| 1 | /* |
| 2 | * Copyright (c) 2000-2020 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | /* |
| 29 | * @OSF_COPYRIGHT@ |
| 30 | */ |
| 31 | /* |
| 32 | * Mach Operating System |
| 33 | * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University |
| 34 | * All Rights Reserved. |
| 35 | * |
| 36 | * Permission to use, copy, modify and distribute this software and its |
| 37 | * documentation is hereby granted, provided that both the copyright |
| 38 | * notice and this permission notice appear in all copies of the |
| 39 | * software, derivative works or modified versions, and any portions |
| 40 | * thereof, and that both notices appear in supporting documentation. |
| 41 | * |
| 42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR |
| 44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 45 | * |
| 46 | * Carnegie Mellon requests users of this software to return to |
| 47 | * |
| 48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 49 | * School of Computer Science |
| 50 | * Carnegie Mellon University |
| 51 | * Pittsburgh PA 15213-3890 |
| 52 | * |
| 53 | * any improvements or extensions that they make and grant Carnegie Mellon |
| 54 | * the rights to redistribute these changes. |
| 55 | */ |
| 56 | /* |
| 57 | */ |
| 58 | /* |
| 59 | * File: kern/zalloc.c |
| 60 | * Author: Avadis Tevanian, Jr. |
| 61 | * |
| 62 | * Zone-based memory allocator. A zone is a collection of fixed size |
| 63 | * data blocks for which quick allocation/deallocation is possible. |
| 64 | */ |
| 65 | |
| 66 | #define ZALLOC_ALLOW_DEPRECATED 1 |
| 67 | #if !ZALLOC_TEST |
| 68 | #include <mach/mach_types.h> |
| 69 | #include <mach/vm_param.h> |
| 70 | #include <mach/kern_return.h> |
| 71 | #include <mach/mach_host_server.h> |
| 72 | #include <mach/task_server.h> |
| 73 | #include <mach/machine/vm_types.h> |
| 74 | #include <machine/machine_routines.h> |
| 75 | #include <mach/vm_map.h> |
| 76 | #include <mach/sdt.h> |
| 77 | #if __x86_64__ |
| 78 | #include <i386/cpuid.h> |
| 79 | #endif |
| 80 | |
| 81 | #include <kern/bits.h> |
| 82 | #include <kern/btlog.h> |
| 83 | #include <kern/startup.h> |
| 84 | #include <kern/kern_types.h> |
| 85 | #include <kern/assert.h> |
| 86 | #include <kern/backtrace.h> |
| 87 | #include <kern/host.h> |
| 88 | #include <kern/macro_help.h> |
| 89 | #include <kern/sched.h> |
| 90 | #include <kern/locks.h> |
| 91 | #include <kern/sched_prim.h> |
| 92 | #include <kern/misc_protos.h> |
| 93 | #include <kern/thread_call.h> |
| 94 | #include <kern/zalloc_internal.h> |
| 95 | #include <kern/kalloc.h> |
| 96 | #include <kern/debug.h> |
| 97 | |
| 98 | #include <prng/random.h> |
| 99 | |
| 100 | #include <vm/pmap.h> |
| 101 | #include <vm/vm_map.h> |
| 102 | #include <vm/vm_memtag.h> |
| 103 | #include <vm/vm_kern.h> |
| 104 | #include <vm/vm_page.h> |
| 105 | #include <vm/vm_pageout.h> |
| 106 | #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */ |
| 107 | |
| 108 | #include <pexpert/pexpert.h> |
| 109 | |
| 110 | #include <machine/machparam.h> |
| 111 | #include <machine/machine_routines.h> /* ml_cpu_get_info */ |
| 112 | |
| 113 | #include <os/atomic.h> |
| 114 | |
| 115 | #include <libkern/OSDebug.h> |
| 116 | #include <libkern/OSAtomic.h> |
| 117 | #include <libkern/section_keywords.h> |
| 118 | #include <sys/kdebug.h> |
| 119 | #include <sys/code_signing.h> |
| 120 | |
| 121 | #include <san/kasan.h> |
| 122 | #include <libsa/stdlib.h> |
| 123 | #include <sys/errno.h> |
| 124 | |
| 125 | #include <IOKit/IOBSD.h> |
| 126 | #include <arm64/amcc_rorgn.h> |
| 127 | |
| 128 | #if DEBUG |
| 129 | #define z_debug_assert(expr) assert(expr) |
| 130 | #else |
| 131 | #define z_debug_assert(expr) (void)(expr) |
| 132 | #endif |
| 133 | |
| 134 | #if CONFIG_PROB_GZALLOC && CONFIG_SPTM |
| 135 | #error This is not a supported configuration |
| 136 | #endif |
| 137 | |
| 138 | /* Returns pid of the task with the largest number of VM map entries. */ |
| 139 | extern pid_t find_largest_process_vm_map_entries(void); |
| 140 | |
| 141 | /* |
| 142 | * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills. |
| 143 | * For any other pid we try to kill that process synchronously. |
| 144 | */ |
| 145 | extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid); |
| 146 | |
| 147 | extern zone_t vm_object_zone; |
| 148 | extern zone_t ipc_service_port_label_zone; |
| 149 | |
| 150 | ZONE_DEFINE_TYPE(percpu_u64_zone, "percpu.64" , uint64_t, |
| 151 | ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE); |
| 152 | |
| 153 | #if CONFIG_KERNEL_TAGGING |
| 154 | #define ZONE_MIN_ELEM_SIZE (sizeof(uint64_t) * 2) |
| 155 | #define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE |
| 156 | #else /* CONFIG_KERNEL_TAGGING */ |
| 157 | #define ZONE_MIN_ELEM_SIZE sizeof(uint64_t) |
| 158 | #define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE |
| 159 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 160 | |
| 161 | #define ZONE_MAX_ALLOC_SIZE (32 * 1024) |
| 162 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 163 | #define ZONE_CHUNK_ALLOC_SIZE (256 * 1024) |
| 164 | #define ZONE_GUARD_DENSE (32 * 1024) |
| 165 | #define ZONE_GUARD_SPARSE (64 * 1024) |
| 166 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 167 | |
| 168 | #if XNU_PLATFORM_MacOSX |
| 169 | #define ZONE_MAP_MAX (32ULL << 30) |
| 170 | #define ZONE_MAP_VA_SIZE (128ULL << 30) |
| 171 | #else /* XNU_PLATFORM_MacOSX */ |
| 172 | #define ZONE_MAP_MAX (8ULL << 30) |
| 173 | #define ZONE_MAP_VA_SIZE (24ULL << 30) |
| 174 | #endif /* !XNU_PLATFORM_MacOSX */ |
| 175 | |
| 176 | __enum_closed_decl(zm_len_t, uint16_t, { |
| 177 | ZM_CHUNK_FREE = 0x0, |
| 178 | /* 1 through 8 are valid lengths */ |
| 179 | ZM_CHUNK_LEN_MAX = 0x8, |
| 180 | |
| 181 | /* PGZ magical values */ |
| 182 | ZM_PGZ_FREE = 0x0, |
| 183 | ZM_PGZ_ALLOCATED = 0xa, /* [a]llocated */ |
| 184 | ZM_PGZ_GUARD = 0xb, /* oo[b] */ |
| 185 | ZM_PGZ_DOUBLE_FREE = 0xd, /* [d]ouble_free */ |
| 186 | |
| 187 | /* secondary page markers */ |
| 188 | ZM_SECONDARY_PAGE = 0xe, |
| 189 | ZM_SECONDARY_PCPU_PAGE = 0xf, |
| 190 | }); |
| 191 | |
| 192 | static_assert(MAX_ZONES < (1u << 10), "MAX_ZONES must fit in zm_index" ); |
| 193 | |
| 194 | struct zone_page_metadata { |
| 195 | union { |
| 196 | struct { |
| 197 | /* The index of the zone this metadata page belongs to */ |
| 198 | zone_id_t zm_index : 10; |
| 199 | |
| 200 | /* |
| 201 | * This chunk ends with a guard page. |
| 202 | */ |
| 203 | uint16_t zm_guarded : 1; |
| 204 | |
| 205 | /* |
| 206 | * Whether `zm_bitmap` is an inline bitmap |
| 207 | * or a packed bitmap reference |
| 208 | */ |
| 209 | uint16_t zm_inline_bitmap : 1; |
| 210 | |
| 211 | /* |
| 212 | * Zones allocate in "chunks" of zone_t::z_chunk_pages |
| 213 | * consecutive pages, or zpercpu_count() pages if the |
| 214 | * zone is percpu. |
| 215 | * |
| 216 | * The first page of it has its metadata set with: |
| 217 | * - 0 if none of the pages are currently wired |
| 218 | * - the number of wired pages in the chunk |
| 219 | * (not scaled for percpu). |
| 220 | * |
| 221 | * Other pages in the chunk have their zm_chunk_len set |
| 222 | * to ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE |
| 223 | * depending on whether the zone is percpu or not. |
| 224 | * For those, zm_page_index holds the index of that page |
| 225 | * in the run, and zm_subchunk_len the remaining length |
| 226 | * within the chunk. |
| 227 | * |
| 228 | * Metadata used for PGZ pages can have 3 values: |
| 229 | * - ZM_PGZ_FREE: slot is free |
| 230 | * - ZM_PGZ_ALLOCATED: slot holds an allocated element |
| 231 | * at offset (zm_pgz_orig_addr & PAGE_MASK) |
| 232 | * - ZM_PGZ_DOUBLE_FREE: slot detected a double free |
| 233 | * (will panic). |
| 234 | */ |
| 235 | zm_len_t zm_chunk_len : 4; |
| 236 | }; |
| 237 | uint16_t zm_bits; |
| 238 | }; |
| 239 | |
| 240 | union { |
| 241 | #define ZM_ALLOC_SIZE_LOCK 1u |
| 242 | uint16_t zm_alloc_size; /* first page only */ |
| 243 | struct { |
| 244 | uint8_t zm_page_index; /* secondary pages only */ |
| 245 | uint8_t zm_subchunk_len; /* secondary pages only */ |
| 246 | }; |
| 247 | uint16_t zm_oob_offs; /* in guard pages */ |
| 248 | }; |
| 249 | union { |
| 250 | uint32_t zm_bitmap; /* most zones */ |
| 251 | uint32_t zm_bump; /* permanent zones */ |
| 252 | }; |
| 253 | |
| 254 | union { |
| 255 | struct { |
| 256 | zone_pva_t zm_page_next; |
| 257 | zone_pva_t zm_page_prev; |
| 258 | }; |
| 259 | vm_offset_t zm_pgz_orig_addr; |
| 260 | struct zone_page_metadata *zm_pgz_slot_next; |
| 261 | }; |
| 262 | }; |
| 263 | static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing" ); |
| 264 | |
| 265 | /*! |
| 266 | * @typedef zone_magazine_t |
| 267 | * |
| 268 | * @brief |
| 269 | * Magazine of cached allocations. |
| 270 | * |
| 271 | * @field zm_next linkage used by magazine depots. |
| 272 | * @field zm_elems an array of @c zc_mag_size() elements. |
| 273 | */ |
| 274 | struct zone_magazine { |
| 275 | zone_magazine_t zm_next; |
| 276 | smr_seq_t zm_seq; |
| 277 | vm_offset_t zm_elems[0]; |
| 278 | }; |
| 279 | |
| 280 | /*! |
| 281 | * @typedef zone_cache_t |
| 282 | * |
| 283 | * @brief |
| 284 | * Magazine of cached allocations. |
| 285 | * |
| 286 | * @discussion |
| 287 | * Below is a diagram of the caching system. This design is inspired by the |
| 288 | * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and |
| 289 | * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA |
| 290 | * zone allocator (itself derived from this seminal work). |
| 291 | * |
| 292 | * It is divided into 3 layers: |
| 293 | * - the per-cpu layer, |
| 294 | * - the recirculation depot layer, |
| 295 | * - the Zone Allocator. |
| 296 | * |
| 297 | * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t), |
| 298 | * which are stacks of up to @c zc_mag_size() elements. |
| 299 | * |
| 300 | * <h2>CPU layer</h2> |
| 301 | * |
| 302 | * The CPU layer (@c zone_cache_t) looks like this: |
| 303 | * |
| 304 | * ╭─ a ─ f ─┬───────── zm_depot ──────────╮ |
| 305 | * │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮ │ |
| 306 | * │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │ |
| 307 | * │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │ |
| 308 | * │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │ |
| 309 | * │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │ |
| 310 | * ╰─────────┴─────────────────────────────╯ |
| 311 | * |
| 312 | * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from, |
| 313 | * or free to. Serialization is achieved through disabling preemption, and only |
| 314 | * the current CPU can acces those allocations. This is represented on the left |
| 315 | * hand side of the diagram above. |
| 316 | * |
| 317 | * The right hand side is the per-cpu depot. It consists of @c zm_depot_count |
| 318 | * full magazines, and is protected by the @c zm_depot_lock for access. |
| 319 | * The lock is expected to absolutely never be contended, as only the local CPU |
| 320 | * tends to access the local per-cpu depot in regular operation mode. |
| 321 | * |
| 322 | * However unlike UMA, our implementation allows for the zone GC to reclaim |
| 323 | * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock. |
| 324 | * |
| 325 | * |
| 326 | * <h2>Recirculation Depot</h2> |
| 327 | * |
| 328 | * The recirculation depot layer is a list similar to the per-cpu depot, |
| 329 | * however it is different in two fundamental ways: |
| 330 | * |
| 331 | * - it is protected by the regular zone lock, |
| 332 | * - elements referenced by the magazines in that layer appear free |
| 333 | * to the zone layer. |
| 334 | * |
| 335 | * |
| 336 | * <h2>Magazine circulation and sizing</h2> |
| 337 | * |
| 338 | * The caching system sizes itself dynamically. Operations that allocate/free |
| 339 | * a single element call @c zone_lock_nopreempt_check_contention() which records |
| 340 | * contention on the lock by doing a trylock and recording its success. |
| 341 | * |
| 342 | * This information is stored in the @c z_recirc_cont_cur field of the zone, |
| 343 | * and a windowed moving average is maintained in @c z_contention_wma. |
| 344 | * The periodically run function @c compute_zone_working_set_size() will then |
| 345 | * take this into account to decide to grow the number of buckets allowed |
| 346 | * in the depot or shrink it based on the @c zc_grow_level and @c zc_shrink_level |
| 347 | * thresholds. |
| 348 | * |
| 349 | * The per-cpu layer will attempt to work with its depot, finding both full and |
| 350 | * empty magazines cached there. If it can't get what it needs, then it will |
| 351 | * mediate with the zone recirculation layer. Such recirculation is done in |
| 352 | * batches in order to amortize lock holds. |
| 353 | * (See @c {zalloc,zfree}_cached_depot_recirculate()). |
| 354 | * |
| 355 | * The recirculation layer keeps a track of what the minimum amount of magazines |
| 356 | * it had over time was for each of the full and empty queues. This allows for |
| 357 | * @c compute_zone_working_set_size() to return memory to the system when a zone |
| 358 | * stops being used as much. |
| 359 | * |
| 360 | * <h2>Security considerations</h2> |
| 361 | * |
| 362 | * The zone caching layer has been designed to avoid returning elements in |
| 363 | * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine, |
| 364 | * and @c zfree() free to the (f) magazine, and only swap them when the |
| 365 | * requested operation cannot be fulfilled. |
| 366 | * |
| 367 | * The per-cpu overflow depot or the recirculation depots are similarly used |
| 368 | * in FIFO order. |
| 369 | * |
| 370 | * @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur. |
| 371 | * @field zc_alloc_cur denormalized number of elements in the (a) magazine |
| 372 | * @field zc_free_cur denormalized number of elements in the (f) magazine |
| 373 | * @field zc_alloc_elems a pointer to the array of elements in (a) |
| 374 | * @field zc_free_elems a pointer to the array of elements in (f) |
| 375 | * |
| 376 | * @field zc_depot a list of @c zc_depot_cur full magazines |
| 377 | */ |
| 378 | typedef struct zone_cache { |
| 379 | hw_lck_ticket_t zc_depot_lock; |
| 380 | uint16_t zc_alloc_cur; |
| 381 | uint16_t zc_free_cur; |
| 382 | vm_offset_t *zc_alloc_elems; |
| 383 | vm_offset_t *zc_free_elems; |
| 384 | struct zone_depot zc_depot; |
| 385 | smr_t zc_smr; |
| 386 | zone_smr_free_cb_t XNU_PTRAUTH_SIGNED_FUNCTION_PTR("zc_free" ) zc_free; |
| 387 | } __attribute__((aligned(64))) * zone_cache_t; |
| 388 | |
| 389 | #if !__x86_64__ |
| 390 | static |
| 391 | #endif |
| 392 | __security_const_late struct { |
| 393 | struct mach_vm_range zi_map_range; /* all zone submaps */ |
| 394 | struct mach_vm_range zi_ro_range; /* read-only range */ |
| 395 | struct mach_vm_range zi_meta_range; /* debugging only */ |
| 396 | struct mach_vm_range zi_bits_range; /* bits buddy allocator */ |
| 397 | struct mach_vm_range zi_xtra_range; /* vm tracking metadata */ |
| 398 | struct mach_vm_range zi_pgz_range; |
| 399 | struct zone_page_metadata *zi_pgz_meta; |
| 400 | |
| 401 | /* |
| 402 | * The metadata lives within the zi_meta_range address range. |
| 403 | * |
| 404 | * The correct formula to find a metadata index is: |
| 405 | * absolute_page_index - page_index(zi_map_range.min_address) |
| 406 | * |
| 407 | * And then this index is used to dereference zi_meta_range.min_address |
| 408 | * as a `struct zone_page_metadata` array. |
| 409 | * |
| 410 | * To avoid doing that substraction all the time in the various fast-paths, |
| 411 | * zi_meta_base are pre-offset with that minimum page index to avoid redoing |
| 412 | * that math all the time. |
| 413 | */ |
| 414 | struct zone_page_metadata *zi_meta_base; |
| 415 | } zone_info; |
| 416 | |
| 417 | __startup_data static struct mach_vm_range zone_map_range; |
| 418 | __startup_data static vm_map_size_t zone_meta_size; |
| 419 | __startup_data static vm_map_size_t zone_bits_size; |
| 420 | __startup_data static vm_map_size_t zone_xtra_size; |
| 421 | |
| 422 | /* |
| 423 | * Initial array of metadata for stolen memory. |
| 424 | * |
| 425 | * The numbers here have to be kept in sync with vm_map_steal_memory() |
| 426 | * so that we have reserved enough metadata. |
| 427 | * |
| 428 | * After zone_init() has run (which happens while the kernel is still single |
| 429 | * threaded), the metadata is moved to its final dynamic location, and |
| 430 | * this array is unmapped with the rest of __startup_data at lockdown. |
| 431 | */ |
| 432 | #define ZONE_EARLY_META_INLINE_COUNT 64 |
| 433 | __startup_data |
| 434 | static struct zone_page_metadata |
| 435 | zone_early_meta_array_startup[ZONE_EARLY_META_INLINE_COUNT]; |
| 436 | |
| 437 | |
| 438 | __startup_data __attribute__((aligned(PAGE_MAX_SIZE))) |
| 439 | static uint8_t zone_early_pages_to_cram[PAGE_MAX_SIZE * 16]; |
| 440 | |
| 441 | /* |
| 442 | * The zone_locks_grp allows for collecting lock statistics. |
| 443 | * All locks are associated to this group in zinit. |
| 444 | * Look at tools/lockstat for debugging lock contention. |
| 445 | */ |
| 446 | LCK_GRP_DECLARE(zone_locks_grp, "zone_locks" ); |
| 447 | static LCK_MTX_DECLARE(zone_metadata_region_lck, &zone_locks_grp); |
| 448 | |
| 449 | /* |
| 450 | * The zone metadata lock protects: |
| 451 | * - metadata faulting, |
| 452 | * - VM submap VA allocations, |
| 453 | * - early gap page queue list |
| 454 | */ |
| 455 | #define zone_meta_lock() lck_mtx_lock(&zone_metadata_region_lck); |
| 456 | #define zone_meta_unlock() lck_mtx_unlock(&zone_metadata_region_lck); |
| 457 | |
| 458 | /* |
| 459 | * Exclude more than one concurrent garbage collection |
| 460 | */ |
| 461 | static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc" ); |
| 462 | static LCK_MTX_DECLARE(zone_gc_lock, &zone_gc_lck_grp); |
| 463 | static LCK_SPIN_DECLARE(zone_exhausted_lock, &zone_gc_lck_grp); |
| 464 | |
| 465 | /* |
| 466 | * Panic logging metadata |
| 467 | */ |
| 468 | bool panic_include_zprint = false; |
| 469 | bool panic_include_kalloc_types = false; |
| 470 | zone_t kalloc_type_src_zone = ZONE_NULL; |
| 471 | zone_t kalloc_type_dst_zone = ZONE_NULL; |
| 472 | mach_memory_info_t *panic_kext_memory_info = NULL; |
| 473 | vm_size_t panic_kext_memory_size = 0; |
| 474 | vm_offset_t panic_fault_address = 0; |
| 475 | |
| 476 | /* |
| 477 | * Protects zone_array, num_zones, num_zones_in_use, and |
| 478 | * zone_destroyed_bitmap |
| 479 | */ |
| 480 | static SIMPLE_LOCK_DECLARE(all_zones_lock, 0); |
| 481 | static zone_id_t num_zones_in_use; |
| 482 | zone_id_t _Atomic num_zones; |
| 483 | SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count; |
| 484 | |
| 485 | /* |
| 486 | * Initial globals for zone stats until we can allocate the real ones. |
| 487 | * Those get migrated inside the per-CPU ones during zone_init() and |
| 488 | * this array is unmapped with the rest of __startup_data at lockdown. |
| 489 | */ |
| 490 | |
| 491 | /* zone to allocate zone_magazine structs from */ |
| 492 | static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone; |
| 493 | /* |
| 494 | * Until pid1 is made, zone caching is off, |
| 495 | * until compute_zone_working_set_size() runs for the firt time. |
| 496 | * |
| 497 | * -1 represents the "never enabled yet" value. |
| 498 | */ |
| 499 | static int8_t zone_caching_disabled = -1; |
| 500 | |
| 501 | __startup_data |
| 502 | static struct zone_stats zone_stats_startup[MAX_ZONES]; |
| 503 | struct zone zone_array[MAX_ZONES]; |
| 504 | SECURITY_READ_ONLY_LATE(zone_security_flags_t) zone_security_array[MAX_ZONES] = { |
| 505 | [0 ... MAX_ZONES - 1] = { |
| 506 | .z_kheap_id = KHEAP_ID_NONE, |
| 507 | .z_noencrypt = false, |
| 508 | .z_submap_idx = Z_SUBMAP_IDX_GENERAL_0, |
| 509 | .z_kalloc_type = false, |
| 510 | .z_sig_eq = 0 |
| 511 | }, |
| 512 | }; |
| 513 | SECURITY_READ_ONLY_LATE(struct zone_size_params) zone_ro_size_params[ZONE_ID__LAST_RO + 1]; |
| 514 | SECURITY_READ_ONLY_LATE(zone_cache_ops_t) zcache_ops[ZONE_ID__FIRST_DYNAMIC]; |
| 515 | |
| 516 | /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */ |
| 517 | static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count; |
| 518 | |
| 519 | /* Used to keep track of destroyed slots in the zone_array */ |
| 520 | static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)]; |
| 521 | |
| 522 | /* number of zone mapped pages used by all zones */ |
| 523 | static size_t _Atomic zone_pages_jetsam_threshold = ~0; |
| 524 | size_t zone_pages_wired; |
| 525 | size_t zone_guard_pages; |
| 526 | |
| 527 | /* Time in (ms) after which we panic for zone exhaustions */ |
| 528 | TUNABLE(int, zone_exhausted_timeout, "zet" , 5000); |
| 529 | static bool zone_share_always = true; |
| 530 | static TUNABLE_WRITEABLE(uint32_t, zone_early_thres_mul, "zone_early_thres_mul" , 5); |
| 531 | |
| 532 | #if VM_TAG_SIZECLASSES |
| 533 | /* |
| 534 | * Zone tagging allows for per "tag" accounting of allocations for the kalloc |
| 535 | * zones only. |
| 536 | * |
| 537 | * There are 3 kinds of tags that can be used: |
| 538 | * - pre-registered VM_KERN_MEMORY_* |
| 539 | * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc()) |
| 540 | * - per-kext tags computed by IOKit (using the magic Z_VM_TAG_BT_BIT marker). |
| 541 | * |
| 542 | * The VM tracks the statistics in lazily allocated structures. |
| 543 | * See vm_tag_will_update_zone(), vm_tag_update_zone_size(). |
| 544 | * |
| 545 | * If for some reason the requested tag cannot be accounted for, |
| 546 | * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated. |
| 547 | * |
| 548 | * Each allocated element also remembers the tag it was assigned, |
| 549 | * which lets zalloc/zfree update statistics correctly. |
| 550 | */ |
| 551 | |
| 552 | /* enable tags for zones that ask for it */ |
| 553 | static TUNABLE(bool, zone_tagging_on, "-zt" , false); |
| 554 | |
| 555 | /* |
| 556 | * Array of all sizeclasses used by kalloc variants so that we can |
| 557 | * have accounting per size class for each kalloc callsite |
| 558 | */ |
| 559 | static uint16_t zone_tags_sizeclasses[VM_TAG_SIZECLASSES]; |
| 560 | #endif /* VM_TAG_SIZECLASSES */ |
| 561 | |
| 562 | #if DEBUG || DEVELOPMENT |
| 563 | static int zalloc_simulate_vm_pressure; |
| 564 | #endif /* DEBUG || DEVELOPMENT */ |
| 565 | |
| 566 | #define Z_TUNABLE(t, n, d) \ |
| 567 | TUNABLE(t, _##n, #n, d); \ |
| 568 | __pure2 static inline t n(void) { return _##n; } |
| 569 | |
| 570 | /* |
| 571 | * Zone caching tunables |
| 572 | * |
| 573 | * zc_mag_size(): |
| 574 | * size of magazines, larger to reduce contention at the expense of memory |
| 575 | * |
| 576 | * zc_enable_level |
| 577 | * number of contentions per second after which zone caching engages |
| 578 | * automatically. |
| 579 | * |
| 580 | * 0 to disable. |
| 581 | * |
| 582 | * zc_grow_level |
| 583 | * number of contentions per second x cpu after which the number of magazines |
| 584 | * allowed in the depot can grow. (in "Z_WMA_UNIT" units). |
| 585 | * |
| 586 | * zc_shrink_level |
| 587 | * number of contentions per second x cpu below which the number of magazines |
| 588 | * allowed in the depot will shrink. (in "Z_WMA_UNIT" units). |
| 589 | * |
| 590 | * zc_pcpu_max |
| 591 | * maximum memory size in bytes that can hang from a CPU, |
| 592 | * which will affect how many magazines are allowed in the depot. |
| 593 | * |
| 594 | * The alloc/free magazines are assumed to be on average half-empty |
| 595 | * and to count for "1" unit of magazines. |
| 596 | * |
| 597 | * zc_autotrim_size |
| 598 | * Size allowed to hang extra from the recirculation depot before |
| 599 | * auto-trim kicks in. |
| 600 | * |
| 601 | * zc_autotrim_buckets |
| 602 | * |
| 603 | * How many buckets in excess of the working-set are allowed |
| 604 | * before auto-trim kicks in for empty buckets. |
| 605 | * |
| 606 | * zc_free_batch_size |
| 607 | * The size of batches of frees/reclaim that can be done keeping |
| 608 | * the zone lock held (and preemption disabled). |
| 609 | */ |
| 610 | Z_TUNABLE(uint16_t, zc_mag_size, 8); |
| 611 | static Z_TUNABLE(uint32_t, zc_enable_level, 10); |
| 612 | static Z_TUNABLE(uint32_t, zc_grow_level, 5 * Z_WMA_UNIT); |
| 613 | static Z_TUNABLE(uint32_t, zc_shrink_level, Z_WMA_UNIT / 2); |
| 614 | static Z_TUNABLE(uint32_t, zc_pcpu_max, 128 << 10); |
| 615 | static Z_TUNABLE(uint32_t, zc_autotrim_size, 16 << 10); |
| 616 | static Z_TUNABLE(uint32_t, zc_autotrim_buckets, 8); |
| 617 | static Z_TUNABLE(uint32_t, zc_free_batch_size, 256); |
| 618 | |
| 619 | static SECURITY_READ_ONLY_LATE(size_t) zone_pages_wired_max; |
| 620 | static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT]; |
| 621 | static SECURITY_READ_ONLY_LATE(vm_map_t) zone_meta_map; |
| 622 | static char const * const zone_submaps_names[Z_SUBMAP_IDX_COUNT] = { |
| 623 | [Z_SUBMAP_IDX_VM] = "VM" , |
| 624 | [Z_SUBMAP_IDX_READ_ONLY] = "RO" , |
| 625 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 626 | [Z_SUBMAP_IDX_GENERAL_0] = "GEN0" , |
| 627 | [Z_SUBMAP_IDX_GENERAL_1] = "GEN1" , |
| 628 | [Z_SUBMAP_IDX_GENERAL_2] = "GEN2" , |
| 629 | [Z_SUBMAP_IDX_GENERAL_3] = "GEN3" , |
| 630 | #else |
| 631 | [Z_SUBMAP_IDX_GENERAL_0] = "GEN" , |
| 632 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 633 | [Z_SUBMAP_IDX_DATA] = "DATA" , |
| 634 | }; |
| 635 | |
| 636 | #if __x86_64__ |
| 637 | #define ZONE_ENTROPY_CNT 8 |
| 638 | #else |
| 639 | #define ZONE_ENTROPY_CNT 2 |
| 640 | #endif |
| 641 | static struct zone_bool_gen { |
| 642 | struct bool_gen zbg_bg; |
| 643 | uint32_t zbg_entropy[ZONE_ENTROPY_CNT]; |
| 644 | } zone_bool_gen[MAX_CPUS]; |
| 645 | |
| 646 | #if CONFIG_PROB_GZALLOC |
| 647 | /* |
| 648 | * Probabilistic gzalloc |
| 649 | * ===================== |
| 650 | * |
| 651 | * |
| 652 | * Probabilistic guard zalloc samples allocations and will protect them by |
| 653 | * double-mapping the page holding them and returning the secondary virtual |
| 654 | * address to its callers. |
| 655 | * |
| 656 | * Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args |
| 657 | * are set. |
| 658 | * |
| 659 | * |
| 660 | * Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with |
| 661 | * most zalloc/kalloc features: |
| 662 | * - zone_require is functional |
| 663 | * - zone caching or zone tagging is compatible |
| 664 | * - non-blocking allocation work (they will always return NULL with gzalloc). |
| 665 | * |
| 666 | * PGZ limitations: |
| 667 | * - VA sequestering isn't respected, as the slots (which are in limited |
| 668 | * quantity) will be reused for any type, however the PGZ quarantine |
| 669 | * somewhat mitigates the impact. |
| 670 | * - zones with elements larger than a page cannot be protected. |
| 671 | * |
| 672 | * |
| 673 | * Tunables: |
| 674 | * -------- |
| 675 | * |
| 676 | * pgz=1: |
| 677 | * Turn on probabilistic guard malloc for all zones |
| 678 | * |
| 679 | * (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified) |
| 680 | * |
| 681 | * pgz_sample_rate=0 to 2^31 |
| 682 | * average sample rate between two guarded allocations. |
| 683 | * 0 means every allocation. |
| 684 | * |
| 685 | * The default is a random number between 1000 and 10,000 |
| 686 | * |
| 687 | * pgz_slots |
| 688 | * how many allocations to protect. |
| 689 | * |
| 690 | * Each costs: |
| 691 | * - a PTE in the pmap (when allocated) |
| 692 | * - 2 zone page meta's (every other page is a "guard" one, 32B total) |
| 693 | * - 64 bytes per backtraces. |
| 694 | * On LP64 this is <16K per 100 slots. |
| 695 | * |
| 696 | * The default is ~200 slots per G of physical ram (32k / G) |
| 697 | * |
| 698 | * TODO: |
| 699 | * - try harder to allocate elements at the "end" to catch OOB more reliably. |
| 700 | * |
| 701 | * pgz_quarantine |
| 702 | * how many slots should be free at any given time. |
| 703 | * |
| 704 | * PGZ will round robin through free slots to be reused, but free slots are |
| 705 | * important to detect use-after-free by acting as a quarantine. |
| 706 | * |
| 707 | * By default, PGZ will keep 33% of the slots around at all time. |
| 708 | * |
| 709 | * pgz1=<name>, pgz2=<name>, ..., pgzn=<name>... |
| 710 | * Specific zones for which to enable probabilistic guard malloc. |
| 711 | * There must be no numbering gap (names after the gap will be ignored). |
| 712 | */ |
| 713 | #if DEBUG || DEVELOPMENT |
| 714 | static TUNABLE(bool, pgz_all, "pgz" , true); |
| 715 | #else |
| 716 | static TUNABLE(bool, pgz_all, "pgz" , false); |
| 717 | #endif |
| 718 | static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate" , 0); |
| 719 | static TUNABLE(uint32_t, pgz_slots, "pgz_slots" , UINT32_MAX); |
| 720 | static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine" , 0); |
| 721 | #endif /* CONFIG_PROB_GZALLOC */ |
| 722 | |
| 723 | static zone_t zone_find_largest(uint64_t *zone_size); |
| 724 | |
| 725 | #endif /* !ZALLOC_TEST */ |
| 726 | #pragma mark Zone metadata |
| 727 | #if !ZALLOC_TEST |
| 728 | |
| 729 | static inline bool |
| 730 | zone_has_index(zone_t z, zone_id_t zid) |
| 731 | { |
| 732 | return zone_array + zid == z; |
| 733 | } |
| 734 | |
| 735 | __abortlike |
| 736 | void |
| 737 | zone_invalid_panic(zone_t zone) |
| 738 | { |
| 739 | panic("zone %p isn't in the zone_array" , zone); |
| 740 | } |
| 741 | |
| 742 | __abortlike |
| 743 | static void |
| 744 | zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta, |
| 745 | const char *kind) |
| 746 | { |
| 747 | panic("zone metadata corruption: %s (meta %p, zone %s%s)" , |
| 748 | kind, meta, zone_heap_name(zone), zone->z_name); |
| 749 | } |
| 750 | |
| 751 | __abortlike |
| 752 | static void |
| 753 | zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr) |
| 754 | { |
| 755 | panic("zone element pointer validation failed (addr: %p, zone %s%s)" , |
| 756 | (void *)addr, zone_heap_name(zone), zone->z_name); |
| 757 | } |
| 758 | |
| 759 | __abortlike |
| 760 | static void |
| 761 | zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr, |
| 762 | struct zone_page_metadata *meta) |
| 763 | { |
| 764 | zone_security_flags_t zsflags = zone_security_config(z: zone), src_zsflags; |
| 765 | zone_id_t zidx; |
| 766 | zone_t src_zone; |
| 767 | |
| 768 | if (zsflags.z_kalloc_type) { |
| 769 | panic_include_kalloc_types = true; |
| 770 | kalloc_type_dst_zone = zone; |
| 771 | } |
| 772 | |
| 773 | zidx = meta->zm_index; |
| 774 | if (zidx >= os_atomic_load(&num_zones, relaxed)) { |
| 775 | panic("%p expected in zone %s%s[%d], but metadata has invalid zidx: %d" , |
| 776 | (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone), |
| 777 | zidx); |
| 778 | } |
| 779 | |
| 780 | src_zone = &zone_array[zidx]; |
| 781 | src_zsflags = zone_security_array[zidx]; |
| 782 | if (src_zsflags.z_kalloc_type) { |
| 783 | panic_include_kalloc_types = true; |
| 784 | kalloc_type_src_zone = src_zone; |
| 785 | } |
| 786 | |
| 787 | panic("%p not in the expected zone %s%s[%d], but found in %s%s[%d]" , |
| 788 | (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone), |
| 789 | zone_heap_name(src_zone), src_zone->z_name, zidx); |
| 790 | } |
| 791 | |
| 792 | __abortlike |
| 793 | static void |
| 794 | zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta) |
| 795 | { |
| 796 | panic("metadata list corruption through element %p detected in zone %s%s" , |
| 797 | meta, zone_heap_name(zone), zone->z_name); |
| 798 | } |
| 799 | |
| 800 | __abortlike |
| 801 | static void |
| 802 | zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta, |
| 803 | const char *kind) |
| 804 | { |
| 805 | panic("accounting mismatch (%s) for zone %s%s, meta %p" , kind, |
| 806 | zone_heap_name(zone), zone->z_name, meta); |
| 807 | } |
| 808 | |
| 809 | __abortlike |
| 810 | static void |
| 811 | zone_meta_double_free_panic(zone_t zone, vm_offset_t addr, const char *caller) |
| 812 | { |
| 813 | panic("%s: double free of %p to zone %s%s" , caller, |
| 814 | (void *)addr, zone_heap_name(zone), zone->z_name); |
| 815 | } |
| 816 | |
| 817 | __abortlike |
| 818 | static void |
| 819 | zone_accounting_panic(zone_t zone, const char *kind) |
| 820 | { |
| 821 | panic("accounting mismatch (%s) for zone %s%s" , kind, |
| 822 | zone_heap_name(zone), zone->z_name); |
| 823 | } |
| 824 | |
| 825 | #define zone_counter_sub(z, stat, value) ({ \ |
| 826 | if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \ |
| 827 | zone_accounting_panic(z, #stat " wrap-around"); \ |
| 828 | } \ |
| 829 | (z)->stat; \ |
| 830 | }) |
| 831 | |
| 832 | static inline uint16_t |
| 833 | zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m, |
| 834 | vm_offset_t esize) |
| 835 | { |
| 836 | if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) { |
| 837 | zone_page_meta_accounting_panic(zone: z, meta: m, kind: "alloc_size wrap-around" ); |
| 838 | } |
| 839 | return m->zm_alloc_size; |
| 840 | } |
| 841 | |
| 842 | static inline uint16_t |
| 843 | zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m, |
| 844 | vm_offset_t esize) |
| 845 | { |
| 846 | if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) { |
| 847 | zone_page_meta_accounting_panic(zone: z, meta: m, kind: "alloc_size wrap-around" ); |
| 848 | } |
| 849 | return m->zm_alloc_size; |
| 850 | } |
| 851 | |
| 852 | __abortlike |
| 853 | static void |
| 854 | zone_nofail_panic(zone_t zone) |
| 855 | { |
| 856 | panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)" , |
| 857 | zone_heap_name(zone), zone->z_name); |
| 858 | } |
| 859 | |
| 860 | __header_always_inline bool |
| 861 | zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end) |
| 862 | { |
| 863 | const struct mach_vm_range *ro_r = &zone_info.zi_ro_range; |
| 864 | struct mach_vm_range r = { addr_start, addr_end }; |
| 865 | |
| 866 | return mach_vm_range_intersects(r1: ro_r, r2: &r); |
| 867 | } |
| 868 | |
| 869 | #define from_range(r, addr, size) \ |
| 870 | __builtin_choose_expr(__builtin_constant_p(size) ? (size) == 1 : 0, \ |
| 871 | mach_vm_range_contains(r, (mach_vm_offset_t)(addr)), \ |
| 872 | mach_vm_range_contains(r, (mach_vm_offset_t)(addr), size)) |
| 873 | |
| 874 | #define from_ro_map(addr, size) \ |
| 875 | from_range(&zone_info.zi_ro_range, addr, size) |
| 876 | |
| 877 | #define from_zone_map(addr, size) \ |
| 878 | from_range(&zone_info.zi_map_range, addr, size) |
| 879 | |
| 880 | __header_always_inline bool |
| 881 | zone_pva_is_null(zone_pva_t page) |
| 882 | { |
| 883 | return page.packed_address == 0; |
| 884 | } |
| 885 | |
| 886 | __header_always_inline bool |
| 887 | zone_pva_is_queue(zone_pva_t page) |
| 888 | { |
| 889 | // actual kernel pages have the top bit set |
| 890 | return (int32_t)page.packed_address > 0; |
| 891 | } |
| 892 | |
| 893 | __header_always_inline bool |
| 894 | zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2) |
| 895 | { |
| 896 | return pva1.packed_address == pva2.packed_address; |
| 897 | } |
| 898 | |
| 899 | __header_always_inline zone_pva_t * |
| 900 | zone_pageq_base(void) |
| 901 | { |
| 902 | extern zone_pva_t data_seg_start[] __SEGMENT_START_SYM("__DATA" ); |
| 903 | |
| 904 | /* |
| 905 | * `-1` so that if the first __DATA variable is a page queue, |
| 906 | * it gets a non 0 index |
| 907 | */ |
| 908 | return data_seg_start - 1; |
| 909 | } |
| 910 | |
| 911 | __header_always_inline void |
| 912 | zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv, |
| 913 | struct zone_page_metadata *meta) |
| 914 | { |
| 915 | zone_pva_t *queue_head = &zone_pageq_base()[queue.packed_address]; |
| 916 | |
| 917 | if (!zone_pva_is_equal(pva1: *queue_head, pva2: oldv)) { |
| 918 | zone_page_metadata_list_corruption(zone: z, meta); |
| 919 | } |
| 920 | *queue_head = meta->zm_page_next; |
| 921 | } |
| 922 | |
| 923 | __header_always_inline zone_pva_t |
| 924 | zone_queue_encode(zone_pva_t *headp) |
| 925 | { |
| 926 | return (zone_pva_t){ .packed_address: (uint32_t)(headp - zone_pageq_base()) }; |
| 927 | } |
| 928 | |
| 929 | __header_always_inline zone_pva_t |
| 930 | zone_pva_from_addr(vm_address_t addr) |
| 931 | { |
| 932 | // cannot use atop() because we want to maintain the sign bit |
| 933 | return (zone_pva_t){ .packed_address: (uint32_t)((intptr_t)addr >> PAGE_SHIFT) }; |
| 934 | } |
| 935 | |
| 936 | __header_always_inline vm_address_t |
| 937 | zone_pva_to_addr(zone_pva_t page) |
| 938 | { |
| 939 | // cause sign extension so that we end up with the right address |
| 940 | return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT; |
| 941 | } |
| 942 | |
| 943 | __header_always_inline struct zone_page_metadata * |
| 944 | zone_pva_to_meta(zone_pva_t page) |
| 945 | { |
| 946 | return &zone_info.zi_meta_base[page.packed_address]; |
| 947 | } |
| 948 | |
| 949 | __header_always_inline zone_pva_t |
| 950 | zone_pva_from_meta(struct zone_page_metadata *meta) |
| 951 | { |
| 952 | return (zone_pva_t){ .packed_address: (uint32_t)(meta - zone_info.zi_meta_base) }; |
| 953 | } |
| 954 | |
| 955 | __header_always_inline struct zone_page_metadata * |
| 956 | zone_meta_from_addr(vm_offset_t addr) |
| 957 | { |
| 958 | return zone_pva_to_meta(page: zone_pva_from_addr(addr)); |
| 959 | } |
| 960 | |
| 961 | __header_always_inline zone_id_t |
| 962 | zone_index_from_ptr(const void *ptr) |
| 963 | { |
| 964 | return zone_pva_to_meta(page: zone_pva_from_addr(addr: (vm_offset_t)ptr))->zm_index; |
| 965 | } |
| 966 | |
| 967 | __header_always_inline vm_offset_t |
| 968 | zone_meta_to_addr(struct zone_page_metadata *meta) |
| 969 | { |
| 970 | return ptoa((int32_t)(meta - zone_info.zi_meta_base)); |
| 971 | } |
| 972 | |
| 973 | __attribute__((overloadable)) |
| 974 | __header_always_inline void |
| 975 | zone_meta_validate(zone_t z, struct zone_page_metadata *meta, vm_address_t addr) |
| 976 | { |
| 977 | if (!zone_has_index(z, zid: meta->zm_index)) { |
| 978 | zone_page_metadata_index_confusion_panic(zone: z, addr, meta); |
| 979 | } |
| 980 | } |
| 981 | |
| 982 | __attribute__((overloadable)) |
| 983 | __header_always_inline void |
| 984 | zone_meta_validate(zone_t z, struct zone_page_metadata *meta) |
| 985 | { |
| 986 | zone_meta_validate(z, meta, addr: zone_meta_to_addr(meta)); |
| 987 | } |
| 988 | |
| 989 | __header_always_inline void |
| 990 | zone_meta_queue_push(zone_t z, zone_pva_t *headp, |
| 991 | struct zone_page_metadata *meta) |
| 992 | { |
| 993 | zone_pva_t head = *headp; |
| 994 | zone_pva_t queue_pva = zone_queue_encode(headp); |
| 995 | struct zone_page_metadata *tmp; |
| 996 | |
| 997 | meta->zm_page_next = head; |
| 998 | if (!zone_pva_is_null(page: head)) { |
| 999 | tmp = zone_pva_to_meta(page: head); |
| 1000 | if (!zone_pva_is_equal(pva1: tmp->zm_page_prev, pva2: queue_pva)) { |
| 1001 | zone_page_metadata_list_corruption(zone: z, meta); |
| 1002 | } |
| 1003 | tmp->zm_page_prev = zone_pva_from_meta(meta); |
| 1004 | } |
| 1005 | meta->zm_page_prev = queue_pva; |
| 1006 | *headp = zone_pva_from_meta(meta); |
| 1007 | } |
| 1008 | |
| 1009 | __header_always_inline struct zone_page_metadata * |
| 1010 | zone_meta_queue_pop(zone_t z, zone_pva_t *headp) |
| 1011 | { |
| 1012 | zone_pva_t head = *headp; |
| 1013 | struct zone_page_metadata *meta = zone_pva_to_meta(page: head); |
| 1014 | struct zone_page_metadata *tmp; |
| 1015 | |
| 1016 | zone_meta_validate(z, meta); |
| 1017 | |
| 1018 | if (!zone_pva_is_null(page: meta->zm_page_next)) { |
| 1019 | tmp = zone_pva_to_meta(page: meta->zm_page_next); |
| 1020 | if (!zone_pva_is_equal(pva1: tmp->zm_page_prev, pva2: head)) { |
| 1021 | zone_page_metadata_list_corruption(zone: z, meta); |
| 1022 | } |
| 1023 | tmp->zm_page_prev = meta->zm_page_prev; |
| 1024 | } |
| 1025 | *headp = meta->zm_page_next; |
| 1026 | |
| 1027 | meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; |
| 1028 | |
| 1029 | return meta; |
| 1030 | } |
| 1031 | |
| 1032 | __header_always_inline void |
| 1033 | zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta) |
| 1034 | { |
| 1035 | zone_pva_t meta_pva = zone_pva_from_meta(meta); |
| 1036 | struct zone_page_metadata *tmp; |
| 1037 | |
| 1038 | if (!zone_pva_is_null(page: meta->zm_page_next)) { |
| 1039 | tmp = zone_pva_to_meta(page: meta->zm_page_next); |
| 1040 | if (!zone_pva_is_equal(pva1: tmp->zm_page_prev, pva2: meta_pva)) { |
| 1041 | zone_page_metadata_list_corruption(zone: z, meta); |
| 1042 | } |
| 1043 | tmp->zm_page_prev = meta->zm_page_prev; |
| 1044 | } |
| 1045 | if (zone_pva_is_queue(page: meta->zm_page_prev)) { |
| 1046 | zone_queue_set_head(z, queue: meta->zm_page_prev, oldv: meta_pva, meta); |
| 1047 | } else { |
| 1048 | tmp = zone_pva_to_meta(page: meta->zm_page_prev); |
| 1049 | if (!zone_pva_is_equal(pva1: tmp->zm_page_next, pva2: meta_pva)) { |
| 1050 | zone_page_metadata_list_corruption(zone: z, meta); |
| 1051 | } |
| 1052 | tmp->zm_page_next = meta->zm_page_next; |
| 1053 | } |
| 1054 | |
| 1055 | meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; |
| 1056 | } |
| 1057 | |
| 1058 | __header_always_inline void |
| 1059 | zone_meta_requeue(zone_t z, zone_pva_t *headp, |
| 1060 | struct zone_page_metadata *meta) |
| 1061 | { |
| 1062 | zone_meta_remqueue(z, meta); |
| 1063 | zone_meta_queue_push(z, headp, meta); |
| 1064 | } |
| 1065 | |
| 1066 | /* prevents a given metadata from ever reaching the z_pageq_empty queue */ |
| 1067 | static inline void |
| 1068 | zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) |
| 1069 | { |
| 1070 | uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK); |
| 1071 | |
| 1072 | assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK); |
| 1073 | if (new_size == ZM_ALLOC_SIZE_LOCK) { |
| 1074 | zone_meta_requeue(z, headp: &z->z_pageq_partial, meta: m); |
| 1075 | zone_counter_sub(z, z_wired_empty, len); |
| 1076 | } |
| 1077 | } |
| 1078 | |
| 1079 | /* allows a given metadata to reach the z_pageq_empty queue again */ |
| 1080 | static inline void |
| 1081 | zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) |
| 1082 | { |
| 1083 | uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK); |
| 1084 | |
| 1085 | assert(new_size % sizeof(vm_offset_t) == 0); |
| 1086 | if (new_size == 0) { |
| 1087 | zone_meta_requeue(z, headp: &z->z_pageq_empty, meta: m); |
| 1088 | z->z_wired_empty += len; |
| 1089 | } |
| 1090 | } |
| 1091 | |
| 1092 | /* |
| 1093 | * Routine to populate a page backing metadata in the zone_metadata_region. |
| 1094 | * Must be called without the zone lock held as it might potentially block. |
| 1095 | */ |
| 1096 | static void |
| 1097 | zone_meta_populate(vm_offset_t base, vm_size_t size) |
| 1098 | { |
| 1099 | struct zone_page_metadata *from = zone_meta_from_addr(addr: base); |
| 1100 | struct zone_page_metadata *to = from + atop(size); |
| 1101 | vm_offset_t page_addr = trunc_page(from); |
| 1102 | |
| 1103 | for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) { |
| 1104 | #if !KASAN |
| 1105 | /* |
| 1106 | * This can race with another thread doing a populate on the same metadata |
| 1107 | * page, where we see an updated pmap but unmapped KASan shadow, causing a |
| 1108 | * fault in the shadow when we first access the metadata page. Avoid this |
| 1109 | * by always synchronizing on the zone_metadata_region lock with KASan. |
| 1110 | */ |
| 1111 | if (pmap_find_phys(map: kernel_pmap, va: page_addr)) { |
| 1112 | continue; |
| 1113 | } |
| 1114 | #endif |
| 1115 | |
| 1116 | for (;;) { |
| 1117 | kern_return_t ret = KERN_SUCCESS; |
| 1118 | |
| 1119 | /* |
| 1120 | * All updates to the zone_metadata_region are done |
| 1121 | * under the zone_metadata_region_lck |
| 1122 | */ |
| 1123 | zone_meta_lock(); |
| 1124 | if (0 == pmap_find_phys(map: kernel_pmap, va: page_addr)) { |
| 1125 | ret = kernel_memory_populate(addr: page_addr, |
| 1126 | PAGE_SIZE, flags: KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO, |
| 1127 | VM_KERN_MEMORY_OSFMK); |
| 1128 | } |
| 1129 | zone_meta_unlock(); |
| 1130 | |
| 1131 | if (ret == KERN_SUCCESS) { |
| 1132 | break; |
| 1133 | } |
| 1134 | |
| 1135 | /* |
| 1136 | * We can't pass KMA_NOPAGEWAIT under a global lock as it leads |
| 1137 | * to bad system deadlocks, so if the allocation failed, |
| 1138 | * we need to do the VM_PAGE_WAIT() outside of the lock. |
| 1139 | */ |
| 1140 | VM_PAGE_WAIT(); |
| 1141 | } |
| 1142 | } |
| 1143 | } |
| 1144 | |
| 1145 | __abortlike |
| 1146 | static void |
| 1147 | zone_invalid_element_panic(zone_t zone, vm_offset_t addr) |
| 1148 | { |
| 1149 | struct zone_page_metadata *meta; |
| 1150 | const char *from_cache = "" ; |
| 1151 | vm_offset_t page; |
| 1152 | |
| 1153 | if (!from_zone_map(addr, zone_elem_inner_size(zone))) { |
| 1154 | panic("addr %p being freed to zone %s%s%s, isn't from zone map" , |
| 1155 | (void *)addr, zone_heap_name(zone), zone->z_name, from_cache); |
| 1156 | } |
| 1157 | page = trunc_page(addr); |
| 1158 | meta = zone_meta_from_addr(addr); |
| 1159 | |
| 1160 | if (!zone_has_index(z: zone, zid: meta->zm_index)) { |
| 1161 | zone_page_metadata_index_confusion_panic(zone, addr, meta); |
| 1162 | } |
| 1163 | |
| 1164 | if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) { |
| 1165 | panic("metadata %p corresponding to addr %p being freed to " |
| 1166 | "zone %s%s%s, is marked as secondary per cpu page" , |
| 1167 | meta, (void *)addr, zone_heap_name(zone), zone->z_name, |
| 1168 | from_cache); |
| 1169 | } |
| 1170 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1171 | page -= ptoa(meta->zm_page_index); |
| 1172 | meta -= meta->zm_page_index; |
| 1173 | } |
| 1174 | |
| 1175 | if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) { |
| 1176 | panic("metadata %p corresponding to addr %p being freed to " |
| 1177 | "zone %s%s%s, has chunk len greater than max" , |
| 1178 | meta, (void *)addr, zone_heap_name(zone), zone->z_name, |
| 1179 | from_cache); |
| 1180 | } |
| 1181 | |
| 1182 | if ((addr - zone_elem_inner_offs(zone) - page) % zone_elem_outer_size(zone)) { |
| 1183 | panic("addr %p being freed to zone %s%s%s, isn't aligned to " |
| 1184 | "zone element size" , (void *)addr, zone_heap_name(zone), |
| 1185 | zone->z_name, from_cache); |
| 1186 | } |
| 1187 | |
| 1188 | zone_invalid_element_addr_panic(zone, addr); |
| 1189 | } |
| 1190 | |
| 1191 | __attribute__((always_inline)) |
| 1192 | static struct zone_page_metadata * |
| 1193 | zone_element_resolve( |
| 1194 | zone_t zone, |
| 1195 | vm_offset_t addr, |
| 1196 | vm_offset_t *idx) |
| 1197 | { |
| 1198 | struct zone_page_metadata *meta; |
| 1199 | vm_offset_t offs, eidx; |
| 1200 | |
| 1201 | meta = zone_meta_from_addr(addr); |
| 1202 | if (!from_zone_map(addr, 1) || !zone_has_index(z: zone, zid: meta->zm_index)) { |
| 1203 | zone_invalid_element_panic(zone, addr); |
| 1204 | } |
| 1205 | |
| 1206 | offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone); |
| 1207 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1208 | offs += ptoa(meta->zm_page_index); |
| 1209 | meta -= meta->zm_page_index; |
| 1210 | } |
| 1211 | |
| 1212 | eidx = Z_FAST_QUO(offs, magic: zone->z_quo_magic); |
| 1213 | if (eidx * zone_elem_outer_size(zone) != offs) { |
| 1214 | zone_invalid_element_panic(zone, addr); |
| 1215 | } |
| 1216 | |
| 1217 | *idx = eidx; |
| 1218 | return meta; |
| 1219 | } |
| 1220 | |
| 1221 | #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST) |
| 1222 | void * |
| 1223 | zone_element_pgz_oob_adjust(void *ptr, vm_size_t req_size, vm_size_t elem_size) |
| 1224 | { |
| 1225 | vm_offset_t addr = (vm_offset_t)ptr; |
| 1226 | vm_offset_t end = addr + elem_size; |
| 1227 | vm_offset_t offs; |
| 1228 | |
| 1229 | /* |
| 1230 | * 0-sized allocations in a KALLOC_MINSIZE bucket |
| 1231 | * would be offset to the next allocation which is incorrect. |
| 1232 | */ |
| 1233 | req_size = MAX(roundup(req_size, KALLOC_MINALIGN), KALLOC_MINALIGN); |
| 1234 | |
| 1235 | /* |
| 1236 | * Given how chunks work, for a zone with PGZ guards on, |
| 1237 | * there's a single element which ends precisely |
| 1238 | * at the page boundary: the last one. |
| 1239 | */ |
| 1240 | if (req_size == elem_size || |
| 1241 | (end & PAGE_MASK) || |
| 1242 | !zone_meta_from_addr(addr)->zm_guarded) { |
| 1243 | return ptr; |
| 1244 | } |
| 1245 | |
| 1246 | offs = elem_size - req_size; |
| 1247 | zone_meta_from_addr(addr: end)->zm_oob_offs = (uint16_t)offs; |
| 1248 | |
| 1249 | return (char *)addr + offs; |
| 1250 | } |
| 1251 | #endif /* !ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */ |
| 1252 | |
| 1253 | __abortlike |
| 1254 | static void |
| 1255 | zone_element_bounds_check_panic(vm_address_t addr, vm_size_t len) |
| 1256 | { |
| 1257 | struct zone_page_metadata *meta; |
| 1258 | vm_offset_t offs, size, page; |
| 1259 | zone_t zone; |
| 1260 | |
| 1261 | page = trunc_page(addr); |
| 1262 | meta = zone_meta_from_addr(addr); |
| 1263 | zone = &zone_array[meta->zm_index]; |
| 1264 | |
| 1265 | if (zone->z_percpu) { |
| 1266 | panic("zone bound checks: address %p is a per-cpu allocation" , |
| 1267 | (void *)addr); |
| 1268 | } |
| 1269 | |
| 1270 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1271 | page -= ptoa(meta->zm_page_index); |
| 1272 | meta -= meta->zm_page_index; |
| 1273 | } |
| 1274 | |
| 1275 | size = zone_elem_outer_size(zone); |
| 1276 | offs = Z_FAST_MOD(offs: addr - zone_elem_inner_offs(zone) - page + size, |
| 1277 | magic: zone->z_quo_magic, size); |
| 1278 | panic("zone bound checks: buffer %p of length %zd overflows " |
| 1279 | "object %p of size %zd in zone %p[%s%s]" , |
| 1280 | (void *)addr, len, (void *)(addr - offs - zone_elem_redzone(zone)), |
| 1281 | zone_elem_inner_size(zone), zone, zone_heap_name(zone), zone_name(zone)); |
| 1282 | } |
| 1283 | |
| 1284 | void |
| 1285 | zone_element_bounds_check(vm_address_t addr, vm_size_t len) |
| 1286 | { |
| 1287 | struct zone_page_metadata *meta; |
| 1288 | vm_offset_t offs, size; |
| 1289 | zone_t zone; |
| 1290 | |
| 1291 | if (!from_zone_map(addr, 1)) { |
| 1292 | return; |
| 1293 | } |
| 1294 | |
| 1295 | #if CONFIG_PROB_GZALLOC |
| 1296 | if (__improbable(pgz_owned(addr))) { |
| 1297 | meta = zone_meta_from_addr(addr); |
| 1298 | addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); |
| 1299 | } |
| 1300 | #endif /* CONFIG_PROB_GZALLOC */ |
| 1301 | meta = zone_meta_from_addr(addr); |
| 1302 | zone = zone_by_id(zid: meta->zm_index); |
| 1303 | |
| 1304 | if (zone->z_percpu) { |
| 1305 | zone_element_bounds_check_panic(addr, len); |
| 1306 | } |
| 1307 | |
| 1308 | if (zone->z_permanent) { |
| 1309 | /* We don't know bounds for those */ |
| 1310 | return; |
| 1311 | } |
| 1312 | |
| 1313 | offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone); |
| 1314 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1315 | offs += ptoa(meta->zm_page_index); |
| 1316 | } |
| 1317 | size = zone_elem_outer_size(zone); |
| 1318 | offs = Z_FAST_MOD(offs: offs + size, magic: zone->z_quo_magic, size); |
| 1319 | if (len + zone_elem_redzone(zone) > size - offs) { |
| 1320 | zone_element_bounds_check_panic(addr, len); |
| 1321 | } |
| 1322 | } |
| 1323 | |
| 1324 | /* |
| 1325 | * Routine to get the size of a zone allocated address. |
| 1326 | * If the address doesnt belong to the zone maps, returns 0. |
| 1327 | */ |
| 1328 | vm_size_t |
| 1329 | zone_element_size(void *elem, zone_t *z, bool clear_oob, vm_offset_t *oob_offs) |
| 1330 | { |
| 1331 | vm_address_t addr = (vm_address_t)elem; |
| 1332 | struct zone_page_metadata *meta; |
| 1333 | vm_size_t esize, offs, end; |
| 1334 | zone_t zone; |
| 1335 | |
| 1336 | if (from_zone_map(addr, sizeof(void *))) { |
| 1337 | meta = zone_meta_from_addr(addr); |
| 1338 | zone = zone_by_id(zid: meta->zm_index); |
| 1339 | esize = zone_elem_inner_size(zone); |
| 1340 | end = vm_memtag_canonicalize_address(addr + esize); |
| 1341 | offs = 0; |
| 1342 | |
| 1343 | #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST) |
| 1344 | /* |
| 1345 | * If the chunk uses guards, and that (addr + esize) |
| 1346 | * either crosses a page boundary or is at the boundary, |
| 1347 | * we need to look harder. |
| 1348 | */ |
| 1349 | if (oob_offs && meta->zm_guarded && atop(addr ^ end)) { |
| 1350 | /* |
| 1351 | * Because in the vast majority of cases the element |
| 1352 | * size is sub-page, and that meta[1] must be faulted, |
| 1353 | * we can quickly peek at whether it's a guard. |
| 1354 | * |
| 1355 | * For elements larger than a page, finding the guard |
| 1356 | * page requires a little more effort. |
| 1357 | */ |
| 1358 | if (meta[1].zm_chunk_len == ZM_PGZ_GUARD) { |
| 1359 | offs = meta[1].zm_oob_offs; |
| 1360 | if (clear_oob) { |
| 1361 | meta[1].zm_oob_offs = 0; |
| 1362 | } |
| 1363 | } else if (esize > PAGE_SIZE) { |
| 1364 | struct zone_page_metadata *gmeta; |
| 1365 | |
| 1366 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1367 | gmeta = meta + meta->zm_subchunk_len; |
| 1368 | } else { |
| 1369 | gmeta = meta + zone->z_chunk_pages; |
| 1370 | } |
| 1371 | assert(gmeta->zm_chunk_len == ZM_PGZ_GUARD); |
| 1372 | |
| 1373 | if (end >= zone_meta_to_addr(meta: gmeta)) { |
| 1374 | offs = gmeta->zm_oob_offs; |
| 1375 | if (clear_oob) { |
| 1376 | gmeta->zm_oob_offs = 0; |
| 1377 | } |
| 1378 | } |
| 1379 | } |
| 1380 | } |
| 1381 | #else |
| 1382 | #pragma unused(end, clear_oob) |
| 1383 | #endif /* ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */ |
| 1384 | |
| 1385 | if (oob_offs) { |
| 1386 | *oob_offs = offs; |
| 1387 | } |
| 1388 | if (z) { |
| 1389 | *z = zone; |
| 1390 | } |
| 1391 | return esize; |
| 1392 | } |
| 1393 | |
| 1394 | if (oob_offs) { |
| 1395 | *oob_offs = 0; |
| 1396 | } |
| 1397 | |
| 1398 | return 0; |
| 1399 | } |
| 1400 | |
| 1401 | zone_id_t |
| 1402 | zone_id_for_element(void *addr, vm_size_t esize) |
| 1403 | { |
| 1404 | zone_id_t zid = ZONE_ID_INVALID; |
| 1405 | if (from_zone_map(addr, esize)) { |
| 1406 | zid = zone_index_from_ptr(ptr: addr); |
| 1407 | __builtin_assume(zid != ZONE_ID_INVALID); |
| 1408 | } |
| 1409 | return zid; |
| 1410 | } |
| 1411 | |
| 1412 | /* This function just formats the reason for the panics by redoing the checks */ |
| 1413 | __abortlike |
| 1414 | static void |
| 1415 | zone_require_panic(zone_t zone, void *addr) |
| 1416 | { |
| 1417 | uint32_t zindex; |
| 1418 | zone_t other; |
| 1419 | |
| 1420 | if (!from_zone_map(addr, zone_elem_inner_size(zone))) { |
| 1421 | panic("zone_require failed: address not in a zone (addr: %p)" , addr); |
| 1422 | } |
| 1423 | |
| 1424 | zindex = zone_index_from_ptr(ptr: addr); |
| 1425 | other = &zone_array[zindex]; |
| 1426 | if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) { |
| 1427 | panic("zone_require failed: invalid zone index %d " |
| 1428 | "(addr: %p, expected: %s%s)" , zindex, |
| 1429 | addr, zone_heap_name(zone), zone->z_name); |
| 1430 | } else { |
| 1431 | panic("zone_require failed: address in unexpected zone id %d (%s%s) " |
| 1432 | "(addr: %p, expected: %s%s)" , |
| 1433 | zindex, zone_heap_name(other), other->z_name, |
| 1434 | addr, zone_heap_name(zone), zone->z_name); |
| 1435 | } |
| 1436 | } |
| 1437 | |
| 1438 | __abortlike |
| 1439 | static void |
| 1440 | zone_id_require_panic(zone_id_t zid, void *addr) |
| 1441 | { |
| 1442 | zone_require_panic(zone: &zone_array[zid], addr); |
| 1443 | } |
| 1444 | |
| 1445 | /* |
| 1446 | * Routines to panic if a pointer is not mapped to an expected zone. |
| 1447 | * This can be used as a means of pinning an object to the zone it is expected |
| 1448 | * to be a part of. Causes a panic if the address does not belong to any |
| 1449 | * specified zone, does not belong to any zone, has been freed and therefore |
| 1450 | * unmapped from the zone, or the pointer contains an uninitialized value that |
| 1451 | * does not belong to any zone. |
| 1452 | */ |
| 1453 | void |
| 1454 | zone_require(zone_t zone, void *addr) |
| 1455 | { |
| 1456 | vm_size_t esize = zone_elem_inner_size(zone); |
| 1457 | |
| 1458 | if (from_zone_map(addr, esize) && |
| 1459 | zone_has_index(z: zone, zid: zone_index_from_ptr(ptr: addr))) { |
| 1460 | return; |
| 1461 | } |
| 1462 | zone_require_panic(zone, addr); |
| 1463 | } |
| 1464 | |
| 1465 | void |
| 1466 | zone_id_require(zone_id_t zid, vm_size_t esize, void *addr) |
| 1467 | { |
| 1468 | if (from_zone_map(addr, esize) && zid == zone_index_from_ptr(ptr: addr)) { |
| 1469 | return; |
| 1470 | } |
| 1471 | zone_id_require_panic(zid, addr); |
| 1472 | } |
| 1473 | |
| 1474 | void |
| 1475 | zone_id_require_aligned(zone_id_t zid, void *addr) |
| 1476 | { |
| 1477 | zone_t zone = zone_by_id(zid); |
| 1478 | vm_offset_t elem, offs; |
| 1479 | |
| 1480 | elem = (vm_offset_t)addr; |
| 1481 | offs = (elem & PAGE_MASK) - zone_elem_inner_offs(zone); |
| 1482 | |
| 1483 | if (from_zone_map(addr, 1)) { |
| 1484 | struct zone_page_metadata *meta; |
| 1485 | |
| 1486 | meta = zone_meta_from_addr(addr: elem); |
| 1487 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 1488 | offs += ptoa(meta->zm_page_index); |
| 1489 | } |
| 1490 | |
| 1491 | if (zid == meta->zm_index && |
| 1492 | Z_FAST_ALIGNED(offs, magic: zone->z_align_magic)) { |
| 1493 | return; |
| 1494 | } |
| 1495 | } |
| 1496 | |
| 1497 | zone_invalid_element_panic(zone, addr: elem); |
| 1498 | } |
| 1499 | |
| 1500 | bool |
| 1501 | zone_owns(zone_t zone, void *addr) |
| 1502 | { |
| 1503 | vm_size_t esize = zone_elem_inner_size(zone); |
| 1504 | |
| 1505 | if (from_zone_map(addr, esize)) { |
| 1506 | return zone_has_index(z: zone, zid: zone_index_from_ptr(ptr: addr)); |
| 1507 | } |
| 1508 | return false; |
| 1509 | } |
| 1510 | |
| 1511 | static inline struct mach_vm_range |
| 1512 | zone_kmem_suballoc( |
| 1513 | mach_vm_offset_t addr, |
| 1514 | vm_size_t size, |
| 1515 | int flags, |
| 1516 | vm_tag_t tag, |
| 1517 | vm_map_t *new_map) |
| 1518 | { |
| 1519 | struct mach_vm_range r; |
| 1520 | |
| 1521 | *new_map = kmem_suballoc(parent: kernel_map, addr: &addr, size, |
| 1522 | vmc_options: VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST, |
| 1523 | vm_flags: flags, flags: KMS_PERMANENT | KMS_NOFAIL, tag).kmr_submap; |
| 1524 | |
| 1525 | r.min_address = addr; |
| 1526 | r.max_address = addr + size; |
| 1527 | return r; |
| 1528 | } |
| 1529 | |
| 1530 | #endif /* !ZALLOC_TEST */ |
| 1531 | #pragma mark Zone bits allocator |
| 1532 | |
| 1533 | /*! |
| 1534 | * @defgroup Zone Bitmap allocator |
| 1535 | * @{ |
| 1536 | * |
| 1537 | * @brief |
| 1538 | * Functions implementing the zone bitmap allocator |
| 1539 | * |
| 1540 | * @discussion |
| 1541 | * The zone allocator maintains which elements are allocated or free in bitmaps. |
| 1542 | * |
| 1543 | * When the number of elements per page is smaller than 32, it is stored inline |
| 1544 | * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set, |
| 1545 | * and @c zm_bitmap used for storage). |
| 1546 | * |
| 1547 | * When the number of elements is larger, then a bitmap is allocated from |
| 1548 | * a buddy allocator (impelemented under the @c zba_* namespace). Pointers |
| 1549 | * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in |
| 1550 | * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in |
| 1551 | * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme |
| 1552 | * cannot be larger than 1024 bytes (8192 bits). |
| 1553 | * |
| 1554 | * This buddy allocator can actually accomodate allocations as large |
| 1555 | * as 8k on 16k systems and 2k on 4k systems. |
| 1556 | * |
| 1557 | * Note: @c zba_* functions are implementation details not meant to be used |
| 1558 | * outside of the allocation of the allocator itself. Interfaces to the rest of |
| 1559 | * the zone allocator are documented and not @c zba_* prefixed. |
| 1560 | */ |
| 1561 | |
| 1562 | #define ZBA_CHUNK_SIZE PAGE_MAX_SIZE |
| 1563 | #define ZBA_GRANULE sizeof(uint64_t) |
| 1564 | #define ZBA_GRANULE_BITS (8 * sizeof(uint64_t)) |
| 1565 | #define ZBA_MAX_ORDER (PAGE_MAX_SHIFT - 4) |
| 1566 | #define ZBA_MAX_ALLOC_ORDER 7 |
| 1567 | #define ZBA_SLOTS (ZBA_CHUNK_SIZE / ZBA_GRANULE) |
| 1568 | #define ZBA_HEADS_COUNT (ZBA_MAX_ORDER + 1) |
| 1569 | #define ZBA_PTR_MASK 0x0fffffff |
| 1570 | #define ZBA_ORDER_SHIFT 29 |
| 1571 | #define 0x10000000 |
| 1572 | |
| 1573 | static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes" ); |
| 1574 | static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough" ); |
| 1575 | |
| 1576 | struct zone_bits_chain { |
| 1577 | uint32_t zbc_next; |
| 1578 | uint32_t zbc_prev; |
| 1579 | } __attribute__((aligned(ZBA_GRANULE))); |
| 1580 | |
| 1581 | struct zone_bits_head { |
| 1582 | uint32_t zbh_next; |
| 1583 | uint32_t zbh_unused; |
| 1584 | } __attribute__((aligned(ZBA_GRANULE))); |
| 1585 | |
| 1586 | static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size" ); |
| 1587 | static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size" ); |
| 1588 | |
| 1589 | struct zone_bits_allocator_meta { |
| 1590 | uint32_t zbam_left; |
| 1591 | uint32_t zbam_right; |
| 1592 | struct zone_bits_head zbam_lists[ZBA_HEADS_COUNT]; |
| 1593 | struct zone_bits_head [ZBA_HEADS_COUNT]; |
| 1594 | }; |
| 1595 | |
| 1596 | struct { |
| 1597 | uint64_t [ZBA_SLOTS / (8 * sizeof(uint64_t))]; |
| 1598 | }; |
| 1599 | |
| 1600 | #if ZALLOC_TEST |
| 1601 | static struct zalloc_bits_allocator_test_setup { |
| 1602 | vm_offset_t zbats_base; |
| 1603 | void (*zbats_populate)(vm_address_t addr, vm_size_t size); |
| 1604 | } zba_test_info; |
| 1605 | |
| 1606 | static struct zone_bits_allocator_header * |
| 1607 | zba_base_header(void) |
| 1608 | { |
| 1609 | return (struct zone_bits_allocator_header *)zba_test_info.zbats_base; |
| 1610 | } |
| 1611 | |
| 1612 | static kern_return_t |
| 1613 | zba_populate(uint32_t n, bool with_extra __unused) |
| 1614 | { |
| 1615 | vm_address_t base = zba_test_info.zbats_base; |
| 1616 | zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE); |
| 1617 | |
| 1618 | return KERN_SUCCESS; |
| 1619 | } |
| 1620 | #else |
| 1621 | __startup_data __attribute__((aligned(ZBA_CHUNK_SIZE))) |
| 1622 | static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE]; |
| 1623 | |
| 1624 | static SECURITY_READ_ONLY_LATE(uint8_t) zba_xtra_shift; |
| 1625 | static LCK_MTX_DECLARE(zba_mtx, &zone_locks_grp); |
| 1626 | |
| 1627 | static struct zone_bits_allocator_header * |
| 1628 | (void) |
| 1629 | { |
| 1630 | return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address; |
| 1631 | } |
| 1632 | |
| 1633 | static void |
| 1634 | zba_lock(void) |
| 1635 | { |
| 1636 | lck_mtx_lock(lck: &zba_mtx); |
| 1637 | } |
| 1638 | |
| 1639 | static void |
| 1640 | zba_unlock(void) |
| 1641 | { |
| 1642 | lck_mtx_unlock(lck: &zba_mtx); |
| 1643 | } |
| 1644 | |
| 1645 | __abortlike |
| 1646 | static void |
| 1647 | zba_memory_exhausted(void) |
| 1648 | { |
| 1649 | uint64_t zsize = 0; |
| 1650 | zone_t z = zone_find_largest(zone_size: &zsize); |
| 1651 | panic("zba_populate: out of bitmap space, " |
| 1652 | "likely due to memory leak in zone [%s%s] " |
| 1653 | "(%u%c, %d elements allocated)" , |
| 1654 | zone_heap_name(z), zone_name(z), |
| 1655 | mach_vm_size_pretty(zsize), mach_vm_size_unit(zsize), |
| 1656 | zone_count_allocated(z)); |
| 1657 | } |
| 1658 | |
| 1659 | |
| 1660 | static kern_return_t |
| 1661 | zba_populate(uint32_t n, bool ) |
| 1662 | { |
| 1663 | vm_size_t bits_size = ZBA_CHUNK_SIZE; |
| 1664 | vm_size_t xtra_size = bits_size * CHAR_BIT << zba_xtra_shift; |
| 1665 | vm_address_t bits_addr; |
| 1666 | vm_address_t xtra_addr; |
| 1667 | kern_return_t kr; |
| 1668 | |
| 1669 | bits_addr = zone_info.zi_bits_range.min_address + n * bits_size; |
| 1670 | xtra_addr = zone_info.zi_xtra_range.min_address + n * xtra_size; |
| 1671 | |
| 1672 | kr = kernel_memory_populate(addr: bits_addr, size: bits_size, |
| 1673 | flags: KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT, |
| 1674 | VM_KERN_MEMORY_OSFMK); |
| 1675 | if (kr != KERN_SUCCESS) { |
| 1676 | return kr; |
| 1677 | } |
| 1678 | |
| 1679 | |
| 1680 | if (with_extra) { |
| 1681 | kr = kernel_memory_populate(addr: xtra_addr, size: xtra_size, |
| 1682 | flags: KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT, |
| 1683 | VM_KERN_MEMORY_OSFMK); |
| 1684 | if (kr != KERN_SUCCESS) { |
| 1685 | kernel_memory_depopulate(addr: bits_addr, size: bits_size, |
| 1686 | flags: KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT, |
| 1687 | VM_KERN_MEMORY_OSFMK); |
| 1688 | } |
| 1689 | } |
| 1690 | |
| 1691 | return kr; |
| 1692 | } |
| 1693 | #endif |
| 1694 | |
| 1695 | __pure2 |
| 1696 | static struct zone_bits_allocator_meta * |
| 1697 | zba_meta(void) |
| 1698 | { |
| 1699 | return (struct zone_bits_allocator_meta *)&zba_base_header()[1]; |
| 1700 | } |
| 1701 | |
| 1702 | __pure2 |
| 1703 | static uint64_t * |
| 1704 | zba_slot_base(void) |
| 1705 | { |
| 1706 | return (uint64_t *)zba_base_header(); |
| 1707 | } |
| 1708 | |
| 1709 | __pure2 |
| 1710 | static struct zone_bits_head * |
| 1711 | zba_head(uint32_t order, bool ) |
| 1712 | { |
| 1713 | if (with_extra) { |
| 1714 | return &zba_meta()->zbam_lists_with_extra[order]; |
| 1715 | } else { |
| 1716 | return &zba_meta()->zbam_lists[order]; |
| 1717 | } |
| 1718 | } |
| 1719 | |
| 1720 | __pure2 |
| 1721 | static uint32_t |
| 1722 | zba_head_index(struct zone_bits_head *hd) |
| 1723 | { |
| 1724 | return (uint32_t)((uint64_t *)hd - zba_slot_base()); |
| 1725 | } |
| 1726 | |
| 1727 | __pure2 |
| 1728 | static struct zone_bits_chain * |
| 1729 | zba_chain_for_index(uint32_t index) |
| 1730 | { |
| 1731 | return (struct zone_bits_chain *)(zba_slot_base() + index); |
| 1732 | } |
| 1733 | |
| 1734 | __pure2 |
| 1735 | static uint32_t |
| 1736 | zba_chain_to_index(const struct zone_bits_chain *zbc) |
| 1737 | { |
| 1738 | return (uint32_t)((const uint64_t *)zbc - zba_slot_base()); |
| 1739 | } |
| 1740 | |
| 1741 | __abortlike |
| 1742 | static void |
| 1743 | zba_head_corruption_panic(uint32_t order, bool ) |
| 1744 | { |
| 1745 | panic("zone bits allocator head[%d:%d:%p] is corrupt" , |
| 1746 | order, with_extra, zba_head(order, with_extra)); |
| 1747 | } |
| 1748 | |
| 1749 | __abortlike |
| 1750 | static void |
| 1751 | zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b) |
| 1752 | { |
| 1753 | panic("zone bits allocator freelist is corrupt (%p <-> %p)" , a, b); |
| 1754 | } |
| 1755 | |
| 1756 | static void |
| 1757 | zba_push_block(struct zone_bits_chain *zbc, uint32_t order, bool ) |
| 1758 | { |
| 1759 | struct zone_bits_head *hd = zba_head(order, with_extra); |
| 1760 | uint32_t hd_index = zba_head_index(hd); |
| 1761 | uint32_t index = zba_chain_to_index(zbc); |
| 1762 | struct zone_bits_chain *next; |
| 1763 | |
| 1764 | if (hd->zbh_next) { |
| 1765 | next = zba_chain_for_index(index: hd->zbh_next); |
| 1766 | if (next->zbc_prev != hd_index) { |
| 1767 | zba_head_corruption_panic(order, with_extra); |
| 1768 | } |
| 1769 | next->zbc_prev = index; |
| 1770 | } |
| 1771 | zbc->zbc_next = hd->zbh_next; |
| 1772 | zbc->zbc_prev = hd_index; |
| 1773 | hd->zbh_next = index; |
| 1774 | } |
| 1775 | |
| 1776 | static void |
| 1777 | zba_remove_block(struct zone_bits_chain *zbc) |
| 1778 | { |
| 1779 | struct zone_bits_chain *prev = zba_chain_for_index(index: zbc->zbc_prev); |
| 1780 | uint32_t index = zba_chain_to_index(zbc); |
| 1781 | |
| 1782 | if (prev->zbc_next != index) { |
| 1783 | zba_chain_corruption_panic(a: prev, b: zbc); |
| 1784 | } |
| 1785 | if ((prev->zbc_next = zbc->zbc_next)) { |
| 1786 | struct zone_bits_chain *next = zba_chain_for_index(index: zbc->zbc_next); |
| 1787 | if (next->zbc_prev != index) { |
| 1788 | zba_chain_corruption_panic(a: zbc, b: next); |
| 1789 | } |
| 1790 | next->zbc_prev = zbc->zbc_prev; |
| 1791 | } |
| 1792 | } |
| 1793 | |
| 1794 | static vm_address_t |
| 1795 | zba_try_pop_block(uint32_t order, bool ) |
| 1796 | { |
| 1797 | struct zone_bits_head *hd = zba_head(order, with_extra); |
| 1798 | struct zone_bits_chain *zbc; |
| 1799 | |
| 1800 | if (hd->zbh_next == 0) { |
| 1801 | return 0; |
| 1802 | } |
| 1803 | |
| 1804 | zbc = zba_chain_for_index(index: hd->zbh_next); |
| 1805 | zba_remove_block(zbc); |
| 1806 | return (vm_address_t)zbc; |
| 1807 | } |
| 1808 | |
| 1809 | static struct zone_bits_allocator_header * |
| 1810 | (vm_offset_t addr) |
| 1811 | { |
| 1812 | addr &= -(vm_offset_t)ZBA_CHUNK_SIZE; |
| 1813 | return (struct zone_bits_allocator_header *)addr; |
| 1814 | } |
| 1815 | |
| 1816 | static size_t |
| 1817 | zba_node_parent(size_t node) |
| 1818 | { |
| 1819 | return (node - 1) / 2; |
| 1820 | } |
| 1821 | |
| 1822 | static size_t |
| 1823 | zba_node_left_child(size_t node) |
| 1824 | { |
| 1825 | return node * 2 + 1; |
| 1826 | } |
| 1827 | |
| 1828 | static size_t |
| 1829 | zba_node_buddy(size_t node) |
| 1830 | { |
| 1831 | return ((node - 1) ^ 1) + 1; |
| 1832 | } |
| 1833 | |
| 1834 | static size_t |
| 1835 | zba_node(vm_offset_t addr, uint32_t order) |
| 1836 | { |
| 1837 | vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE; |
| 1838 | return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1; |
| 1839 | } |
| 1840 | |
| 1841 | static struct zone_bits_chain * |
| 1842 | zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order) |
| 1843 | { |
| 1844 | vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order; |
| 1845 | return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE); |
| 1846 | } |
| 1847 | |
| 1848 | static void |
| 1849 | zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node) |
| 1850 | { |
| 1851 | zbah->zbah_bits[node / 64] ^= 1ull << (node % 64); |
| 1852 | } |
| 1853 | |
| 1854 | static bool |
| 1855 | zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node) |
| 1856 | { |
| 1857 | return zbah->zbah_bits[node / 64] & (1ull << (node % 64)); |
| 1858 | } |
| 1859 | |
| 1860 | static void |
| 1861 | zba_free(vm_offset_t addr, uint32_t order, bool ) |
| 1862 | { |
| 1863 | struct zone_bits_allocator_header *zbah = zba_header(addr); |
| 1864 | struct zone_bits_chain *zbc; |
| 1865 | size_t node = zba_node(addr, order); |
| 1866 | |
| 1867 | while (node) { |
| 1868 | size_t parent = zba_node_parent(node); |
| 1869 | |
| 1870 | zba_node_flip_split(zbah, node: parent); |
| 1871 | if (zba_node_is_split(zbah, node: parent)) { |
| 1872 | break; |
| 1873 | } |
| 1874 | |
| 1875 | zbc = zba_chain_for_node(zbah, node: zba_node_buddy(node), order); |
| 1876 | zba_remove_block(zbc); |
| 1877 | order++; |
| 1878 | node = parent; |
| 1879 | } |
| 1880 | |
| 1881 | zba_push_block(zbc: zba_chain_for_node(zbah, node, order), order, with_extra); |
| 1882 | } |
| 1883 | |
| 1884 | static vm_size_t |
| 1885 | (uint32_t n) |
| 1886 | { |
| 1887 | vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header); |
| 1888 | if (n == 0) { |
| 1889 | hdr_size += sizeof(struct zone_bits_allocator_meta); |
| 1890 | } |
| 1891 | return hdr_size; |
| 1892 | } |
| 1893 | |
| 1894 | static void |
| 1895 | zba_init_chunk(uint32_t n, bool ) |
| 1896 | { |
| 1897 | vm_size_t hdr_size = zba_chunk_header_size(n); |
| 1898 | vm_offset_t page = (vm_offset_t)zba_base_header() + n * ZBA_CHUNK_SIZE; |
| 1899 | struct zone_bits_allocator_header *zbah = zba_header(addr: page); |
| 1900 | vm_size_t size = ZBA_CHUNK_SIZE; |
| 1901 | size_t node; |
| 1902 | |
| 1903 | for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) { |
| 1904 | if (size < hdr_size + (ZBA_GRANULE << o)) { |
| 1905 | continue; |
| 1906 | } |
| 1907 | size -= ZBA_GRANULE << o; |
| 1908 | node = zba_node(addr: page + size, order: o); |
| 1909 | zba_node_flip_split(zbah, node: zba_node_parent(node)); |
| 1910 | zba_push_block(zbc: zba_chain_for_node(zbah, node, order: o), order: o, with_extra); |
| 1911 | } |
| 1912 | } |
| 1913 | |
| 1914 | __attribute__((noinline)) |
| 1915 | static void |
| 1916 | zba_grow(bool ) |
| 1917 | { |
| 1918 | struct zone_bits_allocator_meta *meta = zba_meta(); |
| 1919 | kern_return_t kr = KERN_SUCCESS; |
| 1920 | uint32_t chunk; |
| 1921 | |
| 1922 | #if !ZALLOC_TEST |
| 1923 | if (meta->zbam_left >= meta->zbam_right) { |
| 1924 | zba_memory_exhausted(); |
| 1925 | } |
| 1926 | #endif |
| 1927 | |
| 1928 | if (with_extra) { |
| 1929 | chunk = meta->zbam_right - 1; |
| 1930 | } else { |
| 1931 | chunk = meta->zbam_left; |
| 1932 | } |
| 1933 | |
| 1934 | kr = zba_populate(n: chunk, with_extra); |
| 1935 | if (kr == KERN_SUCCESS) { |
| 1936 | if (with_extra) { |
| 1937 | meta->zbam_right -= 1; |
| 1938 | } else { |
| 1939 | meta->zbam_left += 1; |
| 1940 | } |
| 1941 | |
| 1942 | zba_init_chunk(n: chunk, with_extra); |
| 1943 | #if !ZALLOC_TEST |
| 1944 | } else { |
| 1945 | /* |
| 1946 | * zba_populate() has to be allowed to fail populating, |
| 1947 | * as we are under a global lock, we need to do the |
| 1948 | * VM_PAGE_WAIT() outside of the lock. |
| 1949 | */ |
| 1950 | assert(kr == KERN_RESOURCE_SHORTAGE); |
| 1951 | zba_unlock(); |
| 1952 | VM_PAGE_WAIT(); |
| 1953 | zba_lock(); |
| 1954 | #endif |
| 1955 | } |
| 1956 | } |
| 1957 | |
| 1958 | static vm_offset_t |
| 1959 | zba_alloc(uint32_t order, bool ) |
| 1960 | { |
| 1961 | struct zone_bits_allocator_header *zbah; |
| 1962 | uint32_t cur = order; |
| 1963 | vm_address_t addr; |
| 1964 | size_t node; |
| 1965 | |
| 1966 | while ((addr = zba_try_pop_block(order: cur, with_extra)) == 0) { |
| 1967 | if (__improbable(cur++ >= ZBA_MAX_ORDER)) { |
| 1968 | zba_grow(with_extra); |
| 1969 | cur = order; |
| 1970 | } |
| 1971 | } |
| 1972 | |
| 1973 | zbah = zba_header(addr); |
| 1974 | node = zba_node(addr, order: cur); |
| 1975 | zba_node_flip_split(zbah, node: zba_node_parent(node)); |
| 1976 | while (cur > order) { |
| 1977 | cur--; |
| 1978 | zba_node_flip_split(zbah, node); |
| 1979 | node = zba_node_left_child(node); |
| 1980 | zba_push_block(zbc: zba_chain_for_node(zbah, node: node + 1, order: cur), |
| 1981 | order: cur, with_extra); |
| 1982 | } |
| 1983 | |
| 1984 | return addr; |
| 1985 | } |
| 1986 | |
| 1987 | #define zba_map_index(type, n) (n / (8 * sizeof(type))) |
| 1988 | #define zba_map_bit(type, n) ((type)1 << (n % (8 * sizeof(type)))) |
| 1989 | #define zba_map_mask_lt(type, n) (zba_map_bit(type, n) - 1) |
| 1990 | #define zba_map_mask_ge(type, n) ((type)-zba_map_bit(type, n)) |
| 1991 | |
| 1992 | #if !ZALLOC_TEST |
| 1993 | #if VM_TAG_SIZECLASSES |
| 1994 | |
| 1995 | static void * |
| 1996 | zba_extra_ref_ptr(uint32_t bref, vm_offset_t idx) |
| 1997 | { |
| 1998 | vm_offset_t base = zone_info.zi_xtra_range.min_address; |
| 1999 | vm_offset_t offs = (bref & ZBA_PTR_MASK) * ZBA_GRANULE * CHAR_BIT; |
| 2000 | |
| 2001 | return (void *)(base + ((offs + idx) << zba_xtra_shift)); |
| 2002 | } |
| 2003 | |
| 2004 | #endif /* VM_TAG_SIZECLASSES */ |
| 2005 | |
| 2006 | static uint32_t |
| 2007 | zba_bits_ref_order(uint32_t bref) |
| 2008 | { |
| 2009 | return bref >> ZBA_ORDER_SHIFT; |
| 2010 | } |
| 2011 | |
| 2012 | static bitmap_t * |
| 2013 | zba_bits_ref_ptr(uint32_t bref) |
| 2014 | { |
| 2015 | return zba_slot_base() + (bref & ZBA_PTR_MASK); |
| 2016 | } |
| 2017 | |
| 2018 | static vm_offset_t |
| 2019 | zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta, |
| 2020 | zalloc_flags_t flags, vm_offset_t eidx) |
| 2021 | { |
| 2022 | size_t i = eidx / 32; |
| 2023 | uint32_t map; |
| 2024 | |
| 2025 | if (eidx % 32) { |
| 2026 | map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx); |
| 2027 | if (map) { |
| 2028 | eidx = __builtin_ctz(map); |
| 2029 | meta[i].zm_bitmap ^= 1u << eidx; |
| 2030 | return i * 32 + eidx; |
| 2031 | } |
| 2032 | i++; |
| 2033 | } |
| 2034 | |
| 2035 | uint32_t chunk_len = meta->zm_chunk_len; |
| 2036 | if (flags & Z_PCPU) { |
| 2037 | chunk_len = zpercpu_count(); |
| 2038 | } |
| 2039 | for (int j = 0; j < chunk_len; j++, i++) { |
| 2040 | if (i >= chunk_len) { |
| 2041 | i = 0; |
| 2042 | } |
| 2043 | if (__probable(map = meta[i].zm_bitmap)) { |
| 2044 | meta[i].zm_bitmap &= map - 1; |
| 2045 | return i * 32 + __builtin_ctz(map); |
| 2046 | } |
| 2047 | } |
| 2048 | |
| 2049 | zone_page_meta_accounting_panic(zone, meta, kind: "zm_bitmap" ); |
| 2050 | } |
| 2051 | |
| 2052 | static vm_offset_t |
| 2053 | zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta, |
| 2054 | vm_offset_t eidx) |
| 2055 | { |
| 2056 | uint32_t bits_size = 1 << zba_bits_ref_order(bref: meta->zm_bitmap); |
| 2057 | bitmap_t *bits = zba_bits_ref_ptr(bref: meta->zm_bitmap); |
| 2058 | size_t i = eidx / 64; |
| 2059 | uint64_t map; |
| 2060 | |
| 2061 | if (eidx % 64) { |
| 2062 | map = bits[i] & zba_map_mask_ge(uint64_t, eidx); |
| 2063 | if (map) { |
| 2064 | eidx = __builtin_ctzll(map); |
| 2065 | bits[i] ^= 1ull << eidx; |
| 2066 | return i * 64 + eidx; |
| 2067 | } |
| 2068 | i++; |
| 2069 | } |
| 2070 | |
| 2071 | for (int j = 0; j < bits_size; i++, j++) { |
| 2072 | if (i >= bits_size) { |
| 2073 | i = 0; |
| 2074 | } |
| 2075 | if (__probable(map = bits[i])) { |
| 2076 | bits[i] &= map - 1; |
| 2077 | return i * 64 + __builtin_ctzll(map); |
| 2078 | } |
| 2079 | } |
| 2080 | |
| 2081 | zone_page_meta_accounting_panic(zone, meta, kind: "zm_bitmap" ); |
| 2082 | } |
| 2083 | |
| 2084 | /*! |
| 2085 | * @function zone_meta_find_and_clear_bit |
| 2086 | * |
| 2087 | * @brief |
| 2088 | * The core of the bitmap allocator: find a bit set in the bitmaps. |
| 2089 | * |
| 2090 | * @discussion |
| 2091 | * This method will round robin through available allocations, |
| 2092 | * with a per-core memory of the last allocated element index allocated. |
| 2093 | * |
| 2094 | * This is done in order to avoid a fully LIFO behavior which makes exploiting |
| 2095 | * double-free bugs way too practical. |
| 2096 | * |
| 2097 | * @param zone The zone we're allocating from. |
| 2098 | * @param meta The main metadata for the chunk being allocated from. |
| 2099 | * @param flags the alloc flags (for @c Z_PCPU). |
| 2100 | */ |
| 2101 | static vm_offset_t |
| 2102 | zone_meta_find_and_clear_bit( |
| 2103 | zone_t zone, |
| 2104 | zone_stats_t zs, |
| 2105 | struct zone_page_metadata *meta, |
| 2106 | zalloc_flags_t flags) |
| 2107 | { |
| 2108 | vm_offset_t eidx = zs->zs_alloc_rr + 1; |
| 2109 | |
| 2110 | if (meta->zm_inline_bitmap) { |
| 2111 | eidx = zba_scan_bitmap_inline(zone, meta, flags, eidx); |
| 2112 | } else { |
| 2113 | eidx = zba_scan_bitmap_ref(zone, meta, eidx); |
| 2114 | } |
| 2115 | zs->zs_alloc_rr = (uint16_t)eidx; |
| 2116 | return eidx; |
| 2117 | } |
| 2118 | |
| 2119 | /*! |
| 2120 | * @function zone_meta_bits_init_inline |
| 2121 | * |
| 2122 | * @brief |
| 2123 | * Initializes the inline zm_bitmap field(s) for a newly assigned chunk. |
| 2124 | * |
| 2125 | * @param meta The main metadata for the initialized chunk. |
| 2126 | * @param count The number of elements the chunk can hold |
| 2127 | * (which might be partial for partially populated chunks). |
| 2128 | */ |
| 2129 | static void |
| 2130 | zone_meta_bits_init_inline(struct zone_page_metadata *meta, uint32_t count) |
| 2131 | { |
| 2132 | /* |
| 2133 | * We're called with the metadata zm_bitmap fields already zeroed out. |
| 2134 | */ |
| 2135 | for (size_t i = 0; i < count / 32; i++) { |
| 2136 | meta[i].zm_bitmap = ~0u; |
| 2137 | } |
| 2138 | if (count % 32) { |
| 2139 | meta[count / 32].zm_bitmap = zba_map_mask_lt(uint32_t, count); |
| 2140 | } |
| 2141 | } |
| 2142 | |
| 2143 | /*! |
| 2144 | * @function zone_meta_bits_alloc_init |
| 2145 | * |
| 2146 | * @brief |
| 2147 | * Allocates a zm_bitmap field for a newly assigned chunk. |
| 2148 | * |
| 2149 | * @param count The number of elements the chunk can hold |
| 2150 | * (which might be partial for partially populated chunks). |
| 2151 | * @param nbits The maximum nuber of bits that will be used. |
| 2152 | * @param with_extra Whether "VM Tracking" metadata needs to be allocated. |
| 2153 | */ |
| 2154 | static uint32_t |
| 2155 | zone_meta_bits_alloc_init(uint32_t count, uint32_t nbits, bool ) |
| 2156 | { |
| 2157 | static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <= |
| 2158 | ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough" ); |
| 2159 | |
| 2160 | uint32_t order = flsll(mask: (nbits - 1) / ZBA_GRANULE_BITS); |
| 2161 | uint64_t *bits; |
| 2162 | size_t i = 0; |
| 2163 | |
| 2164 | assert(order <= ZBA_MAX_ALLOC_ORDER); |
| 2165 | assert(count <= ZBA_GRANULE_BITS << order); |
| 2166 | |
| 2167 | zba_lock(); |
| 2168 | bits = (uint64_t *)zba_alloc(order, with_extra); |
| 2169 | zba_unlock(); |
| 2170 | |
| 2171 | while (i < count / 64) { |
| 2172 | bits[i++] = ~0ull; |
| 2173 | } |
| 2174 | if (count % 64) { |
| 2175 | bits[i++] = zba_map_mask_lt(uint64_t, count); |
| 2176 | } |
| 2177 | while (i < 1u << order) { |
| 2178 | bits[i++] = 0; |
| 2179 | } |
| 2180 | |
| 2181 | return (uint32_t)(bits - zba_slot_base()) + |
| 2182 | (order << ZBA_ORDER_SHIFT) + |
| 2183 | (with_extra ? ZBA_HAS_EXTRA_BIT : 0); |
| 2184 | } |
| 2185 | |
| 2186 | /*! |
| 2187 | * @function zone_meta_bits_merge |
| 2188 | * |
| 2189 | * @brief |
| 2190 | * Adds elements <code>[start, end)</code> to a chunk being extended. |
| 2191 | * |
| 2192 | * @param meta The main metadata for the extended chunk. |
| 2193 | * @param start The index of the first element to add to the chunk. |
| 2194 | * @param end The index of the last (exclusive) element to add. |
| 2195 | */ |
| 2196 | static void |
| 2197 | zone_meta_bits_merge(struct zone_page_metadata *meta, |
| 2198 | uint32_t start, uint32_t end) |
| 2199 | { |
| 2200 | if (meta->zm_inline_bitmap) { |
| 2201 | while (start < end) { |
| 2202 | size_t s_i = start / 32; |
| 2203 | size_t s_e = end / 32; |
| 2204 | |
| 2205 | if (s_i == s_e) { |
| 2206 | meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) & |
| 2207 | zba_map_mask_ge(uint32_t, start); |
| 2208 | break; |
| 2209 | } |
| 2210 | |
| 2211 | meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start); |
| 2212 | start += 32 - (start % 32); |
| 2213 | } |
| 2214 | } else { |
| 2215 | uint64_t *bits = zba_bits_ref_ptr(bref: meta->zm_bitmap); |
| 2216 | |
| 2217 | while (start < end) { |
| 2218 | size_t s_i = start / 64; |
| 2219 | size_t s_e = end / 64; |
| 2220 | |
| 2221 | if (s_i == s_e) { |
| 2222 | bits[s_i] |= zba_map_mask_lt(uint64_t, end) & |
| 2223 | zba_map_mask_ge(uint64_t, start); |
| 2224 | break; |
| 2225 | } |
| 2226 | bits[s_i] |= zba_map_mask_ge(uint64_t, start); |
| 2227 | start += 64 - (start % 64); |
| 2228 | } |
| 2229 | } |
| 2230 | } |
| 2231 | |
| 2232 | /*! |
| 2233 | * @function zone_bits_free |
| 2234 | * |
| 2235 | * @brief |
| 2236 | * Frees a bitmap to the zone bitmap allocator. |
| 2237 | * |
| 2238 | * @param bref |
| 2239 | * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field. |
| 2240 | */ |
| 2241 | static void |
| 2242 | zone_bits_free(uint32_t bref) |
| 2243 | { |
| 2244 | zba_lock(); |
| 2245 | zba_free(addr: (vm_offset_t)zba_bits_ref_ptr(bref), |
| 2246 | order: zba_bits_ref_order(bref), with_extra: (bref & ZBA_HAS_EXTRA_BIT)); |
| 2247 | zba_unlock(); |
| 2248 | } |
| 2249 | |
| 2250 | /*! |
| 2251 | * @function zone_meta_is_free |
| 2252 | * |
| 2253 | * @brief |
| 2254 | * Returns whether a given element appears free. |
| 2255 | */ |
| 2256 | static bool |
| 2257 | zone_meta_is_free(struct zone_page_metadata *meta, vm_offset_t eidx) |
| 2258 | { |
| 2259 | if (meta->zm_inline_bitmap) { |
| 2260 | uint32_t bit = zba_map_bit(uint32_t, eidx); |
| 2261 | return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit; |
| 2262 | } else { |
| 2263 | bitmap_t *bits = zba_bits_ref_ptr(bref: meta->zm_bitmap); |
| 2264 | uint64_t bit = zba_map_bit(uint64_t, eidx); |
| 2265 | return bits[zba_map_index(uint64_t, eidx)] & bit; |
| 2266 | } |
| 2267 | } |
| 2268 | |
| 2269 | /*! |
| 2270 | * @function zone_meta_mark_free |
| 2271 | * |
| 2272 | * @brief |
| 2273 | * Marks an element as free and returns whether it was marked as used. |
| 2274 | */ |
| 2275 | static bool |
| 2276 | zone_meta_mark_free(struct zone_page_metadata *meta, vm_offset_t eidx) |
| 2277 | { |
| 2278 | if (meta->zm_inline_bitmap) { |
| 2279 | uint32_t bit = zba_map_bit(uint32_t, eidx); |
| 2280 | if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) { |
| 2281 | return false; |
| 2282 | } |
| 2283 | meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit; |
| 2284 | } else { |
| 2285 | bitmap_t *bits = zba_bits_ref_ptr(bref: meta->zm_bitmap); |
| 2286 | uint64_t bit = zba_map_bit(uint64_t, eidx); |
| 2287 | if (bits[zba_map_index(uint64_t, eidx)] & bit) { |
| 2288 | return false; |
| 2289 | } |
| 2290 | bits[zba_map_index(uint64_t, eidx)] ^= bit; |
| 2291 | } |
| 2292 | return true; |
| 2293 | } |
| 2294 | |
| 2295 | #if VM_TAG_SIZECLASSES |
| 2296 | |
| 2297 | __startup_func |
| 2298 | void |
| 2299 | __zone_site_register(vm_allocation_site_t *site) |
| 2300 | { |
| 2301 | if (zone_tagging_on) { |
| 2302 | vm_tag_alloc(site); |
| 2303 | } |
| 2304 | } |
| 2305 | |
| 2306 | uint16_t |
| 2307 | zone_index_from_tag_index(uint32_t sizeclass_idx) |
| 2308 | { |
| 2309 | return zone_tags_sizeclasses[sizeclass_idx]; |
| 2310 | } |
| 2311 | |
| 2312 | #endif /* VM_TAG_SIZECLASSES */ |
| 2313 | #endif /* !ZALLOC_TEST */ |
| 2314 | /*! @} */ |
| 2315 | #pragma mark zalloc helpers |
| 2316 | #if !ZALLOC_TEST |
| 2317 | |
| 2318 | static inline void * |
| 2319 | zstack_tbi_fix(vm_offset_t elem) |
| 2320 | { |
| 2321 | #if CONFIG_KERNEL_TAGGING |
| 2322 | elem = vm_memtag_fixup_ptr(elem); |
| 2323 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 2324 | return (void *)elem; |
| 2325 | } |
| 2326 | |
| 2327 | static inline vm_offset_t |
| 2328 | zstack_tbi_fill(void *addr) |
| 2329 | { |
| 2330 | vm_offset_t elem = (vm_offset_t)addr; |
| 2331 | |
| 2332 | return vm_memtag_canonicalize_address(elem); |
| 2333 | } |
| 2334 | |
| 2335 | __attribute__((always_inline)) |
| 2336 | static inline void |
| 2337 | zstack_push_no_delta(zstack_t *stack, void *addr) |
| 2338 | { |
| 2339 | vm_offset_t elem = zstack_tbi_fill(addr); |
| 2340 | |
| 2341 | *(vm_offset_t *)addr = stack->z_head - elem; |
| 2342 | stack->z_head = elem; |
| 2343 | } |
| 2344 | |
| 2345 | __attribute__((always_inline)) |
| 2346 | void |
| 2347 | zstack_push(zstack_t *stack, void *addr) |
| 2348 | { |
| 2349 | zstack_push_no_delta(stack, addr); |
| 2350 | stack->z_count++; |
| 2351 | } |
| 2352 | |
| 2353 | __attribute__((always_inline)) |
| 2354 | static inline void * |
| 2355 | zstack_pop_no_delta(zstack_t *stack) |
| 2356 | { |
| 2357 | void *addr = zstack_tbi_fix(elem: stack->z_head); |
| 2358 | |
| 2359 | stack->z_head += *(vm_offset_t *)addr; |
| 2360 | *(vm_offset_t *)addr = 0; |
| 2361 | |
| 2362 | return addr; |
| 2363 | } |
| 2364 | |
| 2365 | __attribute__((always_inline)) |
| 2366 | void * |
| 2367 | zstack_pop(zstack_t *stack) |
| 2368 | { |
| 2369 | stack->z_count--; |
| 2370 | return zstack_pop_no_delta(stack); |
| 2371 | } |
| 2372 | |
| 2373 | static inline void |
| 2374 | zone_recirc_lock_nopreempt_check_contention(zone_t zone) |
| 2375 | { |
| 2376 | uint32_t ticket; |
| 2377 | |
| 2378 | if (__probable(hw_lck_ticket_reserve_nopreempt(&zone->z_recirc_lock, |
| 2379 | &ticket, &zone_locks_grp))) { |
| 2380 | return; |
| 2381 | } |
| 2382 | |
| 2383 | hw_lck_ticket_wait(&zone->z_recirc_lock, ticket, NULL, &zone_locks_grp); |
| 2384 | |
| 2385 | /* |
| 2386 | * If zone caching has been disabled due to memory pressure, |
| 2387 | * then recording contention is not useful, give the system |
| 2388 | * time to recover. |
| 2389 | */ |
| 2390 | if (__probable(!zone_caching_disabled && !zone_exhausted(zone))) { |
| 2391 | zone->z_recirc_cont_cur++; |
| 2392 | } |
| 2393 | } |
| 2394 | |
| 2395 | static inline void |
| 2396 | zone_recirc_lock_nopreempt(zone_t zone) |
| 2397 | { |
| 2398 | hw_lck_ticket_lock_nopreempt(&zone->z_recirc_lock, &zone_locks_grp); |
| 2399 | } |
| 2400 | |
| 2401 | static inline void |
| 2402 | zone_recirc_unlock_nopreempt(zone_t zone) |
| 2403 | { |
| 2404 | hw_lck_ticket_unlock_nopreempt(tlock: &zone->z_recirc_lock); |
| 2405 | } |
| 2406 | |
| 2407 | static inline void |
| 2408 | zone_lock_nopreempt_check_contention(zone_t zone) |
| 2409 | { |
| 2410 | uint32_t ticket; |
| 2411 | #if KASAN_FAKESTACK |
| 2412 | spl_t s = 0; |
| 2413 | if (zone->z_kasan_fakestacks) { |
| 2414 | s = splsched(); |
| 2415 | } |
| 2416 | #endif /* KASAN_FAKESTACK */ |
| 2417 | |
| 2418 | if (__probable(hw_lck_ticket_reserve_nopreempt(&zone->z_lock, &ticket, |
| 2419 | &zone_locks_grp))) { |
| 2420 | #if KASAN_FAKESTACK |
| 2421 | zone->z_kasan_spl = s; |
| 2422 | #endif /* KASAN_FAKESTACK */ |
| 2423 | return; |
| 2424 | } |
| 2425 | |
| 2426 | hw_lck_ticket_wait(&zone->z_lock, ticket, NULL, &zone_locks_grp); |
| 2427 | #if KASAN_FAKESTACK |
| 2428 | zone->z_kasan_spl = s; |
| 2429 | #endif /* KASAN_FAKESTACK */ |
| 2430 | |
| 2431 | /* |
| 2432 | * If zone caching has been disabled due to memory pressure, |
| 2433 | * then recording contention is not useful, give the system |
| 2434 | * time to recover. |
| 2435 | */ |
| 2436 | if (__probable(!zone_caching_disabled && |
| 2437 | !zone->z_pcpu_cache && !zone_exhausted(zone))) { |
| 2438 | zone->z_recirc_cont_cur++; |
| 2439 | } |
| 2440 | } |
| 2441 | |
| 2442 | static inline void |
| 2443 | zone_lock_nopreempt(zone_t zone) |
| 2444 | { |
| 2445 | #if KASAN_FAKESTACK |
| 2446 | spl_t s = 0; |
| 2447 | if (zone->z_kasan_fakestacks) { |
| 2448 | s = splsched(); |
| 2449 | } |
| 2450 | #endif /* KASAN_FAKESTACK */ |
| 2451 | hw_lck_ticket_lock_nopreempt(&zone->z_lock, &zone_locks_grp); |
| 2452 | #if KASAN_FAKESTACK |
| 2453 | zone->z_kasan_spl = s; |
| 2454 | #endif /* KASAN_FAKESTACK */ |
| 2455 | } |
| 2456 | |
| 2457 | static inline void |
| 2458 | zone_unlock_nopreempt(zone_t zone) |
| 2459 | { |
| 2460 | #if KASAN_FAKESTACK |
| 2461 | spl_t s = zone->z_kasan_spl; |
| 2462 | zone->z_kasan_spl = 0; |
| 2463 | #endif /* KASAN_FAKESTACK */ |
| 2464 | hw_lck_ticket_unlock_nopreempt(tlock: &zone->z_lock); |
| 2465 | #if KASAN_FAKESTACK |
| 2466 | if (zone->z_kasan_fakestacks) { |
| 2467 | splx(s); |
| 2468 | } |
| 2469 | #endif /* KASAN_FAKESTACK */ |
| 2470 | } |
| 2471 | |
| 2472 | static inline void |
| 2473 | zone_depot_lock_nopreempt(zone_cache_t zc) |
| 2474 | { |
| 2475 | hw_lck_ticket_lock_nopreempt(&zc->zc_depot_lock, &zone_locks_grp); |
| 2476 | } |
| 2477 | |
| 2478 | static inline void |
| 2479 | zone_depot_unlock_nopreempt(zone_cache_t zc) |
| 2480 | { |
| 2481 | hw_lck_ticket_unlock_nopreempt(tlock: &zc->zc_depot_lock); |
| 2482 | } |
| 2483 | |
| 2484 | static inline void |
| 2485 | zone_depot_lock(zone_cache_t zc) |
| 2486 | { |
| 2487 | hw_lck_ticket_lock(&zc->zc_depot_lock, &zone_locks_grp); |
| 2488 | } |
| 2489 | |
| 2490 | static inline void |
| 2491 | zone_depot_unlock(zone_cache_t zc) |
| 2492 | { |
| 2493 | hw_lck_ticket_unlock(tlock: &zc->zc_depot_lock); |
| 2494 | } |
| 2495 | |
| 2496 | zone_t |
| 2497 | zone_by_id(size_t zid) |
| 2498 | { |
| 2499 | return (zone_t)((uintptr_t)zone_array + zid * sizeof(struct zone)); |
| 2500 | } |
| 2501 | |
| 2502 | static inline bool |
| 2503 | zone_supports_vm(zone_t z) |
| 2504 | { |
| 2505 | /* |
| 2506 | * VM_MAP_ENTRY and VM_MAP_HOLES zones are allowed |
| 2507 | * to overcommit because they're used to reclaim memory |
| 2508 | * (VM support). |
| 2509 | */ |
| 2510 | return z >= &zone_array[ZONE_ID_VM_MAP_ENTRY] && |
| 2511 | z <= &zone_array[ZONE_ID_VM_MAP_HOLES]; |
| 2512 | } |
| 2513 | |
| 2514 | const char * |
| 2515 | zone_name(zone_t z) |
| 2516 | { |
| 2517 | return z->z_name; |
| 2518 | } |
| 2519 | |
| 2520 | const char * |
| 2521 | zone_heap_name(zone_t z) |
| 2522 | { |
| 2523 | zone_security_flags_t zsflags = zone_security_config(z); |
| 2524 | if (__probable(zsflags.z_kheap_id < KHEAP_ID_COUNT)) { |
| 2525 | return kalloc_heap_names[zsflags.z_kheap_id]; |
| 2526 | } |
| 2527 | return "invalid" ; |
| 2528 | } |
| 2529 | |
| 2530 | static uint32_t |
| 2531 | zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems) |
| 2532 | { |
| 2533 | vm_size_t elem_count, chunks; |
| 2534 | |
| 2535 | elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) / |
| 2536 | zone_elem_outer_size(zone: z); |
| 2537 | chunks = (max_elems + elem_count - 1) / elem_count; |
| 2538 | |
| 2539 | return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages); |
| 2540 | } |
| 2541 | |
| 2542 | static inline vm_size_t |
| 2543 | zone_submaps_approx_size(void) |
| 2544 | { |
| 2545 | vm_size_t size = 0; |
| 2546 | |
| 2547 | for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) { |
| 2548 | if (zone_submaps[idx] != VM_MAP_NULL) { |
| 2549 | size += zone_submaps[idx]->size; |
| 2550 | } |
| 2551 | } |
| 2552 | |
| 2553 | return size; |
| 2554 | } |
| 2555 | |
| 2556 | static inline void |
| 2557 | zone_depot_init(struct zone_depot *zd) |
| 2558 | { |
| 2559 | *zd = (struct zone_depot){ |
| 2560 | .zd_tail = &zd->zd_head, |
| 2561 | }; |
| 2562 | } |
| 2563 | |
| 2564 | static inline void |
| 2565 | zone_depot_insert_head_full(struct zone_depot *zd, zone_magazine_t mag) |
| 2566 | { |
| 2567 | if (zd->zd_full++ == 0) { |
| 2568 | zd->zd_tail = &mag->zm_next; |
| 2569 | } |
| 2570 | mag->zm_next = zd->zd_head; |
| 2571 | zd->zd_head = mag; |
| 2572 | } |
| 2573 | |
| 2574 | static inline void |
| 2575 | zone_depot_insert_tail_full(struct zone_depot *zd, zone_magazine_t mag) |
| 2576 | { |
| 2577 | zd->zd_full++; |
| 2578 | mag->zm_next = *zd->zd_tail; |
| 2579 | *zd->zd_tail = mag; |
| 2580 | zd->zd_tail = &mag->zm_next; |
| 2581 | } |
| 2582 | |
| 2583 | static inline void |
| 2584 | zone_depot_insert_head_empty(struct zone_depot *zd, zone_magazine_t mag) |
| 2585 | { |
| 2586 | zd->zd_empty++; |
| 2587 | mag->zm_next = *zd->zd_tail; |
| 2588 | *zd->zd_tail = mag; |
| 2589 | } |
| 2590 | |
| 2591 | static inline zone_magazine_t |
| 2592 | zone_depot_pop_head_full(struct zone_depot *zd, zone_t z) |
| 2593 | { |
| 2594 | zone_magazine_t mag = zd->zd_head; |
| 2595 | |
| 2596 | assert(zd->zd_full); |
| 2597 | |
| 2598 | zd->zd_full--; |
| 2599 | if (z && z->z_recirc_full_min > zd->zd_full) { |
| 2600 | z->z_recirc_full_min = zd->zd_full; |
| 2601 | } |
| 2602 | zd->zd_head = mag->zm_next; |
| 2603 | if (zd->zd_full == 0) { |
| 2604 | zd->zd_tail = &zd->zd_head; |
| 2605 | } |
| 2606 | |
| 2607 | mag->zm_next = NULL; |
| 2608 | return mag; |
| 2609 | } |
| 2610 | |
| 2611 | static inline zone_magazine_t |
| 2612 | zone_depot_pop_head_empty(struct zone_depot *zd, zone_t z) |
| 2613 | { |
| 2614 | zone_magazine_t mag = *zd->zd_tail; |
| 2615 | |
| 2616 | assert(zd->zd_empty); |
| 2617 | |
| 2618 | zd->zd_empty--; |
| 2619 | if (z && z->z_recirc_empty_min > zd->zd_empty) { |
| 2620 | z->z_recirc_empty_min = zd->zd_empty; |
| 2621 | } |
| 2622 | *zd->zd_tail = mag->zm_next; |
| 2623 | |
| 2624 | mag->zm_next = NULL; |
| 2625 | return mag; |
| 2626 | } |
| 2627 | |
| 2628 | static inline smr_seq_t |
| 2629 | zone_depot_move_full( |
| 2630 | struct zone_depot *dst, |
| 2631 | struct zone_depot *src, |
| 2632 | uint32_t n, |
| 2633 | zone_t z) |
| 2634 | { |
| 2635 | zone_magazine_t head, last; |
| 2636 | |
| 2637 | assert(n); |
| 2638 | assert(src->zd_full >= n); |
| 2639 | |
| 2640 | src->zd_full -= n; |
| 2641 | if (z && z->z_recirc_full_min > src->zd_full) { |
| 2642 | z->z_recirc_full_min = src->zd_full; |
| 2643 | } |
| 2644 | head = last = src->zd_head; |
| 2645 | for (uint32_t i = n; i-- > 1;) { |
| 2646 | last = last->zm_next; |
| 2647 | } |
| 2648 | |
| 2649 | src->zd_head = last->zm_next; |
| 2650 | if (src->zd_full == 0) { |
| 2651 | src->zd_tail = &src->zd_head; |
| 2652 | } |
| 2653 | |
| 2654 | if (z && zone_security_array[zone_index(z)].z_lifo) { |
| 2655 | if (dst->zd_full == 0) { |
| 2656 | dst->zd_tail = &last->zm_next; |
| 2657 | } |
| 2658 | last->zm_next = dst->zd_head; |
| 2659 | dst->zd_head = head; |
| 2660 | } else { |
| 2661 | last->zm_next = *dst->zd_tail; |
| 2662 | *dst->zd_tail = head; |
| 2663 | dst->zd_tail = &last->zm_next; |
| 2664 | } |
| 2665 | dst->zd_full += n; |
| 2666 | |
| 2667 | return last->zm_seq; |
| 2668 | } |
| 2669 | |
| 2670 | static inline void |
| 2671 | zone_depot_move_empty( |
| 2672 | struct zone_depot *dst, |
| 2673 | struct zone_depot *src, |
| 2674 | uint32_t n, |
| 2675 | zone_t z) |
| 2676 | { |
| 2677 | zone_magazine_t head, last; |
| 2678 | |
| 2679 | assert(n); |
| 2680 | assert(src->zd_empty >= n); |
| 2681 | |
| 2682 | src->zd_empty -= n; |
| 2683 | if (z && z->z_recirc_empty_min > src->zd_empty) { |
| 2684 | z->z_recirc_empty_min = src->zd_empty; |
| 2685 | } |
| 2686 | head = last = *src->zd_tail; |
| 2687 | for (uint32_t i = n; i-- > 1;) { |
| 2688 | last = last->zm_next; |
| 2689 | } |
| 2690 | |
| 2691 | *src->zd_tail = last->zm_next; |
| 2692 | |
| 2693 | dst->zd_empty += n; |
| 2694 | last->zm_next = *dst->zd_tail; |
| 2695 | *dst->zd_tail = head; |
| 2696 | } |
| 2697 | |
| 2698 | static inline bool |
| 2699 | zone_depot_poll(struct zone_depot *depot, smr_t smr) |
| 2700 | { |
| 2701 | if (depot->zd_full == 0) { |
| 2702 | return false; |
| 2703 | } |
| 2704 | |
| 2705 | return smr == NULL || smr_poll(smr, goal: depot->zd_head->zm_seq); |
| 2706 | } |
| 2707 | |
| 2708 | static void |
| 2709 | zone_cache_swap_magazines(zone_cache_t cache) |
| 2710 | { |
| 2711 | uint16_t count_a = cache->zc_alloc_cur; |
| 2712 | uint16_t count_f = cache->zc_free_cur; |
| 2713 | vm_offset_t *elems_a = cache->zc_alloc_elems; |
| 2714 | vm_offset_t *elems_f = cache->zc_free_elems; |
| 2715 | |
| 2716 | z_debug_assert(count_a <= zc_mag_size()); |
| 2717 | z_debug_assert(count_f <= zc_mag_size()); |
| 2718 | |
| 2719 | cache->zc_alloc_cur = count_f; |
| 2720 | cache->zc_free_cur = count_a; |
| 2721 | cache->zc_alloc_elems = elems_f; |
| 2722 | cache->zc_free_elems = elems_a; |
| 2723 | } |
| 2724 | |
| 2725 | __pure2 |
| 2726 | static smr_t |
| 2727 | zone_cache_smr(zone_cache_t cache) |
| 2728 | { |
| 2729 | return cache->zc_smr; |
| 2730 | } |
| 2731 | |
| 2732 | /*! |
| 2733 | * @function zone_magazine_replace |
| 2734 | * |
| 2735 | * @brief |
| 2736 | * Unlod a magazine and load a new one instead. |
| 2737 | */ |
| 2738 | static zone_magazine_t |
| 2739 | zone_magazine_replace(zone_cache_t zc, zone_magazine_t mag, bool empty) |
| 2740 | { |
| 2741 | zone_magazine_t old; |
| 2742 | vm_offset_t **elems; |
| 2743 | |
| 2744 | mag->zm_seq = SMR_SEQ_INVALID; |
| 2745 | |
| 2746 | if (empty) { |
| 2747 | elems = &zc->zc_free_elems; |
| 2748 | zc->zc_free_cur = 0; |
| 2749 | } else { |
| 2750 | elems = &zc->zc_alloc_elems; |
| 2751 | zc->zc_alloc_cur = zc_mag_size(); |
| 2752 | } |
| 2753 | old = (zone_magazine_t)((uintptr_t)*elems - |
| 2754 | offsetof(struct zone_magazine, zm_elems)); |
| 2755 | *elems = mag->zm_elems; |
| 2756 | |
| 2757 | return old; |
| 2758 | } |
| 2759 | |
| 2760 | static zone_magazine_t |
| 2761 | zone_magazine_alloc(zalloc_flags_t flags) |
| 2762 | { |
| 2763 | return zalloc_flags(zc_magazine_zone, flags | Z_ZERO); |
| 2764 | } |
| 2765 | |
| 2766 | static void |
| 2767 | zone_magazine_free(zone_magazine_t mag) |
| 2768 | { |
| 2769 | (zfree)(zone: zc_magazine_zone, elem: mag); |
| 2770 | } |
| 2771 | |
| 2772 | static void |
| 2773 | zone_magazine_free_list(struct zone_depot *zd) |
| 2774 | { |
| 2775 | zone_magazine_t tmp, mag = *zd->zd_tail; |
| 2776 | |
| 2777 | while (mag) { |
| 2778 | tmp = mag->zm_next; |
| 2779 | zone_magazine_free(mag); |
| 2780 | mag = tmp; |
| 2781 | } |
| 2782 | |
| 2783 | *zd->zd_tail = NULL; |
| 2784 | zd->zd_empty = 0; |
| 2785 | } |
| 2786 | |
| 2787 | void |
| 2788 | zone_enable_caching(zone_t zone) |
| 2789 | { |
| 2790 | size_t size_per_mag = zone_elem_inner_size(zone) * zc_mag_size(); |
| 2791 | zone_cache_t caches; |
| 2792 | size_t depot_limit; |
| 2793 | |
| 2794 | depot_limit = zc_pcpu_max() / size_per_mag; |
| 2795 | zone->z_depot_limit = (uint16_t)MIN(depot_limit, INT16_MAX); |
| 2796 | |
| 2797 | caches = zalloc_percpu_permanent_type(struct zone_cache); |
| 2798 | zpercpu_foreach(zc, caches) { |
| 2799 | zc->zc_alloc_elems = zone_magazine_alloc(flags: Z_WAITOK | Z_NOFAIL)->zm_elems; |
| 2800 | zc->zc_free_elems = zone_magazine_alloc(flags: Z_WAITOK | Z_NOFAIL)->zm_elems; |
| 2801 | zone_depot_init(zd: &zc->zc_depot); |
| 2802 | hw_lck_ticket_init(&zc->zc_depot_lock, &zone_locks_grp); |
| 2803 | } |
| 2804 | |
| 2805 | zone_lock(zone); |
| 2806 | assert(zone->z_pcpu_cache == NULL); |
| 2807 | zone->z_pcpu_cache = caches; |
| 2808 | zone->z_recirc_cont_cur = 0; |
| 2809 | zone->z_recirc_cont_wma = 0; |
| 2810 | zone->z_elems_free_min = 0; /* becomes z_recirc_empty_min */ |
| 2811 | zone->z_elems_free_wma = 0; /* becomes z_recirc_empty_wma */ |
| 2812 | zone_unlock(zone); |
| 2813 | } |
| 2814 | |
| 2815 | bool |
| 2816 | zone_maps_owned(vm_address_t addr, vm_size_t size) |
| 2817 | { |
| 2818 | return from_zone_map(addr, size); |
| 2819 | } |
| 2820 | |
| 2821 | #if KASAN_LIGHT |
| 2822 | bool |
| 2823 | kasan_zone_maps_owned(vm_address_t addr, vm_size_t size) |
| 2824 | { |
| 2825 | return from_zone_map(addr, size) || |
| 2826 | mach_vm_range_size(&zone_info.zi_map_range) == 0; |
| 2827 | } |
| 2828 | #endif /* KASAN_LIGHT */ |
| 2829 | |
| 2830 | void |
| 2831 | zone_map_sizes( |
| 2832 | vm_map_size_t *psize, |
| 2833 | vm_map_size_t *pfree, |
| 2834 | vm_map_size_t *plargest_free) |
| 2835 | { |
| 2836 | vm_map_size_t size, free, largest; |
| 2837 | |
| 2838 | vm_map_sizes(map: zone_submaps[0], psize, pfree, plargest_free); |
| 2839 | |
| 2840 | for (uint32_t i = 1; i < Z_SUBMAP_IDX_COUNT; i++) { |
| 2841 | vm_map_sizes(map: zone_submaps[i], psize: &size, pfree: &free, plargest_free: &largest); |
| 2842 | *psize += size; |
| 2843 | *pfree += free; |
| 2844 | *plargest_free = MAX(*plargest_free, largest); |
| 2845 | } |
| 2846 | } |
| 2847 | |
| 2848 | __attribute__((always_inline)) |
| 2849 | vm_map_t |
| 2850 | zone_submap(zone_security_flags_t zsflags) |
| 2851 | { |
| 2852 | return zone_submaps[zsflags.z_submap_idx]; |
| 2853 | } |
| 2854 | |
| 2855 | unsigned |
| 2856 | zpercpu_count(void) |
| 2857 | { |
| 2858 | return zpercpu_early_count; |
| 2859 | } |
| 2860 | |
| 2861 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC |
| 2862 | /* |
| 2863 | * Returns a random number of a given bit-width. |
| 2864 | * |
| 2865 | * DO NOT COPY THIS CODE OUTSIDE OF ZALLOC |
| 2866 | * |
| 2867 | * This uses Intel's rdrand because random() uses FP registers |
| 2868 | * which causes FP faults and allocations which isn't something |
| 2869 | * we can do from zalloc itself due to reentrancy problems. |
| 2870 | * |
| 2871 | * For pre-rdrand machines (which we no longer support), |
| 2872 | * we use a bad biased random generator that doesn't use FP. |
| 2873 | * Such HW is no longer supported, but VM of newer OSes on older |
| 2874 | * bare metal is made to limp along (with reduced security) this way. |
| 2875 | */ |
| 2876 | static uint64_t |
| 2877 | zalloc_random_mask64(uint32_t bits) |
| 2878 | { |
| 2879 | uint64_t mask = ~0ull >> (64 - bits); |
| 2880 | uint64_t v; |
| 2881 | |
| 2882 | #if __x86_64__ |
| 2883 | if (__probable(cpuid_features() & CPUID_FEATURE_RDRAND)) { |
| 2884 | asm volatile ("1: rdrand %0; jnc 1b\n" : "=r" (v) :: "cc" ); |
| 2885 | v &= mask; |
| 2886 | } else { |
| 2887 | disable_preemption(); |
| 2888 | int cpu = cpu_number(); |
| 2889 | v = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg, |
| 2890 | zone_bool_gen[cpu].zbg_entropy, |
| 2891 | ZONE_ENTROPY_CNT, bits); |
| 2892 | enable_preemption(); |
| 2893 | } |
| 2894 | #else |
| 2895 | v = early_random() & mask; |
| 2896 | #endif |
| 2897 | |
| 2898 | return v; |
| 2899 | } |
| 2900 | |
| 2901 | /* |
| 2902 | * Returns a random number within [bound_min, bound_max) |
| 2903 | * |
| 2904 | * This isn't _exactly_ uniform, but the skew is small enough |
| 2905 | * not to matter for the consumers of this interface. |
| 2906 | * |
| 2907 | * Values within [bound_min, 2^64 % (bound_max - bound_min)) |
| 2908 | * will be returned (bound_max - bound_min) / 2^64 more often |
| 2909 | * than values within [2^64 % (bound_max - bound_min), bound_max). |
| 2910 | */ |
| 2911 | static uint32_t |
| 2912 | zalloc_random_uniform32(uint32_t bound_min, uint32_t bound_max) |
| 2913 | { |
| 2914 | uint64_t delta = bound_max - bound_min; |
| 2915 | |
| 2916 | return bound_min + (uint32_t)(zalloc_random_mask64(bits: 64) % delta); |
| 2917 | } |
| 2918 | |
| 2919 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC */ |
| 2920 | #if ZALLOC_ENABLE_LOGGING || CONFIG_PROB_GZALLOC |
| 2921 | /* |
| 2922 | * Track all kalloc zones of specified size for zlog name |
| 2923 | * kalloc.type.<size> or kalloc.type.var.<size> or kalloc.<size> |
| 2924 | * |
| 2925 | * Additionally track all shared kalloc zones with shared.kalloc |
| 2926 | */ |
| 2927 | static bool |
| 2928 | track_kalloc_zones(zone_t z, const char *logname) |
| 2929 | { |
| 2930 | const char *prefix; |
| 2931 | size_t len; |
| 2932 | zone_security_flags_t zsflags = zone_security_config(z); |
| 2933 | |
| 2934 | prefix = "kalloc.type.var." ; |
| 2935 | len = strlen(prefix); |
| 2936 | if (zsflags.z_kalloc_type && zsflags.z_kheap_id == KHEAP_ID_KT_VAR && |
| 2937 | strncmp(logname, prefix, len) == 0) { |
| 2938 | vm_size_t sizeclass = strtoul(logname + len, NULL, 0); |
| 2939 | |
| 2940 | return zone_elem_inner_size(z) == sizeclass; |
| 2941 | } |
| 2942 | |
| 2943 | prefix = "kalloc.type." ; |
| 2944 | len = strlen(prefix); |
| 2945 | if (zsflags.z_kalloc_type && zsflags.z_kheap_id != KHEAP_ID_KT_VAR && |
| 2946 | strncmp(logname, prefix, len) == 0) { |
| 2947 | vm_size_t sizeclass = strtoul(logname + len, NULL, 0); |
| 2948 | |
| 2949 | return zone_elem_inner_size(z) == sizeclass; |
| 2950 | } |
| 2951 | |
| 2952 | prefix = "kalloc." ; |
| 2953 | len = strlen(prefix); |
| 2954 | if ((zsflags.z_kheap_id || zsflags.z_kalloc_type) && |
| 2955 | strncmp(logname, prefix, len) == 0) { |
| 2956 | vm_size_t sizeclass = strtoul(logname + len, NULL, 0); |
| 2957 | |
| 2958 | return zone_elem_inner_size(z) == sizeclass; |
| 2959 | } |
| 2960 | |
| 2961 | prefix = "shared.kalloc" ; |
| 2962 | if ((zsflags.z_kheap_id == KHEAP_ID_SHARED) && |
| 2963 | (strcmp(logname, prefix) == 0)) { |
| 2964 | return true; |
| 2965 | } |
| 2966 | |
| 2967 | return false; |
| 2968 | } |
| 2969 | #endif |
| 2970 | |
| 2971 | int |
| 2972 | track_this_zone(const char *zonename, const char *logname) |
| 2973 | { |
| 2974 | unsigned int len; |
| 2975 | const char *zc = zonename; |
| 2976 | const char *lc = logname; |
| 2977 | |
| 2978 | /* |
| 2979 | * Compare the strings. We bound the compare by MAX_ZONE_NAME. |
| 2980 | */ |
| 2981 | |
| 2982 | for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { |
| 2983 | /* |
| 2984 | * If the current characters don't match, check for a space in |
| 2985 | * in the zone name and a corresponding period in the log name. |
| 2986 | * If that's not there, then the strings don't match. |
| 2987 | */ |
| 2988 | |
| 2989 | if (*zc != *lc && !(*zc == ' ' && *lc == '.')) { |
| 2990 | break; |
| 2991 | } |
| 2992 | |
| 2993 | /* |
| 2994 | * The strings are equal so far. If we're at the end, then it's a match. |
| 2995 | */ |
| 2996 | |
| 2997 | if (*zc == '\0') { |
| 2998 | return TRUE; |
| 2999 | } |
| 3000 | } |
| 3001 | |
| 3002 | return FALSE; |
| 3003 | } |
| 3004 | |
| 3005 | #if DEBUG || DEVELOPMENT |
| 3006 | |
| 3007 | vm_size_t |
| 3008 | zone_element_info(void *addr, vm_tag_t * ptag) |
| 3009 | { |
| 3010 | vm_size_t size = 0; |
| 3011 | vm_tag_t tag = VM_KERN_MEMORY_NONE; |
| 3012 | struct zone *src_zone; |
| 3013 | |
| 3014 | if (from_zone_map(addr, sizeof(void *))) { |
| 3015 | src_zone = zone_by_id(zone_index_from_ptr(addr)); |
| 3016 | size = zone_elem_inner_size(src_zone); |
| 3017 | #if VM_TAG_SIZECLASSES |
| 3018 | if (__improbable(src_zone->z_uses_tags)) { |
| 3019 | struct zone_page_metadata *meta; |
| 3020 | vm_offset_t eidx; |
| 3021 | vm_tag_t *slot; |
| 3022 | |
| 3023 | meta = zone_element_resolve(src_zone, |
| 3024 | (vm_offset_t)addr, &eidx); |
| 3025 | slot = zba_extra_ref_ptr(meta->zm_bitmap, eidx); |
| 3026 | tag = *slot; |
| 3027 | } |
| 3028 | #endif /* VM_TAG_SIZECLASSES */ |
| 3029 | } |
| 3030 | |
| 3031 | *ptag = tag; |
| 3032 | return size; |
| 3033 | } |
| 3034 | |
| 3035 | #endif /* DEBUG || DEVELOPMENT */ |
| 3036 | #if KASAN_CLASSIC |
| 3037 | |
| 3038 | vm_size_t |
| 3039 | kasan_quarantine_resolve(vm_address_t addr, zone_t *zonep) |
| 3040 | { |
| 3041 | zone_t zone = zone_by_id(zone_index_from_ptr((void *)addr)); |
| 3042 | |
| 3043 | *zonep = zone; |
| 3044 | return zone_elem_inner_size(zone); |
| 3045 | } |
| 3046 | |
| 3047 | #endif /* KASAN_CLASSIC */ |
| 3048 | #endif /* !ZALLOC_TEST */ |
| 3049 | #pragma mark Zone zeroing and early random |
| 3050 | #if !ZALLOC_TEST |
| 3051 | |
| 3052 | /* |
| 3053 | * Zone zeroing |
| 3054 | * |
| 3055 | * All allocations from zones are zeroed on free and are additionally |
| 3056 | * check that they are still zero on alloc. The check is |
| 3057 | * always on, on embedded devices. Perf regression was detected |
| 3058 | * on intel as we cant use the vectorized implementation of |
| 3059 | * memcmp_zero_ptr_aligned due to cyclic dependenices between |
| 3060 | * initization and allocation. Therefore we perform the check |
| 3061 | * on 20% of the allocations. |
| 3062 | */ |
| 3063 | #if ZALLOC_ENABLE_ZERO_CHECK |
| 3064 | #if defined(__x86_64__) |
| 3065 | /* |
| 3066 | * Peform zero validation on every 5th allocation |
| 3067 | */ |
| 3068 | static TUNABLE(uint32_t, zzc_rate, "zzc_rate" , 5); |
| 3069 | static uint32_t PERCPU_DATA(zzc_decrementer); |
| 3070 | #endif /* defined(__x86_64__) */ |
| 3071 | |
| 3072 | /* |
| 3073 | * Determine if zero validation for allocation should be skipped |
| 3074 | */ |
| 3075 | static bool |
| 3076 | zalloc_skip_zero_check(void) |
| 3077 | { |
| 3078 | #if defined(__x86_64__) |
| 3079 | uint32_t *counterp, cnt; |
| 3080 | |
| 3081 | counterp = PERCPU_GET(zzc_decrementer); |
| 3082 | cnt = *counterp; |
| 3083 | if (__probable(cnt > 0)) { |
| 3084 | *counterp = cnt - 1; |
| 3085 | return true; |
| 3086 | } |
| 3087 | *counterp = zzc_rate - 1; |
| 3088 | #endif /* !defined(__x86_64__) */ |
| 3089 | return false; |
| 3090 | } |
| 3091 | |
| 3092 | __abortlike |
| 3093 | static void |
| 3094 | zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size) |
| 3095 | { |
| 3096 | uint32_t esize = (uint32_t)zone_elem_inner_size(zone: z); |
| 3097 | uint32_t first_offs = ~0u; |
| 3098 | uintptr_t first_bits = 0, v; |
| 3099 | char buf[1024]; |
| 3100 | int pos = 0; |
| 3101 | |
| 3102 | buf[0] = '\0'; |
| 3103 | |
| 3104 | for (uint32_t o = 0; o < size; o += sizeof(v)) { |
| 3105 | if ((v = *(uintptr_t *)(elem + o)) == 0) { |
| 3106 | continue; |
| 3107 | } |
| 3108 | pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" |
| 3109 | "%5d: 0x%016lx" , o, v); |
| 3110 | if (first_offs > o) { |
| 3111 | first_offs = o; |
| 3112 | first_bits = v; |
| 3113 | } |
| 3114 | } |
| 3115 | |
| 3116 | (panic)(string: "[%s%s]: element modified after free " |
| 3117 | "(off:%d, val:0x%016lx, sz:%d, ptr:%p)%s" , |
| 3118 | zone_heap_name(z), zone_name(z), |
| 3119 | first_offs, first_bits, esize, (void *)elem, buf); |
| 3120 | } |
| 3121 | |
| 3122 | static void |
| 3123 | zalloc_validate_element( |
| 3124 | zone_t zone, |
| 3125 | vm_offset_t elem, |
| 3126 | vm_size_t size, |
| 3127 | zalloc_flags_t flags) |
| 3128 | { |
| 3129 | if (flags & Z_NOZZC) { |
| 3130 | return; |
| 3131 | } |
| 3132 | if (memcmp_zero_ptr_aligned(s: (void *)elem, n: size)) { |
| 3133 | zalloc_uaf_panic(z: zone, elem, size); |
| 3134 | } |
| 3135 | if (flags & Z_PCPU) { |
| 3136 | for (size_t i = zpercpu_count(); --i > 0;) { |
| 3137 | elem += PAGE_SIZE; |
| 3138 | if (memcmp_zero_ptr_aligned(s: (void *)elem, n: size)) { |
| 3139 | zalloc_uaf_panic(z: zone, elem, size); |
| 3140 | } |
| 3141 | } |
| 3142 | } |
| 3143 | } |
| 3144 | |
| 3145 | #endif /* ZALLOC_ENABLE_ZERO_CHECK */ |
| 3146 | |
| 3147 | __attribute__((noinline)) |
| 3148 | static void |
| 3149 | zone_early_scramble_rr(zone_t zone, int cpu, zone_stats_t zs) |
| 3150 | { |
| 3151 | #if KASAN_FAKESTACK |
| 3152 | /* |
| 3153 | * This can cause re-entrancy with kasan fakestacks |
| 3154 | */ |
| 3155 | #pragma unused(zone, cpu, zs) |
| 3156 | #else |
| 3157 | uint32_t bits; |
| 3158 | |
| 3159 | bits = random_bool_gen_bits(bg: &zone_bool_gen[cpu].zbg_bg, |
| 3160 | buffer: zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, numbits: 8); |
| 3161 | |
| 3162 | zs->zs_alloc_rr += bits; |
| 3163 | zs->zs_alloc_rr %= zone->z_chunk_elems; |
| 3164 | #endif |
| 3165 | } |
| 3166 | |
| 3167 | #endif /* !ZALLOC_TEST */ |
| 3168 | #pragma mark Zone Leak Detection |
| 3169 | #if !ZALLOC_TEST |
| 3170 | #if ZALLOC_ENABLE_LOGGING || CONFIG_ZLEAKS |
| 3171 | |
| 3172 | /* |
| 3173 | * Zone leak debugging code |
| 3174 | * |
| 3175 | * When enabled, this code keeps a log to track allocations to a particular |
| 3176 | * zone that have not yet been freed. |
| 3177 | * |
| 3178 | * Examining this log will reveal the source of a zone leak. |
| 3179 | * |
| 3180 | * The log is allocated only when logging is enabled (it is off by default), |
| 3181 | * so there is no effect on the system when it's turned off. |
| 3182 | * |
| 3183 | * Zone logging is enabled with the `zlog<n>=<zone>` boot-arg for each |
| 3184 | * zone name to log, with n starting at 1. |
| 3185 | * |
| 3186 | * Leaks debugging utilizes 2 tunables: |
| 3187 | * - zlsize (in kB) which describes how much "size" the record covers |
| 3188 | * (zones with smaller elements get more records, default is 4M). |
| 3189 | * |
| 3190 | * - zlfreq (in bytes) which describes a sample rate in cumulative allocation |
| 3191 | * size at which automatic leak detection will sample allocations. |
| 3192 | * (default is 8k) |
| 3193 | * |
| 3194 | * |
| 3195 | * Zone corruption logging |
| 3196 | * |
| 3197 | * Logging can also be used to help identify the source of a zone corruption. |
| 3198 | * |
| 3199 | * First, identify the zone that is being corrupted, |
| 3200 | * then add "-zc zlog<n>=<zone name>" to the boot-args. |
| 3201 | * |
| 3202 | * When -zc is used in conjunction with zlog, |
| 3203 | * it changes the logging style to track both allocations and frees to the zone. |
| 3204 | * |
| 3205 | * When the corruption is detected, examining the log will show you the stack |
| 3206 | * traces of the callers who last allocated and freed any particular element in |
| 3207 | * the zone. |
| 3208 | * |
| 3209 | * Corruption debugging logs will have zrecs records |
| 3210 | * (tuned by the zrecs= boot-arg, 16k elements per G of RAM by default). |
| 3211 | */ |
| 3212 | |
| 3213 | #define ZRECORDS_MAX (256u << 10) |
| 3214 | #define ZRECORDS_DEFAULT (16u << 10) |
| 3215 | static TUNABLE(uint32_t, zrecs, "zrecs" , 0); |
| 3216 | static TUNABLE(uint32_t, zlsize, "zlsize" , 4 * 1024); |
| 3217 | static TUNABLE(uint32_t, zlfreq, "zlfreq" , 8 * 1024); |
| 3218 | |
| 3219 | __startup_func |
| 3220 | static void |
| 3221 | zone_leaks_init_zrecs(void) |
| 3222 | { |
| 3223 | /* |
| 3224 | * Don't allow more than ZRECORDS_MAX records, |
| 3225 | * even if the user asked for more. |
| 3226 | * |
| 3227 | * This prevents accidentally hogging too much kernel memory |
| 3228 | * and making the system unusable. |
| 3229 | */ |
| 3230 | if (zrecs == 0) { |
| 3231 | zrecs = ZRECORDS_DEFAULT * |
| 3232 | (uint32_t)((max_mem + (1ul << 30)) >> 30); |
| 3233 | } |
| 3234 | if (zrecs > ZRECORDS_MAX) { |
| 3235 | zrecs = ZRECORDS_MAX; |
| 3236 | } |
| 3237 | } |
| 3238 | STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_leaks_init_zrecs); |
| 3239 | |
| 3240 | static uint32_t |
| 3241 | zone_leaks_record_count(zone_t z) |
| 3242 | { |
| 3243 | uint32_t recs = (zlsize << 10) / zone_elem_inner_size(z); |
| 3244 | |
| 3245 | return MIN(MAX(recs, ZRECORDS_DEFAULT), ZRECORDS_MAX); |
| 3246 | } |
| 3247 | |
| 3248 | static uint32_t |
| 3249 | zone_leaks_sample_rate(zone_t z) |
| 3250 | { |
| 3251 | return zlfreq / zone_elem_inner_size(z); |
| 3252 | } |
| 3253 | |
| 3254 | #if ZALLOC_ENABLE_LOGGING |
| 3255 | /* Log allocations and frees to help debug a zone element corruption */ |
| 3256 | static TUNABLE(bool, corruption_debug_flag, "-zc" , false); |
| 3257 | |
| 3258 | /* |
| 3259 | * A maximum of 10 zlog<n> boot args can be provided (zlog1 -> zlog10) |
| 3260 | */ |
| 3261 | #define MAX_ZONES_LOG_REQUESTS 10 |
| 3262 | |
| 3263 | /** |
| 3264 | * @function zone_setup_logging |
| 3265 | * |
| 3266 | * @abstract |
| 3267 | * Optionally sets up a zone for logging. |
| 3268 | * |
| 3269 | * @discussion |
| 3270 | * We recognized two boot-args: |
| 3271 | * |
| 3272 | * zlog=<zone_to_log> |
| 3273 | * zrecs=<num_records_in_log> |
| 3274 | * zlsize=<memory to cover for leaks> |
| 3275 | * |
| 3276 | * The zlog arg is used to specify the zone name that should be logged, |
| 3277 | * and zrecs/zlsize is used to control the size of the log. |
| 3278 | */ |
| 3279 | static void |
| 3280 | zone_setup_logging(zone_t z) |
| 3281 | { |
| 3282 | char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */ |
| 3283 | char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */ |
| 3284 | char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */ |
| 3285 | bool logging_on = false; |
| 3286 | |
| 3287 | /* |
| 3288 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
| 3289 | */ |
| 3290 | snprintf(zone_name, MAX_ZONE_NAME, "%s%s" , zone_heap_name(z), z->z_name); |
| 3291 | |
| 3292 | /* zlog0 isn't allowed. */ |
| 3293 | for (int i = 1; i <= MAX_ZONES_LOG_REQUESTS; i++) { |
| 3294 | snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d" , i); |
| 3295 | |
| 3296 | if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val))) { |
| 3297 | if (track_this_zone(zone_name, zlog_val) || |
| 3298 | track_kalloc_zones(z, zlog_val)) { |
| 3299 | logging_on = true; |
| 3300 | break; |
| 3301 | } |
| 3302 | } |
| 3303 | } |
| 3304 | |
| 3305 | /* |
| 3306 | * Backwards compat. with the old boot-arg used to specify single zone |
| 3307 | * logging i.e. zlog Needs to happen after the newer zlogn checks |
| 3308 | * because the prefix will match all the zlogn |
| 3309 | * boot-args. |
| 3310 | */ |
| 3311 | if (!logging_on && |
| 3312 | PE_parse_boot_argn("zlog" , zlog_val, sizeof(zlog_val))) { |
| 3313 | if (track_this_zone(zone_name, zlog_val) || |
| 3314 | track_kalloc_zones(z, zlog_val)) { |
| 3315 | logging_on = true; |
| 3316 | } |
| 3317 | } |
| 3318 | |
| 3319 | /* |
| 3320 | * If we want to log a zone, see if we need to allocate buffer space for |
| 3321 | * the log. |
| 3322 | * |
| 3323 | * Some vm related zones are zinit'ed before we can do a kmem_alloc, so |
| 3324 | * we have to defer allocation in that case. |
| 3325 | * |
| 3326 | * zone_init() will finish the job. |
| 3327 | * |
| 3328 | * If we want to log one of the VM related zones that's set up early on, |
| 3329 | * we will skip allocation of the log until zinit is called again later |
| 3330 | * on some other zone. |
| 3331 | */ |
| 3332 | if (logging_on) { |
| 3333 | if (corruption_debug_flag) { |
| 3334 | z->z_btlog = btlog_create(BTLOG_LOG, zrecs, 0); |
| 3335 | } else { |
| 3336 | z->z_btlog = btlog_create(BTLOG_HASH, |
| 3337 | zone_leaks_record_count(z), 0); |
| 3338 | } |
| 3339 | if (z->z_btlog) { |
| 3340 | z->z_log_on = true; |
| 3341 | printf("zone[%s%s]: logging enabled\n" , |
| 3342 | zone_heap_name(z), z->z_name); |
| 3343 | } else { |
| 3344 | printf("zone[%s%s]: failed to enable logging\n" , |
| 3345 | zone_heap_name(z), z->z_name); |
| 3346 | } |
| 3347 | } |
| 3348 | } |
| 3349 | |
| 3350 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 3351 | #if KASAN_TBI |
| 3352 | static TUNABLE(uint32_t, kasan_zrecs, "kasan_zrecs" , 0); |
| 3353 | |
| 3354 | __startup_func |
| 3355 | static void |
| 3356 | kasan_tbi_init_zrecs(void) |
| 3357 | { |
| 3358 | /* |
| 3359 | * Don't allow more than ZRECORDS_MAX records, |
| 3360 | * even if the user asked for more. |
| 3361 | * |
| 3362 | * This prevents accidentally hogging too much kernel memory |
| 3363 | * and making the system unusable. |
| 3364 | */ |
| 3365 | if (kasan_zrecs == 0) { |
| 3366 | kasan_zrecs = ZRECORDS_DEFAULT * |
| 3367 | (uint32_t)((max_mem + (1ul << 30)) >> 30); |
| 3368 | } |
| 3369 | if (kasan_zrecs > ZRECORDS_MAX) { |
| 3370 | kasan_zrecs = ZRECORDS_MAX; |
| 3371 | } |
| 3372 | } |
| 3373 | STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, kasan_tbi_init_zrecs); |
| 3374 | |
| 3375 | static void |
| 3376 | zone_setup_kasan_logging(zone_t z) |
| 3377 | { |
| 3378 | if (!z->z_tbi_tag) { |
| 3379 | printf("zone[%s%s]: kasan logging disabled for this zone\n" , |
| 3380 | zone_heap_name(z), z->z_name); |
| 3381 | return; |
| 3382 | } |
| 3383 | |
| 3384 | z->z_log_on = true; |
| 3385 | z->z_btlog = btlog_create(BTLOG_LOG, kasan_zrecs, 0); |
| 3386 | if (!z->z_btlog) { |
| 3387 | printf("zone[%s%s]: failed to enable kasan logging\n" , |
| 3388 | zone_heap_name(z), z->z_name); |
| 3389 | } |
| 3390 | } |
| 3391 | |
| 3392 | #endif /* KASAN_TBI */ |
| 3393 | #if CONFIG_ZLEAKS |
| 3394 | |
| 3395 | static thread_call_data_t zone_leaks_callout; |
| 3396 | |
| 3397 | /* |
| 3398 | * The zone leak detector, abbreviated 'zleak', keeps track |
| 3399 | * of a subset of the currently outstanding allocations |
| 3400 | * made by the zone allocator. |
| 3401 | * |
| 3402 | * Zones who use more than zleak_pages_per_zone_wired_threshold |
| 3403 | * pages will get a BTLOG_HASH btlog with sampling to minimize |
| 3404 | * perf impact, yet receive statistical data about the backtrace |
| 3405 | * that is the most likely to cause the leak. |
| 3406 | * |
| 3407 | * If the zone goes under the threshold enough, then the log |
| 3408 | * is disabled and backtraces freed. Data can be collected |
| 3409 | * from userspace with the zlog(1) command. |
| 3410 | */ |
| 3411 | |
| 3412 | uint32_t zleak_active; |
| 3413 | SECURITY_READ_ONLY_LATE(vm_size_t) zleak_max_zonemap_size; |
| 3414 | |
| 3415 | /* Size a zone will have before we will collect data on it */ |
| 3416 | static size_t zleak_pages_per_zone_wired_threshold = ~0; |
| 3417 | vm_size_t zleak_per_zone_tracking_threshold = ~0; |
| 3418 | |
| 3419 | static inline bool |
| 3420 | zleak_should_enable_for_zone(zone_t z) |
| 3421 | { |
| 3422 | if (z->z_log_on) { |
| 3423 | return false; |
| 3424 | } |
| 3425 | if (z->z_btlog) { |
| 3426 | return false; |
| 3427 | } |
| 3428 | if (z->z_exhausts) { |
| 3429 | return false; |
| 3430 | } |
| 3431 | if (zone_exhaustible(z)) { |
| 3432 | return z->z_wired_cur * 8 >= z->z_wired_max * 7; |
| 3433 | } |
| 3434 | return z->z_wired_cur >= zleak_pages_per_zone_wired_threshold; |
| 3435 | } |
| 3436 | |
| 3437 | static inline bool |
| 3438 | zleak_should_disable_for_zone(zone_t z) |
| 3439 | { |
| 3440 | if (z->z_log_on) { |
| 3441 | return false; |
| 3442 | } |
| 3443 | if (!z->z_btlog) { |
| 3444 | return false; |
| 3445 | } |
| 3446 | if (zone_exhaustible(z)) { |
| 3447 | return z->z_wired_cur * 8 < z->z_wired_max * 7; |
| 3448 | } |
| 3449 | return z->z_wired_cur < zleak_pages_per_zone_wired_threshold / 2; |
| 3450 | } |
| 3451 | |
| 3452 | static void |
| 3453 | zleaks_enable_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) |
| 3454 | { |
| 3455 | btlog_t log; |
| 3456 | |
| 3457 | zone_foreach(z) { |
| 3458 | if (zleak_should_disable_for_zone(z)) { |
| 3459 | log = z->z_btlog; |
| 3460 | z->z_btlog = NULL; |
| 3461 | assert(z->z_btlog_disabled == NULL); |
| 3462 | btlog_disable(log); |
| 3463 | z->z_btlog_disabled = log; |
| 3464 | os_atomic_dec(&zleak_active, relaxed); |
| 3465 | } |
| 3466 | |
| 3467 | if (zleak_should_enable_for_zone(z)) { |
| 3468 | log = z->z_btlog_disabled; |
| 3469 | if (log == NULL) { |
| 3470 | log = btlog_create(BTLOG_HASH, |
| 3471 | zone_leaks_record_count(z), |
| 3472 | zone_leaks_sample_rate(z)); |
| 3473 | } else if (btlog_enable(log) == KERN_SUCCESS) { |
| 3474 | z->z_btlog_disabled = NULL; |
| 3475 | } else { |
| 3476 | log = NULL; |
| 3477 | } |
| 3478 | os_atomic_store(&z->z_btlog, log, release); |
| 3479 | os_atomic_inc(&zleak_active, relaxed); |
| 3480 | } |
| 3481 | } |
| 3482 | } |
| 3483 | |
| 3484 | __startup_func |
| 3485 | static void |
| 3486 | zleak_init(void) |
| 3487 | { |
| 3488 | zleak_max_zonemap_size = ptoa(zone_pages_wired_max); |
| 3489 | |
| 3490 | zleak_update_threshold(&zleak_per_zone_tracking_threshold, |
| 3491 | zleak_max_zonemap_size / 8); |
| 3492 | |
| 3493 | thread_call_setup_with_options(&zone_leaks_callout, |
| 3494 | zleaks_enable_async, NULL, THREAD_CALL_PRIORITY_USER, |
| 3495 | THREAD_CALL_OPTIONS_ONCE); |
| 3496 | } |
| 3497 | STARTUP(ZALLOC, STARTUP_RANK_SECOND, zleak_init); |
| 3498 | |
| 3499 | kern_return_t |
| 3500 | zleak_update_threshold(vm_size_t *arg, uint64_t value) |
| 3501 | { |
| 3502 | if (value >= zleak_max_zonemap_size) { |
| 3503 | return KERN_INVALID_VALUE; |
| 3504 | } |
| 3505 | |
| 3506 | if (arg == &zleak_per_zone_tracking_threshold) { |
| 3507 | zleak_per_zone_tracking_threshold = (vm_size_t)value; |
| 3508 | zleak_pages_per_zone_wired_threshold = atop(value); |
| 3509 | if (startup_phase >= STARTUP_SUB_THREAD_CALL) { |
| 3510 | thread_call_enter(&zone_leaks_callout); |
| 3511 | } |
| 3512 | return KERN_SUCCESS; |
| 3513 | } |
| 3514 | |
| 3515 | return KERN_INVALID_ARGUMENT; |
| 3516 | } |
| 3517 | |
| 3518 | static void |
| 3519 | panic_display_zleaks(bool has_syms) |
| 3520 | { |
| 3521 | bool did_header = false; |
| 3522 | vm_address_t bt[BTLOG_MAX_DEPTH]; |
| 3523 | uint32_t len, count; |
| 3524 | |
| 3525 | zone_foreach(z) { |
| 3526 | btlog_t log = z->z_btlog; |
| 3527 | |
| 3528 | if (log == NULL || btlog_get_type(log) != BTLOG_HASH) { |
| 3529 | continue; |
| 3530 | } |
| 3531 | |
| 3532 | count = btlog_guess_top(log, bt, &len); |
| 3533 | if (count == 0) { |
| 3534 | continue; |
| 3535 | } |
| 3536 | |
| 3537 | if (!did_header) { |
| 3538 | paniclog_append_noflush("Zone (suspected) leak report:\n" ); |
| 3539 | did_header = true; |
| 3540 | } |
| 3541 | |
| 3542 | paniclog_append_noflush(" Zone: %s%s\n" , |
| 3543 | zone_heap_name(z), zone_name(z)); |
| 3544 | paniclog_append_noflush(" Count: %d (%ld bytes)\n" , count, |
| 3545 | (long)count * zone_scale_for_percpu(z, zone_elem_inner_size(z))); |
| 3546 | paniclog_append_noflush(" Size: %ld\n" , |
| 3547 | (long)zone_size_wired(z)); |
| 3548 | paniclog_append_noflush(" Top backtrace:\n" ); |
| 3549 | for (uint32_t i = 0; i < len; i++) { |
| 3550 | if (has_syms) { |
| 3551 | paniclog_append_noflush(" %p " , (void *)bt[i]); |
| 3552 | panic_print_symbol_name(bt[i]); |
| 3553 | paniclog_append_noflush("\n" ); |
| 3554 | } else { |
| 3555 | paniclog_append_noflush(" %p\n" , (void *)bt[i]); |
| 3556 | } |
| 3557 | } |
| 3558 | |
| 3559 | kmod_panic_dump(bt, len); |
| 3560 | paniclog_append_noflush("\n" ); |
| 3561 | } |
| 3562 | } |
| 3563 | #endif /* CONFIG_ZLEAKS */ |
| 3564 | |
| 3565 | #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ |
| 3566 | #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS || KASAN_TBI |
| 3567 | |
| 3568 | #if !KASAN_TBI |
| 3569 | __cold |
| 3570 | #endif |
| 3571 | static void |
| 3572 | zalloc_log(zone_t zone, vm_offset_t addr, uint32_t count, void *fp) |
| 3573 | { |
| 3574 | btlog_t log = zone->z_btlog; |
| 3575 | btref_get_flags_t flags = 0; |
| 3576 | btref_t ref; |
| 3577 | |
| 3578 | #if !KASAN_TBI |
| 3579 | if (!log || !btlog_sample(log)) { |
| 3580 | return; |
| 3581 | } |
| 3582 | #endif |
| 3583 | if (get_preemption_level() || zone_supports_vm(zone)) { |
| 3584 | /* |
| 3585 | * VM zones can be used by btlog, avoid reentrancy issues. |
| 3586 | */ |
| 3587 | flags = BTREF_GET_NOWAIT; |
| 3588 | } |
| 3589 | |
| 3590 | ref = btref_get(fp, flags); |
| 3591 | while (count-- > 0) { |
| 3592 | if (count) { |
| 3593 | btref_retain(ref); |
| 3594 | } |
| 3595 | btlog_record(log, (void *)addr, ZOP_ALLOC, ref); |
| 3596 | addr += *(vm_offset_t *)addr; |
| 3597 | } |
| 3598 | } |
| 3599 | |
| 3600 | #define ZALLOC_LOG(zone, addr, count) ({ \ |
| 3601 | if ((zone)->z_btlog) { \ |
| 3602 | zalloc_log(zone, addr, count, __builtin_frame_address(0)); \ |
| 3603 | } \ |
| 3604 | }) |
| 3605 | |
| 3606 | #if !KASAN_TBI |
| 3607 | __cold |
| 3608 | #endif |
| 3609 | static void |
| 3610 | zfree_log(zone_t zone, vm_offset_t addr, uint32_t count, void *fp) |
| 3611 | { |
| 3612 | btlog_t log = zone->z_btlog; |
| 3613 | btref_get_flags_t flags = 0; |
| 3614 | btref_t ref; |
| 3615 | |
| 3616 | #if !KASAN_TBI |
| 3617 | if (!log) { |
| 3618 | return; |
| 3619 | } |
| 3620 | #endif |
| 3621 | |
| 3622 | /* |
| 3623 | * See if we're doing logging on this zone. |
| 3624 | * |
| 3625 | * There are two styles of logging used depending on |
| 3626 | * whether we're trying to catch a leak or corruption. |
| 3627 | */ |
| 3628 | #if !KASAN_TBI |
| 3629 | if (btlog_get_type(log) == BTLOG_HASH) { |
| 3630 | /* |
| 3631 | * We're logging to catch a leak. |
| 3632 | * |
| 3633 | * Remove any record we might have for this element |
| 3634 | * since it's being freed. Note that we may not find it |
| 3635 | * if the buffer overflowed and that's OK. |
| 3636 | * |
| 3637 | * Since the log is of a limited size, old records get |
| 3638 | * overwritten if there are more zallocs than zfrees. |
| 3639 | */ |
| 3640 | while (count-- > 0) { |
| 3641 | btlog_erase(log, (void *)addr); |
| 3642 | addr += *(vm_offset_t *)addr; |
| 3643 | } |
| 3644 | return; |
| 3645 | } |
| 3646 | #endif /* !KASAN_TBI */ |
| 3647 | |
| 3648 | if (get_preemption_level() || zone_supports_vm(zone)) { |
| 3649 | /* |
| 3650 | * VM zones can be used by btlog, avoid reentrancy issues. |
| 3651 | */ |
| 3652 | flags = BTREF_GET_NOWAIT; |
| 3653 | } |
| 3654 | |
| 3655 | ref = btref_get(fp, flags); |
| 3656 | while (count-- > 0) { |
| 3657 | if (count) { |
| 3658 | btref_retain(ref); |
| 3659 | } |
| 3660 | btlog_record(log, (void *)addr, ZOP_FREE, ref); |
| 3661 | addr += *(vm_offset_t *)addr; |
| 3662 | } |
| 3663 | } |
| 3664 | |
| 3665 | #define ZFREE_LOG(zone, addr, count) ({ \ |
| 3666 | if ((zone)->z_btlog) { \ |
| 3667 | zfree_log(zone, addr, count, __builtin_frame_address(0)); \ |
| 3668 | } \ |
| 3669 | }) |
| 3670 | |
| 3671 | #else |
| 3672 | #define ZALLOC_LOG(...) ((void)0) |
| 3673 | #define ZFREE_LOG(...) ((void)0) |
| 3674 | #endif /* ZALLOC_ENABLE_LOGGING || CONFIG_ZLEAKS || KASAN_TBI */ |
| 3675 | #endif /* !ZALLOC_TEST */ |
| 3676 | #pragma mark zone (re)fill |
| 3677 | #if !ZALLOC_TEST |
| 3678 | |
| 3679 | /*! |
| 3680 | * @defgroup Zone Refill |
| 3681 | * @{ |
| 3682 | * |
| 3683 | * @brief |
| 3684 | * Functions handling The zone refill machinery. |
| 3685 | * |
| 3686 | * @discussion |
| 3687 | * Zones are refilled based on 2 mechanisms: direct expansion, async expansion. |
| 3688 | * |
| 3689 | * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is |
| 3690 | * dropping below half of its @c z_elems_rsv (0 for most zones) and will: |
| 3691 | * |
| 3692 | * - call @c zone_expand_locked() directly if the caller is allowed to block, |
| 3693 | * |
| 3694 | * - wakeup the asynchroous expansion thread call if the caller is not allowed |
| 3695 | * to block, or if the reserve becomes depleted. |
| 3696 | * |
| 3697 | * |
| 3698 | * <h2>Synchronous expansion</h2> |
| 3699 | * |
| 3700 | * This mechanism is actually the only one that may refill a zone, and all the |
| 3701 | * other ones funnel through this one eventually. |
| 3702 | * |
| 3703 | * @c zone_expand_locked() implements the core of the expansion mechanism, |
| 3704 | * and will do so while a caller specified predicate is true. |
| 3705 | * |
| 3706 | * Zone expansion allows for up to 2 threads to concurrently refill the zone: |
| 3707 | * - one VM privileged thread, |
| 3708 | * - one regular thread. |
| 3709 | * |
| 3710 | * Regular threads that refill will put down their identity in @c z_expander, |
| 3711 | * so that priority inversion avoidance can be implemented. |
| 3712 | * |
| 3713 | * However, VM privileged threads are allowed to use VM page reserves, |
| 3714 | * which allows for the system to recover from extreme memory pressure |
| 3715 | * situations, allowing for the few allocations that @c zone_gc() or |
| 3716 | * killing processes require. |
| 3717 | * |
| 3718 | * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit |
| 3719 | * is set. @c z_expander is not necessarily the identity of this VM privileged |
| 3720 | * thread (it is if the VM privileged thread came in first, but wouldn't be, and |
| 3721 | * could even be @c THREAD_NULL otherwise). |
| 3722 | * |
| 3723 | * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid |
| 3724 | * spending a whole pointer on priority inheritance for VM privileged threads |
| 3725 | * (and other issues related to having two owners), we use the rwlock boost as |
| 3726 | * a stop gap to avoid priority inversions. |
| 3727 | * |
| 3728 | * |
| 3729 | * <h2>Chunk wiring policies</h2> |
| 3730 | * |
| 3731 | * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time |
| 3732 | * to try to minimize fragmentation relative to element sizes not aligning with |
| 3733 | * a chunk size well. However, this can grow large and be hard to fulfill on |
| 3734 | * a system under a lot of memory pressure (chunks can be as long as 8 pages on |
| 3735 | * 4k page systems). |
| 3736 | * |
| 3737 | * This is why, when under memory pressure the system allows chunks to be |
| 3738 | * partially populated. The metadata of the first page in the chunk maintains |
| 3739 | * the count of actually populated pages. |
| 3740 | * |
| 3741 | * The metadata for addresses assigned to a zone are found of 4 queues: |
| 3742 | * - @c z_pageq_empty has chunk heads with populated pages and no allocated |
| 3743 | * elements (those can be targeted by @c zone_gc()), |
| 3744 | * - @c z_pageq_partial has chunk heads with populated pages that are partially |
| 3745 | * used, |
| 3746 | * - @c z_pageq_full has chunk heads with populated pages with no free elements |
| 3747 | * left, |
| 3748 | * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to |
| 3749 | * the zone forever, or the first secondary metadata for a chunk whose |
| 3750 | * corresponding page is not populated in the chunk. |
| 3751 | * |
| 3752 | * When new pages need to be wired/populated, chunks from the @c z_pageq_va |
| 3753 | * queues are preferred. |
| 3754 | * |
| 3755 | * |
| 3756 | * <h2>Asynchronous expansion</h2> |
| 3757 | * |
| 3758 | * This mechanism allows for refilling zones used mostly with non blocking |
| 3759 | * callers. It relies on a thread call (@c zone_expand_callout) which will |
| 3760 | * iterate all zones and refill the ones marked with @c z_async_refilling. |
| 3761 | * |
| 3762 | * NOTE: If the calling thread for zalloc_noblock is lower priority than |
| 3763 | * the thread_call, then zalloc_noblock to an empty zone may succeed. |
| 3764 | * |
| 3765 | * |
| 3766 | * <h2>Dealing with zone allocations from the mach VM code</h2> |
| 3767 | * |
| 3768 | * The implementation of the mach VM itself uses the zone allocator |
| 3769 | * for things like the vm_map_entry data structure. In order to prevent |
| 3770 | * a recursion problem when adding more pages to a zone, the VM zones |
| 3771 | * use the Z_SUBMAP_IDX_VM submap which doesn't use kmem_alloc() |
| 3772 | * or any VM map functions to allocate. |
| 3773 | * |
| 3774 | * Instead, a really simple coalescing first-fit allocator is used |
| 3775 | * for this submap, and no one else than zalloc can allocate from it. |
| 3776 | * |
| 3777 | * Memory is directly populated which doesn't require allocation of |
| 3778 | * VM map entries, and avoids recursion. The cost of this scheme however, |
| 3779 | * is that `vm_map_lookup_entry` will not function on those addresses |
| 3780 | * (nor any API relying on it). |
| 3781 | */ |
| 3782 | |
| 3783 | static void zone_reclaim_elements(zone_t z, uint16_t n, vm_offset_t *elems); |
| 3784 | static void zone_depot_trim(zone_t z, uint32_t target, struct zone_depot *zd); |
| 3785 | static thread_call_data_t zone_expand_callout; |
| 3786 | |
| 3787 | __attribute__((overloadable)) |
| 3788 | static inline bool |
| 3789 | zone_submap_is_sequestered(zone_submap_idx_t idx) |
| 3790 | { |
| 3791 | return idx != Z_SUBMAP_IDX_DATA; |
| 3792 | } |
| 3793 | |
| 3794 | __attribute__((overloadable)) |
| 3795 | static inline bool |
| 3796 | zone_submap_is_sequestered(zone_security_flags_t zsflags) |
| 3797 | { |
| 3798 | return zone_submap_is_sequestered(idx: zsflags.z_submap_idx); |
| 3799 | } |
| 3800 | |
| 3801 | static inline kma_flags_t |
| 3802 | zone_kma_flags(zone_t z, zone_security_flags_t zsflags, zalloc_flags_t flags) |
| 3803 | { |
| 3804 | kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO; |
| 3805 | |
| 3806 | if (zsflags.z_noencrypt) { |
| 3807 | kmaflags |= KMA_NOENCRYPT; |
| 3808 | } |
| 3809 | if (zsflags.z_submap_idx == Z_SUBMAP_IDX_DATA) { |
| 3810 | kmaflags |= KMA_DATA; |
| 3811 | } |
| 3812 | if (flags & Z_NOPAGEWAIT) { |
| 3813 | kmaflags |= KMA_NOPAGEWAIT; |
| 3814 | } |
| 3815 | if (z->z_permanent || (!z->z_destructible && |
| 3816 | zone_submap_is_sequestered(zsflags))) { |
| 3817 | kmaflags |= KMA_PERMANENT; |
| 3818 | } |
| 3819 | if (zsflags.z_submap_from_end) { |
| 3820 | kmaflags |= KMA_LAST_FREE; |
| 3821 | } |
| 3822 | |
| 3823 | if (z->z_tbi_tag) { |
| 3824 | kmaflags |= KMA_TAG; |
| 3825 | } |
| 3826 | |
| 3827 | return kmaflags; |
| 3828 | } |
| 3829 | |
| 3830 | static inline void |
| 3831 | zone_add_wired_pages(zone_t z, uint32_t pages) |
| 3832 | { |
| 3833 | os_atomic_add(&zone_pages_wired, pages, relaxed); |
| 3834 | |
| 3835 | #if CONFIG_ZLEAKS |
| 3836 | if (__improbable(zleak_should_enable_for_zone(z) && |
| 3837 | startup_phase >= STARTUP_SUB_THREAD_CALL)) { |
| 3838 | thread_call_enter(&zone_leaks_callout); |
| 3839 | } |
| 3840 | #else |
| 3841 | (void)z; |
| 3842 | #endif |
| 3843 | } |
| 3844 | |
| 3845 | static inline void |
| 3846 | zone_remove_wired_pages(zone_t z, uint32_t pages) |
| 3847 | { |
| 3848 | os_atomic_sub(&zone_pages_wired, pages, relaxed); |
| 3849 | |
| 3850 | #if CONFIG_ZLEAKS |
| 3851 | if (__improbable(zleak_should_disable_for_zone(z) && |
| 3852 | startup_phase >= STARTUP_SUB_THREAD_CALL)) { |
| 3853 | thread_call_enter(&zone_leaks_callout); |
| 3854 | } |
| 3855 | #else |
| 3856 | (void)z; |
| 3857 | #endif |
| 3858 | } |
| 3859 | |
| 3860 | #if CONFIG_KERNEL_TAGGING |
| 3861 | static inline vm_address_t |
| 3862 | zone_tag_element(zone_t zone, vm_offset_t addr, vm_size_t elem_size) |
| 3863 | { |
| 3864 | vm_offset_t tagged_address; |
| 3865 | |
| 3866 | tagged_address = vm_memtag_assign_tag(addr, elem_size); |
| 3867 | |
| 3868 | vm_memtag_set_tag(tagged_address, elem_size); |
| 3869 | |
| 3870 | if (zone->z_percpu) { |
| 3871 | zpercpu_foreach_cpu(index) { |
| 3872 | vm_memtag_set_tag(tagged_address + ptoa(index), elem_size); |
| 3873 | } |
| 3874 | } |
| 3875 | |
| 3876 | return tagged_address; |
| 3877 | } |
| 3878 | |
| 3879 | static inline void |
| 3880 | zcram_memtag_init(zone_t zone, vm_offset_t base, uint32_t start, uint32_t end) |
| 3881 | { |
| 3882 | vm_offset_t elem_size = zone_elem_outer_size(zone); |
| 3883 | vm_offset_t oob_offs = zone_elem_outer_offs(zone); |
| 3884 | |
| 3885 | for (uint32_t i = start; i < end; i++) { |
| 3886 | vm_offset_t elem_addr = base + oob_offs + i * elem_size; |
| 3887 | |
| 3888 | (void)zone_tag_element(zone, elem_addr, elem_size); |
| 3889 | } |
| 3890 | } |
| 3891 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 3892 | |
| 3893 | /*! |
| 3894 | * @function zcram_and_lock() |
| 3895 | * |
| 3896 | * @brief |
| 3897 | * Prepare some memory for being usable for allocation purposes. |
| 3898 | * |
| 3899 | * @discussion |
| 3900 | * Prepare memory in <code>[addr + ptoa(pg_start), addr + ptoa(pg_end))</code> |
| 3901 | * to be usable in the zone. |
| 3902 | * |
| 3903 | * This function assumes the metadata is already populated for the range. |
| 3904 | * |
| 3905 | * Calling this function with @c pg_start being 0 means that the memory |
| 3906 | * is either a partial chunk, or a full chunk, that isn't published anywhere |
| 3907 | * and the initialization can happen without locks held. |
| 3908 | * |
| 3909 | * Calling this function with a non zero @c pg_start means that we are extending |
| 3910 | * an existing chunk: the memory in <code>[addr, addr + ptoa(pg_start))</code>, |
| 3911 | * is already usable and published in the zone, so extending it requires holding |
| 3912 | * the zone lock. |
| 3913 | * |
| 3914 | * @param zone The zone to cram new populated pages into |
| 3915 | * @param addr The base address for the chunk(s) |
| 3916 | * @param pg_va_new The number of virtual pages newly assigned to the zone |
| 3917 | * @param pg_start The first newly populated page relative to @a addr. |
| 3918 | * @param pg_end The after-last newly populated page relative to @a addr. |
| 3919 | * @param lock 0 or ZM_ALLOC_SIZE_LOCK (used by early crams) |
| 3920 | */ |
| 3921 | static void |
| 3922 | zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new, |
| 3923 | uint32_t pg_start, uint32_t pg_end, uint16_t lock) |
| 3924 | { |
| 3925 | zone_id_t zindex = zone_index(z: zone); |
| 3926 | vm_offset_t elem_size = zone_elem_outer_size(zone); |
| 3927 | uint32_t free_start = 0, free_end = 0; |
| 3928 | uint32_t oob_offs = zone_elem_outer_offs(zone); |
| 3929 | |
| 3930 | struct zone_page_metadata *meta = zone_meta_from_addr(addr); |
| 3931 | uint32_t chunk_pages = zone->z_chunk_pages; |
| 3932 | bool guarded = meta->zm_guarded; |
| 3933 | |
| 3934 | assert(pg_start < pg_end && pg_end <= chunk_pages); |
| 3935 | |
| 3936 | if (pg_start == 0) { |
| 3937 | uint16_t chunk_len = (uint16_t)pg_end; |
| 3938 | uint16_t secondary_len = ZM_SECONDARY_PAGE; |
| 3939 | bool inline_bitmap = false; |
| 3940 | |
| 3941 | if (zone->z_percpu) { |
| 3942 | chunk_len = 1; |
| 3943 | secondary_len = ZM_SECONDARY_PCPU_PAGE; |
| 3944 | assert(pg_end == zpercpu_count()); |
| 3945 | } |
| 3946 | if (!zone->z_permanent && !zone->z_uses_tags) { |
| 3947 | inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages; |
| 3948 | } |
| 3949 | |
| 3950 | free_end = (uint32_t)(ptoa(chunk_len) - oob_offs) / elem_size; |
| 3951 | |
| 3952 | meta[0] = (struct zone_page_metadata){ |
| 3953 | .zm_index = zindex, |
| 3954 | .zm_guarded = guarded, |
| 3955 | .zm_inline_bitmap = inline_bitmap, |
| 3956 | .zm_chunk_len = chunk_len, |
| 3957 | .zm_alloc_size = lock, |
| 3958 | }; |
| 3959 | |
| 3960 | if (!zone->z_permanent && !inline_bitmap) { |
| 3961 | meta[0].zm_bitmap = zone_meta_bits_alloc_init(count: free_end, |
| 3962 | nbits: zone->z_chunk_elems, with_extra: zone->z_uses_tags); |
| 3963 | } |
| 3964 | |
| 3965 | for (uint16_t i = 1; i < chunk_pages; i++) { |
| 3966 | meta[i] = (struct zone_page_metadata){ |
| 3967 | .zm_index = zindex, |
| 3968 | .zm_guarded = guarded, |
| 3969 | .zm_inline_bitmap = inline_bitmap, |
| 3970 | .zm_chunk_len = secondary_len, |
| 3971 | .zm_page_index = (uint8_t)i, |
| 3972 | .zm_bitmap = meta[0].zm_bitmap, |
| 3973 | .zm_subchunk_len = (uint8_t)(chunk_pages - i), |
| 3974 | }; |
| 3975 | } |
| 3976 | |
| 3977 | if (inline_bitmap) { |
| 3978 | zone_meta_bits_init_inline(meta, count: free_end); |
| 3979 | } |
| 3980 | } else { |
| 3981 | assert(!zone->z_percpu && !zone->z_permanent); |
| 3982 | |
| 3983 | free_end = (uint32_t)(ptoa(pg_end) - oob_offs) / elem_size; |
| 3984 | free_start = (uint32_t)(ptoa(pg_start) - oob_offs) / elem_size; |
| 3985 | } |
| 3986 | |
| 3987 | #if CONFIG_KERNEL_TAGGING |
| 3988 | if (__probable(zone->z_tbi_tag)) { |
| 3989 | zcram_memtag_init(zone, addr, free_end, free_start); |
| 3990 | } |
| 3991 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 3992 | |
| 3993 | #if KASAN_CLASSIC |
| 3994 | assert(pg_start == 0); /* KASAN_CLASSIC never does partial chunks */ |
| 3995 | if (zone->z_permanent) { |
| 3996 | kasan_poison_range(addr, ptoa(pg_end), ASAN_VALID); |
| 3997 | } else if (zone->z_percpu) { |
| 3998 | for (uint32_t i = 0; i < pg_end; i++) { |
| 3999 | kasan_zmem_add(addr + ptoa(i), PAGE_SIZE, |
| 4000 | zone_elem_outer_size(zone), |
| 4001 | zone_elem_outer_offs(zone), |
| 4002 | zone_elem_redzone(zone)); |
| 4003 | } |
| 4004 | } else { |
| 4005 | kasan_zmem_add(addr, ptoa(pg_end), |
| 4006 | zone_elem_outer_size(zone), |
| 4007 | zone_elem_outer_offs(zone), |
| 4008 | zone_elem_redzone(zone)); |
| 4009 | } |
| 4010 | #endif /* KASAN_CLASSIC */ |
| 4011 | |
| 4012 | /* |
| 4013 | * Insert the initialized pages / metadatas into the right lists. |
| 4014 | */ |
| 4015 | |
| 4016 | zone_lock(zone); |
| 4017 | assert(zone->z_self == zone); |
| 4018 | |
| 4019 | if (pg_start != 0) { |
| 4020 | assert(meta->zm_chunk_len == pg_start); |
| 4021 | |
| 4022 | zone_meta_bits_merge(meta, start: free_start, end: free_end); |
| 4023 | meta->zm_chunk_len = (uint16_t)pg_end; |
| 4024 | |
| 4025 | /* |
| 4026 | * consume the zone_meta_lock_in_partial() |
| 4027 | * done in zone_expand_locked() |
| 4028 | */ |
| 4029 | zone_meta_alloc_size_sub(z: zone, m: meta, ZM_ALLOC_SIZE_LOCK); |
| 4030 | zone_meta_remqueue(z: zone, meta); |
| 4031 | } |
| 4032 | |
| 4033 | if (zone->z_permanent || meta->zm_alloc_size) { |
| 4034 | zone_meta_queue_push(z: zone, headp: &zone->z_pageq_partial, meta); |
| 4035 | } else { |
| 4036 | zone_meta_queue_push(z: zone, headp: &zone->z_pageq_empty, meta); |
| 4037 | zone->z_wired_empty += zone->z_percpu ? 1 : pg_end; |
| 4038 | } |
| 4039 | if (pg_end < chunk_pages) { |
| 4040 | /* push any non populated residual VA on z_pageq_va */ |
| 4041 | zone_meta_queue_push(z: zone, headp: &zone->z_pageq_va, meta: meta + pg_end); |
| 4042 | } |
| 4043 | |
| 4044 | zone->z_elems_free += free_end - free_start; |
| 4045 | zone->z_elems_avail += free_end - free_start; |
| 4046 | zone->z_wired_cur += zone->z_percpu ? 1 : pg_end - pg_start; |
| 4047 | if (pg_va_new) { |
| 4048 | zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new; |
| 4049 | } |
| 4050 | if (zone->z_wired_hwm < zone->z_wired_cur) { |
| 4051 | zone->z_wired_hwm = zone->z_wired_cur; |
| 4052 | } |
| 4053 | |
| 4054 | #if CONFIG_ZLEAKS |
| 4055 | if (__improbable(zleak_should_enable_for_zone(zone) && |
| 4056 | startup_phase >= STARTUP_SUB_THREAD_CALL)) { |
| 4057 | thread_call_enter(&zone_leaks_callout); |
| 4058 | } |
| 4059 | #endif /* CONFIG_ZLEAKS */ |
| 4060 | |
| 4061 | zone_add_wired_pages(z: zone, pages: pg_end - pg_start); |
| 4062 | } |
| 4063 | |
| 4064 | static void |
| 4065 | zcram(zone_t zone, vm_offset_t addr, uint32_t pages, uint16_t lock) |
| 4066 | { |
| 4067 | uint32_t chunk_pages = zone->z_chunk_pages; |
| 4068 | |
| 4069 | assert(pages % chunk_pages == 0); |
| 4070 | for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) { |
| 4071 | zcram_and_lock(zone, addr, pg_va_new: chunk_pages, pg_start: 0, pg_end: chunk_pages, lock); |
| 4072 | zone_unlock(zone); |
| 4073 | } |
| 4074 | } |
| 4075 | |
| 4076 | __startup_func |
| 4077 | void |
| 4078 | zone_cram_early(zone_t zone, vm_offset_t newmem, vm_size_t size) |
| 4079 | { |
| 4080 | uint32_t pages = (uint32_t)atop(size); |
| 4081 | |
| 4082 | |
| 4083 | assert(from_zone_map(newmem, size)); |
| 4084 | assert3u(size % ptoa(zone->z_chunk_pages), ==, 0); |
| 4085 | assert3u(startup_phase, <, STARTUP_SUB_ZALLOC); |
| 4086 | |
| 4087 | /* |
| 4088 | * The early pages we move at the pmap layer can't be "depopulated" |
| 4089 | * because there's no vm_page_t for them. |
| 4090 | * |
| 4091 | * "Lock" them so that they never hit z_pageq_empty. |
| 4092 | */ |
| 4093 | vm_memtag_bzero((void *)newmem, size); |
| 4094 | zcram(zone, addr: newmem, pages, ZM_ALLOC_SIZE_LOCK); |
| 4095 | } |
| 4096 | |
| 4097 | /*! |
| 4098 | * @function zone_submap_alloc_sequestered_va |
| 4099 | * |
| 4100 | * @brief |
| 4101 | * Allocates VA without using vm_find_space(). |
| 4102 | * |
| 4103 | * @discussion |
| 4104 | * Allocate VA quickly without using the slower vm_find_space() for cases |
| 4105 | * when the submaps are fully sequestered. |
| 4106 | * |
| 4107 | * The VM submap is used to implement the VM itself so it is always sequestered, |
| 4108 | * as it can't kmem_alloc which needs to always allocate vm entries. |
| 4109 | * However, it can use vm_map_enter() which tries to coalesce entries, which |
| 4110 | * always works, so the VM map only ever needs 2 entries (one for each end). |
| 4111 | * |
| 4112 | * The RO submap is similarly always sequestered if it exists (as a non |
| 4113 | * sequestered RO submap makes very little sense). |
| 4114 | * |
| 4115 | * The allocator is a very simple bump-allocator |
| 4116 | * that allocates from either end. |
| 4117 | */ |
| 4118 | static kern_return_t |
| 4119 | zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags, uint32_t pages, |
| 4120 | vm_offset_t *addrp) |
| 4121 | { |
| 4122 | vm_size_t size = ptoa(pages); |
| 4123 | vm_map_t map = zone_submap(zsflags); |
| 4124 | vm_map_entry_t first, last; |
| 4125 | vm_map_offset_t addr; |
| 4126 | |
| 4127 | vm_map_lock(map); |
| 4128 | |
| 4129 | first = vm_map_first_entry(map); |
| 4130 | last = vm_map_last_entry(map); |
| 4131 | |
| 4132 | if (first->vme_end + size > last->vme_start) { |
| 4133 | vm_map_unlock(map); |
| 4134 | return KERN_NO_SPACE; |
| 4135 | } |
| 4136 | |
| 4137 | if (zsflags.z_submap_from_end) { |
| 4138 | last->vme_start -= size; |
| 4139 | addr = last->vme_start; |
| 4140 | VME_OFFSET_SET(entry: last, offset: addr); |
| 4141 | } else { |
| 4142 | addr = first->vme_end; |
| 4143 | first->vme_end += size; |
| 4144 | } |
| 4145 | map->size += size; |
| 4146 | |
| 4147 | vm_map_unlock(map); |
| 4148 | |
| 4149 | *addrp = addr; |
| 4150 | return KERN_SUCCESS; |
| 4151 | } |
| 4152 | |
| 4153 | void |
| 4154 | zone_fill_initially(zone_t zone, vm_size_t nelems) |
| 4155 | { |
| 4156 | kma_flags_t kmaflags = KMA_NOFAIL | KMA_PERMANENT; |
| 4157 | kern_return_t kr; |
| 4158 | vm_offset_t addr; |
| 4159 | uint32_t pages; |
| 4160 | zone_security_flags_t zsflags = zone_security_config(z: zone); |
| 4161 | |
| 4162 | assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible); |
| 4163 | assert(zone->z_elems_avail == 0); |
| 4164 | |
| 4165 | kmaflags |= zone_kma_flags(z: zone, zsflags, flags: Z_WAITOK); |
| 4166 | pages = zone_alloc_pages_for_nelems(z: zone, max_elems: nelems); |
| 4167 | if (zone_submap_is_sequestered(zsflags)) { |
| 4168 | kr = zone_submap_alloc_sequestered_va(zsflags, pages, addrp: &addr); |
| 4169 | if (kr != KERN_SUCCESS) { |
| 4170 | panic("zone_submap_alloc_sequestered_va() " |
| 4171 | "of %u pages failed" , pages); |
| 4172 | } |
| 4173 | kernel_memory_populate(addr, ptoa(pages), |
| 4174 | flags: kmaflags, VM_KERN_MEMORY_ZONE); |
| 4175 | } else { |
| 4176 | assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY); |
| 4177 | kmem_alloc(map: zone_submap(zsflags), addrp: &addr, ptoa(pages), |
| 4178 | flags: kmaflags, VM_KERN_MEMORY_ZONE); |
| 4179 | } |
| 4180 | |
| 4181 | zone_meta_populate(base: addr, ptoa(pages)); |
| 4182 | zcram(zone, addr, pages, lock: 0); |
| 4183 | } |
| 4184 | |
| 4185 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 4186 | __attribute__((noinline)) |
| 4187 | static void |
| 4188 | zone_scramble_va_and_unlock( |
| 4189 | zone_t z, |
| 4190 | struct zone_page_metadata *meta, |
| 4191 | uint32_t runs, |
| 4192 | uint32_t pages, |
| 4193 | uint32_t chunk_pages, |
| 4194 | uint64_t guard_mask) |
| 4195 | { |
| 4196 | struct zone_page_metadata *arr[ZONE_CHUNK_ALLOC_SIZE / 4096]; |
| 4197 | |
| 4198 | for (uint32_t run = 0, n = 0; run < runs; run++) { |
| 4199 | arr[run] = meta + n; |
| 4200 | n += chunk_pages + ((guard_mask >> run) & 1); |
| 4201 | } |
| 4202 | |
| 4203 | /* |
| 4204 | * Fisher–Yates shuffle, for an array with indices [0, n) |
| 4205 | * |
| 4206 | * for i from n−1 downto 1 do |
| 4207 | * j ← random integer such that 0 ≤ j ≤ i |
| 4208 | * exchange a[j] and a[i] |
| 4209 | * |
| 4210 | * The point here is that early allocations aren't at a fixed |
| 4211 | * distance from each other. |
| 4212 | */ |
| 4213 | for (uint32_t i = runs - 1; i > 0; i--) { |
| 4214 | uint32_t j = zalloc_random_uniform32(bound_min: 0, bound_max: i + 1); |
| 4215 | |
| 4216 | meta = arr[j]; |
| 4217 | arr[j] = arr[i]; |
| 4218 | arr[i] = meta; |
| 4219 | } |
| 4220 | |
| 4221 | zone_lock(zone: z); |
| 4222 | |
| 4223 | for (uint32_t i = 0; i < runs; i++) { |
| 4224 | zone_meta_queue_push(z, headp: &z->z_pageq_va, meta: arr[i]); |
| 4225 | } |
| 4226 | z->z_va_cur += z->z_percpu ? runs : pages; |
| 4227 | } |
| 4228 | |
| 4229 | static inline uint32_t |
| 4230 | dist_u32(uint32_t a, uint32_t b) |
| 4231 | { |
| 4232 | return a < b ? b - a : a - b; |
| 4233 | } |
| 4234 | |
| 4235 | static uint64_t |
| 4236 | zalloc_random_clear_n_bits(uint64_t mask, uint32_t pop, uint32_t n) |
| 4237 | { |
| 4238 | for (; n-- > 0; pop--) { |
| 4239 | uint32_t bit = zalloc_random_uniform32(bound_min: 0, bound_max: pop); |
| 4240 | uint64_t m = mask; |
| 4241 | |
| 4242 | for (; bit; bit--) { |
| 4243 | m &= m - 1; |
| 4244 | } |
| 4245 | |
| 4246 | mask ^= 1ull << __builtin_ctzll(m); |
| 4247 | } |
| 4248 | |
| 4249 | return mask; |
| 4250 | } |
| 4251 | |
| 4252 | /** |
| 4253 | * @function zalloc_random_bits |
| 4254 | * |
| 4255 | * @brief |
| 4256 | * Compute a random number with a specified number of bit set in a given width. |
| 4257 | * |
| 4258 | * @discussion |
| 4259 | * This function generates a "uniform" distribution of sets of bits set in |
| 4260 | * a given width, with typically less than width/4 calls to random. |
| 4261 | * |
| 4262 | * @param pop the target number of bits set. |
| 4263 | * @param width the number of bits in the random integer to generate. |
| 4264 | */ |
| 4265 | static uint64_t |
| 4266 | zalloc_random_bits(uint32_t pop, uint32_t width) |
| 4267 | { |
| 4268 | uint64_t w_mask = (1ull << width) - 1; |
| 4269 | uint64_t mask; |
| 4270 | uint32_t cur; |
| 4271 | |
| 4272 | if (3 * width / 4 <= pop) { |
| 4273 | mask = w_mask; |
| 4274 | cur = width; |
| 4275 | } else if (pop <= width / 4) { |
| 4276 | mask = 0; |
| 4277 | cur = 0; |
| 4278 | } else { |
| 4279 | /* |
| 4280 | * Chosing a random number this way will overwhelmingly |
| 4281 | * contain `width` bits +/- a few. |
| 4282 | */ |
| 4283 | mask = zalloc_random_mask64(bits: width); |
| 4284 | cur = __builtin_popcountll(mask); |
| 4285 | |
| 4286 | if (dist_u32(a: cur, b: pop) > dist_u32(a: width - cur, b: pop)) { |
| 4287 | /* |
| 4288 | * If the opposite mask has a closer popcount, |
| 4289 | * then start with that one as the seed. |
| 4290 | */ |
| 4291 | cur = width - cur; |
| 4292 | mask ^= w_mask; |
| 4293 | } |
| 4294 | } |
| 4295 | |
| 4296 | if (cur < pop) { |
| 4297 | /* |
| 4298 | * Setting `pop - cur` bits is really clearing that many from |
| 4299 | * the opposite mask. |
| 4300 | */ |
| 4301 | mask ^= w_mask; |
| 4302 | mask = zalloc_random_clear_n_bits(mask, pop: width - cur, n: pop - cur); |
| 4303 | mask ^= w_mask; |
| 4304 | } else if (pop < cur) { |
| 4305 | mask = zalloc_random_clear_n_bits(mask, pop: cur, n: cur - pop); |
| 4306 | } |
| 4307 | |
| 4308 | return mask; |
| 4309 | } |
| 4310 | #endif |
| 4311 | |
| 4312 | static void |
| 4313 | zone_allocate_va_locked(zone_t z, zalloc_flags_t flags) |
| 4314 | { |
| 4315 | zone_security_flags_t zsflags = zone_security_config(z); |
| 4316 | struct zone_page_metadata *meta; |
| 4317 | kma_flags_t kmaflags = zone_kma_flags(z, zsflags, flags) | KMA_VAONLY; |
| 4318 | uint32_t chunk_pages = z->z_chunk_pages; |
| 4319 | uint32_t runs, pages, guards, rnum; |
| 4320 | uint64_t guard_mask = 0; |
| 4321 | bool lead_guard = false; |
| 4322 | kern_return_t kr; |
| 4323 | vm_offset_t addr; |
| 4324 | |
| 4325 | zone_unlock(zone: z); |
| 4326 | |
| 4327 | /* |
| 4328 | * A lot of OOB exploitation techniques rely on precise placement |
| 4329 | * and interleaving of zone pages. The layout that is sought |
| 4330 | * by attackers will be C/P/T types, where: |
| 4331 | * - (C)ompromised is the type for which attackers have a bug, |
| 4332 | * - (P)adding is used to pad memory, |
| 4333 | * - (T)arget is the type that the attacker will attempt to corrupt |
| 4334 | * by exploiting (C). |
| 4335 | * |
| 4336 | * Note that in some cases C==T and P isn't needed. |
| 4337 | * |
| 4338 | * In order to make those placement games much harder, |
| 4339 | * we grow zones by random runs of memory, up to 256k. |
| 4340 | * This makes predicting the precise layout of the heap |
| 4341 | * quite more complicated. |
| 4342 | * |
| 4343 | * Note: this function makes a very heavy use of random, |
| 4344 | * however, it is mostly limited to sequestered zones, |
| 4345 | * and eventually the layout will be fixed, |
| 4346 | * and the usage of random vastly reduced. |
| 4347 | * |
| 4348 | * For non sequestered zones, there's a single call |
| 4349 | * to random in order to decide whether we want |
| 4350 | * a guard page or not. |
| 4351 | */ |
| 4352 | pages = chunk_pages; |
| 4353 | guards = 0; |
| 4354 | runs = 1; |
| 4355 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 4356 | if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) { |
| 4357 | pages = atop(ZONE_CHUNK_ALLOC_SIZE); |
| 4358 | runs = (pages + chunk_pages - 1) / chunk_pages; |
| 4359 | runs = zalloc_random_uniform32(bound_min: 1, bound_max: runs + 1); |
| 4360 | pages = runs * chunk_pages; |
| 4361 | } |
| 4362 | static_assert(ZONE_CHUNK_ALLOC_SIZE / 4096 <= 64, |
| 4363 | "make sure that `runs` will never be larger than 64" ); |
| 4364 | #endif /* !ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 4365 | |
| 4366 | /* |
| 4367 | * Zones that are suceptible to OOB (kalloc, ZC_PGZ_USE_GUARDS), |
| 4368 | * guards might be added after each chunk. |
| 4369 | * |
| 4370 | * Those guard pages are marked with the ZM_PGZ_GUARD |
| 4371 | * magical chunk len, and their zm_oob_offs field |
| 4372 | * is used to remember optional shift applied |
| 4373 | * to returned elements, in order to right-align-them |
| 4374 | * as much as possible. |
| 4375 | * |
| 4376 | * In an adversarial context, while guard pages |
| 4377 | * are extremely effective against linear overflow, |
| 4378 | * using a predictable density of guard pages feels like |
| 4379 | * a missed opportunity. Which is why we chose to insert |
| 4380 | * one guard page for about 32k of memory, and place it |
| 4381 | * randomly. |
| 4382 | */ |
| 4383 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 4384 | if (z->z_percpu) { |
| 4385 | /* |
| 4386 | * For per-cpu runs, have a 75% chance to have a guard. |
| 4387 | */ |
| 4388 | rnum = zalloc_random_uniform32(bound_min: 0, bound_max: 4 * 128); |
| 4389 | guards = rnum >= 128; |
| 4390 | } else if (!zsflags.z_pgz_use_guards && !z->z_pgz_use_guards) { |
| 4391 | vm_offset_t rest; |
| 4392 | |
| 4393 | /* |
| 4394 | * For types that are less susceptible to have OOBs, |
| 4395 | * have a density of 1 guard every 64k, with a uniform |
| 4396 | * distribution. |
| 4397 | */ |
| 4398 | rnum = zalloc_random_uniform32(bound_min: 0, ZONE_GUARD_SPARSE); |
| 4399 | guards = (uint32_t)ptoa(pages) / ZONE_GUARD_SPARSE; |
| 4400 | rest = (uint32_t)ptoa(pages) % ZONE_GUARD_SPARSE; |
| 4401 | guards += rnum < rest; |
| 4402 | } else if (ptoa(chunk_pages) >= ZONE_GUARD_DENSE) { |
| 4403 | /* |
| 4404 | * For chunks >= 32k, have a 75% chance of guard pages |
| 4405 | * between chunks. |
| 4406 | */ |
| 4407 | rnum = zalloc_random_uniform32(bound_min: 65, bound_max: 129); |
| 4408 | guards = runs * rnum / 128; |
| 4409 | } else { |
| 4410 | vm_offset_t rest; |
| 4411 | |
| 4412 | /* |
| 4413 | * Otherwise, aim at 1 guard every 32k, |
| 4414 | * with a uniform distribution. |
| 4415 | */ |
| 4416 | rnum = zalloc_random_uniform32(bound_min: 0, ZONE_GUARD_DENSE); |
| 4417 | guards = (uint32_t)ptoa(pages) / ZONE_GUARD_DENSE; |
| 4418 | rest = (uint32_t)ptoa(pages) % ZONE_GUARD_DENSE; |
| 4419 | guards += rnum < rest; |
| 4420 | } |
| 4421 | assert3u(guards, <=, runs); |
| 4422 | |
| 4423 | guard_mask = 0; |
| 4424 | |
| 4425 | if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) { |
| 4426 | uint32_t g = 0; |
| 4427 | |
| 4428 | /* |
| 4429 | * Several exploitation strategies rely on a C/T (compromised |
| 4430 | * then target types) ordering of pages with a sub-page reach |
| 4431 | * from C into T. |
| 4432 | * |
| 4433 | * We want to reliably thwart such exploitations |
| 4434 | * and hence force a guard page between alternating |
| 4435 | * memory types. |
| 4436 | */ |
| 4437 | guard_mask |= 1ull << (runs - 1); |
| 4438 | g++; |
| 4439 | |
| 4440 | /* |
| 4441 | * While we randomize the chunks lengths, an attacker with |
| 4442 | * precise timing control can guess when overflows happen, |
| 4443 | * and "measure" the runs, which gives them an indication |
| 4444 | * of where the next run start offset is. |
| 4445 | * |
| 4446 | * In order to make this knowledge unusable, add a guard page |
| 4447 | * _before_ the new run with a 25% probability, regardless |
| 4448 | * of whether we had enough guard pages. |
| 4449 | */ |
| 4450 | if ((rnum & 3) == 0) { |
| 4451 | lead_guard = true; |
| 4452 | g++; |
| 4453 | } |
| 4454 | if (guards > g) { |
| 4455 | guard_mask |= zalloc_random_bits(pop: guards - g, width: runs - 1); |
| 4456 | } else { |
| 4457 | guards = g; |
| 4458 | } |
| 4459 | } else { |
| 4460 | assert3u(runs, ==, 1); |
| 4461 | assert3u(guards, <=, 1); |
| 4462 | guard_mask = guards << (runs - 1); |
| 4463 | } |
| 4464 | #else |
| 4465 | (void)rnum; |
| 4466 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 4467 | |
| 4468 | if (zone_submap_is_sequestered(zsflags)) { |
| 4469 | kr = zone_submap_alloc_sequestered_va(zsflags, |
| 4470 | pages: pages + guards, addrp: &addr); |
| 4471 | } else { |
| 4472 | assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY); |
| 4473 | kr = kmem_alloc(map: zone_submap(zsflags), addrp: &addr, |
| 4474 | ptoa(pages + guards), flags: kmaflags, VM_KERN_MEMORY_ZONE); |
| 4475 | } |
| 4476 | |
| 4477 | if (kr != KERN_SUCCESS) { |
| 4478 | uint64_t zone_size = 0; |
| 4479 | zone_t zone_largest = zone_find_largest(zone_size: &zone_size); |
| 4480 | panic("zalloc[%d]: zone map exhausted while allocating from zone [%s%s], " |
| 4481 | "likely due to memory leak in zone [%s%s] " |
| 4482 | "(%u%c, %d elements allocated)" , |
| 4483 | kr, zone_heap_name(z), zone_name(z), |
| 4484 | zone_heap_name(zone_largest), zone_name(zone_largest), |
| 4485 | mach_vm_size_pretty(zone_size), |
| 4486 | mach_vm_size_unit(zone_size), |
| 4487 | zone_count_allocated(zone_largest)); |
| 4488 | } |
| 4489 | |
| 4490 | meta = zone_meta_from_addr(addr); |
| 4491 | zone_meta_populate(base: addr, ptoa(pages + guards)); |
| 4492 | |
| 4493 | /* |
| 4494 | * Handle the leading guard page if any |
| 4495 | */ |
| 4496 | if (lead_guard) { |
| 4497 | meta[0].zm_index = zone_index(z); |
| 4498 | meta[0].zm_chunk_len = ZM_PGZ_GUARD; |
| 4499 | meta[0].zm_guarded = true; |
| 4500 | meta++; |
| 4501 | } |
| 4502 | |
| 4503 | for (uint32_t run = 0, n = 0; run < runs; run++) { |
| 4504 | bool guarded = (guard_mask >> run) & 1; |
| 4505 | |
| 4506 | for (uint32_t i = 0; i < chunk_pages; i++, n++) { |
| 4507 | meta[n].zm_index = zone_index(z); |
| 4508 | meta[n].zm_guarded = guarded; |
| 4509 | } |
| 4510 | if (guarded) { |
| 4511 | meta[n].zm_index = zone_index(z); |
| 4512 | meta[n].zm_chunk_len = ZM_PGZ_GUARD; |
| 4513 | n++; |
| 4514 | } |
| 4515 | } |
| 4516 | if (guards) { |
| 4517 | os_atomic_add(&zone_guard_pages, guards, relaxed); |
| 4518 | } |
| 4519 | |
| 4520 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 4521 | if (__improbable(zone_caching_disabled < 0)) { |
| 4522 | return zone_scramble_va_and_unlock(z, meta, runs, pages, |
| 4523 | chunk_pages, guard_mask); |
| 4524 | } |
| 4525 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 4526 | |
| 4527 | zone_lock(zone: z); |
| 4528 | |
| 4529 | for (uint32_t run = 0, n = 0; run < runs; run++) { |
| 4530 | zone_meta_queue_push(z, headp: &z->z_pageq_va, meta: meta + n); |
| 4531 | n += chunk_pages + ((guard_mask >> run) & 1); |
| 4532 | } |
| 4533 | z->z_va_cur += z->z_percpu ? runs : pages; |
| 4534 | } |
| 4535 | |
| 4536 | static inline void |
| 4537 | ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size) |
| 4538 | { |
| 4539 | #if DEBUG || DEVELOPMENT |
| 4540 | VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, |
| 4541 | size, 0, 0, 0); |
| 4542 | #else |
| 4543 | (void)size; |
| 4544 | #endif |
| 4545 | } |
| 4546 | |
| 4547 | static inline void |
| 4548 | ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages) |
| 4549 | { |
| 4550 | #if DEBUG || DEVELOPMENT |
| 4551 | task_t task = current_task_early(); |
| 4552 | if (pages && task) { |
| 4553 | ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages); |
| 4554 | } |
| 4555 | VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, |
| 4556 | pages, 0, 0, 0); |
| 4557 | #else |
| 4558 | (void)pages; |
| 4559 | #endif |
| 4560 | } |
| 4561 | |
| 4562 | __attribute__((noinline)) |
| 4563 | static void |
| 4564 | __ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(zone_t z, uint32_t pgs) |
| 4565 | { |
| 4566 | uint64_t wait_start = 0; |
| 4567 | long mapped; |
| 4568 | |
| 4569 | thread_wakeup(VM_PAGEOUT_GC_EVENT); |
| 4570 | |
| 4571 | if (zone_supports_vm(z) || (current_thread()->options & TH_OPT_VMPRIV)) { |
| 4572 | return; |
| 4573 | } |
| 4574 | |
| 4575 | mapped = os_atomic_load(&zone_pages_wired, relaxed); |
| 4576 | |
| 4577 | /* |
| 4578 | * If the zone map is really exhausted, wait on the GC thread, |
| 4579 | * donating our priority (which is important because the GC |
| 4580 | * thread is at a rather low priority). |
| 4581 | */ |
| 4582 | for (uint32_t n = 1; mapped >= zone_pages_wired_max - pgs; n++) { |
| 4583 | uint32_t wait_ms = n * (n + 1) / 2; |
| 4584 | uint64_t interval; |
| 4585 | |
| 4586 | if (n == 1) { |
| 4587 | wait_start = mach_absolute_time(); |
| 4588 | } else { |
| 4589 | thread_wakeup(VM_PAGEOUT_GC_EVENT); |
| 4590 | } |
| 4591 | if (zone_exhausted_timeout > 0 && |
| 4592 | wait_ms > zone_exhausted_timeout) { |
| 4593 | panic("zone map exhaustion: waited for %dms " |
| 4594 | "(pages: %ld, max: %ld, wanted: %d)" , |
| 4595 | wait_ms, mapped, zone_pages_wired_max, pgs); |
| 4596 | } |
| 4597 | |
| 4598 | clock_interval_to_absolutetime_interval(interval: wait_ms, NSEC_PER_MSEC, |
| 4599 | result: &interval); |
| 4600 | |
| 4601 | lck_spin_lock(lck: &zone_exhausted_lock); |
| 4602 | lck_spin_sleep_with_inheritor(lock: &zone_exhausted_lock, |
| 4603 | lck_sleep_action: LCK_SLEEP_UNLOCK, event: &zone_pages_wired, |
| 4604 | inheritor: vm_pageout_gc_thread, THREAD_UNINT, deadline: wait_start + interval); |
| 4605 | |
| 4606 | mapped = os_atomic_load(&zone_pages_wired, relaxed); |
| 4607 | } |
| 4608 | } |
| 4609 | |
| 4610 | static bool |
| 4611 | zone_expand_wait_for_pages(bool waited) |
| 4612 | { |
| 4613 | if (waited) { |
| 4614 | return false; |
| 4615 | } |
| 4616 | #if DEBUG || DEVELOPMENT |
| 4617 | if (zalloc_simulate_vm_pressure) { |
| 4618 | return false; |
| 4619 | } |
| 4620 | #endif /* DEBUG || DEVELOPMENT */ |
| 4621 | return !vm_pool_low(); |
| 4622 | } |
| 4623 | |
| 4624 | static inline void |
| 4625 | zone_expand_async_schedule_if_allowed(zone_t zone) |
| 4626 | { |
| 4627 | if (zone->z_async_refilling || zone->no_callout) { |
| 4628 | return; |
| 4629 | } |
| 4630 | |
| 4631 | if (zone_exhausted(zone)) { |
| 4632 | return; |
| 4633 | } |
| 4634 | |
| 4635 | if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) { |
| 4636 | return; |
| 4637 | } |
| 4638 | |
| 4639 | if (!vm_pool_low() || zone_supports_vm(z: zone)) { |
| 4640 | zone->z_async_refilling = true; |
| 4641 | thread_call_enter(call: &zone_expand_callout); |
| 4642 | } |
| 4643 | } |
| 4644 | |
| 4645 | __attribute__((noinline)) |
| 4646 | static bool |
| 4647 | zalloc_expand_drain_exhausted_caches_locked(zone_t z) |
| 4648 | { |
| 4649 | struct zone_depot zd; |
| 4650 | zone_magazine_t mag = NULL; |
| 4651 | |
| 4652 | if (z->z_depot_size) { |
| 4653 | z->z_depot_size = 0; |
| 4654 | z->z_depot_cleanup = true; |
| 4655 | |
| 4656 | zone_depot_init(zd: &zd); |
| 4657 | zone_depot_trim(z, target: 0, zd: &zd); |
| 4658 | |
| 4659 | zone_recirc_lock_nopreempt(zone: z); |
| 4660 | if (zd.zd_full) { |
| 4661 | zone_depot_move_full(dst: &z->z_recirc, |
| 4662 | src: &zd, n: zd.zd_full, NULL); |
| 4663 | } |
| 4664 | if (zd.zd_empty) { |
| 4665 | zone_depot_move_empty(dst: &z->z_recirc, |
| 4666 | src: &zd, n: zd.zd_empty, NULL); |
| 4667 | } |
| 4668 | zone_recirc_unlock_nopreempt(zone: z); |
| 4669 | } |
| 4670 | |
| 4671 | zone_recirc_lock_nopreempt(zone: z); |
| 4672 | if (z->z_recirc.zd_full) { |
| 4673 | mag = zone_depot_pop_head_full(zd: &z->z_recirc, z); |
| 4674 | } |
| 4675 | zone_recirc_unlock_nopreempt(zone: z); |
| 4676 | |
| 4677 | if (mag) { |
| 4678 | zone_reclaim_elements(z, n: zc_mag_size(), elems: mag->zm_elems); |
| 4679 | zone_magazine_free(mag); |
| 4680 | } |
| 4681 | |
| 4682 | return mag != NULL; |
| 4683 | } |
| 4684 | |
| 4685 | static bool |
| 4686 | zalloc_needs_refill(zone_t zone, zalloc_flags_t flags) |
| 4687 | { |
| 4688 | if (zone->z_elems_free > zone->z_elems_rsv) { |
| 4689 | return false; |
| 4690 | } |
| 4691 | if (!zone_exhausted(zone)) { |
| 4692 | return true; |
| 4693 | } |
| 4694 | if (zone->z_pcpu_cache && zone->z_depot_size) { |
| 4695 | if (zalloc_expand_drain_exhausted_caches_locked(z: zone)) { |
| 4696 | return false; |
| 4697 | } |
| 4698 | } |
| 4699 | return (flags & Z_NOFAIL) != 0; |
| 4700 | } |
| 4701 | |
| 4702 | static void |
| 4703 | zone_wakeup_exhausted_waiters(zone_t z) |
| 4704 | { |
| 4705 | z->z_exhausted_wait = false; |
| 4706 | EVENT_INVOKE(ZONE_EXHAUSTED, zone_index(z), z, false); |
| 4707 | thread_wakeup(&z->z_expander); |
| 4708 | } |
| 4709 | |
| 4710 | __attribute__((noinline)) |
| 4711 | static void |
| 4712 | __ZONE_EXHAUSTED_AND_WAITING_HARD__(zone_t z) |
| 4713 | { |
| 4714 | if (z->z_pcpu_cache && z->z_depot_size && |
| 4715 | zalloc_expand_drain_exhausted_caches_locked(z)) { |
| 4716 | return; |
| 4717 | } |
| 4718 | |
| 4719 | if (!z->z_exhausted_wait) { |
| 4720 | zone_recirc_lock_nopreempt(zone: z); |
| 4721 | z->z_exhausted_wait = true; |
| 4722 | zone_recirc_unlock_nopreempt(zone: z); |
| 4723 | EVENT_INVOKE(ZONE_EXHAUSTED, zone_index(z), z, true); |
| 4724 | } |
| 4725 | |
| 4726 | assert_wait(event: &z->z_expander, TH_UNINT); |
| 4727 | zone_unlock(zone: z); |
| 4728 | thread_block(THREAD_CONTINUE_NULL); |
| 4729 | zone_lock(zone: z); |
| 4730 | } |
| 4731 | |
| 4732 | static pmap_mapping_type_t |
| 4733 | zone_mapping_type(zone_t z) |
| 4734 | { |
| 4735 | zone_security_flags_t zsflags = zone_security_config(z); |
| 4736 | |
| 4737 | /* |
| 4738 | * If the zone has z_submap_idx is not Z_SUBMAP_IDX_DATA or |
| 4739 | * Z_SUBMAP_IDX_READ_ONLY, mark the corresponding mapping |
| 4740 | * type as PMAP_MAPPING_TYPE_RESTRICTED. |
| 4741 | */ |
| 4742 | switch (zsflags.z_submap_idx) { |
| 4743 | case Z_SUBMAP_IDX_DATA: |
| 4744 | return PMAP_MAPPING_TYPE_DEFAULT; |
| 4745 | case Z_SUBMAP_IDX_READ_ONLY: |
| 4746 | return PMAP_MAPPING_TYPE_ROZONE; |
| 4747 | default: |
| 4748 | return PMAP_MAPPING_TYPE_RESTRICTED; |
| 4749 | } |
| 4750 | } |
| 4751 | |
| 4752 | static vm_prot_t |
| 4753 | zone_page_prot(zone_security_flags_t zsflags) |
| 4754 | { |
| 4755 | switch (zsflags.z_submap_idx) { |
| 4756 | case Z_SUBMAP_IDX_READ_ONLY: |
| 4757 | return VM_PROT_READ; |
| 4758 | default: |
| 4759 | return VM_PROT_READ | VM_PROT_WRITE; |
| 4760 | } |
| 4761 | } |
| 4762 | |
| 4763 | static void |
| 4764 | zone_expand_locked(zone_t z, zalloc_flags_t flags) |
| 4765 | { |
| 4766 | zone_security_flags_t zsflags = zone_security_config(z); |
| 4767 | struct zone_expand ze = { |
| 4768 | .ze_thread = current_thread(), |
| 4769 | }; |
| 4770 | |
| 4771 | if (!(ze.ze_thread->options & TH_OPT_VMPRIV) && zone_supports_vm(z)) { |
| 4772 | ze.ze_thread->options |= TH_OPT_VMPRIV; |
| 4773 | ze.ze_clear_priv = true; |
| 4774 | } |
| 4775 | |
| 4776 | if (ze.ze_thread->options & TH_OPT_VMPRIV) { |
| 4777 | /* |
| 4778 | * When the thread is VM privileged, |
| 4779 | * vm_page_grab() will call VM_PAGE_WAIT() |
| 4780 | * without our knowledge, so we must assume |
| 4781 | * it's being called unfortunately. |
| 4782 | * |
| 4783 | * In practice it's not a big deal because |
| 4784 | * Z_NOPAGEWAIT is not really used on zones |
| 4785 | * that VM privileged threads are going to expand. |
| 4786 | */ |
| 4787 | ze.ze_pg_wait = true; |
| 4788 | ze.ze_vm_priv = true; |
| 4789 | } |
| 4790 | |
| 4791 | for (;;) { |
| 4792 | if (!z->z_permanent && !zalloc_needs_refill(zone: z, flags)) { |
| 4793 | goto out; |
| 4794 | } |
| 4795 | |
| 4796 | if (z->z_expander == NULL) { |
| 4797 | z->z_expander = &ze; |
| 4798 | break; |
| 4799 | } |
| 4800 | |
| 4801 | if (ze.ze_vm_priv && !z->z_expander->ze_vm_priv) { |
| 4802 | change_sleep_inheritor(event: &z->z_expander, inheritor: ze.ze_thread); |
| 4803 | ze.ze_next = z->z_expander; |
| 4804 | z->z_expander = &ze; |
| 4805 | break; |
| 4806 | } |
| 4807 | |
| 4808 | if ((flags & Z_NOPAGEWAIT) && z->z_expander->ze_pg_wait) { |
| 4809 | goto out; |
| 4810 | } |
| 4811 | |
| 4812 | z->z_expanding_wait = true; |
| 4813 | hw_lck_ticket_sleep_with_inheritor(lock: &z->z_lock, grp: &zone_locks_grp, |
| 4814 | lck_sleep_action: LCK_SLEEP_DEFAULT, event: &z->z_expander, inheritor: z->z_expander->ze_thread, |
| 4815 | TH_UNINT, TIMEOUT_WAIT_FOREVER); |
| 4816 | } |
| 4817 | |
| 4818 | do { |
| 4819 | struct zone_page_metadata *meta = NULL; |
| 4820 | uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0; |
| 4821 | vm_page_t page_list = NULL; |
| 4822 | vm_offset_t addr = 0; |
| 4823 | int waited = 0; |
| 4824 | |
| 4825 | if ((flags & Z_NOFAIL) && zone_exhausted(zone: z)) { |
| 4826 | __ZONE_EXHAUSTED_AND_WAITING_HARD__(z); |
| 4827 | continue; /* reevaluate if we really need it */ |
| 4828 | } |
| 4829 | |
| 4830 | /* |
| 4831 | * While we hold the zone lock, look if there's VA we can: |
| 4832 | * - complete from partial pages, |
| 4833 | * - reuse from the sequester list. |
| 4834 | * |
| 4835 | * When the page is being populated we pretend we allocated |
| 4836 | * an extra element so that zone_gc() can't attempt to free |
| 4837 | * the chunk (as it could become empty while we wait for pages). |
| 4838 | */ |
| 4839 | if (zone_pva_is_null(page: z->z_pageq_va)) { |
| 4840 | zone_allocate_va_locked(z, flags); |
| 4841 | } |
| 4842 | |
| 4843 | meta = zone_meta_queue_pop(z, headp: &z->z_pageq_va); |
| 4844 | addr = zone_meta_to_addr(meta); |
| 4845 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 4846 | cur_pages = meta->zm_page_index; |
| 4847 | meta -= cur_pages; |
| 4848 | addr -= ptoa(cur_pages); |
| 4849 | zone_meta_lock_in_partial(z, m: meta, len: cur_pages); |
| 4850 | } |
| 4851 | zone_unlock(zone: z); |
| 4852 | |
| 4853 | /* |
| 4854 | * And now allocate pages to populate our VA. |
| 4855 | */ |
| 4856 | min_pages = z->z_chunk_pages; |
| 4857 | #if !KASAN_CLASSIC |
| 4858 | if (!z->z_percpu) { |
| 4859 | min_pages = (uint32_t)atop(round_page(zone_elem_outer_offs(z) + |
| 4860 | zone_elem_outer_size(z))); |
| 4861 | } |
| 4862 | #endif /* !KASAN_CLASSIC */ |
| 4863 | |
| 4864 | /* |
| 4865 | * Trigger jetsams via VM_PAGEOUT_GC_EVENT |
| 4866 | * if we're running out of zone memory |
| 4867 | */ |
| 4868 | if (__improbable(zone_map_nearing_exhaustion())) { |
| 4869 | __ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(z, pgs: min_pages); |
| 4870 | } |
| 4871 | |
| 4872 | ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages)); |
| 4873 | |
| 4874 | while (pages < z->z_chunk_pages - cur_pages) { |
| 4875 | vm_page_t m = vm_page_grab(); |
| 4876 | |
| 4877 | if (m) { |
| 4878 | pages++; |
| 4879 | m->vmp_snext = page_list; |
| 4880 | page_list = m; |
| 4881 | vm_page_zero_fill(page: m); |
| 4882 | continue; |
| 4883 | } |
| 4884 | |
| 4885 | if (pages >= min_pages && |
| 4886 | !zone_expand_wait_for_pages(waited)) { |
| 4887 | break; |
| 4888 | } |
| 4889 | |
| 4890 | if ((flags & Z_NOPAGEWAIT) == 0) { |
| 4891 | /* |
| 4892 | * The first time we're about to wait for pages, |
| 4893 | * mention that to waiters and wake them all. |
| 4894 | * |
| 4895 | * Set `ze_pg_wait` in our zone_expand context |
| 4896 | * so that waiters who care do not wait again. |
| 4897 | */ |
| 4898 | if (!ze.ze_pg_wait) { |
| 4899 | zone_lock(zone: z); |
| 4900 | if (z->z_expanding_wait) { |
| 4901 | z->z_expanding_wait = false; |
| 4902 | wakeup_all_with_inheritor(event: &z->z_expander, |
| 4903 | THREAD_AWAKENED); |
| 4904 | } |
| 4905 | ze.ze_pg_wait = true; |
| 4906 | zone_unlock(zone: z); |
| 4907 | } |
| 4908 | |
| 4909 | waited++; |
| 4910 | VM_PAGE_WAIT(); |
| 4911 | continue; |
| 4912 | } |
| 4913 | |
| 4914 | /* |
| 4915 | * Undo everything and bail out: |
| 4916 | * |
| 4917 | * - free pages |
| 4918 | * - undo the fake allocation if any |
| 4919 | * - put the VA back on the VA page queue. |
| 4920 | */ |
| 4921 | vm_page_free_list(mem: page_list, FALSE); |
| 4922 | ZONE_TRACE_VM_KERN_REQUEST_END(pages); |
| 4923 | |
| 4924 | zone_lock(zone: z); |
| 4925 | |
| 4926 | zone_expand_async_schedule_if_allowed(zone: z); |
| 4927 | |
| 4928 | if (cur_pages) { |
| 4929 | zone_meta_unlock_from_partial(z, m: meta, len: cur_pages); |
| 4930 | } |
| 4931 | if (meta) { |
| 4932 | zone_meta_queue_push(z, headp: &z->z_pageq_va, |
| 4933 | meta: meta + cur_pages); |
| 4934 | } |
| 4935 | goto page_shortage; |
| 4936 | } |
| 4937 | |
| 4938 | vm_object_t object; |
| 4939 | object = kernel_object_default; |
| 4940 | vm_object_lock(object); |
| 4941 | |
| 4942 | kernel_memory_populate_object_and_unlock(object, |
| 4943 | addr: addr + ptoa(cur_pages), offset: addr + ptoa(cur_pages), ptoa(pages), page_list, |
| 4944 | flags: zone_kma_flags(z, zsflags, flags), VM_KERN_MEMORY_ZONE, |
| 4945 | prot: zone_page_prot(zsflags), mapping_type: zone_mapping_type(z)); |
| 4946 | |
| 4947 | ZONE_TRACE_VM_KERN_REQUEST_END(pages); |
| 4948 | |
| 4949 | zcram_and_lock(zone: z, addr, pg_va_new: new_va, pg_start: cur_pages, pg_end: cur_pages + pages, lock: 0); |
| 4950 | |
| 4951 | /* |
| 4952 | * permanent zones only try once, |
| 4953 | * the retry loop is in the caller |
| 4954 | */ |
| 4955 | } while (!z->z_permanent && zalloc_needs_refill(zone: z, flags)); |
| 4956 | |
| 4957 | page_shortage: |
| 4958 | if (z->z_expander == &ze) { |
| 4959 | z->z_expander = ze.ze_next; |
| 4960 | } else { |
| 4961 | assert(z->z_expander->ze_next == &ze); |
| 4962 | z->z_expander->ze_next = NULL; |
| 4963 | } |
| 4964 | if (z->z_expanding_wait) { |
| 4965 | z->z_expanding_wait = false; |
| 4966 | wakeup_all_with_inheritor(event: &z->z_expander, THREAD_AWAKENED); |
| 4967 | } |
| 4968 | out: |
| 4969 | if (ze.ze_clear_priv) { |
| 4970 | ze.ze_thread->options &= ~TH_OPT_VMPRIV; |
| 4971 | } |
| 4972 | } |
| 4973 | |
| 4974 | static void |
| 4975 | zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) |
| 4976 | { |
| 4977 | zone_foreach(z) { |
| 4978 | if (z->no_callout) { |
| 4979 | /* z_async_refilling will never be set */ |
| 4980 | continue; |
| 4981 | } |
| 4982 | |
| 4983 | if (!z->z_async_refilling) { |
| 4984 | /* |
| 4985 | * avoid locking all zones, because the one(s) |
| 4986 | * we're looking for have been set _before_ |
| 4987 | * thread_call_enter() was called, if we fail |
| 4988 | * to observe the bit, it means the thread-call |
| 4989 | * has been "dinged" again and we'll notice it then. |
| 4990 | */ |
| 4991 | continue; |
| 4992 | } |
| 4993 | |
| 4994 | zone_lock(zone: z); |
| 4995 | if (z->z_self && z->z_async_refilling) { |
| 4996 | zone_expand_locked(z, flags: Z_WAITOK); |
| 4997 | /* |
| 4998 | * clearing _after_ we grow is important, |
| 4999 | * so that we avoid waking up the thread call |
| 5000 | * while we grow and cause to run a second time. |
| 5001 | */ |
| 5002 | z->z_async_refilling = false; |
| 5003 | } |
| 5004 | zone_unlock(zone: z); |
| 5005 | } |
| 5006 | } |
| 5007 | |
| 5008 | #endif /* !ZALLOC_TEST */ |
| 5009 | #pragma mark zone jetsam integration |
| 5010 | #if !ZALLOC_TEST |
| 5011 | |
| 5012 | /* |
| 5013 | * We're being very conservative here and picking a value of 95%. We might need to lower this if |
| 5014 | * we find that we're not catching the problem and are still hitting zone map exhaustion panics. |
| 5015 | */ |
| 5016 | #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95 |
| 5017 | |
| 5018 | /* |
| 5019 | * Threshold above which largest zones should be included in the panic log |
| 5020 | */ |
| 5021 | #define ZONE_MAP_EXHAUSTION_PRINT_PANIC 80 |
| 5022 | |
| 5023 | /* |
| 5024 | * Trigger zone-map-exhaustion jetsams if the zone map is X% full, |
| 5025 | * where X=zone_map_jetsam_limit. |
| 5026 | * |
| 5027 | * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default. |
| 5028 | */ |
| 5029 | TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit" , |
| 5030 | ZONE_MAP_JETSAM_LIMIT_DEFAULT); |
| 5031 | |
| 5032 | kern_return_t |
| 5033 | zone_map_jetsam_set_limit(uint32_t value) |
| 5034 | { |
| 5035 | if (value <= 0 || value > 100) { |
| 5036 | return KERN_INVALID_VALUE; |
| 5037 | } |
| 5038 | |
| 5039 | zone_map_jetsam_limit = value; |
| 5040 | os_atomic_store(&zone_pages_jetsam_threshold, |
| 5041 | zone_pages_wired_max * value / 100, relaxed); |
| 5042 | return KERN_SUCCESS; |
| 5043 | } |
| 5044 | |
| 5045 | void |
| 5046 | get_zone_map_size(uint64_t *current_size, uint64_t *capacity) |
| 5047 | { |
| 5048 | vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed); |
| 5049 | *current_size = ptoa_64(phys_pages); |
| 5050 | *capacity = ptoa_64(zone_pages_wired_max); |
| 5051 | } |
| 5052 | |
| 5053 | void |
| 5054 | get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size) |
| 5055 | { |
| 5056 | zone_t largest_zone = zone_find_largest(zone_size); |
| 5057 | |
| 5058 | /* |
| 5059 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
| 5060 | */ |
| 5061 | snprintf(zone_name, zone_name_len, "%s%s" , |
| 5062 | zone_heap_name(z: largest_zone), largest_zone->z_name); |
| 5063 | } |
| 5064 | |
| 5065 | static bool |
| 5066 | zone_map_nearing_threshold(unsigned int threshold) |
| 5067 | { |
| 5068 | uint64_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed); |
| 5069 | return phys_pages * 100 > zone_pages_wired_max * threshold; |
| 5070 | } |
| 5071 | |
| 5072 | bool |
| 5073 | zone_map_nearing_exhaustion(void) |
| 5074 | { |
| 5075 | vm_size_t pages = os_atomic_load(&zone_pages_wired, relaxed); |
| 5076 | |
| 5077 | return pages >= os_atomic_load(&zone_pages_jetsam_threshold, relaxed); |
| 5078 | } |
| 5079 | |
| 5080 | |
| 5081 | #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 |
| 5082 | |
| 5083 | /* |
| 5084 | * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread |
| 5085 | * to walk through the jetsam priority bands and kill processes. |
| 5086 | */ |
| 5087 | static zone_t |
| 5088 | kill_process_in_largest_zone(void) |
| 5089 | { |
| 5090 | pid_t pid = -1; |
| 5091 | uint64_t zone_size = 0; |
| 5092 | zone_t largest_zone = zone_find_largest(zone_size: &zone_size); |
| 5093 | |
| 5094 | printf(format: "zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n" , |
| 5095 | ptoa_64(os_atomic_load(&zone_pages_wired, relaxed)), |
| 5096 | ptoa_64(zone_pages_wired_max), |
| 5097 | (uint64_t)zone_submaps_approx_size(), |
| 5098 | (uint64_t)mach_vm_range_size(r: &zone_info.zi_map_range), |
| 5099 | zone_map_jetsam_limit); |
| 5100 | printf(format: "zone_map_exhaustion: Largest zone %s%s, size %lu\n" , zone_heap_name(z: largest_zone), |
| 5101 | largest_zone->z_name, (uintptr_t)zone_size); |
| 5102 | |
| 5103 | /* |
| 5104 | * We want to make sure we don't call this function from userspace. |
| 5105 | * Or we could end up trying to synchronously kill the process |
| 5106 | * whose context we're in, causing the system to hang. |
| 5107 | */ |
| 5108 | assert(current_task() == kernel_task); |
| 5109 | |
| 5110 | /* |
| 5111 | * If vm_object_zone is the largest, check to see if the number of |
| 5112 | * elements in vm_map_entry_zone is comparable. |
| 5113 | * |
| 5114 | * If so, consider vm_map_entry_zone as the largest. This lets us target |
| 5115 | * a specific process to jetsam to quickly recover from the zone map |
| 5116 | * bloat. |
| 5117 | */ |
| 5118 | if (largest_zone == vm_object_zone) { |
| 5119 | unsigned int vm_object_zone_count = zone_count_allocated(zone: vm_object_zone); |
| 5120 | unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone); |
| 5121 | /* Is the VM map entries zone count >= 98% of the VM objects zone count? */ |
| 5122 | if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) { |
| 5123 | largest_zone = vm_map_entry_zone; |
| 5124 | printf(format: "zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n" , |
| 5125 | (uintptr_t)zone_size_wired(zone: largest_zone)); |
| 5126 | } |
| 5127 | } |
| 5128 | |
| 5129 | /* TODO: Extend this to check for the largest process in other zones as well. */ |
| 5130 | if (largest_zone == vm_map_entry_zone) { |
| 5131 | pid = find_largest_process_vm_map_entries(); |
| 5132 | } else { |
| 5133 | printf(format: "zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. " |
| 5134 | "Waking up memorystatus thread.\n" , zone_heap_name(z: largest_zone), |
| 5135 | largest_zone->z_name); |
| 5136 | } |
| 5137 | if (!memorystatus_kill_on_zone_map_exhaustion(pid)) { |
| 5138 | printf(format: "zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n" , pid); |
| 5139 | } |
| 5140 | |
| 5141 | return largest_zone; |
| 5142 | } |
| 5143 | |
| 5144 | #endif /* !ZALLOC_TEST */ |
| 5145 | #pragma mark probabilistic gzalloc |
| 5146 | #if !ZALLOC_TEST |
| 5147 | #if CONFIG_PROB_GZALLOC |
| 5148 | |
| 5149 | extern uint32_t random(void); |
| 5150 | struct pgz_backtrace { |
| 5151 | uint32_t pgz_depth; |
| 5152 | int32_t pgz_bt[MAX_ZTRACE_DEPTH]; |
| 5153 | }; |
| 5154 | |
| 5155 | static int32_t PERCPU_DATA(pgz_sample_counter); |
| 5156 | static SECURITY_READ_ONLY_LATE(struct pgz_backtrace *) pgz_backtraces; |
| 5157 | static uint32_t pgz_uses; /* number of zones using PGZ */ |
| 5158 | static int32_t pgz_slot_avail; |
| 5159 | #if OS_ATOMIC_HAS_LLSC |
| 5160 | struct zone_page_metadata *pgz_slot_head; |
| 5161 | #else |
| 5162 | static struct pgz_slot_head { |
| 5163 | uint32_t psh_count; |
| 5164 | uint32_t psh_slot; |
| 5165 | } pgz_slot_head; |
| 5166 | #endif |
| 5167 | struct zone_page_metadata *pgz_slot_tail; |
| 5168 | static SECURITY_READ_ONLY_LATE(vm_map_t) pgz_submap; |
| 5169 | |
| 5170 | static struct zone_page_metadata * |
| 5171 | pgz_meta(uint32_t index) |
| 5172 | { |
| 5173 | return &zone_info.zi_pgz_meta[2 * index + 1]; |
| 5174 | } |
| 5175 | |
| 5176 | static struct pgz_backtrace * |
| 5177 | pgz_bt(uint32_t slot, bool free) |
| 5178 | { |
| 5179 | return &pgz_backtraces[2 * slot + free]; |
| 5180 | } |
| 5181 | |
| 5182 | static void |
| 5183 | pgz_backtrace(struct pgz_backtrace *bt, void *fp) |
| 5184 | { |
| 5185 | struct backtrace_control ctl = { |
| 5186 | .btc_frame_addr = (uintptr_t)fp, |
| 5187 | }; |
| 5188 | |
| 5189 | bt->pgz_depth = (uint32_t)backtrace_packed(BTP_KERN_OFFSET_32, |
| 5190 | (uint8_t *)bt->pgz_bt, sizeof(bt->pgz_bt), &ctl, NULL) / 4; |
| 5191 | } |
| 5192 | |
| 5193 | static uint32_t |
| 5194 | pgz_slot(vm_offset_t addr) |
| 5195 | { |
| 5196 | return (uint32_t)((addr - zone_info.zi_pgz_range.min_address) >> (PAGE_SHIFT + 1)); |
| 5197 | } |
| 5198 | |
| 5199 | static vm_offset_t |
| 5200 | pgz_addr(uint32_t slot) |
| 5201 | { |
| 5202 | return zone_info.zi_pgz_range.min_address + ptoa(2 * slot + 1); |
| 5203 | } |
| 5204 | |
| 5205 | static bool |
| 5206 | pgz_sample(vm_offset_t addr, vm_size_t esize) |
| 5207 | { |
| 5208 | int32_t *counterp, cnt; |
| 5209 | |
| 5210 | if (zone_addr_size_crosses_page(addr, esize)) { |
| 5211 | return false; |
| 5212 | } |
| 5213 | |
| 5214 | /* |
| 5215 | * Note: accessing pgz_sample_counter is racy but this is |
| 5216 | * kind of acceptable given that this is not |
| 5217 | * a security load bearing feature. |
| 5218 | */ |
| 5219 | |
| 5220 | counterp = PERCPU_GET(pgz_sample_counter); |
| 5221 | cnt = *counterp; |
| 5222 | if (__probable(cnt > 0)) { |
| 5223 | *counterp = cnt - 1; |
| 5224 | return false; |
| 5225 | } |
| 5226 | |
| 5227 | if (pgz_slot_avail <= 0) { |
| 5228 | return false; |
| 5229 | } |
| 5230 | |
| 5231 | /* |
| 5232 | * zalloc_random_uniform() might block, so when preemption is disabled, |
| 5233 | * set the counter to `-1` which will cause the next allocation |
| 5234 | * that can block to generate a new random value. |
| 5235 | * |
| 5236 | * No allocation on this CPU will sample until then. |
| 5237 | */ |
| 5238 | if (get_preemption_level()) { |
| 5239 | *counterp = -1; |
| 5240 | } else { |
| 5241 | *counterp = zalloc_random_uniform32(0, 2 * pgz_sample_rate); |
| 5242 | } |
| 5243 | |
| 5244 | return cnt == 0; |
| 5245 | } |
| 5246 | |
| 5247 | static inline bool |
| 5248 | pgz_slot_alloc(uint32_t *slot) |
| 5249 | { |
| 5250 | struct zone_page_metadata *m; |
| 5251 | uint32_t tries = 100; |
| 5252 | |
| 5253 | disable_preemption(); |
| 5254 | |
| 5255 | #if OS_ATOMIC_USE_LLSC |
| 5256 | int32_t ov, nv; |
| 5257 | os_atomic_rmw_loop(&pgz_slot_avail, ov, nv, relaxed, { |
| 5258 | if (__improbable(ov <= 0)) { |
| 5259 | os_atomic_rmw_loop_give_up({ |
| 5260 | enable_preemption(); |
| 5261 | return false; |
| 5262 | }); |
| 5263 | } |
| 5264 | nv = ov - 1; |
| 5265 | }); |
| 5266 | #else |
| 5267 | if (__improbable(os_atomic_dec_orig(&pgz_slot_avail, relaxed) <= 0)) { |
| 5268 | os_atomic_inc(&pgz_slot_avail, relaxed); |
| 5269 | enable_preemption(); |
| 5270 | return false; |
| 5271 | } |
| 5272 | #endif |
| 5273 | |
| 5274 | again: |
| 5275 | if (__improbable(tries-- == 0)) { |
| 5276 | /* |
| 5277 | * Too much contention, |
| 5278 | * extremely unlikely but do not stay stuck. |
| 5279 | */ |
| 5280 | os_atomic_inc(&pgz_slot_avail, relaxed); |
| 5281 | enable_preemption(); |
| 5282 | return false; |
| 5283 | } |
| 5284 | |
| 5285 | #if OS_ATOMIC_HAS_LLSC |
| 5286 | uint32_t castries = 20; |
| 5287 | do { |
| 5288 | if (__improbable(castries-- == 0)) { |
| 5289 | /* |
| 5290 | * rdar://115922110 On many many cores devices, |
| 5291 | * this can fail for a very long time. |
| 5292 | */ |
| 5293 | goto again; |
| 5294 | } |
| 5295 | |
| 5296 | m = os_atomic_load_exclusive(&pgz_slot_head, dependency); |
| 5297 | if (__improbable(m->zm_pgz_slot_next == NULL)) { |
| 5298 | /* |
| 5299 | * Either we are waiting for an enqueuer (unlikely) |
| 5300 | * or we are competing with another core and |
| 5301 | * are looking at a popped element. |
| 5302 | */ |
| 5303 | os_atomic_clear_exclusive(); |
| 5304 | goto again; |
| 5305 | } |
| 5306 | } while (!os_atomic_store_exclusive(&pgz_slot_head, |
| 5307 | m->zm_pgz_slot_next, relaxed)); |
| 5308 | #else |
| 5309 | struct zone_page_metadata *base = zone_info.zi_pgz_meta; |
| 5310 | struct pgz_slot_head ov, nv; |
| 5311 | os_atomic_rmw_loop(&pgz_slot_head, ov, nv, dependency, { |
| 5312 | m = &base[ov.psh_slot * 2]; |
| 5313 | if (__improbable(m->zm_pgz_slot_next == NULL)) { |
| 5314 | /* |
| 5315 | * Either we are waiting for an enqueuer (unlikely) |
| 5316 | * or we are competing with another core and |
| 5317 | * are looking at a popped element. |
| 5318 | */ |
| 5319 | os_atomic_rmw_loop_give_up(goto again); |
| 5320 | } |
| 5321 | nv.psh_count = ov.psh_count + 1; |
| 5322 | nv.psh_slot = (uint32_t)((m->zm_pgz_slot_next - base) / 2); |
| 5323 | }); |
| 5324 | #endif |
| 5325 | |
| 5326 | enable_preemption(); |
| 5327 | |
| 5328 | m->zm_pgz_slot_next = NULL; |
| 5329 | *slot = (uint32_t)((m - zone_info.zi_pgz_meta) / 2); |
| 5330 | return true; |
| 5331 | } |
| 5332 | |
| 5333 | static inline bool |
| 5334 | pgz_slot_free(uint32_t slot) |
| 5335 | { |
| 5336 | struct zone_page_metadata *m = &zone_info.zi_pgz_meta[2 * slot]; |
| 5337 | struct zone_page_metadata *t; |
| 5338 | |
| 5339 | disable_preemption(); |
| 5340 | t = os_atomic_xchg(&pgz_slot_tail, m, relaxed); |
| 5341 | os_atomic_store(&t->zm_pgz_slot_next, m, release); |
| 5342 | os_atomic_inc(&pgz_slot_avail, relaxed); |
| 5343 | enable_preemption(); |
| 5344 | |
| 5345 | return true; |
| 5346 | } |
| 5347 | |
| 5348 | /*! |
| 5349 | * @function pgz_protect() |
| 5350 | * |
| 5351 | * @brief |
| 5352 | * Try to protect an allocation with PGZ. |
| 5353 | * |
| 5354 | * @param zone The zone the allocation was made against. |
| 5355 | * @param addr An allocated element address to protect. |
| 5356 | * @param fp The caller frame pointer (for the backtrace). |
| 5357 | * @returns The new address for the element, or @c addr. |
| 5358 | */ |
| 5359 | __attribute__((noinline)) |
| 5360 | static vm_offset_t |
| 5361 | pgz_protect(zone_t zone, vm_offset_t addr, void *fp) |
| 5362 | { |
| 5363 | kern_return_t kr; |
| 5364 | uint32_t slot; |
| 5365 | |
| 5366 | if (!pgz_slot_alloc(&slot)) { |
| 5367 | return addr; |
| 5368 | } |
| 5369 | |
| 5370 | /* |
| 5371 | * Try to double-map the page (may fail if Z_NOWAIT). |
| 5372 | * we will always find a PA because pgz_init() pre-expanded the pmap. |
| 5373 | */ |
| 5374 | pmap_paddr_t pa = kvtophys(trunc_page(addr)); |
| 5375 | vm_offset_t new_addr = pgz_addr(slot); |
| 5376 | kr = pmap_enter_options_addr(kernel_pmap, new_addr, pa, |
| 5377 | VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, |
| 5378 | get_preemption_level() ? (PMAP_OPTIONS_NOWAIT | PMAP_OPTIONS_NOPREEMPT) : 0, |
| 5379 | NULL, PMAP_MAPPING_TYPE_INFER); |
| 5380 | |
| 5381 | if (__improbable(kr != KERN_SUCCESS)) { |
| 5382 | pgz_slot_free(slot); |
| 5383 | return addr; |
| 5384 | } |
| 5385 | |
| 5386 | struct zone_page_metadata tmp = { |
| 5387 | .zm_chunk_len = ZM_PGZ_ALLOCATED, |
| 5388 | .zm_index = zone_index(zone), |
| 5389 | }; |
| 5390 | struct zone_page_metadata *meta = pgz_meta(slot); |
| 5391 | |
| 5392 | os_atomic_store(&meta->zm_bits, tmp.zm_bits, relaxed); |
| 5393 | os_atomic_store(&meta->zm_pgz_orig_addr, addr, relaxed); |
| 5394 | pgz_backtrace(pgz_bt(slot, false), fp); |
| 5395 | |
| 5396 | return new_addr + (addr & PAGE_MASK); |
| 5397 | } |
| 5398 | |
| 5399 | /*! |
| 5400 | * @function pgz_unprotect() |
| 5401 | * |
| 5402 | * @brief |
| 5403 | * Release a PGZ slot and returns the original address of a freed element. |
| 5404 | * |
| 5405 | * @param addr A PGZ protected element address. |
| 5406 | * @param fp The caller frame pointer (for the backtrace). |
| 5407 | * @returns The non protected address for the element |
| 5408 | * that was passed to @c pgz_protect(). |
| 5409 | */ |
| 5410 | __attribute__((noinline)) |
| 5411 | static vm_offset_t |
| 5412 | pgz_unprotect(vm_offset_t addr, void *fp) |
| 5413 | { |
| 5414 | struct zone_page_metadata *meta; |
| 5415 | struct zone_page_metadata tmp; |
| 5416 | uint32_t slot; |
| 5417 | |
| 5418 | slot = pgz_slot(addr); |
| 5419 | meta = zone_meta_from_addr(addr); |
| 5420 | tmp = *meta; |
| 5421 | if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5422 | goto double_free; |
| 5423 | } |
| 5424 | |
| 5425 | pmap_remove_options(kernel_pmap, vm_memtag_canonicalize_address(trunc_page(addr)), |
| 5426 | vm_memtag_canonicalize_address(trunc_page(addr) + PAGE_SIZE), |
| 5427 | PMAP_OPTIONS_REMOVE | PMAP_OPTIONS_NOPREEMPT); |
| 5428 | |
| 5429 | pgz_backtrace(pgz_bt(slot, true), fp); |
| 5430 | |
| 5431 | tmp.zm_chunk_len = ZM_PGZ_FREE; |
| 5432 | tmp.zm_bits = os_atomic_xchg(&meta->zm_bits, tmp.zm_bits, relaxed); |
| 5433 | if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5434 | goto double_free; |
| 5435 | } |
| 5436 | |
| 5437 | pgz_slot_free(slot); |
| 5438 | return tmp.zm_pgz_orig_addr; |
| 5439 | |
| 5440 | double_free: |
| 5441 | panic_fault_address = addr; |
| 5442 | meta->zm_chunk_len = ZM_PGZ_DOUBLE_FREE; |
| 5443 | panic("probabilistic gzalloc double free: %p" , (void *)addr); |
| 5444 | } |
| 5445 | |
| 5446 | bool |
| 5447 | pgz_owned(mach_vm_address_t addr) |
| 5448 | { |
| 5449 | return mach_vm_range_contains(&zone_info.zi_pgz_range, vm_memtag_canonicalize_address(addr)); |
| 5450 | } |
| 5451 | |
| 5452 | |
| 5453 | __attribute__((always_inline)) |
| 5454 | vm_offset_t |
| 5455 | __pgz_decode(mach_vm_address_t addr, mach_vm_size_t size) |
| 5456 | { |
| 5457 | struct zone_page_metadata *meta; |
| 5458 | |
| 5459 | if (__probable(!pgz_owned(addr))) { |
| 5460 | return (vm_offset_t)addr; |
| 5461 | } |
| 5462 | |
| 5463 | if (zone_addr_size_crosses_page(addr, size)) { |
| 5464 | panic("invalid size for PGZ protected address %p:%p" , |
| 5465 | (void *)addr, (void *)(addr + size)); |
| 5466 | } |
| 5467 | |
| 5468 | meta = zone_meta_from_addr((vm_offset_t)addr); |
| 5469 | if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5470 | panic_fault_address = (vm_offset_t)addr; |
| 5471 | panic("probabilistic gzalloc use-after-free: %p" , (void *)addr); |
| 5472 | } |
| 5473 | |
| 5474 | return trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); |
| 5475 | } |
| 5476 | |
| 5477 | __attribute__((always_inline)) |
| 5478 | vm_offset_t |
| 5479 | __pgz_decode_allow_invalid(vm_offset_t addr, zone_id_t zid) |
| 5480 | { |
| 5481 | struct zone_page_metadata *meta; |
| 5482 | struct zone_page_metadata tmp; |
| 5483 | |
| 5484 | if (__probable(!pgz_owned(addr))) { |
| 5485 | return addr; |
| 5486 | } |
| 5487 | |
| 5488 | meta = zone_meta_from_addr(addr); |
| 5489 | tmp.zm_bits = os_atomic_load(&meta->zm_bits, relaxed); |
| 5490 | |
| 5491 | addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); |
| 5492 | |
| 5493 | if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5494 | return 0; |
| 5495 | } |
| 5496 | |
| 5497 | if (zid != ZONE_ID_ANY && tmp.zm_index != zid) { |
| 5498 | return 0; |
| 5499 | } |
| 5500 | |
| 5501 | return addr; |
| 5502 | } |
| 5503 | |
| 5504 | static void |
| 5505 | pgz_zone_init(zone_t z) |
| 5506 | { |
| 5507 | char zn[MAX_ZONE_NAME]; |
| 5508 | char zv[MAX_ZONE_NAME]; |
| 5509 | char key[30]; |
| 5510 | |
| 5511 | if (zone_elem_inner_size(z) > PAGE_SIZE) { |
| 5512 | return; |
| 5513 | } |
| 5514 | |
| 5515 | if (pgz_all) { |
| 5516 | os_atomic_inc(&pgz_uses, relaxed); |
| 5517 | z->z_pgz_tracked = true; |
| 5518 | return; |
| 5519 | } |
| 5520 | |
| 5521 | snprintf(zn, sizeof(zn), "%s%s" , zone_heap_name(z), zone_name(z)); |
| 5522 | |
| 5523 | for (int i = 1;; i++) { |
| 5524 | snprintf(key, sizeof(key), "pgz%d" , i); |
| 5525 | if (!PE_parse_boot_argn(key, zv, sizeof(zv))) { |
| 5526 | break; |
| 5527 | } |
| 5528 | if (track_this_zone(zn, zv) || track_kalloc_zones(z, zv)) { |
| 5529 | os_atomic_inc(&pgz_uses, relaxed); |
| 5530 | z->z_pgz_tracked = true; |
| 5531 | break; |
| 5532 | } |
| 5533 | } |
| 5534 | } |
| 5535 | |
| 5536 | __startup_func |
| 5537 | static vm_size_t |
| 5538 | pgz_get_size(void) |
| 5539 | { |
| 5540 | if (pgz_slots == UINT32_MAX) { |
| 5541 | /* |
| 5542 | * Scale with RAM size: ~200 slots a G |
| 5543 | */ |
| 5544 | pgz_slots = (uint32_t)(sane_size >> 22); |
| 5545 | } |
| 5546 | |
| 5547 | /* |
| 5548 | * Make sure that the slot allocation scheme works. |
| 5549 | * see pgz_slot_alloc() / pgz_slot_free(); |
| 5550 | */ |
| 5551 | if (pgz_slots < zpercpu_count() * 4) { |
| 5552 | pgz_slots = zpercpu_count() * 4; |
| 5553 | } |
| 5554 | if (pgz_slots >= UINT16_MAX) { |
| 5555 | pgz_slots = UINT16_MAX - 1; |
| 5556 | } |
| 5557 | |
| 5558 | /* |
| 5559 | * Quarantine is 33% of slots by default, no more than 90%. |
| 5560 | */ |
| 5561 | if (pgz_quarantine == 0) { |
| 5562 | pgz_quarantine = pgz_slots / 3; |
| 5563 | } |
| 5564 | if (pgz_quarantine > pgz_slots * 9 / 10) { |
| 5565 | pgz_quarantine = pgz_slots * 9 / 10; |
| 5566 | } |
| 5567 | pgz_slot_avail = pgz_slots - pgz_quarantine; |
| 5568 | |
| 5569 | return ptoa(2 * pgz_slots + 1); |
| 5570 | } |
| 5571 | |
| 5572 | __startup_func |
| 5573 | static void |
| 5574 | pgz_init(void) |
| 5575 | { |
| 5576 | if (!pgz_uses) { |
| 5577 | return; |
| 5578 | } |
| 5579 | |
| 5580 | if (pgz_sample_rate == 0) { |
| 5581 | /* |
| 5582 | * If no rate was provided, pick a random one that scales |
| 5583 | * with the number of protected zones. |
| 5584 | * |
| 5585 | * Use a binomal distribution to avoid having too many |
| 5586 | * really fast sample rates. |
| 5587 | */ |
| 5588 | uint32_t factor = MIN(pgz_uses, 10); |
| 5589 | uint32_t max_rate = 1000 * factor; |
| 5590 | uint32_t min_rate = 100 * factor; |
| 5591 | |
| 5592 | pgz_sample_rate = (zalloc_random_uniform32(min_rate, max_rate) + |
| 5593 | zalloc_random_uniform32(min_rate, max_rate)) / 2; |
| 5594 | } |
| 5595 | |
| 5596 | struct mach_vm_range *r = &zone_info.zi_pgz_range; |
| 5597 | zone_info.zi_pgz_meta = zone_meta_from_addr(r->min_address); |
| 5598 | zone_meta_populate(r->min_address, mach_vm_range_size(r)); |
| 5599 | |
| 5600 | for (size_t i = 0; i < 2 * pgz_slots + 1; i += 2) { |
| 5601 | zone_info.zi_pgz_meta[i].zm_chunk_len = ZM_PGZ_GUARD; |
| 5602 | } |
| 5603 | |
| 5604 | for (size_t i = 1; i < pgz_slots; i++) { |
| 5605 | zone_info.zi_pgz_meta[2 * i - 1].zm_pgz_slot_next = |
| 5606 | &zone_info.zi_pgz_meta[2 * i + 1]; |
| 5607 | } |
| 5608 | #if OS_ATOMIC_HAS_LLSC |
| 5609 | pgz_slot_head = &zone_info.zi_pgz_meta[1]; |
| 5610 | #endif |
| 5611 | pgz_slot_tail = &zone_info.zi_pgz_meta[2 * pgz_slots - 1]; |
| 5612 | |
| 5613 | pgz_backtraces = zalloc_permanent(sizeof(struct pgz_backtrace) * |
| 5614 | 2 * pgz_slots, ZALIGN_PTR); |
| 5615 | |
| 5616 | /* |
| 5617 | * expand the pmap so that pmap_enter_options_addr() |
| 5618 | * in pgz_protect() never need to call pmap_expand(). |
| 5619 | */ |
| 5620 | for (uint32_t slot = 0; slot < pgz_slots; slot++) { |
| 5621 | (void)pmap_enter_options_addr(kernel_pmap, pgz_addr(slot), 0, |
| 5622 | VM_PROT_NONE, VM_PROT_NONE, 0, FALSE, |
| 5623 | PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER); |
| 5624 | } |
| 5625 | |
| 5626 | /* do this last as this will enable pgz */ |
| 5627 | percpu_foreach(counter, pgz_sample_counter) { |
| 5628 | *counter = zalloc_random_uniform32(0, 2 * pgz_sample_rate); |
| 5629 | } |
| 5630 | } |
| 5631 | STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, pgz_init); |
| 5632 | |
| 5633 | static void |
| 5634 | panic_display_pgz_bt(bool has_syms, uint32_t slot, bool free) |
| 5635 | { |
| 5636 | struct pgz_backtrace *bt = pgz_bt(slot, free); |
| 5637 | const char *what = free ? "Free" : "Allocation" ; |
| 5638 | uintptr_t buf[MAX_ZTRACE_DEPTH]; |
| 5639 | |
| 5640 | if (!ml_validate_nofault((vm_offset_t)bt, sizeof(*bt))) { |
| 5641 | paniclog_append_noflush(" Can't decode %s Backtrace\n" , what); |
| 5642 | return; |
| 5643 | } |
| 5644 | |
| 5645 | backtrace_unpack(BTP_KERN_OFFSET_32, buf, MAX_ZTRACE_DEPTH, |
| 5646 | (uint8_t *)bt->pgz_bt, 4 * bt->pgz_depth); |
| 5647 | |
| 5648 | paniclog_append_noflush(" %s Backtrace:\n" , what); |
| 5649 | for (uint32_t i = 0; i < bt->pgz_depth && i < MAX_ZTRACE_DEPTH; i++) { |
| 5650 | if (has_syms) { |
| 5651 | paniclog_append_noflush(" %p " , (void *)buf[i]); |
| 5652 | panic_print_symbol_name(buf[i]); |
| 5653 | paniclog_append_noflush("\n" ); |
| 5654 | } else { |
| 5655 | paniclog_append_noflush(" %p\n" , (void *)buf[i]); |
| 5656 | } |
| 5657 | } |
| 5658 | kmod_panic_dump((vm_offset_t *)buf, bt->pgz_depth); |
| 5659 | } |
| 5660 | |
| 5661 | static void |
| 5662 | panic_display_pgz_uaf_info(bool has_syms, vm_offset_t addr) |
| 5663 | { |
| 5664 | struct zone_page_metadata *meta; |
| 5665 | vm_offset_t elem, esize; |
| 5666 | const char *type; |
| 5667 | const char *prob; |
| 5668 | uint32_t slot; |
| 5669 | zone_t z; |
| 5670 | |
| 5671 | slot = pgz_slot(addr); |
| 5672 | meta = pgz_meta(slot); |
| 5673 | elem = pgz_addr(slot) + (meta->zm_pgz_orig_addr & PAGE_MASK); |
| 5674 | |
| 5675 | paniclog_append_noflush("Probabilistic GZAlloc Report:\n" ); |
| 5676 | |
| 5677 | if (ml_validate_nofault((vm_offset_t)meta, sizeof(*meta)) && |
| 5678 | meta->zm_index && |
| 5679 | meta->zm_index < os_atomic_load(&num_zones, relaxed)) { |
| 5680 | z = &zone_array[meta->zm_index]; |
| 5681 | } else { |
| 5682 | paniclog_append_noflush(" Zone : <unknown>\n" ); |
| 5683 | paniclog_append_noflush(" Address : %p\n" , (void *)addr); |
| 5684 | paniclog_append_noflush("\n" ); |
| 5685 | return; |
| 5686 | } |
| 5687 | |
| 5688 | esize = zone_elem_inner_size(z); |
| 5689 | paniclog_append_noflush(" Zone : %s%s\n" , |
| 5690 | zone_heap_name(z), zone_name(z)); |
| 5691 | paniclog_append_noflush(" Address : %p\n" , (void *)addr); |
| 5692 | paniclog_append_noflush(" Element : [%p, %p) of size %d\n" , |
| 5693 | (void *)elem, (void *)(elem + esize), (uint32_t)esize); |
| 5694 | |
| 5695 | if (addr < elem) { |
| 5696 | type = "out-of-bounds(underflow) + use-after-free" ; |
| 5697 | prob = "low" ; |
| 5698 | } else if (meta->zm_chunk_len == ZM_PGZ_DOUBLE_FREE) { |
| 5699 | type = "double-free" ; |
| 5700 | prob = "high" ; |
| 5701 | } else if (addr < elem + esize) { |
| 5702 | type = "use-after-free" ; |
| 5703 | prob = "high" ; |
| 5704 | } else if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5705 | type = "out-of-bounds + use-after-free" ; |
| 5706 | prob = "low" ; |
| 5707 | } else { |
| 5708 | type = "out-of-bounds" ; |
| 5709 | prob = "high" ; |
| 5710 | } |
| 5711 | paniclog_append_noflush(" Kind : %s (%s confidence)\n" , |
| 5712 | type, prob); |
| 5713 | if (addr < elem) { |
| 5714 | paniclog_append_noflush(" Access : %d byte(s) before\n" , |
| 5715 | (uint32_t)(elem - addr) + 1); |
| 5716 | } else if (addr < elem + esize) { |
| 5717 | paniclog_append_noflush(" Access : %d byte(s) inside\n" , |
| 5718 | (uint32_t)(addr - elem) + 1); |
| 5719 | } else { |
| 5720 | paniclog_append_noflush(" Access : %d byte(s) past\n" , |
| 5721 | (uint32_t)(addr - (elem + esize)) + 1); |
| 5722 | } |
| 5723 | |
| 5724 | panic_display_pgz_bt(has_syms, slot, false); |
| 5725 | if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { |
| 5726 | panic_display_pgz_bt(has_syms, slot, true); |
| 5727 | } |
| 5728 | |
| 5729 | paniclog_append_noflush("\n" ); |
| 5730 | } |
| 5731 | |
| 5732 | #endif /* CONFIG_PROB_GZALLOC */ |
| 5733 | #endif /* !ZALLOC_TEST */ |
| 5734 | #pragma mark zfree |
| 5735 | #if !ZALLOC_TEST |
| 5736 | |
| 5737 | /*! |
| 5738 | * @defgroup zfree |
| 5739 | * @{ |
| 5740 | * |
| 5741 | * @brief |
| 5742 | * The codepath for zone frees. |
| 5743 | * |
| 5744 | * @discussion |
| 5745 | * There are 4 major ways to allocate memory that end up in the zone allocator: |
| 5746 | * - @c zfree() |
| 5747 | * - @c zfree_percpu() |
| 5748 | * - @c kfree*() |
| 5749 | * - @c zfree_permanent() |
| 5750 | * |
| 5751 | * While permanent zones have their own allocation scheme, all other codepaths |
| 5752 | * will eventually go through the @c zfree_ext() choking point. |
| 5753 | */ |
| 5754 | |
| 5755 | __header_always_inline void |
| 5756 | zfree_drop(zone_t zone, vm_offset_t addr) |
| 5757 | { |
| 5758 | vm_offset_t esize = zone_elem_outer_size(zone); |
| 5759 | struct zone_page_metadata *meta; |
| 5760 | vm_offset_t eidx; |
| 5761 | |
| 5762 | meta = zone_element_resolve(zone, addr, idx: &eidx); |
| 5763 | |
| 5764 | if (!zone_meta_mark_free(meta, eidx)) { |
| 5765 | zone_meta_double_free_panic(zone, addr, caller: __func__); |
| 5766 | } |
| 5767 | |
| 5768 | vm_offset_t old_size = meta->zm_alloc_size; |
| 5769 | vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; |
| 5770 | vm_offset_t new_size = zone_meta_alloc_size_sub(z: zone, m: meta, esize); |
| 5771 | |
| 5772 | if (new_size == 0) { |
| 5773 | /* whether the page was on the intermediate or all_used, queue, move it to free */ |
| 5774 | zone_meta_requeue(z: zone, headp: &zone->z_pageq_empty, meta); |
| 5775 | zone->z_wired_empty += meta->zm_chunk_len; |
| 5776 | } else if (old_size + esize > max_size) { |
| 5777 | /* first free element on page, move from all_used */ |
| 5778 | zone_meta_requeue(z: zone, headp: &zone->z_pageq_partial, meta); |
| 5779 | } |
| 5780 | |
| 5781 | if (__improbable(zone->z_exhausted_wait)) { |
| 5782 | zone_wakeup_exhausted_waiters(z: zone); |
| 5783 | } |
| 5784 | } |
| 5785 | |
| 5786 | __attribute__((noinline)) |
| 5787 | static void |
| 5788 | zfree_item(zone_t zone, vm_offset_t addr) |
| 5789 | { |
| 5790 | /* transfer preemption count to lock */ |
| 5791 | zone_lock_nopreempt_check_contention(zone); |
| 5792 | |
| 5793 | zfree_drop(zone, addr); |
| 5794 | zone->z_elems_free += 1; |
| 5795 | |
| 5796 | zone_unlock(zone); |
| 5797 | } |
| 5798 | |
| 5799 | static void |
| 5800 | zfree_cached_depot_recirculate( |
| 5801 | zone_t zone, |
| 5802 | uint32_t depot_max, |
| 5803 | zone_cache_t cache) |
| 5804 | { |
| 5805 | smr_t smr = zone_cache_smr(cache); |
| 5806 | smr_seq_t seq; |
| 5807 | uint32_t n; |
| 5808 | |
| 5809 | zone_recirc_lock_nopreempt_check_contention(zone); |
| 5810 | |
| 5811 | n = cache->zc_depot.zd_full; |
| 5812 | if (n >= depot_max) { |
| 5813 | /* |
| 5814 | * If SMR is in use, rotate the entire chunk of magazines. |
| 5815 | * |
| 5816 | * If the head of the recirculation layer is ready to be |
| 5817 | * reused, pull them back to refill a little. |
| 5818 | */ |
| 5819 | seq = zone_depot_move_full(dst: &zone->z_recirc, |
| 5820 | src: &cache->zc_depot, n: smr ? n : n - depot_max / 2, NULL); |
| 5821 | |
| 5822 | if (smr) { |
| 5823 | smr_deferred_advance_commit(smr, seq); |
| 5824 | if (depot_max > 1 && zone_depot_poll(depot: &zone->z_recirc, smr)) { |
| 5825 | zone_depot_move_full(dst: &cache->zc_depot, |
| 5826 | src: &zone->z_recirc, n: depot_max / 2, NULL); |
| 5827 | } |
| 5828 | } |
| 5829 | } |
| 5830 | |
| 5831 | n = depot_max - cache->zc_depot.zd_full; |
| 5832 | if (n > zone->z_recirc.zd_empty) { |
| 5833 | n = zone->z_recirc.zd_empty; |
| 5834 | } |
| 5835 | if (n) { |
| 5836 | zone_depot_move_empty(dst: &cache->zc_depot, src: &zone->z_recirc, |
| 5837 | n, z: zone); |
| 5838 | } |
| 5839 | |
| 5840 | zone_recirc_unlock_nopreempt(zone); |
| 5841 | } |
| 5842 | |
| 5843 | static zone_cache_t |
| 5844 | zfree_cached_recirculate(zone_t zone, zone_cache_t cache) |
| 5845 | { |
| 5846 | zone_magazine_t mag = NULL, tmp = NULL; |
| 5847 | smr_t smr = zone_cache_smr(cache); |
| 5848 | bool wakeup_exhausted = false; |
| 5849 | |
| 5850 | if (zone->z_recirc.zd_empty == 0) { |
| 5851 | mag = zone_magazine_alloc(flags: Z_NOWAIT); |
| 5852 | } |
| 5853 | |
| 5854 | zone_recirc_lock_nopreempt_check_contention(zone); |
| 5855 | |
| 5856 | if (mag == NULL && zone->z_recirc.zd_empty) { |
| 5857 | mag = zone_depot_pop_head_empty(zd: &zone->z_recirc, z: zone); |
| 5858 | __builtin_assume(mag); |
| 5859 | } |
| 5860 | if (mag) { |
| 5861 | tmp = zone_magazine_replace(zc: cache, mag, true); |
| 5862 | if (smr) { |
| 5863 | smr_deferred_advance_commit(smr, seq: tmp->zm_seq); |
| 5864 | } |
| 5865 | if (zone_security_array[zone_index(z: zone)].z_lifo) { |
| 5866 | zone_depot_insert_head_full(zd: &zone->z_recirc, mag: tmp); |
| 5867 | } else { |
| 5868 | zone_depot_insert_tail_full(zd: &zone->z_recirc, mag: tmp); |
| 5869 | } |
| 5870 | |
| 5871 | wakeup_exhausted = zone->z_exhausted_wait; |
| 5872 | } |
| 5873 | |
| 5874 | zone_recirc_unlock_nopreempt(zone); |
| 5875 | |
| 5876 | if (__improbable(wakeup_exhausted)) { |
| 5877 | zone_lock_nopreempt(zone); |
| 5878 | if (zone->z_exhausted_wait) { |
| 5879 | zone_wakeup_exhausted_waiters(z: zone); |
| 5880 | } |
| 5881 | zone_unlock_nopreempt(zone); |
| 5882 | } |
| 5883 | |
| 5884 | return mag ? cache : NULL; |
| 5885 | } |
| 5886 | |
| 5887 | __attribute__((noinline)) |
| 5888 | static zone_cache_t |
| 5889 | zfree_cached_trim(zone_t zone, zone_cache_t cache) |
| 5890 | { |
| 5891 | zone_magazine_t mag = NULL, tmp = NULL; |
| 5892 | uint32_t depot_max; |
| 5893 | |
| 5894 | depot_max = os_atomic_load(&zone->z_depot_size, relaxed); |
| 5895 | if (depot_max) { |
| 5896 | zone_depot_lock_nopreempt(zc: cache); |
| 5897 | |
| 5898 | if (cache->zc_depot.zd_empty == 0) { |
| 5899 | zfree_cached_depot_recirculate(zone, depot_max, cache); |
| 5900 | } |
| 5901 | |
| 5902 | if (__probable(cache->zc_depot.zd_empty)) { |
| 5903 | mag = zone_depot_pop_head_empty(zd: &cache->zc_depot, NULL); |
| 5904 | __builtin_assume(mag); |
| 5905 | } else { |
| 5906 | mag = zone_magazine_alloc(flags: Z_NOWAIT); |
| 5907 | } |
| 5908 | if (mag) { |
| 5909 | tmp = zone_magazine_replace(zc: cache, mag, true); |
| 5910 | zone_depot_insert_tail_full(zd: &cache->zc_depot, mag: tmp); |
| 5911 | } |
| 5912 | |
| 5913 | zone_depot_unlock_nopreempt(zc: cache); |
| 5914 | |
| 5915 | return mag ? cache : NULL; |
| 5916 | } |
| 5917 | |
| 5918 | return zfree_cached_recirculate(zone, cache); |
| 5919 | } |
| 5920 | |
| 5921 | __attribute__((always_inline)) |
| 5922 | static inline zone_cache_t |
| 5923 | zfree_cached_get_pcpu_cache(zone_t zone, int cpu) |
| 5924 | { |
| 5925 | zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu); |
| 5926 | |
| 5927 | if (__probable(cache->zc_free_cur < zc_mag_size())) { |
| 5928 | return cache; |
| 5929 | } |
| 5930 | |
| 5931 | if (__probable(cache->zc_alloc_cur < zc_mag_size())) { |
| 5932 | zone_cache_swap_magazines(cache); |
| 5933 | return cache; |
| 5934 | } |
| 5935 | |
| 5936 | return zfree_cached_trim(zone, cache); |
| 5937 | } |
| 5938 | |
| 5939 | __attribute__((always_inline)) |
| 5940 | static inline zone_cache_t |
| 5941 | zfree_cached_get_pcpu_cache_smr(zone_t zone, int cpu) |
| 5942 | { |
| 5943 | zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu); |
| 5944 | size_t idx = cache->zc_free_cur; |
| 5945 | |
| 5946 | if (__probable(idx + 1 < zc_mag_size())) { |
| 5947 | return cache; |
| 5948 | } |
| 5949 | |
| 5950 | /* |
| 5951 | * when SMR is in use, the bucket is tagged early with |
| 5952 | * @c smr_deferred_advance(), which costs a full barrier, |
| 5953 | * but performs no store. |
| 5954 | * |
| 5955 | * When zones hit the recirculation layer, the advance is commited, |
| 5956 | * under the recirculation lock (see zfree_cached_recirculate()). |
| 5957 | * |
| 5958 | * When done this way, the zone contention detection mechanism |
| 5959 | * will adjust the size of the per-cpu depots gracefully, which |
| 5960 | * mechanically reduces the pace of these commits as usage increases. |
| 5961 | */ |
| 5962 | |
| 5963 | if (__probable(idx + 1 == zc_mag_size())) { |
| 5964 | zone_magazine_t mag; |
| 5965 | |
| 5966 | mag = (zone_magazine_t)((uintptr_t)cache->zc_free_elems - |
| 5967 | offsetof(struct zone_magazine, zm_elems)); |
| 5968 | mag->zm_seq = smr_deferred_advance(smr: zone_cache_smr(cache)); |
| 5969 | return cache; |
| 5970 | } |
| 5971 | |
| 5972 | return zfree_cached_trim(zone, cache); |
| 5973 | } |
| 5974 | |
| 5975 | __attribute__((always_inline)) |
| 5976 | static inline vm_offset_t |
| 5977 | __zcache_mark_invalid(zone_t zone, vm_offset_t elem, uint64_t combined_size) |
| 5978 | { |
| 5979 | struct zone_page_metadata *meta; |
| 5980 | vm_offset_t offs; |
| 5981 | |
| 5982 | #pragma unused(combined_size) |
| 5983 | #if CONFIG_PROB_GZALLOC |
| 5984 | if (__improbable(pgz_owned(elem))) { |
| 5985 | elem = pgz_unprotect(elem, __builtin_frame_address(0)); |
| 5986 | } |
| 5987 | #endif /* CONFIG_PROB_GZALLOC */ |
| 5988 | |
| 5989 | meta = zone_meta_from_addr(addr: elem); |
| 5990 | if (!from_zone_map(elem, 1) || !zone_has_index(z: zone, zid: meta->zm_index)) { |
| 5991 | zone_invalid_element_panic(zone, addr: elem); |
| 5992 | } |
| 5993 | |
| 5994 | offs = (elem & PAGE_MASK) - zone_elem_inner_offs(zone); |
| 5995 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 5996 | offs += ptoa(meta->zm_page_index); |
| 5997 | } |
| 5998 | |
| 5999 | if (!Z_FAST_ALIGNED(offs, magic: zone->z_align_magic)) { |
| 6000 | zone_invalid_element_panic(zone, addr: elem); |
| 6001 | } |
| 6002 | |
| 6003 | #if VM_TAG_SIZECLASSES |
| 6004 | if (__improbable(zone->z_uses_tags)) { |
| 6005 | vm_tag_t *slot; |
| 6006 | |
| 6007 | slot = zba_extra_ref_ptr(meta->zm_bitmap, |
| 6008 | Z_FAST_QUO(offs, zone->z_quo_magic)); |
| 6009 | vm_tag_update_zone_size(*slot, zone->z_tags_sizeclass, |
| 6010 | -(long)ZFREE_ELEM_SIZE(combined_size)); |
| 6011 | *slot = VM_KERN_MEMORY_NONE; |
| 6012 | } |
| 6013 | #endif /* VM_TAG_SIZECLASSES */ |
| 6014 | |
| 6015 | #if KASAN_CLASSIC |
| 6016 | kasan_free(elem, ZFREE_ELEM_SIZE(combined_size), |
| 6017 | ZFREE_USER_SIZE(combined_size), zone_elem_redzone(zone), |
| 6018 | zone->z_percpu, __builtin_frame_address(0)); |
| 6019 | #endif |
| 6020 | #if CONFIG_KERNEL_TAGGING |
| 6021 | if (__probable(zone->z_tbi_tag)) { |
| 6022 | elem = zone_tag_element(zone, elem, ZFREE_ELEM_SIZE(combined_size)); |
| 6023 | } |
| 6024 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 6025 | |
| 6026 | return elem; |
| 6027 | } |
| 6028 | |
| 6029 | __attribute__((always_inline)) |
| 6030 | void * |
| 6031 | zcache_mark_invalid(zone_t zone, void *elem) |
| 6032 | { |
| 6033 | vm_size_t esize = zone_elem_inner_size(zone); |
| 6034 | |
| 6035 | ZFREE_LOG(zone, (vm_offset_t)elem, 1); |
| 6036 | return (void *)__zcache_mark_invalid(zone, elem: (vm_offset_t)elem, ZFREE_PACK_SIZE(esize, esize)); |
| 6037 | } |
| 6038 | |
| 6039 | /* |
| 6040 | * The function is noinline when zlog can be used so that the backtracing can |
| 6041 | * reliably skip the zfree_ext() and zfree_log() |
| 6042 | * boring frames. |
| 6043 | */ |
| 6044 | #if ZALLOC_ENABLE_LOGGING |
| 6045 | __attribute__((noinline)) |
| 6046 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 6047 | void |
| 6048 | zfree_ext(zone_t zone, zone_stats_t zstats, void *addr, uint64_t combined_size) |
| 6049 | { |
| 6050 | vm_offset_t esize = ZFREE_ELEM_SIZE(combined_size); |
| 6051 | vm_offset_t elem = (vm_offset_t)addr; |
| 6052 | int cpu; |
| 6053 | |
| 6054 | DTRACE_VM2(zfree, zone_t, zone, void*, elem); |
| 6055 | |
| 6056 | ZFREE_LOG(zone, elem, 1); |
| 6057 | elem = __zcache_mark_invalid(zone, elem, combined_size); |
| 6058 | |
| 6059 | disable_preemption(); |
| 6060 | cpu = cpu_number(); |
| 6061 | zpercpu_get_cpu(zstats, cpu)->zs_mem_freed += esize; |
| 6062 | |
| 6063 | #if KASAN_CLASSIC |
| 6064 | if (zone->z_kasan_quarantine && startup_phase >= STARTUP_SUB_ZALLOC) { |
| 6065 | struct kasan_quarantine_result kqr; |
| 6066 | |
| 6067 | kqr = kasan_quarantine(elem, esize); |
| 6068 | elem = kqr.addr; |
| 6069 | zone = kqr.zone; |
| 6070 | if (elem == 0) { |
| 6071 | return enable_preemption(); |
| 6072 | } |
| 6073 | } |
| 6074 | #endif |
| 6075 | |
| 6076 | if (zone->z_pcpu_cache) { |
| 6077 | zone_cache_t cache = zfree_cached_get_pcpu_cache(zone, cpu); |
| 6078 | |
| 6079 | if (__probable(cache)) { |
| 6080 | cache->zc_free_elems[cache->zc_free_cur++] = elem; |
| 6081 | return enable_preemption(); |
| 6082 | } |
| 6083 | } |
| 6084 | |
| 6085 | return zfree_item(zone, addr: elem); |
| 6086 | } |
| 6087 | |
| 6088 | __attribute__((always_inline)) |
| 6089 | static inline zstack_t |
| 6090 | zcache_free_stack_to_cpu( |
| 6091 | zone_id_t zid, |
| 6092 | zone_cache_t cache, |
| 6093 | zstack_t stack, |
| 6094 | vm_size_t esize, |
| 6095 | zone_cache_ops_t ops, |
| 6096 | bool zero) |
| 6097 | { |
| 6098 | size_t n = MIN(zc_mag_size() - cache->zc_free_cur, stack.z_count); |
| 6099 | vm_offset_t *p; |
| 6100 | |
| 6101 | stack.z_count -= n; |
| 6102 | cache->zc_free_cur += n; |
| 6103 | p = cache->zc_free_elems + cache->zc_free_cur; |
| 6104 | |
| 6105 | do { |
| 6106 | void *o = zstack_pop_no_delta(stack: &stack); |
| 6107 | |
| 6108 | if (ops) { |
| 6109 | o = ops->zc_op_mark_invalid(zid, o); |
| 6110 | } else { |
| 6111 | if (zero) { |
| 6112 | bzero(s: o, n: esize); |
| 6113 | } |
| 6114 | o = (void *)__zcache_mark_invalid(zone: zone_by_id(zid), |
| 6115 | elem: (vm_offset_t)o, ZFREE_PACK_SIZE(esize, esize)); |
| 6116 | } |
| 6117 | *--p = (vm_offset_t)o; |
| 6118 | } while (--n > 0); |
| 6119 | |
| 6120 | return stack; |
| 6121 | } |
| 6122 | |
| 6123 | __attribute__((always_inline)) |
| 6124 | static inline void |
| 6125 | zcache_free_1_ext(zone_id_t zid, void *addr, zone_cache_ops_t ops) |
| 6126 | { |
| 6127 | vm_offset_t elem = (vm_offset_t)addr; |
| 6128 | zone_cache_t cache; |
| 6129 | vm_size_t esize; |
| 6130 | zone_t zone = zone_by_id(zid); |
| 6131 | int cpu; |
| 6132 | |
| 6133 | ZFREE_LOG(zone, elem, 1); |
| 6134 | |
| 6135 | disable_preemption(); |
| 6136 | cpu = cpu_number(); |
| 6137 | esize = zone_elem_inner_size(zone); |
| 6138 | zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed += esize; |
| 6139 | if (!ops) { |
| 6140 | addr = (void *)__zcache_mark_invalid(zone, elem, |
| 6141 | ZFREE_PACK_SIZE(esize, esize)); |
| 6142 | } |
| 6143 | cache = zfree_cached_get_pcpu_cache(zone, cpu); |
| 6144 | if (__probable(cache)) { |
| 6145 | if (ops) { |
| 6146 | addr = ops->zc_op_mark_invalid(zid, addr); |
| 6147 | } |
| 6148 | cache->zc_free_elems[cache->zc_free_cur++] = elem; |
| 6149 | enable_preemption(); |
| 6150 | } else if (ops) { |
| 6151 | enable_preemption(); |
| 6152 | os_atomic_dec(&zone_by_id(zid)->z_elems_avail, relaxed); |
| 6153 | ops->zc_op_free(zid, addr); |
| 6154 | } else { |
| 6155 | zfree_item(zone, addr: elem); |
| 6156 | } |
| 6157 | } |
| 6158 | |
| 6159 | __attribute__((always_inline)) |
| 6160 | static inline void |
| 6161 | zcache_free_n_ext(zone_id_t zid, zstack_t stack, zone_cache_ops_t ops, bool zero) |
| 6162 | { |
| 6163 | zone_t zone = zone_by_id(zid); |
| 6164 | zone_cache_t cache; |
| 6165 | vm_size_t esize; |
| 6166 | int cpu; |
| 6167 | |
| 6168 | ZFREE_LOG(zone, stack.z_head, stack.z_count); |
| 6169 | |
| 6170 | disable_preemption(); |
| 6171 | cpu = cpu_number(); |
| 6172 | esize = zone_elem_inner_size(zone); |
| 6173 | zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed += |
| 6174 | stack.z_count * esize; |
| 6175 | |
| 6176 | for (;;) { |
| 6177 | cache = zfree_cached_get_pcpu_cache(zone, cpu); |
| 6178 | if (__probable(cache)) { |
| 6179 | stack = zcache_free_stack_to_cpu(zid, cache, |
| 6180 | stack, esize, ops, zero); |
| 6181 | enable_preemption(); |
| 6182 | } else if (ops) { |
| 6183 | enable_preemption(); |
| 6184 | os_atomic_dec(&zone->z_elems_avail, relaxed); |
| 6185 | ops->zc_op_free(zid, zstack_pop(stack: &stack)); |
| 6186 | } else { |
| 6187 | vm_offset_t addr = (vm_offset_t)zstack_pop(stack: &stack); |
| 6188 | |
| 6189 | if (zero) { |
| 6190 | bzero(s: (void *)addr, n: esize); |
| 6191 | } |
| 6192 | addr = __zcache_mark_invalid(zone, elem: addr, |
| 6193 | ZFREE_PACK_SIZE(esize, esize)); |
| 6194 | zfree_item(zone, addr); |
| 6195 | } |
| 6196 | |
| 6197 | if (stack.z_count == 0) { |
| 6198 | break; |
| 6199 | } |
| 6200 | |
| 6201 | disable_preemption(); |
| 6202 | cpu = cpu_number(); |
| 6203 | } |
| 6204 | } |
| 6205 | |
| 6206 | void |
| 6207 | (zcache_free)(zone_id_t zid, void *addr, zone_cache_ops_t ops) |
| 6208 | { |
| 6209 | __builtin_assume(ops != NULL); |
| 6210 | zcache_free_1_ext(zid, addr, ops); |
| 6211 | } |
| 6212 | |
| 6213 | void |
| 6214 | (zcache_free_n)(zone_id_t zid, zstack_t stack, zone_cache_ops_t ops) |
| 6215 | { |
| 6216 | __builtin_assume(ops != NULL); |
| 6217 | zcache_free_n_ext(zid, stack, ops, false); |
| 6218 | } |
| 6219 | |
| 6220 | void |
| 6221 | (zfree_n)(zone_id_t zid, zstack_t stack) |
| 6222 | { |
| 6223 | zcache_free_n_ext(zid, stack, NULL, true); |
| 6224 | } |
| 6225 | |
| 6226 | void |
| 6227 | (zfree_nozero)(zone_id_t zid, void *addr) |
| 6228 | { |
| 6229 | zcache_free_1_ext(zid, addr, NULL); |
| 6230 | } |
| 6231 | |
| 6232 | void |
| 6233 | (zfree_nozero_n)(zone_id_t zid, zstack_t stack) |
| 6234 | { |
| 6235 | zcache_free_n_ext(zid, stack, NULL, false); |
| 6236 | } |
| 6237 | |
| 6238 | void |
| 6239 | (zfree)(zone_t zov, void *addr) |
| 6240 | { |
| 6241 | zone_t zone = zov->z_self; |
| 6242 | zone_stats_t zstats = zov->z_stats; |
| 6243 | vm_offset_t esize = zone_elem_inner_size(zone); |
| 6244 | |
| 6245 | assert(zone > &zone_array[ZONE_ID__LAST_RO]); |
| 6246 | assert(!zone->z_percpu && !zone->z_permanent && !zone->z_smr); |
| 6247 | |
| 6248 | vm_memtag_bzero(addr, esize); |
| 6249 | |
| 6250 | zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize)); |
| 6251 | } |
| 6252 | |
| 6253 | __attribute__((noinline)) |
| 6254 | void |
| 6255 | zfree_percpu(union zone_or_view zov, void *addr) |
| 6256 | { |
| 6257 | zone_t zone = zov.zov_view->zv_zone; |
| 6258 | zone_stats_t zstats = zov.zov_view->zv_stats; |
| 6259 | vm_offset_t esize = zone_elem_inner_size(zone); |
| 6260 | |
| 6261 | assert(zone > &zone_array[ZONE_ID__LAST_RO]); |
| 6262 | assert(zone->z_percpu); |
| 6263 | addr = (void *)__zpcpu_demangle(addr); |
| 6264 | zpercpu_foreach_cpu(i) { |
| 6265 | vm_memtag_bzero((char *)addr + ptoa(i), esize); |
| 6266 | } |
| 6267 | zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize)); |
| 6268 | } |
| 6269 | |
| 6270 | void |
| 6271 | (zfree_id)(zone_id_t zid, void *addr) |
| 6272 | { |
| 6273 | (zfree)(zov: &zone_array[zid], addr); |
| 6274 | } |
| 6275 | |
| 6276 | void |
| 6277 | (zfree_ro)(zone_id_t zid, void *addr) |
| 6278 | { |
| 6279 | assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO); |
| 6280 | zone_t zone = zone_by_id(zid); |
| 6281 | zone_stats_t zstats = zone->z_stats; |
| 6282 | vm_offset_t esize = zone_ro_size_params[zid].z_elem_size; |
| 6283 | |
| 6284 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 6285 | assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY); |
| 6286 | pmap_ro_zone_bzero(zid, va: (vm_offset_t)addr, offset: 0, size: esize); |
| 6287 | #else |
| 6288 | (void)zid; |
| 6289 | bzero(addr, esize); |
| 6290 | #endif /* !KASAN_CLASSIC */ |
| 6291 | zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize)); |
| 6292 | } |
| 6293 | |
| 6294 | __attribute__((noinline)) |
| 6295 | static void |
| 6296 | zfree_item_smr(zone_t zone, vm_offset_t addr) |
| 6297 | { |
| 6298 | zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, 0); |
| 6299 | vm_size_t esize = zone_elem_inner_size(zone); |
| 6300 | |
| 6301 | /* |
| 6302 | * This should be taken extremely rarely: |
| 6303 | * this happens if we failed allocating an empty bucket. |
| 6304 | */ |
| 6305 | smr_synchronize(smr: zone_cache_smr(cache)); |
| 6306 | |
| 6307 | cache->zc_free((void *)addr, esize); |
| 6308 | addr = __zcache_mark_invalid(zone, elem: addr, ZFREE_PACK_SIZE(esize, esize)); |
| 6309 | |
| 6310 | zfree_item(zone, addr); |
| 6311 | } |
| 6312 | |
| 6313 | void |
| 6314 | (zfree_smr)(zone_t zone, void *addr) |
| 6315 | { |
| 6316 | vm_offset_t elem = (vm_offset_t)addr; |
| 6317 | vm_offset_t esize; |
| 6318 | zone_cache_t cache; |
| 6319 | int cpu; |
| 6320 | |
| 6321 | ZFREE_LOG(zone, elem, 1); |
| 6322 | |
| 6323 | disable_preemption(); |
| 6324 | cpu = cpu_number(); |
| 6325 | #if MACH_ASSERT |
| 6326 | cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu); |
| 6327 | assert(!smr_entered_cpu_noblock(cache->zc_smr, cpu)); |
| 6328 | #endif |
| 6329 | esize = zone_elem_inner_size(zone); |
| 6330 | zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed += esize; |
| 6331 | cache = zfree_cached_get_pcpu_cache_smr(zone, cpu); |
| 6332 | if (__probable(cache)) { |
| 6333 | cache->zc_free_elems[cache->zc_free_cur++] = elem; |
| 6334 | enable_preemption(); |
| 6335 | } else { |
| 6336 | zfree_item_smr(zone, addr: elem); |
| 6337 | } |
| 6338 | } |
| 6339 | |
| 6340 | void |
| 6341 | (zfree_id_smr)(zone_id_t zid, void *addr) |
| 6342 | { |
| 6343 | (zfree_smr)(zone: &zone_array[zid], addr); |
| 6344 | } |
| 6345 | |
| 6346 | void |
| 6347 | kfree_type_impl_internal( |
| 6348 | kalloc_type_view_t kt_view, |
| 6349 | void *ptr __unsafe_indexable) |
| 6350 | { |
| 6351 | zone_t zsig = kt_view->kt_zsig; |
| 6352 | zone_t z = kt_view->kt_zv.zv_zone; |
| 6353 | struct zone_page_metadata *meta; |
| 6354 | zone_id_t zidx_meta; |
| 6355 | zone_security_flags_t zsflags_meta; |
| 6356 | zone_security_flags_t zsflags_z = zone_security_config(z); |
| 6357 | zone_security_flags_t zsflags_zsig; |
| 6358 | |
| 6359 | if (NULL == ptr) { |
| 6360 | return; |
| 6361 | } |
| 6362 | |
| 6363 | meta = zone_meta_from_addr(addr: (vm_offset_t) ptr); |
| 6364 | zidx_meta = meta->zm_index; |
| 6365 | zsflags_meta = zone_security_array[zidx_meta]; |
| 6366 | |
| 6367 | if ((zsflags_z.z_kheap_id == KHEAP_ID_DATA_BUFFERS) || |
| 6368 | zone_has_index(z, zid: zidx_meta)) { |
| 6369 | return (zfree)(view: &kt_view->kt_zv, elem: ptr); |
| 6370 | } |
| 6371 | zsflags_zsig = zone_security_config(z: zsig); |
| 6372 | if (zsflags_meta.z_sig_eq == zsflags_zsig.z_sig_eq) { |
| 6373 | z = zone_array + zidx_meta; |
| 6374 | return (zfree)(zov: z, addr: ptr); |
| 6375 | } |
| 6376 | |
| 6377 | return (zfree)(zov: kt_view->kt_zshared, addr: ptr); |
| 6378 | } |
| 6379 | |
| 6380 | /*! @} */ |
| 6381 | #endif /* !ZALLOC_TEST */ |
| 6382 | #pragma mark zalloc |
| 6383 | #if !ZALLOC_TEST |
| 6384 | |
| 6385 | /*! |
| 6386 | * @defgroup zalloc |
| 6387 | * @{ |
| 6388 | * |
| 6389 | * @brief |
| 6390 | * The codepath for zone allocations. |
| 6391 | * |
| 6392 | * @discussion |
| 6393 | * There are 4 major ways to allocate memory that end up in the zone allocator: |
| 6394 | * - @c zalloc(), @c zalloc_flags(), ... |
| 6395 | * - @c zalloc_percpu() |
| 6396 | * - @c kalloc*() |
| 6397 | * - @c zalloc_permanent() |
| 6398 | * |
| 6399 | * While permanent zones have their own allocation scheme, all other codepaths |
| 6400 | * will eventually go through the @c zalloc_ext() choking point. |
| 6401 | * |
| 6402 | * @c zalloc_return() is the final function everyone tail calls into, |
| 6403 | * which prepares the element for consumption by the caller and deals with |
| 6404 | * common treatment (zone logging, tags, kasan, validation, ...). |
| 6405 | */ |
| 6406 | |
| 6407 | /*! |
| 6408 | * @function zalloc_import |
| 6409 | * |
| 6410 | * @brief |
| 6411 | * Import @c n elements in the specified array, opposite of @c zfree_drop(). |
| 6412 | * |
| 6413 | * @param zone The zone to import elements from |
| 6414 | * @param elems The array to import into |
| 6415 | * @param n The number of elements to import. Must be non zero, |
| 6416 | * and smaller than @c zone->z_elems_free. |
| 6417 | */ |
| 6418 | __header_always_inline vm_size_t |
| 6419 | zalloc_import( |
| 6420 | zone_t zone, |
| 6421 | vm_offset_t *elems, |
| 6422 | zalloc_flags_t flags, |
| 6423 | uint32_t n) |
| 6424 | { |
| 6425 | vm_offset_t esize = zone_elem_outer_size(zone); |
| 6426 | vm_offset_t offs = zone_elem_inner_offs(zone); |
| 6427 | zone_stats_t zs; |
| 6428 | int cpu = cpu_number(); |
| 6429 | uint32_t i = 0; |
| 6430 | |
| 6431 | zs = zpercpu_get_cpu(zone->z_stats, cpu); |
| 6432 | |
| 6433 | if (__improbable(zone_caching_disabled < 0)) { |
| 6434 | /* |
| 6435 | * In the first 10s after boot, mess with |
| 6436 | * the scan position in order to make early |
| 6437 | * allocations patterns less predictable. |
| 6438 | */ |
| 6439 | zone_early_scramble_rr(zone, cpu, zs); |
| 6440 | } |
| 6441 | |
| 6442 | do { |
| 6443 | vm_offset_t page, eidx, size = 0; |
| 6444 | struct zone_page_metadata *meta; |
| 6445 | |
| 6446 | if (!zone_pva_is_null(page: zone->z_pageq_partial)) { |
| 6447 | meta = zone_pva_to_meta(page: zone->z_pageq_partial); |
| 6448 | page = zone_pva_to_addr(page: zone->z_pageq_partial); |
| 6449 | } else if (!zone_pva_is_null(page: zone->z_pageq_empty)) { |
| 6450 | meta = zone_pva_to_meta(page: zone->z_pageq_empty); |
| 6451 | page = zone_pva_to_addr(page: zone->z_pageq_empty); |
| 6452 | zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len); |
| 6453 | } else { |
| 6454 | zone_accounting_panic(zone, kind: "z_elems_free corruption" ); |
| 6455 | } |
| 6456 | |
| 6457 | zone_meta_validate(z: zone, meta, addr: page); |
| 6458 | |
| 6459 | vm_offset_t old_size = meta->zm_alloc_size; |
| 6460 | vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; |
| 6461 | |
| 6462 | do { |
| 6463 | eidx = zone_meta_find_and_clear_bit(zone, zs, meta, flags); |
| 6464 | elems[i++] = page + offs + eidx * esize; |
| 6465 | size += esize; |
| 6466 | } while (i < n && old_size + size + esize <= max_size); |
| 6467 | |
| 6468 | vm_offset_t new_size = zone_meta_alloc_size_add(z: zone, m: meta, esize: size); |
| 6469 | |
| 6470 | if (new_size + esize > max_size) { |
| 6471 | zone_meta_requeue(z: zone, headp: &zone->z_pageq_full, meta); |
| 6472 | } else if (old_size == 0) { |
| 6473 | /* remove from free, move to intermediate */ |
| 6474 | zone_meta_requeue(z: zone, headp: &zone->z_pageq_partial, meta); |
| 6475 | } |
| 6476 | } while (i < n); |
| 6477 | |
| 6478 | n = zone_counter_sub(zone, z_elems_free, n); |
| 6479 | if (zone->z_pcpu_cache == NULL && zone->z_elems_free_min > n) { |
| 6480 | zone->z_elems_free_min = n; |
| 6481 | } |
| 6482 | |
| 6483 | return zone_elem_inner_size(zone); |
| 6484 | } |
| 6485 | |
| 6486 | __attribute__((always_inline)) |
| 6487 | static inline vm_offset_t |
| 6488 | __zcache_mark_valid(zone_t zone, vm_offset_t addr, zalloc_flags_t flags) |
| 6489 | { |
| 6490 | #pragma unused(zone, flags) |
| 6491 | #if KASAN_CLASSIC || CONFIG_PROB_GZALLOC || VM_TAG_SIZECLASSES |
| 6492 | vm_offset_t esize = zone_elem_inner_size(zone); |
| 6493 | #endif |
| 6494 | |
| 6495 | #if CONFIG_KERNEL_TAGGING |
| 6496 | if (__probable(zone->z_tbi_tag)) { |
| 6497 | /* |
| 6498 | * Retrieve the memory tag assigned on free and update the pointer |
| 6499 | * metadata. |
| 6500 | */ |
| 6501 | addr = vm_memtag_fixup_ptr(addr); |
| 6502 | } |
| 6503 | #endif /* CONFIG_KERNEL_TAGGING */ |
| 6504 | |
| 6505 | #if VM_TAG_SIZECLASSES |
| 6506 | if (__improbable(zone->z_uses_tags)) { |
| 6507 | struct zone_page_metadata *meta; |
| 6508 | vm_offset_t offs; |
| 6509 | vm_tag_t *slot; |
| 6510 | vm_tag_t tag; |
| 6511 | |
| 6512 | tag = zalloc_flags_get_tag(flags); |
| 6513 | meta = zone_meta_from_addr(addr); |
| 6514 | offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone); |
| 6515 | if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 6516 | offs += ptoa(meta->zm_page_index); |
| 6517 | } |
| 6518 | |
| 6519 | slot = zba_extra_ref_ptr(meta->zm_bitmap, |
| 6520 | Z_FAST_QUO(offs, zone->z_quo_magic)); |
| 6521 | *slot = tag; |
| 6522 | |
| 6523 | vm_tag_update_zone_size(tag, zone->z_tags_sizeclass, |
| 6524 | (long)esize); |
| 6525 | } |
| 6526 | #endif /* VM_TAG_SIZECLASSES */ |
| 6527 | |
| 6528 | #if CONFIG_PROB_GZALLOC |
| 6529 | if (zone->z_pgz_tracked && pgz_sample(addr, esize)) { |
| 6530 | addr = pgz_protect(zone, addr, __builtin_frame_address(0)); |
| 6531 | } |
| 6532 | #endif |
| 6533 | |
| 6534 | #if KASAN_CLASSIC |
| 6535 | /* |
| 6536 | * KASAN_CLASSIC integration of kalloc heaps are handled by kalloc_ext() |
| 6537 | */ |
| 6538 | if ((flags & Z_SKIP_KASAN) == 0) { |
| 6539 | kasan_alloc(addr, esize, esize, zone_elem_redzone(zone), |
| 6540 | (flags & Z_PCPU), __builtin_frame_address(0)); |
| 6541 | } |
| 6542 | #endif /* KASAN_CLASSIC */ |
| 6543 | |
| 6544 | return addr; |
| 6545 | } |
| 6546 | |
| 6547 | __attribute__((always_inline)) |
| 6548 | void * |
| 6549 | zcache_mark_valid(zone_t zone, void *addr) |
| 6550 | { |
| 6551 | addr = (void *)__zcache_mark_valid(zone, addr: (vm_offset_t)addr, flags: 0); |
| 6552 | ZALLOC_LOG(zone, (vm_offset_t)addr, 1); |
| 6553 | return addr; |
| 6554 | } |
| 6555 | |
| 6556 | /*! |
| 6557 | * @function zalloc_return |
| 6558 | * |
| 6559 | * @brief |
| 6560 | * Performs the tail-end of the work required on allocations before the caller |
| 6561 | * uses them. |
| 6562 | * |
| 6563 | * @discussion |
| 6564 | * This function is called without any zone lock held, |
| 6565 | * and preemption back to the state it had when @c zalloc_ext() was called. |
| 6566 | * |
| 6567 | * @param zone The zone we're allocating from. |
| 6568 | * @param addr The element we just allocated. |
| 6569 | * @param flags The flags passed to @c zalloc_ext() (for Z_ZERO). |
| 6570 | * @param elem_size The element size for this zone. |
| 6571 | */ |
| 6572 | __attribute__((always_inline)) |
| 6573 | static struct kalloc_result |
| 6574 | zalloc_return( |
| 6575 | zone_t zone, |
| 6576 | vm_offset_t addr, |
| 6577 | zalloc_flags_t flags, |
| 6578 | vm_offset_t elem_size) |
| 6579 | { |
| 6580 | addr = __zcache_mark_valid(zone, addr, flags); |
| 6581 | #if ZALLOC_ENABLE_ZERO_CHECK |
| 6582 | zalloc_validate_element(zone, elem: addr, size: elem_size, flags); |
| 6583 | #endif /* ZALLOC_ENABLE_ZERO_CHECK */ |
| 6584 | ZALLOC_LOG(zone, addr, 1); |
| 6585 | |
| 6586 | DTRACE_VM2(zalloc, zone_t, zone, void*, addr); |
| 6587 | return (struct kalloc_result){ .addr: (void *)addr, .size: elem_size }; |
| 6588 | } |
| 6589 | |
| 6590 | static vm_size_t |
| 6591 | zalloc_get_shared_threshold(zone_t zone, vm_size_t esize) |
| 6592 | { |
| 6593 | if (esize <= 512) { |
| 6594 | return zone_early_thres_mul * page_size / 4; |
| 6595 | } else if (esize < 2048) { |
| 6596 | return zone_early_thres_mul * esize * 8; |
| 6597 | } |
| 6598 | return zone_early_thres_mul * zone->z_chunk_elems * esize; |
| 6599 | } |
| 6600 | |
| 6601 | __attribute__((noinline)) |
| 6602 | static struct kalloc_result |
| 6603 | zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) |
| 6604 | { |
| 6605 | vm_offset_t esize, addr; |
| 6606 | zone_stats_t zs; |
| 6607 | |
| 6608 | zone_lock_nopreempt_check_contention(zone); |
| 6609 | |
| 6610 | zs = zpercpu_get(zstats); |
| 6611 | if (__improbable(zone->z_elems_free <= zone->z_elems_rsv / 2)) { |
| 6612 | if ((flags & Z_NOWAIT) || zone->z_elems_free) { |
| 6613 | zone_expand_async_schedule_if_allowed(zone); |
| 6614 | } else { |
| 6615 | zone_expand_locked(z: zone, flags); |
| 6616 | } |
| 6617 | if (__improbable(zone->z_elems_free == 0)) { |
| 6618 | zs->zs_alloc_fail++; |
| 6619 | zone_unlock(zone); |
| 6620 | if (__improbable(flags & Z_NOFAIL)) { |
| 6621 | zone_nofail_panic(zone); |
| 6622 | } |
| 6623 | DTRACE_VM2(zalloc, zone_t, zone, void*, NULL); |
| 6624 | return (struct kalloc_result){ }; |
| 6625 | } |
| 6626 | } |
| 6627 | |
| 6628 | esize = zalloc_import(zone, elems: &addr, flags, n: 1); |
| 6629 | zs->zs_mem_allocated += esize; |
| 6630 | |
| 6631 | if (__improbable(!zone_share_always && |
| 6632 | !os_atomic_load(&zs->zs_alloc_not_shared, relaxed))) { |
| 6633 | if (flags & Z_SET_NOTSHARED) { |
| 6634 | vm_size_t shared_threshold = zalloc_get_shared_threshold(zone, esize); |
| 6635 | |
| 6636 | if (zs->zs_mem_allocated >= shared_threshold) { |
| 6637 | zpercpu_foreach(zs_cpu, zstats) { |
| 6638 | os_atomic_store(&zs_cpu->zs_alloc_not_shared, 1, relaxed); |
| 6639 | } |
| 6640 | } |
| 6641 | } |
| 6642 | } |
| 6643 | zone_unlock(zone); |
| 6644 | |
| 6645 | return zalloc_return(zone, addr, flags, elem_size: esize); |
| 6646 | } |
| 6647 | |
| 6648 | static void |
| 6649 | zalloc_cached_import( |
| 6650 | zone_t zone, |
| 6651 | zalloc_flags_t flags, |
| 6652 | zone_cache_t cache) |
| 6653 | { |
| 6654 | uint16_t n_elems = zc_mag_size(); |
| 6655 | |
| 6656 | zone_lock_nopreempt(zone); |
| 6657 | |
| 6658 | if (__probable(!zone_caching_disabled && |
| 6659 | zone->z_elems_free > zone->z_elems_rsv / 2)) { |
| 6660 | if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) { |
| 6661 | zone_expand_async_schedule_if_allowed(zone); |
| 6662 | } |
| 6663 | if (zone->z_elems_free < n_elems) { |
| 6664 | n_elems = (uint16_t)zone->z_elems_free; |
| 6665 | } |
| 6666 | zalloc_import(zone, elems: cache->zc_alloc_elems, flags, n: n_elems); |
| 6667 | cache->zc_alloc_cur = n_elems; |
| 6668 | } |
| 6669 | |
| 6670 | zone_unlock_nopreempt(zone); |
| 6671 | } |
| 6672 | |
| 6673 | static void |
| 6674 | zalloc_cached_depot_recirculate( |
| 6675 | zone_t zone, |
| 6676 | uint32_t depot_max, |
| 6677 | zone_cache_t cache, |
| 6678 | smr_t smr) |
| 6679 | { |
| 6680 | smr_seq_t seq; |
| 6681 | uint32_t n; |
| 6682 | |
| 6683 | zone_recirc_lock_nopreempt_check_contention(zone); |
| 6684 | |
| 6685 | n = cache->zc_depot.zd_empty; |
| 6686 | if (n >= depot_max) { |
| 6687 | zone_depot_move_empty(dst: &zone->z_recirc, src: &cache->zc_depot, |
| 6688 | n: n - depot_max / 2, NULL); |
| 6689 | } |
| 6690 | |
| 6691 | n = cache->zc_depot.zd_full; |
| 6692 | if (smr && n) { |
| 6693 | /* |
| 6694 | * if SMR is in use, it means smr_poll() failed, |
| 6695 | * so rotate the entire chunk of magazines in order |
| 6696 | * to let the sequence numbers age. |
| 6697 | */ |
| 6698 | seq = zone_depot_move_full(dst: &zone->z_recirc, src: &cache->zc_depot, |
| 6699 | n, NULL); |
| 6700 | smr_deferred_advance_commit(smr, seq); |
| 6701 | } |
| 6702 | |
| 6703 | n = depot_max - cache->zc_depot.zd_empty; |
| 6704 | if (n > zone->z_recirc.zd_full) { |
| 6705 | n = zone->z_recirc.zd_full; |
| 6706 | } |
| 6707 | |
| 6708 | if (n && zone_depot_poll(depot: &zone->z_recirc, smr)) { |
| 6709 | zone_depot_move_full(dst: &cache->zc_depot, src: &zone->z_recirc, |
| 6710 | n, z: zone); |
| 6711 | } |
| 6712 | |
| 6713 | zone_recirc_unlock_nopreempt(zone); |
| 6714 | } |
| 6715 | |
| 6716 | static void |
| 6717 | zalloc_cached_reuse_smr(zone_t z, zone_cache_t cache, zone_magazine_t mag) |
| 6718 | { |
| 6719 | zone_smr_free_cb_t zc_free = cache->zc_free; |
| 6720 | vm_size_t esize = zone_elem_inner_size(zone: z); |
| 6721 | |
| 6722 | for (uint16_t i = 0; i < zc_mag_size(); i++) { |
| 6723 | vm_offset_t elem = mag->zm_elems[i]; |
| 6724 | |
| 6725 | zc_free((void *)elem, zone_elem_inner_size(zone: z)); |
| 6726 | elem = __zcache_mark_invalid(zone: z, elem, |
| 6727 | ZFREE_PACK_SIZE(esize, esize)); |
| 6728 | mag->zm_elems[i] = elem; |
| 6729 | } |
| 6730 | } |
| 6731 | |
| 6732 | static void |
| 6733 | zalloc_cached_recirculate( |
| 6734 | zone_t zone, |
| 6735 | zone_cache_t cache) |
| 6736 | { |
| 6737 | zone_magazine_t mag = NULL; |
| 6738 | |
| 6739 | zone_recirc_lock_nopreempt_check_contention(zone); |
| 6740 | |
| 6741 | if (zone_depot_poll(depot: &zone->z_recirc, smr: zone_cache_smr(cache))) { |
| 6742 | mag = zone_depot_pop_head_full(zd: &zone->z_recirc, z: zone); |
| 6743 | if (zone_cache_smr(cache)) { |
| 6744 | zalloc_cached_reuse_smr(z: zone, cache, mag); |
| 6745 | } |
| 6746 | mag = zone_magazine_replace(zc: cache, mag, false); |
| 6747 | zone_depot_insert_head_empty(zd: &zone->z_recirc, mag); |
| 6748 | } |
| 6749 | |
| 6750 | zone_recirc_unlock_nopreempt(zone); |
| 6751 | } |
| 6752 | |
| 6753 | __attribute__((noinline)) |
| 6754 | static zone_cache_t |
| 6755 | zalloc_cached_prime( |
| 6756 | zone_t zone, |
| 6757 | zone_cache_ops_t ops, |
| 6758 | zalloc_flags_t flags, |
| 6759 | zone_cache_t cache) |
| 6760 | { |
| 6761 | zone_magazine_t mag = NULL; |
| 6762 | uint32_t depot_max; |
| 6763 | smr_t smr; |
| 6764 | |
| 6765 | depot_max = os_atomic_load(&zone->z_depot_size, relaxed); |
| 6766 | if (depot_max) { |
| 6767 | smr = zone_cache_smr(cache); |
| 6768 | |
| 6769 | zone_depot_lock_nopreempt(zc: cache); |
| 6770 | |
| 6771 | if (!zone_depot_poll(depot: &cache->zc_depot, smr)) { |
| 6772 | zalloc_cached_depot_recirculate(zone, depot_max, cache, |
| 6773 | smr); |
| 6774 | } |
| 6775 | |
| 6776 | if (__probable(cache->zc_depot.zd_full)) { |
| 6777 | mag = zone_depot_pop_head_full(zd: &cache->zc_depot, NULL); |
| 6778 | if (zone_cache_smr(cache)) { |
| 6779 | zalloc_cached_reuse_smr(z: zone, cache, mag); |
| 6780 | } |
| 6781 | mag = zone_magazine_replace(zc: cache, mag, false); |
| 6782 | zone_depot_insert_head_empty(zd: &cache->zc_depot, mag); |
| 6783 | } |
| 6784 | |
| 6785 | zone_depot_unlock_nopreempt(zc: cache); |
| 6786 | } else if (zone->z_recirc.zd_full) { |
| 6787 | zalloc_cached_recirculate(zone, cache); |
| 6788 | } |
| 6789 | |
| 6790 | if (__probable(cache->zc_alloc_cur)) { |
| 6791 | return cache; |
| 6792 | } |
| 6793 | |
| 6794 | if (ops == NULL) { |
| 6795 | zalloc_cached_import(zone, flags, cache); |
| 6796 | if (__probable(cache->zc_alloc_cur)) { |
| 6797 | return cache; |
| 6798 | } |
| 6799 | } |
| 6800 | |
| 6801 | return NULL; |
| 6802 | } |
| 6803 | |
| 6804 | __attribute__((always_inline)) |
| 6805 | static inline zone_cache_t |
| 6806 | zalloc_cached_get_pcpu_cache( |
| 6807 | zone_t zone, |
| 6808 | zone_cache_ops_t ops, |
| 6809 | int cpu, |
| 6810 | zalloc_flags_t flags) |
| 6811 | { |
| 6812 | zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu); |
| 6813 | |
| 6814 | if (__probable(cache->zc_alloc_cur != 0)) { |
| 6815 | return cache; |
| 6816 | } |
| 6817 | |
| 6818 | if (__probable(cache->zc_free_cur != 0 && !cache->zc_smr)) { |
| 6819 | zone_cache_swap_magazines(cache); |
| 6820 | return cache; |
| 6821 | } |
| 6822 | |
| 6823 | return zalloc_cached_prime(zone, ops, flags, cache); |
| 6824 | } |
| 6825 | |
| 6826 | |
| 6827 | /*! |
| 6828 | * @function zalloc_ext |
| 6829 | * |
| 6830 | * @brief |
| 6831 | * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu(). |
| 6832 | */ |
| 6833 | struct kalloc_result |
| 6834 | zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) |
| 6835 | { |
| 6836 | /* |
| 6837 | * KASan uses zalloc() for fakestack, which can be called anywhere. |
| 6838 | * However, we make sure these calls can never block. |
| 6839 | */ |
| 6840 | assertf(startup_phase < STARTUP_SUB_EARLY_BOOT || |
| 6841 | #if KASAN_FAKESTACK |
| 6842 | zone->z_kasan_fakestacks || |
| 6843 | #endif /* KASAN_FAKESTACK */ |
| 6844 | ml_get_interrupts_enabled() || |
| 6845 | ml_is_quiescing() || |
| 6846 | debug_mode_active(), |
| 6847 | "Calling {k,z}alloc from interrupt disabled context isn't allowed" ); |
| 6848 | |
| 6849 | /* |
| 6850 | * Make sure Z_NOFAIL was not obviously misused |
| 6851 | */ |
| 6852 | if (flags & Z_NOFAIL) { |
| 6853 | assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); |
| 6854 | } |
| 6855 | |
| 6856 | #if VM_TAG_SIZECLASSES |
| 6857 | if (__improbable(zone->z_uses_tags)) { |
| 6858 | vm_tag_t tag = zalloc_flags_get_tag(flags); |
| 6859 | |
| 6860 | if (flags & Z_VM_TAG_BT_BIT) { |
| 6861 | tag = vm_tag_bt() ?: tag; |
| 6862 | } |
| 6863 | if (tag != VM_KERN_MEMORY_NONE) { |
| 6864 | tag = vm_tag_will_update_zone(tag, zone->z_tags_sizeclass, |
| 6865 | flags & (Z_WAITOK | Z_NOWAIT | Z_NOPAGEWAIT)); |
| 6866 | } |
| 6867 | if (tag == VM_KERN_MEMORY_NONE) { |
| 6868 | zone_security_flags_t zsflags = zone_security_config(zone); |
| 6869 | |
| 6870 | if (zsflags.z_kheap_id == KHEAP_ID_DATA_BUFFERS) { |
| 6871 | tag = VM_KERN_MEMORY_KALLOC_DATA; |
| 6872 | } else if (zsflags.z_kheap_id == KHEAP_ID_KT_VAR || |
| 6873 | zsflags.z_kalloc_type) { |
| 6874 | tag = VM_KERN_MEMORY_KALLOC_TYPE; |
| 6875 | } else { |
| 6876 | tag = VM_KERN_MEMORY_KALLOC; |
| 6877 | } |
| 6878 | } |
| 6879 | flags = Z_VM_TAG(flags & ~Z_VM_TAG_MASK, tag); |
| 6880 | } |
| 6881 | #endif /* VM_TAG_SIZECLASSES */ |
| 6882 | |
| 6883 | disable_preemption(); |
| 6884 | |
| 6885 | #if ZALLOC_ENABLE_ZERO_CHECK |
| 6886 | if (zalloc_skip_zero_check()) { |
| 6887 | flags |= Z_NOZZC; |
| 6888 | } |
| 6889 | #endif |
| 6890 | |
| 6891 | if (zone->z_pcpu_cache) { |
| 6892 | zone_cache_t cache; |
| 6893 | vm_offset_t index, addr, esize; |
| 6894 | int cpu = cpu_number(); |
| 6895 | |
| 6896 | cache = zalloc_cached_get_pcpu_cache(zone, NULL, cpu, flags); |
| 6897 | if (__probable(cache)) { |
| 6898 | esize = zone_elem_inner_size(zone); |
| 6899 | zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += esize; |
| 6900 | index = --cache->zc_alloc_cur; |
| 6901 | addr = cache->zc_alloc_elems[index]; |
| 6902 | cache->zc_alloc_elems[index] = 0; |
| 6903 | enable_preemption(); |
| 6904 | return zalloc_return(zone, addr, flags, elem_size: esize); |
| 6905 | } |
| 6906 | } |
| 6907 | |
| 6908 | __attribute__((musttail)) |
| 6909 | return zalloc_item(zone, zstats, flags); |
| 6910 | } |
| 6911 | |
| 6912 | __attribute__((always_inline)) |
| 6913 | static inline zstack_t |
| 6914 | zcache_alloc_stack_from_cpu( |
| 6915 | zone_id_t zid, |
| 6916 | zone_cache_t cache, |
| 6917 | zstack_t stack, |
| 6918 | uint32_t n, |
| 6919 | zone_cache_ops_t ops) |
| 6920 | { |
| 6921 | vm_offset_t *p; |
| 6922 | |
| 6923 | n = MIN(n, cache->zc_alloc_cur); |
| 6924 | p = cache->zc_alloc_elems + cache->zc_alloc_cur; |
| 6925 | cache->zc_alloc_cur -= n; |
| 6926 | stack.z_count += n; |
| 6927 | |
| 6928 | do { |
| 6929 | vm_offset_t e = *--p; |
| 6930 | |
| 6931 | *p = 0; |
| 6932 | if (ops) { |
| 6933 | e = (vm_offset_t)ops->zc_op_mark_valid(zid, (void *)e); |
| 6934 | } else { |
| 6935 | e = __zcache_mark_valid(zone: zone_by_id(zid), addr: e, flags: 0); |
| 6936 | } |
| 6937 | zstack_push_no_delta(stack: &stack, addr: (void *)e); |
| 6938 | } while (--n > 0); |
| 6939 | |
| 6940 | return stack; |
| 6941 | } |
| 6942 | |
| 6943 | __attribute__((noinline)) |
| 6944 | static zstack_t |
| 6945 | zcache_alloc_fail(zone_id_t zid, zstack_t stack, uint32_t count) |
| 6946 | { |
| 6947 | zone_t zone = zone_by_id(zid); |
| 6948 | zone_stats_t zstats = zone->z_stats; |
| 6949 | int cpu; |
| 6950 | |
| 6951 | count -= stack.z_count; |
| 6952 | |
| 6953 | disable_preemption(); |
| 6954 | cpu = cpu_number(); |
| 6955 | zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated -= |
| 6956 | count * zone_elem_inner_size(zone); |
| 6957 | zpercpu_get_cpu(zstats, cpu)->zs_alloc_fail += 1; |
| 6958 | enable_preemption(); |
| 6959 | |
| 6960 | return stack; |
| 6961 | } |
| 6962 | |
| 6963 | #define ZCACHE_ALLOC_RETRY ((void *)-1) |
| 6964 | |
| 6965 | __attribute__((noinline)) |
| 6966 | static void * |
| 6967 | zcache_alloc_one( |
| 6968 | zone_id_t zid, |
| 6969 | zalloc_flags_t flags, |
| 6970 | zone_cache_ops_t ops) |
| 6971 | { |
| 6972 | zone_t zone = zone_by_id(zid); |
| 6973 | void *o; |
| 6974 | |
| 6975 | /* |
| 6976 | * First try to allocate in rudimentary zones without ever going into |
| 6977 | * __ZONE_EXHAUSTED_AND_WAITING_HARD__() by clearing Z_NOFAIL. |
| 6978 | */ |
| 6979 | enable_preemption(); |
| 6980 | o = ops->zc_op_alloc(zid, flags & ~Z_NOFAIL); |
| 6981 | if (__probable(o)) { |
| 6982 | os_atomic_inc(&zone->z_elems_avail, relaxed); |
| 6983 | } else if (__probable(flags & Z_NOFAIL)) { |
| 6984 | zone_cache_t cache; |
| 6985 | vm_offset_t index; |
| 6986 | int cpu; |
| 6987 | |
| 6988 | zone_lock(zone); |
| 6989 | |
| 6990 | cpu = cpu_number(); |
| 6991 | cache = zalloc_cached_get_pcpu_cache(zone, ops, cpu, flags); |
| 6992 | o = ZCACHE_ALLOC_RETRY; |
| 6993 | if (__probable(cache)) { |
| 6994 | index = --cache->zc_alloc_cur; |
| 6995 | o = (void *)cache->zc_alloc_elems[index]; |
| 6996 | cache->zc_alloc_elems[index] = 0; |
| 6997 | o = ops->zc_op_mark_valid(zid, o); |
| 6998 | } else if (zone->z_elems_free == 0) { |
| 6999 | __ZONE_EXHAUSTED_AND_WAITING_HARD__(z: zone); |
| 7000 | } |
| 7001 | |
| 7002 | zone_unlock(zone); |
| 7003 | } |
| 7004 | |
| 7005 | return o; |
| 7006 | } |
| 7007 | |
| 7008 | __attribute__((always_inline)) |
| 7009 | static zstack_t |
| 7010 | zcache_alloc_n_ext( |
| 7011 | zone_id_t zid, |
| 7012 | uint32_t count, |
| 7013 | zalloc_flags_t flags, |
| 7014 | zone_cache_ops_t ops) |
| 7015 | { |
| 7016 | zstack_t stack = { }; |
| 7017 | zone_cache_t cache; |
| 7018 | zone_t zone; |
| 7019 | int cpu; |
| 7020 | |
| 7021 | disable_preemption(); |
| 7022 | cpu = cpu_number(); |
| 7023 | zone = zone_by_id(zid); |
| 7024 | zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_allocated += |
| 7025 | count * zone_elem_inner_size(zone); |
| 7026 | |
| 7027 | for (;;) { |
| 7028 | cache = zalloc_cached_get_pcpu_cache(zone, ops, cpu, flags); |
| 7029 | if (__probable(cache)) { |
| 7030 | stack = zcache_alloc_stack_from_cpu(zid, cache, stack, |
| 7031 | n: count - stack.z_count, ops); |
| 7032 | enable_preemption(); |
| 7033 | } else { |
| 7034 | void *o; |
| 7035 | |
| 7036 | if (ops) { |
| 7037 | o = zcache_alloc_one(zid, flags, ops); |
| 7038 | } else { |
| 7039 | o = zalloc_item(zone, zstats: zone->z_stats, flags).addr; |
| 7040 | } |
| 7041 | if (__improbable(o == NULL)) { |
| 7042 | return zcache_alloc_fail(zid, stack, count); |
| 7043 | } |
| 7044 | if (ops == NULL || o != ZCACHE_ALLOC_RETRY) { |
| 7045 | zstack_push(stack: &stack, addr: o); |
| 7046 | } |
| 7047 | } |
| 7048 | |
| 7049 | if (stack.z_count == count) { |
| 7050 | break; |
| 7051 | } |
| 7052 | |
| 7053 | disable_preemption(); |
| 7054 | cpu = cpu_number(); |
| 7055 | } |
| 7056 | |
| 7057 | ZALLOC_LOG(zone, stack.z_head, stack.z_count); |
| 7058 | |
| 7059 | return stack; |
| 7060 | } |
| 7061 | |
| 7062 | zstack_t |
| 7063 | zalloc_n(zone_id_t zid, uint32_t count, zalloc_flags_t flags) |
| 7064 | { |
| 7065 | return zcache_alloc_n_ext(zid, count, flags, NULL); |
| 7066 | } |
| 7067 | |
| 7068 | zstack_t |
| 7069 | (zcache_alloc_n)( |
| 7070 | zone_id_t zid, |
| 7071 | uint32_t count, |
| 7072 | zalloc_flags_t flags, |
| 7073 | zone_cache_ops_t ops) |
| 7074 | { |
| 7075 | __builtin_assume(ops != NULL); |
| 7076 | return zcache_alloc_n_ext(zid, count, flags, ops); |
| 7077 | } |
| 7078 | |
| 7079 | __attribute__((always_inline)) |
| 7080 | void * |
| 7081 | zalloc(zone_t zov) |
| 7082 | { |
| 7083 | return zalloc_flags(zov, Z_WAITOK); |
| 7084 | } |
| 7085 | |
| 7086 | __attribute__((always_inline)) |
| 7087 | void * |
| 7088 | zalloc_noblock(zone_t zov) |
| 7089 | { |
| 7090 | return zalloc_flags(zov, Z_NOWAIT); |
| 7091 | } |
| 7092 | |
| 7093 | void * |
| 7094 | (zalloc_flags)(zone_t zov, zalloc_flags_t flags) |
| 7095 | { |
| 7096 | zone_t zone = zov->z_self; |
| 7097 | zone_stats_t zstats = zov->z_stats; |
| 7098 | |
| 7099 | assert(zone > &zone_array[ZONE_ID__LAST_RO]); |
| 7100 | assert(!zone->z_percpu && !zone->z_permanent); |
| 7101 | return zalloc_ext(zone, zstats, flags).addr; |
| 7102 | } |
| 7103 | |
| 7104 | __attribute__((always_inline)) |
| 7105 | void * |
| 7106 | (zalloc_id)(zone_id_t zid, zalloc_flags_t flags) |
| 7107 | { |
| 7108 | return (zalloc_flags)(zov: zone_by_id(zid), flags); |
| 7109 | } |
| 7110 | |
| 7111 | void * |
| 7112 | (zalloc_ro)(zone_id_t zid, zalloc_flags_t flags) |
| 7113 | { |
| 7114 | assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO); |
| 7115 | zone_t zone = zone_by_id(zid); |
| 7116 | zone_stats_t zstats = zone->z_stats; |
| 7117 | struct kalloc_result kr; |
| 7118 | |
| 7119 | kr = zalloc_ext(zone, zstats, flags); |
| 7120 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7121 | assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY); |
| 7122 | if (kr.addr) { |
| 7123 | zone_require_ro(zone_id: zid, elem_size: kr.size, addr: kr.addr); |
| 7124 | } |
| 7125 | #endif |
| 7126 | return kr.addr; |
| 7127 | } |
| 7128 | |
| 7129 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7130 | |
| 7131 | __attribute__((always_inline)) |
| 7132 | static bool |
| 7133 | from_current_stack(vm_offset_t addr, vm_size_t size) |
| 7134 | { |
| 7135 | vm_offset_t start = (vm_offset_t)__builtin_frame_address(0); |
| 7136 | vm_offset_t end = (start + kernel_stack_size - 1) & -kernel_stack_size; |
| 7137 | |
| 7138 | addr = vm_memtag_canonicalize_address(addr); |
| 7139 | |
| 7140 | return (addr >= start) && (addr + size < end); |
| 7141 | } |
| 7142 | |
| 7143 | /* |
| 7144 | * Check if an address is from const memory i.e TEXT or DATA CONST segements |
| 7145 | * or the SECURITY_READ_ONLY_LATE section. |
| 7146 | */ |
| 7147 | #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) |
| 7148 | __attribute__((always_inline)) |
| 7149 | static bool |
| 7150 | from_const_memory(const vm_offset_t addr, vm_size_t size) |
| 7151 | { |
| 7152 | return rorgn_contains(addr, size, true); |
| 7153 | } |
| 7154 | #else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ |
| 7155 | __attribute__((always_inline)) |
| 7156 | static bool |
| 7157 | from_const_memory(const vm_offset_t addr, vm_size_t size) |
| 7158 | { |
| 7159 | #pragma unused(addr, size) |
| 7160 | return true; |
| 7161 | } |
| 7162 | #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ |
| 7163 | |
| 7164 | __abortlike |
| 7165 | static void |
| 7166 | zalloc_ro_mut_validation_panic(zone_id_t zid, void *elem, |
| 7167 | const vm_offset_t src, vm_size_t src_size) |
| 7168 | { |
| 7169 | vm_offset_t stack_start = (vm_offset_t)__builtin_frame_address(0); |
| 7170 | vm_offset_t stack_end = (stack_start + kernel_stack_size - 1) & -kernel_stack_size; |
| 7171 | #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) |
| 7172 | extern vm_offset_t rorgn_begin; |
| 7173 | extern vm_offset_t rorgn_end; |
| 7174 | #else |
| 7175 | vm_offset_t const rorgn_begin = 0; |
| 7176 | vm_offset_t const rorgn_end = 0; |
| 7177 | #endif |
| 7178 | |
| 7179 | if (from_ro_map(src, src_size)) { |
| 7180 | zone_t src_zone = &zone_array[zone_index_from_ptr(ptr: (void *)src)]; |
| 7181 | zone_t dst_zone = &zone_array[zid]; |
| 7182 | panic("zalloc_ro_mut failed: source (%p) not from same zone as dst (%p)" |
| 7183 | " (expected: %s, actual: %s" , (void *)src, elem, src_zone->z_name, |
| 7184 | dst_zone->z_name); |
| 7185 | } |
| 7186 | |
| 7187 | panic("zalloc_ro_mut failed: source (%p, phys %p) not from RO zone map (%p - %p), " |
| 7188 | "current stack (%p - %p) or const memory (phys %p - %p)" , |
| 7189 | (void *)src, (void*)kvtophys(src), |
| 7190 | (void *)zone_info.zi_ro_range.min_address, |
| 7191 | (void *)zone_info.zi_ro_range.max_address, |
| 7192 | (void *)stack_start, (void *)stack_end, |
| 7193 | (void *)rorgn_begin, (void *)rorgn_end); |
| 7194 | } |
| 7195 | |
| 7196 | __attribute__((always_inline)) |
| 7197 | static void |
| 7198 | zalloc_ro_mut_validate_src(zone_id_t zid, void *elem, |
| 7199 | const vm_offset_t src, vm_size_t src_size) |
| 7200 | { |
| 7201 | if (from_current_stack(addr: src, size: src_size) || |
| 7202 | (from_ro_map(src, src_size) && |
| 7203 | zid == zone_index_from_ptr(ptr: (void *)src)) || |
| 7204 | from_const_memory(addr: src, size: src_size)) { |
| 7205 | return; |
| 7206 | } |
| 7207 | zalloc_ro_mut_validation_panic(zid, elem, src, src_size); |
| 7208 | } |
| 7209 | |
| 7210 | #endif /* ZSECURITY_CONFIG(READ_ONLY) */ |
| 7211 | |
| 7212 | __attribute__((noinline)) |
| 7213 | void |
| 7214 | zalloc_ro_mut(zone_id_t zid, void *elem, vm_offset_t offset, |
| 7215 | const void *new_data, vm_size_t new_data_size) |
| 7216 | { |
| 7217 | assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO); |
| 7218 | |
| 7219 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7220 | bool skip_src_check = false; |
| 7221 | |
| 7222 | /* |
| 7223 | * The OSEntitlements RO-zone is a little differently treated. For more |
| 7224 | * information: rdar://100518485. |
| 7225 | */ |
| 7226 | if (zid == ZONE_ID_AMFI_OSENTITLEMENTS) { |
| 7227 | code_signing_config_t cs_config = 0; |
| 7228 | |
| 7229 | code_signing_configuration(NULL, config: &cs_config); |
| 7230 | if (cs_config & CS_CONFIG_CSM_ENABLED) { |
| 7231 | skip_src_check = true; |
| 7232 | } |
| 7233 | } |
| 7234 | |
| 7235 | if (skip_src_check == false) { |
| 7236 | zalloc_ro_mut_validate_src(zid, elem, src: (vm_offset_t)new_data, |
| 7237 | src_size: new_data_size); |
| 7238 | } |
| 7239 | pmap_ro_zone_memcpy(zid, va: (vm_offset_t) elem, offset, |
| 7240 | new_data: (vm_offset_t) new_data, new_data_size); |
| 7241 | #else |
| 7242 | (void)zid; |
| 7243 | memcpy((void *)((uintptr_t)elem + offset), new_data, new_data_size); |
| 7244 | #endif |
| 7245 | } |
| 7246 | |
| 7247 | __attribute__((noinline)) |
| 7248 | uint64_t |
| 7249 | zalloc_ro_mut_atomic(zone_id_t zid, void *elem, vm_offset_t offset, |
| 7250 | zro_atomic_op_t op, uint64_t value) |
| 7251 | { |
| 7252 | assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO); |
| 7253 | |
| 7254 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7255 | value = pmap_ro_zone_atomic_op(zid, va: (vm_offset_t)elem, offset, op, value); |
| 7256 | #else |
| 7257 | (void)zid; |
| 7258 | value = __zalloc_ro_mut_atomic((vm_offset_t)elem + offset, op, value); |
| 7259 | #endif |
| 7260 | return value; |
| 7261 | } |
| 7262 | |
| 7263 | void |
| 7264 | zalloc_ro_clear(zone_id_t zid, void *elem, vm_offset_t offset, vm_size_t size) |
| 7265 | { |
| 7266 | assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO); |
| 7267 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7268 | pmap_ro_zone_bzero(zid, va: (vm_offset_t)elem, offset, size); |
| 7269 | #else |
| 7270 | (void)zid; |
| 7271 | bzero((void *)((uintptr_t)elem + offset), size); |
| 7272 | #endif |
| 7273 | } |
| 7274 | |
| 7275 | /* |
| 7276 | * This function will run in the PPL and needs to be robust |
| 7277 | * against an attacker with arbitrary kernel write. |
| 7278 | */ |
| 7279 | |
| 7280 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7281 | |
| 7282 | __abortlike |
| 7283 | static void |
| 7284 | zone_id_require_ro_panic(zone_id_t zid, void *addr) |
| 7285 | { |
| 7286 | struct zone_size_params p = zone_ro_size_params[zid]; |
| 7287 | vm_offset_t elem = (vm_offset_t)addr; |
| 7288 | uint32_t zindex; |
| 7289 | zone_t other; |
| 7290 | zone_t zone = &zone_array[zid]; |
| 7291 | |
| 7292 | if (!from_ro_map(addr, 1)) { |
| 7293 | panic("zone_require_ro failed: address not in a ro zone (addr: %p)" , addr); |
| 7294 | } |
| 7295 | |
| 7296 | if (!Z_FAST_ALIGNED(PAGE_SIZE - (elem & PAGE_MASK), magic: p.z_align_magic)) { |
| 7297 | panic("zone_require_ro failed: element improperly aligned (addr: %p)" , addr); |
| 7298 | } |
| 7299 | |
| 7300 | zindex = zone_index_from_ptr(ptr: addr); |
| 7301 | other = &zone_array[zindex]; |
| 7302 | if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) { |
| 7303 | panic("zone_require_ro failed: invalid zone index %d " |
| 7304 | "(addr: %p, expected: %s%s)" , zindex, |
| 7305 | addr, zone_heap_name(zone), zone->z_name); |
| 7306 | } else { |
| 7307 | panic("zone_require_ro failed: address in unexpected zone id %d (%s%s) " |
| 7308 | "(addr: %p, expected: %s%s)" , |
| 7309 | zindex, zone_heap_name(other), other->z_name, |
| 7310 | addr, zone_heap_name(zone), zone->z_name); |
| 7311 | } |
| 7312 | } |
| 7313 | |
| 7314 | #endif /* ZSECURITY_CONFIG(READ_ONLY) */ |
| 7315 | |
| 7316 | __attribute__((always_inline)) |
| 7317 | void |
| 7318 | zone_require_ro(zone_id_t zid, vm_size_t elem_size __unused, void *addr) |
| 7319 | { |
| 7320 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 7321 | struct zone_size_params p = zone_ro_size_params[zid]; |
| 7322 | vm_offset_t elem = (vm_offset_t)addr; |
| 7323 | |
| 7324 | if (!from_ro_map(addr, 1) || |
| 7325 | !Z_FAST_ALIGNED(PAGE_SIZE - (elem & PAGE_MASK), magic: p.z_align_magic) || |
| 7326 | zid != zone_meta_from_addr(addr: elem)->zm_index) { |
| 7327 | zone_id_require_ro_panic(zid, addr); |
| 7328 | } |
| 7329 | #else |
| 7330 | #pragma unused(zid, addr) |
| 7331 | #endif |
| 7332 | } |
| 7333 | |
| 7334 | void * |
| 7335 | (zalloc_percpu)(union zone_or_view zov, zalloc_flags_t flags) |
| 7336 | { |
| 7337 | zone_t zone = zov.zov_view->zv_zone; |
| 7338 | zone_stats_t zstats = zov.zov_view->zv_stats; |
| 7339 | |
| 7340 | assert(zone > &zone_array[ZONE_ID__LAST_RO]); |
| 7341 | assert(zone->z_percpu); |
| 7342 | flags |= Z_PCPU; |
| 7343 | return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags).addr); |
| 7344 | } |
| 7345 | |
| 7346 | static void * |
| 7347 | _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask) |
| 7348 | { |
| 7349 | struct zone_page_metadata *page_meta; |
| 7350 | vm_offset_t offs, addr; |
| 7351 | zone_pva_t pva; |
| 7352 | |
| 7353 | assert(ml_get_interrupts_enabled() || |
| 7354 | ml_is_quiescing() || |
| 7355 | debug_mode_active() || |
| 7356 | startup_phase < STARTUP_SUB_EARLY_BOOT); |
| 7357 | |
| 7358 | size = (size + mask) & ~mask; |
| 7359 | assert(size <= PAGE_SIZE); |
| 7360 | |
| 7361 | zone_lock(zone); |
| 7362 | assert(zone->z_self == zone); |
| 7363 | |
| 7364 | for (;;) { |
| 7365 | pva = zone->z_pageq_partial; |
| 7366 | while (!zone_pva_is_null(page: pva)) { |
| 7367 | page_meta = zone_pva_to_meta(page: pva); |
| 7368 | if (page_meta->zm_bump + size <= PAGE_SIZE) { |
| 7369 | goto found; |
| 7370 | } |
| 7371 | pva = page_meta->zm_page_next; |
| 7372 | } |
| 7373 | |
| 7374 | zone_expand_locked(z: zone, flags: Z_WAITOK); |
| 7375 | } |
| 7376 | |
| 7377 | found: |
| 7378 | offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask); |
| 7379 | page_meta->zm_bump = (uint16_t)(offs + size); |
| 7380 | page_meta->zm_alloc_size += size; |
| 7381 | zone->z_elems_free -= size; |
| 7382 | zpercpu_get(zone->z_stats)->zs_mem_allocated += size; |
| 7383 | |
| 7384 | if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) { |
| 7385 | zone_meta_requeue(z: zone, headp: &zone->z_pageq_full, meta: page_meta); |
| 7386 | } |
| 7387 | |
| 7388 | zone_unlock(zone); |
| 7389 | |
| 7390 | if (zone->z_tbi_tag) { |
| 7391 | addr = vm_memtag_fixup_ptr(offs + zone_pva_to_addr(pva)); |
| 7392 | } else { |
| 7393 | addr = offs + zone_pva_to_addr(page: pva); |
| 7394 | } |
| 7395 | |
| 7396 | DTRACE_VM2(zalloc, zone_t, zone, void*, addr); |
| 7397 | return (void *)addr; |
| 7398 | } |
| 7399 | |
| 7400 | static void * |
| 7401 | _zalloc_permanent_large(size_t size, vm_offset_t mask, vm_tag_t tag) |
| 7402 | { |
| 7403 | vm_offset_t addr; |
| 7404 | |
| 7405 | kernel_memory_allocate(map: kernel_map, addrp: &addr, size, mask, |
| 7406 | flags: KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, tag); |
| 7407 | |
| 7408 | return (void *)addr; |
| 7409 | } |
| 7410 | |
| 7411 | void * |
| 7412 | zalloc_permanent_tag(vm_size_t size, vm_offset_t mask, vm_tag_t tag) |
| 7413 | { |
| 7414 | if (size <= PAGE_SIZE) { |
| 7415 | zone_t zone = &zone_array[ZONE_ID_PERMANENT]; |
| 7416 | return _zalloc_permanent(zone, size, mask); |
| 7417 | } |
| 7418 | return _zalloc_permanent_large(size, mask, tag); |
| 7419 | } |
| 7420 | |
| 7421 | void * |
| 7422 | zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask) |
| 7423 | { |
| 7424 | zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT]; |
| 7425 | return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask)); |
| 7426 | } |
| 7427 | |
| 7428 | /*! @} */ |
| 7429 | #endif /* !ZALLOC_TEST */ |
| 7430 | #pragma mark zone GC / trimming |
| 7431 | #if !ZALLOC_TEST |
| 7432 | |
| 7433 | static thread_call_data_t zone_trim_callout; |
| 7434 | EVENT_DEFINE(ZONE_EXHAUSTED); |
| 7435 | |
| 7436 | static void |
| 7437 | zone_reclaim_chunk( |
| 7438 | zone_t z, |
| 7439 | struct zone_page_metadata *meta, |
| 7440 | uint32_t free_count) |
| 7441 | { |
| 7442 | vm_address_t page_addr; |
| 7443 | vm_size_t size_to_free; |
| 7444 | uint32_t bitmap_ref; |
| 7445 | uint32_t page_count; |
| 7446 | zone_security_flags_t zsflags = zone_security_config(z); |
| 7447 | bool sequester = !z->z_destroyed; |
| 7448 | bool oob_guard = false; |
| 7449 | |
| 7450 | if (zone_submap_is_sequestered(zsflags)) { |
| 7451 | /* |
| 7452 | * If the entire map is sequestered, we can't return the VA. |
| 7453 | * It stays pinned to the zone forever. |
| 7454 | */ |
| 7455 | sequester = true; |
| 7456 | } |
| 7457 | |
| 7458 | zone_meta_queue_pop(z, headp: &z->z_pageq_empty); |
| 7459 | |
| 7460 | page_addr = zone_meta_to_addr(meta); |
| 7461 | page_count = meta->zm_chunk_len; |
| 7462 | oob_guard = meta->zm_guarded; |
| 7463 | |
| 7464 | if (meta->zm_alloc_size) { |
| 7465 | zone_metadata_corruption(zone: z, meta, kind: "alloc_size" ); |
| 7466 | } |
| 7467 | if (z->z_percpu) { |
| 7468 | if (page_count != 1) { |
| 7469 | zone_metadata_corruption(zone: z, meta, kind: "page_count" ); |
| 7470 | } |
| 7471 | size_to_free = ptoa(z->z_chunk_pages); |
| 7472 | zone_remove_wired_pages(z, pages: z->z_chunk_pages); |
| 7473 | } else { |
| 7474 | if (page_count > z->z_chunk_pages) { |
| 7475 | zone_metadata_corruption(zone: z, meta, kind: "page_count" ); |
| 7476 | } |
| 7477 | if (page_count < z->z_chunk_pages) { |
| 7478 | /* Dequeue non populated VA from z_pageq_va */ |
| 7479 | zone_meta_remqueue(z, meta: meta + page_count); |
| 7480 | } |
| 7481 | size_to_free = ptoa(page_count); |
| 7482 | zone_remove_wired_pages(z, pages: page_count); |
| 7483 | } |
| 7484 | |
| 7485 | zone_counter_sub(z, z_elems_free, free_count); |
| 7486 | zone_counter_sub(z, z_elems_avail, free_count); |
| 7487 | zone_counter_sub(z, z_wired_empty, page_count); |
| 7488 | zone_counter_sub(z, z_wired_cur, page_count); |
| 7489 | |
| 7490 | if (z->z_pcpu_cache == NULL) { |
| 7491 | if (z->z_elems_free_min < free_count) { |
| 7492 | z->z_elems_free_min = 0; |
| 7493 | } else { |
| 7494 | z->z_elems_free_min -= free_count; |
| 7495 | } |
| 7496 | } |
| 7497 | if (z->z_elems_free_wma < free_count) { |
| 7498 | z->z_elems_free_wma = 0; |
| 7499 | } else { |
| 7500 | z->z_elems_free_wma -= free_count; |
| 7501 | } |
| 7502 | |
| 7503 | bitmap_ref = 0; |
| 7504 | if (sequester) { |
| 7505 | if (meta->zm_inline_bitmap) { |
| 7506 | for (int i = 0; i < meta->zm_chunk_len; i++) { |
| 7507 | meta[i].zm_bitmap = 0; |
| 7508 | } |
| 7509 | } else { |
| 7510 | bitmap_ref = meta->zm_bitmap; |
| 7511 | meta->zm_bitmap = 0; |
| 7512 | } |
| 7513 | meta->zm_chunk_len = 0; |
| 7514 | } else { |
| 7515 | if (!meta->zm_inline_bitmap) { |
| 7516 | bitmap_ref = meta->zm_bitmap; |
| 7517 | } |
| 7518 | zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); |
| 7519 | bzero(s: meta, n: sizeof(*meta) * (z->z_chunk_pages + oob_guard)); |
| 7520 | } |
| 7521 | |
| 7522 | #if CONFIG_ZLEAKS |
| 7523 | if (__improbable(zleak_should_disable_for_zone(z) && |
| 7524 | startup_phase >= STARTUP_SUB_THREAD_CALL)) { |
| 7525 | thread_call_enter(&zone_leaks_callout); |
| 7526 | } |
| 7527 | #endif /* CONFIG_ZLEAKS */ |
| 7528 | |
| 7529 | zone_unlock(zone: z); |
| 7530 | |
| 7531 | if (bitmap_ref) { |
| 7532 | zone_bits_free(bref: bitmap_ref); |
| 7533 | } |
| 7534 | |
| 7535 | /* Free the pages for metadata and account for them */ |
| 7536 | #if KASAN_CLASSIC |
| 7537 | if (z->z_percpu) { |
| 7538 | for (uint32_t i = 0; i < z->z_chunk_pages; i++) { |
| 7539 | kasan_zmem_remove(page_addr + ptoa(i), PAGE_SIZE, |
| 7540 | zone_elem_outer_size(z), |
| 7541 | zone_elem_outer_offs(z), |
| 7542 | zone_elem_redzone(z)); |
| 7543 | } |
| 7544 | } else { |
| 7545 | kasan_zmem_remove(page_addr, size_to_free, |
| 7546 | zone_elem_outer_size(z), |
| 7547 | zone_elem_outer_offs(z), |
| 7548 | zone_elem_redzone(z)); |
| 7549 | } |
| 7550 | #endif /* KASAN_CLASSIC */ |
| 7551 | |
| 7552 | if (sequester) { |
| 7553 | kernel_memory_depopulate(addr: page_addr, size: size_to_free, |
| 7554 | flags: KMA_KOBJECT, VM_KERN_MEMORY_ZONE); |
| 7555 | } else { |
| 7556 | assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_VM); |
| 7557 | kmem_free(map: zone_submap(zsflags), addr: page_addr, |
| 7558 | ptoa(z->z_chunk_pages + oob_guard)); |
| 7559 | if (oob_guard) { |
| 7560 | os_atomic_dec(&zone_guard_pages, relaxed); |
| 7561 | } |
| 7562 | } |
| 7563 | |
| 7564 | thread_yield_to_preemption(); |
| 7565 | |
| 7566 | zone_lock(zone: z); |
| 7567 | |
| 7568 | if (sequester) { |
| 7569 | zone_meta_queue_push(z, headp: &z->z_pageq_va, meta); |
| 7570 | } |
| 7571 | } |
| 7572 | |
| 7573 | static void |
| 7574 | zone_reclaim_elements(zone_t z, uint16_t n, vm_offset_t *elems) |
| 7575 | { |
| 7576 | z_debug_assert(n <= zc_mag_size()); |
| 7577 | |
| 7578 | for (uint16_t i = 0; i < n; i++) { |
| 7579 | vm_offset_t addr = elems[i]; |
| 7580 | elems[i] = 0; |
| 7581 | zfree_drop(zone: z, addr); |
| 7582 | } |
| 7583 | |
| 7584 | z->z_elems_free += n; |
| 7585 | } |
| 7586 | |
| 7587 | static void |
| 7588 | zcache_reclaim_elements(zone_id_t zid, uint16_t n, vm_offset_t *elems) |
| 7589 | { |
| 7590 | z_debug_assert(n <= zc_mag_size()); |
| 7591 | zone_cache_ops_t ops = zcache_ops[zid]; |
| 7592 | |
| 7593 | for (uint16_t i = 0; i < n; i++) { |
| 7594 | vm_offset_t addr = elems[i]; |
| 7595 | elems[i] = 0; |
| 7596 | addr = (vm_offset_t)ops->zc_op_mark_valid(zid, (void *)addr); |
| 7597 | ops->zc_op_free(zid, (void *)addr); |
| 7598 | } |
| 7599 | |
| 7600 | os_atomic_sub(&zone_by_id(zid)->z_elems_avail, n, relaxed); |
| 7601 | } |
| 7602 | |
| 7603 | static void |
| 7604 | zone_depot_trim(zone_t z, uint32_t target, struct zone_depot *zd) |
| 7605 | { |
| 7606 | zpercpu_foreach(zc, z->z_pcpu_cache) { |
| 7607 | zone_depot_lock(zc); |
| 7608 | |
| 7609 | if (zc->zc_depot.zd_full > (target + 1) / 2) { |
| 7610 | uint32_t n = zc->zc_depot.zd_full - (target + 1) / 2; |
| 7611 | zone_depot_move_full(dst: zd, src: &zc->zc_depot, n, NULL); |
| 7612 | } |
| 7613 | |
| 7614 | if (zc->zc_depot.zd_empty > target / 2) { |
| 7615 | uint32_t n = zc->zc_depot.zd_empty - target / 2; |
| 7616 | zone_depot_move_empty(dst: zd, src: &zc->zc_depot, n, NULL); |
| 7617 | } |
| 7618 | |
| 7619 | zone_depot_unlock(zc); |
| 7620 | } |
| 7621 | } |
| 7622 | |
| 7623 | __enum_decl(zone_reclaim_mode_t, uint32_t, { |
| 7624 | ZONE_RECLAIM_TRIM, |
| 7625 | ZONE_RECLAIM_DRAIN, |
| 7626 | ZONE_RECLAIM_DESTROY, |
| 7627 | }); |
| 7628 | |
| 7629 | static void |
| 7630 | zone_reclaim_pcpu(zone_t z, zone_reclaim_mode_t mode, struct zone_depot *zd) |
| 7631 | { |
| 7632 | uint32_t depot_max = 0; |
| 7633 | bool cleanup = mode != ZONE_RECLAIM_TRIM; |
| 7634 | |
| 7635 | if (z->z_depot_cleanup) { |
| 7636 | z->z_depot_cleanup = false; |
| 7637 | depot_max = z->z_depot_size; |
| 7638 | cleanup = true; |
| 7639 | } |
| 7640 | |
| 7641 | if (cleanup) { |
| 7642 | zone_depot_trim(z, target: depot_max, zd); |
| 7643 | } |
| 7644 | |
| 7645 | if (mode == ZONE_RECLAIM_DESTROY) { |
| 7646 | zpercpu_foreach(zc, z->z_pcpu_cache) { |
| 7647 | zone_reclaim_elements(z, n: zc->zc_alloc_cur, |
| 7648 | elems: zc->zc_alloc_elems); |
| 7649 | zone_reclaim_elements(z, n: zc->zc_free_cur, |
| 7650 | elems: zc->zc_free_elems); |
| 7651 | zc->zc_alloc_cur = zc->zc_free_cur = 0; |
| 7652 | } |
| 7653 | |
| 7654 | z->z_recirc_empty_min = 0; |
| 7655 | z->z_recirc_empty_wma = 0; |
| 7656 | z->z_recirc_full_min = 0; |
| 7657 | z->z_recirc_full_wma = 0; |
| 7658 | z->z_recirc_cont_cur = 0; |
| 7659 | z->z_recirc_cont_wma = 0; |
| 7660 | } |
| 7661 | } |
| 7662 | |
| 7663 | static void |
| 7664 | zone_reclaim_recirc_drain(zone_t z, struct zone_depot *zd) |
| 7665 | { |
| 7666 | assert(zd->zd_empty == 0); |
| 7667 | assert(zd->zd_full == 0); |
| 7668 | |
| 7669 | zone_recirc_lock_nopreempt(zone: z); |
| 7670 | |
| 7671 | *zd = z->z_recirc; |
| 7672 | if (zd->zd_full == 0) { |
| 7673 | zd->zd_tail = &zd->zd_head; |
| 7674 | } |
| 7675 | zone_depot_init(zd: &z->z_recirc); |
| 7676 | z->z_recirc_empty_min = 0; |
| 7677 | z->z_recirc_empty_wma = 0; |
| 7678 | z->z_recirc_full_min = 0; |
| 7679 | z->z_recirc_full_wma = 0; |
| 7680 | |
| 7681 | zone_recirc_unlock_nopreempt(zone: z); |
| 7682 | } |
| 7683 | |
| 7684 | static void |
| 7685 | zone_reclaim_recirc_trim(zone_t z, struct zone_depot *zd) |
| 7686 | { |
| 7687 | for (;;) { |
| 7688 | uint32_t budget = zc_free_batch_size(); |
| 7689 | uint32_t count; |
| 7690 | bool done = true; |
| 7691 | |
| 7692 | zone_recirc_lock_nopreempt(zone: z); |
| 7693 | count = MIN(z->z_recirc_empty_wma / Z_WMA_UNIT, |
| 7694 | z->z_recirc_empty_min); |
| 7695 | assert(count <= z->z_recirc.zd_empty); |
| 7696 | |
| 7697 | if (count > budget) { |
| 7698 | count = budget; |
| 7699 | done = false; |
| 7700 | } |
| 7701 | if (count) { |
| 7702 | budget -= count; |
| 7703 | zone_depot_move_empty(dst: zd, src: &z->z_recirc, n: count, NULL); |
| 7704 | z->z_recirc_empty_min -= count; |
| 7705 | z->z_recirc_empty_wma -= count * Z_WMA_UNIT; |
| 7706 | } |
| 7707 | |
| 7708 | count = MIN(z->z_recirc_full_wma / Z_WMA_UNIT, |
| 7709 | z->z_recirc_full_min); |
| 7710 | assert(count <= z->z_recirc.zd_full); |
| 7711 | |
| 7712 | if (count > budget) { |
| 7713 | count = budget; |
| 7714 | done = false; |
| 7715 | } |
| 7716 | if (count) { |
| 7717 | zone_depot_move_full(dst: zd, src: &z->z_recirc, n: count, NULL); |
| 7718 | z->z_recirc_full_min -= count; |
| 7719 | z->z_recirc_full_wma -= count * Z_WMA_UNIT; |
| 7720 | } |
| 7721 | |
| 7722 | zone_recirc_unlock_nopreempt(zone: z); |
| 7723 | |
| 7724 | if (done) { |
| 7725 | return; |
| 7726 | } |
| 7727 | |
| 7728 | /* |
| 7729 | * If the number of magazines to reclaim is too large, |
| 7730 | * we might be keeping preemption disabled for too long. |
| 7731 | * |
| 7732 | * Drop and retake the lock to allow for preemption to occur. |
| 7733 | */ |
| 7734 | zone_unlock(zone: z); |
| 7735 | zone_lock(zone: z); |
| 7736 | } |
| 7737 | } |
| 7738 | |
| 7739 | /*! |
| 7740 | * @function zone_reclaim |
| 7741 | * |
| 7742 | * @brief |
| 7743 | * Drains or trim the zone. |
| 7744 | * |
| 7745 | * @discussion |
| 7746 | * Draining the zone will free it from all its elements. |
| 7747 | * |
| 7748 | * Trimming the zone tries to respect the working set size, and avoids draining |
| 7749 | * the depot when it's not necessary. |
| 7750 | * |
| 7751 | * @param z The zone to reclaim from |
| 7752 | * @param mode The purpose of this reclaim. |
| 7753 | */ |
| 7754 | static void |
| 7755 | zone_reclaim(zone_t z, zone_reclaim_mode_t mode) |
| 7756 | { |
| 7757 | struct zone_depot zd; |
| 7758 | |
| 7759 | zone_depot_init(zd: &zd); |
| 7760 | |
| 7761 | zone_lock(zone: z); |
| 7762 | |
| 7763 | if (mode == ZONE_RECLAIM_DESTROY) { |
| 7764 | if (!z->z_destructible || z->z_elems_rsv) { |
| 7765 | panic("zdestroy: Zone %s%s isn't destructible" , |
| 7766 | zone_heap_name(z), z->z_name); |
| 7767 | } |
| 7768 | |
| 7769 | if (!z->z_self || z->z_expander || |
| 7770 | z->z_async_refilling || z->z_expanding_wait) { |
| 7771 | panic("zdestroy: Zone %s%s in an invalid state for destruction" , |
| 7772 | zone_heap_name(z), z->z_name); |
| 7773 | } |
| 7774 | |
| 7775 | #if !KASAN_CLASSIC |
| 7776 | /* |
| 7777 | * Unset the valid bit. We'll hit an assert failure on further |
| 7778 | * operations on this zone, until zinit() is called again. |
| 7779 | * |
| 7780 | * Leave the zone valid for KASan as we will see zfree's on |
| 7781 | * quarantined free elements even after the zone is destroyed. |
| 7782 | */ |
| 7783 | z->z_self = NULL; |
| 7784 | #endif |
| 7785 | z->z_destroyed = true; |
| 7786 | } else if (z->z_destroyed) { |
| 7787 | return zone_unlock(zone: z); |
| 7788 | } else if (zone_count_free(zone: z) <= z->z_elems_rsv) { |
| 7789 | /* If the zone is under its reserve level, leave it alone. */ |
| 7790 | return zone_unlock(zone: z); |
| 7791 | } |
| 7792 | |
| 7793 | if (z->z_pcpu_cache) { |
| 7794 | zone_magazine_t mag; |
| 7795 | uint32_t freed = 0; |
| 7796 | |
| 7797 | /* |
| 7798 | * This is all done with the zone lock held on purpose. |
| 7799 | * The work here is O(ncpu), which should still be short. |
| 7800 | * |
| 7801 | * We need to keep the lock held until we have reclaimed |
| 7802 | * at least a few magazines, otherwise if the zone has no |
| 7803 | * free elements outside of the depot, a thread performing |
| 7804 | * a concurrent allocatiuon could try to grow the zone |
| 7805 | * while we're trying to drain it. |
| 7806 | */ |
| 7807 | if (mode == ZONE_RECLAIM_TRIM) { |
| 7808 | zone_reclaim_recirc_trim(z, zd: &zd); |
| 7809 | } else { |
| 7810 | zone_reclaim_recirc_drain(z, zd: &zd); |
| 7811 | } |
| 7812 | zone_reclaim_pcpu(z, mode, zd: &zd); |
| 7813 | |
| 7814 | if (z->z_chunk_elems) { |
| 7815 | zone_cache_t cache = zpercpu_get_cpu(z->z_pcpu_cache, 0); |
| 7816 | smr_t smr = zone_cache_smr(cache); |
| 7817 | |
| 7818 | while (zd.zd_full) { |
| 7819 | mag = zone_depot_pop_head_full(zd: &zd, NULL); |
| 7820 | if (smr) { |
| 7821 | smr_wait(smr, goal: mag->zm_seq); |
| 7822 | zalloc_cached_reuse_smr(z, cache, mag); |
| 7823 | freed += zc_mag_size(); |
| 7824 | } |
| 7825 | zone_reclaim_elements(z, n: zc_mag_size(), |
| 7826 | elems: mag->zm_elems); |
| 7827 | zone_depot_insert_head_empty(zd: &zd, mag); |
| 7828 | |
| 7829 | freed += zc_mag_size(); |
| 7830 | if (freed >= zc_free_batch_size()) { |
| 7831 | zone_unlock(zone: z); |
| 7832 | zone_magazine_free_list(zd: &zd); |
| 7833 | thread_yield_to_preemption(); |
| 7834 | zone_lock(zone: z); |
| 7835 | freed = 0; |
| 7836 | } |
| 7837 | } |
| 7838 | } else { |
| 7839 | zone_id_t zid = zone_index(z); |
| 7840 | |
| 7841 | zone_unlock(zone: z); |
| 7842 | |
| 7843 | assert(zid <= ZONE_ID__FIRST_DYNAMIC && zcache_ops[zid]); |
| 7844 | |
| 7845 | while (zd.zd_full) { |
| 7846 | mag = zone_depot_pop_head_full(zd: &zd, NULL); |
| 7847 | zcache_reclaim_elements(zid, n: zc_mag_size(), |
| 7848 | elems: mag->zm_elems); |
| 7849 | zone_magazine_free(mag); |
| 7850 | } |
| 7851 | |
| 7852 | goto cleanup; |
| 7853 | } |
| 7854 | } |
| 7855 | |
| 7856 | while (!zone_pva_is_null(page: z->z_pageq_empty)) { |
| 7857 | struct zone_page_metadata *meta; |
| 7858 | uint32_t count, limit = z->z_elems_rsv * 5 / 4; |
| 7859 | |
| 7860 | if (mode == ZONE_RECLAIM_TRIM && z->z_pcpu_cache == NULL) { |
| 7861 | limit = MAX(limit, z->z_elems_free - |
| 7862 | MIN(z->z_elems_free_min, z->z_elems_free_wma)); |
| 7863 | } |
| 7864 | |
| 7865 | meta = zone_pva_to_meta(page: z->z_pageq_empty); |
| 7866 | count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_outer_size(zone: z); |
| 7867 | |
| 7868 | if (zone_count_free(zone: z) - count < limit) { |
| 7869 | break; |
| 7870 | } |
| 7871 | |
| 7872 | zone_reclaim_chunk(z, meta, free_count: count); |
| 7873 | } |
| 7874 | |
| 7875 | zone_unlock(zone: z); |
| 7876 | |
| 7877 | cleanup: |
| 7878 | zone_magazine_free_list(zd: &zd); |
| 7879 | } |
| 7880 | |
| 7881 | void |
| 7882 | zone_drain(zone_t zone) |
| 7883 | { |
| 7884 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
| 7885 | lck_mtx_lock(lck: &zone_gc_lock); |
| 7886 | zone_reclaim(z: zone, mode: ZONE_RECLAIM_DRAIN); |
| 7887 | lck_mtx_unlock(lck: &zone_gc_lock); |
| 7888 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; |
| 7889 | } |
| 7890 | |
| 7891 | void |
| 7892 | zcache_drain(zone_id_t zid) |
| 7893 | { |
| 7894 | zone_drain(zone: zone_by_id(zid)); |
| 7895 | } |
| 7896 | |
| 7897 | static void |
| 7898 | zone_reclaim_all(zone_reclaim_mode_t mode) |
| 7899 | { |
| 7900 | /* |
| 7901 | * Start with zcaches, so that they flow into the regular zones. |
| 7902 | * |
| 7903 | * Then the zones with VA sequester since depopulating |
| 7904 | * pages will not need to allocate vm map entries for holes, |
| 7905 | * which will give memory back to the system faster. |
| 7906 | */ |
| 7907 | for (zone_id_t zid = ZONE_ID__LAST_RO + 1; zid < ZONE_ID__FIRST_DYNAMIC; zid++) { |
| 7908 | zone_t z = zone_by_id(zid); |
| 7909 | |
| 7910 | if (z->z_self && z->z_chunk_elems == 0) { |
| 7911 | zone_reclaim(z, mode); |
| 7912 | } |
| 7913 | } |
| 7914 | zone_index_foreach(zid) { |
| 7915 | zone_t z = zone_by_id(zid); |
| 7916 | |
| 7917 | if (z == zc_magazine_zone || z->z_chunk_elems == 0) { |
| 7918 | continue; |
| 7919 | } |
| 7920 | if (zone_submap_is_sequestered(zsflags: zone_security_array[zid]) && |
| 7921 | z->collectable) { |
| 7922 | zone_reclaim(z, mode); |
| 7923 | } |
| 7924 | } |
| 7925 | |
| 7926 | zone_index_foreach(zid) { |
| 7927 | zone_t z = zone_by_id(zid); |
| 7928 | |
| 7929 | if (z == zc_magazine_zone || z->z_chunk_elems == 0) { |
| 7930 | continue; |
| 7931 | } |
| 7932 | if (!zone_submap_is_sequestered(zsflags: zone_security_array[zid]) && |
| 7933 | z->collectable) { |
| 7934 | zone_reclaim(z, mode); |
| 7935 | } |
| 7936 | } |
| 7937 | |
| 7938 | zone_reclaim(z: zc_magazine_zone, mode); |
| 7939 | } |
| 7940 | |
| 7941 | void |
| 7942 | zone_userspace_reboot_checks(void) |
| 7943 | { |
| 7944 | vm_size_t label_zone_size = zone_size_allocated(zone: ipc_service_port_label_zone); |
| 7945 | if (label_zone_size != 0) { |
| 7946 | panic("Zone %s should be empty upon userspace reboot. Actual size: %lu." , |
| 7947 | ipc_service_port_label_zone->z_name, (unsigned long)label_zone_size); |
| 7948 | } |
| 7949 | } |
| 7950 | |
| 7951 | void |
| 7952 | zone_gc(zone_gc_level_t level) |
| 7953 | { |
| 7954 | zone_reclaim_mode_t mode; |
| 7955 | zone_t largest_zone = NULL; |
| 7956 | |
| 7957 | switch (level) { |
| 7958 | case ZONE_GC_TRIM: |
| 7959 | mode = ZONE_RECLAIM_TRIM; |
| 7960 | break; |
| 7961 | case ZONE_GC_DRAIN: |
| 7962 | mode = ZONE_RECLAIM_DRAIN; |
| 7963 | break; |
| 7964 | case ZONE_GC_JETSAM: |
| 7965 | largest_zone = kill_process_in_largest_zone(); |
| 7966 | mode = ZONE_RECLAIM_TRIM; |
| 7967 | break; |
| 7968 | } |
| 7969 | |
| 7970 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
| 7971 | lck_mtx_lock(lck: &zone_gc_lock); |
| 7972 | |
| 7973 | zone_reclaim_all(mode); |
| 7974 | |
| 7975 | if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) { |
| 7976 | /* |
| 7977 | * If we possibly killed a process, but we're still critical, |
| 7978 | * we need to drain harder. |
| 7979 | */ |
| 7980 | zone_reclaim(z: largest_zone, mode: ZONE_RECLAIM_DRAIN); |
| 7981 | zone_reclaim_all(mode: ZONE_RECLAIM_DRAIN); |
| 7982 | } |
| 7983 | |
| 7984 | lck_mtx_unlock(lck: &zone_gc_lock); |
| 7985 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; |
| 7986 | } |
| 7987 | |
| 7988 | void |
| 7989 | zone_gc_trim(void) |
| 7990 | { |
| 7991 | zone_gc(level: ZONE_GC_TRIM); |
| 7992 | } |
| 7993 | |
| 7994 | void |
| 7995 | zone_gc_drain(void) |
| 7996 | { |
| 7997 | zone_gc(level: ZONE_GC_DRAIN); |
| 7998 | } |
| 7999 | |
| 8000 | static bool |
| 8001 | zone_trim_needed(zone_t z) |
| 8002 | { |
| 8003 | if (z->z_depot_cleanup) { |
| 8004 | return true; |
| 8005 | } |
| 8006 | |
| 8007 | if (z->z_async_refilling) { |
| 8008 | /* Don't fight with refill */ |
| 8009 | return false; |
| 8010 | } |
| 8011 | |
| 8012 | if (z->z_pcpu_cache) { |
| 8013 | uint32_t e_n, f_n; |
| 8014 | |
| 8015 | e_n = MIN(z->z_recirc_empty_wma, z->z_recirc_empty_min * Z_WMA_UNIT); |
| 8016 | f_n = MIN(z->z_recirc_full_wma, z->z_recirc_full_min * Z_WMA_UNIT); |
| 8017 | |
| 8018 | if (e_n > zc_autotrim_buckets() * Z_WMA_UNIT) { |
| 8019 | return true; |
| 8020 | } |
| 8021 | |
| 8022 | if (f_n * zc_mag_size() > z->z_elems_rsv * Z_WMA_UNIT && |
| 8023 | f_n * zc_mag_size() * zone_elem_inner_size(zone: z) > |
| 8024 | zc_autotrim_size() * Z_WMA_UNIT) { |
| 8025 | return true; |
| 8026 | } |
| 8027 | |
| 8028 | return false; |
| 8029 | } |
| 8030 | |
| 8031 | if (!zone_pva_is_null(page: z->z_pageq_empty)) { |
| 8032 | uint32_t n; |
| 8033 | |
| 8034 | n = MIN(z->z_elems_free_wma, z->z_elems_free_min); |
| 8035 | |
| 8036 | return n >= z->z_elems_rsv + z->z_chunk_elems; |
| 8037 | } |
| 8038 | |
| 8039 | return false; |
| 8040 | } |
| 8041 | |
| 8042 | static void |
| 8043 | zone_trim_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) |
| 8044 | { |
| 8045 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
| 8046 | |
| 8047 | zone_foreach(z) { |
| 8048 | if (!z->collectable || z == zc_magazine_zone) { |
| 8049 | continue; |
| 8050 | } |
| 8051 | |
| 8052 | if (zone_trim_needed(z)) { |
| 8053 | lck_mtx_lock(lck: &zone_gc_lock); |
| 8054 | zone_reclaim(z, mode: ZONE_RECLAIM_TRIM); |
| 8055 | lck_mtx_unlock(lck: &zone_gc_lock); |
| 8056 | } |
| 8057 | } |
| 8058 | |
| 8059 | if (zone_trim_needed(z: zc_magazine_zone)) { |
| 8060 | lck_mtx_lock(lck: &zone_gc_lock); |
| 8061 | zone_reclaim(z: zc_magazine_zone, mode: ZONE_RECLAIM_TRIM); |
| 8062 | lck_mtx_unlock(lck: &zone_gc_lock); |
| 8063 | } |
| 8064 | |
| 8065 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; |
| 8066 | } |
| 8067 | |
| 8068 | void |
| 8069 | compute_zone_working_set_size(__unused void *param) |
| 8070 | { |
| 8071 | uint32_t zc_auto = zc_enable_level(); |
| 8072 | bool needs_trim = false; |
| 8073 | |
| 8074 | /* |
| 8075 | * Keep zone caching disabled until the first proc is made. |
| 8076 | */ |
| 8077 | if (__improbable(zone_caching_disabled < 0)) { |
| 8078 | return; |
| 8079 | } |
| 8080 | |
| 8081 | zone_caching_disabled = vm_pool_low(); |
| 8082 | |
| 8083 | if (os_mul_overflow(zc_auto, Z_WMA_UNIT, &zc_auto)) { |
| 8084 | zc_auto = 0; |
| 8085 | } |
| 8086 | |
| 8087 | zone_foreach(z) { |
| 8088 | uint32_t old, wma, cur; |
| 8089 | bool needs_caching = false; |
| 8090 | |
| 8091 | if (z->z_self != z) { |
| 8092 | continue; |
| 8093 | } |
| 8094 | |
| 8095 | zone_lock(zone: z); |
| 8096 | |
| 8097 | zone_recirc_lock_nopreempt(zone: z); |
| 8098 | |
| 8099 | if (z->z_pcpu_cache) { |
| 8100 | wma = Z_WMA_MIX(z->z_recirc_empty_wma, z->z_recirc_empty_min); |
| 8101 | z->z_recirc_empty_min = z->z_recirc.zd_empty; |
| 8102 | z->z_recirc_empty_wma = wma; |
| 8103 | } else { |
| 8104 | wma = Z_WMA_MIX(z->z_elems_free_wma, z->z_elems_free_min); |
| 8105 | z->z_elems_free_min = z->z_elems_free; |
| 8106 | z->z_elems_free_wma = wma; |
| 8107 | } |
| 8108 | |
| 8109 | wma = Z_WMA_MIX(z->z_recirc_full_wma, z->z_recirc_full_min); |
| 8110 | z->z_recirc_full_min = z->z_recirc.zd_full; |
| 8111 | z->z_recirc_full_wma = wma; |
| 8112 | |
| 8113 | /* fixed point decimal of contentions per second */ |
| 8114 | old = z->z_recirc_cont_wma; |
| 8115 | cur = z->z_recirc_cont_cur * Z_WMA_UNIT / |
| 8116 | (zpercpu_count() * ZONE_WSS_UPDATE_PERIOD); |
| 8117 | cur = (3 * old + cur) / 4; |
| 8118 | zone_recirc_unlock_nopreempt(zone: z); |
| 8119 | |
| 8120 | if (z->z_pcpu_cache) { |
| 8121 | uint16_t size = z->z_depot_size; |
| 8122 | |
| 8123 | if (zone_exhausted(zone: z)) { |
| 8124 | if (z->z_depot_size) { |
| 8125 | z->z_depot_size = 0; |
| 8126 | z->z_depot_cleanup = true; |
| 8127 | } |
| 8128 | } else if (size < z->z_depot_limit && cur > zc_grow_level()) { |
| 8129 | /* |
| 8130 | * lose history on purpose now |
| 8131 | * that we just grew, to give |
| 8132 | * the sytem time to adjust. |
| 8133 | */ |
| 8134 | cur = (zc_grow_level() + zc_shrink_level()) / 2; |
| 8135 | size = size ? (3 * size + 2) / 2 : 2; |
| 8136 | z->z_depot_size = MIN(z->z_depot_limit, size); |
| 8137 | } else if (size > 0 && cur <= zc_shrink_level()) { |
| 8138 | /* |
| 8139 | * lose history on purpose now |
| 8140 | * that we just shrunk, to give |
| 8141 | * the sytem time to adjust. |
| 8142 | */ |
| 8143 | cur = (zc_grow_level() + zc_shrink_level()) / 2; |
| 8144 | z->z_depot_size = size - 1; |
| 8145 | z->z_depot_cleanup = true; |
| 8146 | } |
| 8147 | } else if (!z->z_nocaching && !zone_exhaustible(zone: z) && zc_auto && |
| 8148 | old >= zc_auto && cur >= zc_auto) { |
| 8149 | needs_caching = true; |
| 8150 | } |
| 8151 | |
| 8152 | z->z_recirc_cont_wma = cur; |
| 8153 | z->z_recirc_cont_cur = 0; |
| 8154 | |
| 8155 | if (!needs_trim && zone_trim_needed(z)) { |
| 8156 | needs_trim = true; |
| 8157 | } |
| 8158 | |
| 8159 | zone_unlock(zone: z); |
| 8160 | |
| 8161 | if (needs_caching) { |
| 8162 | zone_enable_caching(zone: z); |
| 8163 | } |
| 8164 | } |
| 8165 | |
| 8166 | if (needs_trim) { |
| 8167 | thread_call_enter(call: &zone_trim_callout); |
| 8168 | } |
| 8169 | } |
| 8170 | |
| 8171 | #endif /* !ZALLOC_TEST */ |
| 8172 | #pragma mark vm integration, MIG routines |
| 8173 | #if !ZALLOC_TEST |
| 8174 | |
| 8175 | extern unsigned int stack_total; |
| 8176 | #if defined (__x86_64__) |
| 8177 | extern unsigned int inuse_ptepages_count; |
| 8178 | #endif |
| 8179 | |
| 8180 | static const char * |
| 8181 | panic_print_get_typename(kalloc_type_views_t cur, kalloc_type_views_t *next, |
| 8182 | bool is_kt_var) |
| 8183 | { |
| 8184 | if (is_kt_var) { |
| 8185 | next->ktv_var = (kalloc_type_var_view_t) cur.ktv_var->kt_next; |
| 8186 | return cur.ktv_var->kt_name; |
| 8187 | } else { |
| 8188 | next->ktv_fixed = (kalloc_type_view_t) cur.ktv_fixed->kt_zv.zv_next; |
| 8189 | return cur.ktv_fixed->kt_zv.zv_name; |
| 8190 | } |
| 8191 | } |
| 8192 | |
| 8193 | static void |
| 8194 | panic_print_types_in_zone(zone_t z, const char* debug_str) |
| 8195 | { |
| 8196 | kalloc_type_views_t kt_cur = {}; |
| 8197 | const char *prev_type = "" ; |
| 8198 | size_t skip_over_site = sizeof("site." ) - 1; |
| 8199 | zone_security_flags_t zsflags = zone_security_config(z); |
| 8200 | bool is_kt_var = false; |
| 8201 | |
| 8202 | if (zsflags.z_kheap_id == KHEAP_ID_KT_VAR) { |
| 8203 | uint32_t heap_id = KT_VAR_PTR_HEAP0 + ((zone_index(z) - |
| 8204 | kalloc_type_heap_array[KT_VAR_PTR_HEAP0].kh_zstart) / KHEAP_NUM_ZONES); |
| 8205 | kt_cur.ktv_var = kalloc_type_heap_array[heap_id].kt_views; |
| 8206 | is_kt_var = true; |
| 8207 | } else { |
| 8208 | kt_cur.ktv_fixed = (kalloc_type_view_t) z->z_views; |
| 8209 | } |
| 8210 | |
| 8211 | paniclog_append_noflush(format: "kalloc %s in zone, %s (%s):\n" , |
| 8212 | is_kt_var? "type arrays" : "types" , debug_str, z->z_name); |
| 8213 | |
| 8214 | while (kt_cur.ktv_fixed) { |
| 8215 | kalloc_type_views_t kt_next = {}; |
| 8216 | const char *typename = panic_print_get_typename(cur: kt_cur, next: &kt_next, |
| 8217 | is_kt_var) + skip_over_site; |
| 8218 | if (strcmp(s1: typename, s2: prev_type) != 0) { |
| 8219 | paniclog_append_noflush(format: "\t%-50s\n" , typename); |
| 8220 | prev_type = typename; |
| 8221 | } |
| 8222 | kt_cur = kt_next; |
| 8223 | } |
| 8224 | paniclog_append_noflush(format: "\n" ); |
| 8225 | } |
| 8226 | |
| 8227 | static void |
| 8228 | panic_display_kalloc_types(void) |
| 8229 | { |
| 8230 | if (kalloc_type_src_zone) { |
| 8231 | panic_print_types_in_zone(z: kalloc_type_src_zone, debug_str: "addr belongs to" ); |
| 8232 | } |
| 8233 | if (kalloc_type_dst_zone) { |
| 8234 | panic_print_types_in_zone(z: kalloc_type_dst_zone, |
| 8235 | debug_str: "addr is being freed to" ); |
| 8236 | } |
| 8237 | } |
| 8238 | |
| 8239 | static void |
| 8240 | zone_find_n_largest(const uint32_t n, zone_t *largest_zones, |
| 8241 | uint64_t *zone_size) |
| 8242 | { |
| 8243 | zone_index_foreach(zid) { |
| 8244 | zone_t z = &zone_array[zid]; |
| 8245 | vm_offset_t size = zone_size_wired(zone: z); |
| 8246 | |
| 8247 | if (zid == ZONE_ID_VM_PAGES) { |
| 8248 | continue; |
| 8249 | } |
| 8250 | for (uint32_t i = 0; i < n; i++) { |
| 8251 | if (size > zone_size[i]) { |
| 8252 | largest_zones[i] = z; |
| 8253 | zone_size[i] = size; |
| 8254 | break; |
| 8255 | } |
| 8256 | } |
| 8257 | } |
| 8258 | } |
| 8259 | |
| 8260 | #define NUM_LARGEST_ZONES 5 |
| 8261 | static void |
| 8262 | panic_display_largest_zones(void) |
| 8263 | { |
| 8264 | zone_t largest_zones[NUM_LARGEST_ZONES] = { NULL }; |
| 8265 | uint64_t largest_size[NUM_LARGEST_ZONES] = { 0 }; |
| 8266 | |
| 8267 | zone_find_n_largest(NUM_LARGEST_ZONES, largest_zones: (zone_t *) &largest_zones, |
| 8268 | zone_size: (uint64_t *) &largest_size); |
| 8269 | |
| 8270 | paniclog_append_noflush(format: "Largest zones:\n%-28s %10s %10s\n" , |
| 8271 | "Zone Name" , "Cur Size" , "Free Size" ); |
| 8272 | for (uint32_t i = 0; i < NUM_LARGEST_ZONES; i++) { |
| 8273 | zone_t z = largest_zones[i]; |
| 8274 | paniclog_append_noflush(format: "%-8s%-20s %9u%c %9u%c\n" , |
| 8275 | zone_heap_name(z), z->z_name, |
| 8276 | mach_vm_size_pretty(size: largest_size[i]), |
| 8277 | mach_vm_size_unit(size: largest_size[i]), |
| 8278 | mach_vm_size_pretty(size: zone_size_free(zone: z)), |
| 8279 | mach_vm_size_unit(size: zone_size_free(zone: z))); |
| 8280 | } |
| 8281 | } |
| 8282 | |
| 8283 | static void |
| 8284 | panic_display_zprint(void) |
| 8285 | { |
| 8286 | panic_display_largest_zones(); |
| 8287 | paniclog_append_noflush(format: "%-20s %10lu\n" , "Kernel Stacks" , |
| 8288 | (uintptr_t)(kernel_stack_size * stack_total)); |
| 8289 | #if defined (__x86_64__) |
| 8290 | paniclog_append_noflush("%-20s %10lu\n" , "PageTables" , |
| 8291 | (uintptr_t)ptoa(inuse_ptepages_count)); |
| 8292 | #endif |
| 8293 | paniclog_append_noflush(format: "%-20s %10llu\n" , "Kalloc.Large" , |
| 8294 | counter_load(&kalloc_large_total)); |
| 8295 | |
| 8296 | if (panic_kext_memory_info) { |
| 8297 | mach_memory_info_t *mem_info = panic_kext_memory_info; |
| 8298 | |
| 8299 | paniclog_append_noflush(format: "\n%-5s %10s\n" , "Kmod" , "Size" ); |
| 8300 | for (uint32_t i = 0; i < panic_kext_memory_size / sizeof(mem_info[0]); i++) { |
| 8301 | if ((mem_info[i].flags & VM_KERN_SITE_TYPE) != VM_KERN_SITE_KMOD) { |
| 8302 | continue; |
| 8303 | } |
| 8304 | if (mem_info[i].size > (1024 * 1024)) { |
| 8305 | paniclog_append_noflush(format: "%-5lld %10lld\n" , |
| 8306 | mem_info[i].site, mem_info[i].size); |
| 8307 | } |
| 8308 | } |
| 8309 | } |
| 8310 | } |
| 8311 | |
| 8312 | static void |
| 8313 | panic_display_zone_info(void) |
| 8314 | { |
| 8315 | paniclog_append_noflush(format: "Zone info:\n" ); |
| 8316 | paniclog_append_noflush(format: " Zone map: %p - %p\n" , |
| 8317 | (void *)zone_info.zi_map_range.min_address, |
| 8318 | (void *)zone_info.zi_map_range.max_address); |
| 8319 | #if CONFIG_PROB_GZALLOC |
| 8320 | if (pgz_submap) { |
| 8321 | paniclog_append_noflush(" . PGZ : %p - %p\n" , |
| 8322 | (void *)pgz_submap->min_offset, |
| 8323 | (void *)pgz_submap->max_offset); |
| 8324 | } |
| 8325 | #endif /* CONFIG_PROB_GZALLOC */ |
| 8326 | for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) { |
| 8327 | vm_map_t map = zone_submaps[i]; |
| 8328 | |
| 8329 | if (map == VM_MAP_NULL) { |
| 8330 | continue; |
| 8331 | } |
| 8332 | paniclog_append_noflush(format: " . %-6s: %p - %p\n" , |
| 8333 | zone_submaps_names[i], |
| 8334 | (void *)map->min_offset, |
| 8335 | (void *)map->max_offset); |
| 8336 | } |
| 8337 | paniclog_append_noflush(format: " Metadata: %p - %p\n" |
| 8338 | " Bitmaps : %p - %p\n" |
| 8339 | " Extra : %p - %p\n" |
| 8340 | "\n" , |
| 8341 | (void *)zone_info.zi_meta_range.min_address, |
| 8342 | (void *)zone_info.zi_meta_range.max_address, |
| 8343 | (void *)zone_info.zi_bits_range.min_address, |
| 8344 | (void *)zone_info.zi_bits_range.max_address, |
| 8345 | (void *)zone_info.zi_xtra_range.min_address, |
| 8346 | (void *)zone_info.zi_xtra_range.max_address); |
| 8347 | } |
| 8348 | |
| 8349 | static void |
| 8350 | panic_display_zone_fault(vm_offset_t addr) |
| 8351 | { |
| 8352 | struct zone_page_metadata meta = { }; |
| 8353 | vm_map_t map = VM_MAP_NULL; |
| 8354 | vm_offset_t oob_offs = 0, size = 0; |
| 8355 | int map_idx = -1; |
| 8356 | zone_t z = NULL; |
| 8357 | const char *kind = "whild deref" ; |
| 8358 | bool oob = false; |
| 8359 | |
| 8360 | /* |
| 8361 | * First: look if we bumped into guard pages between submaps |
| 8362 | */ |
| 8363 | for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) { |
| 8364 | map = zone_submaps[i]; |
| 8365 | if (map == VM_MAP_NULL) { |
| 8366 | continue; |
| 8367 | } |
| 8368 | |
| 8369 | if (addr >= map->min_offset && addr < map->max_offset) { |
| 8370 | map_idx = i; |
| 8371 | break; |
| 8372 | } |
| 8373 | } |
| 8374 | |
| 8375 | if (map_idx == -1) { |
| 8376 | /* this really shouldn't happen, submaps are back to back */ |
| 8377 | return; |
| 8378 | } |
| 8379 | |
| 8380 | paniclog_append_noflush(format: "Probabilistic GZAlloc Report:\n" ); |
| 8381 | |
| 8382 | /* |
| 8383 | * Second: look if there's just no metadata at all |
| 8384 | */ |
| 8385 | if (ml_nofault_copy(virtsrc: (vm_offset_t)zone_meta_from_addr(addr), |
| 8386 | virtdst: (vm_offset_t)&meta, size: sizeof(meta)) != sizeof(meta) || |
| 8387 | meta.zm_index == 0 || meta.zm_index >= MAX_ZONES || |
| 8388 | zone_array[meta.zm_index].z_self == NULL) { |
| 8389 | paniclog_append_noflush(format: " Zone : <unknown>\n" ); |
| 8390 | kind = "wild deref, missing or invalid metadata" ; |
| 8391 | } else { |
| 8392 | z = &zone_array[meta.zm_index]; |
| 8393 | paniclog_append_noflush(format: " Zone : %s%s\n" , |
| 8394 | zone_heap_name(z), zone_name(z)); |
| 8395 | if (meta.zm_chunk_len == ZM_PGZ_GUARD) { |
| 8396 | kind = "out-of-bounds (high confidence)" ; |
| 8397 | oob = true; |
| 8398 | size = zone_element_size(elem: (void *)addr, |
| 8399 | z: &z, false, oob_offs: &oob_offs); |
| 8400 | } else { |
| 8401 | kind = "use-after-free (medium confidence)" ; |
| 8402 | } |
| 8403 | } |
| 8404 | |
| 8405 | paniclog_append_noflush(format: " Address : %p\n" , (void *)addr); |
| 8406 | if (oob) { |
| 8407 | paniclog_append_noflush(format: " Element : [%p, %p) of size %d\n" , |
| 8408 | (void *)(trunc_page(addr) - (size - oob_offs)), |
| 8409 | (void *)trunc_page(addr), (uint32_t)(size - oob_offs)); |
| 8410 | } |
| 8411 | paniclog_append_noflush(format: " Submap : %s [%p; %p)\n" , |
| 8412 | zone_submaps_names[map_idx], |
| 8413 | (void *)map->min_offset, (void *)map->max_offset); |
| 8414 | paniclog_append_noflush(format: " Kind : %s\n" , kind); |
| 8415 | if (oob) { |
| 8416 | paniclog_append_noflush(format: " Access : %d byte(s) past\n" , |
| 8417 | (uint32_t)(addr & PAGE_MASK) + 1); |
| 8418 | } |
| 8419 | paniclog_append_noflush(format: " Metadata: zid:%d inl:%d cl:0x%x " |
| 8420 | "0x%04x 0x%08x 0x%08x 0x%08x\n" , |
| 8421 | meta.zm_index, meta.zm_inline_bitmap, meta.zm_chunk_len, |
| 8422 | meta.zm_alloc_size, meta.zm_bitmap, |
| 8423 | meta.zm_page_next.packed_address, |
| 8424 | meta.zm_page_prev.packed_address); |
| 8425 | paniclog_append_noflush(format: "\n" ); |
| 8426 | } |
| 8427 | |
| 8428 | void |
| 8429 | panic_display_zalloc(void) |
| 8430 | { |
| 8431 | bool keepsyms = false; |
| 8432 | |
| 8433 | PE_parse_boot_argn(arg_string: "keepsyms" , arg_ptr: &keepsyms, max_arg: sizeof(keepsyms)); |
| 8434 | |
| 8435 | panic_display_zone_info(); |
| 8436 | |
| 8437 | if (panic_fault_address) { |
| 8438 | #if CONFIG_PROB_GZALLOC |
| 8439 | if (pgz_owned(panic_fault_address)) { |
| 8440 | panic_display_pgz_uaf_info(keepsyms, panic_fault_address); |
| 8441 | } else |
| 8442 | #endif /* CONFIG_PROB_GZALLOC */ |
| 8443 | if (zone_maps_owned(addr: panic_fault_address, size: 1)) { |
| 8444 | panic_display_zone_fault(addr: panic_fault_address); |
| 8445 | } |
| 8446 | } |
| 8447 | |
| 8448 | if (panic_include_zprint) { |
| 8449 | panic_display_zprint(); |
| 8450 | } else if (zone_map_nearing_threshold(ZONE_MAP_EXHAUSTION_PRINT_PANIC)) { |
| 8451 | panic_display_largest_zones(); |
| 8452 | } |
| 8453 | #if CONFIG_ZLEAKS |
| 8454 | if (zleak_active) { |
| 8455 | panic_display_zleaks(keepsyms); |
| 8456 | } |
| 8457 | #endif |
| 8458 | if (panic_include_kalloc_types) { |
| 8459 | panic_display_kalloc_types(); |
| 8460 | } |
| 8461 | } |
| 8462 | |
| 8463 | /* |
| 8464 | * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls |
| 8465 | * requesting zone information. |
| 8466 | * Frees unused pages towards the end of the region, and zero'es out unused |
| 8467 | * space on the last page. |
| 8468 | */ |
| 8469 | static vm_map_copy_t |
| 8470 | create_vm_map_copy( |
| 8471 | vm_offset_t start_addr, |
| 8472 | vm_size_t total_size, |
| 8473 | vm_size_t used_size) |
| 8474 | { |
| 8475 | kern_return_t kr; |
| 8476 | vm_offset_t end_addr; |
| 8477 | vm_size_t free_size; |
| 8478 | vm_map_copy_t copy; |
| 8479 | |
| 8480 | if (used_size != total_size) { |
| 8481 | end_addr = start_addr + used_size; |
| 8482 | free_size = total_size - (round_page(x: end_addr) - start_addr); |
| 8483 | |
| 8484 | if (free_size >= PAGE_SIZE) { |
| 8485 | kmem_free(map: ipc_kernel_map, |
| 8486 | addr: round_page(x: end_addr), size: free_size); |
| 8487 | } |
| 8488 | bzero(s: (char *) end_addr, n: round_page(x: end_addr) - end_addr); |
| 8489 | } |
| 8490 | |
| 8491 | kr = vm_map_copyin(src_map: ipc_kernel_map, src_addr: (vm_map_address_t)start_addr, |
| 8492 | len: (vm_map_size_t)used_size, TRUE, copy_result: ©); |
| 8493 | assert(kr == KERN_SUCCESS); |
| 8494 | |
| 8495 | return copy; |
| 8496 | } |
| 8497 | |
| 8498 | static boolean_t |
| 8499 | get_zone_info( |
| 8500 | zone_t z, |
| 8501 | mach_zone_name_t *zn, |
| 8502 | mach_zone_info_t *zi) |
| 8503 | { |
| 8504 | struct zone zcopy; |
| 8505 | vm_size_t cached = 0; |
| 8506 | |
| 8507 | assert(z != ZONE_NULL); |
| 8508 | zone_lock(zone: z); |
| 8509 | if (!z->z_self) { |
| 8510 | zone_unlock(zone: z); |
| 8511 | return FALSE; |
| 8512 | } |
| 8513 | zcopy = *z; |
| 8514 | if (z->z_pcpu_cache) { |
| 8515 | zpercpu_foreach(zc, z->z_pcpu_cache) { |
| 8516 | cached += zc->zc_alloc_cur + zc->zc_free_cur; |
| 8517 | cached += zc->zc_depot.zd_full * zc_mag_size(); |
| 8518 | } |
| 8519 | } |
| 8520 | zone_unlock(zone: z); |
| 8521 | |
| 8522 | if (zn != NULL) { |
| 8523 | /* |
| 8524 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
| 8525 | */ |
| 8526 | char temp_zone_name[MAX_ZONE_NAME] = "" ; |
| 8527 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s" , |
| 8528 | zone_heap_name(z), z->z_name); |
| 8529 | |
| 8530 | /* assuming here the name data is static */ |
| 8531 | (void) __nosan_strlcpy(dst: zn->mzn_name, src: temp_zone_name, |
| 8532 | sz: strlen(s: temp_zone_name) + 1); |
| 8533 | } |
| 8534 | |
| 8535 | if (zi != NULL) { |
| 8536 | *zi = (mach_zone_info_t) { |
| 8537 | .mzi_count = zone_count_allocated(zone: &zcopy) - cached, |
| 8538 | .mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)), |
| 8539 | // max_size for zprint is now high-watermark of pages used |
| 8540 | .mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)), |
| 8541 | .mzi_elem_size = zone_scale_for_percpu(zone: &zcopy, size: zcopy.z_elem_size), |
| 8542 | .mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages), |
| 8543 | .mzi_exhaustible = (uint64_t)zone_exhaustible(zone: &zcopy), |
| 8544 | }; |
| 8545 | if (zcopy.z_chunk_pages == 0) { |
| 8546 | /* this is a zcache */ |
| 8547 | zi->mzi_cur_size = zcopy.z_elems_avail * zcopy.z_elem_size; |
| 8548 | } |
| 8549 | zpercpu_foreach(zs, zcopy.z_stats) { |
| 8550 | zi->mzi_sum_size += zs->zs_mem_allocated; |
| 8551 | } |
| 8552 | if (zcopy.collectable) { |
| 8553 | SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, |
| 8554 | ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty))); |
| 8555 | SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE); |
| 8556 | } |
| 8557 | } |
| 8558 | |
| 8559 | return TRUE; |
| 8560 | } |
| 8561 | |
| 8562 | /* mach_memory_info entitlement */ |
| 8563 | #define MEMORYINFO_ENTITLEMENT "com.apple.private.memoryinfo" |
| 8564 | |
| 8565 | /* macro needed to rate-limit mach_memory_info */ |
| 8566 | #define NSEC_DAY (NSEC_PER_SEC * 60 * 60 * 24) |
| 8567 | |
| 8568 | /* declarations necessary to call kauth_cred_issuser() */ |
| 8569 | struct ucred; |
| 8570 | extern int kauth_cred_issuser(struct ucred *); |
| 8571 | extern struct ucred *kauth_cred_get(void); |
| 8572 | |
| 8573 | static kern_return_t |
| 8574 | mach_memory_info_internal( |
| 8575 | host_t host, |
| 8576 | mach_zone_name_array_t *namesp, |
| 8577 | mach_msg_type_number_t *namesCntp, |
| 8578 | mach_zone_info_array_t *infop, |
| 8579 | mach_msg_type_number_t *infoCntp, |
| 8580 | mach_memory_info_array_t *memoryInfop, |
| 8581 | mach_msg_type_number_t *memoryInfoCntp, |
| 8582 | bool redact_info); |
| 8583 | |
| 8584 | static kern_return_t |
| 8585 | mach_memory_info_security_check(bool redact_info) |
| 8586 | { |
| 8587 | /* If not root, only allow redacted calls. */ |
| 8588 | if (!kauth_cred_issuser(kauth_cred_get()) && !redact_info) { |
| 8589 | return KERN_NO_ACCESS; |
| 8590 | } |
| 8591 | |
| 8592 | if (PE_srd_fused) { |
| 8593 | return KERN_SUCCESS; |
| 8594 | } |
| 8595 | |
| 8596 | /* If does not have the memory entitlement, fail. */ |
| 8597 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO |
| 8598 | if (!IOTaskHasEntitlement(current_task(), MEMORYINFO_ENTITLEMENT)) { |
| 8599 | return KERN_DENIED; |
| 8600 | } |
| 8601 | |
| 8602 | /* |
| 8603 | * On release non-mac arm devices, allow mach_memory_info |
| 8604 | * to be called twice per day per boot. memorymaintenanced |
| 8605 | * calls it once per day, which leaves room for a sysdiagnose. |
| 8606 | * Allow redacted version to be called without rate limit. |
| 8607 | */ |
| 8608 | |
| 8609 | if (!redact_info) { |
| 8610 | static uint64_t first_call = 0, second_call = 0; |
| 8611 | uint64_t now = 0; |
| 8612 | absolutetime_to_nanoseconds(ml_get_timebase(), &now); |
| 8613 | |
| 8614 | if (!first_call) { |
| 8615 | first_call = now; |
| 8616 | } else if (!second_call) { |
| 8617 | second_call = now; |
| 8618 | } else if (first_call + NSEC_DAY > now) { |
| 8619 | return KERN_DENIED; |
| 8620 | } else if (first_call + NSEC_DAY < now) { |
| 8621 | first_call = now; |
| 8622 | second_call = 0; |
| 8623 | } |
| 8624 | } |
| 8625 | #endif |
| 8626 | |
| 8627 | return KERN_SUCCESS; |
| 8628 | } |
| 8629 | |
| 8630 | kern_return_t |
| 8631 | mach_zone_info( |
| 8632 | mach_port_t host_port, |
| 8633 | mach_zone_name_array_t *namesp, |
| 8634 | mach_msg_type_number_t *namesCntp, |
| 8635 | mach_zone_info_array_t *infop, |
| 8636 | mach_msg_type_number_t *infoCntp) |
| 8637 | { |
| 8638 | return mach_memory_info(host: host_port, names: namesp, namesCnt: namesCntp, info: infop, infoCnt: infoCntp, NULL, NULL); |
| 8639 | } |
| 8640 | |
| 8641 | kern_return_t |
| 8642 | mach_memory_info( |
| 8643 | mach_port_t host_port, |
| 8644 | mach_zone_name_array_t *namesp, |
| 8645 | mach_msg_type_number_t *namesCntp, |
| 8646 | mach_zone_info_array_t *infop, |
| 8647 | mach_msg_type_number_t *infoCntp, |
| 8648 | mach_memory_info_array_t *memoryInfop, |
| 8649 | mach_msg_type_number_t *memoryInfoCntp) |
| 8650 | { |
| 8651 | bool redact_info = false; |
| 8652 | host_t host = HOST_NULL; |
| 8653 | |
| 8654 | host = convert_port_to_host_priv(port: host_port); |
| 8655 | if (host == HOST_NULL) { |
| 8656 | redact_info = true; |
| 8657 | host = convert_port_to_host(port: host_port); |
| 8658 | } |
| 8659 | |
| 8660 | return mach_memory_info_internal(host, namesp, namesCntp, infop, infoCntp, memoryInfop, memoryInfoCntp, redact_info); |
| 8661 | } |
| 8662 | |
| 8663 | static void |
| 8664 | zone_info_redact(mach_zone_info_t *zi) |
| 8665 | { |
| 8666 | zi->mzi_cur_size = 0; |
| 8667 | zi->mzi_max_size = 0; |
| 8668 | zi->mzi_alloc_size = 0; |
| 8669 | zi->mzi_sum_size = 0; |
| 8670 | zi->mzi_collectable = 0; |
| 8671 | } |
| 8672 | |
| 8673 | static bool |
| 8674 | zone_info_needs_to_be_coalesced(int zone_index) |
| 8675 | { |
| 8676 | zone_security_flags_t zsflags = zone_security_array[zone_index]; |
| 8677 | if (zsflags.z_kalloc_type || zsflags.z_kheap_id == KHEAP_ID_KT_VAR) { |
| 8678 | return true; |
| 8679 | } |
| 8680 | return false; |
| 8681 | } |
| 8682 | |
| 8683 | static bool |
| 8684 | zone_info_find_coalesce_zone( |
| 8685 | mach_zone_info_t *zi, |
| 8686 | mach_zone_info_t *info, |
| 8687 | int *coalesce, |
| 8688 | int coalesce_count, |
| 8689 | int *coalesce_index) |
| 8690 | { |
| 8691 | for (int i = 0; i < coalesce_count; i++) { |
| 8692 | if (zi->mzi_elem_size == info[coalesce[i]].mzi_elem_size) { |
| 8693 | *coalesce_index = coalesce[i]; |
| 8694 | return true; |
| 8695 | } |
| 8696 | } |
| 8697 | |
| 8698 | return false; |
| 8699 | } |
| 8700 | |
| 8701 | static void |
| 8702 | zone_info_coalesce( |
| 8703 | mach_zone_info_t *info, |
| 8704 | int coalesce_index, |
| 8705 | mach_zone_info_t *zi) |
| 8706 | { |
| 8707 | info[coalesce_index].mzi_count += zi->mzi_count; |
| 8708 | } |
| 8709 | |
| 8710 | static kern_return_t |
| 8711 | mach_memory_info_internal( |
| 8712 | host_t host, |
| 8713 | mach_zone_name_array_t *namesp, |
| 8714 | mach_msg_type_number_t *namesCntp, |
| 8715 | mach_zone_info_array_t *infop, |
| 8716 | mach_msg_type_number_t *infoCntp, |
| 8717 | mach_memory_info_array_t *memoryInfop, |
| 8718 | mach_msg_type_number_t *memoryInfoCntp, |
| 8719 | bool redact_info) |
| 8720 | { |
| 8721 | mach_zone_name_t *names; |
| 8722 | vm_offset_t names_addr; |
| 8723 | vm_size_t names_size; |
| 8724 | |
| 8725 | mach_zone_info_t *info; |
| 8726 | vm_offset_t info_addr; |
| 8727 | vm_size_t info_size; |
| 8728 | |
| 8729 | int *coalesce; |
| 8730 | vm_offset_t coalesce_addr; |
| 8731 | vm_size_t coalesce_size; |
| 8732 | int coalesce_count = 0; |
| 8733 | |
| 8734 | mach_memory_info_t *memory_info; |
| 8735 | vm_offset_t memory_info_addr; |
| 8736 | vm_size_t memory_info_size; |
| 8737 | vm_size_t memory_info_vmsize; |
| 8738 | unsigned int num_info; |
| 8739 | |
| 8740 | unsigned int max_zones, used_zones, i; |
| 8741 | mach_zone_name_t *zn; |
| 8742 | mach_zone_info_t *zi; |
| 8743 | kern_return_t kr; |
| 8744 | |
| 8745 | uint64_t zones_collectable_bytes = 0; |
| 8746 | |
| 8747 | if (host == HOST_NULL) { |
| 8748 | return KERN_INVALID_HOST; |
| 8749 | } |
| 8750 | |
| 8751 | kr = mach_memory_info_security_check(redact_info); |
| 8752 | if (kr != KERN_SUCCESS) { |
| 8753 | return kr; |
| 8754 | } |
| 8755 | |
| 8756 | /* |
| 8757 | * We assume that zones aren't freed once allocated. |
| 8758 | * We won't pick up any zones that are allocated later. |
| 8759 | */ |
| 8760 | |
| 8761 | max_zones = os_atomic_load(&num_zones, relaxed); |
| 8762 | |
| 8763 | names_size = round_page(x: max_zones * sizeof *names); |
| 8764 | kr = kmem_alloc(map: ipc_kernel_map, addrp: &names_addr, size: names_size, |
| 8765 | flags: KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); |
| 8766 | if (kr != KERN_SUCCESS) { |
| 8767 | return kr; |
| 8768 | } |
| 8769 | names = (mach_zone_name_t *) names_addr; |
| 8770 | |
| 8771 | info_size = round_page(x: max_zones * sizeof *info); |
| 8772 | kr = kmem_alloc(map: ipc_kernel_map, addrp: &info_addr, size: info_size, |
| 8773 | flags: KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); |
| 8774 | if (kr != KERN_SUCCESS) { |
| 8775 | kmem_free(map: ipc_kernel_map, |
| 8776 | addr: names_addr, size: names_size); |
| 8777 | return kr; |
| 8778 | } |
| 8779 | info = (mach_zone_info_t *) info_addr; |
| 8780 | |
| 8781 | if (redact_info) { |
| 8782 | coalesce_size = round_page(x: max_zones * sizeof *coalesce); |
| 8783 | kr = kmem_alloc(map: ipc_kernel_map, addrp: &coalesce_addr, size: coalesce_size, |
| 8784 | flags: KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); |
| 8785 | if (kr != KERN_SUCCESS) { |
| 8786 | kmem_free(map: ipc_kernel_map, |
| 8787 | addr: names_addr, size: names_size); |
| 8788 | kmem_free(map: ipc_kernel_map, |
| 8789 | addr: info_addr, size: info_size); |
| 8790 | return kr; |
| 8791 | } |
| 8792 | coalesce = (int *)coalesce_addr; |
| 8793 | } |
| 8794 | |
| 8795 | zn = &names[0]; |
| 8796 | zi = &info[0]; |
| 8797 | |
| 8798 | used_zones = 0; |
| 8799 | for (i = 0; i < max_zones; i++) { |
| 8800 | if (!get_zone_info(z: &(zone_array[i]), zn, zi)) { |
| 8801 | continue; |
| 8802 | } |
| 8803 | |
| 8804 | if (!redact_info) { |
| 8805 | zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable); |
| 8806 | zn++; |
| 8807 | zi++; |
| 8808 | used_zones++; |
| 8809 | continue; |
| 8810 | } |
| 8811 | |
| 8812 | zone_info_redact(zi); |
| 8813 | if (!zone_info_needs_to_be_coalesced(zone_index: i)) { |
| 8814 | zn++; |
| 8815 | zi++; |
| 8816 | used_zones++; |
| 8817 | continue; |
| 8818 | } |
| 8819 | |
| 8820 | int coalesce_index; |
| 8821 | bool found_coalesce_zone = zone_info_find_coalesce_zone(zi, info, |
| 8822 | coalesce, coalesce_count, coalesce_index: &coalesce_index); |
| 8823 | |
| 8824 | /* Didn't find a zone to coalesce */ |
| 8825 | if (!found_coalesce_zone) { |
| 8826 | /* Updates the zone name */ |
| 8827 | __nosan_bzero(dst: zn->mzn_name, MAX_ZONE_NAME); |
| 8828 | snprintf(zn->mzn_name, MAX_ZONE_NAME, "kalloc.%d" , |
| 8829 | (int)zi->mzi_elem_size); |
| 8830 | |
| 8831 | coalesce[coalesce_count] = used_zones; |
| 8832 | coalesce_count++; |
| 8833 | zn++; |
| 8834 | zi++; |
| 8835 | used_zones++; |
| 8836 | continue; |
| 8837 | } |
| 8838 | |
| 8839 | zone_info_coalesce(info, coalesce_index, zi); |
| 8840 | } |
| 8841 | |
| 8842 | if (redact_info) { |
| 8843 | kmem_free(map: ipc_kernel_map, addr: coalesce_addr, size: coalesce_size); |
| 8844 | } |
| 8845 | |
| 8846 | *namesp = (mach_zone_name_t *) create_vm_map_copy(start_addr: names_addr, total_size: names_size, used_size: used_zones * sizeof *names); |
| 8847 | *namesCntp = used_zones; |
| 8848 | |
| 8849 | *infop = (mach_zone_info_t *) create_vm_map_copy(start_addr: info_addr, total_size: info_size, used_size: used_zones * sizeof *info); |
| 8850 | *infoCntp = used_zones; |
| 8851 | |
| 8852 | num_info = 0; |
| 8853 | memory_info_addr = 0; |
| 8854 | |
| 8855 | if (memoryInfop && memoryInfoCntp) { |
| 8856 | vm_map_copy_t copy; |
| 8857 | num_info = vm_page_diagnose_estimate(); |
| 8858 | memory_info_size = num_info * sizeof(*memory_info); |
| 8859 | memory_info_vmsize = round_page(x: memory_info_size); |
| 8860 | kr = kmem_alloc(map: ipc_kernel_map, addrp: &memory_info_addr, size: memory_info_vmsize, |
| 8861 | flags: KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); |
| 8862 | if (kr != KERN_SUCCESS) { |
| 8863 | return kr; |
| 8864 | } |
| 8865 | |
| 8866 | kr = vm_map_wire_kernel(map: ipc_kernel_map, start: memory_info_addr, end: memory_info_addr + memory_info_vmsize, |
| 8867 | VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); |
| 8868 | assert(kr == KERN_SUCCESS); |
| 8869 | |
| 8870 | memory_info = (mach_memory_info_t *) memory_info_addr; |
| 8871 | vm_page_diagnose(info: memory_info, num_info, zones_collectable_bytes, redact_info); |
| 8872 | |
| 8873 | kr = vm_map_unwire(map: ipc_kernel_map, start: memory_info_addr, end: memory_info_addr + memory_info_vmsize, FALSE); |
| 8874 | assert(kr == KERN_SUCCESS); |
| 8875 | |
| 8876 | kr = vm_map_copyin(src_map: ipc_kernel_map, src_addr: (vm_map_address_t)memory_info_addr, |
| 8877 | len: (vm_map_size_t)memory_info_size, TRUE, copy_result: ©); |
| 8878 | assert(kr == KERN_SUCCESS); |
| 8879 | |
| 8880 | *memoryInfop = (mach_memory_info_t *) copy; |
| 8881 | *memoryInfoCntp = num_info; |
| 8882 | } |
| 8883 | |
| 8884 | return KERN_SUCCESS; |
| 8885 | } |
| 8886 | |
| 8887 | kern_return_t |
| 8888 | mach_zone_info_for_zone( |
| 8889 | host_priv_t host, |
| 8890 | mach_zone_name_t name, |
| 8891 | mach_zone_info_t *infop) |
| 8892 | { |
| 8893 | zone_t zone_ptr; |
| 8894 | |
| 8895 | if (host == HOST_NULL) { |
| 8896 | return KERN_INVALID_HOST; |
| 8897 | } |
| 8898 | |
| 8899 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO |
| 8900 | if (!PE_i_can_has_debugger(NULL)) { |
| 8901 | return KERN_INVALID_HOST; |
| 8902 | } |
| 8903 | #endif |
| 8904 | |
| 8905 | if (infop == NULL) { |
| 8906 | return KERN_INVALID_ARGUMENT; |
| 8907 | } |
| 8908 | |
| 8909 | zone_ptr = ZONE_NULL; |
| 8910 | zone_foreach(z) { |
| 8911 | /* |
| 8912 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
| 8913 | */ |
| 8914 | char temp_zone_name[MAX_ZONE_NAME] = "" ; |
| 8915 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s" , |
| 8916 | zone_heap_name(z), z->z_name); |
| 8917 | |
| 8918 | /* Find the requested zone by name */ |
| 8919 | if (track_this_zone(zonename: temp_zone_name, logname: name.mzn_name)) { |
| 8920 | zone_ptr = z; |
| 8921 | break; |
| 8922 | } |
| 8923 | } |
| 8924 | |
| 8925 | /* No zones found with the requested zone name */ |
| 8926 | if (zone_ptr == ZONE_NULL) { |
| 8927 | return KERN_INVALID_ARGUMENT; |
| 8928 | } |
| 8929 | |
| 8930 | if (get_zone_info(z: zone_ptr, NULL, zi: infop)) { |
| 8931 | return KERN_SUCCESS; |
| 8932 | } |
| 8933 | return KERN_FAILURE; |
| 8934 | } |
| 8935 | |
| 8936 | kern_return_t |
| 8937 | mach_zone_info_for_largest_zone( |
| 8938 | host_priv_t host, |
| 8939 | mach_zone_name_t *namep, |
| 8940 | mach_zone_info_t *infop) |
| 8941 | { |
| 8942 | if (host == HOST_NULL) { |
| 8943 | return KERN_INVALID_HOST; |
| 8944 | } |
| 8945 | |
| 8946 | #if CONFIG_DEBUGGER_FOR_ZONE_INFO |
| 8947 | if (!PE_i_can_has_debugger(NULL)) { |
| 8948 | return KERN_INVALID_HOST; |
| 8949 | } |
| 8950 | #endif |
| 8951 | |
| 8952 | if (namep == NULL || infop == NULL) { |
| 8953 | return KERN_INVALID_ARGUMENT; |
| 8954 | } |
| 8955 | |
| 8956 | if (get_zone_info(z: zone_find_largest(NULL), zn: namep, zi: infop)) { |
| 8957 | return KERN_SUCCESS; |
| 8958 | } |
| 8959 | return KERN_FAILURE; |
| 8960 | } |
| 8961 | |
| 8962 | uint64_t |
| 8963 | get_zones_collectable_bytes(void) |
| 8964 | { |
| 8965 | uint64_t zones_collectable_bytes = 0; |
| 8966 | mach_zone_info_t zi; |
| 8967 | |
| 8968 | zone_foreach(z) { |
| 8969 | if (get_zone_info(z, NULL, zi: &zi)) { |
| 8970 | zones_collectable_bytes += |
| 8971 | GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable); |
| 8972 | } |
| 8973 | } |
| 8974 | |
| 8975 | return zones_collectable_bytes; |
| 8976 | } |
| 8977 | |
| 8978 | kern_return_t |
| 8979 | mach_zone_get_zlog_zones( |
| 8980 | host_priv_t host, |
| 8981 | mach_zone_name_array_t *namesp, |
| 8982 | mach_msg_type_number_t *namesCntp) |
| 8983 | { |
| 8984 | #if ZALLOC_ENABLE_LOGGING |
| 8985 | unsigned int max_zones, logged_zones, i; |
| 8986 | kern_return_t kr; |
| 8987 | zone_t zone_ptr; |
| 8988 | mach_zone_name_t *names; |
| 8989 | vm_offset_t names_addr; |
| 8990 | vm_size_t names_size; |
| 8991 | |
| 8992 | if (host == HOST_NULL) { |
| 8993 | return KERN_INVALID_HOST; |
| 8994 | } |
| 8995 | |
| 8996 | if (namesp == NULL || namesCntp == NULL) { |
| 8997 | return KERN_INVALID_ARGUMENT; |
| 8998 | } |
| 8999 | |
| 9000 | max_zones = os_atomic_load(&num_zones, relaxed); |
| 9001 | |
| 9002 | names_size = round_page(max_zones * sizeof *names); |
| 9003 | kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size, |
| 9004 | KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); |
| 9005 | if (kr != KERN_SUCCESS) { |
| 9006 | return kr; |
| 9007 | } |
| 9008 | names = (mach_zone_name_t *) names_addr; |
| 9009 | |
| 9010 | zone_ptr = ZONE_NULL; |
| 9011 | logged_zones = 0; |
| 9012 | for (i = 0; i < max_zones; i++) { |
| 9013 | zone_t z = &(zone_array[i]); |
| 9014 | assert(z != ZONE_NULL); |
| 9015 | |
| 9016 | /* Copy out the zone name if zone logging is enabled */ |
| 9017 | if (z->z_btlog) { |
| 9018 | get_zone_info(z, &names[logged_zones], NULL); |
| 9019 | logged_zones++; |
| 9020 | } |
| 9021 | } |
| 9022 | |
| 9023 | *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names); |
| 9024 | *namesCntp = logged_zones; |
| 9025 | |
| 9026 | return KERN_SUCCESS; |
| 9027 | |
| 9028 | #else /* ZALLOC_ENABLE_LOGGING */ |
| 9029 | #pragma unused(host, namesp, namesCntp) |
| 9030 | return KERN_FAILURE; |
| 9031 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 9032 | } |
| 9033 | |
| 9034 | kern_return_t |
| 9035 | mach_zone_get_btlog_records( |
| 9036 | host_priv_t host, |
| 9037 | mach_zone_name_t name, |
| 9038 | zone_btrecord_array_t *recsp, |
| 9039 | mach_msg_type_number_t *numrecs) |
| 9040 | { |
| 9041 | #if ZALLOC_ENABLE_LOGGING |
| 9042 | zone_btrecord_t *recs; |
| 9043 | kern_return_t kr; |
| 9044 | vm_address_t addr; |
| 9045 | vm_size_t size; |
| 9046 | zone_t zone_ptr; |
| 9047 | vm_map_copy_t copy; |
| 9048 | |
| 9049 | if (host == HOST_NULL) { |
| 9050 | return KERN_INVALID_HOST; |
| 9051 | } |
| 9052 | |
| 9053 | if (recsp == NULL || numrecs == NULL) { |
| 9054 | return KERN_INVALID_ARGUMENT; |
| 9055 | } |
| 9056 | |
| 9057 | zone_ptr = ZONE_NULL; |
| 9058 | zone_foreach(z) { |
| 9059 | /* |
| 9060 | * Append kalloc heap name to zone name (if zone is used by kalloc) |
| 9061 | */ |
| 9062 | char temp_zone_name[MAX_ZONE_NAME] = "" ; |
| 9063 | snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s" , |
| 9064 | zone_heap_name(z), z->z_name); |
| 9065 | |
| 9066 | /* Find the requested zone by name */ |
| 9067 | if (track_this_zone(temp_zone_name, name.mzn_name)) { |
| 9068 | zone_ptr = z; |
| 9069 | break; |
| 9070 | } |
| 9071 | } |
| 9072 | |
| 9073 | /* No zones found with the requested zone name */ |
| 9074 | if (zone_ptr == ZONE_NULL) { |
| 9075 | return KERN_INVALID_ARGUMENT; |
| 9076 | } |
| 9077 | |
| 9078 | /* Logging not turned on for the requested zone */ |
| 9079 | if (!zone_ptr->z_btlog) { |
| 9080 | return KERN_FAILURE; |
| 9081 | } |
| 9082 | |
| 9083 | kr = btlog_get_records(zone_ptr->z_btlog, &recs, numrecs); |
| 9084 | if (kr != KERN_SUCCESS) { |
| 9085 | return kr; |
| 9086 | } |
| 9087 | |
| 9088 | addr = (vm_address_t)recs; |
| 9089 | size = sizeof(zone_btrecord_t) * *numrecs; |
| 9090 | |
| 9091 | kr = vm_map_copyin(ipc_kernel_map, addr, size, TRUE, ©); |
| 9092 | assert(kr == KERN_SUCCESS); |
| 9093 | |
| 9094 | *recsp = (zone_btrecord_t *)copy; |
| 9095 | return KERN_SUCCESS; |
| 9096 | |
| 9097 | #else /* !ZALLOC_ENABLE_LOGGING */ |
| 9098 | #pragma unused(host, name, recsp, numrecs) |
| 9099 | return KERN_FAILURE; |
| 9100 | #endif /* !ZALLOC_ENABLE_LOGGING */ |
| 9101 | } |
| 9102 | |
| 9103 | |
| 9104 | kern_return_t |
| 9105 | mach_zone_force_gc( |
| 9106 | host_t host) |
| 9107 | { |
| 9108 | if (host == HOST_NULL) { |
| 9109 | return KERN_INVALID_HOST; |
| 9110 | } |
| 9111 | |
| 9112 | #if DEBUG || DEVELOPMENT |
| 9113 | extern boolean_t(*volatile consider_buffer_cache_collect)(int); |
| 9114 | /* Callout to buffer cache GC to drop elements in the apfs zones */ |
| 9115 | if (consider_buffer_cache_collect != NULL) { |
| 9116 | (void)(*consider_buffer_cache_collect)(0); |
| 9117 | } |
| 9118 | zone_gc(ZONE_GC_DRAIN); |
| 9119 | #endif /* DEBUG || DEVELOPMENT */ |
| 9120 | return KERN_SUCCESS; |
| 9121 | } |
| 9122 | |
| 9123 | zone_t |
| 9124 | zone_find_largest(uint64_t *zone_size) |
| 9125 | { |
| 9126 | zone_t largest_zone = 0; |
| 9127 | uint64_t largest_zone_size = 0; |
| 9128 | zone_find_n_largest(n: 1, largest_zones: &largest_zone, zone_size: &largest_zone_size); |
| 9129 | if (zone_size) { |
| 9130 | *zone_size = largest_zone_size; |
| 9131 | } |
| 9132 | return largest_zone; |
| 9133 | } |
| 9134 | |
| 9135 | void |
| 9136 | zone_get_stats( |
| 9137 | zone_t zone, |
| 9138 | struct zone_basic_stats *stats) |
| 9139 | { |
| 9140 | stats->zbs_avail = zone->z_elems_avail; |
| 9141 | |
| 9142 | stats->zbs_alloc_fail = 0; |
| 9143 | zpercpu_foreach(zs, zone->z_stats) { |
| 9144 | stats->zbs_alloc_fail += zs->zs_alloc_fail; |
| 9145 | } |
| 9146 | |
| 9147 | stats->zbs_cached = 0; |
| 9148 | if (zone->z_pcpu_cache) { |
| 9149 | zpercpu_foreach(zc, zone->z_pcpu_cache) { |
| 9150 | stats->zbs_cached += zc->zc_alloc_cur + |
| 9151 | zc->zc_free_cur + |
| 9152 | zc->zc_depot.zd_full * zc_mag_size(); |
| 9153 | } |
| 9154 | } |
| 9155 | |
| 9156 | stats->zbs_free = zone_count_free(zone) + stats->zbs_cached; |
| 9157 | |
| 9158 | /* |
| 9159 | * Since we don't take any locks, deal with possible inconsistencies |
| 9160 | * as the counters may have changed. |
| 9161 | */ |
| 9162 | if (os_sub_overflow(stats->zbs_avail, stats->zbs_free, |
| 9163 | &stats->zbs_alloc)) { |
| 9164 | stats->zbs_avail = stats->zbs_free; |
| 9165 | stats->zbs_alloc = 0; |
| 9166 | } |
| 9167 | } |
| 9168 | |
| 9169 | #endif /* !ZALLOC_TEST */ |
| 9170 | #pragma mark zone creation, configuration, destruction |
| 9171 | #if !ZALLOC_TEST |
| 9172 | |
| 9173 | static zone_t |
| 9174 | zone_init_defaults(zone_id_t zid) |
| 9175 | { |
| 9176 | zone_t z = &zone_array[zid]; |
| 9177 | |
| 9178 | z->z_wired_max = ~0u; |
| 9179 | z->collectable = true; |
| 9180 | |
| 9181 | hw_lck_ticket_init(&z->z_lock, &zone_locks_grp); |
| 9182 | hw_lck_ticket_init(&z->z_recirc_lock, &zone_locks_grp); |
| 9183 | zone_depot_init(zd: &z->z_recirc); |
| 9184 | return z; |
| 9185 | } |
| 9186 | |
| 9187 | void |
| 9188 | zone_set_exhaustible(zone_t zone, vm_size_t nelems, bool exhausts_by_design) |
| 9189 | { |
| 9190 | zone_lock(zone); |
| 9191 | zone->z_wired_max = zone_alloc_pages_for_nelems(z: zone, max_elems: nelems); |
| 9192 | zone->z_exhausts = exhausts_by_design; |
| 9193 | zone_unlock(zone); |
| 9194 | } |
| 9195 | |
| 9196 | void |
| 9197 | zone_raise_reserve(union zone_or_view zov, uint16_t min_elements) |
| 9198 | { |
| 9199 | zone_t zone = zov.zov_zone; |
| 9200 | |
| 9201 | if (zone < zone_array || zone > &zone_array[MAX_ZONES]) { |
| 9202 | zone = zov.zov_view->zv_zone; |
| 9203 | } else { |
| 9204 | zone = zov.zov_zone; |
| 9205 | } |
| 9206 | |
| 9207 | os_atomic_max(&zone->z_elems_rsv, min_elements, relaxed); |
| 9208 | } |
| 9209 | |
| 9210 | /** |
| 9211 | * @function zone_create_find |
| 9212 | * |
| 9213 | * @abstract |
| 9214 | * Finds an unused zone for the given name and element size. |
| 9215 | * |
| 9216 | * @param name the zone name |
| 9217 | * @param size the element size (including redzones, ...) |
| 9218 | * @param flags the flags passed to @c zone_create* |
| 9219 | * @param zid_inout the desired zone ID or ZONE_ID_ANY |
| 9220 | * |
| 9221 | * @returns a zone to initialize further. |
| 9222 | */ |
| 9223 | static zone_t |
| 9224 | zone_create_find( |
| 9225 | const char *name, |
| 9226 | vm_size_t size, |
| 9227 | zone_create_flags_t flags, |
| 9228 | zone_id_t *zid_inout) |
| 9229 | { |
| 9230 | zone_id_t nzones, zid = *zid_inout; |
| 9231 | zone_t z; |
| 9232 | |
| 9233 | simple_lock(&all_zones_lock, &zone_locks_grp); |
| 9234 | |
| 9235 | nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed); |
| 9236 | assert(num_zones_in_use <= nzones && nzones < MAX_ZONES); |
| 9237 | |
| 9238 | if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) { |
| 9239 | /* |
| 9240 | * The first time around, make sure the reserved zone IDs |
| 9241 | * have an initialized lock as zone_index_foreach() will |
| 9242 | * enumerate them. |
| 9243 | */ |
| 9244 | while (nzones < ZONE_ID__FIRST_DYNAMIC) { |
| 9245 | zone_init_defaults(zid: nzones++); |
| 9246 | } |
| 9247 | |
| 9248 | os_atomic_store(&num_zones, nzones, release); |
| 9249 | } |
| 9250 | |
| 9251 | if (zid != ZONE_ID_ANY) { |
| 9252 | if (zid >= ZONE_ID__FIRST_DYNAMIC) { |
| 9253 | panic("zone_create: invalid desired zone ID %d for %s" , |
| 9254 | zid, name); |
| 9255 | } |
| 9256 | if (flags & ZC_DESTRUCTIBLE) { |
| 9257 | panic("zone_create: ID %d (%s) must be permanent" , zid, name); |
| 9258 | } |
| 9259 | if (zone_array[zid].z_self) { |
| 9260 | panic("zone_create: creating zone ID %d (%s) twice" , zid, name); |
| 9261 | } |
| 9262 | z = &zone_array[zid]; |
| 9263 | } else { |
| 9264 | if (flags & ZC_DESTRUCTIBLE) { |
| 9265 | /* |
| 9266 | * If possible, find a previously zdestroy'ed zone in the |
| 9267 | * zone_array that we can reuse. |
| 9268 | */ |
| 9269 | for (int i = bitmap_first(map: zone_destroyed_bitmap, MAX_ZONES); |
| 9270 | i >= 0; i = bitmap_next(map: zone_destroyed_bitmap, prev: i)) { |
| 9271 | z = &zone_array[i]; |
| 9272 | |
| 9273 | /* |
| 9274 | * If the zone name and the element size are the |
| 9275 | * same, we can just reuse the old zone struct. |
| 9276 | */ |
| 9277 | if (strcmp(s1: z->z_name, s2: name) || |
| 9278 | zone_elem_outer_size(zone: z) != size) { |
| 9279 | continue; |
| 9280 | } |
| 9281 | bitmap_clear(map: zone_destroyed_bitmap, n: i); |
| 9282 | z->z_destroyed = false; |
| 9283 | z->z_self = z; |
| 9284 | zid = (zone_id_t)i; |
| 9285 | goto out; |
| 9286 | } |
| 9287 | } |
| 9288 | |
| 9289 | zid = nzones++; |
| 9290 | z = zone_init_defaults(zid); |
| 9291 | |
| 9292 | /* |
| 9293 | * The release barrier pairs with the acquire in |
| 9294 | * zone_index_foreach() and makes sure that enumeration loops |
| 9295 | * always see an initialized zone lock. |
| 9296 | */ |
| 9297 | os_atomic_store(&num_zones, nzones, release); |
| 9298 | } |
| 9299 | |
| 9300 | out: |
| 9301 | num_zones_in_use++; |
| 9302 | simple_unlock(&all_zones_lock); |
| 9303 | |
| 9304 | *zid_inout = zid; |
| 9305 | return z; |
| 9306 | } |
| 9307 | |
| 9308 | __abortlike |
| 9309 | static void |
| 9310 | zone_create_panic(const char *name, const char *f1, const char *f2) |
| 9311 | { |
| 9312 | panic("zone_create: creating zone %s: flag %s and %s are incompatible" , |
| 9313 | name, f1, f2); |
| 9314 | } |
| 9315 | #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \ |
| 9316 | if ((flags) & forbidden_flag) { \ |
| 9317 | zone_create_panic(name, #current_flag, #forbidden_flag); \ |
| 9318 | } |
| 9319 | |
| 9320 | /* |
| 9321 | * Adjusts the size of the element based on minimum size, alignment |
| 9322 | * and kasan redzones |
| 9323 | */ |
| 9324 | static vm_size_t |
| 9325 | zone_elem_adjust_size( |
| 9326 | const char *name __unused, |
| 9327 | vm_size_t elem_size, |
| 9328 | zone_create_flags_t flags __unused, |
| 9329 | uint16_t *redzone __unused) |
| 9330 | { |
| 9331 | vm_size_t size; |
| 9332 | |
| 9333 | /* |
| 9334 | * Adjust element size for minimum size and pointer alignment |
| 9335 | */ |
| 9336 | size = (elem_size + ZONE_ALIGN_SIZE - 1) & -ZONE_ALIGN_SIZE; |
| 9337 | if (size < ZONE_MIN_ELEM_SIZE) { |
| 9338 | size = ZONE_MIN_ELEM_SIZE; |
| 9339 | } |
| 9340 | |
| 9341 | #if KASAN_CLASSIC |
| 9342 | /* |
| 9343 | * Expand the zone allocation size to include the redzones. |
| 9344 | * |
| 9345 | * For page-multiple zones add a full guard page because they |
| 9346 | * likely require alignment. |
| 9347 | */ |
| 9348 | uint16_t redzone_tmp; |
| 9349 | if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU | ZC_OBJ_CACHE)) { |
| 9350 | redzone_tmp = 0; |
| 9351 | } else if ((size & PAGE_MASK) == 0) { |
| 9352 | if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) { |
| 9353 | panic("zone_create: zone %s can't provide more than PAGE_SIZE" |
| 9354 | "alignment" , name); |
| 9355 | } |
| 9356 | redzone_tmp = PAGE_SIZE; |
| 9357 | } else if (flags & ZC_ALIGNMENT_REQUIRED) { |
| 9358 | redzone_tmp = 0; |
| 9359 | } else { |
| 9360 | redzone_tmp = KASAN_GUARD_SIZE; |
| 9361 | } |
| 9362 | size += redzone_tmp; |
| 9363 | if (redzone) { |
| 9364 | *redzone = redzone_tmp; |
| 9365 | } |
| 9366 | #endif |
| 9367 | return size; |
| 9368 | } |
| 9369 | |
| 9370 | /* |
| 9371 | * Returns the allocation chunk size that has least framentation |
| 9372 | */ |
| 9373 | static vm_size_t |
| 9374 | zone_get_min_alloc_granule( |
| 9375 | vm_size_t elem_size, |
| 9376 | zone_create_flags_t flags) |
| 9377 | { |
| 9378 | vm_size_t alloc_granule = PAGE_SIZE; |
| 9379 | if (flags & ZC_PERCPU) { |
| 9380 | alloc_granule = PAGE_SIZE * zpercpu_count(); |
| 9381 | if (PAGE_SIZE % elem_size > 256) { |
| 9382 | panic("zone_create: per-cpu zone has too much fragmentation" ); |
| 9383 | } |
| 9384 | } else if (flags & ZC_READONLY) { |
| 9385 | alloc_granule = PAGE_SIZE; |
| 9386 | } else if ((elem_size & PAGE_MASK) == 0) { |
| 9387 | /* zero fragmentation by definition */ |
| 9388 | alloc_granule = elem_size; |
| 9389 | } else if (alloc_granule % elem_size == 0) { |
| 9390 | /* zero fragmentation by definition */ |
| 9391 | } else { |
| 9392 | vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule; |
| 9393 | vm_size_t alloc_tmp = PAGE_SIZE; |
| 9394 | vm_size_t max_chunk_size = ZONE_MAX_ALLOC_SIZE; |
| 9395 | |
| 9396 | #if __arm64__ |
| 9397 | /* |
| 9398 | * Increase chunk size to 48K for sizes larger than 4K on 16k |
| 9399 | * machines, so as to reduce internal fragementation for kalloc |
| 9400 | * zones with sizes 12K and 24K. |
| 9401 | */ |
| 9402 | if (elem_size > 4 * 1024 && PAGE_SIZE == 16 * 1024) { |
| 9403 | max_chunk_size = 48 * 1024; |
| 9404 | } |
| 9405 | #endif |
| 9406 | while ((alloc_tmp += PAGE_SIZE) <= max_chunk_size) { |
| 9407 | vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp; |
| 9408 | if (frag_tmp < frag) { |
| 9409 | frag = frag_tmp; |
| 9410 | alloc_granule = alloc_tmp; |
| 9411 | } |
| 9412 | } |
| 9413 | } |
| 9414 | return alloc_granule; |
| 9415 | } |
| 9416 | |
| 9417 | vm_size_t |
| 9418 | zone_get_early_alloc_size( |
| 9419 | const char *name __unused, |
| 9420 | vm_size_t elem_size, |
| 9421 | zone_create_flags_t flags, |
| 9422 | vm_size_t min_elems) |
| 9423 | { |
| 9424 | vm_size_t adjusted_size, alloc_granule, chunk_elems; |
| 9425 | |
| 9426 | adjusted_size = zone_elem_adjust_size(name, elem_size, flags, NULL); |
| 9427 | alloc_granule = zone_get_min_alloc_granule(elem_size: adjusted_size, flags); |
| 9428 | chunk_elems = alloc_granule / adjusted_size; |
| 9429 | |
| 9430 | return ((min_elems + chunk_elems - 1) / chunk_elems) * alloc_granule; |
| 9431 | } |
| 9432 | |
| 9433 | zone_t |
| 9434 | zone_create_ext( |
| 9435 | const char *name, |
| 9436 | vm_size_t size, |
| 9437 | zone_create_flags_t flags, |
| 9438 | zone_id_t zid, |
| 9439 | void (^)(zone_t)) |
| 9440 | { |
| 9441 | zone_security_flags_t *zsflags; |
| 9442 | uint16_t redzone; |
| 9443 | zone_t z; |
| 9444 | |
| 9445 | if (size > ZONE_MAX_ALLOC_SIZE) { |
| 9446 | panic("zone_create: element size too large: %zd" , (size_t)size); |
| 9447 | } |
| 9448 | |
| 9449 | if (size < 2 * sizeof(vm_size_t)) { |
| 9450 | /* Elements are too small for kasan. */ |
| 9451 | flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; |
| 9452 | } |
| 9453 | |
| 9454 | size = zone_elem_adjust_size(name, elem_size: size, flags, redzone: &redzone); |
| 9455 | |
| 9456 | /* |
| 9457 | * Allocate the zone slot, return early if we found an older match. |
| 9458 | */ |
| 9459 | z = zone_create_find(name, size, flags, zid_inout: &zid); |
| 9460 | if (__improbable(z->z_self)) { |
| 9461 | /* We found a zone to reuse */ |
| 9462 | return z; |
| 9463 | } |
| 9464 | zsflags = &zone_security_array[zid]; |
| 9465 | |
| 9466 | /* |
| 9467 | * Initialize the zone properly. |
| 9468 | */ |
| 9469 | |
| 9470 | /* |
| 9471 | * If the kernel is post lockdown, copy the zone name passed in. |
| 9472 | * Else simply maintain a pointer to the name string as it can only |
| 9473 | * be a core XNU zone (no unloadable kext exists before lockdown). |
| 9474 | */ |
| 9475 | if (startup_phase >= STARTUP_SUB_LOCKDOWN) { |
| 9476 | size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN); |
| 9477 | char *buf = zalloc_permanent(nsz, ZALIGN_NONE); |
| 9478 | strlcpy(dst: buf, src: name, n: nsz); |
| 9479 | z->z_name = buf; |
| 9480 | } else { |
| 9481 | z->z_name = name; |
| 9482 | } |
| 9483 | if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { |
| 9484 | z->z_stats = zalloc_percpu_permanent_type(struct zone_stats); |
| 9485 | } else { |
| 9486 | /* |
| 9487 | * zone_init() hasn't run yet, use the storage provided by |
| 9488 | * zone_stats_startup(), and zone_init() will replace it |
| 9489 | * with the final value once the PERCPU zone exists. |
| 9490 | */ |
| 9491 | z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]); |
| 9492 | } |
| 9493 | |
| 9494 | if (flags & ZC_OBJ_CACHE) { |
| 9495 | zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_NOCACHING); |
| 9496 | zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_PERCPU); |
| 9497 | zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_NOGC); |
| 9498 | zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_DESTRUCTIBLE); |
| 9499 | |
| 9500 | z->z_elem_size = (uint16_t)size; |
| 9501 | z->z_chunk_pages = 0; |
| 9502 | z->z_quo_magic = 0; |
| 9503 | z->z_align_magic = 0; |
| 9504 | z->z_chunk_elems = 0; |
| 9505 | z->z_elem_offs = 0; |
| 9506 | z->no_callout = true; |
| 9507 | zsflags->z_lifo = true; |
| 9508 | } else { |
| 9509 | vm_size_t alloc = zone_get_min_alloc_granule(elem_size: size, flags); |
| 9510 | |
| 9511 | z->z_elem_size = (uint16_t)(size - redzone); |
| 9512 | z->z_chunk_pages = (uint16_t)atop(alloc); |
| 9513 | z->z_quo_magic = Z_MAGIC_QUO(size); |
| 9514 | z->z_align_magic = Z_MAGIC_ALIGNED(size); |
| 9515 | if (flags & ZC_PERCPU) { |
| 9516 | z->z_chunk_elems = (uint16_t)(PAGE_SIZE / size); |
| 9517 | z->z_elem_offs = (uint16_t)(PAGE_SIZE % size) + redzone; |
| 9518 | } else { |
| 9519 | z->z_chunk_elems = (uint16_t)(alloc / size); |
| 9520 | z->z_elem_offs = (uint16_t)(alloc % size) + redzone; |
| 9521 | } |
| 9522 | } |
| 9523 | |
| 9524 | /* |
| 9525 | * Handle KPI flags |
| 9526 | */ |
| 9527 | |
| 9528 | /* ZC_CACHING applied after all configuration is done */ |
| 9529 | if (flags & ZC_NOCACHING) { |
| 9530 | z->z_nocaching = true; |
| 9531 | } |
| 9532 | |
| 9533 | if (flags & ZC_READONLY) { |
| 9534 | zone_create_assert_not_both(name, flags, ZC_READONLY, ZC_VM); |
| 9535 | zone_create_assert_not_both(name, flags, ZC_READONLY, ZC_DATA); |
| 9536 | assert(zid <= ZONE_ID__LAST_RO); |
| 9537 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 9538 | zsflags->z_submap_idx = Z_SUBMAP_IDX_READ_ONLY; |
| 9539 | #endif |
| 9540 | zone_ro_size_params[zid].z_elem_size = z->z_elem_size; |
| 9541 | zone_ro_size_params[zid].z_align_magic = z->z_align_magic; |
| 9542 | assert(size <= PAGE_SIZE); |
| 9543 | if ((PAGE_SIZE % size) * 10 >= PAGE_SIZE) { |
| 9544 | panic("Fragmentation greater than 10%% with elem size %d zone %s%s" , |
| 9545 | (uint32_t)size, zone_heap_name(z), z->z_name); |
| 9546 | } |
| 9547 | } |
| 9548 | |
| 9549 | if (flags & ZC_PERCPU) { |
| 9550 | zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_READONLY); |
| 9551 | zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_PGZ_USE_GUARDS); |
| 9552 | z->z_percpu = true; |
| 9553 | } |
| 9554 | if (flags & ZC_NOGC) { |
| 9555 | z->collectable = false; |
| 9556 | } |
| 9557 | /* |
| 9558 | * Handle ZC_NOENCRYPT from xnu only |
| 9559 | */ |
| 9560 | if (startup_phase < STARTUP_SUB_LOCKDOWN && flags & ZC_NOENCRYPT) { |
| 9561 | zsflags->z_noencrypt = true; |
| 9562 | } |
| 9563 | if (flags & ZC_NOCALLOUT) { |
| 9564 | z->no_callout = true; |
| 9565 | } |
| 9566 | if (flags & ZC_DESTRUCTIBLE) { |
| 9567 | zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_READONLY); |
| 9568 | z->z_destructible = true; |
| 9569 | } |
| 9570 | /* |
| 9571 | * Handle Internal flags |
| 9572 | */ |
| 9573 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 9574 | if (flags & ZC_PGZ_USE_GUARDS) { |
| 9575 | /* |
| 9576 | * Try to turn on guard pages only for zones |
| 9577 | * with a chance of OOB. |
| 9578 | */ |
| 9579 | if (startup_phase < STARTUP_SUB_LOCKDOWN) { |
| 9580 | zsflags->z_pgz_use_guards = true; |
| 9581 | } |
| 9582 | z->z_pgz_use_guards = true; |
| 9583 | } |
| 9584 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 9585 | if (!(flags & ZC_NOTBITAG)) { |
| 9586 | z->z_tbi_tag = true; |
| 9587 | } |
| 9588 | if (flags & ZC_KALLOC_TYPE) { |
| 9589 | zsflags->z_kalloc_type = true; |
| 9590 | } |
| 9591 | if (flags & ZC_VM) { |
| 9592 | zone_create_assert_not_both(name, flags, ZC_VM, ZC_DATA); |
| 9593 | zsflags->z_submap_idx = Z_SUBMAP_IDX_VM; |
| 9594 | } |
| 9595 | if (flags & ZC_DATA) { |
| 9596 | zsflags->z_kheap_id = KHEAP_ID_DATA_BUFFERS; |
| 9597 | } |
| 9598 | #if KASAN_CLASSIC |
| 9599 | if (redzone && !(flags & ZC_KASAN_NOQUARANTINE)) { |
| 9600 | z->z_kasan_quarantine = true; |
| 9601 | } |
| 9602 | z->z_kasan_redzone = redzone; |
| 9603 | #endif /* KASAN_CLASSIC */ |
| 9604 | #if KASAN_FAKESTACK |
| 9605 | if (strncmp(name, "fakestack." , sizeof("fakestack." ) - 1) == 0) { |
| 9606 | z->z_kasan_fakestacks = true; |
| 9607 | } |
| 9608 | #endif /* KASAN_FAKESTACK */ |
| 9609 | |
| 9610 | /* |
| 9611 | * Then if there's extra tuning, do it |
| 9612 | */ |
| 9613 | if (extra_setup) { |
| 9614 | extra_setup(z); |
| 9615 | } |
| 9616 | |
| 9617 | /* |
| 9618 | * Configure debugging features |
| 9619 | */ |
| 9620 | #if CONFIG_PROB_GZALLOC |
| 9621 | if ((flags & (ZC_READONLY | ZC_PERCPU | ZC_OBJ_CACHE | ZC_NOPGZ)) == 0) { |
| 9622 | pgz_zone_init(z); |
| 9623 | } |
| 9624 | #endif |
| 9625 | if (zc_magazine_zone) { /* proxy for "has zone_init run" */ |
| 9626 | #if ZALLOC_ENABLE_LOGGING |
| 9627 | /* |
| 9628 | * Check for and set up zone leak detection |
| 9629 | * if requested via boot-args. |
| 9630 | */ |
| 9631 | zone_setup_logging(z); |
| 9632 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 9633 | #if KASAN_TBI |
| 9634 | zone_setup_kasan_logging(z); |
| 9635 | #endif /* KASAN_TBI */ |
| 9636 | } |
| 9637 | |
| 9638 | #if VM_TAG_SIZECLASSES |
| 9639 | if ((zsflags->z_kheap_id || zsflags->z_kalloc_type) && zone_tagging_on) { |
| 9640 | static uint16_t sizeclass_idx; |
| 9641 | |
| 9642 | assert(startup_phase < STARTUP_SUB_LOCKDOWN); |
| 9643 | z->z_uses_tags = true; |
| 9644 | if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) { |
| 9645 | zone_tags_sizeclasses[sizeclass_idx] = (uint16_t)size; |
| 9646 | z->z_tags_sizeclass = sizeclass_idx++; |
| 9647 | } else { |
| 9648 | uint16_t i = 0; |
| 9649 | for (; i < sizeclass_idx; i++) { |
| 9650 | if (size == zone_tags_sizeclasses[i]) { |
| 9651 | z->z_tags_sizeclass = i; |
| 9652 | break; |
| 9653 | } |
| 9654 | } |
| 9655 | |
| 9656 | /* |
| 9657 | * Size class wasn't found, add it to zone_tags_sizeclasses |
| 9658 | */ |
| 9659 | if (i == sizeclass_idx) { |
| 9660 | assert(i < VM_TAG_SIZECLASSES); |
| 9661 | zone_tags_sizeclasses[i] = (uint16_t)size; |
| 9662 | z->z_tags_sizeclass = sizeclass_idx++; |
| 9663 | } |
| 9664 | } |
| 9665 | assert(z->z_tags_sizeclass < VM_TAG_SIZECLASSES); |
| 9666 | } |
| 9667 | #endif |
| 9668 | |
| 9669 | /* |
| 9670 | * Finally, fixup properties based on security policies, boot-args, ... |
| 9671 | */ |
| 9672 | if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) { |
| 9673 | /* |
| 9674 | * We use LIFO in the data map, because workloads like network |
| 9675 | * usage or similar tend to rotate through allocations very |
| 9676 | * quickly with sometimes epxloding working-sets and using |
| 9677 | * a FIFO policy might cause massive TLB trashing with rather |
| 9678 | * dramatic performance impacts. |
| 9679 | */ |
| 9680 | zsflags->z_submap_idx = Z_SUBMAP_IDX_DATA; |
| 9681 | zsflags->z_lifo = true; |
| 9682 | } |
| 9683 | |
| 9684 | if ((flags & (ZC_CACHING | ZC_OBJ_CACHE)) && !z->z_nocaching) { |
| 9685 | /* |
| 9686 | * No zone made before zone_init() can have ZC_CACHING set. |
| 9687 | */ |
| 9688 | assert(zc_magazine_zone); |
| 9689 | zone_enable_caching(zone: z); |
| 9690 | } |
| 9691 | |
| 9692 | zone_lock(zone: z); |
| 9693 | z->z_self = z; |
| 9694 | zone_unlock(zone: z); |
| 9695 | |
| 9696 | return z; |
| 9697 | } |
| 9698 | |
| 9699 | void |
| 9700 | zone_set_sig_eq(zone_t zone, zone_id_t sig_eq) |
| 9701 | { |
| 9702 | zone_security_array[zone_index(z: zone)].z_sig_eq = sig_eq; |
| 9703 | } |
| 9704 | |
| 9705 | zone_id_t |
| 9706 | zone_get_sig_eq(zone_t zone) |
| 9707 | { |
| 9708 | return zone_security_array[zone_index(z: zone)].z_sig_eq; |
| 9709 | } |
| 9710 | |
| 9711 | void |
| 9712 | zone_enable_smr(zone_t zone, struct smr *smr, zone_smr_free_cb_t free_cb) |
| 9713 | { |
| 9714 | /* moving to SMR must be done before the zone has ever been used */ |
| 9715 | assert(zone->z_va_cur == 0 && !zone->z_smr && !zone->z_nocaching); |
| 9716 | assert(!zone_security_array[zone_index(zone)].z_lifo); |
| 9717 | assert((smr->smr_flags & SMR_SLEEPABLE) == 0); |
| 9718 | |
| 9719 | if (!zone->z_pcpu_cache) { |
| 9720 | zone_enable_caching(zone); |
| 9721 | } |
| 9722 | |
| 9723 | zone_lock(zone); |
| 9724 | |
| 9725 | zpercpu_foreach(it, zone->z_pcpu_cache) { |
| 9726 | it->zc_smr = smr; |
| 9727 | it->zc_free = free_cb; |
| 9728 | } |
| 9729 | zone->z_smr = true; |
| 9730 | |
| 9731 | zone_unlock(zone); |
| 9732 | } |
| 9733 | |
| 9734 | __startup_func |
| 9735 | void |
| 9736 | zone_create_startup(struct zone_create_startup_spec *spec) |
| 9737 | { |
| 9738 | zone_t z; |
| 9739 | |
| 9740 | z = zone_create_ext(name: spec->z_name, size: spec->z_size, |
| 9741 | flags: spec->z_flags, zid: spec->z_zid, extra_setup: spec->z_setup); |
| 9742 | if (spec->z_var) { |
| 9743 | *spec->z_var = z; |
| 9744 | } |
| 9745 | } |
| 9746 | |
| 9747 | /* |
| 9748 | * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t |
| 9749 | * union works. trust but verify. |
| 9750 | */ |
| 9751 | #define zalloc_check_zov_alias(f1, f2) \ |
| 9752 | static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2)) |
| 9753 | zalloc_check_zov_alias(z_self, zv_zone); |
| 9754 | zalloc_check_zov_alias(z_stats, zv_stats); |
| 9755 | zalloc_check_zov_alias(z_name, zv_name); |
| 9756 | zalloc_check_zov_alias(z_views, zv_next); |
| 9757 | #undef zalloc_check_zov_alias |
| 9758 | |
| 9759 | __startup_func |
| 9760 | void |
| 9761 | zone_view_startup_init(struct zone_view_startup_spec *spec) |
| 9762 | { |
| 9763 | struct kalloc_heap *heap = NULL; |
| 9764 | zone_view_t zv = spec->zv_view; |
| 9765 | zone_t z; |
| 9766 | zone_security_flags_t zsflags; |
| 9767 | |
| 9768 | switch (spec->zv_heapid) { |
| 9769 | case KHEAP_ID_DATA_BUFFERS: |
| 9770 | heap = KHEAP_DATA_BUFFERS; |
| 9771 | break; |
| 9772 | default: |
| 9773 | heap = NULL; |
| 9774 | } |
| 9775 | |
| 9776 | if (heap) { |
| 9777 | z = kalloc_zone_for_size(zid: heap->kh_zstart, size: spec->zv_size); |
| 9778 | } else { |
| 9779 | z = *spec->zv_zone; |
| 9780 | assert(spec->zv_size <= zone_elem_inner_size(z)); |
| 9781 | } |
| 9782 | |
| 9783 | assert(z); |
| 9784 | |
| 9785 | zv->zv_zone = z; |
| 9786 | zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats); |
| 9787 | zv->zv_next = z->z_views; |
| 9788 | zsflags = zone_security_config(z); |
| 9789 | if (z->z_views == NULL && zsflags.z_kheap_id == KHEAP_ID_NONE) { |
| 9790 | /* |
| 9791 | * count the raw view for zones not in a heap, |
| 9792 | * kalloc_heap_init() already counts it for its members. |
| 9793 | */ |
| 9794 | zone_view_count += 2; |
| 9795 | } else { |
| 9796 | zone_view_count += 1; |
| 9797 | } |
| 9798 | z->z_views = zv; |
| 9799 | } |
| 9800 | |
| 9801 | zone_t |
| 9802 | zone_create( |
| 9803 | const char *name, |
| 9804 | vm_size_t size, |
| 9805 | zone_create_flags_t flags) |
| 9806 | { |
| 9807 | return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL); |
| 9808 | } |
| 9809 | |
| 9810 | static_assert(ZONE_ID__LAST_RO_EXT - ZONE_ID__FIRST_RO_EXT == ZC_RO_ID__LAST); |
| 9811 | |
| 9812 | zone_id_t |
| 9813 | zone_create_ro( |
| 9814 | const char *name, |
| 9815 | vm_size_t size, |
| 9816 | zone_create_flags_t flags, |
| 9817 | zone_create_ro_id_t zc_ro_id) |
| 9818 | { |
| 9819 | assert(zc_ro_id <= ZC_RO_ID__LAST); |
| 9820 | zone_id_t reserved_zid = ZONE_ID__FIRST_RO_EXT + zc_ro_id; |
| 9821 | (void)zone_create_ext(name, size, flags: ZC_READONLY | flags, zid: reserved_zid, NULL); |
| 9822 | return reserved_zid; |
| 9823 | } |
| 9824 | |
| 9825 | zone_t |
| 9826 | zinit( |
| 9827 | vm_size_t size, /* the size of an element */ |
| 9828 | vm_size_t max __unused, /* maximum memory to use */ |
| 9829 | vm_size_t alloc __unused, /* allocation size */ |
| 9830 | const char *name) /* a name for the zone */ |
| 9831 | { |
| 9832 | return zone_create(name, size, flags: ZC_DESTRUCTIBLE); |
| 9833 | } |
| 9834 | |
| 9835 | void |
| 9836 | zdestroy(zone_t z) |
| 9837 | { |
| 9838 | unsigned int zindex = zone_index(z); |
| 9839 | zone_security_flags_t zsflags = zone_security_array[zindex]; |
| 9840 | |
| 9841 | current_thread()->options |= TH_OPT_ZONE_PRIV; |
| 9842 | lck_mtx_lock(lck: &zone_gc_lock); |
| 9843 | |
| 9844 | zone_reclaim(z, mode: ZONE_RECLAIM_DESTROY); |
| 9845 | |
| 9846 | lck_mtx_unlock(lck: &zone_gc_lock); |
| 9847 | current_thread()->options &= ~TH_OPT_ZONE_PRIV; |
| 9848 | |
| 9849 | zone_lock(zone: z); |
| 9850 | |
| 9851 | if (!zone_submap_is_sequestered(zsflags)) { |
| 9852 | while (!zone_pva_is_null(page: z->z_pageq_va)) { |
| 9853 | struct zone_page_metadata *meta; |
| 9854 | |
| 9855 | zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); |
| 9856 | meta = zone_meta_queue_pop(z, headp: &z->z_pageq_va); |
| 9857 | assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX); |
| 9858 | bzero(s: meta, n: sizeof(*meta) * z->z_chunk_pages); |
| 9859 | zone_unlock(zone: z); |
| 9860 | kmem_free(map: zone_submap(zsflags), addr: zone_meta_to_addr(meta), |
| 9861 | ptoa(z->z_chunk_pages)); |
| 9862 | zone_lock(zone: z); |
| 9863 | } |
| 9864 | } |
| 9865 | |
| 9866 | #if !KASAN_CLASSIC |
| 9867 | /* Assert that all counts are zero */ |
| 9868 | if (z->z_elems_avail || z->z_elems_free || zone_size_wired(zone: z) || |
| 9869 | (z->z_va_cur && !zone_submap_is_sequestered(zsflags))) { |
| 9870 | panic("zdestroy: Zone %s%s isn't empty at zdestroy() time" , |
| 9871 | zone_heap_name(z), z->z_name); |
| 9872 | } |
| 9873 | |
| 9874 | /* consistency check: make sure everything is indeed empty */ |
| 9875 | assert(zone_pva_is_null(z->z_pageq_empty)); |
| 9876 | assert(zone_pva_is_null(z->z_pageq_partial)); |
| 9877 | assert(zone_pva_is_null(z->z_pageq_full)); |
| 9878 | if (!zone_submap_is_sequestered(zsflags)) { |
| 9879 | assert(zone_pva_is_null(z->z_pageq_va)); |
| 9880 | } |
| 9881 | #endif |
| 9882 | |
| 9883 | zone_unlock(zone: z); |
| 9884 | |
| 9885 | simple_lock(&all_zones_lock, &zone_locks_grp); |
| 9886 | |
| 9887 | assert(!bitmap_test(zone_destroyed_bitmap, zindex)); |
| 9888 | /* Mark the zone as empty in the bitmap */ |
| 9889 | bitmap_set(map: zone_destroyed_bitmap, n: zindex); |
| 9890 | num_zones_in_use--; |
| 9891 | assert(num_zones_in_use > 0); |
| 9892 | |
| 9893 | simple_unlock(&all_zones_lock); |
| 9894 | } |
| 9895 | |
| 9896 | #endif /* !ZALLOC_TEST */ |
| 9897 | #pragma mark zalloc module init |
| 9898 | #if !ZALLOC_TEST |
| 9899 | |
| 9900 | /* |
| 9901 | * Initialize the "zone of zones" which uses fixed memory allocated |
| 9902 | * earlier in memory initialization. zone_bootstrap is called |
| 9903 | * before zone_init. |
| 9904 | */ |
| 9905 | __startup_func |
| 9906 | void |
| 9907 | zone_bootstrap(void) |
| 9908 | { |
| 9909 | #if DEBUG || DEVELOPMENT |
| 9910 | #if __x86_64__ |
| 9911 | if (PE_parse_boot_argn("kernPOST" , NULL, 0)) { |
| 9912 | /* |
| 9913 | * rdar://79781535 Disable early gaps while running kernPOST on Intel |
| 9914 | * the fp faulting code gets triggered and deadlocks. |
| 9915 | */ |
| 9916 | zone_caching_disabled = 1; |
| 9917 | } |
| 9918 | #endif /* __x86_64__ */ |
| 9919 | #endif /* DEBUG || DEVELOPMENT */ |
| 9920 | |
| 9921 | /* Validate struct zone_packed_virtual_address expectations */ |
| 9922 | static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1" ); |
| 9923 | if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) { |
| 9924 | panic("zone_pva_t can't pack a kernel page address in 31 bits" ); |
| 9925 | } |
| 9926 | |
| 9927 | zpercpu_early_count = ml_early_cpu_max_number() + 1; |
| 9928 | if (!PE_parse_boot_argn(arg_string: "zc_mag_size" , NULL, max_arg: 0)) { |
| 9929 | /* |
| 9930 | * Scale zc_mag_size() per machine. |
| 9931 | * |
| 9932 | * - wide machines get 128B magazines to avoid all false sharing |
| 9933 | * - smaller machines but with enough RAM get a bit bigger |
| 9934 | * buckets (empirically affects networking performance) |
| 9935 | */ |
| 9936 | if (zpercpu_early_count >= 10) { |
| 9937 | _zc_mag_size = 14; |
| 9938 | } else if ((sane_size >> 30) >= 4) { |
| 9939 | _zc_mag_size = 10; |
| 9940 | } |
| 9941 | } |
| 9942 | |
| 9943 | /* |
| 9944 | * Initialize random used to scramble early allocations |
| 9945 | */ |
| 9946 | zpercpu_foreach_cpu(cpu) { |
| 9947 | random_bool_init(bg: &zone_bool_gen[cpu].zbg_bg); |
| 9948 | } |
| 9949 | |
| 9950 | #if CONFIG_PROB_GZALLOC |
| 9951 | /* |
| 9952 | * Set pgz_sample_counter on the boot CPU so that we do not sample |
| 9953 | * any allocation until PGZ has been properly setup (in pgz_init()). |
| 9954 | */ |
| 9955 | *PERCPU_GET_MASTER(pgz_sample_counter) = INT32_MAX; |
| 9956 | #endif /* CONFIG_PROB_GZALLOC */ |
| 9957 | |
| 9958 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 9959 | /* |
| 9960 | * Randomly assign zones to one of the 4 general submaps, |
| 9961 | * and pick whether they allocate from the begining |
| 9962 | * or the end of it. |
| 9963 | * |
| 9964 | * A lot of OOB exploitation relies on precise interleaving |
| 9965 | * of specific types in the heap. |
| 9966 | * |
| 9967 | * Woops, you can't guarantee that anymore. |
| 9968 | */ |
| 9969 | for (zone_id_t i = 1; i < MAX_ZONES; i++) { |
| 9970 | uint32_t r = zalloc_random_uniform32(bound_min: 0, |
| 9971 | ZSECURITY_CONFIG_GENERAL_SUBMAPS * 2); |
| 9972 | |
| 9973 | zone_security_array[i].z_submap_from_end = (r & 1); |
| 9974 | zone_security_array[i].z_submap_idx += (r >> 1); |
| 9975 | } |
| 9976 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 9977 | |
| 9978 | thread_call_setup_with_options(call: &zone_expand_callout, |
| 9979 | func: zone_expand_async, NULL, pri: THREAD_CALL_PRIORITY_HIGH, |
| 9980 | options: THREAD_CALL_OPTIONS_ONCE); |
| 9981 | |
| 9982 | thread_call_setup_with_options(call: &zone_trim_callout, |
| 9983 | func: zone_trim_async, NULL, pri: THREAD_CALL_PRIORITY_USER, |
| 9984 | options: THREAD_CALL_OPTIONS_ONCE); |
| 9985 | } |
| 9986 | |
| 9987 | #define ZONE_GUARD_SIZE (64UL << 10) |
| 9988 | |
| 9989 | __startup_func |
| 9990 | static void |
| 9991 | zone_tunables_fixup(void) |
| 9992 | { |
| 9993 | int wdt = 0; |
| 9994 | |
| 9995 | #if CONFIG_PROB_GZALLOC && (DEVELOPMENT || DEBUG) |
| 9996 | if (!PE_parse_boot_argn("pgz" , NULL, 0) && |
| 9997 | PE_parse_boot_argn("pgz1" , NULL, 0)) { |
| 9998 | /* |
| 9999 | * if pgz1= was used, but pgz= was not, |
| 10000 | * then the more specific pgz1 takes precedence. |
| 10001 | */ |
| 10002 | pgz_all = false; |
| 10003 | } |
| 10004 | #endif |
| 10005 | |
| 10006 | if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) { |
| 10007 | zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT; |
| 10008 | } |
| 10009 | if (PE_parse_boot_argn(arg_string: "wdt" , arg_ptr: &wdt, max_arg: sizeof(wdt)) && wdt == -1 && |
| 10010 | !PE_parse_boot_argn(arg_string: "zet" , NULL, max_arg: 0)) { |
| 10011 | zone_exhausted_timeout = -1; |
| 10012 | } |
| 10013 | } |
| 10014 | STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup); |
| 10015 | |
| 10016 | __startup_func |
| 10017 | static void |
| 10018 | zone_submap_init( |
| 10019 | mach_vm_offset_t *submap_min, |
| 10020 | zone_submap_idx_t idx, |
| 10021 | uint64_t zone_sub_map_numer, |
| 10022 | uint64_t *remaining_denom, |
| 10023 | vm_offset_t *remaining_size) |
| 10024 | { |
| 10025 | vm_map_create_options_t vmco; |
| 10026 | vm_map_address_t addr; |
| 10027 | vm_offset_t submap_start, submap_end; |
| 10028 | vm_size_t submap_size; |
| 10029 | vm_map_t submap; |
| 10030 | vm_prot_t prot = VM_PROT_DEFAULT; |
| 10031 | vm_prot_t prot_max = VM_PROT_ALL; |
| 10032 | kern_return_t kr; |
| 10033 | |
| 10034 | submap_size = trunc_page(zone_sub_map_numer * *remaining_size / |
| 10035 | *remaining_denom); |
| 10036 | submap_start = *submap_min; |
| 10037 | |
| 10038 | if (idx == Z_SUBMAP_IDX_READ_ONLY) { |
| 10039 | vm_offset_t submap_padding = pmap_ro_zone_align(value: submap_start) - submap_start; |
| 10040 | submap_start += submap_padding; |
| 10041 | submap_size = pmap_ro_zone_align(value: submap_size); |
| 10042 | assert(*remaining_size >= (submap_padding + submap_size)); |
| 10043 | *remaining_size -= submap_padding; |
| 10044 | *submap_min = submap_start; |
| 10045 | } |
| 10046 | |
| 10047 | submap_end = submap_start + submap_size; |
| 10048 | if (idx == Z_SUBMAP_IDX_VM) { |
| 10049 | vm_packing_verify_range(subsystem: "vm_compressor" , |
| 10050 | min_address: submap_start, max_address: submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR)); |
| 10051 | vm_packing_verify_range(subsystem: "vm_page" , |
| 10052 | min_address: submap_start, max_address: submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR)); |
| 10053 | } |
| 10054 | |
| 10055 | vmco = VM_MAP_CREATE_NEVER_FAULTS; |
| 10056 | if (!zone_submap_is_sequestered(idx)) { |
| 10057 | vmco |= VM_MAP_CREATE_DISABLE_HOLELIST; |
| 10058 | } |
| 10059 | |
| 10060 | vm_map_will_allocate_early_map(map_owner: &zone_submaps[idx]); |
| 10061 | submap = kmem_suballoc(parent: kernel_map, addr: submap_min, size: submap_size, vmc_options: vmco, |
| 10062 | VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, flags: KMS_PERMANENT | KMS_NOFAIL, |
| 10063 | VM_KERN_MEMORY_ZONE).kmr_submap; |
| 10064 | |
| 10065 | if (idx == Z_SUBMAP_IDX_READ_ONLY) { |
| 10066 | zone_info.zi_ro_range.min_address = submap_start; |
| 10067 | zone_info.zi_ro_range.max_address = submap_end; |
| 10068 | prot_max = prot = VM_PROT_NONE; |
| 10069 | } |
| 10070 | |
| 10071 | addr = submap_start; |
| 10072 | vm_object_t kobject = kernel_object_default; |
| 10073 | kr = vm_map_enter(map: submap, address: &addr, ZONE_GUARD_SIZE / 2, mask: 0, |
| 10074 | VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vm_tag = VM_KERN_MEMORY_ZONE), |
| 10075 | object: kobject, offset: addr, FALSE, cur_protection: prot, max_protection: prot_max, VM_INHERIT_NONE); |
| 10076 | if (kr != KERN_SUCCESS) { |
| 10077 | panic("ksubmap[%s]: failed to make first entry (%d)" , |
| 10078 | zone_submaps_names[idx], kr); |
| 10079 | } |
| 10080 | |
| 10081 | addr = submap_end - ZONE_GUARD_SIZE / 2; |
| 10082 | kr = vm_map_enter(map: submap, address: &addr, ZONE_GUARD_SIZE / 2, mask: 0, |
| 10083 | VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vm_tag = VM_KERN_MEMORY_ZONE), |
| 10084 | object: kobject, offset: addr, FALSE, cur_protection: prot, max_protection: prot_max, VM_INHERIT_NONE); |
| 10085 | if (kr != KERN_SUCCESS) { |
| 10086 | panic("ksubmap[%s]: failed to make last entry (%d)" , |
| 10087 | zone_submaps_names[idx], kr); |
| 10088 | } |
| 10089 | |
| 10090 | #if DEBUG || DEVELOPMENT |
| 10091 | printf("zone_init: map %-5s %p:%p (%u%c)\n" , |
| 10092 | zone_submaps_names[idx], (void *)submap_start, (void *)submap_end, |
| 10093 | mach_vm_size_pretty(submap_size), mach_vm_size_unit(submap_size)); |
| 10094 | #endif /* DEBUG || DEVELOPMENT */ |
| 10095 | |
| 10096 | zone_submaps[idx] = submap; |
| 10097 | *submap_min = submap_end; |
| 10098 | *remaining_size -= submap_size; |
| 10099 | *remaining_denom -= zone_sub_map_numer; |
| 10100 | } |
| 10101 | |
| 10102 | static inline void |
| 10103 | zone_pva_relocate(zone_pva_t *pva, uint32_t delta) |
| 10104 | { |
| 10105 | if (!zone_pva_is_null(page: *pva) && !zone_pva_is_queue(page: *pva)) { |
| 10106 | pva->packed_address += delta; |
| 10107 | } |
| 10108 | } |
| 10109 | |
| 10110 | /* |
| 10111 | * Allocate metadata array and migrate bootstrap initial metadata and memory. |
| 10112 | */ |
| 10113 | __startup_func |
| 10114 | static void |
| 10115 | zone_metadata_init(void) |
| 10116 | { |
| 10117 | vm_map_t vm_map = zone_submaps[Z_SUBMAP_IDX_VM]; |
| 10118 | vm_map_entry_t first; |
| 10119 | |
| 10120 | struct mach_vm_range meta_r, bits_r, xtra_r, early_r; |
| 10121 | vm_size_t early_sz; |
| 10122 | vm_offset_t reloc_base; |
| 10123 | |
| 10124 | /* |
| 10125 | * Step 1: Allocate the metadata + bitmaps range |
| 10126 | * |
| 10127 | * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k |
| 10128 | * of physical memory (16M per 1G). |
| 10129 | * |
| 10130 | * Let's preallocate for the worst to avoid weird panics. |
| 10131 | */ |
| 10132 | vm_map_will_allocate_early_map(map_owner: &zone_meta_map); |
| 10133 | meta_r = zone_kmem_suballoc(addr: zone_info.zi_meta_range.min_address, |
| 10134 | size: zone_meta_size + zone_bits_size + zone_xtra_size, |
| 10135 | VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, |
| 10136 | VM_KERN_MEMORY_ZONE, new_map: &zone_meta_map); |
| 10137 | meta_r.min_address += ZONE_GUARD_SIZE; |
| 10138 | meta_r.max_address -= ZONE_GUARD_SIZE; |
| 10139 | if (zone_xtra_size) { |
| 10140 | xtra_r.max_address = meta_r.max_address; |
| 10141 | meta_r.max_address -= zone_xtra_size; |
| 10142 | xtra_r.min_address = meta_r.max_address; |
| 10143 | } else { |
| 10144 | xtra_r.min_address = xtra_r.max_address = 0; |
| 10145 | } |
| 10146 | bits_r.max_address = meta_r.max_address; |
| 10147 | meta_r.max_address -= zone_bits_size; |
| 10148 | bits_r.min_address = meta_r.max_address; |
| 10149 | |
| 10150 | #if DEBUG || DEVELOPMENT |
| 10151 | printf("zone_init: metadata %p:%p (%u%c)\n" , |
| 10152 | (void *)meta_r.min_address, (void *)meta_r.max_address, |
| 10153 | mach_vm_size_pretty(mach_vm_range_size(&meta_r)), |
| 10154 | mach_vm_size_unit(mach_vm_range_size(&meta_r))); |
| 10155 | printf("zone_init: metabits %p:%p (%u%c)\n" , |
| 10156 | (void *)bits_r.min_address, (void *)bits_r.max_address, |
| 10157 | mach_vm_size_pretty(mach_vm_range_size(&bits_r)), |
| 10158 | mach_vm_size_unit(mach_vm_range_size(&bits_r))); |
| 10159 | printf("zone_init: extra %p:%p (%u%c)\n" , |
| 10160 | (void *)xtra_r.min_address, (void *)xtra_r.max_address, |
| 10161 | mach_vm_size_pretty(mach_vm_range_size(&xtra_r)), |
| 10162 | mach_vm_size_unit(mach_vm_range_size(&xtra_r))); |
| 10163 | #endif /* DEBUG || DEVELOPMENT */ |
| 10164 | |
| 10165 | bits_r.min_address = (bits_r.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE; |
| 10166 | bits_r.max_address = bits_r.max_address & -ZBA_CHUNK_SIZE; |
| 10167 | |
| 10168 | /* |
| 10169 | * Step 2: Install new ranges. |
| 10170 | * Relocate metadata and bits. |
| 10171 | */ |
| 10172 | early_r = zone_info.zi_map_range; |
| 10173 | early_sz = mach_vm_range_size(r: &early_r); |
| 10174 | |
| 10175 | zone_info.zi_map_range = zone_map_range; |
| 10176 | zone_info.zi_meta_range = meta_r; |
| 10177 | zone_info.zi_bits_range = bits_r; |
| 10178 | zone_info.zi_xtra_range = xtra_r; |
| 10179 | zone_info.zi_meta_base = (struct zone_page_metadata *)meta_r.min_address - |
| 10180 | zone_pva_from_addr(addr: zone_map_range.min_address).packed_address; |
| 10181 | |
| 10182 | vm_map_lock(vm_map); |
| 10183 | first = vm_map_first_entry(vm_map); |
| 10184 | reloc_base = first->vme_end; |
| 10185 | first->vme_end += early_sz; |
| 10186 | vm_map->size += early_sz; |
| 10187 | vm_map_unlock(vm_map); |
| 10188 | |
| 10189 | struct zone_page_metadata *early_meta = zone_early_meta_array_startup; |
| 10190 | struct zone_page_metadata *new_meta = zone_meta_from_addr(addr: reloc_base); |
| 10191 | vm_offset_t reloc_delta = reloc_base - early_r.min_address; |
| 10192 | /* this needs to sign extend */ |
| 10193 | uint32_t pva_delta = (uint32_t)((intptr_t)reloc_delta >> PAGE_SHIFT); |
| 10194 | |
| 10195 | zone_meta_populate(base: reloc_base, size: early_sz); |
| 10196 | memcpy(dst: new_meta, src: early_meta, |
| 10197 | atop(early_sz) * sizeof(struct zone_page_metadata)); |
| 10198 | for (uint32_t i = 0; i < atop(early_sz); i++) { |
| 10199 | zone_pva_relocate(pva: &new_meta[i].zm_page_next, delta: pva_delta); |
| 10200 | zone_pva_relocate(pva: &new_meta[i].zm_page_prev, delta: pva_delta); |
| 10201 | } |
| 10202 | |
| 10203 | static_assert(ZONE_ID_VM_MAP_ENTRY == ZONE_ID_VM_MAP + 1); |
| 10204 | static_assert(ZONE_ID_VM_MAP_HOLES == ZONE_ID_VM_MAP + 2); |
| 10205 | |
| 10206 | for (zone_id_t zid = ZONE_ID_VM_MAP; zid <= ZONE_ID_VM_MAP_HOLES; zid++) { |
| 10207 | zone_pva_relocate(pva: &zone_array[zid].z_pageq_partial, delta: pva_delta); |
| 10208 | zone_pva_relocate(pva: &zone_array[zid].z_pageq_full, delta: pva_delta); |
| 10209 | } |
| 10210 | |
| 10211 | zba_populate(n: 0, false); |
| 10212 | memcpy(dst: zba_base_header(), src: zba_chunk_startup, n: sizeof(zba_chunk_startup)); |
| 10213 | zba_meta()->zbam_right = (uint32_t)atop(zone_bits_size); |
| 10214 | |
| 10215 | /* |
| 10216 | * Step 3: Relocate the boostrap VM structs |
| 10217 | * (including rewriting their content). |
| 10218 | */ |
| 10219 | |
| 10220 | kernel_memory_populate(addr: reloc_base, size: early_sz, |
| 10221 | flags: KMA_KOBJECT | KMA_NOENCRYPT | KMA_NOFAIL | KMA_TAG, |
| 10222 | VM_KERN_MEMORY_OSFMK); |
| 10223 | __nosan_memcpy(dst: (void *)reloc_base, src: (void *)early_r.min_address, sz: early_sz); |
| 10224 | |
| 10225 | #if KASAN |
| 10226 | kasan_notify_address(reloc_base, early_sz); |
| 10227 | #if KASAN_TBI |
| 10228 | kasan_tbi_copy_tags(reloc_base, early_r.min_address, early_sz); |
| 10229 | #endif /* KASAN_TBI */ |
| 10230 | #endif /* KASAN */ |
| 10231 | |
| 10232 | vm_map_relocate_early_maps(delta: reloc_delta); |
| 10233 | |
| 10234 | for (uint32_t i = 0; i < atop(early_sz); i++) { |
| 10235 | zone_id_t zid = new_meta[i].zm_index; |
| 10236 | zone_t z = &zone_array[zid]; |
| 10237 | vm_size_t esize = zone_elem_outer_size(zone: z); |
| 10238 | vm_address_t base = reloc_base + ptoa(i) + zone_elem_inner_offs(zone: z); |
| 10239 | vm_address_t addr; |
| 10240 | |
| 10241 | if (new_meta[i].zm_chunk_len >= ZM_SECONDARY_PAGE) { |
| 10242 | continue; |
| 10243 | } |
| 10244 | |
| 10245 | for (uint32_t eidx = 0; eidx < z->z_chunk_elems; eidx++) { |
| 10246 | if (zone_meta_is_free(meta: &new_meta[i], eidx)) { |
| 10247 | continue; |
| 10248 | } |
| 10249 | |
| 10250 | addr = vm_memtag_fixup_ptr(base + eidx * esize); |
| 10251 | #if KASAN_CLASSIC |
| 10252 | kasan_alloc(addr, |
| 10253 | zone_elem_inner_size(z), zone_elem_inner_size(z), |
| 10254 | zone_elem_redzone(z), false, |
| 10255 | __builtin_frame_address(0)); |
| 10256 | #endif |
| 10257 | vm_map_relocate_early_elem(zone_id: zid, new_addr: addr, delta: reloc_delta); |
| 10258 | } |
| 10259 | } |
| 10260 | } |
| 10261 | |
| 10262 | __startup_data |
| 10263 | static uint16_t submap_ratios[Z_SUBMAP_IDX_COUNT] = { |
| 10264 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 10265 | [Z_SUBMAP_IDX_VM] = 15, |
| 10266 | [Z_SUBMAP_IDX_READ_ONLY] = 5, |
| 10267 | #else |
| 10268 | [Z_SUBMAP_IDX_VM] = 20, |
| 10269 | #endif /* !ZSECURITY_CONFIG(READ_ONLY) */ |
| 10270 | #if ZSECURITY_CONFIG(SAD_FENG_SHUI) |
| 10271 | [Z_SUBMAP_IDX_GENERAL_0] = 15, |
| 10272 | [Z_SUBMAP_IDX_GENERAL_1] = 15, |
| 10273 | [Z_SUBMAP_IDX_GENERAL_2] = 15, |
| 10274 | [Z_SUBMAP_IDX_GENERAL_3] = 15, |
| 10275 | [Z_SUBMAP_IDX_DATA] = 20, |
| 10276 | #else |
| 10277 | [Z_SUBMAP_IDX_GENERAL_0] = 60, |
| 10278 | [Z_SUBMAP_IDX_DATA] = 20, |
| 10279 | #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ |
| 10280 | }; |
| 10281 | |
| 10282 | __startup_func |
| 10283 | static inline uint16_t |
| 10284 | zone_submap_ratios_denom(void) |
| 10285 | { |
| 10286 | uint16_t denom = 0; |
| 10287 | |
| 10288 | for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) { |
| 10289 | denom += submap_ratios[idx]; |
| 10290 | } |
| 10291 | |
| 10292 | assert(denom == 100); |
| 10293 | |
| 10294 | return denom; |
| 10295 | } |
| 10296 | |
| 10297 | __startup_func |
| 10298 | static inline vm_offset_t |
| 10299 | zone_restricted_va_max(void) |
| 10300 | { |
| 10301 | vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR); |
| 10302 | vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR); |
| 10303 | |
| 10304 | return trunc_page(MIN(compressor_max, vm_page_max)); |
| 10305 | } |
| 10306 | |
| 10307 | __startup_func |
| 10308 | static void |
| 10309 | zone_set_map_sizes(void) |
| 10310 | { |
| 10311 | vm_size_t zsize; |
| 10312 | vm_size_t zsizearg; |
| 10313 | |
| 10314 | /* |
| 10315 | * Compute the physical limits for the zone map |
| 10316 | */ |
| 10317 | |
| 10318 | if (PE_parse_boot_argn(arg_string: "zsize" , arg_ptr: &zsizearg, max_arg: sizeof(zsizearg))) { |
| 10319 | zsize = zsizearg * (1024ULL * 1024); |
| 10320 | } else { |
| 10321 | /* Set target zone size as 1/4 of physical memory */ |
| 10322 | zsize = (vm_size_t)(sane_size >> 2); |
| 10323 | zsize += zsize >> 1; |
| 10324 | } |
| 10325 | |
| 10326 | if (zsize < CONFIG_ZONE_MAP_MIN) { |
| 10327 | zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */ |
| 10328 | } |
| 10329 | if (zsize > sane_size >> 1) { |
| 10330 | zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */ |
| 10331 | } |
| 10332 | if (zsizearg == 0 && zsize > ZONE_MAP_MAX) { |
| 10333 | /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */ |
| 10334 | printf(format: "NOTE: zonemap size reduced from 0x%lx to 0x%lx\n" , |
| 10335 | (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX); |
| 10336 | zsize = ZONE_MAP_MAX; |
| 10337 | } |
| 10338 | |
| 10339 | zone_pages_wired_max = (uint32_t)atop(trunc_page(zsize)); |
| 10340 | |
| 10341 | |
| 10342 | /* |
| 10343 | * Declare restrictions on zone max |
| 10344 | */ |
| 10345 | vm_offset_t vm_submap_size = round_page( |
| 10346 | x: (submap_ratios[Z_SUBMAP_IDX_VM] * ZONE_MAP_VA_SIZE) / |
| 10347 | zone_submap_ratios_denom()); |
| 10348 | |
| 10349 | #if CONFIG_PROB_GZALLOC |
| 10350 | vm_submap_size += pgz_get_size(); |
| 10351 | #endif /* CONFIG_PROB_GZALLOC */ |
| 10352 | if (os_sub_overflow(zone_restricted_va_max(), vm_submap_size, |
| 10353 | &zone_map_range.min_address)) { |
| 10354 | zone_map_range.min_address = 0; |
| 10355 | } |
| 10356 | |
| 10357 | zone_meta_size = round_page(atop(ZONE_MAP_VA_SIZE) * |
| 10358 | sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2; |
| 10359 | |
| 10360 | static_assert(ZONE_MAP_MAX / (CHAR_BIT * KALLOC_MINSIZE) <= |
| 10361 | ZBA_PTR_MASK + 1); |
| 10362 | zone_bits_size = round_page(ptoa(zone_pages_wired_max) / |
| 10363 | (CHAR_BIT * KALLOC_MINSIZE)); |
| 10364 | |
| 10365 | #if VM_TAG_SIZECLASSES |
| 10366 | if (zone_tagging_on) { |
| 10367 | zba_xtra_shift = (uint8_t)fls(sizeof(vm_tag_t) - 1); |
| 10368 | } |
| 10369 | if (zba_xtra_shift) { |
| 10370 | /* |
| 10371 | * if we need the extra space range, then limit the size of the |
| 10372 | * bitmaps to something reasonable instead of a theoretical |
| 10373 | * worst case scenario of all zones being for the smallest |
| 10374 | * allocation granule, in order to avoid fake VA pressure on |
| 10375 | * other parts of the system. |
| 10376 | */ |
| 10377 | zone_bits_size = round_page(zone_bits_size / 8); |
| 10378 | zone_xtra_size = round_page(zone_bits_size * CHAR_BIT << zba_xtra_shift); |
| 10379 | } |
| 10380 | #endif /* VM_TAG_SIZECLASSES */ |
| 10381 | } |
| 10382 | STARTUP(KMEM, STARTUP_RANK_FIRST, zone_set_map_sizes); |
| 10383 | |
| 10384 | /* |
| 10385 | * Can't use zone_info.zi_map_range at this point as it is being used to |
| 10386 | * store the range of early pmap memory that was stolen to bootstrap the |
| 10387 | * necessary VM zones. |
| 10388 | */ |
| 10389 | KMEM_RANGE_REGISTER_STATIC(zones, &zone_map_range, ZONE_MAP_VA_SIZE); |
| 10390 | KMEM_RANGE_REGISTER_DYNAMIC(zone_meta, &zone_info.zi_meta_range, ^{ |
| 10391 | return zone_meta_size + zone_bits_size + zone_xtra_size; |
| 10392 | }); |
| 10393 | |
| 10394 | /* |
| 10395 | * Global initialization of Zone Allocator. |
| 10396 | * Runs after zone_bootstrap. |
| 10397 | */ |
| 10398 | __startup_func |
| 10399 | static void |
| 10400 | zone_init(void) |
| 10401 | { |
| 10402 | vm_size_t remaining_size = ZONE_MAP_VA_SIZE; |
| 10403 | mach_vm_offset_t submap_min = 0; |
| 10404 | uint64_t denom = zone_submap_ratios_denom(); |
| 10405 | /* |
| 10406 | * And now allocate the various pieces of VA and submaps. |
| 10407 | */ |
| 10408 | |
| 10409 | submap_min = zone_map_range.min_address; |
| 10410 | |
| 10411 | #if CONFIG_PROB_GZALLOC |
| 10412 | vm_size_t pgz_size = pgz_get_size(); |
| 10413 | |
| 10414 | vm_map_will_allocate_early_map(&pgz_submap); |
| 10415 | zone_info.zi_pgz_range = zone_kmem_suballoc(submap_min, pgz_size, |
| 10416 | VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, |
| 10417 | VM_KERN_MEMORY_ZONE, &pgz_submap); |
| 10418 | |
| 10419 | submap_min += pgz_size; |
| 10420 | remaining_size -= pgz_size; |
| 10421 | #if DEBUG || DEVELOPMENT |
| 10422 | printf("zone_init: pgzalloc %p:%p (%u%c) [%d slots]\n" , |
| 10423 | (void *)zone_info.zi_pgz_range.min_address, |
| 10424 | (void *)zone_info.zi_pgz_range.max_address, |
| 10425 | mach_vm_size_pretty(pgz_size), mach_vm_size_unit(pgz_size), |
| 10426 | pgz_slots); |
| 10427 | #endif /* DEBUG || DEVELOPMENT */ |
| 10428 | #endif /* CONFIG_PROB_GZALLOC */ |
| 10429 | |
| 10430 | /* |
| 10431 | * Allocate the submaps |
| 10432 | */ |
| 10433 | for (zone_submap_idx_t idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) { |
| 10434 | if (submap_ratios[idx] == 0) { |
| 10435 | zone_submaps[idx] = VM_MAP_NULL; |
| 10436 | } else { |
| 10437 | zone_submap_init(submap_min: &submap_min, idx, zone_sub_map_numer: submap_ratios[idx], |
| 10438 | remaining_denom: &denom, remaining_size: &remaining_size); |
| 10439 | } |
| 10440 | } |
| 10441 | |
| 10442 | zone_metadata_init(); |
| 10443 | |
| 10444 | #if VM_TAG_SIZECLASSES |
| 10445 | if (zone_tagging_on) { |
| 10446 | vm_allocation_zones_init(); |
| 10447 | } |
| 10448 | #endif /* VM_TAG_SIZECLASSES */ |
| 10449 | |
| 10450 | zone_create_flags_t kma_flags = ZC_NOCACHING | ZC_NOGC | ZC_NOCALLOUT | |
| 10451 | ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE | ZC_VM; |
| 10452 | |
| 10453 | (void)zone_create_ext(name: "vm.permanent" , size: 1, flags: kma_flags | ZC_NOTBITAG, |
| 10454 | zid: ZONE_ID_PERMANENT, extra_setup: ^(zone_t z) { |
| 10455 | z->z_permanent = true; |
| 10456 | z->z_elem_size = 1; |
| 10457 | }); |
| 10458 | (void)zone_create_ext(name: "vm.permanent.percpu" , size: 1, |
| 10459 | flags: kma_flags | ZC_PERCPU | ZC_NOTBITAG, zid: ZONE_ID_PERCPU_PERMANENT, extra_setup: ^(zone_t z) { |
| 10460 | z->z_permanent = true; |
| 10461 | z->z_elem_size = 1; |
| 10462 | }); |
| 10463 | |
| 10464 | zc_magazine_zone = zone_create(name: "zcc_magazine_zone" , size: sizeof(struct zone_magazine) + |
| 10465 | zc_mag_size() * sizeof(vm_offset_t), |
| 10466 | flags: ZC_VM | ZC_NOCACHING | ZC_ZFREE_CLEARMEM | ZC_PGZ_USE_GUARDS); |
| 10467 | zone_raise_reserve(zov: zc_magazine_zone, min_elements: (uint16_t)(2 * zpercpu_count())); |
| 10468 | |
| 10469 | /* |
| 10470 | * Now migrate the startup statistics into their final storage, |
| 10471 | * and enable logging for early zones (that zone_create_ext() skipped). |
| 10472 | */ |
| 10473 | int cpu = cpu_number(); |
| 10474 | zone_index_foreach(idx) { |
| 10475 | zone_t tz = &zone_array[idx]; |
| 10476 | |
| 10477 | if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) { |
| 10478 | zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats); |
| 10479 | |
| 10480 | *zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu); |
| 10481 | tz->z_stats = zs; |
| 10482 | } |
| 10483 | if (tz->z_self == tz) { |
| 10484 | #if ZALLOC_ENABLE_LOGGING |
| 10485 | zone_setup_logging(tz); |
| 10486 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 10487 | #if KASAN_TBI |
| 10488 | zone_setup_kasan_logging(tz); |
| 10489 | #endif /* KASAN_TBI */ |
| 10490 | } |
| 10491 | } |
| 10492 | } |
| 10493 | STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init); |
| 10494 | |
| 10495 | void |
| 10496 | zalloc_iokit_lockdown(void) |
| 10497 | { |
| 10498 | zone_share_always = false; |
| 10499 | } |
| 10500 | |
| 10501 | void |
| 10502 | zalloc_first_proc_made(void) |
| 10503 | { |
| 10504 | zone_caching_disabled = 0; |
| 10505 | zone_early_thres_mul = 1; |
| 10506 | } |
| 10507 | |
| 10508 | __startup_func |
| 10509 | vm_offset_t |
| 10510 | zone_early_mem_init(vm_size_t size) |
| 10511 | { |
| 10512 | vm_offset_t mem; |
| 10513 | |
| 10514 | assert3u(atop(size), <=, ZONE_EARLY_META_INLINE_COUNT); |
| 10515 | |
| 10516 | /* |
| 10517 | * The zone that is used early to bring up the VM is stolen here. |
| 10518 | * |
| 10519 | * When the zone subsystem is actually initialized, |
| 10520 | * zone_metadata_init() will be called, and those pages |
| 10521 | * and the elements they contain, will be relocated into |
| 10522 | * the VM submap (even for architectures when those zones |
| 10523 | * do not live there). |
| 10524 | */ |
| 10525 | assert3u(size, <=, sizeof(zone_early_pages_to_cram)); |
| 10526 | mem = (vm_offset_t)zone_early_pages_to_cram; |
| 10527 | |
| 10528 | zone_info.zi_meta_base = zone_early_meta_array_startup - |
| 10529 | zone_pva_from_addr(addr: mem).packed_address; |
| 10530 | zone_info.zi_map_range.min_address = mem; |
| 10531 | zone_info.zi_map_range.max_address = mem + size; |
| 10532 | |
| 10533 | zone_info.zi_bits_range = (struct mach_vm_range){ |
| 10534 | .min_address = (mach_vm_offset_t)zba_chunk_startup, |
| 10535 | .max_address = (mach_vm_offset_t)zba_chunk_startup + |
| 10536 | sizeof(zba_chunk_startup), |
| 10537 | }; |
| 10538 | |
| 10539 | zba_meta()->zbam_left = 1; |
| 10540 | zba_meta()->zbam_right = 1; |
| 10541 | zba_init_chunk(n: 0, false); |
| 10542 | |
| 10543 | return mem; |
| 10544 | } |
| 10545 | |
| 10546 | #endif /* !ZALLOC_TEST */ |
| 10547 | #pragma mark - tests |
| 10548 | #if DEBUG || DEVELOPMENT |
| 10549 | |
| 10550 | /* |
| 10551 | * Used for sysctl zone tests that aren't thread-safe. Ensure only one |
| 10552 | * thread goes through at a time. |
| 10553 | * |
| 10554 | * Or we can end up with multiple test zones (if a second zinit() comes through |
| 10555 | * before zdestroy()), which could lead us to run out of zones. |
| 10556 | */ |
| 10557 | static bool any_zone_test_running = FALSE; |
| 10558 | |
| 10559 | static uintptr_t * |
| 10560 | zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index) |
| 10561 | { |
| 10562 | vm_offset_t elem_size = zone_elem_outer_size(z); |
| 10563 | vm_offset_t base; |
| 10564 | struct zone_page_metadata *meta; |
| 10565 | |
| 10566 | while (!zone_pva_is_null(page_index)) { |
| 10567 | base = zone_pva_to_addr(page_index) + zone_elem_inner_offs(z); |
| 10568 | meta = zone_pva_to_meta(page_index); |
| 10569 | |
| 10570 | if (meta->zm_inline_bitmap) { |
| 10571 | for (size_t i = 0; i < meta->zm_chunk_len; i++) { |
| 10572 | uint32_t map = meta[i].zm_bitmap; |
| 10573 | |
| 10574 | for (; map; map &= map - 1) { |
| 10575 | *elems++ = INSTANCE_PUT(base + |
| 10576 | elem_size * __builtin_clz(map)); |
| 10577 | } |
| 10578 | base += elem_size * 32; |
| 10579 | } |
| 10580 | } else { |
| 10581 | uint32_t order = zba_bits_ref_order(meta->zm_bitmap); |
| 10582 | bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); |
| 10583 | for (size_t i = 0; i < (1u << order); i++) { |
| 10584 | uint64_t map = bits[i]; |
| 10585 | |
| 10586 | for (; map; map &= map - 1) { |
| 10587 | *elems++ = INSTANCE_PUT(base + |
| 10588 | elem_size * __builtin_clzll(map)); |
| 10589 | } |
| 10590 | base += elem_size * 64; |
| 10591 | } |
| 10592 | } |
| 10593 | |
| 10594 | page_index = meta->zm_page_next; |
| 10595 | } |
| 10596 | return elems; |
| 10597 | } |
| 10598 | |
| 10599 | kern_return_t |
| 10600 | zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc) |
| 10601 | { |
| 10602 | zone_t zone = NULL; |
| 10603 | uintptr_t * array; |
| 10604 | uintptr_t * next; |
| 10605 | uintptr_t element; |
| 10606 | uint32_t idx, count, found; |
| 10607 | uint32_t nobtcount; |
| 10608 | uint32_t elemSize; |
| 10609 | size_t maxElems; |
| 10610 | |
| 10611 | zone_foreach(z) { |
| 10612 | if (!z->z_name) { |
| 10613 | continue; |
| 10614 | } |
| 10615 | if (!strncmp(zoneName, z->z_name, nameLen)) { |
| 10616 | zone = z; |
| 10617 | break; |
| 10618 | } |
| 10619 | } |
| 10620 | if (zone == NULL) { |
| 10621 | return KERN_INVALID_NAME; |
| 10622 | } |
| 10623 | |
| 10624 | elemSize = (uint32_t)zone_elem_inner_size(zone); |
| 10625 | maxElems = (zone->z_elems_avail + 1) & ~1ul; |
| 10626 | |
| 10627 | array = kalloc_type_tag(vm_offset_t, maxElems, VM_KERN_MEMORY_DIAG); |
| 10628 | if (array == NULL) { |
| 10629 | return KERN_RESOURCE_SHORTAGE; |
| 10630 | } |
| 10631 | |
| 10632 | zone_lock(zone); |
| 10633 | |
| 10634 | next = array; |
| 10635 | next = zone_copy_allocations(zone, next, zone->z_pageq_partial); |
| 10636 | next = zone_copy_allocations(zone, next, zone->z_pageq_full); |
| 10637 | count = (uint32_t)(next - array); |
| 10638 | |
| 10639 | zone_unlock(zone); |
| 10640 | |
| 10641 | zone_leaks_scan(array, count, (uint32_t)zone_elem_outer_size(zone), &found); |
| 10642 | assert(found <= count); |
| 10643 | |
| 10644 | for (idx = 0; idx < count; idx++) { |
| 10645 | element = array[idx]; |
| 10646 | if (kInstanceFlagReferenced & element) { |
| 10647 | continue; |
| 10648 | } |
| 10649 | element = INSTANCE_PUT(element) & ~kInstanceFlags; |
| 10650 | } |
| 10651 | |
| 10652 | #if ZALLOC_ENABLE_LOGGING |
| 10653 | if (zone->z_btlog && !corruption_debug_flag) { |
| 10654 | // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found |
| 10655 | static_assert(sizeof(vm_address_t) == sizeof(uintptr_t)); |
| 10656 | btlog_copy_backtraces_for_elements(zone->z_btlog, |
| 10657 | (vm_address_t *)array, &count, elemSize, proc); |
| 10658 | } |
| 10659 | #endif /* ZALLOC_ENABLE_LOGGING */ |
| 10660 | |
| 10661 | for (nobtcount = idx = 0; idx < count; idx++) { |
| 10662 | element = array[idx]; |
| 10663 | if (!element) { |
| 10664 | continue; |
| 10665 | } |
| 10666 | if (kInstanceFlagReferenced & element) { |
| 10667 | continue; |
| 10668 | } |
| 10669 | nobtcount++; |
| 10670 | } |
| 10671 | if (nobtcount) { |
| 10672 | proc(nobtcount, elemSize, BTREF_NULL); |
| 10673 | } |
| 10674 | |
| 10675 | kfree_type(vm_offset_t, maxElems, array); |
| 10676 | return KERN_SUCCESS; |
| 10677 | } |
| 10678 | |
| 10679 | static int |
| 10680 | zone_ro_basic_test_run(__unused int64_t in, int64_t *out) |
| 10681 | { |
| 10682 | zone_security_flags_t zsflags; |
| 10683 | uint32_t x = 4; |
| 10684 | uint32_t *test_ptr; |
| 10685 | |
| 10686 | if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) { |
| 10687 | printf("zone_ro_basic_test: Test already running.\n" ); |
| 10688 | return EALREADY; |
| 10689 | } |
| 10690 | |
| 10691 | zsflags = zone_security_array[ZONE_ID__FIRST_RO]; |
| 10692 | |
| 10693 | for (int i = 0; i < 3; i++) { |
| 10694 | #if ZSECURITY_CONFIG(READ_ONLY) |
| 10695 | /* Basic Test: Create int zone, zalloc int, modify value, free int */ |
| 10696 | printf("zone_ro_basic_test: Basic Test iteration %d\n" , i); |
| 10697 | printf("zone_ro_basic_test: create a sub-page size zone\n" ); |
| 10698 | |
| 10699 | printf("zone_ro_basic_test: verify flags were set\n" ); |
| 10700 | assert(zsflags.z_submap_idx == Z_SUBMAP_IDX_READ_ONLY); |
| 10701 | |
| 10702 | printf("zone_ro_basic_test: zalloc an element\n" ); |
| 10703 | test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK); |
| 10704 | assert(test_ptr); |
| 10705 | |
| 10706 | printf("zone_ro_basic_test: verify we can't write to it\n" ); |
| 10707 | assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT); |
| 10708 | |
| 10709 | x = 4; |
| 10710 | printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n" ); |
| 10711 | zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t)); |
| 10712 | assert(test_ptr); |
| 10713 | assert(*(uint32_t*)test_ptr == x); |
| 10714 | |
| 10715 | x = 5; |
| 10716 | printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n" ); |
| 10717 | zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x); |
| 10718 | assert(test_ptr); |
| 10719 | assert(*(uint32_t*)test_ptr == x); |
| 10720 | |
| 10721 | printf("zone_ro_basic_test: verify we can't write to it after assigning value\n" ); |
| 10722 | assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT); |
| 10723 | |
| 10724 | printf("zone_ro_basic_test: free elem\n" ); |
| 10725 | zfree_ro(ZONE_ID__FIRST_RO, test_ptr); |
| 10726 | assert(!test_ptr); |
| 10727 | #else |
| 10728 | printf("zone_ro_basic_test: Read-only allocator n/a on 32bit platforms, test functionality of API\n" ); |
| 10729 | |
| 10730 | printf("zone_ro_basic_test: verify flags were set\n" ); |
| 10731 | assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY); |
| 10732 | |
| 10733 | printf("zone_ro_basic_test: zalloc an element\n" ); |
| 10734 | test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK); |
| 10735 | assert(test_ptr); |
| 10736 | |
| 10737 | x = 4; |
| 10738 | printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n" ); |
| 10739 | zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t)); |
| 10740 | assert(test_ptr); |
| 10741 | assert(*(uint32_t*)test_ptr == x); |
| 10742 | |
| 10743 | x = 5; |
| 10744 | printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n" ); |
| 10745 | zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x); |
| 10746 | assert(test_ptr); |
| 10747 | assert(*(uint32_t*)test_ptr == x); |
| 10748 | |
| 10749 | printf("zone_ro_basic_test: free elem\n" ); |
| 10750 | zfree_ro(ZONE_ID__FIRST_RO, test_ptr); |
| 10751 | assert(!test_ptr); |
| 10752 | #endif /* !ZSECURITY_CONFIG(READ_ONLY) */ |
| 10753 | } |
| 10754 | |
| 10755 | printf("zone_ro_basic_test: garbage collection\n" ); |
| 10756 | zone_gc(ZONE_GC_DRAIN); |
| 10757 | |
| 10758 | printf("zone_ro_basic_test: Test passed\n" ); |
| 10759 | |
| 10760 | *out = 1; |
| 10761 | os_atomic_store(&any_zone_test_running, false, relaxed); |
| 10762 | return 0; |
| 10763 | } |
| 10764 | SYSCTL_TEST_REGISTER(zone_ro_basic_test, zone_ro_basic_test_run); |
| 10765 | |
| 10766 | static int |
| 10767 | zone_basic_test_run(__unused int64_t in, int64_t *out) |
| 10768 | { |
| 10769 | static zone_t test_zone_ptr = NULL; |
| 10770 | |
| 10771 | unsigned int i = 0, max_iter = 5; |
| 10772 | void * test_ptr; |
| 10773 | zone_t test_zone; |
| 10774 | int rc = 0; |
| 10775 | |
| 10776 | if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) { |
| 10777 | printf("zone_basic_test: Test already running.\n" ); |
| 10778 | return EALREADY; |
| 10779 | } |
| 10780 | |
| 10781 | printf("zone_basic_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n" ); |
| 10782 | |
| 10783 | /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */ |
| 10784 | do { |
| 10785 | test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl" ); |
| 10786 | assert(test_zone); |
| 10787 | |
| 10788 | #if KASAN_CLASSIC |
| 10789 | if (test_zone_ptr == NULL && test_zone->z_elems_free != 0) |
| 10790 | #else |
| 10791 | if (test_zone->z_elems_free != 0) |
| 10792 | #endif |
| 10793 | { |
| 10794 | printf("zone_basic_test: free count is not zero\n" ); |
| 10795 | rc = EIO; |
| 10796 | goto out; |
| 10797 | } |
| 10798 | |
| 10799 | if (test_zone_ptr == NULL) { |
| 10800 | /* Stash the zone pointer returned on the fist zinit */ |
| 10801 | printf("zone_basic_test: zone created for the first time\n" ); |
| 10802 | test_zone_ptr = test_zone; |
| 10803 | } else if (test_zone != test_zone_ptr) { |
| 10804 | printf("zone_basic_test: old zone pointer and new zone pointer don't match\n" ); |
| 10805 | rc = EIO; |
| 10806 | goto out; |
| 10807 | } |
| 10808 | |
| 10809 | test_ptr = zalloc_flags(test_zone, Z_WAITOK | Z_NOFAIL); |
| 10810 | zfree(test_zone, test_ptr); |
| 10811 | |
| 10812 | zdestroy(test_zone); |
| 10813 | i++; |
| 10814 | |
| 10815 | printf("zone_basic_test: Iteration %d successful\n" , i); |
| 10816 | } while (i < max_iter); |
| 10817 | |
| 10818 | #if !KASAN_CLASSIC /* because of the quarantine and redzones */ |
| 10819 | /* test Z_VA_SEQUESTER */ |
| 10820 | { |
| 10821 | zone_t test_pcpu_zone; |
| 10822 | kern_return_t kr; |
| 10823 | int idx, num_allocs = 8; |
| 10824 | vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs; |
| 10825 | void *allocs[num_allocs]; |
| 10826 | void **allocs_pcpu; |
| 10827 | vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed); |
| 10828 | |
| 10829 | test_zone = zone_create("test_zone_sysctl" , elem_size, |
| 10830 | ZC_DESTRUCTIBLE); |
| 10831 | assert(test_zone); |
| 10832 | |
| 10833 | test_pcpu_zone = zone_create("test_zone_sysctl.pcpu" , sizeof(uint64_t), |
| 10834 | ZC_DESTRUCTIBLE | ZC_PERCPU); |
| 10835 | assert(test_pcpu_zone); |
| 10836 | |
| 10837 | for (idx = 0; idx < num_allocs; idx++) { |
| 10838 | allocs[idx] = zalloc(test_zone); |
| 10839 | assert(NULL != allocs[idx]); |
| 10840 | printf("alloc[%d] %p\n" , idx, allocs[idx]); |
| 10841 | } |
| 10842 | for (idx = 0; idx < num_allocs; idx++) { |
| 10843 | zfree(test_zone, allocs[idx]); |
| 10844 | } |
| 10845 | assert(!zone_pva_is_null(test_zone->z_pageq_empty)); |
| 10846 | |
| 10847 | kr = kmem_alloc(kernel_map, (vm_address_t *)&allocs_pcpu, PAGE_SIZE, |
| 10848 | KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG); |
| 10849 | assert(kr == KERN_SUCCESS); |
| 10850 | |
| 10851 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { |
| 10852 | allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, |
| 10853 | Z_WAITOK | Z_ZERO); |
| 10854 | assert(NULL != allocs_pcpu[idx]); |
| 10855 | } |
| 10856 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { |
| 10857 | zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); |
| 10858 | } |
| 10859 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); |
| 10860 | |
| 10861 | printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n" , |
| 10862 | vm_page_wire_count, vm_page_free_count, |
| 10863 | 100L * phys_pages / zone_pages_wired_max); |
| 10864 | zone_gc(ZONE_GC_DRAIN); |
| 10865 | printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n" , |
| 10866 | vm_page_wire_count, vm_page_free_count, |
| 10867 | 100L * phys_pages / zone_pages_wired_max); |
| 10868 | |
| 10869 | unsigned int allva = 0; |
| 10870 | |
| 10871 | zone_foreach(z) { |
| 10872 | zone_lock(z); |
| 10873 | allva += z->z_wired_cur; |
| 10874 | if (zone_pva_is_null(z->z_pageq_va)) { |
| 10875 | zone_unlock(z); |
| 10876 | continue; |
| 10877 | } |
| 10878 | unsigned count = 0; |
| 10879 | uint64_t size; |
| 10880 | zone_pva_t pg = z->z_pageq_va; |
| 10881 | struct zone_page_metadata *page_meta; |
| 10882 | while (pg.packed_address) { |
| 10883 | page_meta = zone_pva_to_meta(pg); |
| 10884 | count += z->z_percpu ? 1 : z->z_chunk_pages; |
| 10885 | if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) { |
| 10886 | count -= page_meta->zm_page_index; |
| 10887 | } |
| 10888 | pg = page_meta->zm_page_next; |
| 10889 | } |
| 10890 | size = zone_size_wired(z); |
| 10891 | if (!size) { |
| 10892 | size = 1; |
| 10893 | } |
| 10894 | printf("%s%s: seq %d, res %d, %qd %%\n" , |
| 10895 | zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur, |
| 10896 | z->z_wired_cur, zone_size_allocated(z) * 100ULL / size); |
| 10897 | zone_unlock(z); |
| 10898 | } |
| 10899 | |
| 10900 | printf("total va: %d\n" , allva); |
| 10901 | |
| 10902 | assert(zone_pva_is_null(test_zone->z_pageq_empty)); |
| 10903 | assert(zone_pva_is_null(test_zone->z_pageq_partial)); |
| 10904 | assert(!zone_pva_is_null(test_zone->z_pageq_va)); |
| 10905 | assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); |
| 10906 | assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial)); |
| 10907 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va)); |
| 10908 | |
| 10909 | for (idx = 0; idx < num_allocs; idx++) { |
| 10910 | assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx])); |
| 10911 | } |
| 10912 | |
| 10913 | /* make sure the zone is still usable after a GC */ |
| 10914 | |
| 10915 | for (idx = 0; idx < num_allocs; idx++) { |
| 10916 | allocs[idx] = zalloc(test_zone); |
| 10917 | assert(allocs[idx]); |
| 10918 | printf("alloc[%d] %p\n" , idx, allocs[idx]); |
| 10919 | } |
| 10920 | for (idx = 0; idx < num_allocs; idx++) { |
| 10921 | zfree(test_zone, allocs[idx]); |
| 10922 | } |
| 10923 | |
| 10924 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { |
| 10925 | allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, |
| 10926 | Z_WAITOK | Z_ZERO); |
| 10927 | assert(NULL != allocs_pcpu[idx]); |
| 10928 | } |
| 10929 | for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { |
| 10930 | zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); |
| 10931 | } |
| 10932 | |
| 10933 | assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); |
| 10934 | |
| 10935 | kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE); |
| 10936 | |
| 10937 | zdestroy(test_zone); |
| 10938 | zdestroy(test_pcpu_zone); |
| 10939 | } |
| 10940 | #endif /* KASAN_CLASSIC */ |
| 10941 | |
| 10942 | printf("zone_basic_test: Test passed\n" ); |
| 10943 | |
| 10944 | |
| 10945 | *out = 1; |
| 10946 | out: |
| 10947 | os_atomic_store(&any_zone_test_running, false, relaxed); |
| 10948 | return rc; |
| 10949 | } |
| 10950 | SYSCTL_TEST_REGISTER(zone_basic_test, zone_basic_test_run); |
| 10951 | |
| 10952 | struct zone_stress_obj { |
| 10953 | TAILQ_ENTRY(zone_stress_obj) zso_link; |
| 10954 | }; |
| 10955 | |
| 10956 | struct zone_stress_ctx { |
| 10957 | thread_t zsc_leader; |
| 10958 | lck_mtx_t zsc_lock; |
| 10959 | zone_t zsc_zone; |
| 10960 | uint64_t zsc_end; |
| 10961 | uint32_t zsc_workers; |
| 10962 | }; |
| 10963 | |
| 10964 | static void |
| 10965 | zone_stress_worker(void *arg, wait_result_t __unused wr) |
| 10966 | { |
| 10967 | struct zone_stress_ctx *ctx = arg; |
| 10968 | bool leader = ctx->zsc_leader == current_thread(); |
| 10969 | TAILQ_HEAD(zone_stress_head, zone_stress_obj) head = TAILQ_HEAD_INITIALIZER(head); |
| 10970 | struct zone_bool_gen bg = { }; |
| 10971 | struct zone_stress_obj *obj; |
| 10972 | uint32_t allocs = 0; |
| 10973 | |
| 10974 | random_bool_init(&bg.zbg_bg); |
| 10975 | |
| 10976 | do { |
| 10977 | for (int i = 0; i < 2000; i++) { |
| 10978 | uint32_t what = random_bool_gen_bits(&bg.zbg_bg, |
| 10979 | bg.zbg_entropy, ZONE_ENTROPY_CNT, 1); |
| 10980 | switch (what) { |
| 10981 | case 0: |
| 10982 | case 1: |
| 10983 | if (allocs < 10000) { |
| 10984 | obj = zalloc(ctx->zsc_zone); |
| 10985 | TAILQ_INSERT_HEAD(&head, obj, zso_link); |
| 10986 | allocs++; |
| 10987 | } |
| 10988 | break; |
| 10989 | case 2: |
| 10990 | case 3: |
| 10991 | if (allocs < 10000) { |
| 10992 | obj = zalloc(ctx->zsc_zone); |
| 10993 | TAILQ_INSERT_TAIL(&head, obj, zso_link); |
| 10994 | allocs++; |
| 10995 | } |
| 10996 | break; |
| 10997 | case 4: |
| 10998 | if (leader) { |
| 10999 | zone_gc(ZONE_GC_DRAIN); |
| 11000 | } |
| 11001 | break; |
| 11002 | case 5: |
| 11003 | case 6: |
| 11004 | if (!TAILQ_EMPTY(&head)) { |
| 11005 | obj = TAILQ_FIRST(&head); |
| 11006 | TAILQ_REMOVE(&head, obj, zso_link); |
| 11007 | zfree(ctx->zsc_zone, obj); |
| 11008 | allocs--; |
| 11009 | } |
| 11010 | break; |
| 11011 | case 7: |
| 11012 | if (!TAILQ_EMPTY(&head)) { |
| 11013 | obj = TAILQ_LAST(&head, zone_stress_head); |
| 11014 | TAILQ_REMOVE(&head, obj, zso_link); |
| 11015 | zfree(ctx->zsc_zone, obj); |
| 11016 | allocs--; |
| 11017 | } |
| 11018 | break; |
| 11019 | } |
| 11020 | } |
| 11021 | } while (mach_absolute_time() < ctx->zsc_end); |
| 11022 | |
| 11023 | while (!TAILQ_EMPTY(&head)) { |
| 11024 | obj = TAILQ_FIRST(&head); |
| 11025 | TAILQ_REMOVE(&head, obj, zso_link); |
| 11026 | zfree(ctx->zsc_zone, obj); |
| 11027 | } |
| 11028 | |
| 11029 | lck_mtx_lock(&ctx->zsc_lock); |
| 11030 | if (--ctx->zsc_workers == 0) { |
| 11031 | thread_wakeup(ctx); |
| 11032 | } else if (leader) { |
| 11033 | while (ctx->zsc_workers) { |
| 11034 | lck_mtx_sleep(&ctx->zsc_lock, LCK_SLEEP_DEFAULT, ctx, |
| 11035 | THREAD_UNINT); |
| 11036 | } |
| 11037 | } |
| 11038 | lck_mtx_unlock(&ctx->zsc_lock); |
| 11039 | |
| 11040 | if (!leader) { |
| 11041 | thread_terminate_self(); |
| 11042 | __builtin_unreachable(); |
| 11043 | } |
| 11044 | } |
| 11045 | |
| 11046 | static int |
| 11047 | zone_stress_test_run(__unused int64_t in, int64_t *out) |
| 11048 | { |
| 11049 | struct zone_stress_ctx ctx = { |
| 11050 | .zsc_leader = current_thread(), |
| 11051 | .zsc_workers = 3, |
| 11052 | }; |
| 11053 | kern_return_t kr; |
| 11054 | thread_t th; |
| 11055 | |
| 11056 | if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) { |
| 11057 | printf("zone_stress_test: Test already running.\n" ); |
| 11058 | return EALREADY; |
| 11059 | } |
| 11060 | |
| 11061 | lck_mtx_init(&ctx.zsc_lock, &zone_locks_grp, LCK_ATTR_NULL); |
| 11062 | ctx.zsc_zone = zone_create("test_zone_344" , 344, |
| 11063 | ZC_DESTRUCTIBLE | ZC_NOCACHING); |
| 11064 | assert(ctx.zsc_zone->z_chunk_pages > 1); |
| 11065 | |
| 11066 | clock_interval_to_deadline(5, NSEC_PER_SEC, &ctx.zsc_end); |
| 11067 | |
| 11068 | printf("zone_stress_test: Starting (leader %p)\n" , current_thread()); |
| 11069 | |
| 11070 | os_atomic_inc(&zalloc_simulate_vm_pressure, relaxed); |
| 11071 | |
| 11072 | for (uint32_t i = 1; i < ctx.zsc_workers; i++) { |
| 11073 | kr = kernel_thread_start_priority(zone_stress_worker, &ctx, |
| 11074 | BASEPRI_DEFAULT, &th); |
| 11075 | if (kr == KERN_SUCCESS) { |
| 11076 | printf("zone_stress_test: thread %d: %p\n" , i, th); |
| 11077 | thread_deallocate(th); |
| 11078 | } else { |
| 11079 | ctx.zsc_workers--; |
| 11080 | } |
| 11081 | } |
| 11082 | |
| 11083 | zone_stress_worker(&ctx, 0); |
| 11084 | |
| 11085 | lck_mtx_destroy(&ctx.zsc_lock, &zone_locks_grp); |
| 11086 | |
| 11087 | zdestroy(ctx.zsc_zone); |
| 11088 | |
| 11089 | printf("zone_stress_test: Done\n" ); |
| 11090 | |
| 11091 | *out = 1; |
| 11092 | os_atomic_dec(&zalloc_simulate_vm_pressure, relaxed); |
| 11093 | os_atomic_store(&any_zone_test_running, false, relaxed); |
| 11094 | return 0; |
| 11095 | } |
| 11096 | SYSCTL_TEST_REGISTER(zone_stress_test, zone_stress_test_run); |
| 11097 | |
| 11098 | struct zone_gc_stress_obj { |
| 11099 | STAILQ_ENTRY(zone_gc_stress_obj) zgso_link; |
| 11100 | uintptr_t zgso_pad[63]; |
| 11101 | }; |
| 11102 | STAILQ_HEAD(zone_gc_stress_head, zone_gc_stress_obj); |
| 11103 | |
| 11104 | #define ZONE_GC_OBJ_PER_PAGE (PAGE_SIZE / sizeof(struct zone_gc_stress_obj)) |
| 11105 | |
| 11106 | KALLOC_TYPE_DEFINE(zone_gc_stress_zone, struct zone_gc_stress_obj, KT_DEFAULT); |
| 11107 | |
| 11108 | struct zone_gc_stress_ctx { |
| 11109 | bool zgsc_done; |
| 11110 | lck_mtx_t zgsc_lock; |
| 11111 | zone_t zgsc_zone; |
| 11112 | uint64_t zgsc_end; |
| 11113 | uint32_t zgsc_workers; |
| 11114 | }; |
| 11115 | |
| 11116 | static void |
| 11117 | zone_gc_stress_test_alloc_n(struct zone_gc_stress_head *head, size_t n) |
| 11118 | { |
| 11119 | struct zone_gc_stress_obj *obj; |
| 11120 | |
| 11121 | for (size_t i = 0; i < n; i++) { |
| 11122 | obj = zalloc_flags(zone_gc_stress_zone, Z_WAITOK); |
| 11123 | STAILQ_INSERT_TAIL(head, obj, zgso_link); |
| 11124 | } |
| 11125 | } |
| 11126 | |
| 11127 | static void |
| 11128 | zone_gc_stress_test_free_n(struct zone_gc_stress_head *head) |
| 11129 | { |
| 11130 | struct zone_gc_stress_obj *obj; |
| 11131 | |
| 11132 | while ((obj = STAILQ_FIRST(head))) { |
| 11133 | STAILQ_REMOVE_HEAD(head, zgso_link); |
| 11134 | zfree(zone_gc_stress_zone, obj); |
| 11135 | } |
| 11136 | } |
| 11137 | |
| 11138 | __dead2 |
| 11139 | static void |
| 11140 | zone_gc_stress_worker(void *arg, wait_result_t __unused wr) |
| 11141 | { |
| 11142 | struct zone_gc_stress_ctx *ctx = arg; |
| 11143 | struct zone_gc_stress_head head = STAILQ_HEAD_INITIALIZER(head); |
| 11144 | |
| 11145 | while (!ctx->zgsc_done) { |
| 11146 | zone_gc_stress_test_alloc_n(&head, ZONE_GC_OBJ_PER_PAGE * 4); |
| 11147 | zone_gc_stress_test_free_n(&head); |
| 11148 | } |
| 11149 | |
| 11150 | lck_mtx_lock(&ctx->zgsc_lock); |
| 11151 | if (--ctx->zgsc_workers == 0) { |
| 11152 | thread_wakeup(ctx); |
| 11153 | } |
| 11154 | lck_mtx_unlock(&ctx->zgsc_lock); |
| 11155 | |
| 11156 | thread_terminate_self(); |
| 11157 | __builtin_unreachable(); |
| 11158 | } |
| 11159 | |
| 11160 | static int |
| 11161 | zone_gc_stress_test_run(__unused int64_t in, int64_t *out) |
| 11162 | { |
| 11163 | struct zone_gc_stress_head head = STAILQ_HEAD_INITIALIZER(head); |
| 11164 | struct zone_gc_stress_ctx ctx = { |
| 11165 | .zgsc_workers = 3, |
| 11166 | }; |
| 11167 | kern_return_t kr; |
| 11168 | thread_t th; |
| 11169 | |
| 11170 | if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) { |
| 11171 | printf("zone_gc_stress_test: Test already running.\n" ); |
| 11172 | return EALREADY; |
| 11173 | } |
| 11174 | |
| 11175 | lck_mtx_init(&ctx.zgsc_lock, &zone_locks_grp, LCK_ATTR_NULL); |
| 11176 | lck_mtx_lock(&ctx.zgsc_lock); |
| 11177 | |
| 11178 | printf("zone_gc_stress_test: Starting (leader %p)\n" , current_thread()); |
| 11179 | |
| 11180 | os_atomic_inc(&zalloc_simulate_vm_pressure, relaxed); |
| 11181 | |
| 11182 | for (uint32_t i = 0; i < ctx.zgsc_workers; i++) { |
| 11183 | kr = kernel_thread_start_priority(zone_gc_stress_worker, &ctx, |
| 11184 | BASEPRI_DEFAULT, &th); |
| 11185 | if (kr == KERN_SUCCESS) { |
| 11186 | printf("zone_gc_stress_test: thread %d: %p\n" , i, th); |
| 11187 | thread_deallocate(th); |
| 11188 | } else { |
| 11189 | ctx.zgsc_workers--; |
| 11190 | } |
| 11191 | } |
| 11192 | |
| 11193 | for (uint64_t i = 0; i < in; i++) { |
| 11194 | size_t count = zc_mag_size() * zc_free_batch_size() * 10; |
| 11195 | |
| 11196 | if (count < ZONE_GC_OBJ_PER_PAGE * 20) { |
| 11197 | count = ZONE_GC_OBJ_PER_PAGE * 20; |
| 11198 | } |
| 11199 | |
| 11200 | zone_gc_stress_test_alloc_n(&head, count); |
| 11201 | zone_gc_stress_test_free_n(&head); |
| 11202 | |
| 11203 | lck_mtx_lock(&zone_gc_lock); |
| 11204 | zone_reclaim(zone_gc_stress_zone->kt_zv.zv_zone, |
| 11205 | ZONE_RECLAIM_TRIM); |
| 11206 | lck_mtx_unlock(&zone_gc_lock); |
| 11207 | |
| 11208 | printf("zone_gc_stress_test: round %lld/%lld\n" , i + 1, in); |
| 11209 | } |
| 11210 | |
| 11211 | os_atomic_thread_fence(seq_cst); |
| 11212 | ctx.zgsc_done = true; |
| 11213 | lck_mtx_sleep(&ctx.zgsc_lock, LCK_SLEEP_DEFAULT, &ctx, THREAD_UNINT); |
| 11214 | lck_mtx_unlock(&ctx.zgsc_lock); |
| 11215 | |
| 11216 | lck_mtx_destroy(&ctx.zgsc_lock, &zone_locks_grp); |
| 11217 | |
| 11218 | lck_mtx_lock(&zone_gc_lock); |
| 11219 | zone_reclaim(zone_gc_stress_zone->kt_zv.zv_zone, |
| 11220 | ZONE_RECLAIM_DRAIN); |
| 11221 | lck_mtx_unlock(&zone_gc_lock); |
| 11222 | |
| 11223 | printf("zone_gc_stress_test: Done\n" ); |
| 11224 | |
| 11225 | *out = 1; |
| 11226 | os_atomic_dec(&zalloc_simulate_vm_pressure, relaxed); |
| 11227 | os_atomic_store(&any_zone_test_running, false, relaxed); |
| 11228 | return 0; |
| 11229 | } |
| 11230 | SYSCTL_TEST_REGISTER(zone_gc_stress_test, zone_gc_stress_test_run); |
| 11231 | |
| 11232 | /* |
| 11233 | * Routines to test that zone garbage collection and zone replenish threads |
| 11234 | * running at the same time don't cause problems. |
| 11235 | */ |
| 11236 | |
| 11237 | static int |
| 11238 | zone_gc_replenish_test(__unused int64_t in, int64_t *out) |
| 11239 | { |
| 11240 | zone_gc(ZONE_GC_DRAIN); |
| 11241 | *out = 1; |
| 11242 | return 0; |
| 11243 | } |
| 11244 | SYSCTL_TEST_REGISTER(zone_gc_replenish_test, zone_gc_replenish_test); |
| 11245 | |
| 11246 | static int |
| 11247 | zone_alloc_replenish_test(__unused int64_t in, int64_t *out) |
| 11248 | { |
| 11249 | zone_t z = vm_map_entry_zone; |
| 11250 | struct data { struct data *next; } *node, *list = NULL; |
| 11251 | |
| 11252 | if (z == NULL) { |
| 11253 | printf("Couldn't find a replenish zone\n" ); |
| 11254 | return EIO; |
| 11255 | } |
| 11256 | |
| 11257 | /* big enough to go past replenishment */ |
| 11258 | for (uint32_t i = 0; i < 10 * z->z_elems_rsv; ++i) { |
| 11259 | node = zalloc(z); |
| 11260 | node->next = list; |
| 11261 | list = node; |
| 11262 | } |
| 11263 | |
| 11264 | /* |
| 11265 | * release the memory we allocated |
| 11266 | */ |
| 11267 | while (list != NULL) { |
| 11268 | node = list; |
| 11269 | list = list->next; |
| 11270 | zfree(z, node); |
| 11271 | } |
| 11272 | |
| 11273 | *out = 1; |
| 11274 | return 0; |
| 11275 | } |
| 11276 | SYSCTL_TEST_REGISTER(zone_alloc_replenish_test, zone_alloc_replenish_test); |
| 11277 | |
| 11278 | #endif /* DEBUG || DEVELOPMENT */ |
| 11279 | |