1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#define _FN_KPRINTF
31#include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32#include <libkern/OSDebug.h> /* for OSBacktrace */
33#include <kern/sched_prim.h> /* for assert_wait */
34#include <vm/vm_memtag.h>
35
36/*
37 * Memory allocator with per-CPU caching (magazines), derived from the kmem
38 * magazine concept and implementation as described in the following paper:
39 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
40 *
41 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
42 * reserved. Use is subject to license terms.
43 *
44 * This derivative differs from the original kmem slab allocator, in that:
45 *
46 * a) There is always a discrete bufctl per object, even for small sizes.
47 * This increases the overhead, but is necessary as Skywalk objects
48 * coming from the slab may be shared (RO or RW) with userland; therefore
49 * embedding the KVA pointer linkage in freed objects is a non-starter.
50 *
51 * b) Writing patterns to the slab at slab creation or destruction time
52 * (when debugging is enabled) is not implemented, as the object may
53 * be shared (RW) with userland and thus we cannot panic upon pattern
54 * mismatch episodes. This can be relaxed so that we conditionally
55 * verify the pattern for kernel-only memory.
56 *
57 * This derivative also differs from Darwin's mcache allocator (which itself
58 * is a derivative of the original kmem slab allocator), in that:
59 *
60 * 1) The slab layer is internal to skmem_cache, unlike mcache's external
61 * slab layer required to support mbufs. skmem_cache also supports
62 * constructing and deconstructing objects, while mcache does not.
63 * This brings skmem_cache's model closer to that of the original
64 * kmem slab allocator.
65 *
66 * 2) mcache allows for batch allocation and free by way of chaining the
67 * objects together using a linked list. This requires using a part
68 * of the object to act as the linkage, which is against Skywalk's
69 * requirements of not exposing any KVA pointer to userland. Although
70 * this is supported by skmem_cache, chaining is only possible if the
71 * region is not mapped to userland. That implies that kernel-only
72 * objects can be chained provided the cache is created with batching
73 * mode enabled, and that the object is large enough to contain the
74 * skmem_obj structure.
75 *
76 * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
77 * implements features that are required by Skywalk. In addition to being
78 * aware of userland access on the buffers, in also supports mirrored backend
79 * memory regions. This allows a cache to manage two independent memory
80 * regions, such that allocating/freeing an object from/to one results in
81 * allocating/freeing a shadow object in another, thus guaranteeing that both
82 * objects share the same lifetime.
83 */
84
85static uint32_t ncpu; /* total # of initialized CPUs */
86
87static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
88static struct thread *skmem_lock_owner = THREAD_NULL;
89
90static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
91static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
92static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
93
94#define SKMEM_CACHE_LOCK() do { \
95 lck_mtx_lock(&skmem_cache_lock); \
96 skmem_lock_owner = current_thread(); \
97} while (0)
98#define SKMEM_CACHE_UNLOCK() do { \
99 skmem_lock_owner = THREAD_NULL; \
100 lck_mtx_unlock(&skmem_cache_lock); \
101} while (0)
102#define SKMEM_CACHE_LOCK_ASSERT_HELD() \
103 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
104#define SKMEM_CACHE_LOCK_ASSERT_NOTHELD() \
105 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
106
107#define SKM_SLAB_LOCK(_skm) \
108 lck_mtx_lock(&(_skm)->skm_sl_lock)
109#define SKM_SLAB_LOCK_ASSERT_HELD(_skm) \
110 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
111#define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm) \
112 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
113#define SKM_SLAB_UNLOCK(_skm) \
114 lck_mtx_unlock(&(_skm)->skm_sl_lock)
115
116#define SKM_DEPOT_LOCK(_skm) \
117 lck_mtx_lock(&(_skm)->skm_dp_lock)
118#define SKM_DEPOT_LOCK_SPIN(_skm) \
119 lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
120#define SKM_DEPOT_CONVERT_LOCK(_skm) \
121 lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
122#define SKM_DEPOT_LOCK_TRY(_skm) \
123 lck_mtx_try_lock(&(_skm)->skm_dp_lock)
124#define SKM_DEPOT_LOCK_ASSERT_HELD(_skm) \
125 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
126#define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm) \
127 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
128#define SKM_DEPOT_UNLOCK(_skm) \
129 lck_mtx_unlock(&(_skm)->skm_dp_lock)
130
131#define SKM_RESIZE_LOCK(_skm) \
132 lck_mtx_lock(&(_skm)->skm_rs_lock)
133#define SKM_RESIZE_LOCK_ASSERT_HELD(_skm) \
134 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
135#define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm) \
136 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
137#define SKM_RESIZE_UNLOCK(_skm) \
138 lck_mtx_unlock(&(_skm)->skm_rs_lock)
139
140#define SKM_CPU_LOCK(_cp) \
141 lck_mtx_lock(&(_cp)->cp_lock)
142#define SKM_CPU_LOCK_SPIN(_cp) \
143 lck_mtx_lock_spin(&(_cp)->cp_lock)
144#define SKM_CPU_CONVERT_LOCK(_cp) \
145 lck_mtx_convert_spin(&(_cp)->cp_lock)
146#define SKM_CPU_LOCK_ASSERT_HELD(_cp) \
147 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
148#define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp) \
149 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
150#define SKM_CPU_UNLOCK(_cp) \
151 lck_mtx_unlock(&(_cp)->cp_lock)
152
153#define SKM_ZONE_MAX 256
154
155static struct zone *skm_zone; /* zone for skmem_cache */
156
157static struct skmem_cache *skmem_slab_cache; /* cache for skmem_slab */
158static struct skmem_cache *skmem_bufctl_cache; /* cache for skmem_bufctl */
159static unsigned int bc_size; /* size of bufctl */
160
161/*
162 * Magazine types (one per row.)
163 *
164 * The first column defines the number of objects that the magazine can hold.
165 * Using that number, we derive the effective number: the aggregate count of
166 * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
167 * This would result in an object size that is aligned on the CPU cache
168 * size boundary; the exception to this is the KASAN mode where the size
169 * would be larger due to the redzone regions.
170 *
171 * The second column defines the alignment of the magazine. Because each
172 * magazine is used at the CPU-layer cache, we need to ensure there is no
173 * false sharing across the CPUs, and align the magazines to the maximum
174 * cache alignment size, for simplicity. The value of 0 may be used to
175 * indicate natural pointer size alignment.
176 *
177 * The third column defines the starting magazine type for a given cache,
178 * determined at the cache's creation time based on its chunk size.
179 *
180 * The fourth column defines the magazine type limit for a given cache.
181 * Magazine resizing will only occur if the chunk size is less than this.
182 */
183static struct skmem_magtype skmem_magtype[] = {
184#if defined(__LP64__)
185 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
186 .mt_cache = NULL, .mt_cname = "" },
187 { .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
188 .mt_cache = NULL, .mt_cname = "" },
189 { .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
190 .mt_cache = NULL, .mt_cname = "" },
191 { .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
192 .mt_cache = NULL, .mt_cname = "" },
193 { .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
194 .mt_cache = NULL, .mt_cname = "" },
195 { .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
196 .mt_cache = NULL, .mt_cname = "" },
197 { .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
198 .mt_cache = NULL, .mt_cname = "" },
199 { .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
200 .mt_cache = NULL, .mt_cname = "" },
201#else /* !__LP64__ */
202 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
203 .mt_cache = NULL, .mt_cname = "" },
204#endif /* !__LP64__ */
205};
206
207/*
208 * Hash table bounds. Start with the initial value, and rescale up to
209 * the specified limit. Ideally we don't need a limit, but in practice
210 * this helps guard against runaways. These values should be revisited
211 * in future and be adjusted as needed.
212 */
213#define SKMEM_CACHE_HASH_INITIAL 64 /* initial hash table size */
214#define SKMEM_CACHE_HASH_LIMIT 8192 /* hash table size limit */
215
216#define SKMEM_CACHE_HASH_INDEX(_a, _s, _m) (((_a) >> (_s)) & (_m))
217#define SKMEM_CACHE_HASH(_skm, _buf) \
218 (&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
219 (_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
220
221/*
222 * The last magazine type.
223 */
224static struct skmem_magtype *skmem_cache_magsize_last;
225
226static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
227static boolean_t skmem_cache_ready;
228
229static int skmem_slab_alloc_locked(struct skmem_cache *,
230 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
231static void skmem_slab_free_locked(struct skmem_cache *, void *);
232static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
233 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
234static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
235static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
236static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
237static int skmem_magazine_ctor(struct skmem_obj_info *,
238 struct skmem_obj_info *, void *, uint32_t);
239static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
240 int);
241static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
242 struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t);
243static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
244 uint32_t *, struct skmem_mag *);
245static void skmem_depot_ws_update(struct skmem_cache *);
246static void skmem_depot_ws_zero(struct skmem_cache *);
247static void skmem_depot_ws_reap(struct skmem_cache *);
248static void skmem_cache_magazine_purge(struct skmem_cache *);
249static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
250static void skmem_cache_magazine_resize(struct skmem_cache *);
251static void skmem_cache_hash_rescale(struct skmem_cache *);
252static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
253static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
254 struct skmem_mag *, int);
255static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
256 uint32_t);
257static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
258static void skmem_cache_reap_start(void);
259static void skmem_cache_reap_done(void);
260static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
261static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
262static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
263static void skmem_cache_resize_exit(struct skmem_cache *);
264static void skmem_audit_bufctl(struct skmem_bufctl *);
265static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
266static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
267
268SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
269 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
270 0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
271 "Skywalk cache statistics");
272
273static volatile uint32_t skmem_cache_reaping;
274static thread_call_t skmem_cache_reap_tc;
275static thread_call_t skmem_cache_update_tc;
276
277extern kern_return_t thread_terminate(thread_t);
278extern unsigned int ml_wait_max_cpus(void);
279
280#define SKMEM_DEBUG_NOMAGAZINES 0x1 /* disable magazines layer */
281#define SKMEM_DEBUG_AUDIT 0x2 /* audit transactions */
282#define SKMEM_DEBUG_MASK (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
283
284#if DEBUG
285static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
286#else /* !DEBUG */
287static uint32_t skmem_debug = 0;
288#endif /* !DEBUG */
289
290static uint32_t skmem_clear_min = 0; /* clear on free threshold */
291
292#define SKMEM_CACHE_UPDATE_INTERVAL 11 /* 11 seconds */
293static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
294
295#define SKMEM_DEPOT_CONTENTION 3 /* max failed trylock per interval */
296static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
297
298/*
299 * Too big a value will cause overflow and thus trip the assertion; the
300 * idea here is to set an upper limit for the time that a particular
301 * thread is allowed to perform retries before we give up and panic.
302 */
303#define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
304
305/*
306 * Threshold (in msec) after which we reset the exponential backoff value
307 * back to its (random) initial value. Note that we allow the actual delay
308 * to be at most twice this value.
309 */
310#define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
311
312/*
313 * To reduce the likelihood of global synchronization between threads,
314 * we use some random value to start the exponential backoff.
315 */
316#define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
317
318#if (DEVELOPMENT || DEBUG)
319SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
320 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
321 SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
322SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
323 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
324 SKMEM_DEPOT_CONTENTION, "Depot contention");
325
326static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
327
328/*
329 * Called by skmem_test_start() to set the update interval.
330 */
331void
332skmem_cache_test_start(uint32_t i)
333{
334 skmem_cache_update_interval_saved = skmem_cache_update_interval;
335 skmem_cache_update_interval = i;
336}
337
338/*
339 * Called by skmem_test_stop() to restore the update interval.
340 */
341void
342skmem_cache_test_stop(void)
343{
344 skmem_cache_update_interval = skmem_cache_update_interval_saved;
345}
346#endif /* (DEVELOPMENT || DEBUG) */
347
348#define SKMEM_TAG_BUFCTL_HASH "com.apple.skywalk.bufctl.hash"
349static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
350
351#define SKMEM_TAG_CACHE_MIB "com.apple.skywalk.cache.mib"
352static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
353
354static int __skmem_cache_pre_inited = 0;
355static int __skmem_cache_inited = 0;
356
357/*
358 * Called before skmem_region_init().
359 */
360void
361skmem_cache_pre_init(void)
362{
363 vm_size_t skm_size;
364
365 ASSERT(!__skmem_cache_pre_inited);
366
367 ncpu = ml_wait_max_cpus();
368
369 /* allocate extra in case we need to manually align the pointer */
370 if (skm_zone == NULL) {
371 skm_size = SKMEM_CACHE_SIZE(ncpu);
372#if KASAN
373 /*
374 * When KASAN is enabled, the zone allocator adjusts the
375 * element size to include the redzone regions, in which
376 * case we assume that the elements won't start on the
377 * alignment boundary and thus need to do some fix-ups.
378 * These include increasing the effective object size
379 * which adds at least 136 bytes to the original size,
380 * as computed by skmem_region_params_config() above.
381 */
382 skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
383#endif /* KASAN */
384 skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
385 skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", size: skm_size,
386 flags: ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
387 }
388
389 TAILQ_INIT(&skmem_cache_head);
390
391 __skmem_cache_pre_inited = 1;
392}
393
394/*
395 * Called after skmem_region_init().
396 */
397void
398skmem_cache_init(void)
399{
400 uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
401 struct skmem_magtype *mtp;
402 uint32_t i;
403
404 _CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
405
406 _CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
407 _CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
408 _CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
409 _CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
410 _CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
411 _CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
412 _CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
413
414 ASSERT(__skmem_cache_pre_inited);
415 ASSERT(!__skmem_cache_inited);
416
417 PE_parse_boot_argn(arg_string: "skmem_debug", arg_ptr: &skmem_debug, max_arg: sizeof(skmem_debug));
418 skmem_debug &= SKMEM_DEBUG_MASK;
419
420#if (DEVELOPMENT || DEBUG)
421 PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
422 sizeof(skmem_clear_min));
423#endif /* (DEVELOPMENT || DEBUG) */
424 if (skmem_clear_min == 0) {
425 /* zeroing 2 CPU cache lines practically comes for free */
426 skmem_clear_min = 2 * cpu_cache_line_size;
427 } else {
428 /* round it up to CPU cache line size */
429 skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
430 cpu_cache_line_size);
431 }
432
433 /* create a cache for buffer control structures */
434 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
435 bc_size = sizeof(struct skmem_bufctl_audit);
436 skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
437 bc_size, sizeof(uint64_t), NULL, NULL,
438 NULL, NULL, NULL, 0);
439 } else {
440 bc_size = sizeof(struct skmem_bufctl);
441 skmem_bufctl_cache = skmem_cache_create("bufctl",
442 bc_size, sizeof(uint64_t), NULL, NULL,
443 NULL, NULL, NULL, 0);
444 }
445
446 /* create a cache for slab structures */
447 skmem_slab_cache = skmem_cache_create("slab",
448 sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
449 NULL, NULL, 0);
450
451 /*
452 * Go thru the magazine type table and create an cache for each.
453 */
454 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
455 mtp = &skmem_magtype[i];
456
457 if (mtp->mt_align != 0 &&
458 ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
459 mtp->mt_align < (int)cpu_cache_line_size)) {
460 panic("%s: bad alignment %d", __func__, mtp->mt_align);
461 /* NOTREACHED */
462 __builtin_unreachable();
463 }
464 (void) snprintf(mtp->mt_cname, count: sizeof(mtp->mt_cname),
465 "mg.%d", mtp->mt_magsize);
466
467 /* create an cache for this magazine type */
468 mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
469 SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
470 skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
471
472 /* remember the last magazine type */
473 skmem_cache_magsize_last = mtp;
474 }
475
476 VERIFY(skmem_cache_magsize_last != NULL);
477 VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
478 VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
479
480 /*
481 * Allocate thread calls for cache reap and update operations.
482 */
483 skmem_cache_reap_tc =
484 thread_call_allocate_with_options(func: skmem_cache_reap_func,
485 NULL, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
486 skmem_cache_update_tc =
487 thread_call_allocate_with_options(func: skmem_cache_update_func,
488 NULL, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
489 if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
490 panic("%s: thread_call_allocate failed", __func__);
491 /* NOTREACHED */
492 __builtin_unreachable();
493 }
494
495 /*
496 * We're ready; go through existing skmem_cache entries
497 * (if any) and enable the magazines layer for each.
498 */
499 skmem_cache_applyall(skmem_cache_magazine_enable, 0);
500 skmem_cache_ready = TRUE;
501
502 /* and start the periodic cache update machinery */
503 skmem_dispatch(skmem_cache_update_tc, NULL,
504 (skmem_cache_update_interval * NSEC_PER_SEC));
505
506 __skmem_cache_inited = 1;
507}
508
509void
510skmem_cache_fini(void)
511{
512 struct skmem_magtype *mtp;
513 uint32_t i;
514
515 if (__skmem_cache_inited) {
516 ASSERT(TAILQ_EMPTY(&skmem_cache_head));
517
518 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
519 mtp = &skmem_magtype[i];
520 skmem_cache_destroy(mtp->mt_cache);
521 mtp->mt_cache = NULL;
522 }
523 skmem_cache_destroy(skmem_slab_cache);
524 skmem_slab_cache = NULL;
525 skmem_cache_destroy(skmem_bufctl_cache);
526 skmem_bufctl_cache = NULL;
527
528 if (skmem_cache_reap_tc != NULL) {
529 (void) thread_call_cancel_wait(call: skmem_cache_reap_tc);
530 (void) thread_call_free(call: skmem_cache_reap_tc);
531 skmem_cache_reap_tc = NULL;
532 }
533 if (skmem_cache_update_tc != NULL) {
534 (void) thread_call_cancel_wait(call: skmem_cache_update_tc);
535 (void) thread_call_free(call: skmem_cache_update_tc);
536 skmem_cache_update_tc = NULL;
537 }
538
539 __skmem_cache_inited = 0;
540 }
541
542 if (__skmem_cache_pre_inited) {
543 if (skm_zone != NULL) {
544 zdestroy(zone: skm_zone);
545 skm_zone = NULL;
546 }
547
548 __skmem_cache_pre_inited = 0;
549 }
550}
551
552/*
553 * Create a cache.
554 */
555struct skmem_cache *
556skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
557 skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
558 void *private, struct skmem_region *region, uint32_t cflags)
559{
560 boolean_t pseudo = (region == NULL);
561 struct skmem_magtype *mtp;
562 struct skmem_cache *skm;
563 void *buf;
564 size_t segsize;
565 size_t chunksize;
566 size_t objsize;
567 size_t objalign;
568 uint32_t i, cpuid;
569
570 /* enforce 64-bit minimum alignment for buffers */
571 if (bufalign == 0) {
572 bufalign = SKMEM_CACHE_ALIGN;
573 }
574 bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
575
576 /* enforce alignment to be a power of 2 */
577 VERIFY(powerof2(bufalign));
578
579 if (region == NULL) {
580 struct skmem_region_params srp;
581
582 /* batching is currently not supported on pseudo regions */
583 VERIFY(!(cflags & SKMEM_CR_BATCH));
584
585 srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
586 ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
587
588 /* objalign is always equal to bufalign */
589 srp.srp_align = objalign = bufalign;
590 srp.srp_r_obj_cnt = 1;
591 srp.srp_r_obj_size = (uint32_t)bufsize;
592 skmem_region_params_config(&srp);
593
594 /* allocate region for intrinsics */
595 region = skmem_region_create(name, &srp, NULL, NULL, NULL);
596 VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
597 VERIFY(objalign == region->skr_align);
598#if KASAN
599 /*
600 * When KASAN is enabled, the zone allocator adjusts the
601 * element size to include the redzone regions, in which
602 * case we assume that the elements won't start on the
603 * alignment boundary and thus need to do some fix-ups.
604 * These include increasing the effective object size
605 * which adds at least 16 bytes to the original size,
606 * as computed by skmem_region_params_config() above.
607 */
608 VERIFY(region->skr_c_obj_size >=
609 (bufsize + sizeof(uint64_t) + bufalign));
610#endif /* KASAN */
611 /* enable magazine resizing by default */
612 cflags |= SKMEM_CR_DYNAMIC;
613
614 /*
615 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
616 * even though it's a no-op since the work is done
617 * at the zone layer instead.
618 */
619 cflags |= SKMEM_CR_CLEARONFREE;
620 } else {
621 objalign = region->skr_align;
622 }
623
624 ASSERT(region != NULL);
625 ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
626 segsize = region->skr_seg_size;
627 ASSERT(bufalign <= segsize);
628
629 buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO);
630#if KASAN
631 /*
632 * In case we didn't get a cache-aligned memory, round it up
633 * accordingly. This is needed in order to get the rest of
634 * structure members aligned properly. It also means that
635 * the memory span gets shifted due to the round up, but it
636 * is okay since we've allocated extra space for this.
637 */
638 skm = (struct skmem_cache *)
639 P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
640 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
641 *pbuf = buf;
642#else /* !KASAN */
643 /*
644 * We expect that the zone allocator would allocate elements
645 * rounded up to the requested alignment based on the object
646 * size computed in skmem_cache_pre_init() earlier, and
647 * 'skm' is therefore the element address itself.
648 */
649 skm = buf;
650#endif /* !KASAN */
651 VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
652
653 if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
654 (cflags & SKMEM_CR_NOMAGAZINES)) {
655 /*
656 * Either the caller insists that this cache should not
657 * utilize magazines layer, or that the system override
658 * to disable magazines layer on all caches has been set.
659 */
660 skm->skm_mode |= SKM_MODE_NOMAGAZINES;
661 } else {
662 /*
663 * Region must be configured with enough objects
664 * to take into account objects at the CPU layer.
665 */
666 ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
667 }
668
669 if (cflags & SKMEM_CR_DYNAMIC) {
670 /*
671 * Enable per-CPU cache magazine resizing.
672 */
673 skm->skm_mode |= SKM_MODE_DYNAMIC;
674 }
675
676 /* region stays around after defunct? */
677 if (region->skr_mode & SKR_MODE_NOREDIRECT) {
678 skm->skm_mode |= SKM_MODE_NOREDIRECT;
679 }
680
681 if (cflags & SKMEM_CR_BATCH) {
682 /*
683 * Batch alloc/free involves storing the next object
684 * pointer at the beginning of each object; this is
685 * okay for kernel-only regions, but not those that
686 * are mappable to user space (we can't leak kernel
687 * addresses).
688 */
689 _CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
690 VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
691
692 /* batching is currently not supported on pseudo regions */
693 VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
694
695 /* validate object size */
696 VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
697
698 skm->skm_mode |= SKM_MODE_BATCH;
699 }
700
701 uuid_generate_random(out: skm->skm_uuid);
702 (void) snprintf(skm->skm_name, count: sizeof(skm->skm_name),
703 "%s.%s", SKMEM_CACHE_PREFIX, name);
704 skm->skm_bufsize = bufsize;
705 skm->skm_bufalign = bufalign;
706 skm->skm_objalign = objalign;
707 skm->skm_ctor = ctor;
708 skm->skm_dtor = dtor;
709 skm->skm_reclaim = reclaim;
710 skm->skm_private = private;
711 skm->skm_slabsize = segsize;
712
713 skm->skm_region = region;
714 /* callee holds reference */
715 skmem_region_slab_config(region, skm, true);
716 objsize = region->skr_c_obj_size;
717 skm->skm_objsize = objsize;
718
719 if (pseudo) {
720 /*
721 * Release reference from skmem_region_create()
722 * since skm->skm_region holds one now.
723 */
724 ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
725 skmem_region_release(region);
726
727 skm->skm_mode |= SKM_MODE_PSEUDO;
728
729 skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
730 skm->skm_slab_free = skmem_slab_free_pseudo_locked;
731 } else {
732 skm->skm_slab_alloc = skmem_slab_alloc_locked;
733 skm->skm_slab_free = skmem_slab_free_locked;
734
735 /* auditing was requested? (normal regions only) */
736 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
737 ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
738 skm->skm_mode |= SKM_MODE_AUDIT;
739 }
740 }
741
742 /*
743 * Clear upon free (to slab layer) as long as the region is
744 * not marked as read-only for kernel, and if the chunk size
745 * is within the threshold or if the caller had requested it.
746 */
747 if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
748 if (skm->skm_objsize <= skmem_clear_min ||
749 (cflags & SKMEM_CR_CLEARONFREE)) {
750 skm->skm_mode |= SKM_MODE_CLEARONFREE;
751 }
752 }
753
754 chunksize = bufsize;
755 if (bufalign >= SKMEM_CACHE_ALIGN) {
756 chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
757 }
758
759 chunksize = P2ROUNDUP(chunksize, bufalign);
760 if (chunksize > objsize) {
761 panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
762 __func__, bufsize, chunksize, objsize);
763 /* NOTREACHED */
764 __builtin_unreachable();
765 }
766 ASSERT(chunksize != 0);
767 skm->skm_chunksize = chunksize;
768
769 lck_mtx_init(lck: &skm->skm_sl_lock, grp: &skmem_sl_lock_grp, attr: &skmem_lock_attr);
770 TAILQ_INIT(&skm->skm_sl_partial_list);
771 TAILQ_INIT(&skm->skm_sl_empty_list);
772
773 /* allocated-address hash table */
774 skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
775 skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
776 skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
777 skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
778
779 skm->skm_hash_mask = (skm->skm_hash_initial - 1);
780 skm->skm_hash_shift = flsll(chunksize) - 1;
781
782 for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
783 SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
784 }
785
786 lck_mtx_init(lck: &skm->skm_dp_lock, grp: &skmem_dp_lock_grp, attr: &skmem_lock_attr);
787
788 /* find a suitable magazine type for this chunk size */
789 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
790 continue;
791 }
792
793 skm->skm_magtype = mtp;
794 if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
795 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
796 }
797
798 /*
799 * Initialize the CPU layer. Each per-CPU structure is aligned
800 * on the CPU cache line boundary to prevent false sharing.
801 */
802 lck_mtx_init(lck: &skm->skm_rs_lock, grp: &skmem_cpu_lock_grp, attr: &skmem_lock_attr);
803 for (cpuid = 0; cpuid < ncpu; cpuid++) {
804 struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
805
806 VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
807 lck_mtx_init(lck: &ccp->cp_lock, grp: &skmem_cpu_lock_grp,
808 attr: &skmem_lock_attr);
809 ccp->cp_rounds = -1;
810 ccp->cp_prounds = -1;
811 }
812
813 SKMEM_CACHE_LOCK();
814 TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
815 SKMEM_CACHE_UNLOCK();
816
817 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
818 skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
819 SK_DF(SK_VERB_MEM_CACHE,
820 " bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
821 (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
822 (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
823 (uint32_t)skm->skm_slabsize);
824
825 if (skmem_cache_ready) {
826 skmem_cache_magazine_enable(skm, 0);
827 }
828
829 if (cflags & SKMEM_CR_RECLAIM) {
830 skm->skm_mode |= SKM_MODE_RECLAIM;
831 }
832
833 return skm;
834}
835
836/*
837 * Destroy a cache.
838 */
839void
840skmem_cache_destroy(struct skmem_cache *skm)
841{
842 uint32_t cpuid;
843
844 SKMEM_CACHE_LOCK();
845 TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
846 SKMEM_CACHE_UNLOCK();
847
848 ASSERT(skm->skm_rs_busy == 0);
849 ASSERT(skm->skm_rs_want == 0);
850
851 /* purge all cached objects for this cache */
852 skmem_cache_magazine_purge(skm);
853
854 /*
855 * Panic if we detect there are unfreed objects; the caller
856 * destroying this cache is responsible for ensuring that all
857 * allocated objects have been freed prior to getting here.
858 */
859 SKM_SLAB_LOCK(skm);
860 if (skm->skm_sl_bufinuse != 0) {
861 panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
862 skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
863 /* NOTREACHED */
864 __builtin_unreachable();
865 }
866 ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
867 ASSERT(skm->skm_sl_partial == 0);
868 ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
869 ASSERT(skm->skm_sl_empty == 0);
870 skm->skm_reclaim = NULL;
871 skm->skm_ctor = NULL;
872 skm->skm_dtor = NULL;
873 SKM_SLAB_UNLOCK(skm);
874
875 if (skm->skm_hash_table != NULL) {
876#if (DEBUG || DEVELOPMENT)
877 for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
878 ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
879 }
880#endif /* DEBUG || DEVELOPMENT */
881
882 sk_free_type_array(struct skmem_bufctl_bkt,
883 skm->skm_hash_mask + 1, skm->skm_hash_table);
884 skm->skm_hash_table = NULL;
885 }
886
887 for (cpuid = 0; cpuid < ncpu; cpuid++) {
888 lck_mtx_destroy(lck: &skm->skm_cpu_cache[cpuid].cp_lock,
889 grp: &skmem_cpu_lock_grp);
890 }
891 lck_mtx_destroy(lck: &skm->skm_rs_lock, grp: &skmem_cpu_lock_grp);
892 lck_mtx_destroy(lck: &skm->skm_dp_lock, grp: &skmem_dp_lock_grp);
893 lck_mtx_destroy(lck: &skm->skm_sl_lock, grp: &skmem_sl_lock_grp);
894
895 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
896 skm->skm_name, SK_KVA(skm));
897
898 /* callee releases reference */
899 skmem_region_slab_config(skm->skm_region, skm, false);
900 skm->skm_region = NULL;
901
902#if KASAN
903 /* get the original address since we're about to free it */
904 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
905 skm = *pbuf;
906#endif /* KASAN */
907
908 zfree(skm_zone, skm);
909}
910
911/*
912 * Create a slab.
913 */
914static struct skmem_slab *
915skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
916{
917 struct skmem_region *skr = skm->skm_region;
918 uint32_t objsize, chunks;
919 size_t slabsize = skm->skm_slabsize;
920 struct skmem_slab *sl;
921 struct sksegment *sg, *sgm;
922 char *buf, *bufm, *slab, *slabm;
923
924 /*
925 * Allocate a segment (a slab at our layer) from the region.
926 */
927 slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
928 if (slab == NULL) {
929 goto rg_alloc_failure;
930 }
931
932 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
933 goto slab_alloc_failure;
934 }
935
936 ASSERT(sg != NULL);
937 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
938
939 bzero(s: sl, n: sizeof(*sl));
940 sl->sl_cache = skm;
941 sl->sl_base = buf = slab;
942 sl->sl_basem = bufm = slabm;
943 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
944 objsize = (uint32_t)skr->skr_c_obj_size;
945 ASSERT(skm->skm_objsize == objsize);
946 ASSERT((slabsize / objsize) <= UINT32_MAX);
947 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
948 sl->sl_seg = sg;
949 sl->sl_segm = sgm;
950
951 /*
952 * Create one or more buffer control structures for the slab,
953 * each one tracking a chunk of raw object from the segment,
954 * and insert these into the slab's list of buffer controls.
955 */
956 ASSERT(chunks > 0);
957 while (chunks != 0) {
958 struct skmem_bufctl *bc;
959
960 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
961 if (bc == NULL) {
962 goto bufctl_alloc_failure;
963 }
964
965 bzero(s: bc, n: bc_size);
966 bc->bc_addr = buf;
967 bc->bc_addrm = bufm;
968 bc->bc_slab = sl;
969 bc->bc_idx = (sl->sl_chunks - chunks);
970 if (skr->skr_mode & SKR_MODE_SHAREOK) {
971 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
972 }
973 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
974 bc->bc_lim = objsize;
975 buf += objsize;
976 if (bufm != NULL) {
977 bufm += objsize;
978 }
979 --chunks;
980 }
981
982 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
983 SK_KVA(skm), SK_KVA(sl));
984 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
985 SK_KVA(slab), SK_KVA(slab + objsize));
986
987 return sl;
988
989bufctl_alloc_failure:
990 skmem_slab_destroy(skm, sl);
991
992slab_alloc_failure:
993 skmem_region_free(skr, slab, slabm);
994
995rg_alloc_failure:
996 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
997
998 return NULL;
999}
1000
1001/*
1002 * Destroy a slab.
1003 */
1004static void
1005skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
1006{
1007 struct skmem_bufctl *bc, *tbc;
1008 void *slab = sl->sl_base;
1009 void *slabm = sl->sl_basem;
1010
1011 ASSERT(sl->sl_refcnt == 0);
1012
1013 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1014 SK_KVA(skm), SK_KVA(sl));
1015 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1016 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1017
1018 /*
1019 * Go through the slab's list of buffer controls and free
1020 * them, and then free the slab itself back to its cache.
1021 */
1022 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1023 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1024 skmem_cache_free(skmem_bufctl_cache, bc);
1025 }
1026 skmem_cache_free(skmem_slab_cache, sl);
1027
1028 /* and finally free the segment back to the backing region */
1029 skmem_region_free(skm->skm_region, slab, slabm);
1030}
1031
1032/*
1033 * Allocate a raw object from the (locked) slab layer. Normal region variant.
1034 */
1035static int
1036skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
1037 struct skmem_obj_info *oim, uint32_t skmflag)
1038{
1039 struct skmem_bufctl_bkt *bcb;
1040 struct skmem_bufctl *bc;
1041 struct skmem_slab *sl;
1042 uint32_t retries = 0;
1043 uint64_t boff_total = 0; /* in usec */
1044 uint64_t boff = 0; /* in msec */
1045 boolean_t new_slab;
1046 void *buf;
1047#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1048 vm_offset_t tagged_address; /* address tagging */
1049 struct skmem_region *region; /* region source for this slab */
1050#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1051
1052 /* this flag is not for the caller to set */
1053 VERIFY(!(skmflag & SKMEM_FAILOK));
1054
1055 /*
1056 * A slab is either in a partially-allocated list (at least it has
1057 * a free object available), or is in the empty list (everything
1058 * has been allocated.) If we can't find a partially-allocated
1059 * slab, then we need to allocate a slab (segment) from the region.
1060 */
1061again:
1062 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1063 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1064 if (sl == NULL) {
1065 uint32_t flags = skmflag;
1066 boolean_t retry;
1067
1068 ASSERT(skm->skm_sl_partial == 0);
1069 SKM_SLAB_UNLOCK(skm);
1070 if (!(flags & SKMEM_NOSLEEP)) {
1071 /*
1072 * Pick up a random value to start the exponential
1073 * backoff, if this is the first round, or if the
1074 * current value is over the threshold. Otherwise,
1075 * double the backoff value.
1076 */
1077 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
1078 read_frandom(buffer: &boff, numBytes: sizeof(boff));
1079 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
1080 ASSERT(boff > 0);
1081 } else if (os_mul_overflow(boff, 2, &boff)) {
1082 panic_plain("\"%s\": boff counter "
1083 "overflows\n", skm->skm_name);
1084 /* NOTREACHED */
1085 __builtin_unreachable();
1086 }
1087 /* add this value (in msec) to the total (in usec) */
1088 if (os_add_overflow(boff_total,
1089 (boff * NSEC_PER_USEC), &boff_total)) {
1090 panic_plain("\"%s\": boff_total counter "
1091 "overflows\n", skm->skm_name);
1092 /* NOTREACHED */
1093 __builtin_unreachable();
1094 }
1095 }
1096 /*
1097 * In the event of a race between multiple threads trying
1098 * to create the last remaining (or the only) slab, let the
1099 * loser(s) attempt to retry after waiting a bit. The winner
1100 * would have inserted the newly-created slab into the list.
1101 */
1102 if (!(flags & SKMEM_NOSLEEP) &&
1103 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1104 retry = TRUE;
1105 ++retries;
1106 flags |= SKMEM_FAILOK;
1107 } else {
1108 if (!(flags & SKMEM_NOSLEEP)) {
1109 panic_plain("\"%s\": failed to allocate "
1110 "slab (sleeping mode) after %llu "
1111 "msec, %u retries\n\n%s", skm->skm_name,
1112 (boff_total / NSEC_PER_USEC), retries,
1113 skmem_dump(skm->skm_region));
1114 /* NOTREACHED */
1115 __builtin_unreachable();
1116 }
1117 retry = FALSE;
1118 }
1119
1120 /*
1121 * Create a new slab.
1122 */
1123 if ((sl = skmem_slab_create(skm, skmflag: flags)) == NULL) {
1124 if (retry) {
1125 SK_ERR("\"%s\": failed to allocate "
1126 "slab (%ssleeping mode): waiting for %llu "
1127 "msec, total %llu msec, %u retries",
1128 skm->skm_name,
1129 (flags & SKMEM_NOSLEEP) ? "non-" : "",
1130 boff, (boff_total / NSEC_PER_USEC), retries);
1131 VERIFY(boff > 0 && ((uint32_t)boff <=
1132 (SKMEM_SLAB_BACKOFF_THRES * 2)));
1133 delay(usec: (uint32_t)boff * NSEC_PER_USEC);
1134 SKM_SLAB_LOCK(skm);
1135 goto again;
1136 } else {
1137 SK_RDERR(4, "\"%s\": failed to allocate slab "
1138 "(%ssleeping mode)", skm->skm_name,
1139 (flags & SKMEM_NOSLEEP) ? "non-" : "");
1140 SKM_SLAB_LOCK(skm);
1141 }
1142 return ENOMEM;
1143 }
1144
1145 SKM_SLAB_LOCK(skm);
1146 skm->skm_sl_create++;
1147 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1148 skm->skm_sl_bufmax) {
1149 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1150 }
1151 }
1152 skm->skm_sl_alloc++;
1153
1154 new_slab = (sl->sl_refcnt == 0);
1155 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
1156
1157 sl->sl_refcnt++;
1158 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1159
1160 /*
1161 * We either have a new slab, or a partially-allocated one.
1162 * Remove a buffer control from the slab, and insert it to
1163 * the allocated-address hash chain.
1164 */
1165 bc = SLIST_FIRST(&sl->sl_head);
1166 ASSERT(bc != NULL);
1167 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1168
1169 /* sanity check */
1170 VERIFY(bc->bc_usecnt == 0);
1171
1172 /*
1173 * Also store the master object's region info for the caller.
1174 */
1175 bzero(s: oi, n: sizeof(*oi));
1176#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1177 region = sl->sl_cache->skm_region;
1178 if (region->skr_mode & SKR_MODE_MEMTAG) {
1179 /*
1180 * If this region is configured to be tagged, we generate a
1181 * unique tag for the object address, and return this tagged
1182 * address to the caller. vm_memtag_assign_tag generates a
1183 * unique tag for the given address and size, and
1184 * vm_memtag_set_tag commits the tag to the backing memory
1185 * metadata. This tagged address is returned back to the client,
1186 * and when the client frees the address, we "re-tag" the
1187 * address to prevent against use-after-free attacks (more on
1188 * this in skmem_cache_batch_free).
1189 */
1190 tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
1191 skm->skm_objsize);
1192 vm_memtag_set_tag(tagged_address, skm->skm_objsize);
1193 buf = (void *)tagged_address;
1194 } else {
1195 buf = bc->bc_addr;
1196 }
1197#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1198 buf = bc->bc_addr;
1199#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1200 SKMEM_OBJ_ADDR(oi) = buf;
1201 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1202 ASSERT(skm->skm_objsize <= UINT32_MAX);
1203 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1204 SKMEM_OBJ_IDX_REG(oi) =
1205 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1206 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1207 /*
1208 * And for slave object.
1209 */
1210 if (oim != NULL) {
1211 bzero(s: oim, n: sizeof(*oim));
1212 if (bc->bc_addrm != NULL) {
1213 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1214 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1215 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1216 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1217 }
1218 }
1219
1220 if (skm->skm_mode & SKM_MODE_BATCH) {
1221 ((struct skmem_obj *)buf)->mo_next = NULL;
1222 }
1223
1224 /* insert to allocated-address hash chain */
1225 bcb = SKMEM_CACHE_HASH(skm, buf);
1226 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1227
1228 if (SLIST_EMPTY(&sl->sl_head)) {
1229 /*
1230 * If that was the last buffer control from this slab,
1231 * insert the slab into the empty list. If it was in
1232 * the partially-allocated list, then remove the slab
1233 * from there as well.
1234 */
1235 ASSERT(sl->sl_refcnt == sl->sl_chunks);
1236 if (new_slab) {
1237 ASSERT(sl->sl_chunks == 1);
1238 } else {
1239 ASSERT(sl->sl_chunks > 1);
1240 ASSERT(skm->skm_sl_partial > 0);
1241 skm->skm_sl_partial--;
1242 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1243 }
1244 skm->skm_sl_empty++;
1245 ASSERT(skm->skm_sl_empty != 0);
1246 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1247 } else {
1248 /*
1249 * The slab is not empty; if it was newly allocated
1250 * above, then it's not in the partially-allocated
1251 * list and so we insert it there.
1252 */
1253 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1254 if (new_slab) {
1255 skm->skm_sl_partial++;
1256 ASSERT(skm->skm_sl_partial != 0);
1257 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1258 sl, sl_link);
1259 }
1260 }
1261
1262 /* if auditing is enabled, record this transaction */
1263 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1264 skmem_audit_bufctl(bc);
1265 }
1266
1267 return 0;
1268}
1269
1270/*
1271 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
1272 */
1273static int
1274skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1275 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
1276{
1277 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1278 struct skmem_region *skr = skm->skm_region;
1279 void *obj, *buf;
1280
1281 /* this flag is not for the caller to set */
1282 VERIFY(!(skmflag & SKMEM_FAILOK));
1283
1284 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1285
1286 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1287 /* mirrored region is not applicable */
1288 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1289 /* batching is not yet supported */
1290 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1291
1292 if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) {
1293 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
1294 return ENOMEM;
1295 }
1296
1297#if KASAN
1298 /*
1299 * Perform some fix-ups since the zone element isn't guaranteed
1300 * to be on the aligned boundary. The effective object size
1301 * has been adjusted accordingly by skmem_region_create() earlier
1302 * at cache creation time.
1303 *
1304 * 'buf' is get the aligned address for this object.
1305 */
1306 buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
1307 skm->skm_bufalign);
1308
1309 /*
1310 * Wind back a pointer size from the aligned address and
1311 * save the original address so we can free it later.
1312 */
1313 void **pbuf = (void **)((intptr_t)buf - sizeof(void *));
1314 *pbuf = obj;
1315
1316 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1317 ((intptr_t)obj + skm->skm_objsize));
1318#else /* !KASAN */
1319 /*
1320 * We expect that the zone allocator would allocate elements
1321 * rounded up to the requested alignment based on the effective
1322 * object size computed in skmem_region_create() earlier, and
1323 * 'buf' is therefore the element address itself.
1324 */
1325 buf = obj;
1326#endif /* !KASAN */
1327
1328 /* make sure the object is aligned */
1329 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1330
1331 /*
1332 * Return the object's info to the caller.
1333 */
1334 bzero(s: oi, n: sizeof(*oi));
1335 SKMEM_OBJ_ADDR(oi) = buf;
1336 ASSERT(skm->skm_objsize <= UINT32_MAX);
1337 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1338 if (oim != NULL) {
1339 bzero(s: oim, n: sizeof(*oim));
1340 }
1341
1342 skm->skm_sl_alloc++;
1343 skm->skm_sl_bufinuse++;
1344 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1345 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1346 }
1347
1348 return 0;
1349}
1350
1351/*
1352 * Allocate a raw object from the slab layer.
1353 */
1354static int
1355skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
1356 struct skmem_obj_info *oim, uint32_t skmflag)
1357{
1358 int err;
1359
1360 SKM_SLAB_LOCK(skm);
1361 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1362 SKM_SLAB_UNLOCK(skm);
1363
1364 return err;
1365}
1366
1367/*
1368 * Allocate raw object(s) from the slab layer.
1369 */
1370static uint32_t
1371skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1372 uint32_t num, uint32_t skmflag)
1373{
1374 uint32_t need = num;
1375
1376 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1377 *list = NULL;
1378
1379 SKM_SLAB_LOCK(skm);
1380 for (;;) {
1381 struct skmem_obj_info oi, oim;
1382
1383 /*
1384 * Get a single raw object from the slab layer.
1385 */
1386 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1387 break;
1388 }
1389
1390 *list = SKMEM_OBJ_ADDR(&oi);
1391 ASSERT((*list)->mo_next == NULL);
1392 /* store these inside the object itself */
1393 (*list)->mo_info = oi;
1394 (*list)->mo_minfo = oim;
1395 list = &(*list)->mo_next;
1396
1397 ASSERT(need != 0);
1398 if (--need == 0) {
1399 break;
1400 }
1401 }
1402 SKM_SLAB_UNLOCK(skm);
1403
1404 return num - need;
1405}
1406
1407/*
1408 * Free a raw object to the (locked) slab layer. Normal region variant.
1409 */
1410static void
1411skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
1412{
1413 struct skmem_bufctl *bc, *tbc;
1414 struct skmem_bufctl_bkt *bcb;
1415 struct skmem_slab *sl = NULL;
1416#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1417 struct skmem_region *region;
1418 vm_offset_t tagged_addr;
1419 /*
1420 * If buf is tagged, then addr would have the canonicalized address.
1421 * If buf is untagged, then addr is same as buf.
1422 */
1423 void *addr = (void *)vm_memtag_canonicalize_address((vm_offset_t)buf);
1424#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1425
1426 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1427 ASSERT(buf != NULL);
1428 /* caller is expected to clear mo_next */
1429 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1430 ((struct skmem_obj *)buf)->mo_next == NULL);
1431
1432 /*
1433 * Search the hash chain to find a matching buffer control for the
1434 * given object address. If found, remove the buffer control from
1435 * the hash chain and insert it into the freelist. Otherwise, we
1436 * panic since the caller has given us a bogus address.
1437 */
1438 skm->skm_sl_free++;
1439 bcb = SKMEM_CACHE_HASH(skm, buf);
1440
1441#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1442 /*
1443 * If this region is configured to tag memory addresses, then buf is a
1444 * tagged address. When we search for the buffer control from the hash
1445 * table, we need to use the untagged address, because buffer control
1446 * maintains untagged address (bc_addr). vm_memtag_canonicalize_address
1447 * returns the untagged address.
1448 */
1449 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1450 if (bc->bc_addr == addr) {
1451 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1452 sl = bc->bc_slab;
1453 break;
1454 }
1455 }
1456#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1457 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1458 if (bc->bc_addr == buf) {
1459 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1460 sl = bc->bc_slab;
1461 break;
1462 }
1463 }
1464#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1465
1466 if (bc == NULL) {
1467 panic("%s: attempt to free invalid or already-freed obj %p "
1468 "on skm %p", __func__, buf, skm);
1469 /* NOTREACHED */
1470 __builtin_unreachable();
1471 }
1472 ASSERT(sl != NULL && sl->sl_cache == skm);
1473
1474#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1475 /*
1476 * We use untagged address here, because SKMEM_SLAB_MEMBER compares the
1477 * address against sl_base, which is untagged.
1478 */
1479 VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
1480#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1481 VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1482#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1483
1484 /* make sure this object is not currently in use by another object */
1485 VERIFY(bc->bc_usecnt == 0);
1486
1487 /* if auditing is enabled, record this transaction */
1488 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1489 skmem_audit_bufctl(bc);
1490 }
1491
1492 /* if clear on free is requested, zero out the object */
1493 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1494 bzero(s: buf, n: skm->skm_objsize);
1495 }
1496
1497#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1498 /*
1499 * If this region is configured to tag memory addresses, we re-tag this
1500 * address as the object is freed. We do the re-tagging in the magazine
1501 * layer too, but in case we need to free raw objects to the slab layer
1502 * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
1503 * was not able to allocate empty magazines), we re-tag the addresses
1504 * here in the slab layer. Freeing to the slab layer is symmetrical to
1505 * allocating from the slab layer - when we allocate from slab layer, we
1506 * tag the address, and then construct the object; when we free to the
1507 * slab layer, we destruct the object, and retag the address.
1508 * We do the re-tagging here, because this is right after the last usage
1509 * of the buf variable (which is tagged).
1510 */
1511 region = skm->skm_region;
1512 if (region->skr_mode & SKR_MODE_MEMTAG) {
1513 tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
1514 skm->skm_objsize);
1515 vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
1516 }
1517#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1518
1519 /* insert the buffer control to the slab's freelist */
1520 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1521
1522 ASSERT(sl->sl_refcnt >= 1);
1523 if (--sl->sl_refcnt == 0) {
1524 /*
1525 * If this was the last outstanding object for the slab,
1526 * remove the slab from the partially-allocated or empty
1527 * list, and destroy the slab (segment) back to the region.
1528 */
1529 if (sl->sl_chunks == 1) {
1530 ASSERT(skm->skm_sl_empty > 0);
1531 skm->skm_sl_empty--;
1532 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1533 } else {
1534 ASSERT(skm->skm_sl_partial > 0);
1535 skm->skm_sl_partial--;
1536 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1537 }
1538 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
1539 skm->skm_sl_bufinuse -= sl->sl_chunks;
1540 skm->skm_sl_destroy++;
1541 SKM_SLAB_UNLOCK(skm);
1542 skmem_slab_destroy(skm, sl);
1543 SKM_SLAB_LOCK(skm);
1544 return;
1545 }
1546
1547 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1548 if (SLIST_NEXT(bc, bc_link) == NULL) {
1549 /*
1550 * If this is the first (potentially amongst many) object
1551 * that's returned to the slab, remove the slab from the
1552 * empty list and insert to end of the partially-allocated
1553 * list. This should help avoid thrashing the partial slab
1554 * since we avoid disturbing what's already at the front.
1555 */
1556 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
1557 ASSERT(sl->sl_chunks > 1);
1558 ASSERT(skm->skm_sl_empty > 0);
1559 skm->skm_sl_empty--;
1560 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1561 skm->skm_sl_partial++;
1562 ASSERT(skm->skm_sl_partial != 0);
1563 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1564 }
1565}
1566
1567/*
1568 * Free a raw object to the (locked) slab layer. Pseudo region variant.
1569 */
1570static void
1571skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
1572{
1573 struct skmem_region *skr = skm->skm_region;
1574 void *obj = buf;
1575
1576 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1577
1578 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1579
1580 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1581
1582#if KASAN
1583 /*
1584 * Since we stuffed the original zone element address before
1585 * the buffer address in KASAN mode, get it back since we're
1586 * about to free it.
1587 */
1588 void **pbuf = (void **)((intptr_t)obj - sizeof(void *));
1589
1590 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1591 ((intptr_t)*pbuf + skm->skm_objsize));
1592
1593 obj = *pbuf;
1594#endif /* KASAN */
1595
1596 /* free it to zone */
1597 zfree(skr->skr_zreg, obj);
1598
1599 skm->skm_sl_free++;
1600 ASSERT(skm->skm_sl_bufinuse > 0);
1601 skm->skm_sl_bufinuse--;
1602}
1603
1604/*
1605 * Free a raw object to the slab layer.
1606 */
1607static void
1608skmem_slab_free(struct skmem_cache *skm, void *buf)
1609{
1610 if (skm->skm_mode & SKM_MODE_BATCH) {
1611 ((struct skmem_obj *)buf)->mo_next = NULL;
1612 }
1613
1614 SKM_SLAB_LOCK(skm);
1615 skm->skm_slab_free(skm, buf);
1616 SKM_SLAB_UNLOCK(skm);
1617}
1618
1619/*
1620 * Free raw object(s) to the slab layer.
1621 */
1622static void
1623skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1624{
1625 struct skmem_obj *listn;
1626
1627 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1628
1629 SKM_SLAB_LOCK(skm);
1630 for (;;) {
1631 listn = list->mo_next;
1632 list->mo_next = NULL;
1633
1634 /*
1635 * Free a single object to the slab layer.
1636 */
1637 skm->skm_slab_free(skm, (void *)list);
1638
1639 /* if no more objects to free, we're done */
1640 if ((list = listn) == NULL) {
1641 break;
1642 }
1643 }
1644 SKM_SLAB_UNLOCK(skm);
1645}
1646
1647/*
1648 * Return the object's region info.
1649 */
1650void
1651skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
1652 struct skmem_obj_info *oi, struct skmem_obj_info *oim)
1653{
1654 struct skmem_bufctl_bkt *bcb;
1655 struct skmem_bufctl *bc;
1656 struct skmem_slab *sl;
1657
1658 /*
1659 * Search the hash chain to find a matching buffer control for the
1660 * given object address. If not found, panic since the caller has
1661 * given us a bogus address.
1662 */
1663 SKM_SLAB_LOCK(skm);
1664 bcb = SKMEM_CACHE_HASH(skm, buf);
1665 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1666 if (bc->bc_addr == buf) {
1667 break;
1668 }
1669 }
1670
1671 if (__improbable(bc == NULL)) {
1672 panic("%s: %s failed to get object info for %p",
1673 __func__, skm->skm_name, buf);
1674 /* NOTREACHED */
1675 __builtin_unreachable();
1676 }
1677
1678 /*
1679 * Return the master object's info to the caller.
1680 */
1681 sl = bc->bc_slab;
1682 SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1683 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1684 ASSERT(skm->skm_objsize <= UINT32_MAX);
1685 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1686 SKMEM_OBJ_IDX_REG(oi) =
1687 (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1688 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1689 /*
1690 * And for slave object.
1691 */
1692 if (oim != NULL) {
1693 bzero(s: oim, n: sizeof(*oim));
1694 if (bc->bc_addrm != NULL) {
1695 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1696 SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1697 SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1698 SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1699 }
1700 }
1701 SKM_SLAB_UNLOCK(skm);
1702}
1703
1704/*
1705 * Magazine constructor.
1706 */
1707static int
1708skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
1709 void *arg, uint32_t skmflag)
1710{
1711#pragma unused(oim, skmflag)
1712 struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1713
1714 ASSERT(oim == NULL);
1715 ASSERT(arg != NULL);
1716
1717 /*
1718 * Store it in the magazine object since we'll
1719 * need to refer to it during magazine destroy;
1720 * we can't safely refer to skm_magtype as the
1721 * depot lock may not be acquired then.
1722 */
1723 mg->mg_magtype = arg;
1724
1725 return 0;
1726}
1727
1728/*
1729 * Destroy a magazine (free each object to the slab layer).
1730 */
1731static void
1732skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
1733 int nrounds)
1734{
1735 int round;
1736
1737 for (round = 0; round < nrounds; round++) {
1738 void *buf = mg->mg_round[round];
1739 struct skmem_obj *next;
1740
1741 if (skm->skm_mode & SKM_MODE_BATCH) {
1742 next = ((struct skmem_obj *)buf)->mo_next;
1743 ((struct skmem_obj *)buf)->mo_next = NULL;
1744 }
1745
1746 /* deconstruct the object */
1747 if (skm->skm_dtor != NULL) {
1748 skm->skm_dtor(buf, skm->skm_private);
1749 }
1750
1751 /*
1752 * In non-batching mode, each object in the magazine has
1753 * no linkage to its neighbor, so free individual object
1754 * to the slab layer now.
1755 */
1756 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1757 skmem_slab_free(skm, buf);
1758 } else {
1759 ((struct skmem_obj *)buf)->mo_next = next;
1760 }
1761 }
1762
1763 /*
1764 * In batching mode, each object is linked to its neighbor at free
1765 * time, and so take the bottom-most object and free it to the slab
1766 * layer. Because of the way the list is reversed during free, this
1767 * will bring along the rest of objects above it.
1768 */
1769 if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1770 skmem_slab_batch_free(skm, list: mg->mg_round[nrounds - 1]);
1771 }
1772
1773 /* free the magazine itself back to cache */
1774 skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1775}
1776
1777/*
1778 * Get one or more magazines from the depot.
1779 */
1780static uint32_t
1781skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1782 uint32_t *count, struct skmem_mag **list, uint32_t num)
1783{
1784 SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1785 struct skmem_mag *mg;
1786 uint32_t need = num, c = 0;
1787
1788 ASSERT(list != NULL && need > 0);
1789
1790 if (!SKM_DEPOT_LOCK_TRY(skm)) {
1791 /*
1792 * Track the amount of lock contention here; if the contention
1793 * level is high (more than skmem_cache_depot_contention per a
1794 * given skmem_cache_update_interval interval), then we treat
1795 * it as a sign that the per-CPU layer is not using the right
1796 * magazine type, and that we'd need to resize it.
1797 */
1798 SKM_DEPOT_LOCK(skm);
1799 if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1800 skm->skm_depot_contention++;
1801 }
1802 }
1803
1804 while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1805 SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1806 SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1807 ASSERT(ml->ml_total != 0);
1808 if (--ml->ml_total < ml->ml_min) {
1809 ml->ml_min = ml->ml_total;
1810 }
1811 c++;
1812 ml->ml_alloc++;
1813 if (--need == 0) {
1814 break;
1815 }
1816 }
1817 *count -= c;
1818
1819 SKM_DEPOT_UNLOCK(skm);
1820
1821 *list = SLIST_FIRST(&mg_list);
1822
1823 return num - need;
1824}
1825
1826/*
1827 * Return one or more magazines to the depot.
1828 */
1829static void
1830skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1831 uint32_t *count, struct skmem_mag *mg)
1832{
1833 struct skmem_mag *nmg;
1834 uint32_t c = 0;
1835
1836 SKM_DEPOT_LOCK(skm);
1837 while (mg != NULL) {
1838 nmg = SLIST_NEXT(mg, mg_link);
1839 SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1840 ml->ml_total++;
1841 c++;
1842 mg = nmg;
1843 }
1844 *count += c;
1845 SKM_DEPOT_UNLOCK(skm);
1846}
1847
1848/*
1849 * Update the depot's working state statistics.
1850 */
1851static void
1852skmem_depot_ws_update(struct skmem_cache *skm)
1853{
1854 SKM_DEPOT_LOCK_SPIN(skm);
1855 skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1856 skm->skm_full.ml_min = skm->skm_full.ml_total;
1857 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1858 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1859 SKM_DEPOT_UNLOCK(skm);
1860}
1861
1862/*
1863 * Empty the depot's working state statistics (everything's reapable.)
1864 */
1865static void
1866skmem_depot_ws_zero(struct skmem_cache *skm)
1867{
1868 SKM_DEPOT_LOCK_SPIN(skm);
1869 if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1870 skm->skm_full.ml_min != skm->skm_full.ml_total ||
1871 skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1872 skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1873 skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1874 skm->skm_full.ml_min = skm->skm_full.ml_total;
1875 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1876 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1877 skm->skm_depot_ws_zero++;
1878 }
1879 SKM_DEPOT_UNLOCK(skm);
1880}
1881
1882/*
1883 * Reap magazines that's outside of the working set.
1884 */
1885static void
1886skmem_depot_ws_reap(struct skmem_cache *skm)
1887{
1888 struct skmem_mag *mg, *nmg;
1889 uint32_t f, e, reap;
1890
1891 reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1892 if (reap != 0) {
1893 (void) skmem_depot_batch_alloc(skm, ml: &skm->skm_full,
1894 count: &skm->skm_depot_full, list: &mg, num: reap);
1895 while (mg != NULL) {
1896 nmg = SLIST_NEXT(mg, mg_link);
1897 SLIST_NEXT(mg, mg_link) = NULL;
1898 skmem_magazine_destroy(skm, mg,
1899 nrounds: mg->mg_magtype->mt_magsize);
1900 mg = nmg;
1901 }
1902 }
1903
1904 reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1905 if (reap != 0) {
1906 (void) skmem_depot_batch_alloc(skm, ml: &skm->skm_empty,
1907 count: &skm->skm_depot_empty, list: &mg, num: reap);
1908 while (mg != NULL) {
1909 nmg = SLIST_NEXT(mg, mg_link);
1910 SLIST_NEXT(mg, mg_link) = NULL;
1911 skmem_magazine_destroy(skm, mg, nrounds: 0);
1912 mg = nmg;
1913 }
1914 }
1915
1916 if (f != 0 || e != 0) {
1917 os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed);
1918 }
1919}
1920
1921/*
1922 * Performs periodic maintenance on a cache. This is serialized
1923 * through the update thread call, and so we guarantee there's at
1924 * most one update episode in the system at any given time.
1925 */
1926static void
1927skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1928{
1929#pragma unused(arg)
1930 boolean_t resize_mag = FALSE;
1931 boolean_t rescale_hash = FALSE;
1932
1933 SKMEM_CACHE_LOCK_ASSERT_HELD();
1934
1935 /* insist that we are executing in the update thread call context */
1936 ASSERT(sk_is_cache_update_protected());
1937
1938 /*
1939 * If the cache has become much larger or smaller than the
1940 * allocated-address hash table, rescale the hash table.
1941 */
1942 SKM_SLAB_LOCK(skm);
1943 if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1944 (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1945 (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1946 skm->skm_hash_mask > skm->skm_hash_initial)) {
1947 rescale_hash = TRUE;
1948 }
1949 SKM_SLAB_UNLOCK(skm);
1950
1951 /*
1952 * Update the working set.
1953 */
1954 skmem_depot_ws_update(skm);
1955
1956 /*
1957 * If the contention count is greater than the threshold during
1958 * the update interval, and if we are not already at the maximum
1959 * magazine size, increase it.
1960 */
1961 SKM_DEPOT_LOCK_SPIN(skm);
1962 if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1963 (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1964 skmem_cache_depot_contention) {
1965 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1966 resize_mag = TRUE;
1967 }
1968 skm->skm_depot_contention_prev = skm->skm_depot_contention;
1969 SKM_DEPOT_UNLOCK(skm);
1970
1971 if (rescale_hash) {
1972 skmem_cache_hash_rescale(skm);
1973 }
1974
1975 if (resize_mag) {
1976 skmem_cache_magazine_resize(skm);
1977 }
1978}
1979
1980/*
1981 * Reload the CPU's magazines with mg and its follower (if any).
1982 */
1983static void
1984skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1985 int rounds)
1986{
1987 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1988 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1989 ASSERT(cp->cp_magsize > 0);
1990
1991 cp->cp_loaded = mg;
1992 cp->cp_rounds = rounds;
1993 if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1994 cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1995 cp->cp_prounds = rounds;
1996 SLIST_NEXT(mg, mg_link) = NULL;
1997 } else {
1998 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1999 cp->cp_ploaded = NULL;
2000 cp->cp_prounds = -1;
2001 }
2002}
2003
2004/*
2005 * Reload the CPU's magazine with mg and save the previous one.
2006 */
2007static void
2008skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
2009{
2010 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
2011 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
2012 ASSERT(cp->cp_magsize > 0);
2013
2014 cp->cp_ploaded = cp->cp_loaded;
2015 cp->cp_prounds = cp->cp_rounds;
2016 cp->cp_loaded = mg;
2017 cp->cp_rounds = rounds;
2018}
2019
2020/*
2021 * Allocate a constructed object from the cache.
2022 */
2023void *
2024skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
2025{
2026 struct skmem_obj *buf;
2027
2028 (void) skmem_cache_batch_alloc(skm, list: &buf, 1, skmflag);
2029 return buf;
2030}
2031
2032/*
2033 * Allocate constructed object(s) from the cache.
2034 */
2035uint32_t
2036skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
2037 uint32_t num, uint32_t skmflag)
2038{
2039 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2040 struct skmem_obj **top = &(*list);
2041 struct skmem_mag *mg;
2042 uint32_t need = num;
2043
2044 ASSERT(list != NULL);
2045 *list = NULL;
2046
2047 if (need == 0) {
2048 return 0;
2049 }
2050 ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
2051
2052 SKM_CPU_LOCK(cp);
2053 for (;;) {
2054 /*
2055 * If we have an object in the current CPU's loaded
2056 * magazine, return it and we're done.
2057 */
2058 if (cp->cp_rounds > 0) {
2059 int objs = MIN((unsigned int)cp->cp_rounds, need);
2060 /*
2061 * In the SKM_MODE_BATCH case, objects in are already
2062 * linked together with the most recently freed object
2063 * at the head of the list; grab as many objects as we
2064 * can. Otherwise we'll just grab 1 object at most.
2065 */
2066 *list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2067 cp->cp_rounds -= objs;
2068 cp->cp_alloc += objs;
2069
2070 if (skm->skm_mode & SKM_MODE_BATCH) {
2071 struct skmem_obj *tail =
2072 cp->cp_loaded->mg_round[cp->cp_rounds];
2073 list = &tail->mo_next;
2074 *list = NULL;
2075 }
2076
2077 /* if we got them all, return to caller */
2078 if ((need -= objs) == 0) {
2079 SKM_CPU_UNLOCK(cp);
2080 goto done;
2081 }
2082 }
2083
2084 /*
2085 * The CPU's loaded magazine is empty. If the previously
2086 * loaded magazine was full, exchange and try again.
2087 */
2088 if (cp->cp_prounds > 0) {
2089 skmem_cpu_reload(cp, mg: cp->cp_ploaded, rounds: cp->cp_prounds);
2090 continue;
2091 }
2092
2093 /*
2094 * If the magazine layer is disabled, allocate from slab.
2095 * This can happen either because SKM_MODE_NOMAGAZINES is
2096 * set, or because we are resizing the magazine now.
2097 */
2098 if (cp->cp_magsize == 0) {
2099 break;
2100 }
2101
2102 /*
2103 * Both of the CPU's magazines are empty; try to get
2104 * full magazine(s) from the depot layer. Upon success,
2105 * reload and try again. To prevent potential thrashing,
2106 * replace both empty magazines only if the requested
2107 * count exceeds a magazine's worth of objects.
2108 */
2109 (void) skmem_depot_batch_alloc(skm, ml: &skm->skm_full,
2110 count: &skm->skm_depot_full, list: &mg, num: (need <= cp->cp_magsize) ? 1 : 2);
2111 if (mg != NULL) {
2112 SLIST_HEAD(, skmem_mag) mg_list =
2113 SLIST_HEAD_INITIALIZER(mg_list);
2114
2115 if (cp->cp_ploaded != NULL) {
2116 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2117 mg_link);
2118 }
2119 if (SLIST_NEXT(mg, mg_link) == NULL) {
2120 /*
2121 * Depot allocation returns only 1 magazine;
2122 * retain current empty magazine.
2123 */
2124 skmem_cpu_reload(cp, mg, rounds: cp->cp_magsize);
2125 } else {
2126 /*
2127 * We got 2 full magazines from depot;
2128 * release the current empty magazine
2129 * back to the depot layer.
2130 */
2131 if (cp->cp_loaded != NULL) {
2132 SLIST_INSERT_HEAD(&mg_list,
2133 cp->cp_loaded, mg_link);
2134 }
2135 skmem_cpu_batch_reload(cp, mg, rounds: cp->cp_magsize);
2136 }
2137 skmem_depot_batch_free(skm, ml: &skm->skm_empty,
2138 count: &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2139 continue;
2140 }
2141
2142 /*
2143 * The depot layer doesn't have any full magazines;
2144 * allocate directly from the slab layer.
2145 */
2146 break;
2147 }
2148 SKM_CPU_UNLOCK(cp);
2149
2150 if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
2151 struct skmem_obj *rtop, *rlist, *rlistp = NULL;
2152 uint32_t rlistc, c = 0;
2153
2154 /*
2155 * Get a list of raw objects from the slab layer.
2156 */
2157 rlistc = skmem_slab_batch_alloc(skm, list: &rlist, num: need, skmflag);
2158 ASSERT(rlistc == 0 || rlist != NULL);
2159 rtop = rlist;
2160
2161 /*
2162 * Construct each object in the raw list. Upon failure,
2163 * free any remaining objects in the list back to the slab
2164 * layer, and keep the ones that were successfully constructed.
2165 * Here, "oi" and "oim" in each skmem_obj refer to the objects
2166 * coming from the master and slave regions (on mirrored
2167 * regions), respectively. They are stored inside the object
2168 * temporarily so that we can pass them to the constructor.
2169 */
2170 while (skm->skm_ctor != NULL && rlist != NULL) {
2171 struct skmem_obj_info *oi = &rlist->mo_info;
2172 struct skmem_obj_info *oim = &rlist->mo_minfo;
2173 struct skmem_obj *rlistn = rlist->mo_next;
2174
2175 /*
2176 * Note that the constructor guarantees at least
2177 * the size of a pointer at the top of the object
2178 * and no more than that. That means we must not
2179 * refer to "oi" and "oim" any longer after the
2180 * object goes thru the constructor.
2181 */
2182 if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2183 oim : NULL), skm->skm_private, skmflag) != 0) {
2184 VERIFY(rlist->mo_next == rlistn);
2185 os_atomic_add(&skm->skm_sl_alloc_fail,
2186 rlistc - c, relaxed);
2187 if (rlistp != NULL) {
2188 rlistp->mo_next = NULL;
2189 }
2190 if (rlist == rtop) {
2191 rtop = NULL;
2192 ASSERT(c == 0);
2193 }
2194 skmem_slab_batch_free(skm, list: rlist);
2195 rlist = NULL;
2196 rlistc = c;
2197 break;
2198 }
2199 VERIFY(rlist->mo_next == rlistn);
2200
2201 ++c; /* # of constructed objs */
2202 rlistp = rlist;
2203 if ((rlist = rlist->mo_next) == NULL) {
2204 ASSERT(rlistc == c);
2205 break;
2206 }
2207 }
2208
2209 /*
2210 * At this point "top" points to the head of the chain we're
2211 * going to return to caller; "list" points to the tail of that
2212 * chain. The second chain begins at "rtop", and we append
2213 * that after "list" to form a single chain. "rlistc" is the
2214 * number of objects in "rtop" originated from the slab layer
2215 * that have been successfully constructed (if applicable).
2216 */
2217 ASSERT(c == 0 || rtop != NULL);
2218 need -= rlistc;
2219 *list = rtop;
2220 } else {
2221 struct skmem_obj_info oi, oim;
2222 void *buf;
2223
2224 ASSERT(*top == NULL && num == 1 && need == 1);
2225
2226 /*
2227 * Get a single raw object from the slab layer.
2228 */
2229 if (skmem_slab_alloc(skm, oi: &oi, oim: &oim, skmflag) != 0) {
2230 goto done;
2231 }
2232
2233 buf = SKMEM_OBJ_ADDR(&oi);
2234 ASSERT(buf != NULL);
2235
2236 /*
2237 * Construct the raw object. Here, "oi" and "oim" refer to
2238 * the objects coming from the master and slave regions (on
2239 * mirrored regions), respectively.
2240 */
2241 if (skm->skm_ctor != NULL &&
2242 skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2243 &oim : NULL), skm->skm_private, skmflag) != 0) {
2244 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
2245 skmem_slab_free(skm, buf);
2246 goto done;
2247 }
2248
2249 need = 0;
2250 *list = buf;
2251 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
2252 (*list)->mo_next == NULL);
2253 }
2254
2255done:
2256 /* if auditing is enabled, record this transaction */
2257 if (__improbable(*top != NULL &&
2258 (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2259 skmem_audit_buf(skm, *top);
2260 }
2261
2262 return num - need;
2263}
2264
2265/*
2266 * Free a constructed object to the cache.
2267 */
2268void
2269skmem_cache_free(struct skmem_cache *skm, void *buf)
2270{
2271 if (skm->skm_mode & SKM_MODE_BATCH) {
2272 ((struct skmem_obj *)buf)->mo_next = NULL;
2273 }
2274 skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2275}
2276
2277void
2278skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
2279{
2280 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2281 struct skmem_magtype *mtp;
2282 struct skmem_mag *mg;
2283 struct skmem_obj *listn;
2284#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2285 vm_offset_t tagged_address; /* address tagging */
2286 struct skmem_region *region; /* region source for this cache */
2287#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2288
2289 /* if auditing is enabled, record this transaction */
2290 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2291 skmem_audit_buf(skm, list);
2292 }
2293
2294 SKM_CPU_LOCK(cp);
2295 for (;;) {
2296 /*
2297 * If there's an available space in the current CPU's
2298 * loaded magazine, place it there and we're done.
2299 */
2300 if ((unsigned int)cp->cp_rounds <
2301 (unsigned int)cp->cp_magsize) {
2302 /*
2303 * In the SKM_MODE_BATCH case, reverse the list
2304 * while we place each object into the magazine;
2305 * this effectively causes the most recently
2306 * freed object to be reused during allocation.
2307 */
2308 if (skm->skm_mode & SKM_MODE_BATCH) {
2309 listn = list->mo_next;
2310 list->mo_next = (cp->cp_rounds == 0) ? NULL :
2311 cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2312 } else {
2313 listn = NULL;
2314 }
2315#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2316 /*
2317 * If this region is configured to be tagged, we re-tag
2318 * the address that's being freed, to protect against
2319 * use-after-free bugs. This "re-tagged" address will
2320 * reside in the CPU's loaded magazine, and when cache
2321 * alloc is called, it is returned to client as is. At
2322 * this point, we know that this object will be freed to
2323 * the CPU's loaded magazine and not down to the slab
2324 * layer, so we won't be double tagging the same address
2325 * in the magazine layer and slab layer.
2326 */
2327 region = skm->skm_region;
2328 if (region->skr_mode & SKR_MODE_MEMTAG) {
2329 tagged_address = vm_memtag_assign_tag(
2330 (vm_offset_t)list, skm->skm_objsize);
2331 vm_memtag_set_tag(tagged_address,
2332 skm->skm_objsize);
2333 cp->cp_loaded->mg_round[cp->cp_rounds++] =
2334 (void *)tagged_address;
2335 } else {
2336 cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2337 }
2338#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2339 cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2340#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2341 cp->cp_free++;
2342
2343 if ((list = listn) != NULL) {
2344 continue;
2345 }
2346
2347 SKM_CPU_UNLOCK(cp);
2348 return;
2349 }
2350
2351 /*
2352 * The loaded magazine is full. If the previously
2353 * loaded magazine was empty, exchange and try again.
2354 */
2355 if (cp->cp_prounds == 0) {
2356 skmem_cpu_reload(cp, mg: cp->cp_ploaded, rounds: cp->cp_prounds);
2357 continue;
2358 }
2359
2360 /*
2361 * If the magazine layer is disabled, free to slab.
2362 * This can happen either because SKM_MODE_NOMAGAZINES
2363 * is set, or because we are resizing the magazine now.
2364 */
2365 if (cp->cp_magsize == 0) {
2366 break;
2367 }
2368
2369 /*
2370 * Both magazines for the CPU are full; try to get
2371 * empty magazine(s) from the depot. If we get one,
2372 * exchange a full magazine with it and place the
2373 * object in there.
2374 *
2375 * TODO: Because the caller currently doesn't indicate
2376 * the number of objects in the list, we choose the more
2377 * conservative approach of allocating only 1 empty
2378 * magazine (to prevent potential thrashing). Once we
2379 * have the object count, we can replace 1 with similar
2380 * logic as used in skmem_cache_batch_alloc().
2381 */
2382 (void) skmem_depot_batch_alloc(skm, ml: &skm->skm_empty,
2383 count: &skm->skm_depot_empty, list: &mg, num: 1);
2384 if (mg != NULL) {
2385 SLIST_HEAD(, skmem_mag) mg_list =
2386 SLIST_HEAD_INITIALIZER(mg_list);
2387
2388 if (cp->cp_ploaded != NULL) {
2389 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2390 mg_link);
2391 }
2392 if (SLIST_NEXT(mg, mg_link) == NULL) {
2393 /*
2394 * Depot allocation returns only 1 magazine;
2395 * retain current full magazine.
2396 */
2397 skmem_cpu_reload(cp, mg, rounds: 0);
2398 } else {
2399 /*
2400 * We got 2 empty magazines from depot;
2401 * release the current full magazine back
2402 * to the depot layer.
2403 */
2404 if (cp->cp_loaded != NULL) {
2405 SLIST_INSERT_HEAD(&mg_list,
2406 cp->cp_loaded, mg_link);
2407 }
2408 skmem_cpu_batch_reload(cp, mg, rounds: 0);
2409 }
2410 skmem_depot_batch_free(skm, ml: &skm->skm_full,
2411 count: &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2412 continue;
2413 }
2414
2415 /*
2416 * We can't get any empty magazine from the depot, and
2417 * so we need to allocate one. If the allocation fails,
2418 * just fall through, deconstruct and free the object
2419 * to the slab layer.
2420 */
2421 mtp = skm->skm_magtype;
2422 SKM_CPU_UNLOCK(cp);
2423 mg = skmem_cache_alloc(skm: mtp->mt_cache, SKMEM_NOSLEEP);
2424 SKM_CPU_LOCK(cp);
2425
2426 if (mg != NULL) {
2427 /*
2428 * We allocated an empty magazine, but since we
2429 * dropped the CPU lock above the magazine size
2430 * may have changed. If that's the case free
2431 * the magazine and try again.
2432 */
2433 if (cp->cp_magsize != mtp->mt_magsize) {
2434 SKM_CPU_UNLOCK(cp);
2435 skmem_cache_free(skm: mtp->mt_cache, buf: mg);
2436 SKM_CPU_LOCK(cp);
2437 continue;
2438 }
2439
2440 /*
2441 * We have a magazine with the right size;
2442 * add it to the depot and try again.
2443 */
2444 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2445 skmem_depot_batch_free(skm, ml: &skm->skm_empty,
2446 count: &skm->skm_depot_empty, mg);
2447 continue;
2448 }
2449
2450 /*
2451 * We can't get an empty magazine, so free to slab.
2452 */
2453 break;
2454 }
2455 SKM_CPU_UNLOCK(cp);
2456
2457 /*
2458 * We weren't able to free the constructed object(s) to the
2459 * magazine layer, so deconstruct them and free to the slab.
2460 */
2461 if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2462 list->mo_next != NULL)) {
2463 /* whatever is left from original list */
2464 struct skmem_obj *top = list;
2465
2466 while (list != NULL && skm->skm_dtor != NULL) {
2467 listn = list->mo_next;
2468 list->mo_next = NULL;
2469
2470 /* deconstruct the object */
2471 if (skm->skm_dtor != NULL) {
2472 skm->skm_dtor((void *)list, skm->skm_private);
2473 }
2474
2475 list->mo_next = listn;
2476 list = listn;
2477 }
2478
2479 skmem_slab_batch_free(skm, list: top);
2480 } else {
2481 /* deconstruct the object */
2482 if (skm->skm_dtor != NULL) {
2483 skm->skm_dtor((void *)list, skm->skm_private);
2484 }
2485
2486 skmem_slab_free(skm, buf: (void *)list);
2487 }
2488}
2489
2490/*
2491 * Return the maximum number of objects cached at the magazine layer
2492 * based on the chunk size. This takes into account the starting
2493 * magazine type as well as the final magazine type used in resizing.
2494 */
2495uint32_t
2496skmem_cache_magazine_max(uint32_t chunksize)
2497{
2498 struct skmem_magtype *mtp;
2499 uint32_t magsize_max;
2500
2501 VERIFY(ncpu != 0);
2502 VERIFY(chunksize > 0);
2503
2504 /* find a suitable magazine type for this chunk size */
2505 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2506 continue;
2507 }
2508
2509 /* and find the last magazine type */
2510 for (;;) {
2511 magsize_max = mtp->mt_magsize;
2512 if (mtp == skmem_cache_magsize_last ||
2513 chunksize >= mtp->mt_maxbuf) {
2514 break;
2515 }
2516 ++mtp;
2517 VERIFY(mtp <= skmem_cache_magsize_last);
2518 }
2519
2520 return ncpu * magsize_max * 2; /* two magazines per CPU */
2521}
2522
2523/*
2524 * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2525 */
2526boolean_t
2527skmem_allow_magazines(void)
2528{
2529 return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2530}
2531
2532/*
2533 * Purge all magazines from a cache and disable its per-CPU magazines layer.
2534 */
2535static void
2536skmem_cache_magazine_purge(struct skmem_cache *skm)
2537{
2538 struct skmem_cpu_cache *cp;
2539 struct skmem_mag *mg, *pmg;
2540 int rounds, prounds;
2541 uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
2542
2543 SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2544
2545 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2546
2547 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2548 cp = &skm->skm_cpu_cache[cpuid];
2549
2550 SKM_CPU_LOCK_SPIN(cp);
2551 mg = cp->cp_loaded;
2552 pmg = cp->cp_ploaded;
2553 rounds = cp->cp_rounds;
2554 prounds = cp->cp_prounds;
2555 cp->cp_loaded = NULL;
2556 cp->cp_ploaded = NULL;
2557 cp->cp_rounds = -1;
2558 cp->cp_prounds = -1;
2559 cp->cp_magsize = 0;
2560 SKM_CPU_UNLOCK(cp);
2561
2562 if (mg != NULL) {
2563 skmem_magazine_destroy(skm, mg, nrounds: rounds);
2564 ++mg_cnt;
2565 }
2566 if (pmg != NULL) {
2567 skmem_magazine_destroy(skm, mg: pmg, nrounds: prounds);
2568 ++pmg_cnt;
2569 }
2570 }
2571
2572 if (mg_cnt != 0 || pmg_cnt != 0) {
2573 os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed);
2574 }
2575
2576 skmem_depot_ws_zero(skm);
2577 skmem_depot_ws_reap(skm);
2578}
2579
2580/*
2581 * Enable magazines on a cache. Must only be called on a cache with
2582 * its per-CPU magazines layer disabled (e.g. due to purge).
2583 */
2584static void
2585skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2586{
2587#pragma unused(arg)
2588 struct skmem_cpu_cache *cp;
2589 uint32_t cpuid;
2590
2591 if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2592 return;
2593 }
2594
2595 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2596 cp = &skm->skm_cpu_cache[cpuid];
2597 SKM_CPU_LOCK_SPIN(cp);
2598 /* the magazines layer must be disabled at this point */
2599 ASSERT(cp->cp_loaded == NULL);
2600 ASSERT(cp->cp_ploaded == NULL);
2601 ASSERT(cp->cp_rounds == -1);
2602 ASSERT(cp->cp_prounds == -1);
2603 ASSERT(cp->cp_magsize == 0);
2604 cp->cp_magsize = skm->skm_magtype->mt_magsize;
2605 SKM_CPU_UNLOCK(cp);
2606 }
2607
2608 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2609 SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2610 SKMEM_CPU_CACHE(skm)->cp_magsize);
2611}
2612
2613/*
2614 * Enter the cache resize perimeter. Upon success, claim exclusivity
2615 * on the perimeter and return 0, else EBUSY. Caller may indicate
2616 * whether or not they're willing to wait.
2617 */
2618static int
2619skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2620{
2621 SKM_RESIZE_LOCK(skm);
2622 if (skm->skm_rs_owner == current_thread()) {
2623 ASSERT(skm->skm_rs_busy != 0);
2624 skm->skm_rs_busy++;
2625 goto done;
2626 }
2627 if (!can_sleep) {
2628 if (skm->skm_rs_busy != 0) {
2629 SKM_RESIZE_UNLOCK(skm);
2630 return EBUSY;
2631 }
2632 } else {
2633 while (skm->skm_rs_busy != 0) {
2634 skm->skm_rs_want++;
2635 (void) assert_wait(event: &skm->skm_rs_busy, THREAD_UNINT);
2636 SKM_RESIZE_UNLOCK(skm);
2637 (void) thread_block(THREAD_CONTINUE_NULL);
2638 SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2639 "(0x%llx) busy=%u", skm->skm_name,
2640 SK_KVA(skm), skm->skm_rs_busy);
2641 SKM_RESIZE_LOCK(skm);
2642 }
2643 }
2644 SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2645 ASSERT(skm->skm_rs_busy == 0);
2646 skm->skm_rs_busy++;
2647 skm->skm_rs_owner = current_thread();
2648done:
2649 SKM_RESIZE_UNLOCK(skm);
2650 return 0;
2651}
2652
2653/*
2654 * Exit the cache resize perimeter and unblock any waiters.
2655 */
2656static void
2657skmem_cache_resize_exit(struct skmem_cache *skm)
2658{
2659 uint32_t want;
2660
2661 SKM_RESIZE_LOCK(skm);
2662 ASSERT(skm->skm_rs_busy != 0);
2663 ASSERT(skm->skm_rs_owner == current_thread());
2664 if (--skm->skm_rs_busy == 0) {
2665 skm->skm_rs_owner = NULL;
2666 /*
2667 * We're done; notify anyone that has lost the race.
2668 */
2669 if ((want = skm->skm_rs_want) != 0) {
2670 skm->skm_rs_want = 0;
2671 wakeup(chan: (void *)&skm->skm_rs_busy);
2672 SKM_RESIZE_UNLOCK(skm);
2673 } else {
2674 SKM_RESIZE_UNLOCK(skm);
2675 }
2676 } else {
2677 SKM_RESIZE_UNLOCK(skm);
2678 }
2679}
2680
2681/*
2682 * Recompute a cache's magazine size. This is an expensive operation
2683 * and should not be done frequently; larger magazines provide for a
2684 * higher transfer rate with the depot while smaller magazines reduce
2685 * the memory consumption.
2686 */
2687static void
2688skmem_cache_magazine_resize(struct skmem_cache *skm)
2689{
2690 struct skmem_magtype *mtp = skm->skm_magtype;
2691
2692 /* insist that we are executing in the update thread call context */
2693 ASSERT(sk_is_cache_update_protected());
2694 ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2695 /* depot contention only applies to dynamic mode */
2696 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2697
2698 /*
2699 * Although we're executing in the context of the update thread
2700 * call, we need to protect the per-CPU states during resizing
2701 * against other synchronous cache purge/reenable requests that
2702 * could take place in parallel.
2703 */
2704 if (skm->skm_chunksize < mtp->mt_maxbuf) {
2705 (void) skmem_cache_resize_enter(skm, TRUE);
2706 skmem_cache_magazine_purge(skm);
2707
2708 /*
2709 * Upgrade to the next magazine type with larger size.
2710 */
2711 SKM_DEPOT_LOCK_SPIN(skm);
2712 skm->skm_cpu_mag_resize++;
2713 skm->skm_magtype = ++mtp;
2714 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2715 skm->skm_depot_contention_prev =
2716 skm->skm_depot_contention + INT_MAX;
2717 SKM_DEPOT_UNLOCK(skm);
2718
2719 skmem_cache_magazine_enable(skm, arg: 0);
2720 skmem_cache_resize_exit(skm);
2721 }
2722}
2723
2724/*
2725 * Rescale the cache's allocated-address hash table.
2726 */
2727static void
2728skmem_cache_hash_rescale(struct skmem_cache *skm)
2729{
2730 struct skmem_bufctl_bkt *old_table, *new_table;
2731 size_t old_size, new_size;
2732 uint32_t i, moved = 0;
2733
2734 /* insist that we are executing in the update thread call context */
2735 ASSERT(sk_is_cache_update_protected());
2736
2737 /*
2738 * To get small average lookup time (lookup depth near 1.0), the hash
2739 * table size should be roughly the same (not necessarily equivalent)
2740 * as the cache size.
2741 */
2742 new_size = MAX(skm->skm_hash_initial,
2743 (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2744 new_size = MIN(skm->skm_hash_limit, new_size);
2745 old_size = (skm->skm_hash_mask + 1);
2746
2747 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2748 return;
2749 }
2750
2751 new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2752 Z_NOWAIT, skmem_tag_bufctl_hash);
2753 if (__improbable(new_table == NULL)) {
2754 return;
2755 }
2756
2757 for (i = 0; i < new_size; i++) {
2758 SLIST_INIT(&new_table[i].bcb_head);
2759 }
2760
2761 SKM_SLAB_LOCK(skm);
2762
2763 old_size = (skm->skm_hash_mask + 1);
2764 old_table = skm->skm_hash_table;
2765
2766 skm->skm_hash_mask = (new_size - 1);
2767 skm->skm_hash_table = new_table;
2768 skm->skm_sl_rescale++;
2769
2770 for (i = 0; i < old_size; i++) {
2771 struct skmem_bufctl_bkt *bcb = &old_table[i];
2772 struct skmem_bufctl_bkt *new_bcb;
2773 struct skmem_bufctl *bc;
2774
2775 while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2776 SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2777 new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2778 /*
2779 * Ideally we want to insert tail here, but simple
2780 * list doesn't give us that. The fact that we are
2781 * essentially reversing the order is not a big deal
2782 * here vis-a-vis the new table size.
2783 */
2784 SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2785 ++moved;
2786 }
2787 ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2788 }
2789
2790 SK_DF(SK_VERB_MEM_CACHE,
2791 "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2792 (uint32_t)old_size, (uint32_t)new_size, moved);
2793
2794 SKM_SLAB_UNLOCK(skm);
2795
2796 sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2797}
2798
2799/*
2800 * Apply a function to operate on all caches.
2801 */
2802static void
2803skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2804{
2805 struct skmem_cache *skm;
2806
2807 net_update_uptime();
2808
2809 SKMEM_CACHE_LOCK();
2810 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2811 func(skm, arg);
2812 }
2813 SKMEM_CACHE_UNLOCK();
2814}
2815
2816/*
2817 * Reclaim unused memory from a cache.
2818 */
2819static void
2820skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2821{
2822 /*
2823 * Inform the owner to free memory if possible; the reclaim
2824 * policy is left to the owner. This is just an advisory.
2825 */
2826 if (skm->skm_reclaim != NULL) {
2827 skm->skm_reclaim(skm->skm_private);
2828 }
2829
2830 if (lowmem) {
2831 /*
2832 * If another thread is in the process of purging or
2833 * resizing, bail out and let the currently-ongoing
2834 * purging take its natural course.
2835 */
2836 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2837 skmem_cache_magazine_purge(skm);
2838 skmem_cache_magazine_enable(skm, arg: 0);
2839 skmem_cache_resize_exit(skm);
2840 }
2841 } else {
2842 skmem_depot_ws_reap(skm);
2843 }
2844}
2845
2846/*
2847 * Thread call callback for reap.
2848 */
2849static void
2850skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2851{
2852#pragma unused(dummy)
2853 void (*func)(void) = arg;
2854
2855 ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2856 func();
2857}
2858
2859/*
2860 * Start reaping all caches; this is serialized via thread call.
2861 */
2862static void
2863skmem_cache_reap_start(void)
2864{
2865 SK_DF(SK_VERB_MEM_CACHE, "now running");
2866 skmem_cache_applyall(func: skmem_cache_reclaim, arg: skmem_lowmem_check());
2867 skmem_dispatch(skmem_cache_reap_tc, func: skmem_cache_reap_done,
2868 (skmem_cache_update_interval * NSEC_PER_SEC));
2869}
2870
2871/*
2872 * Stop reaping; this would allow another reap request to occur.
2873 */
2874static void
2875skmem_cache_reap_done(void)
2876{
2877 volatile uint32_t *flag = &skmem_cache_reaping;
2878
2879 *flag = 0;
2880 os_atomic_thread_fence(seq_cst);
2881}
2882
2883/*
2884 * Immediately reap all unused memory of a cache. If purging,
2885 * also purge the cached objects at the CPU layer.
2886 */
2887void
2888skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2889{
2890 /* if SKM_MODE_RECLIAM flag is set for this cache, we purge */
2891 if (purge || (skm->skm_mode & SKM_MODE_RECLAIM)) {
2892 /*
2893 * If another thread is in the process of purging or
2894 * resizing, bail out and let the currently-ongoing
2895 * purging take its natural course.
2896 */
2897 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2898 skmem_cache_magazine_purge(skm);
2899 skmem_cache_magazine_enable(skm, arg: 0);
2900 skmem_cache_resize_exit(skm);
2901 }
2902 } else {
2903 skmem_depot_ws_zero(skm);
2904 skmem_depot_ws_reap(skm);
2905
2906 /* clean up cp_ploaded magazines from each CPU */
2907 SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2908
2909 struct skmem_cpu_cache *cp;
2910 struct skmem_mag *pmg;
2911 int prounds;
2912 uint32_t cpuid;
2913
2914 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2915 cp = &skm->skm_cpu_cache[cpuid];
2916
2917 SKM_CPU_LOCK_SPIN(cp);
2918 pmg = cp->cp_ploaded;
2919 prounds = cp->cp_prounds;
2920
2921 cp->cp_ploaded = NULL;
2922 cp->cp_prounds = -1;
2923 SKM_CPU_UNLOCK(cp);
2924
2925 if (pmg != NULL) {
2926 skmem_magazine_destroy(skm, mg: pmg, nrounds: prounds);
2927 }
2928 }
2929 }
2930}
2931
2932/*
2933 * Request a global reap operation to be dispatched.
2934 */
2935void
2936skmem_cache_reap(void)
2937{
2938 /* only one reaping episode is allowed at a time */
2939 if (skmem_lock_owner == current_thread() ||
2940 !os_atomic_cmpxchg(&skmem_cache_reaping, 0, 1, acq_rel)) {
2941 return;
2942 }
2943
2944 skmem_dispatch(skmem_cache_reap_tc, func: skmem_cache_reap_start, 0);
2945}
2946
2947/*
2948 * Reap internal caches.
2949 */
2950void
2951skmem_reap_caches(boolean_t purge)
2952{
2953 skmem_cache_reap_now(skm: skmem_slab_cache, purge);
2954 skmem_cache_reap_now(skm: skmem_bufctl_cache, purge);
2955
2956 /* packet buffer pool objects */
2957 pp_reap_caches(purge);
2958
2959 /* also handle the region cache(s) */
2960 skmem_region_reap_caches(purge);
2961}
2962
2963/*
2964 * Thread call callback for update.
2965 */
2966static void
2967skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2968{
2969#pragma unused(dummy, arg)
2970 sk_protect_t protect;
2971
2972 protect = sk_cache_update_protect();
2973 skmem_cache_applyall(func: skmem_cache_update, arg: 0);
2974 sk_cache_update_unprotect(protect);
2975
2976 skmem_dispatch(skmem_cache_update_tc, NULL,
2977 (skmem_cache_update_interval * NSEC_PER_SEC));
2978}
2979
2980/*
2981 * Given a buffer control, record the current transaction.
2982 */
2983__attribute__((noinline, cold, not_tail_called))
2984static inline void
2985skmem_audit_bufctl(struct skmem_bufctl *bc)
2986{
2987 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
2988 struct timeval tv;
2989
2990 microuptime(tv: &tv);
2991 bca->bc_thread = current_thread();
2992 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
2993 bca->bc_depth = OSBacktrace(bt: bca->bc_stack, SKMEM_STACK_DEPTH);
2994}
2995
2996/*
2997 * Given an object, find its buffer control and record the transaction.
2998 */
2999__attribute__((noinline, cold, not_tail_called))
3000static inline void
3001skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
3002{
3003 struct skmem_bufctl_bkt *bcb;
3004 struct skmem_bufctl *bc;
3005
3006 ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
3007
3008 SKM_SLAB_LOCK(skm);
3009 while (list != NULL) {
3010 void *buf = list;
3011
3012 bcb = SKMEM_CACHE_HASH(skm, buf);
3013 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
3014 if (bc->bc_addr == buf) {
3015 break;
3016 }
3017 }
3018
3019 if (__improbable(bc == NULL)) {
3020 panic("%s: %s failed to get bufctl for %p",
3021 __func__, skm->skm_name, buf);
3022 /* NOTREACHED */
3023 __builtin_unreachable();
3024 }
3025
3026 skmem_audit_bufctl(bc);
3027
3028 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
3029 break;
3030 }
3031
3032 list = list->mo_next;
3033 }
3034 SKM_SLAB_UNLOCK(skm);
3035}
3036
3037static size_t
3038skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len)
3039{
3040 size_t actual_space = sizeof(struct sk_stats_cache);
3041 struct sk_stats_cache *sca = out;
3042 int contention;
3043
3044 if (out == NULL || len < actual_space) {
3045 goto done;
3046 }
3047
3048 bzero(s: sca, n: sizeof(*sca));
3049 (void) snprintf(sca->sca_name, count: sizeof(sca->sca_name), "%s",
3050 skm->skm_name);
3051 uuid_copy(dst: sca->sca_uuid, src: skm->skm_uuid);
3052 uuid_copy(dst: sca->sca_ruuid, src: skm->skm_region->skr_uuid);
3053 sca->sca_mode = skm->skm_mode;
3054 sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
3055 sca->sca_objsize = (uint64_t)skm->skm_objsize;
3056 sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
3057 sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
3058 sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
3059 sca->sca_objalign = (uint64_t)skm->skm_objalign;
3060
3061 sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
3062 sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
3063 sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
3064 sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
3065 sca->sca_depot_full = skm->skm_depot_full;
3066 sca->sca_depot_empty = skm->skm_depot_empty;
3067 sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
3068 /* in case of a race this might be a negative value, turn it into 0 */
3069 if ((contention = (int)(skm->skm_depot_contention -
3070 skm->skm_depot_contention_prev)) < 0) {
3071 contention = 0;
3072 }
3073 sca->sca_depot_contention_factor = contention;
3074
3075 sca->sca_cpu_rounds = 0;
3076 sca->sca_cpu_prounds = 0;
3077 for (int cpuid = 0; cpuid < ncpu; cpuid++) {
3078 struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
3079
3080 SKM_CPU_LOCK(ccp);
3081 if (ccp->cp_rounds > -1) {
3082 sca->sca_cpu_rounds += ccp->cp_rounds;
3083 }
3084 if (ccp->cp_prounds > -1) {
3085 sca->sca_cpu_prounds += ccp->cp_prounds;
3086 }
3087 SKM_CPU_UNLOCK(ccp);
3088 }
3089
3090 sca->sca_sl_create = skm->skm_sl_create;
3091 sca->sca_sl_destroy = skm->skm_sl_destroy;
3092 sca->sca_sl_alloc = skm->skm_sl_alloc;
3093 sca->sca_sl_free = skm->skm_sl_free;
3094 sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
3095 sca->sca_sl_partial = skm->skm_sl_partial;
3096 sca->sca_sl_empty = skm->skm_sl_empty;
3097 sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
3098 sca->sca_sl_rescale = skm->skm_sl_rescale;
3099 sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
3100
3101done:
3102 return actual_space;
3103}
3104
3105static int
3106skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
3107{
3108#pragma unused(arg1, arg2, oidp)
3109 struct skmem_cache *skm;
3110 size_t actual_space;
3111 size_t buffer_space;
3112 size_t allocated_space;
3113 caddr_t buffer = NULL;
3114 caddr_t scan;
3115 int error = 0;
3116
3117 if (!kauth_cred_issuser(cred: kauth_cred_get())) {
3118 return EPERM;
3119 }
3120
3121 net_update_uptime();
3122 buffer_space = req->oldlen;
3123 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3124 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3125 buffer_space = SK_SYSCTL_ALLOC_MAX;
3126 }
3127 allocated_space = buffer_space;
3128 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
3129 if (__improbable(buffer == NULL)) {
3130 return ENOBUFS;
3131 }
3132 } else if (req->oldptr == USER_ADDR_NULL) {
3133 buffer_space = 0;
3134 }
3135 actual_space = 0;
3136 scan = buffer;
3137
3138 SKMEM_CACHE_LOCK();
3139 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
3140 size_t size = skmem_cache_mib_get_stats(skm, out: scan, len: buffer_space);
3141 if (scan != NULL) {
3142 if (buffer_space < size) {
3143 /* supplied buffer too small, stop copying */
3144 error = ENOMEM;
3145 break;
3146 }
3147 scan += size;
3148 buffer_space -= size;
3149 }
3150 actual_space += size;
3151 }
3152 SKMEM_CACHE_UNLOCK();
3153
3154 if (actual_space != 0) {
3155 int out_error = SYSCTL_OUT(req, buffer, actual_space);
3156 if (out_error != 0) {
3157 error = out_error;
3158 }
3159 }
3160 if (buffer != NULL) {
3161 sk_free_data(buffer, allocated_space);
3162 }
3163
3164 return error;
3165}
3166