skmem_cache.c source code [xnu/bsd/skywalk/mem/skmem_cache.c]

1	/*
2	* Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <skywalk/os_skywalk_private.h>
30	#define _FN_KPRINTF
31	#include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32	#include <libkern/OSDebug.h> /* for OSBacktrace */
33	#include <kern/sched_prim.h> /* for assert_wait */
34	#include <vm/vm_memtag.h>
35
36	/*
37	* Memory allocator with per-CPU caching (magazines), derived from the kmem
38	* magazine concept and implementation as described in the following paper:
39	* http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
40	*
41	* That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
42	* reserved. Use is subject to license terms.
43	*
44	* This derivative differs from the original kmem slab allocator, in that:
45	*
46	* a) There is always a discrete bufctl per object, even for small sizes.
47	* This increases the overhead, but is necessary as Skywalk objects
48	* coming from the slab may be shared (RO or RW) with userland; therefore
49	* embedding the KVA pointer linkage in freed objects is a non-starter.
50	*
51	* b) Writing patterns to the slab at slab creation or destruction time
52	* (when debugging is enabled) is not implemented, as the object may
53	* be shared (RW) with userland and thus we cannot panic upon pattern
54	* mismatch episodes. This can be relaxed so that we conditionally
55	* verify the pattern for kernel-only memory.
56	*
57	* This derivative also differs from Darwin's mcache allocator (which itself
58	* is a derivative of the original kmem slab allocator), in that:
59	*
60	* 1) The slab layer is internal to skmem_cache, unlike mcache's external
61	* slab layer required to support mbufs. skmem_cache also supports
62	* constructing and deconstructing objects, while mcache does not.
63	* This brings skmem_cache's model closer to that of the original
64	* kmem slab allocator.
65	*
66	* 2) mcache allows for batch allocation and free by way of chaining the
67	* objects together using a linked list. This requires using a part
68	* of the object to act as the linkage, which is against Skywalk's
69	* requirements of not exposing any KVA pointer to userland. Although
70	* this is supported by skmem_cache, chaining is only possible if the
71	* region is not mapped to userland. That implies that kernel-only
72	* objects can be chained provided the cache is created with batching
73	* mode enabled, and that the object is large enough to contain the
74	* skmem_obj structure.
75	*
76	* In other words, skmem_cache is a hybrid of a hybrid custom allocator that
77	* implements features that are required by Skywalk. In addition to being
78	* aware of userland access on the buffers, in also supports mirrored backend
79	* memory regions. This allows a cache to manage two independent memory
80	* regions, such that allocating/freeing an object from/to one results in
81	* allocating/freeing a shadow object in another, thus guaranteeing that both
82	* objects share the same lifetime.
83	*/
84
85	static uint32_t ncpu; / total # of initialized CPUs /
86
87	static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
88	static struct thread *skmem_lock_owner = THREAD_NULL;
89
90	static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
91	static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
92	static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
93
94	#define SKMEM_CACHE_LOCK() do { \
95	lck_mtx_lock(&skmem_cache_lock); \
96	skmem_lock_owner = current_thread(); \
97	} while (0)
98	#define SKMEM_CACHE_UNLOCK() do { \
99	skmem_lock_owner = THREAD_NULL; \
100	lck_mtx_unlock(&skmem_cache_lock); \
101	} while (0)
102	#define SKMEM_CACHE_LOCK_ASSERT_HELD() \
103	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
104	#define SKMEM_CACHE_LOCK_ASSERT_NOTHELD() \
105	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
106
107	#define SKM_SLAB_LOCK(_skm) \
108	lck_mtx_lock(&(_skm)->skm_sl_lock)
109	#define SKM_SLAB_LOCK_ASSERT_HELD(_skm) \
110	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
111	#define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm) \
112	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
113	#define SKM_SLAB_UNLOCK(_skm) \
114	lck_mtx_unlock(&(_skm)->skm_sl_lock)
115
116	#define SKM_DEPOT_LOCK(_skm) \
117	lck_mtx_lock(&(_skm)->skm_dp_lock)
118	#define SKM_DEPOT_LOCK_SPIN(_skm) \
119	lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
120	#define SKM_DEPOT_CONVERT_LOCK(_skm) \
121	lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
122	#define SKM_DEPOT_LOCK_TRY(_skm) \
123	lck_mtx_try_lock(&(_skm)->skm_dp_lock)
124	#define SKM_DEPOT_LOCK_ASSERT_HELD(_skm) \
125	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
126	#define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm) \
127	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
128	#define SKM_DEPOT_UNLOCK(_skm) \
129	lck_mtx_unlock(&(_skm)->skm_dp_lock)
130
131	#define SKM_RESIZE_LOCK(_skm) \
132	lck_mtx_lock(&(_skm)->skm_rs_lock)
133	#define SKM_RESIZE_LOCK_ASSERT_HELD(_skm) \
134	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
135	#define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm) \
136	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
137	#define SKM_RESIZE_UNLOCK(_skm) \
138	lck_mtx_unlock(&(_skm)->skm_rs_lock)
139
140	#define SKM_CPU_LOCK(_cp) \
141	lck_mtx_lock(&(_cp)->cp_lock)
142	#define SKM_CPU_LOCK_SPIN(_cp) \
143	lck_mtx_lock_spin(&(_cp)->cp_lock)
144	#define SKM_CPU_CONVERT_LOCK(_cp) \
145	lck_mtx_convert_spin(&(_cp)->cp_lock)
146	#define SKM_CPU_LOCK_ASSERT_HELD(_cp) \
147	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
148	#define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp) \
149	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
150	#define SKM_CPU_UNLOCK(_cp) \
151	lck_mtx_unlock(&(_cp)->cp_lock)
152
153	#define SKM_ZONE_MAX 256
154
155	static struct zone skm_zone; /* zone for skmem_cache /
156
157	static struct skmem_cache skmem_slab_cache; /* cache for skmem_slab /
158	static struct skmem_cache skmem_bufctl_cache; /* cache for skmem_bufctl /
159	static unsigned int bc_size; / size of bufctl /
160
161	/*
162	* Magazine types (one per row.)
163	*
164	* The first column defines the number of objects that the magazine can hold.
165	* Using that number, we derive the effective number: the aggregate count of
166	* object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
167	* This would result in an object size that is aligned on the CPU cache
168	* size boundary; the exception to this is the KASAN mode where the size
169	* would be larger due to the redzone regions.
170	*
171	* The second column defines the alignment of the magazine. Because each
172	* magazine is used at the CPU-layer cache, we need to ensure there is no
173	* false sharing across the CPUs, and align the magazines to the maximum
174	* cache alignment size, for simplicity. The value of 0 may be used to
175	* indicate natural pointer size alignment.
176	*
177	* The third column defines the starting magazine type for a given cache,
178	* determined at the cache's creation time based on its chunk size.
179	*
180	* The fourth column defines the magazine type limit for a given cache.
181	* Magazine resizing will only occur if the chunk size is less than this.
182	*/
183	static struct skmem_magtype skmem_magtype[] = {
184	#if defined(__LP64__)
185	{ .mt_magsize = `14`, .mt_align = `0`, .mt_minbuf = `128`, .mt_maxbuf = `512`,
186	.mt_cache = NULL, .mt_cname = "" },
187	{ .mt_magsize = `30`, .mt_align = `0`, .mt_minbuf = `96`, .mt_maxbuf = `256`,
188	.mt_cache = NULL, .mt_cname = "" },
189	{ .mt_magsize = `46`, .mt_align = `0`, .mt_minbuf = `64`, .mt_maxbuf = `128`,
190	.mt_cache = NULL, .mt_cname = "" },
191	{ .mt_magsize = `62`, .mt_align = `0`, .mt_minbuf = `32`, .mt_maxbuf = `64`,
192	.mt_cache = NULL, .mt_cname = "" },
193	{ .mt_magsize = `94`, .mt_align = `0`, .mt_minbuf = `16`, .mt_maxbuf = `32`,
194	.mt_cache = NULL, .mt_cname = "" },
195	{ .mt_magsize = `126`, .mt_align = `0`, .mt_minbuf = `8`, .mt_maxbuf = `16`,
196	.mt_cache = NULL, .mt_cname = "" },
197	{ .mt_magsize = `142`, .mt_align = `0`, .mt_minbuf = `0`, .mt_maxbuf = `8`,
198	.mt_cache = NULL, .mt_cname = "" },
199	{ .mt_magsize = `158`, .mt_align = `0`, .mt_minbuf = `0`, .mt_maxbuf = `0`,
200	.mt_cache = NULL, .mt_cname = "" },
201	#else /* !__LP64__ */
202	{ .mt_magsize = `14`, .mt_align = `0`, .mt_minbuf = `0`, .mt_maxbuf = `0`,
203	.mt_cache = NULL, .mt_cname = "" },
204	#endif /* !__LP64__ */
205	};
206
207	/*
208	* Hash table bounds. Start with the initial value, and rescale up to
209	* the specified limit. Ideally we don't need a limit, but in practice
210	* this helps guard against runaways. These values should be revisited
211	* in future and be adjusted as needed.
212	*/
213	#define SKMEM_CACHE_HASH_INITIAL 64 /* initial hash table size */
214	#define SKMEM_CACHE_HASH_LIMIT 8192 /* hash table size limit */
215
216	#define SKMEM_CACHE_HASH_INDEX(_a, _s, _m) (((_a) >> (_s)) & (_m))
217	#define SKMEM_CACHE_HASH(_skm, _buf) \
218	(&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
219	(_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
220
221	/*
222	* The last magazine type.
223	*/
224	static struct skmem_magtype *skmem_cache_magsize_last;
225
226	static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
227	static boolean_t skmem_cache_ready;
228
229	static int skmem_slab_alloc_locked(struct skmem_cache *,
230	struct skmem_obj_info , struct* skmem_obj_info *, uint32_t);
231	static void skmem_slab_free_locked(struct skmem_cache , void* *);
232	static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
233	struct skmem_obj_info , struct* skmem_obj_info *, uint32_t);
234	static void skmem_slab_free_pseudo_locked(struct skmem_cache , void* *);
235	static struct skmem_slab skmem_slab_create(struct* skmem_cache *, uint32_t);
236	static void skmem_slab_destroy(struct skmem_cache , struct* skmem_slab *);
237	static int skmem_magazine_ctor(struct skmem_obj_info *,
238	struct skmem_obj_info , void* *, uint32_t);
239	static void skmem_magazine_destroy(struct skmem_cache , struct* skmem_mag *,
240	int);
241	static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
242	struct skmem_maglist , uint32_t , struct skmem_mag **, uint32_t);
243	static void skmem_depot_batch_free(struct skmem_cache , struct* skmem_maglist *,
244	uint32_t , struct* skmem_mag *);
245	static void skmem_depot_ws_update(struct skmem_cache *);
246	static void skmem_depot_ws_zero(struct skmem_cache *);
247	static void skmem_depot_ws_reap(struct skmem_cache *);
248	static void skmem_cache_magazine_purge(struct skmem_cache *);
249	static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
250	static void skmem_cache_magazine_resize(struct skmem_cache *);
251	static void skmem_cache_hash_rescale(struct skmem_cache *);
252	static void skmem_cpu_reload(struct skmem_cpu_cache , struct* skmem_mag , int*);
253	static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
254	struct skmem_mag , int*);
255	static void skmem_cache_applyall(void ()(struct* skmem_cache *, uint32_t),
256	uint32_t);
257	static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
258	static void skmem_cache_reap_start(void);
259	static void skmem_cache_reap_done(void);
260	static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
261	static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
262	static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
263	static void skmem_cache_resize_exit(struct skmem_cache *);
264	static void skmem_audit_bufctl(struct skmem_bufctl *);
265	static void skmem_audit_buf(struct skmem_cache , struct* skmem_obj *);
266	static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
267
268	SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
269	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
270	`0`, `0`, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
271	"Skywalk cache statistics");
272
273	static volatile uint32_t skmem_cache_reaping;
274	static thread_call_t skmem_cache_reap_tc;
275	static thread_call_t skmem_cache_update_tc;
276
277	extern kern_return_t thread_terminate(thread_t);
278	extern unsigned int ml_wait_max_cpus(void);
279
280	#define SKMEM_DEBUG_NOMAGAZINES 0x1 /* disable magazines layer */
281	#define SKMEM_DEBUG_AUDIT 0x2 /* audit transactions */
282	#define SKMEM_DEBUG_MASK (SKMEM_DEBUG_NOMAGAZINES\|SKMEM_DEBUG_AUDIT)
283
284	#if DEBUG
285	static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
286	#else /* !DEBUG */
287	static uint32_t skmem_debug = `0`;
288	#endif /* !DEBUG */
289
290	static uint32_t skmem_clear_min = `0`; / clear on free threshold /
291
292	#define SKMEM_CACHE_UPDATE_INTERVAL 11 /* 11 seconds */
293	static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
294
295	#define SKMEM_DEPOT_CONTENTION 3 /* max failed trylock per interval */
296	static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
297
298	/*
299	* Too big a value will cause overflow and thus trip the assertion; the
300	* idea here is to set an upper limit for the time that a particular
301	* thread is allowed to perform retries before we give up and panic.
302	*/
303	#define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
304
305	/*
306	* Threshold (in msec) after which we reset the exponential backoff value
307	* back to its (random) initial value. Note that we allow the actual delay
308	* to be at most twice this value.
309	*/
310	#define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
311
312	/*
313	* To reduce the likelihood of global synchronization between threads,
314	* we use some random value to start the exponential backoff.
315	*/
316	#define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
317
318	#if (DEVELOPMENT \|\| DEBUG)
319	SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
320	CTLFLAG_RW \| CTLFLAG_LOCKED, &skmem_cache_update_interval,
321	SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
322	SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
323	CTLFLAG_RW \| CTLFLAG_LOCKED, &skmem_cache_depot_contention,
324	SKMEM_DEPOT_CONTENTION, "Depot contention");
325
326	static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
327
328	/*
329	* Called by skmem_test_start() to set the update interval.
330	*/
331	void
332	skmem_cache_test_start(uint32_t i)
333	{
334	skmem_cache_update_interval_saved = skmem_cache_update_interval;
335	skmem_cache_update_interval = i;
336	}
337
338	/*
339	* Called by skmem_test_stop() to restore the update interval.
340	*/
341	void
342	skmem_cache_test_stop(void)
343	{
344	skmem_cache_update_interval = skmem_cache_update_interval_saved;
345	}
346	#endif /* (DEVELOPMENT \|\| DEBUG) */
347
348	#define SKMEM_TAG_BUFCTL_HASH "com.apple.skywalk.bufctl.hash"
349	static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
350
351	#define SKMEM_TAG_CACHE_MIB "com.apple.skywalk.cache.mib"
352	static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
353
354	static int __skmem_cache_pre_inited = `0`;
355	static int __skmem_cache_inited = `0`;
356
357	/*
358	* Called before skmem_region_init().
359	*/
360	void
361	skmem_cache_pre_init(void)
362	{
363	vm_size_t skm_size;
364
365	ASSERT(!__skmem_cache_pre_inited);
366
367	ncpu = ml_wait_max_cpus();
368
369	/ allocate extra in case we need to manually align the pointer /
370	if (skm_zone == NULL) {
371	skm_size = SKMEM_CACHE_SIZE(ncpu);
372	#if KASAN
373	/*
374	* When KASAN is enabled, the zone allocator adjusts the
375	* element size to include the redzone regions, in which
376	* case we assume that the elements won't start on the
377	* alignment boundary and thus need to do some fix-ups.
378	* These include increasing the effective object size
379	* which adds at least 136 bytes to the original size,
380	* as computed by skmem_region_params_config() above.
381	*/
382	skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
383	#endif /* KASAN */
384	skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
385	skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", size: skm_size,
386	flags: ZC_PGZ_USE_GUARDS \| ZC_ZFREE_CLEARMEM \| ZC_DESTRUCTIBLE);
387	}
388
389	TAILQ_INIT(&skmem_cache_head);
390
391	__skmem_cache_pre_inited = `1`;
392	}
393
394	/*
395	* Called after skmem_region_init().
396	*/
397	void
398	skmem_cache_init(void)
399	{
400	uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
401	struct skmem_magtype *mtp;
402	uint32_t i;
403
404	_CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
405
406	_CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
407	_CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
408	_CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
409	_CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
410	_CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
411	_CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
412	_CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
413
414	ASSERT(__skmem_cache_pre_inited);
415	ASSERT(!__skmem_cache_inited);
416
417	PE_parse_boot_argn(arg_string: "skmem_debug", arg_ptr: &skmem_debug, max_arg: sizeof(skmem_debug));
418	skmem_debug &= SKMEM_DEBUG_MASK;
419
420	#if (DEVELOPMENT \|\| DEBUG)
421	PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
422	sizeof(skmem_clear_min));
423	#endif /* (DEVELOPMENT \|\| DEBUG) */
424	if (skmem_clear_min == `0`) {
425	/ zeroing 2 CPU cache lines practically comes for free /
426	skmem_clear_min = `2` * cpu_cache_line_size;
427	} else {
428	/ round it up to CPU cache line size /
429	skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
430	cpu_cache_line_size);
431	}
432
433	/ create a cache for buffer control structures /
434	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
435	bc_size = sizeof(struct skmem_bufctl_audit);
436	skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
437	bc_size, sizeof(uint64_t), NULL, NULL,
438	NULL, NULL, NULL, `0`);
439	} else {
440	bc_size = sizeof(struct skmem_bufctl);
441	skmem_bufctl_cache = skmem_cache_create("bufctl",
442	bc_size, sizeof(uint64_t), NULL, NULL,
443	NULL, NULL, NULL, `0`);
444	}
445
446	/ create a cache for slab structures /
447	skmem_slab_cache = skmem_cache_create("slab",
448	sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
449	NULL, NULL, `0`);
450
451	/*
452	* Go thru the magazine type table and create an cache for each.
453	*/
454	for (i = `0`; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
455	mtp = &skmem_magtype[i];
456
457	if (mtp->mt_align != `0` &&
458	((mtp->mt_align & (mtp->mt_align - `1`)) != `0` \|\|
459	mtp->mt_align < (int)cpu_cache_line_size)) {
460	panic("%s: bad alignment %d", __func__, mtp->mt_align);
461	/ NOTREACHED /
462	__builtin_unreachable();
463	}
464	(void) snprintf(mtp->mt_cname, count: sizeof(mtp->mt_cname),
465	"mg.%d", mtp->mt_magsize);
466
467	/ create an cache for this magazine type /
468	mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
469	SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
470	skmem_magazine_ctor, NULL, NULL, mtp, NULL, `0`);
471
472	/ remember the last magazine type /
473	skmem_cache_magsize_last = mtp;
474	}
475
476	VERIFY(skmem_cache_magsize_last != NULL);
477	VERIFY(skmem_cache_magsize_last->mt_minbuf == `0`);
478	VERIFY(skmem_cache_magsize_last->mt_maxbuf == `0`);
479
480	/*
481	* Allocate thread calls for cache reap and update operations.
482	*/
483	skmem_cache_reap_tc =
484	thread_call_allocate_with_options(func: skmem_cache_reap_func,
485	NULL, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
486	skmem_cache_update_tc =
487	thread_call_allocate_with_options(func: skmem_cache_update_func,
488	NULL, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
489	if (skmem_cache_reap_tc == NULL \|\| skmem_cache_update_tc == NULL) {
490	panic("%s: thread_call_allocate failed", __func__);
491	/ NOTREACHED /
492	__builtin_unreachable();
493	}
494
495	/*
496	* We're ready; go through existing skmem_cache entries
497	* (if any) and enable the magazines layer for each.
498	*/
499	skmem_cache_applyall(skmem_cache_magazine_enable, `0`);
500	skmem_cache_ready = TRUE;
501
502	/ and start the periodic cache update machinery /
503	skmem_dispatch(skmem_cache_update_tc, NULL,
504	(skmem_cache_update_interval * NSEC_PER_SEC));
505
506	__skmem_cache_inited = `1`;
507	}
508
509	void
510	skmem_cache_fini(void)
511	{
512	struct skmem_magtype *mtp;
513	uint32_t i;
514
515	if (__skmem_cache_inited) {
516	ASSERT(TAILQ_EMPTY(&skmem_cache_head));
517
518	for (i = `0`; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
519	mtp = &skmem_magtype[i];
520	skmem_cache_destroy(mtp->mt_cache);
521	mtp->mt_cache = NULL;
522	}
523	skmem_cache_destroy(skmem_slab_cache);
524	skmem_slab_cache = NULL;
525	skmem_cache_destroy(skmem_bufctl_cache);
526	skmem_bufctl_cache = NULL;
527
528	if (skmem_cache_reap_tc != NULL) {
529	(void) thread_call_cancel_wait(call: skmem_cache_reap_tc);
530	(void) thread_call_free(call: skmem_cache_reap_tc);
531	skmem_cache_reap_tc = NULL;
532	}
533	if (skmem_cache_update_tc != NULL) {
534	(void) thread_call_cancel_wait(call: skmem_cache_update_tc);
535	(void) thread_call_free(call: skmem_cache_update_tc);
536	skmem_cache_update_tc = NULL;
537	}
538
539	__skmem_cache_inited = `0`;
540	}
541
542	if (__skmem_cache_pre_inited) {
543	if (skm_zone != NULL) {
544	zdestroy(zone: skm_zone);
545	skm_zone = NULL;
546	}
547
548	__skmem_cache_pre_inited = `0`;
549	}
550	}
551
552	/*
553	* Create a cache.
554	*/
555	struct skmem_cache *
556	skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
557	skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
558	void private, struct* skmem_region *region, uint32_t cflags)
559	{
560	boolean_t pseudo = (region == NULL);
561	struct skmem_magtype *mtp;
562	struct skmem_cache *skm;
563	void *buf;
564	size_t segsize;
565	size_t chunksize;
566	size_t objsize;
567	size_t objalign;
568	uint32_t i, cpuid;
569
570	/ enforce 64-bit minimum alignment for buffers /
571	if (bufalign == `0`) {
572	bufalign = SKMEM_CACHE_ALIGN;
573	}
574	bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
575
576	/ enforce alignment to be a power of 2 /
577	VERIFY(powerof2(bufalign));
578
579	if (region == NULL) {
580	struct skmem_region_params srp;
581
582	/ batching is currently not supported on pseudo regions /
583	VERIFY(!(cflags & SKMEM_CR_BATCH));
584
585	srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
586	ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
587
588	/ objalign is always equal to bufalign /
589	srp.srp_align = objalign = bufalign;
590	srp.srp_r_obj_cnt = `1`;
591	srp.srp_r_obj_size = (uint32_t)bufsize;
592	skmem_region_params_config(&srp);
593
594	/ allocate region for intrinsics /
595	region = skmem_region_create(name, &srp, NULL, NULL, NULL);
596	VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
597	VERIFY(objalign == region->skr_align);
598	#if KASAN
599	/*
600	* When KASAN is enabled, the zone allocator adjusts the
601	* element size to include the redzone regions, in which
602	* case we assume that the elements won't start on the
603	* alignment boundary and thus need to do some fix-ups.
604	* These include increasing the effective object size
605	* which adds at least 16 bytes to the original size,
606	* as computed by skmem_region_params_config() above.
607	*/
608	VERIFY(region->skr_c_obj_size >=
609	(bufsize + sizeof(uint64_t) + bufalign));
610	#endif /* KASAN */
611	/ enable magazine resizing by default /
612	cflags \|= SKMEM_CR_DYNAMIC;
613
614	/*
615	* For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
616	* even though it's a no-op since the work is done
617	* at the zone layer instead.
618	*/
619	cflags \|= SKMEM_CR_CLEARONFREE;
620	} else {
621	objalign = region->skr_align;
622	}
623
624	ASSERT(region != NULL);
625	ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
626	segsize = region->skr_seg_size;
627	ASSERT(bufalign <= segsize);
628
629	buf = zalloc_flags(skm_zone, Z_WAITOK \| Z_ZERO);
630	#if KASAN
631	/*
632	* In case we didn't get a cache-aligned memory, round it up
633	* accordingly. This is needed in order to get the rest of
634	* structure members aligned properly. It also means that
635	* the memory span gets shifted due to the round up, but it
636	* is okay since we've allocated extra space for this.
637	*/
638	skm = (struct skmem_cache *)
639	P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
640	void *pbuf = (void* )((intptr_t)skm - sizeof*(void* *));
641	*pbuf = buf;
642	#else /* !KASAN */
643	/*
644	* We expect that the zone allocator would allocate elements
645	* rounded up to the requested alignment based on the object
646	* size computed in skmem_cache_pre_init() earlier, and
647	* 'skm' is therefore the element address itself.
648	*/
649	skm = buf;
650	#endif /* !KASAN */
651	VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
652
653	if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) \|\|
654	(cflags & SKMEM_CR_NOMAGAZINES)) {
655	/*
656	* Either the caller insists that this cache should not
657	* utilize magazines layer, or that the system override
658	* to disable magazines layer on all caches has been set.
659	*/
660	skm->skm_mode \|= SKM_MODE_NOMAGAZINES;
661	} else {
662	/*
663	* Region must be configured with enough objects
664	* to take into account objects at the CPU layer.
665	*/
666	ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
667	}
668
669	if (cflags & SKMEM_CR_DYNAMIC) {
670	/*
671	* Enable per-CPU cache magazine resizing.
672	*/
673	skm->skm_mode \|= SKM_MODE_DYNAMIC;
674	}
675
676	/ region stays around after defunct? /
677	if (region->skr_mode & SKR_MODE_NOREDIRECT) {
678	skm->skm_mode \|= SKM_MODE_NOREDIRECT;
679	}
680
681	if (cflags & SKMEM_CR_BATCH) {
682	/*
683	* Batch alloc/free involves storing the next object
684	* pointer at the beginning of each object; this is
685	* okay for kernel-only regions, but not those that
686	* are mappable to user space (we can't leak kernel
687	* addresses).
688	*/
689	_CASSERT(offsetof(struct skmem_obj, mo_next) == `0`);
690	VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
691
692	/ batching is currently not supported on pseudo regions /
693	VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
694
695	/ validate object size /
696	VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
697
698	skm->skm_mode \|= SKM_MODE_BATCH;
699	}
700
701	uuid_generate_random(out: skm->skm_uuid);
702	(void) snprintf(skm->skm_name, count: sizeof(skm->skm_name),
703	"%s.%s", SKMEM_CACHE_PREFIX, name);
704	skm->skm_bufsize = bufsize;
705	skm->skm_bufalign = bufalign;
706	skm->skm_objalign = objalign;
707	skm->skm_ctor = ctor;
708	skm->skm_dtor = dtor;
709	skm->skm_reclaim = reclaim;
710	skm->skm_private = private;
711	skm->skm_slabsize = segsize;
712
713	skm->skm_region = region;
714	/ callee holds reference /
715	skmem_region_slab_config(region, skm, true);
716	objsize = region->skr_c_obj_size;
717	skm->skm_objsize = objsize;
718
719	if (pseudo) {
720	/*
721	* Release reference from skmem_region_create()
722	* since skm->skm_region holds one now.
723	*/
724	ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
725	skmem_region_release(region);
726
727	skm->skm_mode \|= SKM_MODE_PSEUDO;
728
729	skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
730	skm->skm_slab_free = skmem_slab_free_pseudo_locked;
731	} else {
732	skm->skm_slab_alloc = skmem_slab_alloc_locked;
733	skm->skm_slab_free = skmem_slab_free_locked;
734
735	/ auditing was requested? (normal regions only) /
736	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
737	ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
738	skm->skm_mode \|= SKM_MODE_AUDIT;
739	}
740	}
741
742	/*
743	* Clear upon free (to slab layer) as long as the region is
744	* not marked as read-only for kernel, and if the chunk size
745	* is within the threshold or if the caller had requested it.
746	*/
747	if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
748	if (skm->skm_objsize <= skmem_clear_min \|\|
749	(cflags & SKMEM_CR_CLEARONFREE)) {
750	skm->skm_mode \|= SKM_MODE_CLEARONFREE;
751	}
752	}
753
754	chunksize = bufsize;
755	if (bufalign >= SKMEM_CACHE_ALIGN) {
756	chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
757	}
758
759	chunksize = P2ROUNDUP(chunksize, bufalign);
760	if (chunksize > objsize) {
761	panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
762	__func__, bufsize, chunksize, objsize);
763	/ NOTREACHED /
764	__builtin_unreachable();
765	}
766	ASSERT(chunksize != `0`);
767	skm->skm_chunksize = chunksize;
768
769	lck_mtx_init(lck: &skm->skm_sl_lock, grp: &skmem_sl_lock_grp, attr: &skmem_lock_attr);
770	TAILQ_INIT(&skm->skm_sl_partial_list);
771	TAILQ_INIT(&skm->skm_sl_empty_list);
772
773	/ allocated-address hash table /
774	skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
775	skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
776	skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
777	skm->skm_hash_initial, Z_WAITOK \| Z_NOFAIL, skmem_tag_bufctl_hash);
778
779	skm->skm_hash_mask = (skm->skm_hash_initial - `1`);
780	skm->skm_hash_shift = flsll(chunksize) - `1`;
781
782	for (i = `0`; i < (skm->skm_hash_mask + `1`); i++) {
783	SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
784	}
785
786	lck_mtx_init(lck: &skm->skm_dp_lock, grp: &skmem_dp_lock_grp, attr: &skmem_lock_attr);
787
788	/ find a suitable magazine type for this chunk size /
789	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
790	continue;
791	}
792
793	skm->skm_magtype = mtp;
794	if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
795	skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
796	}
797
798	/*
799	* Initialize the CPU layer. Each per-CPU structure is aligned
800	* on the CPU cache line boundary to prevent false sharing.
801	*/
802	lck_mtx_init(lck: &skm->skm_rs_lock, grp: &skmem_cpu_lock_grp, attr: &skmem_lock_attr);
803	for (cpuid = `0`; cpuid < ncpu; cpuid++) {
804	struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
805
806	VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
807	lck_mtx_init(lck: &ccp->cp_lock, grp: &skmem_cpu_lock_grp,
808	attr: &skmem_lock_attr);
809	ccp->cp_rounds = -`1`;
810	ccp->cp_prounds = -`1`;
811	}
812
813	SKMEM_CACHE_LOCK();
814	TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
815	SKMEM_CACHE_UNLOCK();
816
817	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
818	skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
819	SK_DF(SK_VERB_MEM_CACHE,
820	" bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
821	(uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
822	(uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
823	(uint32_t)skm->skm_slabsize);
824
825	if (skmem_cache_ready) {
826	skmem_cache_magazine_enable(skm, `0`);
827	}
828
829	if (cflags & SKMEM_CR_RECLAIM) {
830	skm->skm_mode \|= SKM_MODE_RECLAIM;
831	}
832
833	return skm;
834	}
835
836	/*
837	* Destroy a cache.
838	*/
839	void
840	skmem_cache_destroy(struct skmem_cache *skm)
841	{
842	uint32_t cpuid;
843
844	SKMEM_CACHE_LOCK();
845	TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
846	SKMEM_CACHE_UNLOCK();
847
848	ASSERT(skm->skm_rs_busy == `0`);
849	ASSERT(skm->skm_rs_want == `0`);
850
851	/ purge all cached objects for this cache /
852	skmem_cache_magazine_purge(skm);
853
854	/*
855	* Panic if we detect there are unfreed objects; the caller
856	* destroying this cache is responsible for ensuring that all
857	* allocated objects have been freed prior to getting here.
858	*/
859	SKM_SLAB_LOCK(skm);
860	if (skm->skm_sl_bufinuse != `0`) {
861	panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
862	skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
863	/ NOTREACHED /
864	__builtin_unreachable();
865	}
866	ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
867	ASSERT(skm->skm_sl_partial == `0`);
868	ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
869	ASSERT(skm->skm_sl_empty == `0`);
870	skm->skm_reclaim = NULL;
871	skm->skm_ctor = NULL;
872	skm->skm_dtor = NULL;
873	SKM_SLAB_UNLOCK(skm);
874
875	if (skm->skm_hash_table != NULL) {
876	#if (DEBUG \|\| DEVELOPMENT)
877	for (uint32_t i = `0`; i < (skm->skm_hash_mask + `1`); i++) {
878	ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
879	}
880	#endif /* DEBUG \|\| DEVELOPMENT */
881
882	sk_free_type_array(struct skmem_bufctl_bkt,
883	skm->skm_hash_mask + `1`, skm->skm_hash_table);
884	skm->skm_hash_table = NULL;
885	}
886
887	for (cpuid = `0`; cpuid < ncpu; cpuid++) {
888	lck_mtx_destroy(lck: &skm->skm_cpu_cache[cpuid].cp_lock,
889	grp: &skmem_cpu_lock_grp);
890	}
891	lck_mtx_destroy(lck: &skm->skm_rs_lock, grp: &skmem_cpu_lock_grp);
892	lck_mtx_destroy(lck: &skm->skm_dp_lock, grp: &skmem_dp_lock_grp);
893	lck_mtx_destroy(lck: &skm->skm_sl_lock, grp: &skmem_sl_lock_grp);
894
895	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
896	skm->skm_name, SK_KVA(skm));
897
898	/ callee releases reference /
899	skmem_region_slab_config(skm->skm_region, skm, false);
900	skm->skm_region = NULL;
901
902	#if KASAN
903	/ get the original address since we're about to free it /
904	void *pbuf = (void* )((intptr_t)skm - sizeof*(void* *));
905	skm = *pbuf;
906	#endif /* KASAN */
907
908	zfree(skm_zone, skm);
909	}
910
911	/*
912	* Create a slab.
913	*/
914	static struct skmem_slab *
915	skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
916	{
917	struct skmem_region *skr = skm->skm_region;
918	uint32_t objsize, chunks;
919	size_t slabsize = skm->skm_slabsize;
920	struct skmem_slab *sl;
921	struct sksegment sg, sgm;
922	char buf, bufm, slab, slabm;
923
924	/*
925	* Allocate a segment (a slab at our layer) from the region.
926	*/
927	slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
928	if (slab == NULL) {
929	goto rg_alloc_failure;
930	}
931
932	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
933	goto slab_alloc_failure;
934	}
935
936	ASSERT(sg != NULL);
937	ASSERT(sgm == NULL \|\| sgm->sg_index == sg->sg_index);
938
939	bzero(s: sl, n: sizeof(*sl));
940	sl->sl_cache = skm;
941	sl->sl_base = buf = slab;
942	sl->sl_basem = bufm = slabm;
943	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
944	objsize = (uint32_t)skr->skr_c_obj_size;
945	ASSERT(skm->skm_objsize == objsize);
946	ASSERT((slabsize / objsize) <= UINT32_MAX);
947	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
948	sl->sl_seg = sg;
949	sl->sl_segm = sgm;
950
951	/*
952	* Create one or more buffer control structures for the slab,
953	* each one tracking a chunk of raw object from the segment,
954	* and insert these into the slab's list of buffer controls.
955	*/
956	ASSERT(chunks > `0`);
957	while (chunks != `0`) {
958	struct skmem_bufctl *bc;
959
960	bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
961	if (bc == NULL) {
962	goto bufctl_alloc_failure;
963	}
964
965	bzero(s: bc, n: bc_size);
966	bc->bc_addr = buf;
967	bc->bc_addrm = bufm;
968	bc->bc_slab = sl;
969	bc->bc_idx = (sl->sl_chunks - chunks);
970	if (skr->skr_mode & SKR_MODE_SHAREOK) {
971	bc->bc_flags \|= SKMEM_BUFCTL_SHAREOK;
972	}
973	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
974	bc->bc_lim = objsize;
975	buf += objsize;
976	if (bufm != NULL) {
977	bufm += objsize;
978	}
979	--chunks;
980	}
981
982	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
983	SK_KVA(skm), SK_KVA(sl));
984	SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
985	SK_KVA(slab), SK_KVA(slab + objsize));
986
987	return sl;
988
989	bufctl_alloc_failure:
990	skmem_slab_destroy(skm, sl);
991
992	slab_alloc_failure:
993	skmem_region_free(skr, slab, slabm);
994
995	rg_alloc_failure:
996	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
997
998	return NULL;
999	}
1000
1001	/*
1002	* Destroy a slab.
1003	*/
1004	static void
1005	skmem_slab_destroy(struct skmem_cache skm, struct* skmem_slab *sl)
1006	{
1007	struct skmem_bufctl bc, tbc;
1008	void *slab = sl->sl_base;
1009	void *slabm = sl->sl_basem;
1010
1011	ASSERT(sl->sl_refcnt == `0`);
1012
1013	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1014	SK_KVA(skm), SK_KVA(sl));
1015	SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1016	SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1017
1018	/*
1019	* Go through the slab's list of buffer controls and free
1020	* them, and then free the slab itself back to its cache.
1021	*/
1022	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1023	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1024	skmem_cache_free(skmem_bufctl_cache, bc);
1025	}
1026	skmem_cache_free(skmem_slab_cache, sl);
1027
1028	/ and finally free the segment back to the backing region /
1029	skmem_region_free(skm->skm_region, slab, slabm);
1030	}
1031
1032	/*
1033	* Allocate a raw object from the (locked) slab layer. Normal region variant.
1034	*/
1035	static int
1036	skmem_slab_alloc_locked(struct skmem_cache skm, struct* skmem_obj_info *oi,
1037	struct skmem_obj_info *oim, uint32_t skmflag)
1038	{
1039	struct skmem_bufctl_bkt *bcb;
1040	struct skmem_bufctl *bc;
1041	struct skmem_slab *sl;
1042	uint32_t retries = `0`;
1043	uint64_t boff_total = `0`; / in usec /
1044	uint64_t boff = `0`; / in msec /
1045	boolean_t new_slab;
1046	void *buf;
1047	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1048	vm_offset_t tagged_address; / address tagging /
1049	struct skmem_region region; /* region source for this slab /
1050	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1051
1052	/ this flag is not for the caller to set /
1053	VERIFY(!(skmflag & SKMEM_FAILOK));
1054
1055	/*
1056	* A slab is either in a partially-allocated list (at least it has
1057	* a free object available), or is in the empty list (everything
1058	* has been allocated.) If we can't find a partially-allocated
1059	* slab, then we need to allocate a slab (segment) from the region.
1060	*/
1061	again:
1062	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1063	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1064	if (sl == NULL) {
1065	uint32_t flags = skmflag;
1066	boolean_t retry;
1067
1068	ASSERT(skm->skm_sl_partial == `0`);
1069	SKM_SLAB_UNLOCK(skm);
1070	if (!(flags & SKMEM_NOSLEEP)) {
1071	/*
1072	* Pick up a random value to start the exponential
1073	* backoff, if this is the first round, or if the
1074	* current value is over the threshold. Otherwise,
1075	* double the backoff value.
1076	*/
1077	if (boff == `0` \|\| boff > SKMEM_SLAB_BACKOFF_THRES) {
1078	read_frandom(buffer: &boff, numBytes: sizeof(boff));
1079	boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + `1`;
1080	ASSERT(boff > `0`);
1081	} else if (os_mul_overflow(boff, `2`, &boff)) {
1082	panic_plain("\"%s\": boff counter "
1083	"overflows\n", skm->skm_name);
1084	/ NOTREACHED /
1085	__builtin_unreachable();
1086	}
1087	/ add this value (in msec) to the total (in usec) /
1088	if (os_add_overflow(boff_total,
1089	(boff * NSEC_PER_USEC), &boff_total)) {
1090	panic_plain("\"%s\": boff_total counter "
1091	"overflows\n", skm->skm_name);
1092	/ NOTREACHED /
1093	__builtin_unreachable();
1094	}
1095	}
1096	/*
1097	* In the event of a race between multiple threads trying
1098	* to create the last remaining (or the only) slab, let the
1099	* loser(s) attempt to retry after waiting a bit. The winner
1100	* would have inserted the newly-created slab into the list.
1101	*/
1102	if (!(flags & SKMEM_NOSLEEP) &&
1103	boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1104	retry = TRUE;
1105	++retries;
1106	flags \|= SKMEM_FAILOK;
1107	} else {
1108	if (!(flags & SKMEM_NOSLEEP)) {
1109	panic_plain("\"%s\": failed to allocate "
1110	"slab (sleeping mode) after %llu "
1111	"msec, %u retries\n\n%s", skm->skm_name,
1112	(boff_total / NSEC_PER_USEC), retries,
1113	skmem_dump(skm->skm_region));
1114	/ NOTREACHED /
1115	__builtin_unreachable();
1116	}
1117	retry = FALSE;
1118	}
1119
1120	/*
1121	* Create a new slab.
1122	*/
1123	if ((sl = skmem_slab_create(skm, skmflag: flags)) == NULL) {
1124	if (retry) {
1125	SK_ERR("\"%s\": failed to allocate "
1126	"slab (%ssleeping mode): waiting for %llu "
1127	"msec, total %llu msec, %u retries",
1128	skm->skm_name,
1129	(flags & SKMEM_NOSLEEP) ? "non-" : "",
1130	boff, (boff_total / NSEC_PER_USEC), retries);
1131	VERIFY(boff > `0` && ((uint32_t)boff <=
1132	(SKMEM_SLAB_BACKOFF_THRES * `2`)));
1133	delay(usec: (uint32_t)boff * NSEC_PER_USEC);
1134	SKM_SLAB_LOCK(skm);
1135	goto again;
1136	} else {
1137	SK_RDERR(`4`, "\"%s\": failed to allocate slab "
1138	"(%ssleeping mode)", skm->skm_name,
1139	(flags & SKMEM_NOSLEEP) ? "non-" : "");
1140	SKM_SLAB_LOCK(skm);
1141	}
1142	return ENOMEM;
1143	}
1144
1145	SKM_SLAB_LOCK(skm);
1146	skm->skm_sl_create++;
1147	if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1148	skm->skm_sl_bufmax) {
1149	skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1150	}
1151	}
1152	skm->skm_sl_alloc++;
1153
1154	new_slab = (sl->sl_refcnt == `0`);
1155	ASSERT(new_slab \|\| SKMEM_SLAB_IS_PARTIAL(sl));
1156
1157	sl->sl_refcnt++;
1158	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1159
1160	/*
1161	* We either have a new slab, or a partially-allocated one.
1162	* Remove a buffer control from the slab, and insert it to
1163	* the allocated-address hash chain.
1164	*/
1165	bc = SLIST_FIRST(&sl->sl_head);
1166	ASSERT(bc != NULL);
1167	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1168
1169	/ sanity check /
1170	VERIFY(bc->bc_usecnt == `0`);
1171
1172	/*
1173	* Also store the master object's region info for the caller.
1174	*/
1175	bzero(s: oi, n: sizeof(*oi));
1176	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1177	region = sl->sl_cache->skm_region;
1178	if (region->skr_mode & SKR_MODE_MEMTAG) {
1179	/*
1180	* If this region is configured to be tagged, we generate a
1181	* unique tag for the object address, and return this tagged
1182	* address to the caller. vm_memtag_assign_tag generates a
1183	* unique tag for the given address and size, and
1184	* vm_memtag_set_tag commits the tag to the backing memory
1185	* metadata. This tagged address is returned back to the client,
1186	* and when the client frees the address, we "re-tag" the
1187	* address to prevent against use-after-free attacks (more on
1188	* this in skmem_cache_batch_free).
1189	*/
1190	tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
1191	skm->skm_objsize);
1192	vm_memtag_set_tag(tagged_address, skm->skm_objsize);
1193	buf = (void *)tagged_address;
1194	} else {
1195	buf = bc->bc_addr;
1196	}
1197	#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1198	buf = bc->bc_addr;
1199	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1200	SKMEM_OBJ_ADDR(oi) = buf;
1201	SKMEM_OBJ_BUFCTL(oi) = bc; / master only; NULL for slave /
1202	ASSERT(skm->skm_objsize <= UINT32_MAX);
1203	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1204	SKMEM_OBJ_IDX_REG(oi) =
1205	((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1206	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1207	/*
1208	* And for slave object.
1209	*/
1210	if (oim != NULL) {
1211	bzero(s: oim, n: sizeof(*oim));
1212	if (bc->bc_addrm != NULL) {
1213	SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1214	SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1215	SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1216	SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1217	}
1218	}
1219
1220	if (skm->skm_mode & SKM_MODE_BATCH) {
1221	((struct skmem_obj *)buf)->mo_next = NULL;
1222	}
1223
1224	/ insert to allocated-address hash chain /
1225	bcb = SKMEM_CACHE_HASH(skm, buf);
1226	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1227
1228	if (SLIST_EMPTY(&sl->sl_head)) {
1229	/*
1230	* If that was the last buffer control from this slab,
1231	* insert the slab into the empty list. If it was in
1232	* the partially-allocated list, then remove the slab
1233	* from there as well.
1234	*/
1235	ASSERT(sl->sl_refcnt == sl->sl_chunks);
1236	if (new_slab) {
1237	ASSERT(sl->sl_chunks == `1`);
1238	} else {
1239	ASSERT(sl->sl_chunks > `1`);
1240	ASSERT(skm->skm_sl_partial > `0`);
1241	skm->skm_sl_partial--;
1242	TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1243	}
1244	skm->skm_sl_empty++;
1245	ASSERT(skm->skm_sl_empty != `0`);
1246	TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1247	} else {
1248	/*
1249	* The slab is not empty; if it was newly allocated
1250	* above, then it's not in the partially-allocated
1251	* list and so we insert it there.
1252	*/
1253	ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1254	if (new_slab) {
1255	skm->skm_sl_partial++;
1256	ASSERT(skm->skm_sl_partial != `0`);
1257	TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1258	sl, sl_link);
1259	}
1260	}
1261
1262	/ if auditing is enabled, record this transaction /
1263	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != `0`)) {
1264	skmem_audit_bufctl(bc);
1265	}
1266
1267	return `0`;
1268	}
1269
1270	/*
1271	* Allocate a raw object from the (locked) slab layer. Pseudo region variant.
1272	*/
1273	static int
1274	skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1275	struct skmem_obj_info oi, struct* skmem_obj_info *oim, uint32_t skmflag)
1276	{
1277	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1278	struct skmem_region *skr = skm->skm_region;
1279	void obj, buf;
1280
1281	/ this flag is not for the caller to set /
1282	VERIFY(!(skmflag & SKMEM_FAILOK));
1283
1284	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1285
1286	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1287	/ mirrored region is not applicable /
1288	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1289	/ batching is not yet supported /
1290	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1291
1292	if ((obj = zalloc_flags(skr->skr_zreg, zflags \| Z_ZERO)) == NULL) {
1293	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
1294	return ENOMEM;
1295	}
1296
1297	#if KASAN
1298	/*
1299	* Perform some fix-ups since the zone element isn't guaranteed
1300	* to be on the aligned boundary. The effective object size
1301	* has been adjusted accordingly by skmem_region_create() earlier
1302	* at cache creation time.
1303	*
1304	* 'buf' is get the aligned address for this object.
1305	*/
1306	buf = (void )P2ROUNDUP((intptr_t)obj + sizeof*(u_int64_t),
1307	skm->skm_bufalign);
1308
1309	/*
1310	* Wind back a pointer size from the aligned address and
1311	* save the original address so we can free it later.
1312	*/
1313	void *pbuf = (void* )((intptr_t)buf - sizeof*(void* *));
1314	*pbuf = obj;
1315
1316	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1317	((intptr_t)obj + skm->skm_objsize));
1318	#else /* !KASAN */
1319	/*
1320	* We expect that the zone allocator would allocate elements
1321	* rounded up to the requested alignment based on the effective
1322	* object size computed in skmem_region_create() earlier, and
1323	* 'buf' is therefore the element address itself.
1324	*/
1325	buf = obj;
1326	#endif /* !KASAN */
1327
1328	/ make sure the object is aligned /
1329	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1330
1331	/*
1332	* Return the object's info to the caller.
1333	*/
1334	bzero(s: oi, n: sizeof(*oi));
1335	SKMEM_OBJ_ADDR(oi) = buf;
1336	ASSERT(skm->skm_objsize <= UINT32_MAX);
1337	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1338	if (oim != NULL) {
1339	bzero(s: oim, n: sizeof(*oim));
1340	}
1341
1342	skm->skm_sl_alloc++;
1343	skm->skm_sl_bufinuse++;
1344	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1345	skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1346	}
1347
1348	return `0`;
1349	}
1350
1351	/*
1352	* Allocate a raw object from the slab layer.
1353	*/
1354	static int
1355	skmem_slab_alloc(struct skmem_cache skm, struct* skmem_obj_info *oi,
1356	struct skmem_obj_info *oim, uint32_t skmflag)
1357	{
1358	int err;
1359
1360	SKM_SLAB_LOCK(skm);
1361	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1362	SKM_SLAB_UNLOCK(skm);
1363
1364	return err;
1365	}
1366
1367	/*
1368	* Allocate raw object(s) from the slab layer.
1369	*/
1370	static uint32_t
1371	skmem_slab_batch_alloc(struct skmem_cache skm, struct* skmem_obj **list,
1372	uint32_t num, uint32_t skmflag)
1373	{
1374	uint32_t need = num;
1375
1376	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1377	*list = NULL;
1378
1379	SKM_SLAB_LOCK(skm);
1380	for (;;) {
1381	struct skmem_obj_info oi, oim;
1382
1383	/*
1384	* Get a single raw object from the slab layer.
1385	*/
1386	if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != `0`) {
1387	break;
1388	}
1389
1390	*list = SKMEM_OBJ_ADDR(&oi);
1391	ASSERT((*list)->mo_next == NULL);
1392	/ store these inside the object itself /
1393	(*list)->mo_info = oi;
1394	(*list)->mo_minfo = oim;
1395	list = &(*list)->mo_next;
1396
1397	ASSERT(need != `0`);
1398	if (--need == `0`) {
1399	break;
1400	}
1401	}
1402	SKM_SLAB_UNLOCK(skm);
1403
1404	return num - need;
1405	}
1406
1407	/*
1408	* Free a raw object to the (locked) slab layer. Normal region variant.
1409	*/
1410	static void
1411	skmem_slab_free_locked(struct skmem_cache skm, void* *buf)
1412	{
1413	struct skmem_bufctl bc, tbc;
1414	struct skmem_bufctl_bkt *bcb;
1415	struct skmem_slab *sl = NULL;
1416	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1417	struct skmem_region *region;
1418	vm_offset_t tagged_addr;
1419	/*
1420	* If buf is tagged, then addr would have the canonicalized address.
1421	* If buf is untagged, then addr is same as buf.
1422	*/
1423	void addr = (void* *)vm_memtag_canonicalize_address((vm_offset_t)buf);
1424	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1425
1426	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1427	ASSERT(buf != NULL);
1428	/ caller is expected to clear mo_next /
1429	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) \|\|
1430	((struct skmem_obj *)buf)->mo_next == NULL);
1431
1432	/*
1433	* Search the hash chain to find a matching buffer control for the
1434	* given object address. If found, remove the buffer control from
1435	* the hash chain and insert it into the freelist. Otherwise, we
1436	* panic since the caller has given us a bogus address.
1437	*/
1438	skm->skm_sl_free++;
1439	bcb = SKMEM_CACHE_HASH(skm, buf);
1440
1441	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1442	/*
1443	* If this region is configured to tag memory addresses, then buf is a
1444	* tagged address. When we search for the buffer control from the hash
1445	* table, we need to use the untagged address, because buffer control
1446	* maintains untagged address (bc_addr). vm_memtag_canonicalize_address
1447	* returns the untagged address.
1448	*/
1449	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1450	if (bc->bc_addr == addr) {
1451	SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1452	sl = bc->bc_slab;
1453	break;
1454	}
1455	}
1456	#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1457	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1458	if (bc->bc_addr == buf) {
1459	SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1460	sl = bc->bc_slab;
1461	break;
1462	}
1463	}
1464	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1465
1466	if (bc == NULL) {
1467	panic("%s: attempt to free invalid or already-freed obj %p "
1468	"on skm %p", __func__, buf, skm);
1469	/ NOTREACHED /
1470	__builtin_unreachable();
1471	}
1472	ASSERT(sl != NULL && sl->sl_cache == skm);
1473
1474	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1475	/*
1476	* We use untagged address here, because SKMEM_SLAB_MEMBER compares the
1477	* address against sl_base, which is untagged.
1478	*/
1479	VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
1480	#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1481	VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1482	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1483
1484	/ make sure this object is not currently in use by another object /
1485	VERIFY(bc->bc_usecnt == `0`);
1486
1487	/ if auditing is enabled, record this transaction /
1488	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != `0`)) {
1489	skmem_audit_bufctl(bc);
1490	}
1491
1492	/ if clear on free is requested, zero out the object /
1493	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1494	bzero(s: buf, n: skm->skm_objsize);
1495	}
1496
1497	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1498	/*
1499	* If this region is configured to tag memory addresses, we re-tag this
1500	* address as the object is freed. We do the re-tagging in the magazine
1501	* layer too, but in case we need to free raw objects to the slab layer
1502	* (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
1503	* was not able to allocate empty magazines), we re-tag the addresses
1504	* here in the slab layer. Freeing to the slab layer is symmetrical to
1505	* allocating from the slab layer - when we allocate from slab layer, we
1506	* tag the address, and then construct the object; when we free to the
1507	* slab layer, we destruct the object, and retag the address.
1508	* We do the re-tagging here, because this is right after the last usage
1509	* of the buf variable (which is tagged).
1510	*/
1511	region = skm->skm_region;
1512	if (region->skr_mode & SKR_MODE_MEMTAG) {
1513	tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
1514	skm->skm_objsize);
1515	vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
1516	}
1517	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1518
1519	/ insert the buffer control to the slab's freelist /
1520	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1521
1522	ASSERT(sl->sl_refcnt >= `1`);
1523	if (--sl->sl_refcnt == `0`) {
1524	/*
1525	* If this was the last outstanding object for the slab,
1526	* remove the slab from the partially-allocated or empty
1527	* list, and destroy the slab (segment) back to the region.
1528	*/
1529	if (sl->sl_chunks == `1`) {
1530	ASSERT(skm->skm_sl_empty > `0`);
1531	skm->skm_sl_empty--;
1532	TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1533	} else {
1534	ASSERT(skm->skm_sl_partial > `0`);
1535	skm->skm_sl_partial--;
1536	TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1537	}
1538	ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= `0`);
1539	skm->skm_sl_bufinuse -= sl->sl_chunks;
1540	skm->skm_sl_destroy++;
1541	SKM_SLAB_UNLOCK(skm);
1542	skmem_slab_destroy(skm, sl);
1543	SKM_SLAB_LOCK(skm);
1544	return;
1545	}
1546
1547	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1548	if (SLIST_NEXT(bc, bc_link) == NULL) {
1549	/*
1550	* If this is the first (potentially amongst many) object
1551	* that's returned to the slab, remove the slab from the
1552	* empty list and insert to end of the partially-allocated
1553	* list. This should help avoid thrashing the partial slab
1554	* since we avoid disturbing what's already at the front.
1555	*/
1556	ASSERT(sl->sl_refcnt == (sl->sl_chunks - `1`));
1557	ASSERT(sl->sl_chunks > `1`);
1558	ASSERT(skm->skm_sl_empty > `0`);
1559	skm->skm_sl_empty--;
1560	TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1561	skm->skm_sl_partial++;
1562	ASSERT(skm->skm_sl_partial != `0`);
1563	TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1564	}
1565	}
1566
1567	/*
1568	* Free a raw object to the (locked) slab layer. Pseudo region variant.
1569	*/
1570	static void
1571	skmem_slab_free_pseudo_locked(struct skmem_cache skm, void* *buf)
1572	{
1573	struct skmem_region *skr = skm->skm_region;
1574	void *obj = buf;
1575
1576	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1577
1578	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1579
1580	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1581
1582	#if KASAN
1583	/*
1584	* Since we stuffed the original zone element address before
1585	* the buffer address in KASAN mode, get it back since we're
1586	* about to free it.
1587	*/
1588	void *pbuf = (void* )((intptr_t)obj - sizeof*(void* *));
1589
1590	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1591	((intptr_t)*pbuf + skm->skm_objsize));
1592
1593	obj = *pbuf;
1594	#endif /* KASAN */
1595
1596	/ free it to zone /
1597	zfree(skr->skr_zreg, obj);
1598
1599	skm->skm_sl_free++;
1600	ASSERT(skm->skm_sl_bufinuse > `0`);
1601	skm->skm_sl_bufinuse--;
1602	}
1603
1604	/*
1605	* Free a raw object to the slab layer.
1606	*/
1607	static void
1608	skmem_slab_free(struct skmem_cache skm, void* *buf)
1609	{
1610	if (skm->skm_mode & SKM_MODE_BATCH) {
1611	((struct skmem_obj *)buf)->mo_next = NULL;
1612	}
1613
1614	SKM_SLAB_LOCK(skm);
1615	skm->skm_slab_free(skm, buf);
1616	SKM_SLAB_UNLOCK(skm);
1617	}
1618
1619	/*
1620	* Free raw object(s) to the slab layer.
1621	*/
1622	static void
1623	skmem_slab_batch_free(struct skmem_cache skm, struct* skmem_obj *list)
1624	{
1625	struct skmem_obj *listn;
1626
1627	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1628
1629	SKM_SLAB_LOCK(skm);
1630	for (;;) {
1631	listn = list->mo_next;
1632	list->mo_next = NULL;
1633
1634	/*
1635	* Free a single object to the slab layer.
1636	*/
1637	skm->skm_slab_free(skm, (void *)list);
1638
1639	/ if no more objects to free, we're done /
1640	if ((list = listn) == NULL) {
1641	break;
1642	}
1643	}
1644	SKM_SLAB_UNLOCK(skm);
1645	}
1646
1647	/*
1648	* Return the object's region info.
1649	*/
1650	void
1651	skmem_cache_get_obj_info(struct skmem_cache skm, void* *buf,
1652	struct skmem_obj_info oi, struct* skmem_obj_info *oim)
1653	{
1654	struct skmem_bufctl_bkt *bcb;
1655	struct skmem_bufctl *bc;
1656	struct skmem_slab *sl;
1657
1658	/*
1659	* Search the hash chain to find a matching buffer control for the
1660	* given object address. If not found, panic since the caller has
1661	* given us a bogus address.
1662	*/
1663	SKM_SLAB_LOCK(skm);
1664	bcb = SKMEM_CACHE_HASH(skm, buf);
1665	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1666	if (bc->bc_addr == buf) {
1667	break;
1668	}
1669	}
1670
1671	if (__improbable(bc == NULL)) {
1672	panic("%s: %s failed to get object info for %p",
1673	__func__, skm->skm_name, buf);
1674	/ NOTREACHED /
1675	__builtin_unreachable();
1676	}
1677
1678	/*
1679	* Return the master object's info to the caller.
1680	*/
1681	sl = bc->bc_slab;
1682	SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1683	SKMEM_OBJ_BUFCTL(oi) = bc; / master only; NULL for slave /
1684	ASSERT(skm->skm_objsize <= UINT32_MAX);
1685	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1686	SKMEM_OBJ_IDX_REG(oi) =
1687	(sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1688	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1689	/*
1690	* And for slave object.
1691	*/
1692	if (oim != NULL) {
1693	bzero(s: oim, n: sizeof(*oim));
1694	if (bc->bc_addrm != NULL) {
1695	SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1696	SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1697	SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1698	SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1699	}
1700	}
1701	SKM_SLAB_UNLOCK(skm);
1702	}
1703
1704	/*
1705	* Magazine constructor.
1706	*/
1707	static int
1708	skmem_magazine_ctor(struct skmem_obj_info oi, struct* skmem_obj_info *oim,
1709	void *arg, uint32_t skmflag)
1710	{
1711	#pragma unused(oim, skmflag)
1712	struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1713
1714	ASSERT(oim == NULL);
1715	ASSERT(arg != NULL);
1716
1717	/*
1718	* Store it in the magazine object since we'll
1719	* need to refer to it during magazine destroy;
1720	* we can't safely refer to skm_magtype as the
1721	* depot lock may not be acquired then.
1722	*/
1723	mg->mg_magtype = arg;
1724
1725	return `0`;
1726	}
1727
1728	/*
1729	* Destroy a magazine (free each object to the slab layer).
1730	*/
1731	static void
1732	skmem_magazine_destroy(struct skmem_cache skm, struct* skmem_mag *mg,
1733	int nrounds)
1734	{
1735	int round;
1736
1737	for (round = `0`; round < nrounds; round++) {
1738	void *buf = mg->mg_round[round];
1739	struct skmem_obj *next;
1740
1741	if (skm->skm_mode & SKM_MODE_BATCH) {
1742	next = ((struct skmem_obj *)buf)->mo_next;
1743	((struct skmem_obj *)buf)->mo_next = NULL;
1744	}
1745
1746	/ deconstruct the object /
1747	if (skm->skm_dtor != NULL) {
1748	skm->skm_dtor(buf, skm->skm_private);
1749	}
1750
1751	/*
1752	* In non-batching mode, each object in the magazine has
1753	* no linkage to its neighbor, so free individual object
1754	* to the slab layer now.
1755	*/
1756	if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1757	skmem_slab_free(skm, buf);
1758	} else {
1759	((struct skmem_obj *)buf)->mo_next = next;
1760	}
1761	}
1762
1763	/*
1764	* In batching mode, each object is linked to its neighbor at free
1765	* time, and so take the bottom-most object and free it to the slab
1766	* layer. Because of the way the list is reversed during free, this
1767	* will bring along the rest of objects above it.
1768	*/
1769	if (nrounds > `0` && (skm->skm_mode & SKM_MODE_BATCH)) {
1770	skmem_slab_batch_free(skm, list: mg->mg_round[nrounds - `1`]);
1771	}
1772
1773	/ free the magazine itself back to cache /
1774	skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1775	}
1776
1777	/*
1778	* Get one or more magazines from the depot.
1779	*/
1780	static uint32_t
1781	skmem_depot_batch_alloc(struct skmem_cache skm, struct* skmem_maglist *ml,
1782	uint32_t count, struct* skmem_mag **list, uint32_t num)
1783	{
1784	SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1785	struct skmem_mag *mg;
1786	uint32_t need = num, c = `0`;
1787
1788	ASSERT(list != NULL && need > `0`);
1789
1790	if (!SKM_DEPOT_LOCK_TRY(skm)) {
1791	/*
1792	* Track the amount of lock contention here; if the contention
1793	* level is high (more than skmem_cache_depot_contention per a
1794	* given skmem_cache_update_interval interval), then we treat
1795	* it as a sign that the per-CPU layer is not using the right
1796	* magazine type, and that we'd need to resize it.
1797	*/
1798	SKM_DEPOT_LOCK(skm);
1799	if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1800	skm->skm_depot_contention++;
1801	}
1802	}
1803
1804	while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1805	SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1806	SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1807	ASSERT(ml->ml_total != `0`);
1808	if (--ml->ml_total < ml->ml_min) {
1809	ml->ml_min = ml->ml_total;
1810	}
1811	c++;
1812	ml->ml_alloc++;
1813	if (--need == `0`) {
1814	break;
1815	}
1816	}
1817	*count -= c;
1818
1819	SKM_DEPOT_UNLOCK(skm);
1820
1821	*list = SLIST_FIRST(&mg_list);
1822
1823	return num - need;
1824	}
1825
1826	/*
1827	* Return one or more magazines to the depot.
1828	*/
1829	static void
1830	skmem_depot_batch_free(struct skmem_cache skm, struct* skmem_maglist *ml,
1831	uint32_t count, struct* skmem_mag *mg)
1832	{
1833	struct skmem_mag *nmg;
1834	uint32_t c = `0`;
1835
1836	SKM_DEPOT_LOCK(skm);
1837	while (mg != NULL) {
1838	nmg = SLIST_NEXT(mg, mg_link);
1839	SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1840	ml->ml_total++;
1841	c++;
1842	mg = nmg;
1843	}
1844	*count += c;
1845	SKM_DEPOT_UNLOCK(skm);
1846	}
1847
1848	/*
1849	* Update the depot's working state statistics.
1850	*/
1851	static void
1852	skmem_depot_ws_update(struct skmem_cache *skm)
1853	{
1854	SKM_DEPOT_LOCK_SPIN(skm);
1855	skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1856	skm->skm_full.ml_min = skm->skm_full.ml_total;
1857	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1858	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1859	SKM_DEPOT_UNLOCK(skm);
1860	}
1861
1862	/*
1863	* Empty the depot's working state statistics (everything's reapable.)
1864	*/
1865	static void
1866	skmem_depot_ws_zero(struct skmem_cache *skm)
1867	{
1868	SKM_DEPOT_LOCK_SPIN(skm);
1869	if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total \|\|
1870	skm->skm_full.ml_min != skm->skm_full.ml_total \|\|
1871	skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total \|\|
1872	skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1873	skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1874	skm->skm_full.ml_min = skm->skm_full.ml_total;
1875	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1876	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1877	skm->skm_depot_ws_zero++;
1878	}
1879	SKM_DEPOT_UNLOCK(skm);
1880	}
1881
1882	/*
1883	* Reap magazines that's outside of the working set.
1884	*/
1885	static void
1886	skmem_depot_ws_reap(struct skmem_cache *skm)
1887	{
1888	struct skmem_mag mg, nmg;
1889	uint32_t f, e, reap;
1890
1891	reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1892	if (reap != `0`) {
1893	(void) skmem_depot_batch_alloc(skm, ml: &skm->skm_full,
1894	count: &skm->skm_depot_full, list: &mg, num: reap);
1895	while (mg != NULL) {
1896	nmg = SLIST_NEXT(mg, mg_link);
1897	SLIST_NEXT(mg, mg_link) = NULL;
1898	skmem_magazine_destroy(skm, mg,
1899	nrounds: mg->mg_magtype->mt_magsize);
1900	mg = nmg;
1901	}
1902	}
1903
1904	reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1905	if (reap != `0`) {
1906	(void) skmem_depot_batch_alloc(skm, ml: &skm->skm_empty,
1907	count: &skm->skm_depot_empty, list: &mg, num: reap);
1908	while (mg != NULL) {
1909	nmg = SLIST_NEXT(mg, mg_link);
1910	SLIST_NEXT(mg, mg_link) = NULL;
1911	skmem_magazine_destroy(skm, mg, nrounds: `0`);
1912	mg = nmg;
1913	}
1914	}
1915
1916	if (f != `0` \|\| e != `0`) {
1917	os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed);
1918	}
1919	}
1920
1921	/*
1922	* Performs periodic maintenance on a cache. This is serialized
1923	* through the update thread call, and so we guarantee there's at
1924	* most one update episode in the system at any given time.
1925	*/
1926	static void
1927	skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1928	{
1929	#pragma unused(arg)
1930	boolean_t resize_mag = FALSE;
1931	boolean_t rescale_hash = FALSE;
1932
1933	SKMEM_CACHE_LOCK_ASSERT_HELD();
1934
1935	/ insist that we are executing in the update thread call context /
1936	ASSERT(sk_is_cache_update_protected());
1937
1938	/*
1939	* If the cache has become much larger or smaller than the
1940	* allocated-address hash table, rescale the hash table.
1941	*/
1942	SKM_SLAB_LOCK(skm);
1943	if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << `1`) &&
1944	(skm->skm_hash_mask + `1`) < skm->skm_hash_limit) \|\|
1945	(skm->skm_sl_bufinuse < (skm->skm_hash_mask >> `1`) &&
1946	skm->skm_hash_mask > skm->skm_hash_initial)) {
1947	rescale_hash = TRUE;
1948	}
1949	SKM_SLAB_UNLOCK(skm);
1950
1951	/*
1952	* Update the working set.
1953	*/
1954	skmem_depot_ws_update(skm);
1955
1956	/*
1957	* If the contention count is greater than the threshold during
1958	* the update interval, and if we are not already at the maximum
1959	* magazine size, increase it.
1960	*/
1961	SKM_DEPOT_LOCK_SPIN(skm);
1962	if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1963	(int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1964	skmem_cache_depot_contention) {
1965	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1966	resize_mag = TRUE;
1967	}
1968	skm->skm_depot_contention_prev = skm->skm_depot_contention;
1969	SKM_DEPOT_UNLOCK(skm);
1970
1971	if (rescale_hash) {
1972	skmem_cache_hash_rescale(skm);
1973	}
1974
1975	if (resize_mag) {
1976	skmem_cache_magazine_resize(skm);
1977	}
1978	}
1979
1980	/*
1981	* Reload the CPU's magazines with mg and its follower (if any).
1982	*/
1983	static void
1984	skmem_cpu_batch_reload(struct skmem_cpu_cache cp, struct* skmem_mag *mg,
1985	int rounds)
1986	{
1987	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -`1`) \|\|
1988	(cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1989	ASSERT(cp->cp_magsize > `0`);
1990
1991	cp->cp_loaded = mg;
1992	cp->cp_rounds = rounds;
1993	if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1994	cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1995	cp->cp_prounds = rounds;
1996	SLIST_NEXT(mg, mg_link) = NULL;
1997	} else {
1998	ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1999	cp->cp_ploaded = NULL;
2000	cp->cp_prounds = -`1`;
2001	}
2002	}
2003
2004	/*
2005	* Reload the CPU's magazine with mg and save the previous one.
2006	*/
2007	static void
2008	skmem_cpu_reload(struct skmem_cpu_cache cp, struct* skmem_mag mg, int* rounds)
2009	{
2010	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -`1`) \|\|
2011	(cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
2012	ASSERT(cp->cp_magsize > `0`);
2013
2014	cp->cp_ploaded = cp->cp_loaded;
2015	cp->cp_prounds = cp->cp_rounds;
2016	cp->cp_loaded = mg;
2017	cp->cp_rounds = rounds;
2018	}
2019
2020	/*
2021	* Allocate a constructed object from the cache.
2022	*/
2023	void *
2024	skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
2025	{
2026	struct skmem_obj *buf;
2027
2028	(void) skmem_cache_batch_alloc(skm, list: &buf, `1`, skmflag);
2029	return buf;
2030	}
2031
2032	/*
2033	* Allocate constructed object(s) from the cache.
2034	*/
2035	uint32_t
2036	skmem_cache_batch_alloc(struct skmem_cache skm, struct* skmem_obj **list,
2037	uint32_t num, uint32_t skmflag)
2038	{
2039	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2040	struct skmem_obj *top = &(list);
2041	struct skmem_mag *mg;
2042	uint32_t need = num;
2043
2044	ASSERT(list != NULL);
2045	*list = NULL;
2046
2047	if (need == `0`) {
2048	return `0`;
2049	}
2050	ASSERT(need == `1` \|\| (skm->skm_mode & SKM_MODE_BATCH));
2051
2052	SKM_CPU_LOCK(cp);
2053	for (;;) {
2054	/*
2055	* If we have an object in the current CPU's loaded
2056	* magazine, return it and we're done.
2057	*/
2058	if (cp->cp_rounds > `0`) {
2059	int objs = MIN((unsigned int)cp->cp_rounds, need);
2060	/*
2061	* In the SKM_MODE_BATCH case, objects in are already
2062	* linked together with the most recently freed object
2063	* at the head of the list; grab as many objects as we
2064	* can. Otherwise we'll just grab 1 object at most.
2065	*/
2066	*list = cp->cp_loaded->mg_round[cp->cp_rounds - `1`];
2067	cp->cp_rounds -= objs;
2068	cp->cp_alloc += objs;
2069
2070	if (skm->skm_mode & SKM_MODE_BATCH) {
2071	struct skmem_obj *tail =
2072	cp->cp_loaded->mg_round[cp->cp_rounds];
2073	list = &tail->mo_next;
2074	*list = NULL;
2075	}
2076
2077	/ if we got them all, return to caller /
2078	if ((need -= objs) == `0`) {
2079	SKM_CPU_UNLOCK(cp);
2080	goto done;
2081	}
2082	}
2083
2084	/*
2085	* The CPU's loaded magazine is empty. If the previously
2086	* loaded magazine was full, exchange and try again.
2087	*/
2088	if (cp->cp_prounds > `0`) {
2089	skmem_cpu_reload(cp, mg: cp->cp_ploaded, rounds: cp->cp_prounds);
2090	continue;
2091	}
2092
2093	/*
2094	* If the magazine layer is disabled, allocate from slab.
2095	* This can happen either because SKM_MODE_NOMAGAZINES is
2096	* set, or because we are resizing the magazine now.
2097	*/
2098	if (cp->cp_magsize == `0`) {
2099	break;
2100	}
2101
2102	/*
2103	* Both of the CPU's magazines are empty; try to get
2104	* full magazine(s) from the depot layer. Upon success,
2105	* reload and try again. To prevent potential thrashing,
2106	* replace both empty magazines only if the requested
2107	* count exceeds a magazine's worth of objects.
2108	*/
2109	(void) skmem_depot_batch_alloc(skm, ml: &skm->skm_full,
2110	count: &skm->skm_depot_full, list: &mg, num: (need <= cp->cp_magsize) ? `1` : `2`);
2111	if (mg != NULL) {
2112	SLIST_HEAD(, skmem_mag) mg_list =
2113	SLIST_HEAD_INITIALIZER(mg_list);
2114
2115	if (cp->cp_ploaded != NULL) {
2116	SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2117	mg_link);
2118	}
2119	if (SLIST_NEXT(mg, mg_link) == NULL) {
2120	/*
2121	* Depot allocation returns only 1 magazine;
2122	* retain current empty magazine.
2123	*/
2124	skmem_cpu_reload(cp, mg, rounds: cp->cp_magsize);
2125	} else {
2126	/*
2127	* We got 2 full magazines from depot;
2128	* release the current empty magazine
2129	* back to the depot layer.
2130	*/
2131	if (cp->cp_loaded != NULL) {
2132	SLIST_INSERT_HEAD(&mg_list,
2133	cp->cp_loaded, mg_link);
2134	}
2135	skmem_cpu_batch_reload(cp, mg, rounds: cp->cp_magsize);
2136	}
2137	skmem_depot_batch_free(skm, ml: &skm->skm_empty,
2138	count: &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2139	continue;
2140	}
2141
2142	/*
2143	* The depot layer doesn't have any full magazines;
2144	* allocate directly from the slab layer.
2145	*/
2146	break;
2147	}
2148	SKM_CPU_UNLOCK(cp);
2149
2150	if (__probable(num > `1` && (skm->skm_mode & SKM_MODE_BATCH) != `0`)) {
2151	struct skmem_obj rtop, rlist, *rlistp = NULL;
2152	uint32_t rlistc, c = `0`;
2153
2154	/*
2155	* Get a list of raw objects from the slab layer.
2156	*/
2157	rlistc = skmem_slab_batch_alloc(skm, list: &rlist, num: need, skmflag);
2158	ASSERT(rlistc == `0` \|\| rlist != NULL);
2159	rtop = rlist;
2160
2161	/*
2162	* Construct each object in the raw list. Upon failure,
2163	* free any remaining objects in the list back to the slab
2164	* layer, and keep the ones that were successfully constructed.
2165	* Here, "oi" and "oim" in each skmem_obj refer to the objects
2166	* coming from the master and slave regions (on mirrored
2167	* regions), respectively. They are stored inside the object
2168	* temporarily so that we can pass them to the constructor.
2169	*/
2170	while (skm->skm_ctor != NULL && rlist != NULL) {
2171	struct skmem_obj_info *oi = &rlist->mo_info;
2172	struct skmem_obj_info *oim = &rlist->mo_minfo;
2173	struct skmem_obj *rlistn = rlist->mo_next;
2174
2175	/*
2176	* Note that the constructor guarantees at least
2177	* the size of a pointer at the top of the object
2178	* and no more than that. That means we must not
2179	* refer to "oi" and "oim" any longer after the
2180	* object goes thru the constructor.
2181	*/
2182	if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2183	oim : NULL), skm->skm_private, skmflag) != `0`) {
2184	VERIFY(rlist->mo_next == rlistn);
2185	os_atomic_add(&skm->skm_sl_alloc_fail,
2186	rlistc - c, relaxed);
2187	if (rlistp != NULL) {
2188	rlistp->mo_next = NULL;
2189	}
2190	if (rlist == rtop) {
2191	rtop = NULL;
2192	ASSERT(c == `0`);
2193	}
2194	skmem_slab_batch_free(skm, list: rlist);
2195	rlist = NULL;
2196	rlistc = c;
2197	break;
2198	}
2199	VERIFY(rlist->mo_next == rlistn);
2200
2201	++c; / # of constructed objs /
2202	rlistp = rlist;
2203	if ((rlist = rlist->mo_next) == NULL) {
2204	ASSERT(rlistc == c);
2205	break;
2206	}
2207	}
2208
2209	/*
2210	* At this point "top" points to the head of the chain we're
2211	* going to return to caller; "list" points to the tail of that
2212	* chain. The second chain begins at "rtop", and we append
2213	* that after "list" to form a single chain. "rlistc" is the
2214	* number of objects in "rtop" originated from the slab layer
2215	* that have been successfully constructed (if applicable).
2216	*/
2217	ASSERT(c == `0` \|\| rtop != NULL);
2218	need -= rlistc;
2219	*list = rtop;
2220	} else {
2221	struct skmem_obj_info oi, oim;
2222	void *buf;
2223
2224	ASSERT(*top == NULL && num == `1` && need == `1`);
2225
2226	/*
2227	* Get a single raw object from the slab layer.
2228	*/
2229	if (skmem_slab_alloc(skm, oi: &oi, oim: &oim, skmflag) != `0`) {
2230	goto done;
2231	}
2232
2233	buf = SKMEM_OBJ_ADDR(&oi);
2234	ASSERT(buf != NULL);
2235
2236	/*
2237	* Construct the raw object. Here, "oi" and "oim" refer to
2238	* the objects coming from the master and slave regions (on
2239	* mirrored regions), respectively.
2240	*/
2241	if (skm->skm_ctor != NULL &&
2242	skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2243	&oim : NULL), skm->skm_private, skmflag) != `0`) {
2244	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
2245	skmem_slab_free(skm, buf);
2246	goto done;
2247	}
2248
2249	need = `0`;
2250	*list = buf;
2251	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) \|\|
2252	(*list)->mo_next == NULL);
2253	}
2254
2255	done:
2256	/ if auditing is enabled, record this transaction /
2257	if (__improbable(*top != NULL &&
2258	(skm->skm_mode & SKM_MODE_AUDIT) != `0`)) {
2259	skmem_audit_buf(skm, *top);
2260	}
2261
2262	return num - need;
2263	}
2264
2265	/*
2266	* Free a constructed object to the cache.
2267	*/
2268	void
2269	skmem_cache_free(struct skmem_cache skm, void* *buf)
2270	{
2271	if (skm->skm_mode & SKM_MODE_BATCH) {
2272	((struct skmem_obj *)buf)->mo_next = NULL;
2273	}
2274	skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2275	}
2276
2277	void
2278	skmem_cache_batch_free(struct skmem_cache skm, struct* skmem_obj *list)
2279	{
2280	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2281	struct skmem_magtype *mtp;
2282	struct skmem_mag *mg;
2283	struct skmem_obj *listn;
2284	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2285	vm_offset_t tagged_address; / address tagging /
2286	struct skmem_region region; /* region source for this cache /
2287	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2288
2289	/ if auditing is enabled, record this transaction /
2290	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != `0`)) {
2291	skmem_audit_buf(skm, list);
2292	}
2293
2294	SKM_CPU_LOCK(cp);
2295	for (;;) {
2296	/*
2297	* If there's an available space in the current CPU's
2298	* loaded magazine, place it there and we're done.
2299	*/
2300	if ((unsigned int)cp->cp_rounds <
2301	(unsigned int)cp->cp_magsize) {
2302	/*
2303	* In the SKM_MODE_BATCH case, reverse the list
2304	* while we place each object into the magazine;
2305	* this effectively causes the most recently
2306	* freed object to be reused during allocation.
2307	*/
2308	if (skm->skm_mode & SKM_MODE_BATCH) {
2309	listn = list->mo_next;
2310	list->mo_next = (cp->cp_rounds == `0`) ? NULL :
2311	cp->cp_loaded->mg_round[cp->cp_rounds - `1`];
2312	} else {
2313	listn = NULL;
2314	}
2315	#if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2316	/*
2317	* If this region is configured to be tagged, we re-tag
2318	* the address that's being freed, to protect against
2319	* use-after-free bugs. This "re-tagged" address will
2320	* reside in the CPU's loaded magazine, and when cache
2321	* alloc is called, it is returned to client as is. At
2322	* this point, we know that this object will be freed to
2323	* the CPU's loaded magazine and not down to the slab
2324	* layer, so we won't be double tagging the same address
2325	* in the magazine layer and slab layer.
2326	*/
2327	region = skm->skm_region;
2328	if (region->skr_mode & SKR_MODE_MEMTAG) {
2329	tagged_address = vm_memtag_assign_tag(
2330	(vm_offset_t)list, skm->skm_objsize);
2331	vm_memtag_set_tag(tagged_address,
2332	skm->skm_objsize);
2333	cp->cp_loaded->mg_round[cp->cp_rounds++] =
2334	(void *)tagged_address;
2335	} else {
2336	cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2337	}
2338	#else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2339	cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2340	#endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2341	cp->cp_free++;
2342
2343	if ((list = listn) != NULL) {
2344	continue;
2345	}
2346
2347	SKM_CPU_UNLOCK(cp);
2348	return;
2349	}
2350
2351	/*
2352	* The loaded magazine is full. If the previously
2353	* loaded magazine was empty, exchange and try again.
2354	*/
2355	if (cp->cp_prounds == `0`) {
2356	skmem_cpu_reload(cp, mg: cp->cp_ploaded, rounds: cp->cp_prounds);
2357	continue;
2358	}
2359
2360	/*
2361	* If the magazine layer is disabled, free to slab.
2362	* This can happen either because SKM_MODE_NOMAGAZINES
2363	* is set, or because we are resizing the magazine now.
2364	*/
2365	if (cp->cp_magsize == `0`) {
2366	break;
2367	}
2368
2369	/*
2370	* Both magazines for the CPU are full; try to get
2371	* empty magazine(s) from the depot. If we get one,
2372	* exchange a full magazine with it and place the
2373	* object in there.
2374	*
2375	* TODO: Because the caller currently doesn't indicate
2376	* the number of objects in the list, we choose the more
2377	* conservative approach of allocating only 1 empty
2378	* magazine (to prevent potential thrashing). Once we
2379	* have the object count, we can replace 1 with similar
2380	* logic as used in skmem_cache_batch_alloc().
2381	*/
2382	(void) skmem_depot_batch_alloc(skm, ml: &skm->skm_empty,
2383	count: &skm->skm_depot_empty, list: &mg, num: `1`);
2384	if (mg != NULL) {
2385	SLIST_HEAD(, skmem_mag) mg_list =
2386	SLIST_HEAD_INITIALIZER(mg_list);
2387
2388	if (cp->cp_ploaded != NULL) {
2389	SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2390	mg_link);
2391	}
2392	if (SLIST_NEXT(mg, mg_link) == NULL) {
2393	/*
2394	* Depot allocation returns only 1 magazine;
2395	* retain current full magazine.
2396	*/
2397	skmem_cpu_reload(cp, mg, rounds: `0`);
2398	} else {
2399	/*
2400	* We got 2 empty magazines from depot;
2401	* release the current full magazine back
2402	* to the depot layer.
2403	*/
2404	if (cp->cp_loaded != NULL) {
2405	SLIST_INSERT_HEAD(&mg_list,
2406	cp->cp_loaded, mg_link);
2407	}
2408	skmem_cpu_batch_reload(cp, mg, rounds: `0`);
2409	}
2410	skmem_depot_batch_free(skm, ml: &skm->skm_full,
2411	count: &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2412	continue;
2413	}
2414
2415	/*
2416	* We can't get any empty magazine from the depot, and
2417	* so we need to allocate one. If the allocation fails,
2418	* just fall through, deconstruct and free the object
2419	* to the slab layer.
2420	*/
2421	mtp = skm->skm_magtype;
2422	SKM_CPU_UNLOCK(cp);
2423	mg = skmem_cache_alloc(skm: mtp->mt_cache, SKMEM_NOSLEEP);
2424	SKM_CPU_LOCK(cp);
2425
2426	if (mg != NULL) {
2427	/*
2428	* We allocated an empty magazine, but since we
2429	* dropped the CPU lock above the magazine size
2430	* may have changed. If that's the case free
2431	* the magazine and try again.
2432	*/
2433	if (cp->cp_magsize != mtp->mt_magsize) {
2434	SKM_CPU_UNLOCK(cp);
2435	skmem_cache_free(skm: mtp->mt_cache, buf: mg);
2436	SKM_CPU_LOCK(cp);
2437	continue;
2438	}
2439
2440	/*
2441	* We have a magazine with the right size;
2442	* add it to the depot and try again.
2443	*/
2444	ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2445	skmem_depot_batch_free(skm, ml: &skm->skm_empty,
2446	count: &skm->skm_depot_empty, mg);
2447	continue;
2448	}
2449
2450	/*
2451	* We can't get an empty magazine, so free to slab.
2452	*/
2453	break;
2454	}
2455	SKM_CPU_UNLOCK(cp);
2456
2457	/*
2458	* We weren't able to free the constructed object(s) to the
2459	* magazine layer, so deconstruct them and free to the slab.
2460	*/
2461	if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2462	list->mo_next != NULL)) {
2463	/ whatever is left from original list /
2464	struct skmem_obj *top = list;
2465
2466	while (list != NULL && skm->skm_dtor != NULL) {
2467	listn = list->mo_next;
2468	list->mo_next = NULL;
2469
2470	/ deconstruct the object /
2471	if (skm->skm_dtor != NULL) {
2472	skm->skm_dtor((void *)list, skm->skm_private);
2473	}
2474
2475	list->mo_next = listn;
2476	list = listn;
2477	}
2478
2479	skmem_slab_batch_free(skm, list: top);
2480	} else {
2481	/ deconstruct the object /
2482	if (skm->skm_dtor != NULL) {
2483	skm->skm_dtor((void *)list, skm->skm_private);
2484	}
2485
2486	skmem_slab_free(skm, buf: (void *)list);
2487	}
2488	}
2489
2490	/*
2491	* Return the maximum number of objects cached at the magazine layer
2492	* based on the chunk size. This takes into account the starting
2493	* magazine type as well as the final magazine type used in resizing.
2494	*/
2495	uint32_t
2496	skmem_cache_magazine_max(uint32_t chunksize)
2497	{
2498	struct skmem_magtype *mtp;
2499	uint32_t magsize_max;
2500
2501	VERIFY(ncpu != `0`);
2502	VERIFY(chunksize > `0`);
2503
2504	/ find a suitable magazine type for this chunk size /
2505	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2506	continue;
2507	}
2508
2509	/ and find the last magazine type /
2510	for (;;) {
2511	magsize_max = mtp->mt_magsize;
2512	if (mtp == skmem_cache_magsize_last \|\|
2513	chunksize >= mtp->mt_maxbuf) {
2514	break;
2515	}
2516	++mtp;
2517	VERIFY(mtp <= skmem_cache_magsize_last);
2518	}
2519
2520	return ncpu * magsize_max * `2`; / two magazines per CPU /
2521	}
2522
2523	/*
2524	* Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2525	*/
2526	boolean_t
2527	skmem_allow_magazines(void)
2528	{
2529	return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2530	}
2531
2532	/*
2533	* Purge all magazines from a cache and disable its per-CPU magazines layer.
2534	*/
2535	static void
2536	skmem_cache_magazine_purge(struct skmem_cache *skm)
2537	{
2538	struct skmem_cpu_cache *cp;
2539	struct skmem_mag mg, pmg;
2540	int rounds, prounds;
2541	uint32_t cpuid, mg_cnt = `0`, pmg_cnt = `0`;
2542
2543	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2544
2545	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2546
2547	for (cpuid = `0`; cpuid < ncpu; cpuid++) {
2548	cp = &skm->skm_cpu_cache[cpuid];
2549
2550	SKM_CPU_LOCK_SPIN(cp);
2551	mg = cp->cp_loaded;
2552	pmg = cp->cp_ploaded;
2553	rounds = cp->cp_rounds;
2554	prounds = cp->cp_prounds;
2555	cp->cp_loaded = NULL;
2556	cp->cp_ploaded = NULL;
2557	cp->cp_rounds = -`1`;
2558	cp->cp_prounds = -`1`;
2559	cp->cp_magsize = `0`;
2560	SKM_CPU_UNLOCK(cp);
2561
2562	if (mg != NULL) {
2563	skmem_magazine_destroy(skm, mg, nrounds: rounds);
2564	++mg_cnt;
2565	}
2566	if (pmg != NULL) {
2567	skmem_magazine_destroy(skm, mg: pmg, nrounds: prounds);
2568	++pmg_cnt;
2569	}
2570	}
2571
2572	if (mg_cnt != `0` \|\| pmg_cnt != `0`) {
2573	os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed);
2574	}
2575
2576	skmem_depot_ws_zero(skm);
2577	skmem_depot_ws_reap(skm);
2578	}
2579
2580	/*
2581	* Enable magazines on a cache. Must only be called on a cache with
2582	* its per-CPU magazines layer disabled (e.g. due to purge).
2583	*/
2584	static void
2585	skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2586	{
2587	#pragma unused(arg)
2588	struct skmem_cpu_cache *cp;
2589	uint32_t cpuid;
2590
2591	if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2592	return;
2593	}
2594
2595	for (cpuid = `0`; cpuid < ncpu; cpuid++) {
2596	cp = &skm->skm_cpu_cache[cpuid];
2597	SKM_CPU_LOCK_SPIN(cp);
2598	/ the magazines layer must be disabled at this point /
2599	ASSERT(cp->cp_loaded == NULL);
2600	ASSERT(cp->cp_ploaded == NULL);
2601	ASSERT(cp->cp_rounds == -`1`);
2602	ASSERT(cp->cp_prounds == -`1`);
2603	ASSERT(cp->cp_magsize == `0`);
2604	cp->cp_magsize = skm->skm_magtype->mt_magsize;
2605	SKM_CPU_UNLOCK(cp);
2606	}
2607
2608	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2609	SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2610	SKMEM_CPU_CACHE(skm)->cp_magsize);
2611	}
2612
2613	/*
2614	* Enter the cache resize perimeter. Upon success, claim exclusivity
2615	* on the perimeter and return 0, else EBUSY. Caller may indicate
2616	* whether or not they're willing to wait.
2617	*/
2618	static int
2619	skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2620	{
2621	SKM_RESIZE_LOCK(skm);
2622	if (skm->skm_rs_owner == current_thread()) {
2623	ASSERT(skm->skm_rs_busy != `0`);
2624	skm->skm_rs_busy++;
2625	goto done;
2626	}
2627	if (!can_sleep) {
2628	if (skm->skm_rs_busy != `0`) {
2629	SKM_RESIZE_UNLOCK(skm);
2630	return EBUSY;
2631	}
2632	} else {
2633	while (skm->skm_rs_busy != `0`) {
2634	skm->skm_rs_want++;
2635	(void) assert_wait(event: &skm->skm_rs_busy, THREAD_UNINT);
2636	SKM_RESIZE_UNLOCK(skm);
2637	(void) thread_block(THREAD_CONTINUE_NULL);
2638	SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2639	"(0x%llx) busy=%u", skm->skm_name,
2640	SK_KVA(skm), skm->skm_rs_busy);
2641	SKM_RESIZE_LOCK(skm);
2642	}
2643	}
2644	SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2645	ASSERT(skm->skm_rs_busy == `0`);
2646	skm->skm_rs_busy++;
2647	skm->skm_rs_owner = current_thread();
2648	done:
2649	SKM_RESIZE_UNLOCK(skm);
2650	return `0`;
2651	}
2652
2653	/*
2654	* Exit the cache resize perimeter and unblock any waiters.
2655	*/
2656	static void
2657	skmem_cache_resize_exit(struct skmem_cache *skm)
2658	{
2659	uint32_t want;
2660
2661	SKM_RESIZE_LOCK(skm);
2662	ASSERT(skm->skm_rs_busy != `0`);
2663	ASSERT(skm->skm_rs_owner == current_thread());
2664	if (--skm->skm_rs_busy == `0`) {
2665	skm->skm_rs_owner = NULL;
2666	/*
2667	* We're done; notify anyone that has lost the race.
2668	*/
2669	if ((want = skm->skm_rs_want) != `0`) {
2670	skm->skm_rs_want = `0`;
2671	wakeup(chan: (void *)&skm->skm_rs_busy);
2672	SKM_RESIZE_UNLOCK(skm);
2673	} else {
2674	SKM_RESIZE_UNLOCK(skm);
2675	}
2676	} else {
2677	SKM_RESIZE_UNLOCK(skm);
2678	}
2679	}
2680
2681	/*
2682	* Recompute a cache's magazine size. This is an expensive operation
2683	* and should not be done frequently; larger magazines provide for a
2684	* higher transfer rate with the depot while smaller magazines reduce
2685	* the memory consumption.
2686	*/
2687	static void
2688	skmem_cache_magazine_resize(struct skmem_cache *skm)
2689	{
2690	struct skmem_magtype *mtp = skm->skm_magtype;
2691
2692	/ insist that we are executing in the update thread call context /
2693	ASSERT(sk_is_cache_update_protected());
2694	ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2695	/ depot contention only applies to dynamic mode /
2696	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2697
2698	/*
2699	* Although we're executing in the context of the update thread
2700	* call, we need to protect the per-CPU states during resizing
2701	* against other synchronous cache purge/reenable requests that
2702	* could take place in parallel.
2703	*/
2704	if (skm->skm_chunksize < mtp->mt_maxbuf) {
2705	(void) skmem_cache_resize_enter(skm, TRUE);
2706	skmem_cache_magazine_purge(skm);
2707
2708	/*
2709	* Upgrade to the next magazine type with larger size.
2710	*/
2711	SKM_DEPOT_LOCK_SPIN(skm);
2712	skm->skm_cpu_mag_resize++;
2713	skm->skm_magtype = ++mtp;
2714	skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2715	skm->skm_depot_contention_prev =
2716	skm->skm_depot_contention + INT_MAX;
2717	SKM_DEPOT_UNLOCK(skm);
2718
2719	skmem_cache_magazine_enable(skm, arg: `0`);
2720	skmem_cache_resize_exit(skm);
2721	}
2722	}
2723
2724	/*
2725	* Rescale the cache's allocated-address hash table.
2726	*/
2727	static void
2728	skmem_cache_hash_rescale(struct skmem_cache *skm)
2729	{
2730	struct skmem_bufctl_bkt old_table, new_table;
2731	size_t old_size, new_size;
2732	uint32_t i, moved = `0`;
2733
2734	/ insist that we are executing in the update thread call context /
2735	ASSERT(sk_is_cache_update_protected());
2736
2737	/*
2738	* To get small average lookup time (lookup depth near 1.0), the hash
2739	* table size should be roughly the same (not necessarily equivalent)
2740	* as the cache size.
2741	*/
2742	new_size = MAX(skm->skm_hash_initial,
2743	(`1` << (flsll(`3` * skm->skm_sl_bufinuse + `4`) - `2`)));
2744	new_size = MIN(skm->skm_hash_limit, new_size);
2745	old_size = (skm->skm_hash_mask + `1`);
2746
2747	if ((old_size >> `1`) <= new_size && new_size <= (old_size << `1`)) {
2748	return;
2749	}
2750
2751	new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2752	Z_NOWAIT, skmem_tag_bufctl_hash);
2753	if (__improbable(new_table == NULL)) {
2754	return;
2755	}
2756
2757	for (i = `0`; i < new_size; i++) {
2758	SLIST_INIT(&new_table[i].bcb_head);
2759	}
2760
2761	SKM_SLAB_LOCK(skm);
2762
2763	old_size = (skm->skm_hash_mask + `1`);
2764	old_table = skm->skm_hash_table;
2765
2766	skm->skm_hash_mask = (new_size - `1`);
2767	skm->skm_hash_table = new_table;
2768	skm->skm_sl_rescale++;
2769
2770	for (i = `0`; i < old_size; i++) {
2771	struct skmem_bufctl_bkt *bcb = &old_table[i];
2772	struct skmem_bufctl_bkt *new_bcb;
2773	struct skmem_bufctl *bc;
2774
2775	while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2776	SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2777	new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2778	/*
2779	* Ideally we want to insert tail here, but simple
2780	* list doesn't give us that. The fact that we are
2781	* essentially reversing the order is not a big deal
2782	* here vis-a-vis the new table size.
2783	*/
2784	SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2785	++moved;
2786	}
2787	ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2788	}
2789
2790	SK_DF(SK_VERB_MEM_CACHE,
2791	"skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2792	(uint32_t)old_size, (uint32_t)new_size, moved);
2793
2794	SKM_SLAB_UNLOCK(skm);
2795
2796	sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2797	}
2798
2799	/*
2800	* Apply a function to operate on all caches.
2801	*/
2802	static void
2803	skmem_cache_applyall(void (func)(struct* skmem_cache *, uint32_t), uint32_t arg)
2804	{
2805	struct skmem_cache *skm;
2806
2807	net_update_uptime();
2808
2809	SKMEM_CACHE_LOCK();
2810	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2811	func(skm, arg);
2812	}
2813	SKMEM_CACHE_UNLOCK();
2814	}
2815
2816	/*
2817	* Reclaim unused memory from a cache.
2818	*/
2819	static void
2820	skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2821	{
2822	/*
2823	* Inform the owner to free memory if possible; the reclaim
2824	* policy is left to the owner. This is just an advisory.
2825	*/
2826	if (skm->skm_reclaim != NULL) {
2827	skm->skm_reclaim(skm->skm_private);
2828	}
2829
2830	if (lowmem) {
2831	/*
2832	* If another thread is in the process of purging or
2833	* resizing, bail out and let the currently-ongoing
2834	* purging take its natural course.
2835	*/
2836	if (skmem_cache_resize_enter(skm, FALSE) == `0`) {
2837	skmem_cache_magazine_purge(skm);
2838	skmem_cache_magazine_enable(skm, arg: `0`);
2839	skmem_cache_resize_exit(skm);
2840	}
2841	} else {
2842	skmem_depot_ws_reap(skm);
2843	}
2844	}
2845
2846	/*
2847	* Thread call callback for reap.
2848	*/
2849	static void
2850	skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2851	{
2852	#pragma unused(dummy)
2853	void (func)(void*) = arg;
2854
2855	ASSERT(func == skmem_cache_reap_start \|\| func == skmem_cache_reap_done);
2856	func();
2857	}
2858
2859	/*
2860	* Start reaping all caches; this is serialized via thread call.
2861	*/
2862	static void
2863	skmem_cache_reap_start(void)
2864	{
2865	SK_DF(SK_VERB_MEM_CACHE, "now running");
2866	skmem_cache_applyall(func: skmem_cache_reclaim, arg: skmem_lowmem_check());
2867	skmem_dispatch(skmem_cache_reap_tc, func: skmem_cache_reap_done,
2868	(skmem_cache_update_interval * NSEC_PER_SEC));
2869	}
2870
2871	/*
2872	* Stop reaping; this would allow another reap request to occur.
2873	*/
2874	static void
2875	skmem_cache_reap_done(void)
2876	{
2877	volatile uint32_t *flag = &skmem_cache_reaping;
2878
2879	*flag = `0`;
2880	os_atomic_thread_fence(seq_cst);
2881	}
2882
2883	/*
2884	* Immediately reap all unused memory of a cache. If purging,
2885	* also purge the cached objects at the CPU layer.
2886	*/
2887	void
2888	skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2889	{
2890	/ if SKM_MODE_RECLIAM flag is set for this cache, we purge /
2891	if (purge \|\| (skm->skm_mode & SKM_MODE_RECLAIM)) {
2892	/*
2893	* If another thread is in the process of purging or
2894	* resizing, bail out and let the currently-ongoing
2895	* purging take its natural course.
2896	*/
2897	if (skmem_cache_resize_enter(skm, FALSE) == `0`) {
2898	skmem_cache_magazine_purge(skm);
2899	skmem_cache_magazine_enable(skm, arg: `0`);
2900	skmem_cache_resize_exit(skm);
2901	}
2902	} else {
2903	skmem_depot_ws_zero(skm);
2904	skmem_depot_ws_reap(skm);
2905
2906	/ clean up cp_ploaded magazines from each CPU /
2907	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2908
2909	struct skmem_cpu_cache *cp;
2910	struct skmem_mag *pmg;
2911	int prounds;
2912	uint32_t cpuid;
2913
2914	for (cpuid = `0`; cpuid < ncpu; cpuid++) {
2915	cp = &skm->skm_cpu_cache[cpuid];
2916
2917	SKM_CPU_LOCK_SPIN(cp);
2918	pmg = cp->cp_ploaded;
2919	prounds = cp->cp_prounds;
2920
2921	cp->cp_ploaded = NULL;
2922	cp->cp_prounds = -`1`;
2923	SKM_CPU_UNLOCK(cp);
2924
2925	if (pmg != NULL) {
2926	skmem_magazine_destroy(skm, mg: pmg, nrounds: prounds);
2927	}
2928	}
2929	}
2930	}
2931
2932	/*
2933	* Request a global reap operation to be dispatched.
2934	*/
2935	void
2936	skmem_cache_reap(void)
2937	{
2938	/ only one reaping episode is allowed at a time /
2939	if (skmem_lock_owner == current_thread() \|\|
2940	!os_atomic_cmpxchg(&skmem_cache_reaping, `0`, `1`, acq_rel)) {
2941	return;
2942	}
2943
2944	skmem_dispatch(skmem_cache_reap_tc, func: skmem_cache_reap_start, `0`);
2945	}
2946
2947	/*
2948	* Reap internal caches.
2949	*/
2950	void
2951	skmem_reap_caches(boolean_t purge)
2952	{
2953	skmem_cache_reap_now(skm: skmem_slab_cache, purge);
2954	skmem_cache_reap_now(skm: skmem_bufctl_cache, purge);
2955
2956	/ packet buffer pool objects /
2957	pp_reap_caches(purge);
2958
2959	/ also handle the region cache(s) /
2960	skmem_region_reap_caches(purge);
2961	}
2962
2963	/*
2964	* Thread call callback for update.
2965	*/
2966	static void
2967	skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2968	{
2969	#pragma unused(dummy, arg)
2970	sk_protect_t protect;
2971
2972	protect = sk_cache_update_protect();
2973	skmem_cache_applyall(func: skmem_cache_update, arg: `0`);
2974	sk_cache_update_unprotect(protect);
2975
2976	skmem_dispatch(skmem_cache_update_tc, NULL,
2977	(skmem_cache_update_interval * NSEC_PER_SEC));
2978	}
2979
2980	/*
2981	* Given a buffer control, record the current transaction.
2982	*/
2983	__attribute__((noinline, cold, not_tail_called))
2984	static inline void
2985	skmem_audit_bufctl(struct skmem_bufctl *bc)
2986	{
2987	struct skmem_bufctl_audit bca = (struct* skmem_bufctl_audit *)bc;
2988	struct timeval tv;
2989
2990	microuptime(tv: &tv);
2991	bca->bc_thread = current_thread();
2992	bca->bc_timestamp = (uint32_t)((tv.tv_sec * `1000`) + (tv.tv_usec / `1000`));
2993	bca->bc_depth = OSBacktrace(bt: bca->bc_stack, SKMEM_STACK_DEPTH);
2994	}
2995
2996	/*
2997	* Given an object, find its buffer control and record the transaction.
2998	*/
2999	__attribute__((noinline, cold, not_tail_called))
3000	static inline void
3001	skmem_audit_buf(struct skmem_cache skm, struct* skmem_obj *list)
3002	{
3003	struct skmem_bufctl_bkt *bcb;
3004	struct skmem_bufctl *bc;
3005
3006	ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
3007
3008	SKM_SLAB_LOCK(skm);
3009	while (list != NULL) {
3010	void *buf = list;
3011
3012	bcb = SKMEM_CACHE_HASH(skm, buf);
3013	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
3014	if (bc->bc_addr == buf) {
3015	break;
3016	}
3017	}
3018
3019	if (__improbable(bc == NULL)) {
3020	panic("%s: %s failed to get bufctl for %p",
3021	__func__, skm->skm_name, buf);
3022	/ NOTREACHED /
3023	__builtin_unreachable();
3024	}
3025
3026	skmem_audit_bufctl(bc);
3027
3028	if (!(skm->skm_mode & SKM_MODE_BATCH)) {
3029	break;
3030	}
3031
3032	list = list->mo_next;
3033	}
3034	SKM_SLAB_UNLOCK(skm);
3035	}
3036
3037	static size_t
3038	skmem_cache_mib_get_stats(struct skmem_cache skm, void* *out, size_t len)
3039	{
3040	size_t actual_space = sizeof(struct sk_stats_cache);
3041	struct sk_stats_cache *sca = out;
3042	int contention;
3043
3044	if (out == NULL \|\| len < actual_space) {
3045	goto done;
3046	}
3047
3048	bzero(s: sca, n: sizeof(*sca));
3049	(void) snprintf(sca->sca_name, count: sizeof(sca->sca_name), "%s",
3050	skm->skm_name);
3051	uuid_copy(dst: sca->sca_uuid, src: skm->skm_uuid);
3052	uuid_copy(dst: sca->sca_ruuid, src: skm->skm_region->skr_uuid);
3053	sca->sca_mode = skm->skm_mode;
3054	sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
3055	sca->sca_objsize = (uint64_t)skm->skm_objsize;
3056	sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
3057	sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
3058	sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
3059	sca->sca_objalign = (uint64_t)skm->skm_objalign;
3060
3061	sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
3062	sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
3063	sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
3064	sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
3065	sca->sca_depot_full = skm->skm_depot_full;
3066	sca->sca_depot_empty = skm->skm_depot_empty;
3067	sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
3068	/ in case of a race this might be a negative value, turn it into 0 /
3069	if ((contention = (int)(skm->skm_depot_contention -
3070	skm->skm_depot_contention_prev)) < `0`) {
3071	contention = `0`;
3072	}
3073	sca->sca_depot_contention_factor = contention;
3074
3075	sca->sca_cpu_rounds = `0`;
3076	sca->sca_cpu_prounds = `0`;
3077	for (int cpuid = `0`; cpuid < ncpu; cpuid++) {
3078	struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
3079
3080	SKM_CPU_LOCK(ccp);
3081	if (ccp->cp_rounds > -`1`) {
3082	sca->sca_cpu_rounds += ccp->cp_rounds;
3083	}
3084	if (ccp->cp_prounds > -`1`) {
3085	sca->sca_cpu_prounds += ccp->cp_prounds;
3086	}
3087	SKM_CPU_UNLOCK(ccp);
3088	}
3089
3090	sca->sca_sl_create = skm->skm_sl_create;
3091	sca->sca_sl_destroy = skm->skm_sl_destroy;
3092	sca->sca_sl_alloc = skm->skm_sl_alloc;
3093	sca->sca_sl_free = skm->skm_sl_free;
3094	sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
3095	sca->sca_sl_partial = skm->skm_sl_partial;
3096	sca->sca_sl_empty = skm->skm_sl_empty;
3097	sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
3098	sca->sca_sl_rescale = skm->skm_sl_rescale;
3099	sca->sca_sl_hash_size = (skm->skm_hash_mask + `1`);
3100
3101	done:
3102	return actual_space;
3103	}
3104
3105	static int
3106	skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
3107	{
3108	#pragma unused(arg1, arg2, oidp)
3109	struct skmem_cache *skm;
3110	size_t actual_space;
3111	size_t buffer_space;
3112	size_t allocated_space;
3113	caddr_t buffer = NULL;
3114	caddr_t scan;
3115	int error = `0`;
3116
3117	if (!kauth_cred_issuser(cred: kauth_cred_get())) {
3118	return EPERM;
3119	}
3120
3121	net_update_uptime();
3122	buffer_space = req->oldlen;
3123	if (req->oldptr != USER_ADDR_NULL && buffer_space != `0`) {
3124	if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3125	buffer_space = SK_SYSCTL_ALLOC_MAX;
3126	}
3127	allocated_space = buffer_space;
3128	buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
3129	if (__improbable(buffer == NULL)) {
3130	return ENOBUFS;
3131	}
3132	} else if (req->oldptr == USER_ADDR_NULL) {
3133	buffer_space = `0`;
3134	}
3135	actual_space = `0`;
3136	scan = buffer;
3137
3138	SKMEM_CACHE_LOCK();
3139	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
3140	size_t size = skmem_cache_mib_get_stats(skm, out: scan, len: buffer_space);
3141	if (scan != NULL) {
3142	if (buffer_space < size) {
3143	/ supplied buffer too small, stop copying /
3144	error = ENOMEM;
3145	break;
3146	}
3147	scan += size;
3148	buffer_space -= size;
3149	}
3150	actual_space += size;
3151	}
3152	SKMEM_CACHE_UNLOCK();
3153
3154	if (actual_space != `0`) {
3155	int out_error = SYSCTL_OUT(req, buffer, actual_space);
3156	if (out_error != `0`) {
3157	error = out_error;
3158	}
3159	}
3160	if (buffer != NULL) {
3161	sk_free_data(buffer, allocated_space);
3162	}
3163
3164	return error;
3165	}
3166

Browse the source code of xnu/bsd/skywalk/mem/skmem_cache.c