waitq.c source code [xnu/osfmk/kern/waitq.c]

1	/*
2	* Copyright (c) 2015-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_FREE_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56
57	/*
58	* un-comment the following lines to debug the link/prepost tables
59	* NOTE: this expands each element by ~40 bytes
60	*/
61	//#define KEEP_WAITQ_LINK_STATS
62	//#define KEEP_WAITQ_PREPOST_STATS
63
64	#include <kern/ast.h>
65	#include <kern/backtrace.h>
66	#include <kern/kern_types.h>
67	#include <kern/ltable.h>
68	#include <kern/mach_param.h>
69	#include <kern/queue.h>
70	#include <kern/sched_prim.h>
71	#include <kern/simple_lock.h>
72	#include <kern/spl.h>
73	#include <kern/waitq.h>
74	#include <kern/zalloc.h>
75	#include <kern/policy_internal.h>
76	#include <kern/turnstile.h>
77
78	#include <libkern/OSAtomic.h>
79	#include <mach/sync_policy.h>
80	#include <vm/vm_kern.h>
81
82	#include <sys/kdebug.h>
83
84	#if defined(KEEP_WAITQ_LINK_STATS) \|\| defined(KEEP_WAITQ_PREPOST_STATS)
85	# if !CONFIG_LTABLE_STATS
86	# error "You must configure LTABLE_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
87	# endif
88	# if !CONFIG_WAITQ_STATS
89	# error "You must configure WAITQ_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
90	# endif
91	#endif
92
93	#if CONFIG_WAITQ_DEBUG
94	#define wqdbg(fmt,...) \
95	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
96	#else
97	#define wqdbg(fmt,...) do { } while (0)
98	#endif
99
100	#ifdef WAITQ_VERBOSE_DEBUG
101	#define wqdbg_v(fmt,...) \
102	printf("WQ[v:%s]: " fmt "\n", __func__, ## __VA_ARGS__)
103	#else
104	#define wqdbg_v(fmt,...) do { } while (0)
105	#endif
106
107	#define wqinfo(fmt,...) \
108	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
109
110	#define wqerr(fmt,...) \
111	printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
112
113	/*
114	* file-static functions / data
115	*/
116	static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
117	uint64_t *reserved_preposts,
118	int priority, spl_t *spl);
119
120	static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
121	event64_t event,
122	thread_t thread, spl_t *spl);
123
124	#define WAITQ_SET_MAX (task_max * 3)
125	static zone_t waitq_set_zone;
126
127
128	#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
129	#define ROUNDDOWN(x,y) (((x)/(y))*(y))
130
131
132	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
133	static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
134	#endif
135
136	#if __arm64__
137
138	#define waitq_lock_to(wq,to) \
139	(hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to))
140
141	#define waitq_lock_unlock(wq) \
142	(hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
143
144	#define waitq_lock_init(wq) \
145	(wq->waitq_interlock = 0)
146
147	#else
148
149	#define waitq_lock_to(wq,to) \
150	(hw_lock_to(&(wq)->waitq_interlock, to))
151
152	#define waitq_lock_unlock(wq) \
153	(hw_lock_unlock(&(wq)->waitq_interlock))
154
155	#define waitq_lock_init(wq) \
156	(hw_lock_init(&(wq)->waitq_interlock))
157
158	#endif /* __arm64__ */
159
160	/*
161	* Prepost callback function for specially marked waitq sets
162	* (prepost alternative)
163	*/
164	extern void waitq_set__CALLING_PREPOST_HOOK__(void ctx, void* memberctx, int* priority);
165
166	#define DEFAULT_MIN_FREE_TABLE_ELEM 100
167	static uint32_t g_min_free_table_elem;
168	static uint32_t g_min_free_cache;
169
170
171	/ ----------------------------------------------------------------------*
172	*
173	* SetID Link Table Implementation
174	*
175	* ---------------------------------------------------------------------- */
176	static struct link_table g_wqlinktable;
177
178	enum wq_link_type {
179	WQL_ALL = -`1`,
180	WQL_FREE = LT_FREE,
181	WQL_WQS = LT_ELEM,
182	WQL_LINK = LT_LINK,
183	};
184
185	struct waitq_link {
186	struct lt_elem wqte;
187
188	union {
189	/ wqt_type == WQL_WQS (LT_ELEM) /
190	struct {
191	struct waitq_set *wql_set;
192	/ uint64_t sl_prepost_id; /
193	} wql_wqs;
194
195	/ wqt_type == WQL_LINK (LT_LINK) /
196	struct {
197	uint64_t left_setid;
198	uint64_t right_setid;
199	} wql_link;
200	};
201	#ifdef KEEP_WAITQ_LINK_STATS
202	thread_t sl_alloc_th;
203	task_t sl_alloc_task;
204	uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
205	uint64_t sl_alloc_ts;
206	uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
207	uint64_t sl_invalidate_ts;
208	uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
209	uint64_t sl_mkvalid_ts;
210	uint64_t sl_free_ts;
211	#endif
212	};
213	#if !defined(KEEP_WAITQ_LINK_STATS)
214	static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - `1`)) == `0`,
215	"waitq_link struct must be a power of two!");
216	#endif
217
218	#define wql_refcnt(link) \
219	(lt_bits_refcnt((link)->wqte.lt_bits))
220
221	#define wql_type(link) \
222	(lt_bits_type((link)->wqte.lt_bits))
223
224	#define wql_mkvalid(link) \
225	do { \
226	lt_elem_mkvalid(&(link)->wqte); \
227	wql_do_mkvalid_stats(&(link)->wqte); \
228	} while (0)
229
230	#define wql_is_valid(link) \
231	lt_bits_valid((link)->wqte.lt_bits)
232
233	#define wql_setid wqte.lt_id
234
235	#define WQL_WQS_POISON ((void *)(0xf00df00d))
236	#define WQL_LINK_POISON (0x0bad0badffffffffull)
237
238	static void wql_poison(struct link_table table, struct* lt_elem *elem)
239	{
240	struct waitq_link link = (struct* waitq_link *)elem;
241	(void)table;
242
243	switch (wql_type(link)) {
244	case WQL_WQS:
245	link->wql_wqs.wql_set = WQL_WQS_POISON;
246	break;
247	case WQL_LINK:
248	link->wql_link.left_setid = WQL_LINK_POISON;
249	link->wql_link.right_setid = WQL_LINK_POISON;
250	break;
251	default:
252	break;
253	}
254	#ifdef KEEP_WAITQ_LINK_STATS
255	memset(link->sl_alloc_bt, `0`, sizeof(link->sl_alloc_bt));
256	link->sl_alloc_ts = `0`;
257	memset(link->sl_mkvalid_bt, `0`, sizeof(link->sl_mkvalid_bt));
258	link->sl_mkvalid_ts = `0`;
259
260	link->sl_alloc_th = THREAD_NULL;
261	/ leave the sl_alloc_task in place for debugging /
262
263	link->sl_free_ts = mach_absolute_time();
264	#endif
265	}
266
267	#ifdef KEEP_WAITQ_LINK_STATS
268	static __inline__ void wql_do_alloc_stats(struct lt_elem *elem)
269	{
270	if (elem) {
271	struct waitq_link link = (struct* waitq_link *)elem;
272	memset(link->sl_alloc_bt, `0`, sizeof(link->sl_alloc_bt));
273	waitq_grab_backtrace(link->sl_alloc_bt, `0`);
274	link->sl_alloc_th = current_thread();
275	link->sl_alloc_task = current_task();
276
277	assert(link->sl_alloc_ts == `0`);
278	link->sl_alloc_ts = mach_absolute_time();
279
280	memset(link->sl_invalidate_bt, `0`, sizeof(link->sl_invalidate_bt));
281	link->sl_invalidate_ts = `0`;
282	}
283	}
284
285	static __inline__ void wql_do_invalidate_stats(struct lt_elem *elem)
286	{
287	struct waitq_link link = (struct* waitq_link *)elem;
288
289	if (!elem)
290	return;
291
292	assert(link->sl_mkvalid_ts > `0`);
293
294	memset(link->sl_invalidate_bt, `0`, sizeof(link->sl_invalidate_bt));
295	link->sl_invalidate_ts = mach_absolute_time();
296	waitq_grab_backtrace(link->sl_invalidate_bt, `0`);
297	}
298
299	static __inline__ void wql_do_mkvalid_stats(struct lt_elem *elem)
300	{
301	struct waitq_link link = (struct* waitq_link *)elem;
302
303	if (!elem)
304	return;
305
306	memset(link->sl_mkvalid_bt, `0`, sizeof(link->sl_mkvalid_bt));
307	link->sl_mkvalid_ts = mach_absolute_time();
308	waitq_grab_backtrace(link->sl_mkvalid_bt, `0`);
309	}
310	#else
311	#define wql_do_alloc_stats(e)
312	#define wql_do_invalidate_stats(e)
313	#define wql_do_mkvalid_stats(e)
314	#endif /* KEEP_WAITQ_LINK_STATS */
315
316	static void wql_init(void)
317	{
318	uint32_t tablesz = `0`, max_links = `0`;
319
320	if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE)
321	tablesz = (uint32_t)g_lt_max_tbl_size;
322
323	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
324	max_links = tablesz / sizeof(struct waitq_link);
325	assert(max_links > `0` && tablesz > `0`);
326
327	/ we have a restricted index range /
328	if (max_links > (LT_IDX_MAX + `1`))
329	max_links = LT_IDX_MAX + `1`;
330
331	wqinfo("init linktable with max:%d elements (%d bytes)",
332	max_links, tablesz);
333	ltable_init(&g_wqlinktable, "wqslab.wql", max_links,
334	sizeof(struct waitq_link), wql_poison);
335	}
336
337	static void wql_ensure_free_space(void)
338	{
339	if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {
340	/*
341	* we don't hold locks on these values, so check for underflow
342	*/
343	if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {
344	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
345	g_wqlinktable.nelem, g_wqlinktable.used_elem,
346	g_min_free_table_elem);
347	ltable_grow(&g_wqlinktable, g_min_free_table_elem);
348	}
349	}
350	}
351
352	static struct waitq_link wql_alloc_link(int* type)
353	{
354	struct lt_elem *elem;
355
356	elem = ltable_alloc_elem(&g_wqlinktable, type, `1`, `0`);
357	wql_do_alloc_stats(elem);
358	return (struct waitq_link *)elem;
359	}
360
361	static void wql_realloc_link(struct waitq_link link, int* type)
362	{
363	ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);
364	#ifdef KEEP_WAITQ_LINK_STATS
365	memset(link->sl_alloc_bt, `0`, sizeof(link->sl_alloc_bt));
366	link->sl_alloc_ts = `0`;
367	wql_do_alloc_stats(&link->wqte);
368
369	memset(link->sl_invalidate_bt, `0`, sizeof(link->sl_invalidate_bt));
370	link->sl_invalidate_ts = `0`;
371	#endif
372	}
373
374	static void wql_invalidate(struct waitq_link *link)
375	{
376	lt_elem_invalidate(&link->wqte);
377	wql_do_invalidate_stats(&link->wqte);
378	}
379
380	static struct waitq_link *wql_get_link(uint64_t setid)
381	{
382	struct lt_elem *elem;
383
384	elem = ltable_get_elem(&g_wqlinktable, setid);
385	return (struct waitq_link *)elem;
386	}
387
388	static void wql_put_link(struct waitq_link *link)
389	{
390	if (!link)
391	return;
392	ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);
393	}
394
395	static struct waitq_link wql_get_reserved(uint64_t setid, int* type)
396	{
397	struct lt_elem *elem;
398
399	elem = lt_elem_list_first(&g_wqlinktable, setid);
400	if (!elem)
401	return NULL;
402	ltable_realloc_elem(&g_wqlinktable, elem, type);
403	return (struct waitq_link *)elem;
404	}
405
406
407	static inline int waitq_maybe_remove_link(struct waitq *waitq,
408	uint64_t setid,
409	struct waitq_link *parent,
410	struct waitq_link *left,
411	struct waitq_link *right);
412
413	enum {
414	LINK_WALK_ONE_LEVEL = `0`,
415	LINK_WALK_FULL_DAG = `1`,
416	LINK_WALK_FULL_DAG_UNLOCKED = `2`,
417	};
418
419	typedef int (wql_callback_func)(struct* waitq waitq, void* *ctx,
420	struct waitq_link *link);
421
422	/**
423	* walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'
424	*
425	* Conditions:
426	* waitq is locked (or NULL)
427	* 'setid' is managed by 'waitq'
428	* this could be direct (waitq->waitq_set_id == setid)
429	* OR indirect (setid is the left/right ID in a LINK chain,
430	* whose root is waitq->waitq_set_id)
431	*
432	* Notes:
433	* This function uses recursion to walk the set of table elements
434	* pointed to by 'setid'. For each element encountered, 'cb' will be
435	* called. If non-zero, the return value of this callback function can
436	* early-out of the table walk.
437	*
438	* For each link element encountered, the function takes a reference to
439	* it. The reference is dropped only after the callback and any recursion
440	* has completed.
441	*
442	* The assumed table/link/tree structure:
443	* 'setid'
444	* / \
445	* / \
446	* L(LINK) R(LINK)
447	* /\ /\
448	* / \ / \
449	* / \ Rl() Rr()
450	* Ll() Lr() /\ /\
451	* /\ /\ ... ... ... ...
452	* ... ... ... ...
453	* \
454	* WQS(wqset_q.waitq_setid == Sx)
455	* [waitq set is a membet of setid, 'Sx')
456	*
457	* 'Sx'
458	* / \
459	* / \
460	* L(LINK) R(LINK)
461	* /\ /\
462	* ... ... ... ...
463	*
464	* The basic algorithm is as follows:
465	* *) take a reference to the table object pointed to by 'setid'
466	* *) if appropriate, call 'cb' (potentially early-out on non-zero return)
467	* *) if the link object points to a waitq set, and the walk type
468	* is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
469	* the associated waitq set object and recursively walk all sets to
470	* which that set belongs. This is a DFS of the tree structure.
471	* *) recurse down the left side of the tree (following the
472	* 'left_setid' pointer in the link object
473	* *) recurse down the right side of the tree (following the
474	* 'right_setid' pointer in the link object
475	*/
476	static __attribute__((noinline))
477	int walk_waitq_links(int walk_type, struct waitq *waitq,
478	uint64_t setid, int link_type,
479	void *ctx, wql_callback_func cb)
480	{
481	struct waitq_link *link;
482	uint64_t nextid;
483	int wqltype;
484
485	link = wql_get_link(setid);
486
487	/ invalid link /
488	if (!link)
489	return WQ_ITERATE_CONTINUE;
490
491	setid = nextid = `0`;
492	wqltype = wql_type(link);
493	if (wqltype == WQL_LINK) {
494	setid = link->wql_link.left_setid;
495	nextid = link->wql_link.right_setid;
496	}
497
498	/*
499	* Make the callback only on specified link_type (or all links)
500	* Note that after the callback, the link object may be
501	* invalid. The only valid thing we can do is put our
502	* reference to it (which may put it back on the free list)
503	*/
504	if (link_type == WQL_ALL \|\| link_type == wqltype) {
505	/ allow the callback to early-out /
506	int ret = cb(waitq, ctx, link);
507	if (ret != WQ_ITERATE_CONTINUE) {
508	wql_put_link(link);
509	return ret;
510	}
511	}
512
513	if (wqltype == WQL_WQS &&
514	(walk_type == LINK_WALK_FULL_DAG \|\|
515	walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
516	/*
517	* Recurse down any sets to which this wait queue set was
518	* added. We do this just before we put our reference to
519	* the link object (which may free it).
520	*/
521	struct waitq_set *wqset = link->wql_wqs.wql_set;
522	int ret = WQ_ITERATE_CONTINUE;
523	int should_unlock = `0`;
524	uint64_t wqset_setid = `0`;
525
526	if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
527	assert(!waitq_irq_safe(&wqset->wqset_q));
528	waitq_set_lock(wqset);
529	should_unlock = `1`;
530	}
531
532	/*
533	* verify the linked waitq set as it could have been
534	* invalidated before we grabbed the lock!
535	*/
536	if (wqset->wqset_id != link->wql_setid.id) {
537	/ This is the bottom of the tree: just get out /
538	if (should_unlock) {
539	waitq_set_unlock(wqset);
540	}
541	wql_put_link(link);
542	return WQ_ITERATE_CONTINUE;
543	}
544
545	wqset_setid = wqset->wqset_q.waitq_set_id;
546
547	if (wqset_setid > `0`)
548	ret = walk_waitq_links(walk_type, &wqset->wqset_q,
549	wqset_setid, link_type, ctx, cb);
550	if (should_unlock) {
551	waitq_set_unlock(wqset);
552	}
553	if (ret != WQ_ITERATE_CONTINUE) {
554	wql_put_link(link);
555	return ret;
556	}
557	}
558
559	wql_put_link(link);
560
561	/ recurse down left side of the tree /
562	if (setid) {
563	int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);
564	if (ret != WQ_ITERATE_CONTINUE)
565	return ret;
566	}
567
568	/ recurse down right side of the tree /
569	if (nextid)
570	return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);
571
572	return WQ_ITERATE_CONTINUE;
573	}
574
575	/ ----------------------------------------------------------------------*
576	*
577	* Prepost Link Table Implementation
578	*
579	* ---------------------------------------------------------------------- */
580	static struct link_table g_prepost_table;
581
582	enum wq_prepost_type {
583	WQP_FREE = LT_FREE,
584	WQP_WQ = LT_ELEM,
585	WQP_POST = LT_LINK,
586	};
587
588	struct wq_prepost {
589	struct lt_elem wqte;
590
591	union {
592	/ wqt_type == WQP_WQ (LT_ELEM) /
593	struct {
594	struct waitq *wqp_wq_ptr;
595	} wqp_wq;
596	/ wqt_type == WQP_POST (LT_LINK) /
597	struct {
598	uint64_t wqp_next_id;
599	uint64_t wqp_wq_id;
600	} wqp_post;
601	};
602	#ifdef KEEP_WAITQ_PREPOST_STATS
603	thread_t wqp_alloc_th;
604	task_t wqp_alloc_task;
605	uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
606	#endif
607	};
608	#if !defined(KEEP_WAITQ_PREPOST_STATS)
609	static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - `1`)) == `0`,
610	"wq_prepost struct must be a power of two!");
611	#endif
612
613	#define wqp_refcnt(wqp) \
614	(lt_bits_refcnt((wqp)->wqte.lt_bits))
615
616	#define wqp_type(wqp) \
617	(lt_bits_type((wqp)->wqte.lt_bits))
618
619	#define wqp_set_valid(wqp) \
620	lt_elem_mkvalid(&(wqp)->wqte)
621
622	#define wqp_is_valid(wqp) \
623	lt_bits_valid((wqp)->wqte.lt_bits)
624
625	#define wqp_prepostid wqte.lt_id
626
627	#define WQP_WQ_POISON (0x0bad0badffffffffull)
628	#define WQP_POST_POISON (0xf00df00df00df00d)
629
630	static void wqp_poison(struct link_table table, struct* lt_elem *elem)
631	{
632	struct wq_prepost wqp = (struct* wq_prepost *)elem;
633	(void)table;
634
635	switch (wqp_type(wqp)) {
636	case WQP_WQ:
637	break;
638	case WQP_POST:
639	wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
640	wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
641	break;
642	default:
643	break;
644	}
645	}
646
647	#ifdef KEEP_WAITQ_PREPOST_STATS
648	static __inline__ void wqp_do_alloc_stats(struct lt_elem *elem)
649	{
650	if (!elem)
651	return;
652
653	struct wq_prepost wqp = (struct* wq_prepost *)elem;
654	uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];
655
656	waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);
657
658	/ be sure the take stats for _all_ allocated objects /
659	for (;;) {
660	memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));
661	wqp->wqp_alloc_th = current_thread();
662	wqp->wqp_alloc_task = current_task();
663	wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);
664	if (!wqp)
665	break;
666	}
667	}
668	#else
669	#define wqp_do_alloc_stats(e)
670	#endif /* KEEP_WAITQ_LINK_STATS */
671
672	static void wqp_init(void)
673	{
674	uint32_t tablesz = `0`, max_wqp = `0`;
675
676	if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE)
677	tablesz = (uint32_t)g_lt_max_tbl_size;
678
679	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
680	max_wqp = tablesz / sizeof(struct wq_prepost);
681	assert(max_wqp > `0` && tablesz > `0`);
682
683	/ we have a restricted index range /
684	if (max_wqp > (LT_IDX_MAX + `1`))
685	max_wqp = LT_IDX_MAX + `1`;
686
687	wqinfo("init prepost table with max:%d elements (%d bytes)",
688	max_wqp, tablesz);
689	ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,
690	sizeof(struct wq_prepost), wqp_poison);
691	}
692
693	/*
694	* Refill the per-CPU cache.
695	*/
696	static void wq_prepost_refill_cpu_cache(uint32_t nalloc)
697	{
698	struct lt_elem new_head, old_head;
699	struct wqp_cache *cache;
700
701	/ require preemption enabled to allocate elements /
702	if (get_preemption_level() != `0`)
703	return;
704
705	new_head = ltable_alloc_elem(&g_prepost_table,
706	LT_RESERVED, nalloc, `1`);
707	if (new_head == NULL)
708	return;
709
710	disable_preemption();
711	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
712
713	/ check once more before putting these elements on the list /
714	if (cache->avail >= WQP_CACHE_MAX) {
715	lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);
716	enable_preemption();
717	return;
718	}
719
720	cache->avail += nalloc;
721	if (cache->head == `0` \|\| cache->head == LT_IDX_MAX) {
722	cache->head = new_head->lt_id.id;
723	goto out;
724	}
725
726	old_head = lt_elem_list_first(&g_prepost_table, cache->head);
727	(void)lt_elem_list_link(&g_prepost_table, new_head, old_head);
728	cache->head = new_head->lt_id.id;
729
730	out:
731	enable_preemption();
732	return;
733	}
734
735	static void wq_prepost_ensure_free_space(void)
736	{
737	uint32_t free_elem;
738	uint32_t min_free;
739	struct wqp_cache *cache;
740
741	if (g_min_free_cache == `0`)
742	g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
743
744	/*
745	* Ensure that we always have a pool of per-CPU prepost elements
746	*/
747	disable_preemption();
748	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
749	free_elem = cache->avail;
750	enable_preemption();
751
752	if (free_elem < (WQP_CACHE_MAX / `3`))
753	wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
754
755	/*
756	* Now ensure that we have a sufficient amount of free table space
757	*/
758	free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
759	min_free = g_min_free_table_elem + g_min_free_cache;
760	if (free_elem < min_free) {
761	/*
762	* we don't hold locks on these values, so check for underflow
763	*/
764	if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
765	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
766	g_prepost_table.nelem, g_prepost_table.used_elem,
767	g_min_free_table_elem, g_min_free_cache);
768	ltable_grow(&g_prepost_table, min_free);
769	}
770	}
771	}
772
773	static struct wq_prepost wq_prepost_alloc(int* type, int nelem)
774	{
775	struct lt_elem *elem;
776	struct wq_prepost *wqp;
777	struct wqp_cache *cache;
778
779	if (type != LT_RESERVED)
780	goto do_alloc;
781	if (nelem == `0`)
782	return NULL;
783
784	/*
785	* First try to grab the elements from the per-CPU cache if we are
786	* allocating RESERVED elements
787	*/
788	disable_preemption();
789	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
790	if (nelem <= (int)cache->avail) {
791	struct lt_elem first, next = NULL;
792	int nalloc = nelem;
793
794	cache->avail -= nelem;
795
796	/ grab the first element /
797	first = lt_elem_list_first(&g_prepost_table, cache->head);
798
799	/ find the last element and re-adjust the cache head /
800	for (elem = first; elem != NULL && nalloc > `0`; elem = next) {
801	next = lt_elem_list_next(&g_prepost_table, elem);
802	if (--nalloc == `0`) {
803	/ terminate the allocated list /
804	elem->lt_next_idx = LT_IDX_MAX;
805	break;
806	}
807	}
808	assert(nalloc == `0`);
809	if (!next)
810	cache->head = LT_IDX_MAX;
811	else
812	cache->head = next->lt_id.id;
813	/ assert that we don't have mis-matched book keeping /
814	assert(!(cache->head == LT_IDX_MAX && cache->avail > `0`));
815	enable_preemption();
816	elem = first;
817	goto out;
818	}
819	enable_preemption();
820
821	do_alloc:
822	/ fall-back to standard table allocation /
823	elem = ltable_alloc_elem(&g_prepost_table, type, nelem, `0`);
824	if (!elem)
825	return NULL;
826
827	out:
828	wqp = (struct wq_prepost *)elem;
829	wqp_do_alloc_stats(elem);
830	return wqp;
831	}
832
833	static void wq_prepost_invalidate(struct wq_prepost *wqp)
834	{
835	lt_elem_invalidate(&wqp->wqte);
836	}
837
838	static struct wq_prepost *wq_prepost_get(uint64_t wqp_id)
839	{
840	struct lt_elem *elem;
841
842	elem = ltable_get_elem(&g_prepost_table, wqp_id);
843	return (struct wq_prepost *)elem;
844	}
845
846	static void wq_prepost_put(struct wq_prepost *wqp)
847	{
848	ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);
849	}
850
851	static int wq_prepost_rlink(struct wq_prepost parent, struct* wq_prepost *child)
852	{
853	return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
854	}
855
856	static struct wq_prepost wq_prepost_get_rnext(struct* wq_prepost *head)
857	{
858	struct lt_elem *elem;
859	struct wq_prepost *wqp;
860	uint64_t id;
861
862	elem = lt_elem_list_next(&g_prepost_table, &head->wqte);
863	if (!elem)
864	return NULL;
865	id = elem->lt_id.id;
866	elem = ltable_get_elem(&g_prepost_table, id);
867
868	if (!elem)
869	return NULL;
870	wqp = (struct wq_prepost *)elem;
871	if (elem->lt_id.id != id \|\|
872	wqp_type(wqp) != WQP_POST \|\|
873	wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
874	ltable_put_elem(&g_prepost_table, elem);
875	return NULL;
876	}
877
878	return wqp;
879	}
880
881	static void wq_prepost_reset_rnext(struct wq_prepost *wqp)
882	{
883	(void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);
884	}
885
886
887	/**
888	* remove 'wqp' from the prepost list on 'wqset'
889	*
890	* Conditions:
891	* wqset is locked
892	* caller holds a reference on wqp (and is responsible to release it)
893	*
894	* Result:
895	* wqp is invalidated, wqset is potentially updated with a new
896	* prepost ID, and the next element of the prepost list may be
897	* consumed as well (if the list contained only 2 objects)
898	*/
899	static int wq_prepost_remove(struct waitq_set *wqset,
900	struct wq_prepost *wqp)
901	{
902	int more_posts = `1`;
903	uint64_t next_id = wqp->wqp_post.wqp_next_id;
904	uint64_t wqp_id = wqp->wqp_prepostid.id;
905	struct wq_prepost prev_wqp, next_wqp;
906
907	assert(wqp_type(wqp) == WQP_POST);
908	assert(wqset->wqset_q.waitq_prepost == `1`);
909
910	if (next_id == wqp_id) {
911	/ the list is singular and becoming empty /
912	wqset->wqset_prepost_id = `0`;
913	more_posts = `0`;
914	goto out;
915	}
916
917	prev_wqp = wq_prepost_get_rnext(wqp);
918	assert(prev_wqp != NULL);
919	assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
920	assert(prev_wqp->wqp_prepostid.id != wqp_id);
921	assert(wqp_type(prev_wqp) == WQP_POST);
922
923	if (prev_wqp->wqp_prepostid.id == next_id) {
924	/*
925	* There are two items in the list, and we're removing one. We
926	* only need to keep the WQP_WQ pointer from 'prev_wqp'
927	*/
928	wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
929	wq_prepost_invalidate(prev_wqp);
930	wq_prepost_put(prev_wqp);
931	more_posts = `0`;
932	goto out;
933	}
934
935	/ prev->next = next /
936	prev_wqp->wqp_post.wqp_next_id = next_id;
937
938	/ next->prev = prev /
939	next_wqp = wq_prepost_get(next_id);
940	assert(next_wqp != NULL);
941	assert(next_wqp != wqp);
942	assert(next_wqp != prev_wqp);
943	assert(wqp_type(next_wqp) == WQP_POST);
944
945	wq_prepost_reset_rnext(next_wqp);
946	wq_prepost_rlink(next_wqp, prev_wqp);
947
948	/ If we remove the head of the list, update the wqset /
949	if (wqp_id == wqset->wqset_prepost_id)
950	wqset->wqset_prepost_id = next_id;
951
952	wq_prepost_put(prev_wqp);
953	wq_prepost_put(next_wqp);
954
955	out:
956	wq_prepost_reset_rnext(wqp);
957	wq_prepost_invalidate(wqp);
958	return more_posts;
959	}
960
961	static struct wq_prepost *wq_prepost_rfirst(uint64_t id)
962	{
963	struct lt_elem *elem;
964	elem = lt_elem_list_first(&g_prepost_table, id);
965	wqp_do_alloc_stats(elem);
966	return (struct wq_prepost )(void* *)elem;
967	}
968
969	static struct wq_prepost wq_prepost_rpop(uint64_t id, int type)
970	{
971	struct lt_elem *elem;
972	elem = lt_elem_list_pop(&g_prepost_table, id, type);
973	wqp_do_alloc_stats(elem);
974	return (struct wq_prepost )(void* *)elem;
975	}
976
977	static void wq_prepost_release_rlist(struct wq_prepost *wqp)
978	{
979	int nelem = `0`;
980	struct wqp_cache *cache;
981	struct lt_elem *elem;
982
983	if (!wqp)
984	return;
985
986	elem = &wqp->wqte;
987
988	/*
989	* These are reserved elements: release them back to the per-cpu pool
990	* if our cache is running low.
991	*/
992	disable_preemption();
993	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
994	if (cache->avail < WQP_CACHE_MAX) {
995	struct lt_elem *tmp = NULL;
996	if (cache->head != LT_IDX_MAX)
997	tmp = lt_elem_list_first(&g_prepost_table, cache->head);
998	nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);
999	cache->head = elem->lt_id.id;
1000	cache->avail += nelem;
1001	enable_preemption();
1002	return;
1003	}
1004	enable_preemption();
1005
1006	/ release these elements back to the main table /
1007	nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);
1008
1009	#if CONFIG_WAITQ_STATS
1010	g_prepost_table.nreserved_releases += `1`;
1011	OSDecrementAtomic64(&g_prepost_table.nreservations);
1012	#endif
1013	}
1014
1015	typedef int (wqp_callback_func)(struct* waitq_set *wqset,
1016	void *ctx,
1017	struct wq_prepost *wqp,
1018	struct waitq *waitq);
1019
1020	/**
1021	* iterate over a chain of preposts associated with a waitq set.
1022	*
1023	* Conditions:
1024	* wqset is locked
1025	*
1026	* Notes:
1027	* This loop performs automatic prepost chain management / culling, and
1028	* may reset or adjust the waitq set's prepost ID pointer. If you don't
1029	* want this extra processing, you can use wq_prepost_iterate().
1030	*/
1031	static int wq_prepost_foreach_locked(struct waitq_set *wqset,
1032	void *ctx, wqp_callback_func cb)
1033	{
1034	int ret = WQ_ITERATE_SUCCESS;
1035	struct wq_prepost wqp, tmp_wqp;
1036
1037	assert(cb != NULL);
1038
1039	if (!wqset \|\| !waitq_set_maybe_preposted(wqset))
1040	return WQ_ITERATE_SUCCESS;
1041
1042	restart:
1043	wqp = wq_prepost_get(wqset->wqset_prepost_id);
1044	if (!wqp) {
1045	/*
1046	* The prepost object is no longer valid, reset the waitq
1047	* set's prepost id.
1048	*/
1049	wqset->wqset_prepost_id = `0`;
1050	return WQ_ITERATE_SUCCESS;
1051	}
1052
1053	if (wqp_type(wqp) == WQP_WQ) {
1054	uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
1055
1056	ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1057
1058	switch (ret) {
1059	case WQ_ITERATE_INVALIDATE_CONTINUE:
1060	/ the caller wants to remove the only prepost here /
1061	assert(wqp_id == wqset->wqset_prepost_id);
1062	wqset->wqset_prepost_id = `0`;
1063	/ fall through /
1064	case WQ_ITERATE_CONTINUE:
1065	wq_prepost_put(wqp);
1066	ret = WQ_ITERATE_SUCCESS;
1067	break;
1068	case WQ_ITERATE_RESTART:
1069	wq_prepost_put(wqp);
1070	/ fall through /
1071	case WQ_ITERATE_DROPPED:
1072	goto restart;
1073	default:
1074	wq_prepost_put(wqp);
1075	break;
1076	}
1077	return ret;
1078	}
1079
1080	assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
1081	assert(wqp_type(wqp) == WQP_POST);
1082
1083	/*
1084	* At this point we know we have a list of POST objects.
1085	* Grab a handle to the last element in the list and start
1086	* the iteration.
1087	*/
1088	tmp_wqp = wq_prepost_get_rnext(wqp);
1089	assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
1090
1091	uint64_t last_id = tmp_wqp->wqp_prepostid.id;
1092	wq_prepost_put(tmp_wqp);
1093
1094	ret = WQ_ITERATE_SUCCESS;
1095	for (;;) {
1096	uint64_t wqp_id, first_id, next_id;
1097
1098	wqp_id = wqp->wqp_prepostid.id;
1099	first_id = wqset->wqset_prepost_id;
1100	next_id = wqp->wqp_post.wqp_next_id;
1101
1102	/ grab the WQP_WQ object this _POST points to /
1103	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1104	if (!tmp_wqp) {
1105	/*
1106	* This WQP_POST object points to an invalid
1107	* WQP_WQ object - remove the POST object from
1108	* the list.
1109	*/
1110	if (wq_prepost_remove(wqset, wqp) == `0`) {
1111	wq_prepost_put(wqp);
1112	goto restart;
1113	}
1114	goto next_prepost;
1115	}
1116	assert(wqp_type(tmp_wqp) == WQP_WQ);
1117	/*
1118	* make the callback: note that this could remove 'wqp' or
1119	* drop the lock on our waitq set. We need to re-validate
1120	* our state when this function returns.
1121	*/
1122	ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);
1123	wq_prepost_put(tmp_wqp);
1124
1125	switch (ret) {
1126	case WQ_ITERATE_CONTINUE:
1127	/ continue iteration /
1128	break;
1129	case WQ_ITERATE_INVALIDATE_CONTINUE:
1130	assert(next_id == wqp->wqp_post.wqp_next_id);
1131	if (wq_prepost_remove(wqset, wqp) == `0`) {
1132	wq_prepost_put(wqp);
1133	goto restart;
1134	}
1135	goto next_prepost;
1136	case WQ_ITERATE_RESTART:
1137	wq_prepost_put(wqp);
1138	/ fall-through /
1139	case WQ_ITERATE_DROPPED:
1140	/ the callback dropped the ref to wqp: just restart /
1141	goto restart;
1142	default:
1143	/ break out of the iteration for some other reason /
1144	goto finish_prepost_foreach;
1145	}
1146
1147	/*
1148	* the set lock may have been dropped during callback,
1149	* if something looks different, restart the prepost iteration
1150	*/
1151	if (!wqp_is_valid(wqp) \|\|
1152	(wqp->wqp_post.wqp_next_id != next_id) \|\|
1153	wqset->wqset_prepost_id != first_id) {
1154	wq_prepost_put(wqp);
1155	goto restart;
1156	}
1157
1158	next_prepost:
1159	/ this was the last object in the list /
1160	if (wqp_id == last_id)
1161	break;
1162
1163	/ get the next object /
1164	tmp_wqp = wq_prepost_get(next_id);
1165	if (!tmp_wqp) {
1166	/*
1167	* At this point we've already checked our state
1168	* after the callback (which may have dropped the set
1169	* lock). If we find an invalid member of the list
1170	* then something is wrong.
1171	*/
1172	panic("Invalid WQP_POST member 0x%llx in waitq set "
1173	"0x%llx prepost list (first:%llx, "
1174	"wqp:%p)",
1175	next_id, wqset->wqset_id, first_id, wqp);
1176	}
1177	wq_prepost_put(wqp);
1178	wqp = tmp_wqp;
1179
1180	assert(wqp_type(wqp) == WQP_POST);
1181	}
1182
1183	finish_prepost_foreach:
1184	wq_prepost_put(wqp);
1185	if (ret == WQ_ITERATE_CONTINUE)
1186	ret = WQ_ITERATE_SUCCESS;
1187
1188	return ret;
1189	}
1190
1191	/**
1192	* Perform a simple loop over a chain of prepost objects
1193	*
1194	* Conditions:
1195	* If 'prepost_id' is associated with a waitq (set) then that object must
1196	* be locked before calling this function.
1197	* Callback function, 'cb', must be able to handle a NULL wqset pointer
1198	* and a NULL waitq pointer!
1199	*
1200	* Notes:
1201	* This prepost chain iteration will _not_ automatically adjust any chain
1202	* element or linkage. This is the responsibility of the caller! If you
1203	* want automatic prepost chain management (at a cost of extra CPU time),
1204	* you can use: wq_prepost_foreach_locked().
1205	*/
1206	static int wq_prepost_iterate(uint64_t prepost_id,
1207	void *ctx, wqp_callback_func cb)
1208	{
1209	int ret;
1210	struct wq_prepost *wqp;
1211
1212	if (!prepost_id)
1213	return WQ_ITERATE_SUCCESS;
1214
1215	wqp = wq_prepost_get(prepost_id);
1216	if (!wqp)
1217	return WQ_ITERATE_SUCCESS;
1218
1219	if (wqp_type(wqp) == WQP_WQ) {
1220	ret = WQ_ITERATE_SUCCESS;
1221	if (cb)
1222	ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1223
1224	if (ret != WQ_ITERATE_DROPPED)
1225	wq_prepost_put(wqp);
1226	return ret;
1227	}
1228
1229	assert(wqp->wqp_prepostid.id == prepost_id);
1230	assert(wqp_type(wqp) == WQP_POST);
1231
1232	/ at this point we know we have a list of POST objects /
1233	uint64_t next_id;
1234
1235	ret = WQ_ITERATE_CONTINUE;
1236	do {
1237	struct wq_prepost *tmp_wqp;
1238	struct waitq *wq = NULL;
1239
1240	next_id = wqp->wqp_post.wqp_next_id;
1241
1242	/ grab the WQP_WQ object this _POST points to /
1243	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1244	if (tmp_wqp) {
1245	assert(wqp_type(tmp_wqp) == WQP_WQ);
1246	wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
1247	}
1248
1249	if (cb)
1250	ret = cb(NULL, ctx, wqp, wq);
1251	if (tmp_wqp)
1252	wq_prepost_put(tmp_wqp);
1253
1254	if (ret != WQ_ITERATE_CONTINUE)
1255	break;
1256
1257	tmp_wqp = wq_prepost_get(next_id);
1258	if (!tmp_wqp) {
1259	/*
1260	* the chain is broken: nothing we can do here besides
1261	* bail from the iteration.
1262	*/
1263	ret = WQ_ITERATE_ABORTED;
1264	break;
1265	}
1266
1267	wq_prepost_put(wqp);
1268	wqp = tmp_wqp;
1269
1270	assert(wqp_type(wqp) == WQP_POST);
1271	} while (next_id != prepost_id);
1272
1273	if (ret != WQ_ITERATE_DROPPED)
1274	wq_prepost_put(wqp);
1275
1276	if (ret == WQ_ITERATE_CONTINUE)
1277	ret = WQ_ITERATE_SUCCESS;
1278	return ret;
1279	}
1280
1281
1282	struct _is_posted_ctx {
1283	struct waitq *posting_wq;
1284	int did_prepost;
1285	};
1286
1287	static int wq_is_preposted_on_set_cb(struct waitq_set wqset, void* *ctx,
1288	struct wq_prepost wqp, struct* waitq *waitq)
1289	{
1290	struct _is_posted_ctx pctx = (struct* _is_posted_ctx *)ctx;
1291
1292	(void)wqset;
1293	(void)wqp;
1294
1295	/*
1296	* Don't early-out, run through the _entire_ list:
1297	* This ensures that we retain a minimum number of invalid elements.
1298	*/
1299	if (pctx->posting_wq == waitq)
1300	pctx->did_prepost = `1`;
1301
1302	return WQ_ITERATE_CONTINUE;
1303	}
1304
1305
1306	/**
1307	* checks if 'waitq' has already preposted on 'wqset'
1308	*
1309	* Parameters:
1310	* waitq The waitq that's preposting
1311	* wqset The set onto which waitq may be preposted
1312	*
1313	* Conditions:
1314	* both waitq and wqset are locked
1315	*
1316	* Returns non-zero if 'waitq' has already preposted to 'wqset'
1317	*/
1318	static int wq_is_preposted_on_set(struct waitq waitq, struct* waitq_set *wqset)
1319	{
1320	int ret;
1321	struct _is_posted_ctx pctx;
1322
1323	/*
1324	* If the set's only prepost matches the waitq's prepost ID,
1325	* then it obviously already preposted to the set.
1326	*/
1327	if (waitq->waitq_prepost_id != `0` &&
1328	wqset->wqset_prepost_id == waitq->waitq_prepost_id)
1329	return `1`;
1330
1331	/ use full prepost iteration: always trim the list /
1332	pctx.posting_wq = waitq;
1333	pctx.did_prepost = `0`;
1334	ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
1335	wq_is_preposted_on_set_cb);
1336	return pctx.did_prepost;
1337	}
1338
1339	static struct wq_prepost wq_get_prepost_obj(uint64_t reserved, int type)
1340	{
1341	struct wq_prepost *wqp = NULL;
1342	/*
1343	* don't fail just because the caller doesn't have enough
1344	* reservations, we've kept a low-water mark on the prepost table,
1345	* so there should be some available for us.
1346	*/
1347	if (reserved && *reserved) {
1348	wqp = wq_prepost_rpop(reserved, type);
1349	assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);
1350	} else {
1351	/*
1352	* TODO: if in interrupt context, grab from a special
1353	* region / reserved list!
1354	*/
1355	wqp = wq_prepost_alloc(type, `1`);
1356	}
1357
1358	if (wqp == NULL)
1359	panic("Couldn't allocate prepost object!");
1360	return wqp;
1361	}
1362
1363
1364	/**
1365	* prepost a waitq onto a waitq set
1366	*
1367	* Parameters:
1368	* wqset The set onto which waitq will be preposted
1369	* waitq The waitq that's preposting
1370	* reserved List (lt_elem_list_ style) of pre-allocated prepost elements
1371	* Could be NULL
1372	*
1373	* Conditions:
1374	* both wqset and waitq are locked
1375	*
1376	* Notes:
1377	* If reserved is NULL, this may block on prepost table growth.
1378	*/
1379	static void wq_prepost_do_post_locked(struct waitq_set *wqset,
1380	struct waitq *waitq,
1381	uint64_t *reserved)
1382	{
1383	struct wq_prepost wqp_post, wqp_head, *wqp_tail;
1384
1385	assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
1386
1387	/*
1388	* nothing to do if it's already preposted:
1389	* note that this also culls any invalid prepost objects
1390	*/
1391	if (wq_is_preposted_on_set(waitq, wqset))
1392	return;
1393
1394	assert(waitqs_is_linked(wqset));
1395
1396	/*
1397	* This function is called because an event is being posted to 'waitq'.
1398	* We need a prepost object associated with this queue. Allocate one
1399	* now if the waitq isn't already associated with one.
1400	*/
1401	if (waitq->waitq_prepost_id == `0`) {
1402	struct wq_prepost *wqp;
1403	wqp = wq_get_prepost_obj(reserved, WQP_WQ);
1404	wqp->wqp_wq.wqp_wq_ptr = waitq;
1405	wqp_set_valid(wqp);
1406	waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
1407	wq_prepost_put(wqp);
1408	}
1409
1410	#if CONFIG_LTABLE_STATS
1411	g_prepost_table.npreposts += `1`;
1412	#endif
1413
1414	wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
1415	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
1416	waitq->waitq_prepost_id, wqset->wqset_id);
1417
1418	if (wqset->wqset_prepost_id == `0`) {
1419	/ the set has no previous preposts /
1420	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1421	return;
1422	}
1423
1424	wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
1425	if (!wqp_head) {
1426	/ the previous prepost has become invalid /
1427	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1428	return;
1429	}
1430
1431	assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
1432
1433	/*
1434	* If we get here, we're going to need at least one new wq_prepost
1435	* object. If the previous wqset_prepost_id points to a WQP_WQ, we
1436	* actually need to allocate 2 wq_prepost objects because the WQP_WQ
1437	* is tied to the waitq and shared across all sets.
1438	*/
1439	wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
1440
1441	wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
1442	wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
1443	waitq->waitq_prepost_id);
1444
1445	if (wqp_type(wqp_head) == WQP_WQ) {
1446	/*
1447	* We must replace the wqset_prepost_id with a pointer
1448	* to two new WQP_POST objects
1449	*/
1450	uint64_t wqp_id = wqp_head->wqp_prepostid.id;
1451	wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
1452	"replacing with two POST preposts",
1453	wqset->wqset_id, wqp_id);
1454
1455	/ drop the old reference /
1456	wq_prepost_put(wqp_head);
1457
1458	/ grab another new object (the 2nd of two) /
1459	wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
1460
1461	/ point this one to the original WQP_WQ object /
1462	wqp_head->wqp_post.wqp_wq_id = wqp_id;
1463	wqdbg_v("POST 0x%llx :: WQ 0x%llx",
1464	wqp_head->wqp_prepostid.id, wqp_id);
1465
1466	/ link it to the new wqp_post object allocated earlier /
1467	wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1468	/ make the list a double-linked and circular /
1469	wq_prepost_rlink(wqp_head, wqp_post);
1470
1471	/*
1472	* Finish setting up the new prepost: point it back to the
1473	* POST object we allocated to replace the original wqset
1474	* WQ prepost object
1475	*/
1476	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1477	wq_prepost_rlink(wqp_post, wqp_head);
1478
1479	/ mark objects valid, and reset the wqset prepost list head /
1480	wqp_set_valid(wqp_head);
1481	wqp_set_valid(wqp_post);
1482	wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
1483
1484	/ release both references /
1485	wq_prepost_put(wqp_head);
1486	wq_prepost_put(wqp_post);
1487
1488	wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
1489	wqset->wqset_id, wqset->wqset_prepost_id,
1490	wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
1491	wqp_post->wqp_prepostid.id,
1492	wqp_post->wqp_post.wqp_next_id);
1493	return;
1494	}
1495
1496	assert(wqp_type(wqp_head) == WQP_POST);
1497
1498	/*
1499	* Add the new prepost to the end of the prepost list
1500	*/
1501	wqp_tail = wq_prepost_get_rnext(wqp_head);
1502	assert(wqp_tail != NULL);
1503	assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
1504
1505	/*
1506	* link the head to the new tail
1507	* NOTE: this needs to happen first in case wqp_tail == wqp_head
1508	*/
1509	wq_prepost_reset_rnext(wqp_head);
1510	wq_prepost_rlink(wqp_head, wqp_post);
1511
1512	/ point the new object to the list head, and list tail /
1513	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1514	wq_prepost_rlink(wqp_post, wqp_tail);
1515
1516	/ point the last item in the waitq set's list to the new object /
1517	wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1518
1519	wqp_set_valid(wqp_post);
1520
1521	wq_prepost_put(wqp_head);
1522	wq_prepost_put(wqp_tail);
1523	wq_prepost_put(wqp_post);
1524
1525	wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
1526	"new_prepost:0x%llx->0x%llx", wqset->wqset_id,
1527	wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
1528	wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
1529
1530	return;
1531	}
1532
1533
1534	/ ----------------------------------------------------------------------*
1535	*
1536	* Stats collection / reporting
1537	*
1538	* ---------------------------------------------------------------------- */
1539	#if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS
1540	static void wq_table_stats(struct link_table table, struct* wq_table_stats *stats)
1541	{
1542	stats->version = WAITQ_STATS_VERSION;
1543	stats->table_elements = table->nelem;
1544	stats->table_used_elems = table->used_elem;
1545	stats->table_elem_sz = table->elem_sz;
1546	stats->table_slabs = table->nslabs;
1547	stats->table_slab_sz = table->slab_sz;
1548
1549	stats->table_num_allocs = table->nallocs;
1550	stats->table_num_preposts = table->npreposts;
1551	stats->table_num_reservations = table->nreservations;
1552
1553	stats->table_max_used = table->max_used;
1554	stats->table_avg_used = table->avg_used;
1555	stats->table_max_reservations = table->max_reservations;
1556	stats->table_avg_reservations = table->avg_reservations;
1557	}
1558
1559	void waitq_link_stats(struct wq_table_stats *stats)
1560	{
1561	if (!stats)
1562	return;
1563	wq_table_stats(&g_wqlinktable, stats);
1564	}
1565
1566	void waitq_prepost_stats(struct wq_table_stats *stats)
1567	{
1568	wq_table_stats(&g_prepost_table, stats);
1569	}
1570	#endif
1571
1572
1573	/ ----------------------------------------------------------------------*
1574	*
1575	* Global Wait Queues
1576	*
1577	* ---------------------------------------------------------------------- */
1578
1579	static struct waitq g_boot_waitq;
1580	static struct waitq *global_waitqs = &g_boot_waitq;
1581	static uint32_t g_num_waitqs = `1`;
1582
1583	/*
1584	* Zero out the used MSBs of the event.
1585	*/
1586	#define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
1587
1588	static __inline__ uint32_t waitq_hash(char *key, size_t length)
1589	{
1590	uint32_t hash = jenkins_hash(key, length);
1591
1592	hash &= (g_num_waitqs - `1`);
1593	return hash;
1594	}
1595
1596	/ return a global waitq pointer corresponding to the given event /
1597	struct waitq _global_eventq(char* *event, size_t event_length)
1598	{
1599	return &global_waitqs[waitq_hash(event, event_length)];
1600	}
1601
1602	/ return an indexed global waitq pointer /
1603	struct waitq global_waitq(int* index)
1604	{
1605	return &global_waitqs[index % g_num_waitqs];
1606	}
1607
1608
1609	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
1610	/ this global is for lldb /
1611	const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
1612
1613	static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
1614	{
1615	uintptr_t buf[NWAITQ_BTFRAMES + skip];
1616	if (skip < `0`)
1617	skip = `0`;
1618	memset(buf, `0`, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
1619	backtrace(buf, g_nwaitq_btframes + skip);
1620	memcpy(&bt[`0`], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
1621	}
1622	#else /* no stats */
1623	#define waitq_grab_backtrace(...)
1624	#endif
1625
1626	#if CONFIG_WAITQ_STATS
1627
1628	struct wq_stats g_boot_stats;
1629	struct wq_stats *g_waitq_stats = &g_boot_stats;
1630
1631	static __inline__ struct wq_stats waitq_global_stats(struct* waitq *waitq) {
1632	struct wq_stats *wqs;
1633	uint32_t idx;
1634
1635	if (!waitq_is_global(waitq))
1636	return NULL;
1637
1638	idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
1639	assert(idx < g_num_waitqs);
1640	wqs = &g_waitq_stats[idx];
1641	return wqs;
1642	}
1643
1644	static __inline__ void waitq_stats_count_wait(struct waitq *waitq)
1645	{
1646	struct wq_stats *wqs = waitq_global_stats(waitq);
1647	if (wqs != NULL) {
1648	wqs->waits++;
1649	waitq_grab_backtrace(wqs->last_wait, `2`);
1650	}
1651	}
1652
1653	static __inline__ void waitq_stats_count_wakeup(struct waitq *waitq)
1654	{
1655	struct wq_stats *wqs = waitq_global_stats(waitq);
1656	if (wqs != NULL) {
1657	wqs->wakeups++;
1658	waitq_grab_backtrace(wqs->last_wakeup, `2`);
1659	}
1660	}
1661
1662	static __inline__ void waitq_stats_count_clear_wakeup(struct waitq *waitq)
1663	{
1664	struct wq_stats *wqs = waitq_global_stats(waitq);
1665	if (wqs != NULL) {
1666	wqs->wakeups++;
1667	wqs->clears++;
1668	waitq_grab_backtrace(wqs->last_wakeup, `2`);
1669	}
1670	}
1671
1672	static __inline__ void waitq_stats_count_fail(struct waitq *waitq)
1673	{
1674	struct wq_stats *wqs = waitq_global_stats(waitq);
1675	if (wqs != NULL) {
1676	wqs->failed_wakeups++;
1677	waitq_grab_backtrace(wqs->last_failed_wakeup, `2`);
1678	}
1679	}
1680	#else /* !CONFIG_WAITQ_STATS */
1681	#define waitq_stats_count_wait(q) do { } while (0)
1682	#define waitq_stats_count_wakeup(q) do { } while (0)
1683	#define waitq_stats_count_clear_wakeup(q) do { } while (0)
1684	#define waitq_stats_count_fail(q) do { } while (0)
1685	#endif
1686
1687	int waitq_is_valid(struct waitq *waitq)
1688	{
1689	return (waitq != NULL) && waitq->waitq_isvalid;
1690	}
1691
1692	int waitq_set_is_valid(struct waitq_set *wqset)
1693	{
1694	return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);
1695	}
1696
1697	int waitq_is_global(struct waitq *waitq)
1698	{
1699	if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs)
1700	return `1`;
1701	return `0`;
1702	}
1703
1704	int waitq_irq_safe(struct waitq *waitq)
1705	{
1706	/ global wait queues have this bit set on initialization /
1707	return waitq->waitq_irq;
1708	}
1709
1710	struct waitq * waitq_get_safeq(struct waitq *waitq)
1711	{
1712	struct waitq *safeq;
1713
1714	/ Check if it's a port waitq /
1715	if (waitq_is_port_queue(waitq)) {
1716	assert(!waitq_irq_safe(waitq));
1717	safeq = ipc_port_rcv_turnstile_waitq(waitq);
1718	} else {
1719	safeq = global_eventq(waitq);
1720	}
1721	return safeq;
1722	}
1723
1724	static uint32_t waitq_hash_size(void)
1725	{
1726	uint32_t hsize, queues;
1727
1728	if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize)))
1729	return (hsize);
1730
1731	queues = thread_max / `5`;
1732	hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
1733
1734	return hsize;
1735	}
1736
1737	/*
1738	* Since the priority ordered waitq uses basepri as the
1739	* ordering key assert that this value fits in a uint8_t.
1740	*/
1741	static_assert(MAXPRI <= UINT8_MAX);
1742
1743	static inline void waitq_thread_insert(struct waitq *wq,
1744	thread_t thread, boolean_t fifo)
1745	{
1746	if (waitq_is_turnstile_queue(wq)) {
1747	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1748	(TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) \| DBG_FUNC_NONE,
1749	VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
1750	thread_tid(thread),
1751	thread->base_pri, `0`, `0`);
1752
1753	turnstile_stats_update(`0`, TSU_TURNSTILE_BLOCK_COUNT, NULL);
1754
1755	/*
1756	* For turnstile queues (which use priority queues),
1757	* insert the thread in the heap based on its current
1758	* base_pri. Note that the priority queue implementation
1759	* is currently not stable, so does not maintain fifo for
1760	* threads at the same base_pri. Also, if the base_pri
1761	* of the thread changes while its blocked in the waitq,
1762	* the thread position should be updated in the priority
1763	* queue by calling priority queue increase/decrease
1764	* operations.
1765	*/
1766	priority_queue_entry_init(&(thread->wait_prioq_links));
1767	priority_queue_insert(&wq->waitq_prio_queue,
1768	&thread->wait_prioq_links, thread->base_pri,
1769	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1770	} else {
1771	turnstile_stats_update(`0`, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
1772	if (fifo) {
1773	enqueue_tail(&wq->waitq_queue, &thread->wait_links);
1774	} else {
1775	enqueue_head(&wq->waitq_queue, &thread->wait_links);
1776	}
1777	}
1778	}
1779
1780	static inline void waitq_thread_remove(struct waitq *wq,
1781	thread_t thread)
1782	{
1783	if (waitq_is_turnstile_queue(wq)) {
1784	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1785	(TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) \| DBG_FUNC_NONE,
1786	VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
1787	thread_tid(thread),
1788	`0`, `0`, `0`);
1789	priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links,
1790	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
1791	} else {
1792	remqueue(&(thread->wait_links));
1793	}
1794	}
1795
1796	void waitq_bootstrap(void)
1797	{
1798	kern_return_t kret;
1799	uint32_t whsize, qsz, tmp32;
1800
1801	g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
1802	if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE)
1803	g_min_free_table_elem = tmp32;
1804	wqdbg("Minimum free table elements: %d", tmp32);
1805
1806	/*
1807	* Determine the amount of memory we're willing to reserve for
1808	* the waitqueue hash table
1809	*/
1810	whsize = waitq_hash_size();
1811
1812	/ Determine the number of waitqueues we can fit. /
1813	qsz = sizeof(struct waitq);
1814	whsize = ROUNDDOWN(whsize, qsz);
1815	g_num_waitqs = whsize / qsz;
1816
1817	/*
1818	* The hash algorithm requires that this be a power of 2, so we
1819	* just mask off all the low-order bits.
1820	*/
1821	for (uint32_t i = `0`; i < `31`; i++) {
1822	uint32_t bit = (`1` << i);
1823	if ((g_num_waitqs & bit) == g_num_waitqs)
1824	break;
1825	g_num_waitqs &= ~bit;
1826	}
1827	assert(g_num_waitqs > `0`);
1828
1829	/ Now determine how much memory we really need. /
1830	whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
1831
1832	wqdbg("allocating %d global queues (%d bytes)", g_num_waitqs, whsize);
1833	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
1834	whsize, `0`, KMA_KOBJECT\|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1835	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL)
1836	panic("kernel_memory_allocate() failed to alloc global_waitqs"
1837	", error: %d, whsize: 0x%x", kret, whsize);
1838
1839	#if CONFIG_WAITQ_STATS
1840	whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
1841	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
1842	whsize, `0`, KMA_KOBJECT\|KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1843	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL)
1844	panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
1845	", error: %d, whsize: 0x%x", kret, whsize);
1846	memset(g_waitq_stats, `0`, whsize);
1847	#endif
1848
1849	for (uint32_t i = `0`; i < g_num_waitqs; i++) {
1850	waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO\|SYNC_POLICY_DISABLE_IRQ);
1851	}
1852
1853	waitq_set_zone = zinit(sizeof(struct waitq_set),
1854	WAITQ_SET_MAX * sizeof(struct waitq_set),
1855	sizeof(struct waitq_set),
1856	"waitq sets");
1857	zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
1858
1859	/ initialize the global waitq link table /
1860	wql_init();
1861
1862	/ initialize the global waitq prepost table /
1863	wqp_init();
1864	}
1865
1866
1867	/ ----------------------------------------------------------------------*
1868	*
1869	* Wait Queue Implementation
1870	*
1871	* ---------------------------------------------------------------------- */
1872
1873	/*
1874	* Double the standard lock timeout, because wait queues tend
1875	* to iterate over a number of threads - locking each. If there is
1876	* a problem with a thread lock, it normally times out at the wait
1877	* queue level first, hiding the real problem.
1878	*/
1879	/ For x86, the hardware timeout is in TSC units. /
1880	#if defined(__i386__) \|\| defined(__x86_64__)
1881	#define hwLockTimeOut LockTimeOutTSC
1882	#else
1883	#define hwLockTimeOut LockTimeOut
1884	#endif
1885
1886	void waitq_lock(struct waitq *wq)
1887	{
1888	if (__improbable(waitq_lock_to(wq,
1889	hwLockTimeOut * `2`) == `0`)) {
1890	boolean_t wql_acquired = FALSE;
1891
1892	while (machine_timeout_suspended()) {
1893	mp_enable_preemption();
1894	wql_acquired = waitq_lock_to(wq,
1895	hwLockTimeOut * `2`);
1896	if (wql_acquired)
1897	break;
1898	}
1899	if (wql_acquired == FALSE)
1900	panic("waitq deadlock - waitq=%p, cpu=%d\n",
1901	wq, cpu_number());
1902	}
1903	#if defined(__x86_64__)
1904	pltrace(FALSE);
1905	#endif
1906	assert(waitq_held(wq));
1907	}
1908
1909	void waitq_unlock(struct waitq *wq)
1910	{
1911	assert(waitq_held(wq));
1912	#if defined(__x86_64__)
1913	pltrace(TRUE);
1914	#endif
1915	waitq_lock_unlock(wq);
1916	}
1917
1918
1919	/**
1920	* clear the thread-related waitq state
1921	*
1922	* Conditions:
1923	* 'thread' is locked
1924	*/
1925	static inline void thread_clear_waitq_state(thread_t thread)
1926	{
1927	thread->waitq = NULL;
1928	thread->wait_event = NO_EVENT64;
1929	thread->at_safe_point = FALSE;
1930	}
1931
1932
1933	typedef thread_t (waitq_select_cb)(void* ctx, struct* waitq *waitq,
1934	int is_global, thread_t thread);
1935
1936	struct waitq_select_args {
1937	/ input parameters /
1938	struct waitq *posted_waitq;
1939	struct waitq *waitq;
1940	event64_t event;
1941	waitq_select_cb select_cb;
1942	void *select_ctx;
1943
1944	uint64_t *reserved_preposts;
1945
1946	/ output parameters /
1947	queue_t threadq;
1948	int max_threads;
1949	int *nthreads;
1950	spl_t *spl;
1951	};
1952
1953	static void do_waitq_select_n_locked(struct waitq_select_args *args);
1954
1955	/**
1956	* callback invoked once for every waitq set to which a waitq belongs
1957	*
1958	* Conditions:
1959	* ctx->posted_waitq is locked
1960	* 'link' points to a valid waitq set
1961	*
1962	* Notes:
1963	* Takes the waitq set lock on the set pointed to by 'link'
1964	* Calls do_waitq_select_n_locked() which could recurse back into
1965	* this function if the waitq set is a member of other sets.
1966	* If no threads were selected, it preposts the input waitq
1967	* onto the waitq set pointed to by 'link'.
1968	*/
1969	static int waitq_select_walk_cb(struct waitq waitq, void* *ctx,
1970	struct waitq_link *link)
1971	{
1972	int ret = WQ_ITERATE_CONTINUE;
1973	struct waitq_select_args args = ((struct* waitq_select_args *)ctx);
1974	struct waitq_set *wqset;
1975
1976	(void)waitq;
1977	assert(wql_type(link) == WQL_WQS);
1978
1979	wqset = link->wql_wqs.wql_set;
1980	args.waitq = &wqset->wqset_q;
1981
1982	assert(!waitq_irq_safe(waitq));
1983	assert(!waitq_irq_safe(&wqset->wqset_q));
1984
1985	waitq_set_lock(wqset);
1986	/*
1987	* verify that the link wasn't invalidated just before
1988	* we were able to take the lock.
1989	*/
1990	if (wqset->wqset_id != link->wql_setid.id)
1991	goto out_unlock;
1992
1993	assert(waitqs_is_linked(wqset));
1994
1995	/*
1996	* Find any threads waiting on this wait queue set,
1997	* and recurse into any waitq set to which this set belongs.
1998	*/
1999	do_waitq_select_n_locked(&args);
2000
2001	if (*(args.nthreads) > `0` \|\|
2002	(args.threadq && !queue_empty(args.threadq))) {
2003	/ at least 1 thread was selected and returned: don't prepost /
2004	if (args.max_threads > `0` &&
2005	*(args.nthreads) >= args.max_threads) {
2006	/ break out of the setid walk /
2007	ret = WQ_ITERATE_FOUND;
2008	}
2009	goto out_unlock;
2010	} else {
2011	/*
2012	* No thread selected: prepost 'waitq' to 'wqset'
2013	* if wqset can handle preposts and the event is set to 0.
2014	* We also make sure to not post waitq sets to other sets.
2015	*
2016	* If the set doesn't support preposts, but does support
2017	* prepost callout/hook interaction, invoke the predefined
2018	* callout function and pass the set's 'prepost_hook.' This
2019	* could potentially release another thread to handle events.
2020	*/
2021	if (args.event == NO_EVENT64) {
2022	if (waitq_set_can_prepost(wqset)) {
2023	wq_prepost_do_post_locked(
2024	wqset, waitq, args.reserved_preposts);
2025	} else if (waitq_set_has_prepost_hook(wqset)) {
2026	waitq_set__CALLING_PREPOST_HOOK__(
2027	wqset->wqset_prepost_hook, waitq, `0`);
2028	}
2029	}
2030	}
2031
2032	out_unlock:
2033	waitq_set_unlock(wqset);
2034	return ret;
2035	}
2036
2037	/**
2038	* Routine to iterate over the waitq for non-priority ordered waitqs
2039	*
2040	* Conditions:
2041	* args->waitq (and args->posted_waitq) is locked
2042	*
2043	* Notes:
2044	* Uses the optional select callback function to refine the selection
2045	* of one or more threads from a waitq. The select callback is invoked
2046	* once for every thread that is found to be waiting on the input args->waitq.
2047	*
2048	* If one or more threads are selected, this may disable interrupts.
2049	* The previous interrupt state is returned in args->spl and should
2050	* be used in a call to splx() if threads are returned to the caller.
2051	*/
2052	static thread_t waitq_queue_iterate_locked(struct waitq safeq, struct* waitq *waitq,
2053	spl_t spl, struct waitq_select_args *args,
2054	uint32_t *remaining_eventmask)
2055	{
2056	int max_threads = args->max_threads;
2057	int *nthreads = args->nthreads;
2058	thread_t thread = THREAD_NULL;
2059	thread_t first_thread = THREAD_NULL;
2060
2061	qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
2062	thread_t t = THREAD_NULL;
2063	assert_thread_magic(thread);
2064
2065	/*
2066	* For non-priority ordered waitqs, we allow multiple events to be
2067	* mux'ed into the same waitq. Also safeqs may contain threads from
2068	* multiple waitqs. Only pick threads that match the
2069	* requested wait event.
2070	*/
2071	if (thread->waitq == waitq && thread->wait_event == args->event) {
2072	t = thread;
2073	if (first_thread == THREAD_NULL)
2074	first_thread = thread;
2075
2076	/ allow the caller to futher refine the selection /
2077	if (args->select_cb)
2078	t = args->select_cb(args->select_ctx, waitq,
2079	waitq_is_global(waitq), thread);
2080	if (t != THREAD_NULL) {
2081	*nthreads += `1`;
2082	if (args->threadq) {
2083	/ if output queue, add locked thread to it /
2084	if (*nthreads == `1`)
2085	*(args->spl) = (safeq != waitq) ? spl : splsched();
2086	thread_lock(t);
2087	thread_clear_waitq_state(t);
2088	re_queue_tail(args->threadq, &t->wait_links);
2089	}
2090	/ only enqueue up to 'max' threads /
2091	if (*nthreads >= max_threads && max_threads > `0`)
2092	break;
2093	}
2094	}
2095	/ thread wasn't selected so track it's event /
2096	if (t == THREAD_NULL) {
2097	*remaining_eventmask \|= (thread->waitq != safeq) ?
2098	_CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);
2099	}
2100	}
2101
2102	return first_thread;
2103	}
2104
2105	/**
2106	* Routine to iterate and remove threads from priority ordered waitqs
2107	*
2108	* Conditions:
2109	* args->waitq (and args->posted_waitq) is locked
2110	*
2111	* Notes:
2112	* The priority ordered waitqs only support maximum priority element removal.
2113	*
2114	* Also, the implementation makes sure that all threads in a priority ordered
2115	* waitq are waiting on the same wait event. This is not necessarily true for
2116	* non-priority ordered waitqs. If one or more threads are selected, this may
2117	* disable interrupts. The previous interrupt state is returned in args->spl
2118	* and should be used in a call to splx() if threads are returned to the caller.
2119	*
2120	* In the future, we could support priority ordered waitqs with multiple wait
2121	* events in the same queue. The way to implement that would be to keep removing
2122	* elements from the waitq and if the event does not match the requested one,
2123	* add it to a local list. This local list of elements needs to be re-inserted
2124	* into the priority queue at the end and the select_cb return value &
2125	* remaining_eventmask would need to be handled appropriately. The implementation
2126	* is not very efficient but would work functionally.
2127	*/
2128	static thread_t waitq_prioq_iterate_locked(struct waitq safeq, struct* waitq *waitq,
2129	spl_t spl, struct waitq_select_args *args,
2130	uint32_t *remaining_eventmask)
2131	{
2132	int max_threads = args->max_threads;
2133	int *nthreads = args->nthreads;
2134	thread_t first_thread = THREAD_NULL;
2135	thread_t thread = THREAD_NULL;
2136
2137	/*
2138	* The waitq select routines need to handle two cases:
2139	* Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)
2140	* Get the maximum priority thread from the waitq without removing it.
2141	* In that case args->threadq == NULL and max_threads == 1.
2142	* Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)
2143	* Get max_threads (if available) while removing them from the waitq.
2144	* In that case args->threadq != NULL and max_threads is one of {-1, 1}.
2145	*
2146	* The only possible values for remaining_eventmask for the priority queue
2147	* waitq are either 0 (for the remove all threads case) or the original
2148	* safeq->waitq_eventmask (for the lookup/remove one thread cases).
2149	*/
2150	*remaining_eventmask = safeq->waitq_eventmask;
2151	boolean_t remove_op = !!(args->threadq);
2152
2153	while ((max_threads <= `0`) \|\| (*nthreads < max_threads)) {
2154
2155	if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
2156	*remaining_eventmask = `0`;
2157	break;
2158	}
2159
2160	if (remove_op) {
2161	thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
2162	struct thread, wait_prioq_links,
2163	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
2164	} else {
2165	/ For the peek operation, the only valid value for max_threads is 1 /
2166	assert(max_threads == `1`);
2167	thread = priority_queue_max(&safeq->waitq_prio_queue,
2168	struct thread, wait_prioq_links);
2169	}
2170	/*
2171	* Ensure the wait event matches since priority ordered waitqs do not
2172	* support multiple events in the same waitq.
2173	*/
2174	assert((thread->waitq == waitq) && (thread->wait_event == args->event));
2175
2176	if (args->select_cb) {
2177	/*
2178	* Call the select_cb passed into the waitq_select args. The callback
2179	* updates the select_ctx with information about the highest priority
2180	* thread which is eventually used by the caller.
2181	*/
2182	thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,
2183	waitq_is_global(waitq), thread);
2184	if (!remove_op) {
2185	/ For the peek operation, the thread should not be selected for addition /
2186	assert(ret_thread == THREAD_NULL);
2187	} else {
2188	/*
2189	* For the remove operation, the select routine should always return a valid
2190	* thread for priority waitqs. Since all threads in a prioq are equally
2191	* eligible, it should match the thread removed from the prioq. If this
2192	* invariant changes, the implementation would need to handle the
2193	* remaining_eventmask here correctly.
2194	*/
2195	assert(ret_thread == thread);
2196	}
2197	}
2198
2199	if (first_thread == THREAD_NULL)
2200	first_thread = thread;
2201
2202	/ For the peek operation, break out early /
2203	if (!remove_op)
2204	break;
2205
2206	/ Add the thread to the result thread list /
2207	*nthreads += `1`;
2208	if (*nthreads == `1`)
2209	*(args->spl) = (safeq != waitq) ? spl : splsched();
2210	thread_lock(thread);
2211	thread_clear_waitq_state(thread);
2212	enqueue_tail(args->threadq, &(thread->wait_links));
2213	}
2214
2215	return first_thread;
2216	}
2217
2218	/**
2219	* generic thread selection from a waitq (and sets to which the waitq belongs)
2220	*
2221	* Conditions:
2222	* args->waitq (and args->posted_waitq) is locked
2223	*
2224	* Notes:
2225	* Uses the optional select callback function to refine the selection
2226	* of one or more threads from a waitq and any set to which the waitq
2227	* belongs. The select callback is invoked once for every thread that
2228	* is found to be waiting on the input args->waitq.
2229	*
2230	* If one or more threads are selected, this may disable interrupts.
2231	* The previous interrupt state is returned in args->spl and should
2232	* be used in a call to splx() if threads are returned to the caller.
2233	*/
2234	static void do_waitq_select_n_locked(struct waitq_select_args *args)
2235	{
2236	struct waitq *waitq = args->waitq;
2237	int max_threads = args->max_threads;
2238	thread_t first_thread = THREAD_NULL;
2239	struct waitq *safeq;
2240	uint32_t remaining_eventmask = `0`;
2241	uint32_t eventmask;
2242	int *nthreads = args->nthreads;
2243	spl_t spl = `0`;
2244
2245	assert(max_threads != `0`);
2246
2247	if (!waitq_irq_safe(waitq)) {
2248	/ JMM - add flag to waitq to avoid global lookup if no waiters /
2249	eventmask = _CAST_TO_EVENT_MASK(waitq);
2250	safeq = waitq_get_safeq(waitq);
2251	if (*nthreads == `0`)
2252	spl = splsched();
2253	waitq_lock(safeq);
2254	} else {
2255	eventmask = _CAST_TO_EVENT_MASK(args->event);
2256	safeq = waitq;
2257	}
2258
2259	/*
2260	* If the safeq doesn't have an eventmask (not global) or the event
2261	* we're looking for IS set in its eventmask, then scan the threads
2262	* in that queue for ones that match the original <waitq,event> pair.
2263	*/
2264	if (!waitq_is_global(safeq) \|\|
2265	(safeq->waitq_eventmask & eventmask) == eventmask) {
2266
2267	if (waitq_is_turnstile_queue(safeq)) {
2268	first_thread = waitq_prioq_iterate_locked(safeq, waitq,
2269	spl, args,
2270	&remaining_eventmask);
2271	} else {
2272	first_thread = waitq_queue_iterate_locked(safeq, waitq,
2273	spl, args,
2274	&remaining_eventmask);
2275	}
2276
2277	/*
2278	* Update the eventmask of global queues we just scanned:
2279	* - If we selected all the threads in the queue, we can clear its
2280	* eventmask.
2281	*
2282	* - If we didn't find enough threads to fill our needs, then we can
2283	* assume we looked at every thread in the queue and the mask we
2284	* computed is complete - so reset it.
2285	*/
2286	if (waitq_is_global(safeq)) {
2287	if (waitq_empty(safeq))
2288	safeq->waitq_eventmask = `0`;
2289	else if (max_threads < `0` \|\| *nthreads < max_threads)
2290	safeq->waitq_eventmask = remaining_eventmask;
2291	}
2292	}
2293
2294	/*
2295	* Grab the first thread in the queue if no other thread was selected.
2296	* We can guarantee that no one has manipulated this thread because
2297	* it's waiting on the given waitq, and we have that waitq locked.
2298	*/
2299	if (*nthreads == `0` && first_thread != THREAD_NULL && args->threadq) {
2300	/ we know this is the first (and only) thread /
2301	++(*nthreads);
2302	*(args->spl) = (safeq != waitq) ? spl : splsched();
2303	thread_lock(first_thread);
2304	thread_clear_waitq_state(first_thread);
2305	waitq_thread_remove(safeq, first_thread);
2306	enqueue_tail(args->threadq, &(first_thread->wait_links));
2307
2308	/ update the eventmask on [now] empty global queues /
2309	if (waitq_is_global(safeq) && waitq_empty(safeq))
2310	safeq->waitq_eventmask = `0`;
2311	}
2312
2313	/ unlock the safe queue if we locked one above /
2314	if (safeq != waitq) {
2315	waitq_unlock(safeq);
2316	if (*nthreads == `0`)
2317	splx(spl);
2318	}
2319
2320	if (max_threads > `0` && *nthreads >= max_threads)
2321	return;
2322
2323	/*
2324	* wait queues that are not in any sets
2325	* are the bottom of the recursion
2326	*/
2327	if (!waitq->waitq_set_id)
2328	return;
2329
2330	/ check to see if the set ID for this wait queue is valid /
2331	struct waitq_link *link = wql_get_link(waitq->waitq_set_id);
2332	if (!link) {
2333	/ the waitq set to which this waitq belonged, has been invalidated /
2334	waitq->waitq_set_id = `0`;
2335	return;
2336	}
2337
2338	wql_put_link(link);
2339
2340	/*
2341	* If this waitq is a member of any wait queue sets, we need to look
2342	* for waiting thread(s) in any of those sets, and prepost all sets that
2343	* don't have active waiters.
2344	*
2345	* Note that we do a local walk of this waitq's links - we manually
2346	* recurse down wait queue set's with non-zero wqset_q.waitq_set_id
2347	*/
2348	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
2349	WQL_WQS, (void *)args, waitq_select_walk_cb);
2350	}
2351
2352	/**
2353	* main entry point for thread selection from a waitq
2354	*
2355	* Conditions:
2356	* waitq is locked
2357	*
2358	* Returns:
2359	* The number of threads waiting on 'waitq' for 'event' which have
2360	* been placed onto the input 'threadq'
2361	*
2362	* Notes:
2363	* The 'select_cb' function is invoked for every thread found waiting on
2364	* 'waitq' for 'event'. The thread is _not_ locked upon callback
2365	* invocation. This parameter may be NULL.
2366	*
2367	* If one or more threads are returned in 'threadq' then the caller is
2368	* responsible to call splx() using the returned 'spl' value. Each
2369	* returned thread is locked.
2370	*/
2371	static __inline__ int waitq_select_n_locked(struct waitq *waitq,
2372	event64_t event,
2373	waitq_select_cb select_cb,
2374	void *select_ctx,
2375	uint64_t *reserved_preposts,
2376	queue_t threadq,
2377	int max_threads, spl_t *spl)
2378	{
2379	int nthreads = `0`;
2380
2381	struct waitq_select_args args = {
2382	.posted_waitq = waitq,
2383	.waitq = waitq,
2384	.event = event,
2385	.select_cb = select_cb,
2386	.select_ctx = select_ctx,
2387	.reserved_preposts = reserved_preposts,
2388	.threadq = threadq,
2389	.max_threads = max_threads,
2390	.nthreads = &nthreads,
2391	.spl = spl,
2392	};
2393
2394	do_waitq_select_n_locked(&args);
2395	return nthreads;
2396	}
2397
2398	/**
2399	* select from a waitq a single thread waiting for a given event
2400	*
2401	* Conditions:
2402	* 'waitq' is locked
2403	*
2404	* Returns:
2405	* A locked thread that's been removed from the waitq, but has not
2406	* yet been put on a run queue. Caller is responsible to call splx
2407	* with the '*spl' value.
2408	*/
2409	static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
2410	uint64_t *reserved_preposts,
2411	int priority, spl_t *spl)
2412	{
2413	(void)priority;
2414	int nthreads;
2415	queue_head_t threadq;
2416
2417	queue_init(&threadq);
2418
2419	nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
2420	reserved_preposts, &threadq, `1`, spl);
2421
2422	/ if we selected a thread, return it (still locked) /
2423	if (!queue_empty(&threadq)) {
2424	thread_t t;
2425	queue_entry_t qe = dequeue_head(&threadq);
2426	t = qe_element(qe, struct thread, wait_links);
2427	assert(queue_empty(&threadq)); / there should be 1 entry /
2428	/ t has been locked and removed from all queues /
2429	return t;
2430	}
2431
2432	return THREAD_NULL;
2433	}
2434
2435	struct find_max_pri_ctx {
2436	integer_t max_sched_pri;
2437	integer_t max_base_pri;
2438	thread_t highest_thread;
2439	};
2440
2441	/**
2442	* callback function that finds the max priority thread
2443	*
2444	* Conditions:
2445	* 'waitq' is locked
2446	* 'thread' is not locked
2447	*/
2448	static thread_t
2449	waitq_find_max_pri_cb(void *ctx_in,
2450	__unused struct waitq *waitq,
2451	__unused int is_global,
2452	thread_t thread)
2453	{
2454	struct find_max_pri_ctx ctx = (struct* find_max_pri_ctx *)ctx_in;
2455
2456	/*
2457	* thread is not locked, use pri as a hint only
2458	* wake up the highest base pri, and find the highest sched pri at that base pri
2459	*/
2460	integer_t sched_pri = (volatile* int16_t *)&thread->sched_pri;
2461	integer_t base_pri = (volatile* int16_t *)&thread->base_pri;
2462
2463	if (ctx->highest_thread == THREAD_NULL \|\|
2464	(base_pri > ctx->max_base_pri) \|\|
2465	(base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) {
2466	/ don't select the thread, just update ctx /
2467
2468	ctx->max_sched_pri = sched_pri;
2469	ctx->max_base_pri = base_pri;
2470	ctx->highest_thread = thread;
2471	}
2472
2473	return THREAD_NULL;
2474	}
2475
2476	/**
2477	* select from a waitq the highest priority thread waiting for a given event
2478	*
2479	* Conditions:
2480	* 'waitq' is locked
2481	*
2482	* Returns:
2483	* A locked thread that's been removed from the waitq, but has not
2484	* yet been put on a run queue. Caller is responsible to call splx
2485	* with the '*spl' value.
2486	*/
2487	static thread_t
2488	waitq_select_max_locked(struct waitq *waitq, event64_t event,
2489	uint64_t *reserved_preposts,
2490	spl_t *spl)
2491	{
2492	__assert_only int nthreads;
2493	assert(!waitq->waitq_set_id); / doesn't support recursive sets /
2494
2495	struct find_max_pri_ctx ctx = {
2496	.max_sched_pri = `0`,
2497	.max_base_pri = `0`,
2498	.highest_thread = THREAD_NULL,
2499	};
2500
2501	/*
2502	* Scan the waitq to find the highest priority thread.
2503	* This doesn't remove any thread from the queue
2504	*/
2505	nthreads = waitq_select_n_locked(waitq, event,
2506	waitq_find_max_pri_cb,
2507	&ctx, reserved_preposts, NULL, `1`, spl);
2508
2509	assert(nthreads == `0`);
2510
2511	if (ctx.highest_thread != THREAD_NULL) {
2512	__assert_only kern_return_t ret;
2513
2514	/ Remove only the thread we just found /
2515	ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl);
2516
2517	assert(ret == KERN_SUCCESS);
2518	return ctx.highest_thread;
2519	}
2520
2521	return THREAD_NULL;
2522	}
2523
2524
2525	struct select_thread_ctx {
2526	thread_t thread;
2527	event64_t event;
2528	spl_t *spl;
2529	};
2530
2531	/**
2532	* link walk callback invoked once for each set to which a waitq belongs
2533	*
2534	* Conditions:
2535	* initial waitq is locked
2536	* ctx->thread is unlocked
2537	*
2538	* Notes:
2539	* This may disable interrupts and early-out of the full DAG link walk by
2540	* returning KERN_ALREADY_IN_SET. In this case, the returned thread has
2541	* been removed from the waitq, it's waitq state has been reset, and the
2542	* caller is responsible to call splx() with the returned interrupt state
2543	* in ctx->spl.
2544	*/
2545	static int waitq_select_thread_cb(struct waitq waitq, void* *ctx,
2546	struct waitq_link *link)
2547	{
2548	struct select_thread_ctx stctx = (struct* select_thread_ctx *)ctx;
2549	struct waitq_set *wqset;
2550	struct waitq *wqsetq;
2551	struct waitq *safeq;
2552	spl_t s;
2553
2554	(void)waitq;
2555
2556	thread_t thread = stctx->thread;
2557	event64_t event = stctx->event;
2558
2559	if (wql_type(link) != WQL_WQS)
2560	return WQ_ITERATE_CONTINUE;
2561
2562	wqset = link->wql_wqs.wql_set;
2563	wqsetq = &wqset->wqset_q;
2564
2565	assert(!waitq_irq_safe(waitq));
2566	assert(!waitq_irq_safe(wqsetq));
2567
2568	waitq_set_lock(wqset);
2569
2570	s = splsched();
2571
2572	/ find and lock the interrupt-safe waitq the thread is thought to be on /
2573	safeq = waitq_get_safeq(wqsetq);
2574	waitq_lock(safeq);
2575
2576	thread_lock(thread);
2577
2578	if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
2579	waitq_thread_remove(wqsetq, thread);
2580	if (waitq_empty(safeq)) {
2581	safeq->waitq_eventmask = `0`;
2582	}
2583	thread_clear_waitq_state(thread);
2584	waitq_unlock(safeq);
2585	waitq_set_unlock(wqset);
2586	/*
2587	* thread still locked,
2588	* return non-zero to break out of WQS walk
2589	*/
2590	*(stctx->spl) = s;
2591	return WQ_ITERATE_FOUND;
2592	}
2593
2594	thread_unlock(thread);
2595	waitq_set_unlock(wqset);
2596	waitq_unlock(safeq);
2597	splx(s);
2598
2599	return WQ_ITERATE_CONTINUE;
2600	}
2601
2602	/**
2603	* returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
2604	* on 'waitq' (or any set to which waitq belongs) for 'event'
2605	*
2606	* Conditions:
2607	* 'waitq' is locked
2608	* 'thread' is unlocked
2609	*/
2610	static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
2611	event64_t event,
2612	thread_t thread, spl_t *spl)
2613	{
2614	struct waitq *safeq;
2615	struct waitq_link *link;
2616	struct select_thread_ctx ctx;
2617	kern_return_t kr;
2618	spl_t s;
2619
2620	s = splsched();
2621
2622	/ Find and lock the interrupts disabled queue the thread is actually on /
2623	if (!waitq_irq_safe(waitq)) {
2624	safeq = waitq_get_safeq(waitq);
2625	waitq_lock(safeq);
2626	} else {
2627	safeq = waitq;
2628	}
2629
2630	thread_lock(thread);
2631
2632	if ((thread->waitq == waitq) && (thread->wait_event == event)) {
2633	waitq_thread_remove(safeq, thread);
2634	if (waitq_empty(safeq)) {
2635	safeq->waitq_eventmask = `0`;
2636	}
2637	thread_clear_waitq_state(thread);
2638	*spl = s;
2639	/ thread still locked /
2640	return KERN_SUCCESS;
2641	}
2642
2643	thread_unlock(thread);
2644
2645	if (safeq != waitq)
2646	waitq_unlock(safeq);
2647
2648	splx(s);
2649
2650	if (!waitq->waitq_set_id)
2651	return KERN_NOT_WAITING;
2652
2653	/ check to see if the set ID for this wait queue is valid /
2654	link = wql_get_link(waitq->waitq_set_id);
2655	if (!link) {
2656	/ the waitq to which this set belonged, has been invalidated /
2657	waitq->waitq_set_id = `0`;
2658	return KERN_NOT_WAITING;
2659	}
2660
2661	/*
2662	* The thread may be waiting on a wait queue set to which
2663	* the input 'waitq' belongs. Go look for the thread in
2664	* all wait queue sets. If it's there, we'll remove it
2665	* because it's equivalent to waiting directly on the input waitq.
2666	*/
2667	ctx.thread = thread;
2668	ctx.event = event;
2669	ctx.spl = spl;
2670	kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
2671	WQL_WQS, (void *)&ctx, waitq_select_thread_cb);
2672
2673	wql_put_link(link);
2674
2675	/ we found a thread, return success /
2676	if (kr == WQ_ITERATE_FOUND)
2677	return KERN_SUCCESS;
2678
2679	return KERN_NOT_WAITING;
2680	}
2681
2682	static int prepost_exists_cb(struct waitq_set __unused *wqset,
2683	void __unused *ctx,
2684	struct wq_prepost __unused *wqp,
2685	struct waitq __unused *waitq)
2686	{
2687	/ if we get here, then we know that there is a valid prepost object! /
2688	return WQ_ITERATE_FOUND;
2689	}
2690
2691	/**
2692	* declare a thread's intent to wait on 'waitq' for 'wait_event'
2693	*
2694	* Conditions:
2695	* 'waitq' is locked
2696	*/
2697	wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
2698	event64_t wait_event,
2699	wait_interrupt_t interruptible,
2700	wait_timeout_urgency_t urgency,
2701	uint64_t deadline,
2702	uint64_t leeway,
2703	thread_t thread)
2704	{
2705	wait_result_t wait_result;
2706	int realtime = `0`;
2707	struct waitq *safeq;
2708	uintptr_t eventmask;
2709	spl_t s;
2710
2711
2712	/*
2713	* Warning: Do _not_ place debugging print statements here.
2714	* The waitq is locked!
2715	*/
2716	assert(!thread->started \|\| thread == current_thread());
2717
2718	if (thread->waitq != NULL)
2719	panic("thread already waiting on %p", thread->waitq);
2720
2721	if (waitq_is_set(waitq)) {
2722	struct waitq_set wqset = (struct* waitq_set *)waitq;
2723	/*
2724	* early-out if the thread is waiting on a wait queue set
2725	* that has already been pre-posted.
2726	*/
2727	if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
2728	int ret;
2729	/*
2730	* Run through the list of potential preposts. Because
2731	* this is a hot path, we short-circuit the iteration
2732	* if we find just one prepost object.
2733	*/
2734	ret = wq_prepost_foreach_locked(wqset, NULL,
2735	prepost_exists_cb);
2736	if (ret == WQ_ITERATE_FOUND) {
2737	s = splsched();
2738	thread_lock(thread);
2739	thread->wait_result = THREAD_AWAKENED;
2740	thread_unlock(thread);
2741	splx(s);
2742	return THREAD_AWAKENED;
2743	}
2744	}
2745	}
2746
2747	s = splsched();
2748
2749	/*
2750	* If already dealing with an irq safe wait queue, we are all set.
2751	* Otherwise, determine a global queue to use and lock it.
2752	*/
2753	if (!waitq_irq_safe(waitq)) {
2754	safeq = waitq_get_safeq(waitq);
2755	eventmask = _CAST_TO_EVENT_MASK(waitq);
2756	waitq_lock(safeq);
2757	} else {
2758	safeq = waitq;
2759	eventmask = _CAST_TO_EVENT_MASK(wait_event);
2760	}
2761
2762	/ lock the thread now that we have the irq-safe waitq locked /
2763	thread_lock(thread);
2764
2765	/*
2766	* Realtime threads get priority for wait queue placements.
2767	* This allows wait_queue_wakeup_one to prefer a waiting
2768	* realtime thread, similar in principle to performing
2769	* a wait_queue_wakeup_all and allowing scheduler prioritization
2770	* to run the realtime thread, but without causing the
2771	* lock contention of that scenario.
2772	*/
2773	if (thread->sched_pri >= BASEPRI_REALTIME)
2774	realtime = `1`;
2775
2776	/*
2777	* This is the extent to which we currently take scheduling attributes
2778	* into account. If the thread is vm priviledged, we stick it at
2779	* the front of the queue. Later, these queues will honor the policy
2780	* value set at waitq_init time.
2781	*/
2782	wait_result = thread_mark_wait_locked(thread, interruptible);
2783	/ thread->wait_result has been set /
2784	if (wait_result == THREAD_WAITING) {
2785
2786	if (!safeq->waitq_fifo
2787	\|\| (thread->options & TH_OPT_VMPRIV) \|\| realtime)
2788	waitq_thread_insert(safeq, thread, false);
2789	else
2790	waitq_thread_insert(safeq, thread, true);
2791
2792	/ mark the event and real waitq, even if enqueued on a global safeq /
2793	thread->wait_event = wait_event;
2794	thread->waitq = waitq;
2795
2796	if (deadline != `0`) {
2797	boolean_t act;
2798
2799	act = timer_call_enter_with_leeway(&thread->wait_timer,
2800	NULL,
2801	deadline, leeway,
2802	urgency, FALSE);
2803	if (!act)
2804	thread->wait_timer_active++;
2805	thread->wait_timer_is_set = TRUE;
2806	}
2807
2808	if (waitq_is_global(safeq))
2809	safeq->waitq_eventmask \|= eventmask;
2810
2811	waitq_stats_count_wait(waitq);
2812	}
2813
2814	/ unlock the thread /
2815	thread_unlock(thread);
2816
2817	/ update the inheritor's thread priority if the waitq is embedded in turnstile /
2818	if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {
2819	turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
2820	turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
2821	}
2822
2823	/ unlock the safeq if we locked it here /
2824	if (safeq != waitq) {
2825	waitq_unlock(safeq);
2826	}
2827
2828	splx(s);
2829
2830	return wait_result;
2831	}
2832
2833	/**
2834	* remove 'thread' from its current blocking state on 'waitq'
2835	*
2836	* Conditions:
2837	* 'thread' is locked
2838	*
2839	* Notes:
2840	* This function is primarily used by clear_wait_internal in
2841	* sched_prim.c from the thread timer wakeup path
2842	* (i.e. the thread was waiting on 'waitq' with a timeout that expired)
2843	*/
2844	int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
2845	{
2846	struct waitq *safeq;
2847
2848	assert_thread_magic(thread);
2849	assert(thread->waitq == waitq);
2850
2851	/ Find the interrupts disabled queue thread is waiting on /
2852	if (!waitq_irq_safe(waitq)) {
2853	safeq = waitq_get_safeq(waitq);
2854	} else {
2855	safeq = waitq;
2856	}
2857
2858	/ thread is already locked so have to try for the waitq lock /
2859	if (!waitq_lock_try(safeq))
2860	return `0`;
2861
2862	waitq_thread_remove(safeq, thread);
2863	thread_clear_waitq_state(thread);
2864	waitq_stats_count_clear_wakeup(waitq);
2865
2866	/ clear the global event mask if this was the last thread there! /
2867	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
2868	safeq->waitq_eventmask = `0`;
2869	/ JMM - also mark no-waiters on waitq (if not the same as the safeq) /
2870	}
2871
2872	waitq_unlock(safeq);
2873
2874	return `1`;
2875	}
2876
2877
2878	static __inline__
2879	void maybe_adjust_thread_pri(thread_t thread,
2880	int priority,
2881	__kdebug_only struct waitq *waitq)
2882	{
2883
2884	/*
2885	* If the caller is requesting the waitq subsystem to promote the
2886	* priority of the awoken thread, then boost the thread's priority to
2887	* the default WAITQ_BOOST_PRIORITY (if it's not already equal or
2888	* higher priority). This boost must be removed via a call to
2889	* waitq_clear_promotion_locked before the thread waits again.
2890	*
2891	* WAITQ_PROMOTE_PRIORITY is -2.
2892	* Anything above 0 represents a mutex promotion.
2893	* The default 'no action' value is -1.
2894	* TODO: define this in a header
2895	*/
2896	if (priority == WAITQ_PROMOTE_PRIORITY) {
2897	uintptr_t trace_waitq = `0`;
2898	if (__improbable(kdebug_enable))
2899	trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);
2900
2901	sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
2902	} else if (priority > `0`) {
2903	/ Mutex subsystem wants to see this thread before we 'go' it /
2904	lck_mtx_wakeup_adjust_pri(thread, priority);
2905	}
2906	}
2907
2908	/*
2909	* Clear a potential thread priority promotion from a waitq wakeup
2910	* with WAITQ_PROMOTE_PRIORITY.
2911	*
2912	* This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.
2913	*/
2914	void waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
2915	{
2916	spl_t s;
2917
2918	assert(waitq_held(waitq));
2919	assert(thread != THREAD_NULL);
2920	assert(thread == current_thread());
2921
2922	/ This flag is only cleared by the thread itself, so safe to check outside lock /
2923	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED)
2924	return;
2925
2926	if (!waitq_irq_safe(waitq))
2927	s = splsched();
2928	thread_lock(thread);
2929
2930	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, `0`);
2931
2932	thread_unlock(thread);
2933	if (!waitq_irq_safe(waitq))
2934	splx(s);
2935	}
2936
2937	/**
2938	* wakeup all threads waiting on 'waitq' for 'wake_event'
2939	*
2940	* Conditions:
2941	* 'waitq' is locked
2942	*
2943	* Notes:
2944	* May temporarily disable and re-enable interrupts
2945	* and re-adjust thread priority of each awoken thread.
2946	*
2947	* If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
2948	* been unlocked before calling thread_go() on any returned threads, and
2949	* is guaranteed to be unlocked upon function return.
2950	*/
2951	kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq,
2952	event64_t wake_event,
2953	wait_result_t result,
2954	uint64_t *reserved_preposts,
2955	int priority,
2956	waitq_lock_state_t lock_state)
2957	{
2958	kern_return_t ret;
2959	thread_t thread;
2960	spl_t th_spl;
2961	int nthreads;
2962	queue_head_t wakeup_queue;
2963
2964	assert(waitq_held(waitq));
2965	queue_init(&wakeup_queue);
2966
2967	nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
2968	reserved_preposts,
2969	&wakeup_queue, -`1`, &th_spl);
2970
2971	/ set each thread running /
2972	ret = KERN_NOT_WAITING;
2973
2974	#if CONFIG_WAITQ_STATS
2975	qe_foreach_element(thread, &wakeup_queue, wait_links)
2976	waitq_stats_count_wakeup(waitq);
2977	#endif
2978	if (lock_state == WAITQ_UNLOCK)
2979	waitq_unlock(waitq);
2980
2981	qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
2982	assert_thread_magic(thread);
2983	remqueue(&thread->wait_links);
2984	maybe_adjust_thread_pri(thread, priority, waitq);
2985	ret = thread_go(thread, result);
2986	assert(ret == KERN_SUCCESS);
2987	thread_unlock(thread);
2988	}
2989	if (nthreads > `0`)
2990	splx(th_spl);
2991	else
2992	waitq_stats_count_fail(waitq);
2993
2994	return ret;
2995	}
2996
2997	/**
2998	* wakeup one thread waiting on 'waitq' for 'wake_event'
2999	*
3000	* Conditions:
3001	* 'waitq' is locked
3002	*
3003	* Notes:
3004	* May temporarily disable and re-enable interrupts.
3005	*/
3006	kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq,
3007	event64_t wake_event,
3008	wait_result_t result,
3009	uint64_t *reserved_preposts,
3010	int priority,
3011	waitq_lock_state_t lock_state)
3012	{
3013	thread_t thread;
3014	spl_t th_spl;
3015
3016	assert(waitq_held(waitq));
3017
3018	if (priority == WAITQ_SELECT_MAX_PRI) {
3019	thread = waitq_select_max_locked(waitq, wake_event,
3020	reserved_preposts,
3021	&th_spl);
3022	} else {
3023	thread = waitq_select_one_locked(waitq, wake_event,
3024	reserved_preposts,
3025	priority, &th_spl);
3026	}
3027
3028
3029	if (thread != THREAD_NULL)
3030	waitq_stats_count_wakeup(waitq);
3031	else
3032	waitq_stats_count_fail(waitq);
3033
3034	if (lock_state == WAITQ_UNLOCK)
3035	waitq_unlock(waitq);
3036
3037	if (thread != THREAD_NULL) {
3038	maybe_adjust_thread_pri(thread, priority, waitq);
3039	kern_return_t ret = thread_go(thread, result);
3040	assert(ret == KERN_SUCCESS);
3041	thread_unlock(thread);
3042	splx(th_spl);
3043	return ret;
3044	}
3045
3046	return KERN_NOT_WAITING;
3047	}
3048
3049	/**
3050	* wakeup one thread waiting on 'waitq' for 'wake_event'
3051	*
3052	* Conditions:
3053	* 'waitq' is locked
3054	*
3055	* Returns:
3056	* A locked, runnable thread.
3057	* If return value is non-NULL, interrupts have also
3058	* been disabled, and the caller is responsible to call
3059	* splx() with the returned '*spl' value.
3060	*/
3061	thread_t
3062	waitq_wakeup64_identify_locked(struct waitq *waitq,
3063	event64_t wake_event,
3064	wait_result_t result,
3065	spl_t *spl,
3066	uint64_t *reserved_preposts,
3067	int priority,
3068	waitq_lock_state_t lock_state)
3069	{
3070	thread_t thread;
3071
3072	assert(waitq_held(waitq));
3073
3074	if (priority == WAITQ_SELECT_MAX_PRI) {
3075	thread = waitq_select_max_locked(waitq, wake_event,
3076	reserved_preposts,
3077	spl);
3078	} else {
3079	thread = waitq_select_one_locked(waitq, wake_event,
3080	reserved_preposts,
3081	priority, spl);
3082	}
3083
3084	if (thread != THREAD_NULL)
3085	waitq_stats_count_wakeup(waitq);
3086	else
3087	waitq_stats_count_fail(waitq);
3088
3089	if (lock_state == WAITQ_UNLOCK)
3090	waitq_unlock(waitq);
3091
3092	if (thread != THREAD_NULL) {
3093	kern_return_t __assert_only ret;
3094	ret = thread_go(thread, result);
3095	assert(ret == KERN_SUCCESS);
3096	}
3097
3098	return thread; / locked if not NULL (caller responsible for spl) /
3099	}
3100
3101	/**
3102	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
3103	*
3104	* Conditions:
3105	* 'waitq' is locked
3106	* 'thread' is unlocked
3107	*
3108	* Notes:
3109	* May temporarily disable and re-enable interrupts
3110	*
3111	* If the input lock_state == WAITQ_UNLOCK then the waitq will have been
3112	* unlocked before calling thread_go() if 'thread' is to be awoken, and
3113	* is guaranteed to be unlocked upon function return.
3114	*/
3115	kern_return_t waitq_wakeup64_thread_locked(struct waitq *waitq,
3116	event64_t wake_event,
3117	thread_t thread,
3118	wait_result_t result,
3119	waitq_lock_state_t lock_state)
3120	{
3121	kern_return_t ret;
3122	spl_t th_spl;
3123
3124	assert(waitq_held(waitq));
3125	assert_thread_magic(thread);
3126
3127	/*
3128	* See if the thread was still waiting there. If so, it got
3129	* dequeued and returned locked.
3130	*/
3131	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
3132
3133	if (ret == KERN_SUCCESS)
3134	waitq_stats_count_wakeup(waitq);
3135	else
3136	waitq_stats_count_fail(waitq);
3137
3138	if (lock_state == WAITQ_UNLOCK)
3139	waitq_unlock(waitq);
3140
3141	if (ret != KERN_SUCCESS)
3142	return KERN_NOT_WAITING;
3143
3144	ret = thread_go(thread, result);
3145	assert(ret == KERN_SUCCESS);
3146	thread_unlock(thread);
3147	splx(th_spl);
3148
3149	return ret;
3150	}
3151
3152
3153
3154	/ ----------------------------------------------------------------------*
3155	*
3156	* In-Kernel API
3157	*
3158	* ---------------------------------------------------------------------- */
3159
3160	/**
3161	* initialize a waitq object
3162	*/
3163	kern_return_t waitq_init(struct waitq waitq, int* policy)
3164	{
3165	assert(waitq != NULL);
3166
3167	/ only FIFO and LIFO for now /
3168	if ((policy & SYNC_POLICY_FIXED_PRIORITY) != `0`)
3169	return KERN_INVALID_ARGUMENT;
3170
3171	waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == `0`);
3172	waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
3173	waitq->waitq_prepost = `0`;
3174	waitq->waitq_type = WQT_QUEUE;
3175	waitq->waitq_turnstile_or_port = !!(policy & SYNC_POLICY_TURNSTILE);
3176	waitq->waitq_eventmask = `0`;
3177
3178	waitq->waitq_set_id = `0`;
3179	waitq->waitq_prepost_id = `0`;
3180
3181	waitq_lock_init(waitq);
3182	if (waitq_is_turnstile_queue(waitq)) {
3183	/ For turnstile, initialize it as a priority queue /
3184	priority_queue_init(&waitq->waitq_prio_queue,
3185	PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
3186	assert(waitq->waitq_fifo == `0`);
3187	} else {
3188	queue_init(&waitq->waitq_queue);
3189	}
3190
3191	waitq->waitq_isvalid = `1`;
3192	return KERN_SUCCESS;
3193	}
3194
3195	struct wq_unlink_ctx {
3196	struct waitq *unlink_wq;
3197	struct waitq_set *unlink_wqset;
3198	};
3199
3200	static int waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void* *ctx,
3201	struct wq_prepost wqp, struct* waitq *waitq);
3202
3203	/**
3204	* walk_waitq_links callback to invalidate 'link' parameter
3205	*
3206	* Conditions:
3207	* Called from walk_waitq_links.
3208	* Note that unlink other callbacks, this one make no assumptions about
3209	* the 'waitq' parameter, specifically it does not have to be locked or
3210	* even valid.
3211	*/
3212	static int waitq_unlink_all_cb(struct waitq waitq, void* *ctx,
3213	struct waitq_link *link)
3214	{
3215	(void)waitq;
3216	(void)ctx;
3217	if (wql_type(link) == WQL_LINK && wql_is_valid(link))
3218	wql_invalidate(link);
3219
3220	if (wql_type(link) == WQL_WQS) {
3221	struct waitq_set *wqset;
3222	struct wq_unlink_ctx ulctx;
3223
3224	/*
3225	* When destroying the waitq, take the time to clear out any
3226	* preposts it may have made. This could potentially save time
3227	* on the IPC send path which would otherwise have to iterate
3228	* over lots of dead port preposts.
3229	*/
3230	if (waitq->waitq_prepost_id == `0`)
3231	goto out;
3232
3233	wqset = link->wql_wqs.wql_set;
3234	assert(wqset != NULL);
3235	assert(!waitq_irq_safe(&wqset->wqset_q));
3236
3237	waitq_set_lock(wqset);
3238
3239	if (!waitq_set_is_valid(wqset)) {
3240	/ someone raced us to teardown /
3241	goto out_unlock;
3242	}
3243	if (!waitq_set_maybe_preposted(wqset))
3244	goto out_unlock;
3245
3246	ulctx.unlink_wq = waitq;
3247	ulctx.unlink_wqset = wqset;
3248	(void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
3249	waitq_unlink_prepost_cb);
3250	out_unlock:
3251	waitq_set_unlock(wqset);
3252	}
3253
3254	out:
3255	return WQ_ITERATE_CONTINUE;
3256	}
3257
3258
3259	/**
3260	* cleanup any link/prepost table resources associated with a waitq
3261	*/
3262	void waitq_deinit(struct waitq *waitq)
3263	{
3264	spl_t s;
3265
3266	if (!waitq \|\| !waitq_is_queue(waitq))
3267	return;
3268
3269	if (waitq_irq_safe(waitq))
3270	s = splsched();
3271	waitq_lock(waitq);
3272	if (!waitq_valid(waitq)) {
3273	waitq_unlock(waitq);
3274	if (waitq_irq_safe(waitq))
3275	splx(s);
3276	return;
3277	}
3278
3279	waitq->waitq_isvalid = `0`;
3280
3281	if (!waitq_irq_safe(waitq)) {
3282	waitq_unlink_all_unlock(waitq);
3283	/ waitq unlocked and set links deallocated /
3284	} else {
3285	waitq_unlock(waitq);
3286	splx(s);
3287	}
3288
3289	assert(waitq_empty(waitq));
3290	}
3291
3292	void waitq_invalidate_locked(struct waitq *waitq)
3293	{
3294	assert(waitq_held(waitq));
3295	assert(waitq_is_valid(waitq));
3296	waitq->waitq_isvalid = `0`;
3297	}
3298
3299	/**
3300	* invalidate the given wq_prepost object
3301	*
3302	* Conditions:
3303	* Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
3304	*/
3305	static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
3306	void __unused *ctx,
3307	struct wq_prepost *wqp,
3308	struct waitq __unused *waitq)
3309	{
3310	if (wqp_type(wqp) == WQP_POST)
3311	wq_prepost_invalidate(wqp);
3312	return WQ_ITERATE_CONTINUE;
3313	}
3314
3315
3316	/**
3317	* allocate and initialize a waitq set object
3318	*
3319	* Conditions:
3320	* may block
3321	*
3322	* Returns:
3323	* allocated / initialized waitq_set object.
3324	* the waits_set object returned does not have
3325	* a waitq_link associated.
3326	*
3327	* NULL on failure
3328	*/
3329	struct waitq_set waitq_set_alloc(int* policy, void *prepost_hook)
3330	{
3331	struct waitq_set *wqset;
3332
3333	wqset = (struct waitq_set *)zalloc(waitq_set_zone);
3334	if (!wqset)
3335	panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
3336
3337	kern_return_t ret;
3338	ret = waitq_set_init(wqset, policy, NULL, prepost_hook);
3339	if (ret != KERN_SUCCESS) {
3340	zfree(waitq_set_zone, wqset);
3341	wqset = NULL;
3342	}
3343
3344	return wqset;
3345	}
3346
3347	/**
3348	* initialize a waitq set object
3349	*
3350	* if no 'reserved_link' object is passed
3351	* the waitq_link will be lazily allocated
3352	* on demand through waitq_set_lazy_init_link.
3353	*/
3354	kern_return_t waitq_set_init(struct waitq_set *wqset,
3355	int policy, uint64_t *reserved_link,
3356	void *prepost_hook)
3357	{
3358	struct waitq_link *link;
3359	kern_return_t ret;
3360
3361	memset(wqset, `0`, sizeof(*wqset));
3362
3363	ret = waitq_init(&wqset->wqset_q, policy);
3364	if (ret != KERN_SUCCESS)
3365	return ret;
3366
3367	wqset->wqset_q.waitq_type = WQT_SET;
3368	if (policy & SYNC_POLICY_PREPOST) {
3369	wqset->wqset_q.waitq_prepost = `1`;
3370	wqset->wqset_prepost_id = `0`;
3371	assert(prepost_hook == NULL);
3372	} else {
3373	wqset->wqset_q.waitq_prepost = `0`;
3374	wqset->wqset_prepost_hook = prepost_hook;
3375	}
3376
3377	if (reserved_link && *reserved_link != `0`) {
3378	link = wql_get_reserved(*reserved_link, WQL_WQS);
3379
3380	if (!link)
3381	panic("Can't allocate link object for waitq set: %p", wqset);
3382
3383	/ always consume the caller's reference /
3384	*reserved_link = `0`;
3385
3386	link->wql_wqs.wql_set = wqset;
3387	wql_mkvalid(link);
3388
3389	wqset->wqset_id = link->wql_setid.id;
3390	wql_put_link(link);
3391
3392	} else {
3393	/*
3394	* Lazy allocate the link only when an actual id is needed.
3395	*/
3396	wqset->wqset_id = WQSET_NOT_LINKED;
3397	}
3398
3399	return KERN_SUCCESS;
3400	}
3401
3402	#if DEVELOPMENT \|\| DEBUG
3403
3404	int
3405	sysctl_helper_waitq_set_nelem(void)
3406	{
3407	return ltable_nelem(&g_wqlinktable);
3408	}
3409
3410	#endif
3411
3412	/**
3413	* initialize a waitq set link.
3414	*
3415	* Conditions:
3416	* may block
3417	* locks and unlocks the waiq set lock
3418	*
3419	*/
3420	void
3421	waitq_set_lazy_init_link(struct waitq_set *wqset)
3422	{
3423	struct waitq_link *link;
3424
3425	assert(get_preemption_level() == `0` && waitq_wait_possible(current_thread()));
3426
3427	waitq_set_lock(wqset);
3428	if (!waitq_set_should_lazy_init_link(wqset)){
3429	waitq_set_unlock(wqset);
3430	return;
3431	}
3432
3433	assert(wqset->wqset_id == WQSET_NOT_LINKED);
3434	waitq_set_unlock(wqset);
3435
3436	link = wql_alloc_link(WQL_WQS);
3437	if (!link)
3438	panic("Can't allocate link object for waitq set: %p", wqset);
3439
3440	link->wql_wqs.wql_set = wqset;
3441
3442	waitq_set_lock(wqset);
3443	if (waitq_set_should_lazy_init_link(wqset)) {
3444	wql_mkvalid(link);
3445	wqset->wqset_id = link->wql_setid.id;
3446	}
3447
3448	assert(wqset->wqset_id != `0`);
3449	assert(wqset->wqset_id != WQSET_NOT_LINKED);
3450
3451	waitq_set_unlock(wqset);
3452
3453	wql_put_link(link);
3454
3455	return;
3456	}
3457
3458	/**
3459	* checks if a waitq set needs to be linked.
3460	*
3461	*/
3462	boolean_t
3463	waitq_set_should_lazy_init_link(struct waitq_set *wqset)
3464	{
3465	if (waitqs_is_linked(wqset) \|\| wqset->wqset_id == `0`) {
3466	return FALSE;
3467	}
3468	return TRUE;
3469	}
3470
3471	/**
3472	* clear out / release any resources associated with a waitq set
3473	*
3474	* Conditions:
3475	* may block
3476	* Note:
3477	* This will render the waitq set invalid, and it must
3478	* be re-initialized with waitq_set_init before it can be used again
3479	*/
3480	void waitq_set_deinit(struct waitq_set *wqset)
3481	{
3482	struct waitq_link *link = NULL;
3483	uint64_t set_id, prepost_id;
3484
3485	if (!waitqs_is_set(wqset))
3486	panic("trying to de-initialize an invalid wqset @%p", wqset);
3487
3488	assert(!waitq_irq_safe(&wqset->wqset_q));
3489
3490	waitq_set_lock(wqset);
3491
3492	set_id = wqset->wqset_id;
3493
3494	if (waitqs_is_linked(wqset) \|\| set_id == `0`) {
3495
3496	/ grab the set's link object /
3497	link = wql_get_link(set_id);
3498	if (link) {
3499	wql_invalidate(link);
3500	}
3501	/ someone raced us to deinit /
3502	if (!link \|\| wqset->wqset_id != set_id \|\| set_id != link->wql_setid.id) {
3503	if (link) {
3504	wql_put_link(link);
3505	}
3506	waitq_set_unlock(wqset);
3507	return;
3508	}
3509
3510	/ the link should be a valid link object at this point /
3511	assert(link != NULL && wql_type(link) == WQL_WQS);
3512
3513	wqset->wqset_id = `0`;
3514	}
3515
3516	/*
3517	* This set may have a lot of preposts, or may have been a member of
3518	* many other sets. To minimize spinlock hold times, we clear out the
3519	* waitq set data structure under the lock-hold, but don't clear any
3520	* table objects. We keep handles to the prepost and set linkage
3521	* objects and free those outside the critical section.
3522	*/
3523	prepost_id = `0`;
3524	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
3525	assert(link != NULL);
3526	prepost_id = wqset->wqset_prepost_id;
3527	}
3528	/ else { TODO: notify kqueue subsystem? } /
3529	wqset->wqset_prepost_id = `0`;
3530
3531	wqset->wqset_q.waitq_fifo = `0`;
3532	wqset->wqset_q.waitq_prepost = `0`;
3533	wqset->wqset_q.waitq_isvalid = `0`;
3534
3535	/ don't clear the 'waitq_irq' bit: it's used in locking! /
3536	wqset->wqset_q.waitq_eventmask = `0`;
3537
3538	waitq_unlink_all_unlock(&wqset->wqset_q);
3539	/ wqset->wqset_q unlocked and set links deallocated /
3540
3541
3542	if (link) {
3543	/*
3544	* walk_waitq_links may race with us for access to the waitq set.
3545	* If walk_waitq_links has a reference to the set, then we should wait
3546	* until the link's refcount goes to 1 (our reference) before we exit
3547	* this function. That way we ensure that the waitq set memory will
3548	* remain valid even though it's been cleared out.
3549	*/
3550	while (wql_refcnt(link) > `1`)
3551	delay(`1`);
3552	wql_put_link(link);
3553	}
3554
3555	/ drop / unlink all the prepost table objects /
3556	/ JMM - can this happen before the delay? /
3557	if (prepost_id)
3558	(void)wq_prepost_iterate(prepost_id, NULL,
3559	wqset_clear_prepost_chain_cb);
3560	}
3561
3562	/**
3563	* de-initialize and free an allocated waitq set object
3564	*
3565	* Conditions:
3566	* may block
3567	*/
3568	kern_return_t waitq_set_free(struct waitq_set *wqset)
3569	{
3570	waitq_set_deinit(wqset);
3571
3572	memset(wqset, `0`, sizeof(*wqset));
3573	zfree(waitq_set_zone, wqset);
3574
3575	return KERN_SUCCESS;
3576	}
3577
3578	#if DEVELOPMENT \|\| DEBUG
3579	#if CONFIG_WAITQ_DEBUG
3580	/**
3581	* return the set ID of 'wqset'
3582	*/
3583	uint64_t wqset_id(struct waitq_set *wqset)
3584	{
3585	if (!wqset)
3586	return `0`;
3587
3588	assert(waitqs_is_set(wqset));
3589
3590	if (!waitqs_is_linked(wqset)) {
3591	waitq_set_lazy_init_link(wqset);
3592	}
3593
3594	return wqset->wqset_id;
3595	}
3596
3597	/**
3598	* returns a pointer to the waitq object embedded in 'wqset'
3599	*/
3600	struct waitq wqset_waitq(struct* waitq_set *wqset)
3601	{
3602	if (!wqset)
3603	return NULL;
3604
3605	assert(waitqs_is_set(wqset));
3606
3607	return &wqset->wqset_q;
3608	}
3609	#endif /* CONFIG_WAITQ_DEBUG */
3610	#endif /* DEVELOPMENT \|\| DEBUG */
3611
3612
3613	/**
3614	* clear all preposts originating from 'waitq'
3615	*
3616	* Conditions:
3617	* 'waitq' locked
3618	* may (rarely) spin waiting for another on-core thread to
3619	* release the last reference to the waitq's prepost link object
3620	*
3621	* NOTE:
3622	* If this function needs to spin, it will drop the waitq lock!
3623	* The return value of the function indicates whether or not this
3624	* happened: 1 == lock was dropped, 0 == lock held
3625	*/
3626	int waitq_clear_prepost_locked(struct waitq *waitq)
3627	{
3628	struct wq_prepost *wqp;
3629	int dropped_lock = `0`;
3630
3631	assert(!waitq_irq_safe(waitq));
3632
3633	if (waitq->waitq_prepost_id == `0`)
3634	return `0`;
3635
3636	wqp = wq_prepost_get(waitq->waitq_prepost_id);
3637	waitq->waitq_prepost_id = `0`;
3638	if (wqp) {
3639	uint64_t wqp_id = wqp->wqp_prepostid.id;
3640	wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
3641	wqp->wqp_prepostid.id, wqp_refcnt(wqp));
3642	wq_prepost_invalidate(wqp);
3643	while (wqp_refcnt(wqp) > `1`) {
3644
3645	/*
3646	* Some other thread must have raced us to grab a link
3647	* object reference before we invalidated it. This
3648	* means that they are probably trying to access the
3649	* waitq to which the prepost object points. We need
3650	* to wait here until the other thread drops their
3651	* reference. We know that no one else can get a
3652	* reference (the object has been invalidated), and
3653	* that prepost references are short-lived (dropped on
3654	* a call to wq_prepost_put). We also know that no one
3655	* blocks while holding a reference therefore the
3656	* other reference holder must be on-core. We'll just
3657	* sit and wait for the other reference to be dropped.
3658	*/
3659	disable_preemption();
3660
3661	waitq_unlock(waitq);
3662	dropped_lock = `1`;
3663	/*
3664	* don't yield here, just spin and assume the other
3665	* consumer is already on core...
3666	*/
3667	delay(`1`);
3668
3669	waitq_lock(waitq);
3670
3671	enable_preemption();
3672	}
3673	if (wqp_refcnt(wqp) > `0` && wqp->wqp_prepostid.id == wqp_id)
3674	wq_prepost_put(wqp);
3675	}
3676
3677	return dropped_lock;
3678	}
3679
3680	/**
3681	* clear all preposts originating from 'waitq'
3682	*
3683	* Conditions:
3684	* 'waitq' is not locked
3685	* may disable and re-enable interrupts
3686	*/
3687	void waitq_clear_prepost(struct waitq *waitq)
3688	{
3689	assert(waitq_valid(waitq));
3690	assert(!waitq_irq_safe(waitq));
3691
3692	waitq_lock(waitq);
3693	/ it doesn't matter to us if the lock is dropped here /
3694	(void)waitq_clear_prepost_locked(waitq);
3695	waitq_unlock(waitq);
3696	}
3697
3698	/**
3699	* return a the waitq's prepost object ID (allocate if necessary)
3700	*
3701	* Conditions:
3702	* 'waitq' is unlocked
3703	*/
3704	uint64_t waitq_get_prepost_id(struct waitq *waitq)
3705	{
3706	struct wq_prepost *wqp;
3707	uint64_t wqp_id = `0`;
3708
3709	if (!waitq_valid(waitq))
3710	return `0`;
3711
3712	assert(!waitq_irq_safe(waitq));
3713
3714	waitq_lock(waitq);
3715
3716	if (!waitq_valid(waitq))
3717	goto out_unlock;
3718
3719	if (waitq->waitq_prepost_id) {
3720	wqp_id = waitq->waitq_prepost_id;
3721	goto out_unlock;
3722	}
3723
3724	/ don't hold a spinlock while allocating a prepost object /
3725	waitq_unlock(waitq);
3726
3727	wqp = wq_prepost_alloc(WQP_WQ, `1`);
3728	if (!wqp)
3729	return `0`;
3730
3731	/ re-acquire the waitq lock /
3732	waitq_lock(waitq);
3733
3734	if (!waitq_valid(waitq)) {
3735	wq_prepost_put(wqp);
3736	wqp_id = `0`;
3737	goto out_unlock;
3738	}
3739
3740	if (waitq->waitq_prepost_id) {
3741	/ we were beat by someone else /
3742	wq_prepost_put(wqp);
3743	wqp_id = waitq->waitq_prepost_id;
3744	goto out_unlock;
3745	}
3746
3747	wqp->wqp_wq.wqp_wq_ptr = waitq;
3748
3749	wqp_set_valid(wqp);
3750	wqp_id = wqp->wqp_prepostid.id;
3751	waitq->waitq_prepost_id = wqp_id;
3752
3753	wq_prepost_put(wqp);
3754
3755	out_unlock:
3756	waitq_unlock(waitq);
3757
3758	return wqp_id;
3759	}
3760
3761
3762	static int waitq_inset_cb(struct waitq waitq, void* ctx, struct* waitq_link *link)
3763	{
3764	uint64_t setid = (uint64_t )ctx;
3765	int wqltype = wql_type(link);
3766	(void)waitq;
3767	if (wqltype == WQL_WQS && link->wql_setid.id == setid) {
3768	wqdbg_v(" waitq already in set 0x%llx", setid);
3769	return WQ_ITERATE_FOUND;
3770	} else if (wqltype == WQL_LINK) {
3771	/*
3772	* break out early if we see a link that points to the setid
3773	* in question. This saves us a step in the
3774	* iteration/recursion
3775	*/
3776	wqdbg_v(" waitq already in set 0x%llx (WQL_LINK)", setid);
3777	if (link->wql_link.left_setid == setid \|\|
3778	link->wql_link.right_setid == setid)
3779	return WQ_ITERATE_FOUND;
3780	}
3781
3782	return WQ_ITERATE_CONTINUE;
3783	}
3784
3785	/**
3786	* determine if 'waitq' is a member of 'wqset'
3787	*
3788	* Conditions:
3789	* neither 'waitq' nor 'wqset' is not locked
3790	* may disable and re-enable interrupts while locking 'waitq'
3791	*/
3792	boolean_t waitq_member(struct waitq waitq, struct* waitq_set *wqset)
3793	{
3794	kern_return_t kr = WQ_ITERATE_SUCCESS;
3795	uint64_t setid;
3796
3797	if (!waitq_valid(waitq))
3798	panic("Invalid waitq: %p", waitq);
3799	assert(!waitq_irq_safe(waitq));
3800
3801	if (!waitqs_is_set(wqset))
3802	return FALSE;
3803
3804	waitq_lock(waitq);
3805
3806	if (!waitqs_is_linked(wqset))
3807	goto out_unlock;
3808
3809	setid = wqset->wqset_id;
3810
3811	/ fast path: most waitqs are members of only 1 set /
3812	if (waitq->waitq_set_id == setid) {
3813	waitq_unlock(waitq);
3814	return TRUE;
3815	}
3816
3817	/ walk the link table and look for the Set ID of wqset /
3818	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
3819	WQL_ALL, (void *)&setid, waitq_inset_cb);
3820
3821	out_unlock:
3822	waitq_unlock(waitq);
3823	return (kr == WQ_ITERATE_FOUND);
3824	}
3825
3826	/**
3827	* Returns true is the given waitq is a member of at least 1 set
3828	*/
3829	boolean_t waitq_in_set(struct waitq *waitq)
3830	{
3831	struct waitq_link *link;
3832	boolean_t inset = FALSE;
3833
3834	if (waitq_irq_safe(waitq))
3835	return FALSE;
3836
3837	waitq_lock(waitq);
3838
3839	if (!waitq->waitq_set_id)
3840	goto out_unlock;
3841
3842	link = wql_get_link(waitq->waitq_set_id);
3843	if (link) {
3844	/ if we get here, the waitq is in _at_least_one_ set /
3845	inset = TRUE;
3846	wql_put_link(link);
3847	} else {
3848	/ we can just optimize this for next time /
3849	waitq->waitq_set_id = `0`;
3850	}
3851
3852	out_unlock:
3853	waitq_unlock(waitq);
3854	return inset;
3855	}
3856
3857
3858	/**
3859	* pre-allocate a waitq link structure from the link table
3860	*
3861	* Conditions:
3862	* 'waitq' is not locked
3863	* may (rarely) block if link table needs to grow
3864	*/
3865	uint64_t waitq_link_reserve(struct waitq *waitq)
3866	{
3867	struct waitq_link *link;
3868	uint64_t reserved_id = `0`;
3869
3870	assert(get_preemption_level() == `0` && waitq_wait_possible(current_thread()));
3871
3872	/*
3873	* We've asserted that the caller can block, so we enforce a
3874	* minimum-free table element policy here.
3875	*/
3876	wql_ensure_free_space();
3877
3878	(void)waitq;
3879	link = wql_alloc_link(LT_RESERVED);
3880	if (!link)
3881	return `0`;
3882
3883	reserved_id = link->wql_setid.id;
3884
3885	return reserved_id;
3886	}
3887
3888	/**
3889	* release a pre-allocated waitq link structure
3890	*/
3891	void waitq_link_release(uint64_t id)
3892	{
3893	struct waitq_link *link;
3894
3895	if (id == `0`)
3896	return;
3897
3898	link = wql_get_reserved(id, WQL_LINK);
3899	if (!link)
3900	return;
3901
3902	/*
3903	* if we successfully got a link object, then we know
3904	* it's not been marked valid, and can be released with
3905	* a standard wql_put_link() which should free the element.
3906	*/
3907	wql_put_link(link);
3908	#if CONFIG_LTABLE_STATS
3909	g_wqlinktable.nreserved_releases += `1`;
3910	#endif
3911	}
3912
3913	/**
3914	* link 'waitq' to the set identified by 'setid' using the 'link' structure
3915	*
3916	* Conditions:
3917	* 'waitq' is locked
3918	* caller should have a reference to the 'link' object
3919	*/
3920	static kern_return_t waitq_link_internal(struct waitq *waitq,
3921	uint64_t setid, struct waitq_link *link)
3922	{
3923	struct waitq_link *qlink;
3924	kern_return_t kr;
3925
3926	assert(waitq_held(waitq));
3927	assert(setid != `0`);
3928	assert(setid != WQSET_NOT_LINKED);
3929
3930	/*
3931	* If the waitq_set_id field is empty, then this waitq is not
3932	* a member of any other set. All we have to do is update the
3933	* field.
3934	*/
3935	if (!waitq->waitq_set_id) {
3936	waitq->waitq_set_id = setid;
3937	return KERN_SUCCESS;
3938	}
3939
3940	qlink = wql_get_link(waitq->waitq_set_id);
3941	if (!qlink) {
3942	/*
3943	* The set to which this wait queue belonged has been
3944	* destroyed / invalidated. We can re-use the waitq field.
3945	*/
3946	waitq->waitq_set_id = setid;
3947	return KERN_SUCCESS;
3948	}
3949	wql_put_link(qlink);
3950
3951	/*
3952	* Check to see if it's already a member of the set.
3953	*
3954	* TODO: check for cycles!
3955	*/
3956	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
3957	WQL_ALL, (void *)&setid, waitq_inset_cb);
3958	if (kr == WQ_ITERATE_FOUND)
3959	return KERN_ALREADY_IN_SET;
3960
3961	/*
3962	* This wait queue is a member of at least one set already,
3963	* and _not_ a member of the given set. Use our previously
3964	* allocated link object, and hook it up to the wait queue.
3965	* Note that it's possible that one or more of the wait queue sets to
3966	* which the wait queue belongs was invalidated before we allocated
3967	* this link object. That's OK because the next time we use that
3968	* object we'll just ignore it.
3969	*/
3970	link->wql_link.left_setid = setid;
3971	link->wql_link.right_setid = waitq->waitq_set_id;
3972	wql_mkvalid(link);
3973
3974	waitq->waitq_set_id = link->wql_setid.id;
3975
3976	return KERN_SUCCESS;
3977	}
3978
3979	/**
3980	* link 'waitq' to 'wqset'
3981	*
3982	* Conditions:
3983	* if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
3984	* Otherwise, 'waitq' must be locked.
3985	*
3986	* may (rarely) block on link table allocation if the table has to grow,
3987	* and no 'reserved_link' object is passed.
3988	*
3989	* may block and acquire wqset lock if the wqset passed has no link.
3990	*
3991	* Notes:
3992	* The caller can guarantee that this function will never block by
3993	* - pre-allocating a link table object and passing its ID in 'reserved_link'
3994	* - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.
3995	* It is not possible to provide a reserved_link without having also linked
3996	* the wqset.
3997	*/
3998	kern_return_t waitq_link(struct waitq waitq, struct* waitq_set *wqset,
3999	waitq_lock_state_t lock_state, uint64_t *reserved_link)
4000	{
4001	kern_return_t kr;
4002	struct waitq_link *link;
4003	int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
4004
4005	if (!waitq_valid(waitq) \|\| waitq_irq_safe(waitq))
4006	panic("Invalid waitq: %p", waitq);
4007
4008	if (!waitqs_is_set(wqset))
4009	return KERN_INVALID_ARGUMENT;
4010
4011	if (!reserved_link \|\| *reserved_link == `0`) {
4012	if (!waitqs_is_linked(wqset)) {
4013	waitq_set_lazy_init_link(wqset);
4014	}
4015	}
4016
4017	wqdbg_v("Link waitq %p to wqset 0x%llx",
4018	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4019
4020	/*
4021	* We _might_ need a new link object here, so we'll grab outside
4022	* the lock because the alloc call _might_ block.
4023	*
4024	* If the caller reserved a link beforehand, then wql_get_link
4025	* is guaranteed not to block because the caller holds an extra
4026	* reference to the link which, in turn, hold a reference to the
4027	* link table.
4028	*/
4029	if (reserved_link && *reserved_link != `0`) {
4030	link = wql_get_reserved(*reserved_link, WQL_LINK);
4031	/ always consume the caller's reference /
4032	*reserved_link = `0`;
4033	} else {
4034	link = wql_alloc_link(WQL_LINK);
4035	}
4036	if (!link)
4037	return KERN_NO_SPACE;
4038
4039	if (should_lock) {
4040	waitq_lock(waitq);
4041	}
4042
4043	kr = waitq_link_internal(waitq, wqset->wqset_id, link);
4044
4045	if (should_lock) {
4046	waitq_unlock(waitq);
4047	}
4048
4049	wql_put_link(link);
4050
4051	return kr;
4052	}
4053
4054	/**
4055	* helper: unlink 'waitq' from waitq set identified by 'setid'
4056	* this function also prunes invalid objects from the tree
4057	*
4058	* Conditions:
4059	* MUST be called from walk_waitq_links link table walk
4060	* 'waitq' is locked
4061	*
4062	* Notes:
4063	* This is a helper function which compresses the link table by culling
4064	* unused or unnecessary links. See comments below for different
4065	* scenarios.
4066	*/
4067	static inline int waitq_maybe_remove_link(struct waitq *waitq,
4068	uint64_t setid,
4069	struct waitq_link *parent,
4070	struct waitq_link *left,
4071	struct waitq_link *right)
4072	{
4073	uint64_t *wq_setid = &waitq->waitq_set_id;
4074
4075	/*
4076	* There are two scenarios:
4077	*
4078	* Scenario 1:
4079	* --------------------------------------------------------------------
4080	* waitq->waitq_set_id == parent
4081	*
4082	* parent(LINK)
4083	* / \
4084	* / \
4085	* / \
4086	* L(LINK/WQS_l) R(LINK/WQS_r)
4087	*
4088	* In this scenario, we assert that the original waitq points to the
4089	* parent link we were passed in. If WQS_l (or WQS_r) is the waitq
4090	* set we're looking for, we can set the corresponding parent
4091	* link id (left or right) to 0. To compress the tree, we can reset the
4092	* waitq_set_id of the original waitq to point to the side of the
4093	* parent that is still valid. We then discard the parent link object.
4094	*/
4095	if (*wq_setid == parent->wql_setid.id) {
4096	if (!left && !right) {
4097	/ completely invalid children /
4098	wql_invalidate(parent);
4099	wqdbg_v("S1, L+R");
4100	*wq_setid = `0`;
4101	return WQ_ITERATE_INVALID;
4102	} else if (!left \|\| left->wql_setid.id == setid) {
4103	/*
4104	* left side matches we know it points either to the
4105	* WQS we're unlinking, or to an invalid object:
4106	* no need to invalidate it
4107	*/
4108	*wq_setid = right ? right->wql_setid.id : `0`;
4109	wql_invalidate(parent);
4110	wqdbg_v("S1, L");
4111	return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4112	} else if (!right \|\| right->wql_setid.id == setid) {
4113	/*
4114	* if right side matches we know it points either to the
4115	* WQS we're unlinking, or to an invalid object:
4116	* no need to invalidate it
4117	*/
4118	*wq_setid = left ? left->wql_setid.id : `0`;
4119	wql_invalidate(parent);
4120	wqdbg_v("S1, R");
4121	return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4122	}
4123	}
4124
4125	/*
4126	* the tree walk starts at the top-of-tree and moves down,
4127	* so these are safe asserts.
4128	*/
4129	assert(left \|\| right); / one of them has to be valid at this point /
4130
4131	/*
4132	* Scenario 2:
4133	* --------------------------------------------------------------------
4134	* waitq->waitq_set_id == ... (OR parent)
4135	*
4136	* ...
4137	* \|
4138	* parent
4139	* / \
4140	* / \
4141	* L(LINK) R(LINK)
4142	* /\ /\
4143	* / \ / \
4144	* / \ Rl() Rr()
4145	* Ll(WQS) Lr(WQS)
4146	*
4147	* In this scenario, a leaf node of either the left or right side
4148	* could be the wait queue set we're looking to unlink. We also handle
4149	* the case where one of these links is invalid. If a leaf node is
4150	* invalid or it's the set we're looking for, we can safely remove the
4151	* middle link (left or right) and point the parent link directly to
4152	* the remaining leaf node.
4153	*/
4154	if (left && wql_type(left) == WQL_LINK) {
4155	uint64_t Ll, Lr;
4156	struct waitq_link linkLl, linkLr;
4157	assert(left->wql_setid.id != setid);
4158	Ll = left->wql_link.left_setid;
4159	Lr = left->wql_link.right_setid;
4160	linkLl = wql_get_link(Ll);
4161	linkLr = wql_get_link(Lr);
4162	if (!linkLl && !linkLr) {
4163	/*
4164	* The left object points to two invalid objects!
4165	* We can invalidate the left w/o touching the parent.
4166	*/
4167	wql_invalidate(left);
4168	wqdbg_v("S2, Ll+Lr");
4169	return WQ_ITERATE_INVALID;
4170	} else if (!linkLl \|\| Ll == setid) {
4171	/ Ll is invalid and/or the wait queue set we're looking for /
4172	parent->wql_link.left_setid = Lr;
4173	wql_invalidate(left);
4174	wql_put_link(linkLl);
4175	wql_put_link(linkLr);
4176	wqdbg_v("S2, Ll");
4177	return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4178	} else if (!linkLr \|\| Lr == setid) {
4179	/ Lr is invalid and/or the wait queue set we're looking for /
4180	parent->wql_link.left_setid = Ll;
4181	wql_invalidate(left);
4182	wql_put_link(linkLr);
4183	wql_put_link(linkLl);
4184	wqdbg_v("S2, Lr");
4185	return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4186	}
4187	wql_put_link(linkLl);
4188	wql_put_link(linkLr);
4189	}
4190
4191	if (right && wql_type(right) == WQL_LINK) {
4192	uint64_t Rl, Rr;
4193	struct waitq_link linkRl, linkRr;
4194	assert(right->wql_setid.id != setid);
4195	Rl = right->wql_link.left_setid;
4196	Rr = right->wql_link.right_setid;
4197	linkRl = wql_get_link(Rl);
4198	linkRr = wql_get_link(Rr);
4199	if (!linkRl && !linkRr) {
4200	/*
4201	* The right object points to two invalid objects!
4202	* We can invalidate the right w/o touching the parent.
4203	*/
4204	wql_invalidate(right);
4205	wqdbg_v("S2, Rl+Rr");
4206	return WQ_ITERATE_INVALID;
4207	} else if (!linkRl \|\| Rl == setid) {
4208	/ Rl is invalid and/or the wait queue set we're looking for /
4209	parent->wql_link.right_setid = Rr;
4210	wql_invalidate(right);
4211	wql_put_link(linkRl);
4212	wql_put_link(linkRr);
4213	wqdbg_v("S2, Rl");
4214	return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4215	} else if (!linkRr \|\| Rr == setid) {
4216	/ Rr is invalid and/or the wait queue set we're looking for /
4217	parent->wql_link.right_setid = Rl;
4218	wql_invalidate(right);
4219	wql_put_link(linkRl);
4220	wql_put_link(linkRr);
4221	wqdbg_v("S2, Rr");
4222	return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4223	}
4224	wql_put_link(linkRl);
4225	wql_put_link(linkRr);
4226	}
4227
4228	return WQ_ITERATE_CONTINUE;
4229	}
4230
4231	/**
4232	* link table walk callback that unlinks 'waitq' from 'ctx->setid'
4233	*
4234	* Conditions:
4235	* called from walk_waitq_links
4236	* 'waitq' is locked
4237	*
4238	* Notes:
4239	* uses waitq_maybe_remove_link() to compress the linktable and
4240	* perform the actual unlinking
4241	*/
4242	static int waitq_unlink_cb(struct waitq waitq, void* *ctx,
4243	struct waitq_link *link)
4244	{
4245	uint64_t setid = ((uint64_t )ctx);
4246	struct waitq_link right, left;
4247	int ret = `0`;
4248
4249	if (wql_type(link) != WQL_LINK)
4250	return WQ_ITERATE_CONTINUE;
4251
4252	do {
4253	left = wql_get_link(link->wql_link.left_setid);
4254	right = wql_get_link(link->wql_link.right_setid);
4255
4256	ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
4257
4258	wql_put_link(left);
4259	wql_put_link(right);
4260
4261	if (!wql_is_valid(link))
4262	return WQ_ITERATE_INVALID;
4263	/ A ret value of UNLINKED will break us out of table walk /
4264	} while (ret == WQ_ITERATE_INVALID);
4265
4266	return ret;
4267	}
4268
4269
4270	/**
4271	* undo/remove a prepost from 'ctx' (waitq) to 'wqset'
4272	*
4273	* Conditions:
4274	* Called from wq_prepost_foreach_locked OR wq_prepost_iterate
4275	* 'wqset' may be NULL
4276	* (ctx)->unlink_wqset is locked
4277	*/
4278	static int waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void* *ctx,
4279	struct wq_prepost wqp, struct* waitq *waitq)
4280	{
4281	struct wq_unlink_ctx ulctx = (struct* wq_unlink_ctx *)ctx;
4282
4283	if (waitq != ulctx->unlink_wq)
4284	return WQ_ITERATE_CONTINUE;
4285
4286	if (wqp_type(wqp) == WQP_WQ &&
4287	wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
4288	/ this is the only prepost on this wait queue set /
4289	wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
4290	ulctx->unlink_wqset->wqset_prepost_id = `0`;
4291	return WQ_ITERATE_BREAK;
4292	}
4293
4294	assert(wqp_type(wqp) == WQP_POST);
4295
4296	/*
4297	* The prepost object 'wqp' points to a waitq which should no longer
4298	* be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
4299	* object from the list and break out of the iteration. Using the
4300	* context object in this way allows this same callback function to be
4301	* used from both wq_prepost_foreach_locked and wq_prepost_iterate.
4302	*/
4303	wq_prepost_remove(ulctx->unlink_wqset, wqp);
4304	return WQ_ITERATE_BREAK;
4305	}
4306
4307	/**
4308	* unlink 'waitq' from 'wqset'
4309	*
4310	* Conditions:
4311	* 'waitq' is locked
4312	* 'wqset' is _not_ locked
4313	* may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
4314	* (see waitq_clear_prepost_locked)
4315	*/
4316	static kern_return_t waitq_unlink_locked(struct waitq *waitq,
4317	struct waitq_set *wqset)
4318	{
4319	uint64_t setid;
4320	kern_return_t kr;
4321
4322	assert(!waitq_irq_safe(waitq));
4323
4324	if (waitq->waitq_set_id == `0`) {
4325	/*
4326	* TODO:
4327	* it doesn't belong to anyone, and it has a prepost object?
4328	* This is an artifact of not cleaning up after kqueues when
4329	* they prepost into select sets...
4330	*/
4331	if (waitq->waitq_prepost_id != `0`)
4332	(void)waitq_clear_prepost_locked(waitq);
4333	return KERN_NOT_IN_SET;
4334	}
4335
4336	if (!waitqs_is_linked(wqset)) {
4337	/*
4338	* No link has been allocated for the wqset,
4339	* so no waitq could have been linked to it.
4340	*/
4341	return KERN_NOT_IN_SET;
4342	}
4343
4344	setid = wqset->wqset_id;
4345
4346	if (waitq->waitq_set_id == setid) {
4347	waitq->waitq_set_id = `0`;
4348	/*
4349	* This was the only set to which the waitq belonged: we can
4350	* safely release the waitq's prepost object. It doesn't
4351	* matter if this function drops and re-acquires the lock
4352	* because we're not manipulating waitq state any more.
4353	*/
4354	(void)waitq_clear_prepost_locked(waitq);
4355	return KERN_SUCCESS;
4356	}
4357
4358	/*
4359	* The waitq was a member of more that 1 set, so we need to
4360	* handle potentially compressing the link table, and
4361	* adjusting the waitq->waitq_set_id value.
4362	*
4363	* Note: we can't free the waitq's associated prepost object (if any)
4364	* because it may be in use by the one or more _other_ sets to
4365	* which this queue belongs.
4366	*
4367	* Note: This function only handles a single level of the queue linkage.
4368	* Removing a waitq from a set to which it does not directly
4369	* belong is undefined. For example, if a waitq belonged to set
4370	* A, and set A belonged to set B. You can't remove the waitq
4371	* from set B.
4372	*/
4373	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4374	WQL_LINK, (void *)&setid, waitq_unlink_cb);
4375
4376	if (kr == WQ_ITERATE_UNLINKED) {
4377	struct wq_unlink_ctx ulctx;
4378
4379	kr = KERN_SUCCESS; / found it and dis-associated it /
4380
4381	/ don't look for preposts if it's not prepost-enabled /
4382	if (!wqset->wqset_q.waitq_prepost)
4383	goto out;
4384
4385	assert(!waitq_irq_safe(&wqset->wqset_q));
4386
4387	waitq_set_lock(wqset);
4388	/*
4389	* clear out any prepost from waitq into wqset
4390	* TODO: this could be more efficient than a linear search of
4391	* the waitq set's prepost list.
4392	*/
4393	ulctx.unlink_wq = waitq;
4394	ulctx.unlink_wqset = wqset;
4395	(void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
4396	waitq_unlink_prepost_cb);
4397	waitq_set_unlock(wqset);
4398	} else {
4399	kr = KERN_NOT_IN_SET; / waitq is _not_ associated with wqset /
4400	}
4401
4402	out:
4403	return kr;
4404	}
4405
4406	/**
4407	* unlink 'waitq' from 'wqset'
4408	*
4409	* Conditions:
4410	* neither 'waitq' nor 'wqset' is locked
4411	* may disable and re-enable interrupts
4412	* may (rarely) spin in prepost clear
4413	* (see waitq_clear_prepost_locked)
4414	*/
4415	kern_return_t waitq_unlink(struct waitq waitq, struct* waitq_set *wqset)
4416	{
4417	kern_return_t kr = KERN_SUCCESS;
4418
4419	assert(waitqs_is_set(wqset));
4420
4421	/*
4422	* we allow the waitq to be invalid because the caller may be trying
4423	* to clear out old/dirty state
4424	*/
4425	if (!waitq_valid(waitq))
4426	return KERN_INVALID_ARGUMENT;
4427
4428	wqdbg_v("unlink waitq %p from set 0x%llx",
4429	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4430
4431	assert(!waitq_irq_safe(waitq));
4432
4433	waitq_lock(waitq);
4434
4435	kr = waitq_unlink_locked(waitq, wqset);
4436
4437	waitq_unlock(waitq);
4438	return kr;
4439	}
4440
4441	/**
4442	* unlink a waitq from a waitq set, but reference the waitq by its prepost ID
4443	*
4444	* Conditions:
4445	* 'wqset' is unlocked
4446	* wqp_id may be valid or invalid
4447	*/
4448	void waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
4449	{
4450	struct wq_prepost *wqp;
4451
4452	disable_preemption();
4453	wqp = wq_prepost_get(wqp_id);
4454	if (wqp) {
4455	struct waitq *wq;
4456
4457	wq = wqp->wqp_wq.wqp_wq_ptr;
4458
4459	/*
4460	* lock the waitq, then release our prepost ID reference, then
4461	* unlink the waitq from the wqset: this ensures that we don't
4462	* hold a prepost ID reference during the unlink, but we also
4463	* complete the unlink operation atomically to avoid a race
4464	* with waitq_unlink[_all].
4465	*/
4466	assert(!waitq_irq_safe(wq));
4467
4468	waitq_lock(wq);
4469	wq_prepost_put(wqp);
4470
4471	if (!waitq_valid(wq)) {
4472	/ someone already tore down this waitq! /
4473	waitq_unlock(wq);
4474	enable_preemption();
4475	return;
4476	}
4477
4478	/ this _may_ drop the wq lock, but that's OK /
4479	waitq_unlink_locked(wq, wqset);
4480
4481	waitq_unlock(wq);
4482	}
4483	enable_preemption();
4484	return;
4485	}
4486
4487
4488	/**
4489	* reference and lock a waitq by its prepost ID
4490	*
4491	* Conditions:
4492	* wqp_id may be valid or invalid
4493	*
4494	* Returns:
4495	* a locked waitq if wqp_id was valid
4496	* NULL on failure
4497	*/
4498	struct waitq *waitq_lock_by_prepost_id(uint64_t wqp_id)
4499	{
4500	struct waitq *wq = NULL;
4501	struct wq_prepost *wqp;
4502
4503	disable_preemption();
4504	wqp = wq_prepost_get(wqp_id);
4505	if (wqp) {
4506	wq = wqp->wqp_wq.wqp_wq_ptr;
4507
4508	assert(!waitq_irq_safe(wq));
4509
4510	waitq_lock(wq);
4511	wq_prepost_put(wqp);
4512
4513	if (!waitq_valid(wq)) {
4514	/ someone already tore down this waitq! /
4515	waitq_unlock(wq);
4516	enable_preemption();
4517	return NULL;
4518	}
4519	}
4520	enable_preemption();
4521	return wq;
4522	}
4523
4524
4525	/**
4526	* unlink 'waitq' from all sets to which it belongs
4527	*
4528	* Conditions:
4529	* 'waitq' is locked on entry
4530	* returns with waitq lock dropped
4531	*
4532	* Notes:
4533	* may (rarely) spin (see waitq_clear_prepost_locked)
4534	*/
4535	kern_return_t waitq_unlink_all_unlock(struct waitq *waitq)
4536	{
4537	uint64_t old_set_id = `0`;
4538	wqdbg_v("unlink waitq %p from all sets",
4539	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
4540	assert(!waitq_irq_safe(waitq));
4541
4542	/ it's not a member of any sets /
4543	if (waitq->waitq_set_id == `0`) {
4544	waitq_unlock(waitq);
4545	return KERN_SUCCESS;
4546	}
4547
4548	old_set_id = waitq->waitq_set_id;
4549	waitq->waitq_set_id = `0`;
4550
4551	/*
4552	* invalidate the prepost entry for this waitq.
4553	* This may drop and re-acquire the waitq lock, but that's OK because
4554	* if it was added to another set and preposted to that set in the
4555	* time we drop the lock, the state will remain consistent.
4556	*/
4557	(void)waitq_clear_prepost_locked(waitq);
4558
4559	waitq_unlock(waitq);
4560
4561	if (old_set_id) {
4562	/*
4563	* Walk the link table and invalidate each LINK object that
4564	* used to connect this waitq to one or more sets: this works
4565	* because WQL_LINK objects are private to each wait queue
4566	*/
4567	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,
4568	WQL_LINK, NULL, waitq_unlink_all_cb);
4569	}
4570
4571	return KERN_SUCCESS;
4572	}
4573
4574	/**
4575	* unlink 'waitq' from all sets to which it belongs
4576	*
4577	* Conditions:
4578	* 'waitq' is not locked
4579	* may disable and re-enable interrupts
4580	* may (rarely) spin
4581	* (see waitq_unlink_all_locked, waitq_clear_prepost_locked)
4582	*/
4583	kern_return_t waitq_unlink_all(struct waitq *waitq)
4584	{
4585	kern_return_t kr = KERN_SUCCESS;
4586
4587	if (!waitq_valid(waitq))
4588	panic("Invalid waitq: %p", waitq);
4589
4590	assert(!waitq_irq_safe(waitq));
4591	waitq_lock(waitq);
4592	if (!waitq_valid(waitq)) {
4593	waitq_unlock(waitq);
4594	return KERN_SUCCESS;
4595	}
4596
4597	kr = waitq_unlink_all_unlock(waitq);
4598	/ waitq unlocked and set links deallocated /
4599
4600	return kr;
4601	}
4602
4603
4604	/**
4605	* unlink all waitqs from 'wqset'
4606	*
4607	* Conditions:
4608	* 'wqset' is locked on entry
4609	* 'wqset' is unlocked on exit and spl is restored
4610	*
4611	* Note:
4612	* may (rarely) spin/block (see waitq_clear_prepost_locked)
4613	*/
4614	kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset)
4615	{
4616	struct waitq_link *link;
4617	uint64_t prepost_id;
4618
4619	wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
4620
4621	/*
4622	* This operation does not require interaction with any of the set's
4623	* constituent wait queues. All we have to do is invalidate the SetID
4624	*/
4625
4626	if (waitqs_is_linked(wqset)){
4627
4628	/ invalidate and re-alloc the link object first /
4629	link = wql_get_link(wqset->wqset_id);
4630
4631	/ we may have raced with a waitq_set_deinit: handle this /
4632	if (!link) {
4633	waitq_set_unlock(wqset);
4634	return KERN_SUCCESS;
4635	}
4636
4637	wql_invalidate(link);
4638
4639	/ re-alloc the object to get a new generation ID /
4640	wql_realloc_link(link, WQL_WQS);
4641	link->wql_wqs.wql_set = wqset;
4642
4643	wqset->wqset_id = link->wql_setid.id;
4644	wql_mkvalid(link);
4645	wql_put_link(link);
4646	}
4647
4648	/ clear any preposts attached to this set /
4649	prepost_id = `0`;
4650	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id)
4651	prepost_id = wqset->wqset_prepost_id;
4652	/ else { TODO: notify kqueue subsystem? } /
4653	wqset->wqset_prepost_id = `0`;
4654
4655	/*
4656	* clear set linkage and prepost object associated with this set:
4657	* waitq sets may prepost to other sets if, for example, they are
4658	* associated with a kqueue which is in a select set.
4659	*
4660	* This releases all the set link objects
4661	* (links to other sets to which this set was previously added)
4662	*/
4663	waitq_unlink_all_unlock(&wqset->wqset_q);
4664	/ wqset->wqset_q unlocked /
4665
4666	/ drop / unlink all the prepost table objects /
4667	if (prepost_id)
4668	(void)wq_prepost_iterate(prepost_id, NULL,
4669	wqset_clear_prepost_chain_cb);
4670
4671	return KERN_SUCCESS;
4672	}
4673
4674	/**
4675	* unlink all waitqs from 'wqset'
4676	*
4677	* Conditions:
4678	* 'wqset' is not locked
4679	* may (rarely) spin/block (see waitq_clear_prepost_locked)
4680	*/
4681	kern_return_t waitq_set_unlink_all(struct waitq_set *wqset)
4682	{
4683	assert(waitqs_is_set(wqset));
4684	assert(!waitq_irq_safe(&wqset->wqset_q));
4685
4686	waitq_set_lock(wqset);
4687	return waitq_set_unlink_all_unlock(wqset);
4688	/ wqset unlocked and set links and preposts deallocated /
4689	}
4690
4691	static int waitq_prepost_reserve_cb(struct waitq waitq, void* *ctx,
4692	struct waitq_link *link)
4693	{
4694	uint32_t num = (uint32_t )ctx;
4695	(void)waitq;
4696
4697	/*
4698	* In the worst case, we'll have to allocate 2 prepost objects
4699	* per waitq set (if the set was already preposted by another
4700	* waitq).
4701	*/
4702	if (wql_type(link) == WQL_WQS) {
4703	/*
4704	* check to see if the associated waitq actually supports
4705	* preposting
4706	*/
4707	if (waitq_set_can_prepost(link->wql_wqs.wql_set))
4708	*num += `2`;
4709	}
4710	return WQ_ITERATE_CONTINUE;
4711	}
4712
4713	static int waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
4714	int did_unlock, struct* wq_prepost **wqp)
4715	{
4716	struct wq_prepost *tmp;
4717	struct wqp_cache *cache;
4718
4719	*did_unlock = `0`;
4720
4721	/*
4722	* Before we unlock the waitq, check the per-processor prepost object
4723	* cache to see if there's enough there for us. If so, do the
4724	* allocation, keep the lock and save an entire iteration over the set
4725	* linkage!
4726	*/
4727	if (waitq) {
4728	disable_preemption();
4729	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
4730	if (nalloc <= (int)cache->avail)
4731	goto do_alloc;
4732	enable_preemption();
4733
4734	/ unlock the waitq to perform the allocation /
4735	*did_unlock = `1`;
4736	waitq_unlock(waitq);
4737	}
4738
4739	do_alloc:
4740	tmp = wq_prepost_alloc(LT_RESERVED, nalloc);
4741	if (!tmp)
4742	panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
4743	nalloc, waitq, *wqp);
4744	if (*wqp) {
4745	/ link the two lists /
4746	int __assert_only rc;
4747	rc = wq_prepost_rlink(tmp, *wqp);
4748	assert(rc == nalloc);
4749	}
4750	*wqp = tmp;
4751
4752	/*
4753	* If the caller can block, then enforce a minimum-free table element
4754	* policy here. This helps ensure that we will have enough prepost
4755	* objects for callers such as selwakeup() that can be called with
4756	* spin locks held.
4757	*/
4758	if (get_preemption_level() == `0`)
4759	wq_prepost_ensure_free_space();
4760
4761	if (waitq) {
4762	if (*did_unlock == `0`) {
4763	/ decrement the preemption count if alloc from cache /
4764	enable_preemption();
4765	} else {
4766	/ otherwise: re-lock the waitq /
4767	waitq_lock(waitq);
4768	}
4769	}
4770
4771	return nalloc;
4772	}
4773
4774	static int waitq_count_prepost_reservation(struct waitq waitq, int* extra, int keep_locked)
4775	{
4776	int npreposts = `0`;
4777
4778	/*
4779	* If the waitq is not currently part of a set, and we're not asked to
4780	* keep the waitq locked then we'll want to have 3 in reserve
4781	* just-in-case it becomes part of a set while we unlock and reserve.
4782	* We may need up to 1 object for the waitq, and 2 for the set.
4783	*/
4784	if (waitq->waitq_set_id == `0`) {
4785	npreposts = `3`;
4786	} else {
4787	/ this queue has never been preposted before /
4788	if (waitq->waitq_prepost_id == `0`)
4789	npreposts = `3`;
4790
4791	/*
4792	* Walk the set of table linkages associated with this waitq
4793	* and count the worst-case number of prepost objects that
4794	* may be needed during a wakeup_all. We can walk this without
4795	* locking each set along the way because the table-based IDs
4796	* disconnect us from the set pointers themselves, and the
4797	* table walking is careful to read the setid values only once.
4798	* Locking each set up the chain also doesn't guarantee that
4799	* their membership won't change between the time we unlock
4800	* that set and when we actually go to prepost, so our
4801	* situation is no worse than before and we've alleviated lock
4802	* contention on any sets to which this waitq belongs.
4803	*/
4804	(void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,
4805	waitq, waitq->waitq_set_id,
4806	WQL_WQS, (void *)&npreposts,
4807	waitq_prepost_reserve_cb);
4808	}
4809
4810	if (extra > `0`)
4811	npreposts += extra;
4812
4813	if (npreposts == `0` && !keep_locked) {
4814	/*
4815	* If we get here, we were asked to reserve some prepost
4816	* objects for a waitq that's previously preposted, and is not
4817	* currently a member of any sets. We have also been
4818	* instructed to unlock the waitq when we're done. In this
4819	* case, we pre-allocated enough reserved objects to handle
4820	* the case where the waitq gets added to a single set when
4821	* the lock is released.
4822	*/
4823	npreposts = `3`;
4824	}
4825
4826	return npreposts;
4827	}
4828
4829
4830	/**
4831	* pre-allocate prepost objects for 'waitq'
4832	*
4833	* Conditions:
4834	* 'waitq' is not locked
4835	*
4836	* Returns:
4837	* panic on error
4838	*
4839	* 0 on success, '*reserved' is set to the head of a singly-linked
4840	* list of pre-allocated prepost objects.
4841	*
4842	* Notes:
4843	* If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
4844	* atomically and returns 'waitq' locked.
4845	*
4846	* This function attempts to pre-allocate precisely enough prepost
4847	* objects based on the current set membership of 'waitq'. If the
4848	* operation is performed atomically, then the caller
4849	* is guaranteed to have enough pre-allocated prepost object to avoid
4850	* any (rare) blocking in the wakeup path.
4851	*/
4852	uint64_t waitq_prepost_reserve(struct waitq waitq, int* extra,
4853	waitq_lock_state_t lock_state)
4854	{
4855	uint64_t reserved = `0`;
4856	uint64_t prev_setid = `0`, prev_prepostid = `0`;
4857	struct wq_prepost *wqp = NULL;
4858	int nalloc = `0`, npreposts = `0`;
4859	int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
4860	int unlocked = `0`;
4861
4862	wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
4863	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
4864
4865	if (waitq == NULL && extra > `0`) {
4866	/*
4867	* Simple prepost object allocation:
4868	* we'll add 2 more because the waitq might need an object,
4869	* and the set itself may need a new POST object in addition
4870	* to the number of preposts requested by the caller
4871	*/
4872	nalloc = waitq_alloc_prepost_reservation(extra + `2`, NULL,
4873	&unlocked, &wqp);
4874	assert(nalloc == extra + `2`);
4875	return wqp->wqp_prepostid.id;
4876	}
4877
4878	assert(lock_state == WAITQ_KEEP_LOCKED \|\| lock_state == WAITQ_UNLOCK);
4879
4880	assert(!waitq_irq_safe(waitq));
4881
4882	waitq_lock(waitq);
4883
4884	/ remember the set ID that we started with /
4885	prev_setid = waitq->waitq_set_id;
4886	prev_prepostid = waitq->waitq_prepost_id;
4887
4888	/*
4889	* If the waitq is not part of a set, and we're asked to
4890	* keep the set locked, then we don't have to reserve
4891	* anything!
4892	*/
4893	if (prev_setid == `0` && keep_locked)
4894	goto out;
4895
4896	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
4897
4898	/ nothing for us to do! /
4899	if (npreposts == `0`) {
4900	if (keep_locked)
4901	goto out;
4902	goto out_unlock;
4903	}
4904
4905	try_alloc:
4906	/ this _may_ unlock and relock the waitq! /
4907	nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,
4908	&unlocked, &wqp);
4909
4910	if (!unlocked) {
4911	/ allocation held the waitq lock: we'd done! /
4912	if (keep_locked)
4913	goto out;
4914	goto out_unlock;
4915	}
4916
4917	/*
4918	* Before we return, if the allocation had to unlock the waitq, we
4919	* must check one more time to see if we have enough. If not, we'll
4920	* try to allocate the difference. If the caller requests it, we'll
4921	* also leave the waitq locked so that the use of the pre-allocated
4922	* prepost objects can be guaranteed to be enough if a wakeup_all is
4923	* performed before unlocking the waitq.
4924	*/
4925
4926	/*
4927	* If the waitq is no longer associated with a set, or if the waitq's
4928	* set/prepostid has not changed since we first walked its linkage,
4929	* we're done.
4930	*/
4931	if ((waitq->waitq_set_id == `0`) \|\|
4932	(waitq->waitq_set_id == prev_setid &&
4933	waitq->waitq_prepost_id == prev_prepostid)) {
4934	if (keep_locked)
4935	goto out;
4936	goto out_unlock;
4937	}
4938
4939	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
4940
4941	if (npreposts > nalloc) {
4942	prev_setid = waitq->waitq_set_id;
4943	prev_prepostid = waitq->waitq_prepost_id;
4944	npreposts = npreposts - nalloc; / only allocate the diff /
4945	goto try_alloc;
4946	}
4947
4948	if (keep_locked)
4949	goto out;
4950
4951	out_unlock:
4952	waitq_unlock(waitq);
4953	out:
4954	if (wqp)
4955	reserved = wqp->wqp_prepostid.id;
4956
4957	return reserved;
4958	}
4959
4960	/**
4961	* release a linked list of prepost objects allocated via _prepost_reserve
4962	*
4963	* Conditions:
4964	* may (rarely) spin waiting for prepost table growth memcpy
4965	*/
4966	void waitq_prepost_release_reserve(uint64_t id)
4967	{
4968	struct wq_prepost *wqp;
4969
4970	wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
4971
4972	wqp = wq_prepost_rfirst(id);
4973	if (!wqp)
4974	return;
4975
4976	wq_prepost_release_rlist(wqp);
4977	}
4978
4979
4980	/**
4981	* clear all preposts from 'wqset'
4982	*
4983	* Conditions:
4984	* 'wqset' is not locked
4985	*/
4986	void waitq_set_clear_preposts(struct waitq_set *wqset)
4987	{
4988	uint64_t prepost_id;
4989	spl_t spl;
4990
4991	assert(waitqs_is_set(wqset));
4992
4993	if (!wqset->wqset_q.waitq_prepost \|\| !wqset->wqset_prepost_id)
4994	return;
4995
4996	wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
4997	wqset->wqset_id);
4998
4999	if (waitq_irq_safe(&wqset->wqset_q))
5000	spl = splsched();
5001	waitq_set_lock(wqset);
5002	prepost_id = wqset->wqset_prepost_id;
5003	wqset->wqset_prepost_id = `0`;
5004	waitq_set_unlock(wqset);
5005	if (waitq_irq_safe(&wqset->wqset_q))
5006	splx(spl);
5007
5008	/ drop / unlink all the prepost table objects /
5009	if (prepost_id)
5010	(void)wq_prepost_iterate(prepost_id, NULL,
5011	wqset_clear_prepost_chain_cb);
5012	}
5013
5014
5015	/ ----------------------------------------------------------------------*
5016	*
5017	* Iteration: waitq -> sets / waitq_set -> preposts
5018	*
5019	* ---------------------------------------------------------------------- */
5020
5021	struct wq_it_ctx {
5022	void *input;
5023	void *ctx;
5024	waitq_iterator_t it;
5025	};
5026
5027	static int waitq_iterate_sets_cb(struct waitq waitq, void* *ctx,
5028	struct waitq_link *link)
5029	{
5030	struct wq_it_ctx wctx = (struct* wq_it_ctx *)(ctx);
5031	struct waitq_set *wqset;
5032	int ret;
5033
5034	(void)waitq;
5035	assert(!waitq_irq_safe(waitq));
5036	assert(wql_type(link) == WQL_WQS);
5037
5038	/*
5039	* the waitq is locked, so we can just take the set lock
5040	* and call the iterator function
5041	*/
5042	wqset = link->wql_wqs.wql_set;
5043	assert(wqset != NULL);
5044	assert(!waitq_irq_safe(&wqset->wqset_q));
5045	waitq_set_lock(wqset);
5046
5047	ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
5048
5049	waitq_set_unlock(wqset);
5050	return ret;
5051	}
5052
5053	/**
5054	* call external iterator function for each prepost object in wqset
5055	*
5056	* Conditions:
5057	* Called from wq_prepost_foreach_locked
5058	* (wqset locked, waitq _not_ locked)
5059	*/
5060	static int wqset_iterate_prepost_cb(struct waitq_set wqset, void* *ctx,
5061	struct wq_prepost wqp, struct* waitq *waitq)
5062	{
5063	struct wq_it_ctx wctx = (struct* wq_it_ctx *)(ctx);
5064	uint64_t wqp_id;
5065	int ret;
5066
5067	(void)wqp;
5068
5069	/*
5070	* This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
5071	* Taking the 'waitq' lock is a lock order violation, so we need to be
5072	* careful. We also must realize that we may have taken a reference to
5073	* the 'wqp' just as the associated waitq was being torn down (or
5074	* clearing all its preposts) - see waitq_clear_prepost_locked(). If
5075	* the 'wqp' is valid and we can get the waitq lock, then we are good
5076	* to go. If not, we need to back off, check that the 'wqp' hasn't
5077	* been invalidated, and try to re-take the locks.
5078	*/
5079	assert(!waitq_irq_safe(waitq));
5080
5081	if (waitq_lock_try(waitq))
5082	goto call_iterator;
5083
5084	if (!wqp_is_valid(wqp))
5085	return WQ_ITERATE_RESTART;
5086
5087	/ We are passed a prepost object with a reference on it. If neither*
5088	* the waitq set nor the waitq require interrupts disabled, then we
5089	* may block on the delay(1) call below. We can't hold a prepost
5090	* object reference while blocking, so we have to give that up as well
5091	* and re-acquire it when we come back.
5092	*/
5093	wqp_id = wqp->wqp_prepostid.id;
5094	wq_prepost_put(wqp);
5095	waitq_set_unlock(wqset);
5096	wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
5097	wqset, wqp, wqp->wqp_prepostid.id, waitq);
5098	delay(`1`);
5099	waitq_set_lock(wqset);
5100	wqp = wq_prepost_get(wqp_id);
5101	if (!wqp)
5102	/ someone cleared preposts while we slept! /
5103	return WQ_ITERATE_DROPPED;
5104
5105	/*
5106	* TODO:
5107	* This differs slightly from the logic in ipc_mqueue.c:
5108	* ipc_mqueue_receive_on_thread(). There, if the waitq lock
5109	* can't be obtained, the prepost link is placed on the back of
5110	* the chain, and the iteration starts from the beginning. Here,
5111	* we just restart from the beginning.
5112	*/
5113	return WQ_ITERATE_RESTART;
5114
5115	call_iterator:
5116	if (!wqp_is_valid(wqp)) {
5117	ret = WQ_ITERATE_RESTART;
5118	goto out_unlock;
5119	}
5120
5121	/ call the external callback /
5122	ret = wctx->it(wctx->ctx, waitq, wqset);
5123
5124	if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
5125	ret = WQ_ITERATE_BREAK;
5126	goto out;
5127	}
5128
5129	out_unlock:
5130	waitq_unlock(waitq);
5131	out:
5132	return ret;
5133	}
5134
5135	/**
5136	* iterator over all sets to which the given waitq has been linked
5137	*
5138	* Conditions:
5139	* 'waitq' is locked
5140	*/
5141	int waitq_iterate_sets(struct waitq waitq, void* *ctx, waitq_iterator_t it)
5142	{
5143	int ret;
5144	struct wq_it_ctx wctx = {
5145	.input = (void *)waitq,
5146	.ctx = ctx,
5147	.it = it,
5148	};
5149	if (!it \|\| !waitq)
5150	return KERN_INVALID_ARGUMENT;
5151
5152	ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
5153	WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);
5154	if (ret == WQ_ITERATE_CONTINUE)
5155	ret = WQ_ITERATE_SUCCESS;
5156	return ret;
5157	}
5158
5159	/**
5160	* iterator over all preposts in the given wqset
5161	*
5162	* Conditions:
5163	* 'wqset' is locked
5164	*/
5165	int waitq_set_iterate_preposts(struct waitq_set *wqset,
5166	void *ctx, waitq_iterator_t it)
5167	{
5168	struct wq_it_ctx wctx = {
5169	.input = (void *)wqset,
5170	.ctx = ctx,
5171	.it = it,
5172	};
5173	if (!it \|\| !wqset)
5174	return WQ_ITERATE_INVALID;
5175
5176	assert(waitq_held(&wqset->wqset_q));
5177
5178	return wq_prepost_foreach_locked(wqset, (void *)&wctx,
5179	wqset_iterate_prepost_cb);
5180	}
5181
5182
5183	/ ----------------------------------------------------------------------*
5184	*
5185	* Higher-level APIs
5186	*
5187	* ---------------------------------------------------------------------- */
5188
5189
5190	/**
5191	* declare a thread's intent to wait on 'waitq' for 'wait_event'
5192	*
5193	* Conditions:
5194	* 'waitq' is not locked
5195	*/
5196	wait_result_t waitq_assert_wait64(struct waitq *waitq,
5197	event64_t wait_event,
5198	wait_interrupt_t interruptible,
5199	uint64_t deadline)
5200	{
5201	thread_t thread = current_thread();
5202	wait_result_t ret;
5203	spl_t s;
5204
5205	if (!waitq_valid(waitq))
5206	panic("Invalid waitq: %p", waitq);
5207
5208	if (waitq_irq_safe(waitq))
5209	s = splsched();
5210
5211	waitq_lock(waitq);
5212	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5213	TIMEOUT_URGENCY_SYS_NORMAL,
5214	deadline, TIMEOUT_NO_LEEWAY, thread);
5215	waitq_unlock(waitq);
5216
5217	if (waitq_irq_safe(waitq))
5218	splx(s);
5219
5220	return ret;
5221	}
5222
5223	/**
5224	* declare a thread's intent to wait on 'waitq' for 'wait_event'
5225	*
5226	* Conditions:
5227	* 'waitq' is not locked
5228	* will disable and re-enable interrupts while locking current_thread()
5229	*/
5230	wait_result_t waitq_assert_wait64_leeway(struct waitq *waitq,
5231	event64_t wait_event,
5232	wait_interrupt_t interruptible,
5233	wait_timeout_urgency_t urgency,
5234	uint64_t deadline,
5235	uint64_t leeway)
5236	{
5237	wait_result_t ret;
5238	thread_t thread = current_thread();
5239	spl_t s;
5240
5241	if (!waitq_valid(waitq))
5242	panic("Invalid waitq: %p", waitq);
5243
5244	if (waitq_irq_safe(waitq))
5245	s = splsched();
5246
5247	waitq_lock(waitq);
5248	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5249	urgency, deadline, leeway, thread);
5250	waitq_unlock(waitq);
5251
5252	if (waitq_irq_safe(waitq))
5253	splx(s);
5254
5255	return ret;
5256	}
5257
5258	/**
5259	* wakeup a single thread from a waitq that's waiting for a given event
5260	*
5261	* Conditions:
5262	* 'waitq' is not locked
5263	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5264	* may disable and re-enable interrupts
5265	*
5266	* Notes:
5267	* will _not_ block if waitq is global (or not a member of any set)
5268	*/
5269	kern_return_t waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
5270	wait_result_t result, int priority)
5271	{
5272	kern_return_t kr;
5273	uint64_t reserved_preposts = `0`;
5274	spl_t spl;
5275
5276	if (!waitq_valid(waitq))
5277	panic("Invalid waitq: %p", waitq);
5278
5279	if (!waitq_irq_safe(waitq)) {
5280	/ reserve preposts in addition to locking the waitq /
5281	reserved_preposts = waitq_prepost_reserve(waitq, `0`, WAITQ_KEEP_LOCKED);
5282	} else {
5283	spl = splsched();
5284	waitq_lock(waitq);
5285	}
5286
5287	/ waitq is locked upon return /
5288	kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
5289	&reserved_preposts, priority, WAITQ_UNLOCK);
5290
5291	if (waitq_irq_safe(waitq))
5292	splx(spl);
5293
5294	/ release any left-over prepost object (won't block/lock anything) /
5295	waitq_prepost_release_reserve(reserved_preposts);
5296
5297	return kr;
5298	}
5299
5300	/**
5301	* wakeup all threads from a waitq that are waiting for a given event
5302	*
5303	* Conditions:
5304	* 'waitq' is not locked
5305	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5306	* may disable and re-enable interrupts
5307	*
5308	* Notes:
5309	* will _not_ block if waitq is global (or not a member of any set)
5310	*/
5311	kern_return_t waitq_wakeup64_all(struct waitq *waitq,
5312	event64_t wake_event,
5313	wait_result_t result,
5314	int priority)
5315	{
5316	kern_return_t ret;
5317	uint64_t reserved_preposts = `0`;
5318	spl_t s;
5319
5320	if (!waitq_valid(waitq))
5321	panic("Invalid waitq: %p", waitq);
5322
5323	if (!waitq_irq_safe(waitq)) {
5324	/ reserve preposts in addition to locking waitq /
5325	reserved_preposts = waitq_prepost_reserve(waitq, `0`,
5326	WAITQ_KEEP_LOCKED);
5327	} else {
5328	s = splsched();
5329	waitq_lock(waitq);
5330	}
5331
5332	ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
5333	&reserved_preposts, priority,
5334	WAITQ_UNLOCK);
5335
5336	if (waitq_irq_safe(waitq))
5337	splx(s);
5338
5339	waitq_prepost_release_reserve(reserved_preposts);
5340
5341	return ret;
5342
5343	}
5344
5345	/**
5346	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
5347	*
5348	* Conditions:
5349	* 'waitq' is not locked
5350	*
5351	* Notes:
5352	* May temporarily disable and re-enable interrupts
5353	*/
5354	kern_return_t waitq_wakeup64_thread(struct waitq *waitq,
5355	event64_t wake_event,
5356	thread_t thread,
5357	wait_result_t result)
5358	{
5359	kern_return_t ret;
5360	spl_t s, th_spl;
5361
5362	if (!waitq_valid(waitq))
5363	panic("Invalid waitq: %p", waitq);
5364
5365	if (waitq_irq_safe(waitq))
5366	s = splsched();
5367	waitq_lock(waitq);
5368
5369	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
5370	/ on success, returns 'thread' locked /
5371
5372	waitq_unlock(waitq);
5373
5374	if (ret == KERN_SUCCESS) {
5375	ret = thread_go(thread, result);
5376	assert(ret == KERN_SUCCESS);
5377	thread_unlock(thread);
5378	splx(th_spl);
5379	waitq_stats_count_wakeup(waitq);
5380	} else {
5381	ret = KERN_NOT_WAITING;
5382	waitq_stats_count_fail(waitq);
5383	}
5384
5385	if (waitq_irq_safe(waitq))
5386	splx(s);
5387
5388	return ret;
5389	}
5390
5391	/**
5392	* wakeup a single thread from a waitq that's waiting for a given event
5393	* and return a reference to that thread
5394	* returns THREAD_NULL if no thread was waiting
5395	*
5396	* Conditions:
5397	* 'waitq' is not locked
5398	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5399	* may disable and re-enable interrupts
5400	*
5401	* Notes:
5402	* will _not_ block if waitq is global (or not a member of any set)
5403	*/
5404	thread_t
5405	waitq_wakeup64_identify(struct waitq *waitq,
5406	event64_t wake_event,
5407	wait_result_t result,
5408	int priority)
5409	{
5410	uint64_t reserved_preposts = `0`;
5411	spl_t thread_spl = `0`;
5412	thread_t thread;
5413	spl_t spl;
5414
5415	if (!waitq_valid(waitq))
5416	panic("Invalid waitq: %p", waitq);
5417
5418	if (!waitq_irq_safe(waitq)) {
5419	/ reserve preposts in addition to locking waitq /
5420	reserved_preposts = waitq_prepost_reserve(waitq, `0`, WAITQ_KEEP_LOCKED);
5421	} else {
5422	spl = splsched();
5423	waitq_lock(waitq);
5424	}
5425
5426	thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
5427	&thread_spl, &reserved_preposts,
5428	priority, WAITQ_UNLOCK);
5429	/ waitq is unlocked, thread is locked /
5430
5431	if (thread != THREAD_NULL) {
5432	thread_reference(thread);
5433	thread_unlock(thread);
5434	splx(thread_spl);
5435	}
5436
5437	if (waitq_irq_safe(waitq))
5438	splx(spl);
5439
5440	/ release any left-over prepost object (won't block/lock anything) /
5441	waitq_prepost_release_reserve(reserved_preposts);
5442
5443	/ returns +1 ref to running thread or THREAD_NULL /
5444	return thread;
5445	}
5446
5447

Browse the source code of xnu/osfmk/kern/waitq.c