waitq.c source code [xnu/osfmk/kern/waitq.c]

1	/*
2	* Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_FREE_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56
57	#include <kern/ast.h>
58	#include <kern/backtrace.h>
59	#include <kern/kern_types.h>
60	#include <kern/mach_param.h>
61	#include <kern/percpu.h>
62	#include <kern/queue.h>
63	#include <kern/sched_prim.h>
64	#include <kern/simple_lock.h>
65	#include <kern/spl.h>
66	#include <kern/waitq.h>
67	#include <kern/zalloc.h>
68	#include <kern/policy_internal.h>
69	#include <kern/turnstile.h>
70
71	#include <os/hash.h>
72	#include <libkern/section_keywords.h>
73	#include <mach/sync_policy.h>
74	#include <vm/vm_kern.h>
75
76	#include <sys/kdebug.h>
77
78	/!*
79	* @const waitq_set_unlink_batch
80	*
81	* @brief
82	* How many links are unhooked under a single set lock hold.
83	*
84	* @discussion
85	* Holding a waitq set lock for too long can cause
86	* extreme contention (when a set is being torn down concurrently
87	* to messages being sent to ports who used to belong to that set).
88	*
89	* In order to fight this, large wait queue sets will drop
90	* and reacquire their lock for each unlinking batch.
91	*/
92	static TUNABLE(uint32_t, waitq_set_unlink_batch, "waitq_set_unlink_batch", `64`);
93
94	/!*
95	* @const WQL_PREPOST_MARKER
96	*
97	* @brief
98	* Marker set in the @c wql_wqs field of wait queue linkages to denote that
99	* this linkage has preposted to its wait queue set already.
100	*
101	* @discussion
102	* This bit is manipulated under both the wait queue and the wait queue set
103	* locks, and is used for two purposes:
104	*
105	* - for port set queues, it denotes in which circle queue the linkage
106	* is queued on (@c waitq_set::wqset_links or @c waitq_set::wqset_preposts)
107	*
108	* - as an optimization during pre-post to not walk sets this link already
109	* preposted to.
110	*/
111	#define WQL_PREPOST_MARKER 1ul
112
113	#if __LP64__
114	/!*
115	* @struct waitq_link_hdr
116	*
117	* @brief
118	* Common "header" between all linkages, in order to find the waitq_set
119	* of this linkage.
120	*
121	* @discussion
122	* Due to unfortunate alignment constraints on @c queue_chain_t,
123	* this is wildly different for LP64 and ILP32.
124	*
125	* Do note that `wql
126	*/
127	struct waitq_link_hdr {
128	uintptr_t wql_wqs;
129	};
130
131	/!*
132	* @struct waitq_sellink
133	*
134	* @brief
135	* Linkages used for select waitq queues to select wait queue sets.
136	*
137	* @discussion
138	* Select linkages are one way (queue to set) for two reasons:
139	*
140	* 1. select doesn't use the wait queue subsystem to discover which file
141	* descriptor woke up the set (it will instead scan all fds again),
142	*
143	* 2. all linkages are unhooked on each syscall return, so we minimize
144	* work to be done to be as quick as possible, using a fast invalidation
145	* scheme based on unique identifiers and sequestering
146	* (see @c select_set_nextid()).
147	*/
148	struct waitq_sellink {
149	uintptr_t wql_wqs;
150	struct waitq_link_list_entry wql_next;
151	uint64_t wql_setid;
152	};
153
154	/!*
155	* @struct waitq_link
156	*
157	* @brief
158	* Linkages used for port wait queues and port-set wait queue sets.
159	*
160	* @discussion
161	* Those linkages go both ways so that receiving messages through a port-set
162	* can quickly find ports that preposted to the set.
163	*
164	* It also means that unhooking linkages cannot be lazy.
165	*/
166	struct waitq_link {
167	uintptr_t wql_wqs; /< wait queue set for this link /*
168	queue_chain_t wql_qlink; /< linkage through the waitq list /*
169	queue_chain_t wql_slink; /< linkage through the wqset list /*
170	struct waitq wql_wq; /*< wait queue for this link /*
171	};
172	#else
173	struct waitq_link_hdr {
174	uint64_t __wql_padding;
175	uintptr_t wql_wqs;
176	};
177
178	struct waitq_sellink {
179	struct waitq_link_list_entry wql_next;
180	uintptr_t __wql_padding;
181	uintptr_t wql_wqs;
182	uint64_t wql_setid;
183	};
184
185	struct waitq_link {
186	queue_chain_t wql_qlink;
187	uintptr_t wql_wqs;
188	struct waitq *wql_wq;
189	queue_chain_t wql_slink;
190	};
191	#endif
192
193	static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
194	offsetof(struct waitq_sellink, wql_wqs));
195	static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
196	offsetof(struct waitq_link, wql_wqs));
197	static_assert(sizeof(struct waitq) <= WQ_OPAQUE_SIZE, "waitq structure size mismatch");
198	static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure alignment mismatch");
199
200	static KALLOC_TYPE_DEFINE(waitq_sellink_zone, struct waitq_sellink, KT_PRIV_ACCT);
201	static KALLOC_TYPE_DEFINE(waitq_link_zone, struct waitq_link, KT_PRIV_ACCT);
202	ZONE_DEFINE_ID(ZONE_ID_SELECT_SET, "select_set", struct select_set,
203	ZC_SEQUESTER \| ZC_NOPGZ \| ZC_ZFREE_CLEARMEM);
204
205	static LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
206
207	static uint64_t PERCPU_DATA(select_setid);
208	struct waitq select_conflict_queue;
209
210	#pragma mark waitq links
211
212	static inline bool
213	waitq_is_sellink(waitq_type_t type)
214	{
215	return type == WQT_SELECT \|\| type == WQT_SELECT_SET;
216	}
217
218	static inline bool
219	wql_sellink_valid(struct select_set selset, struct* waitq_sellink *link)
220	{
221	return waitq_valid(waitq: selset) && selset->selset_id == link->wql_setid;
222	}
223
224	static waitq_t
225	wql_wqs(waitq_link_t link)
226	{
227	return (waitq_t){ .wq_q: (void *)(link.wqlh->wql_wqs & ~WQL_PREPOST_MARKER) };
228	}
229
230	static bool
231	wql_wqs_preposted(waitq_link_t link)
232	{
233	return link.wqlh->wql_wqs & WQL_PREPOST_MARKER;
234	}
235
236	static void
237	wql_wqs_mark_preposted(waitq_link_t link)
238	{
239	assert(!wql_wqs_preposted(link));
240	link.wqlh->wql_wqs \|= WQL_PREPOST_MARKER;
241	}
242
243	static void
244	wql_wqs_clear_preposted(waitq_link_t link)
245	{
246	assert(wql_wqs_preposted(link));
247	link.wqlh->wql_wqs &= ~WQL_PREPOST_MARKER;
248	}
249
250	static circle_queue_t
251	wql_wqs_queue(struct waitq_set wqs, struct* waitq_link *link)
252	{
253	return wql_wqs_preposted(link: link) ? &wqs->wqset_preposts : &wqs->wqset_links;
254	}
255
256	static void
257	wql_list_push(waitq_link_list_t *list, waitq_link_t link)
258	{
259	link.wqls->wql_next.next = list->next;
260	list->next = &link.wqls->wql_next;
261	}
262
263	static inline struct waitq_sellink *
264	wql_list_elem(struct waitq_link_list_entry *e)
265	{
266	return e ? __container_of(e, struct waitq_sellink, wql_next) : NULL;
267	}
268
269	/!*
270	* @function wql_list_next()
271	*
272	* @brief
273	* Helper function to implement wait queue link list enumeration.
274	*
275	* @param e in: pointer to the current element,
276	* out: pointer to the next element or NULL
277	* @param end which element to stop enumeration at (NULL for lists,
278	* or the first element enumerated for circle queues).
279	* @returns true (makes writing for(;;) based enumerators easier).
280	*/
281	static inline bool
282	wql_list_next(struct waitq_link_list_entry e, struct** waitq_link_list_entry *end)
283	{
284	if (e == NULL \|\| (e)->next == end) {
285	*e = NULL;
286	} else {
287	e = (e)->next;
288	}
289	return true;
290	}
291
292	#define __wql_list_foreach(it, head, end) \
293	for (struct waitq_link_list_entry __it = (head)->next, __end = end; \
294	((it) = wql_list_elem(__it)); wql_list_next(&__it, __end))
295
296	#define wql_list_foreach(it, head) \
297	__wql_list_foreach(it, head, NULL)
298
299	#define wql_list_foreach_safe(it, head) \
300	for (struct waitq_link_list_entry *__it = (head)->next; \
301	((it) = wql_list_elem(__it)) && wql_list_next(&__it, NULL); )
302
303	/*
304	* Gross hack: passing `__it` to `__wql_list_foreach` makes it stop whether
305	* we circle back to the first element or NULL (whichever comes first).
306	*
307	* This allows to have a single enumeration function oblivious to whether
308	* we enumerate a circle queue or a sellink list.
309	*/
310	#define waitq_link_foreach(link, waitq) \
311	__wql_list_foreach((link).wqls, &(waitq).wq_q->waitq_sellinks, __it)
312
313	static_assert(offsetof(struct waitq, waitq_sellinks) ==
314	offsetof(struct waitq, waitq_links));
315	static_assert(offsetof(struct waitq_sellink, wql_next) ==
316	offsetof(struct waitq_link, wql_qlink.next));
317
318	static struct waitq_link *
319	wql_find(struct waitq *waitq, waitq_t wqset)
320	{
321	struct waitq_link *link;
322
323	cqe_foreach_element(link, &waitq->waitq_links, wql_qlink) {
324	if (waitq_same(wq1: wql_wqs(link: link), wq2: wqset)) {
325	return link;
326	}
327	}
328
329	return NULL;
330	}
331
332	waitq_link_t
333	waitq_link_alloc(waitq_type_t type)
334	{
335	waitq_link_t link;
336
337	if (waitq_is_sellink(type)) {
338	link.wqls = zalloc_flags(waitq_sellink_zone, Z_WAITOK \| Z_ZERO);
339	} else {
340	link.wqll = zalloc_flags(waitq_link_zone, Z_WAITOK \| Z_ZERO);
341	}
342	return link;
343	}
344
345	void
346	waitq_link_free(waitq_type_t type, waitq_link_t link)
347	{
348	if (waitq_is_sellink(type)) {
349	return zfree(waitq_sellink_zone, link.wqls);
350	} else {
351	return zfree(waitq_link_zone, link.wqll);
352	}
353	}
354
355	void
356	waitq_link_free_list(waitq_type_t type, waitq_link_list_t *free_l)
357	{
358	waitq_link_t link;
359
360	wql_list_foreach_safe(link.wqls, free_l) {
361	waitq_link_free(type, link);
362	}
363
364	free_l->next = NULL;
365	}
366
367
368	#pragma mark global wait queues
369
370	static __startup_data struct waitq g_boot_waitq;
371	static SECURITY_READ_ONLY_LATE(struct waitq *) global_waitqs = &g_boot_waitq;
372	static SECURITY_READ_ONLY_LATE(uint32_t) g_num_waitqs = `1`;
373
374	/*
375	* Zero out the used MSBs of the event.
376	*/
377	#define _CAST_TO_EVENT_MASK(event) \
378	((waitq_flags_t)(uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
379
380	static inline uint32_t
381	waitq_hash(char *key, size_t length)
382	{
383	return os_hash_jenkins(data: key, length) & (g_num_waitqs - `1`);
384	}
385
386	/ return a global waitq pointer corresponding to the given event /
387	struct waitq *
388	_global_eventq(char *event, size_t event_length)
389	{
390	return &global_waitqs[waitq_hash(key: event, length: event_length)];
391	}
392
393	bool
394	waitq_is_valid(waitq_t waitq)
395	{
396	return waitq_valid(waitq);
397	}
398
399	static inline bool
400	waitq_is_global(waitq_t waitq)
401	{
402	if (waitq_type(wq: waitq) != WQT_QUEUE) {
403	return false;
404	}
405	return waitq.wq_q >= global_waitqs && waitq.wq_q < global_waitqs + g_num_waitqs;
406	}
407
408	static inline bool
409	waitq_empty(waitq_t wq)
410	{
411	struct turnstile *ts;
412
413	switch (waitq_type(wq)) {
414	case WQT_TURNSTILE:
415	return priority_queue_empty(&wq.wq_q->waitq_prio_queue);
416	case WQT_PORT:
417	ts = wq.wq_q->waitq_ts;
418	return ts == TURNSTILE_NULL \|\|
419	priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
420	case WQT_QUEUE:
421	case WQT_SELECT:
422	case WQT_PORT_SET:
423	case WQT_SELECT_SET:
424	return circle_queue_empty(cq: &wq.wq_q->waitq_queue);
425
426	default:
427	return true;
428	}
429	}
430
431	#if CONFIG_WAITQ_STATS
432	#define NWAITQ_BTFRAMES 5
433
434	struct wq_stats {
435	uint64_t waits;
436	uint64_t wakeups;
437	uint64_t clears;
438	uint64_t failed_wakeups;
439
440	uintptr_t last_wait[NWAITQ_BTFRAMES];
441	uintptr_t last_wakeup[NWAITQ_BTFRAMES];
442	uintptr_t last_failed_wakeup[NWAITQ_BTFRAMES];
443	};
444
445	/ this global is for lldb /
446	const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
447	struct wq_stats g_boot_stats;
448	struct wq_stats *g_waitq_stats = &g_boot_stats;
449
450	static __inline__ void
451	waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], unsigned skip)
452	{
453	uintptr_t buf[NWAITQ_BTFRAMES + skip];
454
455	memset(buf, `0`, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
456	backtrace(buf, g_nwaitq_btframes + skip, NULL, NULL);
457	memcpy(&bt[`0`], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
458	}
459
460	static __inline__ struct wq_stats *
461	waitq_global_stats(waitq_t waitq)
462	{
463	struct wq_stats *wqs;
464	uint32_t idx;
465
466	if (!waitq_is_global(waitq)) {
467	return NULL;
468	}
469
470	idx = (uint32_t)(waitq.wq_q - global_waitqs);
471	assert(idx < g_num_waitqs);
472	wqs = &g_waitq_stats[idx];
473	return wqs;
474	}
475
476	static __inline__ void
477	waitq_stats_count_wait(waitq_t waitq)
478	{
479	struct wq_stats *wqs = waitq_global_stats(waitq);
480	if (wqs != NULL) {
481	wqs->waits++;
482	waitq_grab_backtrace(wqs->last_wait, `2`);
483	}
484	}
485
486	static __inline__ void
487	waitq_stats_count_wakeup(waitq_t waitq, int n)
488	{
489	struct wq_stats *wqs = waitq_global_stats(waitq);
490	if (wqs != NULL) {
491	if (n > `0`) {
492	wqs->wakeups += n;
493	waitq_grab_backtrace(wqs->last_wakeup, `2`);
494	} else {
495	wqs->failed_wakeups++;
496	waitq_grab_backtrace(wqs->last_failed_wakeup, `2`);
497	}
498	}
499	}
500
501	static __inline__ void
502	waitq_stats_count_clear_wakeup(waitq_t waitq)
503	{
504	struct wq_stats *wqs = waitq_global_stats(waitq);
505	if (wqs != NULL) {
506	wqs->wakeups++;
507	wqs->clears++;
508	waitq_grab_backtrace(wqs->last_wakeup, `2`);
509	}
510	}
511	#else /* !CONFIG_WAITQ_STATS */
512	#define waitq_stats_count_wait(q) do { } while (0)
513	#define waitq_stats_count_wakeup(q, n) do { } while (0)
514	#define waitq_stats_count_clear_wakeup(q) do { } while (0)
515	#endif
516
517	static struct waitq *
518	waitq_get_safeq(waitq_t waitq)
519	{
520	if (waitq_type(wq: waitq) == WQT_PORT) {
521	struct turnstile *ts = waitq.wq_q->waitq_ts;
522	return ts ? &ts->ts_waitq : NULL;
523	}
524
525	uint32_t hash = os_hash_kernel_pointer(pointer: waitq.wq_q);
526	return &global_waitqs[hash & (g_num_waitqs - `1`)];
527	}
528
529	/*
530	* Since the priority ordered waitq uses basepri as the
531	* ordering key assert that this value fits in a uint8_t.
532	*/
533	static_assert(MAXPRI <= UINT8_MAX);
534
535	static inline void
536	waitq_thread_insert(struct waitq *safeq, thread_t thread,
537	waitq_t wq, event64_t event)
538	{
539	if (waitq_type(wq: safeq) == WQT_TURNSTILE) {
540	turnstile_stats_update(hop: `0`, flags: TSU_TURNSTILE_BLOCK_COUNT, NULL);
541	turnstile_waitq_add_thread_priority_queue(wq: safeq, thread);
542	} else {
543	turnstile_stats_update(hop: `0`, flags: TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
544	/*
545	* This is the extent to which we currently take scheduling
546	* attributes into account:
547	*
548	* - If the thread is vm privileged, we stick it at the front
549	* of the queue, later, these queues will honor the policy
550	* value set at waitq_init time.
551	*
552	* - Realtime threads get priority for wait queue placements.
553	* This allows wait_queue_wakeup_one to prefer a waiting
554	* realtime thread, similar in principle to performing
555	* a wait_queue_wakeup_all and allowing scheduler
556	* prioritization to run the realtime thread, but without
557	* causing the lock contention of that scenario.
558	*/
559	if (thread->sched_pri >= BASEPRI_REALTIME \|\|
560	!safeq->waitq_fifo \|\|
561	(thread->options & TH_OPT_VMPRIV)) {
562	circle_enqueue_head(cq: &safeq->waitq_queue, elt: &thread->wait_links);
563	} else {
564	circle_enqueue_tail(cq: &safeq->waitq_queue, elt: &thread->wait_links);
565	}
566	}
567
568	/ mark the event and real waitq, even if enqueued on a global safeq /
569	thread->wait_event = event;
570	thread->waitq = wq;
571	}
572
573	/**
574	* clear the thread-related waitq state, moving the thread from
575	* TH_WAIT to TH_WAIT \| TH_WAKING, where it is no longer on a waitq and
576	* can expect to be go'ed in the near future.
577	*
578	* Clearing the waitq prevents further propagation of a turnstile boost
579	* on the thread and stops a clear_wait from succeeding.
580	*
581	* Conditions:
582	* 'thread' is locked, thread is waiting
583	*/
584	static inline void
585	thread_clear_waitq_state(thread_t thread)
586	{
587	assert(thread->state & TH_WAIT);
588
589	thread->waitq.wq_q = NULL;
590	thread->wait_event = NO_EVENT64;
591	thread->at_safe_point = FALSE;
592	thread->block_hint = kThreadWaitNone;
593	thread->state \|= TH_WAKING;
594	}
595
596	static inline void
597	waitq_thread_remove(waitq_t wq, thread_t thread)
598	{
599	if (waitq_type(wq) == WQT_TURNSTILE) {
600	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
601	(TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS,
602	(THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) \| DBG_FUNC_NONE,
603	VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq.wq_q)),
604	thread_tid(thread), `0`, `0`, `0`);
605	priority_queue_remove(que: &wq.wq_q->waitq_prio_queue,
606	elt: &thread->wait_prioq_links);
607	} else {
608	circle_dequeue(cq: &wq.wq_q->waitq_queue, elt: &thread->wait_links);
609	if (waitq_is_global(waitq: wq) && waitq_empty(wq)) {
610	wq.wq_q->waitq_eventmask = `0`;
611	}
612	}
613
614	thread_clear_waitq_state(thread);
615	}
616
617	bool
618	waitq_wait_possible(thread_t thread)
619	{
620	return waitq_is_null(wq: thread->waitq) &&
621	((thread->state & TH_WAKING) == `0`);
622	}
623
624	__startup_func
625	static void
626	waitq_bootstrap(void)
627	{
628	const uint32_t qsz = sizeof(struct waitq);
629	vm_offset_t whsize;
630	int cpu = `0`;
631
632	/*
633	* Determine the amount of memory we're willing to reserve for
634	* the waitqueue hash table
635	*/
636	if (!PE_parse_boot_argn(arg_string: "wqsize", arg_ptr: &whsize, max_arg: sizeof(whsize))) {
637	whsize = round_page(x: thread_max * qsz / `5`);
638	}
639
640	/*
641	* Determine the number of waitqueues we can fit.
642	* The hash algorithm requires that this be a power of 2.
643	*/
644	g_num_waitqs = `0x80000000u` >> __builtin_clzl(whsize / qsz);
645	assert(g_num_waitqs > `0`);
646	whsize = round_page(x: g_num_waitqs * qsz);
647
648	kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&global_waitqs, size: whsize,
649	flags: KMA_NOFAIL \| KMA_KOBJECT \| KMA_NOPAGEWAIT \| KMA_PERMANENT,
650	VM_KERN_MEMORY_WAITQ);
651
652	#if CONFIG_WAITQ_STATS
653	whsize = round_page(g_num_waitqs * sizeof(struct wq_stats));
654	kmem_alloc(kernel_map, (vm_offset_t *)&g_waitq_stats, whsize,
655	KMA_NOFAIL \| KMA_KOBJECT \| KMA_NOPAGEWAIT \| KMA_ZERO \| KMA_PERMANENT,
656	VM_KERN_MEMORY_WAITQ);
657	#endif
658
659	for (uint32_t i = `0`; i < g_num_waitqs; i++) {
660	waitq_init(waitq: &global_waitqs[i], type: WQT_QUEUE, SYNC_POLICY_FIFO);
661	}
662
663	waitq_init(waitq: &select_conflict_queue, type: WQT_SELECT, SYNC_POLICY_FIFO);
664
665	percpu_foreach(setid, select_setid) {
666	/ is not cpu_number() but CPUs haven't been numbered yet /
667	*setid = cpu++;
668	}
669	}
670	STARTUP(MACH_IPC, STARTUP_RANK_FIRST, waitq_bootstrap);
671
672
673	#pragma mark locking
674
675	static hw_spin_timeout_status_t
676	waitq_timeout_handler(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
677	{
678	lck_spinlock_to_info_t lsti;
679	hw_lck_ticket_t tmp;
680	struct waitq *wq = _lock;
681
682	if (machine_timeout_suspended()) {
683	return HW_LOCK_TIMEOUT_CONTINUE;
684	}
685
686	lsti = lck_spinlock_timeout_hit(lck: &wq->waitq_interlock, owner: `0`);
687	tmp.tcurnext = os_atomic_load(&wq->waitq_interlock.tcurnext, relaxed);
688
689	panic("waitq(%p) lock " HW_SPIN_TIMEOUT_FMT "; cpu=%d, "
690	"cticket: 0x%x, nticket: 0x%x, waiting for 0x%x, "
691	HW_SPIN_TIMEOUT_DETAILS_FMT,
692	wq, HW_SPIN_TIMEOUT_ARG(to, st), cpu_number(),
693	tmp.cticket, tmp.nticket, lsti->extra,
694	HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
695	}
696
697	static const struct hw_spin_policy waitq_spin_policy = {
698	.hwsp_name = "waitq",
699	#if defined(__i386__) \|\| defined(__x86_64__)
700	.hwsp_timeout = &LockTimeOutTSC,
701	#else
702	.hwsp_timeout_atomic = &LockTimeOut,
703	#endif
704	/*
705	* Double the standard lock timeout, because wait queues tend
706	* to iterate over a number of threads - locking each. If there is
707	* a problem with a thread lock, it normally times out at the wait
708	* queue level first, hiding the real problem.
709	*/
710	.hwsp_timeout_shift = `1`,
711	.hwsp_lock_offset = offsetof(struct waitq, waitq_interlock),
712	.hwsp_op_timeout = waitq_timeout_handler,
713	};
714
715	void
716	waitq_invalidate(waitq_t waitq)
717	{
718	hw_lck_ticket_invalidate(tlock: &waitq.wq_q->waitq_interlock);
719	}
720
721	bool
722	waitq_held(waitq_t wq)
723	{
724	return hw_lck_ticket_held(tlock: &wq.wq_q->waitq_interlock);
725	}
726
727	void
728	waitq_lock(waitq_t wq)
729	{
730	(void)hw_lck_ticket_lock_to(&wq.wq_q->waitq_interlock,
731	&waitq_spin_policy, &waitq_lck_grp);
732	#if defined(__x86_64__)
733	pltrace(FALSE);
734	#endif
735	}
736
737	bool
738	waitq_lock_try(waitq_t wq)
739	{
740	bool rc = hw_lck_ticket_lock_try(&wq.wq_q->waitq_interlock, &waitq_lck_grp);
741
742	#if defined(__x86_64__)
743	if (rc) {
744	pltrace(FALSE);
745	}
746	#endif
747	return rc;
748	}
749
750	bool
751	waitq_lock_reserve(waitq_t wq, uint32_t *ticket)
752	{
753	return hw_lck_ticket_reserve(&wq.wq_q->waitq_interlock, ticket, &waitq_lck_grp);
754	}
755
756	void
757	waitq_lock_wait(waitq_t wq, uint32_t ticket)
758	{
759	(void)hw_lck_ticket_wait(&wq.wq_q->waitq_interlock, ticket,
760	&waitq_spin_policy, &waitq_lck_grp);
761	#if defined(__x86_64__)
762	pltrace(FALSE);
763	#endif
764	}
765
766	bool
767	waitq_lock_allow_invalid(waitq_t wq)
768	{
769	hw_lock_status_t rc;
770
771	rc = hw_lck_ticket_lock_allow_invalid(&wq.wq_q->waitq_interlock,
772	&waitq_spin_policy, &waitq_lck_grp);
773
774	#if defined(__x86_64__)
775	if (rc == HW_LOCK_ACQUIRED) {
776	pltrace(FALSE);
777	}
778	#endif
779	return rc == HW_LOCK_ACQUIRED;
780	}
781
782	void
783	waitq_unlock(waitq_t wq)
784	{
785	assert(waitq_held(wq));
786	#if defined(__x86_64__)
787	pltrace(TRUE);
788	#endif
789	hw_lck_ticket_unlock(tlock: &wq.wq_q->waitq_interlock);
790	}
791
792
793	#pragma mark assert_wait / wakeup
794
795	struct waitq_select_args {
796	/ input parameters /
797	event64_t event;
798	wait_result_t result;
799	waitq_wakeup_flags_t flags;
800	uint32_t max_threads;
801	bool is_identified;
802
803	/ output parameters /
804	/ counts all woken threads, may have more threads than on threadq /
805	uint32_t nthreads;
806	/ preemption is disabled while threadq is non-empty /
807	circle_queue_head_t threadq;
808	};
809
810	static inline void
811	maybe_adjust_thread_pri(
812	thread_t thread,
813	waitq_wakeup_flags_t flags,
814	__kdebug_only waitq_t waitq)
815	{
816	/*
817	* If the caller is requesting the waitq subsystem to promote the
818	* priority of the awoken thread, then boost the thread's priority to
819	* the default WAITQ_BOOST_PRIORITY (if it's not already equal or
820	* higher priority). This boost must be removed via a call to
821	* waitq_clear_promotion_locked before the thread waits again.
822	*/
823	if (flags & WAITQ_PROMOTE_PRIORITY) {
824	uintptr_t trace_waitq = `0`;
825	if (__improbable(kdebug_enable)) {
826	trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq.wq_q);
827	}
828
829	sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_obj: trace_waitq);
830	}
831	}
832
833	static void
834	waitq_select_queue_add(waitq_t waitq, thread_t thread, struct waitq_select_args *args)
835	{
836	spl_t s = splsched();
837
838	thread_lock(thread);
839	thread_clear_waitq_state(thread);
840
841	if (!args->is_identified && thread->state & TH_RUN) {
842	/*
843	* A thread that is currently on core may try to clear its own
844	* wait with clear wait or by waking its own event instead of
845	* calling thread_block as is normally expected. After doing
846	* this, it expects to be able to immediately wait again.
847	*
848	* If we are currently on a different CPU and waking that
849	* thread, as soon as we unlock the waitq and thread, that
850	* operation could complete, but we would still be holding the
851	* thread on our flush queue, leaving it in the waking state
852	* where it can't yet assert another wait.
853	*
854	* Since we know that we won't actually need to enqueue the
855	* thread on the runq due to it being on core, we can just
856	* immediately unblock it here so that the thread will be in a
857	* waitable state after we release its thread lock from this
858	* lock hold.
859	*
860	* Wakeups using *_identify can't be allowed to pass
861	* thread block until they're resumed, so they can't use
862	* this path. That means they are not allowed to skip calling
863	* thread_block.
864	*/
865	maybe_adjust_thread_pri(thread, flags: args->flags, waitq);
866	thread_go(thread, wresult: args->result, false);
867	} else {
868	if (circle_queue_empty(cq: &args->threadq)) {
869	/*
870	* preemption is disabled while threads are
871	* on threadq - balanced in:
872	* waitq_resume_identified_thread
873	* waitq_select_queue_flush
874	*/
875	disable_preemption();
876	}
877
878	circle_enqueue_tail(cq: &args->threadq, elt: &thread->wait_links);
879	}
880
881	thread_unlock(thread);
882
883	splx(s);
884	}
885
886
887	#if SCHED_HYGIENE_DEBUG
888
889	TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_threads, "waitq_flush_excess_threads", `20`);
890	TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_time_mt, "waitq_flush_excess_time_mt", `7200`); / 300us /
891
892	#endif /* SCHED_HYGIENE_DEBUG */
893
894
895	static void
896	waitq_select_queue_flush(waitq_t waitq, struct waitq_select_args *args)
897	{
898	thread_t thread = THREAD_NULL;
899
900	assert(!circle_queue_empty(&args->threadq));
901
902	int flushed_threads = `0`;
903
904	#if SCHED_HYGIENE_DEBUG
905	uint64_t start_time = ml_get_sched_hygiene_timebase();
906	disable_preemption();
907	#endif /* SCHED_HYGIENE_DEBUG */
908
909	cqe_foreach_element_safe(thread, &args->threadq, wait_links) {
910	circle_dequeue(cq: &args->threadq, elt: &thread->wait_links);
911	assert_thread_magic(thread);
912
913	spl_t s = splsched();
914
915	thread_lock(thread);
916	maybe_adjust_thread_pri(thread, flags: args->flags, waitq);
917	thread_go(thread, wresult: args->result, try_handoff: args->flags & WAITQ_HANDOFF);
918	thread_unlock(thread);
919
920	splx(s);
921
922	flushed_threads++;
923	}
924
925	#if SCHED_HYGIENE_DEBUG
926	uint64_t end_time = ml_get_sched_hygiene_timebase();
927
928	/*
929	* Check for a combination of excess threads and long time,
930	* so that a single thread wakeup that gets stuck is still caught
931	*/
932	if (waitq_flush_excess_threads && waitq_flush_excess_time_mt &&
933	flushed_threads > waitq_flush_excess_threads &&
934	(end_time - start_time) > waitq_flush_excess_time_mt) {
935	/*
936	* Hack alert:
937	*
938	* If a wakeup-all is done with interrupts disabled, or if
939	* there are enough threads / lock contention to pass the
940	* preemption disable threshold, it can take Too Long to get
941	* through waking up all the threads, leading to
942	* the watchdog going off.
943	*
944	* While we are working on a change to break up this
945	* giant glob of work into smaller chunks, remove this
946	* time region from the watchdog's memory to avoid
947	* unit tests that wake up hundreds of threads on
948	* one semaphore from causing this to blow up.
949	*
950	* We only trigger this when seeing a combination of
951	* excess threads and long time, so that a single
952	* thread wakeup that gets stuck is still caught.
953	*
954	* This was improved with
955	* rdar://90325140
956	* to enable interrupts during most wakeup-all's
957	* and will be removed with
958	* rdar://101110793
959	*/
960	if (ml_get_interrupts_enabled() == false) {
961	ml_spin_debug_reset(current_thread());
962	ml_irq_debug_abandon();
963	}
964	abandon_preemption_disable_measurement();
965
966	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_RESET), flushed_threads, end_time - start_time);
967	}
968
969	enable_preemption();
970
971	#endif /* SCHED_HYGIENE_DEBUG */
972
973	/*
974	* match the disable when making threadq nonempty from
975	* waitq_select_queue_add
976	*/
977	enable_preemption();
978	}
979
980	/**
981	* Routine to iterate over the waitq for non-priority ordered waitqs
982	*
983	* Conditions:
984	* args->waitq (and the posted waitq) is locked
985	*
986	* Notes:
987	* If one or more threads are selected, this may disable preemption,
988	* which is balanced when the threadq is flushed in
989	* waitq_resume_identified_thread or waitq_select_queue_flush.
990	*/
991	static waitq_flags_t
992	waitq_queue_iterate_locked(struct waitq safeq, struct* waitq *waitq,
993	struct waitq_select_args *args)
994	{
995	thread_t thread = THREAD_NULL;
996	waitq_flags_t eventmask = `0`;
997
998	cqe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
999	assert_thread_magic(thread);
1000
1001	/*
1002	* For non-priority ordered waitqs, we allow multiple events to be
1003	* mux'ed into the same waitq. Also safeqs may contain threads from
1004	* multiple waitqs. Only pick threads that match the
1005	* requested wait event.
1006	*/
1007	if (waitq_same(wq1: thread->waitq, wq2: waitq) && thread->wait_event == args->event) {
1008	/ We found a matching thread! Pull it from the queue. /
1009
1010	circle_dequeue(cq: &safeq->waitq_queue, elt: &thread->wait_links);
1011
1012	waitq_select_queue_add(waitq: waitq, thread, args);
1013
1014	if (++args->nthreads >= args->max_threads) {
1015	break;
1016	}
1017	} else {
1018	/ thread wasn't selected so track its event /
1019	eventmask \|= waitq_same(wq1: thread->waitq, wq2: safeq)
1020	? _CAST_TO_EVENT_MASK(thread->wait_event)
1021	: _CAST_TO_EVENT_MASK(thread->waitq.wq_q);
1022	}
1023	}
1024
1025	return eventmask;
1026	}
1027
1028	/**
1029	* Routine to iterate and remove threads from priority ordered waitqs
1030	*
1031	* Conditions:
1032	* args->waitq (and the posted waitq) is locked
1033	*
1034	* Notes:
1035	* The priority ordered waitqs only support maximum priority element removal.
1036	*
1037	* Also, the implementation makes sure that all threads in a priority ordered
1038	* waitq are waiting on the same wait event. This is not necessarily true for
1039	* non-priority ordered waitqs. If one or more threads are selected, this may
1040	* disable preemption.
1041	*/
1042	static void
1043	waitq_prioq_iterate_locked(
1044	struct waitq *ts_wq,
1045	struct waitq *waitq,
1046	struct waitq_select_args *args)
1047	{
1048	struct turnstile *ts = waitq_to_turnstile(ts_wq);
1049	bool update_inheritor = (args->flags & WAITQ_UPDATE_INHERITOR);
1050
1051	if (update_inheritor && args->max_threads == UINT32_MAX) {
1052	/*
1053	* If we are going to wake up all threads,
1054	* go ahead and set the inheritor to NULL.
1055	*/
1056	turnstile_kernel_update_inheritor_on_wake_locked(turnstile: ts,
1057	TURNSTILE_INHERITOR_NULL, flags: TURNSTILE_INHERITOR_THREAD);
1058	update_inheritor = false;
1059	}
1060
1061	while (!priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1062	thread_t thread;
1063
1064	thread = priority_queue_remove_max(&ts_wq->waitq_prio_queue,
1065	struct thread, wait_prioq_links);
1066
1067	assert_thread_magic(thread);
1068
1069	/*
1070	* Ensure the wait event matches since priority ordered waitqs do not
1071	* support multiple events in the same waitq.
1072	*/
1073	assert(waitq_same(thread->waitq, waitq) && (thread->wait_event == args->event));
1074
1075	if (update_inheritor) {
1076	turnstile_inheritor_t inheritor = thread;
1077
1078	if (priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1079	inheritor = TURNSTILE_INHERITOR_NULL;
1080	}
1081	turnstile_kernel_update_inheritor_on_wake_locked(turnstile: ts,
1082	new_inheritor: inheritor, flags: TURNSTILE_INHERITOR_THREAD);
1083	update_inheritor = false;
1084	}
1085
1086	waitq_select_queue_add(waitq: waitq, thread, args);
1087
1088	if (++args->nthreads >= args->max_threads) {
1089	break;
1090	}
1091	}
1092	}
1093
1094	/**
1095	* @function do_waitq_select_n_locked_queue
1096	*
1097	* @brief
1098	* Selects threads waiting on a wait queue.
1099	*
1100	* @discussion
1101	* @c waitq is locked.
1102	* If @c waitq is a set, then the wait queue posting to it is locked too.
1103	*
1104	* If one or more threads are selected, this may disable preemption.
1105	*/
1106	static void
1107	do_waitq_select_n_locked_queue(waitq_t waitq, struct waitq_select_args *args)
1108	{
1109	spl_t s = `0`;
1110
1111	struct waitq *safeq;
1112	waitq_flags_t eventmask, remaining_eventmask;
1113
1114	if (waitq_irq_safe(waitq)) {
1115	eventmask = _CAST_TO_EVENT_MASK(args->event);
1116	safeq = waitq.wq_q;
1117	} else {
1118	/ JMM - add flag to waitq to avoid global lookup if no waiters /
1119	eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1120	safeq = waitq_get_safeq(waitq);
1121	if (safeq == NULL) {
1122	return;
1123	}
1124
1125	s = splsched();
1126	waitq_lock(wq: safeq);
1127	}
1128
1129	/*
1130	* If the safeq doesn't have an eventmask (not global) or the event
1131	* we're looking for IS set in its eventmask, then scan the threads
1132	* in that queue for ones that match the original <waitq,event> pair.
1133	*/
1134	if (waitq_type(wq: safeq) == WQT_TURNSTILE) {
1135	waitq_prioq_iterate_locked(ts_wq: safeq, waitq: waitq.wq_q, args);
1136	} else if (!waitq_is_global(waitq: safeq)) {
1137	waitq_queue_iterate_locked(safeq, waitq: waitq.wq_q, args);
1138	} else if ((safeq->waitq_eventmask & eventmask) == eventmask) {
1139	remaining_eventmask = waitq_queue_iterate_locked(safeq,
1140	waitq: waitq.wq_q, args);
1141
1142	/*
1143	* Update the eventmask of global queues we just scanned:
1144	* - If we selected all the threads in the queue,
1145	* we can clear its eventmask.
1146	*
1147	* - If we didn't find enough threads to fill our needs,
1148	* then we can assume we looked at every thread in the queue
1149	* and the mask we computed is complete - so reset it.
1150	*/
1151	if (waitq_empty(wq: safeq)) {
1152	safeq->waitq_eventmask = `0`;
1153	} else if (args->nthreads < args->max_threads) {
1154	safeq->waitq_eventmask = remaining_eventmask;
1155	}
1156	}
1157
1158	/ unlock the safe queue if we locked one above /
1159	if (!waitq_same(wq1: waitq, wq2: safeq)) {
1160	waitq_unlock(wq: safeq);
1161	splx(s);
1162	}
1163	}
1164
1165	/**
1166	* @function do_waitq_link_select_n_locked()
1167	*
1168	* @brief
1169	* Selects threads waiting on any set a wait queue belongs to,
1170	* or preposts the wait queue onto them.
1171	*
1172	* @discussion
1173	* @c waitq is locked.
1174	*/
1175	__attribute__((noinline))
1176	static void
1177	do_waitq_select_n_locked_sets(waitq_t waitq, struct waitq_select_args *args)
1178	{
1179	waitq_type_t wq_type = waitq_type(wq: waitq);
1180	waitq_link_t link;
1181
1182	assert(args->event == NO_EVENT64);
1183	assert(waitq_preposts(waitq));
1184
1185	waitq_link_foreach(link, waitq) {
1186	waitq_t wqset = wql_wqs(link);
1187
1188	if (wql_wqs_preposted(link)) {
1189	/*
1190	* The wql_wqs_preposted() bit is cleared
1191	* under both the wq/wqset lock.
1192	*
1193	* If the wqset is still preposted,
1194	* we really won't find threads there.
1195	*
1196	* Just mark the waitq as preposted and move on.
1197	*/
1198	if (wq_type == WQT_PORT) {
1199	waitq.wq_q->waitq_preposted = true;
1200	}
1201	continue;
1202	}
1203
1204	if (wq_type == WQT_SELECT) {
1205	/*
1206	* If PGZ picked this select set,
1207	* translate it to the real address
1208	*
1209	* If it is still a select set
1210	* (the slot could have been reused),
1211	* then keep using it for the rest of the logic.
1212	*
1213	* Even in the extremely unlikely case where
1214	* the slot was reused for another select_set,
1215	* the `wql_sellink_valid` check below will
1216	* take care of debouncing it. But we must
1217	* forget the original pointer we read
1218	* so that we unlock the proper object.
1219	*/
1220	wqset.wqs_sel = pgz_decode_allow_invalid(wqset.wqs_sel,
1221	ZONE_ID_SELECT_SET);
1222	if (!wqset.wqs_sel) {
1223	continue;
1224	}
1225	if (!waitq_lock_allow_invalid(wq: wqset)) {
1226	continue;
1227	}
1228	if (!wql_sellink_valid(selset: wqset.wqs_sel, link: link.wqls)) {
1229	goto out_unlock;
1230	}
1231	} else {
1232	waitq_lock(wq: wqset);
1233	if (!waitq_valid(waitq: wqset)) {
1234	goto out_unlock;
1235	}
1236	}
1237
1238	/*
1239	* Find any threads waiting on this wait queue set as a queue.
1240	*/
1241	do_waitq_select_n_locked_queue(waitq: wqset, args);
1242
1243	if (args->nthreads == `0`) {
1244	/ No thread selected: prepost 'waitq' to 'wqset' /
1245	wql_wqs_mark_preposted(link);
1246	if (wq_type == WQT_SELECT) {
1247	wqset.wqs_sel->selset_preposted = true;
1248	} else {
1249	waitq.wq_q->waitq_preposted = true;
1250	circle_dequeue(cq: &wqset.wqs_set->wqset_links,
1251	elt: &link.wqll->wql_slink);
1252	circle_enqueue_tail(cq: &wqset.wqs_set->wqset_preposts,
1253	elt: &link.wqll->wql_slink);
1254	ipc_pset_prepost(wqset: wqset.wqs_set, waitq: waitq.wq_q);
1255	}
1256	}
1257
1258	out_unlock:
1259	waitq_unlock(wq: wqset);
1260
1261	if (args->nthreads >= args->max_threads) {
1262	break;
1263	}
1264	}
1265	}
1266
1267	/**
1268	* @function do_waitq_select_n_locked
1269	*
1270	* @brief
1271	* Selects threads waiting on a wait queue, or preposts it.
1272	*
1273	* @discussion
1274	* @c waitq is locked.
1275	*
1276	* Recurses into all sets this wait queue belongs to.
1277	*/
1278	static void
1279	do_waitq_select_n_locked(waitq_t waitq, struct waitq_select_args *args)
1280	{
1281	do_waitq_select_n_locked_queue(waitq, args);
1282
1283	if (args->nthreads >= args->max_threads) {
1284	/ already enough threads found /
1285	return;
1286	}
1287
1288	if (args->event != NO_EVENT64 \|\| !waitq_preposts(wq: waitq)) {
1289	/ this wakeup should not recurse into sets /
1290	return;
1291	}
1292
1293	do_waitq_select_n_locked_sets(waitq, args);
1294	}
1295
1296	static inline bool
1297	waitq_is_preposted_set(waitq_t waitq)
1298	{
1299	switch (waitq_type(wq: waitq)) {
1300	case WQT_PORT_SET:
1301	return waitq_set_first_prepost(wqset: waitq.wqs_set, flags: WQS_PREPOST_PEEK) != NULL;
1302
1303	case WQT_SELECT_SET:
1304	return waitq.wqs_sel->selset_preposted;
1305
1306	default:
1307	return false;
1308	}
1309	}
1310
1311	wait_result_t
1312	waitq_assert_wait64_locked(waitq_t waitq,
1313	event64_t wait_event,
1314	wait_interrupt_t interruptible,
1315	wait_timeout_urgency_t urgency,
1316	uint64_t deadline,
1317	uint64_t leeway,
1318	thread_t thread)
1319	{
1320	wait_result_t wait_result;
1321	struct waitq *safeq;
1322	uintptr_t eventmask;
1323	spl_t s;
1324
1325	switch (waitq_type(wq: waitq)) {
1326	case WQT_PORT:
1327	case WQT_SELECT:
1328	case WQT_PORT_SET:
1329	case WQT_SELECT_SET:
1330	assert(wait_event == NO_EVENT64);
1331	break;
1332	default:
1333	assert(wait_event != NO_EVENT64);
1334	break;
1335	}
1336
1337	/*
1338	* Warning: Do _not_ place debugging print statements here.
1339	* The waitq is locked!
1340	*/
1341	assert(!thread->started \|\| thread == current_thread());
1342
1343	if (!waitq_wait_possible(thread)) {
1344	panic("thread already waiting on %p", thread->waitq.wq_q);
1345	}
1346
1347	s = splsched();
1348
1349	/*
1350	* early-out if the thread is waiting on a wait queue set
1351	* that has already been pre-posted.
1352	*
1353	* Note: waitq_is_preposted_set() may unlock the waitq-set
1354	*/
1355	if (waitq_is_preposted_set(waitq)) {
1356	thread_lock(thread);
1357	thread->wait_result = THREAD_AWAKENED;
1358	thread_unlock(thread);
1359	splx(s);
1360	return THREAD_AWAKENED;
1361	}
1362
1363	/*
1364	* If already dealing with an irq safe wait queue, we are all set.
1365	* Otherwise, determine a global queue to use and lock it.
1366	*/
1367	if (waitq_irq_safe(waitq)) {
1368	safeq = waitq.wq_q;
1369	eventmask = _CAST_TO_EVENT_MASK(wait_event);
1370	} else {
1371	safeq = waitq_get_safeq(waitq);
1372	if (__improbable(safeq == NULL)) {
1373	panic("Trying to assert_wait on a turnstile proxy "
1374	"that hasn't been donated one (waitq: %p)", waitq.wq_q);
1375	}
1376	eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1377	waitq_lock(wq: safeq);
1378	}
1379
1380	/ lock the thread now that we have the irq-safe waitq locked /
1381	thread_lock(thread);
1382
1383	wait_result = thread_mark_wait_locked(thread, interruptible);
1384	/ thread->wait_result has been set /
1385	if (wait_result == THREAD_WAITING) {
1386	waitq_thread_insert(safeq, thread, wq: waitq, event: wait_event);
1387
1388	if (deadline != `0`) {
1389	bool was_active;
1390
1391	was_active = timer_call_enter_with_leeway(call: thread->wait_timer,
1392	NULL,
1393	deadline, leeway,
1394	flags: urgency, FALSE);
1395	if (!was_active) {
1396	thread->wait_timer_active++;
1397	}
1398	thread->wait_timer_armed = true;
1399	}
1400
1401	if (waitq_is_global(waitq: safeq)) {
1402	safeq->waitq_eventmask \|= (waitq_flags_t)eventmask;
1403	}
1404
1405	waitq_stats_count_wait(waitq);
1406	}
1407
1408	/ unlock the thread /
1409	thread_unlock(thread);
1410
1411	/ update the inheritor's thread priority if the waitq is embedded in turnstile /
1412	if (waitq_type(wq: safeq) == WQT_TURNSTILE && wait_result == THREAD_WAITING) {
1413	turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
1414	turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
1415	}
1416
1417	/ unlock the safeq if we locked it here /
1418	if (!waitq_same(wq1: waitq, wq2: safeq)) {
1419	waitq_unlock(wq: safeq);
1420	}
1421
1422	splx(s);
1423
1424	return wait_result;
1425	}
1426
1427	bool
1428	waitq_pull_thread_locked(waitq_t waitq, thread_t thread)
1429	{
1430	struct waitq *safeq;
1431	uint32_t ticket;
1432
1433	assert_thread_magic(thread);
1434
1435	/ Find the interrupts disabled queue thread is waiting on /
1436	if (waitq_irq_safe(waitq)) {
1437	safeq = waitq.wq_q;
1438	} else {
1439	safeq = waitq_get_safeq(waitq);
1440	if (__improbable(safeq == NULL)) {
1441	panic("Trying to clear_wait on a turnstile proxy "
1442	"that hasn't been donated one (waitq: %p)", waitq.wq_q);
1443	}
1444	}
1445
1446	/*
1447	* thread is already locked so have to try for the waitq lock.
1448	*
1449	* We can't wait for the waitq lock under the thread lock,
1450	* however we can reserve our slot in the lock queue,
1451	* and if that reservation requires waiting, we are guaranteed
1452	* that this waitq can't die until we got our turn!
1453	*/
1454	if (!waitq_lock_reserve(wq: safeq, ticket: &ticket)) {
1455	thread_unlock(thread);
1456	waitq_lock_wait(wq: safeq, ticket);
1457	thread_lock(thread);
1458
1459	if (!waitq_same(wq1: waitq, wq2: thread->waitq)) {
1460	/*
1461	* While we were waiting for our reservation the thread
1462	* stopped waiting on this waitq, bail out.
1463	*/
1464	waitq_unlock(wq: safeq);
1465	return false;
1466	}
1467	}
1468
1469	waitq_thread_remove(wq: safeq, thread);
1470	waitq_stats_count_clear_wakeup(waitq);
1471	waitq_unlock(wq: safeq);
1472	return true;
1473	}
1474
1475
1476	void
1477	waitq_clear_promotion_locked(waitq_t waitq, thread_t thread)
1478	{
1479	spl_t s = `0`;
1480
1481	assert(waitq_held(waitq));
1482	assert(thread != THREAD_NULL);
1483	assert(thread == current_thread());
1484
1485	/ This flag is only cleared by the thread itself, so safe to check outside lock /
1486	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
1487	return;
1488	}
1489
1490	if (!waitq_irq_safe(waitq)) {
1491	s = splsched();
1492	}
1493	thread_lock(thread);
1494
1495	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_obj: `0`);
1496
1497	thread_unlock(thread);
1498	if (!waitq_irq_safe(waitq)) {
1499	splx(s);
1500	}
1501	}
1502
1503	static inline bool
1504	waitq_should_unlock(waitq_wakeup_flags_t flags)
1505	{
1506	return (flags & (WAITQ_UNLOCK \| WAITQ_KEEP_LOCKED)) == WAITQ_UNLOCK;
1507	}
1508
1509	static inline bool
1510	waitq_should_enable_interrupts(waitq_wakeup_flags_t flags)
1511	{
1512	return (flags & (WAITQ_UNLOCK \| WAITQ_KEEP_LOCKED \| WAITQ_ENABLE_INTERRUPTS)) == (WAITQ_UNLOCK \| WAITQ_ENABLE_INTERRUPTS);
1513	}
1514
1515	kern_return_t
1516	waitq_wakeup64_all_locked(
1517	waitq_t waitq,
1518	event64_t wake_event,
1519	wait_result_t result,
1520	waitq_wakeup_flags_t flags)
1521	{
1522	struct waitq_select_args args = {
1523	.event = wake_event,
1524	.result = result,
1525	.flags = flags & ~WAITQ_HANDOFF,
1526	.max_threads = UINT32_MAX,
1527	};
1528
1529	assert(waitq_held(waitq));
1530
1531	if (flags & WAITQ_ENABLE_INTERRUPTS) {
1532	assert(waitq_should_unlock(flags));
1533	assert(ml_get_interrupts_enabled() == false);
1534	}
1535
1536	do_waitq_select_n_locked(waitq, args: &args);
1537	waitq_stats_count_wakeup(waitq, args.nthreads);
1538
1539	if (waitq_should_unlock(flags)) {
1540	waitq_unlock(wq: waitq);
1541	}
1542
1543	if (waitq_should_enable_interrupts(flags)) {
1544	ml_set_interrupts_enabled(true);
1545	}
1546
1547	if (!circle_queue_empty(cq: &args.threadq)) {
1548	waitq_select_queue_flush(waitq, args: &args);
1549	}
1550
1551	if (args.nthreads > `0`) {
1552	return KERN_SUCCESS;
1553	}
1554
1555	return KERN_NOT_WAITING;
1556	}
1557
1558	kern_return_t
1559	waitq_wakeup64_one_locked(
1560	waitq_t waitq,
1561	event64_t wake_event,
1562	wait_result_t result,
1563	waitq_wakeup_flags_t flags)
1564	{
1565	struct waitq_select_args args = {
1566	.event = wake_event,
1567	.result = result,
1568	.flags = flags,
1569	.max_threads = `1`,
1570	};
1571
1572	assert(waitq_held(waitq));
1573
1574	if (flags & WAITQ_ENABLE_INTERRUPTS) {
1575	assert(waitq_should_unlock(flags));
1576	assert(ml_get_interrupts_enabled() == false);
1577	}
1578
1579	do_waitq_select_n_locked(waitq, args: &args);
1580	waitq_stats_count_wakeup(waitq, args.nthreads);
1581
1582	if (waitq_should_unlock(flags)) {
1583	waitq_unlock(wq: waitq);
1584	}
1585
1586	if (waitq_should_enable_interrupts(flags)) {
1587	ml_set_interrupts_enabled(true);
1588	}
1589
1590	if (!circle_queue_empty(cq: &args.threadq)) {
1591	waitq_select_queue_flush(waitq, args: &args);
1592	}
1593
1594	if (args.nthreads > `0`) {
1595	return KERN_SUCCESS;
1596	}
1597
1598	return KERN_NOT_WAITING;
1599	}
1600
1601	thread_t
1602	waitq_wakeup64_identify_locked(
1603	waitq_t waitq,
1604	event64_t wake_event,
1605	wait_result_t result,
1606	waitq_wakeup_flags_t flags)
1607	{
1608	struct waitq_select_args args = {
1609	.event = wake_event,
1610	.result = result,
1611	.flags = flags,
1612	.max_threads = `1`,
1613	.is_identified = true,
1614	};
1615
1616	assert(waitq_held(waitq));
1617
1618	do_waitq_select_n_locked(waitq, args: &args);
1619	waitq_stats_count_wakeup(waitq, args.nthreads);
1620
1621	if (waitq_should_unlock(flags)) {
1622	waitq_unlock(wq: waitq);
1623	}
1624
1625	if (waitq_should_enable_interrupts(flags)) {
1626	ml_set_interrupts_enabled(true);
1627	}
1628
1629	if (args.nthreads > `0`) {
1630	thread_t thread = cqe_dequeue_head(&args.threadq, struct thread, wait_links);
1631
1632	assert(args.nthreads == `1` && circle_queue_empty(&args.threadq));
1633
1634	/ Thread is off waitq, not unblocked yet /
1635
1636	return thread;
1637	}
1638
1639	return THREAD_NULL;
1640	}
1641
1642	void
1643	waitq_resume_identified_thread(
1644	waitq_t waitq,
1645	thread_t thread,
1646	wait_result_t result,
1647	waitq_wakeup_flags_t flags)
1648	{
1649	spl_t spl = splsched();
1650
1651	thread_lock(thread);
1652
1653	assert((thread->state & (TH_WAIT \| TH_WAKING)) == (TH_WAIT \| TH_WAKING));
1654
1655	maybe_adjust_thread_pri(thread, flags, waitq);
1656	thread_go(thread, wresult: result, try_handoff: (flags & WAITQ_HANDOFF));
1657
1658	thread_unlock(thread);
1659	splx(spl);
1660
1661	enable_preemption(); // balance disable upon pulling thread
1662	}
1663
1664	void
1665	waitq_resume_and_bind_identified_thread(
1666	waitq_t waitq,
1667	thread_t thread,
1668	processor_t processor,
1669	wait_result_t result,
1670	waitq_wakeup_flags_t flags)
1671	{
1672	spl_t spl = splsched();
1673
1674	thread_lock(thread);
1675
1676	assert((thread->state & (TH_WAIT \| TH_WAKING)) == (TH_WAIT \| TH_WAKING));
1677
1678	maybe_adjust_thread_pri(thread, flags, waitq);
1679	thread_bind_during_wakeup(thread, processor);
1680	thread_go(thread, wresult: result, try_handoff: (flags & WAITQ_HANDOFF));
1681
1682	thread_unlock(thread);
1683	splx(spl);
1684
1685	enable_preemption(); // balance disable upon pulling thread
1686	}
1687
1688	kern_return_t
1689	waitq_wakeup64_thread_and_unlock(
1690	struct waitq *waitq,
1691	event64_t event,
1692	thread_t thread,
1693	wait_result_t result)
1694	{
1695	kern_return_t ret = KERN_NOT_WAITING;
1696
1697	assert(waitq_irq_safe(waitq));
1698	assert(waitq_held(waitq));
1699	assert_thread_magic(thread);
1700
1701	/*
1702	* See if the thread was still waiting there. If so, it got
1703	* dequeued and returned locked.
1704	*
1705	* By holding the thread locked across the go, a thread on another CPU
1706	* can't see itself in 'waking' state, even if it uses clear_wait.
1707	*/
1708	thread_lock(thread);
1709
1710	if (waitq_same(wq1: thread->waitq, wq2: waitq) && thread->wait_event == event) {
1711	waitq_thread_remove(wq: waitq, thread);
1712	ret = KERN_SUCCESS;
1713	}
1714	waitq_stats_count_wakeup(waitq, ret == KERN_SUCCESS ? `1` : `0`);
1715
1716	waitq_unlock(wq: waitq);
1717
1718	if (ret == KERN_SUCCESS) {
1719	thread_go(thread, wresult: result, / handoff / false);
1720	}
1721
1722	thread_unlock(thread);
1723
1724	return ret;
1725	}
1726
1727
1728	#pragma mark waitq
1729
1730	__attribute__((always_inline))
1731	void
1732	waitq_init(waitq_t waitq, waitq_type_t type, int policy)
1733	{
1734	assert((policy & SYNC_POLICY_FIXED_PRIORITY) == `0`);
1735
1736	waitq.wq_q = (struct* waitq){
1737	.waitq_type = type,
1738	.waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == `0`),
1739	};
1740
1741	switch (type) {
1742	case WQT_INVALID:
1743	__builtin_trap();
1744
1745	case WQT_TURNSTILE:
1746	/ For turnstile, initialize it as a priority queue /
1747	priority_queue_init(que: &waitq.wq_q->waitq_prio_queue);
1748	assert(waitq.wq_q->waitq_fifo == `0`);
1749	break;
1750
1751	case WQT_PORT:
1752	waitq.wq_q->waitq_ts = TURNSTILE_NULL;
1753	break;
1754
1755	case WQT_PORT_SET:
1756	circle_queue_init(&waitq.wqs_set->wqset_preposts);
1757	OS_FALLTHROUGH;
1758	case WQT_SELECT_SET:
1759	case WQT_QUEUE:
1760	case WQT_SELECT:
1761	circle_queue_init(&waitq.wq_q->waitq_queue);
1762	break;
1763	}
1764
1765	if (policy & SYNC_POLICY_INIT_LOCKED) {
1766	hw_lck_ticket_init_locked(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1767	} else {
1768	hw_lck_ticket_init(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1769	}
1770	}
1771
1772	void
1773	waitq_deinit(waitq_t waitq)
1774	{
1775	waitq_type_t type = waitq_type(wq: waitq);
1776
1777	switch (type) {
1778	case WQT_QUEUE:
1779	assert(circle_queue_empty(&waitq.wq_q->waitq_queue));
1780	waitq_invalidate(waitq);
1781	break;
1782
1783	case WQT_TURNSTILE:
1784	assert(priority_queue_empty(&waitq.wq_q->waitq_prio_queue));
1785	assert(waitq.wq_q->waitq_inheritor == TURNSTILE_INHERITOR_NULL);
1786	waitq_invalidate(waitq);
1787	break;
1788
1789	case WQT_PORT:
1790	assert(waitq.wq_q->waitq_ts == TURNSTILE_NULL);
1791	assert(circle_queue_empty(&waitq.wq_q->waitq_links));
1792	break;
1793
1794	case WQT_SELECT:
1795	assert(waitq.wq_q->waitq_sellinks.next == NULL);
1796	assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1797	break;
1798
1799	case WQT_PORT_SET:
1800	assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1801	assert(circle_queue_empty(&waitq.wqs_set->wqset_links));
1802	assert(circle_queue_empty(&waitq.wqs_set->wqset_preposts));
1803	break;
1804
1805	default:
1806	panic("invalid wait type: %p/%d", waitq.wq_q, type);
1807	}
1808
1809	/*
1810	* The waitq must have been invalidated, or hw_lck_ticket_destroy()
1811	* below won't wait for reservations from waitq_lock_reserve(),
1812	* or waitq_lock_allow_invalid().
1813	*/
1814	assert(!waitq_valid(waitq.wqs_set));
1815	hw_lck_ticket_destroy(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1816	}
1817
1818
1819	#pragma mark port-set sets
1820
1821	void
1822	waitq_set_unlink_all_locked(struct waitq_set wqset, waitq_link_list_t free_l)
1823	{
1824	uint32_t batch = waitq_set_unlink_batch;
1825
1826	waitq_invalidate(waitq: wqset);
1827
1828	for (;;) {
1829	struct waitq_link *link;
1830	queue_entry_t elt;
1831	circle_queue_t q;
1832	struct waitq *wq;
1833	uint32_t ticket;
1834	bool stable = true;
1835
1836	if (!circle_queue_empty(cq: &wqset->wqset_links)) {
1837	q = &wqset->wqset_links;
1838	} else if (!circle_queue_empty(cq: &wqset->wqset_preposts)) {
1839	q = &wqset->wqset_preposts;
1840	} else {
1841	break;
1842	}
1843
1844	if (batch-- == `0`) {
1845	waitq_unlock(wq: wqset);
1846	waitq_lock(wq: wqset);
1847	batch = waitq_set_unlink_batch;
1848	continue;
1849	}
1850
1851	elt = circle_queue_first(cq: q);
1852	link = cqe_element(elt, struct waitq_link, wql_slink);
1853	wq = link->wql_wq;
1854
1855	if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
1856	waitq_unlock(wq: wqset);
1857	waitq_lock_wait(wq: wq, ticket);
1858	waitq_lock(wq: wqset);
1859	stable = (elt == circle_queue_first(cq: q) && link->wql_wq == wq);
1860	}
1861
1862	if (stable) {
1863	circle_dequeue(cq: q, elt: &link->wql_slink);
1864	circle_dequeue(cq: &wq->waitq_links, elt: &link->wql_qlink);
1865	wql_list_push(list: free_l, link: link);
1866	}
1867
1868	waitq_unlock(wq: wq);
1869	}
1870	}
1871
1872	void
1873	waitq_clear_prepost_locked(struct waitq *waitq)
1874	{
1875	assert(waitq_type(waitq) == WQT_PORT);
1876	waitq->waitq_preposted = false;
1877	}
1878
1879	void
1880	waitq_set_foreach_member_locked(struct waitq_set wqs, void* (^cb)(struct waitq *))
1881	{
1882	struct waitq_link *link;
1883
1884	cqe_foreach_element(link, &wqs->wqset_links, wql_slink) {
1885	cb(link->wql_wq);
1886	}
1887
1888	cqe_foreach_element(link, &wqs->wqset_preposts, wql_slink) {
1889	cb(link->wql_wq);
1890	}
1891	}
1892
1893	__abortlike
1894	static void
1895	__waitq_link_arguments_panic(struct waitq waitq, struct* waitq_set *wqset)
1896	{
1897	if (!waitq_valid(waitq: waitq)) {
1898	panic("Invalid waitq: %p", waitq);
1899	}
1900	if (waitq_type(wq: waitq) != WQT_PORT) {
1901	panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
1902	}
1903	panic("Invalid waitq-set: %p", wqset);
1904	}
1905
1906	static inline void
1907	__waitq_link_arguments_validate(struct waitq waitq, struct* waitq_set *wqset)
1908	{
1909	if (!waitq_valid(waitq: waitq) \|\|
1910	waitq_type(wq: waitq) != WQT_PORT \|\|
1911	waitq_type(wq: wqset) != WQT_PORT_SET) {
1912	__waitq_link_arguments_panic(waitq, wqset);
1913	}
1914	}
1915
1916	__abortlike
1917	static void
1918	__waitq_invalid_panic(waitq_t waitq)
1919	{
1920	panic("Invalid waitq: %p", waitq.wq_q);
1921	}
1922
1923	static void
1924	__waitq_validate(waitq_t waitq)
1925	{
1926	if (!waitq_valid(waitq)) {
1927	__waitq_invalid_panic(waitq);
1928	}
1929	}
1930
1931	kern_return_t
1932	waitq_link_locked(struct waitq waitq, struct* waitq_set *wqset,
1933	waitq_link_t *linkp)
1934	{
1935	assert(linkp->wqlh);
1936
1937	__waitq_link_arguments_validate(waitq, wqset);
1938
1939	if (wql_find(waitq, wqset: wqset)) {
1940	return KERN_ALREADY_IN_SET;
1941	}
1942
1943	linkp->wqll->wql_wq = waitq;
1944	linkp->wqll->wql_wqs = (uintptr_t)wqset;
1945
1946	if (waitq_valid(waitq: wqset)) {
1947	circle_enqueue_tail(cq: &wqset->wqset_links, elt: &linkp->wqll->wql_slink);
1948	circle_enqueue_tail(cq: &waitq->waitq_links, elt: &linkp->wqll->wql_qlink);
1949	*linkp = WQL_NULL;
1950	}
1951
1952	return KERN_SUCCESS;
1953	}
1954
1955	kern_return_t
1956	waitq_link_prepost_locked(struct waitq waitq, struct* waitq_set *wqset)
1957	{
1958	struct waitq_link *link;
1959
1960	__waitq_link_arguments_validate(waitq, wqset);
1961
1962	link = wql_find(waitq, wqset: wqset);
1963	if (link == NULL) {
1964	return KERN_NOT_IN_SET;
1965	}
1966
1967	if (!wql_wqs_preposted(link: link)) {
1968	wql_wqs_mark_preposted(link: link);
1969	waitq->waitq_preposted = true;
1970	circle_dequeue(cq: &wqset->wqset_links, elt: &link->wql_slink);
1971	circle_enqueue_tail(cq: &wqset->wqset_preposts, elt: &link->wql_slink);
1972	ipc_pset_prepost(wqset, waitq);
1973	}
1974
1975	return KERN_SUCCESS;
1976	}
1977
1978	waitq_link_t
1979	waitq_unlink_locked(struct waitq waitq, struct* waitq_set *wqset)
1980	{
1981	struct waitq_link *link;
1982
1983	__waitq_link_arguments_validate(waitq, wqset);
1984
1985	link = wql_find(waitq, wqset: wqset);
1986	if (link) {
1987	circle_dequeue(cq: wql_wqs_queue(wqs: wqset, link), elt: &link->wql_slink);
1988	circle_dequeue(cq: &waitq->waitq_links, elt: &link->wql_qlink);
1989	}
1990
1991	return (waitq_link_t){ .wqll = link };
1992	}
1993
1994	void
1995	waitq_unlink_all_locked(struct waitq waitq, struct* waitq_set *except_wqset,
1996	waitq_link_list_t *free_l)
1997	{
1998	struct waitq_link *kept_link = NULL;
1999	struct waitq_link *link;
2000
2001	assert(waitq_type(waitq) == WQT_PORT);
2002
2003	cqe_foreach_element_safe(link, &waitq->waitq_links, wql_qlink) {
2004	waitq_t wqs = wql_wqs(link: link);
2005
2006	if (wqs.wqs_set == except_wqset) {
2007	kept_link = link;
2008	continue;
2009	}
2010
2011	waitq_lock(wq: wqs);
2012	circle_dequeue(cq: wql_wqs_queue(wqs: wqs.wqs_set, link),
2013	elt: &link->wql_slink);
2014	wql_list_push(list: free_l, link: link);
2015	waitq_unlock(wq: wqs);
2016	}
2017
2018	circle_queue_init(&waitq->waitq_links);
2019	if (kept_link) {
2020	circle_enqueue_tail(cq: &waitq->waitq_links, elt: &kept_link->wql_qlink);
2021	}
2022	}
2023
2024	struct waitq *
2025	waitq_set_first_prepost(struct waitq_set *wqset, wqs_prepost_flags_t flags)
2026	{
2027	circle_queue_t q = &wqset->wqset_preposts;
2028	queue_entry_t elt;
2029	struct waitq_link *link;
2030	struct waitq *wq;
2031	uint32_t ticket;
2032
2033	if (__improbable(!waitq_valid(wqset))) {
2034	return NULL;
2035	}
2036
2037	while (!circle_queue_empty(cq: q)) {
2038	elt = circle_queue_first(cq: q);
2039	link = cqe_element(elt, struct waitq_link, wql_slink);
2040	wq = link->wql_wq;
2041
2042	if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
2043	waitq_unlock(wq: wqset);
2044	waitq_lock_wait(wq: wq, ticket);
2045	waitq_lock(wq: wqset);
2046	if (!waitq_valid(waitq: wqset)) {
2047	waitq_unlock(wq: wq);
2048	return NULL;
2049	}
2050
2051	if (elt != circle_queue_first(cq: q) \|\| link->wql_wq != wq) {
2052	waitq_unlock(wq: wq);
2053	continue;
2054	}
2055	}
2056
2057	if (wq->waitq_preposted) {
2058	if ((flags & WQS_PREPOST_PEEK) == `0`) {
2059	circle_queue_rotate_head_forward(cq: q);
2060	}
2061	if ((flags & WQS_PREPOST_LOCK) == `0`) {
2062	waitq_unlock(wq: wq);
2063	}
2064	return wq;
2065	}
2066
2067	/*
2068	* We found a link that is no longer preposted,
2069	* someone must have called waitq_clear_prepost_locked()
2070	* and this set just only noticed.
2071	*/
2072	wql_wqs_clear_preposted(link: link);
2073	waitq_unlock(wq: wq);
2074
2075	circle_dequeue(cq: q, elt: &link->wql_slink);
2076	circle_enqueue_tail(cq: &wqset->wqset_links, elt: &link->wql_slink);
2077	}
2078
2079	return NULL;
2080	}
2081
2082
2083	#pragma mark select sets
2084
2085	/**
2086	* @function select_set_nextid()
2087	*
2088	* @brief
2089	* Generate a unique ID for a select set "generation"
2090	*
2091	* @discussion
2092	* This mixes the CPU number with a monotonic clock
2093	* (in order to avoid contention on a global atomic).
2094	*
2095	* In order for select sets to be invalidated very quickly,
2096	* they do not have backward linkages to their member queues.
2097	*
2098	* Instead, each time a new @c select() "pass" is initiated,
2099	* a new ID is generated, which is copied onto the @c waitq_sellink
2100	* links at the time of link.
2101	*
2102	* The zone for select sets is sequestered, which allows for select
2103	* wait queues to speculatively lock their set during prepost
2104	* and use this ID to debounce wakeups and avoid spurious wakeups
2105	* (as an "optimization" because select recovers from spurious wakeups,
2106	* we just want those to be very rare).
2107	*/
2108	__attribute__((always_inline))
2109	static inline uint64_t
2110	select_set_nextid(bool preemption_enabled)
2111	{
2112	/ waitq_bootstrap() set the low byte to a unique value per CPU /
2113	static_assert(MAX_CPUS <= `256`);
2114	const uint64_t inc = `256`;
2115	uint64_t id;
2116
2117	#ifdef __x86_64__
2118	/ uncontended atomics are slower than disabling preemption on Intel /
2119	if (preemption_enabled) {
2120	disable_preemption();
2121	}
2122	id = (*PERCPU_GET(select_setid) += inc);
2123	if (preemption_enabled) {
2124	enable_preemption();
2125	}
2126	#else
2127	/*
2128	* if preemption is enabled this might update another CPU's
2129	* setid, which will be rare but is acceptable, it still
2130	* produces a unique select ID.
2131	*
2132	* We chose this because the uncontended atomics on !intel
2133	* are faster than disabling/reenabling preemption.
2134	*/
2135	(void)preemption_enabled;
2136	id = os_atomic_add(PERCPU_GET(select_setid), inc, relaxed);
2137	#endif
2138
2139	return id;
2140	}
2141
2142	struct select_set *
2143	select_set_alloc(void)
2144	{
2145	struct select_set *selset;
2146	selset = zalloc_id(ZONE_ID_SELECT_SET, Z_ZERO \| Z_WAITOK \| Z_NOFAIL);
2147
2148	waitq_init(waitq: selset, type: WQT_SELECT_SET, SYNC_POLICY_FIFO);
2149	selset->selset_id = select_set_nextid(true);
2150
2151	return selset;
2152	}
2153
2154	__abortlike
2155	static void
2156	__select_set_link_arguments_panic(struct waitq waitq, struct* select_set *set)
2157	{
2158	if (!waitq_valid(waitq: waitq)) {
2159	panic("Invalid waitq: %p", waitq);
2160	}
2161	if (waitq_type(wq: waitq) != WQT_SELECT) {
2162	panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
2163	}
2164	panic("Invalid waitq-set: %p", set);
2165	}
2166
2167	static inline void
2168	__select_set_link_arguments_validate(struct waitq waitq, struct* select_set *set)
2169	{
2170	if (!waitq_valid(waitq: waitq) \|\|
2171	waitq_type(wq: waitq) != WQT_SELECT \|\|
2172	waitq_type(wq: set) != WQT_SELECT_SET) {
2173	__select_set_link_arguments_panic(waitq, set);
2174	}
2175	}
2176
2177	void
2178	select_set_link(struct waitq waitq, struct* select_set *set,
2179	waitq_link_t *linkp)
2180	{
2181	struct waitq_sellink *link;
2182
2183	__select_set_link_arguments_validate(waitq, set);
2184
2185	waitq_lock(wq: waitq);
2186
2187	if (waitq == &select_conflict_queue) {
2188	waitq_lock(wq: set);
2189	set->selset_conflict = true;
2190	waitq_unlock(wq: set);
2191	}
2192
2193	wql_list_foreach(link, &waitq->waitq_sellinks) {
2194	if (waitq_same(wq1: wql_wqs(link: link), wq2: set)) {
2195	goto found;
2196	}
2197	}
2198
2199	link = linkp->wqls;
2200	*linkp = WQL_NULL;
2201	wql_list_push(list: &waitq->waitq_sellinks, link: link);
2202
2203	found:
2204	link->wql_wqs = (uintptr_t)set;
2205	link->wql_setid = set->selset_id;
2206	waitq_unlock(wq: waitq);
2207	}
2208
2209	static void
2210	select_set_unlink_conflict_queue(struct select_set *set)
2211	{
2212	struct waitq_link_list_entry **prev;
2213	struct waitq_sellink *link;
2214
2215	waitq_lock(wq: &select_conflict_queue);
2216
2217	/*
2218	* We know the conflict queue is hooked,
2219	* so find the linkage and free it.
2220	*/
2221	prev = &select_conflict_queue.waitq_sellinks.next;
2222	for (;;) {
2223	assert(*prev);
2224	link = wql_list_elem(e: *prev);
2225	if (waitq_same(wq1: wql_wqs(link: link), wq2: set)) {
2226	*prev = link->wql_next.next;
2227	break;
2228	}
2229	prev = &link->wql_next.next;
2230	}
2231
2232	waitq_unlock(wq: &select_conflict_queue);
2233
2234	waitq_link_free(type: WQT_SELECT_SET, link: link);
2235	}
2236
2237	static void
2238	__select_set_reset(struct select_set *set, bool invalidate)
2239	{
2240	if (set->selset_conflict) {
2241	select_set_unlink_conflict_queue(set);
2242	}
2243
2244	waitq_lock(wq: set);
2245	if (invalidate) {
2246	waitq_invalidate(waitq: set);
2247	}
2248	set->selset_id = select_set_nextid(false);
2249	set->selset_preposted = `0`;
2250	set->selset_conflict = `0`;
2251	waitq_unlock(wq: set);
2252	}
2253
2254	void
2255	select_set_reset(struct select_set *set)
2256	{
2257	__select_set_reset(set, false);
2258	}
2259
2260	void
2261	select_set_free(struct select_set *set)
2262	{
2263	__select_set_reset(set, true);
2264	hw_lck_ticket_destroy(&set->selset_interlock, &waitq_lck_grp);
2265	zfree_id(ZONE_ID_SELECT_SET, set);
2266	}
2267
2268	void
2269	select_waitq_wakeup_and_deinit(
2270	struct waitq *waitq,
2271	event64_t wake_event,
2272	wait_result_t result)
2273	{
2274	waitq_link_list_t free_l = { };
2275
2276	if (waitq_is_valid(waitq: waitq)) {
2277	assert(waitq_type(waitq) == WQT_SELECT);
2278
2279	waitq_lock(wq: waitq);
2280
2281	waitq_wakeup64_all_locked(waitq: waitq, wake_event, result,
2282	flags: WAITQ_KEEP_LOCKED);
2283
2284	waitq_invalidate(waitq: waitq);
2285	free_l = waitq->waitq_sellinks;
2286	waitq->waitq_sellinks.next = NULL;
2287
2288	waitq_unlock(wq: waitq);
2289
2290	waitq_link_free_list(type: WQT_SELECT, free_l: &free_l);
2291
2292	waitq_deinit(waitq: waitq);
2293	}
2294	}
2295
2296	#pragma mark assert_wait / wakeup (high level)
2297
2298	wait_result_t
2299	waitq_assert_wait64(struct waitq *waitq,
2300	event64_t wait_event,
2301	wait_interrupt_t interruptible,
2302	uint64_t deadline)
2303	{
2304	thread_t thread = current_thread();
2305	wait_result_t ret;
2306	spl_t s = `0`;
2307
2308	__waitq_validate(waitq: waitq);
2309
2310	if (waitq_irq_safe(waitq: waitq)) {
2311	s = splsched();
2312	}
2313	waitq_lock(wq: waitq);
2314
2315	ret = waitq_assert_wait64_locked(waitq: waitq, wait_event, interruptible,
2316	TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread);
2317
2318	waitq_unlock(wq: waitq);
2319	if (waitq_irq_safe(waitq: waitq)) {
2320	splx(s);
2321	}
2322
2323	return ret;
2324	}
2325
2326	wait_result_t
2327	waitq_assert_wait64_leeway(struct waitq *waitq,
2328	event64_t wait_event,
2329	wait_interrupt_t interruptible,
2330	wait_timeout_urgency_t urgency,
2331	uint64_t deadline,
2332	uint64_t leeway)
2333	{
2334	wait_result_t ret;
2335	thread_t thread = current_thread();
2336	spl_t s = `0`;
2337
2338	__waitq_validate(waitq: waitq);
2339
2340	if (waitq_irq_safe(waitq: waitq)) {
2341	s = splsched();
2342	}
2343	waitq_lock(wq: waitq);
2344
2345	ret = waitq_assert_wait64_locked(waitq: waitq, wait_event, interruptible,
2346	urgency, deadline, leeway, thread);
2347
2348	waitq_unlock(wq: waitq);
2349	if (waitq_irq_safe(waitq: waitq)) {
2350	splx(s);
2351	}
2352
2353	return ret;
2354	}
2355
2356	kern_return_t
2357	waitq_wakeup64_one(
2358	waitq_t waitq,
2359	event64_t wake_event,
2360	wait_result_t result,
2361	waitq_wakeup_flags_t flags)
2362	{
2363	__waitq_validate(waitq);
2364
2365	spl_t spl = `0`;
2366
2367	if (waitq_irq_safe(waitq)) {
2368	spl = splsched();
2369	}
2370
2371	waitq_lock(wq: waitq);
2372
2373	/ waitq is unlocked upon return, splx is handled /
2374	return waitq_wakeup64_one_locked(waitq, wake_event, result,
2375	flags: flags \| waitq_flags_splx(spl_level: spl) \| WAITQ_UNLOCK);
2376	}
2377
2378	kern_return_t
2379	waitq_wakeup64_all(
2380	waitq_t waitq,
2381	event64_t wake_event,
2382	wait_result_t result,
2383	waitq_wakeup_flags_t flags)
2384	{
2385	__waitq_validate(waitq);
2386
2387	spl_t spl = `0`;
2388
2389	if (waitq_irq_safe(waitq)) {
2390	spl = splsched();
2391	}
2392
2393	waitq_lock(wq: waitq);
2394
2395	/ waitq is unlocked upon return, splx is handled /
2396	return waitq_wakeup64_all_locked(waitq, wake_event, result,
2397	flags: flags \| waitq_flags_splx(spl_level: spl) \| WAITQ_UNLOCK);
2398	}
2399
2400	kern_return_t
2401	waitq_wakeup64_thread(
2402	struct waitq *waitq,
2403	event64_t event,
2404	thread_t thread,
2405	wait_result_t result)
2406	{
2407	spl_t s = splsched();
2408	kern_return_t ret;
2409
2410	__waitq_validate(waitq: waitq);
2411	assert(waitq_irq_safe(waitq));
2412	waitq_lock(wq: waitq);
2413
2414	ret = waitq_wakeup64_thread_and_unlock(waitq, event, thread, result);
2415
2416	splx(s);
2417
2418	return ret;
2419	}
2420
2421	thread_t
2422	waitq_wakeup64_identify(
2423	waitq_t waitq,
2424	event64_t wake_event,
2425	wait_result_t result,
2426	waitq_wakeup_flags_t flags)
2427	{
2428	__waitq_validate(waitq);
2429
2430	spl_t spl = `0`;
2431
2432	if (waitq_irq_safe(waitq)) {
2433	spl = splsched();
2434	}
2435
2436	waitq_lock(wq: waitq);
2437
2438	thread_t thread = waitq_wakeup64_identify_locked(waitq, wake_event,
2439	result, flags: flags \| waitq_flags_splx(spl_level: spl) \| WAITQ_UNLOCK);
2440	/ waitq is unlocked, thread is not go-ed yet /
2441	/ preemption disabled if thread non-null /
2442	/ splx is handled /
2443
2444	if (thread != THREAD_NULL) {
2445	thread_reference(thread);
2446	waitq_resume_identified_thread(waitq, thread, result, flags);
2447	/ preemption enabled, thread go-ed /
2448	/ returns +1 ref to running thread /
2449	return thread;
2450	}
2451
2452	return THREAD_NULL;
2453	}
2454
2455
2456	#pragma mark tests
2457	#if DEBUG \|\| DEVELOPMENT
2458
2459	#include <ipc/ipc_pset.h>
2460	#include <sys/errno.h>
2461
2462	#define MAX_GLOBAL_TEST_QUEUES 64
2463	static struct waitq wqt_waitq_array[MAX_GLOBAL_TEST_QUEUES];
2464	static bool wqt_running;
2465	static bool wqt_init;
2466
2467	static bool
2468	wqt_start(const char test, int64_t out)
2469	{
2470	if (os_atomic_xchg(&wqt_running, true, acquire)) {
2471	*out = `0`;
2472	return false;
2473	}
2474
2475	if (!wqt_init) {
2476	wqt_init = true;
2477	for (int i = `0`; i < MAX_GLOBAL_TEST_QUEUES; i++) {
2478	waitq_init(&wqt_waitq_array[i], WQT_PORT, SYNC_POLICY_FIFO);
2479	}
2480	}
2481
2482	printf("[WQ] starting %s\n", test);
2483	return true;
2484	}
2485
2486	static int
2487	wqt_end(const char test, int64_t out)
2488	{
2489	os_atomic_store(&wqt_running, false, release);
2490	printf("[WQ] done %s\n", test);
2491	*out = `1`;
2492	return `0`;
2493	}
2494
2495	static struct waitq *
2496	wqt_wq(uint32_t index)
2497	{
2498	return &wqt_waitq_array[index];
2499	}
2500
2501	static uint32_t
2502	wqt_idx(struct waitq *waitq)
2503	{
2504	assert(waitq >= wqt_waitq_array &&
2505	waitq < wqt_waitq_array + MAX_GLOBAL_TEST_QUEUES);
2506	return (uint32_t)(waitq - wqt_waitq_array);
2507	}
2508
2509	__attribute__((overloadable))
2510	static uint64_t
2511	wqt_bit(uint32_t index)
2512	{
2513	return `1ull` << index;
2514	}
2515
2516	__attribute__((overloadable))
2517	static uint64_t
2518	wqt_bit(struct waitq *waitq)
2519	{
2520	return wqt_bit(wqt_idx(waitq));
2521	}
2522
2523	static struct waitq_set *
2524	wqt_wqset_create(void)
2525	{
2526	struct waitq_set *wqset;
2527
2528	wqset = &ipc_pset_alloc_special(ipc_space_kernel)->ips_wqset;
2529	printf("[WQ]: created waitq set %p\n", wqset);
2530	return wqset;
2531	}
2532
2533	static void
2534	wqt_wqset_free(struct waitq_set *wqset)
2535	{
2536	printf("[WQ]: destroying waitq set %p\n", wqset);
2537	waitq_lock(wqset);
2538	ipc_pset_destroy(ipc_space_kernel,
2539	__container_of(wqset, struct ipc_pset, ips_wqset));
2540	}
2541
2542	static void
2543	wqt_link(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2544	{
2545	struct waitq *waitq = wqt_wq(index);
2546	waitq_link_t link = waitq_link_alloc(WQT_PORT_SET);
2547	kern_return_t kr;
2548
2549	printf("[WQ]: linking waitq [%d] to global wqset (%p)\n", index, wqset);
2550
2551	waitq_lock(waitq);
2552	waitq_lock(wqset);
2553	kr = waitq_link_locked(waitq, wqset, &link);
2554	waitq_unlock(wqset);
2555	waitq_unlock(waitq);
2556
2557	if (link.wqlh) {
2558	waitq_link_free(WQT_PORT_SET, link);
2559	}
2560
2561	printf("[WQ]:\tkr=%d\texpected=%d\n", kr, want);
2562	assert(kr == want);
2563	}
2564
2565	static void
2566	wqt_unlink(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2567	{
2568	struct waitq *waitq = wqt_wq(index);
2569	waitq_link_t link;
2570	kern_return_t kr;
2571
2572	printf("[WQ]: unlinking waitq [%d] from global wqset (%p)\n",
2573	index, wqset);
2574
2575	waitq_lock(waitq);
2576	waitq_lock(wqset);
2577	link = waitq_unlink_locked(waitq, wqset);
2578	waitq_unlock(wqset);
2579	waitq_unlock(waitq);
2580
2581	if (link.wqlh) {
2582	waitq_link_free(WQT_PORT_SET, link);
2583	kr = KERN_SUCCESS;
2584	} else {
2585	kr = KERN_NOT_IN_SET;
2586	}
2587
2588	printf("[WQ]: \tkr=%d\n", kr);
2589	assert(kr == want);
2590	}
2591
2592	static void
2593	wqt_wakeup_one(uint32_t index, event64_t event64, kern_return_t want)
2594	{
2595	kern_return_t kr;
2596
2597	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
2598	index, event64);
2599	kr = waitq_wakeup64_one(wqt_wq(index), event64,
2600	THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
2601	printf("[WQ]: \tkr=%d\n", kr);
2602	assert(kr == want);
2603	}
2604
2605	static void
2606	wqt_clear_preposts(uint32_t idx)
2607	{
2608	waitq_lock(wqt_wq(idx));
2609	(void)waitq_clear_prepost_locked(wqt_wq(idx));
2610	waitq_unlock(wqt_wq(idx));
2611	}
2612
2613	static void
2614	wqt_preposts_gc_locked(struct waitq_set *wqset)
2615	{
2616	circle_queue_t q = &wqset->wqset_preposts;
2617	struct waitq_link *link;
2618	uint32_t ticket;
2619
2620	again:
2621	cqe_foreach_element_safe(link, q, wql_slink) {
2622	struct waitq *wq = link->wql_wq;
2623
2624	if (!waitq_lock_reserve(wq, &ticket)) {
2625	waitq_unlock(wqset);
2626	waitq_lock_wait(wq, ticket);
2627	waitq_lock(wqset);
2628	waitq_unlock(wq);
2629	/ the list was possibly mutated, restart /
2630	goto again;
2631	}
2632
2633	if (!wq->waitq_preposted) {
2634	wql_wqs_clear_preposted(link);
2635	circle_dequeue(q, &link->wql_slink);
2636	circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
2637	}
2638
2639	waitq_unlock(wq);
2640	}
2641	}
2642
2643	static void
2644	wqt_expect_preposts(struct waitq_set *wqset, uint64_t preposts)
2645	{
2646	struct waitq_link *link;
2647	uint64_t found = `0`;
2648
2649	waitq_lock(wqset);
2650
2651	wqt_preposts_gc_locked(wqset);
2652
2653	cqe_foreach_element(link, &wqset->wqset_preposts, wql_slink) {
2654	struct waitq *waitq = link->wql_wq;
2655
2656	printf("[WQ]: found prepost %d\n", wqt_idx(waitq));
2657	assertf((found & wqt_bit(waitq)) == `0`,
2658	"found waitq %d twice", wqt_idx(waitq));
2659	found \|= wqt_bit(waitq);
2660	}
2661
2662	waitq_unlock(wqset);
2663
2664	assertf(found == preposts, "preposts expected 0x%llx, but got 0x%llx",
2665	preposts, found);
2666	}
2667
2668	static int
2669	waitq_basic_test(__unused int64_t in, int64_t *out)
2670	{
2671	struct waitq_set *wqset;
2672
2673	if (!wqt_start(__func__, out)) {
2674	return EBUSY;
2675	}
2676
2677	wqset = wqt_wqset_create();
2678	wqt_link(`10`, wqset, KERN_SUCCESS);
2679	wqt_link(`10`, wqset, KERN_ALREADY_IN_SET);
2680	wqt_link(`11`, wqset, KERN_SUCCESS);
2681	wqt_link(`11`, wqset, KERN_ALREADY_IN_SET);
2682	wqt_link(`12`, wqset, KERN_SUCCESS);
2683	wqt_link(`12`, wqset, KERN_ALREADY_IN_SET);
2684
2685	wqt_wakeup_one(`10`, NO_EVENT64, KERN_NOT_WAITING);
2686	wqt_wakeup_one(`12`, NO_EVENT64, KERN_NOT_WAITING);
2687
2688	wqt_expect_preposts(wqset, wqt_bit(`10`) \| wqt_bit(`12`));
2689	wqt_clear_preposts(`10`);
2690
2691	wqt_expect_preposts(wqset, wqt_bit(`12`));
2692	wqt_clear_preposts(`12`);
2693
2694	wqt_expect_preposts(wqset, `0`);
2695
2696	wqt_unlink(`12`, wqset, KERN_SUCCESS);
2697	wqt_unlink(`12`, wqset, KERN_NOT_IN_SET);
2698	wqt_unlink(`11`, wqset, KERN_SUCCESS);
2699	wqt_unlink(`10`, wqset, KERN_SUCCESS);
2700	wqt_wqset_free(wqset);
2701
2702	return wqt_end(__func__, out);
2703	}
2704	SYSCTL_TEST_REGISTER(waitq_basic, waitq_basic_test);
2705	#endif /* DEBUG \|\| DEVELOPMENT */
2706

Browse the source code of xnu/osfmk/kern/waitq.c