1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57#include <kern/ast.h>
58#include <kern/backtrace.h>
59#include <kern/kern_types.h>
60#include <kern/mach_param.h>
61#include <kern/percpu.h>
62#include <kern/queue.h>
63#include <kern/sched_prim.h>
64#include <kern/simple_lock.h>
65#include <kern/spl.h>
66#include <kern/waitq.h>
67#include <kern/zalloc.h>
68#include <kern/policy_internal.h>
69#include <kern/turnstile.h>
70
71#include <os/hash.h>
72#include <libkern/section_keywords.h>
73#include <mach/sync_policy.h>
74#include <vm/vm_kern.h>
75
76#include <sys/kdebug.h>
77
78/*!
79 * @const waitq_set_unlink_batch
80 *
81 * @brief
82 * How many links are unhooked under a single set lock hold.
83 *
84 * @discussion
85 * Holding a waitq set lock for too long can cause
86 * extreme contention (when a set is being torn down concurrently
87 * to messages being sent to ports who used to belong to that set).
88 *
89 * In order to fight this, large wait queue sets will drop
90 * and reacquire their lock for each unlinking batch.
91 */
92static TUNABLE(uint32_t, waitq_set_unlink_batch, "waitq_set_unlink_batch", 64);
93
94/*!
95 * @const WQL_PREPOST_MARKER
96 *
97 * @brief
98 * Marker set in the @c wql_wqs field of wait queue linkages to denote that
99 * this linkage has preposted to its wait queue set already.
100 *
101 * @discussion
102 * This bit is manipulated under both the wait queue and the wait queue set
103 * locks, and is used for two purposes:
104 *
105 * - for port set queues, it denotes in which circle queue the linkage
106 * is queued on (@c waitq_set::wqset_links or @c waitq_set::wqset_preposts)
107 *
108 * - as an optimization during pre-post to not walk sets this link already
109 * preposted to.
110 */
111#define WQL_PREPOST_MARKER 1ul
112
113#if __LP64__
114/*!
115 * @struct waitq_link_hdr
116 *
117 * @brief
118 * Common "header" between all linkages, in order to find the waitq_set
119 * of this linkage.
120 *
121 * @discussion
122 * Due to unfortunate alignment constraints on @c queue_chain_t,
123 * this is wildly different for LP64 and ILP32.
124 *
125 * Do note that `wql
126 */
127struct waitq_link_hdr {
128 uintptr_t wql_wqs;
129};
130
131/*!
132 * @struct waitq_sellink
133 *
134 * @brief
135 * Linkages used for select waitq queues to select wait queue sets.
136 *
137 * @discussion
138 * Select linkages are one way (queue to set) for two reasons:
139 *
140 * 1. select doesn't use the wait queue subsystem to discover which file
141 * descriptor woke up the set (it will instead scan all fds again),
142 *
143 * 2. all linkages are unhooked on each syscall return, so we minimize
144 * work to be done to be as quick as possible, using a fast invalidation
145 * scheme based on unique identifiers and sequestering
146 * (see @c select_set_nextid()).
147 */
148struct waitq_sellink {
149 uintptr_t wql_wqs;
150 struct waitq_link_list_entry wql_next;
151 uint64_t wql_setid;
152};
153
154/*!
155 * @struct waitq_link
156 *
157 * @brief
158 * Linkages used for port wait queues and port-set wait queue sets.
159 *
160 * @discussion
161 * Those linkages go both ways so that receiving messages through a port-set
162 * can quickly find ports that preposted to the set.
163 *
164 * It also means that unhooking linkages cannot be lazy.
165 */
166struct waitq_link {
167 uintptr_t wql_wqs; /**< wait queue set for this link */
168 queue_chain_t wql_qlink; /**< linkage through the waitq list */
169 queue_chain_t wql_slink; /**< linkage through the wqset list */
170 struct waitq *wql_wq; /**< wait queue for this link */
171};
172#else
173struct waitq_link_hdr {
174 uint64_t __wql_padding;
175 uintptr_t wql_wqs;
176};
177
178struct waitq_sellink {
179 struct waitq_link_list_entry wql_next;
180 uintptr_t __wql_padding;
181 uintptr_t wql_wqs;
182 uint64_t wql_setid;
183};
184
185struct waitq_link {
186 queue_chain_t wql_qlink;
187 uintptr_t wql_wqs;
188 struct waitq *wql_wq;
189 queue_chain_t wql_slink;
190};
191#endif
192
193static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
194 offsetof(struct waitq_sellink, wql_wqs));
195static_assert(offsetof(struct waitq_link_hdr, wql_wqs) ==
196 offsetof(struct waitq_link, wql_wqs));
197static_assert(sizeof(struct waitq) <= WQ_OPAQUE_SIZE, "waitq structure size mismatch");
198static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure alignment mismatch");
199
200static KALLOC_TYPE_DEFINE(waitq_sellink_zone, struct waitq_sellink, KT_PRIV_ACCT);
201static KALLOC_TYPE_DEFINE(waitq_link_zone, struct waitq_link, KT_PRIV_ACCT);
202ZONE_DEFINE_ID(ZONE_ID_SELECT_SET, "select_set", struct select_set,
203 ZC_SEQUESTER | ZC_NOPGZ | ZC_ZFREE_CLEARMEM);
204
205static LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
206
207static uint64_t PERCPU_DATA(select_setid);
208struct waitq select_conflict_queue;
209
210#pragma mark waitq links
211
212static inline bool
213waitq_is_sellink(waitq_type_t type)
214{
215 return type == WQT_SELECT || type == WQT_SELECT_SET;
216}
217
218static inline bool
219wql_sellink_valid(struct select_set *selset, struct waitq_sellink *link)
220{
221 return waitq_valid(waitq: selset) && selset->selset_id == link->wql_setid;
222}
223
224static waitq_t
225wql_wqs(waitq_link_t link)
226{
227 return (waitq_t){ .wq_q: (void *)(link.wqlh->wql_wqs & ~WQL_PREPOST_MARKER) };
228}
229
230static bool
231wql_wqs_preposted(waitq_link_t link)
232{
233 return link.wqlh->wql_wqs & WQL_PREPOST_MARKER;
234}
235
236static void
237wql_wqs_mark_preposted(waitq_link_t link)
238{
239 assert(!wql_wqs_preposted(link));
240 link.wqlh->wql_wqs |= WQL_PREPOST_MARKER;
241}
242
243static void
244wql_wqs_clear_preposted(waitq_link_t link)
245{
246 assert(wql_wqs_preposted(link));
247 link.wqlh->wql_wqs &= ~WQL_PREPOST_MARKER;
248}
249
250static circle_queue_t
251wql_wqs_queue(struct waitq_set *wqs, struct waitq_link *link)
252{
253 return wql_wqs_preposted(link: link) ? &wqs->wqset_preposts : &wqs->wqset_links;
254}
255
256static void
257wql_list_push(waitq_link_list_t *list, waitq_link_t link)
258{
259 link.wqls->wql_next.next = list->next;
260 list->next = &link.wqls->wql_next;
261}
262
263static inline struct waitq_sellink *
264wql_list_elem(struct waitq_link_list_entry *e)
265{
266 return e ? __container_of(e, struct waitq_sellink, wql_next) : NULL;
267}
268
269/*!
270 * @function wql_list_next()
271 *
272 * @brief
273 * Helper function to implement wait queue link list enumeration.
274 *
275 * @param e in: pointer to the current element,
276 * out: pointer to the next element or NULL
277 * @param end which element to stop enumeration at (NULL for lists,
278 * or the first element enumerated for circle queues).
279 * @returns true (makes writing for(;;) based enumerators easier).
280 */
281static inline bool
282wql_list_next(struct waitq_link_list_entry **e, struct waitq_link_list_entry *end)
283{
284 if (*e == NULL || (*e)->next == end) {
285 *e = NULL;
286 } else {
287 *e = (*e)->next;
288 }
289 return true;
290}
291
292#define __wql_list_foreach(it, head, end) \
293 for (struct waitq_link_list_entry *__it = (head)->next, *__end = end; \
294 ((it) = wql_list_elem(__it)); wql_list_next(&__it, __end))
295
296#define wql_list_foreach(it, head) \
297 __wql_list_foreach(it, head, NULL)
298
299#define wql_list_foreach_safe(it, head) \
300 for (struct waitq_link_list_entry *__it = (head)->next; \
301 ((it) = wql_list_elem(__it)) && wql_list_next(&__it, NULL); )
302
303/*
304 * Gross hack: passing `__it` to `__wql_list_foreach` makes it stop whether
305 * we circle back to the first element or NULL (whichever comes first).
306 *
307 * This allows to have a single enumeration function oblivious to whether
308 * we enumerate a circle queue or a sellink list.
309 */
310#define waitq_link_foreach(link, waitq) \
311 __wql_list_foreach((link).wqls, &(waitq).wq_q->waitq_sellinks, __it)
312
313static_assert(offsetof(struct waitq, waitq_sellinks) ==
314 offsetof(struct waitq, waitq_links));
315static_assert(offsetof(struct waitq_sellink, wql_next) ==
316 offsetof(struct waitq_link, wql_qlink.next));
317
318static struct waitq_link *
319wql_find(struct waitq *waitq, waitq_t wqset)
320{
321 struct waitq_link *link;
322
323 cqe_foreach_element(link, &waitq->waitq_links, wql_qlink) {
324 if (waitq_same(wq1: wql_wqs(link: link), wq2: wqset)) {
325 return link;
326 }
327 }
328
329 return NULL;
330}
331
332waitq_link_t
333waitq_link_alloc(waitq_type_t type)
334{
335 waitq_link_t link;
336
337 if (waitq_is_sellink(type)) {
338 link.wqls = zalloc_flags(waitq_sellink_zone, Z_WAITOK | Z_ZERO);
339 } else {
340 link.wqll = zalloc_flags(waitq_link_zone, Z_WAITOK | Z_ZERO);
341 }
342 return link;
343}
344
345void
346waitq_link_free(waitq_type_t type, waitq_link_t link)
347{
348 if (waitq_is_sellink(type)) {
349 return zfree(waitq_sellink_zone, link.wqls);
350 } else {
351 return zfree(waitq_link_zone, link.wqll);
352 }
353}
354
355void
356waitq_link_free_list(waitq_type_t type, waitq_link_list_t *free_l)
357{
358 waitq_link_t link;
359
360 wql_list_foreach_safe(link.wqls, free_l) {
361 waitq_link_free(type, link);
362 }
363
364 free_l->next = NULL;
365}
366
367
368#pragma mark global wait queues
369
370static __startup_data struct waitq g_boot_waitq;
371static SECURITY_READ_ONLY_LATE(struct waitq *) global_waitqs = &g_boot_waitq;
372static SECURITY_READ_ONLY_LATE(uint32_t) g_num_waitqs = 1;
373
374/*
375 * Zero out the used MSBs of the event.
376 */
377#define _CAST_TO_EVENT_MASK(event) \
378 ((waitq_flags_t)(uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
379
380static inline uint32_t
381waitq_hash(char *key, size_t length)
382{
383 return os_hash_jenkins(data: key, length) & (g_num_waitqs - 1);
384}
385
386/* return a global waitq pointer corresponding to the given event */
387struct waitq *
388_global_eventq(char *event, size_t event_length)
389{
390 return &global_waitqs[waitq_hash(key: event, length: event_length)];
391}
392
393bool
394waitq_is_valid(waitq_t waitq)
395{
396 return waitq_valid(waitq);
397}
398
399static inline bool
400waitq_is_global(waitq_t waitq)
401{
402 if (waitq_type(wq: waitq) != WQT_QUEUE) {
403 return false;
404 }
405 return waitq.wq_q >= global_waitqs && waitq.wq_q < global_waitqs + g_num_waitqs;
406}
407
408static inline bool
409waitq_empty(waitq_t wq)
410{
411 struct turnstile *ts;
412
413 switch (waitq_type(wq)) {
414 case WQT_TURNSTILE:
415 return priority_queue_empty(&wq.wq_q->waitq_prio_queue);
416 case WQT_PORT:
417 ts = wq.wq_q->waitq_ts;
418 return ts == TURNSTILE_NULL ||
419 priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
420 case WQT_QUEUE:
421 case WQT_SELECT:
422 case WQT_PORT_SET:
423 case WQT_SELECT_SET:
424 return circle_queue_empty(cq: &wq.wq_q->waitq_queue);
425
426 default:
427 return true;
428 }
429}
430
431#if CONFIG_WAITQ_STATS
432#define NWAITQ_BTFRAMES 5
433
434struct wq_stats {
435 uint64_t waits;
436 uint64_t wakeups;
437 uint64_t clears;
438 uint64_t failed_wakeups;
439
440 uintptr_t last_wait[NWAITQ_BTFRAMES];
441 uintptr_t last_wakeup[NWAITQ_BTFRAMES];
442 uintptr_t last_failed_wakeup[NWAITQ_BTFRAMES];
443};
444
445/* this global is for lldb */
446const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
447struct wq_stats g_boot_stats;
448struct wq_stats *g_waitq_stats = &g_boot_stats;
449
450static __inline__ void
451waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], unsigned skip)
452{
453 uintptr_t buf[NWAITQ_BTFRAMES + skip];
454
455 memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
456 backtrace(buf, g_nwaitq_btframes + skip, NULL, NULL);
457 memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
458}
459
460static __inline__ struct wq_stats *
461waitq_global_stats(waitq_t waitq)
462{
463 struct wq_stats *wqs;
464 uint32_t idx;
465
466 if (!waitq_is_global(waitq)) {
467 return NULL;
468 }
469
470 idx = (uint32_t)(waitq.wq_q - global_waitqs);
471 assert(idx < g_num_waitqs);
472 wqs = &g_waitq_stats[idx];
473 return wqs;
474}
475
476static __inline__ void
477waitq_stats_count_wait(waitq_t waitq)
478{
479 struct wq_stats *wqs = waitq_global_stats(waitq);
480 if (wqs != NULL) {
481 wqs->waits++;
482 waitq_grab_backtrace(wqs->last_wait, 2);
483 }
484}
485
486static __inline__ void
487waitq_stats_count_wakeup(waitq_t waitq, int n)
488{
489 struct wq_stats *wqs = waitq_global_stats(waitq);
490 if (wqs != NULL) {
491 if (n > 0) {
492 wqs->wakeups += n;
493 waitq_grab_backtrace(wqs->last_wakeup, 2);
494 } else {
495 wqs->failed_wakeups++;
496 waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
497 }
498 }
499}
500
501static __inline__ void
502waitq_stats_count_clear_wakeup(waitq_t waitq)
503{
504 struct wq_stats *wqs = waitq_global_stats(waitq);
505 if (wqs != NULL) {
506 wqs->wakeups++;
507 wqs->clears++;
508 waitq_grab_backtrace(wqs->last_wakeup, 2);
509 }
510}
511#else /* !CONFIG_WAITQ_STATS */
512#define waitq_stats_count_wait(q) do { } while (0)
513#define waitq_stats_count_wakeup(q, n) do { } while (0)
514#define waitq_stats_count_clear_wakeup(q) do { } while (0)
515#endif
516
517static struct waitq *
518waitq_get_safeq(waitq_t waitq)
519{
520 if (waitq_type(wq: waitq) == WQT_PORT) {
521 struct turnstile *ts = waitq.wq_q->waitq_ts;
522 return ts ? &ts->ts_waitq : NULL;
523 }
524
525 uint32_t hash = os_hash_kernel_pointer(pointer: waitq.wq_q);
526 return &global_waitqs[hash & (g_num_waitqs - 1)];
527}
528
529/*
530 * Since the priority ordered waitq uses basepri as the
531 * ordering key assert that this value fits in a uint8_t.
532 */
533static_assert(MAXPRI <= UINT8_MAX);
534
535static inline void
536waitq_thread_insert(struct waitq *safeq, thread_t thread,
537 waitq_t wq, event64_t event)
538{
539 if (waitq_type(wq: safeq) == WQT_TURNSTILE) {
540 turnstile_stats_update(hop: 0, flags: TSU_TURNSTILE_BLOCK_COUNT, NULL);
541 turnstile_waitq_add_thread_priority_queue(wq: safeq, thread);
542 } else {
543 turnstile_stats_update(hop: 0, flags: TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
544 /*
545 * This is the extent to which we currently take scheduling
546 * attributes into account:
547 *
548 * - If the thread is vm privileged, we stick it at the front
549 * of the queue, later, these queues will honor the policy
550 * value set at waitq_init time.
551 *
552 * - Realtime threads get priority for wait queue placements.
553 * This allows wait_queue_wakeup_one to prefer a waiting
554 * realtime thread, similar in principle to performing
555 * a wait_queue_wakeup_all and allowing scheduler
556 * prioritization to run the realtime thread, but without
557 * causing the lock contention of that scenario.
558 */
559 if (thread->sched_pri >= BASEPRI_REALTIME ||
560 !safeq->waitq_fifo ||
561 (thread->options & TH_OPT_VMPRIV)) {
562 circle_enqueue_head(cq: &safeq->waitq_queue, elt: &thread->wait_links);
563 } else {
564 circle_enqueue_tail(cq: &safeq->waitq_queue, elt: &thread->wait_links);
565 }
566 }
567
568 /* mark the event and real waitq, even if enqueued on a global safeq */
569 thread->wait_event = event;
570 thread->waitq = wq;
571}
572
573/**
574 * clear the thread-related waitq state, moving the thread from
575 * TH_WAIT to TH_WAIT | TH_WAKING, where it is no longer on a waitq and
576 * can expect to be go'ed in the near future.
577 *
578 * Clearing the waitq prevents further propagation of a turnstile boost
579 * on the thread and stops a clear_wait from succeeding.
580 *
581 * Conditions:
582 * 'thread' is locked, thread is waiting
583 */
584static inline void
585thread_clear_waitq_state(thread_t thread)
586{
587 assert(thread->state & TH_WAIT);
588
589 thread->waitq.wq_q = NULL;
590 thread->wait_event = NO_EVENT64;
591 thread->at_safe_point = FALSE;
592 thread->block_hint = kThreadWaitNone;
593 thread->state |= TH_WAKING;
594}
595
596static inline void
597waitq_thread_remove(waitq_t wq, thread_t thread)
598{
599 if (waitq_type(wq) == WQT_TURNSTILE) {
600 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
601 (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS,
602 (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
603 VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq.wq_q)),
604 thread_tid(thread), 0, 0, 0);
605 priority_queue_remove(que: &wq.wq_q->waitq_prio_queue,
606 elt: &thread->wait_prioq_links);
607 } else {
608 circle_dequeue(cq: &wq.wq_q->waitq_queue, elt: &thread->wait_links);
609 if (waitq_is_global(waitq: wq) && waitq_empty(wq)) {
610 wq.wq_q->waitq_eventmask = 0;
611 }
612 }
613
614 thread_clear_waitq_state(thread);
615}
616
617bool
618waitq_wait_possible(thread_t thread)
619{
620 return waitq_is_null(wq: thread->waitq) &&
621 ((thread->state & TH_WAKING) == 0);
622}
623
624__startup_func
625static void
626waitq_bootstrap(void)
627{
628 const uint32_t qsz = sizeof(struct waitq);
629 vm_offset_t whsize;
630 int cpu = 0;
631
632 /*
633 * Determine the amount of memory we're willing to reserve for
634 * the waitqueue hash table
635 */
636 if (!PE_parse_boot_argn(arg_string: "wqsize", arg_ptr: &whsize, max_arg: sizeof(whsize))) {
637 whsize = round_page(x: thread_max * qsz / 5);
638 }
639
640 /*
641 * Determine the number of waitqueues we can fit.
642 * The hash algorithm requires that this be a power of 2.
643 */
644 g_num_waitqs = 0x80000000u >> __builtin_clzl(whsize / qsz);
645 assert(g_num_waitqs > 0);
646 whsize = round_page(x: g_num_waitqs * qsz);
647
648 kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&global_waitqs, size: whsize,
649 flags: KMA_NOFAIL | KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_PERMANENT,
650 VM_KERN_MEMORY_WAITQ);
651
652#if CONFIG_WAITQ_STATS
653 whsize = round_page(g_num_waitqs * sizeof(struct wq_stats));
654 kmem_alloc(kernel_map, (vm_offset_t *)&g_waitq_stats, whsize,
655 KMA_NOFAIL | KMA_KOBJECT | KMA_NOPAGEWAIT | KMA_ZERO | KMA_PERMANENT,
656 VM_KERN_MEMORY_WAITQ);
657#endif
658
659 for (uint32_t i = 0; i < g_num_waitqs; i++) {
660 waitq_init(waitq: &global_waitqs[i], type: WQT_QUEUE, SYNC_POLICY_FIFO);
661 }
662
663 waitq_init(waitq: &select_conflict_queue, type: WQT_SELECT, SYNC_POLICY_FIFO);
664
665 percpu_foreach(setid, select_setid) {
666 /* is not cpu_number() but CPUs haven't been numbered yet */
667 *setid = cpu++;
668 }
669}
670STARTUP(MACH_IPC, STARTUP_RANK_FIRST, waitq_bootstrap);
671
672
673#pragma mark locking
674
675static hw_spin_timeout_status_t
676waitq_timeout_handler(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
677{
678 lck_spinlock_to_info_t lsti;
679 hw_lck_ticket_t tmp;
680 struct waitq *wq = _lock;
681
682 if (machine_timeout_suspended()) {
683 return HW_LOCK_TIMEOUT_CONTINUE;
684 }
685
686 lsti = lck_spinlock_timeout_hit(lck: &wq->waitq_interlock, owner: 0);
687 tmp.tcurnext = os_atomic_load(&wq->waitq_interlock.tcurnext, relaxed);
688
689 panic("waitq(%p) lock " HW_SPIN_TIMEOUT_FMT "; cpu=%d, "
690 "cticket: 0x%x, nticket: 0x%x, waiting for 0x%x, "
691 HW_SPIN_TIMEOUT_DETAILS_FMT,
692 wq, HW_SPIN_TIMEOUT_ARG(to, st), cpu_number(),
693 tmp.cticket, tmp.nticket, lsti->extra,
694 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
695}
696
697static const struct hw_spin_policy waitq_spin_policy = {
698 .hwsp_name = "waitq",
699#if defined(__i386__) || defined(__x86_64__)
700 .hwsp_timeout = &LockTimeOutTSC,
701#else
702 .hwsp_timeout_atomic = &LockTimeOut,
703#endif
704 /*
705 * Double the standard lock timeout, because wait queues tend
706 * to iterate over a number of threads - locking each. If there is
707 * a problem with a thread lock, it normally times out at the wait
708 * queue level first, hiding the real problem.
709 */
710 .hwsp_timeout_shift = 1,
711 .hwsp_lock_offset = offsetof(struct waitq, waitq_interlock),
712 .hwsp_op_timeout = waitq_timeout_handler,
713};
714
715void
716waitq_invalidate(waitq_t waitq)
717{
718 hw_lck_ticket_invalidate(tlock: &waitq.wq_q->waitq_interlock);
719}
720
721bool
722waitq_held(waitq_t wq)
723{
724 return hw_lck_ticket_held(tlock: &wq.wq_q->waitq_interlock);
725}
726
727void
728waitq_lock(waitq_t wq)
729{
730 (void)hw_lck_ticket_lock_to(&wq.wq_q->waitq_interlock,
731 &waitq_spin_policy, &waitq_lck_grp);
732#if defined(__x86_64__)
733 pltrace(FALSE);
734#endif
735}
736
737bool
738waitq_lock_try(waitq_t wq)
739{
740 bool rc = hw_lck_ticket_lock_try(&wq.wq_q->waitq_interlock, &waitq_lck_grp);
741
742#if defined(__x86_64__)
743 if (rc) {
744 pltrace(FALSE);
745 }
746#endif
747 return rc;
748}
749
750bool
751waitq_lock_reserve(waitq_t wq, uint32_t *ticket)
752{
753 return hw_lck_ticket_reserve(&wq.wq_q->waitq_interlock, ticket, &waitq_lck_grp);
754}
755
756void
757waitq_lock_wait(waitq_t wq, uint32_t ticket)
758{
759 (void)hw_lck_ticket_wait(&wq.wq_q->waitq_interlock, ticket,
760 &waitq_spin_policy, &waitq_lck_grp);
761#if defined(__x86_64__)
762 pltrace(FALSE);
763#endif
764}
765
766bool
767waitq_lock_allow_invalid(waitq_t wq)
768{
769 hw_lock_status_t rc;
770
771 rc = hw_lck_ticket_lock_allow_invalid(&wq.wq_q->waitq_interlock,
772 &waitq_spin_policy, &waitq_lck_grp);
773
774#if defined(__x86_64__)
775 if (rc == HW_LOCK_ACQUIRED) {
776 pltrace(FALSE);
777 }
778#endif
779 return rc == HW_LOCK_ACQUIRED;
780}
781
782void
783waitq_unlock(waitq_t wq)
784{
785 assert(waitq_held(wq));
786#if defined(__x86_64__)
787 pltrace(TRUE);
788#endif
789 hw_lck_ticket_unlock(tlock: &wq.wq_q->waitq_interlock);
790}
791
792
793#pragma mark assert_wait / wakeup
794
795struct waitq_select_args {
796 /* input parameters */
797 event64_t event;
798 wait_result_t result;
799 waitq_wakeup_flags_t flags;
800 uint32_t max_threads;
801 bool is_identified;
802
803 /* output parameters */
804 /* counts all woken threads, may have more threads than on threadq */
805 uint32_t nthreads;
806 /* preemption is disabled while threadq is non-empty */
807 circle_queue_head_t threadq;
808};
809
810static inline void
811maybe_adjust_thread_pri(
812 thread_t thread,
813 waitq_wakeup_flags_t flags,
814 __kdebug_only waitq_t waitq)
815{
816 /*
817 * If the caller is requesting the waitq subsystem to promote the
818 * priority of the awoken thread, then boost the thread's priority to
819 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
820 * higher priority). This boost must be removed via a call to
821 * waitq_clear_promotion_locked before the thread waits again.
822 */
823 if (flags & WAITQ_PROMOTE_PRIORITY) {
824 uintptr_t trace_waitq = 0;
825 if (__improbable(kdebug_enable)) {
826 trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq.wq_q);
827 }
828
829 sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_obj: trace_waitq);
830 }
831}
832
833static void
834waitq_select_queue_add(waitq_t waitq, thread_t thread, struct waitq_select_args *args)
835{
836 spl_t s = splsched();
837
838 thread_lock(thread);
839 thread_clear_waitq_state(thread);
840
841 if (!args->is_identified && thread->state & TH_RUN) {
842 /*
843 * A thread that is currently on core may try to clear its own
844 * wait with clear wait or by waking its own event instead of
845 * calling thread_block as is normally expected. After doing
846 * this, it expects to be able to immediately wait again.
847 *
848 * If we are currently on a different CPU and waking that
849 * thread, as soon as we unlock the waitq and thread, that
850 * operation could complete, but we would still be holding the
851 * thread on our flush queue, leaving it in the waking state
852 * where it can't yet assert another wait.
853 *
854 * Since we know that we won't actually need to enqueue the
855 * thread on the runq due to it being on core, we can just
856 * immediately unblock it here so that the thread will be in a
857 * waitable state after we release its thread lock from this
858 * lock hold.
859 *
860 * Wakeups using *_identify can't be allowed to pass
861 * thread block until they're resumed, so they can't use
862 * this path. That means they are not allowed to skip calling
863 * thread_block.
864 */
865 maybe_adjust_thread_pri(thread, flags: args->flags, waitq);
866 thread_go(thread, wresult: args->result, false);
867 } else {
868 if (circle_queue_empty(cq: &args->threadq)) {
869 /*
870 * preemption is disabled while threads are
871 * on threadq - balanced in:
872 * waitq_resume_identified_thread
873 * waitq_select_queue_flush
874 */
875 disable_preemption();
876 }
877
878 circle_enqueue_tail(cq: &args->threadq, elt: &thread->wait_links);
879 }
880
881 thread_unlock(thread);
882
883 splx(s);
884}
885
886
887#if SCHED_HYGIENE_DEBUG
888
889TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_threads, "waitq_flush_excess_threads", 20);
890TUNABLE_DEV_WRITEABLE(uint32_t, waitq_flush_excess_time_mt, "waitq_flush_excess_time_mt", 7200); /* 300us */
891
892#endif /* SCHED_HYGIENE_DEBUG */
893
894
895static void
896waitq_select_queue_flush(waitq_t waitq, struct waitq_select_args *args)
897{
898 thread_t thread = THREAD_NULL;
899
900 assert(!circle_queue_empty(&args->threadq));
901
902 int flushed_threads = 0;
903
904#if SCHED_HYGIENE_DEBUG
905 uint64_t start_time = ml_get_sched_hygiene_timebase();
906 disable_preemption();
907#endif /* SCHED_HYGIENE_DEBUG */
908
909 cqe_foreach_element_safe(thread, &args->threadq, wait_links) {
910 circle_dequeue(cq: &args->threadq, elt: &thread->wait_links);
911 assert_thread_magic(thread);
912
913 spl_t s = splsched();
914
915 thread_lock(thread);
916 maybe_adjust_thread_pri(thread, flags: args->flags, waitq);
917 thread_go(thread, wresult: args->result, try_handoff: args->flags & WAITQ_HANDOFF);
918 thread_unlock(thread);
919
920 splx(s);
921
922 flushed_threads++;
923 }
924
925#if SCHED_HYGIENE_DEBUG
926 uint64_t end_time = ml_get_sched_hygiene_timebase();
927
928 /*
929 * Check for a combination of excess threads and long time,
930 * so that a single thread wakeup that gets stuck is still caught
931 */
932 if (waitq_flush_excess_threads && waitq_flush_excess_time_mt &&
933 flushed_threads > waitq_flush_excess_threads &&
934 (end_time - start_time) > waitq_flush_excess_time_mt) {
935 /*
936 * Hack alert:
937 *
938 * If a wakeup-all is done with interrupts disabled, or if
939 * there are enough threads / lock contention to pass the
940 * preemption disable threshold, it can take Too Long to get
941 * through waking up all the threads, leading to
942 * the watchdog going off.
943 *
944 * While we are working on a change to break up this
945 * giant glob of work into smaller chunks, remove this
946 * time region from the watchdog's memory to avoid
947 * unit tests that wake up hundreds of threads on
948 * one semaphore from causing this to blow up.
949 *
950 * We only trigger this when seeing a combination of
951 * excess threads and long time, so that a single
952 * thread wakeup that gets stuck is still caught.
953 *
954 * This was improved with
955 * rdar://90325140
956 * to enable interrupts during most wakeup-all's
957 * and will be removed with
958 * rdar://101110793
959 */
960 if (ml_get_interrupts_enabled() == false) {
961 ml_spin_debug_reset(current_thread());
962 ml_irq_debug_abandon();
963 }
964 abandon_preemption_disable_measurement();
965
966 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_RESET), flushed_threads, end_time - start_time);
967 }
968
969 enable_preemption();
970
971#endif /* SCHED_HYGIENE_DEBUG */
972
973 /*
974 * match the disable when making threadq nonempty from
975 * waitq_select_queue_add
976 */
977 enable_preemption();
978}
979
980/**
981 * Routine to iterate over the waitq for non-priority ordered waitqs
982 *
983 * Conditions:
984 * args->waitq (and the posted waitq) is locked
985 *
986 * Notes:
987 * If one or more threads are selected, this may disable preemption,
988 * which is balanced when the threadq is flushed in
989 * waitq_resume_identified_thread or waitq_select_queue_flush.
990 */
991static waitq_flags_t
992waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,
993 struct waitq_select_args *args)
994{
995 thread_t thread = THREAD_NULL;
996 waitq_flags_t eventmask = 0;
997
998 cqe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
999 assert_thread_magic(thread);
1000
1001 /*
1002 * For non-priority ordered waitqs, we allow multiple events to be
1003 * mux'ed into the same waitq. Also safeqs may contain threads from
1004 * multiple waitqs. Only pick threads that match the
1005 * requested wait event.
1006 */
1007 if (waitq_same(wq1: thread->waitq, wq2: waitq) && thread->wait_event == args->event) {
1008 /* We found a matching thread! Pull it from the queue. */
1009
1010 circle_dequeue(cq: &safeq->waitq_queue, elt: &thread->wait_links);
1011
1012 waitq_select_queue_add(waitq: waitq, thread, args);
1013
1014 if (++args->nthreads >= args->max_threads) {
1015 break;
1016 }
1017 } else {
1018 /* thread wasn't selected so track its event */
1019 eventmask |= waitq_same(wq1: thread->waitq, wq2: safeq)
1020 ? _CAST_TO_EVENT_MASK(thread->wait_event)
1021 : _CAST_TO_EVENT_MASK(thread->waitq.wq_q);
1022 }
1023 }
1024
1025 return eventmask;
1026}
1027
1028/**
1029 * Routine to iterate and remove threads from priority ordered waitqs
1030 *
1031 * Conditions:
1032 * args->waitq (and the posted waitq) is locked
1033 *
1034 * Notes:
1035 * The priority ordered waitqs only support maximum priority element removal.
1036 *
1037 * Also, the implementation makes sure that all threads in a priority ordered
1038 * waitq are waiting on the same wait event. This is not necessarily true for
1039 * non-priority ordered waitqs. If one or more threads are selected, this may
1040 * disable preemption.
1041 */
1042static void
1043waitq_prioq_iterate_locked(
1044 struct waitq *ts_wq,
1045 struct waitq *waitq,
1046 struct waitq_select_args *args)
1047{
1048 struct turnstile *ts = waitq_to_turnstile(ts_wq);
1049 bool update_inheritor = (args->flags & WAITQ_UPDATE_INHERITOR);
1050
1051 if (update_inheritor && args->max_threads == UINT32_MAX) {
1052 /*
1053 * If we are going to wake up all threads,
1054 * go ahead and set the inheritor to NULL.
1055 */
1056 turnstile_kernel_update_inheritor_on_wake_locked(turnstile: ts,
1057 TURNSTILE_INHERITOR_NULL, flags: TURNSTILE_INHERITOR_THREAD);
1058 update_inheritor = false;
1059 }
1060
1061 while (!priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1062 thread_t thread;
1063
1064 thread = priority_queue_remove_max(&ts_wq->waitq_prio_queue,
1065 struct thread, wait_prioq_links);
1066
1067 assert_thread_magic(thread);
1068
1069 /*
1070 * Ensure the wait event matches since priority ordered waitqs do not
1071 * support multiple events in the same waitq.
1072 */
1073 assert(waitq_same(thread->waitq, waitq) && (thread->wait_event == args->event));
1074
1075 if (update_inheritor) {
1076 turnstile_inheritor_t inheritor = thread;
1077
1078 if (priority_queue_empty(&ts_wq->waitq_prio_queue)) {
1079 inheritor = TURNSTILE_INHERITOR_NULL;
1080 }
1081 turnstile_kernel_update_inheritor_on_wake_locked(turnstile: ts,
1082 new_inheritor: inheritor, flags: TURNSTILE_INHERITOR_THREAD);
1083 update_inheritor = false;
1084 }
1085
1086 waitq_select_queue_add(waitq: waitq, thread, args);
1087
1088 if (++args->nthreads >= args->max_threads) {
1089 break;
1090 }
1091 }
1092}
1093
1094/**
1095 * @function do_waitq_select_n_locked_queue
1096 *
1097 * @brief
1098 * Selects threads waiting on a wait queue.
1099 *
1100 * @discussion
1101 * @c waitq is locked.
1102 * If @c waitq is a set, then the wait queue posting to it is locked too.
1103 *
1104 * If one or more threads are selected, this may disable preemption.
1105 */
1106static void
1107do_waitq_select_n_locked_queue(waitq_t waitq, struct waitq_select_args *args)
1108{
1109 spl_t s = 0;
1110
1111 struct waitq *safeq;
1112 waitq_flags_t eventmask, remaining_eventmask;
1113
1114 if (waitq_irq_safe(waitq)) {
1115 eventmask = _CAST_TO_EVENT_MASK(args->event);
1116 safeq = waitq.wq_q;
1117 } else {
1118 /* JMM - add flag to waitq to avoid global lookup if no waiters */
1119 eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1120 safeq = waitq_get_safeq(waitq);
1121 if (safeq == NULL) {
1122 return;
1123 }
1124
1125 s = splsched();
1126 waitq_lock(wq: safeq);
1127 }
1128
1129 /*
1130 * If the safeq doesn't have an eventmask (not global) or the event
1131 * we're looking for IS set in its eventmask, then scan the threads
1132 * in that queue for ones that match the original <waitq,event> pair.
1133 */
1134 if (waitq_type(wq: safeq) == WQT_TURNSTILE) {
1135 waitq_prioq_iterate_locked(ts_wq: safeq, waitq: waitq.wq_q, args);
1136 } else if (!waitq_is_global(waitq: safeq)) {
1137 waitq_queue_iterate_locked(safeq, waitq: waitq.wq_q, args);
1138 } else if ((safeq->waitq_eventmask & eventmask) == eventmask) {
1139 remaining_eventmask = waitq_queue_iterate_locked(safeq,
1140 waitq: waitq.wq_q, args);
1141
1142 /*
1143 * Update the eventmask of global queues we just scanned:
1144 * - If we selected all the threads in the queue,
1145 * we can clear its eventmask.
1146 *
1147 * - If we didn't find enough threads to fill our needs,
1148 * then we can assume we looked at every thread in the queue
1149 * and the mask we computed is complete - so reset it.
1150 */
1151 if (waitq_empty(wq: safeq)) {
1152 safeq->waitq_eventmask = 0;
1153 } else if (args->nthreads < args->max_threads) {
1154 safeq->waitq_eventmask = remaining_eventmask;
1155 }
1156 }
1157
1158 /* unlock the safe queue if we locked one above */
1159 if (!waitq_same(wq1: waitq, wq2: safeq)) {
1160 waitq_unlock(wq: safeq);
1161 splx(s);
1162 }
1163}
1164
1165/**
1166 * @function do_waitq_link_select_n_locked()
1167 *
1168 * @brief
1169 * Selects threads waiting on any set a wait queue belongs to,
1170 * or preposts the wait queue onto them.
1171 *
1172 * @discussion
1173 * @c waitq is locked.
1174 */
1175__attribute__((noinline))
1176static void
1177do_waitq_select_n_locked_sets(waitq_t waitq, struct waitq_select_args *args)
1178{
1179 waitq_type_t wq_type = waitq_type(wq: waitq);
1180 waitq_link_t link;
1181
1182 assert(args->event == NO_EVENT64);
1183 assert(waitq_preposts(waitq));
1184
1185 waitq_link_foreach(link, waitq) {
1186 waitq_t wqset = wql_wqs(link);
1187
1188 if (wql_wqs_preposted(link)) {
1189 /*
1190 * The wql_wqs_preposted() bit is cleared
1191 * under both the wq/wqset lock.
1192 *
1193 * If the wqset is still preposted,
1194 * we really won't find threads there.
1195 *
1196 * Just mark the waitq as preposted and move on.
1197 */
1198 if (wq_type == WQT_PORT) {
1199 waitq.wq_q->waitq_preposted = true;
1200 }
1201 continue;
1202 }
1203
1204 if (wq_type == WQT_SELECT) {
1205 /*
1206 * If PGZ picked this select set,
1207 * translate it to the real address
1208 *
1209 * If it is still a select set
1210 * (the slot could have been reused),
1211 * then keep using it for the rest of the logic.
1212 *
1213 * Even in the extremely unlikely case where
1214 * the slot was reused for another select_set,
1215 * the `wql_sellink_valid` check below will
1216 * take care of debouncing it. But we must
1217 * forget the original pointer we read
1218 * so that we unlock the proper object.
1219 */
1220 wqset.wqs_sel = pgz_decode_allow_invalid(wqset.wqs_sel,
1221 ZONE_ID_SELECT_SET);
1222 if (!wqset.wqs_sel) {
1223 continue;
1224 }
1225 if (!waitq_lock_allow_invalid(wq: wqset)) {
1226 continue;
1227 }
1228 if (!wql_sellink_valid(selset: wqset.wqs_sel, link: link.wqls)) {
1229 goto out_unlock;
1230 }
1231 } else {
1232 waitq_lock(wq: wqset);
1233 if (!waitq_valid(waitq: wqset)) {
1234 goto out_unlock;
1235 }
1236 }
1237
1238 /*
1239 * Find any threads waiting on this wait queue set as a queue.
1240 */
1241 do_waitq_select_n_locked_queue(waitq: wqset, args);
1242
1243 if (args->nthreads == 0) {
1244 /* No thread selected: prepost 'waitq' to 'wqset' */
1245 wql_wqs_mark_preposted(link);
1246 if (wq_type == WQT_SELECT) {
1247 wqset.wqs_sel->selset_preposted = true;
1248 } else {
1249 waitq.wq_q->waitq_preposted = true;
1250 circle_dequeue(cq: &wqset.wqs_set->wqset_links,
1251 elt: &link.wqll->wql_slink);
1252 circle_enqueue_tail(cq: &wqset.wqs_set->wqset_preposts,
1253 elt: &link.wqll->wql_slink);
1254 ipc_pset_prepost(wqset: wqset.wqs_set, waitq: waitq.wq_q);
1255 }
1256 }
1257
1258out_unlock:
1259 waitq_unlock(wq: wqset);
1260
1261 if (args->nthreads >= args->max_threads) {
1262 break;
1263 }
1264 }
1265}
1266
1267/**
1268 * @function do_waitq_select_n_locked
1269 *
1270 * @brief
1271 * Selects threads waiting on a wait queue, or preposts it.
1272 *
1273 * @discussion
1274 * @c waitq is locked.
1275 *
1276 * Recurses into all sets this wait queue belongs to.
1277 */
1278static void
1279do_waitq_select_n_locked(waitq_t waitq, struct waitq_select_args *args)
1280{
1281 do_waitq_select_n_locked_queue(waitq, args);
1282
1283 if (args->nthreads >= args->max_threads) {
1284 /* already enough threads found */
1285 return;
1286 }
1287
1288 if (args->event != NO_EVENT64 || !waitq_preposts(wq: waitq)) {
1289 /* this wakeup should not recurse into sets */
1290 return;
1291 }
1292
1293 do_waitq_select_n_locked_sets(waitq, args);
1294}
1295
1296static inline bool
1297waitq_is_preposted_set(waitq_t waitq)
1298{
1299 switch (waitq_type(wq: waitq)) {
1300 case WQT_PORT_SET:
1301 return waitq_set_first_prepost(wqset: waitq.wqs_set, flags: WQS_PREPOST_PEEK) != NULL;
1302
1303 case WQT_SELECT_SET:
1304 return waitq.wqs_sel->selset_preposted;
1305
1306 default:
1307 return false;
1308 }
1309}
1310
1311wait_result_t
1312waitq_assert_wait64_locked(waitq_t waitq,
1313 event64_t wait_event,
1314 wait_interrupt_t interruptible,
1315 wait_timeout_urgency_t urgency,
1316 uint64_t deadline,
1317 uint64_t leeway,
1318 thread_t thread)
1319{
1320 wait_result_t wait_result;
1321 struct waitq *safeq;
1322 uintptr_t eventmask;
1323 spl_t s;
1324
1325 switch (waitq_type(wq: waitq)) {
1326 case WQT_PORT:
1327 case WQT_SELECT:
1328 case WQT_PORT_SET:
1329 case WQT_SELECT_SET:
1330 assert(wait_event == NO_EVENT64);
1331 break;
1332 default:
1333 assert(wait_event != NO_EVENT64);
1334 break;
1335 }
1336
1337 /*
1338 * Warning: Do _not_ place debugging print statements here.
1339 * The waitq is locked!
1340 */
1341 assert(!thread->started || thread == current_thread());
1342
1343 if (!waitq_wait_possible(thread)) {
1344 panic("thread already waiting on %p", thread->waitq.wq_q);
1345 }
1346
1347 s = splsched();
1348
1349 /*
1350 * early-out if the thread is waiting on a wait queue set
1351 * that has already been pre-posted.
1352 *
1353 * Note: waitq_is_preposted_set() may unlock the waitq-set
1354 */
1355 if (waitq_is_preposted_set(waitq)) {
1356 thread_lock(thread);
1357 thread->wait_result = THREAD_AWAKENED;
1358 thread_unlock(thread);
1359 splx(s);
1360 return THREAD_AWAKENED;
1361 }
1362
1363 /*
1364 * If already dealing with an irq safe wait queue, we are all set.
1365 * Otherwise, determine a global queue to use and lock it.
1366 */
1367 if (waitq_irq_safe(waitq)) {
1368 safeq = waitq.wq_q;
1369 eventmask = _CAST_TO_EVENT_MASK(wait_event);
1370 } else {
1371 safeq = waitq_get_safeq(waitq);
1372 if (__improbable(safeq == NULL)) {
1373 panic("Trying to assert_wait on a turnstile proxy "
1374 "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1375 }
1376 eventmask = _CAST_TO_EVENT_MASK(waitq.wq_q);
1377 waitq_lock(wq: safeq);
1378 }
1379
1380 /* lock the thread now that we have the irq-safe waitq locked */
1381 thread_lock(thread);
1382
1383 wait_result = thread_mark_wait_locked(thread, interruptible);
1384 /* thread->wait_result has been set */
1385 if (wait_result == THREAD_WAITING) {
1386 waitq_thread_insert(safeq, thread, wq: waitq, event: wait_event);
1387
1388 if (deadline != 0) {
1389 bool was_active;
1390
1391 was_active = timer_call_enter_with_leeway(call: thread->wait_timer,
1392 NULL,
1393 deadline, leeway,
1394 flags: urgency, FALSE);
1395 if (!was_active) {
1396 thread->wait_timer_active++;
1397 }
1398 thread->wait_timer_armed = true;
1399 }
1400
1401 if (waitq_is_global(waitq: safeq)) {
1402 safeq->waitq_eventmask |= (waitq_flags_t)eventmask;
1403 }
1404
1405 waitq_stats_count_wait(waitq);
1406 }
1407
1408 /* unlock the thread */
1409 thread_unlock(thread);
1410
1411 /* update the inheritor's thread priority if the waitq is embedded in turnstile */
1412 if (waitq_type(wq: safeq) == WQT_TURNSTILE && wait_result == THREAD_WAITING) {
1413 turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
1414 turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
1415 }
1416
1417 /* unlock the safeq if we locked it here */
1418 if (!waitq_same(wq1: waitq, wq2: safeq)) {
1419 waitq_unlock(wq: safeq);
1420 }
1421
1422 splx(s);
1423
1424 return wait_result;
1425}
1426
1427bool
1428waitq_pull_thread_locked(waitq_t waitq, thread_t thread)
1429{
1430 struct waitq *safeq;
1431 uint32_t ticket;
1432
1433 assert_thread_magic(thread);
1434
1435 /* Find the interrupts disabled queue thread is waiting on */
1436 if (waitq_irq_safe(waitq)) {
1437 safeq = waitq.wq_q;
1438 } else {
1439 safeq = waitq_get_safeq(waitq);
1440 if (__improbable(safeq == NULL)) {
1441 panic("Trying to clear_wait on a turnstile proxy "
1442 "that hasn't been donated one (waitq: %p)", waitq.wq_q);
1443 }
1444 }
1445
1446 /*
1447 * thread is already locked so have to try for the waitq lock.
1448 *
1449 * We can't wait for the waitq lock under the thread lock,
1450 * however we can reserve our slot in the lock queue,
1451 * and if that reservation requires waiting, we are guaranteed
1452 * that this waitq can't die until we got our turn!
1453 */
1454 if (!waitq_lock_reserve(wq: safeq, ticket: &ticket)) {
1455 thread_unlock(thread);
1456 waitq_lock_wait(wq: safeq, ticket);
1457 thread_lock(thread);
1458
1459 if (!waitq_same(wq1: waitq, wq2: thread->waitq)) {
1460 /*
1461 * While we were waiting for our reservation the thread
1462 * stopped waiting on this waitq, bail out.
1463 */
1464 waitq_unlock(wq: safeq);
1465 return false;
1466 }
1467 }
1468
1469 waitq_thread_remove(wq: safeq, thread);
1470 waitq_stats_count_clear_wakeup(waitq);
1471 waitq_unlock(wq: safeq);
1472 return true;
1473}
1474
1475
1476void
1477waitq_clear_promotion_locked(waitq_t waitq, thread_t thread)
1478{
1479 spl_t s = 0;
1480
1481 assert(waitq_held(waitq));
1482 assert(thread != THREAD_NULL);
1483 assert(thread == current_thread());
1484
1485 /* This flag is only cleared by the thread itself, so safe to check outside lock */
1486 if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
1487 return;
1488 }
1489
1490 if (!waitq_irq_safe(waitq)) {
1491 s = splsched();
1492 }
1493 thread_lock(thread);
1494
1495 sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_obj: 0);
1496
1497 thread_unlock(thread);
1498 if (!waitq_irq_safe(waitq)) {
1499 splx(s);
1500 }
1501}
1502
1503static inline bool
1504waitq_should_unlock(waitq_wakeup_flags_t flags)
1505{
1506 return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED)) == WAITQ_UNLOCK;
1507}
1508
1509static inline bool
1510waitq_should_enable_interrupts(waitq_wakeup_flags_t flags)
1511{
1512 return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED | WAITQ_ENABLE_INTERRUPTS)) == (WAITQ_UNLOCK | WAITQ_ENABLE_INTERRUPTS);
1513}
1514
1515kern_return_t
1516waitq_wakeup64_all_locked(
1517 waitq_t waitq,
1518 event64_t wake_event,
1519 wait_result_t result,
1520 waitq_wakeup_flags_t flags)
1521{
1522 struct waitq_select_args args = {
1523 .event = wake_event,
1524 .result = result,
1525 .flags = flags & ~WAITQ_HANDOFF,
1526 .max_threads = UINT32_MAX,
1527 };
1528
1529 assert(waitq_held(waitq));
1530
1531 if (flags & WAITQ_ENABLE_INTERRUPTS) {
1532 assert(waitq_should_unlock(flags));
1533 assert(ml_get_interrupts_enabled() == false);
1534 }
1535
1536 do_waitq_select_n_locked(waitq, args: &args);
1537 waitq_stats_count_wakeup(waitq, args.nthreads);
1538
1539 if (waitq_should_unlock(flags)) {
1540 waitq_unlock(wq: waitq);
1541 }
1542
1543 if (waitq_should_enable_interrupts(flags)) {
1544 ml_set_interrupts_enabled(true);
1545 }
1546
1547 if (!circle_queue_empty(cq: &args.threadq)) {
1548 waitq_select_queue_flush(waitq, args: &args);
1549 }
1550
1551 if (args.nthreads > 0) {
1552 return KERN_SUCCESS;
1553 }
1554
1555 return KERN_NOT_WAITING;
1556}
1557
1558kern_return_t
1559waitq_wakeup64_one_locked(
1560 waitq_t waitq,
1561 event64_t wake_event,
1562 wait_result_t result,
1563 waitq_wakeup_flags_t flags)
1564{
1565 struct waitq_select_args args = {
1566 .event = wake_event,
1567 .result = result,
1568 .flags = flags,
1569 .max_threads = 1,
1570 };
1571
1572 assert(waitq_held(waitq));
1573
1574 if (flags & WAITQ_ENABLE_INTERRUPTS) {
1575 assert(waitq_should_unlock(flags));
1576 assert(ml_get_interrupts_enabled() == false);
1577 }
1578
1579 do_waitq_select_n_locked(waitq, args: &args);
1580 waitq_stats_count_wakeup(waitq, args.nthreads);
1581
1582 if (waitq_should_unlock(flags)) {
1583 waitq_unlock(wq: waitq);
1584 }
1585
1586 if (waitq_should_enable_interrupts(flags)) {
1587 ml_set_interrupts_enabled(true);
1588 }
1589
1590 if (!circle_queue_empty(cq: &args.threadq)) {
1591 waitq_select_queue_flush(waitq, args: &args);
1592 }
1593
1594 if (args.nthreads > 0) {
1595 return KERN_SUCCESS;
1596 }
1597
1598 return KERN_NOT_WAITING;
1599}
1600
1601thread_t
1602waitq_wakeup64_identify_locked(
1603 waitq_t waitq,
1604 event64_t wake_event,
1605 wait_result_t result,
1606 waitq_wakeup_flags_t flags)
1607{
1608 struct waitq_select_args args = {
1609 .event = wake_event,
1610 .result = result,
1611 .flags = flags,
1612 .max_threads = 1,
1613 .is_identified = true,
1614 };
1615
1616 assert(waitq_held(waitq));
1617
1618 do_waitq_select_n_locked(waitq, args: &args);
1619 waitq_stats_count_wakeup(waitq, args.nthreads);
1620
1621 if (waitq_should_unlock(flags)) {
1622 waitq_unlock(wq: waitq);
1623 }
1624
1625 if (waitq_should_enable_interrupts(flags)) {
1626 ml_set_interrupts_enabled(true);
1627 }
1628
1629 if (args.nthreads > 0) {
1630 thread_t thread = cqe_dequeue_head(&args.threadq, struct thread, wait_links);
1631
1632 assert(args.nthreads == 1 && circle_queue_empty(&args.threadq));
1633
1634 /* Thread is off waitq, not unblocked yet */
1635
1636 return thread;
1637 }
1638
1639 return THREAD_NULL;
1640}
1641
1642void
1643waitq_resume_identified_thread(
1644 waitq_t waitq,
1645 thread_t thread,
1646 wait_result_t result,
1647 waitq_wakeup_flags_t flags)
1648{
1649 spl_t spl = splsched();
1650
1651 thread_lock(thread);
1652
1653 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1654
1655 maybe_adjust_thread_pri(thread, flags, waitq);
1656 thread_go(thread, wresult: result, try_handoff: (flags & WAITQ_HANDOFF));
1657
1658 thread_unlock(thread);
1659 splx(spl);
1660
1661 enable_preemption(); // balance disable upon pulling thread
1662}
1663
1664void
1665waitq_resume_and_bind_identified_thread(
1666 waitq_t waitq,
1667 thread_t thread,
1668 processor_t processor,
1669 wait_result_t result,
1670 waitq_wakeup_flags_t flags)
1671{
1672 spl_t spl = splsched();
1673
1674 thread_lock(thread);
1675
1676 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1677
1678 maybe_adjust_thread_pri(thread, flags, waitq);
1679 thread_bind_during_wakeup(thread, processor);
1680 thread_go(thread, wresult: result, try_handoff: (flags & WAITQ_HANDOFF));
1681
1682 thread_unlock(thread);
1683 splx(spl);
1684
1685 enable_preemption(); // balance disable upon pulling thread
1686}
1687
1688kern_return_t
1689waitq_wakeup64_thread_and_unlock(
1690 struct waitq *waitq,
1691 event64_t event,
1692 thread_t thread,
1693 wait_result_t result)
1694{
1695 kern_return_t ret = KERN_NOT_WAITING;
1696
1697 assert(waitq_irq_safe(waitq));
1698 assert(waitq_held(waitq));
1699 assert_thread_magic(thread);
1700
1701 /*
1702 * See if the thread was still waiting there. If so, it got
1703 * dequeued and returned locked.
1704 *
1705 * By holding the thread locked across the go, a thread on another CPU
1706 * can't see itself in 'waking' state, even if it uses clear_wait.
1707 */
1708 thread_lock(thread);
1709
1710 if (waitq_same(wq1: thread->waitq, wq2: waitq) && thread->wait_event == event) {
1711 waitq_thread_remove(wq: waitq, thread);
1712 ret = KERN_SUCCESS;
1713 }
1714 waitq_stats_count_wakeup(waitq, ret == KERN_SUCCESS ? 1 : 0);
1715
1716 waitq_unlock(wq: waitq);
1717
1718 if (ret == KERN_SUCCESS) {
1719 thread_go(thread, wresult: result, /* handoff */ false);
1720 }
1721
1722 thread_unlock(thread);
1723
1724 return ret;
1725}
1726
1727
1728#pragma mark waitq
1729
1730__attribute__((always_inline))
1731void
1732waitq_init(waitq_t waitq, waitq_type_t type, int policy)
1733{
1734 assert((policy & SYNC_POLICY_FIXED_PRIORITY) == 0);
1735
1736 *waitq.wq_q = (struct waitq){
1737 .waitq_type = type,
1738 .waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0),
1739 };
1740
1741 switch (type) {
1742 case WQT_INVALID:
1743 __builtin_trap();
1744
1745 case WQT_TURNSTILE:
1746 /* For turnstile, initialize it as a priority queue */
1747 priority_queue_init(que: &waitq.wq_q->waitq_prio_queue);
1748 assert(waitq.wq_q->waitq_fifo == 0);
1749 break;
1750
1751 case WQT_PORT:
1752 waitq.wq_q->waitq_ts = TURNSTILE_NULL;
1753 break;
1754
1755 case WQT_PORT_SET:
1756 circle_queue_init(&waitq.wqs_set->wqset_preposts);
1757 OS_FALLTHROUGH;
1758 case WQT_SELECT_SET:
1759 case WQT_QUEUE:
1760 case WQT_SELECT:
1761 circle_queue_init(&waitq.wq_q->waitq_queue);
1762 break;
1763 }
1764
1765 if (policy & SYNC_POLICY_INIT_LOCKED) {
1766 hw_lck_ticket_init_locked(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1767 } else {
1768 hw_lck_ticket_init(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1769 }
1770}
1771
1772void
1773waitq_deinit(waitq_t waitq)
1774{
1775 waitq_type_t type = waitq_type(wq: waitq);
1776
1777 switch (type) {
1778 case WQT_QUEUE:
1779 assert(circle_queue_empty(&waitq.wq_q->waitq_queue));
1780 waitq_invalidate(waitq);
1781 break;
1782
1783 case WQT_TURNSTILE:
1784 assert(priority_queue_empty(&waitq.wq_q->waitq_prio_queue));
1785 assert(waitq.wq_q->waitq_inheritor == TURNSTILE_INHERITOR_NULL);
1786 waitq_invalidate(waitq);
1787 break;
1788
1789 case WQT_PORT:
1790 assert(waitq.wq_q->waitq_ts == TURNSTILE_NULL);
1791 assert(circle_queue_empty(&waitq.wq_q->waitq_links));
1792 break;
1793
1794 case WQT_SELECT:
1795 assert(waitq.wq_q->waitq_sellinks.next == NULL);
1796 assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1797 break;
1798
1799 case WQT_PORT_SET:
1800 assert(circle_queue_empty(&waitq.wqs_set->wqset_queue));
1801 assert(circle_queue_empty(&waitq.wqs_set->wqset_links));
1802 assert(circle_queue_empty(&waitq.wqs_set->wqset_preposts));
1803 break;
1804
1805 default:
1806 panic("invalid wait type: %p/%d", waitq.wq_q, type);
1807 }
1808
1809 /*
1810 * The waitq must have been invalidated, or hw_lck_ticket_destroy()
1811 * below won't wait for reservations from waitq_lock_reserve(),
1812 * or waitq_lock_allow_invalid().
1813 */
1814 assert(!waitq_valid(waitq.wqs_set));
1815 hw_lck_ticket_destroy(&waitq.wq_q->waitq_interlock, &waitq_lck_grp);
1816}
1817
1818
1819#pragma mark port-set sets
1820
1821void
1822waitq_set_unlink_all_locked(struct waitq_set *wqset, waitq_link_list_t *free_l)
1823{
1824 uint32_t batch = waitq_set_unlink_batch;
1825
1826 waitq_invalidate(waitq: wqset);
1827
1828 for (;;) {
1829 struct waitq_link *link;
1830 queue_entry_t elt;
1831 circle_queue_t q;
1832 struct waitq *wq;
1833 uint32_t ticket;
1834 bool stable = true;
1835
1836 if (!circle_queue_empty(cq: &wqset->wqset_links)) {
1837 q = &wqset->wqset_links;
1838 } else if (!circle_queue_empty(cq: &wqset->wqset_preposts)) {
1839 q = &wqset->wqset_preposts;
1840 } else {
1841 break;
1842 }
1843
1844 if (batch-- == 0) {
1845 waitq_unlock(wq: wqset);
1846 waitq_lock(wq: wqset);
1847 batch = waitq_set_unlink_batch;
1848 continue;
1849 }
1850
1851 elt = circle_queue_first(cq: q);
1852 link = cqe_element(elt, struct waitq_link, wql_slink);
1853 wq = link->wql_wq;
1854
1855 if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
1856 waitq_unlock(wq: wqset);
1857 waitq_lock_wait(wq: wq, ticket);
1858 waitq_lock(wq: wqset);
1859 stable = (elt == circle_queue_first(cq: q) && link->wql_wq == wq);
1860 }
1861
1862 if (stable) {
1863 circle_dequeue(cq: q, elt: &link->wql_slink);
1864 circle_dequeue(cq: &wq->waitq_links, elt: &link->wql_qlink);
1865 wql_list_push(list: free_l, link: link);
1866 }
1867
1868 waitq_unlock(wq: wq);
1869 }
1870}
1871
1872void
1873waitq_clear_prepost_locked(struct waitq *waitq)
1874{
1875 assert(waitq_type(waitq) == WQT_PORT);
1876 waitq->waitq_preposted = false;
1877}
1878
1879void
1880waitq_set_foreach_member_locked(struct waitq_set *wqs, void (^cb)(struct waitq *))
1881{
1882 struct waitq_link *link;
1883
1884 cqe_foreach_element(link, &wqs->wqset_links, wql_slink) {
1885 cb(link->wql_wq);
1886 }
1887
1888 cqe_foreach_element(link, &wqs->wqset_preposts, wql_slink) {
1889 cb(link->wql_wq);
1890 }
1891}
1892
1893__abortlike
1894static void
1895__waitq_link_arguments_panic(struct waitq *waitq, struct waitq_set *wqset)
1896{
1897 if (!waitq_valid(waitq: waitq)) {
1898 panic("Invalid waitq: %p", waitq);
1899 }
1900 if (waitq_type(wq: waitq) != WQT_PORT) {
1901 panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
1902 }
1903 panic("Invalid waitq-set: %p", wqset);
1904}
1905
1906static inline void
1907__waitq_link_arguments_validate(struct waitq *waitq, struct waitq_set *wqset)
1908{
1909 if (!waitq_valid(waitq: waitq) ||
1910 waitq_type(wq: waitq) != WQT_PORT ||
1911 waitq_type(wq: wqset) != WQT_PORT_SET) {
1912 __waitq_link_arguments_panic(waitq, wqset);
1913 }
1914}
1915
1916__abortlike
1917static void
1918__waitq_invalid_panic(waitq_t waitq)
1919{
1920 panic("Invalid waitq: %p", waitq.wq_q);
1921}
1922
1923static void
1924__waitq_validate(waitq_t waitq)
1925{
1926 if (!waitq_valid(waitq)) {
1927 __waitq_invalid_panic(waitq);
1928 }
1929}
1930
1931kern_return_t
1932waitq_link_locked(struct waitq *waitq, struct waitq_set *wqset,
1933 waitq_link_t *linkp)
1934{
1935 assert(linkp->wqlh);
1936
1937 __waitq_link_arguments_validate(waitq, wqset);
1938
1939 if (wql_find(waitq, wqset: wqset)) {
1940 return KERN_ALREADY_IN_SET;
1941 }
1942
1943 linkp->wqll->wql_wq = waitq;
1944 linkp->wqll->wql_wqs = (uintptr_t)wqset;
1945
1946 if (waitq_valid(waitq: wqset)) {
1947 circle_enqueue_tail(cq: &wqset->wqset_links, elt: &linkp->wqll->wql_slink);
1948 circle_enqueue_tail(cq: &waitq->waitq_links, elt: &linkp->wqll->wql_qlink);
1949 *linkp = WQL_NULL;
1950 }
1951
1952 return KERN_SUCCESS;
1953}
1954
1955kern_return_t
1956waitq_link_prepost_locked(struct waitq *waitq, struct waitq_set *wqset)
1957{
1958 struct waitq_link *link;
1959
1960 __waitq_link_arguments_validate(waitq, wqset);
1961
1962 link = wql_find(waitq, wqset: wqset);
1963 if (link == NULL) {
1964 return KERN_NOT_IN_SET;
1965 }
1966
1967 if (!wql_wqs_preposted(link: link)) {
1968 wql_wqs_mark_preposted(link: link);
1969 waitq->waitq_preposted = true;
1970 circle_dequeue(cq: &wqset->wqset_links, elt: &link->wql_slink);
1971 circle_enqueue_tail(cq: &wqset->wqset_preposts, elt: &link->wql_slink);
1972 ipc_pset_prepost(wqset, waitq);
1973 }
1974
1975 return KERN_SUCCESS;
1976}
1977
1978waitq_link_t
1979waitq_unlink_locked(struct waitq *waitq, struct waitq_set *wqset)
1980{
1981 struct waitq_link *link;
1982
1983 __waitq_link_arguments_validate(waitq, wqset);
1984
1985 link = wql_find(waitq, wqset: wqset);
1986 if (link) {
1987 circle_dequeue(cq: wql_wqs_queue(wqs: wqset, link), elt: &link->wql_slink);
1988 circle_dequeue(cq: &waitq->waitq_links, elt: &link->wql_qlink);
1989 }
1990
1991 return (waitq_link_t){ .wqll = link };
1992}
1993
1994void
1995waitq_unlink_all_locked(struct waitq *waitq, struct waitq_set *except_wqset,
1996 waitq_link_list_t *free_l)
1997{
1998 struct waitq_link *kept_link = NULL;
1999 struct waitq_link *link;
2000
2001 assert(waitq_type(waitq) == WQT_PORT);
2002
2003 cqe_foreach_element_safe(link, &waitq->waitq_links, wql_qlink) {
2004 waitq_t wqs = wql_wqs(link: link);
2005
2006 if (wqs.wqs_set == except_wqset) {
2007 kept_link = link;
2008 continue;
2009 }
2010
2011 waitq_lock(wq: wqs);
2012 circle_dequeue(cq: wql_wqs_queue(wqs: wqs.wqs_set, link),
2013 elt: &link->wql_slink);
2014 wql_list_push(list: free_l, link: link);
2015 waitq_unlock(wq: wqs);
2016 }
2017
2018 circle_queue_init(&waitq->waitq_links);
2019 if (kept_link) {
2020 circle_enqueue_tail(cq: &waitq->waitq_links, elt: &kept_link->wql_qlink);
2021 }
2022}
2023
2024struct waitq *
2025waitq_set_first_prepost(struct waitq_set *wqset, wqs_prepost_flags_t flags)
2026{
2027 circle_queue_t q = &wqset->wqset_preposts;
2028 queue_entry_t elt;
2029 struct waitq_link *link;
2030 struct waitq *wq;
2031 uint32_t ticket;
2032
2033 if (__improbable(!waitq_valid(wqset))) {
2034 return NULL;
2035 }
2036
2037 while (!circle_queue_empty(cq: q)) {
2038 elt = circle_queue_first(cq: q);
2039 link = cqe_element(elt, struct waitq_link, wql_slink);
2040 wq = link->wql_wq;
2041
2042 if (__improbable(!waitq_lock_reserve(wq, &ticket))) {
2043 waitq_unlock(wq: wqset);
2044 waitq_lock_wait(wq: wq, ticket);
2045 waitq_lock(wq: wqset);
2046 if (!waitq_valid(waitq: wqset)) {
2047 waitq_unlock(wq: wq);
2048 return NULL;
2049 }
2050
2051 if (elt != circle_queue_first(cq: q) || link->wql_wq != wq) {
2052 waitq_unlock(wq: wq);
2053 continue;
2054 }
2055 }
2056
2057 if (wq->waitq_preposted) {
2058 if ((flags & WQS_PREPOST_PEEK) == 0) {
2059 circle_queue_rotate_head_forward(cq: q);
2060 }
2061 if ((flags & WQS_PREPOST_LOCK) == 0) {
2062 waitq_unlock(wq: wq);
2063 }
2064 return wq;
2065 }
2066
2067 /*
2068 * We found a link that is no longer preposted,
2069 * someone must have called waitq_clear_prepost_locked()
2070 * and this set just only noticed.
2071 */
2072 wql_wqs_clear_preposted(link: link);
2073 waitq_unlock(wq: wq);
2074
2075 circle_dequeue(cq: q, elt: &link->wql_slink);
2076 circle_enqueue_tail(cq: &wqset->wqset_links, elt: &link->wql_slink);
2077 }
2078
2079 return NULL;
2080}
2081
2082
2083#pragma mark select sets
2084
2085/**
2086 * @function select_set_nextid()
2087 *
2088 * @brief
2089 * Generate a unique ID for a select set "generation"
2090 *
2091 * @discussion
2092 * This mixes the CPU number with a monotonic clock
2093 * (in order to avoid contention on a global atomic).
2094 *
2095 * In order for select sets to be invalidated very quickly,
2096 * they do not have backward linkages to their member queues.
2097 *
2098 * Instead, each time a new @c select() "pass" is initiated,
2099 * a new ID is generated, which is copied onto the @c waitq_sellink
2100 * links at the time of link.
2101 *
2102 * The zone for select sets is sequestered, which allows for select
2103 * wait queues to speculatively lock their set during prepost
2104 * and use this ID to debounce wakeups and avoid spurious wakeups
2105 * (as an "optimization" because select recovers from spurious wakeups,
2106 * we just want those to be very rare).
2107 */
2108__attribute__((always_inline))
2109static inline uint64_t
2110select_set_nextid(bool preemption_enabled)
2111{
2112 /* waitq_bootstrap() set the low byte to a unique value per CPU */
2113 static_assert(MAX_CPUS <= 256);
2114 const uint64_t inc = 256;
2115 uint64_t id;
2116
2117#ifdef __x86_64__
2118 /* uncontended atomics are slower than disabling preemption on Intel */
2119 if (preemption_enabled) {
2120 disable_preemption();
2121 }
2122 id = (*PERCPU_GET(select_setid) += inc);
2123 if (preemption_enabled) {
2124 enable_preemption();
2125 }
2126#else
2127 /*
2128 * if preemption is enabled this might update another CPU's
2129 * setid, which will be rare but is acceptable, it still
2130 * produces a unique select ID.
2131 *
2132 * We chose this because the uncontended atomics on !intel
2133 * are faster than disabling/reenabling preemption.
2134 */
2135 (void)preemption_enabled;
2136 id = os_atomic_add(PERCPU_GET(select_setid), inc, relaxed);
2137#endif
2138
2139 return id;
2140}
2141
2142struct select_set *
2143select_set_alloc(void)
2144{
2145 struct select_set *selset;
2146 selset = zalloc_id(ZONE_ID_SELECT_SET, Z_ZERO | Z_WAITOK | Z_NOFAIL);
2147
2148 waitq_init(waitq: selset, type: WQT_SELECT_SET, SYNC_POLICY_FIFO);
2149 selset->selset_id = select_set_nextid(true);
2150
2151 return selset;
2152}
2153
2154__abortlike
2155static void
2156__select_set_link_arguments_panic(struct waitq *waitq, struct select_set *set)
2157{
2158 if (!waitq_valid(waitq: waitq)) {
2159 panic("Invalid waitq: %p", waitq);
2160 }
2161 if (waitq_type(wq: waitq) != WQT_SELECT) {
2162 panic("Invalid waitq type: %p:%d", waitq, waitq->waitq_type);
2163 }
2164 panic("Invalid waitq-set: %p", set);
2165}
2166
2167static inline void
2168__select_set_link_arguments_validate(struct waitq *waitq, struct select_set *set)
2169{
2170 if (!waitq_valid(waitq: waitq) ||
2171 waitq_type(wq: waitq) != WQT_SELECT ||
2172 waitq_type(wq: set) != WQT_SELECT_SET) {
2173 __select_set_link_arguments_panic(waitq, set);
2174 }
2175}
2176
2177void
2178select_set_link(struct waitq *waitq, struct select_set *set,
2179 waitq_link_t *linkp)
2180{
2181 struct waitq_sellink *link;
2182
2183 __select_set_link_arguments_validate(waitq, set);
2184
2185 waitq_lock(wq: waitq);
2186
2187 if (waitq == &select_conflict_queue) {
2188 waitq_lock(wq: set);
2189 set->selset_conflict = true;
2190 waitq_unlock(wq: set);
2191 }
2192
2193 wql_list_foreach(link, &waitq->waitq_sellinks) {
2194 if (waitq_same(wq1: wql_wqs(link: link), wq2: set)) {
2195 goto found;
2196 }
2197 }
2198
2199 link = linkp->wqls;
2200 *linkp = WQL_NULL;
2201 wql_list_push(list: &waitq->waitq_sellinks, link: link);
2202
2203found:
2204 link->wql_wqs = (uintptr_t)set;
2205 link->wql_setid = set->selset_id;
2206 waitq_unlock(wq: waitq);
2207}
2208
2209static void
2210select_set_unlink_conflict_queue(struct select_set *set)
2211{
2212 struct waitq_link_list_entry **prev;
2213 struct waitq_sellink *link;
2214
2215 waitq_lock(wq: &select_conflict_queue);
2216
2217 /*
2218 * We know the conflict queue is hooked,
2219 * so find the linkage and free it.
2220 */
2221 prev = &select_conflict_queue.waitq_sellinks.next;
2222 for (;;) {
2223 assert(*prev);
2224 link = wql_list_elem(e: *prev);
2225 if (waitq_same(wq1: wql_wqs(link: link), wq2: set)) {
2226 *prev = link->wql_next.next;
2227 break;
2228 }
2229 prev = &link->wql_next.next;
2230 }
2231
2232 waitq_unlock(wq: &select_conflict_queue);
2233
2234 waitq_link_free(type: WQT_SELECT_SET, link: link);
2235}
2236
2237static void
2238__select_set_reset(struct select_set *set, bool invalidate)
2239{
2240 if (set->selset_conflict) {
2241 select_set_unlink_conflict_queue(set);
2242 }
2243
2244 waitq_lock(wq: set);
2245 if (invalidate) {
2246 waitq_invalidate(waitq: set);
2247 }
2248 set->selset_id = select_set_nextid(false);
2249 set->selset_preposted = 0;
2250 set->selset_conflict = 0;
2251 waitq_unlock(wq: set);
2252}
2253
2254void
2255select_set_reset(struct select_set *set)
2256{
2257 __select_set_reset(set, false);
2258}
2259
2260void
2261select_set_free(struct select_set *set)
2262{
2263 __select_set_reset(set, true);
2264 hw_lck_ticket_destroy(&set->selset_interlock, &waitq_lck_grp);
2265 zfree_id(ZONE_ID_SELECT_SET, set);
2266}
2267
2268void
2269select_waitq_wakeup_and_deinit(
2270 struct waitq *waitq,
2271 event64_t wake_event,
2272 wait_result_t result)
2273{
2274 waitq_link_list_t free_l = { };
2275
2276 if (waitq_is_valid(waitq: waitq)) {
2277 assert(waitq_type(waitq) == WQT_SELECT);
2278
2279 waitq_lock(wq: waitq);
2280
2281 waitq_wakeup64_all_locked(waitq: waitq, wake_event, result,
2282 flags: WAITQ_KEEP_LOCKED);
2283
2284 waitq_invalidate(waitq: waitq);
2285 free_l = waitq->waitq_sellinks;
2286 waitq->waitq_sellinks.next = NULL;
2287
2288 waitq_unlock(wq: waitq);
2289
2290 waitq_link_free_list(type: WQT_SELECT, free_l: &free_l);
2291
2292 waitq_deinit(waitq: waitq);
2293 }
2294}
2295
2296#pragma mark assert_wait / wakeup (high level)
2297
2298wait_result_t
2299waitq_assert_wait64(struct waitq *waitq,
2300 event64_t wait_event,
2301 wait_interrupt_t interruptible,
2302 uint64_t deadline)
2303{
2304 thread_t thread = current_thread();
2305 wait_result_t ret;
2306 spl_t s = 0;
2307
2308 __waitq_validate(waitq: waitq);
2309
2310 if (waitq_irq_safe(waitq: waitq)) {
2311 s = splsched();
2312 }
2313 waitq_lock(wq: waitq);
2314
2315 ret = waitq_assert_wait64_locked(waitq: waitq, wait_event, interruptible,
2316 TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread);
2317
2318 waitq_unlock(wq: waitq);
2319 if (waitq_irq_safe(waitq: waitq)) {
2320 splx(s);
2321 }
2322
2323 return ret;
2324}
2325
2326wait_result_t
2327waitq_assert_wait64_leeway(struct waitq *waitq,
2328 event64_t wait_event,
2329 wait_interrupt_t interruptible,
2330 wait_timeout_urgency_t urgency,
2331 uint64_t deadline,
2332 uint64_t leeway)
2333{
2334 wait_result_t ret;
2335 thread_t thread = current_thread();
2336 spl_t s = 0;
2337
2338 __waitq_validate(waitq: waitq);
2339
2340 if (waitq_irq_safe(waitq: waitq)) {
2341 s = splsched();
2342 }
2343 waitq_lock(wq: waitq);
2344
2345 ret = waitq_assert_wait64_locked(waitq: waitq, wait_event, interruptible,
2346 urgency, deadline, leeway, thread);
2347
2348 waitq_unlock(wq: waitq);
2349 if (waitq_irq_safe(waitq: waitq)) {
2350 splx(s);
2351 }
2352
2353 return ret;
2354}
2355
2356kern_return_t
2357waitq_wakeup64_one(
2358 waitq_t waitq,
2359 event64_t wake_event,
2360 wait_result_t result,
2361 waitq_wakeup_flags_t flags)
2362{
2363 __waitq_validate(waitq);
2364
2365 spl_t spl = 0;
2366
2367 if (waitq_irq_safe(waitq)) {
2368 spl = splsched();
2369 }
2370
2371 waitq_lock(wq: waitq);
2372
2373 /* waitq is unlocked upon return, splx is handled */
2374 return waitq_wakeup64_one_locked(waitq, wake_event, result,
2375 flags: flags | waitq_flags_splx(spl_level: spl) | WAITQ_UNLOCK);
2376}
2377
2378kern_return_t
2379waitq_wakeup64_all(
2380 waitq_t waitq,
2381 event64_t wake_event,
2382 wait_result_t result,
2383 waitq_wakeup_flags_t flags)
2384{
2385 __waitq_validate(waitq);
2386
2387 spl_t spl = 0;
2388
2389 if (waitq_irq_safe(waitq)) {
2390 spl = splsched();
2391 }
2392
2393 waitq_lock(wq: waitq);
2394
2395 /* waitq is unlocked upon return, splx is handled */
2396 return waitq_wakeup64_all_locked(waitq, wake_event, result,
2397 flags: flags | waitq_flags_splx(spl_level: spl) | WAITQ_UNLOCK);
2398}
2399
2400kern_return_t
2401waitq_wakeup64_thread(
2402 struct waitq *waitq,
2403 event64_t event,
2404 thread_t thread,
2405 wait_result_t result)
2406{
2407 spl_t s = splsched();
2408 kern_return_t ret;
2409
2410 __waitq_validate(waitq: waitq);
2411 assert(waitq_irq_safe(waitq));
2412 waitq_lock(wq: waitq);
2413
2414 ret = waitq_wakeup64_thread_and_unlock(waitq, event, thread, result);
2415
2416 splx(s);
2417
2418 return ret;
2419}
2420
2421thread_t
2422waitq_wakeup64_identify(
2423 waitq_t waitq,
2424 event64_t wake_event,
2425 wait_result_t result,
2426 waitq_wakeup_flags_t flags)
2427{
2428 __waitq_validate(waitq);
2429
2430 spl_t spl = 0;
2431
2432 if (waitq_irq_safe(waitq)) {
2433 spl = splsched();
2434 }
2435
2436 waitq_lock(wq: waitq);
2437
2438 thread_t thread = waitq_wakeup64_identify_locked(waitq, wake_event,
2439 result, flags: flags | waitq_flags_splx(spl_level: spl) | WAITQ_UNLOCK);
2440 /* waitq is unlocked, thread is not go-ed yet */
2441 /* preemption disabled if thread non-null */
2442 /* splx is handled */
2443
2444 if (thread != THREAD_NULL) {
2445 thread_reference(thread);
2446 waitq_resume_identified_thread(waitq, thread, result, flags);
2447 /* preemption enabled, thread go-ed */
2448 /* returns +1 ref to running thread */
2449 return thread;
2450 }
2451
2452 return THREAD_NULL;
2453}
2454
2455
2456#pragma mark tests
2457#if DEBUG || DEVELOPMENT
2458
2459#include <ipc/ipc_pset.h>
2460#include <sys/errno.h>
2461
2462#define MAX_GLOBAL_TEST_QUEUES 64
2463static struct waitq wqt_waitq_array[MAX_GLOBAL_TEST_QUEUES];
2464static bool wqt_running;
2465static bool wqt_init;
2466
2467static bool
2468wqt_start(const char *test, int64_t *out)
2469{
2470 if (os_atomic_xchg(&wqt_running, true, acquire)) {
2471 *out = 0;
2472 return false;
2473 }
2474
2475 if (!wqt_init) {
2476 wqt_init = true;
2477 for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) {
2478 waitq_init(&wqt_waitq_array[i], WQT_PORT, SYNC_POLICY_FIFO);
2479 }
2480 }
2481
2482 printf("[WQ] starting %s\n", test);
2483 return true;
2484}
2485
2486static int
2487wqt_end(const char *test, int64_t *out)
2488{
2489 os_atomic_store(&wqt_running, false, release);
2490 printf("[WQ] done %s\n", test);
2491 *out = 1;
2492 return 0;
2493}
2494
2495static struct waitq *
2496wqt_wq(uint32_t index)
2497{
2498 return &wqt_waitq_array[index];
2499}
2500
2501static uint32_t
2502wqt_idx(struct waitq *waitq)
2503{
2504 assert(waitq >= wqt_waitq_array &&
2505 waitq < wqt_waitq_array + MAX_GLOBAL_TEST_QUEUES);
2506 return (uint32_t)(waitq - wqt_waitq_array);
2507}
2508
2509__attribute__((overloadable))
2510static uint64_t
2511wqt_bit(uint32_t index)
2512{
2513 return 1ull << index;
2514}
2515
2516__attribute__((overloadable))
2517static uint64_t
2518wqt_bit(struct waitq *waitq)
2519{
2520 return wqt_bit(wqt_idx(waitq));
2521}
2522
2523static struct waitq_set *
2524wqt_wqset_create(void)
2525{
2526 struct waitq_set *wqset;
2527
2528 wqset = &ipc_pset_alloc_special(ipc_space_kernel)->ips_wqset;
2529 printf("[WQ]: created waitq set %p\n", wqset);
2530 return wqset;
2531}
2532
2533static void
2534wqt_wqset_free(struct waitq_set *wqset)
2535{
2536 printf("[WQ]: destroying waitq set %p\n", wqset);
2537 waitq_lock(wqset);
2538 ipc_pset_destroy(ipc_space_kernel,
2539 __container_of(wqset, struct ipc_pset, ips_wqset));
2540}
2541
2542static void
2543wqt_link(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2544{
2545 struct waitq *waitq = wqt_wq(index);
2546 waitq_link_t link = waitq_link_alloc(WQT_PORT_SET);
2547 kern_return_t kr;
2548
2549 printf("[WQ]: linking waitq [%d] to global wqset (%p)\n", index, wqset);
2550
2551 waitq_lock(waitq);
2552 waitq_lock(wqset);
2553 kr = waitq_link_locked(waitq, wqset, &link);
2554 waitq_unlock(wqset);
2555 waitq_unlock(waitq);
2556
2557 if (link.wqlh) {
2558 waitq_link_free(WQT_PORT_SET, link);
2559 }
2560
2561 printf("[WQ]:\tkr=%d\texpected=%d\n", kr, want);
2562 assert(kr == want);
2563}
2564
2565static void
2566wqt_unlink(uint32_t index, struct waitq_set *wqset, kern_return_t want)
2567{
2568 struct waitq *waitq = wqt_wq(index);
2569 waitq_link_t link;
2570 kern_return_t kr;
2571
2572 printf("[WQ]: unlinking waitq [%d] from global wqset (%p)\n",
2573 index, wqset);
2574
2575 waitq_lock(waitq);
2576 waitq_lock(wqset);
2577 link = waitq_unlink_locked(waitq, wqset);
2578 waitq_unlock(wqset);
2579 waitq_unlock(waitq);
2580
2581 if (link.wqlh) {
2582 waitq_link_free(WQT_PORT_SET, link);
2583 kr = KERN_SUCCESS;
2584 } else {
2585 kr = KERN_NOT_IN_SET;
2586 }
2587
2588 printf("[WQ]: \tkr=%d\n", kr);
2589 assert(kr == want);
2590}
2591
2592static void
2593wqt_wakeup_one(uint32_t index, event64_t event64, kern_return_t want)
2594{
2595 kern_return_t kr;
2596
2597 printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
2598 index, event64);
2599 kr = waitq_wakeup64_one(wqt_wq(index), event64,
2600 THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
2601 printf("[WQ]: \tkr=%d\n", kr);
2602 assert(kr == want);
2603}
2604
2605static void
2606wqt_clear_preposts(uint32_t idx)
2607{
2608 waitq_lock(wqt_wq(idx));
2609 (void)waitq_clear_prepost_locked(wqt_wq(idx));
2610 waitq_unlock(wqt_wq(idx));
2611}
2612
2613static void
2614wqt_preposts_gc_locked(struct waitq_set *wqset)
2615{
2616 circle_queue_t q = &wqset->wqset_preposts;
2617 struct waitq_link *link;
2618 uint32_t ticket;
2619
2620again:
2621 cqe_foreach_element_safe(link, q, wql_slink) {
2622 struct waitq *wq = link->wql_wq;
2623
2624 if (!waitq_lock_reserve(wq, &ticket)) {
2625 waitq_unlock(wqset);
2626 waitq_lock_wait(wq, ticket);
2627 waitq_lock(wqset);
2628 waitq_unlock(wq);
2629 /* the list was possibly mutated, restart */
2630 goto again;
2631 }
2632
2633 if (!wq->waitq_preposted) {
2634 wql_wqs_clear_preposted(link);
2635 circle_dequeue(q, &link->wql_slink);
2636 circle_enqueue_tail(&wqset->wqset_links, &link->wql_slink);
2637 }
2638
2639 waitq_unlock(wq);
2640 }
2641}
2642
2643static void
2644wqt_expect_preposts(struct waitq_set *wqset, uint64_t preposts)
2645{
2646 struct waitq_link *link;
2647 uint64_t found = 0;
2648
2649 waitq_lock(wqset);
2650
2651 wqt_preposts_gc_locked(wqset);
2652
2653 cqe_foreach_element(link, &wqset->wqset_preposts, wql_slink) {
2654 struct waitq *waitq = link->wql_wq;
2655
2656 printf("[WQ]: found prepost %d\n", wqt_idx(waitq));
2657 assertf((found & wqt_bit(waitq)) == 0,
2658 "found waitq %d twice", wqt_idx(waitq));
2659 found |= wqt_bit(waitq);
2660 }
2661
2662 waitq_unlock(wqset);
2663
2664 assertf(found == preposts, "preposts expected 0x%llx, but got 0x%llx",
2665 preposts, found);
2666}
2667
2668static int
2669waitq_basic_test(__unused int64_t in, int64_t *out)
2670{
2671 struct waitq_set *wqset;
2672
2673 if (!wqt_start(__func__, out)) {
2674 return EBUSY;
2675 }
2676
2677 wqset = wqt_wqset_create();
2678 wqt_link(10, wqset, KERN_SUCCESS);
2679 wqt_link(10, wqset, KERN_ALREADY_IN_SET);
2680 wqt_link(11, wqset, KERN_SUCCESS);
2681 wqt_link(11, wqset, KERN_ALREADY_IN_SET);
2682 wqt_link(12, wqset, KERN_SUCCESS);
2683 wqt_link(12, wqset, KERN_ALREADY_IN_SET);
2684
2685 wqt_wakeup_one(10, NO_EVENT64, KERN_NOT_WAITING);
2686 wqt_wakeup_one(12, NO_EVENT64, KERN_NOT_WAITING);
2687
2688 wqt_expect_preposts(wqset, wqt_bit(10) | wqt_bit(12));
2689 wqt_clear_preposts(10);
2690
2691 wqt_expect_preposts(wqset, wqt_bit(12));
2692 wqt_clear_preposts(12);
2693
2694 wqt_expect_preposts(wqset, 0);
2695
2696 wqt_unlink(12, wqset, KERN_SUCCESS);
2697 wqt_unlink(12, wqset, KERN_NOT_IN_SET);
2698 wqt_unlink(11, wqset, KERN_SUCCESS);
2699 wqt_unlink(10, wqset, KERN_SUCCESS);
2700 wqt_wqset_free(wqset);
2701
2702 return wqt_end(__func__, out);
2703}
2704SYSCTL_TEST_REGISTER(waitq_basic, waitq_basic_test);
2705#endif /* DEBUG || DEVELOPMENT */
2706