sched_multiq.c source code [xnu/osfmk/kern/sched_multiq.c]

1	/*
2	* Copyright (c) 2013-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <mach/mach_types.h>
30	#include <mach/machine.h>
31
32	#include <machine/machine_routines.h>
33	#include <machine/sched_param.h>
34	#include <machine/machine_cpu.h>
35
36	#include <kern/kern_types.h>
37	#include <kern/debug.h>
38	#include <kern/mach_param.h>
39	#include <kern/machine.h>
40	#include <kern/misc_protos.h>
41	#include <kern/processor.h>
42	#include <kern/queue.h>
43	#include <kern/sched.h>
44	#include <kern/sched_prim.h>
45	#include <kern/task.h>
46	#include <kern/thread.h>
47
48	#include <sys/kdebug.h>
49
50	/*
51	* Theory Statement
52	*
53	* How does the task scheduler work?
54	*
55	* It schedules threads across a few levels.
56	*
57	* RT threads are dealt with above us
58	* Bound threads go into the per-processor runq
59	* Non-bound threads are linked on their task's sched_group's runq
60	* sched_groups' sched_entries are linked on the pset's runq
61	*
62	* TODO: make this explicit - bound threads should have a different enqueue fxn
63	*
64	* When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65	* or the current group's runqueue, then dequeue the next thread in that runqueue.
66	*
67	* We then manipulate the sched_entries to reflect the invariant that:
68	* Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69	* runqueue.
70	*
71	* A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72	* to run. This reduces the excess contention bonus given to processes which have work spread among many threads
73	* as compared to processes which do the same amount of work under fewer threads.
74	*
75	* NOTE: Currently, the multiq scheduler only supports one pset.
76	*
77	* NOTE ABOUT thread->sched_pri:
78	*
79	* It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80	* Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81	*
82	* TODO: Future features:
83	*
84	* Decouple the task priority from the sched_entry priority, allowing for:
85	* fast task priority change without having to iterate and re-dispatch all threads in the task.
86	* i.e. task-wide priority, task-wide boosting
87	* fancier group decay features
88	*
89	* Group (or task) decay:
90	* Decay is used for a few different things:
91	* Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92	* Balancing work between threads in a process
93	* Balancing work done at the same priority between different processes
94	* Recovering from priority inversions between two threads in the same process
95	* Recovering from priority inversions between two threads in different processes
96	* Simulating a proportional share scheduler by allowing lower priority threads
97	* to run for a certain percentage of the time
98	*
99	* Task decay lets us separately address the 'same process' and 'different process' needs,
100	* which will allow us to make smarter tradeoffs in different cases.
101	* For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102	* process below low priority threads in other processes.
103	*
104	* One lock to rule them all (or at least all the runqueues) instead of the pset locks
105	*
106	* Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107	* The entries array is 5K currently so it'd be really great to reduce.
108	* One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109	*
110	* When preempting a processor, store a flag saying if the preemption
111	* was from a thread in the same group or different group,
112	* and tell choose_thread about it.
113	*
114	* When choosing a processor, bias towards those running in the same
115	* group as I am running (at the same priority, or within a certain band?).
116	*
117	* Decide if we need to support psets.
118	* Decide how to support psets - do we need duplicate entries for each pset,
119	* or can we get away with putting the entry in either one or the other pset?
120	*
121	* Consider the right way to handle runq count - I don't want to iterate groups.
122	* Perhaps keep a global counter.
123	* Alternate option - remove it from choose_processor. It doesn't add much value
124	* now that we have global runq.
125	*
126	* Need a better way of finding group to target instead of looking at current_task.
127	* Perhaps choose_thread could pass in the current thread?
128	*
129	* Consider unifying runq copy-pastes.
130	*
131	* Thoughts on having a group central quantum bucket:
132	*
133	* I see two algorithms to decide quanta:
134	* A) Hand off only when switching thread to thread in the same group
135	* B) Allocate and return quanta to the group's pool
136	*
137	* Issues:
138	* If a task blocks completely, should it come back with the leftover quanta
139	* or brand new quanta?
140	*
141	* Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142	*
143	* Resolution:
144	* Handing off quanta between threads will help with jumping around in the current task
145	* but will not help when a thread from a different task is involved.
146	* Need an algorithm that works with round robin-ing between threads in different tasks
147	*
148	* But wait - round robining can only be triggered by quantum expire or blocking.
149	* We need something that works with preemption or yielding - that's the more interesting idea.
150	*
151	* Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152	* Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153	*
154	* New algorithm -
155	* Hand off quanta when hopping between threads with same sched_group
156	* Even if thread was blocked it uses last thread remaining quanta when it starts.
157	*
158	* If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159	* interesting.
160	*
161	* A thought - perhaps the handoff approach doesn't work so well in the presence of
162	* non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163	* woken thread will be what I switch to - other processor may have stolen it.
164	* What do we do there?
165	*
166	* Conclusions:
167	* We currently don't know of a scenario where quantum buckets on the task is beneficial.
168	* We will instead handoff quantum between threads in the task, and keep quantum
169	* on the preempted thread if it's preempted by something outside the task.
170	*
171	*/
172
173	#if DEBUG \|\| DEVELOPMENT
174	#define MULTIQ_SANITY_CHECK
175	#endif
176
177	typedef struct sched_entry {
178	queue_chain_t entry_links;
179	int16_t sched_pri; / scheduled (current) priority /
180	int16_t runq;
181	int32_t pad;
182	} *sched_entry_t;
183
184	typedef run_queue_t entry_queue_t; / A run queue that holds sched_entries instead of threads /
185	typedef run_queue_t group_runq_t; / A run queue that is part of a sched_group /
186
187	#define SCHED_ENTRY_NULL ((sched_entry_t) 0)
188	#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */
189
190	/ Each level in the run queue corresponds to one entry in the entries array /
191	struct sched_group {
192	struct sched_entry entries[NRQS];
193	struct run_queue runq;
194	queue_chain_t sched_groups;
195	};
196
197	/*
198	* Keep entry on the head of the runqueue while dequeueing threads.
199	* Only cycle it to the end of the runqueue when a thread in the task
200	* hits its quantum.
201	*/
202	static boolean_t deep_drain = FALSE;
203
204	/ Verify the consistency of the runq before touching it /
205	static boolean_t multiq_sanity_check = FALSE;
206
207	/*
208	* Draining threads from the current task is preferred
209	* when they're less than X steps below the current
210	* global highest priority
211	*/
212	#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213	static integer_t drain_band_limit;
214
215	/*
216	* Don't go below this priority level if there is something above it in another task
217	*/
218	#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219	static integer_t drain_depth_limit;
220
221	/*
222	* Don't favor the task when there's something above this priority in another task.
223	*/
224	#define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225	static integer_t drain_ceiling;
226
227	static ZONE_DEFINE_TYPE(sched_group_zone, "sched groups",
228	struct sched_group, ZC_NOCALLOUT);
229
230	static uint64_t num_sched_groups = `0`;
231	static queue_head_t sched_groups;
232
233	static LCK_GRP_DECLARE(sched_groups_lock_grp, "sched_groups");
234	static LCK_MTX_DECLARE(sched_groups_lock, &sched_groups_lock_grp);
235
236	static void
237	sched_multiq_init(void);
238
239	static thread_t
240	sched_multiq_steal_thread(processor_set_t pset);
241
242	static void
243	sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
244
245	static boolean_t
246	sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
247	sched_options_t options);
248
249	static boolean_t
250	sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
251
252	void
253	sched_multiq_quantum_expire(thread_t thread);
254
255	static ast_t
256	sched_multiq_processor_csw_check(processor_t processor);
257
258	static boolean_t
259	sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
260
261	static int
262	sched_multiq_runq_count(processor_t processor);
263
264	static boolean_t
265	sched_multiq_processor_queue_empty(processor_t processor);
266
267	static uint64_t
268	sched_multiq_runq_stats_count_sum(processor_t processor);
269
270	static int
271	sched_multiq_processor_bound_count(processor_t processor);
272
273	static void
274	sched_multiq_pset_init(processor_set_t pset);
275
276	static void
277	sched_multiq_processor_init(processor_t processor);
278
279	static thread_t
280	sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
281
282	static void
283	sched_multiq_processor_queue_shutdown(processor_t processor);
284
285	static sched_mode_t
286	sched_multiq_initial_thread_sched_mode(task_t parent_task);
287
288	static bool
289	sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason);
290
291	const struct sched_dispatch_table sched_multiq_dispatch = {
292	.sched_name = "multiq",
293	.init = sched_multiq_init,
294	.timebase_init = sched_timeshare_timebase_init,
295	.processor_init = sched_multiq_processor_init,
296	.pset_init = sched_multiq_pset_init,
297	.maintenance_continuation = sched_timeshare_maintenance_continue,
298	.choose_thread = sched_multiq_choose_thread,
299	.steal_thread_enabled = sched_steal_thread_DISABLED,
300	.steal_thread = sched_multiq_steal_thread,
301	.compute_timeshare_priority = sched_compute_timeshare_priority,
302	.choose_node = sched_choose_node,
303	.choose_processor = choose_processor,
304	.processor_enqueue = sched_multiq_processor_enqueue,
305	.processor_queue_shutdown = sched_multiq_processor_queue_shutdown,
306	.processor_queue_remove = sched_multiq_processor_queue_remove,
307	.processor_queue_empty = sched_multiq_processor_queue_empty,
308	.priority_is_urgent = priority_is_urgent,
309	.processor_csw_check = sched_multiq_processor_csw_check,
310	.processor_queue_has_priority = sched_multiq_processor_queue_has_priority,
311	.initial_quantum_size = sched_timeshare_initial_quantum_size,
312	.initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode,
313	.can_update_priority = can_update_priority,
314	.update_priority = update_priority,
315	.lightweight_update_priority = lightweight_update_priority,
316	.quantum_expire = sched_multiq_quantum_expire,
317	.processor_runq_count = sched_multiq_runq_count,
318	.processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum,
319	.processor_bound_count = sched_multiq_processor_bound_count,
320	.thread_update_scan = sched_multiq_thread_update_scan,
321	.multiple_psets_enabled = FALSE,
322	.sched_groups_enabled = TRUE,
323	.avoid_processor_enabled = TRUE,
324	.thread_avoid_processor = sched_multiq_thread_avoid_processor,
325	.processor_balance = sched_SMT_balance,
326
327	.rt_runq = sched_rtlocal_runq,
328	.rt_init = sched_rtlocal_init,
329	.rt_queue_shutdown = sched_rtlocal_queue_shutdown,
330	.rt_runq_scan = sched_rtlocal_runq_scan,
331	.rt_runq_count_sum = sched_rtlocal_runq_count_sum,
332	.rt_steal_thread = sched_rtlocal_steal_thread,
333
334	.qos_max_parallelism = sched_qos_max_parallelism,
335	.check_spill = sched_check_spill,
336	.ipi_policy = sched_ipi_policy,
337	.thread_should_yield = sched_thread_should_yield,
338	.run_count_incr = sched_run_incr,
339	.run_count_decr = sched_run_decr,
340	.update_thread_bucket = sched_update_thread_bucket,
341	.pset_made_schedulable = sched_pset_made_schedulable,
342	.cpu_init_completed = NULL,
343	.thread_eligible_for_pset = NULL,
344	};
345
346
347	static void
348	sched_multiq_init(void)
349	{
350	#if defined(MULTIQ_SANITY_CHECK)
351	PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
352	#endif
353
354	PE_parse_boot_argn(arg_string: "-multiq-deep-drain", arg_ptr: &deep_drain, max_arg: sizeof(deep_drain));
355
356	if (!PE_parse_boot_argn(arg_string: "multiq_drain_ceiling", arg_ptr: &drain_ceiling, max_arg: sizeof(drain_ceiling))) {
357	drain_ceiling = DEFAULT_DRAIN_CEILING;
358	}
359
360	if (!PE_parse_boot_argn(arg_string: "multiq_drain_depth_limit", arg_ptr: &drain_depth_limit, max_arg: sizeof(drain_depth_limit))) {
361	drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
362	}
363
364	if (!PE_parse_boot_argn(arg_string: "multiq_drain_band_limit", arg_ptr: &drain_band_limit, max_arg: sizeof(drain_band_limit))) {
365	drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
366	}
367
368	printf(format: "multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
369	deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
370
371	queue_init(&sched_groups);
372
373	sched_timeshare_init();
374	}
375
376	static void
377	sched_multiq_processor_init(processor_t processor)
378	{
379	run_queue_init(runq: &processor->runq);
380	}
381
382	static void
383	sched_multiq_pset_init(processor_set_t pset)
384	{
385	run_queue_init(runq: &pset->pset_runq);
386	}
387
388	static sched_mode_t
389	sched_multiq_initial_thread_sched_mode(task_t parent_task)
390	{
391	if (parent_task == kernel_task) {
392	return TH_MODE_FIXED;
393	} else {
394	return TH_MODE_TIMESHARE;
395	}
396	}
397
398	sched_group_t
399	sched_group_create(void)
400	{
401	sched_group_t sched_group;
402
403	if (!SCHED(sched_groups_enabled)) {
404	return SCHED_GROUP_NULL;
405	}
406
407	sched_group = zalloc_flags(sched_group_zone, Z_WAITOK \| Z_ZERO);
408
409	run_queue_init(runq: &sched_group->runq);
410
411	for (size_t i = `0`; i < NRQS; i++) {
412	sched_group->entries[i].runq = `0`;
413	sched_group->entries[i].sched_pri = (int16_t)i;
414	}
415
416	lck_mtx_lock(lck: &sched_groups_lock);
417	queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
418	num_sched_groups++;
419	lck_mtx_unlock(lck: &sched_groups_lock);
420
421	return sched_group;
422	}
423
424	void
425	sched_group_destroy(sched_group_t sched_group)
426	{
427	if (!SCHED(sched_groups_enabled)) {
428	assert(sched_group == SCHED_GROUP_NULL);
429	return;
430	}
431
432	assert(sched_group != SCHED_GROUP_NULL);
433	assert(sched_group->runq.count == `0`);
434
435	for (int i = `0`; i < NRQS; i++) {
436	assert(sched_group->entries[i].runq == `0`);
437	assert(sched_group->entries[i].sched_pri == i);
438	}
439
440	lck_mtx_lock(lck: &sched_groups_lock);
441	queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
442	num_sched_groups--;
443	lck_mtx_unlock(lck: &sched_groups_lock);
444
445	zfree(sched_group_zone, sched_group);
446	}
447
448	__attribute__((always_inline))
449	static inline entry_queue_t
450	multiq_main_entryq(processor_t processor)
451	{
452	return (entry_queue_t)&processor->processor_set->pset_runq;
453	}
454
455	__attribute__((always_inline))
456	static inline run_queue_t
457	multiq_bound_runq(processor_t processor)
458	{
459	return &processor->runq;
460	}
461
462	__attribute__((always_inline))
463	static inline sched_entry_t
464	group_entry_for_pri(sched_group_t group, integer_t pri)
465	{
466	return &group->entries[pri];
467	}
468
469	__attribute__((always_inline))
470	static inline sched_group_t
471	group_for_entry(sched_entry_t entry)
472	{
473	#pragma clang diagnostic push
474	#pragma clang diagnostic ignored "-Wcast-align"
475	sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
476	#pragma clang diagnostic pop
477	return group;
478	}
479
480	/ Peek at the head of the runqueue /
481	static sched_entry_t
482	entry_queue_first_entry(entry_queue_t rq)
483	{
484	assert(rq->count != `0`);
485
486	circle_queue_t queue = &rq->queues[rq->highq];
487
488	sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
489
490	assert(entry->sched_pri == rq->highq);
491
492	return entry;
493	}
494
495	#if defined(MULTIQ_SANITY_CHECK)
496
497	#if MACH_ASSERT
498	__attribute__((always_inline))
499	static inline boolean_t
500	queue_chain_linked(queue_chain_t* chain)
501	{
502	if (chain->next != NULL) {
503	assert(chain->prev != NULL);
504	return TRUE;
505	} else {
506	assert(chain->prev == NULL);
507	return FALSE;
508	}
509	}
510	#endif /* MACH_ASSERT */
511
512	static thread_t
513	group_first_thread(sched_group_t group)
514	{
515	group_runq_t rq = &group->runq;
516
517	assert(rq->count != `0`);
518
519	circle_queue_t queue = &rq->queues[rq->highq];
520
521	thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
522
523	assert(thread != THREAD_NULL);
524	assert_thread_magic(thread);
525
526	assert(thread->sched_group == group);
527
528	/ TODO: May not be safe /
529	assert(thread->sched_pri == rq->highq);
530
531	return thread;
532	}
533
534	/ Asserts if entry is not in entry runq at pri /
535	static void
536	entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
537	{
538	circle_queue_t q;
539	sched_entry_t elem;
540
541	assert(queue_chain_linked(&entry->entry_links));
542	assert(entry->runq == MULTIQ_ERUNQ);
543
544	q = &runq->queues[expected_pri];
545
546	cqe_foreach_element(elem, q, entry_links) {
547	if (elem == entry) {
548	return;
549	}
550	}
551
552	panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
553	}
554
555	/ Asserts if thread is not in group at its priority /
556	static void
557	sched_group_check_thread(sched_group_t group, thread_t thread)
558	{
559	circle_queue_t q;
560	thread_t elem;
561	int pri = thread->sched_pri;
562
563	thread_assert_runq_nonnull(thread);
564
565	q = &group->runq.queues[pri];
566
567	cqe_foreach_element(elem, q, runq_links) {
568	if (elem == thread) {
569	return;
570	}
571	}
572
573	panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
574	}
575
576	static void
577	global_check_entry_queue(entry_queue_t main_entryq)
578	{
579	if (main_entryq->count == `0`) {
580	return;
581	}
582
583	sched_entry_t entry = entry_queue_first_entry(main_entryq);
584
585	assert(entry->runq == MULTIQ_ERUNQ);
586
587	sched_group_t group = group_for_entry(entry);
588
589	thread_t thread = group_first_thread(group);
590
591	__assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
592
593	assert(entry->sched_pri == group->runq.highq);
594
595	assert(entry == thread_entry);
596	thread_assert_runq_nonnull(thread);
597	}
598
599	static void
600	group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
601	{
602	if (group->runq.count == `0`) {
603	return;
604	}
605
606	thread_t thread = group_first_thread(group);
607
608	thread_assert_runq_nonnull(thread);
609
610	sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
611
612	entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
613
614	assert(sched_entry->sched_pri == thread->sched_pri);
615	assert(sched_entry->runq == MULTIQ_ERUNQ);
616	}
617
618	#endif /* defined(MULTIQ_SANITY_CHECK) */
619
620	/*
621	* The run queue must not be empty.
622	*/
623	static sched_entry_t
624	entry_queue_dequeue_entry(entry_queue_t rq)
625	{
626	sched_entry_t sched_entry;
627	circle_queue_t queue = &rq->queues[rq->highq];
628
629	assert(rq->count > `0`);
630	assert(!circle_queue_empty(queue));
631
632	sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
633
634	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
635	rq->count--;
636	if (SCHED(priority_is_urgent)(rq->highq)) {
637	rq->urgency--; assert(rq->urgency >= `0`);
638	}
639	if (circle_queue_empty(cq: queue)) {
640	rq_bitmap_clear(map: rq->bitmap, n: rq->highq);
641	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
642	}
643
644	sched_entry->runq = `0`;
645
646	return sched_entry;
647	}
648
649	/*
650	* The run queue must not be empty.
651	*/
652	static boolean_t
653	entry_queue_enqueue_entry(
654	entry_queue_t rq,
655	sched_entry_t entry,
656	integer_t options)
657	{
658	int sched_pri = entry->sched_pri;
659	circle_queue_t queue = &rq->queues[sched_pri];
660	boolean_t result = FALSE;
661
662	assert(entry->runq == `0`);
663
664	if (circle_queue_empty(cq: queue)) {
665	circle_enqueue_tail(cq: queue, elt: &entry->entry_links);
666
667	rq_bitmap_set(map: rq->bitmap, n: sched_pri);
668	if (sched_pri > rq->highq) {
669	rq->highq = sched_pri;
670	result = TRUE;
671	}
672	} else {
673	if (options & SCHED_TAILQ) {
674	circle_enqueue_tail(cq: queue, elt: &entry->entry_links);
675	} else {
676	circle_enqueue_head(cq: queue, elt: &entry->entry_links);
677	}
678	}
679	if (SCHED(priority_is_urgent)(sched_pri)) {
680	rq->urgency++;
681	}
682	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
683	rq->count++;
684
685	entry->runq = MULTIQ_ERUNQ;
686
687	return result;
688	}
689
690	/*
691	* The entry must be in this runqueue.
692	*/
693	static void
694	entry_queue_remove_entry(
695	entry_queue_t rq,
696	sched_entry_t entry)
697	{
698	int sched_pri = entry->sched_pri;
699
700	#if defined(MULTIQ_SANITY_CHECK)
701	if (multiq_sanity_check) {
702	entry_queue_check_entry(rq, entry, sched_pri);
703	}
704	#endif
705
706	remqueue(elt: &entry->entry_links);
707
708	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
709	rq->count--;
710	if (SCHED(priority_is_urgent)(sched_pri)) {
711	rq->urgency--; assert(rq->urgency >= `0`);
712	}
713
714	if (circle_queue_empty(cq: &rq->queues[sched_pri])) {
715	/ update run queue status /
716	rq_bitmap_clear(map: rq->bitmap, n: sched_pri);
717	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
718	}
719
720	entry->runq = `0`;
721	}
722
723	static void
724	entry_queue_change_entry(
725	entry_queue_t rq,
726	sched_entry_t entry,
727	integer_t options)
728	{
729	int sched_pri = entry->sched_pri;
730	circle_queue_t queue = &rq->queues[sched_pri];
731
732	#if defined(MULTIQ_SANITY_CHECK)
733	if (multiq_sanity_check) {
734	entry_queue_check_entry(rq, entry, sched_pri);
735	}
736	#endif
737
738	circle_dequeue(cq: queue, elt: &entry->entry_links);
739	if (options & SCHED_TAILQ) {
740	circle_enqueue_tail(cq: queue, elt: &entry->entry_links);
741	} else {
742	circle_enqueue_head(cq: queue, elt: &entry->entry_links);
743	}
744	}
745	/*
746	* The run queue must not be empty.
747	*
748	* sets queue_empty to TRUE if queue is now empty at thread_pri
749	*/
750	static thread_t
751	group_run_queue_dequeue_thread(
752	group_runq_t rq,
753	integer_t *thread_pri,
754	boolean_t *queue_empty)
755	{
756	thread_t thread;
757	circle_queue_t queue = &rq->queues[rq->highq];
758
759	assert(rq->count > `0`);
760	assert(!circle_queue_empty(queue));
761
762	*thread_pri = rq->highq;
763
764	thread = cqe_dequeue_head(queue, struct thread, runq_links);
765	assert_thread_magic(thread);
766
767	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
768	rq->count--;
769	if (SCHED(priority_is_urgent)(rq->highq)) {
770	rq->urgency--; assert(rq->urgency >= `0`);
771	}
772	if (circle_queue_empty(cq: queue)) {
773	rq_bitmap_clear(map: rq->bitmap, n: rq->highq);
774	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
775	*queue_empty = TRUE;
776	} else {
777	*queue_empty = FALSE;
778	}
779
780	return thread;
781	}
782
783	/*
784	* The run queue must not be empty.
785	* returns TRUE if queue was empty at thread_pri
786	*/
787	static boolean_t
788	group_run_queue_enqueue_thread(
789	group_runq_t rq,
790	thread_t thread,
791	integer_t thread_pri,
792	integer_t options)
793	{
794	circle_queue_t queue = &rq->queues[thread_pri];
795	boolean_t result = FALSE;
796
797	thread_assert_runq_null(thread);
798	assert_thread_magic(thread);
799
800	if (circle_queue_empty(cq: queue)) {
801	circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
802
803	rq_bitmap_set(map: rq->bitmap, n: thread_pri);
804	if (thread_pri > rq->highq) {
805	rq->highq = thread_pri;
806	}
807	result = TRUE;
808	} else {
809	if (options & SCHED_TAILQ) {
810	circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
811	} else {
812	circle_enqueue_head(cq: queue, elt: &thread->runq_links);
813	}
814	}
815	if (SCHED(priority_is_urgent)(thread_pri)) {
816	rq->urgency++;
817	}
818	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
819	rq->count++;
820
821	return result;
822	}
823
824	/*
825	* The thread must be in this runqueue.
826	* returns TRUE if queue is now empty at thread_pri
827	*/
828	static boolean_t
829	group_run_queue_remove_thread(
830	group_runq_t rq,
831	thread_t thread,
832	integer_t thread_pri)
833	{
834	circle_queue_t queue = &rq->queues[thread_pri];
835	boolean_t result = FALSE;
836
837	assert_thread_magic(thread);
838	thread_assert_runq_nonnull(thread);
839
840	circle_dequeue(cq: queue, elt: &thread->runq_links);
841
842	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
843	rq->count--;
844	if (SCHED(priority_is_urgent)(thread_pri)) {
845	rq->urgency--; assert(rq->urgency >= `0`);
846	}
847
848	if (circle_queue_empty(cq: queue)) {
849	/ update run queue status /
850	rq_bitmap_clear(map: rq->bitmap, n: thread_pri);
851	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
852	result = TRUE;
853	}
854
855	thread_clear_runq_locked(thread);
856
857	return result;
858	}
859
860	/*
861	* A thread's sched pri may change out from under us because
862	* we're clearing thread->runq here without the thread locked.
863	* Do not rely on it to be the same as when we enqueued.
864	*/
865	static thread_t
866	sched_global_dequeue_thread(entry_queue_t main_entryq)
867	{
868	boolean_t pri_level_empty = FALSE;
869	sched_entry_t entry;
870	group_runq_t group_runq;
871	thread_t thread;
872	integer_t thread_pri;
873	sched_group_t group;
874
875	assert(main_entryq->count > `0`);
876
877	entry = entry_queue_dequeue_entry(rq: main_entryq);
878
879	group = group_for_entry(entry);
880	group_runq = &group->runq;
881
882	thread = group_run_queue_dequeue_thread(rq: group_runq, thread_pri: &thread_pri, queue_empty: &pri_level_empty);
883
884	thread_clear_runq(thread);
885
886	if (!pri_level_empty) {
887	entry_queue_enqueue_entry(rq: main_entryq, entry, options: SCHED_TAILQ);
888	}
889
890	return thread;
891	}
892
893	/ Dequeue a thread from the global runq without moving the entry /
894	static thread_t
895	sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
896	{
897	boolean_t pri_level_empty = FALSE;
898	sched_entry_t entry;
899	group_runq_t group_runq;
900	thread_t thread;
901	integer_t thread_pri;
902	sched_group_t group;
903
904	assert(main_entryq->count > `0`);
905
906	entry = entry_queue_first_entry(rq: main_entryq);
907
908	group = group_for_entry(entry);
909	group_runq = &group->runq;
910
911	thread = group_run_queue_dequeue_thread(rq: group_runq, thread_pri: &thread_pri, queue_empty: &pri_level_empty);
912
913	thread_clear_runq(thread);
914
915	if (pri_level_empty) {
916	entry_queue_remove_entry(rq: main_entryq, entry);
917	}
918
919	return thread;
920	}
921
922
923	static thread_t
924	sched_group_dequeue_thread(
925	entry_queue_t main_entryq,
926	sched_group_t group)
927	{
928	group_runq_t group_runq = &group->runq;
929	boolean_t pri_level_empty = FALSE;
930	thread_t thread;
931	integer_t thread_pri;
932
933	thread = group_run_queue_dequeue_thread(rq: group_runq, thread_pri: &thread_pri, queue_empty: &pri_level_empty);
934
935	thread_clear_runq(thread);
936
937	if (pri_level_empty) {
938	entry_queue_remove_entry(rq: main_entryq, entry: group_entry_for_pri(group, pri: thread_pri));
939	}
940
941	return thread;
942	}
943
944	static void
945	sched_group_remove_thread(
946	entry_queue_t main_entryq,
947	sched_group_t group,
948	thread_t thread)
949	{
950	integer_t thread_pri = thread->sched_pri;
951	sched_entry_t sched_entry = group_entry_for_pri(group, pri: thread_pri);
952
953	#if defined(MULTIQ_SANITY_CHECK)
954	if (multiq_sanity_check) {
955	global_check_entry_queue(main_entryq);
956	group_check_run_queue(main_entryq, group);
957
958	sched_group_check_thread(group, thread);
959	entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
960	}
961	#endif
962
963	boolean_t pri_level_empty = group_run_queue_remove_thread(rq: &group->runq, thread, thread_pri);
964
965	if (pri_level_empty) {
966	entry_queue_remove_entry(rq: main_entryq, entry: sched_entry);
967	}
968
969	#if defined(MULTIQ_SANITY_CHECK)
970	if (multiq_sanity_check) {
971	global_check_entry_queue(main_entryq);
972	group_check_run_queue(main_entryq, group);
973	}
974	#endif
975	}
976
977	static void
978	sched_group_enqueue_thread(
979	entry_queue_t main_entryq,
980	sched_group_t group,
981	thread_t thread,
982	integer_t options)
983	{
984	#if defined(MULTIQ_SANITY_CHECK)
985	if (multiq_sanity_check) {
986	global_check_entry_queue(main_entryq);
987	group_check_run_queue(main_entryq, group);
988	}
989	#endif
990
991	int sched_pri = thread->sched_pri;
992
993	boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(rq: &group->runq, thread, thread_pri: sched_pri, options);
994
995	if (pri_level_was_empty) {
996	/*
997	* TODO: Need to figure out if passing options here is a good idea or not
998	* What effects would it have?
999	*/
1000	entry_queue_enqueue_entry(rq: main_entryq, entry: &group->entries[sched_pri], options);
1001	} else if (options & SCHED_HEADQ) {
1002	/ The thread should be at the head of the line - move its entry to the front /
1003	entry_queue_change_entry(rq: main_entryq, entry: &group->entries[sched_pri], options);
1004	}
1005	}
1006
1007	/*
1008	* Locate a thread to execute from the run queue and return it.
1009	* Only choose a thread with greater or equal priority.
1010	*
1011	* pset is locked, thread is not locked.
1012	*
1013	* Returns THREAD_NULL if it cannot find a valid thread.
1014	*
1015	* Note: we cannot rely on the value of thread->sched_pri in this path because
1016	* we don't have the thread locked.
1017	*
1018	* TODO: Remove tracepoints
1019	*/
1020	static thread_t
1021	sched_multiq_choose_thread(
1022	processor_t processor,
1023	int priority,
1024	ast_t reason)
1025	{
1026	entry_queue_t main_entryq = multiq_main_entryq(processor);
1027	run_queue_t bound_runq = multiq_bound_runq(processor);
1028
1029	boolean_t choose_bound_runq = FALSE;
1030
1031	if (bound_runq->highq < priority &&
1032	main_entryq->highq < priority) {
1033	return THREAD_NULL;
1034	}
1035
1036	if (bound_runq->count && main_entryq->count) {
1037	if (bound_runq->highq >= main_entryq->highq) {
1038	choose_bound_runq = TRUE;
1039	} else {
1040	/ Use main runq /
1041	}
1042	} else if (bound_runq->count) {
1043	choose_bound_runq = TRUE;
1044	} else if (main_entryq->count) {
1045	/ Use main runq /
1046	} else {
1047	return THREAD_NULL;
1048	}
1049
1050	if (choose_bound_runq) {
1051	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1052	MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) \| DBG_FUNC_NONE,
1053	MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, `0`, `0`);
1054
1055	return run_queue_dequeue(runq: bound_runq, options: SCHED_HEADQ);
1056	}
1057
1058	sched_group_t group = current_thread()->sched_group;
1059
1060	#if defined(MULTIQ_SANITY_CHECK)
1061	if (multiq_sanity_check) {
1062	global_check_entry_queue(main_entryq);
1063	group_check_run_queue(main_entryq, group);
1064	}
1065	#endif
1066
1067	/*
1068	* Determine if we should look at the group or the global queue
1069	*
1070	* TODO:
1071	* Perhaps pass reason as a 'should look inside' argument to choose_thread
1072	* Should YIELD AST override drain limit?
1073	*/
1074	if (group->runq.count != `0` && (reason & AST_PREEMPTION) == `0`) {
1075	boolean_t favor_group = TRUE;
1076
1077	integer_t global_pri = main_entryq->highq;
1078	integer_t group_pri = group->runq.highq;
1079
1080	/*
1081	* Favor the current group if the group is still the globally highest.
1082	*
1083	* Otherwise, consider choosing a thread from the current group
1084	* even if it's lower priority than the global highest priority.
1085	*/
1086	if (global_pri > group_pri) {
1087	/*
1088	* If there's something elsewhere above the depth limit,
1089	* don't pick a thread below the limit.
1090	*/
1091	if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit) {
1092	favor_group = FALSE;
1093	}
1094
1095	/*
1096	* If there's something at or above the ceiling,
1097	* don't favor the group.
1098	*/
1099	if (global_pri >= drain_ceiling) {
1100	favor_group = FALSE;
1101	}
1102
1103	/*
1104	* Don't go more than X steps below the global highest
1105	*/
1106	if ((global_pri - group_pri) >= drain_band_limit) {
1107	favor_group = FALSE;
1108	}
1109	}
1110
1111	if (favor_group) {
1112	/ Pull from local runq /
1113	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1114	MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) \| DBG_FUNC_NONE,
1115	MACH_MULTIQ_GROUP, global_pri, group_pri, `0`, `0`);
1116
1117	return sched_group_dequeue_thread(main_entryq, group);
1118	}
1119	}
1120
1121	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1122	MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) \| DBG_FUNC_NONE,
1123	MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, `0`, `0`);
1124
1125	/ Couldn't pull from local runq, pull from global runq instead /
1126	if (deep_drain) {
1127	return sched_global_deep_drain_dequeue_thread(main_entryq);
1128	} else {
1129	return sched_global_dequeue_thread(main_entryq);
1130	}
1131	}
1132
1133
1134	/*
1135	* Thread must be locked, and not already be on a run queue.
1136	* pset is locked.
1137	*/
1138	static boolean_t
1139	sched_multiq_processor_enqueue(
1140	processor_t processor,
1141	thread_t thread,
1142	sched_options_t options)
1143	{
1144	boolean_t result;
1145
1146	assert(processor == thread->chosen_processor);
1147
1148	if (thread->bound_processor != PROCESSOR_NULL) {
1149	assert(thread->bound_processor == processor);
1150
1151	result = run_queue_enqueue(runq: multiq_bound_runq(processor), thread, options);
1152	thread_set_runq_locked(thread, new_runq: processor);
1153
1154	return result;
1155	}
1156
1157	sched_group_enqueue_thread(main_entryq: multiq_main_entryq(processor),
1158	group: thread->sched_group,
1159	thread, options);
1160
1161	thread_set_runq_locked(thread, new_runq: processor);
1162
1163	return FALSE;
1164	}
1165
1166	/*
1167	* Called in the context of thread with thread and pset unlocked,
1168	* after updating thread priority but before propagating that priority
1169	* to the processor
1170	*/
1171	void
1172	sched_multiq_quantum_expire(thread_t thread)
1173	{
1174	if (deep_drain) {
1175	/*
1176	* Move the entry at this priority to the end of the queue,
1177	* to allow the next task a shot at running.
1178	*/
1179
1180	processor_t processor = thread->last_processor;
1181	processor_set_t pset = processor->processor_set;
1182	entry_queue_t entryq = multiq_main_entryq(processor);
1183
1184	pset_lock(pset);
1185
1186	sched_entry_t entry = group_entry_for_pri(group: thread->sched_group, pri: processor->current_pri);
1187
1188	if (entry->runq == MULTIQ_ERUNQ) {
1189	entry_queue_change_entry(rq: entryq, entry, options: SCHED_TAILQ);
1190	}
1191
1192	pset_unlock(pset);
1193	}
1194	}
1195
1196	static boolean_t
1197	sched_multiq_processor_queue_empty(processor_t processor)
1198	{
1199	return multiq_main_entryq(processor)->count == `0` &&
1200	multiq_bound_runq(processor)->count == `0`;
1201	}
1202
1203	static ast_t
1204	sched_multiq_processor_csw_check(processor_t processor)
1205	{
1206	boolean_t has_higher;
1207	int pri;
1208
1209	if (sched_multiq_thread_avoid_processor(processor, thread: current_thread(), AST_NONE)) {
1210	return AST_PREEMPT \| AST_URGENT;
1211	}
1212
1213	entry_queue_t main_entryq = multiq_main_entryq(processor);
1214	run_queue_t bound_runq = multiq_bound_runq(processor);
1215
1216	assert(processor->active_thread != NULL);
1217
1218	pri = MAX(main_entryq->highq, bound_runq->highq);
1219
1220	if (processor->first_timeslice) {
1221	has_higher = (pri > processor->current_pri);
1222	} else {
1223	has_higher = (pri >= processor->current_pri);
1224	}
1225
1226	if (has_higher) {
1227	if (main_entryq->urgency > `0`) {
1228	return AST_PREEMPT \| AST_URGENT;
1229	}
1230
1231	if (bound_runq->urgency > `0`) {
1232	return AST_PREEMPT \| AST_URGENT;
1233	}
1234
1235	return AST_PREEMPT;
1236	}
1237
1238	return AST_NONE;
1239	}
1240
1241	static boolean_t
1242	sched_multiq_processor_queue_has_priority(
1243	processor_t processor,
1244	int priority,
1245	boolean_t gte)
1246	{
1247	run_queue_t main_runq = multiq_main_entryq(processor);
1248	run_queue_t bound_runq = multiq_bound_runq(processor);
1249
1250	int qpri = MAX(main_runq->highq, bound_runq->highq);
1251
1252	if (gte) {
1253	return qpri >= priority;
1254	} else {
1255	return qpri > priority;
1256	}
1257	}
1258
1259	static int
1260	sched_multiq_runq_count(processor_t processor)
1261	{
1262	/*
1263	* TODO: Decide whether to keep a count of runnable threads in the pset
1264	* or just return something less than the true count.
1265	*
1266	* This needs to be fast, so no iterating the whole runq.
1267	*
1268	* Another possible decision is to remove this - with global runq
1269	* it doesn't make much sense.
1270	*/
1271	return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1272	}
1273
1274	static uint64_t
1275	sched_multiq_runq_stats_count_sum(processor_t processor)
1276	{
1277	/*
1278	* TODO: This one does need to go through all the runqueues, but it's only needed for
1279	* the sched stats tool
1280	*/
1281
1282	uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1283
1284	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
1285	return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1286	} else {
1287	return bound_sum;
1288	}
1289	}
1290
1291	static int
1292	sched_multiq_processor_bound_count(processor_t processor)
1293	{
1294	return multiq_bound_runq(processor)->count;
1295	}
1296
1297	static void
1298	sched_multiq_processor_queue_shutdown(processor_t processor)
1299	{
1300	processor_set_t pset = processor->processor_set;
1301	entry_queue_t main_entryq = multiq_main_entryq(processor);
1302	thread_t thread;
1303	queue_head_t tqueue;
1304
1305	/ We only need to migrate threads if this is the last active processor in the pset /
1306	if (pset->online_processor_count > `0`) {
1307	pset_unlock(pset);
1308	return;
1309	}
1310
1311	queue_init(&tqueue);
1312
1313	/ Note that we do not remove bound threads from the queues here /
1314
1315	while (main_entryq->count > `0`) {
1316	thread = sched_global_dequeue_thread(main_entryq);
1317	enqueue_tail(que: &tqueue, elt: &thread->runq_links);
1318	}
1319
1320	pset_unlock(pset);
1321
1322	qe_foreach_element_safe(thread, &tqueue, runq_links) {
1323	remqueue(elt: &thread->runq_links);
1324
1325	thread_lock(thread);
1326
1327	thread_setrun(thread, options: SCHED_TAILQ);
1328
1329	thread_unlock(thread);
1330	}
1331	}
1332
1333	/*
1334	* Thread is locked
1335	*
1336	* This is why we can never read sched_pri unless we have the thread locked.
1337	* Which we do in the enqueue and remove cases, but not the dequeue case.
1338	*/
1339	static boolean_t
1340	sched_multiq_processor_queue_remove(
1341	processor_t processor,
1342	thread_t thread)
1343	{
1344	boolean_t removed = FALSE;
1345	processor_set_t pset = processor->processor_set;
1346
1347	pset_lock(pset);
1348
1349	if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
1350	/*
1351	* Thread is on a run queue and we have a lock on
1352	* that run queue.
1353	*/
1354
1355	thread_assert_runq_nonnull(thread);
1356
1357	if (thread->bound_processor != PROCESSOR_NULL) {
1358	assert(processor == thread->bound_processor);
1359	run_queue_remove(runq: multiq_bound_runq(processor), thread);
1360	thread_clear_runq_locked(thread);
1361	} else {
1362	sched_group_remove_thread(main_entryq: multiq_main_entryq(processor),
1363	group: thread->sched_group,
1364	thread);
1365	}
1366
1367	removed = TRUE;
1368	}
1369
1370	pset_unlock(pset);
1371
1372	return removed;
1373	}
1374
1375	/ pset is locked, returned unlocked /
1376	static thread_t
1377	sched_multiq_steal_thread(processor_set_t pset)
1378	{
1379	pset_unlock(pset);
1380	return THREAD_NULL;
1381	}
1382
1383	/*
1384	* Scan the global queue for candidate groups, and scan those groups for
1385	* candidate threads.
1386	*
1387	* TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1388	* Instead, iterate only the queue in the group runq matching the priority of the entry.
1389	*
1390	* Returns TRUE if retry is needed.
1391	*/
1392	static boolean_t
1393	group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
1394	{
1395	int count = runq->count;
1396	int queue_index;
1397
1398	assert(count >= `0`);
1399
1400	if (count == `0`) {
1401	return FALSE;
1402	}
1403
1404	for (queue_index = bitmap_first(map: runq->bitmap, NRQS);
1405	queue_index >= `0`;
1406	queue_index = bitmap_next(map: runq->bitmap, prev: queue_index)) {
1407	sched_entry_t entry;
1408
1409	cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1410	assert(count > `0`);
1411
1412	sched_group_t group = group_for_entry(entry);
1413	if (group->runq.count > `0`) {
1414	if (runq_scan(runq: &group->runq, scan_context)) {
1415	return TRUE;
1416	}
1417	}
1418	count--;
1419	}
1420	}
1421
1422	return FALSE;
1423	}
1424
1425	static void
1426	sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
1427	{
1428	boolean_t restart_needed = FALSE;
1429	processor_t processor = processor_list;
1430	processor_set_t pset;
1431	thread_t thread;
1432	spl_t s;
1433
1434	/*
1435	* We update the threads associated with each processor (bound and idle threads)
1436	* and then update the threads in each pset runqueue.
1437	*/
1438
1439	do {
1440	do {
1441	pset = processor->processor_set;
1442
1443	s = splsched();
1444	pset_lock(pset);
1445
1446	restart_needed = runq_scan(runq: multiq_bound_runq(processor), scan_context);
1447
1448	pset_unlock(pset);
1449	splx(s);
1450
1451	if (restart_needed) {
1452	break;
1453	}
1454
1455	thread = processor->idle_thread;
1456	if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1457	if (thread_update_add_thread(thread) == FALSE) {
1458	restart_needed = TRUE;
1459	break;
1460	}
1461	}
1462	} while ((processor = processor->processor_list) != NULL);
1463
1464	/ Ok, we now have a collection of candidates -- fix them. /
1465	thread_update_process_threads();
1466	} while (restart_needed);
1467
1468	pset = &pset0;
1469
1470	do {
1471	do {
1472	s = splsched();
1473	pset_lock(pset);
1474
1475	restart_needed = group_scan(runq: &pset->pset_runq, scan_context);
1476
1477	pset_unlock(pset);
1478	splx(s);
1479
1480	if (restart_needed) {
1481	break;
1482	}
1483	} while ((pset = pset->pset_list) != NULL);
1484
1485	/ Ok, we now have a collection of candidates -- fix them. /
1486	thread_update_process_threads();
1487	} while (restart_needed);
1488	}
1489
1490	extern int sched_allow_rt_smt;
1491
1492	/ Return true if this thread should not continue running on this processor /
1493	static bool
1494	sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason)
1495	{
1496	if (processor->processor_primary != processor) {
1497	/*
1498	* This is a secondary SMT processor. If the primary is running
1499	* a realtime thread, only allow realtime threads on the secondary.
1500	*/
1501	if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) \|\| !sched_allow_rt_smt)) {
1502	return true;
1503	}
1504	}
1505
1506	return false;
1507	}
1508

Browse the source code of xnu/osfmk/kern/sched_multiq.c