sched_prim.c source code [xnu/osfmk/kern/sched_prim.c]

1	/*
2	* Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_FREE_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: sched_prim.c
60	* Author: Avadis Tevanian, Jr.
61	* Date: 1986
62	*
63	* Scheduling primitives
64	*
65	*/
66
67	#include <debug.h>
68
69	#include <mach/mach_types.h>
70	#include <mach/machine.h>
71	#include <mach/policy.h>
72	#include <mach/sync_policy.h>
73	#include <mach/thread_act.h>
74
75	#include <machine/machine_routines.h>
76	#include <machine/sched_param.h>
77	#include <machine/machine_cpu.h>
78	#include <machine/limits.h>
79	#include <machine/atomic.h>
80
81	#include <machine/commpage.h>
82
83	#include <kern/kern_types.h>
84	#include <kern/backtrace.h>
85	#include <kern/clock.h>
86	#include <kern/cpu_number.h>
87	#include <kern/cpu_data.h>
88	#include <kern/smp.h>
89	#include <kern/debug.h>
90	#include <kern/macro_help.h>
91	#include <kern/machine.h>
92	#include <kern/misc_protos.h>
93	#include <kern/monotonic.h>
94	#include <kern/processor.h>
95	#include <kern/queue.h>
96	#include <kern/recount.h>
97	#include <kern/restartable.h>
98	#include <kern/sched.h>
99	#include <kern/sched_prim.h>
100	#include <kern/sfi.h>
101	#include <kern/syscall_subr.h>
102	#include <kern/task.h>
103	#include <kern/thread.h>
104	#include <kern/thread_group.h>
105	#include <kern/ledger.h>
106	#include <kern/timer_queue.h>
107	#include <kern/waitq.h>
108	#include <kern/policy_internal.h>
109
110	#include <vm/pmap.h>
111	#include <vm/vm_kern.h>
112	#include <vm/vm_map.h>
113	#include <vm/vm_pageout.h>
114
115	#include <mach/sdt.h>
116	#include <mach/mach_host.h>
117	#include <mach/host_info.h>
118
119	#include <sys/kdebug.h>
120	#include <kperf/kperf.h>
121	#include <kern/kpc.h>
122	#include <san/kasan.h>
123	#include <kern/pms.h>
124	#include <kern/host.h>
125	#include <stdatomic.h>
126	#include <os/atomic_private.h>
127
128	#ifdef KDBG_MACOS_RELEASE
129	#define KTRC KDBG_MACOS_RELEASE
130	#else
131	#define KTRC KDBG_RELEASE
132	#endif
133
134	struct sched_statistics PERCPU_DATA(sched_stats);
135	bool sched_stats_active;
136
137	static uint64_t
138	deadline_add(uint64_t d, uint64_t e)
139	{
140	uint64_t sum;
141	return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
142	}
143
144	int
145	rt_runq_count(processor_set_t pset)
146	{
147	return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
148	}
149
150	uint64_t
151	rt_runq_earliest_deadline(processor_set_t pset)
152	{
153	return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
154	}
155
156	static int
157	rt_runq_priority(processor_set_t pset)
158	{
159	pset_assert_locked(pset);
160	rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
161
162	bitmap_t *map = rt_run_queue->bitmap;
163	int i = bitmap_first(map, NRTQS);
164	assert(i < NRTQS);
165
166	if (i >= `0`) {
167	return i + BASEPRI_RTQUEUES;
168	}
169
170	return i;
171	}
172
173	static thread_t rt_runq_first(rt_queue_t rt_runq);
174
175	#if DEBUG
176	static void
177	check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
178	{
179	bitmap_t *map = rt_run_queue->bitmap;
180
181	uint64_t earliest_deadline = RT_DEADLINE_NONE;
182	uint32_t constraint = RT_CONSTRAINT_NONE;
183	int ed_index = NOPRI;
184	int count = `0`;
185	bool found_thread = false;
186
187	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
188	int i = pri - BASEPRI_RTQUEUES;
189	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
190	queue_t queue = &rt_runq->pri_queue;
191	queue_entry_t iter;
192	int n = `0`;
193	uint64_t previous_deadline = `0`;
194	qe_foreach(iter, queue) {
195	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
196	assert_thread_magic(iter_thread);
197	if (iter_thread == thread) {
198	found_thread = true;
199	}
200	assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
201	assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
202	assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
203	assert(previous_deadline <= iter_thread->realtime.deadline);
204	n++;
205	if (iter == queue_first(queue)) {
206	assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
207	assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
208	}
209	previous_deadline = iter_thread->realtime.deadline;
210	}
211	assert(n == rt_runq->pri_count);
212	if (n == `0`) {
213	assert(bitmap_test(map, i) == false);
214	assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
215	assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
216	} else {
217	assert(bitmap_test(map, i) == true);
218	}
219	if (rt_runq->pri_earliest_deadline < earliest_deadline) {
220	earliest_deadline = rt_runq->pri_earliest_deadline;
221	constraint = rt_runq->pri_constraint;
222	ed_index = i;
223	}
224	count += n;
225	}
226	assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
227	assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
228	assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
229	assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
230	if (thread) {
231	assert(found_thread);
232	}
233	}
234	#define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
235	#else
236	#define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
237	#endif
238
239	uint32_t rt_constraint_threshold;
240
241	static bool
242	rt_runq_is_low_latency(processor_set_t pset)
243	{
244	return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
245	}
246
247	TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
248
249	/ TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) /
250	TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", `0`); / microseconds /
251	static uint64_t nonurgent_preemption_timer_abs = `0`;
252
253	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
254	TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
255
256	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
257	TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
258
259	#define MAX_UNSAFE_RT_QUANTA 100
260	#define SAFE_RT_MULTIPLIER 2
261
262	#define MAX_UNSAFE_FIXED_QUANTA 100
263	#define SAFE_FIXED_MULTIPLIER 2
264
265	TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
266	TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
267
268	TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
269	TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
270
271	#define MAX_POLL_QUANTA 2
272	TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
273
274	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
275	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
276
277	uint64_t max_poll_computation;
278
279	uint64_t max_unsafe_rt_computation;
280	uint64_t max_unsafe_fixed_computation;
281	uint64_t sched_safe_rt_duration;
282	uint64_t sched_safe_fixed_duration;
283
284	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
285
286	uint32_t std_quantum;
287	uint32_t min_std_quantum;
288	uint32_t bg_quantum;
289
290	uint32_t std_quantum_us;
291	uint32_t bg_quantum_us;
292
293	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
294
295	uint32_t thread_depress_time;
296	uint32_t default_timeshare_computation;
297	uint32_t default_timeshare_constraint;
298
299	uint32_t max_rt_quantum;
300	uint32_t min_rt_quantum;
301
302	uint32_t rt_deadline_epsilon;
303
304	uint32_t rt_constraint_threshold;
305
306	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
307
308	unsigned sched_tick;
309	uint32_t sched_tick_interval;
310
311	/ Timeshare load calculation interval (15ms) /
312	uint32_t sched_load_compute_interval_us = `15000`;
313	uint64_t sched_load_compute_interval_abs;
314	static _Atomic uint64_t sched_load_compute_deadline;
315
316	uint32_t sched_pri_shifts[TH_BUCKET_MAX];
317	uint32_t sched_fixed_shift;
318
319	uint32_t sched_decay_usage_age_factor = `1`; / accelerate 5/8^n usage aging /
320
321	/ Allow foreground to decay past default to resolve inversions /
322	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
323	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324
325	/ Defaults for timer deadline profiling /
326	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
327	* 2ms */
328	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
329	* <= 5ms */
330
331	uint64_t timer_deadline_tracking_bin_1;
332	uint64_t timer_deadline_tracking_bin_2;
333
334	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
335
336	thread_t sched_maintenance_thread;
337
338	/ interrupts disabled lock to guard recommended cores state /
339	decl_simple_lock_data(, sched_available_cores_lock);
340	uint64_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
341	uint64_t perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
342	uint64_t perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343	static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344	static uint64_t sched_online_processors = `0`;
345	static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
346	static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
347
348	#if __arm64__
349	static void sched_recommended_cores_maintenance(void);
350	uint64_t perfcontrol_failsafe_starvation_threshold;
351	extern char proc_name_address(struct* proc *p);
352	#endif /* __arm64__ */
353
354	uint64_t sched_one_second_interval;
355	boolean_t allow_direct_handoff = TRUE;
356
357	/ Forwards /
358
359	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
360
361	static void load_shift_init(void);
362	static void preempt_pri_init(void);
363
364	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
365
366	thread_t processor_idle(
367	thread_t thread,
368	processor_t processor);
369
370	static ast_t
371	csw_check_locked(
372	thread_t thread,
373	processor_t processor,
374	processor_set_t pset,
375	ast_t check_reason);
376
377	static void processor_setrun(
378	processor_t processor,
379	thread_t thread,
380	integer_t options);
381
382	static void
383	sched_realtime_timebase_init(void);
384
385	static void
386	sched_timer_deadline_tracking_init(void);
387
388	#if DEBUG
389	extern int debug_task;
390	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
391	#else
392	#define TLOG(a, fmt, args...) do {} while (0)
393	#endif
394
395	static processor_t
396	thread_bind_internal(
397	thread_t thread,
398	processor_t processor);
399
400	static void
401	sched_vm_group_maintenance(void);
402
403	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
404	int8_t sched_load_shifts[NRQS];
405	bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
406	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
407
408	/*
409	* Statically allocate a buffer to hold the longest possible
410	* scheduler description string, as currently implemented.
411	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
412	* to export to userspace via sysctl(3). If either version
413	* changes, update the other.
414	*
415	* Note that in addition to being an upper bound on the strings
416	* in the kernel, it's also an exact parameter to PE_get_default(),
417	* which interrogates the device tree on some platforms. That
418	* API requires the caller know the exact size of the device tree
419	* property, so we need both a legacy size (32) and the current size
420	* (48) to deal with old and new device trees. The device tree property
421	* is similarly padded to a fixed size so that the same kernel image
422	* can run on multiple devices with different schedulers configured
423	* in the device tree.
424	*/
425	char sched_string[SCHED_STRING_MAX_LENGTH];
426
427	uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
428
429	/ Global flag which indicates whether Background Stepper Context is enabled /
430	static int cpu_throttle_enabled = `1`;
431
432	#if DEVELOPMENT \|\| DEBUG
433	int enable_task_set_cluster_type = `0`;
434	bool system_ecore_only = false;
435	#endif /* DEVELOPMENT \|\| DEBUG */
436
437	void
438	sched_init(void)
439	{
440	boolean_t direct_handoff = FALSE;
441	kprintf(fmt: "Scheduler: Default of %s\n", SCHED(sched_name));
442
443	if (!PE_parse_boot_argn(arg_string: "sched_pri_decay_limit", arg_ptr: &sched_pri_decay_band_limit, max_arg: sizeof(sched_pri_decay_band_limit))) {
444	/ No boot-args, check in device tree /
445	if (!PE_get_default(property_name: "kern.sched_pri_decay_limit",
446	property_ptr: &sched_pri_decay_band_limit,
447	max_property: sizeof(sched_pri_decay_band_limit))) {
448	/ Allow decay all the way to normal limits /
449	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
450	}
451	}
452
453	kprintf(fmt: "Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
454
455	if (PE_parse_boot_argn(arg_string: "sched_debug", arg_ptr: &sched_debug_flags, max_arg: sizeof(sched_debug_flags))) {
456	kprintf(fmt: "Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
457	}
458	strlcpy(dst: sched_string, SCHED(sched_name), n: sizeof(sched_string));
459
460	#if __arm64__
461	clock_interval_to_absolutetime_interval(interval: expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, result: &expecting_ipi_wfe_timeout_mt);
462	#endif /* __arm64__ */
463
464	SCHED(init)();
465	SCHED(rt_init)(&pset0);
466	sched_timer_deadline_tracking_init();
467
468	SCHED(pset_init)(&pset0);
469	SCHED(processor_init)(master_processor);
470
471	if (PE_parse_boot_argn(arg_string: "direct_handoff", arg_ptr: &direct_handoff, max_arg: sizeof(direct_handoff))) {
472	allow_direct_handoff = direct_handoff;
473	}
474
475	#if DEVELOPMENT \|\| DEBUG
476	if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
477	system_ecore_only = (enable_task_set_cluster_type == `2`);
478	}
479	#endif /* DEVELOPMENT \|\| DEBUG */
480
481	simple_lock_init(&sched_available_cores_lock, `0`);
482	}
483
484	void
485	sched_timebase_init(void)
486	{
487	uint64_t abstime;
488
489	clock_interval_to_absolutetime_interval(interval: `1`, NSEC_PER_SEC, result: &abstime);
490	sched_one_second_interval = abstime;
491
492	SCHED(timebase_init)();
493	sched_realtime_timebase_init();
494	}
495
496	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
497
498	void
499	sched_timeshare_init(void)
500	{
501	/*
502	* Calculate the timeslicing quantum
503	* in us.
504	*/
505	if (default_preemption_rate < `1`) {
506	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
507	}
508	std_quantum_us = (`1000` * `1000`) / default_preemption_rate;
509
510	printf(format: "standard timeslicing quantum is %d us\n", std_quantum_us);
511
512	if (default_bg_preemption_rate < `1`) {
513	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
514	}
515	bg_quantum_us = (`1000` * `1000`) / default_bg_preemption_rate;
516
517	printf(format: "standard background quantum is %d us\n", bg_quantum_us);
518
519	load_shift_init();
520	preempt_pri_init();
521	sched_tick = `0`;
522	}
523
524	void
525	sched_set_max_unsafe_rt_quanta(int max)
526	{
527	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
528
529	max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
530
531	const int mult = safe_rt_multiplier <= `0` ? `2` : safe_rt_multiplier;
532	sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
533
534
535	#if DEVELOPMENT \|\| DEBUG
536	max_unsafe_rt_quanta = max;
537	#else
538	/*
539	* On RELEASE kernels, this is only called on boot where
540	* max is already equal to max_unsafe_rt_quanta.
541	*/
542	assert3s(max, ==, max_unsafe_rt_quanta);
543	#endif
544	}
545
546	void
547	sched_set_max_unsafe_fixed_quanta(int max)
548	{
549	const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
550
551	max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
552
553	const int mult = safe_fixed_multiplier <= `0` ? `2` : safe_fixed_multiplier;
554	sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
555
556	#if DEVELOPMENT \|\| DEBUG
557	max_unsafe_fixed_quanta = max;
558	#else
559	/*
560	* On RELEASE kernels, this is only called on boot where
561	* max is already equal to max_unsafe_fixed_quanta.
562	*/
563	assert3s(max, ==, max_unsafe_fixed_quanta);
564	#endif
565	}
566
567	void
568	sched_timeshare_timebase_init(void)
569	{
570	uint64_t abstime;
571	uint32_t shift;
572
573	/ standard timeslicing quantum /
574	clock_interval_to_absolutetime_interval(
575	interval: std_quantum_us, NSEC_PER_USEC, result: &abstime);
576	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
577	std_quantum = (uint32_t)abstime;
578
579	/ smallest remaining quantum (250 us) /
580	clock_interval_to_absolutetime_interval(interval: `250`, NSEC_PER_USEC, result: &abstime);
581	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
582	min_std_quantum = (uint32_t)abstime;
583
584	/ quantum for background tasks /
585	clock_interval_to_absolutetime_interval(
586	interval: bg_quantum_us, NSEC_PER_USEC, result: &abstime);
587	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
588	bg_quantum = (uint32_t)abstime;
589
590	/ scheduler tick interval /
591	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
592	NSEC_PER_USEC, result: &abstime);
593	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
594	sched_tick_interval = (uint32_t)abstime;
595
596	/ timeshare load calculation interval & deadline initialization /
597	clock_interval_to_absolutetime_interval(interval: sched_load_compute_interval_us, NSEC_PER_USEC, result: &sched_load_compute_interval_abs);
598	os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
599
600	/*
601	* Compute conversion factor from usage to
602	* timesharing priorities with 5/8 ** n aging.
603	*/
604	abstime = (abstime * `5`) / `3`;
605	for (shift = `0`; abstime > BASEPRI_DEFAULT; ++shift) {
606	abstime >>= `1`;
607	}
608	sched_fixed_shift = shift;
609
610	for (uint32_t i = `0`; i < TH_BUCKET_MAX; i++) {
611	sched_pri_shifts[i] = INT8_MAX;
612	}
613
614	sched_set_max_unsafe_rt_quanta(max: max_unsafe_rt_quanta);
615	sched_set_max_unsafe_fixed_quanta(max: max_unsafe_fixed_quanta);
616
617	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
618	thread_depress_time = `1` * std_quantum;
619	default_timeshare_computation = std_quantum / `2`;
620	default_timeshare_constraint = std_quantum;
621
622	#if __arm64__
623	perfcontrol_failsafe_starvation_threshold = (`2` * sched_tick_interval);
624	#endif /* __arm64__ */
625
626	if (nonurgent_preemption_timer_us) {
627	clock_interval_to_absolutetime_interval(interval: nonurgent_preemption_timer_us, NSEC_PER_USEC, result: &abstime);
628	nonurgent_preemption_timer_abs = abstime;
629	}
630	}
631
632	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
633
634	void
635	pset_rt_init(processor_set_t pset)
636	{
637	for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
638	int i = pri - BASEPRI_RTQUEUES;
639	rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
640	queue_init(&rqi->pri_queue);
641	rqi->pri_count = `0`;
642	rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
643	rqi->pri_constraint = RT_CONSTRAINT_NONE;
644	}
645	os_atomic_init(&pset->rt_runq.count, `0`);
646	os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
647	os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
648	os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
649	memset(s: &pset->rt_runq.runq_stats, c: `0`, n: sizeof pset->rt_runq.runq_stats);
650	}
651
652	/ epsilon for comparing RT deadlines /
653	int rt_deadline_epsilon_us = `100`;
654
655	int
656	sched_get_rt_deadline_epsilon(void)
657	{
658	return rt_deadline_epsilon_us;
659	}
660
661	void
662	sched_set_rt_deadline_epsilon(int new_epsilon_us)
663	{
664	rt_deadline_epsilon_us = new_epsilon_us;
665
666	uint64_t abstime;
667	clock_interval_to_absolutetime_interval(interval: rt_deadline_epsilon_us, NSEC_PER_USEC, result: &abstime);
668	assert((abstime >> `32`) == `0` && ((rt_deadline_epsilon_us == `0`) \|\| (uint32_t)abstime != `0`));
669	rt_deadline_epsilon = (uint32_t)abstime;
670	}
671
672	static void
673	sched_realtime_timebase_init(void)
674	{
675	uint64_t abstime;
676
677	/ smallest rt computation (50 us) /
678	clock_interval_to_absolutetime_interval(interval: `50`, NSEC_PER_USEC, result: &abstime);
679	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
680	min_rt_quantum = (uint32_t)abstime;
681
682	/ maximum rt computation (50 ms) /
683	clock_interval_to_absolutetime_interval(
684	interval: `50`, scale_factor: `1000` * NSEC_PER_USEC, result: &abstime);
685	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
686	max_rt_quantum = (uint32_t)abstime;
687
688	/ constraint threshold for sending backup IPIs (4 ms) /
689	clock_interval_to_absolutetime_interval(interval: `4`, NSEC_PER_MSEC, result: &abstime);
690	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
691	rt_constraint_threshold = (uint32_t)abstime;
692
693	/ epsilon for comparing deadlines /
694	sched_set_rt_deadline_epsilon(new_epsilon_us: rt_deadline_epsilon_us);
695	}
696
697	void
698	sched_check_spill(processor_set_t pset, thread_t thread)
699	{
700	(void)pset;
701	(void)thread;
702
703	return;
704	}
705
706	bool
707	sched_thread_should_yield(processor_t processor, thread_t thread)
708	{
709	(void)thread;
710
711	return !SCHED(processor_queue_empty)(processor) \|\| rt_runq_count(pset: processor->processor_set) > `0`;
712	}
713
714	/ Default implementations of .steal_thread_enabled /
715	bool
716	sched_steal_thread_DISABLED(processor_set_t pset)
717	{
718	(void)pset;
719	return false;
720	}
721
722	bool
723	sched_steal_thread_enabled(processor_set_t pset)
724	{
725	return bit_count(x: pset->node->pset_map) > `1`;
726	}
727
728	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
729
730	/*
731	* Set up values for timeshare
732	* loading factors.
733	*/
734	static void
735	load_shift_init(void)
736	{
737	int8_t k, *p = sched_load_shifts;
738	uint32_t i, j;
739
740	uint32_t sched_decay_penalty = `1`;
741
742	if (PE_parse_boot_argn(arg_string: "sched_decay_penalty", arg_ptr: &sched_decay_penalty, max_arg: sizeof(sched_decay_penalty))) {
743	kprintf(fmt: "Overriding scheduler decay penalty %u\n", sched_decay_penalty);
744	}
745
746	if (PE_parse_boot_argn(arg_string: "sched_decay_usage_age_factor", arg_ptr: &sched_decay_usage_age_factor, max_arg: sizeof(sched_decay_usage_age_factor))) {
747	kprintf(fmt: "Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
748	}
749
750	if (sched_decay_penalty == `0`) {
751	/*
752	* There is no penalty for timeshare threads for using too much
753	* CPU, so set all load shifts to INT8_MIN. Even under high load,
754	* sched_pri_shift will be >INT8_MAX, and there will be no
755	* penalty applied to threads (nor will sched_usage be updated per
756	* thread).
757	*/
758	for (i = `0`; i < NRQS; i++) {
759	sched_load_shifts[i] = INT8_MIN;
760	}
761
762	return;
763	}
764
765	p++ = INT8_MIN; p++ = `0`;
766
767	/*
768	* For a given system load "i", the per-thread priority
769	* penalty per quantum of CPU usage is ~2^k priority
770	* levels. "sched_decay_penalty" can cause more
771	* array entries to be filled with smaller "k" values
772	*/
773	for (i = `2`, j = `1` << sched_decay_penalty, k = `1`; i < NRQS; ++k) {
774	for (j <<= `1`; (i < j) && (i < NRQS); ++i) {
775	*p++ = k;
776	}
777	}
778	}
779
780	static void
781	preempt_pri_init(void)
782	{
783	bitmap_t *p = sched_preempt_pri;
784
785	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
786	bitmap_set(map: p, n: i);
787	}
788
789	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
790	bitmap_set(map: p, n: i);
791	}
792	}
793
794	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
795
796	void
797	check_monotonic_time(uint64_t ctime)
798	{
799	processor_t processor = current_processor();
800	uint64_t last_dispatch = processor->last_dispatch;
801
802	if (last_dispatch > ctime) {
803	panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
804	last_dispatch, ctime);
805	}
806	}
807
808
809	/*
810	* Thread wait timer expiration.
811	* Runs in timer interrupt context with interrupts disabled.
812	*/
813	void
814	thread_timer_expire(void p0, __unused void* *p1)
815	{
816	thread_t thread = (thread_t)p0;
817
818	assert_thread_magic(thread);
819
820	assert(ml_get_interrupts_enabled() == FALSE);
821
822	thread_lock(thread);
823
824	if (thread->wait_timer_armed) {
825	thread->wait_timer_armed = false;
826	clear_wait_internal(thread, THREAD_TIMED_OUT);
827	/ clear_wait_internal may have dropped and retaken the thread lock /
828	}
829
830	thread->wait_timer_active--;
831
832	thread_unlock(thread);
833	}
834
835	/*
836	* thread_unblock:
837	*
838	* Unblock thread on wake up.
839	*
840	* Returns TRUE if the thread should now be placed on the runqueue.
841	*
842	* Thread must be locked.
843	*
844	* Called at splsched().
845	*/
846	boolean_t
847	thread_unblock(
848	thread_t thread,
849	wait_result_t wresult)
850	{
851	boolean_t ready_for_runq = FALSE;
852	thread_t cthread = current_thread();
853	uint32_t new_run_count;
854	int old_thread_state;
855
856	/*
857	* Set wait_result.
858	*/
859	thread->wait_result = wresult;
860
861	/*
862	* Cancel pending wait timer.
863	*/
864	if (thread->wait_timer_armed) {
865	if (timer_call_cancel(call: thread->wait_timer)) {
866	thread->wait_timer_active--;
867	}
868	thread->wait_timer_armed = false;
869	}
870
871	boolean_t aticontext, pidle;
872	ml_get_power_state(&aticontext, &pidle);
873
874	/*
875	* Update scheduling state: not waiting,
876	* set running.
877	*/
878	old_thread_state = thread->state;
879	thread->state = (old_thread_state \| TH_RUN) &
880	~(TH_WAIT \| TH_UNINT \| TH_WAIT_REPORT \| TH_WAKING);
881
882	if ((old_thread_state & TH_RUN) == `0`) {
883	uint64_t ctime = mach_approximate_time();
884
885	check_monotonic_time(ctime);
886
887	thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
888	timer_start(timer: &thread->runnable_timer, tstamp: ctime);
889
890	ready_for_runq = TRUE;
891
892	if (old_thread_state & TH_WAIT_REPORT) {
893	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
894	}
895
896	/ Update the runnable thread count /
897	new_run_count = SCHED(run_count_incr)(thread);
898
899	#if CONFIG_SCHED_AUTO_JOIN
900	if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
901	work_interval_auto_join_propagate(from: cthread, to: thread);
902	}
903	#endif /CONFIG_SCHED_AUTO_JOIN /
904
905	} else {
906	/*
907	* Either the thread is idling in place on another processor,
908	* or it hasn't finished context switching yet.
909	*/
910	assert((thread->state & TH_IDLE) == `0`);
911	/*
912	* The run count is only dropped after the context switch completes
913	* and the thread is still waiting, so we should not run_incr here
914	*/
915	new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
916	}
917
918	/*
919	* Calculate deadline for real-time threads.
920	*/
921	if (thread->sched_mode == TH_MODE_REALTIME) {
922	uint64_t ctime = mach_absolute_time();
923	thread->realtime.deadline = thread->realtime.constraint + ctime;
924	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) \| DBG_FUNC_NONE,
925	(uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, `0`);
926	}
927
928	/*
929	* Clear old quantum, fail-safe computation, etc.
930	*/
931	thread->quantum_remaining = `0`;
932	thread->computation_metered = `0`;
933	thread->reason = AST_NONE;
934	thread->block_hint = kThreadWaitNone;
935
936	/ Obtain power-relevant interrupt and "platform-idle exit" statistics.*
937	* We also account for "double hop" thread signaling via
938	* the thread callout infrastructure.
939	* DRK: consider removing the callout wakeup counters in the future
940	* they're present for verification at the moment.
941	*/
942
943	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
944	DTRACE_SCHED2(iwakeup, struct thread , thread, struct* proc *, current_proc());
945
946	uint64_t ttd = current_processor()->timer_call_ttd;
947
948	if (ttd) {
949	if (ttd <= timer_deadline_tracking_bin_1) {
950	thread->thread_timer_wakeups_bin_1++;
951	} else if (ttd <= timer_deadline_tracking_bin_2) {
952	thread->thread_timer_wakeups_bin_2++;
953	}
954	}
955
956	ledger_credit_thread(thread, ledger: thread->t_ledger,
957	entry: task_ledgers.interrupt_wakeups, amount: `1`);
958	if (pidle) {
959	ledger_credit_thread(thread, ledger: thread->t_ledger,
960	entry: task_ledgers.platform_idle_wakeups, amount: `1`);
961	}
962	} else if (thread_get_tag_internal(thread: cthread) & THREAD_TAG_CALLOUT) {
963	/ TODO: what about an interrupt that does a wake taken on a callout thread? /
964	if (cthread->callout_woken_from_icontext) {
965	ledger_credit_thread(thread, ledger: thread->t_ledger,
966	entry: task_ledgers.interrupt_wakeups, amount: `1`);
967	thread->thread_callout_interrupt_wakeups++;
968
969	if (cthread->callout_woken_from_platform_idle) {
970	ledger_credit_thread(thread, ledger: thread->t_ledger,
971	entry: task_ledgers.platform_idle_wakeups, amount: `1`);
972	thread->thread_callout_platform_idle_wakeups++;
973	}
974
975	cthread->callout_woke_thread = TRUE;
976	}
977	}
978
979	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
980	thread->callout_woken_from_icontext = !!aticontext;
981	thread->callout_woken_from_platform_idle = !!pidle;
982	thread->callout_woke_thread = FALSE;
983	}
984
985	#if KPERF
986	if (ready_for_runq) {
987	kperf_make_runnable(thread, interrupt: aticontext);
988	}
989	#endif /* KPERF */
990
991	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
992	MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
993	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
994	sched_run_buckets[TH_BUCKET_RUN], `0`);
995
996	DTRACE_SCHED2(wakeup, struct thread , thread, struct* proc *, current_proc());
997
998	return ready_for_runq;
999	}
1000
1001	/*
1002	* Routine: thread_allowed_for_handoff
1003	* Purpose:
1004	* Check if the thread is allowed for handoff operation
1005	* Conditions:
1006	* thread lock held, IPC locks may be held.
1007	* TODO: In future, do not allow handoff if threads have different cluster
1008	* recommendations.
1009	*/
1010	boolean_t
1011	thread_allowed_for_handoff(
1012	thread_t thread)
1013	{
1014	thread_t self = current_thread();
1015
1016	if (allow_direct_handoff &&
1017	thread->sched_mode == TH_MODE_REALTIME &&
1018	self->sched_mode == TH_MODE_REALTIME) {
1019	return TRUE;
1020	}
1021
1022	return FALSE;
1023	}
1024
1025	/*
1026	* Routine: thread_go
1027	* Purpose:
1028	* Unblock and dispatch thread.
1029	* Conditions:
1030	* thread lock held, IPC locks may be held.
1031	* thread must have been waiting
1032	*/
1033	void
1034	thread_go(
1035	thread_t thread,
1036	wait_result_t wresult,
1037	bool try_handoff)
1038	{
1039	thread_t self = current_thread();
1040
1041	assert_thread_magic(thread);
1042
1043	assert(thread->at_safe_point == FALSE);
1044	assert(thread->wait_event == NO_EVENT64);
1045	assert(waitq_is_null(thread->waitq));
1046
1047	assert(!(thread->state & (TH_TERMINATE \| TH_TERMINATE2)));
1048	assert(thread->state & TH_WAIT);
1049
1050	if (thread->started) {
1051	assert(thread->state & TH_WAKING);
1052	}
1053
1054	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1055
1056	assert(ml_get_interrupts_enabled() == false);
1057
1058	if (thread_unblock(thread, wresult)) {
1059	#if SCHED_TRACE_THREAD_WAKEUPS
1060	backtrace(&thread->thread_wakeup_bt[`0`],
1061	(sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1062	NULL);
1063	#endif /* SCHED_TRACE_THREAD_WAKEUPS */
1064	if (try_handoff && thread_allowed_for_handoff(thread)) {
1065	thread_reference(thread);
1066	assert(self->handoff_thread == NULL);
1067	self->handoff_thread = thread;
1068	} else {
1069	thread_setrun(thread, options: SCHED_PREEMPT \| SCHED_TAILQ);
1070	}
1071	}
1072	}
1073
1074	/*
1075	* Routine: thread_mark_wait_locked
1076	* Purpose:
1077	* Mark a thread as waiting. If, given the circumstances,
1078	* it doesn't want to wait (i.e. already aborted), then
1079	* indicate that in the return value.
1080	* Conditions:
1081	* at splsched() and thread is locked.
1082	*/
1083	__private_extern__
1084	wait_result_t
1085	thread_mark_wait_locked(
1086	thread_t thread,
1087	wait_interrupt_t interruptible_orig)
1088	{
1089	boolean_t at_safe_point;
1090	wait_interrupt_t interruptible = interruptible_orig;
1091
1092	if (thread->state & TH_IDLE) {
1093	panic("Invalid attempt to wait while running the idle thread");
1094	}
1095
1096	assert(!(thread->state & (TH_WAIT \| TH_WAKING \| TH_IDLE \| TH_UNINT \| TH_TERMINATE2 \| TH_WAIT_REPORT)));
1097
1098	/*
1099	* The thread may have certain types of interrupts/aborts masked
1100	* off. Even if the wait location says these types of interrupts
1101	* are OK, we have to honor mask settings (outer-scoped code may
1102	* not be able to handle aborts at the moment).
1103	*/
1104	interruptible &= TH_OPT_INTMASK;
1105	if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1106	interruptible = thread->options & TH_OPT_INTMASK;
1107	}
1108
1109	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1110
1111	if (interruptible == THREAD_UNINT \|\|
1112	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
1113	(!at_safe_point &&
1114	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1115	if (!(thread->state & TH_TERMINATE)) {
1116	DTRACE_SCHED(sleep);
1117	}
1118
1119	int state_bits = TH_WAIT;
1120	if (!interruptible) {
1121	state_bits \|= TH_UNINT;
1122	}
1123	if (thread->sched_call) {
1124	wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1125	if (is_kerneltask(task: get_threadtask(thread))) {
1126	mask = THREAD_WAIT_NOREPORT_KERNEL;
1127	}
1128	if ((interruptible_orig & mask) == `0`) {
1129	state_bits \|= TH_WAIT_REPORT;
1130	}
1131	}
1132	thread->state \|= state_bits;
1133	thread->at_safe_point = at_safe_point;
1134
1135	/ TODO: pass this through assert_wait instead, have*
1136	* assert_wait just take a struct as an argument */
1137	assert(!thread->block_hint);
1138	thread->block_hint = thread->pending_block_hint;
1139	thread->pending_block_hint = kThreadWaitNone;
1140
1141	return thread->wait_result = THREAD_WAITING;
1142	} else {
1143	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1144	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1145	}
1146	}
1147	thread->pending_block_hint = kThreadWaitNone;
1148
1149	return thread->wait_result = THREAD_INTERRUPTED;
1150	}
1151
1152	/*
1153	* Routine: thread_interrupt_level
1154	* Purpose:
1155	* Set the maximum interruptible state for the
1156	* current thread. The effective value of any
1157	* interruptible flag passed into assert_wait
1158	* will never exceed this.
1159	*
1160	* Useful for code that must not be interrupted,
1161	* but which calls code that doesn't know that.
1162	* Returns:
1163	* The old interrupt level for the thread.
1164	*/
1165	__private_extern__
1166	wait_interrupt_t
1167	thread_interrupt_level(
1168	wait_interrupt_t new_level)
1169	{
1170	thread_t thread = current_thread();
1171	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1172
1173	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
1174
1175	return result;
1176	}
1177
1178	/*
1179	* assert_wait:
1180	*
1181	* Assert that the current thread is about to go to
1182	* sleep until the specified event occurs.
1183	*/
1184	wait_result_t
1185	assert_wait(
1186	event_t event,
1187	wait_interrupt_t interruptible)
1188	{
1189	if (__improbable(event == NO_EVENT)) {
1190	panic("%s() called with NO_EVENT", __func__);
1191	}
1192
1193	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1194	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) \| DBG_FUNC_NONE,
1195	VM_KERNEL_UNSLIDE_OR_PERM(event), `0`, `0`, `0`, `0`);
1196
1197	struct waitq *waitq;
1198	waitq = global_eventq(event);
1199	return waitq_assert_wait64(waitq: waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1200	}
1201
1202	/*
1203	* assert_wait_queue:
1204	*
1205	* Return the global waitq for the specified event
1206	*/
1207	struct waitq *
1208	assert_wait_queue(
1209	event_t event)
1210	{
1211	return global_eventq(event);
1212	}
1213
1214	wait_result_t
1215	assert_wait_timeout(
1216	event_t event,
1217	wait_interrupt_t interruptible,
1218	uint32_t interval,
1219	uint32_t scale_factor)
1220	{
1221	thread_t thread = current_thread();
1222	wait_result_t wresult;
1223	uint64_t deadline;
1224	spl_t s;
1225
1226	if (__improbable(event == NO_EVENT)) {
1227	panic("%s() called with NO_EVENT", __func__);
1228	}
1229
1230	struct waitq *waitq;
1231	waitq = global_eventq(event);
1232
1233	s = splsched();
1234	waitq_lock(wq: waitq);
1235
1236	clock_interval_to_deadline(interval, scale_factor, result: &deadline);
1237
1238	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1239	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) \| DBG_FUNC_NONE,
1240	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1241
1242	wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1243	interruptible,
1244	TIMEOUT_URGENCY_SYS_NORMAL,
1245	deadline, TIMEOUT_NO_LEEWAY,
1246	thread);
1247
1248	waitq_unlock(wq: waitq);
1249	splx(s);
1250	return wresult;
1251	}
1252
1253	wait_result_t
1254	assert_wait_timeout_with_leeway(
1255	event_t event,
1256	wait_interrupt_t interruptible,
1257	wait_timeout_urgency_t urgency,
1258	uint32_t interval,
1259	uint32_t leeway,
1260	uint32_t scale_factor)
1261	{
1262	thread_t thread = current_thread();
1263	wait_result_t wresult;
1264	uint64_t deadline;
1265	uint64_t abstime;
1266	uint64_t slop;
1267	uint64_t now;
1268	spl_t s;
1269
1270	if (__improbable(event == NO_EVENT)) {
1271	panic("%s() called with NO_EVENT", __func__);
1272	}
1273
1274	now = mach_absolute_time();
1275	clock_interval_to_absolutetime_interval(interval, scale_factor, result: &abstime);
1276	deadline = now + abstime;
1277
1278	clock_interval_to_absolutetime_interval(interval: leeway, scale_factor, result: &slop);
1279
1280	struct waitq *waitq;
1281	waitq = global_eventq(event);
1282
1283	s = splsched();
1284	waitq_lock(wq: waitq);
1285
1286	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1287	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) \| DBG_FUNC_NONE,
1288	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1289
1290	wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1291	interruptible,
1292	urgency, deadline, leeway: slop,
1293	thread);
1294
1295	waitq_unlock(wq: waitq);
1296	splx(s);
1297	return wresult;
1298	}
1299
1300	wait_result_t
1301	assert_wait_deadline(
1302	event_t event,
1303	wait_interrupt_t interruptible,
1304	uint64_t deadline)
1305	{
1306	thread_t thread = current_thread();
1307	wait_result_t wresult;
1308	spl_t s;
1309
1310	if (__improbable(event == NO_EVENT)) {
1311	panic("%s() called with NO_EVENT", __func__);
1312	}
1313
1314	struct waitq *waitq;
1315	waitq = global_eventq(event);
1316
1317	s = splsched();
1318	waitq_lock(wq: waitq);
1319
1320	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1321	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) \| DBG_FUNC_NONE,
1322	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1323
1324	wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1325	interruptible,
1326	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1327	TIMEOUT_NO_LEEWAY, thread);
1328	waitq_unlock(wq: waitq);
1329	splx(s);
1330	return wresult;
1331	}
1332
1333	wait_result_t
1334	assert_wait_deadline_with_leeway(
1335	event_t event,
1336	wait_interrupt_t interruptible,
1337	wait_timeout_urgency_t urgency,
1338	uint64_t deadline,
1339	uint64_t leeway)
1340	{
1341	thread_t thread = current_thread();
1342	wait_result_t wresult;
1343	spl_t s;
1344
1345	if (__improbable(event == NO_EVENT)) {
1346	panic("%s() called with NO_EVENT", __func__);
1347	}
1348
1349	struct waitq *waitq;
1350	waitq = global_eventq(event);
1351
1352	s = splsched();
1353	waitq_lock(wq: waitq);
1354
1355	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1356	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) \| DBG_FUNC_NONE,
1357	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1358
1359	wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1360	interruptible,
1361	urgency, deadline, leeway,
1362	thread);
1363	waitq_unlock(wq: waitq);
1364	splx(s);
1365	return wresult;
1366	}
1367
1368	void
1369	sched_cond_init(
1370	sched_cond_atomic_t *cond)
1371	{
1372	os_atomic_init(cond, SCHED_COND_INIT);
1373	}
1374
1375	wait_result_t
1376	sched_cond_wait_parameter(
1377	sched_cond_atomic_t *cond,
1378	wait_interrupt_t interruptible,
1379	thread_continue_t continuation,
1380	void *parameter)
1381	{
1382	assert_wait(event: (event_t) cond, interruptible);
1383	/ clear active bit to indicate future wakeups will have to unblock this thread /
1384	sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1385	if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1386	/ a wakeup has been issued; undo wait assertion, ack the wakeup, and return /
1387	thread_t thread = current_thread();
1388	clear_wait(thread, THREAD_AWAKENED);
1389	sched_cond_ack(cond);
1390	return THREAD_AWAKENED;
1391	}
1392	return thread_block_parameter(continuation, parameter);
1393	}
1394
1395	wait_result_t
1396	sched_cond_wait(
1397	sched_cond_atomic_t *cond,
1398	wait_interrupt_t interruptible,
1399	thread_continue_t continuation)
1400	{
1401	return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1402	}
1403
1404	sched_cond_t
1405	sched_cond_ack(
1406	sched_cond_atomic_t *cond)
1407	{
1408	sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE \| SCHED_COND_WAKEUP, acquire);
1409	assert(new_cond & SCHED_COND_ACTIVE);
1410	return new_cond;
1411	}
1412
1413	kern_return_t
1414	sched_cond_signal(
1415	sched_cond_atomic_t *cond,
1416	thread_t thread)
1417	{
1418	disable_preemption();
1419	sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1420	if (!(old_cond & (SCHED_COND_WAKEUP \| SCHED_COND_ACTIVE))) {
1421	/ this was the first wakeup to be issued AND the thread was inactive /
1422	thread_wakeup_thread(event: (event_t) cond, thread);
1423	}
1424	enable_preemption();
1425	return KERN_SUCCESS;
1426	}
1427
1428	/*
1429	* thread_isoncpu:
1430	*
1431	* Return TRUE if a thread is running on a processor such that an AST
1432	* is needed to pull it out of userspace execution, or if executing in
1433	* the kernel, bring to a context switch boundary that would cause
1434	* thread state to be serialized in the thread PCB.
1435	*
1436	* Thread locked, returns the same way. While locked, fields
1437	* like "state" cannot change. "runq" can change only from set to unset.
1438	*/
1439	static inline boolean_t
1440	thread_isoncpu(thread_t thread)
1441	{
1442	/ Not running or runnable /
1443	if (!(thread->state & TH_RUN)) {
1444	return FALSE;
1445	}
1446
1447	/ Waiting on a runqueue, not currently running /
1448	/ TODO: This is invalid - it can get dequeued without thread lock, but not context switched. /
1449	if (thread_get_runq(thread) != PROCESSOR_NULL) {
1450	return FALSE;
1451	}
1452
1453	/*
1454	* Thread does not have a stack yet
1455	* It could be on the stack alloc queue or preparing to be invoked
1456	*/
1457	if (!thread->kernel_stack) {
1458	return FALSE;
1459	}
1460
1461	/*
1462	* Thread must be running on a processor, or
1463	* about to run, or just did run. In all these
1464	* cases, an AST to the processor is needed
1465	* to guarantee that the thread is kicked out
1466	* of userspace and the processor has
1467	* context switched (and saved register state).
1468	*/
1469	return TRUE;
1470	}
1471
1472	/*
1473	* thread_stop:
1474	*
1475	* Force a preemption point for a thread and wait
1476	* for it to stop running on a CPU. If a stronger
1477	* guarantee is requested, wait until no longer
1478	* runnable. Arbitrates access among
1479	* multiple stop requests. (released by unstop)
1480	*
1481	* The thread must enter a wait state and stop via a
1482	* separate means.
1483	*
1484	* Returns FALSE if interrupted.
1485	*/
1486	boolean_t
1487	thread_stop(
1488	thread_t thread,
1489	boolean_t until_not_runnable)
1490	{
1491	wait_result_t wresult;
1492	spl_t s = splsched();
1493	boolean_t oncpu;
1494
1495	wake_lock(thread);
1496	thread_lock(thread);
1497
1498	while (thread->state & TH_SUSP) {
1499	thread->wake_active = TRUE;
1500	thread_unlock(thread);
1501
1502	wresult = assert_wait(event: &thread->wake_active, THREAD_ABORTSAFE);
1503	wake_unlock(thread);
1504	splx(s);
1505
1506	if (wresult == THREAD_WAITING) {
1507	wresult = thread_block(THREAD_CONTINUE_NULL);
1508	}
1509
1510	if (wresult != THREAD_AWAKENED) {
1511	return FALSE;
1512	}
1513
1514	s = splsched();
1515	wake_lock(thread);
1516	thread_lock(thread);
1517	}
1518
1519	thread->state \|= TH_SUSP;
1520
1521	while ((oncpu = thread_isoncpu(thread)) \|\|
1522	(until_not_runnable && (thread->state & TH_RUN))) {
1523	processor_t processor;
1524
1525	if (oncpu) {
1526	assert(thread->state & TH_RUN);
1527	processor = thread->chosen_processor;
1528	cause_ast_check(processor);
1529	}
1530
1531	thread->wake_active = TRUE;
1532	thread_unlock(thread);
1533
1534	wresult = assert_wait(event: &thread->wake_active, THREAD_ABORTSAFE);
1535	wake_unlock(thread);
1536	splx(s);
1537
1538	if (wresult == THREAD_WAITING) {
1539	wresult = thread_block(THREAD_CONTINUE_NULL);
1540	}
1541
1542	if (wresult != THREAD_AWAKENED) {
1543	thread_unstop(thread);
1544	return FALSE;
1545	}
1546
1547	s = splsched();
1548	wake_lock(thread);
1549	thread_lock(thread);
1550	}
1551
1552	thread_unlock(thread);
1553	wake_unlock(thread);
1554	splx(s);
1555
1556	/*
1557	* We return with the thread unlocked. To prevent it from
1558	* transitioning to a runnable state (or from TH_RUN to
1559	* being on the CPU), the caller must ensure the thread
1560	* is stopped via an external means (such as an AST)
1561	*/
1562
1563	return TRUE;
1564	}
1565
1566	/*
1567	* thread_unstop:
1568	*
1569	* Release a previous stop request and set
1570	* the thread running if appropriate.
1571	*
1572	* Use only after a successful stop operation.
1573	*/
1574	void
1575	thread_unstop(
1576	thread_t thread)
1577	{
1578	spl_t s = splsched();
1579
1580	wake_lock(thread);
1581	thread_lock(thread);
1582
1583	assert((thread->state & (TH_RUN \| TH_WAIT \| TH_SUSP)) != TH_SUSP);
1584
1585	if (thread->state & TH_SUSP) {
1586	thread->state &= ~TH_SUSP;
1587
1588	if (thread->wake_active) {
1589	thread->wake_active = FALSE;
1590	thread_unlock(thread);
1591
1592	thread_wakeup(&thread->wake_active);
1593	wake_unlock(thread);
1594	splx(s);
1595
1596	return;
1597	}
1598	}
1599
1600	thread_unlock(thread);
1601	wake_unlock(thread);
1602	splx(s);
1603	}
1604
1605	/*
1606	* thread_wait:
1607	*
1608	* Wait for a thread to stop running. (non-interruptible)
1609	*
1610	*/
1611	void
1612	thread_wait(
1613	thread_t thread,
1614	boolean_t until_not_runnable)
1615	{
1616	wait_result_t wresult;
1617	boolean_t oncpu;
1618	processor_t processor;
1619	spl_t s = splsched();
1620
1621	wake_lock(thread);
1622	thread_lock(thread);
1623
1624	/*
1625	* Wait until not running on a CPU. If stronger requirement
1626	* desired, wait until not runnable. Assumption: if thread is
1627	* on CPU, then TH_RUN is set, so we're not waiting in any case
1628	* where the original, pure "TH_RUN" check would have let us
1629	* finish.
1630	*/
1631	while ((oncpu = thread_isoncpu(thread)) \|\|
1632	(until_not_runnable && (thread->state & TH_RUN))) {
1633	if (oncpu) {
1634	assert(thread->state & TH_RUN);
1635	processor = thread->chosen_processor;
1636	cause_ast_check(processor);
1637	}
1638
1639	thread->wake_active = TRUE;
1640	thread_unlock(thread);
1641
1642	wresult = assert_wait(event: &thread->wake_active, THREAD_UNINT);
1643	wake_unlock(thread);
1644	splx(s);
1645
1646	if (wresult == THREAD_WAITING) {
1647	thread_block(THREAD_CONTINUE_NULL);
1648	}
1649
1650	s = splsched();
1651	wake_lock(thread);
1652	thread_lock(thread);
1653	}
1654
1655	thread_unlock(thread);
1656	wake_unlock(thread);
1657	splx(s);
1658	}
1659
1660	/*
1661	* Routine: clear_wait_internal
1662	*
1663	* Clear the wait condition for the specified thread.
1664	* Start the thread executing if that is appropriate.
1665	* Arguments:
1666	* thread thread to awaken
1667	* result Wakeup result the thread should see
1668	* Conditions:
1669	* At splsched
1670	* the thread is locked.
1671	* Returns:
1672	* KERN_SUCCESS thread was rousted out a wait
1673	* KERN_FAILURE thread was waiting but could not be rousted
1674	* KERN_NOT_WAITING thread was not waiting
1675	*/
1676	__private_extern__ kern_return_t
1677	clear_wait_internal(
1678	thread_t thread,
1679	wait_result_t wresult)
1680	{
1681	waitq_t waitq = thread->waitq;
1682
1683	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1684	return KERN_FAILURE;
1685	}
1686
1687	/*
1688	* Check that the thread is waiting and not waking, as a waking thread
1689	* has already cleared its waitq, and is destined to be go'ed, don't
1690	* need to do it again.
1691	*/
1692	if ((thread->state & (TH_WAIT \| TH_TERMINATE \| TH_WAKING)) != TH_WAIT) {
1693	assert(waitq_is_null(thread->waitq));
1694	return KERN_NOT_WAITING;
1695	}
1696
1697	/ may drop and retake the thread lock /
1698	if (!waitq_is_null(wq: waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1699	return KERN_NOT_WAITING;
1700	}
1701
1702	thread_go(thread, wresult, / handoff / false);
1703
1704	return KERN_SUCCESS;
1705	}
1706
1707
1708	/*
1709	* clear_wait:
1710	*
1711	* Clear the wait condition for the specified thread. Start the thread
1712	* executing if that is appropriate.
1713	*
1714	* parameters:
1715	* thread thread to awaken
1716	* result Wakeup result the thread should see
1717	*/
1718	kern_return_t
1719	clear_wait(
1720	thread_t thread,
1721	wait_result_t result)
1722	{
1723	kern_return_t ret;
1724	spl_t s;
1725
1726	s = splsched();
1727	thread_lock(thread);
1728
1729	ret = clear_wait_internal(thread, wresult: result);
1730
1731	if (thread == current_thread()) {
1732	/*
1733	* The thread must be ready to wait again immediately
1734	* after clearing its own wait.
1735	*/
1736	assert((thread->state & TH_WAKING) == `0`);
1737	}
1738
1739	thread_unlock(thread);
1740	splx(s);
1741	return ret;
1742	}
1743
1744
1745	/*
1746	* thread_wakeup_prim:
1747	*
1748	* Common routine for thread_wakeup, thread_wakeup_with_result,
1749	* and thread_wakeup_one.
1750	*
1751	*/
1752	kern_return_t
1753	thread_wakeup_prim(
1754	event_t event,
1755	boolean_t one_thread,
1756	wait_result_t result)
1757	{
1758	if (__improbable(event == NO_EVENT)) {
1759	panic("%s() called with NO_EVENT", __func__);
1760	}
1761
1762	struct waitq *wq = global_eventq(event);
1763
1764	if (one_thread) {
1765	return waitq_wakeup64_one(waitq: wq, CAST_EVENT64_T(event), result, flags: WAITQ_WAKEUP_DEFAULT);
1766	} else {
1767	return waitq_wakeup64_all(waitq: wq, CAST_EVENT64_T(event), result, flags: WAITQ_WAKEUP_DEFAULT);
1768	}
1769	}
1770
1771	/*
1772	* Wakeup a specified thread if and only if it's waiting for this event
1773	*/
1774	kern_return_t
1775	thread_wakeup_thread(
1776	event_t event,
1777	thread_t thread)
1778	{
1779	if (__improbable(event == NO_EVENT)) {
1780	panic("%s() called with NO_EVENT", __func__);
1781	}
1782
1783	if (__improbable(thread == THREAD_NULL)) {
1784	panic("%s() called with THREAD_NULL", __func__);
1785	}
1786
1787	struct waitq *wq = global_eventq(event);
1788
1789	return waitq_wakeup64_thread(waitq: wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1790	}
1791
1792	/*
1793	* Wakeup a thread waiting on an event and promote it to a priority.
1794	*
1795	* Requires woken thread to un-promote itself when done.
1796	*/
1797	kern_return_t
1798	thread_wakeup_one_with_pri(
1799	event_t event,
1800	int priority)
1801	{
1802	if (__improbable(event == NO_EVENT)) {
1803	panic("%s() called with NO_EVENT", __func__);
1804	}
1805
1806	struct waitq *wq = global_eventq(event);
1807
1808	return waitq_wakeup64_one(waitq: wq, CAST_EVENT64_T(event), THREAD_AWAKENED, flags: priority);
1809	}
1810
1811	/*
1812	* Wakeup a thread waiting on an event,
1813	* promote it to a priority,
1814	* and return a reference to the woken thread.
1815	*
1816	* Requires woken thread to un-promote itself when done.
1817	*/
1818	thread_t
1819	thread_wakeup_identify(event_t event,
1820	int priority)
1821	{
1822	if (__improbable(event == NO_EVENT)) {
1823	panic("%s() called with NO_EVENT", __func__);
1824	}
1825
1826	struct waitq *wq = global_eventq(event);
1827
1828	return waitq_wakeup64_identify(waitq: wq, CAST_EVENT64_T(event), THREAD_AWAKENED, flags: priority);
1829	}
1830
1831	/*
1832	* thread_bind:
1833	*
1834	* Force the current thread to execute on the specified processor.
1835	* Takes effect after the next thread_block().
1836	*
1837	* Returns the previous binding. PROCESSOR_NULL means
1838	* not bound.
1839	*
1840	* XXX - DO NOT export this to users - XXX
1841	*/
1842	processor_t
1843	thread_bind(
1844	processor_t processor)
1845	{
1846	thread_t self = current_thread();
1847	processor_t prev;
1848	spl_t s;
1849
1850	s = splsched();
1851	thread_lock(self);
1852
1853	prev = thread_bind_internal(thread: self, processor);
1854
1855	thread_unlock(self);
1856	splx(s);
1857
1858	return prev;
1859	}
1860
1861	void
1862	thread_bind_during_wakeup(thread_t thread, processor_t processor)
1863	{
1864	assert(!ml_get_interrupts_enabled());
1865	assert((thread->state & (TH_WAIT \| TH_WAKING)) == (TH_WAIT \| TH_WAKING));
1866	#if MACH_ASSERT
1867	thread_lock_assert(thread, LCK_ASSERT_OWNED);
1868	#endif
1869
1870	if (thread->bound_processor != processor) {
1871	thread_bind_internal(thread, processor);
1872	}
1873	}
1874
1875	void
1876	thread_unbind_after_queue_shutdown(
1877	thread_t thread,
1878	processor_t processor __assert_only)
1879	{
1880	assert(!ml_get_interrupts_enabled());
1881
1882	thread_lock(thread);
1883
1884	if (thread->bound_processor) {
1885	bool removed;
1886
1887	assert(thread->bound_processor == processor);
1888
1889	removed = thread_run_queue_remove(thread);
1890	/*
1891	* we can always unbind even if we didn't really remove the
1892	* thread from the runqueue
1893	*/
1894	thread_bind_internal(thread, PROCESSOR_NULL);
1895	if (removed) {
1896	thread_run_queue_reinsert(thread, options: SCHED_TAILQ);
1897	}
1898	}
1899
1900	thread_unlock(thread);
1901	}
1902
1903	/*
1904	* thread_bind_internal:
1905	*
1906	* If the specified thread is not the current thread, and it is currently
1907	* running on another CPU, a remote AST must be sent to that CPU to cause
1908	* the thread to migrate to its bound processor. Otherwise, the migration
1909	* will occur at the next quantum expiration or blocking point.
1910	*
1911	* When the thread is the current thread, and explicit thread_block() should
1912	* be used to force the current processor to context switch away and
1913	* let the thread migrate to the bound processor.
1914	*
1915	* Thread must be locked, and at splsched.
1916	*/
1917
1918	static processor_t
1919	thread_bind_internal(
1920	thread_t thread,
1921	processor_t processor)
1922	{
1923	processor_t prev;
1924
1925	/ <rdar://problem/15102234> /
1926	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1927	/ A thread can't be bound if it's sitting on a (potentially incorrect) runqueue /
1928	thread_assert_runq_null(thread);
1929
1930	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1931	thread_tid(thread), processor ? processor->cpu_id : ~`0ul`, `0`, `0`, `0`);
1932
1933	prev = thread->bound_processor;
1934	thread->bound_processor = processor;
1935
1936	return prev;
1937	}
1938
1939	/*
1940	* thread_vm_bind_group_add:
1941	*
1942	* The "VM bind group" is a special mechanism to mark a collection
1943	* of threads from the VM subsystem that, in general, should be scheduled
1944	* with only one CPU of parallelism. To accomplish this, we initially
1945	* bind all the threads to the master processor, which has the effect
1946	* that only one of the threads in the group can execute at once, including
1947	* preempting threads in the group that are a lower priority. Future
1948	* mechanisms may use more dynamic mechanisms to prevent the collection
1949	* of VM threads from using more CPU time than desired.
1950	*
1951	* The current implementation can result in priority inversions where
1952	* compute-bound priority 95 or realtime threads that happen to have
1953	* landed on the master processor prevent the VM threads from running.
1954	* When this situation is detected, we unbind the threads for one
1955	* scheduler tick to allow the scheduler to run the threads an
1956	* additional CPUs, before restoring the binding (assuming high latency
1957	* is no longer a problem).
1958	*/
1959
1960	/*
1961	* The current max is provisioned for:
1962	* vm_compressor_swap_trigger_thread (92)
1963	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1964	* vm_pageout_continue (92)
1965	* memorystatus_thread (95)
1966	*/
1967	#define MAX_VM_BIND_GROUP_COUNT (5)
1968	decl_simple_lock_data(static, sched_vm_group_list_lock);
1969	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1970	static int sched_vm_group_thread_count;
1971	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1972
1973	void
1974	thread_vm_bind_group_add(void)
1975	{
1976	thread_t self = current_thread();
1977
1978	thread_reference(thread: self);
1979	self->options \|= TH_OPT_SCHED_VM_GROUP;
1980
1981	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1982	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1983	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1984	simple_unlock(&sched_vm_group_list_lock);
1985
1986	thread_bind(master_processor);
1987
1988	/ Switch to bound processor if not already there /
1989	thread_block(THREAD_CONTINUE_NULL);
1990	}
1991
1992	static void
1993	sched_vm_group_maintenance(void)
1994	{
1995	uint64_t ctime = mach_absolute_time();
1996	uint64_t longtime = ctime - sched_tick_interval;
1997	int i;
1998	spl_t s;
1999	boolean_t high_latency_observed = FALSE;
2000	boolean_t runnable_and_not_on_runq_observed = FALSE;
2001	boolean_t bind_target_changed = FALSE;
2002	processor_t bind_target = PROCESSOR_NULL;
2003
2004	/ Make sure nobody attempts to add new threads while we are enumerating them /
2005	simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2006
2007	s = splsched();
2008
2009	for (i = `0`; i < sched_vm_group_thread_count; i++) {
2010	thread_t thread = sched_vm_group_thread_list[i];
2011	assert(thread != THREAD_NULL);
2012	thread_lock(thread);
2013	if ((thread->state & (TH_RUN \| TH_WAIT)) == TH_RUN) {
2014	if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2015	high_latency_observed = TRUE;
2016	} else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2017	/ There are some cases where a thread be transitiong that also fall into this case /
2018	runnable_and_not_on_runq_observed = TRUE;
2019	}
2020	}
2021	thread_unlock(thread);
2022
2023	if (high_latency_observed && runnable_and_not_on_runq_observed) {
2024	/ All the things we are looking for are true, stop looking /
2025	break;
2026	}
2027	}
2028
2029	splx(s);
2030
2031	if (sched_vm_group_temporarily_unbound) {
2032	/ If we turned off binding, make sure everything is OK before rebinding /
2033	if (!high_latency_observed) {
2034	/ rebind /
2035	bind_target_changed = TRUE;
2036	bind_target = master_processor;
2037	sched_vm_group_temporarily_unbound = FALSE; / might be reset to TRUE if change cannot be completed /
2038	}
2039	} else {
2040	/*
2041	* Check if we're in a bad state, which is defined by high
2042	* latency with no core currently executing a thread. If a
2043	* single thread is making progress on a CPU, that means the
2044	* binding concept to reduce parallelism is working as
2045	* designed.
2046	*/
2047	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2048	/ unbind /
2049	bind_target_changed = TRUE;
2050	bind_target = PROCESSOR_NULL;
2051	sched_vm_group_temporarily_unbound = TRUE;
2052	}
2053	}
2054
2055	if (bind_target_changed) {
2056	s = splsched();
2057	for (i = `0`; i < sched_vm_group_thread_count; i++) {
2058	thread_t thread = sched_vm_group_thread_list[i];
2059	boolean_t removed;
2060	assert(thread != THREAD_NULL);
2061
2062	thread_lock(thread);
2063	removed = thread_run_queue_remove(thread);
2064	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
2065	thread_bind_internal(thread, processor: bind_target);
2066	} else {
2067	/*
2068	* Thread was in the middle of being context-switched-to,
2069	* or was in the process of blocking. To avoid switching the bind
2070	* state out mid-flight, defer the change if possible.
2071	*/
2072	if (bind_target == PROCESSOR_NULL) {
2073	thread_bind_internal(thread, processor: bind_target);
2074	} else {
2075	sched_vm_group_temporarily_unbound = TRUE; / next pass will try again /
2076	}
2077	}
2078
2079	if (removed) {
2080	thread_run_queue_reinsert(thread, options: SCHED_PREEMPT \| SCHED_TAILQ);
2081	}
2082	thread_unlock(thread);
2083	}
2084	splx(s);
2085	}
2086
2087	simple_unlock(&sched_vm_group_list_lock);
2088	}
2089
2090	#if defined(__x86_64__)
2091	#define SCHED_AVOID_CPU0 1
2092	#else
2093	#define SCHED_AVOID_CPU0 0
2094	#endif
2095
2096	int sched_allow_rt_smt = `1`;
2097	int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2098	int sched_allow_rt_steal = `1`;
2099	int sched_backup_cpu_timeout_count = `5`; / The maximum number of 10us delays to wait before using a backup cpu /
2100
2101	int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2102
2103	int
2104	sched_get_rt_n_backup_processors(void)
2105	{
2106	return sched_rt_n_backup_processors;
2107	}
2108
2109	void
2110	sched_set_rt_n_backup_processors(int n)
2111	{
2112	if (n < `0`) {
2113	n = `0`;
2114	} else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2115	n = SCHED_MAX_BACKUP_PROCESSORS;
2116	}
2117
2118	sched_rt_n_backup_processors = n;
2119	}
2120
2121	int sched_rt_runq_strict_priority = false;
2122
2123	inline static processor_set_t
2124	change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2125	{
2126	if (current_pset != new_pset) {
2127	pset_unlock(current_pset);
2128	pset_lock(new_pset);
2129	}
2130
2131	return new_pset;
2132	}
2133
2134	/*
2135	* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2136	* rebalancing opportunity exists when a core is (instantaneously) idle, but
2137	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
2138	* IPI thrash if this core does not remain idle following the load balancing ASTs
2139	* Idle "thrash", when IPI issue is followed by idle entry/core power down
2140	* followed by a wakeup shortly thereafter.
2141	*/
2142
2143	#if (DEVELOPMENT \|\| DEBUG)
2144	int sched_smt_balance = `1`;
2145	#endif
2146
2147	/ Invoked with pset locked, returns with pset unlocked /
2148	bool
2149	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2150	{
2151	processor_t ast_processor = NULL;
2152
2153	#if (DEVELOPMENT \|\| DEBUG)
2154	if (__improbable(sched_smt_balance == `0`)) {
2155	goto smt_balance_exit;
2156	}
2157	#endif
2158
2159	assert(cprocessor == current_processor());
2160	if (cprocessor->is_SMT == FALSE) {
2161	goto smt_balance_exit;
2162	}
2163
2164	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2165
2166	/ Determine if both this processor and its sibling are idle,*
2167	* indicating an SMT rebalancing opportunity.
2168	*/
2169	if (sib_processor->state != PROCESSOR_IDLE) {
2170	goto smt_balance_exit;
2171	}
2172
2173	processor_t sprocessor;
2174
2175	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2176	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2177	~cpset->primary_map);
2178	for (int cpuid = lsb_first(bitmap: running_secondary_map); cpuid >= `0`; cpuid = lsb_next(bitmap: running_secondary_map, previous_bit: cpuid)) {
2179	sprocessor = processor_array[cpuid];
2180	if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2181	(sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2182	ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2183	if (ipi_type != SCHED_IPI_NONE) {
2184	assert(sprocessor != cprocessor);
2185	ast_processor = sprocessor;
2186	break;
2187	}
2188	}
2189	}
2190
2191	smt_balance_exit:
2192	pset_unlock(cpset);
2193
2194	if (ast_processor) {
2195	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, `0`, `0`);
2196	sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2197	}
2198	return false;
2199	}
2200
2201	static cpumap_t
2202	pset_available_cpumap(processor_set_t pset)
2203	{
2204	return pset->cpu_available_map & pset->recommended_bitmask;
2205	}
2206
2207	int
2208	pset_available_cpu_count(processor_set_t pset)
2209	{
2210	return bit_count(x: pset_available_cpumap(pset));
2211	}
2212
2213	bool
2214	pset_is_recommended(processor_set_t pset)
2215	{
2216	if (!pset) {
2217	return false;
2218	}
2219	return pset_available_cpu_count(pset) > `0`;
2220	}
2221
2222	static cpumap_t
2223	pset_available_but_not_running_cpumap(processor_set_t pset)
2224	{
2225	return (pset->cpu_state_map[PROCESSOR_IDLE] \| pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2226	pset->recommended_bitmask;
2227	}
2228
2229	bool
2230	pset_has_stealable_threads(processor_set_t pset)
2231	{
2232	pset_assert_locked(pset);
2233
2234	cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2235	/*
2236	* Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2237	* available primary CPUs
2238	*/
2239	avail_map &= pset->primary_map;
2240
2241	return (pset->pset_runq.count > `0`) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(x: avail_map));
2242	}
2243
2244	static cpumap_t
2245	pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2246	{
2247	cpumap_t avail_map = pset_available_cpumap(pset);
2248	if (!sched_allow_rt_smt) {
2249	/*
2250	* Secondary CPUs are not allowed to run RT threads, so
2251	* only primary CPUs should be included
2252	*/
2253	avail_map &= pset->primary_map;
2254	}
2255
2256	return avail_map & ~pset->realtime_map;
2257	}
2258
2259	static bool
2260	pset_needs_a_followup_IPI(processor_set_t pset)
2261	{
2262	int nbackup_cpus = `0`;
2263
2264	if (rt_runq_is_low_latency(pset)) {
2265	nbackup_cpus = sched_rt_n_backup_processors;
2266	}
2267
2268	int rt_rq_count = rt_runq_count(pset);
2269
2270	return (rt_rq_count > `0`) && ((rt_rq_count + nbackup_cpus - bit_count(x: pset->pending_AST_URGENT_cpu_mask)) > `0`);
2271	}
2272
2273	bool
2274	pset_has_stealable_rt_threads(processor_set_t pset)
2275	{
2276	pset_node_t node = pset->node;
2277	if (bit_count(x: node->pset_map) == `1`) {
2278	return false;
2279	}
2280
2281	cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2282
2283	return rt_runq_count(pset) > bit_count(x: avail_map);
2284	}
2285
2286	static void
2287	pset_update_rt_stealable_state(processor_set_t pset)
2288	{
2289	if (pset_has_stealable_rt_threads(pset)) {
2290	pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2291	} else {
2292	pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2293	}
2294	}
2295
2296	static void
2297	clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2298	{
2299	/ Acknowledge any pending IPIs here with pset lock held /
2300	pset_assert_locked(pset);
2301	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2302	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_END,
2303	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, `0`, trace_point_number);
2304	}
2305	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2306
2307	#if defined(CONFIG_SCHED_DEFERRED_AST)
2308	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2309	#endif
2310	}
2311
2312	/*
2313	* Called with pset locked, on a processor that is committing to run a new thread
2314	* Will transition an idle or dispatching processor to running as it picks up
2315	* the first new thread from the idle thread.
2316	*/
2317	static void
2318	pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2319	{
2320	pset_assert_locked(pset);
2321
2322	if (processor->state == PROCESSOR_DISPATCHING \|\| processor->state == PROCESSOR_IDLE) {
2323	assert(current_thread() == processor->idle_thread);
2324
2325	/*
2326	* Dispatching processor is now committed to running new_thread,
2327	* so change its state to PROCESSOR_RUNNING.
2328	*/
2329	pset_update_processor_state(pset, processor, new_state: PROCESSOR_RUNNING);
2330	} else {
2331	assert((processor->state == PROCESSOR_RUNNING) \|\| (processor->state == PROCESSOR_SHUTDOWN));
2332	}
2333
2334	processor_state_update_from_thread(processor, thread: new_thread, true);
2335
2336	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2337	bit_set(pset->realtime_map, processor->cpu_id);
2338	} else {
2339	bit_clear(pset->realtime_map, processor->cpu_id);
2340	}
2341	pset_update_rt_stealable_state(pset);
2342
2343	pset_node_t node = pset->node;
2344
2345	if (bit_count(x: node->pset_map) == `1`) {
2346	/ Node has only a single pset, so skip node pset map updates /
2347	return;
2348	}
2349
2350	cpumap_t avail_map = pset_available_cpumap(pset);
2351
2352	if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2353	if ((avail_map & pset->realtime_map) == avail_map) {
2354	/ No more non-RT CPUs in this pset /
2355	atomic_bit_clear(map: &node->pset_non_rt_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2356	}
2357	avail_map &= pset->primary_map;
2358	if ((avail_map & pset->realtime_map) == avail_map) {
2359	/ No more non-RT primary CPUs in this pset /
2360	atomic_bit_clear(map: &node->pset_non_rt_primary_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2361	}
2362	} else {
2363	if ((avail_map & pset->realtime_map) != avail_map) {
2364	if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2365	atomic_bit_set(map: &node->pset_non_rt_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2366	}
2367	}
2368	avail_map &= pset->primary_map;
2369	if ((avail_map & pset->realtime_map) != avail_map) {
2370	if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2371	atomic_bit_set(map: &node->pset_non_rt_primary_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2372	}
2373	}
2374	}
2375	}
2376
2377	static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2378	static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2379	processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2380	static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2381	#if defined(__x86_64__)
2382	static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2383	static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2384	#endif
2385	static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2386	static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2387
2388	static bool
2389	other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2390	{
2391	pset_map_t pset_map = stealing_pset->node->pset_map;
2392
2393	bit_clear(pset_map, stealing_pset->pset_id);
2394
2395	for (int pset_id = lsb_first(bitmap: pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: pset_map, previous_bit: pset_id)) {
2396	processor_set_t nset = pset_array[pset_id];
2397
2398	if (deadline_add(d: nset->stealable_rt_threads_earliest_deadline, e: rt_deadline_epsilon) < earliest_deadline) {
2399	return true;
2400	}
2401	}
2402
2403	return false;
2404	}
2405
2406	/*
2407	* starting_pset must be locked, but returns true if it is unlocked before return
2408	*/
2409	static bool
2410	choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2411	processor_t result_processor, sched_ipi_type_t result_ipi_type)
2412	{
2413	bool starting_pset_is_unlocked = false;
2414	uint64_t earliest_deadline = rt_runq_earliest_deadline(pset: starting_pset);
2415	int max_pri = rt_runq_priority(pset: starting_pset);
2416	__kdebug_only uint64_t spill_tid = thread_tid(thread: rt_runq_first(rt_runq: &starting_pset->rt_runq));
2417	processor_set_t pset = starting_pset;
2418	processor_t next_rt_processor = PROCESSOR_NULL;
2419	if (spill_ipi) {
2420	processor_set_t nset = next_pset(pset);
2421	assert(nset != starting_pset);
2422	pset = change_locked_pset(current_pset: pset, new_pset: nset);
2423	starting_pset_is_unlocked = true;
2424	}
2425	do {
2426	const bool consider_secondaries = true;
2427	next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, minimum_deadline: earliest_deadline, skip_processor: chosen_processor, consider_secondaries);
2428	if (next_rt_processor == PROCESSOR_NULL) {
2429	if (!spill_ipi) {
2430	break;
2431	}
2432	processor_set_t nset = next_pset(pset);
2433	if (nset == starting_pset) {
2434	break;
2435	}
2436	pset = change_locked_pset(current_pset: pset, new_pset: nset);
2437	starting_pset_is_unlocked = true;
2438	}
2439	} while (next_rt_processor == PROCESSOR_NULL);
2440	if (next_rt_processor) {
2441	if (pset != starting_pset) {
2442	if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2443	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) \| DBG_FUNC_START,
2444	next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2445	}
2446	}
2447	*result_ipi_type = sched_ipi_action(dst: next_rt_processor, NULL, event: SCHED_IPI_EVENT_RT_PREEMPT);
2448	*result_processor = next_rt_processor;
2449	}
2450	if (pset != starting_pset) {
2451	pset_unlock(pset);
2452	}
2453
2454	return starting_pset_is_unlocked;
2455	}
2456
2457	/*
2458	* backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2459	* followup processor - used in thread_select when there are still threads on the run queue and available processors
2460	* spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2461	*/
2462	typedef enum {
2463	none,
2464	backup,
2465	followup,
2466	spill
2467	} next_processor_type_t;
2468
2469	#undef LOOP_COUNT
2470	#ifdef LOOP_COUNT
2471	int max_loop_count[MAX_SCHED_CPUS] = { `0` };
2472	#endif
2473
2474	/*
2475	* thread_select:
2476	*
2477	* Select a new thread for the current processor to execute.
2478	*
2479	* May select the current thread, which must be locked.
2480	*/
2481	static thread_t
2482	thread_select(thread_t thread,
2483	processor_t processor,
2484	ast_t *reason)
2485	{
2486	processor_set_t pset = processor->processor_set;
2487	thread_t new_thread = THREAD_NULL;
2488
2489	assert(processor == current_processor());
2490	assert((thread->state & (TH_RUN \| TH_TERMINATE2)) == TH_RUN);
2491
2492	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_START,
2493	`0`, pset->pending_AST_URGENT_cpu_mask, `0`, `0`);
2494
2495	__kdebug_only int idle_reason = `0`;
2496	__kdebug_only int delay_count = `0`;
2497
2498	#if defined(__x86_64__)
2499	int timeout_count = sched_backup_cpu_timeout_count;
2500	if ((sched_avoid_cpu0 == `1`) && (processor->cpu_id == `0`)) {
2501	/ Prefer cpu0 as backup /
2502	timeout_count--;
2503	} else if ((sched_avoid_cpu0 == `2`) && (processor->processor_primary != processor)) {
2504	/ Prefer secondary cpu as backup /
2505	timeout_count--;
2506	}
2507	#endif
2508	bool pending_AST_URGENT = false;
2509	bool pending_AST_PREEMPT = false;
2510
2511	#ifdef LOOP_COUNT
2512	int loop_count = -`1`;
2513	#endif
2514
2515	do {
2516	/*
2517	* Update the priority.
2518	*/
2519	if (SCHED(can_update_priority)(thread)) {
2520	SCHED(update_priority)(thread);
2521	}
2522
2523	pset_lock(pset);
2524
2525	restart:
2526	#ifdef LOOP_COUNT
2527	loop_count++;
2528	if (loop_count > max_loop_count[processor->cpu_id]) {
2529	max_loop_count[processor->cpu_id] = loop_count;
2530	if (bit_count(loop_count) == `1`) {
2531	kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2532	}
2533	}
2534	#endif
2535	pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2536	pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2537
2538	processor_state_update_from_thread(processor, thread, true);
2539
2540	idle_reason = `0`;
2541
2542	processor_t ast_processor = PROCESSOR_NULL;
2543	processor_t next_rt_processor = PROCESSOR_NULL;
2544	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2545	sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2546
2547	assert(processor->state != PROCESSOR_OFF_LINE);
2548
2549	/*
2550	* Bound threads are dispatched to a processor without going through
2551	* choose_processor(), so in those cases we must continue trying to dequeue work
2552	* as we are the only option.
2553	*/
2554	if (!SCHED(processor_bound_count)(processor)) {
2555	if (!processor->is_recommended) {
2556	/*
2557	* The performance controller has provided a hint to not dispatch more threads,
2558	*/
2559	idle_reason = `1`;
2560	goto send_followup_ipi_before_idle;
2561	} else if (rt_runq_count(pset)) {
2562	bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2563	/ Give the current RT thread a chance to complete /
2564	ok_to_run_realtime_thread \|= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2565	#if defined(__x86_64__)
2566	/*
2567	* On Intel we want to avoid SMT secondary processors and processor 0
2568	* but allow them to be used as backup processors in case the preferred chosen
2569	* processor is delayed by interrupts or processor stalls. So if it is
2570	* not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2571	* but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2572	* we delay up to (timeout_count * 10us) to give the preferred processor chance
2573	* to grab the thread before the (current) backup processor does.
2574	*
2575	* timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2576	* on DEVELOPMENT \|\| DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2577	* cpu0 before secondary cpus or not.
2578	*/
2579	if (!ok_to_run_realtime_thread) {
2580	if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2581	if (timeout_count-- > `0`) {
2582	pset_unlock(pset);
2583	thread_unlock(thread);
2584	delay(`10`);
2585	delay_count++;
2586	thread_lock(thread);
2587	pset_lock(pset);
2588	goto restart;
2589	}
2590	ok_to_run_realtime_thread = true;
2591	}
2592	}
2593	#endif
2594	if (!ok_to_run_realtime_thread) {
2595	idle_reason = `2`;
2596	goto send_followup_ipi_before_idle;
2597	}
2598	} else if (processor->processor_primary != processor) {
2599	/*
2600	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2601	* we should look for work only under the same conditions that choose_processor()
2602	* would have assigned work, which is when all primary processors have been assigned work.
2603	*/
2604	if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != `0`) {
2605	/ There are idle primaries /
2606	idle_reason = `3`;
2607	goto idle;
2608	}
2609	}
2610	}
2611
2612	/*
2613	* Test to see if the current thread should continue
2614	* to run on this processor. Must not be attempting to wait, and not
2615	* bound to a different processor, nor be in the wrong
2616	* processor set, nor be forced to context switch by TH_SUSP.
2617	*
2618	* Note that there are never any RT threads in the regular runqueue.
2619	*
2620	* This code is very insanely tricky.
2621	*/
2622
2623	/ i.e. not waiting, not TH_SUSP'ed /
2624	bool still_running = ((thread->state & (TH_TERMINATE \| TH_IDLE \| TH_WAIT \| TH_RUN \| TH_SUSP)) == TH_RUN);
2625
2626	/*
2627	* Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2628	* TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2629	* <rdar://problem/47907700>
2630	*
2631	* A yielding thread shouldn't be forced to context switch.
2632	*/
2633
2634	bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2635
2636	bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2637
2638	bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2639
2640	bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2641
2642	bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2643
2644	bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2645
2646	bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2647	if (current_thread_can_keep_running) {
2648	/*
2649	* This thread is eligible to keep running on this processor.
2650	*
2651	* RT threads with un-expired quantum stay on processor,
2652	* unless there's a valid RT thread with an earlier deadline
2653	* and it is still ok_to_run_realtime_thread.
2654	*/
2655	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2656	/*
2657	* Pick a new RT thread only if ok_to_run_realtime_thread
2658	* (but the current thread is allowed to complete).
2659	*/
2660	if (ok_to_run_realtime_thread) {
2661	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2662	goto pick_new_rt_thread;
2663	}
2664	if (rt_runq_priority(pset) > thread->sched_pri) {
2665	if (sched_rt_runq_strict_priority) {
2666	/ The next RT thread is better, so pick it off the runqueue. /
2667	goto pick_new_rt_thread;
2668	}
2669
2670	/*
2671	* See if the current lower priority thread can continue to run without causing
2672	* the higher priority thread on the runq queue to miss its deadline.
2673	*/
2674	thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2675	if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2676	/ The next RT thread is better, so pick it off the runqueue. /
2677	goto pick_new_rt_thread;
2678	}
2679	} else if ((rt_runq_count(pset) > `0`) && (deadline_add(d: rt_runq_earliest_deadline(pset), e: rt_deadline_epsilon) < thread->realtime.deadline)) {
2680	/ The next RT thread is better, so pick it off the runqueue. /
2681	goto pick_new_rt_thread;
2682	}
2683	if (other_psets_have_earlier_rt_threads_pending(stealing_pset: pset, earliest_deadline: thread->realtime.deadline)) {
2684	goto pick_new_rt_thread;
2685	}
2686	}
2687
2688	/ This is still the best RT thread to run. /
2689	processor->deadline = thread->realtime.deadline;
2690
2691	sched_update_pset_load_average(pset, curtime: `0`);
2692
2693	clear_pending_AST_bits(pset, processor, trace_point_number: `1`);
2694
2695	next_rt_processor = PROCESSOR_NULL;
2696	next_rt_ipi_type = SCHED_IPI_NONE;
2697
2698	bool pset_unlocked = false;
2699	__kdebug_only next_processor_type_t nptype = none;
2700	if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2701	nptype = spill;
2702	pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, true, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2703	} else if (pset_needs_a_followup_IPI(pset)) {
2704	nptype = followup;
2705	pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, false, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2706	}
2707	if (!pset_unlocked) {
2708	pset_unlock(pset);
2709	}
2710
2711	if (next_rt_processor) {
2712	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) \| DBG_FUNC_NONE,
2713	next_rt_processor->cpu_id, next_rt_processor->state, nptype, `2`);
2714	sched_ipi_perform(dst: next_rt_processor, ipi: next_rt_ipi_type);
2715	}
2716
2717	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2718	(uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `1`);
2719	return thread;
2720	}
2721
2722	if ((rt_runq_count(pset) == `0`) &&
2723	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2724	/ This thread is still the highest priority runnable (non-idle) thread /
2725	processor->deadline = RT_DEADLINE_NONE;
2726
2727	sched_update_pset_load_average(pset, curtime: `0`);
2728
2729	clear_pending_AST_bits(pset, processor, trace_point_number: `2`);
2730
2731	pset_unlock(pset);
2732
2733	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2734	(uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `2`);
2735	return thread;
2736	}
2737	} else {
2738	/*
2739	* This processor must context switch.
2740	* If it's due to a rebalance, we should aggressively find this thread a new home.
2741	*/
2742	if (needs_smt_rebalance \|\| affinity_mismatch \|\| bound_elsewhere \|\| avoid_processor) {
2743	*reason \|= AST_REBALANCE;
2744	}
2745	}
2746
2747	bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2748	(thread_no_smt(thread) \|\| (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2749	(processor->processor_secondary->state == PROCESSOR_IDLE));
2750
2751	/ OK, so we're not going to run the current thread. Look at the RT queue. /
2752	if (ok_to_run_realtime_thread) {
2753	pick_new_rt_thread:
2754	new_thread = sched_rt_choose_thread(pset);
2755	if (new_thread != THREAD_NULL) {
2756	processor->deadline = new_thread->realtime.deadline;
2757	pset_commit_processor_to_new_thread(pset, processor, new_thread);
2758
2759	clear_pending_AST_bits(pset, processor, trace_point_number: `3`);
2760
2761	if (processor->processor_secondary != NULL) {
2762	processor_t sprocessor = processor->processor_secondary;
2763	if ((sprocessor->state == PROCESSOR_RUNNING) \|\| (sprocessor->state == PROCESSOR_DISPATCHING)) {
2764	ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2765	ast_processor = sprocessor;
2766	}
2767	}
2768	}
2769	}
2770
2771	send_followup_ipi_before_idle:
2772	/ This might not have been cleared if we didn't call sched_rt_choose_thread() /
2773	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2774	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) \| DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, `0`, `5`);
2775	}
2776	__kdebug_only next_processor_type_t nptype = none;
2777	bool pset_unlocked = false;
2778	if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2779	nptype = spill;
2780	pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, true, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2781	} else if (pset_needs_a_followup_IPI(pset)) {
2782	nptype = followup;
2783	pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, false, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2784	}
2785
2786	assert(new_thread \|\| !ast_processor);
2787	if (new_thread \|\| next_rt_processor) {
2788	if (!pset_unlocked) {
2789	pset_unlock(pset);
2790	pset_unlocked = true;
2791	}
2792	if (ast_processor == next_rt_processor) {
2793	ast_processor = PROCESSOR_NULL;
2794	ipi_type = SCHED_IPI_NONE;
2795	}
2796
2797	if (ast_processor) {
2798	sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2799	}
2800
2801	if (next_rt_processor) {
2802	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) \| DBG_FUNC_NONE,
2803	next_rt_processor->cpu_id, next_rt_processor->state, nptype, `3`);
2804	sched_ipi_perform(dst: next_rt_processor, ipi: next_rt_ipi_type);
2805	}
2806
2807	if (new_thread) {
2808	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2809	(uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `3`);
2810	return new_thread;
2811	}
2812	}
2813
2814	if (pset_unlocked) {
2815	pset_lock(pset);
2816	}
2817
2818	if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2819	/ Things changed while we dropped the lock /
2820	goto restart;
2821	}
2822
2823	if (processor->is_recommended) {
2824	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2825	if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending \|\| rt_runq_count(pset))) {
2826	/ Things changed while we dropped the lock /
2827	goto restart;
2828	}
2829
2830	if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2831	/ secondary can only run realtime thread /
2832	if (idle_reason == `0`) {
2833	idle_reason = `4`;
2834	}
2835	goto idle;
2836	}
2837	} else if (!SCHED(processor_bound_count)(processor)) {
2838	/ processor not recommended and no bound threads /
2839	if (idle_reason == `0`) {
2840	idle_reason = `5`;
2841	}
2842	goto idle;
2843	}
2844
2845	processor->deadline = RT_DEADLINE_NONE;
2846
2847	/ No RT threads, so let's look at the regular threads. /
2848	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2849	pset_commit_processor_to_new_thread(pset, processor, new_thread);
2850
2851	clear_pending_AST_bits(pset, processor, trace_point_number: `4`);
2852
2853	ast_processor = PROCESSOR_NULL;
2854	ipi_type = SCHED_IPI_NONE;
2855
2856	processor_t sprocessor = processor->processor_secondary;
2857	if (sprocessor != NULL) {
2858	if (sprocessor->state == PROCESSOR_RUNNING) {
2859	if (thread_no_smt(thread: new_thread)) {
2860	ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2861	ast_processor = sprocessor;
2862	}
2863	} else if (secondary_forced_idle && !thread_no_smt(thread: new_thread) && pset_has_stealable_threads(pset)) {
2864	ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_PREEMPT);
2865	ast_processor = sprocessor;
2866	}
2867	}
2868	pset_unlock(pset);
2869
2870	if (ast_processor) {
2871	sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2872	}
2873	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2874	(uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `4`);
2875	return new_thread;
2876	}
2877
2878	if (processor->must_idle) {
2879	processor->must_idle = false;
2880	*reason \|= AST_REBALANCE;
2881	idle_reason = `6`;
2882	goto idle;
2883	}
2884
2885	if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2886	/*
2887	* No runnable threads, attempt to steal
2888	* from other processors. Returns with pset lock dropped.
2889	*/
2890
2891	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2892	pset_lock(pset);
2893	pset_commit_processor_to_new_thread(pset, processor, new_thread);
2894	if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2895	/*
2896	* A realtime thread choose this processor while it was DISPATCHING
2897	* and the pset lock was dropped
2898	*/
2899	ast_on(AST_URGENT \| AST_PREEMPT);
2900	}
2901
2902	clear_pending_AST_bits(pset, processor, trace_point_number: `5`);
2903
2904	pset_unlock(pset);
2905
2906	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2907	(uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `5`);
2908	return new_thread;
2909	}
2910
2911	/*
2912	* If other threads have appeared, shortcut
2913	* around again.
2914	*/
2915	if (SCHED(processor_bound_count)(processor)) {
2916	continue;
2917	}
2918	if (processor->is_recommended) {
2919	if (!SCHED(processor_queue_empty)(processor) \|\| (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > `0`))) {
2920	continue;
2921	}
2922	}
2923
2924	pset_lock(pset);
2925	}
2926
2927	idle:
2928	/ Someone selected this processor while we had dropped the lock /
2929	if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) \|\|
2930	(!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2931	goto restart;
2932	}
2933
2934	if ((idle_reason == `0`) && current_thread_can_keep_running) {
2935	/ This thread is the only runnable (non-idle) thread /
2936	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2937	processor->deadline = thread->realtime.deadline;
2938	} else {
2939	processor->deadline = RT_DEADLINE_NONE;
2940	}
2941
2942	sched_update_pset_load_average(pset, curtime: `0`);
2943
2944	clear_pending_AST_bits(pset, processor, trace_point_number: `6`);
2945
2946	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2947	(uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `6`);
2948	pset_unlock(pset);
2949	return thread;
2950	}
2951
2952	/*
2953	* Nothing is runnable, or this processor must be forced idle,
2954	* so set this processor idle if it was running.
2955	*/
2956	if ((processor->state == PROCESSOR_RUNNING) \|\| (processor->state == PROCESSOR_DISPATCHING)) {
2957	pset_update_processor_state(pset, processor, new_state: PROCESSOR_IDLE);
2958	processor_state_update_idle(processor);
2959	}
2960	pset_update_rt_stealable_state(pset);
2961
2962	clear_pending_AST_bits(pset, processor, trace_point_number: `7`);
2963
2964	/ Invoked with pset locked, returns with pset unlocked /
2965	processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2966
2967	new_thread = processor->idle_thread;
2968	} while (new_thread == THREAD_NULL);
2969
2970	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) \| DBG_FUNC_END,
2971	(uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, `10` + idle_reason);
2972	return new_thread;
2973	}
2974
2975	/*
2976	* thread_invoke
2977	*
2978	* Called at splsched with neither thread locked.
2979	*
2980	* Perform a context switch and start executing the new thread.
2981	*
2982	* Returns FALSE when the context switch didn't happen.
2983	* The reference to the new thread is still consumed.
2984	*
2985	* "self" is what is currently running on the processor,
2986	* "thread" is the new thread to context switch to
2987	* (which may be the same thread in some cases)
2988	*/
2989	static boolean_t
2990	thread_invoke(
2991	thread_t self,
2992	thread_t thread,
2993	ast_t reason)
2994	{
2995	if (__improbable(get_preemption_level() != `0`)) {
2996	int pl = get_preemption_level();
2997	panic("thread_invoke: preemption_level %d, possible cause: %s",
2998	pl, (pl < `0` ? "unlocking an unlocked mutex or spinlock" :
2999	"blocking while holding a spinlock, or within interrupt context"));
3000	}
3001
3002	thread_continue_t continuation = self->continuation;
3003	void *parameter = self->parameter;
3004
3005	struct recount_snap snap = { `0` };
3006	recount_snapshot(snap: &snap);
3007	uint64_t ctime = snap.rsn_time_mach;
3008
3009	check_monotonic_time(ctime);
3010
3011	#ifdef CONFIG_MACH_APPROXIMATE_TIME
3012	commpage_update_mach_approximate_time(ctime);
3013	#endif
3014
3015	if (ctime < thread->last_made_runnable_time) {
3016	panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3017	ctime, thread->last_made_runnable_time);
3018	}
3019
3020	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3021	if (!((thread->state & TH_IDLE) != `0` \|\|
3022	((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3023	sched_timeshare_consider_maintenance(ctime, true);
3024	}
3025	#endif
3026
3027	recount_log_switch_thread(snap: &snap);
3028
3029	assert_thread_magic(self);
3030	assert(self == current_thread());
3031	thread_assert_runq_null(thread: self);
3032	assert((self->state & (TH_RUN \| TH_TERMINATE2)) == TH_RUN);
3033
3034	thread_lock(thread);
3035
3036	assert_thread_magic(thread);
3037	assert((thread->state & (TH_RUN \| TH_WAIT \| TH_UNINT \| TH_TERMINATE \| TH_TERMINATE2)) == TH_RUN);
3038	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
3039	thread_assert_runq_null(thread);
3040
3041	/ Update SFI class based on other factors /
3042	thread->sfi_class = sfi_thread_classify(thread);
3043
3044	/ Update the same_pri_latency for the thread (used by perfcontrol callouts) /
3045	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3046	/*
3047	* In case a base_pri update happened between the timestamp and
3048	* taking the thread lock
3049	*/
3050	if (ctime <= thread->last_basepri_change_time) {
3051	thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3052	}
3053
3054	/ Allow realtime threads to hang onto a stack. /
3055	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3056	self->reserved_stack = self->kernel_stack;
3057	}
3058
3059	/ Prepare for spin debugging /
3060	#if SCHED_HYGIENE_DEBUG
3061	ml_spin_debug_clear(thread);
3062	#endif
3063
3064	if (continuation != NULL) {
3065	if (!thread->kernel_stack) {
3066	/*
3067	* If we are using a privileged stack,
3068	* check to see whether we can exchange it with
3069	* that of the other thread.
3070	*/
3071	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3072	goto need_stack;
3073	}
3074
3075	/*
3076	* Context switch by performing a stack handoff.
3077	* Requires both threads to be parked in a continuation.
3078	*/
3079	continuation = thread->continuation;
3080	parameter = thread->parameter;
3081
3082	processor_t processor = current_processor();
3083	processor->active_thread = thread;
3084	processor_state_update_from_thread(processor, thread, false);
3085
3086	if (thread->last_processor != processor && thread->last_processor != NULL) {
3087	if (thread->last_processor->processor_set != processor->processor_set) {
3088	thread->ps_switch++;
3089	}
3090	thread->p_switch++;
3091	}
3092	thread->last_processor = processor;
3093	thread->c_switch++;
3094	ast_context(thread);
3095
3096	thread_unlock(thread);
3097
3098	self->reason = reason;
3099
3100	processor->last_dispatch = ctime;
3101	self->last_run_time = ctime;
3102	timer_update(timer: &thread->runnable_timer, tstamp: ctime);
3103	recount_switch_thread(snap: &snap, off_thread: self, off_task: get_threadtask(self));
3104
3105	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3106	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) \| DBG_FUNC_NONE,
3107	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
3108
3109	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3110	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) \| DBG_FUNC_NONE,
3111	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, `0`, `0`, `0`);
3112	}
3113
3114	DTRACE_SCHED2(off__cpu, struct thread , thread, struct* proc *, current_proc());
3115
3116	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3117
3118	#if KPERF
3119	kperf_off_cpu(thread: self);
3120	#endif /* KPERF */
3121
3122	/*
3123	* This is where we actually switch thread identity,
3124	* and address space if required. However, register
3125	* state is not switched - this routine leaves the
3126	* stack and register state active on the current CPU.
3127	*/
3128	TLOG(`1`, "thread_invoke: calling stack_handoff\n");
3129	stack_handoff(from: self, to: thread);
3130
3131	/ 'self' is now off core /
3132	assert(thread == current_thread_volatile());
3133
3134	DTRACE_SCHED(on__cpu);
3135
3136	#if KPERF
3137	kperf_on_cpu(thread, continuation, NULL);
3138	#endif /* KPERF */
3139
3140	recount_log_switch_thread_on(snap: &snap);
3141
3142	thread_dispatch(old_thread: self, new_thread: thread);
3143
3144	#if KASAN
3145	/ Old thread's stack has been moved to the new thread, so explicitly*
3146	* unpoison it. */
3147	kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3148	#endif
3149
3150	thread->continuation = thread->parameter = NULL;
3151
3152	boolean_t enable_interrupts = TRUE;
3153
3154	/ idle thread needs to stay interrupts-disabled /
3155	if ((thread->state & TH_IDLE)) {
3156	enable_interrupts = FALSE;
3157	}
3158
3159	assert(continuation);
3160	call_continuation(continuation, parameter,
3161	wresult: thread->wait_result, enable_interrupts);
3162	/NOTREACHED/
3163	} else if (thread == self) {
3164	/ same thread but with continuation /
3165	ast_context(thread: self);
3166
3167	thread_unlock(self);
3168
3169	#if KPERF
3170	kperf_on_cpu(thread, continuation, NULL);
3171	#endif /* KPERF */
3172
3173	recount_log_switch_thread_on(snap: &snap);
3174
3175	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) \| DBG_FUNC_NONE,
3177	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
3178
3179	#if KASAN
3180	/ stack handoff to self - no thread_dispatch(), so clear the stack*
3181	* and free the fakestack directly */
3182	#if KASAN_CLASSIC
3183	kasan_fakestack_drop(self);
3184	kasan_fakestack_gc(self);
3185	#endif /* KASAN_CLASSIC */
3186	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3187	#endif /* KASAN */
3188
3189	self->continuation = self->parameter = NULL;
3190
3191	boolean_t enable_interrupts = TRUE;
3192
3193	/ idle thread needs to stay interrupts-disabled /
3194	if ((self->state & TH_IDLE)) {
3195	enable_interrupts = FALSE;
3196	}
3197
3198	call_continuation(continuation, parameter,
3199	wresult: self->wait_result, enable_interrupts);
3200	/NOTREACHED/
3201	}
3202	} else {
3203	/*
3204	* Check that the other thread has a stack
3205	*/
3206	if (!thread->kernel_stack) {
3207	need_stack:
3208	if (!stack_alloc_try(thread)) {
3209	thread_unlock(thread);
3210	thread_stack_enqueue(thread);
3211	return FALSE;
3212	}
3213	} else if (thread == self) {
3214	ast_context(thread: self);
3215	thread_unlock(self);
3216
3217	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3218	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) \| DBG_FUNC_NONE,
3219	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
3220
3221	return TRUE;
3222	}
3223	}
3224
3225	/*
3226	* Context switch by full context save.
3227	*/
3228	processor_t processor = current_processor();
3229	processor->active_thread = thread;
3230	processor_state_update_from_thread(processor, thread, false);
3231
3232	if (thread->last_processor != processor && thread->last_processor != NULL) {
3233	if (thread->last_processor->processor_set != processor->processor_set) {
3234	thread->ps_switch++;
3235	}
3236	thread->p_switch++;
3237	}
3238	thread->last_processor = processor;
3239	thread->c_switch++;
3240	ast_context(thread);
3241
3242	thread_unlock(thread);
3243
3244	self->reason = reason;
3245
3246	processor->last_dispatch = ctime;
3247	self->last_run_time = ctime;
3248	timer_update(timer: &thread->runnable_timer, tstamp: ctime);
3249	recount_switch_thread(snap: &snap, off_thread: self, off_task: get_threadtask(self));
3250
3251	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3252	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) \| DBG_FUNC_NONE,
3253	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
3254
3255	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3256	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) \| DBG_FUNC_NONE,
3257	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, `0`, `0`, `0`);
3258	}
3259
3260	DTRACE_SCHED2(off__cpu, struct thread , thread, struct* proc *, current_proc());
3261
3262	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3263
3264	#if KPERF
3265	kperf_off_cpu(thread: self);
3266	#endif /* KPERF */
3267
3268	/*
3269	* This is where we actually switch register context,
3270	* and address space if required. We will next run
3271	* as a result of a subsequent context switch.
3272	*
3273	* Once registers are switched and the processor is running "thread",
3274	* the stack variables and non-volatile registers will contain whatever
3275	* was there the last time that thread blocked. No local variables should
3276	* be used after this point, except for the special case of "thread", which
3277	* the platform layer returns as the previous thread running on the processor
3278	* via the function call ABI as a return register, and "self", which may have
3279	* been stored on the stack or a non-volatile register, but a stale idea of
3280	* what was on the CPU is newly-accurate because that thread is again
3281	* running on the CPU.
3282	*
3283	* If one of the threads is using a continuation, thread_continue
3284	* is used to stitch up its context.
3285	*
3286	* If we are invoking a thread which is resuming from a continuation,
3287	* the CPU will invoke thread_continue next.
3288	*
3289	* If the current thread is parking in a continuation, then its state
3290	* won't be saved and the stack will be discarded. When the stack is
3291	* re-allocated, it will be configured to resume from thread_continue.
3292	*/
3293
3294	assert(continuation == self->continuation);
3295	thread = machine_switch_context(old_thread: self, continuation, new_thread: thread);
3296	assert(self == current_thread_volatile());
3297	TLOG(`1`, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3298
3299	assert(continuation == NULL && self->continuation == NULL);
3300
3301	DTRACE_SCHED(on__cpu);
3302
3303	#if KPERF
3304	kperf_on_cpu(thread: self, NULL, starting_fp: __builtin_frame_address(`0`));
3305	#endif /* KPERF */
3306
3307	/ Previous snap on the old stack is gone. /
3308	recount_log_switch_thread_on(NULL);
3309
3310	/ We have been resumed and are set to run. /
3311	thread_dispatch(old_thread: thread, new_thread: self);
3312
3313	return TRUE;
3314	}
3315
3316	#if defined(CONFIG_SCHED_DEFERRED_AST)
3317	/*
3318	* pset_cancel_deferred_dispatch:
3319	*
3320	* Cancels all ASTs that we can cancel for the given processor set
3321	* if the current processor is running the last runnable thread in the
3322	* system.
3323	*
3324	* This function assumes the current thread is runnable. This must
3325	* be called with the pset unlocked.
3326	*/
3327	static void
3328	pset_cancel_deferred_dispatch(
3329	processor_set_t pset,
3330	processor_t processor)
3331	{
3332	processor_t active_processor = NULL;
3333	uint32_t sampled_sched_run_count;
3334
3335	pset_lock(pset);
3336	sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3337
3338	/*
3339	* If we have emptied the run queue, and our current thread is runnable, we
3340	* should tell any processors that are still DISPATCHING that they will
3341	* probably not have any work to do. In the event that there are no
3342	* pending signals that we can cancel, this is also uninteresting.
3343	*
3344	* In the unlikely event that another thread becomes runnable while we are
3345	* doing this (sched_run_count is atomically updated, not guarded), the
3346	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
3347	* in order to dispatch it to a processor in our pset. So, the other
3348	* codepath will wait while we squash all cancelable ASTs, get the pset
3349	* lock, and then dispatch the freshly runnable thread. So this should be
3350	* correct (we won't accidentally have a runnable thread that hasn't been
3351	* dispatched to an idle processor), if not ideal (we may be restarting the
3352	* dispatch process, which could have some overhead).
3353	*/
3354
3355	if ((sampled_sched_run_count == `1`) && (pset->pending_deferred_AST_cpu_mask)) {
3356	uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3357	pset->pending_deferred_AST_cpu_mask &
3358	~pset->pending_AST_URGENT_cpu_mask);
3359	for (int cpuid = lsb_first(bitmap: dispatching_map); cpuid >= `0`; cpuid = lsb_next(bitmap: dispatching_map, previous_bit: cpuid)) {
3360	active_processor = processor_array[cpuid];
3361	/*
3362	* If a processor is DISPATCHING, it could be because of
3363	* a cancelable signal.
3364	*
3365	* IF the processor is not our
3366	* current processor (the current processor should not
3367	* be DISPATCHING, so this is a bit paranoid), AND there
3368	* is a cancelable signal pending on the processor, AND
3369	* there is no non-cancelable signal pending (as there is
3370	* no point trying to backtrack on bringing the processor
3371	* up if a signal we cannot cancel is outstanding), THEN
3372	* it should make sense to roll back the processor state
3373	* to the IDLE state.
3374	*
3375	* If the racey nature of this approach (as the signal
3376	* will be arbitrated by hardware, and can fire as we
3377	* roll back state) results in the core responding
3378	* despite being pushed back to the IDLE state, it
3379	* should be no different than if the core took some
3380	* interrupt while IDLE.
3381	*/
3382	if (active_processor != processor) {
3383	/*
3384	* Squash all of the processor state back to some
3385	* reasonable facsimile of PROCESSOR_IDLE.
3386	*/
3387
3388	processor_state_update_idle(processor: active_processor);
3389	active_processor->deadline = RT_DEADLINE_NONE;
3390	pset_update_processor_state(pset, processor: active_processor, new_state: PROCESSOR_IDLE);
3391	bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3392	machine_signal_idle_cancel(processor: active_processor);
3393	}
3394	}
3395	}
3396
3397	pset_unlock(pset);
3398	}
3399	#else
3400	/ We don't support deferred ASTs; everything is candycanes and sunshine. /
3401	#endif
3402
3403	static void
3404	thread_csw_callout(
3405	thread_t old,
3406	thread_t new,
3407	uint64_t timestamp)
3408	{
3409	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3410	uint64_t same_pri_latency = (new->state & TH_IDLE) ? `0` : new->same_pri_latency;
3411	machine_switch_perfcontrol_context(event, timestamp, flags: `0`,
3412	new_thread_same_pri_latency: same_pri_latency, old, new);
3413	}
3414
3415
3416	/*
3417	* thread_dispatch:
3418	*
3419	* Handle threads at context switch. Re-dispatch other thread
3420	* if still running, otherwise update run state and perform
3421	* special actions. Update quantum for other thread and begin
3422	* the quantum for ourselves.
3423	*
3424	* "thread" is the old thread that we have switched away from.
3425	* "self" is the new current thread that we have context switched to
3426	*
3427	* Called at splsched.
3428	*
3429	*/
3430	void
3431	thread_dispatch(
3432	thread_t thread,
3433	thread_t self)
3434	{
3435	processor_t processor = self->last_processor;
3436	bool was_idle = false;
3437
3438	assert(processor == current_processor());
3439	assert(self == current_thread_volatile());
3440	assert(thread != self);
3441
3442	if (thread != THREAD_NULL) {
3443	/*
3444	* Do the perfcontrol callout for context switch.
3445	* The reason we do this here is:
3446	* - thread_dispatch() is called from various places that are not
3447	* the direct context switch path for eg. processor shutdown etc.
3448	* So adding the callout here covers all those cases.
3449	* - We want this callout as early as possible to be close
3450	* to the timestamp taken in thread_invoke()
3451	* - We want to avoid holding the thread lock while doing the
3452	* callout
3453	* - We do not want to callout if "thread" is NULL.
3454	*/
3455	thread_csw_callout(old: thread, new: self, timestamp: processor->last_dispatch);
3456
3457	#if KASAN
3458	if (thread->continuation != NULL) {
3459	/*
3460	* Thread has a continuation and the normal stack is going away.
3461	* Unpoison the stack and mark all fakestack objects as unused.
3462	*/
3463	#if KASAN_CLASSIC
3464	kasan_fakestack_drop(thread);
3465	#endif /* KASAN_CLASSIC */
3466	if (thread->kernel_stack) {
3467	kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3468	}
3469	}
3470
3471
3472	#if KASAN_CLASSIC
3473	/*
3474	* Free all unused fakestack objects.
3475	*/
3476	kasan_fakestack_gc(thread);
3477	#endif /* KASAN_CLASSIC */
3478	#endif /* KASAN */
3479
3480	/*
3481	* If blocked at a continuation, discard
3482	* the stack.
3483	*/
3484	if (thread->continuation != NULL && thread->kernel_stack != `0`) {
3485	stack_free(thread);
3486	}
3487
3488	if (thread->state & TH_IDLE) {
3489	was_idle = true;
3490	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3491	MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) \| DBG_FUNC_NONE,
3492	(uintptr_t)thread_tid(thread), `0`, thread->state,
3493	sched_run_buckets[TH_BUCKET_RUN], `0`);
3494	} else {
3495	int64_t consumed;
3496	int64_t remainder = `0`;
3497
3498	if (processor->quantum_end > processor->last_dispatch) {
3499	remainder = processor->quantum_end -
3500	processor->last_dispatch;
3501	}
3502
3503	consumed = thread->quantum_remaining - remainder;
3504
3505	if ((thread->reason & AST_LEDGER) == `0`) {
3506	/*
3507	* Bill CPU time to both the task and
3508	* the individual thread.
3509	*/
3510	ledger_credit_thread(thread, ledger: thread->t_ledger,
3511	entry: task_ledgers.cpu_time, amount: consumed);
3512	ledger_credit_thread(thread, ledger: thread->t_threadledger,
3513	entry: thread_ledgers.cpu_time, amount: consumed);
3514	if (thread->t_bankledger) {
3515	ledger_credit_thread(thread, ledger: thread->t_bankledger,
3516	entry: bank_ledgers.cpu_time,
3517	amount: (consumed - thread->t_deduct_bank_ledger_time));
3518	}
3519	thread->t_deduct_bank_ledger_time = `0`;
3520	if (consumed > `0`) {
3521	/*
3522	* This should never be negative, but in traces we are seeing some instances
3523	* of consumed being negative.
3524	* <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3525	*/
3526	sched_update_pset_avg_execution_time(pset: current_processor()->processor_set, delta: consumed, curtime: processor->last_dispatch, sched_bucket: thread->th_sched_bucket);
3527	}
3528	}
3529
3530	/ For the thread that we just context switched away from, figure*
3531	* out if we have expired the wq quantum and set the AST if we have
3532	*/
3533	if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3534	thread_evaluate_workqueue_quantum_expiry(thread);
3535	}
3536
3537	if (__improbable(thread->rwlock_count != `0`)) {
3538	smr_mark_active_trackers_stalled(self: thread);
3539	}
3540
3541	/*
3542	* Pairs with task_restartable_ranges_synchronize
3543	*/
3544	wake_lock(thread);
3545	thread_lock(thread);
3546
3547	/*
3548	* Same as ast_check(), in case we missed the IPI
3549	*/
3550	thread_reset_pcs_ack_IPI(thread);
3551
3552	/*
3553	* Apply a priority floor if the thread holds a kernel resource
3554	* or explicitly requested it.
3555	* Do this before checking starting_pri to avoid overpenalizing
3556	* repeated rwlock blockers.
3557	*/
3558	if (__improbable(thread->rwlock_count != `0`)) {
3559	lck_rw_set_promotion_locked(thread);
3560	}
3561	if (__improbable(thread->priority_floor_count != `0`)) {
3562	thread_floor_boost_set_promotion_locked(thread);
3563	}
3564
3565	boolean_t keep_quantum = processor->first_timeslice;
3566
3567	/*
3568	* Treat a thread which has dropped priority since it got on core
3569	* as having expired its quantum.
3570	*/
3571	if (processor->starting_pri > thread->sched_pri) {
3572	keep_quantum = FALSE;
3573	}
3574
3575	/ Compute remainder of current quantum. /
3576	if (keep_quantum &&
3577	processor->quantum_end > processor->last_dispatch) {
3578	thread->quantum_remaining = (uint32_t)remainder;
3579	} else {
3580	thread->quantum_remaining = `0`;
3581	}
3582
3583	if (thread->sched_mode == TH_MODE_REALTIME) {
3584	/*
3585	* Cancel the deadline if the thread has
3586	* consumed the entire quantum.
3587	*/
3588	if (thread->quantum_remaining == `0`) {
3589	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) \| DBG_FUNC_NONE,
3590	(uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, `0`);
3591	thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3592	}
3593	} else {
3594	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3595	/*
3596	* For non-realtime threads treat a tiny
3597	* remaining quantum as an expired quantum
3598	* but include what's left next time.
3599	*/
3600	if (thread->quantum_remaining < min_std_quantum) {
3601	thread->reason \|= AST_QUANTUM;
3602	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3603	}
3604	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3605	}
3606
3607	/*
3608	* If we are doing a direct handoff then
3609	* take the remainder of the quantum.
3610	*/
3611	if ((thread->reason & (AST_HANDOFF \| AST_QUANTUM)) == AST_HANDOFF) {
3612	self->quantum_remaining = thread->quantum_remaining;
3613	thread->reason \|= AST_QUANTUM;
3614	thread->quantum_remaining = `0`;
3615	} else {
3616	#if defined(CONFIG_SCHED_MULTIQ)
3617	if (SCHED(sched_groups_enabled) &&
3618	thread->sched_group == self->sched_group) {
3619	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3620	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3621	self->reason, (uintptr_t)thread_tid(thread),
3622	self->quantum_remaining, thread->quantum_remaining, `0`);
3623
3624	self->quantum_remaining = thread->quantum_remaining;
3625	thread->quantum_remaining = `0`;
3626	/ Don't set AST_QUANTUM here - old thread might still want to preempt someone else /
3627	}
3628	#endif /* defined(CONFIG_SCHED_MULTIQ) */
3629	}
3630
3631	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3632
3633	if (!(thread->state & TH_WAIT)) {
3634	/*
3635	* Still runnable.
3636	*/
3637	thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3638
3639	machine_thread_going_off_core(old_thread: thread, FALSE, last_dispatch: processor->last_dispatch, TRUE);
3640
3641	ast_t reason = thread->reason;
3642	sched_options_t options = SCHED_NONE;
3643
3644	if (reason & AST_REBALANCE) {
3645	options \|= SCHED_REBALANCE;
3646	if (reason & AST_QUANTUM) {
3647	/*
3648	* Having gone to the trouble of forcing this thread off a less preferred core,
3649	* we should force the preferable core to reschedule immediately to give this
3650	* thread a chance to run instead of just sitting on the run queue where
3651	* it may just be stolen back by the idle core we just forced it off.
3652	* But only do this at the end of a quantum to prevent cascading effects.
3653	*/
3654	options \|= SCHED_PREEMPT;
3655	}
3656	}
3657
3658	if (reason & AST_QUANTUM) {
3659	options \|= SCHED_TAILQ;
3660	} else if (reason & AST_PREEMPT) {
3661	options \|= SCHED_HEADQ;
3662	} else {
3663	options \|= (SCHED_PREEMPT \| SCHED_TAILQ);
3664	}
3665
3666	thread_setrun(thread, options);
3667
3668	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3669	MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) \| DBG_FUNC_NONE,
3670	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
3671	sched_run_buckets[TH_BUCKET_RUN], `0`);
3672
3673	if (thread->wake_active) {
3674	thread->wake_active = FALSE;
3675	thread_unlock(thread);
3676
3677	thread_wakeup(&thread->wake_active);
3678	} else {
3679	thread_unlock(thread);
3680	}
3681
3682	wake_unlock(thread);
3683	} else {
3684	/*
3685	* Waiting.
3686	*/
3687	boolean_t should_terminate = FALSE;
3688	uint32_t new_run_count;
3689	int thread_state = thread->state;
3690
3691	/ Only the first call to thread_dispatch*
3692	* after explicit termination should add
3693	* the thread to the termination queue
3694	*/
3695	if ((thread_state & (TH_TERMINATE \| TH_TERMINATE2)) == TH_TERMINATE) {
3696	should_terminate = TRUE;
3697	thread_state \|= TH_TERMINATE2;
3698	}
3699
3700	timer_stop(timer: &thread->runnable_timer, tstamp: processor->last_dispatch);
3701
3702	thread_state &= ~TH_RUN;
3703	thread->state = thread_state;
3704
3705	thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3706	thread->chosen_processor = PROCESSOR_NULL;
3707
3708	new_run_count = SCHED(run_count_decr)(thread);
3709
3710	#if CONFIG_SCHED_AUTO_JOIN
3711	if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != `0`) {
3712	work_interval_auto_join_unwind(thread);
3713	}
3714	#endif /* CONFIG_SCHED_AUTO_JOIN */
3715
3716	#if CONFIG_SCHED_SFI
3717	if (thread->reason & AST_SFI) {
3718	thread->wait_sfi_begin_time = processor->last_dispatch;
3719	}
3720	#endif
3721	machine_thread_going_off_core(old_thread: thread, thread_terminating: should_terminate, last_dispatch: processor->last_dispatch, FALSE);
3722
3723	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3724	MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) \| DBG_FUNC_NONE,
3725	(uintptr_t)thread_tid(thread), thread->reason, thread_state,
3726	new_run_count, `0`);
3727
3728	if (thread_state & TH_WAIT_REPORT) {
3729	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3730	}
3731
3732	if (thread->wake_active) {
3733	thread->wake_active = FALSE;
3734	thread_unlock(thread);
3735
3736	thread_wakeup(&thread->wake_active);
3737	} else {
3738	thread_unlock(thread);
3739	}
3740
3741	wake_unlock(thread);
3742
3743	if (should_terminate) {
3744	thread_terminate_enqueue(thread);
3745	}
3746	}
3747	}
3748	/*
3749	* The thread could have been added to the termination queue, so it's
3750	* unsafe to use after this point.
3751	*/
3752	thread = THREAD_NULL;
3753	}
3754
3755	int urgency = THREAD_URGENCY_NONE;
3756	uint64_t latency = `0`;
3757
3758	/ Update (new) current thread and reprogram running timers /
3759	thread_lock(self);
3760
3761	if (!(self->state & TH_IDLE)) {
3762	uint64_t arg1, arg2;
3763
3764	#if CONFIG_SCHED_SFI
3765	ast_t new_ast;
3766
3767	new_ast = sfi_thread_needs_ast(thread: self, NULL);
3768
3769	if (new_ast != AST_NONE) {
3770	ast_on(reasons: new_ast);
3771	}
3772	#endif
3773
3774	if (processor->last_dispatch < self->last_made_runnable_time) {
3775	panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3776	processor->last_dispatch, self->last_made_runnable_time);
3777	}
3778
3779	assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3780
3781	latency = processor->last_dispatch - self->last_made_runnable_time;
3782	assert(latency >= self->same_pri_latency);
3783
3784	urgency = thread_get_urgency(thread: self, rt_period: &arg1, rt_deadline: &arg2);
3785
3786	thread_tell_urgency(urgency, rt_period: arg1, rt_deadline: arg2, sched_latency: latency, nthread: self);
3787
3788	/*
3789	* Start a new CPU limit interval if the previous one has
3790	* expired. This should happen before initializing a new
3791	* quantum.
3792	*/
3793	if (cpulimit_affects_quantum &&
3794	thread_cpulimit_interval_has_expired(now: processor->last_dispatch)) {
3795	thread_cpulimit_restart(now: processor->last_dispatch);
3796	}
3797
3798	/*
3799	* Get a new quantum if none remaining.
3800	*/
3801	if (self->quantum_remaining == `0`) {
3802	thread_quantum_init(thread: self, now: processor->last_dispatch);
3803	}
3804
3805	/*
3806	* Set up quantum timer and timeslice.
3807	*/
3808	processor->quantum_end = processor->last_dispatch +
3809	self->quantum_remaining;
3810
3811	running_timer_setup(processor, timer: RUNNING_TIMER_QUANTUM, param: self,
3812	deadline: processor->quantum_end, now: processor->last_dispatch);
3813	if (was_idle) {
3814	/*
3815	* kperf's running timer is active whenever the idle thread for a
3816	* CPU is not running.
3817	*/
3818	kperf_running_setup(processor, now: processor->last_dispatch);
3819	}
3820	running_timers_activate(processor);
3821	processor->first_timeslice = TRUE;
3822	} else {
3823	running_timers_deactivate(processor);
3824	processor->first_timeslice = FALSE;
3825	thread_tell_urgency(urgency: THREAD_URGENCY_NONE, rt_period: `0`, rt_deadline: `0`, sched_latency: `0`, nthread: self);
3826	}
3827
3828	assert(self->block_hint == kThreadWaitNone);
3829	self->computation_epoch = processor->last_dispatch;
3830	/*
3831	* This relies on the interrupt time being tallied up to the thread in the
3832	* exception handler epilogue, which is before AST context where preemption
3833	* is considered (and the scheduler is potentially invoked to
3834	* context switch, here).
3835	*/
3836	self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3837	self->reason = AST_NONE;
3838	processor->starting_pri = self->sched_pri;
3839
3840	thread_unlock(self);
3841
3842	machine_thread_going_on_core(new_thread: self, urgency, sched_latency: latency, same_pri_latency: self->same_pri_latency,
3843	dispatch_time: processor->last_dispatch);
3844
3845	#if defined(CONFIG_SCHED_DEFERRED_AST)
3846	/*
3847	* TODO: Can we state that redispatching our old thread is also
3848	* uninteresting?
3849	*/
3850	if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == `1`) && !(self->state & TH_IDLE)) {
3851	pset_cancel_deferred_dispatch(pset: processor->processor_set, processor);
3852	}
3853	#endif
3854	}
3855
3856	/*
3857	* thread_block_reason:
3858	*
3859	* Forces a reschedule, blocking the caller if a wait
3860	* has been asserted.
3861	*
3862	* If a continuation is specified, then thread_invoke will
3863	* attempt to discard the thread's kernel stack. When the
3864	* thread resumes, it will execute the continuation function
3865	* on a new kernel stack.
3866	*/
3867	wait_result_t
3868	thread_block_reason(
3869	thread_continue_t continuation,
3870	void *parameter,
3871	ast_t reason)
3872	{
3873	thread_t self = current_thread();
3874	processor_t processor;
3875	thread_t new_thread;
3876	spl_t s;
3877
3878	s = splsched();
3879
3880	processor = current_processor();
3881
3882	/ If we're explicitly yielding, force a subsequent quantum /
3883	if (reason & AST_YIELD) {
3884	processor->first_timeslice = FALSE;
3885	}
3886
3887	/ We're handling all scheduling AST's /
3888	ast_off(AST_SCHEDULING);
3889
3890	clear_pending_nonurgent_preemption(processor);
3891
3892	#if PROC_REF_DEBUG
3893	if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3894	uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3895	}
3896	#endif
3897
3898	#if CONFIG_EXCLAVES
3899	if (continuation != NULL) {
3900	assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, `0`);
3901	}
3902	#endif /* CONFIG_EXCLAVES */
3903
3904	self->continuation = continuation;
3905	self->parameter = parameter;
3906
3907	if (self->state & ~(TH_RUN \| TH_IDLE)) {
3908	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3909	MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3910	reason, VM_KERNEL_UNSLIDE(continuation), `0`, `0`, `0`);
3911	}
3912
3913	do {
3914	thread_lock(self);
3915	new_thread = thread_select(thread: self, processor, reason: &reason);
3916	thread_unlock(self);
3917	} while (!thread_invoke(self, thread: new_thread, reason));
3918
3919	splx(s);
3920
3921	return self->wait_result;
3922	}
3923
3924	/*
3925	* thread_block:
3926	*
3927	* Block the current thread if a wait has been asserted.
3928	*/
3929	wait_result_t
3930	thread_block(
3931	thread_continue_t continuation)
3932	{
3933	return thread_block_reason(continuation, NULL, AST_NONE);
3934	}
3935
3936	wait_result_t
3937	thread_block_parameter(
3938	thread_continue_t continuation,
3939	void *parameter)
3940	{
3941	return thread_block_reason(continuation, parameter, AST_NONE);
3942	}
3943
3944	/*
3945	* thread_run:
3946	*
3947	* Switch directly from the current thread to the
3948	* new thread, handing off our quantum if appropriate.
3949	*
3950	* New thread must be runnable, and not on a run queue.
3951	*
3952	* Called at splsched.
3953	*/
3954	int
3955	thread_run(
3956	thread_t self,
3957	thread_continue_t continuation,
3958	void *parameter,
3959	thread_t new_thread)
3960	{
3961	ast_t reason = AST_NONE;
3962
3963	if ((self->state & TH_IDLE) == `0`) {
3964	reason = AST_HANDOFF;
3965	}
3966
3967	/*
3968	* If this thread hadn't been setrun'ed, it
3969	* might not have a chosen processor, so give it one
3970	*/
3971	if (new_thread->chosen_processor == NULL) {
3972	new_thread->chosen_processor = current_processor();
3973	}
3974
3975	self->continuation = continuation;
3976	self->parameter = parameter;
3977
3978	while (!thread_invoke(self, thread: new_thread, reason)) {
3979	/ the handoff failed, so we have to fall back to the normal block path /
3980	processor_t processor = current_processor();
3981
3982	reason = AST_NONE;
3983
3984	thread_lock(self);
3985	new_thread = thread_select(thread: self, processor, reason: &reason);
3986	thread_unlock(self);
3987	}
3988
3989	return self->wait_result;
3990	}
3991
3992	/*
3993	* thread_continue:
3994	*
3995	* Called at splsched when a thread first receives
3996	* a new stack after a continuation.
3997	*
3998	* Called with THREAD_NULL as the old thread when
3999	* invoked by machine_load_context.
4000	*/
4001	void
4002	thread_continue(
4003	thread_t thread)
4004	{
4005	thread_t self = current_thread();
4006	thread_continue_t continuation;
4007	void *parameter;
4008
4009	DTRACE_SCHED(on__cpu);
4010
4011	continuation = self->continuation;
4012	parameter = self->parameter;
4013
4014	assert(continuation != NULL);
4015
4016	#if KPERF
4017	kperf_on_cpu(thread: self, continuation, NULL);
4018	#endif
4019
4020	thread_dispatch(thread, self);
4021
4022	self->continuation = self->parameter = NULL;
4023
4024	#if SCHED_HYGIENE_DEBUG
4025	/ Reset interrupt-masked spin debugging timeout /
4026	ml_spin_debug_clear(self);
4027	#endif
4028
4029	TLOG(`1`, "thread_continue: calling call_continuation\n");
4030
4031	boolean_t enable_interrupts = TRUE;
4032
4033	/ bootstrap thread, idle thread need to stay interrupts-disabled /
4034	if (thread == THREAD_NULL \|\| (self->state & TH_IDLE)) {
4035	enable_interrupts = FALSE;
4036	}
4037
4038	#if KASAN_TBI
4039	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4040	#endif /* KASAN_TBI */
4041
4042
4043	call_continuation(continuation, parameter, wresult: self->wait_result, enable_interrupts);
4044	/NOTREACHED/
4045	}
4046
4047	void
4048	thread_quantum_init(thread_t thread, uint64_t now)
4049	{
4050	uint64_t new_quantum = `0`;
4051
4052	switch (thread->sched_mode) {
4053	case TH_MODE_REALTIME:
4054	new_quantum = thread->realtime.computation;
4055	new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4056	break;
4057
4058	case TH_MODE_FIXED:
4059	new_quantum = SCHED(initial_quantum_size)(thread);
4060	new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4061	break;
4062
4063	default:
4064	new_quantum = SCHED(initial_quantum_size)(thread);
4065	break;
4066	}
4067
4068	if (cpulimit_affects_quantum) {
4069	const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4070
4071	/*
4072	* If there's no remaining CPU time, the ledger system will
4073	* notice and put the thread to sleep.
4074	*/
4075	if (cpulimit_remaining > `0`) {
4076	new_quantum = MIN(new_quantum, cpulimit_remaining);
4077	}
4078	}
4079
4080	assert3u(new_quantum, <, UINT32_MAX);
4081	assert3u(new_quantum, >, `0`);
4082
4083	thread->quantum_remaining = (uint32_t)new_quantum;
4084	}
4085
4086	uint32_t
4087	sched_timeshare_initial_quantum_size(thread_t thread)
4088	{
4089	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4090	return bg_quantum;
4091	} else {
4092	return std_quantum;
4093	}
4094	}
4095
4096	/*
4097	* run_queue_init:
4098	*
4099	* Initialize a run queue before first use.
4100	*/
4101	void
4102	run_queue_init(
4103	run_queue_t rq)
4104	{
4105	rq->highq = NOPRI;
4106	for (u_int i = `0`; i < BITMAP_LEN(NRQS); i++) {
4107	rq->bitmap[i] = `0`;
4108	}
4109	rq->urgency = rq->count = `0`;
4110	for (int i = `0`; i < NRQS; i++) {
4111	circle_queue_init(&rq->queues[i]);
4112	}
4113	}
4114
4115	/*
4116	* run_queue_dequeue:
4117	*
4118	* Perform a dequeue operation on a run queue,
4119	* and return the resulting thread.
4120	*
4121	* The run queue must be locked (see thread_run_queue_remove()
4122	* for more info), and not empty.
4123	*/
4124	thread_t
4125	run_queue_dequeue(
4126	run_queue_t rq,
4127	sched_options_t options)
4128	{
4129	thread_t thread;
4130	circle_queue_t queue = &rq->queues[rq->highq];
4131
4132	if (options & SCHED_HEADQ) {
4133	thread = cqe_dequeue_head(queue, struct thread, runq_links);
4134	} else {
4135	thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4136	}
4137
4138	assert(thread != THREAD_NULL);
4139	assert_thread_magic(thread);
4140
4141	thread_clear_runq(thread);
4142	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4143	rq->count--;
4144	if (SCHED(priority_is_urgent)(rq->highq)) {
4145	rq->urgency--; assert(rq->urgency >= `0`);
4146	}
4147	if (circle_queue_empty(cq: queue)) {
4148	bitmap_clear(map: rq->bitmap, n: rq->highq);
4149	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
4150	}
4151
4152	return thread;
4153	}
4154
4155	/*
4156	* run_queue_enqueue:
4157	*
4158	* Perform a enqueue operation on a run queue.
4159	*
4160	* The run queue must be locked (see thread_run_queue_remove()
4161	* for more info).
4162	*/
4163	boolean_t
4164	run_queue_enqueue(
4165	run_queue_t rq,
4166	thread_t thread,
4167	sched_options_t options)
4168	{
4169	circle_queue_t queue = &rq->queues[thread->sched_pri];
4170	boolean_t result = FALSE;
4171
4172	assert_thread_magic(thread);
4173
4174	if (circle_queue_empty(cq: queue)) {
4175	circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
4176
4177	rq_bitmap_set(map: rq->bitmap, n: thread->sched_pri);
4178	if (thread->sched_pri > rq->highq) {
4179	rq->highq = thread->sched_pri;
4180	result = TRUE;
4181	}
4182	} else {
4183	if (options & SCHED_TAILQ) {
4184	circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
4185	} else {
4186	circle_enqueue_head(cq: queue, elt: &thread->runq_links);
4187	}
4188	}
4189	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4190	rq->urgency++;
4191	}
4192	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4193	rq->count++;
4194
4195	return result;
4196	}
4197
4198	/*
4199	* run_queue_remove:
4200	*
4201	* Remove a specific thread from a runqueue.
4202	*
4203	* The run queue must be locked.
4204	*/
4205	void
4206	run_queue_remove(
4207	run_queue_t rq,
4208	thread_t thread)
4209	{
4210	circle_queue_t queue = &rq->queues[thread->sched_pri];
4211
4212	thread_assert_runq_nonnull(thread);
4213	assert_thread_magic(thread);
4214
4215	circle_dequeue(cq: queue, elt: &thread->runq_links);
4216	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4217	rq->count--;
4218	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4219	rq->urgency--; assert(rq->urgency >= `0`);
4220	}
4221
4222	if (circle_queue_empty(cq: queue)) {
4223	/ update run queue status /
4224	bitmap_clear(map: rq->bitmap, n: thread->sched_pri);
4225	rq->highq = bitmap_first(map: rq->bitmap, NRQS);
4226	}
4227
4228	thread_clear_runq(thread);
4229	}
4230
4231	/*
4232	* run_queue_peek
4233	*
4234	* Peek at the runq and return the highest
4235	* priority thread from the runq.
4236	*
4237	* The run queue must be locked.
4238	*/
4239	thread_t
4240	run_queue_peek(
4241	run_queue_t rq)
4242	{
4243	if (rq->count > `0`) {
4244	circle_queue_t queue = &rq->queues[rq->highq];
4245	thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4246	assert_thread_magic(thread);
4247	return thread;
4248	} else {
4249	return THREAD_NULL;
4250	}
4251	}
4252
4253	static bool
4254	rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4255	{
4256	int pri = thread->sched_pri;
4257	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4258	int i = pri - BASEPRI_RTQUEUES;
4259	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4260	bitmap_t *map = rt_run_queue->bitmap;
4261
4262	bitmap_set(map, n: i);
4263
4264	queue_t queue = &rt_runq->pri_queue;
4265	uint64_t deadline = thread->realtime.deadline;
4266	bool preempt = false;
4267	bool earliest = false;
4268
4269	if (queue_empty(queue)) {
4270	enqueue_tail(que: queue, elt: &thread->runq_links);
4271	preempt = true;
4272	earliest = true;
4273	rt_runq->pri_earliest_deadline = deadline;
4274	rt_runq->pri_constraint = thread->realtime.constraint;
4275	} else {
4276	/ Insert into rt_runq in thread deadline order /
4277	queue_entry_t iter;
4278	qe_foreach(iter, queue) {
4279	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4280	assert_thread_magic(iter_thread);
4281
4282	if (deadline < iter_thread->realtime.deadline) {
4283	if (iter == queue_first(queue)) {
4284	preempt = true;
4285	earliest = true;
4286	rt_runq->pri_earliest_deadline = deadline;
4287	rt_runq->pri_constraint = thread->realtime.constraint;
4288	}
4289	insque(entry: &thread->runq_links, queue_prev(iter));
4290	break;
4291	} else if (iter == queue_last(queue)) {
4292	enqueue_tail(que: queue, elt: &thread->runq_links);
4293	break;
4294	}
4295	}
4296	}
4297	if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4298	os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4299	os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4300	os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4301	}
4302
4303	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4304	rt_runq->pri_count++;
4305	os_atomic_inc(&rt_run_queue->count, relaxed);
4306
4307	thread_set_runq_locked(thread, new_runq: processor);
4308
4309	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4310
4311	return preempt;
4312	}
4313
4314	static thread_t
4315	rt_runq_dequeue(rt_queue_t rt_run_queue)
4316	{
4317	bitmap_t *map = rt_run_queue->bitmap;
4318	int i = bitmap_first(map, NRTQS);
4319	assert((i >= `0`) && (i < NRTQS));
4320
4321	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4322
4323	if (!sched_rt_runq_strict_priority) {
4324	int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4325	if (ed_index != i) {
4326	assert((ed_index >= `0`) && (ed_index < NRTQS));
4327	rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4328
4329	thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4330	thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4331
4332	if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4333	/ choose the earliest deadline thread /
4334	rt_runq = ed_runq;
4335	i = ed_index;
4336	}
4337	}
4338	}
4339
4340	assert(rt_runq->pri_count > `0`);
4341	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4342	uint32_t constraint = RT_CONSTRAINT_NONE;
4343	int ed_index = NOPRI;
4344	thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4345	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4346	if (--rt_runq->pri_count > `0`) {
4347	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4348	assert(next_rt != THREAD_NULL);
4349	earliest_deadline = next_rt->realtime.deadline;
4350	constraint = next_rt->realtime.constraint;
4351	ed_index = i;
4352	} else {
4353	bitmap_clear(map, n: i);
4354	}
4355	rt_runq->pri_earliest_deadline = earliest_deadline;
4356	rt_runq->pri_constraint = constraint;
4357
4358	for (i = bitmap_first(map, NRTQS); i >= `0`; i = bitmap_next(map, prev: i)) {
4359	rt_runq = &rt_run_queue->rt_queue_pri[i];
4360	if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4361	earliest_deadline = rt_runq->pri_earliest_deadline;
4362	constraint = rt_runq->pri_constraint;
4363	ed_index = i;
4364	}
4365	}
4366	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4367	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4368	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4369	os_atomic_dec(&rt_run_queue->count, relaxed);
4370
4371	thread_clear_runq(thread: new_thread);
4372
4373	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4374
4375	return new_thread;
4376	}
4377
4378	static thread_t
4379	rt_runq_first(rt_queue_t rt_run_queue)
4380	{
4381	bitmap_t *map = rt_run_queue->bitmap;
4382	int i = bitmap_first(map, NRTQS);
4383	if (i < `0`) {
4384	return THREAD_NULL;
4385	}
4386	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4387	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4388
4389	return next_rt;
4390	}
4391
4392	static void
4393	rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4394	{
4395	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4396
4397	int pri = thread->sched_pri;
4398	assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4399	int i = pri - BASEPRI_RTQUEUES;
4400	rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4401	bitmap_t *map = rt_run_queue->bitmap;
4402
4403	assert(rt_runq->pri_count > `0`);
4404	uint64_t earliest_deadline = RT_DEADLINE_NONE;
4405	uint32_t constraint = RT_CONSTRAINT_NONE;
4406	int ed_index = NOPRI;
4407	remqueue(elt: &thread->runq_links);
4408	SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4409	if (--rt_runq->pri_count > `0`) {
4410	thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4411	earliest_deadline = next_rt->realtime.deadline;
4412	constraint = next_rt->realtime.constraint;
4413	ed_index = i;
4414	} else {
4415	bitmap_clear(map, n: i);
4416	}
4417	rt_runq->pri_earliest_deadline = earliest_deadline;
4418	rt_runq->pri_constraint = constraint;
4419
4420	for (i = bitmap_first(map, NRTQS); i >= `0`; i = bitmap_next(map, prev: i)) {
4421	rt_runq = &rt_run_queue->rt_queue_pri[i];
4422	if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4423	earliest_deadline = rt_runq->pri_earliest_deadline;
4424	constraint = rt_runq->pri_constraint;
4425	ed_index = i;
4426	}
4427	}
4428	os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4429	os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4430	os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4431	os_atomic_dec(&rt_run_queue->count, relaxed);
4432
4433	thread_clear_runq_locked(thread);
4434
4435	CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4436	}
4437
4438	rt_queue_t
4439	sched_rtlocal_runq(processor_set_t pset)
4440	{
4441	return &pset->rt_runq;
4442	}
4443
4444	void
4445	sched_rtlocal_init(processor_set_t pset)
4446	{
4447	pset_rt_init(pset);
4448	}
4449
4450	void
4451	sched_rtlocal_queue_shutdown(processor_t processor)
4452	{
4453	processor_set_t pset = processor->processor_set;
4454	thread_t thread;
4455	queue_head_t tqueue;
4456
4457	pset_lock(pset);
4458
4459	/ We only need to migrate threads if this is the last active or last recommended processor in the pset /
4460	if (bit_count(x: pset_available_cpumap(pset)) > `0`) {
4461	pset_unlock(pset);
4462	return;
4463	}
4464
4465	queue_init(&tqueue);
4466
4467	while (rt_runq_count(pset) > `0`) {
4468	thread = rt_runq_dequeue(rt_run_queue: &pset->rt_runq);
4469	enqueue_tail(que: &tqueue, elt: &thread->runq_links);
4470	}
4471	sched_update_pset_load_average(pset, curtime: `0`);
4472	pset_update_rt_stealable_state(pset);
4473	pset_unlock(pset);
4474
4475	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4476	remqueue(elt: &thread->runq_links);
4477
4478	thread_lock(thread);
4479
4480	thread_setrun(thread, options: SCHED_TAILQ);
4481
4482	thread_unlock(thread);
4483	}
4484	}
4485
4486	/ Assumes RT lock is not held, and acquires splsched/rt_lock itself /
4487	void
4488	sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4489	{
4490	thread_t thread;
4491
4492	pset_node_t node = &pset_node0;
4493	processor_set_t pset = node->psets;
4494
4495	spl_t s = splsched();
4496	do {
4497	while (pset != NULL) {
4498	pset_lock(pset);
4499
4500	bitmap_t *map = pset->rt_runq.bitmap;
4501	for (int i = bitmap_first(map, NRTQS); i >= `0`; i = bitmap_next(map, prev: i)) {
4502	rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4503
4504	qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4505	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4506	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4507	}
4508	}
4509	}
4510
4511	pset_unlock(pset);
4512
4513	pset = pset->pset_list;
4514	}
4515	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4516	splx(s);
4517	}
4518
4519	int64_t
4520	sched_rtlocal_runq_count_sum(void)
4521	{
4522	pset_node_t node = &pset_node0;
4523	processor_set_t pset = node->psets;
4524	int64_t count = `0`;
4525
4526	do {
4527	while (pset != NULL) {
4528	count += pset->rt_runq.runq_stats.count_sum;
4529
4530	pset = pset->pset_list;
4531	}
4532	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4533
4534	return count;
4535	}
4536
4537	/*
4538	* Called with stealing_pset locked and
4539	* returns with stealing_pset locked
4540	* but the lock will have been dropped
4541	* if a thread is returned.
4542	*/
4543	thread_t
4544	sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4545	{
4546	if (!sched_allow_rt_steal) {
4547	return THREAD_NULL;
4548	}
4549	pset_map_t pset_map = stealing_pset->node->pset_map;
4550
4551	bit_clear(pset_map, stealing_pset->pset_id);
4552
4553	processor_set_t pset = stealing_pset;
4554
4555	processor_set_t target_pset;
4556	uint64_t target_deadline;
4557
4558	retry:
4559	target_pset = NULL;
4560	target_deadline = earliest_deadline - rt_deadline_epsilon;
4561
4562	for (int pset_id = lsb_first(bitmap: pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: pset_map, previous_bit: pset_id)) {
4563	processor_set_t nset = pset_array[pset_id];
4564
4565	/*
4566	* During startup, while pset_array[] and node->pset_map are still being initialized,
4567	* the update to pset_map may become visible to this cpu before the update to pset_array[].
4568	* It would be good to avoid inserting a memory barrier here that is only needed during startup,
4569	* so just check nset is not NULL instead.
4570	*/
4571	if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4572	target_deadline = nset->stealable_rt_threads_earliest_deadline;
4573	target_pset = nset;
4574	}
4575	}
4576
4577	if (target_pset != NULL) {
4578	pset = change_locked_pset(current_pset: pset, new_pset: target_pset);
4579	if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4580	thread_t new_thread = rt_runq_dequeue(rt_run_queue: &pset->rt_runq);
4581	pset_update_rt_stealable_state(pset);
4582	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, `0`);
4583
4584	pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4585	return new_thread;
4586	}
4587	pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4588	earliest_deadline = rt_runq_earliest_deadline(pset);
4589	goto retry;
4590	}
4591
4592	pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4593	return THREAD_NULL;
4594	}
4595
4596	/*
4597	* pset is locked
4598	*/
4599	thread_t
4600	sched_rt_choose_thread(processor_set_t pset)
4601	{
4602	processor_t processor = current_processor();
4603
4604	if (SCHED(steal_thread_enabled)(pset)) {
4605	do {
4606	bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4607	if (spill_pending) {
4608	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) \| DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, `0`, `2`);
4609	}
4610	thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4611	if (new_thread != THREAD_NULL) {
4612	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4613	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) \| DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, `0`, `3`);
4614	}
4615	return new_thread;
4616	}
4617	} while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4618	}
4619
4620	if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4621	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) \| DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, `0`, `4`);
4622	}
4623
4624	if (rt_runq_count(pset) > `0`) {
4625	thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4626	assert(new_thread != THREAD_NULL);
4627	pset_update_rt_stealable_state(pset);
4628	return new_thread;
4629	}
4630
4631	return THREAD_NULL;
4632	}
4633
4634	/*
4635	* realtime_queue_insert:
4636	*
4637	* Enqueue a thread for realtime execution.
4638	*/
4639	static bool
4640	realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4641	{
4642	pset_assert_locked(pset);
4643
4644	bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4645	pset_update_rt_stealable_state(pset);
4646
4647	return preempt;
4648	}
4649
4650	/*
4651	* realtime_setrun:
4652	*
4653	* Dispatch a thread for realtime execution.
4654	*
4655	* Thread must be locked. Associated pset must
4656	* be locked, and is returned unlocked.
4657	*/
4658	static void
4659	realtime_setrun(
4660	processor_t chosen_processor,
4661	thread_t thread)
4662	{
4663	processor_set_t pset = chosen_processor->processor_set;
4664	pset_assert_locked(pset);
4665	bool pset_is_locked = true;
4666
4667	int n_backup = `0`;
4668
4669	if (thread->realtime.constraint <= rt_constraint_threshold) {
4670	n_backup = sched_rt_n_backup_processors;
4671	}
4672	assert((n_backup >= `0`) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4673
4674	int existing_backups = bit_count(x: pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4675	if (existing_backups > `0`) {
4676	n_backup = n_backup - existing_backups;
4677	if (n_backup < `0`) {
4678	n_backup = `0`;
4679	}
4680	}
4681
4682	sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + `1`] = {};
4683	processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + `1`] = {};
4684
4685	thread->chosen_processor = chosen_processor;
4686
4687	/ <rdar://problem/15102234> /
4688	assert(thread->bound_processor == PROCESSOR_NULL);
4689
4690	realtime_queue_insert(processor: chosen_processor, pset, thread);
4691
4692	processor_t processor = chosen_processor;
4693
4694	int count = `0`;
4695	for (int i = `0`; i <= n_backup; i++) {
4696	if (i == `0`) {
4697	ipi_type[i] = SCHED_IPI_NONE;
4698	ipi_processor[i] = processor;
4699	count++;
4700
4701	ast_t preempt = AST_NONE;
4702	if (thread->sched_pri > processor->current_pri) {
4703	preempt = (AST_PREEMPT \| AST_URGENT);
4704	} else if (thread->sched_pri == processor->current_pri) {
4705	if (deadline_add(d: thread->realtime.deadline, e: rt_deadline_epsilon) < processor->deadline) {
4706	preempt = (AST_PREEMPT \| AST_URGENT);
4707	}
4708	}
4709
4710	if (preempt != AST_NONE) {
4711	if (processor->state == PROCESSOR_IDLE) {
4712	if (processor == current_processor()) {
4713	pset_update_processor_state(pset, processor, new_state: PROCESSOR_DISPATCHING);
4714	ast_on(reasons: preempt);
4715
4716	if ((preempt & AST_URGENT) == AST_URGENT) {
4717	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4718	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
4719	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `1`);
4720	}
4721	}
4722
4723	if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4724	bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4725	}
4726	} else {
4727	ipi_type[i] = sched_ipi_action(dst: processor, thread, event: SCHED_IPI_EVENT_RT_PREEMPT);
4728	}
4729	} else if (processor->state == PROCESSOR_DISPATCHING) {
4730	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4731	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
4732	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `2`);
4733	}
4734	} else {
4735	if (processor == current_processor()) {
4736	ast_on(reasons: preempt);
4737
4738	if ((preempt & AST_URGENT) == AST_URGENT) {
4739	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4740	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
4741	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `3`);
4742	}
4743	}
4744
4745	if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4746	bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4747	}
4748	} else {
4749	ipi_type[i] = sched_ipi_action(dst: processor, thread, event: SCHED_IPI_EVENT_RT_PREEMPT);
4750	}
4751	}
4752	} else {
4753	/ Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. /
4754	}
4755	} else {
4756	if (!pset_is_locked) {
4757	pset_lock(pset);
4758	}
4759	ipi_type[i] = SCHED_IPI_NONE;
4760	ipi_processor[i] = PROCESSOR_NULL;
4761	pset_is_locked = !choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor, false, result_processor: &ipi_processor[i], result_ipi_type: &ipi_type[i]);
4762	if (ipi_processor[i] == PROCESSOR_NULL) {
4763	break;
4764	}
4765	count++;
4766
4767	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) \| DBG_FUNC_NONE,
4768	ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, `1`);
4769	#if defined(__x86_64__)
4770	#define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) \|\| ((p)->cpu_id != 0)))
4771	if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4772	processor_t p0 = ipi_processor[`0`];
4773	processor_t p1 = ipi_processor[`1`];
4774	assert(p0 && p1);
4775	if (p_is_good(p0) && p_is_good(p1)) {
4776	/*
4777	* Both the chosen processor and the first backup are non-cpu0 primaries,
4778	* so there is no need for a 2nd backup processor.
4779	*/
4780	break;
4781	}
4782	}
4783	#endif
4784	}
4785	}
4786
4787	if (pset_is_locked) {
4788	pset_unlock(pset);
4789	}
4790
4791	assert((count > `0`) && (count <= (n_backup + `1`)));
4792	for (int i = `0`; i < count; i++) {
4793	assert(ipi_processor[i] != PROCESSOR_NULL);
4794	sched_ipi_perform(dst: ipi_processor[i], ipi: ipi_type[i]);
4795	}
4796	}
4797
4798
4799	sched_ipi_type_t
4800	sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4801	thread_t thread, __unused sched_ipi_event_t event)
4802	{
4803	#if defined(CONFIG_SCHED_DEFERRED_AST)
4804	#if CONFIG_THREAD_GROUPS
4805	if (thread) {
4806	struct thread_group *tg = thread_group_get(t: thread);
4807	if (thread_group_uses_immediate_ipi(tg)) {
4808	return SCHED_IPI_IMMEDIATE;
4809	}
4810	}
4811	#endif /* CONFIG_THREAD_GROUPS */
4812	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4813	return SCHED_IPI_DEFERRED;
4814	}
4815	#else /* CONFIG_SCHED_DEFERRED_AST */
4816	(void) thread;
4817	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4818	#endif /* CONFIG_SCHED_DEFERRED_AST */
4819	return SCHED_IPI_NONE;
4820	}
4821
4822	sched_ipi_type_t
4823	sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4824	{
4825	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4826	assert(dst != NULL);
4827
4828	processor_set_t pset = dst->processor_set;
4829	if (current_processor() == dst) {
4830	return SCHED_IPI_NONE;
4831	}
4832
4833	bool dst_idle = (dst->state == PROCESSOR_IDLE);
4834	if (dst_idle) {
4835	pset_update_processor_state(pset, processor: dst, new_state: PROCESSOR_DISPATCHING);
4836	}
4837
4838	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4839	switch (ipi_type) {
4840	case SCHED_IPI_NONE:
4841	return SCHED_IPI_NONE;
4842	#if defined(CONFIG_SCHED_DEFERRED_AST)
4843	case SCHED_IPI_DEFERRED:
4844	bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4845	break;
4846	#endif /* CONFIG_SCHED_DEFERRED_AST */
4847	default:
4848	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4849	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
4850	dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `4`);
4851	}
4852	bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4853	break;
4854	}
4855	return ipi_type;
4856	}
4857
4858	sched_ipi_type_t
4859	sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4860	{
4861	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4862	boolean_t deferred_ipi_supported = false;
4863	processor_set_t pset = dst->processor_set;
4864
4865	#if defined(CONFIG_SCHED_DEFERRED_AST)
4866	deferred_ipi_supported = true;
4867	#endif /* CONFIG_SCHED_DEFERRED_AST */
4868
4869	switch (event) {
4870	case SCHED_IPI_EVENT_SPILL:
4871	case SCHED_IPI_EVENT_SMT_REBAL:
4872	case SCHED_IPI_EVENT_REBALANCE:
4873	case SCHED_IPI_EVENT_BOUND_THR:
4874	case SCHED_IPI_EVENT_RT_PREEMPT:
4875	/*
4876	* The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4877	* scenarios use immediate IPIs always.
4878	*/
4879	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4880	break;
4881	case SCHED_IPI_EVENT_PREEMPT:
4882	/ In the preemption case, use immediate IPIs for RT threads /
4883	if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4884	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4885	break;
4886	}
4887
4888	/*
4889	* For Non-RT threads preemption,
4890	* If the core is active, use immediate IPIs.
4891	* If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4892	*/
4893	if (deferred_ipi_supported && dst_idle) {
4894	return sched_ipi_deferred_policy(pset, dst, thread, event);
4895	}
4896	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4897	break;
4898	default:
4899	panic("Unrecognized scheduler IPI event type %d", event);
4900	}
4901	assert(ipi_type != SCHED_IPI_NONE);
4902	return ipi_type;
4903	}
4904
4905	void
4906	sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4907	{
4908	switch (ipi) {
4909	case SCHED_IPI_NONE:
4910	break;
4911	case SCHED_IPI_IDLE:
4912	machine_signal_idle(processor: dst);
4913	break;
4914	case SCHED_IPI_IMMEDIATE:
4915	cause_ast_check(processor: dst);
4916	break;
4917	case SCHED_IPI_DEFERRED:
4918	machine_signal_idle_deferred(processor: dst);
4919	break;
4920	default:
4921	panic("Unrecognized scheduler IPI type: %d", ipi);
4922	}
4923	}
4924
4925	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4926
4927	boolean_t
4928	priority_is_urgent(int priority)
4929	{
4930	return bitmap_test(map: sched_preempt_pri, n: priority) ? TRUE : FALSE;
4931	}
4932
4933	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4934
4935	/*
4936	* processor_setrun:
4937	*
4938	* Dispatch a thread for execution on a
4939	* processor.
4940	*
4941	* Thread must be locked. Associated pset must
4942	* be locked, and is returned unlocked.
4943	*/
4944	static void
4945	processor_setrun(
4946	processor_t processor,
4947	thread_t thread,
4948	integer_t options)
4949	{
4950	processor_set_t pset = processor->processor_set;
4951	pset_assert_locked(pset);
4952	ast_t preempt = AST_NONE;
4953	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4954
4955	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4956
4957	thread->chosen_processor = processor;
4958
4959	/*
4960	* Set preemption mode.
4961	*/
4962	#if defined(CONFIG_SCHED_DEFERRED_AST)
4963	/ TODO: Do we need to care about urgency (see rdar://problem/20136239)? /
4964	#endif
4965	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4966	preempt = (AST_PREEMPT \| AST_URGENT);
4967	} else if (processor->current_is_eagerpreempt) {
4968	preempt = (AST_PREEMPT \| AST_URGENT);
4969	} else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4970	if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4971	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4972	} else {
4973	preempt = AST_NONE;
4974	}
4975	} else {
4976	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4977	}
4978
4979	if ((options & (SCHED_PREEMPT \| SCHED_REBALANCE)) == (SCHED_PREEMPT \| SCHED_REBALANCE)) {
4980	/*
4981	* Having gone to the trouble of forcing this thread off a less preferred core,
4982	* we should force the preferable core to reschedule immediately to give this
4983	* thread a chance to run instead of just sitting on the run queue where
4984	* it may just be stolen back by the idle core we just forced it off.
4985	*/
4986	preempt \|= AST_PREEMPT;
4987	}
4988
4989	SCHED(processor_enqueue)(processor, thread, options);
4990	sched_update_pset_load_average(pset, curtime: `0`);
4991
4992	if (preempt != AST_NONE) {
4993	if (processor->state == PROCESSOR_IDLE) {
4994	ipi_action = eExitIdle;
4995	} else if (processor->state == PROCESSOR_DISPATCHING) {
4996	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4997	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
4998	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `5`);
4999	}
5000	} else if ((processor->state == PROCESSOR_RUNNING \|\|
5001	processor->state == PROCESSOR_SHUTDOWN) &&
5002	(thread->sched_pri >= processor->current_pri)) {
5003	ipi_action = eInterruptRunning;
5004	}
5005	} else {
5006	/*
5007	* New thread is not important enough to preempt what is running, but
5008	* special processor states may need special handling
5009	*/
5010	if (processor->state == PROCESSOR_SHUTDOWN &&
5011	thread->sched_pri >= processor->current_pri) {
5012	ipi_action = eInterruptRunning;
5013	} else if (processor->state == PROCESSOR_IDLE) {
5014	ipi_action = eExitIdle;
5015	} else if (processor->state == PROCESSOR_DISPATCHING) {
5016	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5017	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
5018	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `6`);
5019	}
5020	}
5021	}
5022
5023	if (ipi_action != eDoNothing) {
5024	if (processor == current_processor()) {
5025	if (ipi_action == eExitIdle) {
5026	pset_update_processor_state(pset, processor, new_state: PROCESSOR_DISPATCHING);
5027	}
5028	if ((preempt = csw_check_locked(thread: processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5029	ast_on(reasons: preempt);
5030	}
5031
5032	if ((preempt & AST_URGENT) == AST_URGENT) {
5033	if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5034	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_START,
5035	processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), `7`);
5036	}
5037	} else {
5038	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5039	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, `0`, `7`);
5040	}
5041	}
5042
5043	if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5044	bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5045	} else {
5046	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5047	}
5048	} else {
5049	sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5050	ipi_type = sched_ipi_action(dst: processor, thread, event);
5051	}
5052	}
5053
5054	pset_unlock(pset);
5055	sched_ipi_perform(dst: processor, ipi: ipi_type);
5056
5057	if (ipi_action != eDoNothing && processor == current_processor()) {
5058	ast_t new_preempt = update_pending_nonurgent_preemption(processor, reason: preempt);
5059	ast_on(reasons: new_preempt);
5060	}
5061	}
5062
5063	/*
5064	* choose_next_pset:
5065	*
5066	* Return the next sibling pset containing
5067	* available processors.
5068	*
5069	* Returns the original pset if none other is
5070	* suitable.
5071	*/
5072	static processor_set_t
5073	choose_next_pset(
5074	processor_set_t pset)
5075	{
5076	processor_set_t nset = pset;
5077
5078	do {
5079	nset = next_pset(pset: nset);
5080
5081	/*
5082	* Sometimes during startup the pset_map can contain a bit
5083	* for a pset that isn't fully published in pset_array because
5084	* the pset_map read isn't an acquire load.
5085	*
5086	* In order to avoid needing an acquire barrier here, just bail
5087	* out.
5088	*/
5089	if (nset == PROCESSOR_SET_NULL) {
5090	return pset;
5091	}
5092	} while (nset->online_processor_count < `1` && nset != pset);
5093
5094	return nset;
5095	}
5096
5097	/*
5098	* choose_processor:
5099	*
5100	* Choose a processor for the thread, beginning at
5101	* the pset. Accepts an optional processor hint in
5102	* the pset.
5103	*
5104	* Returns a processor, possibly from a different pset.
5105	*
5106	* The thread must be locked. The pset must be locked,
5107	* and the resulting pset is locked on return.
5108	*/
5109	processor_t
5110	choose_processor(
5111	processor_set_t starting_pset,
5112	processor_t processor,
5113	thread_t thread)
5114	{
5115	processor_set_t pset = starting_pset;
5116	processor_set_t nset;
5117
5118	assert(thread->sched_pri <= MAXPRI);
5119
5120	/*
5121	* Prefer the hinted processor, when appropriate.
5122	*/
5123
5124	/ Fold last processor hint from secondary processor to its primary /
5125	if (processor != PROCESSOR_NULL) {
5126	processor = processor->processor_primary;
5127	}
5128
5129	/*
5130	* Only consult platform layer if pset is active, which
5131	* it may not be in some cases when a multi-set system
5132	* is going to sleep.
5133	*/
5134	if (pset->online_processor_count) {
5135	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5136	processor_t mc_processor = machine_choose_processor(pset, processor);
5137	if (mc_processor != PROCESSOR_NULL) {
5138	processor = mc_processor->processor_primary;
5139	}
5140	}
5141	}
5142
5143	/*
5144	* At this point, we may have a processor hint, and we may have
5145	* an initial starting pset. If the hint is not in the pset, or
5146	* if the hint is for a processor in an invalid state, discard
5147	* the hint.
5148	*/
5149	if (processor != PROCESSOR_NULL) {
5150	if (processor->processor_set != pset) {
5151	processor = PROCESSOR_NULL;
5152	} else if (!processor->is_recommended) {
5153	processor = PROCESSOR_NULL;
5154	} else {
5155	switch (processor->state) {
5156	case PROCESSOR_START:
5157	case PROCESSOR_SHUTDOWN:
5158	case PROCESSOR_PENDING_OFFLINE:
5159	case PROCESSOR_OFF_LINE:
5160	/*
5161	* Hint is for a processor that cannot support running new threads.
5162	*/
5163	processor = PROCESSOR_NULL;
5164	break;
5165	case PROCESSOR_IDLE:
5166	/*
5167	* Hint is for an idle processor. Assume it is no worse than any other
5168	* idle processor. The platform layer had an opportunity to provide
5169	* the "least cost idle" processor above.
5170	*/
5171	if ((thread->sched_pri < BASEPRI_RTQUEUES) \|\| processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5172	uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5173	uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5174	/*
5175	* If the rotation bitmask to force a migration is set for this core and there's an idle core that
5176	* that needn't be avoided, don't continue running on the same core.
5177	*/
5178	if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != `0`)) {
5179	return processor;
5180	}
5181	}
5182	processor = PROCESSOR_NULL;
5183	break;
5184	case PROCESSOR_RUNNING:
5185	case PROCESSOR_DISPATCHING:
5186	/*
5187	* Hint is for an active CPU. This fast-path allows
5188	* realtime threads to preempt non-realtime threads
5189	* to regain their previous executing processor.
5190	*/
5191	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5192	if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5193	return processor;
5194	}
5195	processor = PROCESSOR_NULL;
5196	}
5197
5198	/ Otherwise, use hint as part of search below /
5199	break;
5200	default:
5201	processor = PROCESSOR_NULL;
5202	break;
5203	}
5204	}
5205	}
5206
5207	/*
5208	* Iterate through the processor sets to locate
5209	* an appropriate processor. Seed results with
5210	* a last-processor hint, if available, so that
5211	* a search must find something strictly better
5212	* to replace it.
5213	*
5214	* A primary/secondary pair of SMT processors are
5215	* "unpaired" if the primary is busy but its
5216	* corresponding secondary is idle (so the physical
5217	* core has full use of its resources).
5218	*/
5219
5220	integer_t lowest_priority = MAXPRI + `1`;
5221	integer_t lowest_secondary_priority = MAXPRI + `1`;
5222	integer_t lowest_unpaired_primary_priority = MAXPRI + `1`;
5223	integer_t lowest_idle_secondary_priority = MAXPRI + `1`;
5224	integer_t lowest_count = INT_MAX;
5225	processor_t lp_processor = PROCESSOR_NULL;
5226	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5227	processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5228	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5229	processor_t lc_processor = PROCESSOR_NULL;
5230
5231	if (processor != PROCESSOR_NULL) {
5232	/ All other states should be enumerated above. /
5233	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
5234	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5235
5236	lowest_priority = processor->current_pri;
5237	lp_processor = processor;
5238
5239	lowest_count = SCHED(processor_runq_count)(processor);
5240	lc_processor = processor;
5241	}
5242
5243	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5244	pset_node_t node = pset->node;
5245	bool include_ast_urgent_pending_cpus = false;
5246	cpumap_t ast_urgent_pending;
5247	try_again:
5248	ast_urgent_pending = `0`;
5249	int consider_secondaries = (!pset->is_SMT) \|\| (bit_count(x: node->pset_map) == `1`) \|\| (node->pset_non_rt_primary_map == `0`) \|\| include_ast_urgent_pending_cpus;
5250	for (; consider_secondaries < `2`; consider_secondaries++) {
5251	pset = change_locked_pset(current_pset: pset, new_pset: starting_pset);
5252	do {
5253	cpumap_t available_map = pset_available_cpumap(pset);
5254	if (available_map == `0`) {
5255	goto no_available_cpus;
5256	}
5257
5258	processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5259	if (processor) {
5260	return processor;
5261	}
5262
5263	if (consider_secondaries) {
5264	processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri: thread->sched_pri, minimum_deadline: thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5265	if (processor) {
5266	/*
5267	* Instead of looping through all the psets to find the global
5268	* furthest deadline processor, preempt the first candidate found.
5269	* The preempted thread will then find any other available far deadline
5270	* processors to preempt.
5271	*/
5272	return processor;
5273	}
5274
5275	ast_urgent_pending \|= pset->pending_AST_URGENT_cpu_mask;
5276
5277	if (rt_runq_count(pset) < lowest_count) {
5278	int cpuid = bit_first(bitmap: available_map);
5279	assert(cpuid >= `0`);
5280	lc_processor = processor_array[cpuid];
5281	lowest_count = rt_runq_count(pset);
5282	}
5283	}
5284
5285	no_available_cpus:
5286	nset = next_pset(pset);
5287
5288	if (nset != starting_pset) {
5289	pset = change_locked_pset(current_pset: pset, new_pset: nset);
5290	}
5291	} while (nset != starting_pset);
5292	}
5293
5294	/ Short cut for single pset nodes /
5295	if (bit_count(x: node->pset_map) == `1`) {
5296	if (lc_processor) {
5297	pset_assert_locked(lc_processor->processor_set);
5298	return lc_processor;
5299	}
5300	} else {
5301	if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5302	/ See the comment in choose_furthest_deadline_processor_for_realtime_thread() /
5303	include_ast_urgent_pending_cpus = true;
5304	goto try_again;
5305	}
5306	}
5307
5308	processor = lc_processor;
5309
5310	if (processor) {
5311	pset = change_locked_pset(current_pset: pset, new_pset: processor->processor_set);
5312	/ Check that chosen processor is still usable /
5313	cpumap_t available_map = pset_available_cpumap(pset);
5314	if (bit_test(available_map, processor->cpu_id)) {
5315	return processor;
5316	}
5317
5318	/ processor is no longer usable /
5319	processor = PROCESSOR_NULL;
5320	}
5321
5322	pset_assert_locked(pset);
5323	pset_unlock(pset);
5324	return PROCESSOR_NULL;
5325	}
5326
5327	/ No realtime threads from this point on /
5328	assert(thread->sched_pri < BASEPRI_RTQUEUES);
5329
5330	do {
5331	/*
5332	* Choose an idle processor, in pset traversal order
5333	*/
5334	uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5335	uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5336
5337	/ there shouldn't be a pending AST if the processor is idle /
5338	assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == `0`);
5339
5340	/*
5341	* Look at the preferred cores first.
5342	*/
5343	int cpuid = lsb_next(bitmap: preferred_idle_primary_map, previous_bit: pset->cpu_preferred_last_chosen);
5344	if (cpuid < `0`) {
5345	cpuid = lsb_first(bitmap: preferred_idle_primary_map);
5346	}
5347	if (cpuid >= `0`) {
5348	processor = processor_array[cpuid];
5349	pset->cpu_preferred_last_chosen = cpuid;
5350	return processor;
5351	}
5352
5353	/*
5354	* Look at the cores that don't need to be avoided next.
5355	*/
5356	if (pset->perfcontrol_cpu_migration_bitmask != `0`) {
5357	uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5358	cpuid = lsb_next(bitmap: non_avoided_idle_primary_map, previous_bit: pset->cpu_preferred_last_chosen);
5359	if (cpuid < `0`) {
5360	cpuid = lsb_first(bitmap: non_avoided_idle_primary_map);
5361	}
5362	if (cpuid >= `0`) {
5363	processor = processor_array[cpuid];
5364	pset->cpu_preferred_last_chosen = cpuid;
5365	return processor;
5366	}
5367	}
5368
5369	/*
5370	* Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5371	*/
5372	cpuid = lsb_first(bitmap: idle_primary_map);
5373	if (cpuid >= `0`) {
5374	processor = processor_array[cpuid];
5375	return processor;
5376	}
5377
5378	/*
5379	* Otherwise, enumerate active and idle processors to find primary candidates
5380	* with lower priority/etc.
5381	*/
5382
5383	uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] \| pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5384	pset->recommended_bitmask &
5385	~pset->pending_AST_URGENT_cpu_mask);
5386
5387	if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5388	active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5389	}
5390
5391	active_map = bit_ror64(bitmap: active_map, n: (pset->last_chosen + `1`));
5392	for (int rotid = lsb_first(bitmap: active_map); rotid >= `0`; rotid = lsb_next(bitmap: active_map, previous_bit: rotid)) {
5393	cpuid = ((rotid + pset->last_chosen + `1`) & `63`);
5394	processor = processor_array[cpuid];
5395
5396	integer_t cpri = processor->current_pri;
5397	processor_t primary = processor->processor_primary;
5398	if (primary != processor) {
5399	/ If primary is running a NO_SMT thread, don't choose its secondary /
5400	if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(processor: primary))) {
5401	if (cpri < lowest_secondary_priority) {
5402	lowest_secondary_priority = cpri;
5403	lp_paired_secondary_processor = processor;
5404	}
5405	}
5406	} else {
5407	if (cpri < lowest_priority) {
5408	lowest_priority = cpri;
5409	lp_processor = processor;
5410	}
5411	}
5412
5413	integer_t ccount = SCHED(processor_runq_count)(processor);
5414	if (ccount < lowest_count) {
5415	lowest_count = ccount;
5416	lc_processor = processor;
5417	}
5418	}
5419
5420	/*
5421	* For SMT configs, these idle secondary processors must have active primary. Otherwise
5422	* the idle primary would have short-circuited the loop above
5423	*/
5424	uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5425	~pset->primary_map &
5426	pset->recommended_bitmask);
5427
5428	/ there shouldn't be a pending AST if the processor is idle /
5429	assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == `0`);
5430	assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == `0`);
5431
5432	for (cpuid = lsb_first(bitmap: idle_secondary_map); cpuid >= `0`; cpuid = lsb_next(bitmap: idle_secondary_map, previous_bit: cpuid)) {
5433	processor = processor_array[cpuid];
5434
5435	processor_t cprimary = processor->processor_primary;
5436
5437	integer_t primary_pri = cprimary->current_pri;
5438
5439	/*
5440	* TODO: This should also make the same decisions
5441	* as secondary_can_run_realtime_thread
5442	*
5443	* TODO: Keep track of the pending preemption priority
5444	* of the primary to make this more accurate.
5445	*/
5446
5447	/ If the primary is running a no-smt thread, then don't choose its secondary /
5448	if (cprimary->state == PROCESSOR_RUNNING &&
5449	processor_active_thread_no_smt(processor: cprimary)) {
5450	continue;
5451	}
5452
5453	/*
5454	* Find the idle secondary processor with the lowest priority primary
5455	*
5456	* We will choose this processor as a fallback if we find no better
5457	* primary to preempt.
5458	*/
5459	if (primary_pri < lowest_idle_secondary_priority) {
5460	lp_idle_secondary_processor = processor;
5461	lowest_idle_secondary_priority = primary_pri;
5462	}
5463
5464	/ Find the the lowest priority active primary with idle secondary /
5465	if (primary_pri < lowest_unpaired_primary_priority) {
5466	/ If the primary processor is offline or starting up, it's not a candidate for this path /
5467	if (cprimary->state != PROCESSOR_RUNNING &&
5468	cprimary->state != PROCESSOR_DISPATCHING) {
5469	continue;
5470	}
5471
5472	if (!cprimary->is_recommended) {
5473	continue;
5474	}
5475
5476	/ if the primary is pending preemption, don't try to re-preempt it /
5477	if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5478	continue;
5479	}
5480
5481	if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5482	bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5483	continue;
5484	}
5485
5486	lowest_unpaired_primary_priority = primary_pri;
5487	lp_unpaired_primary_processor = cprimary;
5488	}
5489	}
5490
5491	/*
5492	* We prefer preempting a primary processor over waking up its secondary.
5493	* The secondary will then be woken up by the preempted thread.
5494	*/
5495	if (thread->sched_pri > lowest_unpaired_primary_priority) {
5496	pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5497	return lp_unpaired_primary_processor;
5498	}
5499
5500	/*
5501	* We prefer preempting a lower priority active processor over directly
5502	* waking up an idle secondary.
5503	* The preempted thread will then find the idle secondary.
5504	*/
5505	if (thread->sched_pri > lowest_priority) {
5506	pset->last_chosen = lp_processor->cpu_id;
5507	return lp_processor;
5508	}
5509
5510	/*
5511	* lc_processor is used to indicate the best processor set run queue
5512	* on which to enqueue a thread when all available CPUs are busy with
5513	* higher priority threads, so try to make sure it is initialized.
5514	*/
5515	if (lc_processor == PROCESSOR_NULL) {
5516	cpumap_t available_map = pset_available_cpumap(pset);
5517	cpuid = lsb_first(bitmap: available_map);
5518	if (cpuid >= `0`) {
5519	lc_processor = processor_array[cpuid];
5520	lowest_count = SCHED(processor_runq_count)(lc_processor);
5521	}
5522	}
5523
5524	/*
5525	* Move onto the next processor set.
5526	*
5527	* If all primary processors in this pset are running a higher
5528	* priority thread, move on to next pset. Only when we have
5529	* exhausted the search for primary processors do we
5530	* fall back to secondaries.
5531	*/
5532	#if CONFIG_SCHED_EDGE
5533	/*
5534	* The edge scheduler expects a CPU to be selected from the pset it passed in
5535	* as the starting pset for non-RT workloads. The edge migration algorithm
5536	* should already have considered idle CPUs and loads to decide the starting_pset;
5537	* which means that this loop can be short-circuted.
5538	*/
5539	nset = starting_pset;
5540	#else /* CONFIG_SCHED_EDGE */
5541	nset = next_pset(pset);
5542	#endif /* CONFIG_SCHED_EDGE */
5543
5544	if (nset != starting_pset) {
5545	pset = change_locked_pset(current_pset: pset, new_pset: nset);
5546	}
5547	} while (nset != starting_pset);
5548
5549	/*
5550	* Make sure that we pick a running processor,
5551	* and that the correct processor set is locked.
5552	* Since we may have unlocked the candidate processor's
5553	* pset, it may have changed state.
5554	*
5555	* All primary processors are running a higher priority
5556	* thread, so the only options left are enqueuing on
5557	* the secondary processor that would perturb the least priority
5558	* primary, or the least busy primary.
5559	*/
5560
5561	/ lowest_priority is evaluated in the main loops above /
5562	if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5563	processor = lp_idle_secondary_processor;
5564	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5565	processor = lp_paired_secondary_processor;
5566	} else if (lc_processor != PROCESSOR_NULL) {
5567	processor = lc_processor;
5568	} else {
5569	processor = PROCESSOR_NULL;
5570	}
5571
5572	if (processor) {
5573	pset = change_locked_pset(current_pset: pset, new_pset: processor->processor_set);
5574	/ Check that chosen processor is still usable /
5575	cpumap_t available_map = pset_available_cpumap(pset);
5576	if (bit_test(available_map, processor->cpu_id)) {
5577	pset->last_chosen = processor->cpu_id;
5578	return processor;
5579	}
5580
5581	/ processor is no longer usable /
5582	processor = PROCESSOR_NULL;
5583	}
5584
5585	pset_assert_locked(pset);
5586	pset_unlock(pset);
5587	return PROCESSOR_NULL;
5588	}
5589
5590	/*
5591	* Default implementation of SCHED(choose_node)()
5592	* for single node systems
5593	*/
5594	pset_node_t
5595	sched_choose_node(__unused thread_t thread)
5596	{
5597	return &pset_node0;
5598	}
5599
5600	/*
5601	* choose_starting_pset:
5602	*
5603	* Choose a starting processor set for the thread.
5604	* May return a processor hint within the pset.
5605	*
5606	* Returns a starting processor set, to be used by
5607	* choose_processor.
5608	*
5609	* The thread must be locked. The resulting pset is unlocked on return,
5610	* and is chosen without taking any pset locks.
5611	*/
5612	processor_set_t
5613	choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5614	{
5615	processor_set_t pset;
5616	processor_t processor = PROCESSOR_NULL;
5617
5618	if (thread->affinity_set != AFFINITY_SET_NULL) {
5619	/*
5620	* Use affinity set policy hint.
5621	*/
5622	pset = thread->affinity_set->aset_pset;
5623	} else if (thread->last_processor != PROCESSOR_NULL) {
5624	/*
5625	* Simple (last processor) affinity case.
5626	*/
5627	processor = thread->last_processor;
5628	pset = processor->processor_set;
5629	} else {
5630	/*
5631	* No Affinity case:
5632	*
5633	* Utilitize a per task hint to spread threads
5634	* among the available processor sets.
5635	* NRG this seems like the wrong thing to do.
5636	* See also task->pset_hint = pset in thread_setrun()
5637	*/
5638	pset = get_threadtask(thread)->pset_hint;
5639	if (pset == PROCESSOR_SET_NULL) {
5640	pset = current_processor()->processor_set;
5641	}
5642
5643	pset = choose_next_pset(pset);
5644	}
5645
5646	if (!bit_test(node->pset_map, pset->pset_id)) {
5647	/ pset is not from this node so choose one that is /
5648	int id = lsb_first(bitmap: node->pset_map);
5649	if (id < `0`) {
5650	/ startup race, so check again under the node lock /
5651	lck_spin_lock(lck: &pset_node_lock);
5652	if (bit_test(node->pset_map, pset->pset_id)) {
5653	id = pset->pset_id;
5654	} else {
5655	id = lsb_first(bitmap: node->pset_map);
5656	}
5657	lck_spin_unlock(lck: &pset_node_lock);
5658	}
5659	assert(id >= `0`);
5660	pset = pset_array[id];
5661	}
5662
5663	if (bit_count(x: node->pset_map) == `1`) {
5664	/ Only a single pset in this node /
5665	goto out;
5666	}
5667
5668	bool avoid_cpu0 = false;
5669
5670	#if defined(__x86_64__)
5671	if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5672	/ Avoid the pset containing cpu0 /
5673	avoid_cpu0 = true;
5674	/ Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ /
5675	assert(bit_test(pset_array[`0`]->cpu_bitmask, `0`));
5676	}
5677	#endif
5678
5679	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5680	pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5681	if ((avoid_cpu0 && pset->pset_id == `0`) \|\| !bit_test(rt_target_map, pset->pset_id)) {
5682	if (avoid_cpu0) {
5683	rt_target_map = bit_ror64(bitmap: rt_target_map, n: `1`);
5684	}
5685	int rotid = lsb_first(bitmap: rt_target_map);
5686	if (rotid >= `0`) {
5687	int id = avoid_cpu0 ? ((rotid + `1`) & `63`) : rotid;
5688	pset = pset_array[id];
5689	goto out;
5690	}
5691	}
5692	if (!pset->is_SMT \|\| !sched_allow_rt_smt) {
5693	/ All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread /
5694	goto out;
5695	}
5696	rt_target_map = atomic_load(&node->pset_non_rt_map);
5697	if ((avoid_cpu0 && pset->pset_id == `0`) \|\| !bit_test(rt_target_map, pset->pset_id)) {
5698	if (avoid_cpu0) {
5699	rt_target_map = bit_ror64(bitmap: rt_target_map, n: `1`);
5700	}
5701	int rotid = lsb_first(bitmap: rt_target_map);
5702	if (rotid >= `0`) {
5703	int id = avoid_cpu0 ? ((rotid + `1`) & `63`) : rotid;
5704	pset = pset_array[id];
5705	goto out;
5706	}
5707	}
5708	/ All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread /
5709	} else {
5710	pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5711	if (!bit_test(idle_map, pset->pset_id)) {
5712	int next_idle_pset_id = lsb_first(bitmap: idle_map);
5713	if (next_idle_pset_id >= `0`) {
5714	pset = pset_array[next_idle_pset_id];
5715	}
5716	}
5717	}
5718
5719	out:
5720	if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5721	processor = PROCESSOR_NULL;
5722	}
5723	if (processor != PROCESSOR_NULL) {
5724	*processor_hint = processor;
5725	}
5726
5727	assert(pset != NULL);
5728	return pset;
5729	}
5730
5731	/*
5732	* thread_setrun:
5733	*
5734	* Dispatch thread for execution, onto an idle
5735	* processor or run queue, and signal a preemption
5736	* as appropriate.
5737	*
5738	* Thread must be locked.
5739	*/
5740	void
5741	thread_setrun(
5742	thread_t thread,
5743	sched_options_t options)
5744	{
5745	processor_t processor = PROCESSOR_NULL;
5746	processor_set_t pset;
5747
5748	assert((thread->state & (TH_RUN \| TH_WAIT \| TH_UNINT \| TH_TERMINATE \| TH_TERMINATE2)) == TH_RUN);
5749	thread_assert_runq_null(thread);
5750
5751	#if CONFIG_PREADOPT_TG
5752	/ We know that the thread is not in the runq by virtue of being in this*
5753	* function and the thread is not self since we are running. We can safely
5754	* resolve the thread group hierarchy and modify the thread's thread group
5755	* here. */
5756	thread_resolve_and_enforce_thread_group_hierarchy_if_needed(t: thread);
5757	#endif
5758
5759	/*
5760	* Update priority if needed.
5761	*/
5762	if (SCHED(can_update_priority)(thread)) {
5763	SCHED(update_priority)(thread);
5764	}
5765	thread->sfi_class = sfi_thread_classify(thread);
5766
5767	if (thread->bound_processor == PROCESSOR_NULL) {
5768	/*
5769	* Unbound case.
5770	*
5771	* Usually, this loop will only be executed once,
5772	* but if CLPC derecommends a processor after it has been chosen,
5773	* or if a processor is shut down after it is chosen,
5774	* choose_processor() may return NULL, so a retry
5775	* may be necessary. A single retry will usually
5776	* be enough, and we can't afford to retry too many times
5777	* because interrupts are disabled.
5778	*/
5779	#define CHOOSE_PROCESSOR_MAX_RETRIES 3
5780	for (int retry = `0`; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5781	processor_t processor_hint = PROCESSOR_NULL;
5782	pset_node_t node = SCHED(choose_node)(thread);
5783	processor_set_t starting_pset = choose_starting_pset(node, thread, processor_hint: &processor_hint);
5784
5785	pset_lock(starting_pset);
5786
5787	processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5788	if (processor != PROCESSOR_NULL) {
5789	pset = processor->processor_set;
5790	pset_assert_locked(pset);
5791	break;
5792	}
5793	}
5794	/*
5795	* If choose_processor() still returns NULL,
5796	* which is very unlikely,
5797	* choose the master_processor, which is always
5798	* safe to choose.
5799	*/
5800	if (processor == PROCESSOR_NULL) {
5801	/ Choose fallback processor /
5802	processor = master_processor;
5803	pset = processor->processor_set;
5804	pset_lock(pset);
5805	assert((pset_available_cpu_count(pset) > `0`) \|\| (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5806	}
5807	task_t task = get_threadtask(thread);
5808	if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5809	task->pset_hint = pset; / NRG this is done without holding the task lock /
5810	}
5811	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) \| DBG_FUNC_NONE,
5812	(uintptr_t)thread_tid(thread), (uintptr_t)-`1`, processor->cpu_id, processor->state, `0`);
5813	assert((pset_available_cpu_count(pset) > `0`) \|\| (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5814	} else {
5815	/*
5816	* Bound case:
5817	*
5818	* Unconditionally dispatch on the processor.
5819	*/
5820	processor = thread->bound_processor;
5821	pset = processor->processor_set;
5822	pset_lock(pset);
5823
5824	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) \| DBG_FUNC_NONE,
5825	(uintptr_t)thread_tid(thread), (uintptr_t)-`2`, processor->cpu_id, processor->state, `0`);
5826	}
5827
5828	/*
5829	* Dispatch the thread on the chosen processor.
5830	* TODO: This should be based on sched_mode, not sched_pri
5831	*/
5832	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5833	realtime_setrun(chosen_processor: processor, thread);
5834	} else {
5835	processor_setrun(processor, thread, options);
5836	}
5837	/ pset is now unlocked /
5838	if (thread->bound_processor == PROCESSOR_NULL) {
5839	SCHED(check_spill)(pset, thread);
5840	}
5841	}
5842
5843	processor_set_t
5844	task_choose_pset(
5845	task_t task)
5846	{
5847	processor_set_t pset = task->pset_hint;
5848
5849	if (pset != PROCESSOR_SET_NULL) {
5850	pset = choose_next_pset(pset);
5851	}
5852
5853	return pset;
5854	}
5855
5856	/*
5857	* Check for a preemption point in
5858	* the current context.
5859	*
5860	* Called at splsched with thread locked.
5861	*/
5862	ast_t
5863	csw_check(
5864	thread_t thread,
5865	processor_t processor,
5866	ast_t check_reason)
5867	{
5868	processor_set_t pset = processor->processor_set;
5869
5870	assert(thread == processor->active_thread);
5871
5872	pset_lock(pset);
5873
5874	processor_state_update_from_thread(processor, thread, true);
5875
5876	ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5877
5878	/ Acknowledge the IPI if we decided not to preempt /
5879
5880	if ((preempt & AST_URGENT) == `0`) {
5881	if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5882	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) \| DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, `0`, `8`);
5883	}
5884	}
5885
5886	if ((preempt & AST_PREEMPT) == `0`) {
5887	bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5888	}
5889
5890	pset_unlock(pset);
5891
5892	return update_pending_nonurgent_preemption(processor, reason: preempt);
5893	}
5894
5895	void
5896	clear_pending_nonurgent_preemption(processor_t processor)
5897	{
5898	if (!processor->pending_nonurgent_preemption) {
5899	return;
5900	}
5901
5902	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) \| DBG_FUNC_END);
5903
5904	processor->pending_nonurgent_preemption = false;
5905	running_timer_clear(processor, timer: RUNNING_TIMER_PREEMPT);
5906	}
5907
5908	ast_t
5909	update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5910	{
5911	if ((reason & (AST_URGENT \| AST_PREEMPT)) != (AST_PREEMPT)) {
5912	clear_pending_nonurgent_preemption(processor);
5913	return reason;
5914	}
5915
5916	if (nonurgent_preemption_timer_abs == `0`) {
5917	/ Preemption timer not enabled /
5918	return reason;
5919	}
5920
5921	if (current_thread()->state & TH_IDLE) {
5922	/ idle threads don't need nonurgent preemption /
5923	return reason;
5924	}
5925
5926	if (processor->pending_nonurgent_preemption) {
5927	/ Timer is already armed, no need to do it again /
5928	return reason;
5929	}
5930
5931	if (ml_did_interrupt_userspace()) {
5932	/*
5933	* We're preempting userspace here, so we don't need
5934	* to defer the preemption. Force AST_URGENT
5935	* so that we can avoid arming this timer without risking
5936	* ast_taken_user deciding to spend too long in kernel
5937	* space to handle other ASTs.
5938	*/
5939
5940	return reason \| AST_URGENT;
5941	}
5942
5943	/*
5944	* We've decided to do a nonurgent preemption when running in
5945	* kernelspace. We defer the preemption until reaching userspace boundary
5946	* to give a grace period for locks etc to be dropped and to reach
5947	* a clean preemption point, so that the preempting thread doesn't
5948	* always immediately hit the lock that the waking thread still holds.
5949	*
5950	* Arm a timer to enforce that the preemption executes within a bounded
5951	* time if the thread doesn't block or return to userspace quickly.
5952	*/
5953
5954	processor->pending_nonurgent_preemption = true;
5955	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) \| DBG_FUNC_START,
5956	reason);
5957
5958	uint64_t now = mach_absolute_time();
5959
5960	uint64_t deadline = now + nonurgent_preemption_timer_abs;
5961
5962	running_timer_enter(processor, timer: RUNNING_TIMER_PREEMPT, NULL,
5963	deadline: now, now: deadline);
5964
5965	return reason;
5966	}
5967
5968	/*
5969	* Check for preemption at splsched with
5970	* pset and thread locked
5971	*/
5972	ast_t
5973	csw_check_locked(
5974	thread_t thread,
5975	processor_t processor,
5976	processor_set_t pset,
5977	ast_t check_reason)
5978	{
5979	/*
5980	* If the current thread is running on a processor that is no longer recommended,
5981	* urgently preempt it, at which point thread_select() should
5982	* try to idle the processor and re-dispatch the thread to a recommended processor.
5983	*/
5984	if (!processor->is_recommended) {
5985	return check_reason \| AST_PREEMPT \| AST_URGENT;
5986	}
5987
5988	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5989	return check_reason \| AST_PREEMPT \| AST_URGENT;
5990	}
5991
5992	if (rt_runq_count(pset) > `0`) {
5993	if ((rt_runq_priority(pset) > processor->current_pri) \|\| !processor->first_timeslice) {
5994	return check_reason \| AST_PREEMPT \| AST_URGENT;
5995	} else if (deadline_add(d: rt_runq_earliest_deadline(pset), e: rt_deadline_epsilon) < processor->deadline) {
5996	return check_reason \| AST_PREEMPT \| AST_URGENT;
5997	} else {
5998	return check_reason \| AST_PREEMPT;
5999	}
6000	}
6001
6002	ast_t result = SCHED(processor_csw_check)(processor);
6003	if (result != AST_NONE) {
6004	return check_reason \| result \| (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
6005	}
6006
6007	/*
6008	* Same for avoid-processor
6009	*
6010	* TODO: Should these set AST_REBALANCE?
6011	*/
6012	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6013	return check_reason \| AST_PREEMPT;
6014	}
6015
6016	/*
6017	* Even though we could continue executing on this processor, a
6018	* secondary SMT core should try to shed load to another primary core.
6019	*
6020	* TODO: Should this do the same check that thread_select does? i.e.
6021	* if no bound threads target this processor, and idle primaries exist, preempt
6022	* The case of RT threads existing is already taken care of above
6023	*/
6024
6025	if (processor->current_pri < BASEPRI_RTQUEUES &&
6026	processor->processor_primary != processor) {
6027	return check_reason \| AST_PREEMPT;
6028	}
6029
6030	if (thread->state & TH_SUSP) {
6031	return check_reason \| AST_PREEMPT;
6032	}
6033
6034	#if CONFIG_SCHED_SFI
6035	/*
6036	* Current thread may not need to be preempted, but maybe needs
6037	* an SFI wait?
6038	*/
6039	result = sfi_thread_needs_ast(thread, NULL);
6040	if (result != AST_NONE) {
6041	return result;
6042	}
6043	#endif
6044
6045	return AST_NONE;
6046	}
6047
6048	/*
6049	* Handle preemption IPI or IPI in response to setting an AST flag
6050	* Triggered by cause_ast_check
6051	* Called at splsched
6052	*/
6053	void
6054	ast_check(processor_t processor)
6055	{
6056	smr_ack_ipi();
6057
6058	if (processor->state != PROCESSOR_RUNNING &&
6059	processor->state != PROCESSOR_SHUTDOWN) {
6060	return;
6061	}
6062
6063	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6064	MACH_SCHED_AST_CHECK) \| DBG_FUNC_START);
6065
6066	thread_t thread = processor->active_thread;
6067
6068	assert(thread == current_thread());
6069
6070	/*
6071	* Pairs with task_restartable_ranges_synchronize
6072	*/
6073	thread_lock(thread);
6074
6075	thread_reset_pcs_ack_IPI(thread);
6076
6077	/*
6078	* Propagate thread ast to processor.
6079	* (handles IPI in response to setting AST flag)
6080	*/
6081	ast_propagate(thread);
6082
6083	/*
6084	* Stash the old urgency and perfctl values to find out if
6085	* csw_check updates them.
6086	*/
6087	thread_urgency_t old_urgency = processor->current_urgency;
6088	perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6089
6090	ast_t preempt;
6091
6092	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6093	ast_on(reasons: preempt);
6094	}
6095
6096	if (old_urgency != processor->current_urgency) {
6097	/*
6098	* Urgency updates happen with the thread lock held (ugh).
6099	* TODO: This doesn't notice QoS changes...
6100	*/
6101	uint64_t urgency_param1, urgency_param2;
6102
6103	thread_urgency_t urgency = thread_get_urgency(thread, rt_period: &urgency_param1, rt_deadline: &urgency_param2);
6104	thread_tell_urgency(urgency, rt_period: urgency_param1, rt_deadline: urgency_param2, sched_latency: `0`, nthread: thread);
6105	}
6106
6107	thread_unlock(thread);
6108
6109	if (old_perfctl_class != processor->current_perfctl_class) {
6110	/*
6111	* We updated the perfctl class of this thread from another core.
6112	* Let CLPC know that the currently running thread has a new
6113	* class.
6114	*/
6115
6116	machine_switch_perfcontrol_state_update(event: PERFCONTROL_ATTR_UPDATE,
6117	timestamp: mach_approximate_time(), flags: `0`, thread);
6118	}
6119
6120	SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6121	MACH_SCHED_AST_CHECK) \| DBG_FUNC_END, preempt);
6122	}
6123
6124
6125	void
6126	thread_preempt_expire(
6127	timer_call_param_t p0,
6128	__unused timer_call_param_t p1)
6129	{
6130	processor_t processor = p0;
6131
6132	assert(processor == current_processor());
6133	assert(p1 == NULL);
6134
6135	thread_t thread = current_thread();
6136
6137	/*
6138	* This is set and cleared by the current core, so we will
6139	* never see a race with running timer expiration
6140	*/
6141	assert(processor->pending_nonurgent_preemption);
6142
6143	clear_pending_nonurgent_preemption(processor);
6144
6145	thread_lock(thread);
6146
6147	/*
6148	* Check again to see if it's still worth a
6149	* context switch, but this time force enable kernel preemption
6150	*/
6151
6152	ast_t preempt = csw_check(thread, processor, AST_URGENT);
6153
6154	if (preempt) {
6155	ast_on(reasons: preempt);
6156	}
6157
6158	thread_unlock(thread);
6159
6160	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6161	}
6162
6163
6164	/*
6165	* set_sched_pri:
6166	*
6167	* Set the scheduled priority of the specified thread.
6168	*
6169	* This may cause the thread to change queues.
6170	*
6171	* Thread must be locked.
6172	*/
6173	void
6174	set_sched_pri(
6175	thread_t thread,
6176	int16_t new_priority,
6177	set_sched_pri_options_t options)
6178	{
6179	bool is_current_thread = (thread == current_thread());
6180	bool removed_from_runq = false;
6181	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6182
6183	int16_t old_priority = thread->sched_pri;
6184
6185	/ If we're already at this priority, no need to mess with the runqueue /
6186	if (new_priority == old_priority) {
6187	#if CONFIG_SCHED_CLUTCH
6188	/ For the first thread in the system, the priority is correct but*
6189	* th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6190	* scheduler relies on the bucket being set for all threads, update
6191	* its bucket here.
6192	*/
6193	if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6194	assert(thread == vm_pageout_scan_thread);
6195	SCHED(update_thread_bucket)(thread);
6196	}
6197	#endif /* CONFIG_SCHED_CLUTCH */
6198
6199	return;
6200	}
6201
6202	if (is_current_thread) {
6203	assert(thread->state & TH_RUN);
6204	thread_assert_runq_null(thread);
6205	} else {
6206	removed_from_runq = thread_run_queue_remove(thread);
6207	}
6208
6209	thread->sched_pri = new_priority;
6210
6211	#if CONFIG_SCHED_CLUTCH
6212	/*
6213	* Since for the clutch scheduler, the thread's bucket determines its runq
6214	* in the hierarchy it is important to update the bucket when the thread
6215	* lock is held and the thread has been removed from the runq hierarchy.
6216	*/
6217	SCHED(update_thread_bucket)(thread);
6218
6219	#endif /* CONFIG_SCHED_CLUTCH */
6220
6221	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6222	(uintptr_t)thread_tid(thread),
6223	thread->base_pri,
6224	thread->sched_pri,
6225	thread->sched_usage,
6226	`0`);
6227
6228	if (removed_from_runq) {
6229	thread_run_queue_reinsert(thread, options: SCHED_PREEMPT \| SCHED_TAILQ);
6230	} else if (is_current_thread) {
6231	processor_t processor = thread->last_processor;
6232	assert(processor == current_processor());
6233
6234	thread_urgency_t old_urgency = processor->current_urgency;
6235
6236	/*
6237	* When dropping in priority, check if the thread no longer belongs on core.
6238	* If a thread raises its own priority, don't aggressively rebalance it.
6239	* <rdar://problem/31699165>
6240	*
6241	* csw_check does a processor_state_update_from_thread, but
6242	* we should do our own if we're being lazy.
6243	*/
6244	if (!lazy_update && new_priority < old_priority) {
6245	ast_t preempt;
6246
6247	if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6248	ast_on(reasons: preempt);
6249	}
6250	} else {
6251	processor_state_update_from_thread(processor, thread, false);
6252	}
6253
6254	/*
6255	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6256	* class alterations from user space to occur relatively infrequently, hence
6257	* those are lazily handled. QoS classes have distinct priority bands, and QoS
6258	* inheritance is expected to involve priority changes.
6259	*/
6260	if (processor->current_urgency != old_urgency) {
6261	uint64_t urgency_param1, urgency_param2;
6262
6263	thread_urgency_t new_urgency = thread_get_urgency(thread,
6264	rt_period: &urgency_param1, rt_deadline: &urgency_param2);
6265
6266	thread_tell_urgency(urgency: new_urgency, rt_period: urgency_param1,
6267	rt_deadline: urgency_param2, sched_latency: `0`, nthread: thread);
6268	}
6269
6270	/ TODO: only call this if current_perfctl_class changed /
6271	uint64_t ctime = mach_approximate_time();
6272	machine_thread_going_on_core(new_thread: thread, urgency: processor->current_urgency, sched_latency: `0`, same_pri_latency: `0`, dispatch_time: ctime);
6273	} else if (thread->state & TH_RUN) {
6274	processor_t processor = thread->last_processor;
6275
6276	if (!lazy_update &&
6277	processor != PROCESSOR_NULL &&
6278	processor != current_processor() &&
6279	processor->active_thread == thread) {
6280	cause_ast_check(processor);
6281	}
6282	}
6283	}
6284
6285	/*
6286	* thread_run_queue_remove_for_handoff
6287	*
6288	* Pull a thread or its (recursive) push target out of the runqueue
6289	* so that it is ready for thread_run()
6290	*
6291	* Called at splsched
6292	*
6293	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6294	* This may be different than the thread that was passed in.
6295	*/
6296	thread_t
6297	thread_run_queue_remove_for_handoff(thread_t thread)
6298	{
6299	thread_t pulled_thread = THREAD_NULL;
6300
6301	thread_lock(thread);
6302
6303	/*
6304	* Check that the thread is not bound to a different processor,
6305	* NO_SMT flag is not set on the thread, cluster type of
6306	* processor matches with thread if the thread is pinned to a
6307	* particular cluster and that realtime is not involved.
6308	*
6309	* Next, pull it off its run queue. If it doesn't come, it's not eligible.
6310	*/
6311	processor_t processor = current_processor();
6312	if ((thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)
6313	&& (!thread_no_smt(thread))
6314	&& (processor->current_pri < BASEPRI_RTQUEUES)
6315	&& (thread->sched_pri < BASEPRI_RTQUEUES)
6316	#if __AMP__
6317	&& ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) \|\|
6318	processor->processor_set->pset_id == thread->th_bound_cluster_id)
6319	#endif /* __AMP__ */
6320	) {
6321	if (thread_run_queue_remove(thread)) {
6322	pulled_thread = thread;
6323	}
6324	}
6325
6326	thread_unlock(thread);
6327
6328	return pulled_thread;
6329	}
6330
6331	/*
6332	* thread_prepare_for_handoff
6333	*
6334	* Make the thread ready for handoff.
6335	* If the thread was runnable then pull it off the runq, if the thread could
6336	* not be pulled, return NULL.
6337	*
6338	* If the thread was woken up from wait for handoff, make sure it is not bound to
6339	* different processor.
6340	*
6341	* Called at splsched
6342	*
6343	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6344	* This may be different than the thread that was passed in.
6345	*/
6346	thread_t
6347	thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6348	{
6349	thread_t pulled_thread = THREAD_NULL;
6350
6351	if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6352	processor_t processor = current_processor();
6353	thread_lock(thread);
6354
6355	/*
6356	* Check that the thread is not bound to a different processor,
6357	* NO_SMT flag is not set on the thread and cluster type of
6358	* processor matches with thread if the thread is pinned to a
6359	* particular cluster. Call setrun instead if above conditions
6360	* are not satisfied.
6361	*/
6362	if ((thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)
6363	&& (!thread_no_smt(thread))
6364	#if __AMP__
6365	&& ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) \|\|
6366	processor->processor_set->pset_id == thread->th_bound_cluster_id)
6367	#endif /* __AMP__ */
6368	) {
6369	pulled_thread = thread;
6370	} else {
6371	thread_setrun(thread, options: SCHED_PREEMPT \| SCHED_TAILQ);
6372	}
6373	thread_unlock(thread);
6374	} else {
6375	pulled_thread = thread_run_queue_remove_for_handoff(thread);
6376	}
6377
6378	return pulled_thread;
6379	}
6380
6381	/*
6382	* thread_run_queue_remove:
6383	*
6384	* Remove a thread from its current run queue and
6385	* return TRUE if successful.
6386	*
6387	* Thread must be locked.
6388	*
6389	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6390	* run queues because the caller locked the thread. Otherwise
6391	* the thread is on a run queue, but could be chosen for dispatch
6392	* and removed by another processor under a different lock, which
6393	* will set thread->runq to PROCESSOR_NULL.
6394	*
6395	* Hence the thread select path must not rely on anything that could
6396	* be changed under the thread lock after calling this function,
6397	* most importantly thread->sched_pri.
6398	*/
6399	boolean_t
6400	thread_run_queue_remove(
6401	thread_t thread)
6402	{
6403	boolean_t removed = FALSE;
6404
6405	if ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT) {
6406	/ Thread isn't runnable /
6407	thread_assert_runq_null(thread);
6408	return FALSE;
6409	}
6410
6411	processor_t processor = thread_get_runq(thread);
6412	if (processor == PROCESSOR_NULL) {
6413	/*
6414	* The thread is either not on the runq,
6415	* or is in the midst of being removed from the runq.
6416	*
6417	* runq is set to NULL under the pset lock, not the thread
6418	* lock, so the thread may still be in the process of being dequeued
6419	* from the runq. It will wait in invoke for the thread lock to be
6420	* dropped.
6421	*/
6422
6423	return FALSE;
6424	}
6425
6426	if (thread->sched_pri < BASEPRI_RTQUEUES) {
6427	return SCHED(processor_queue_remove)(processor, thread);
6428	}
6429
6430	processor_set_t pset = processor->processor_set;
6431
6432	pset_lock(pset);
6433
6434	/*
6435	* Must re-read the thread runq after acquiring the pset lock, in
6436	* case another core swooped in before us to dequeue the thread.
6437	*/
6438	if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6439	/*
6440	* Thread is on the RT run queue and we have a lock on
6441	* that run queue.
6442	*/
6443	rt_runq_remove(SCHED(rt_runq)(pset), thread);
6444	pset_update_rt_stealable_state(pset);
6445
6446	removed = TRUE;
6447	}
6448
6449	pset_unlock(pset);
6450
6451	return removed;
6452	}
6453
6454	/*
6455	* Put the thread back where it goes after a thread_run_queue_remove
6456	*
6457	* Thread must have been removed under the same thread lock hold
6458	*
6459	* thread locked, at splsched
6460	*/
6461	void
6462	thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6463	{
6464	thread_assert_runq_null(thread);
6465	assert(thread->state & (TH_RUN));
6466
6467	thread_setrun(thread, options);
6468	}
6469
6470	void
6471	sys_override_cpu_throttle(boolean_t enable_override)
6472	{
6473	if (enable_override) {
6474	cpu_throttle_enabled = `0`;
6475	} else {
6476	cpu_throttle_enabled = `1`;
6477	}
6478	}
6479
6480	thread_urgency_t
6481	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
6482	{
6483	uint64_t urgency_param1 = `0`, urgency_param2 = `0`;
6484	task_t task = get_threadtask_early(thread);
6485
6486	thread_urgency_t urgency;
6487
6488	if (thread == NULL \|\| task == TASK_NULL \|\| (thread->state & TH_IDLE)) {
6489	urgency_param1 = `0`;
6490	urgency_param2 = `0`;
6491
6492	urgency = THREAD_URGENCY_NONE;
6493	} else if (thread->sched_mode == TH_MODE_REALTIME) {
6494	urgency_param1 = thread->realtime.period;
6495	urgency_param2 = thread->realtime.deadline;
6496
6497	urgency = THREAD_URGENCY_REAL_TIME;
6498	} else if (cpu_throttle_enabled &&
6499	(thread->sched_pri <= MAXPRI_THROTTLE) &&
6500	(thread->base_pri <= MAXPRI_THROTTLE)) {
6501	/*
6502	* Threads that are running at low priority but are not
6503	* tagged with a specific QoS are separated out from
6504	* the "background" urgency. Performance management
6505	* subsystem can decide to either treat these threads
6506	* as normal threads or look at other signals like thermal
6507	* levels for optimal power/perf tradeoffs for a platform.
6508	*/
6509	boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6510	boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == `0x1`);
6511
6512	/*
6513	* Background urgency applied when thread priority is
6514	* MAXPRI_THROTTLE or lower and thread is not promoted
6515	* and thread has a QoS specified
6516	*/
6517	urgency_param1 = thread->sched_pri;
6518	urgency_param2 = thread->base_pri;
6519
6520	if (thread_lacks_qos && !task_is_suppressed) {
6521	urgency = THREAD_URGENCY_LOWPRI;
6522	} else {
6523	urgency = THREAD_URGENCY_BACKGROUND;
6524	}
6525	} else {
6526	/ For otherwise unclassified threads, report throughput QoS parameters /
6527	urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6528	urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6529	urgency = THREAD_URGENCY_NORMAL;
6530	}
6531
6532	if (arg1 != NULL) {
6533	*arg1 = urgency_param1;
6534	}
6535	if (arg2 != NULL) {
6536	*arg2 = urgency_param2;
6537	}
6538
6539	return urgency;
6540	}
6541
6542	perfcontrol_class_t
6543	thread_get_perfcontrol_class(thread_t thread)
6544	{
6545	/ Special case handling /
6546	if (thread->state & TH_IDLE) {
6547	return PERFCONTROL_CLASS_IDLE;
6548	}
6549
6550	if (thread->sched_mode == TH_MODE_REALTIME) {
6551	return PERFCONTROL_CLASS_REALTIME;
6552	}
6553
6554	/ perfcontrol_class based on base_pri /
6555	if (thread->base_pri <= MAXPRI_THROTTLE) {
6556	return PERFCONTROL_CLASS_BACKGROUND;
6557	} else if (thread->base_pri <= BASEPRI_UTILITY) {
6558	return PERFCONTROL_CLASS_UTILITY;
6559	} else if (thread->base_pri <= BASEPRI_DEFAULT) {
6560	return PERFCONTROL_CLASS_NONUI;
6561	} else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6562	return PERFCONTROL_CLASS_USER_INITIATED;
6563	} else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6564	return PERFCONTROL_CLASS_UI;
6565	} else {
6566	if (get_threadtask(thread) == kernel_task) {
6567	/*
6568	* Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6569	* All other lower priority kernel threads should be treated
6570	* as regular threads for performance control purposes.
6571	*/
6572	return PERFCONTROL_CLASS_KERNEL;
6573	}
6574	return PERFCONTROL_CLASS_ABOVEUI;
6575	}
6576	}
6577
6578	/*
6579	* This is the processor idle loop, which just looks for other threads
6580	* to execute. Processor idle threads invoke this without supplying a
6581	* current thread to idle without an asserted wait state.
6582	*
6583	* Returns a the next thread to execute if dispatched directly.
6584	*/
6585
6586	#if 0
6587	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6588	#else
6589	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6590	#endif
6591
6592	#if (DEVELOPMENT \|\| DEBUG)
6593	int sched_idle_delay_cpuid = -`1`;
6594	#endif
6595
6596	thread_t
6597	processor_idle(
6598	thread_t thread,
6599	processor_t processor)
6600	{
6601	processor_set_t pset = processor->processor_set;
6602	struct recount_snap snap = { `0` };
6603
6604	(void)splsched();
6605
6606	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6607	MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) \| DBG_FUNC_START,
6608	(uintptr_t)thread_tid(thread), `0`, `0`, `0`, `0`);
6609
6610	SCHED_STATS_INC(idle_transitions);
6611	assert(processor->running_timers_active == false);
6612
6613	recount_snapshot(snap: &snap);
6614	recount_processor_idle(pr: &processor->pr_recount, snap: &snap);
6615
6616	while (`1`) {
6617	/*
6618	* Ensure that updates to my processor and pset state,
6619	* made by the IPI source processor before sending the IPI,
6620	* are visible on this processor now (even though we don't
6621	* take the pset lock yet).
6622	*/
6623	atomic_thread_fence(memory_order_acquire);
6624
6625	if (processor->state != PROCESSOR_IDLE) {
6626	break;
6627	}
6628	if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6629	break;
6630	}
6631	#if defined(CONFIG_SCHED_DEFERRED_AST)
6632	if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6633	break;
6634	}
6635	#endif
6636	if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6637	break;
6638	}
6639
6640	if (processor->is_recommended && (processor->processor_primary == processor)) {
6641	if (rt_runq_count(pset)) {
6642	break;
6643	}
6644	} else {
6645	if (SCHED(processor_bound_count)(processor)) {
6646	break;
6647	}
6648	}
6649
6650	IDLE_KERNEL_DEBUG_CONSTANT(
6651	MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -`1`, `0`);
6652
6653	machine_track_platform_idle(TRUE);
6654
6655	machine_idle();
6656	/ returns with interrupts enabled /
6657
6658	machine_track_platform_idle(FALSE);
6659
6660	#if (DEVELOPMENT \|\| DEBUG)
6661	if (processor->cpu_id == sched_idle_delay_cpuid) {
6662	delay(`500`);
6663	}
6664	#endif
6665
6666	(void)splsched();
6667
6668	atomic_thread_fence(memory_order_acquire);
6669
6670	IDLE_KERNEL_DEBUG_CONSTANT(
6671	MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -`2`, `0`);
6672
6673	/*
6674	* Check if we should call sched_timeshare_consider_maintenance() here.
6675	* The CPU was woken out of idle due to an interrupt and we should do the
6676	* call only if the processor is still idle. If the processor is non-idle,
6677	* the threads running on the processor would do the call as part of
6678	* context swithing.
6679	*/
6680	if (processor->state == PROCESSOR_IDLE) {
6681	sched_timeshare_consider_maintenance(ctime: mach_absolute_time(), true);
6682	}
6683
6684	if (!SCHED(processor_queue_empty)(processor)) {
6685	/ Secondary SMT processors respond to directed wakeups*
6686	* exclusively. Some platforms induce 'spurious' SMT wakeups.
6687	*/
6688	if (processor->processor_primary == processor) {
6689	break;
6690	}
6691	}
6692	}
6693
6694	recount_snapshot(snap: &snap);
6695	recount_processor_run(pr: &processor->pr_recount, snap: &snap);
6696	smr_cpu_join(processor, ctime: snap.rsn_time_mach);
6697
6698	ast_t reason = AST_NONE;
6699
6700	/ We're handling all scheduling AST's /
6701	ast_off(AST_SCHEDULING);
6702
6703	/*
6704	* thread_select will move the processor from dispatching to running,
6705	* or put it in idle if there's nothing to do.
6706	*/
6707	thread_t cur_thread = current_thread();
6708
6709	thread_lock(cur_thread);
6710	thread_t new_thread = thread_select(thread: cur_thread, processor, reason: &reason);
6711	thread_unlock(cur_thread);
6712
6713	assert(processor->running_timers_active == false);
6714
6715	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6716	MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) \| DBG_FUNC_END,
6717	(uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, `0`);
6718
6719	return new_thread;
6720	}
6721
6722	/*
6723	* Each processor has a dedicated thread which
6724	* executes the idle loop when there is no suitable
6725	* previous context.
6726	*
6727	* This continuation is entered with interrupts disabled.
6728	*/
6729	void
6730	idle_thread(__assert_only void* parameter,
6731	__unused wait_result_t result)
6732	{
6733	assert(ml_get_interrupts_enabled() == FALSE);
6734	assert(parameter == NULL);
6735
6736	processor_t processor = current_processor();
6737
6738	smr_cpu_leave(processor, ctime: processor->last_dispatch);
6739
6740	/*
6741	* Ensure that anything running in idle context triggers
6742	* preemption-disabled checks.
6743	*/
6744	disable_preemption_without_measurements();
6745
6746	/*
6747	* Enable interrupts temporarily to handle any pending interrupts
6748	* or IPIs before deciding to sleep
6749	*/
6750	spllo();
6751
6752	thread_t new_thread = processor_idle(THREAD_NULL, processor);
6753	/ returns with interrupts disabled /
6754
6755	enable_preemption();
6756
6757	if (new_thread != THREAD_NULL) {
6758	thread_run(self: processor->idle_thread,
6759	continuation: idle_thread, NULL, new_thread);
6760	/NOTREACHED/
6761	}
6762
6763	thread_block(continuation: idle_thread);
6764	/NOTREACHED/
6765	}
6766
6767	kern_return_t
6768	idle_thread_create(
6769	processor_t processor)
6770	{
6771	kern_return_t result;
6772	thread_t thread;
6773	spl_t s;
6774	char name[MAXTHREADNAMESIZE];
6775
6776	result = kernel_thread_create(continuation: idle_thread, NULL, MAXPRI_KERNEL, new_thread: &thread);
6777	if (result != KERN_SUCCESS) {
6778	return result;
6779	}
6780
6781	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6782	thread_set_thread_name(th: thread, name);
6783
6784	s = splsched();
6785	thread_lock(thread);
6786	thread->bound_processor = processor;
6787	processor->idle_thread = thread;
6788	thread->sched_pri = thread->base_pri = IDLEPRI;
6789	thread->state = (TH_RUN \| TH_IDLE);
6790	thread->options \|= TH_OPT_IDLE_THREAD;
6791	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6792	thread_unlock(thread);
6793	splx(s);
6794
6795	thread_deallocate(thread);
6796
6797	return KERN_SUCCESS;
6798	}
6799
6800	static void sched_update_powered_cores_continue(void);
6801
6802	/*
6803	* sched_startup:
6804	*
6805	* Kicks off scheduler services.
6806	*
6807	* Called at splsched.
6808	*/
6809	void
6810	sched_startup(void)
6811	{
6812	kern_return_t result;
6813	thread_t thread;
6814
6815	simple_lock_init(&sched_vm_group_list_lock, `0`);
6816
6817	result = kernel_thread_start_priority(continuation: (thread_continue_t)sched_init_thread,
6818	NULL, MAXPRI_KERNEL, new_thread: &thread);
6819	if (result != KERN_SUCCESS) {
6820	panic("sched_startup");
6821	}
6822
6823	thread_deallocate(thread);
6824
6825	assert_thread_magic(thread);
6826
6827	/*
6828	* Yield to the sched_init_thread once, to
6829	* initialize our own thread after being switched
6830	* back to.
6831	*
6832	* The current thread is the only other thread
6833	* active at this point.
6834	*/
6835	thread_block(THREAD_CONTINUE_NULL);
6836
6837	result = kernel_thread_start_priority(continuation: (thread_continue_t)sched_update_powered_cores_continue,
6838	NULL, MAXPRI_KERNEL, new_thread: &thread);
6839	if (result != KERN_SUCCESS) {
6840	panic("sched_startup");
6841	}
6842
6843	thread_deallocate(thread);
6844
6845	assert_thread_magic(thread);
6846	}
6847
6848	#if __arm64__
6849	static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6850	#endif /* __arm64__ */
6851
6852
6853	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6854
6855	static volatile uint64_t sched_maintenance_deadline;
6856	static uint64_t sched_tick_last_abstime;
6857	static uint64_t sched_tick_delta;
6858	uint64_t sched_tick_max_delta;
6859
6860
6861	/*
6862	* sched_init_thread:
6863	*
6864	* Perform periodic bookkeeping functions about ten
6865	* times per second.
6866	*/
6867	void
6868	sched_timeshare_maintenance_continue(void)
6869	{
6870	uint64_t sched_tick_ctime, late_time;
6871
6872	struct sched_update_scan_context scan_context = {
6873	.earliest_bg_make_runnable_time = UINT64_MAX,
6874	.earliest_normal_make_runnable_time = UINT64_MAX,
6875	.earliest_rt_make_runnable_time = UINT64_MAX
6876	};
6877
6878	sched_tick_ctime = mach_absolute_time();
6879
6880	if (__improbable(sched_tick_last_abstime == `0`)) {
6881	sched_tick_last_abstime = sched_tick_ctime;
6882	late_time = `0`;
6883	sched_tick_delta = `1`;
6884	} else {
6885	late_time = sched_tick_ctime - sched_tick_last_abstime;
6886	sched_tick_delta = late_time / sched_tick_interval;
6887	/ Ensure a delta of 1, since the interval could be slightly*
6888	* smaller than the sched_tick_interval due to dispatch
6889	* latencies.
6890	*/
6891	sched_tick_delta = MAX(sched_tick_delta, `1`);
6892
6893	/ In the event interrupt latencies or platform*
6894	* idle events that advanced the timebase resulted
6895	* in periods where no threads were dispatched,
6896	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6897	* iterations.
6898	*/
6899	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6900
6901	sched_tick_last_abstime = sched_tick_ctime;
6902	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6903	}
6904
6905	scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6906	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_START,
6907	sched_tick_delta, late_time, `0`, `0`, `0`);
6908
6909	/ Add a number of pseudo-ticks corresponding to the elapsed interval*
6910	* This could be greater than 1 if substantial intervals where
6911	* all processors are idle occur, which rarely occurs in practice.
6912	*/
6913
6914	sched_tick += sched_tick_delta;
6915
6916	update_vm_info();
6917
6918	/*
6919	* Compute various averages.
6920	*/
6921	compute_averages(sched_tick_delta);
6922
6923	/*
6924	* Scan the run queues for threads which
6925	* may need to be updated, and find the earliest runnable thread on the runqueue
6926	* to report its latency.
6927	*/
6928	SCHED(thread_update_scan)(&scan_context);
6929
6930	SCHED(rt_runq_scan)(&scan_context);
6931
6932	uint64_t ctime = mach_absolute_time();
6933
6934	uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6935	ctime - scan_context.earliest_bg_make_runnable_time : `0`;
6936
6937	uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6938	ctime - scan_context.earliest_normal_make_runnable_time : `0`;
6939
6940	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6941	ctime - scan_context.earliest_rt_make_runnable_time : `0`;
6942
6943	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6944
6945	/*
6946	* Check to see if the special sched VM group needs attention.
6947	*/
6948	sched_vm_group_maintenance();
6949
6950	#if __arm64__
6951	/ Check to see if the recommended cores failsafe is active /
6952	sched_recommended_cores_maintenance();
6953	#endif /* __arm64__ */
6954
6955
6956	#if DEBUG \|\| DEVELOPMENT
6957	#if __x86_64__
6958	#include <i386/misc_protos.h>
6959	/ Check for long-duration interrupts /
6960	mp_interrupt_watchdog();
6961	#endif /* __x86_64__ */
6962	#endif /* DEBUG \|\| DEVELOPMENT */
6963
6964	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_END,
6965	sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6966	sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], `0`);
6967
6968	assert_wait(event: (event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6969	thread_block(continuation: (thread_continue_t)sched_timeshare_maintenance_continue);
6970	/NOTREACHED/
6971	}
6972
6973	static uint64_t sched_maintenance_wakeups;
6974
6975	/*
6976	* Determine if the set of routines formerly driven by a maintenance timer
6977	* must be invoked, based on a deadline comparison. Signals the scheduler
6978	* maintenance thread on deadline expiration. Must be invoked at an interval
6979	* lower than the "sched_tick_interval", currently accomplished by
6980	* invocation via the quantum expiration timer and at context switch time.
6981	* Performance matters: this routine reuses a timestamp approximating the
6982	* current absolute time received from the caller, and should perform
6983	* no more than a comparison against the deadline in the common case.
6984	*/
6985	void
6986	sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6987	{
6988	uint64_t deadline = sched_maintenance_deadline;
6989
6990	if (__improbable(ctime >= deadline)) {
6991	if (__improbable(current_thread() == sched_maintenance_thread)) {
6992	return;
6993	}
6994	OSMemoryBarrier();
6995
6996	uint64_t ndeadline = ctime + sched_tick_interval;
6997
6998	if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6999	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
7000	sched_maintenance_wakeups++;
7001	smr_maintenance(ctime);
7002	}
7003	}
7004
7005	smr_cpu_tick(ctime, safe_point);
7006
7007	#if !CONFIG_SCHED_CLUTCH
7008	/*
7009	* Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7010	* scheduler, the load is maintained at the thread group and bucket level.
7011	*/
7012	uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7013
7014	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7015	uint64_t new_deadline = `0`;
7016	if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7017	compute_sched_load();
7018	new_deadline = ctime + sched_load_compute_interval_abs;
7019	os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7020	}
7021	}
7022	#endif /* CONFIG_SCHED_CLUTCH */
7023
7024	#if __arm64__
7025	uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7026
7027	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7028	/ CAS in 0, if success, make callback. Otherwise let the next context switch check again. /
7029	if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, `0`, relaxed)) {
7030	machine_perfcontrol_deadline_passed(deadline: perf_deadline);
7031	}
7032	}
7033	#endif /* __arm64__ */
7034	}
7035
7036	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
7037
7038	void
7039	sched_init_thread(void)
7040	{
7041	thread_block(THREAD_CONTINUE_NULL);
7042
7043	thread_t thread = current_thread();
7044
7045	thread_set_thread_name(th: thread, name: "sched_maintenance_thread");
7046
7047	sched_maintenance_thread = thread;
7048
7049	SCHED(maintenance_continuation)();
7050
7051	/NOTREACHED/
7052	}
7053
7054	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
7055
7056	/*
7057	* thread_update_scan / runq_scan:
7058	*
7059	* Scan the run queues to account for timesharing threads
7060	* which need to be updated.
7061	*
7062	* Scanner runs in two passes. Pass one squirrels likely
7063	* threads away in an array, pass two does the update.
7064	*
7065	* This is necessary because the run queue is locked for
7066	* the candidate scan, but the thread is locked for the update.
7067	*
7068	* Array should be sized to make forward progress, without
7069	* disabling preemption for long periods.
7070	*/
7071
7072	#define THREAD_UPDATE_SIZE 128
7073
7074	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7075	static uint32_t thread_update_count = `0`;
7076
7077	/ Returns TRUE if thread was added, FALSE if thread_update_array is full /
7078	boolean_t
7079	thread_update_add_thread(thread_t thread)
7080	{
7081	if (thread_update_count == THREAD_UPDATE_SIZE) {
7082	return FALSE;
7083	}
7084
7085	thread_update_array[thread_update_count++] = thread;
7086	thread_reference(thread);
7087	return TRUE;
7088	}
7089
7090	void
7091	thread_update_process_threads(void)
7092	{
7093	assert(thread_update_count <= THREAD_UPDATE_SIZE);
7094
7095	for (uint32_t i = `0`; i < thread_update_count; i++) {
7096	thread_t thread = thread_update_array[i];
7097	assert_thread_magic(thread);
7098	thread_update_array[i] = THREAD_NULL;
7099
7100	spl_t s = splsched();
7101	thread_lock(thread);
7102	if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7103	SCHED(update_priority)(thread);
7104	}
7105	thread_unlock(thread);
7106	splx(s);
7107
7108	thread_deallocate(thread);
7109	}
7110
7111	thread_update_count = `0`;
7112	}
7113
7114	static boolean_t
7115	runq_scan_thread(
7116	thread_t thread,
7117	sched_update_scan_context_t scan_context)
7118	{
7119	assert_thread_magic(thread);
7120
7121	if (thread->sched_stamp != sched_tick &&
7122	thread->sched_mode == TH_MODE_TIMESHARE) {
7123	if (thread_update_add_thread(thread) == FALSE) {
7124	return TRUE;
7125	}
7126	}
7127
7128	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7129	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7130	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7131	}
7132	} else {
7133	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7134	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7135	}
7136	}
7137
7138	return FALSE;
7139	}
7140
7141	/*
7142	* Scan a runq for candidate threads.
7143	*
7144	* Returns TRUE if retry is needed.
7145	*/
7146	boolean_t
7147	runq_scan(
7148	run_queue_t runq,
7149	sched_update_scan_context_t scan_context)
7150	{
7151	int count = runq->count;
7152	int queue_index;
7153
7154	assert(count >= `0`);
7155
7156	if (count == `0`) {
7157	return FALSE;
7158	}
7159
7160	for (queue_index = bitmap_first(map: runq->bitmap, NRQS);
7161	queue_index >= `0`;
7162	queue_index = bitmap_next(map: runq->bitmap, prev: queue_index)) {
7163	thread_t thread;
7164	circle_queue_t queue = &runq->queues[queue_index];
7165
7166	cqe_foreach_element(thread, queue, runq_links) {
7167	assert(count > `0`);
7168	if (runq_scan_thread(thread, scan_context) == TRUE) {
7169	return TRUE;
7170	}
7171	count--;
7172	}
7173	}
7174
7175	return FALSE;
7176	}
7177
7178	#if CONFIG_SCHED_CLUTCH
7179
7180	boolean_t
7181	sched_clutch_timeshare_scan(
7182	queue_t thread_queue,
7183	uint16_t thread_count,
7184	sched_update_scan_context_t scan_context)
7185	{
7186	if (thread_count == `0`) {
7187	return FALSE;
7188	}
7189
7190	thread_t thread;
7191	qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7192	if (runq_scan_thread(thread, scan_context) == TRUE) {
7193	return TRUE;
7194	}
7195	thread_count--;
7196	}
7197
7198	assert(thread_count == `0`);
7199	return FALSE;
7200	}
7201
7202
7203	#endif /* CONFIG_SCHED_CLUTCH */
7204
7205	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
7206
7207	bool
7208	thread_is_eager_preempt(thread_t thread)
7209	{
7210	return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7211	}
7212
7213	void
7214	thread_set_eager_preempt(thread_t thread)
7215	{
7216	spl_t s = splsched();
7217	thread_lock(thread);
7218
7219	assert(!thread_is_eager_preempt(thread));
7220
7221	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
7222
7223	if (thread == current_thread()) {
7224	/ csw_check updates current_is_eagerpreempt on the processor /
7225	ast_t ast = csw_check(thread, processor: current_processor(), AST_NONE);
7226
7227	thread_unlock(thread);
7228
7229	if (ast != AST_NONE) {
7230	thread_block_reason(THREAD_CONTINUE_NULL, NULL, reason: ast);
7231	}
7232	} else {
7233	processor_t last_processor = thread->last_processor;
7234
7235	if (last_processor != PROCESSOR_NULL &&
7236	last_processor->state == PROCESSOR_RUNNING &&
7237	last_processor->active_thread == thread) {
7238	cause_ast_check(processor: last_processor);
7239	}
7240
7241	thread_unlock(thread);
7242	}
7243
7244	splx(s);
7245	}
7246
7247	void
7248	thread_clear_eager_preempt(thread_t thread)
7249	{
7250	spl_t s = splsched();
7251	thread_lock(thread);
7252
7253	assert(thread_is_eager_preempt(thread));
7254
7255	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7256
7257	if (thread == current_thread()) {
7258	current_processor()->current_is_eagerpreempt = false;
7259	}
7260
7261	thread_unlock(thread);
7262	splx(s);
7263	}
7264
7265	/*
7266	* Scheduling statistics
7267	*/
7268	void
7269	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7270	{
7271	struct sched_statistics *stats;
7272	boolean_t to_realtime = FALSE;
7273
7274	stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7275	stats->csw_count++;
7276
7277	if (otherpri >= BASEPRI_REALTIME) {
7278	stats->rt_sched_count++;
7279	to_realtime = TRUE;
7280	}
7281
7282	if ((reasons & AST_PREEMPT) != `0`) {
7283	stats->preempt_count++;
7284
7285	if (selfpri >= BASEPRI_REALTIME) {
7286	stats->preempted_rt_count++;
7287	}
7288
7289	if (to_realtime) {
7290	stats->preempted_by_rt_count++;
7291	}
7292	}
7293	}
7294
7295	void
7296	sched_stats_handle_runq_change(struct runq_stats stats, int* old_count)
7297	{
7298	uint64_t timestamp = mach_absolute_time();
7299
7300	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7301	stats->last_change_timestamp = timestamp;
7302	}
7303
7304	/*
7305	* For calls from assembly code
7306	*/
7307	#undef thread_wakeup
7308	void
7309	thread_wakeup(
7310	event_t x);
7311
7312	void
7313	thread_wakeup(
7314	event_t x)
7315	{
7316	thread_wakeup_with_result(x, THREAD_AWAKENED);
7317	}
7318
7319	boolean_t
7320	preemption_enabled(void)
7321	{
7322	return get_preemption_level() == `0` && ml_get_interrupts_enabled();
7323	}
7324
7325	static void
7326	sched_timer_deadline_tracking_init(void)
7327	{
7328	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, result: &timer_deadline_tracking_bin_1);
7329	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, result: &timer_deadline_tracking_bin_2);
7330	}
7331
7332	static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7333	processor_reason_t latest_requested_reason = REASON_NONE;
7334	static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7335	bool perfcontrol_sleep_override = false;
7336
7337	LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7338	LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7339	int32_t cluster_powerdown_suspend_count = `0`;
7340
7341	bool
7342	sched_is_in_sleep(void)
7343	{
7344	os_atomic_thread_fence(acquire);
7345	return perfcontrol_sleep_override;
7346	}
7347
7348	static void
7349	sched_update_powered_cores_continue(void)
7350	{
7351	lck_mtx_lock(lck: &cluster_powerdown_lock);
7352
7353	if (!cluster_powerdown_suspend_count) {
7354	spl_t s = splsched();
7355	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7356
7357	uint64_t latest = latest_requested_powered_cores;
7358	processor_reason_t reason = latest_requested_reason;
7359	uint64_t current = current_requested_powered_cores;
7360	current_requested_powered_cores = latest;
7361	bool in_sleep = perfcontrol_sleep_override;
7362
7363	simple_unlock(&sched_available_cores_lock);
7364	splx(s);
7365
7366	while (latest != current) {
7367	if (!in_sleep) {
7368	assert((reason == REASON_CLPC_SYSTEM) \|\| (reason == REASON_CLPC_USER));
7369	sched_update_powered_cores(reqested_powered_cores: latest, reason, SHUTDOWN_TEMPORARY \| WAIT_FOR_LAST_START);
7370	}
7371
7372	s = splsched();
7373	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7374
7375	latest = latest_requested_powered_cores;
7376	reason = latest_requested_reason;
7377	current = current_requested_powered_cores;
7378	current_requested_powered_cores = latest;
7379	in_sleep = perfcontrol_sleep_override;
7380
7381	simple_unlock(&sched_available_cores_lock);
7382	splx(s);
7383	}
7384
7385	assert_wait(event: (event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7386
7387	s = splsched();
7388	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7389	if (latest_requested_powered_cores != current_requested_powered_cores) {
7390	clear_wait(thread: current_thread(), THREAD_AWAKENED);
7391	}
7392	simple_unlock(&sched_available_cores_lock);
7393	splx(s);
7394	}
7395
7396	lck_mtx_unlock(lck: &cluster_powerdown_lock);
7397
7398	thread_block(continuation: (thread_continue_t)sched_update_powered_cores_continue);
7399	/NOTREACHED/
7400	}
7401
7402	void
7403	sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7404	{
7405	assert((reason == REASON_CLPC_SYSTEM) \|\| (reason == REASON_CLPC_USER));
7406
7407	#if DEVELOPMENT \|\| DEBUG
7408	if (flags & (ASSERT_IN_SLEEP \| ASSERT_POWERDOWN_SUSPENDED)) {
7409	if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7410	assert(cluster_powerdown_suspend_count > `0`);
7411	}
7412	if (flags & ASSERT_IN_SLEEP) {
7413	assert(perfcontrol_sleep_override == true);
7414	}
7415	return;
7416	}
7417	#endif
7418
7419	spl_t s = splsched();
7420	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7421
7422	bool should_wakeup = !cluster_powerdown_suspend_count;
7423	if (should_wakeup) {
7424	latest_requested_powered_cores = requested_powered_cores;
7425	latest_requested_reason = reason;
7426	}
7427
7428	simple_unlock(&sched_available_cores_lock);
7429	splx(s);
7430
7431	if (should_wakeup) {
7432	thread_wakeup(x: (event_t)sched_update_powered_cores_continue);
7433	}
7434	}
7435
7436	void
7437	suspend_cluster_powerdown(void)
7438	{
7439	lck_mtx_lock(lck: &cluster_powerdown_lock);
7440
7441	assert(cluster_powerdown_suspend_count >= `0`);
7442
7443	bool first_suspend = (cluster_powerdown_suspend_count == `0`);
7444	if (first_suspend) {
7445	spl_t s = splsched();
7446	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7447	latest_requested_powered_cores = ALL_CORES_POWERED;
7448	current_requested_powered_cores = ALL_CORES_POWERED;
7449	latest_requested_reason = REASON_SYSTEM;
7450	simple_unlock(&sched_available_cores_lock);
7451	splx(s);
7452	}
7453
7454	cluster_powerdown_suspend_count++;
7455
7456	if (first_suspend) {
7457	kprintf(fmt: "%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE \| WAIT_FOR_START)\n", __FUNCTION__);
7458	sched_update_powered_cores(ALL_CORES_POWERED, reason: REASON_SYSTEM, LOCK_STATE \| WAIT_FOR_START);
7459	}
7460
7461	lck_mtx_unlock(lck: &cluster_powerdown_lock);
7462	}
7463
7464	void
7465	resume_cluster_powerdown(void)
7466	{
7467	lck_mtx_lock(lck: &cluster_powerdown_lock);
7468
7469	if (cluster_powerdown_suspend_count <= `0`) {
7470	panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7471	}
7472
7473	cluster_powerdown_suspend_count--;
7474
7475	bool last_resume = (cluster_powerdown_suspend_count == `0`);
7476
7477	if (last_resume) {
7478	spl_t s = splsched();
7479	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7480	latest_requested_powered_cores = ALL_CORES_POWERED;
7481	current_requested_powered_cores = ALL_CORES_POWERED;
7482	latest_requested_reason = REASON_SYSTEM;
7483	simple_unlock(&sched_available_cores_lock);
7484	splx(s);
7485
7486	kprintf(fmt: "%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7487	sched_update_powered_cores(ALL_CORES_POWERED, reason: REASON_SYSTEM, UNLOCK_STATE);
7488	}
7489
7490	lck_mtx_unlock(lck: &cluster_powerdown_lock);
7491	}
7492
7493	LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7494	static bool user_suspended_cluster_powerdown = false;
7495
7496	kern_return_t
7497	suspend_cluster_powerdown_from_user(void)
7498	{
7499	kern_return_t ret = KERN_FAILURE;
7500
7501	lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7502
7503	if (!user_suspended_cluster_powerdown) {
7504	suspend_cluster_powerdown();
7505	user_suspended_cluster_powerdown = true;
7506	ret = KERN_SUCCESS;
7507	}
7508
7509	lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7510
7511	return ret;
7512	}
7513
7514	kern_return_t
7515	resume_cluster_powerdown_from_user(void)
7516	{
7517	kern_return_t ret = KERN_FAILURE;
7518
7519	lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7520
7521	if (user_suspended_cluster_powerdown) {
7522	resume_cluster_powerdown();
7523	user_suspended_cluster_powerdown = false;
7524	ret = KERN_SUCCESS;
7525	}
7526
7527	lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7528
7529	return ret;
7530	}
7531
7532	int
7533	get_cluster_powerdown_user_suspended(void)
7534	{
7535	lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7536
7537	int ret = (int)user_suspended_cluster_powerdown;
7538
7539	lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7540
7541	return ret;
7542	}
7543
7544	#if DEVELOPMENT \|\| DEBUG
7545	/ Functions to support the temporary sysctl /
7546	static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7547	void
7548	sched_set_powered_cores(int requested_powered_cores)
7549	{
7550	processor_reason_t reason = bit_test(requested_powered_cores, `31`) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7551	uint32_t flags = requested_powered_cores & `0x30000000`;
7552
7553	saved_requested_powered_cores = requested_powered_cores;
7554
7555	requested_powered_cores = bits(requested_powered_cores, `28`, `0`);
7556
7557	sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7558	}
7559	int
7560	sched_get_powered_cores(void)
7561	{
7562	return (int)saved_requested_powered_cores;
7563	}
7564	#endif
7565
7566	/*
7567	* Ensure that all cores are powered and recommended before sleep
7568	*/
7569	void
7570	sched_override_available_cores_for_sleep(void)
7571	{
7572	spl_t s = splsched();
7573	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7574
7575	if (perfcontrol_sleep_override == false) {
7576	perfcontrol_sleep_override = true;
7577	#if __arm__ \|\| __arm64__
7578	sched_update_recommended_cores(ALL_CORES_RECOMMENDED, reason: REASON_SYSTEM, flags: `0`);
7579	#endif
7580	}
7581
7582	simple_unlock(&sched_available_cores_lock);
7583	splx(s);
7584
7585	suspend_cluster_powerdown();
7586	}
7587
7588	/*
7589	* Restore the previously recommended cores, but leave all cores powered
7590	* after sleep
7591	*/
7592	void
7593	sched_restore_available_cores_after_sleep(void)
7594	{
7595	spl_t s = splsched();
7596	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7597
7598	if (perfcontrol_sleep_override == true) {
7599	perfcontrol_sleep_override = false;
7600	#if __arm__ \|\| __arm64__
7601	sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7602	reason: REASON_NONE, flags: `0`);
7603	#endif
7604	}
7605
7606	simple_unlock(&sched_available_cores_lock);
7607	splx(s);
7608
7609	resume_cluster_powerdown();
7610	}
7611
7612	#if __arm__ \|\| __arm64__
7613
7614	uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7615	bool perfcontrol_failsafe_active = false;
7616
7617	uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7618	uint64_t perfcontrol_failsafe_activation_time;
7619	uint64_t perfcontrol_failsafe_deactivation_time;
7620
7621	/ data covering who likely caused it and how long they ran /
7622	#define FAILSAFE_NAME_LEN 33 /* (2MAXCOMLEN)+1 from size of p_name /
7623	char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7624	int perfcontrol_failsafe_pid;
7625	uint64_t perfcontrol_failsafe_tid;
7626	uint64_t perfcontrol_failsafe_thread_timer_at_start;
7627	uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7628	uint64_t perfcontrol_failsafe_recommended_at_trigger;
7629
7630	/*
7631	* Perf controller calls here to update the recommended core bitmask.
7632	* If the failsafe is active, we don't immediately apply the new value.
7633	* Instead, we store the new request and use it after the failsafe deactivates.
7634	*
7635	* If the failsafe is not active, immediately apply the update.
7636	*
7637	* No scheduler locks are held, no other locks are held that scheduler might depend on,
7638	* interrupts are enabled
7639	*
7640	* currently prototype is in osfmk/arm/machine_routines.h
7641	*/
7642	void
7643	sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7644	{
7645	assert(preemption_enabled());
7646
7647	spl_t s = splsched();
7648	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7649
7650	if (reason == REASON_CLPC_SYSTEM) {
7651	perfcontrol_system_requested_recommended_cores = recommended_cores;
7652	} else {
7653	assert(reason == REASON_CLPC_USER);
7654	perfcontrol_user_requested_recommended_cores = recommended_cores;
7655	}
7656
7657	perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7658	perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7659
7660	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7661	sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7662	} else {
7663	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7664	MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_NONE,
7665	perfcontrol_requested_recommended_cores,
7666	sched_maintenance_thread->last_made_runnable_time, `0`, `0`, `0`);
7667	}
7668
7669	simple_unlock(&sched_available_cores_lock);
7670	splx(s);
7671	}
7672
7673	void
7674	sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7675	{
7676	sched_perfcontrol_update_recommended_cores_reason(recommended_cores, reason: REASON_CLPC_USER, flags: `0`);
7677	}
7678
7679	/*
7680	* Consider whether we need to activate the recommended cores failsafe
7681	*
7682	* Called from quantum timer interrupt context of a realtime thread
7683	* No scheduler locks are held, interrupts are disabled
7684	*/
7685	void
7686	sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7687	{
7688	/*
7689	* Check if a realtime thread is starving the system
7690	* and bringing up non-recommended cores would help
7691	*
7692	* TODO: Is this the correct check for recommended == possible cores?
7693	* TODO: Validate the checks without the relevant lock are OK.
7694	*/
7695
7696	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7697	/ keep track of how long the responsible thread runs /
7698	uint64_t cur_th_time = recount_current_thread_time_mach();
7699
7700	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7701
7702	if (perfcontrol_failsafe_active == TRUE &&
7703	cur_thread->thread_id == perfcontrol_failsafe_tid) {
7704	perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7705	}
7706
7707	simple_unlock(&sched_available_cores_lock);
7708
7709	/ we're already trying to solve the problem, so bail /
7710	return;
7711	}
7712
7713	/ The failsafe won't help if there are no more processors to enable /
7714	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7715	return;
7716	}
7717
7718	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7719
7720	/ Use the maintenance thread as our canary in the coal mine /
7721	thread_t m_thread = sched_maintenance_thread;
7722
7723	/ If it doesn't look bad, nothing to see here /
7724	if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7725	return;
7726	}
7727
7728	/ It looks bad, take the lock to be sure /
7729	thread_lock(m_thread);
7730
7731	if (thread_get_runq(thread: m_thread) == PROCESSOR_NULL \|\|
7732	(m_thread->state & (TH_RUN \| TH_WAIT)) != TH_RUN \|\|
7733	m_thread->last_made_runnable_time >= too_long_ago) {
7734	/*
7735	* Maintenance thread is either on cpu or blocked, and
7736	* therefore wouldn't benefit from more cores
7737	*/
7738	thread_unlock(m_thread);
7739	return;
7740	}
7741
7742	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7743
7744	thread_unlock(m_thread);
7745
7746	/*
7747	* There are cores disabled at perfcontrol's recommendation, but the
7748	* system is so overloaded that the maintenance thread can't run.
7749	* That likely means that perfcontrol can't run either, so it can't fix
7750	* the recommendation. We have to kick in a failsafe to keep from starving.
7751	*
7752	* When the maintenance thread has been starved for too long,
7753	* ignore the recommendation from perfcontrol and light up all the cores.
7754	*
7755	* TODO: Consider weird states like boot, sleep, or debugger
7756	*/
7757
7758	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7759
7760	if (perfcontrol_failsafe_active == TRUE) {
7761	simple_unlock(&sched_available_cores_lock);
7762	return;
7763	}
7764
7765	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7766	MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_START,
7767	perfcontrol_requested_recommended_cores, maintenance_runnable_time, `0`, `0`, `0`);
7768
7769	perfcontrol_failsafe_active = TRUE;
7770	perfcontrol_failsafe_activation_time = mach_absolute_time();
7771	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7772	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7773
7774	/ Capture some data about who screwed up (assuming that the thread on core is at fault) /
7775	task_t task = get_threadtask(cur_thread);
7776	perfcontrol_failsafe_pid = task_pid(task);
7777	strlcpy(dst: perfcontrol_failsafe_name, src: proc_name_address(p: get_bsdtask_info(task)), n: sizeof(perfcontrol_failsafe_name));
7778
7779	perfcontrol_failsafe_tid = cur_thread->thread_id;
7780
7781	/ Blame the thread for time it has run recently /
7782	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7783
7784	uint64_t last_seen = recount_current_thread_time_mach();
7785
7786	/ Compute the start time of the bad behavior in terms of the thread's on core time /
7787	perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7788	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7789
7790	/ Ignore the previously recommended core configuration /
7791	sched_update_recommended_cores(ALL_CORES_RECOMMENDED, reason: REASON_SYSTEM, flags: `0`);
7792
7793	simple_unlock(&sched_available_cores_lock);
7794	}
7795
7796	/*
7797	* Now that our bacon has been saved by the failsafe, consider whether to turn it off
7798	*
7799	* Runs in the context of the maintenance thread, no locks held
7800	*/
7801	static void
7802	sched_recommended_cores_maintenance(void)
7803	{
7804	/ Common case - no failsafe, nothing to be done here /
7805	if (__probable(perfcontrol_failsafe_active == FALSE)) {
7806	return;
7807	}
7808
7809	uint64_t ctime = mach_absolute_time();
7810
7811	boolean_t print_diagnostic = FALSE;
7812	char p_name[FAILSAFE_NAME_LEN] = "";
7813
7814	spl_t s = splsched();
7815	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7816
7817	/ Check again, under the lock, to avoid races /
7818	if (perfcontrol_failsafe_active == FALSE) {
7819	goto out;
7820	}
7821
7822	/*
7823	* Ensure that the other cores get another few ticks to run some threads
7824	* If we don't have this hysteresis, the maintenance thread is the first
7825	* to run, and then it immediately kills the other cores
7826	*/
7827	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7828	goto out;
7829	}
7830
7831	/ Capture some diagnostic state under the lock so we can print it out later /
7832
7833	int pid = perfcontrol_failsafe_pid;
7834	uint64_t tid = perfcontrol_failsafe_tid;
7835
7836	uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7837	perfcontrol_failsafe_thread_timer_at_start;
7838	uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7839	uint64_t rec_cores_after = perfcontrol_requested_recommended_cores;
7840	uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7841	strlcpy(dst: p_name, src: perfcontrol_failsafe_name, n: sizeof(p_name));
7842
7843	print_diagnostic = TRUE;
7844
7845	/ Deactivate the failsafe and reinstate the requested recommendation settings /
7846
7847	perfcontrol_failsafe_deactivation_time = ctime;
7848	perfcontrol_failsafe_active = FALSE;
7849
7850	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7851	MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_END,
7852	perfcontrol_requested_recommended_cores, failsafe_duration, `0`, `0`, `0`);
7853
7854	sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7855	reason: REASON_NONE, flags: `0`);
7856
7857	out:
7858	simple_unlock(&sched_available_cores_lock);
7859	splx(s);
7860
7861	if (print_diagnostic) {
7862	uint64_t failsafe_duration_ms = `0`, thread_usage_ms = `0`;
7863
7864	absolutetime_to_nanoseconds(abstime: failsafe_duration, result: &failsafe_duration_ms);
7865	failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7866
7867	absolutetime_to_nanoseconds(abstime: thread_usage, result: &thread_usage_ms);
7868	thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7869
7870	printf(format: "recommended core failsafe kicked in for %lld ms "
7871	"likely due to %s[%d] thread 0x%llx spending "
7872	"%lld ms on cpu at realtime priority - "
7873	"new recommendation: 0x%llx -> 0x%llx\n",
7874	failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7875	rec_cores_before, rec_cores_after);
7876	}
7877	}
7878
7879	#endif /* __arm64__ */
7880
7881	kern_return_t
7882	sched_processor_enable(processor_t processor, boolean_t enable)
7883	{
7884	assert(preemption_enabled());
7885
7886	if (processor == master_processor) {
7887	/ The system can hang if this is allowed /
7888	return KERN_NOT_SUPPORTED;
7889	}
7890
7891	spl_t s = splsched();
7892	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7893
7894	if (enable) {
7895	bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7896	} else {
7897	bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7898	}
7899
7900	#if __arm64__
7901	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7902	sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7903	reason: REASON_USER, flags: `0`);
7904	} else {
7905	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7906	MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_NONE,
7907	perfcontrol_requested_recommended_cores,
7908	sched_maintenance_thread->last_made_runnable_time, `0`, `0`, `0`);
7909	}
7910	#else /* __arm64__ */
7911	sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, `0`);
7912	#endif /* ! __arm64__ */
7913
7914	simple_unlock(&sched_available_cores_lock);
7915	splx(s);
7916
7917	return KERN_SUCCESS;
7918	}
7919
7920	void
7921	sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7922	{
7923	assert((processor != master_processor) \|\| (reason == REASON_SYSTEM));
7924
7925	bit_set(sched_online_processors, processor->cpu_id);
7926	}
7927
7928	kern_return_t
7929	sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7930	{
7931	assert((processor != master_processor) \|\| (reason == REASON_SYSTEM));
7932	kern_return_t ret = KERN_SUCCESS;
7933
7934	spl_t s = splsched();
7935	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7936
7937	if (reason == REASON_SYSTEM) {
7938	bit_clear(sched_online_processors, processor->cpu_id);
7939	simple_unlock(&sched_available_cores_lock);
7940	splx(s);
7941	return ret;
7942	}
7943
7944	uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7945
7946	if (!bit_test(sched_online_processors, processor->cpu_id)) {
7947	/ Processor is already offline /
7948	ret = KERN_NOT_IN_SET;
7949	} else if (available_cores == BIT(processor->cpu_id)) {
7950	ret = KERN_RESOURCE_SHORTAGE;
7951	} else {
7952	bit_clear(sched_online_processors, processor->cpu_id);
7953	ret = KERN_SUCCESS;
7954	}
7955
7956	simple_unlock(&sched_available_cores_lock);
7957	splx(s);
7958
7959	return ret;
7960	}
7961
7962	/*
7963	* Apply a new recommended cores mask to the processors it affects
7964	* Runs after considering failsafes and such
7965	*
7966	* Iterate over processors and update their ->is_recommended field.
7967	* If a processor is running, we let it drain out at its next
7968	* quantum expiration or blocking point. If a processor is idle, there
7969	* may be more work for it to do, so IPI it.
7970	*
7971	* interrupts disabled, sched_available_cores_lock is held
7972	*/
7973	static void
7974	sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7975	{
7976	uint64_t needs_exit_idle_mask = `0x0`;
7977
7978	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_START,
7979	recommended_cores,
7980	#if __arm64__
7981	perfcontrol_failsafe_active, `0`, `0`);
7982	#else /* __arm64__ */
7983	`0`, `0`, `0`);
7984	#endif /* ! __arm64__ */
7985
7986	if (__builtin_popcountll(recommended_cores & sched_online_processors) == `0`) {
7987	bit_set(recommended_cores, master_processor->cpu_id); / add boot processor or we hang /
7988	}
7989
7990	/ First set recommended cores /
7991	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7992	for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
7993	processor_set_t pset = pset_array[pset_id];
7994
7995	cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7996	cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7997
7998	if (newly_recommended == `0`) {
7999	/ Nothing to do /
8000	continue;
8001	}
8002
8003	pset_lock(pset);
8004
8005	for (int cpu_id = lsb_first(bitmap: newly_recommended); cpu_id >= `0`; cpu_id = lsb_next(bitmap: newly_recommended, previous_bit: cpu_id)) {
8006	processor_t processor = processor_array[cpu_id];
8007	processor->is_recommended = TRUE;
8008	processor->last_recommend_reason = reason;
8009	bit_set(pset->recommended_bitmask, processor->cpu_id);
8010
8011	if (processor->state == PROCESSOR_IDLE) {
8012	if (processor != current_processor()) {
8013	bit_set(needs_exit_idle_mask, processor->cpu_id);
8014	}
8015	}
8016	if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8017	os_atomic_inc(&processor_avail_count_user, relaxed);
8018	if (processor->processor_primary == processor) {
8019	os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8020	}
8021	SCHED(pset_made_schedulable)(processor, pset, false);
8022	}
8023	}
8024	pset_update_rt_stealable_state(pset);
8025
8026	pset_unlock(pset);
8027
8028	for (int cpu_id = lsb_first(bitmap: newly_recommended); cpu_id >= `0`;
8029	cpu_id = lsb_next(bitmap: newly_recommended, previous_bit: cpu_id)) {
8030	smr_cpu_up(processor_array[cpu_id],
8031	SMR_CPU_REASON_IGNORED);
8032	}
8033	}
8034	}
8035
8036	/ Now shutdown not recommended cores /
8037	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8038	for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8039	processor_set_t pset = pset_array[pset_id];
8040
8041	cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8042	cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8043
8044	if (newly_unrecommended == `0`) {
8045	/ Nothing to do /
8046	continue;
8047	}
8048
8049	pset_lock(pset);
8050
8051	for (int cpu_id = lsb_first(bitmap: newly_unrecommended); cpu_id >= `0`; cpu_id = lsb_next(bitmap: newly_unrecommended, previous_bit: cpu_id)) {
8052	processor_t processor = processor_array[cpu_id];
8053	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8054
8055	processor->is_recommended = FALSE;
8056	if (reason != REASON_NONE) {
8057	processor->last_derecommend_reason = reason;
8058	}
8059	bit_clear(pset->recommended_bitmask, processor->cpu_id);
8060	if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8061	os_atomic_dec(&processor_avail_count_user, relaxed);
8062	if (processor->processor_primary == processor) {
8063	os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8064	}
8065	}
8066	pset_update_rt_stealable_state(pset);
8067
8068	if ((processor->state == PROCESSOR_RUNNING) \|\| (processor->state == PROCESSOR_DISPATCHING)) {
8069	ipi_type = SCHED_IPI_IMMEDIATE;
8070	}
8071	SCHED(processor_queue_shutdown)(processor);
8072	/ pset unlocked /
8073
8074	SCHED(rt_queue_shutdown)(processor);
8075
8076	if (ipi_type == SCHED_IPI_NONE) {
8077	/*
8078	* If the core is idle,
8079	* we can directly mark the processor
8080	* as "Ignored"
8081	*
8082	* Otherwise, smr will detect this
8083	* during smr_cpu_leave() when the
8084	* processor actually idles.
8085	*/
8086	smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8087	} else if (processor == current_processor()) {
8088	ast_on(AST_PREEMPT);
8089	} else {
8090	sched_ipi_perform(dst: processor, ipi: ipi_type);
8091	}
8092
8093	pset_lock(pset);
8094	}
8095	pset_unlock(pset);
8096	}
8097	}
8098
8099	#if defined(__x86_64__)
8100	commpage_update_active_cpus();
8101	#endif
8102	/ Issue all pending IPIs now that the pset lock has been dropped /
8103	for (int cpuid = lsb_first(bitmap: needs_exit_idle_mask); cpuid >= `0`; cpuid = lsb_next(bitmap: needs_exit_idle_mask, previous_bit: cpuid)) {
8104	processor_t processor = processor_array[cpuid];
8105	machine_signal_idle(processor);
8106	}
8107
8108	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_END,
8109	needs_exit_idle_mask, `0`, `0`, `0`);
8110	}
8111
8112	static void
8113	sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8114	{
8115	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) \| DBG_FUNC_START,
8116	requested_powered_cores, reason, flags, `0`);
8117
8118	assert((flags & (LOCK_STATE \| UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : `1`);
8119
8120	/*
8121	* Loop through newly set requested_powered_cores and start them.
8122	* Loop through newly cleared requested_powered_cores and shut them down.
8123	*/
8124
8125	if ((reason == REASON_CLPC_SYSTEM) \|\| (reason == REASON_CLPC_USER)) {
8126	flags \|= SHUTDOWN_TEMPORARY;
8127	}
8128
8129	/ First set powered cores /
8130	cpumap_t started_cores = `0ull`;
8131	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8132	for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8133	processor_set_t pset = pset_array[pset_id];
8134
8135	spl_t s = splsched();
8136	pset_lock(pset);
8137	cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8138	cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] \| pset->cpu_state_map[PROCESSOR_IDLE] \| pset->cpu_state_map[PROCESSOR_DISPATCHING] \| pset->cpu_state_map[PROCESSOR_RUNNING]);
8139	cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8140	pset_unlock(pset);
8141	splx(s);
8142
8143	cpumap_t newly_powered = requested_changes & requested_powered_cores;
8144
8145	cpumap_t cpu_map = newly_powered;
8146
8147	if (flags & (LOCK_STATE \| UNLOCK_STATE)) {
8148	/*
8149	* We need to change the lock state even if
8150	* we don't need to change the actual state.
8151	*/
8152	cpu_map = pset_requested_powered_cores;
8153	/ But not the master_processor, which is always implicitly locked /
8154	bit_clear(cpu_map, master_processor->cpu_id);
8155	}
8156
8157	if (cpu_map == `0`) {
8158	/ Nothing to do /
8159	continue;
8160	}
8161
8162	for (int cpu_id = lsb_first(bitmap: cpu_map); cpu_id >= `0`; cpu_id = lsb_next(bitmap: cpu_map, previous_bit: cpu_id)) {
8163	processor_t processor = processor_array[cpu_id];
8164	processor_start_reason(processor, reason, flags);
8165	bit_set(started_cores, cpu_id);
8166	}
8167	}
8168	}
8169	if (flags & WAIT_FOR_LAST_START) {
8170	for (int cpu_id = lsb_first(bitmap: started_cores); cpu_id >= `0`; cpu_id = lsb_next(bitmap: started_cores, previous_bit: cpu_id)) {
8171	processor_t processor = processor_array[cpu_id];
8172	processor_wait_for_start(processor);
8173	}
8174	}
8175
8176	/ Now shutdown not powered cores /
8177	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8178	for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= `0`; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8179	processor_set_t pset = pset_array[pset_id];
8180
8181	spl_t s = splsched();
8182	pset_lock(pset);
8183	cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] \| pset->cpu_state_map[PROCESSOR_IDLE] \| pset->cpu_state_map[PROCESSOR_DISPATCHING] \| pset->cpu_state_map[PROCESSOR_RUNNING]);
8184	cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8185	pset_unlock(pset);
8186	splx(s);
8187
8188	cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8189
8190	if (newly_unpowered == `0`) {
8191	/ Nothing to do /
8192	continue;
8193	}
8194
8195	for (int cpu_id = lsb_first(bitmap: newly_unpowered); cpu_id >= `0`; cpu_id = lsb_next(bitmap: newly_unpowered, previous_bit: cpu_id)) {
8196	processor_t processor = processor_array[cpu_id];
8197
8198	processor_exit_reason(processor, reason, flags);
8199	}
8200	}
8201	}
8202
8203	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) \| DBG_FUNC_END, `0`, `0`, `0`, `0`);
8204	}
8205
8206	void
8207	thread_set_options(uint32_t thopt)
8208	{
8209	spl_t x;
8210	thread_t t = current_thread();
8211
8212	x = splsched();
8213	thread_lock(t);
8214
8215	t->options \|= thopt;
8216
8217	thread_unlock(t);
8218	splx(x);
8219	}
8220
8221	void
8222	thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8223	{
8224	thread->pending_block_hint = block_hint;
8225	}
8226
8227	uint32_t
8228	qos_max_parallelism(int qos, uint64_t options)
8229	{
8230	return SCHED(qos_max_parallelism)(qos, options);
8231	}
8232
8233	uint32_t
8234	sched_qos_max_parallelism(__unused int qos, uint64_t options)
8235	{
8236	host_basic_info_data_t hinfo;
8237	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8238
8239
8240	/*
8241	* The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8242	* implement their own qos_max_parallelism() interfaces.
8243	*/
8244	assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == `0`);
8245
8246	/ Query the machine layer for core information /
8247	__assert_only kern_return_t kret = host_info(host: host_self(), HOST_BASIC_INFO,
8248	host_info_out: (host_info_t)&hinfo, host_info_outCnt: &count);
8249	assert(kret == KERN_SUCCESS);
8250
8251	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8252	return hinfo.logical_cpu;
8253	} else {
8254	return hinfo.physical_cpu;
8255	}
8256	}
8257
8258	int sched_allow_NO_SMT_threads = `1`;
8259	bool
8260	thread_no_smt(thread_t thread)
8261	{
8262	return sched_allow_NO_SMT_threads &&
8263	(thread->bound_processor == PROCESSOR_NULL) &&
8264	((thread->sched_flags & TH_SFLAG_NO_SMT) \|\| (get_threadtask(thread)->t_flags & TF_NO_SMT));
8265	}
8266
8267	bool
8268	processor_active_thread_no_smt(processor_t processor)
8269	{
8270	return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8271	}
8272
8273	#if __arm64__
8274
8275	/*
8276	* Set up or replace old timer with new timer
8277	*
8278	* Returns true if canceled old timer, false if it did not
8279	*/
8280	boolean_t
8281	sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8282	{
8283	/*
8284	* Exchange deadline for new deadline, if old deadline was nonzero,
8285	* then I cancelled the callback, otherwise I didn't
8286	*/
8287
8288	return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8289	relaxed) != `0`;
8290	}
8291
8292	/*
8293	* Set global SFI window (in usec)
8294	*/
8295	kern_return_t
8296	sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8297	{
8298	kern_return_t ret = KERN_NOT_SUPPORTED;
8299	#if CONFIG_THREAD_GROUPS
8300	if (window_usecs == `0ULL`) {
8301	ret = sfi_window_cancel();
8302	} else {
8303	ret = sfi_set_window(window_usecs);
8304	}
8305	#endif // CONFIG_THREAD_GROUPS
8306	return ret;
8307	}
8308
8309	/*
8310	* Set background and maintenance SFI class offtimes
8311	*/
8312	kern_return_t
8313	sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8314	{
8315	kern_return_t ret = KERN_NOT_SUPPORTED;
8316	#if CONFIG_THREAD_GROUPS
8317	if (offtime_usecs == `0ULL`) {
8318	ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8319	ret \|= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8320	} else {
8321	ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8322	ret \|= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8323	}
8324	#endif // CONFIG_THREAD_GROUPS
8325	return ret;
8326	}
8327
8328	/*
8329	* Set utility SFI class offtime
8330	*/
8331	kern_return_t
8332	sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8333	{
8334	kern_return_t ret = KERN_NOT_SUPPORTED;
8335	#if CONFIG_THREAD_GROUPS
8336	if (offtime_usecs == `0ULL`) {
8337	ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8338	} else {
8339	ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8340	}
8341	#endif // CONFIG_THREAD_GROUPS
8342	return ret;
8343	}
8344
8345	#endif /* __arm64__ */
8346
8347	#if CONFIG_SCHED_EDGE
8348
8349	#define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8350
8351	/*
8352	* sched_edge_pset_running_higher_bucket()
8353	*
8354	* Routine to calculate cumulative running counts for each scheduling
8355	* bucket. This effectively lets the load calculation calculate if a
8356	* cluster is running any threads at a QoS lower than the thread being
8357	* migrated etc.
8358	*/
8359
8360	static void
8361	sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8362	{
8363	bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8364
8365	/ Edge Scheduler Optimization /
8366	for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= `0`; cpu = bitmap_next(active_map, cpu)) {
8367	sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8368	for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8369	running_higher[bucket]++;
8370	}
8371	}
8372	}
8373
8374	/*
8375	* sched_update_pset_load_average()
8376	*
8377	* Updates the load average for each sched bucket for a cluster.
8378	* This routine must be called with the pset lock held.
8379	*/
8380	void
8381	sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8382	{
8383	int avail_cpu_count = pset_available_cpu_count(pset);
8384	if (avail_cpu_count == `0`) {
8385	/ Looks like the pset is not runnable any more; nothing to do here /
8386	return;
8387	}
8388
8389	/*
8390	* Edge Scheduler Optimization
8391	*
8392	* See if more callers of this routine can pass in timestamps to avoid the
8393	* mach_absolute_time() call here.
8394	*/
8395
8396	if (!curtime) {
8397	curtime = mach_absolute_time();
8398	}
8399	uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8400	int64_t delta_ticks = curtime - last_update;
8401	if (delta_ticks < `0`) {
8402	return;
8403	}
8404
8405	uint64_t delta_nsecs = `0`;
8406	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8407
8408	if (__improbable(delta_nsecs > UINT32_MAX)) {
8409	delta_nsecs = UINT32_MAX;
8410	}
8411
8412	#if CONFIG_SCHED_EDGE
8413	/ Update the shared resource load on the pset /
8414	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8415	uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8416	uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8417	uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8418	uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8419	if (old_shared_load != new_shared_load) {
8420	KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) \| DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8421	}
8422	}
8423	#endif /* CONFIG_SCHED_EDGE */
8424
8425	uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {`0`};
8426	sched_edge_pset_running_higher_bucket(pset, running_higher);
8427
8428	for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8429	uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8430	uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8431	uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8432
8433	/*
8434	* For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8435	* Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8436	* new load averga needs to be shifted before it can be added to the old load average.
8437	*/
8438	uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8439
8440	/*
8441	* For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8442	* instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8443	* when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8444	*/
8445	int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8446	boolean_t load_uptick = (old_load_shifted == `0`) && (current_runq_depth != `0`);
8447	boolean_t load_downtick = (old_load_shifted != `0`) && (current_runq_depth == `0`);
8448	uint64_t load_average;
8449	if (load_uptick \|\| load_downtick) {
8450	load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8451	} else {
8452	/ Indicates a loaded system; use EWMA for load average calculation /
8453	load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8454	}
8455	os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8456	if (load_average != old_load_average) {
8457	KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) \| DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8458	}
8459	}
8460	os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8461	}
8462
8463	void
8464	sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8465	{
8466	pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8467	uint64_t avg_thread_execution_time = `0`;
8468
8469	os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8470	old_execution_time_packed.pset_execution_time_packed,
8471	new_execution_time_packed.pset_execution_time_packed, relaxed, {
8472	uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8473	int64_t delta_ticks = curtime - last_update;
8474	if (delta_ticks < `0`) {
8475	/*
8476	* Its possible that another CPU came in and updated the pset_execution_time
8477	* before this CPU could do it. Since the average execution time is meant to
8478	* be an approximate measure per cluster, ignore the older update.
8479	*/
8480	os_atomic_rmw_loop_give_up(return );
8481	}
8482	uint64_t delta_nsecs = `0`;
8483	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8484
8485	uint64_t nanotime = `0`;
8486	absolutetime_to_nanoseconds(execution_time, &nanotime);
8487	uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8488
8489	uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8490	uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8491
8492	avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8493	new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8494	new_execution_time_packed.pset_execution_time_last_update = curtime;
8495	});
8496	if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8497	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) \| DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8498	}
8499	}
8500
8501	uint64_t
8502	sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8503	{
8504	return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8505	}
8506
8507	#else /* CONFIG_SCHED_EDGE */
8508
8509	void
8510	sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8511	{
8512	int non_rt_load = pset->pset_runq.count;
8513	int load = ((bit_count(x: pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8514	int new_load_average = ((int)pset->load_average + load) >> `1`;
8515
8516	pset->load_average = new_load_average;
8517	#if (DEVELOPMENT \|\| DEBUG)
8518	#if __AMP__
8519	if (pset->pset_cluster_type == PSET_AMP_P) {
8520	KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) \| DBG_FUNC_NONE, sched_get_pset_load_average(pset, `0`), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8521	}
8522	#endif
8523	#endif
8524	}
8525
8526	void
8527	sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8528	{
8529	}
8530
8531	#endif /* CONFIG_SCHED_EDGE */
8532
8533	/ pset is locked /
8534	static bool
8535	processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8536	{
8537	int cpuid = processor->cpu_id;
8538	#if defined(__x86_64__)
8539	if (sched_avoid_cpu0 && (cpuid == `0`)) {
8540	return false;
8541	}
8542	#endif
8543
8544	cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8545
8546	return bit_test(fasttrack_map, cpuid);
8547	}
8548
8549	/ pset is locked /
8550	static processor_t
8551	choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8552	{
8553	#if defined(__x86_64__)
8554	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, `0`);
8555	#else
8556	const bool avoid_cpu0 = false;
8557	#endif
8558	cpumap_t cpu_map;
8559
8560	try_again:
8561	cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8562	if (skip_processor) {
8563	bit_clear(cpu_map, skip_processor->cpu_id);
8564	}
8565	if (skip_spills) {
8566	cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8567	}
8568
8569	if (avoid_cpu0 && (sched_avoid_cpu0 == `2`)) {
8570	bit_clear(cpu_map, `0`);
8571	}
8572
8573	cpumap_t primary_map = cpu_map & pset->primary_map;
8574	if (avoid_cpu0) {
8575	primary_map = bit_ror64(bitmap: primary_map, n: `1`);
8576	}
8577
8578	int rotid = lsb_first(bitmap: primary_map);
8579	if (rotid >= `0`) {
8580	int cpuid = avoid_cpu0 ? ((rotid + `1`) & `63`) : rotid;
8581
8582	processor_t processor = processor_array[cpuid];
8583
8584	return processor;
8585	}
8586
8587	if (!pset->is_SMT \|\| !sched_allow_rt_smt \|\| !consider_secondaries) {
8588	goto out;
8589	}
8590
8591	if (avoid_cpu0 && (sched_avoid_cpu0 == `2`)) {
8592	/ Also avoid cpu1 /
8593	bit_clear(cpu_map, `1`);
8594	}
8595
8596	/ Consider secondary processors whose primary is actually running a realtime thread /
8597	cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << `1`);
8598	if (avoid_cpu0) {
8599	/ Also avoid cpu1 /
8600	secondary_map = bit_ror64(bitmap: secondary_map, n: `2`);
8601	}
8602	rotid = lsb_first(bitmap: secondary_map);
8603	if (rotid >= `0`) {
8604	int cpuid = avoid_cpu0 ? ((rotid + `2`) & `63`) : rotid;
8605
8606	processor_t processor = processor_array[cpuid];
8607
8608	return processor;
8609	}
8610
8611	/ Consider secondary processors /
8612	secondary_map = cpu_map & ~pset->primary_map;
8613	if (avoid_cpu0) {
8614	/ Also avoid cpu1 /
8615	secondary_map = bit_ror64(bitmap: secondary_map, n: `2`);
8616	}
8617	rotid = lsb_first(bitmap: secondary_map);
8618	if (rotid >= `0`) {
8619	int cpuid = avoid_cpu0 ? ((rotid + `2`) & `63`) : rotid;
8620
8621	processor_t processor = processor_array[cpuid];
8622
8623	return processor;
8624	}
8625
8626	/*
8627	* I was hoping the compiler would optimize
8628	* this away when avoid_cpu0 is const bool false
8629	* but it still complains about the assignmnent
8630	* in that case.
8631	*/
8632	if (avoid_cpu0 && (sched_avoid_cpu0 == `2`)) {
8633	#if defined(__x86_64__)
8634	avoid_cpu0 = false;
8635	#else
8636	assert(`0`);
8637	#endif
8638	goto try_again;
8639	}
8640
8641	out:
8642	if (skip_processor) {
8643	return PROCESSOR_NULL;
8644	}
8645
8646	/*
8647	* If we didn't find an obvious processor to choose, but there are still more CPUs
8648	* not already running realtime threads than realtime threads in the realtime run queue,
8649	* this thread belongs in this pset, so choose some other processor in this pset
8650	* to ensure the thread is enqueued here.
8651	*/
8652	cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8653	if (bit_count(x: non_realtime_map) > rt_runq_count(pset)) {
8654	cpu_map = non_realtime_map;
8655	assert(cpu_map != `0`);
8656	int cpuid = bit_first(bitmap: cpu_map);
8657	assert(cpuid >= `0`);
8658	return processor_array[cpuid];
8659	}
8660
8661	if (!pset->is_SMT \|\| !sched_allow_rt_smt \|\| !consider_secondaries) {
8662	goto skip_secondaries;
8663	}
8664
8665	non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8666	if (bit_count(x: non_realtime_map) > rt_runq_count(pset)) {
8667	cpu_map = non_realtime_map;
8668	assert(cpu_map != `0`);
8669	int cpuid = bit_first(bitmap: cpu_map);
8670	assert(cpuid >= `0`);
8671	return processor_array[cpuid];
8672	}
8673
8674	skip_secondaries:
8675	return PROCESSOR_NULL;
8676	}
8677
8678	/*
8679	* Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8680	* If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8681	*
8682	* pset is locked.
8683	*/
8684	static processor_t
8685	choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8686	{
8687	uint64_t furthest_deadline = deadline_add(d: minimum_deadline, e: rt_deadline_epsilon);
8688	processor_t fd_processor = PROCESSOR_NULL;
8689	int lowest_priority = max_pri;
8690
8691	cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8692	if (skip_processor) {
8693	bit_clear(cpu_map, skip_processor->cpu_id);
8694	}
8695	if (skip_spills) {
8696	cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8697	}
8698
8699	for (int cpuid = bit_first(bitmap: cpu_map); cpuid >= `0`; cpuid = bit_next(bitmap: cpu_map, previous_bit: cpuid)) {
8700	processor_t processor = processor_array[cpuid];
8701
8702	if (processor->current_pri > lowest_priority) {
8703	continue;
8704	}
8705
8706	if (processor->current_pri < lowest_priority) {
8707	lowest_priority = processor->current_pri;
8708	furthest_deadline = processor->deadline;
8709	fd_processor = processor;
8710	continue;
8711	}
8712
8713	if (processor->deadline > furthest_deadline) {
8714	furthest_deadline = processor->deadline;
8715	fd_processor = processor;
8716	}
8717	}
8718
8719	if (fd_processor) {
8720	return fd_processor;
8721	}
8722
8723	/*
8724	* There is a race condition possible when there are multiple processor sets.
8725	* choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8726	* so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8727	* pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8728	* the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8729	* the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8730	* near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8731	* To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8732	* But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8733	*
8734	* The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8735	* on the run queue of that pset.
8736	*/
8737	if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8738	cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8739	assert(skip_processor == PROCESSOR_NULL);
8740	assert(skip_spills == false);
8741
8742	for (int cpuid = bit_first(bitmap: cpu_map); cpuid >= `0`; cpuid = bit_next(bitmap: cpu_map, previous_bit: cpuid)) {
8743	processor_t processor = processor_array[cpuid];
8744
8745	if (processor->current_pri > lowest_priority) {
8746	continue;
8747	}
8748
8749	if (processor->current_pri < lowest_priority) {
8750	lowest_priority = processor->current_pri;
8751	furthest_deadline = processor->deadline;
8752	fd_processor = processor;
8753	continue;
8754	}
8755
8756	if (processor->deadline > furthest_deadline) {
8757	furthest_deadline = processor->deadline;
8758	fd_processor = processor;
8759	}
8760	}
8761	}
8762
8763	return fd_processor;
8764	}
8765
8766	/ pset is locked /
8767	static processor_t
8768	choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8769	{
8770	bool skip_spills = true;
8771	bool include_ast_urgent_pending_cpus = false;
8772
8773	processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8774	if (next_processor != PROCESSOR_NULL) {
8775	return next_processor;
8776	}
8777
8778	next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8779	return next_processor;
8780	}
8781
8782	#if defined(__x86_64__)
8783	/ pset is locked /
8784	static bool
8785	all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8786	{
8787	bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, `0`);
8788	int nbackup_cpus = `0`;
8789
8790	if (include_backups && rt_runq_is_low_latency(pset)) {
8791	nbackup_cpus = sched_rt_n_backup_processors;
8792	}
8793
8794	cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8795	if (avoid_cpu0 && (sched_avoid_cpu0 == `2`)) {
8796	bit_clear(cpu_map, `0`);
8797	}
8798	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8799	}
8800
8801	/ pset is locked /
8802	static bool
8803	these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8804	{
8805	int nbackup_cpus = `0`;
8806
8807	if (include_backups && rt_runq_is_low_latency(pset)) {
8808	nbackup_cpus = sched_rt_n_backup_processors;
8809	}
8810
8811	cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8812	return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8813	}
8814	#endif
8815
8816	static bool
8817	sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8818	{
8819	if (!processor->is_recommended) {
8820	return false;
8821	}
8822	bool ok_to_run_realtime_thread = true;
8823	#if defined(__x86_64__)
8824	bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8825	if (spill_pending) {
8826	return true;
8827	}
8828	if (processor->cpu_id == `0`) {
8829	if (sched_avoid_cpu0 == `1`) {
8830	ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~`0x1`, as_backup);
8831	} else if (sched_avoid_cpu0 == `2`) {
8832	ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~`0x3`, as_backup);
8833	}
8834	} else if (sched_avoid_cpu0 && (processor->cpu_id == `1`) && processor->is_SMT) {
8835	ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~`0x2`, as_backup);
8836	} else if (processor->processor_primary != processor) {
8837	ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8838	}
8839	#else
8840	(void)pset;
8841	(void)processor;
8842	(void)as_backup;
8843	#endif
8844	return ok_to_run_realtime_thread;
8845	}
8846
8847	void
8848	sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8849	{
8850	if (drop_lock) {
8851	pset_unlock(pset);
8852	}
8853	}
8854
8855	void
8856	thread_set_no_smt(bool set)
8857	{
8858	if (!system_is_SMT) {
8859	/ Not a machine that supports SMT /
8860	return;
8861	}
8862
8863	thread_t thread = current_thread();
8864
8865	spl_t s = splsched();
8866	thread_lock(thread);
8867	if (set) {
8868	thread->sched_flags \|= TH_SFLAG_NO_SMT;
8869	}
8870	thread_unlock(thread);
8871	splx(s);
8872	}
8873
8874	bool
8875	thread_get_no_smt(void)
8876	{
8877	return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8878	}
8879
8880	extern void task_set_no_smt(task_t);
8881	void
8882	task_set_no_smt(task_t task)
8883	{
8884	if (!system_is_SMT) {
8885	/ Not a machine that supports SMT /
8886	return;
8887	}
8888
8889	if (task == TASK_NULL) {
8890	task = current_task();
8891	}
8892
8893	task_lock(task);
8894	task->t_flags \|= TF_NO_SMT;
8895	task_unlock(task);
8896	}
8897
8898	#if DEBUG \|\| DEVELOPMENT
8899	extern void sysctl_task_set_no_smt(char no_smt);
8900	void
8901	sysctl_task_set_no_smt(char no_smt)
8902	{
8903	if (!system_is_SMT) {
8904	/ Not a machine that supports SMT /
8905	return;
8906	}
8907
8908	task_t task = current_task();
8909
8910	task_lock(task);
8911	if (no_smt == `'1'`) {
8912	task->t_flags \|= TF_NO_SMT;
8913	}
8914	task_unlock(task);
8915	}
8916
8917	extern char sysctl_task_get_no_smt(void);
8918	char
8919	sysctl_task_get_no_smt(void)
8920	{
8921	task_t task = current_task();
8922
8923	if (task->t_flags & TF_NO_SMT) {
8924	return `'1'`;
8925	}
8926	return `'0'`;
8927	}
8928	#endif /* DEVELOPMENT \|\| DEBUG */
8929
8930
8931	__private_extern__ void
8932	thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8933	{
8934	#if __AMP__
8935	spl_t s = splsched();
8936	thread_lock(thread);
8937	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8938	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8939	if (soft_bound) {
8940	thread->sched_flags \|= TH_SFLAG_BOUND_SOFT;
8941	}
8942	switch (cluster_type) {
8943	case `'e'`:
8944	case `'E'`:
8945	if (pset0.pset_cluster_type == PSET_AMP_E) {
8946	thread->th_bound_cluster_id = pset0.pset_id;
8947	} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8948	thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8949	}
8950	break;
8951	case `'p'`:
8952	case `'P'`:
8953	if (pset0.pset_cluster_type == PSET_AMP_P) {
8954	thread->th_bound_cluster_id = pset0.pset_id;
8955	} else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8956	thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8957	}
8958	break;
8959	default:
8960	break;
8961	}
8962	thread_unlock(thread);
8963	splx(s);
8964
8965	if (thread == current_thread()) {
8966	thread_block(THREAD_CONTINUE_NULL);
8967	}
8968	#else /* __AMP__ */
8969	(void)thread;
8970	(void)cluster_type;
8971	(void)soft_bound;
8972	#endif /* __AMP__ */
8973	}
8974
8975	extern uint32_t thread_bound_cluster_id(thread_t thread);
8976	uint32_t
8977	thread_bound_cluster_id(thread_t thread)
8978	{
8979	return thread->th_bound_cluster_id;
8980	}
8981
8982	__private_extern__ kern_return_t
8983	thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8984	{
8985	#if __AMP__
8986
8987	processor_set_t pset = NULL;
8988
8989	/ Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. /
8990	if ((options & THREAD_UNBIND) \|\| cluster_id == THREAD_BOUND_CLUSTER_NONE) {
8991	/ If the thread was actually not bound to some cluster, nothing to do here /
8992	if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8993	return KERN_SUCCESS;
8994	}
8995	} else {
8996	/ Validate the inputs for the bind case /
8997	int max_clusters = ml_get_cluster_count();
8998	if (cluster_id >= max_clusters) {
8999	/ Invalid cluster id /
9000	return KERN_INVALID_VALUE;
9001	}
9002	pset = pset_array[cluster_id];
9003	if (pset == NULL) {
9004	/ Cluster has not been initialized yet /
9005	return KERN_INVALID_VALUE;
9006	}
9007	if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9008	if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
9009	/ Thread is not recommended for the cluster type /
9010	return KERN_INVALID_POLICY;
9011	}
9012	}
9013	}
9014
9015	spl_t s = splsched();
9016	thread_lock(thread);
9017
9018	/ Unbind the thread from its previous bound state /
9019	thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9020	thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9021
9022	if (options & THREAD_UNBIND) {
9023	/ Nothing more to do here /
9024	goto thread_bind_cluster_complete;
9025	}
9026
9027	if (options & THREAD_BIND_SOFT) {
9028	thread->sched_flags \|= TH_SFLAG_BOUND_SOFT;
9029	}
9030	thread->th_bound_cluster_id = cluster_id;
9031
9032	thread_bind_cluster_complete:
9033	thread_unlock(thread);
9034	splx(s);
9035
9036	if (thread == current_thread()) {
9037	thread_block(THREAD_CONTINUE_NULL);
9038	}
9039	#else /* __AMP__ */
9040	(void)thread;
9041	(void)cluster_id;
9042	(void)options;
9043	#endif /* __AMP__ */
9044	return KERN_SUCCESS;
9045	}
9046
9047	#if DEVELOPMENT \|\| DEBUG
9048	extern int32_t sysctl_get_bound_cpuid(void);
9049	int32_t
9050	sysctl_get_bound_cpuid(void)
9051	{
9052	int32_t cpuid = -`1`;
9053	thread_t self = current_thread();
9054
9055	processor_t processor = self->bound_processor;
9056	if (processor == NULL) {
9057	cpuid = -`1`;
9058	} else {
9059	cpuid = processor->cpu_id;
9060	}
9061
9062	return cpuid;
9063	}
9064
9065	extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9066	kern_return_t
9067	sysctl_thread_bind_cpuid(int32_t cpuid)
9068	{
9069	processor_t processor = PROCESSOR_NULL;
9070
9071	if (cpuid == -`1`) {
9072	goto unbind;
9073	}
9074
9075	if (cpuid < `0` \|\| cpuid >= MAX_SCHED_CPUS) {
9076	return KERN_INVALID_VALUE;
9077	}
9078
9079	processor = processor_array[cpuid];
9080	if (processor == PROCESSOR_NULL) {
9081	return KERN_INVALID_VALUE;
9082	}
9083
9084	#if __AMP__
9085
9086	thread_t thread = current_thread();
9087
9088	if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9089	if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == `0`) {
9090	/ Cannot hard-bind an already hard-cluster-bound thread /
9091	return KERN_NOT_SUPPORTED;
9092	}
9093	}
9094
9095	#endif /* __AMP__ */
9096
9097	unbind:
9098	thread_bind(processor);
9099
9100	thread_block(THREAD_CONTINUE_NULL);
9101	return KERN_SUCCESS;
9102	}
9103
9104	extern char sysctl_get_task_cluster_type(void);
9105	char
9106	sysctl_get_task_cluster_type(void)
9107	{
9108	task_t task = current_task();
9109	processor_set_t pset_hint = task->pset_hint;
9110
9111	if (!pset_hint) {
9112	return `'0'`;
9113	}
9114
9115	#if __AMP__
9116	if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9117	return `'E'`;
9118	} else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9119	return `'P'`;
9120	}
9121	#endif
9122
9123	return `'0'`;
9124	}
9125
9126	#if __AMP__
9127	static processor_set_t
9128	find_pset_of_type(pset_cluster_type_t t)
9129	{
9130	for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9131	if (node->pset_cluster_type != t) {
9132	continue;
9133	}
9134
9135	processor_set_t pset = PROCESSOR_SET_NULL;
9136	for (int pset_id = lsb_first(node->pset_map); pset_id >= `0`; pset_id = lsb_next(node->pset_map, pset_id)) {
9137	pset = pset_array[pset_id];
9138	/ Prefer one with recommended processsors /
9139	if (pset->recommended_bitmask != `0`) {
9140	assert(pset->pset_cluster_type == t);
9141	return pset;
9142	}
9143	}
9144	/ Otherwise return whatever was found last /
9145	return pset;
9146	}
9147
9148	return PROCESSOR_SET_NULL;
9149	}
9150	#endif
9151
9152	extern void sysctl_task_set_cluster_type(char cluster_type);
9153	void
9154	sysctl_task_set_cluster_type(char cluster_type)
9155	{
9156	task_t task = current_task();
9157	processor_set_t pset_hint = PROCESSOR_SET_NULL;
9158
9159	#if __AMP__
9160	switch (cluster_type) {
9161	case `'e'`:
9162	case `'E'`:
9163	pset_hint = find_pset_of_type(PSET_AMP_E);
9164	break;
9165	case `'p'`:
9166	case `'P'`:
9167	pset_hint = find_pset_of_type(PSET_AMP_P);
9168	break;
9169	default:
9170	break;
9171	}
9172
9173	if (pset_hint) {
9174	task_lock(task);
9175	task->t_flags \|= TF_USE_PSET_HINT_CLUSTER_TYPE;
9176	task->pset_hint = pset_hint;
9177	task_unlock(task);
9178
9179	thread_block(THREAD_CONTINUE_NULL);
9180	}
9181	#else
9182	(void)cluster_type;
9183	(void)task;
9184	(void)pset_hint;
9185	#endif
9186	}
9187
9188	/*
9189	* The quantum length used for Fixed and RT sched modes. In general the quantum
9190	* can vary - for example for background or QOS.
9191	*/
9192	extern uint64_t sysctl_get_quantum_us(void);
9193	uint64_t
9194	sysctl_get_quantum_us(void)
9195	{
9196	uint32_t quantum;
9197	uint64_t quantum_ns;
9198
9199	quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9200	absolutetime_to_nanoseconds(quantum, &quantum_ns);
9201
9202	return quantum_ns / `1000`;
9203	}
9204
9205	#endif /* DEVELOPMENT \|\| DEBUG */
9206

Browse the source code of xnu/osfmk/kern/sched_prim.c