sched_prim.c source code [xnu/osfmk/kern/sched_prim.c]

1	/*
2	* Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_FREE_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: sched_prim.c
60	* Author: Avadis Tevanian, Jr.
61	* Date: 1986
62	*
63	* Scheduling primitives
64	*
65	*/
66
67	#include <debug.h>
68
69	#include <mach/mach_types.h>
70	#include <mach/machine.h>
71	#include <mach/policy.h>
72	#include <mach/sync_policy.h>
73	#include <mach/thread_act.h>
74
75	#include <machine/machine_routines.h>
76	#include <machine/sched_param.h>
77	#include <machine/machine_cpu.h>
78	#include <machine/machlimits.h>
79	#include <machine/atomic.h>
80
81	#include <machine/commpage.h>
82
83	#include <kern/kern_types.h>
84	#include <kern/backtrace.h>
85	#include <kern/clock.h>
86	#include <kern/counters.h>
87	#include <kern/cpu_number.h>
88	#include <kern/cpu_data.h>
89	#include <kern/smp.h>
90	#include <kern/debug.h>
91	#include <kern/macro_help.h>
92	#include <kern/machine.h>
93	#include <kern/misc_protos.h>
94	#if MONOTONIC
95	#include <kern/monotonic.h>
96	#endif /* MONOTONIC */
97	#include <kern/processor.h>
98	#include <kern/queue.h>
99	#include <kern/sched.h>
100	#include <kern/sched_prim.h>
101	#include <kern/sfi.h>
102	#include <kern/syscall_subr.h>
103	#include <kern/task.h>
104	#include <kern/thread.h>
105	#include <kern/ledger.h>
106	#include <kern/timer_queue.h>
107	#include <kern/waitq.h>
108	#include <kern/policy_internal.h>
109	#include <kern/cpu_quiesce.h>
110
111	#include <vm/pmap.h>
112	#include <vm/vm_kern.h>
113	#include <vm/vm_map.h>
114	#include <vm/vm_pageout.h>
115
116	#include <mach/sdt.h>
117	#include <mach/mach_host.h>
118	#include <mach/host_info.h>
119
120	#include <sys/kdebug.h>
121	#include <kperf/kperf.h>
122	#include <kern/kpc.h>
123	#include <san/kasan.h>
124	#include <kern/pms.h>
125	#include <kern/host.h>
126	#include <stdatomic.h>
127
128	int rt_runq_count(processor_set_t pset)
129	{
130	return atomic_load_explicit(&SCHED(rt_runq)(pset)->count, memory_order_relaxed);
131	}
132
133	void rt_runq_count_incr(processor_set_t pset)
134	{
135	atomic_fetch_add_explicit(&SCHED(rt_runq)(pset)->count, `1`, memory_order_relaxed);
136	}
137
138	void rt_runq_count_decr(processor_set_t pset)
139	{
140	atomic_fetch_sub_explicit(&SCHED(rt_runq)(pset)->count, `1`, memory_order_relaxed);
141	}
142
143	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
144	int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
145
146	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
147	int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
148
149	#define MAX_UNSAFE_QUANTA 800
150	int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
151
152	#define MAX_POLL_QUANTA 2
153	int max_poll_quanta = MAX_POLL_QUANTA;
154
155	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
156	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
157
158	uint64_t max_poll_computation;
159
160	uint64_t max_unsafe_computation;
161	uint64_t sched_safe_duration;
162
163	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
164
165	uint32_t std_quantum;
166	uint32_t min_std_quantum;
167	uint32_t bg_quantum;
168
169	uint32_t std_quantum_us;
170	uint32_t bg_quantum_us;
171
172	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
173
174	uint32_t thread_depress_time;
175	uint32_t default_timeshare_computation;
176	uint32_t default_timeshare_constraint;
177
178	uint32_t max_rt_quantum;
179	uint32_t min_rt_quantum;
180
181	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
182
183	unsigned sched_tick;
184	uint32_t sched_tick_interval;
185
186	/ Timeshare load calculation interval (15ms) /
187	uint32_t sched_load_compute_interval_us = `15000`;
188	uint64_t sched_load_compute_interval_abs;
189	static _Atomic uint64_t sched_load_compute_deadline;
190
191	uint32_t sched_pri_shifts[TH_BUCKET_MAX];
192	uint32_t sched_fixed_shift;
193
194	uint32_t sched_decay_usage_age_factor = `1`; / accelerate 5/8^n usage aging /
195
196	/ Allow foreground to decay past default to resolve inversions /
197	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
198	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
199
200	/ Defaults for timer deadline profiling /
201	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
202	* 2ms */
203	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
204	<= 5ms */
205
206	uint64_t timer_deadline_tracking_bin_1;
207	uint64_t timer_deadline_tracking_bin_2;
208
209	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
210
211	thread_t sched_maintenance_thread;
212
213	#if __arm__ \|\| __arm64__
214	/ interrupts disabled lock to guard recommended cores state /
215	decl_simple_lock_data(static,sched_recommended_cores_lock);
216	static void sched_recommended_cores_maintenance(void);
217	static void sched_update_recommended_cores(uint32_t recommended_cores);
218
219	uint64_t perfcontrol_failsafe_starvation_threshold;
220	extern char proc_name_address(struct* proc *p);
221
222	#endif /* __arm__ \|\| __arm64__ */
223
224	uint64_t sched_one_second_interval;
225
226	/ Forwards /
227
228	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
229
230	static void load_shift_init(void);
231	static void preempt_pri_init(void);
232
233	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
234
235	#if CONFIG_SCHED_IDLE_IN_PLACE
236	static thread_t thread_select_idle(
237	thread_t thread,
238	processor_t processor);
239	#endif
240
241	thread_t processor_idle(
242	thread_t thread,
243	processor_t processor);
244
245	ast_t
246	csw_check_locked( processor_t processor,
247	processor_set_t pset,
248	ast_t check_reason);
249
250	static void processor_setrun(
251	processor_t processor,
252	thread_t thread,
253	integer_t options);
254
255	static void
256	sched_realtime_timebase_init(void);
257
258	static void
259	sched_timer_deadline_tracking_init(void);
260
261	#if DEBUG
262	extern int debug_task;
263	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
264	#else
265	#define TLOG(a, fmt, args...) do {} while (0)
266	#endif
267
268	static processor_t
269	thread_bind_internal(
270	thread_t thread,
271	processor_t processor);
272
273	static void
274	sched_vm_group_maintenance(void);
275
276	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
277	int8_t sched_load_shifts[NRQS];
278	bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
279	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
280
281	const struct sched_dispatch_table *sched_current_dispatch = NULL;
282
283	/*
284	* Statically allocate a buffer to hold the longest possible
285	* scheduler description string, as currently implemented.
286	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
287	* to export to userspace via sysctl(3). If either version
288	* changes, update the other.
289	*
290	* Note that in addition to being an upper bound on the strings
291	* in the kernel, it's also an exact parameter to PE_get_default(),
292	* which interrogates the device tree on some platforms. That
293	* API requires the caller know the exact size of the device tree
294	* property, so we need both a legacy size (32) and the current size
295	* (48) to deal with old and new device trees. The device tree property
296	* is similarly padded to a fixed size so that the same kernel image
297	* can run on multiple devices with different schedulers configured
298	* in the device tree.
299	*/
300	char sched_string[SCHED_STRING_MAX_LENGTH];
301
302	uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
303
304	/ Global flag which indicates whether Background Stepper Context is enabled /
305	static int cpu_throttle_enabled = `1`;
306
307	#if DEBUG
308
309	/ Since using the indirect function dispatch table has a negative impact on*
310	* context switch performance, only allow DEBUG kernels to use that mechanism.
311	*/
312	static void
313	sched_init_override(void)
314	{
315	char sched_arg[SCHED_STRING_MAX_LENGTH] = { `'\0'` };
316
317	/ Check for runtime selection of the scheduler algorithm /
318	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
319	sched_arg[`0`] = `'\0'`;
320	}
321	if (strlen(sched_arg) > `0`) {
322	if (`0`) {
323	/ Allow pattern below /
324	#if defined(CONFIG_SCHED_TRADITIONAL)
325	} else if (`0` == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
326	sched_current_dispatch = &sched_traditional_dispatch;
327	} else if (`0` == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
328	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
329	#endif
330	#if defined(CONFIG_SCHED_MULTIQ)
331	} else if (`0` == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
332	sched_current_dispatch = &sched_multiq_dispatch;
333	} else if (`0` == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
334	sched_current_dispatch = &sched_dualq_dispatch;
335	#endif
336	} else {
337	#if defined(CONFIG_SCHED_TRADITIONAL)
338	printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
339	printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
340	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
341	#else
342	panic("Unrecognized scheduler algorithm: %s", sched_arg);
343	#endif
344	}
345	kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
346	} else {
347	#if defined(CONFIG_SCHED_MULTIQ)
348	sched_current_dispatch = &sched_dualq_dispatch;
349	#elif defined(CONFIG_SCHED_TRADITIONAL)
350	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
351	#else
352	#error No default scheduler implementation
353	#endif
354	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
355	}
356	}
357
358	#endif /* DEBUG */
359
360	void
361	sched_init(void)
362	{
363	#if DEBUG
364	sched_init_override();
365	#else /* DEBUG */
366	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
367	#endif /* DEBUG */
368
369	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
370	/ No boot-args, check in device tree /
371	if (!PE_get_default("kern.sched_pri_decay_limit",
372	&sched_pri_decay_band_limit,
373	sizeof(sched_pri_decay_band_limit))) {
374	/ Allow decay all the way to normal limits /
375	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
376	}
377	}
378
379	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
380
381	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
382	kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
383	}
384	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
385
386	cpu_quiescent_counter_init();
387
388	SCHED(init)();
389	SCHED(rt_init)(&pset0);
390	sched_timer_deadline_tracking_init();
391
392	SCHED(pset_init)(&pset0);
393	SCHED(processor_init)(master_processor);
394	}
395
396	void
397	sched_timebase_init(void)
398	{
399	uint64_t abstime;
400
401	clock_interval_to_absolutetime_interval(`1`, NSEC_PER_SEC, &abstime);
402	sched_one_second_interval = abstime;
403
404	SCHED(timebase_init)();
405	sched_realtime_timebase_init();
406	}
407
408	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
409
410	void
411	sched_timeshare_init(void)
412	{
413	/*
414	* Calculate the timeslicing quantum
415	* in us.
416	*/
417	if (default_preemption_rate < `1`)
418	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
419	std_quantum_us = (`1000` * `1000`) / default_preemption_rate;
420
421	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
422
423	if (default_bg_preemption_rate < `1`)
424	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
425	bg_quantum_us = (`1000` * `1000`) / default_bg_preemption_rate;
426
427	printf("standard background quantum is %d us\n", bg_quantum_us);
428
429	load_shift_init();
430	preempt_pri_init();
431	sched_tick = `0`;
432	}
433
434	void
435	sched_timeshare_timebase_init(void)
436	{
437	uint64_t abstime;
438	uint32_t shift;
439
440	/ standard timeslicing quantum /
441	clock_interval_to_absolutetime_interval(
442	std_quantum_us, NSEC_PER_USEC, &abstime);
443	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
444	std_quantum = (uint32_t)abstime;
445
446	/ smallest remaining quantum (250 us) /
447	clock_interval_to_absolutetime_interval(`250`, NSEC_PER_USEC, &abstime);
448	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
449	min_std_quantum = (uint32_t)abstime;
450
451	/ quantum for background tasks /
452	clock_interval_to_absolutetime_interval(
453	bg_quantum_us, NSEC_PER_USEC, &abstime);
454	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
455	bg_quantum = (uint32_t)abstime;
456
457	/ scheduler tick interval /
458	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
459	NSEC_PER_USEC, &abstime);
460	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
461	sched_tick_interval = (uint32_t)abstime;
462
463	/ timeshare load calculation interval & deadline initialization /
464	clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
465	sched_load_compute_deadline = sched_load_compute_interval_abs;
466
467	/*
468	* Compute conversion factor from usage to
469	* timesharing priorities with 5/8 ** n aging.
470	*/
471	abstime = (abstime * `5`) / `3`;
472	for (shift = `0`; abstime > BASEPRI_DEFAULT; ++shift)
473	abstime >>= `1`;
474	sched_fixed_shift = shift;
475
476	for (uint32_t i = `0` ; i < TH_BUCKET_MAX ; i++)
477	sched_pri_shifts[i] = INT8_MAX;
478
479	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
480	sched_safe_duration = `2` * ((uint64_t)max_unsafe_quanta) * std_quantum;
481
482	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
483	thread_depress_time = `1` * std_quantum;
484	default_timeshare_computation = std_quantum / `2`;
485	default_timeshare_constraint = std_quantum;
486
487	#if __arm__ \|\| __arm64__
488	perfcontrol_failsafe_starvation_threshold = (`2` * sched_tick_interval);
489	#endif /* __arm__ \|\| __arm64__ */
490	}
491
492	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
493
494	void
495	pset_rt_init(processor_set_t pset)
496	{
497	rt_lock_init(pset);
498
499	pset->rt_runq.count = `0`;
500	queue_init(&pset->rt_runq.queue);
501	memset(&pset->rt_runq.runq_stats, `0`, sizeof pset->rt_runq.runq_stats);
502	}
503
504	rt_queue_t
505	sched_rtglobal_runq(processor_set_t pset)
506	{
507	(void)pset;
508
509	return &pset0.rt_runq;
510	}
511
512	void
513	sched_rtglobal_init(processor_set_t pset)
514	{
515	if (pset == &pset0) {
516	return pset_rt_init(pset);
517	}
518
519	/ Only pset0 rt_runq is used, so make it easy to detect*
520	* buggy accesses to others.
521	*/
522	memset(&pset->rt_runq, `0xfd`, sizeof pset->rt_runq);
523	}
524
525	void
526	sched_rtglobal_queue_shutdown(processor_t processor)
527	{
528	(void)processor;
529	}
530
531	static void
532	sched_realtime_timebase_init(void)
533	{
534	uint64_t abstime;
535
536	/ smallest rt computaton (50 us) /
537	clock_interval_to_absolutetime_interval(`50`, NSEC_PER_USEC, &abstime);
538	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
539	min_rt_quantum = (uint32_t)abstime;
540
541	/ maximum rt computation (50 ms) /
542	clock_interval_to_absolutetime_interval(
543	`50`, `1000`*NSEC_PER_USEC, &abstime);
544	assert((abstime >> `32`) == `0` && (uint32_t)abstime != `0`);
545	max_rt_quantum = (uint32_t)abstime;
546
547	}
548
549	void
550	sched_check_spill(processor_set_t pset, thread_t thread)
551	{
552	(void)pset;
553	(void)thread;
554
555	return;
556	}
557
558	bool
559	sched_thread_should_yield(processor_t processor, thread_t thread)
560	{
561	(void)thread;
562
563	return (!SCHED(processor_queue_empty)(processor) \|\| rt_runq_count(processor->processor_set) > `0`);
564	}
565
566	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
567
568	/*
569	* Set up values for timeshare
570	* loading factors.
571	*/
572	static void
573	load_shift_init(void)
574	{
575	int8_t k, *p = sched_load_shifts;
576	uint32_t i, j;
577
578	uint32_t sched_decay_penalty = `1`;
579
580	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
581	kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
582	}
583
584	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
585	kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
586	}
587
588	if (sched_decay_penalty == `0`) {
589	/*
590	* There is no penalty for timeshare threads for using too much
591	* CPU, so set all load shifts to INT8_MIN. Even under high load,
592	* sched_pri_shift will be >INT8_MAX, and there will be no
593	* penalty applied to threads (nor will sched_usage be updated per
594	* thread).
595	*/
596	for (i = `0`; i < NRQS; i++) {
597	sched_load_shifts[i] = INT8_MIN;
598	}
599
600	return;
601	}
602
603	p++ = INT8_MIN; p++ = `0`;
604
605	/*
606	* For a given system load "i", the per-thread priority
607	* penalty per quantum of CPU usage is ~2^k priority
608	* levels. "sched_decay_penalty" can cause more
609	* array entries to be filled with smaller "k" values
610	*/
611	for (i = `2`, j = `1` << sched_decay_penalty, k = `1`; i < NRQS; ++k) {
612	for (j <<= `1`; (i < j) && (i < NRQS); ++i)
613	*p++ = k;
614	}
615	}
616
617	static void
618	preempt_pri_init(void)
619	{
620	bitmap_t *p = sched_preempt_pri;
621
622	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
623	bitmap_set(p, i);
624
625	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
626	bitmap_set(p, i);
627	}
628
629	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
630
631	/*
632	* Thread wait timer expiration.
633	*/
634	void
635	thread_timer_expire(
636	void *p0,
637	__unused void *p1)
638	{
639	thread_t thread = p0;
640	spl_t s;
641
642	assert_thread_magic(thread);
643
644	s = splsched();
645	thread_lock(thread);
646	if (--thread->wait_timer_active == `0`) {
647	if (thread->wait_timer_is_set) {
648	thread->wait_timer_is_set = FALSE;
649	clear_wait_internal(thread, THREAD_TIMED_OUT);
650	}
651	}
652	thread_unlock(thread);
653	splx(s);
654	}
655
656	/*
657	* thread_unblock:
658	*
659	* Unblock thread on wake up.
660	*
661	* Returns TRUE if the thread should now be placed on the runqueue.
662	*
663	* Thread must be locked.
664	*
665	* Called at splsched().
666	*/
667	boolean_t
668	thread_unblock(
669	thread_t thread,
670	wait_result_t wresult)
671	{
672	boolean_t ready_for_runq = FALSE;
673	thread_t cthread = current_thread();
674	uint32_t new_run_count;
675	int old_thread_state;
676
677	/*
678	* Set wait_result.
679	*/
680	thread->wait_result = wresult;
681
682	/*
683	* Cancel pending wait timer.
684	*/
685	if (thread->wait_timer_is_set) {
686	if (timer_call_cancel(&thread->wait_timer))
687	thread->wait_timer_active--;
688	thread->wait_timer_is_set = FALSE;
689	}
690
691	/*
692	* Update scheduling state: not waiting,
693	* set running.
694	*/
695	old_thread_state = thread->state;
696	thread->state = (old_thread_state \| TH_RUN) &
697	~(TH_WAIT\|TH_UNINT\|TH_WAIT_REPORT);
698
699	if ((old_thread_state & TH_RUN) == `0`) {
700	uint64_t ctime = mach_approximate_time();
701	thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
702	timer_start(&thread->runnable_timer, ctime);
703
704	ready_for_runq = TRUE;
705
706	if (old_thread_state & TH_WAIT_REPORT) {
707	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
708	}
709
710	/ Update the runnable thread count /
711	new_run_count = sched_run_incr(thread);
712	} else {
713	/*
714	* Either the thread is idling in place on another processor,
715	* or it hasn't finished context switching yet.
716	*/
717	#if CONFIG_SCHED_IDLE_IN_PLACE
718	if (thread->state & TH_IDLE) {
719	processor_t processor = thread->last_processor;
720
721	if (processor != current_processor())
722	machine_signal_idle(processor);
723	}
724	#else
725	assert((thread->state & TH_IDLE) == `0`);
726	#endif
727	/*
728	* The run count is only dropped after the context switch completes
729	* and the thread is still waiting, so we should not run_incr here
730	*/
731	new_run_count = sched_run_buckets[TH_BUCKET_RUN];
732	}
733
734
735	/*
736	* Calculate deadline for real-time threads.
737	*/
738	if (thread->sched_mode == TH_MODE_REALTIME) {
739	uint64_t ctime;
740
741	ctime = mach_absolute_time();
742	thread->realtime.deadline = thread->realtime.constraint + ctime;
743	}
744
745	/*
746	* Clear old quantum, fail-safe computation, etc.
747	*/
748	thread->quantum_remaining = `0`;
749	thread->computation_metered = `0`;
750	thread->reason = AST_NONE;
751	thread->block_hint = kThreadWaitNone;
752
753	/ Obtain power-relevant interrupt and "platform-idle exit" statistics.*
754	* We also account for "double hop" thread signaling via
755	* the thread callout infrastructure.
756	* DRK: consider removing the callout wakeup counters in the future
757	* they're present for verification at the moment.
758	*/
759	boolean_t aticontext, pidle;
760	ml_get_power_state(&aticontext, &pidle);
761
762	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
763	DTRACE_SCHED2(iwakeup, struct thread , thread, struct* proc *, thread->task->bsd_info);
764
765	uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
766
767	if (ttd) {
768	if (ttd <= timer_deadline_tracking_bin_1)
769	thread->thread_timer_wakeups_bin_1++;
770	else
771	if (ttd <= timer_deadline_tracking_bin_2)
772	thread->thread_timer_wakeups_bin_2++;
773	}
774
775	ledger_credit_thread(thread, thread->t_ledger,
776	task_ledgers.interrupt_wakeups, `1`);
777	if (pidle) {
778	ledger_credit_thread(thread, thread->t_ledger,
779	task_ledgers.platform_idle_wakeups, `1`);
780	}
781
782	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
783	/ TODO: what about an interrupt that does a wake taken on a callout thread? /
784	if (cthread->callout_woken_from_icontext) {
785	ledger_credit_thread(thread, thread->t_ledger,
786	task_ledgers.interrupt_wakeups, `1`);
787	thread->thread_callout_interrupt_wakeups++;
788
789	if (cthread->callout_woken_from_platform_idle) {
790	ledger_credit_thread(thread, thread->t_ledger,
791	task_ledgers.platform_idle_wakeups, `1`);
792	thread->thread_callout_platform_idle_wakeups++;
793	}
794
795	cthread->callout_woke_thread = TRUE;
796	}
797	}
798
799	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
800	thread->callout_woken_from_icontext = aticontext;
801	thread->callout_woken_from_platform_idle = pidle;
802	thread->callout_woke_thread = FALSE;
803	}
804
805	#if KPERF
806	if (ready_for_runq) {
807	kperf_make_runnable(thread, aticontext);
808	}
809	#endif /* KPERF */
810
811	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
812	MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
813	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
814	sched_run_buckets[TH_BUCKET_RUN], `0`);
815
816	DTRACE_SCHED2(wakeup, struct thread , thread, struct* proc *, thread->task->bsd_info);
817
818	return (ready_for_runq);
819	}
820
821	/*
822	* Routine: thread_go
823	* Purpose:
824	* Unblock and dispatch thread.
825	* Conditions:
826	* thread lock held, IPC locks may be held.
827	* thread must have been pulled from wait queue under same lock hold.
828	* thread must have been waiting
829	* Returns:
830	* KERN_SUCCESS - Thread was set running
831	*
832	* TODO: This should return void
833	*/
834	kern_return_t
835	thread_go(
836	thread_t thread,
837	wait_result_t wresult)
838	{
839	assert_thread_magic(thread);
840
841	assert(thread->at_safe_point == FALSE);
842	assert(thread->wait_event == NO_EVENT64);
843	assert(thread->waitq == NULL);
844
845	assert(!(thread->state & (TH_TERMINATE\|TH_TERMINATE2)));
846	assert(thread->state & TH_WAIT);
847
848
849	if (thread_unblock(thread, wresult)) {
850	#if SCHED_TRACE_THREAD_WAKEUPS
851	backtrace(&thread->thread_wakeup_bt[`0`],
852	(sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
853	#endif
854	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
855	}
856
857	return (KERN_SUCCESS);
858	}
859
860	/*
861	* Routine: thread_mark_wait_locked
862	* Purpose:
863	* Mark a thread as waiting. If, given the circumstances,
864	* it doesn't want to wait (i.e. already aborted), then
865	* indicate that in the return value.
866	* Conditions:
867	* at splsched() and thread is locked.
868	*/
869	__private_extern__
870	wait_result_t
871	thread_mark_wait_locked(
872	thread_t thread,
873	wait_interrupt_t interruptible_orig)
874	{
875	boolean_t at_safe_point;
876	wait_interrupt_t interruptible = interruptible_orig;
877
878	assert(!(thread->state & (TH_WAIT\|TH_IDLE\|TH_UNINT\|TH_TERMINATE2\|TH_WAIT_REPORT)));
879
880	/*
881	* The thread may have certain types of interrupts/aborts masked
882	* off. Even if the wait location says these types of interrupts
883	* are OK, we have to honor mask settings (outer-scoped code may
884	* not be able to handle aborts at the moment).
885	*/
886	interruptible &= TH_OPT_INTMASK;
887	if (interruptible > (thread->options & TH_OPT_INTMASK))
888	interruptible = thread->options & TH_OPT_INTMASK;
889
890	at_safe_point = (interruptible == THREAD_ABORTSAFE);
891
892	if ( interruptible == THREAD_UNINT \|\|
893	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
894	(!at_safe_point &&
895	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
896
897	if ( !(thread->state & TH_TERMINATE))
898	DTRACE_SCHED(sleep);
899
900	int state_bits = TH_WAIT;
901	if (!interruptible) {
902	state_bits \|= TH_UNINT;
903	}
904	if (thread->sched_call) {
905	wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
906	if (is_kerneltask(thread->task)) {
907	mask = THREAD_WAIT_NOREPORT_KERNEL;
908	}
909	if ((interruptible_orig & mask) == `0`) {
910	state_bits \|= TH_WAIT_REPORT;
911	}
912	}
913	thread->state \|= state_bits;
914	thread->at_safe_point = at_safe_point;
915
916	/ TODO: pass this through assert_wait instead, have*
917	* assert_wait just take a struct as an argument */
918	assert(!thread->block_hint);
919	thread->block_hint = thread->pending_block_hint;
920	thread->pending_block_hint = kThreadWaitNone;
921
922	return (thread->wait_result = THREAD_WAITING);
923	} else {
924	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
925	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
926	}
927	thread->pending_block_hint = kThreadWaitNone;
928
929	return (thread->wait_result = THREAD_INTERRUPTED);
930	}
931
932	/*
933	* Routine: thread_interrupt_level
934	* Purpose:
935	* Set the maximum interruptible state for the
936	* current thread. The effective value of any
937	* interruptible flag passed into assert_wait
938	* will never exceed this.
939	*
940	* Useful for code that must not be interrupted,
941	* but which calls code that doesn't know that.
942	* Returns:
943	* The old interrupt level for the thread.
944	*/
945	__private_extern__
946	wait_interrupt_t
947	thread_interrupt_level(
948	wait_interrupt_t new_level)
949	{
950	thread_t thread = current_thread();
951	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
952
953	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
954
955	return result;
956	}
957
958	/*
959	* assert_wait:
960	*
961	* Assert that the current thread is about to go to
962	* sleep until the specified event occurs.
963	*/
964	wait_result_t
965	assert_wait(
966	event_t event,
967	wait_interrupt_t interruptible)
968	{
969	if (__improbable(event == NO_EVENT))
970	panic("%s() called with NO_EVENT", __func__);
971
972	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
973	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
974	VM_KERNEL_UNSLIDE_OR_PERM(event), `0`, `0`, `0`, `0`);
975
976	struct waitq *waitq;
977	waitq = global_eventq(event);
978	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
979	}
980
981	/*
982	* assert_wait_queue:
983	*
984	* Return the global waitq for the specified event
985	*/
986	struct waitq *
987	assert_wait_queue(
988	event_t event)
989	{
990	return global_eventq(event);
991	}
992
993	wait_result_t
994	assert_wait_timeout(
995	event_t event,
996	wait_interrupt_t interruptible,
997	uint32_t interval,
998	uint32_t scale_factor)
999	{
1000	thread_t thread = current_thread();
1001	wait_result_t wresult;
1002	uint64_t deadline;
1003	spl_t s;
1004
1005	if (__improbable(event == NO_EVENT))
1006	panic("%s() called with NO_EVENT", __func__);
1007
1008	struct waitq *waitq;
1009	waitq = global_eventq(event);
1010
1011	s = splsched();
1012	waitq_lock(waitq);
1013
1014	clock_interval_to_deadline(interval, scale_factor, &deadline);
1015
1016	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1017	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
1018	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1019
1020	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1021	interruptible,
1022	TIMEOUT_URGENCY_SYS_NORMAL,
1023	deadline, TIMEOUT_NO_LEEWAY,
1024	thread);
1025
1026	waitq_unlock(waitq);
1027	splx(s);
1028	return wresult;
1029	}
1030
1031	wait_result_t
1032	assert_wait_timeout_with_leeway(
1033	event_t event,
1034	wait_interrupt_t interruptible,
1035	wait_timeout_urgency_t urgency,
1036	uint32_t interval,
1037	uint32_t leeway,
1038	uint32_t scale_factor)
1039	{
1040	thread_t thread = current_thread();
1041	wait_result_t wresult;
1042	uint64_t deadline;
1043	uint64_t abstime;
1044	uint64_t slop;
1045	uint64_t now;
1046	spl_t s;
1047
1048	if (__improbable(event == NO_EVENT))
1049	panic("%s() called with NO_EVENT", __func__);
1050
1051	now = mach_absolute_time();
1052	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1053	deadline = now + abstime;
1054
1055	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1056
1057	struct waitq *waitq;
1058	waitq = global_eventq(event);
1059
1060	s = splsched();
1061	waitq_lock(waitq);
1062
1063	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1064	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
1065	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1066
1067	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1068	interruptible,
1069	urgency, deadline, slop,
1070	thread);
1071
1072	waitq_unlock(waitq);
1073	splx(s);
1074	return wresult;
1075	}
1076
1077	wait_result_t
1078	assert_wait_deadline(
1079	event_t event,
1080	wait_interrupt_t interruptible,
1081	uint64_t deadline)
1082	{
1083	thread_t thread = current_thread();
1084	wait_result_t wresult;
1085	spl_t s;
1086
1087	if (__improbable(event == NO_EVENT))
1088	panic("%s() called with NO_EVENT", __func__);
1089
1090	struct waitq *waitq;
1091	waitq = global_eventq(event);
1092
1093	s = splsched();
1094	waitq_lock(waitq);
1095
1096	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1097	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
1098	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1099
1100	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1101	interruptible,
1102	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1103	TIMEOUT_NO_LEEWAY, thread);
1104	waitq_unlock(waitq);
1105	splx(s);
1106	return wresult;
1107	}
1108
1109	wait_result_t
1110	assert_wait_deadline_with_leeway(
1111	event_t event,
1112	wait_interrupt_t interruptible,
1113	wait_timeout_urgency_t urgency,
1114	uint64_t deadline,
1115	uint64_t leeway)
1116	{
1117	thread_t thread = current_thread();
1118	wait_result_t wresult;
1119	spl_t s;
1120
1121	if (__improbable(event == NO_EVENT))
1122	panic("%s() called with NO_EVENT", __func__);
1123
1124	struct waitq *waitq;
1125	waitq = global_eventq(event);
1126
1127	s = splsched();
1128	waitq_lock(waitq);
1129
1130	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1131	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
1132	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, `0`, `0`);
1133
1134	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1135	interruptible,
1136	urgency, deadline, leeway,
1137	thread);
1138	waitq_unlock(waitq);
1139	splx(s);
1140	return wresult;
1141	}
1142
1143	/*
1144	* thread_isoncpu:
1145	*
1146	* Return TRUE if a thread is running on a processor such that an AST
1147	* is needed to pull it out of userspace execution, or if executing in
1148	* the kernel, bring to a context switch boundary that would cause
1149	* thread state to be serialized in the thread PCB.
1150	*
1151	* Thread locked, returns the same way. While locked, fields
1152	* like "state" cannot change. "runq" can change only from set to unset.
1153	*/
1154	static inline boolean_t
1155	thread_isoncpu(thread_t thread)
1156	{
1157	/ Not running or runnable /
1158	if (!(thread->state & TH_RUN))
1159	return (FALSE);
1160
1161	/ Waiting on a runqueue, not currently running /
1162	/ TODO: This is invalid - it can get dequeued without thread lock, but not context switched. /
1163	if (thread->runq != PROCESSOR_NULL)
1164	return (FALSE);
1165
1166	/*
1167	* Thread does not have a stack yet
1168	* It could be on the stack alloc queue or preparing to be invoked
1169	*/
1170	if (!thread->kernel_stack)
1171	return (FALSE);
1172
1173	/*
1174	* Thread must be running on a processor, or
1175	* about to run, or just did run. In all these
1176	* cases, an AST to the processor is needed
1177	* to guarantee that the thread is kicked out
1178	* of userspace and the processor has
1179	* context switched (and saved register state).
1180	*/
1181	return (TRUE);
1182	}
1183
1184	/*
1185	* thread_stop:
1186	*
1187	* Force a preemption point for a thread and wait
1188	* for it to stop running on a CPU. If a stronger
1189	* guarantee is requested, wait until no longer
1190	* runnable. Arbitrates access among
1191	* multiple stop requests. (released by unstop)
1192	*
1193	* The thread must enter a wait state and stop via a
1194	* separate means.
1195	*
1196	* Returns FALSE if interrupted.
1197	*/
1198	boolean_t
1199	thread_stop(
1200	thread_t thread,
1201	boolean_t until_not_runnable)
1202	{
1203	wait_result_t wresult;
1204	spl_t s = splsched();
1205	boolean_t oncpu;
1206
1207	wake_lock(thread);
1208	thread_lock(thread);
1209
1210	while (thread->state & TH_SUSP) {
1211	thread->wake_active = TRUE;
1212	thread_unlock(thread);
1213
1214	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1215	wake_unlock(thread);
1216	splx(s);
1217
1218	if (wresult == THREAD_WAITING)
1219	wresult = thread_block(THREAD_CONTINUE_NULL);
1220
1221	if (wresult != THREAD_AWAKENED)
1222	return (FALSE);
1223
1224	s = splsched();
1225	wake_lock(thread);
1226	thread_lock(thread);
1227	}
1228
1229	thread->state \|= TH_SUSP;
1230
1231	while ((oncpu = thread_isoncpu(thread)) \|\|
1232	(until_not_runnable && (thread->state & TH_RUN))) {
1233	processor_t processor;
1234
1235	if (oncpu) {
1236	assert(thread->state & TH_RUN);
1237	processor = thread->chosen_processor;
1238	cause_ast_check(processor);
1239	}
1240
1241	thread->wake_active = TRUE;
1242	thread_unlock(thread);
1243
1244	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1245	wake_unlock(thread);
1246	splx(s);
1247
1248	if (wresult == THREAD_WAITING)
1249	wresult = thread_block(THREAD_CONTINUE_NULL);
1250
1251	if (wresult != THREAD_AWAKENED) {
1252	thread_unstop(thread);
1253	return (FALSE);
1254	}
1255
1256	s = splsched();
1257	wake_lock(thread);
1258	thread_lock(thread);
1259	}
1260
1261	thread_unlock(thread);
1262	wake_unlock(thread);
1263	splx(s);
1264
1265	/*
1266	* We return with the thread unlocked. To prevent it from
1267	* transitioning to a runnable state (or from TH_RUN to
1268	* being on the CPU), the caller must ensure the thread
1269	* is stopped via an external means (such as an AST)
1270	*/
1271
1272	return (TRUE);
1273	}
1274
1275	/*
1276	* thread_unstop:
1277	*
1278	* Release a previous stop request and set
1279	* the thread running if appropriate.
1280	*
1281	* Use only after a successful stop operation.
1282	*/
1283	void
1284	thread_unstop(
1285	thread_t thread)
1286	{
1287	spl_t s = splsched();
1288
1289	wake_lock(thread);
1290	thread_lock(thread);
1291
1292	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_SUSP)) != TH_SUSP);
1293
1294	if (thread->state & TH_SUSP) {
1295	thread->state &= ~TH_SUSP;
1296
1297	if (thread->wake_active) {
1298	thread->wake_active = FALSE;
1299	thread_unlock(thread);
1300
1301	thread_wakeup(&thread->wake_active);
1302	wake_unlock(thread);
1303	splx(s);
1304
1305	return;
1306	}
1307	}
1308
1309	thread_unlock(thread);
1310	wake_unlock(thread);
1311	splx(s);
1312	}
1313
1314	/*
1315	* thread_wait:
1316	*
1317	* Wait for a thread to stop running. (non-interruptible)
1318	*
1319	*/
1320	void
1321	thread_wait(
1322	thread_t thread,
1323	boolean_t until_not_runnable)
1324	{
1325	wait_result_t wresult;
1326	boolean_t oncpu;
1327	processor_t processor;
1328	spl_t s = splsched();
1329
1330	wake_lock(thread);
1331	thread_lock(thread);
1332
1333	/*
1334	* Wait until not running on a CPU. If stronger requirement
1335	* desired, wait until not runnable. Assumption: if thread is
1336	* on CPU, then TH_RUN is set, so we're not waiting in any case
1337	* where the original, pure "TH_RUN" check would have let us
1338	* finish.
1339	*/
1340	while ((oncpu = thread_isoncpu(thread)) \|\|
1341	(until_not_runnable && (thread->state & TH_RUN))) {
1342
1343	if (oncpu) {
1344	assert(thread->state & TH_RUN);
1345	processor = thread->chosen_processor;
1346	cause_ast_check(processor);
1347	}
1348
1349	thread->wake_active = TRUE;
1350	thread_unlock(thread);
1351
1352	wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1353	wake_unlock(thread);
1354	splx(s);
1355
1356	if (wresult == THREAD_WAITING)
1357	thread_block(THREAD_CONTINUE_NULL);
1358
1359	s = splsched();
1360	wake_lock(thread);
1361	thread_lock(thread);
1362	}
1363
1364	thread_unlock(thread);
1365	wake_unlock(thread);
1366	splx(s);
1367	}
1368
1369	/*
1370	* Routine: clear_wait_internal
1371	*
1372	* Clear the wait condition for the specified thread.
1373	* Start the thread executing if that is appropriate.
1374	* Arguments:
1375	* thread thread to awaken
1376	* result Wakeup result the thread should see
1377	* Conditions:
1378	* At splsched
1379	* the thread is locked.
1380	* Returns:
1381	* KERN_SUCCESS thread was rousted out a wait
1382	* KERN_FAILURE thread was waiting but could not be rousted
1383	* KERN_NOT_WAITING thread was not waiting
1384	*/
1385	__private_extern__ kern_return_t
1386	clear_wait_internal(
1387	thread_t thread,
1388	wait_result_t wresult)
1389	{
1390	uint32_t i = LockTimeOutUsec;
1391	struct waitq *waitq = thread->waitq;
1392
1393	do {
1394	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1395	return (KERN_FAILURE);
1396
1397	if (waitq != NULL) {
1398	if (!waitq_pull_thread_locked(waitq, thread)) {
1399	thread_unlock(thread);
1400	delay(`1`);
1401	if (i > `0` && !machine_timeout_suspended())
1402	i--;
1403	thread_lock(thread);
1404	if (waitq != thread->waitq)
1405	return KERN_NOT_WAITING;
1406	continue;
1407	}
1408	}
1409
1410	/ TODO: Can we instead assert TH_TERMINATE is not set? /
1411	if ((thread->state & (TH_WAIT\|TH_TERMINATE)) == TH_WAIT)
1412	return (thread_go(thread, wresult));
1413	else
1414	return (KERN_NOT_WAITING);
1415	} while (i > `0`);
1416
1417	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1418	thread, waitq, cpu_number());
1419
1420	return (KERN_FAILURE);
1421	}
1422
1423
1424	/*
1425	* clear_wait:
1426	*
1427	* Clear the wait condition for the specified thread. Start the thread
1428	* executing if that is appropriate.
1429	*
1430	* parameters:
1431	* thread thread to awaken
1432	* result Wakeup result the thread should see
1433	*/
1434	kern_return_t
1435	clear_wait(
1436	thread_t thread,
1437	wait_result_t result)
1438	{
1439	kern_return_t ret;
1440	spl_t s;
1441
1442	s = splsched();
1443	thread_lock(thread);
1444	ret = clear_wait_internal(thread, result);
1445	thread_unlock(thread);
1446	splx(s);
1447	return ret;
1448	}
1449
1450
1451	/*
1452	* thread_wakeup_prim:
1453	*
1454	* Common routine for thread_wakeup, thread_wakeup_with_result,
1455	* and thread_wakeup_one.
1456	*
1457	*/
1458	kern_return_t
1459	thread_wakeup_prim(
1460	event_t event,
1461	boolean_t one_thread,
1462	wait_result_t result)
1463	{
1464	if (__improbable(event == NO_EVENT))
1465	panic("%s() called with NO_EVENT", __func__);
1466
1467	struct waitq *wq = global_eventq(event);
1468
1469	if (one_thread)
1470	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1471	else
1472	return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1473	}
1474
1475	/*
1476	* Wakeup a specified thread if and only if it's waiting for this event
1477	*/
1478	kern_return_t
1479	thread_wakeup_thread(
1480	event_t event,
1481	thread_t thread)
1482	{
1483	if (__improbable(event == NO_EVENT))
1484	panic("%s() called with NO_EVENT", __func__);
1485
1486	if (__improbable(thread == THREAD_NULL))
1487	panic("%s() called with THREAD_NULL", __func__);
1488
1489	struct waitq *wq = global_eventq(event);
1490
1491	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1492	}
1493
1494	/*
1495	* Wakeup a thread waiting on an event and promote it to a priority.
1496	*
1497	* Requires woken thread to un-promote itself when done.
1498	*/
1499	kern_return_t
1500	thread_wakeup_one_with_pri(
1501	event_t event,
1502	int priority)
1503	{
1504	if (__improbable(event == NO_EVENT))
1505	panic("%s() called with NO_EVENT", __func__);
1506
1507	struct waitq *wq = global_eventq(event);
1508
1509	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1510	}
1511
1512	/*
1513	* Wakeup a thread waiting on an event,
1514	* promote it to a priority,
1515	* and return a reference to the woken thread.
1516	*
1517	* Requires woken thread to un-promote itself when done.
1518	*/
1519	thread_t
1520	thread_wakeup_identify(event_t event,
1521	int priority)
1522	{
1523	if (__improbable(event == NO_EVENT))
1524	panic("%s() called with NO_EVENT", __func__);
1525
1526	struct waitq *wq = global_eventq(event);
1527
1528	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1529	}
1530
1531	/*
1532	* thread_bind:
1533	*
1534	* Force the current thread to execute on the specified processor.
1535	* Takes effect after the next thread_block().
1536	*
1537	* Returns the previous binding. PROCESSOR_NULL means
1538	* not bound.
1539	*
1540	* XXX - DO NOT export this to users - XXX
1541	*/
1542	processor_t
1543	thread_bind(
1544	processor_t processor)
1545	{
1546	thread_t self = current_thread();
1547	processor_t prev;
1548	spl_t s;
1549
1550	s = splsched();
1551	thread_lock(self);
1552
1553	prev = thread_bind_internal(self, processor);
1554
1555	thread_unlock(self);
1556	splx(s);
1557
1558	return (prev);
1559	}
1560
1561	/*
1562	* thread_bind_internal:
1563	*
1564	* If the specified thread is not the current thread, and it is currently
1565	* running on another CPU, a remote AST must be sent to that CPU to cause
1566	* the thread to migrate to its bound processor. Otherwise, the migration
1567	* will occur at the next quantum expiration or blocking point.
1568	*
1569	* When the thread is the current thread, and explicit thread_block() should
1570	* be used to force the current processor to context switch away and
1571	* let the thread migrate to the bound processor.
1572	*
1573	* Thread must be locked, and at splsched.
1574	*/
1575
1576	static processor_t
1577	thread_bind_internal(
1578	thread_t thread,
1579	processor_t processor)
1580	{
1581	processor_t prev;
1582
1583	/ <rdar://problem/15102234> /
1584	assert(thread->sched_pri < BASEPRI_RTQUEUES);
1585	/ A thread can't be bound if it's sitting on a (potentially incorrect) runqueue /
1586	assert(thread->runq == PROCESSOR_NULL);
1587
1588	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-`1`, `0`, `0`, `0`);
1589
1590	prev = thread->bound_processor;
1591	thread->bound_processor = processor;
1592
1593	return (prev);
1594	}
1595
1596	/*
1597	* thread_vm_bind_group_add:
1598	*
1599	* The "VM bind group" is a special mechanism to mark a collection
1600	* of threads from the VM subsystem that, in general, should be scheduled
1601	* with only one CPU of parallelism. To accomplish this, we initially
1602	* bind all the threads to the master processor, which has the effect
1603	* that only one of the threads in the group can execute at once, including
1604	* preempting threads in the group that are a lower priority. Future
1605	* mechanisms may use more dynamic mechanisms to prevent the collection
1606	* of VM threads from using more CPU time than desired.
1607	*
1608	* The current implementation can result in priority inversions where
1609	* compute-bound priority 95 or realtime threads that happen to have
1610	* landed on the master processor prevent the VM threads from running.
1611	* When this situation is detected, we unbind the threads for one
1612	* scheduler tick to allow the scheduler to run the threads an
1613	* additional CPUs, before restoring the binding (assuming high latency
1614	* is no longer a problem).
1615	*/
1616
1617	/*
1618	* The current max is provisioned for:
1619	* vm_compressor_swap_trigger_thread (92)
1620	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1621	* vm_pageout_continue (92)
1622	* memorystatus_thread (95)
1623	*/
1624	#define MAX_VM_BIND_GROUP_COUNT (5)
1625	decl_simple_lock_data(static,sched_vm_group_list_lock);
1626	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1627	static int sched_vm_group_thread_count;
1628	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1629
1630	void
1631	thread_vm_bind_group_add(void)
1632	{
1633	thread_t self = current_thread();
1634
1635	thread_reference_internal(self);
1636	self->options \|= TH_OPT_SCHED_VM_GROUP;
1637
1638	simple_lock(&sched_vm_group_list_lock);
1639	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1640	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1641	simple_unlock(&sched_vm_group_list_lock);
1642
1643	thread_bind(master_processor);
1644
1645	/ Switch to bound processor if not already there /
1646	thread_block(THREAD_CONTINUE_NULL);
1647	}
1648
1649	static void
1650	sched_vm_group_maintenance(void)
1651	{
1652	uint64_t ctime = mach_absolute_time();
1653	uint64_t longtime = ctime - sched_tick_interval;
1654	int i;
1655	spl_t s;
1656	boolean_t high_latency_observed = FALSE;
1657	boolean_t runnable_and_not_on_runq_observed = FALSE;
1658	boolean_t bind_target_changed = FALSE;
1659	processor_t bind_target = PROCESSOR_NULL;
1660
1661	/ Make sure nobody attempts to add new threads while we are enumerating them /
1662	simple_lock(&sched_vm_group_list_lock);
1663
1664	s = splsched();
1665
1666	for (i=`0`; i < sched_vm_group_thread_count; i++) {
1667	thread_t thread = sched_vm_group_thread_list[i];
1668	assert(thread != THREAD_NULL);
1669	thread_lock(thread);
1670	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_RUN) {
1671	if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1672	high_latency_observed = TRUE;
1673	} else if (thread->runq == PROCESSOR_NULL) {
1674	/ There are some cases where a thread be transitiong that also fall into this case /
1675	runnable_and_not_on_runq_observed = TRUE;
1676	}
1677	}
1678	thread_unlock(thread);
1679
1680	if (high_latency_observed && runnable_and_not_on_runq_observed) {
1681	/ All the things we are looking for are true, stop looking /
1682	break;
1683	}
1684	}
1685
1686	splx(s);
1687
1688	if (sched_vm_group_temporarily_unbound) {
1689	/ If we turned off binding, make sure everything is OK before rebinding /
1690	if (!high_latency_observed) {
1691	/ rebind /
1692	bind_target_changed = TRUE;
1693	bind_target = master_processor;
1694	sched_vm_group_temporarily_unbound = FALSE; / might be reset to TRUE if change cannot be completed /
1695	}
1696	} else {
1697	/*
1698	* Check if we're in a bad state, which is defined by high
1699	* latency with no core currently executing a thread. If a
1700	* single thread is making progress on a CPU, that means the
1701	* binding concept to reduce parallelism is working as
1702	* designed.
1703	*/
1704	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1705	/ unbind /
1706	bind_target_changed = TRUE;
1707	bind_target = PROCESSOR_NULL;
1708	sched_vm_group_temporarily_unbound = TRUE;
1709	}
1710	}
1711
1712	if (bind_target_changed) {
1713	s = splsched();
1714	for (i=`0`; i < sched_vm_group_thread_count; i++) {
1715	thread_t thread = sched_vm_group_thread_list[i];
1716	boolean_t removed;
1717	assert(thread != THREAD_NULL);
1718
1719	thread_lock(thread);
1720	removed = thread_run_queue_remove(thread);
1721	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
1722	thread_bind_internal(thread, bind_target);
1723	} else {
1724	/*
1725	* Thread was in the middle of being context-switched-to,
1726	* or was in the process of blocking. To avoid switching the bind
1727	* state out mid-flight, defer the change if possible.
1728	*/
1729	if (bind_target == PROCESSOR_NULL) {
1730	thread_bind_internal(thread, bind_target);
1731	} else {
1732	sched_vm_group_temporarily_unbound = TRUE; / next pass will try again /
1733	}
1734	}
1735
1736	if (removed) {
1737	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
1738	}
1739	thread_unlock(thread);
1740	}
1741	splx(s);
1742	}
1743
1744	simple_unlock(&sched_vm_group_list_lock);
1745	}
1746
1747	/ Invoked prior to idle entry to determine if, on SMT capable processors, an SMT*
1748	* rebalancing opportunity exists when a core is (instantaneously) idle, but
1749	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
1750	* IPI thrash if this core does not remain idle following the load balancing ASTs
1751	* Idle "thrash", when IPI issue is followed by idle entry/core power down
1752	* followed by a wakeup shortly thereafter.
1753	*/
1754
1755	#if (DEVELOPMENT \|\| DEBUG)
1756	int sched_smt_balance = `1`;
1757	#endif
1758
1759	#if __SMP__
1760	/ Invoked with pset locked, returns with pset unlocked /
1761	void
1762	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1763	processor_t ast_processor = NULL;
1764
1765	#if (DEVELOPMENT \|\| DEBUG)
1766	if (__improbable(sched_smt_balance == `0`))
1767	goto smt_balance_exit;
1768	#endif
1769
1770	assert(cprocessor == current_processor());
1771	if (cprocessor->is_SMT == FALSE)
1772	goto smt_balance_exit;
1773
1774	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1775
1776	/ Determine if both this processor and its sibling are idle,*
1777	* indicating an SMT rebalancing opportunity.
1778	*/
1779	if (sib_processor->state != PROCESSOR_IDLE)
1780	goto smt_balance_exit;
1781
1782	processor_t sprocessor;
1783
1784	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
1785	uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
1786	~cpset->primary_map);
1787	for (int cpuid = lsb_first(running_secondary_map); cpuid >= `0`; cpuid = lsb_next(running_secondary_map, cpuid)) {
1788	sprocessor = processor_array[cpuid];
1789	if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1790	(sprocessor->current_pri < BASEPRI_RTQUEUES)) {
1791
1792	ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
1793	if (ipi_type != SCHED_IPI_NONE) {
1794	assert(sprocessor != cprocessor);
1795	ast_processor = sprocessor;
1796	break;
1797	}
1798	}
1799	}
1800
1801	smt_balance_exit:
1802	pset_unlock(cpset);
1803
1804	if (ast_processor) {
1805	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, `0`, `0`);
1806	sched_ipi_perform(ast_processor, ipi_type);
1807	}
1808	}
1809	#else
1810	/ Invoked with pset locked, returns with pset unlocked /
1811	void
1812	sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset)
1813	{
1814	pset_unlock(cpset);
1815	}
1816	#endif /* __SMP__ */
1817
1818	static processor_t choose_processor_for_realtime_thread(processor_set_t pset);
1819	static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset);
1820	int sched_allow_rt_smt = `1`;
1821
1822	/*
1823	* thread_select:
1824	*
1825	* Select a new thread for the current processor to execute.
1826	*
1827	* May select the current thread, which must be locked.
1828	*/
1829	static thread_t
1830	thread_select(thread_t thread,
1831	processor_t processor,
1832	ast_t *reason)
1833	{
1834	processor_set_t pset = processor->processor_set;
1835	thread_t new_thread = THREAD_NULL;
1836
1837	assert(processor == current_processor());
1838	assert((thread->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
1839
1840	do {
1841	/*
1842	* Update the priority.
1843	*/
1844	if (SCHED(can_update_priority)(thread))
1845	SCHED(update_priority)(thread);
1846
1847	processor_state_update_from_thread(processor, thread);
1848
1849	pset_lock(pset);
1850
1851	assert(processor->state != PROCESSOR_OFF_LINE);
1852
1853	if (!processor->is_recommended) {
1854	/*
1855	* The performance controller has provided a hint to not dispatch more threads,
1856	* unless they are bound to us (and thus we are the only option
1857	*/
1858	if (!SCHED(processor_bound_count)(processor)) {
1859	goto idle;
1860	}
1861	} else if (processor->processor_primary != processor) {
1862	/*
1863	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1864	* we should look for work only under the same conditions that choose_processor()
1865	* would have assigned work, which is when all primary processors have been assigned work.
1866	*
1867	* An exception is that bound threads are dispatched to a processor without going through
1868	* choose_processor(), so in those cases we should continue trying to dequeue work.
1869	*/
1870	if (!SCHED(processor_bound_count)(processor)) {
1871	if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != `0`) {
1872	goto idle;
1873	}
1874
1875	/ There are no idle primaries /
1876
1877	if (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) {
1878	bool secondary_can_run_realtime_thread = sched_allow_rt_smt && rt_runq_count(pset) && all_available_primaries_are_running_realtime_threads(pset);
1879	if (!secondary_can_run_realtime_thread) {
1880	goto idle;
1881	}
1882	}
1883	}
1884	}
1885
1886	/*
1887	* Test to see if the current thread should continue
1888	* to run on this processor. Must not be attempting to wait, and not
1889	* bound to a different processor, nor be in the wrong
1890	* processor set, nor be forced to context switch by TH_SUSP.
1891	*
1892	* Note that there are never any RT threads in the regular runqueue.
1893	*
1894	* This code is very insanely tricky.
1895	*/
1896
1897	/ i.e. not waiting, not TH_SUSP'ed /
1898	boolean_t still_running = ((thread->state & (TH_TERMINATE\|TH_IDLE\|TH_WAIT\|TH_RUN\|TH_SUSP)) == TH_RUN);
1899
1900	/*
1901	* Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
1902	* TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
1903	*/
1904	boolean_t needs_smt_rebalance = (thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
1905
1906	boolean_t affinity_mismatch = (thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset);
1907
1908	boolean_t bound_elsewhere = (thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor);
1909
1910	boolean_t avoid_processor = (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread));
1911
1912	if (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor) {
1913	/*
1914	* This thread is eligible to keep running on this processor.
1915	*
1916	* RT threads with un-expired quantum stay on processor,
1917	* unless there's a valid RT thread with an earlier deadline.
1918	*/
1919	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
1920	if (rt_runq_count(pset) > `0`) {
1921
1922	rt_lock_lock(pset);
1923
1924	if (rt_runq_count(pset) > `0`) {
1925
1926	thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1927
1928	if (next_rt->realtime.deadline < processor->deadline &&
1929	(next_rt->bound_processor == PROCESSOR_NULL \|\|
1930	next_rt->bound_processor == processor)) {
1931	/ The next RT thread is better, so pick it off the runqueue. /
1932	goto pick_new_rt_thread;
1933	}
1934	}
1935
1936	rt_lock_unlock(pset);
1937	}
1938
1939	/ This is still the best RT thread to run. /
1940	processor->deadline = thread->realtime.deadline;
1941
1942	sched_update_pset_load_average(pset);
1943
1944	processor_t next_rt_processor = PROCESSOR_NULL;
1945	sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
1946
1947	if (rt_runq_count(pset) > `0`) {
1948	next_rt_processor = choose_processor_for_realtime_thread(pset);
1949	if (next_rt_processor) {
1950	next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
1951	}
1952	}
1953	pset_unlock(pset);
1954
1955	if (next_rt_processor) {
1956	sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
1957	}
1958
1959	return (thread);
1960	}
1961
1962	if ((rt_runq_count(pset) == `0`) &&
1963	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
1964	/ This thread is still the highest priority runnable (non-idle) thread /
1965	processor->deadline = UINT64_MAX;
1966
1967	sched_update_pset_load_average(pset);
1968	pset_unlock(pset);
1969
1970	return (thread);
1971	}
1972	} else {
1973	/*
1974	* This processor must context switch.
1975	* If it's due to a rebalance, we should aggressively find this thread a new home.
1976	*/
1977	if (needs_smt_rebalance \|\| affinity_mismatch \|\| bound_elsewhere \|\| avoid_processor)
1978	*reason \|= AST_REBALANCE;
1979	}
1980
1981	/ OK, so we're not going to run the current thread. Look at the RT queue. /
1982	if (rt_runq_count(pset) > `0`) {
1983
1984	rt_lock_lock(pset);
1985
1986	if (rt_runq_count(pset) > `0`) {
1987	thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1988
1989	if (__probable((next_rt->bound_processor == PROCESSOR_NULL \|\|
1990	(next_rt->bound_processor == processor)))) {
1991	pick_new_rt_thread:
1992	new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1993
1994	new_thread->runq = PROCESSOR_NULL;
1995	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
1996	rt_runq_count_decr(pset);
1997
1998	processor->deadline = new_thread->realtime.deadline;
1999	processor_state_update_from_thread(processor, new_thread);
2000
2001	rt_lock_unlock(pset);
2002	sched_update_pset_load_average(pset);
2003
2004	processor_t ast_processor = PROCESSOR_NULL;
2005	processor_t next_rt_processor = PROCESSOR_NULL;
2006	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2007	sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2008
2009	if (processor->processor_secondary != NULL) {
2010	processor_t sprocessor = processor->processor_secondary;
2011	if ((sprocessor->state == PROCESSOR_RUNNING) \|\| (sprocessor->state == PROCESSOR_DISPATCHING)) {
2012	ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
2013	ast_processor = sprocessor;
2014	}
2015	}
2016	if (rt_runq_count(pset) > `0`) {
2017	next_rt_processor = choose_processor_for_realtime_thread(pset);
2018	if (next_rt_processor) {
2019	next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
2020	}
2021	}
2022	pset_unlock(pset);
2023
2024	if (ast_processor) {
2025	sched_ipi_perform(ast_processor, ipi_type);
2026	}
2027
2028	if (next_rt_processor) {
2029	sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2030	}
2031
2032	return (new_thread);
2033	}
2034	}
2035
2036	rt_lock_unlock(pset);
2037	}
2038
2039	processor->deadline = UINT64_MAX;
2040
2041	/ No RT threads, so let's look at the regular threads. /
2042	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2043	sched_update_pset_load_average(pset);
2044	processor_state_update_from_thread(processor, new_thread);
2045	pset_unlock(pset);
2046	return (new_thread);
2047	}
2048
2049	#if __SMP__
2050	if (SCHED(steal_thread_enabled)) {
2051	/*
2052	* No runnable threads, attempt to steal
2053	* from other processors. Returns with pset lock dropped.
2054	*/
2055
2056	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2057	return (new_thread);
2058	}
2059
2060	/*
2061	* If other threads have appeared, shortcut
2062	* around again.
2063	*/
2064	if (!SCHED(processor_queue_empty)(processor) \|\| rt_runq_count(pset) > `0`)
2065	continue;
2066
2067	pset_lock(pset);
2068	}
2069	#endif
2070
2071	idle:
2072	/*
2073	* Nothing is runnable, so set this processor idle if it
2074	* was running.
2075	*/
2076	if (processor->state == PROCESSOR_RUNNING) {
2077	pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2078	}
2079
2080	#if __SMP__
2081	/ Invoked with pset locked, returns with pset unlocked /
2082	SCHED(processor_balance)(processor, pset);
2083	#else
2084	pset_unlock(pset);
2085	#endif
2086
2087	#if CONFIG_SCHED_IDLE_IN_PLACE
2088	/*
2089	* Choose idle thread if fast idle is not possible.
2090	*/
2091	if (processor->processor_primary != processor)
2092	return (processor->idle_thread);
2093
2094	if ((thread->state & (TH_IDLE\|TH_TERMINATE\|TH_SUSP)) \|\| !(thread->state & TH_WAIT) \|\| thread->wake_active \|\| thread->sched_pri >= BASEPRI_RTQUEUES)
2095	return (processor->idle_thread);
2096
2097	/*
2098	* Perform idling activities directly without a
2099	* context switch. Return dispatched thread,
2100	* else check again for a runnable thread.
2101	*/
2102	new_thread = thread_select_idle(thread, processor);
2103
2104	#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
2105
2106	/*
2107	* Do a full context switch to idle so that the current
2108	* thread can start running on another processor without
2109	* waiting for the fast-idled processor to wake up.
2110	*/
2111	new_thread = processor->idle_thread;
2112
2113	#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
2114
2115	} while (new_thread == THREAD_NULL);
2116
2117	return (new_thread);
2118	}
2119
2120	#if CONFIG_SCHED_IDLE_IN_PLACE
2121	/*
2122	* thread_select_idle:
2123	*
2124	* Idle the processor using the current thread context.
2125	*
2126	* Called with thread locked, then dropped and relocked.
2127	*/
2128	static thread_t
2129	thread_select_idle(
2130	thread_t thread,
2131	processor_t processor)
2132	{
2133	thread_t new_thread;
2134	uint64_t arg1, arg2;
2135	int urgency;
2136
2137	sched_run_decr(thread);
2138
2139	thread->state \|= TH_IDLE;
2140	processor_state_update_idle(procssor);
2141
2142	/ Reload precise timing global policy to thread-local policy /
2143	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2144
2145	thread_unlock(thread);
2146
2147	/*
2148	* Switch execution timing to processor idle thread.
2149	*/
2150	processor->last_dispatch = mach_absolute_time();
2151
2152	#ifdef CONFIG_MACH_APPROXIMATE_TIME
2153	commpage_update_mach_approximate_time(processor->last_dispatch);
2154	#endif
2155
2156	thread->last_run_time = processor->last_dispatch;
2157	processor_timer_switch_thread(processor->last_dispatch,
2158	&processor->idle_thread->system_timer);
2159	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
2160
2161
2162	/*
2163	* Cancel the quantum timer while idling.
2164	*/
2165	timer_call_quantum_timer_cancel(&processor->quantum_timer);
2166	processor->first_timeslice = FALSE;
2167
2168	if (thread->sched_call) {
2169	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2170	}
2171
2172	thread_tell_urgency(THREAD_URGENCY_NONE, `0`, `0`, `0`, NULL);
2173
2174	/*
2175	* Enable interrupts and perform idling activities. No
2176	* preemption due to TH_IDLE being set.
2177	*/
2178	spllo(); new_thread = processor_idle(thread, processor);
2179
2180	/*
2181	* Return at splsched.
2182	*/
2183	if (thread->sched_call) {
2184	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2185	}
2186
2187	thread_lock(thread);
2188
2189	/*
2190	* If awakened, switch to thread timer and start a new quantum.
2191	* Otherwise skip; we will context switch to another thread or return here.
2192	*/
2193	if (!(thread->state & TH_WAIT)) {
2194	uint64_t time_now = processor->last_dispatch = mach_absolute_time();
2195	processor_timer_switch_thread(time_now, &thread->system_timer);
2196	timer_update(&thread->runnable_timer, time_now);
2197	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2198	thread_quantum_init(thread);
2199	processor->quantum_end = time_now + thread->quantum_remaining;
2200	timer_call_quantum_timer_enter(&processor->quantum_timer,
2201	thread, processor->quantum_end, time_now);
2202	processor->first_timeslice = TRUE;
2203
2204	thread->computation_epoch = time_now;
2205	}
2206
2207	thread->state &= ~TH_IDLE;
2208
2209	urgency = thread_get_urgency(thread, &arg1, &arg2);
2210
2211	thread_tell_urgency(urgency, arg1, arg2, `0`, new_thread);
2212
2213	sched_run_incr(thread);
2214
2215	return (new_thread);
2216	}
2217	#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2218
2219	/*
2220	* thread_invoke
2221	*
2222	* Called at splsched with neither thread locked.
2223	*
2224	* Perform a context switch and start executing the new thread.
2225	*
2226	* Returns FALSE when the context switch didn't happen.
2227	* The reference to the new thread is still consumed.
2228	*
2229	* "self" is what is currently running on the processor,
2230	* "thread" is the new thread to context switch to
2231	* (which may be the same thread in some cases)
2232	*/
2233	static boolean_t
2234	thread_invoke(
2235	thread_t self,
2236	thread_t thread,
2237	ast_t reason)
2238	{
2239	if (__improbable(get_preemption_level() != `0`)) {
2240	int pl = get_preemption_level();
2241	panic("thread_invoke: preemption_level %d, possible cause: %s",
2242	pl, (pl < `0` ? "unlocking an unlocked mutex or spinlock" :
2243	"blocking while holding a spinlock, or within interrupt context"));
2244	}
2245
2246	thread_continue_t continuation = self->continuation;
2247	void *parameter = self->parameter;
2248	processor_t processor;
2249
2250	uint64_t ctime = mach_absolute_time();
2251
2252	#ifdef CONFIG_MACH_APPROXIMATE_TIME
2253	commpage_update_mach_approximate_time(ctime);
2254	#endif
2255
2256	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2257	if ((thread->state & TH_IDLE) == `0`)
2258	sched_timeshare_consider_maintenance(ctime);
2259	#endif
2260
2261	#if MONOTONIC
2262	mt_sched_update(self);
2263	#endif /* MONOTONIC */
2264
2265	assert_thread_magic(self);
2266	assert(self == current_thread());
2267	assert(self->runq == PROCESSOR_NULL);
2268	assert((self->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
2269
2270	thread_lock(thread);
2271
2272	assert_thread_magic(thread);
2273	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
2274	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
2275	assert(thread->runq == PROCESSOR_NULL);
2276
2277	/ Reload precise timing global policy to thread-local policy /
2278	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2279
2280	/ Update SFI class based on other factors /
2281	thread->sfi_class = sfi_thread_classify(thread);
2282
2283	/ Update the same_pri_latency for the thread (used by perfcontrol callouts) /
2284	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2285	/*
2286	* In case a base_pri update happened between the timestamp and
2287	* taking the thread lock
2288	*/
2289	if (ctime <= thread->last_basepri_change_time)
2290	thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2291
2292	/ Allow realtime threads to hang onto a stack. /
2293	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2294	self->reserved_stack = self->kernel_stack;
2295
2296	/ Prepare for spin debugging /
2297	#if INTERRUPT_MASKED_DEBUG
2298	ml_spin_debug_clear(thread);
2299	#endif
2300
2301	if (continuation != NULL) {
2302	if (!thread->kernel_stack) {
2303	/*
2304	* If we are using a privileged stack,
2305	* check to see whether we can exchange it with
2306	* that of the other thread.
2307	*/
2308	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2309	goto need_stack;
2310
2311	/*
2312	* Context switch by performing a stack handoff.
2313	*/
2314	continuation = thread->continuation;
2315	parameter = thread->parameter;
2316
2317	processor = current_processor();
2318	processor->active_thread = thread;
2319	processor_state_update_from_thread(processor, thread);
2320
2321	if (thread->last_processor != processor && thread->last_processor != NULL) {
2322	if (thread->last_processor->processor_set != processor->processor_set)
2323	thread->ps_switch++;
2324	thread->p_switch++;
2325	}
2326	thread->last_processor = processor;
2327	thread->c_switch++;
2328	ast_context(thread);
2329
2330	thread_unlock(thread);
2331
2332	self->reason = reason;
2333
2334	processor->last_dispatch = ctime;
2335	self->last_run_time = ctime;
2336	processor_timer_switch_thread(ctime, &thread->system_timer);
2337	timer_update(&thread->runnable_timer, ctime);
2338	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2339
2340	/*
2341	* Since non-precise user/kernel time doesn't update the state timer
2342	* during privilege transitions, synthesize an event now.
2343	*/
2344	if (!thread->precise_user_kernel_time) {
2345	timer_update(PROCESSOR_DATA(processor, current_state), ctime);
2346	}
2347
2348	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2349	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)\|DBG_FUNC_NONE,
2350	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
2351
2352	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2353	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
2354	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, `0`, `0`, `0`);
2355	}
2356
2357	DTRACE_SCHED2(off__cpu, struct thread , thread, struct* proc *, thread->task->bsd_info);
2358
2359	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2360
2361	#if KPERF
2362	kperf_off_cpu(self);
2363	#endif /* KPERF */
2364
2365	TLOG(`1`, "thread_invoke: calling stack_handoff\n");
2366	stack_handoff(self, thread);
2367
2368	/ 'self' is now off core /
2369	assert(thread == current_thread_volatile());
2370
2371	DTRACE_SCHED(on__cpu);
2372
2373	#if KPERF
2374	kperf_on_cpu(thread, continuation, NULL);
2375	#endif /* KPERF */
2376
2377	thread_dispatch(self, thread);
2378
2379	#if KASAN
2380	/ Old thread's stack has been moved to the new thread, so explicitly*
2381	* unpoison it. */
2382	kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2383	#endif
2384
2385	thread->continuation = thread->parameter = NULL;
2386
2387	counter(c_thread_invoke_hits++);
2388
2389	assert(continuation);
2390	call_continuation(continuation, parameter, thread->wait_result, TRUE);
2391	/NOTREACHED/
2392	}
2393	else if (thread == self) {
2394	/ same thread but with continuation /
2395	ast_context(self);
2396	counter(++c_thread_invoke_same);
2397
2398	thread_unlock(self);
2399
2400	#if KPERF
2401	kperf_on_cpu(thread, continuation, NULL);
2402	#endif /* KPERF */
2403
2404	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2405	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
2406	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
2407
2408	#if KASAN
2409	/ stack handoff to self - no thread_dispatch(), so clear the stack*
2410	* and free the fakestack directly */
2411	kasan_fakestack_drop(self);
2412	kasan_fakestack_gc(self);
2413	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
2414	#endif
2415
2416	self->continuation = self->parameter = NULL;
2417
2418	call_continuation(continuation, parameter, self->wait_result, TRUE);
2419	/NOTREACHED/
2420	}
2421	} else {
2422	/*
2423	* Check that the other thread has a stack
2424	*/
2425	if (!thread->kernel_stack) {
2426	need_stack:
2427	if (!stack_alloc_try(thread)) {
2428	counter(c_thread_invoke_misses++);
2429	thread_unlock(thread);
2430	thread_stack_enqueue(thread);
2431	return (FALSE);
2432	}
2433	} else if (thread == self) {
2434	ast_context(self);
2435	counter(++c_thread_invoke_same);
2436	thread_unlock(self);
2437
2438	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2439	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
2440	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
2441
2442	return (TRUE);
2443	}
2444	}
2445
2446	/*
2447	* Context switch by full context save.
2448	*/
2449	processor = current_processor();
2450	processor->active_thread = thread;
2451	processor_state_update_from_thread(processor, thread);
2452
2453	if (thread->last_processor != processor && thread->last_processor != NULL) {
2454	if (thread->last_processor->processor_set != processor->processor_set)
2455	thread->ps_switch++;
2456	thread->p_switch++;
2457	}
2458	thread->last_processor = processor;
2459	thread->c_switch++;
2460	ast_context(thread);
2461
2462	thread_unlock(thread);
2463
2464	counter(c_thread_invoke_csw++);
2465
2466	self->reason = reason;
2467
2468	processor->last_dispatch = ctime;
2469	self->last_run_time = ctime;
2470	processor_timer_switch_thread(ctime, &thread->system_timer);
2471	timer_update(&thread->runnable_timer, ctime);
2472	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2473
2474	/*
2475	* Since non-precise user/kernel time doesn't update the state timer
2476	* during privilege transitions, synthesize an event now.
2477	*/
2478	if (!thread->precise_user_kernel_time) {
2479	timer_update(PROCESSOR_DATA(processor, current_state), ctime);
2480	}
2481
2482	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2483	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
2484	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, `0`);
2485
2486	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2487	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
2488	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, `0`, `0`, `0`);
2489	}
2490
2491	DTRACE_SCHED2(off__cpu, struct thread , thread, struct* proc *, thread->task->bsd_info);
2492
2493	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2494
2495	#if KPERF
2496	kperf_off_cpu(self);
2497	#endif /* KPERF */
2498
2499	/*
2500	* This is where we actually switch register context,
2501	* and address space if required. We will next run
2502	* as a result of a subsequent context switch.
2503	*
2504	* Once registers are switched and the processor is running "thread",
2505	* the stack variables and non-volatile registers will contain whatever
2506	* was there the last time that thread blocked. No local variables should
2507	* be used after this point, except for the special case of "thread", which
2508	* the platform layer returns as the previous thread running on the processor
2509	* via the function call ABI as a return register, and "self", which may have
2510	* been stored on the stack or a non-volatile register, but a stale idea of
2511	* what was on the CPU is newly-accurate because that thread is again
2512	* running on the CPU.
2513	*/
2514	assert(continuation == self->continuation);
2515	thread = machine_switch_context(self, continuation, thread);
2516	assert(self == current_thread_volatile());
2517	TLOG(`1`,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2518
2519	DTRACE_SCHED(on__cpu);
2520
2521	#if KPERF
2522	kperf_on_cpu(self, NULL, __builtin_frame_address(`0`));
2523	#endif /* KPERF */
2524
2525	/*
2526	* We have been resumed and are set to run.
2527	*/
2528	thread_dispatch(thread, self);
2529
2530	if (continuation) {
2531	self->continuation = self->parameter = NULL;
2532
2533	call_continuation(continuation, parameter, self->wait_result, TRUE);
2534	/NOTREACHED/
2535	}
2536
2537	return (TRUE);
2538	}
2539
2540	#if defined(CONFIG_SCHED_DEFERRED_AST)
2541	/*
2542	* pset_cancel_deferred_dispatch:
2543	*
2544	* Cancels all ASTs that we can cancel for the given processor set
2545	* if the current processor is running the last runnable thread in the
2546	* system.
2547	*
2548	* This function assumes the current thread is runnable. This must
2549	* be called with the pset unlocked.
2550	*/
2551	static void
2552	pset_cancel_deferred_dispatch(
2553	processor_set_t pset,
2554	processor_t processor)
2555	{
2556	processor_t active_processor = NULL;
2557	uint32_t sampled_sched_run_count;
2558
2559	pset_lock(pset);
2560	sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
2561
2562	/*
2563	* If we have emptied the run queue, and our current thread is runnable, we
2564	* should tell any processors that are still DISPATCHING that they will
2565	* probably not have any work to do. In the event that there are no
2566	* pending signals that we can cancel, this is also uninteresting.
2567	*
2568	* In the unlikely event that another thread becomes runnable while we are
2569	* doing this (sched_run_count is atomically updated, not guarded), the
2570	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
2571	* in order to dispatch it to a processor in our pset. So, the other
2572	* codepath will wait while we squash all cancelable ASTs, get the pset
2573	* lock, and then dispatch the freshly runnable thread. So this should be
2574	* correct (we won't accidentally have a runnable thread that hasn't been
2575	* dispatched to an idle processor), if not ideal (we may be restarting the
2576	* dispatch process, which could have some overhead).
2577	*/
2578
2579	if ((sampled_sched_run_count == `1`) && (pset->pending_deferred_AST_cpu_mask)) {
2580	uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
2581	pset->pending_deferred_AST_cpu_mask &
2582	~pset->pending_AST_cpu_mask);
2583	for (int cpuid = lsb_first(dispatching_map); cpuid >= `0`; cpuid = lsb_next(dispatching_map, cpuid)) {
2584	active_processor = processor_array[cpuid];
2585	/*
2586	* If a processor is DISPATCHING, it could be because of
2587	* a cancelable signal.
2588	*
2589	* IF the processor is not our
2590	* current processor (the current processor should not
2591	* be DISPATCHING, so this is a bit paranoid), AND there
2592	* is a cancelable signal pending on the processor, AND
2593	* there is no non-cancelable signal pending (as there is
2594	* no point trying to backtrack on bringing the processor
2595	* up if a signal we cannot cancel is outstanding), THEN
2596	* it should make sense to roll back the processor state
2597	* to the IDLE state.
2598	*
2599	* If the racey nature of this approach (as the signal
2600	* will be arbitrated by hardware, and can fire as we
2601	* roll back state) results in the core responding
2602	* despite being pushed back to the IDLE state, it
2603	* should be no different than if the core took some
2604	* interrupt while IDLE.
2605	*/
2606	if (active_processor != processor) {
2607	/*
2608	* Squash all of the processor state back to some
2609	* reasonable facsimile of PROCESSOR_IDLE.
2610	*/
2611
2612	assert(active_processor->next_thread == THREAD_NULL);
2613	processor_state_update_idle(active_processor);
2614	active_processor->deadline = UINT64_MAX;
2615	pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
2616	bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
2617	machine_signal_idle_cancel(active_processor);
2618	}
2619
2620	}
2621	}
2622
2623	pset_unlock(pset);
2624	}
2625	#else
2626	/ We don't support deferred ASTs; everything is candycanes and sunshine. /
2627	#endif
2628
2629	static void
2630	thread_csw_callout(
2631	thread_t old,
2632	thread_t new,
2633	uint64_t timestamp)
2634	{
2635	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
2636	uint64_t same_pri_latency = (new->state & TH_IDLE) ? `0` : new->same_pri_latency;
2637	machine_switch_perfcontrol_context(event, timestamp, `0`,
2638	same_pri_latency, old, new);
2639	}
2640
2641
2642	/*
2643	* thread_dispatch:
2644	*
2645	* Handle threads at context switch. Re-dispatch other thread
2646	* if still running, otherwise update run state and perform
2647	* special actions. Update quantum for other thread and begin
2648	* the quantum for ourselves.
2649	*
2650	* "thread" is the old thread that we have switched away from.
2651	* "self" is the new current thread that we have context switched to
2652	*
2653	* Called at splsched.
2654	*/
2655	void
2656	thread_dispatch(
2657	thread_t thread,
2658	thread_t self)
2659	{
2660	processor_t processor = self->last_processor;
2661
2662	assert(processor == current_processor());
2663	assert(self == current_thread_volatile());
2664	assert(thread != self);
2665
2666	if (thread != THREAD_NULL) {
2667	/*
2668	* Do the perfcontrol callout for context switch.
2669	* The reason we do this here is:
2670	* - thread_dispatch() is called from various places that are not
2671	* the direct context switch path for eg. processor shutdown etc.
2672	* So adding the callout here covers all those cases.
2673	* - We want this callout as early as possible to be close
2674	* to the timestamp taken in thread_invoke()
2675	* - We want to avoid holding the thread lock while doing the
2676	* callout
2677	* - We do not want to callout if "thread" is NULL.
2678	*/
2679	thread_csw_callout(thread, self, processor->last_dispatch);
2680
2681	#if KASAN
2682	if (thread->continuation != NULL) {
2683	/*
2684	* Thread has a continuation and the normal stack is going away.
2685	* Unpoison the stack and mark all fakestack objects as unused.
2686	*/
2687	kasan_fakestack_drop(thread);
2688	if (thread->kernel_stack) {
2689	kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2690	}
2691	}
2692
2693	/*
2694	* Free all unused fakestack objects.
2695	*/
2696	kasan_fakestack_gc(thread);
2697	#endif
2698
2699	/*
2700	* If blocked at a continuation, discard
2701	* the stack.
2702	*/
2703	if (thread->continuation != NULL && thread->kernel_stack != `0`)
2704	stack_free(thread);
2705
2706	if (thread->state & TH_IDLE) {
2707	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2708	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
2709	(uintptr_t)thread_tid(thread), `0`, thread->state,
2710	sched_run_buckets[TH_BUCKET_RUN], `0`);
2711	} else {
2712	int64_t consumed;
2713	int64_t remainder = `0`;
2714
2715	if (processor->quantum_end > processor->last_dispatch)
2716	remainder = processor->quantum_end -
2717	processor->last_dispatch;
2718
2719	consumed = thread->quantum_remaining - remainder;
2720
2721	if ((thread->reason & AST_LEDGER) == `0`) {
2722	/*
2723	* Bill CPU time to both the task and
2724	* the individual thread.
2725	*/
2726	ledger_credit_thread(thread, thread->t_ledger,
2727	task_ledgers.cpu_time, consumed);
2728	ledger_credit_thread(thread, thread->t_threadledger,
2729	thread_ledgers.cpu_time, consumed);
2730	if (thread->t_bankledger) {
2731	ledger_credit_thread(thread, thread->t_bankledger,
2732	bank_ledgers.cpu_time,
2733	(consumed - thread->t_deduct_bank_ledger_time));
2734	}
2735	thread->t_deduct_bank_ledger_time = `0`;
2736	}
2737
2738	wake_lock(thread);
2739	thread_lock(thread);
2740
2741	/*
2742	* Apply a priority floor if the thread holds a kernel resource
2743	* Do this before checking starting_pri to avoid overpenalizing
2744	* repeated rwlock blockers.
2745	*/
2746	if (__improbable(thread->rwlock_count != `0`))
2747	lck_rw_set_promotion_locked(thread);
2748
2749	boolean_t keep_quantum = processor->first_timeslice;
2750
2751	/*
2752	* Treat a thread which has dropped priority since it got on core
2753	* as having expired its quantum.
2754	*/
2755	if (processor->starting_pri > thread->sched_pri)
2756	keep_quantum = FALSE;
2757
2758	/ Compute remainder of current quantum. /
2759	if (keep_quantum &&
2760	processor->quantum_end > processor->last_dispatch)
2761	thread->quantum_remaining = (uint32_t)remainder;
2762	else
2763	thread->quantum_remaining = `0`;
2764
2765	if (thread->sched_mode == TH_MODE_REALTIME) {
2766	/*
2767	* Cancel the deadline if the thread has
2768	* consumed the entire quantum.
2769	*/
2770	if (thread->quantum_remaining == `0`) {
2771	thread->realtime.deadline = UINT64_MAX;
2772	}
2773	} else {
2774	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2775	/*
2776	* For non-realtime threads treat a tiny
2777	* remaining quantum as an expired quantum
2778	* but include what's left next time.
2779	*/
2780	if (thread->quantum_remaining < min_std_quantum) {
2781	thread->reason \|= AST_QUANTUM;
2782	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2783	}
2784	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2785	}
2786
2787	/*
2788	* If we are doing a direct handoff then
2789	* take the remainder of the quantum.
2790	*/
2791	if ((thread->reason & (AST_HANDOFF\|AST_QUANTUM)) == AST_HANDOFF) {
2792	self->quantum_remaining = thread->quantum_remaining;
2793	thread->reason \|= AST_QUANTUM;
2794	thread->quantum_remaining = `0`;
2795	} else {
2796	#if defined(CONFIG_SCHED_MULTIQ)
2797	if (SCHED(sched_groups_enabled) &&
2798	thread->sched_group == self->sched_group) {
2799	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2800	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
2801	self->reason, (uintptr_t)thread_tid(thread),
2802	self->quantum_remaining, thread->quantum_remaining, `0`);
2803
2804	self->quantum_remaining = thread->quantum_remaining;
2805	thread->quantum_remaining = `0`;
2806	/ Don't set AST_QUANTUM here - old thread might still want to preempt someone else /
2807	}
2808	#endif /* defined(CONFIG_SCHED_MULTIQ) */
2809	}
2810
2811	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2812
2813	if (!(thread->state & TH_WAIT)) {
2814	/*
2815	* Still runnable.
2816	*/
2817	thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
2818
2819	machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
2820
2821	ast_t reason = thread->reason;
2822	sched_options_t options = SCHED_NONE;
2823
2824	if (reason & AST_REBALANCE) {
2825	options \|= SCHED_REBALANCE;
2826	if (reason & AST_QUANTUM) {
2827	/*
2828	* Having gone to the trouble of forcing this thread off a less preferred core,
2829	* we should force the preferable core to reschedule immediately to give this
2830	* thread a chance to run instead of just sitting on the run queue where
2831	* it may just be stolen back by the idle core we just forced it off.
2832	* But only do this at the end of a quantum to prevent cascading effects.
2833	*/
2834	options \|= SCHED_PREEMPT;
2835	}
2836	}
2837
2838	if (reason & AST_QUANTUM)
2839	options \|= SCHED_TAILQ;
2840	else if (reason & AST_PREEMPT)
2841	options \|= SCHED_HEADQ;
2842	else
2843	options \|= (SCHED_PREEMPT \| SCHED_TAILQ);
2844
2845	thread_setrun(thread, options);
2846
2847	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2848	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
2849	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
2850	sched_run_buckets[TH_BUCKET_RUN], `0`);
2851
2852	if (thread->wake_active) {
2853	thread->wake_active = FALSE;
2854	thread_unlock(thread);
2855
2856	thread_wakeup(&thread->wake_active);
2857	} else {
2858	thread_unlock(thread);
2859	}
2860
2861	wake_unlock(thread);
2862	} else {
2863	/*
2864	* Waiting.
2865	*/
2866	boolean_t should_terminate = FALSE;
2867	uint32_t new_run_count;
2868	int thread_state = thread->state;
2869
2870	/ Only the first call to thread_dispatch*
2871	* after explicit termination should add
2872	* the thread to the termination queue
2873	*/
2874	if ((thread_state & (TH_TERMINATE\|TH_TERMINATE2)) == TH_TERMINATE) {
2875	should_terminate = TRUE;
2876	thread_state \|= TH_TERMINATE2;
2877	}
2878
2879	timer_stop(&thread->runnable_timer, processor->last_dispatch);
2880
2881	thread_state &= ~TH_RUN;
2882	thread->state = thread_state;
2883
2884	thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
2885	thread->chosen_processor = PROCESSOR_NULL;
2886
2887	new_run_count = sched_run_decr(thread);
2888
2889	#if CONFIG_SCHED_SFI
2890	if (thread->reason & AST_SFI) {
2891	thread->wait_sfi_begin_time = processor->last_dispatch;
2892	}
2893	#endif
2894
2895	machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
2896
2897	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2898	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
2899	(uintptr_t)thread_tid(thread), thread->reason, thread_state,
2900	new_run_count, `0`);
2901
2902	if (thread_state & TH_WAIT_REPORT) {
2903	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2904	}
2905
2906	if (thread->wake_active) {
2907	thread->wake_active = FALSE;
2908	thread_unlock(thread);
2909
2910	thread_wakeup(&thread->wake_active);
2911	} else {
2912	thread_unlock(thread);
2913	}
2914
2915	wake_unlock(thread);
2916
2917	if (should_terminate)
2918	thread_terminate_enqueue(thread);
2919	}
2920	}
2921	}
2922
2923	int urgency = THREAD_URGENCY_NONE;
2924	uint64_t latency = `0`;
2925
2926	/ Update (new) current thread and reprogram quantum timer /
2927	thread_lock(self);
2928
2929	if (!(self->state & TH_IDLE)) {
2930	uint64_t arg1, arg2;
2931
2932	#if CONFIG_SCHED_SFI
2933	ast_t new_ast;
2934
2935	new_ast = sfi_thread_needs_ast(self, NULL);
2936
2937	if (new_ast != AST_NONE) {
2938	ast_on(new_ast);
2939	}
2940	#endif
2941
2942	assertf(processor->last_dispatch >= self->last_made_runnable_time,
2943	"Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx",
2944	processor->last_dispatch, self->last_made_runnable_time);
2945
2946	assert(self->last_made_runnable_time <= self->last_basepri_change_time);
2947
2948	latency = processor->last_dispatch - self->last_made_runnable_time;
2949	assert(latency >= self->same_pri_latency);
2950
2951	urgency = thread_get_urgency(self, &arg1, &arg2);
2952
2953	thread_tell_urgency(urgency, arg1, arg2, latency, self);
2954
2955	/*
2956	* Get a new quantum if none remaining.
2957	*/
2958	if (self->quantum_remaining == `0`) {
2959	thread_quantum_init(self);
2960	}
2961
2962	/*
2963	* Set up quantum timer and timeslice.
2964	*/
2965	processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2966	timer_call_quantum_timer_enter(&processor->quantum_timer, self,
2967	processor->quantum_end, processor->last_dispatch);
2968
2969	processor->first_timeslice = TRUE;
2970	} else {
2971	timer_call_quantum_timer_cancel(&processor->quantum_timer);
2972	processor->first_timeslice = FALSE;
2973
2974	thread_tell_urgency(THREAD_URGENCY_NONE, `0`, `0`, `0`, self);
2975	}
2976
2977	assert(self->block_hint == kThreadWaitNone);
2978	self->computation_epoch = processor->last_dispatch;
2979	self->reason = AST_NONE;
2980	processor->starting_pri = self->sched_pri;
2981
2982	thread_unlock(self);
2983
2984	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
2985	processor->last_dispatch);
2986
2987	#if defined(CONFIG_SCHED_DEFERRED_AST)
2988	/*
2989	* TODO: Can we state that redispatching our old thread is also
2990	* uninteresting?
2991	*/
2992	if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == `1`) &&
2993	!(self->state & TH_IDLE)) {
2994	pset_cancel_deferred_dispatch(processor->processor_set, processor);
2995	}
2996	#endif
2997	}
2998
2999	/*
3000	* thread_block_reason:
3001	*
3002	* Forces a reschedule, blocking the caller if a wait
3003	* has been asserted.
3004	*
3005	* If a continuation is specified, then thread_invoke will
3006	* attempt to discard the thread's kernel stack. When the
3007	* thread resumes, it will execute the continuation function
3008	* on a new kernel stack.
3009	*/
3010	counter(mach_counter_t c_thread_block_calls = `0`;)
3011
3012	wait_result_t
3013	thread_block_reason(
3014	thread_continue_t continuation,
3015	void *parameter,
3016	ast_t reason)
3017	{
3018	thread_t self = current_thread();
3019	processor_t processor;
3020	thread_t new_thread;
3021	spl_t s;
3022
3023	counter(++c_thread_block_calls);
3024
3025	s = splsched();
3026
3027	processor = current_processor();
3028
3029	/ If we're explicitly yielding, force a subsequent quantum /
3030	if (reason & AST_YIELD)
3031	processor->first_timeslice = FALSE;
3032
3033	/ We're handling all scheduling AST's /
3034	ast_off(AST_SCHEDULING);
3035
3036	#if PROC_REF_DEBUG
3037	if ((continuation != NULL) && (self->task != kernel_task)) {
3038	if (uthread_get_proc_refcount(self->uthread) != `0`) {
3039	panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
3040	}
3041	}
3042	#endif
3043
3044	self->continuation = continuation;
3045	self->parameter = parameter;
3046
3047	if (self->state & ~(TH_RUN \| TH_IDLE)) {
3048	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3049	MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
3050	reason, VM_KERNEL_UNSLIDE(continuation), `0`, `0`, `0`);
3051	}
3052
3053	do {
3054	thread_lock(self);
3055	new_thread = thread_select(self, processor, &reason);
3056	thread_unlock(self);
3057	} while (!thread_invoke(self, new_thread, reason));
3058
3059	splx(s);
3060
3061	return (self->wait_result);
3062	}
3063
3064	/*
3065	* thread_block:
3066	*
3067	* Block the current thread if a wait has been asserted.
3068	*/
3069	wait_result_t
3070	thread_block(
3071	thread_continue_t continuation)
3072	{
3073	return thread_block_reason(continuation, NULL, AST_NONE);
3074	}
3075
3076	wait_result_t
3077	thread_block_parameter(
3078	thread_continue_t continuation,
3079	void *parameter)
3080	{
3081	return thread_block_reason(continuation, parameter, AST_NONE);
3082	}
3083
3084	/*
3085	* thread_run:
3086	*
3087	* Switch directly from the current thread to the
3088	* new thread, handing off our quantum if appropriate.
3089	*
3090	* New thread must be runnable, and not on a run queue.
3091	*
3092	* Called at splsched.
3093	*/
3094	int
3095	thread_run(
3096	thread_t self,
3097	thread_continue_t continuation,
3098	void *parameter,
3099	thread_t new_thread)
3100	{
3101	ast_t reason = AST_HANDOFF;
3102
3103	self->continuation = continuation;
3104	self->parameter = parameter;
3105
3106	while (!thread_invoke(self, new_thread, reason)) {
3107	/ the handoff failed, so we have to fall back to the normal block path /
3108	processor_t processor = current_processor();
3109
3110	reason = AST_NONE;
3111
3112	thread_lock(self);
3113	new_thread = thread_select(self, processor, &reason);
3114	thread_unlock(self);
3115	}
3116
3117	return (self->wait_result);
3118	}
3119
3120	/*
3121	* thread_continue:
3122	*
3123	* Called at splsched when a thread first receives
3124	* a new stack after a continuation.
3125	*/
3126	void
3127	thread_continue(
3128	thread_t thread)
3129	{
3130	thread_t self = current_thread();
3131	thread_continue_t continuation;
3132	void *parameter;
3133
3134	DTRACE_SCHED(on__cpu);
3135
3136	continuation = self->continuation;
3137	parameter = self->parameter;
3138
3139	#if KPERF
3140	kperf_on_cpu(self, continuation, NULL);
3141	#endif
3142
3143	thread_dispatch(thread, self);
3144
3145	self->continuation = self->parameter = NULL;
3146
3147	#if INTERRUPT_MASKED_DEBUG
3148	/ Reset interrupt-masked spin debugging timeout /
3149	ml_spin_debug_clear(self);
3150	#endif
3151
3152	TLOG(`1`, "thread_continue: calling call_continuation\n");
3153
3154	boolean_t enable_interrupts = thread != THREAD_NULL;
3155	call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3156	/NOTREACHED/
3157	}
3158
3159	void
3160	thread_quantum_init(thread_t thread)
3161	{
3162	if (thread->sched_mode == TH_MODE_REALTIME) {
3163	thread->quantum_remaining = thread->realtime.computation;
3164	} else {
3165	thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3166	}
3167	}
3168
3169	uint32_t
3170	sched_timeshare_initial_quantum_size(thread_t thread)
3171	{
3172	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
3173	return bg_quantum;
3174	else
3175	return std_quantum;
3176	}
3177
3178	/*
3179	* run_queue_init:
3180	*
3181	* Initialize a run queue before first use.
3182	*/
3183	void
3184	run_queue_init(
3185	run_queue_t rq)
3186	{
3187	rq->highq = NOPRI;
3188	for (u_int i = `0`; i < BITMAP_LEN(NRQS); i++)
3189	rq->bitmap[i] = `0`;
3190	rq->urgency = rq->count = `0`;
3191	for (int i = `0`; i < NRQS; i++)
3192	queue_init(&rq->queues[i]);
3193	}
3194
3195	/*
3196	* run_queue_dequeue:
3197	*
3198	* Perform a dequeue operation on a run queue,
3199	* and return the resulting thread.
3200	*
3201	* The run queue must be locked (see thread_run_queue_remove()
3202	* for more info), and not empty.
3203	*/
3204	thread_t
3205	run_queue_dequeue(
3206	run_queue_t rq,
3207	integer_t options)
3208	{
3209	thread_t thread;
3210	queue_t queue = &rq->queues[rq->highq];
3211
3212	if (options & SCHED_HEADQ) {
3213	thread = qe_dequeue_head(queue, struct thread, runq_links);
3214	} else {
3215	thread = qe_dequeue_tail(queue, struct thread, runq_links);
3216	}
3217
3218	assert(thread != THREAD_NULL);
3219	assert_thread_magic(thread);
3220
3221	thread->runq = PROCESSOR_NULL;
3222	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3223	rq->count--;
3224	if (SCHED(priority_is_urgent)(rq->highq)) {
3225	rq->urgency--; assert(rq->urgency >= `0`);
3226	}
3227	if (queue_empty(queue)) {
3228	bitmap_clear(rq->bitmap, rq->highq);
3229	rq->highq = bitmap_first(rq->bitmap, NRQS);
3230	}
3231
3232	return thread;
3233	}
3234
3235	/*
3236	* run_queue_enqueue:
3237	*
3238	* Perform a enqueue operation on a run queue.
3239	*
3240	* The run queue must be locked (see thread_run_queue_remove()
3241	* for more info).
3242	*/
3243	boolean_t
3244	run_queue_enqueue(
3245	run_queue_t rq,
3246	thread_t thread,
3247	integer_t options)
3248	{
3249	queue_t queue = &rq->queues[thread->sched_pri];
3250	boolean_t result = FALSE;
3251
3252	assert_thread_magic(thread);
3253
3254	if (queue_empty(queue)) {
3255	enqueue_tail(queue, &thread->runq_links);
3256
3257	rq_bitmap_set(rq->bitmap, thread->sched_pri);
3258	if (thread->sched_pri > rq->highq) {
3259	rq->highq = thread->sched_pri;
3260	result = TRUE;
3261	}
3262	} else {
3263	if (options & SCHED_TAILQ)
3264	enqueue_tail(queue, &thread->runq_links);
3265	else
3266	enqueue_head(queue, &thread->runq_links);
3267	}
3268	if (SCHED(priority_is_urgent)(thread->sched_pri))
3269	rq->urgency++;
3270	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3271	rq->count++;
3272
3273	return (result);
3274	}
3275
3276	/*
3277	* run_queue_remove:
3278	*
3279	* Remove a specific thread from a runqueue.
3280	*
3281	* The run queue must be locked.
3282	*/
3283	void
3284	run_queue_remove(
3285	run_queue_t rq,
3286	thread_t thread)
3287	{
3288	assert(thread->runq != PROCESSOR_NULL);
3289	assert_thread_magic(thread);
3290
3291	remqueue(&thread->runq_links);
3292	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3293	rq->count--;
3294	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3295	rq->urgency--; assert(rq->urgency >= `0`);
3296	}
3297
3298	if (queue_empty(&rq->queues[thread->sched_pri])) {
3299	/ update run queue status /
3300	bitmap_clear(rq->bitmap, thread->sched_pri);
3301	rq->highq = bitmap_first(rq->bitmap, NRQS);
3302	}
3303
3304	thread->runq = PROCESSOR_NULL;
3305	}
3306
3307	/ Assumes RT lock is not held, and acquires splsched/rt_lock itself /
3308	void
3309	sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context)
3310	{
3311	spl_t s;
3312	thread_t thread;
3313
3314	processor_set_t pset = &pset0;
3315
3316	s = splsched();
3317	rt_lock_lock(pset);
3318
3319	qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
3320	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3321	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3322	}
3323	}
3324
3325	rt_lock_unlock(pset);
3326	splx(s);
3327	}
3328
3329	int64_t
3330	sched_rtglobal_runq_count_sum(void)
3331	{
3332	return pset0.rt_runq.runq_stats.count_sum;
3333	}
3334
3335	/*
3336	* realtime_queue_insert:
3337	*
3338	* Enqueue a thread for realtime execution.
3339	*/
3340	static boolean_t
3341	realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
3342	{
3343	queue_t queue = &SCHED(rt_runq)(pset)->queue;
3344	uint64_t deadline = thread->realtime.deadline;
3345	boolean_t preempt = FALSE;
3346
3347	rt_lock_lock(pset);
3348
3349	if (queue_empty(queue)) {
3350	enqueue_tail(queue, &thread->runq_links);
3351	preempt = TRUE;
3352	} else {
3353	/ Insert into rt_runq in thread deadline order /
3354	queue_entry_t iter;
3355	qe_foreach(iter, queue) {
3356	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
3357	assert_thread_magic(iter_thread);
3358
3359	if (deadline < iter_thread->realtime.deadline) {
3360	if (iter == queue_first(queue))
3361	preempt = TRUE;
3362	insque(&thread->runq_links, queue_prev(iter));
3363	break;
3364	} else if (iter == queue_last(queue)) {
3365	enqueue_tail(queue, &thread->runq_links);
3366	break;
3367	}
3368	}
3369	}
3370
3371	thread->runq = processor;
3372	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
3373	rt_runq_count_incr(pset);
3374
3375	rt_lock_unlock(pset);
3376
3377	return (preempt);
3378	}
3379
3380	/*
3381	* realtime_setrun:
3382	*
3383	* Dispatch a thread for realtime execution.
3384	*
3385	* Thread must be locked. Associated pset must
3386	* be locked, and is returned unlocked.
3387	*/
3388	static void
3389	realtime_setrun(
3390	processor_t processor,
3391	thread_t thread)
3392	{
3393	processor_set_t pset = processor->processor_set;
3394	pset_assert_locked(pset);
3395	ast_t preempt;
3396
3397	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3398
3399	thread->chosen_processor = processor;
3400
3401	/ <rdar://problem/15102234> /
3402	assert(thread->bound_processor == PROCESSOR_NULL);
3403
3404	/*
3405	* Dispatch directly onto idle processor.
3406	*/
3407	if ( (thread->bound_processor == processor)
3408	&& processor->state == PROCESSOR_IDLE) {
3409
3410	processor->next_thread = thread;
3411	processor_state_update_from_thread(processor, thread);
3412	processor->deadline = thread->realtime.deadline;
3413	pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3414
3415	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
3416	pset_unlock(pset);
3417	sched_ipi_perform(processor, ipi_type);
3418	return;
3419	}
3420
3421	if (processor->current_pri < BASEPRI_RTQUEUES)
3422	preempt = (AST_PREEMPT \| AST_URGENT);
3423	else if (thread->realtime.deadline < processor->deadline)
3424	preempt = (AST_PREEMPT \| AST_URGENT);
3425	else
3426	preempt = AST_NONE;
3427
3428	realtime_queue_insert(processor, pset, thread);
3429
3430	ipi_type = SCHED_IPI_NONE;
3431	if (preempt != AST_NONE) {
3432	if (processor->state == PROCESSOR_IDLE) {
3433	processor->next_thread = THREAD_NULL;
3434	processor_state_update_from_thread(processor, thread);
3435	processor->deadline = thread->realtime.deadline;
3436	pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3437	if (processor == current_processor()) {
3438	ast_on(preempt);
3439	} else {
3440	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
3441	}
3442	} else if (processor->state == PROCESSOR_DISPATCHING) {
3443	if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) \|\| (processor->deadline > thread->realtime.deadline))) {
3444	processor_state_update_from_thread(processor, thread);
3445	processor->deadline = thread->realtime.deadline;
3446	}
3447	} else {
3448	if (processor == current_processor()) {
3449	ast_on(preempt);
3450	} else {
3451	ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT);
3452	}
3453	}
3454	} else {
3455	/ Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. /
3456	}
3457
3458	pset_unlock(pset);
3459	sched_ipi_perform(processor, ipi_type);
3460	}
3461
3462
3463	sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
3464	__unused sched_ipi_event_t event)
3465	{
3466	#if defined(CONFIG_SCHED_DEFERRED_AST)
3467	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
3468	return SCHED_IPI_DEFERRED;
3469	}
3470	#else /* CONFIG_SCHED_DEFERRED_AST */
3471	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
3472	#endif /* CONFIG_SCHED_DEFERRED_AST */
3473	return SCHED_IPI_NONE;
3474	}
3475
3476	sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
3477	{
3478	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3479	assert(dst != NULL);
3480
3481	processor_set_t pset = dst->processor_set;
3482	if (current_processor() == dst) {
3483	return SCHED_IPI_NONE;
3484	}
3485
3486	if (bit_test(pset->pending_AST_cpu_mask, dst->cpu_id)) {
3487	return SCHED_IPI_NONE;
3488	}
3489
3490	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
3491	switch(ipi_type) {
3492	case SCHED_IPI_NONE:
3493	return SCHED_IPI_NONE;
3494	#if defined(CONFIG_SCHED_DEFERRED_AST)
3495	case SCHED_IPI_DEFERRED:
3496	bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
3497	break;
3498	#endif /* CONFIG_SCHED_DEFERRED_AST */
3499	default:
3500	bit_set(pset->pending_AST_cpu_mask, dst->cpu_id);
3501	break;
3502	}
3503	return ipi_type;
3504	}
3505
3506	sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
3507	{
3508	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3509	boolean_t deferred_ipi_supported = false;
3510	processor_set_t pset = dst->processor_set;
3511
3512	#if defined(CONFIG_SCHED_DEFERRED_AST)
3513	deferred_ipi_supported = true;
3514	#endif /* CONFIG_SCHED_DEFERRED_AST */
3515
3516	switch(event) {
3517	case SCHED_IPI_EVENT_SPILL:
3518	case SCHED_IPI_EVENT_SMT_REBAL:
3519	case SCHED_IPI_EVENT_REBALANCE:
3520	case SCHED_IPI_EVENT_BOUND_THR:
3521	/*
3522	* The spill, SMT rebalance, rebalance and the bound thread
3523	* scenarios use immediate IPIs always.
3524	*/
3525	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3526	break;
3527	case SCHED_IPI_EVENT_PREEMPT:
3528	/ In the preemption case, use immediate IPIs for RT threads /
3529	if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
3530	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3531	break;
3532	}
3533
3534	/*
3535	* For Non-RT threads preemption,
3536	* If the core is active, use immediate IPIs.
3537	* If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
3538	*/
3539	if (deferred_ipi_supported && dst_idle) {
3540	return sched_ipi_deferred_policy(pset, dst, event);
3541	}
3542	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3543	break;
3544	default:
3545	panic("Unrecognized scheduler IPI event type %d", event);
3546	}
3547	assert(ipi_type != SCHED_IPI_NONE);
3548	return ipi_type;
3549	}
3550
3551	void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
3552	{
3553	switch (ipi) {
3554	case SCHED_IPI_NONE:
3555	break;
3556	case SCHED_IPI_IDLE:
3557	machine_signal_idle(dst);
3558	break;
3559	case SCHED_IPI_IMMEDIATE:
3560	cause_ast_check(dst);
3561	break;
3562	case SCHED_IPI_DEFERRED:
3563	machine_signal_idle_deferred(dst);
3564	break;
3565	default:
3566	panic("Unrecognized scheduler IPI type: %d", ipi);
3567	}
3568	}
3569
3570	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3571
3572	boolean_t
3573	priority_is_urgent(int priority)
3574	{
3575	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
3576	}
3577
3578	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3579
3580	/*
3581	* processor_setrun:
3582	*
3583	* Dispatch a thread for execution on a
3584	* processor.
3585	*
3586	* Thread must be locked. Associated pset must
3587	* be locked, and is returned unlocked.
3588	*/
3589	static void
3590	processor_setrun(
3591	processor_t processor,
3592	thread_t thread,
3593	integer_t options)
3594	{
3595	processor_set_t pset = processor->processor_set;
3596	pset_assert_locked(pset);
3597	ast_t preempt;
3598	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3599
3600	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3601
3602	thread->chosen_processor = processor;
3603
3604	/*
3605	* Dispatch directly onto idle processor.
3606	*/
3607	if ( (SCHED(direct_dispatch_to_idle_processors) \|\|
3608	thread->bound_processor == processor)
3609	&& processor->state == PROCESSOR_IDLE) {
3610
3611	processor->next_thread = thread;
3612	processor_state_update_from_thread(processor, thread);
3613	processor->deadline = UINT64_MAX;
3614	pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3615
3616	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
3617	pset_unlock(pset);
3618	sched_ipi_perform(processor, ipi_type);
3619	return;
3620	}
3621
3622	/*
3623	* Set preemption mode.
3624	*/
3625	#if defined(CONFIG_SCHED_DEFERRED_AST)
3626	/ TODO: Do we need to care about urgency (see rdar://problem/20136239)? /
3627	#endif
3628	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3629	preempt = (AST_PREEMPT \| AST_URGENT);
3630	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3631	preempt = (AST_PREEMPT \| AST_URGENT);
3632	else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3633	if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
3634	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3635	} else {
3636	preempt = AST_NONE;
3637	}
3638	} else
3639	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3640
3641	if ((options & (SCHED_PREEMPT\|SCHED_REBALANCE)) == (SCHED_PREEMPT\|SCHED_REBALANCE)) {
3642	/*
3643	* Having gone to the trouble of forcing this thread off a less preferred core,
3644	* we should force the preferable core to reschedule immediately to give this
3645	* thread a chance to run instead of just sitting on the run queue where
3646	* it may just be stolen back by the idle core we just forced it off.
3647	*/
3648	preempt \|= AST_PREEMPT;
3649	}
3650
3651	SCHED(processor_enqueue)(processor, thread, options);
3652	sched_update_pset_load_average(pset);
3653
3654	if (preempt != AST_NONE) {
3655	if (processor->state == PROCESSOR_IDLE) {
3656	processor->next_thread = THREAD_NULL;
3657	processor_state_update_from_thread(processor, thread);
3658	processor->deadline = UINT64_MAX;
3659	pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3660	ipi_action = eExitIdle;
3661	} else if ( processor->state == PROCESSOR_DISPATCHING) {
3662	if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3663	processor_state_update_from_thread(processor, thread);
3664	processor->deadline = UINT64_MAX;
3665	}
3666	} else if ( (processor->state == PROCESSOR_RUNNING \|\|
3667	processor->state == PROCESSOR_SHUTDOWN) &&
3668	(thread->sched_pri >= processor->current_pri)) {
3669	ipi_action = eInterruptRunning;
3670	}
3671	} else {
3672	/*
3673	* New thread is not important enough to preempt what is running, but
3674	* special processor states may need special handling
3675	*/
3676	if (processor->state == PROCESSOR_SHUTDOWN &&
3677	thread->sched_pri >= processor->current_pri ) {
3678	ipi_action = eInterruptRunning;
3679	} else if (processor->state == PROCESSOR_IDLE) {
3680
3681	processor->next_thread = THREAD_NULL;
3682	processor_state_update_from_thread(processor, thread);
3683	processor->deadline = UINT64_MAX;
3684	pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3685
3686	ipi_action = eExitIdle;
3687	}
3688	}
3689
3690	if (ipi_action != eDoNothing) {
3691	if (processor == current_processor()) {
3692	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3693	ast_on(preempt);
3694	} else {
3695	sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
3696	ipi_type = sched_ipi_action(processor, thread, (ipi_action == eExitIdle), event);
3697	}
3698	}
3699	pset_unlock(pset);
3700	sched_ipi_perform(processor, ipi_type);
3701	}
3702
3703	/*
3704	* choose_next_pset:
3705	*
3706	* Return the next sibling pset containing
3707	* available processors.
3708	*
3709	* Returns the original pset if none other is
3710	* suitable.
3711	*/
3712	static processor_set_t
3713	choose_next_pset(
3714	processor_set_t pset)
3715	{
3716	processor_set_t nset = pset;
3717
3718	do {
3719	nset = next_pset(nset);
3720	} while (nset->online_processor_count < `1` && nset != pset);
3721
3722	return (nset);
3723	}
3724
3725	/*
3726	* choose_processor:
3727	*
3728	* Choose a processor for the thread, beginning at
3729	* the pset. Accepts an optional processor hint in
3730	* the pset.
3731	*
3732	* Returns a processor, possibly from a different pset.
3733	*
3734	* The thread must be locked. The pset must be locked,
3735	* and the resulting pset is locked on return.
3736	*/
3737	processor_t
3738	choose_processor(
3739	processor_set_t starting_pset,
3740	processor_t processor,
3741	thread_t thread)
3742	{
3743	processor_set_t pset = starting_pset;
3744	processor_set_t nset;
3745
3746	assert(thread->sched_pri <= BASEPRI_RTQUEUES);
3747
3748	/*
3749	* Prefer the hinted processor, when appropriate.
3750	*/
3751
3752	/ Fold last processor hint from secondary processor to its primary /
3753	if (processor != PROCESSOR_NULL) {
3754	processor = processor->processor_primary;
3755	}
3756
3757	/*
3758	* Only consult platform layer if pset is active, which
3759	* it may not be in some cases when a multi-set system
3760	* is going to sleep.
3761	*/
3762	if (pset->online_processor_count) {
3763	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3764	processor_t mc_processor = machine_choose_processor(pset, processor);
3765	if (mc_processor != PROCESSOR_NULL)
3766	processor = mc_processor->processor_primary;
3767	}
3768	}
3769
3770	/*
3771	* At this point, we may have a processor hint, and we may have
3772	* an initial starting pset. If the hint is not in the pset, or
3773	* if the hint is for a processor in an invalid state, discard
3774	* the hint.
3775	*/
3776	if (processor != PROCESSOR_NULL) {
3777	if (processor->processor_set != pset) {
3778	processor = PROCESSOR_NULL;
3779	} else if (!processor->is_recommended) {
3780	processor = PROCESSOR_NULL;
3781	} else {
3782	switch (processor->state) {
3783	case PROCESSOR_START:
3784	case PROCESSOR_SHUTDOWN:
3785	case PROCESSOR_OFF_LINE:
3786	/*
3787	* Hint is for a processor that cannot support running new threads.
3788	*/
3789	processor = PROCESSOR_NULL;
3790	break;
3791	case PROCESSOR_IDLE:
3792	/*
3793	* Hint is for an idle processor. Assume it is no worse than any other
3794	* idle processor. The platform layer had an opportunity to provide
3795	* the "least cost idle" processor above.
3796	*/
3797	return (processor);
3798	case PROCESSOR_RUNNING:
3799	case PROCESSOR_DISPATCHING:
3800	/*
3801	* Hint is for an active CPU. This fast-path allows
3802	* realtime threads to preempt non-realtime threads
3803	* to regain their previous executing processor.
3804	*/
3805	if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3806	(processor->current_pri < BASEPRI_RTQUEUES))
3807	return (processor);
3808
3809	/ Otherwise, use hint as part of search below /
3810	break;
3811	default:
3812	processor = PROCESSOR_NULL;
3813	break;
3814	}
3815	}
3816	}
3817
3818	/*
3819	* Iterate through the processor sets to locate
3820	* an appropriate processor. Seed results with
3821	* a last-processor hint, if available, so that
3822	* a search must find something strictly better
3823	* to replace it.
3824	*
3825	* A primary/secondary pair of SMT processors are
3826	* "unpaired" if the primary is busy but its
3827	* corresponding secondary is idle (so the physical
3828	* core has full use of its resources).
3829	*/
3830
3831	integer_t lowest_priority = MAXPRI + `1`;
3832	integer_t lowest_secondary_priority = MAXPRI + `1`;
3833	integer_t lowest_unpaired_primary_priority = MAXPRI + `1`;
3834	integer_t lowest_count = INT_MAX;
3835	uint64_t furthest_deadline = `1`;
3836	processor_t lp_processor = PROCESSOR_NULL;
3837	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3838	processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3839	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
3840	processor_t lc_processor = PROCESSOR_NULL;
3841	processor_t fd_processor = PROCESSOR_NULL;
3842
3843	if (processor != PROCESSOR_NULL) {
3844	/ All other states should be enumerated above. /
3845	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
3846
3847	lowest_priority = processor->current_pri;
3848	lp_processor = processor;
3849
3850	if (processor->current_pri >= BASEPRI_RTQUEUES) {
3851	furthest_deadline = processor->deadline;
3852	fd_processor = processor;
3853	}
3854
3855	lowest_count = SCHED(processor_runq_count)(processor);
3856	lc_processor = processor;
3857	}
3858
3859	do {
3860	/*
3861	* Choose an idle processor, in pset traversal order
3862	*/
3863
3864	uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
3865	pset->primary_map &
3866	pset->recommended_bitmask &
3867	~pset->pending_AST_cpu_mask);
3868
3869	int cpuid = lsb_first(idle_primary_map);
3870	if (cpuid >= `0`) {
3871	processor = processor_array[cpuid];
3872	return processor;
3873	}
3874
3875	/*
3876	* Otherwise, enumerate active and idle processors to find primary candidates
3877	* with lower priority/etc.
3878	*/
3879
3880	uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] \| pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
3881	pset->recommended_bitmask &
3882	~pset->pending_AST_cpu_mask);
3883	active_map = bit_ror64(active_map, (pset->last_chosen + `1`));
3884	for (int rotid = lsb_first(active_map); rotid >= `0`; rotid = lsb_next(active_map, rotid)) {
3885	cpuid = ((rotid + pset->last_chosen + `1`) & `63`);
3886	processor = processor_array[cpuid];
3887
3888	integer_t cpri = processor->current_pri;
3889	if (processor->processor_primary != processor) {
3890	if (cpri < lowest_secondary_priority) {
3891	lowest_secondary_priority = cpri;
3892	lp_paired_secondary_processor = processor;
3893	}
3894	} else {
3895	if (cpri < lowest_priority) {
3896	lowest_priority = cpri;
3897	lp_processor = processor;
3898	}
3899	}
3900
3901	if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3902	furthest_deadline = processor->deadline;
3903	fd_processor = processor;
3904	}
3905
3906	integer_t ccount = SCHED(processor_runq_count)(processor);
3907	if (ccount < lowest_count) {
3908	lowest_count = ccount;
3909	lc_processor = processor;
3910	}
3911	}
3912
3913	/*
3914	* For SMT configs, these idle secondary processors must have active primary. Otherwise
3915	* the idle primary would have short-circuited the loop above
3916	*/
3917	uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
3918	~pset->primary_map &
3919	pset->recommended_bitmask &
3920	~pset->pending_AST_cpu_mask);
3921
3922	for (cpuid = lsb_first(idle_secondary_map); cpuid >= `0`; cpuid = lsb_next(idle_secondary_map, cpuid)) {
3923	processor = processor_array[cpuid];
3924
3925	processor_t cprimary = processor->processor_primary;
3926
3927	if (!cprimary->is_recommended) {
3928	continue;
3929	}
3930	if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) {
3931	continue;
3932	}
3933
3934	/ If the primary processor is offline or starting up, it's not a candidate for this path /
3935	if (cprimary->state == PROCESSOR_RUNNING \|\| cprimary->state == PROCESSOR_DISPATCHING) {
3936	integer_t primary_pri = cprimary->current_pri;
3937
3938	if (primary_pri < lowest_unpaired_primary_priority) {
3939	lowest_unpaired_primary_priority = primary_pri;
3940	lp_unpaired_primary_processor = cprimary;
3941	lp_unpaired_secondary_processor = processor;
3942	}
3943	}
3944	}
3945
3946
3947	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3948
3949	/*
3950	* For realtime threads, the most important aspect is
3951	* scheduling latency, so we attempt to assign threads
3952	* to good preemption candidates (assuming an idle primary
3953	* processor was not available above).
3954	*/
3955
3956	if (thread->sched_pri > lowest_unpaired_primary_priority) {
3957	pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
3958	return lp_unpaired_primary_processor;
3959	}
3960	if (thread->sched_pri > lowest_priority) {
3961	pset->last_chosen = lp_processor->cpu_id;
3962	return lp_processor;
3963	}
3964	if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) {
3965	pset->last_chosen = lp_paired_secondary_processor->cpu_id;
3966	return lp_paired_secondary_processor;
3967	}
3968	if (thread->realtime.deadline < furthest_deadline)
3969	return fd_processor;
3970
3971	/*
3972	* If all primary and secondary CPUs are busy with realtime
3973	* threads with deadlines earlier than us, move on to next
3974	* pset.
3975	*/
3976	}
3977	else {
3978
3979	if (thread->sched_pri > lowest_unpaired_primary_priority) {
3980	pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
3981	return lp_unpaired_primary_processor;
3982	}
3983	if (thread->sched_pri > lowest_priority) {
3984	pset->last_chosen = lp_processor->cpu_id;
3985	return lp_processor;
3986	}
3987
3988	/*
3989	* If all primary processor in this pset are running a higher
3990	* priority thread, move on to next pset. Only when we have
3991	* exhausted this search do we fall back to other heuristics.
3992	*/
3993	}
3994
3995	/*
3996	* Move onto the next processor set.
3997	*/
3998	nset = next_pset(pset);
3999
4000	if (nset != starting_pset) {
4001	pset_unlock(pset);
4002
4003	pset = nset;
4004	pset_lock(pset);
4005	}
4006	} while (nset != starting_pset);
4007
4008	/*
4009	* Make sure that we pick a running processor,
4010	* and that the correct processor set is locked.
4011	* Since we may have unlock the candidate processor's
4012	* pset, it may have changed state.
4013	*
4014	* All primary processors are running a higher priority
4015	* thread, so the only options left are enqueuing on
4016	* the secondary processor that would perturb the least priority
4017	* primary, or the least busy primary.
4018	*/
4019	do {
4020
4021	/ lowest_priority is evaluated in the main loops above /
4022	if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
4023	processor = lp_unpaired_secondary_processor;
4024	lp_unpaired_secondary_processor = PROCESSOR_NULL;
4025	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
4026	processor = lp_paired_secondary_processor;
4027	lp_paired_secondary_processor = PROCESSOR_NULL;
4028	} else if (lc_processor != PROCESSOR_NULL) {
4029	processor = lc_processor;
4030	lc_processor = PROCESSOR_NULL;
4031	} else {
4032	/*
4033	* All processors are executing higher
4034	* priority threads, and the lowest_count
4035	* candidate was not usable
4036	*/
4037	processor = master_processor;
4038	}
4039
4040	/*
4041	* Check that the correct processor set is
4042	* returned locked.
4043	*/
4044	if (pset != processor->processor_set) {
4045	pset_unlock(pset);
4046	pset = processor->processor_set;
4047	pset_lock(pset);
4048	}
4049
4050	/*
4051	* We must verify that the chosen processor is still available.
4052	* master_processor is an exception, since we may need to preempt
4053	* a running thread on it during processor shutdown (for sleep),
4054	* and that thread needs to be enqueued on its runqueue to run
4055	* when the processor is restarted.
4056	*/
4057	if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN \|\| processor->state == PROCESSOR_OFF_LINE))
4058	processor = PROCESSOR_NULL;
4059
4060	} while (processor == PROCESSOR_NULL);
4061
4062	pset->last_chosen = processor->cpu_id;
4063	return processor;
4064	}
4065
4066	/*
4067	* thread_setrun:
4068	*
4069	* Dispatch thread for execution, onto an idle
4070	* processor or run queue, and signal a preemption
4071	* as appropriate.
4072	*
4073	* Thread must be locked.
4074	*/
4075	void
4076	thread_setrun(
4077	thread_t thread,
4078	integer_t options)
4079	{
4080	processor_t processor;
4081	processor_set_t pset;
4082
4083	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
4084	assert(thread->runq == PROCESSOR_NULL);
4085
4086	/*
4087	* Update priority if needed.
4088	*/
4089	if (SCHED(can_update_priority)(thread))
4090	SCHED(update_priority)(thread);
4091
4092	thread->sfi_class = sfi_thread_classify(thread);
4093
4094	assert(thread->runq == PROCESSOR_NULL);
4095
4096	#if __SMP__
4097	if (thread->bound_processor == PROCESSOR_NULL) {
4098	/*
4099	* Unbound case.
4100	*/
4101	if (thread->affinity_set != AFFINITY_SET_NULL) {
4102	/*
4103	* Use affinity set policy hint.
4104	*/
4105	pset = thread->affinity_set->aset_pset;
4106	pset_lock(pset);
4107
4108	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
4109	pset = processor->processor_set;
4110
4111	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
4112	(uintptr_t)thread_tid(thread), (uintptr_t)-`1`, processor->cpu_id, processor->state, `0`);
4113	} else if (thread->last_processor != PROCESSOR_NULL) {
4114	/*
4115	* Simple (last processor) affinity case.
4116	*/
4117	processor = thread->last_processor;
4118	pset = processor->processor_set;
4119	pset_lock(pset);
4120	processor = SCHED(choose_processor)(pset, processor, thread);
4121	pset = processor->processor_set;
4122
4123	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
4124	(uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, `0`);
4125	} else {
4126	/*
4127	* No Affinity case:
4128	*
4129	* Utilitize a per task hint to spread threads
4130	* among the available processor sets.
4131	*/
4132	task_t task = thread->task;
4133
4134	pset = task->pset_hint;
4135	if (pset == PROCESSOR_SET_NULL)
4136	pset = current_processor()->processor_set;
4137
4138	pset = choose_next_pset(pset);
4139	pset_lock(pset);
4140
4141	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
4142	pset = processor->processor_set;
4143	task->pset_hint = pset;
4144
4145	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
4146	(uintptr_t)thread_tid(thread), (uintptr_t)-`1`, processor->cpu_id, processor->state, `0`);
4147	}
4148	} else {
4149	/*
4150	* Bound case:
4151	*
4152	* Unconditionally dispatch on the processor.
4153	*/
4154	processor = thread->bound_processor;
4155	pset = processor->processor_set;
4156	pset_lock(pset);
4157
4158	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
4159	(uintptr_t)thread_tid(thread), (uintptr_t)-`2`, processor->cpu_id, processor->state, `0`);
4160	}
4161	#else /* !__SMP__ */
4162	/ Only one processor to choose /
4163	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == master_processor);
4164	processor = master_processor;
4165	pset = processor->processor_set;
4166	pset_lock(pset);
4167	#endif /* !__SMP__ */
4168
4169	/*
4170	* Dispatch the thread on the chosen processor.
4171	* TODO: This should be based on sched_mode, not sched_pri
4172	*/
4173	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4174	realtime_setrun(processor, thread);
4175	} else {
4176	processor_setrun(processor, thread, options);
4177	}
4178	/ pset is now unlocked /
4179	if (thread->bound_processor == PROCESSOR_NULL) {
4180	SCHED(check_spill)(pset, thread);
4181	}
4182	}
4183
4184	processor_set_t
4185	task_choose_pset(
4186	task_t task)
4187	{
4188	processor_set_t pset = task->pset_hint;
4189
4190	if (pset != PROCESSOR_SET_NULL)
4191	pset = choose_next_pset(pset);
4192
4193	return (pset);
4194	}
4195
4196	/*
4197	* Check for a preemption point in
4198	* the current context.
4199	*
4200	* Called at splsched with thread locked.
4201	*/
4202	ast_t
4203	csw_check(
4204	processor_t processor,
4205	ast_t check_reason)
4206	{
4207	processor_set_t pset = processor->processor_set;
4208	ast_t result;
4209
4210	pset_lock(pset);
4211
4212	/ If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held /
4213	bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
4214
4215	result = csw_check_locked(processor, pset, check_reason);
4216
4217	pset_unlock(pset);
4218
4219	return result;
4220	}
4221
4222	/*
4223	* Check for preemption at splsched with
4224	* pset and thread locked
4225	*/
4226	ast_t
4227	csw_check_locked(
4228	processor_t processor,
4229	processor_set_t pset,
4230	ast_t check_reason)
4231	{
4232	ast_t result;
4233	thread_t thread = processor->active_thread;
4234
4235	if (processor->first_timeslice) {
4236	if (rt_runq_count(pset) > `0`)
4237	return (check_reason \| AST_PREEMPT \| AST_URGENT);
4238	}
4239	else {
4240	if (rt_runq_count(pset) > `0`) {
4241	if (BASEPRI_RTQUEUES > processor->current_pri)
4242	return (check_reason \| AST_PREEMPT \| AST_URGENT);
4243	else
4244	return (check_reason \| AST_PREEMPT);
4245	}
4246	}
4247
4248	#if __SMP__
4249	/*
4250	* If the current thread is running on a processor that is no longer recommended,
4251	* urgently preempt it, at which point thread_select() should
4252	* try to idle the processor and re-dispatch the thread to a recommended processor.
4253	*/
4254	if (!processor->is_recommended) {
4255	return (check_reason \| AST_PREEMPT \| AST_URGENT);
4256	}
4257	#endif
4258
4259	result = SCHED(processor_csw_check)(processor);
4260	if (result != AST_NONE)
4261	return (check_reason \| result \| (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
4262
4263	#if __SMP__
4264	/*
4265	* Same for avoid-processor
4266	*
4267	* TODO: Should these set AST_REBALANCE?
4268	*/
4269	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
4270	return (check_reason \| AST_PREEMPT);
4271	}
4272
4273	/*
4274	* Even though we could continue executing on this processor, a
4275	* secondary SMT core should try to shed load to another primary core.
4276	*
4277	* TODO: Should this do the same check that thread_select does? i.e.
4278	* if no bound threads target this processor, and idle primaries exist, preempt
4279	* The case of RT threads existing is already taken care of above
4280	*/
4281
4282	if (processor->current_pri < BASEPRI_RTQUEUES &&
4283	processor->processor_primary != processor)
4284	return (check_reason \| AST_PREEMPT);
4285	#endif
4286
4287	if (thread->state & TH_SUSP)
4288	return (check_reason \| AST_PREEMPT);
4289
4290	#if CONFIG_SCHED_SFI
4291	/*
4292	* Current thread may not need to be preempted, but maybe needs
4293	* an SFI wait?
4294	*/
4295	result = sfi_thread_needs_ast(thread, NULL);
4296	if (result != AST_NONE)
4297	return (check_reason \| result);
4298	#endif
4299
4300	return (AST_NONE);
4301	}
4302
4303	/*
4304	* set_sched_pri:
4305	*
4306	* Set the scheduled priority of the specified thread.
4307	*
4308	* This may cause the thread to change queues.
4309	*
4310	* Thread must be locked.
4311	*/
4312	void
4313	set_sched_pri(
4314	thread_t thread,
4315	int new_priority,
4316	set_sched_pri_options_t options)
4317	{
4318	thread_t cthread = current_thread();
4319	boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
4320	int curgency, nurgency;
4321	uint64_t urgency_param1, urgency_param2;
4322	boolean_t removed_from_runq = FALSE;
4323
4324	bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
4325
4326	int old_priority = thread->sched_pri;
4327
4328	/ If we're already at this priority, no need to mess with the runqueue /
4329	if (new_priority == old_priority)
4330	return;
4331
4332	if (is_current_thread) {
4333	assert(thread->runq == PROCESSOR_NULL);
4334	curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4335	} else {
4336	removed_from_runq = thread_run_queue_remove(thread);
4337	}
4338
4339	thread->sched_pri = new_priority;
4340
4341	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
4342	(uintptr_t)thread_tid(thread),
4343	thread->base_pri,
4344	thread->sched_pri,
4345	thread->sched_usage,
4346	`0`);
4347
4348	if (is_current_thread) {
4349	nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4350	/*
4351	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
4352	* class alterations from user space to occur relatively infrequently, hence
4353	* those are lazily handled. QoS classes have distinct priority bands, and QoS
4354	* inheritance is expected to involve priority changes.
4355	*/
4356	uint64_t ctime = mach_approximate_time();
4357	if (nurgency != curgency) {
4358	thread_tell_urgency(nurgency, urgency_param1, urgency_param2, `0`, thread);
4359	}
4360	machine_thread_going_on_core(thread, nurgency, `0`, `0`, ctime);
4361	}
4362
4363	if (removed_from_runq)
4364	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
4365	else if (thread->state & TH_RUN) {
4366	processor_t processor = thread->last_processor;
4367
4368	if (is_current_thread) {
4369	processor_state_update_from_thread(processor, thread);
4370
4371	/*
4372	* When dropping in priority, check if the thread no longer belongs on core.
4373	* If a thread raises its own priority, don't aggressively rebalance it.
4374	* <rdar://problem/31699165>
4375	*/
4376	if (!lazy_update && new_priority < old_priority) {
4377	ast_t preempt;
4378
4379	if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4380	ast_on(preempt);
4381	}
4382	} else if (!lazy_update && processor != PROCESSOR_NULL &&
4383	processor != current_processor() && processor->active_thread == thread) {
4384	cause_ast_check(processor);
4385	}
4386	}
4387	}
4388
4389	/*
4390	* thread_run_queue_remove_for_handoff
4391	*
4392	* Pull a thread or its (recursive) push target out of the runqueue
4393	* so that it is ready for thread_run()
4394	*
4395	* Called at splsched
4396	*
4397	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4398	* This may be different than the thread that was passed in.
4399	*/
4400	thread_t
4401	thread_run_queue_remove_for_handoff(thread_t thread) {
4402
4403	thread_t pulled_thread = THREAD_NULL;
4404
4405	thread_lock(thread);
4406
4407	/*
4408	* Check that the thread is not bound
4409	* to a different processor, and that realtime
4410	* is not involved.
4411	*
4412	* Next, pull it off its run queue. If it
4413	* doesn't come, it's not eligible.
4414	*/
4415
4416	processor_t processor = current_processor();
4417	if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4418	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)) {
4419
4420	if (thread_run_queue_remove(thread))
4421	pulled_thread = thread;
4422	}
4423
4424	thread_unlock(thread);
4425
4426	return pulled_thread;
4427	}
4428
4429	/*
4430	* thread_run_queue_remove:
4431	*
4432	* Remove a thread from its current run queue and
4433	* return TRUE if successful.
4434	*
4435	* Thread must be locked.
4436	*
4437	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4438	* run queues because the caller locked the thread. Otherwise
4439	* the thread is on a run queue, but could be chosen for dispatch
4440	* and removed by another processor under a different lock, which
4441	* will set thread->runq to PROCESSOR_NULL.
4442	*
4443	* Hence the thread select path must not rely on anything that could
4444	* be changed under the thread lock after calling this function,
4445	* most importantly thread->sched_pri.
4446	*/
4447	boolean_t
4448	thread_run_queue_remove(
4449	thread_t thread)
4450	{
4451	boolean_t removed = FALSE;
4452	processor_t processor = thread->runq;
4453
4454	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_WAIT) {
4455	/ Thread isn't runnable /
4456	assert(thread->runq == PROCESSOR_NULL);
4457	return FALSE;
4458	}
4459
4460	if (processor == PROCESSOR_NULL) {
4461	/*
4462	* The thread is either not on the runq,
4463	* or is in the midst of being removed from the runq.
4464	*
4465	* runq is set to NULL under the pset lock, not the thread
4466	* lock, so the thread may still be in the process of being dequeued
4467	* from the runq. It will wait in invoke for the thread lock to be
4468	* dropped.
4469	*/
4470
4471	return FALSE;
4472	}
4473
4474	if (thread->sched_pri < BASEPRI_RTQUEUES) {
4475	return SCHED(processor_queue_remove)(processor, thread);
4476	}
4477
4478	processor_set_t pset = processor->processor_set;
4479
4480	rt_lock_lock(pset);
4481
4482	if (thread->runq != PROCESSOR_NULL) {
4483	/*
4484	* Thread is on the RT run queue and we have a lock on
4485	* that run queue.
4486	*/
4487
4488	remqueue(&thread->runq_links);
4489	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
4490	rt_runq_count_decr(pset);
4491
4492	thread->runq = PROCESSOR_NULL;
4493
4494	removed = TRUE;
4495	}
4496
4497	rt_lock_unlock(pset);
4498
4499	return (removed);
4500	}
4501
4502	/*
4503	* Put the thread back where it goes after a thread_run_queue_remove
4504	*
4505	* Thread must have been removed under the same thread lock hold
4506	*
4507	* thread locked, at splsched
4508	*/
4509	void
4510	thread_run_queue_reinsert(thread_t thread, integer_t options)
4511	{
4512	assert(thread->runq == PROCESSOR_NULL);
4513	assert(thread->state & (TH_RUN));
4514
4515	thread_setrun(thread, options);
4516	}
4517
4518	void
4519	sys_override_cpu_throttle(boolean_t enable_override)
4520	{
4521	if (enable_override)
4522	cpu_throttle_enabled = `0`;
4523	else
4524	cpu_throttle_enabled = `1`;
4525	}
4526
4527	int
4528	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
4529	{
4530	if (thread == NULL \|\| (thread->state & TH_IDLE)) {
4531	*arg1 = `0`;
4532	*arg2 = `0`;
4533
4534	return (THREAD_URGENCY_NONE);
4535	} else if (thread->sched_mode == TH_MODE_REALTIME) {
4536	*arg1 = thread->realtime.period;
4537	*arg2 = thread->realtime.deadline;
4538
4539	return (THREAD_URGENCY_REAL_TIME);
4540	} else if (cpu_throttle_enabled &&
4541	((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4542	/*
4543	* Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4544	*/
4545	*arg1 = thread->sched_pri;
4546	*arg2 = thread->base_pri;
4547
4548	return (THREAD_URGENCY_BACKGROUND);
4549	} else {
4550	/ For otherwise unclassified threads, report throughput QoS*
4551	* parameters
4552	*/
4553	*arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
4554	*arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
4555
4556	return (THREAD_URGENCY_NORMAL);
4557	}
4558	}
4559
4560	perfcontrol_class_t
4561	thread_get_perfcontrol_class(thread_t thread)
4562	{
4563	/ Special case handling /
4564	if (thread->state & TH_IDLE)
4565	return PERFCONTROL_CLASS_IDLE;
4566	if (thread->task == kernel_task)
4567	return PERFCONTROL_CLASS_KERNEL;
4568	if (thread->sched_mode == TH_MODE_REALTIME)
4569	return PERFCONTROL_CLASS_REALTIME;
4570
4571	/ perfcontrol_class based on base_pri /
4572	if (thread->base_pri <= MAXPRI_THROTTLE)
4573	return PERFCONTROL_CLASS_BACKGROUND;
4574	else if (thread->base_pri <= BASEPRI_UTILITY)
4575	return PERFCONTROL_CLASS_UTILITY;
4576	else if (thread->base_pri <= BASEPRI_DEFAULT)
4577	return PERFCONTROL_CLASS_NONUI;
4578	else if (thread->base_pri <= BASEPRI_FOREGROUND)
4579	return PERFCONTROL_CLASS_UI;
4580	else
4581	return PERFCONTROL_CLASS_ABOVEUI;
4582	}
4583
4584	/*
4585	* This is the processor idle loop, which just looks for other threads
4586	* to execute. Processor idle threads invoke this without supplying a
4587	* current thread to idle without an asserted wait state.
4588	*
4589	* Returns a the next thread to execute if dispatched directly.
4590	*/
4591
4592	#if 0
4593	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4594	#else
4595	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4596	#endif
4597
4598	thread_t
4599	processor_idle(
4600	thread_t thread,
4601	processor_t processor)
4602	{
4603	processor_set_t pset = processor->processor_set;
4604	thread_t new_thread;
4605	int state;
4606	(void)splsched();
4607
4608	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4609	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_START,
4610	(uintptr_t)thread_tid(thread), `0`, `0`, `0`, `0`);
4611
4612	SCHED_STATS_CPU_IDLE_START(processor);
4613
4614	uint64_t ctime = mach_absolute_time();
4615
4616	timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state));
4617	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4618
4619	cpu_quiescent_counter_leave(ctime);
4620
4621	while (`1`) {
4622	/*
4623	* Ensure that updates to my processor and pset state,
4624	* made by the IPI source processor before sending the IPI,
4625	* are visible on this processor now (even though we don't
4626	* take the pset lock yet).
4627	*/
4628	atomic_thread_fence(memory_order_acquire);
4629
4630	if (processor->state != PROCESSOR_IDLE)
4631	break;
4632	if (bit_test(pset->pending_AST_cpu_mask, processor->cpu_id))
4633	break;
4634	#if defined(CONFIG_SCHED_DEFERRED_AST)
4635	if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id))
4636	break;
4637	#endif
4638	if (processor->is_recommended && (processor->processor_primary == processor)) {
4639	if (rt_runq_count(pset))
4640	break;
4641	} else {
4642	if (SCHED(processor_bound_count)(processor))
4643	break;
4644	}
4645
4646	#if CONFIG_SCHED_IDLE_IN_PLACE
4647	if (thread != THREAD_NULL) {
4648	/ Did idle-in-place thread wake up /
4649	if ((thread->state & (TH_WAIT\|TH_SUSP)) != TH_WAIT \|\| thread->wake_active)
4650	break;
4651	}
4652	#endif
4653
4654	IDLE_KERNEL_DEBUG_CONSTANT(
4655	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -`1`, `0`);
4656
4657	machine_track_platform_idle(TRUE);
4658
4659	machine_idle();
4660
4661	machine_track_platform_idle(FALSE);
4662
4663	(void)splsched();
4664
4665	/*
4666	* Check if we should call sched_timeshare_consider_maintenance() here.
4667	* The CPU was woken out of idle due to an interrupt and we should do the
4668	* call only if the processor is still idle. If the processor is non-idle,
4669	* the threads running on the processor would do the call as part of
4670	* context swithing.
4671	*/
4672	if (processor->state == PROCESSOR_IDLE) {
4673	sched_timeshare_consider_maintenance(mach_absolute_time());
4674	}
4675
4676	IDLE_KERNEL_DEBUG_CONSTANT(
4677	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -`2`, `0`);
4678
4679	if (!SCHED(processor_queue_empty)(processor)) {
4680	/ Secondary SMT processors respond to directed wakeups*
4681	* exclusively. Some platforms induce 'spurious' SMT wakeups.
4682	*/
4683	if (processor->processor_primary == processor)
4684	break;
4685	}
4686	}
4687
4688	ctime = mach_absolute_time();
4689
4690	timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state));
4691	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4692
4693	cpu_quiescent_counter_join(ctime);
4694
4695	pset_lock(pset);
4696
4697	/ If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held /
4698	bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
4699	#if defined(CONFIG_SCHED_DEFERRED_AST)
4700	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
4701	#endif
4702
4703	state = processor->state;
4704	if (state == PROCESSOR_DISPATCHING) {
4705	/*
4706	* Commmon case -- cpu dispatched.
4707	*/
4708	new_thread = processor->next_thread;
4709	processor->next_thread = THREAD_NULL;
4710	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
4711
4712	if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) \|\|
4713	(rt_runq_count(pset) > `0`)) ) {
4714	/ Something higher priority has popped up on the runqueue - redispatch this thread elsewhere /
4715	processor_state_update_idle(processor);
4716	processor->deadline = UINT64_MAX;
4717
4718	pset_unlock(pset);
4719
4720	thread_lock(new_thread);
4721	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq_count(pset), `0`, `0`);
4722	thread_setrun(new_thread, SCHED_HEADQ);
4723	thread_unlock(new_thread);
4724
4725	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4726	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
4727	(uintptr_t)thread_tid(thread), state, `0`, `0`, `0`);
4728
4729	return (THREAD_NULL);
4730	}
4731
4732	sched_update_pset_load_average(pset);
4733
4734	pset_unlock(pset);
4735
4736	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4737	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
4738	(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), `0`, `0`);
4739
4740	return (new_thread);
4741
4742	} else if (state == PROCESSOR_IDLE) {
4743	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
4744	processor_state_update_idle(processor);
4745	processor->deadline = UINT64_MAX;
4746
4747	} else if (state == PROCESSOR_SHUTDOWN) {
4748	/*
4749	* Going off-line. Force a
4750	* reschedule.
4751	*/
4752	if ((new_thread = processor->next_thread) != THREAD_NULL) {
4753	processor->next_thread = THREAD_NULL;
4754	processor_state_update_idle(processor);
4755	processor->deadline = UINT64_MAX;
4756
4757	pset_unlock(pset);
4758
4759	thread_lock(new_thread);
4760	thread_setrun(new_thread, SCHED_HEADQ);
4761	thread_unlock(new_thread);
4762
4763	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4764	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
4765	(uintptr_t)thread_tid(thread), state, `0`, `0`, `0`);
4766
4767	return (THREAD_NULL);
4768	}
4769	}
4770
4771	pset_unlock(pset);
4772
4773	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4774	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
4775	(uintptr_t)thread_tid(thread), state, `0`, `0`, `0`);
4776
4777	return (THREAD_NULL);
4778	}
4779
4780	/*
4781	* Each processor has a dedicated thread which
4782	* executes the idle loop when there is no suitable
4783	* previous context.
4784	*/
4785	void
4786	idle_thread(void)
4787	{
4788	processor_t processor = current_processor();
4789	thread_t new_thread;
4790
4791	new_thread = processor_idle(THREAD_NULL, processor);
4792	if (new_thread != THREAD_NULL) {
4793	thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4794	/NOTREACHED/
4795	}
4796
4797	thread_block((thread_continue_t)idle_thread);
4798	/NOTREACHED/
4799	}
4800
4801	kern_return_t
4802	idle_thread_create(
4803	processor_t processor)
4804	{
4805	kern_return_t result;
4806	thread_t thread;
4807	spl_t s;
4808	char name[MAXTHREADNAMESIZE];
4809
4810	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4811	if (result != KERN_SUCCESS)
4812	return (result);
4813
4814	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
4815	thread_set_thread_name(thread, name);
4816
4817	s = splsched();
4818	thread_lock(thread);
4819	thread->bound_processor = processor;
4820	processor->idle_thread = thread;
4821	thread->sched_pri = thread->base_pri = IDLEPRI;
4822	thread->state = (TH_RUN \| TH_IDLE);
4823	thread->options \|= TH_OPT_IDLE_THREAD;
4824	thread_unlock(thread);
4825	splx(s);
4826
4827	thread_deallocate(thread);
4828
4829	return (KERN_SUCCESS);
4830	}
4831
4832	/*
4833	* sched_startup:
4834	*
4835	* Kicks off scheduler services.
4836	*
4837	* Called at splsched.
4838	*/
4839	void
4840	sched_startup(void)
4841	{
4842	kern_return_t result;
4843	thread_t thread;
4844
4845	simple_lock_init(&sched_vm_group_list_lock, `0`);
4846
4847	#if __arm__ \|\| __arm64__
4848	simple_lock_init(&sched_recommended_cores_lock, `0`);
4849	#endif /* __arm__ \|\| __arm64__ */
4850
4851	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4852	(void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4853	if (result != KERN_SUCCESS)
4854	panic("sched_startup");
4855
4856	thread_deallocate(thread);
4857
4858	assert_thread_magic(thread);
4859
4860	/*
4861	* Yield to the sched_init_thread once, to
4862	* initialize our own thread after being switched
4863	* back to.
4864	*
4865	* The current thread is the only other thread
4866	* active at this point.
4867	*/
4868	thread_block(THREAD_CONTINUE_NULL);
4869	}
4870
4871	#if __arm64__
4872	static _Atomic uint64_t sched_perfcontrol_callback_deadline;
4873	#endif /* __arm64__ */
4874
4875
4876	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4877
4878	static volatile uint64_t sched_maintenance_deadline;
4879	static uint64_t sched_tick_last_abstime;
4880	static uint64_t sched_tick_delta;
4881	uint64_t sched_tick_max_delta;
4882
4883
4884	/*
4885	* sched_init_thread:
4886	*
4887	* Perform periodic bookkeeping functions about ten
4888	* times per second.
4889	*/
4890	void
4891	sched_timeshare_maintenance_continue(void)
4892	{
4893	uint64_t sched_tick_ctime, late_time;
4894
4895	struct sched_update_scan_context scan_context = {
4896	.earliest_bg_make_runnable_time = UINT64_MAX,
4897	.earliest_normal_make_runnable_time = UINT64_MAX,
4898	.earliest_rt_make_runnable_time = UINT64_MAX
4899	};
4900
4901	sched_tick_ctime = mach_absolute_time();
4902
4903	if (__improbable(sched_tick_last_abstime == `0`)) {
4904	sched_tick_last_abstime = sched_tick_ctime;
4905	late_time = `0`;
4906	sched_tick_delta = `1`;
4907	} else {
4908	late_time = sched_tick_ctime - sched_tick_last_abstime;
4909	sched_tick_delta = late_time / sched_tick_interval;
4910	/ Ensure a delta of 1, since the interval could be slightly*
4911	* smaller than the sched_tick_interval due to dispatch
4912	* latencies.
4913	*/
4914	sched_tick_delta = MAX(sched_tick_delta, `1`);
4915
4916	/ In the event interrupt latencies or platform*
4917	* idle events that advanced the timebase resulted
4918	* in periods where no threads were dispatched,
4919	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4920	* iterations.
4921	*/
4922	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4923
4924	sched_tick_last_abstime = sched_tick_ctime;
4925	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4926	}
4927
4928	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_START,
4929	sched_tick_delta, late_time, `0`, `0`, `0`);
4930
4931	/ Add a number of pseudo-ticks corresponding to the elapsed interval*
4932	* This could be greater than 1 if substantial intervals where
4933	* all processors are idle occur, which rarely occurs in practice.
4934	*/
4935
4936	sched_tick += sched_tick_delta;
4937
4938	update_vm_info();
4939
4940	/*
4941	* Compute various averages.
4942	*/
4943	compute_averages(sched_tick_delta);
4944
4945	/*
4946	* Scan the run queues for threads which
4947	* may need to be updated, and find the earliest runnable thread on the runqueue
4948	* to report its latency.
4949	*/
4950	SCHED(thread_update_scan)(&scan_context);
4951
4952	SCHED(rt_runq_scan)(&scan_context);
4953
4954	uint64_t ctime = mach_absolute_time();
4955
4956	uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
4957	ctime - scan_context.earliest_bg_make_runnable_time : `0`;
4958
4959	uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
4960	ctime - scan_context.earliest_normal_make_runnable_time : `0`;
4961
4962	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
4963	ctime - scan_context.earliest_rt_make_runnable_time : `0`;
4964
4965	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
4966
4967	/*
4968	* Check to see if the special sched VM group needs attention.
4969	*/
4970	sched_vm_group_maintenance();
4971
4972	#if __arm__ \|\| __arm64__
4973	/ Check to see if the recommended cores failsafe is active /
4974	sched_recommended_cores_maintenance();
4975	#endif /* __arm__ \|\| __arm64__ */
4976
4977
4978	#if DEBUG \|\| DEVELOPMENT
4979	#if __x86_64__
4980	#include <i386/misc_protos.h>
4981	/ Check for long-duration interrupts /
4982	mp_interrupt_watchdog();
4983	#endif /* __x86_64__ */
4984	#endif /* DEBUG \|\| DEVELOPMENT */
4985
4986	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_END,
4987	sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
4988	sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], `0`);
4989
4990	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4991	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
4992	/NOTREACHED/
4993	}
4994
4995	static uint64_t sched_maintenance_wakeups;
4996
4997	/*
4998	* Determine if the set of routines formerly driven by a maintenance timer
4999	* must be invoked, based on a deadline comparison. Signals the scheduler
5000	* maintenance thread on deadline expiration. Must be invoked at an interval
5001	* lower than the "sched_tick_interval", currently accomplished by
5002	* invocation via the quantum expiration timer and at context switch time.
5003	* Performance matters: this routine reuses a timestamp approximating the
5004	* current absolute time received from the caller, and should perform
5005	* no more than a comparison against the deadline in the common case.
5006	*/
5007	void
5008	sched_timeshare_consider_maintenance(uint64_t ctime) {
5009
5010	cpu_quiescent_counter_checkin(ctime);
5011
5012	uint64_t deadline = sched_maintenance_deadline;
5013
5014	if (__improbable(ctime >= deadline)) {
5015	if (__improbable(current_thread() == sched_maintenance_thread))
5016	return;
5017	OSMemoryBarrier();
5018
5019	uint64_t ndeadline = ctime + sched_tick_interval;
5020
5021	if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
5022	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
5023	sched_maintenance_wakeups++;
5024	}
5025	}
5026
5027	uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed);
5028
5029	if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
5030	uint64_t new_deadline = `0`;
5031	if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline,
5032	memory_order_relaxed, memory_order_relaxed)) {
5033	compute_sched_load();
5034	new_deadline = ctime + sched_load_compute_interval_abs;
5035	__c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed);
5036	}
5037	}
5038
5039	#if __arm64__
5040	uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
5041
5042	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
5043	/ CAS in 0, if success, make callback. Otherwise let the next context switch check again. /
5044	if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, `0`,
5045	memory_order_relaxed, memory_order_relaxed)) {
5046	machine_perfcontrol_deadline_passed(perf_deadline);
5047	}
5048	}
5049	#endif /* __arm64__ */
5050
5051	}
5052
5053	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
5054
5055	void
5056	sched_init_thread(void (continuation)(void*))
5057	{
5058	thread_block(THREAD_CONTINUE_NULL);
5059
5060	thread_t thread = current_thread();
5061
5062	thread_set_thread_name(thread, "sched_maintenance_thread");
5063
5064	sched_maintenance_thread = thread;
5065
5066	continuation();
5067
5068	/NOTREACHED/
5069	}
5070
5071	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
5072
5073	/*
5074	* thread_update_scan / runq_scan:
5075	*
5076	* Scan the run queues to account for timesharing threads
5077	* which need to be updated.
5078	*
5079	* Scanner runs in two passes. Pass one squirrels likely
5080	* threads away in an array, pass two does the update.
5081	*
5082	* This is necessary because the run queue is locked for
5083	* the candidate scan, but the thread is locked for the update.
5084	*
5085	* Array should be sized to make forward progress, without
5086	* disabling preemption for long periods.
5087	*/
5088
5089	#define THREAD_UPDATE_SIZE 128
5090
5091	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
5092	static uint32_t thread_update_count = `0`;
5093
5094	/ Returns TRUE if thread was added, FALSE if thread_update_array is full /
5095	boolean_t
5096	thread_update_add_thread(thread_t thread)
5097	{
5098	if (thread_update_count == THREAD_UPDATE_SIZE)
5099	return (FALSE);
5100
5101	thread_update_array[thread_update_count++] = thread;
5102	thread_reference_internal(thread);
5103	return (TRUE);
5104	}
5105
5106	void
5107	thread_update_process_threads(void)
5108	{
5109	assert(thread_update_count <= THREAD_UPDATE_SIZE);
5110
5111	for (uint32_t i = `0` ; i < thread_update_count ; i++) {
5112	thread_t thread = thread_update_array[i];
5113	assert_thread_magic(thread);
5114	thread_update_array[i] = THREAD_NULL;
5115
5116	spl_t s = splsched();
5117	thread_lock(thread);
5118	if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
5119	SCHED(update_priority)(thread);
5120	}
5121	thread_unlock(thread);
5122	splx(s);
5123
5124	thread_deallocate(thread);
5125	}
5126
5127	thread_update_count = `0`;
5128	}
5129
5130	/*
5131	* Scan a runq for candidate threads.
5132	*
5133	* Returns TRUE if retry is needed.
5134	*/
5135	boolean_t
5136	runq_scan(
5137	run_queue_t runq,
5138	sched_update_scan_context_t scan_context)
5139	{
5140	int count = runq->count;
5141	int queue_index;
5142
5143	assert(count >= `0`);
5144
5145	if (count == `0`)
5146	return FALSE;
5147
5148	for (queue_index = bitmap_first(runq->bitmap, NRQS);
5149	queue_index >= `0`;
5150	queue_index = bitmap_next(runq->bitmap, queue_index)) {
5151
5152	thread_t thread;
5153	queue_t queue = &runq->queues[queue_index];
5154
5155	qe_foreach_element(thread, queue, runq_links) {
5156	assert(count > `0`);
5157	assert_thread_magic(thread);
5158
5159	if (thread->sched_stamp != sched_tick &&
5160	thread->sched_mode == TH_MODE_TIMESHARE) {
5161	if (thread_update_add_thread(thread) == FALSE)
5162	return TRUE;
5163	}
5164
5165	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
5166	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
5167	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
5168	}
5169	} else {
5170	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
5171	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
5172	}
5173	}
5174	count--;
5175	}
5176	}
5177
5178	return FALSE;
5179	}
5180
5181	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
5182
5183	boolean_t
5184	thread_eager_preemption(thread_t thread)
5185	{
5186	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != `0`);
5187	}
5188
5189	void
5190	thread_set_eager_preempt(thread_t thread)
5191	{
5192	spl_t x;
5193	processor_t p;
5194	ast_t ast = AST_NONE;
5195
5196	x = splsched();
5197	p = current_processor();
5198
5199	thread_lock(thread);
5200	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
5201
5202	if (thread == current_thread()) {
5203
5204	ast = csw_check(p, AST_NONE);
5205	thread_unlock(thread);
5206	if (ast != AST_NONE) {
5207	(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
5208	}
5209	} else {
5210	p = thread->last_processor;
5211
5212	if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
5213	p->active_thread == thread) {
5214	cause_ast_check(p);
5215	}
5216
5217	thread_unlock(thread);
5218	}
5219
5220	splx(x);
5221	}
5222
5223	void
5224	thread_clear_eager_preempt(thread_t thread)
5225	{
5226	spl_t x;
5227
5228	x = splsched();
5229	thread_lock(thread);
5230
5231	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
5232
5233	thread_unlock(thread);
5234	splx(x);
5235	}
5236
5237	/*
5238	* Scheduling statistics
5239	*/
5240	void
5241	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
5242	{
5243	struct processor_sched_statistics *stats;
5244	boolean_t to_realtime = FALSE;
5245
5246	stats = &processor->processor_data.sched_stats;
5247	stats->csw_count++;
5248
5249	if (otherpri >= BASEPRI_REALTIME) {
5250	stats->rt_sched_count++;
5251	to_realtime = TRUE;
5252	}
5253
5254	if ((reasons & AST_PREEMPT) != `0`) {
5255	stats->preempt_count++;
5256
5257	if (selfpri >= BASEPRI_REALTIME) {
5258	stats->preempted_rt_count++;
5259	}
5260
5261	if (to_realtime) {
5262	stats->preempted_by_rt_count++;
5263	}
5264
5265	}
5266	}
5267
5268	void
5269	sched_stats_handle_runq_change(struct runq_stats stats, int* old_count)
5270	{
5271	uint64_t timestamp = mach_absolute_time();
5272
5273	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
5274	stats->last_change_timestamp = timestamp;
5275	}
5276
5277	/*
5278	* For calls from assembly code
5279	*/
5280	#undef thread_wakeup
5281	void
5282	thread_wakeup(
5283	event_t x);
5284
5285	void
5286	thread_wakeup(
5287	event_t x)
5288	{
5289	thread_wakeup_with_result(x, THREAD_AWAKENED);
5290	}
5291
5292	boolean_t
5293	preemption_enabled(void)
5294	{
5295	return (get_preemption_level() == `0` && ml_get_interrupts_enabled());
5296	}
5297
5298	static void
5299	sched_timer_deadline_tracking_init(void) {
5300	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
5301	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
5302	}
5303
5304	#if __arm__ \|\| __arm64__
5305
5306	uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
5307	uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
5308	bool perfcontrol_failsafe_active = false;
5309	bool perfcontrol_sleep_override = false;
5310
5311	uint64_t perfcontrol_failsafe_maintenance_runnable_time;
5312	uint64_t perfcontrol_failsafe_activation_time;
5313	uint64_t perfcontrol_failsafe_deactivation_time;
5314
5315	/ data covering who likely caused it and how long they ran /
5316	#define FAILSAFE_NAME_LEN 33 /* (2MAXCOMLEN)+1 from size of p_name /
5317	char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
5318	int perfcontrol_failsafe_pid;
5319	uint64_t perfcontrol_failsafe_tid;
5320	uint64_t perfcontrol_failsafe_thread_timer_at_start;
5321	uint64_t perfcontrol_failsafe_thread_timer_last_seen;
5322	uint32_t perfcontrol_failsafe_recommended_at_trigger;
5323
5324	/*
5325	* Perf controller calls here to update the recommended core bitmask.
5326	* If the failsafe is active, we don't immediately apply the new value.
5327	* Instead, we store the new request and use it after the failsafe deactivates.
5328	*
5329	* If the failsafe is not active, immediately apply the update.
5330	*
5331	* No scheduler locks are held, no other locks are held that scheduler might depend on,
5332	* interrupts are enabled
5333	*
5334	* currently prototype is in osfmk/arm/machine_routines.h
5335	*/
5336	void
5337	sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
5338	{
5339	assert(preemption_enabled());
5340
5341	spl_t s = splsched();
5342	simple_lock(&sched_recommended_cores_lock);
5343
5344	perfcontrol_requested_recommended_cores = recommended_cores;
5345	perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
5346
5347	if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false))
5348	sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5349	else
5350	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5351	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_NONE,
5352	perfcontrol_requested_recommended_cores,
5353	sched_maintenance_thread->last_made_runnable_time, `0`, `0`, `0`);
5354
5355	simple_unlock(&sched_recommended_cores_lock);
5356	splx(s);
5357	}
5358
5359	void
5360	sched_override_recommended_cores_for_sleep(void)
5361	{
5362	spl_t s = splsched();
5363	simple_lock(&sched_recommended_cores_lock);
5364
5365	if (perfcontrol_sleep_override == false) {
5366	perfcontrol_sleep_override = true;
5367	sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
5368	}
5369
5370	simple_unlock(&sched_recommended_cores_lock);
5371	splx(s);
5372	}
5373
5374	void
5375	sched_restore_recommended_cores_after_sleep(void)
5376	{
5377	spl_t s = splsched();
5378	simple_lock(&sched_recommended_cores_lock);
5379
5380	if (perfcontrol_sleep_override == true) {
5381	perfcontrol_sleep_override = false;
5382	sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5383	}
5384
5385	simple_unlock(&sched_recommended_cores_lock);
5386	splx(s);
5387	}
5388
5389	/*
5390	* Consider whether we need to activate the recommended cores failsafe
5391	*
5392	* Called from quantum timer interrupt context of a realtime thread
5393	* No scheduler locks are held, interrupts are disabled
5394	*/
5395	void
5396	sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
5397	{
5398	/*
5399	* Check if a realtime thread is starving the system
5400	* and bringing up non-recommended cores would help
5401	*
5402	* TODO: Is this the correct check for recommended == possible cores?
5403	* TODO: Validate the checks without the relevant lock are OK.
5404	*/
5405
5406	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
5407	/ keep track of how long the responsible thread runs /
5408
5409	simple_lock(&sched_recommended_cores_lock);
5410
5411	if (perfcontrol_failsafe_active == TRUE &&
5412	cur_thread->thread_id == perfcontrol_failsafe_tid) {
5413	perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
5414	timer_grab(&cur_thread->system_timer);
5415	}
5416
5417	simple_unlock(&sched_recommended_cores_lock);
5418
5419	/ we're already trying to solve the problem, so bail /
5420	return;
5421	}
5422
5423	/ The failsafe won't help if there are no more processors to enable /
5424	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count))
5425	return;
5426
5427	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
5428
5429	/ Use the maintenance thread as our canary in the coal mine /
5430	thread_t m_thread = sched_maintenance_thread;
5431
5432	/ If it doesn't look bad, nothing to see here /
5433	if (__probable(m_thread->last_made_runnable_time >= too_long_ago))
5434	return;
5435
5436	/ It looks bad, take the lock to be sure /
5437	thread_lock(m_thread);
5438
5439	if (m_thread->runq == PROCESSOR_NULL \|\|
5440	(m_thread->state & (TH_RUN\|TH_WAIT)) != TH_RUN \|\|
5441	m_thread->last_made_runnable_time >= too_long_ago) {
5442	/*
5443	* Maintenance thread is either on cpu or blocked, and
5444	* therefore wouldn't benefit from more cores
5445	*/
5446	thread_unlock(m_thread);
5447	return;
5448	}
5449
5450	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
5451
5452	thread_unlock(m_thread);
5453
5454	/*
5455	* There are cores disabled at perfcontrol's recommendation, but the
5456	* system is so overloaded that the maintenance thread can't run.
5457	* That likely means that perfcontrol can't run either, so it can't fix
5458	* the recommendation. We have to kick in a failsafe to keep from starving.
5459	*
5460	* When the maintenance thread has been starved for too long,
5461	* ignore the recommendation from perfcontrol and light up all the cores.
5462	*
5463	* TODO: Consider weird states like boot, sleep, or debugger
5464	*/
5465
5466	simple_lock(&sched_recommended_cores_lock);
5467
5468	if (perfcontrol_failsafe_active == TRUE) {
5469	simple_unlock(&sched_recommended_cores_lock);
5470	return;
5471	}
5472
5473	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5474	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_START,
5475	perfcontrol_requested_recommended_cores, maintenance_runnable_time, `0`, `0`, `0`);
5476
5477	perfcontrol_failsafe_active = TRUE;
5478	perfcontrol_failsafe_activation_time = mach_absolute_time();
5479	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
5480	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
5481
5482	/ Capture some data about who screwed up (assuming that the thread on core is at fault) /
5483	task_t task = cur_thread->task;
5484	perfcontrol_failsafe_pid = task_pid(task);
5485	strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
5486
5487	perfcontrol_failsafe_tid = cur_thread->thread_id;
5488
5489	/ Blame the thread for time it has run recently /
5490	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
5491
5492	uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
5493
5494	/ Compute the start time of the bad behavior in terms of the thread's on core time /
5495	perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
5496	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
5497
5498	/ Ignore the previously recommended core configuration /
5499	sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
5500
5501	simple_unlock(&sched_recommended_cores_lock);
5502	}
5503
5504	/*
5505	* Now that our bacon has been saved by the failsafe, consider whether to turn it off
5506	*
5507	* Runs in the context of the maintenance thread, no locks held
5508	*/
5509	static void
5510	sched_recommended_cores_maintenance(void)
5511	{
5512	/ Common case - no failsafe, nothing to be done here /
5513	if (__probable(perfcontrol_failsafe_active == FALSE))
5514	return;
5515
5516	uint64_t ctime = mach_absolute_time();
5517
5518	boolean_t print_diagnostic = FALSE;
5519	char p_name[FAILSAFE_NAME_LEN] = "";
5520
5521	spl_t s = splsched();
5522	simple_lock(&sched_recommended_cores_lock);
5523
5524	/ Check again, under the lock, to avoid races /
5525	if (perfcontrol_failsafe_active == FALSE)
5526	goto out;
5527
5528	/*
5529	* Ensure that the other cores get another few ticks to run some threads
5530	* If we don't have this hysteresis, the maintenance thread is the first
5531	* to run, and then it immediately kills the other cores
5532	*/
5533	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold)
5534	goto out;
5535
5536	/ Capture some diagnostic state under the lock so we can print it out later /
5537
5538	int pid = perfcontrol_failsafe_pid;
5539	uint64_t tid = perfcontrol_failsafe_tid;
5540
5541	uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
5542	perfcontrol_failsafe_thread_timer_at_start;
5543	uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
5544	uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
5545	uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
5546	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
5547
5548	print_diagnostic = TRUE;
5549
5550	/ Deactivate the failsafe and reinstate the requested recommendation settings /
5551
5552	perfcontrol_failsafe_deactivation_time = ctime;
5553	perfcontrol_failsafe_active = FALSE;
5554
5555	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5556	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_END,
5557	perfcontrol_requested_recommended_cores, failsafe_duration, `0`, `0`, `0`);
5558
5559	sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5560
5561	out:
5562	simple_unlock(&sched_recommended_cores_lock);
5563	splx(s);
5564
5565	if (print_diagnostic) {
5566	uint64_t failsafe_duration_ms = `0`, thread_usage_ms = `0`;
5567
5568	absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
5569	failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
5570
5571	absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
5572	thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
5573
5574	printf("recommended core failsafe kicked in for %lld ms "
5575	"likely due to %s[%d] thread 0x%llx spending "
5576	"%lld ms on cpu at realtime priority - "
5577	"new recommendation: 0x%x -> 0x%x\n",
5578	failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
5579	rec_cores_before, rec_cores_after);
5580	}
5581	}
5582
5583	/*
5584	* Apply a new recommended cores mask to the processors it affects
5585	* Runs after considering failsafes and such
5586	*
5587	* Iterate over processors and update their ->is_recommended field.
5588	* If a processor is running, we let it drain out at its next
5589	* quantum expiration or blocking point. If a processor is idle, there
5590	* may be more work for it to do, so IPI it.
5591	*
5592	* interrupts disabled, sched_recommended_cores_lock is held
5593	*/
5594	static void
5595	sched_update_recommended_cores(uint32_t recommended_cores)
5596	{
5597	processor_set_t pset, nset;
5598	processor_t processor;
5599	uint64_t needs_exit_idle_mask = `0x0`;
5600
5601	processor = processor_list;
5602	pset = processor->processor_set;
5603
5604	KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_START,
5605	recommended_cores, perfcontrol_failsafe_active, `0`, `0`);
5606
5607	if (__builtin_popcount(recommended_cores) == `0`) {
5608	bit_set(recommended_cores, master_processor->cpu_id); / add boot processor or we hang /
5609	}
5610
5611	/ First set recommended cores /
5612	pset_lock(pset);
5613	do {
5614
5615	nset = processor->processor_set;
5616	if (nset != pset) {
5617	pset_unlock(pset);
5618	pset = nset;
5619	pset_lock(pset);
5620	}
5621
5622	if (bit_test(recommended_cores, processor->cpu_id)) {
5623	processor->is_recommended = TRUE;
5624	bit_set(pset->recommended_bitmask, processor->cpu_id);
5625
5626	if (processor->state == PROCESSOR_IDLE) {
5627	if (processor != current_processor()) {
5628	bit_set(needs_exit_idle_mask, processor->cpu_id);
5629	}
5630	}
5631	}
5632	} while ((processor = processor->processor_list) != NULL);
5633	pset_unlock(pset);
5634
5635	/ Now shutdown not recommended cores /
5636	processor = processor_list;
5637	pset = processor->processor_set;
5638
5639	pset_lock(pset);
5640	do {
5641
5642	nset = processor->processor_set;
5643	if (nset != pset) {
5644	pset_unlock(pset);
5645	pset = nset;
5646	pset_lock(pset);
5647	}
5648
5649	if (!bit_test(recommended_cores, processor->cpu_id)) {
5650	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
5651
5652	processor->is_recommended = FALSE;
5653	bit_clear(pset->recommended_bitmask, processor->cpu_id);
5654
5655	if ((processor->state == PROCESSOR_RUNNING) \|\| (processor->state == PROCESSOR_DISPATCHING)) {
5656	ipi_type = SCHED_IPI_IMMEDIATE;
5657	}
5658	SCHED(processor_queue_shutdown)(processor);
5659	/ pset unlocked /
5660
5661	SCHED(rt_queue_shutdown)(processor);
5662
5663	if (ipi_type != SCHED_IPI_NONE) {
5664	if (processor == current_processor()) {
5665	ast_on(AST_PREEMPT);
5666	} else {
5667	sched_ipi_perform(processor, ipi_type);
5668	}
5669	}
5670
5671	pset_lock(pset);
5672	}
5673	} while ((processor = processor->processor_list) != NULL);
5674	pset_unlock(pset);
5675
5676	/ Issue all pending IPIs now that the pset lock has been dropped /
5677	for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= `0`; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
5678	processor = processor_array[cpuid];
5679	machine_signal_idle(processor);
5680	}
5681
5682	KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_END,
5683	needs_exit_idle_mask, `0`, `0`, `0`);
5684	}
5685	#endif /* __arm__ \|\| __arm64__ */
5686
5687	void thread_set_options(uint32_t thopt) {
5688	spl_t x;
5689	thread_t t = current_thread();
5690
5691	x = splsched();
5692	thread_lock(t);
5693
5694	t->options \|= thopt;
5695
5696	thread_unlock(t);
5697	splx(x);
5698	}
5699
5700	void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
5701	thread->pending_block_hint = block_hint;
5702	}
5703
5704	uint32_t qos_max_parallelism(int qos, uint64_t options)
5705	{
5706	return SCHED(qos_max_parallelism)(qos, options);
5707	}
5708
5709	uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options)
5710	{
5711	host_basic_info_data_t hinfo;
5712	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5713	/ Query the machine layer for core information /
5714	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
5715	(host_info_t)&hinfo, &count);
5716	assert(kret == KERN_SUCCESS);
5717
5718	/ We would not want multiple realtime threads running on the*
5719	* same physical core; even for SMT capable machines.
5720	*/
5721	if (options & QOS_PARALLELISM_REALTIME) {
5722	return hinfo.physical_cpu;
5723	}
5724
5725	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
5726	return hinfo.logical_cpu;
5727	} else {
5728	return hinfo.physical_cpu;
5729	}
5730	}
5731
5732	#if __arm64__
5733
5734	/*
5735	* Set up or replace old timer with new timer
5736	*
5737	* Returns true if canceled old timer, false if it did not
5738	*/
5739	boolean_t
5740	sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
5741	{
5742	/*
5743	* Exchange deadline for new deadline, if old deadline was nonzero,
5744	* then I cancelled the callback, otherwise I didn't
5745	*/
5746
5747	uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
5748	memory_order_relaxed);
5749
5750
5751	while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
5752	&old_deadline, new_deadline,
5753	memory_order_relaxed, memory_order_relaxed));
5754
5755
5756	/ now old_deadline contains previous value, which might not be the same if it raced /
5757
5758	return (old_deadline != `0`) ? TRUE : FALSE;
5759	}
5760
5761	#endif /* __arm64__ */
5762
5763	void
5764	sched_update_pset_load_average(processor_set_t pset)
5765	{
5766	int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
5767	int new_load_average = (pset->load_average + load) >> `1`;
5768
5769	pset->load_average = new_load_average;
5770
5771	#if (DEVELOPMENT \|\| DEBUG)
5772	#endif
5773	}
5774
5775	/ pset is locked /
5776	static processor_t
5777	choose_processor_for_realtime_thread(processor_set_t pset)
5778	{
5779	uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_cpu_mask);
5780
5781	for (int cpuid = lsb_first(cpu_map); cpuid >= `0`; cpuid = lsb_next(cpu_map, cpuid)) {
5782	processor_t processor = processor_array[cpuid];
5783
5784	if (processor->processor_primary != processor) {
5785	continue;
5786	}
5787
5788	if (processor->state == PROCESSOR_IDLE) {
5789	return processor;
5790	}
5791
5792	if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
5793	continue;
5794	}
5795
5796	if (processor->current_pri >= BASEPRI_RTQUEUES) {
5797	continue;
5798	}
5799
5800	return processor;
5801
5802	}
5803
5804	if (!sched_allow_rt_smt) {
5805	return PROCESSOR_NULL;
5806	}
5807
5808	/ Consider secondary processors /
5809	for (int cpuid = lsb_first(cpu_map); cpuid >= `0`; cpuid = lsb_next(cpu_map, cpuid)) {
5810	processor_t processor = processor_array[cpuid];
5811
5812	if (processor->processor_primary == processor) {
5813	continue;
5814	}
5815
5816	if (processor->state == PROCESSOR_IDLE) {
5817	return processor;
5818	}
5819
5820	if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
5821	continue;
5822	}
5823
5824	if (processor->current_pri >= BASEPRI_RTQUEUES) {
5825	continue;
5826	}
5827
5828	return processor;
5829
5830	}
5831
5832	return PROCESSOR_NULL;
5833	}
5834
5835	/ pset is locked /
5836	static bool
5837	all_available_primaries_are_running_realtime_threads(processor_set_t pset)
5838	{
5839	uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask);
5840
5841	for (int cpuid = lsb_first(cpu_map); cpuid >= `0`; cpuid = lsb_next(cpu_map, cpuid)) {
5842	processor_t processor = processor_array[cpuid];
5843
5844	if (processor->processor_primary != processor) {
5845	continue;
5846	}
5847
5848	if (processor->state == PROCESSOR_IDLE) {
5849	return false;
5850	}
5851
5852	if (processor->state == PROCESSOR_DISPATCHING) {
5853	return false;
5854	}
5855
5856	if (processor->state != PROCESSOR_RUNNING) {
5857	/*
5858	* All other processor states are considered unavailable to run
5859	* realtime threads. In particular, we prefer an available secondary
5860	* processor over the risk of leaving a realtime thread on the run queue
5861	* while waiting for a processor in PROCESSOR_START state,
5862	* which should anyway be a rare case.
5863	*/
5864	continue;
5865	}
5866
5867	if (processor->current_pri < BASEPRI_RTQUEUES) {
5868	return false;
5869	}
5870	}
5871
5872	return true;
5873	}
5874
5875
5876

Browse the source code of xnu/osfmk/kern/sched_prim.c