sched_clutch.c source code [xnu/osfmk/kern/sched_clutch.c]

1	/*
2	* Copyright (c) 2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <mach/mach_types.h>
30	#include <mach/machine.h>
31	#include <machine/machine_routines.h>
32	#include <machine/sched_param.h>
33	#include <machine/machine_cpu.h>
34	#include <kern/kern_types.h>
35	#include <kern/debug.h>
36	#include <kern/machine.h>
37	#include <kern/misc_protos.h>
38	#include <kern/processor.h>
39	#include <kern/queue.h>
40	#include <kern/sched.h>
41	#include <kern/sched_prim.h>
42	#include <kern/task.h>
43	#include <kern/thread.h>
44	#include <kern/sched_clutch.h>
45	#include <machine/atomic.h>
46	#include <kern/sched_clutch.h>
47	#include <sys/kdebug.h>
48
49	#if CONFIG_SCHED_EDGE
50	#include <kern/sched_amp_common.h>
51	#endif /* CONFIG_SCHED_EDGE */
52
53	#if CONFIG_SCHED_CLUTCH
54
55	/ Forward declarations of static routines /
56
57	/ Root level hierarchy management /
58	static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59	static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60	static void sched_clutch_root_pri_update(sched_clutch_root_t);
61	static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62	static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63
64	__enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = `0`,
66	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = `1`,
67	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = `2`,
68	});
69
70	static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71
72	#if CONFIG_SCHED_EDGE
73	/ Support for foreign threads on AMP platforms /
74	static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75	static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76	#endif /* CONFIG_SCHED_EDGE */
77
78	/ Root bucket level hierarchy management /
79	static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80	static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81
82	/ Options for clutch bucket ordering in the runq /
83	__options_decl(sched_clutch_bucket_options_t, uint32_t, {
84	SCHED_CLUTCH_BUCKET_OPTIONS_NONE = `0x0`,
85	/ Round robin clutch bucket on thread removal /
86	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = `0x1`,
87	/ Insert clutch bucket at head (for thread preemption) /
88	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = `0x2`,
89	/ Insert clutch bucket at tail (default) /
90	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = `0x4`,
91	});
92
93	/ Clutch bucket level hierarchy management /
94	static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95	static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96	static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97	static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98	static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99	static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100
101	/ Clutch bucket group level properties management /
102	static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103	static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104	static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105	static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106	static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107	static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108	static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109
110	/ Clutch timeshare properties updates /
111	static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112	static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113
114	/ Clutch membership management /
115	static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116	static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117	static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118
119	/ Clutch properties updates /
120	static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121	static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122	static int sched_clutch_root_priority(sched_clutch_root_t);
123	static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124	static boolean_t sched_thread_sched_pri_promoted(thread_t);
125
126	#if CONFIG_SCHED_EDGE
127	/ System based routines /
128	static bool sched_edge_pset_available(processor_set_t);
129	static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130	static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131
132	/ Global indicating the maximum number of clusters on the current platform /
133	static int sched_edge_max_clusters = `0`;
134	#endif /* CONFIG_SCHED_EDGE */
135
136	/ Helper debugging routines /
137	static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138
139	extern processor_set_t pset_array[MAX_PSETS];
140
141	/*
142	* Special markers for buckets that have invalid WCELs/quantums etc.
143	*/
144	#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145	#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146
147	/*
148	* Root level bucket WCELs
149	*
150	* The root level bucket selection algorithm is an Earliest Deadline
151	* First (EDF) algorithm where the deadline for buckets are defined
152	* by the worst-case-execution-latency and the make runnable timestamp
153	* for the bucket.
154	*
155	*/
156	static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157	SCHED_CLUTCH_INVALID_TIME_32, / FIXPRI /
158	`0`, / FG /
159	`37500`, / IN (37.5ms) /
160	`75000`, / DF (75ms) /
161	`150000`, / UT (150ms) /
162	`250000` / BG (250ms) /
163	};
164	static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {`0`};
165
166	/*
167	* Root level bucket warp
168	*
169	* Each root level bucket has a warp value associated with it as well.
170	* The warp value allows the root bucket to effectively warp ahead of
171	* lower priority buckets for a limited time even if it has a later
172	* deadline. The warping behavior provides extra (but limited)
173	* opportunity for high priority buckets to remain responsive.
174	*/
175
176	/ Special warp deadline value to indicate that the bucket has not used any warp yet /
177	#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
178
179	/ Warp window durations for various tiers /
180	static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181	SCHED_CLUTCH_INVALID_TIME_32, / FIXPRI /
182	`8000`, / FG (8ms)/
183	`4000`, / IN (4ms) /
184	`2000`, / DF (2ms) /
185	`1000`, / UT (1ms) /
186	`0` / BG (0ms) /
187	};
188	static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {`0`};
189
190	/*
191	* Thread level quantum
192	*
193	* The algorithm defines quantums for threads at various buckets. This
194	* (combined with the root level bucket quantums) restricts how much
195	* the lower priority levels can preempt the higher priority threads.
196	*/
197
198	#if XNU_TARGET_OS_OSX
199	static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200	`10000`, / FIXPRI (10ms) /
201	`10000`, / FG (10ms) /
202	`10000`, / IN (10ms) /
203	`10000`, / DF (10ms) /
204	`4000`, / UT (4ms) /
205	`2000` / BG (2ms) /
206	};
207	#else /* XNU_TARGET_OS_OSX */
208	static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209	`10000`, / FIXPRI (10ms) /
210	`10000`, / FG (10ms) /
211	`8000`, / IN (8ms) /
212	`6000`, / DF (6ms) /
213	`4000`, / UT (4ms) /
214	`2000` / BG (2ms) /
215	};
216	#endif /* XNU_TARGET_OS_OSX */
217
218	static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {`0`};
219
220	/*
221	* sched_clutch_us_to_abstime()
222	*
223	* Initializer for converting all durations in usec to abstime
224	*/
225	static void
226	sched_clutch_us_to_abstime(uint32_t us_vals, uint64_t abstime_vals)
227	{
228	for (int i = `0`; i < TH_BUCKET_SCHED_MAX; i++) {
229	if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230	abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231	} else {
232	clock_interval_to_absolutetime_interval(interval: us_vals[i],
233	NSEC_PER_USEC, result: &abstime_vals[i]);
234	}
235	}
236	}
237
238	/ Clutch/Edge Scheduler Debugging support /
239	#define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c \| ((uint64_t)b << 16) \| ((uint64_t)a << 32))
240
241	#if DEVELOPMENT \|\| DEBUG
242
243	/*
244	* sched_clutch_hierarchy_locked_assert()
245	*
246	* Debugging helper routine. Asserts that the hierarchy is locked. The locking
247	* for the hierarchy depends on where the hierarchy is hooked. The current
248	* implementation hooks the hierarchy at the pset, so the hierarchy is locked
249	* using the pset lock.
250	*/
251	static inline void
252	sched_clutch_hierarchy_locked_assert(
253	sched_clutch_root_t root_clutch)
254	{
255	pset_assert_locked(root_clutch->scr_pset);
256	}
257
258	#else /* DEVELOPMENT \|\| DEBUG */
259
260	static inline void
261	sched_clutch_hierarchy_locked_assert(
262	__unused sched_clutch_root_t root_clutch)
263	{
264	}
265
266	#endif /* DEVELOPMENT \|\| DEBUG */
267
268	/*
269	* sched_clutch_thr_count_inc()
270	*
271	* Increment thread count at a hierarchy level with overflow checks.
272	*/
273	static void
274	sched_clutch_thr_count_inc(
275	uint16_t *thr_count)
276	{
277	if (__improbable(os_inc_overflow(thr_count))) {
278	panic("sched_clutch thread count overflowed!");
279	}
280	}
281
282	/*
283	* sched_clutch_thr_count_dec()
284	*
285	* Decrement thread count at a hierarchy level with underflow checks.
286	*/
287	static void
288	sched_clutch_thr_count_dec(
289	uint16_t *thr_count)
290	{
291	if (__improbable(os_dec_overflow(thr_count))) {
292	panic("sched_clutch thread count underflowed!");
293	}
294	}
295
296	static sched_bucket_t
297	sched_convert_pri_to_bucket(uint8_t priority)
298	{
299	sched_bucket_t bucket = TH_BUCKET_RUN;
300
301	if (priority > BASEPRI_USER_INITIATED) {
302	bucket = TH_BUCKET_SHARE_FG;
303	} else if (priority > BASEPRI_DEFAULT) {
304	bucket = TH_BUCKET_SHARE_IN;
305	} else if (priority > BASEPRI_UTILITY) {
306	bucket = TH_BUCKET_SHARE_DF;
307	} else if (priority > MAXPRI_THROTTLE) {
308	bucket = TH_BUCKET_SHARE_UT;
309	} else {
310	bucket = TH_BUCKET_SHARE_BG;
311	}
312	return bucket;
313	}
314
315	/*
316	* sched_clutch_thread_bucket_map()
317	*
318	* Map a thread to a scheduling bucket for the clutch/edge scheduler
319	* based on its scheduling mode and the priority attribute passed in.
320	*/
321	static sched_bucket_t
322	sched_clutch_thread_bucket_map(thread_t thread, int pri)
323	{
324	switch (thread->sched_mode) {
325	case TH_MODE_FIXED:
326	if (pri >= BASEPRI_FOREGROUND) {
327	return TH_BUCKET_FIXPRI;
328	} else {
329	return sched_convert_pri_to_bucket(priority: pri);
330	}
331
332	case TH_MODE_REALTIME:
333	return TH_BUCKET_FIXPRI;
334
335	case TH_MODE_TIMESHARE:
336	return sched_convert_pri_to_bucket(priority: pri);
337
338	default:
339	panic("unexpected mode: %d", thread->sched_mode);
340	break;
341	}
342	}
343
344	/*
345	* The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
346	* based on the amount of time they have been pending and the load at that
347	* scheduling bucket level. Since the clutch bucket groups are global (i.e. span
348	* multiple clusters, its important to keep the load also as a global counter.
349	*/
350	static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
351
352	/*
353	* sched_clutch_root_init()
354	*
355	* Routine to initialize the scheduler hierarchy root.
356	*/
357	static void
358	sched_clutch_root_init(
359	sched_clutch_root_t root_clutch,
360	processor_set_t pset)
361	{
362	root_clutch->scr_thr_count = `0`;
363	root_clutch->scr_priority = NOPRI;
364	root_clutch->scr_urgency = `0`;
365	root_clutch->scr_pset = pset;
366	#if CONFIG_SCHED_EDGE
367	root_clutch->scr_cluster_id = pset->pset_cluster_id;
368	#else /* CONFIG_SCHED_EDGE */
369	root_clutch->scr_cluster_id = `0`;
370	#endif /* CONFIG_SCHED_EDGE */
371
372	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
373	root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = `0`;
374	}
375	/ Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes /
376	queue_init(&root_clutch->scr_clutch_buckets);
377
378	/ Initialize the priority queue which maintains all runnable foreign clutch buckets /
379	priority_queue_init(que: &root_clutch->scr_foreign_buckets);
380	bzero(s: &root_clutch->scr_cumulative_run_count, n: sizeof(root_clutch->scr_cumulative_run_count));
381	bitmap_zero(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
382	bitmap_zero(map: root_clutch->scr_bound_warp_available, nbits: TH_BUCKET_SCHED_MAX);
383	priority_queue_init(que: &root_clutch->scr_bound_root_buckets);
384
385	/ Initialize the bitmap and priority queue of runnable root buckets /
386	priority_queue_init(que: &root_clutch->scr_unbound_root_buckets);
387	bitmap_zero(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
388	bitmap_zero(map: root_clutch->scr_unbound_warp_available, nbits: TH_BUCKET_SCHED_MAX);
389
390	/ Initialize all the root buckets /
391	for (uint32_t i = `0`; i < TH_BUCKET_SCHED_MAX; i++) {
392	sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
393	sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
394	}
395	}
396
397	/*
398	* Clutch Bucket Runqueues
399	*
400	* The clutch buckets are maintained in a runq at the root bucket level. The
401	* runq organization allows clutch buckets to be ordered based on various
402	* factors such as:
403	*
404	* - Clutch buckets are round robin'ed at the same priority level when a
405	* thread is selected from a clutch bucket. This prevents a clutch bucket
406	* from starving out other clutch buckets at the same priority.
407	*
408	* - Clutch buckets are inserted at the head when it becomes runnable due to
409	* thread preemption. This allows threads that were preempted to maintain
410	* their order in the queue.
411	*/
412
413	/*
414	* sched_clutch_bucket_runq_init()
415	*
416	* Initialize a clutch bucket runq.
417	*/
418	static void
419	sched_clutch_bucket_runq_init(
420	sched_clutch_bucket_runq_t clutch_buckets_rq)
421	{
422	clutch_buckets_rq->scbrq_highq = NOPRI;
423	for (uint8_t i = `0`; i < BITMAP_LEN(NRQS); i++) {
424	clutch_buckets_rq->scbrq_bitmap[i] = `0`;
425	}
426	clutch_buckets_rq->scbrq_count = `0`;
427	for (int i = `0`; i < NRQS; i++) {
428	circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
429	}
430	}
431
432	/*
433	* sched_clutch_bucket_runq_empty()
434	*
435	* Returns if a clutch bucket runq is empty.
436	*/
437	static boolean_t
438	sched_clutch_bucket_runq_empty(
439	sched_clutch_bucket_runq_t clutch_buckets_rq)
440	{
441	return clutch_buckets_rq->scbrq_count == `0`;
442	}
443
444	/*
445	* sched_clutch_bucket_runq_peek()
446	*
447	* Returns the highest priority clutch bucket in the runq.
448	*/
449	static sched_clutch_bucket_t
450	sched_clutch_bucket_runq_peek(
451	sched_clutch_bucket_runq_t clutch_buckets_rq)
452	{
453	if (clutch_buckets_rq->scbrq_count > `0`) {
454	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
455	return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
456	} else {
457	return NULL;
458	}
459	}
460
461	/*
462	* sched_clutch_bucket_runq_enqueue()
463	*
464	* Enqueue a clutch bucket into the runq based on the options passed in.
465	*/
466	static void
467	sched_clutch_bucket_runq_enqueue(
468	sched_clutch_bucket_runq_t clutch_buckets_rq,
469	sched_clutch_bucket_t clutch_bucket,
470	sched_clutch_bucket_options_t options)
471	{
472	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
473	if (circle_queue_empty(cq: queue)) {
474	circle_enqueue_tail(cq: queue, elt: &clutch_bucket->scb_runqlink);
475	bitmap_set(map: clutch_buckets_rq->scbrq_bitmap, n: clutch_bucket->scb_priority);
476	if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
477	clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
478	}
479	} else {
480	if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
481	circle_enqueue_head(cq: queue, elt: &clutch_bucket->scb_runqlink);
482	} else {
483	/*
484	* Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
485	* SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
486	*/
487	circle_enqueue_tail(cq: queue, elt: &clutch_bucket->scb_runqlink);
488	}
489	}
490	clutch_buckets_rq->scbrq_count++;
491	}
492
493	/*
494	* sched_clutch_bucket_runq_remove()
495	*
496	* Remove a clutch bucket from the runq.
497	*/
498	static void
499	sched_clutch_bucket_runq_remove(
500	sched_clutch_bucket_runq_t clutch_buckets_rq,
501	sched_clutch_bucket_t clutch_bucket)
502	{
503	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
504	circle_dequeue(cq: queue, elt: &clutch_bucket->scb_runqlink);
505	assert(clutch_buckets_rq->scbrq_count > `0`);
506	clutch_buckets_rq->scbrq_count--;
507	if (circle_queue_empty(cq: queue)) {
508	bitmap_clear(map: clutch_buckets_rq->scbrq_bitmap, n: clutch_bucket->scb_priority);
509	clutch_buckets_rq->scbrq_highq = bitmap_first(map: clutch_buckets_rq->scbrq_bitmap, NRQS);
510	}
511	}
512
513	static void
514	sched_clutch_bucket_runq_rotate(
515	sched_clutch_bucket_runq_t clutch_buckets_rq,
516	sched_clutch_bucket_t clutch_bucket)
517	{
518	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
519	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
520	circle_queue_rotate_head_forward(cq: queue);
521	}
522
523	/*
524	* sched_clutch_root_bucket_init()
525	*
526	* Routine to initialize root buckets.
527	*/
528	static void
529	sched_clutch_root_bucket_init(
530	sched_clutch_root_bucket_t root_bucket,
531	sched_bucket_t bucket,
532	bool bound_root_bucket)
533	{
534	root_bucket->scrb_bucket = bucket;
535	if (bound_root_bucket) {
536	/ For bound root buckets, initialize the bound thread runq. /
537	root_bucket->scrb_bound = true;
538	run_queue_init(runq: &root_bucket->scrb_bound_thread_runq);
539	} else {
540	/*
541	* The unbounded root buckets contain a runq of runnable clutch buckets
542	* which then hold the runnable threads.
543	*/
544	root_bucket->scrb_bound = false;
545	sched_clutch_bucket_runq_init(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets);
546	}
547	priority_queue_entry_init(&root_bucket->scrb_pqlink);
548	root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
549	root_bucket->scrb_warped_deadline = `0`;
550	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
551	root_bucket->scrb_starvation_avoidance = false;
552	root_bucket->scrb_starvation_ts = `0`;
553	}
554
555	/*
556	* Special case scheduling for Above UI bucket.
557	*
558	* AboveUI threads are typically system critical threads that need low latency
559	* which is why they are handled specially.
560	*
561	* Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
562	* important to maintain some native priority order between those buckets. For unbounded
563	* root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
564	* Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
565	* deadline based scheduling which should pickup the timeshare buckets. For the bound
566	* case, the policy simply compares the priority of the highest runnable threads in
567	* the above UI and timeshare buckets.
568	*
569	* The implementation allows extremely low latency CPU access for Above UI threads
570	* while supporting the use case of high priority timeshare threads contending with
571	* lower priority fixed priority threads.
572	*/
573
574
575	/*
576	* sched_clutch_root_unbound_select_aboveui()
577	*
578	* Routine to determine if the above UI unbounded bucket should be selected for execution.
579	*/
580	static bool
581	sched_clutch_root_unbound_select_aboveui(
582	sched_clutch_root_t root_clutch)
583	{
584	if (bitmap_test(map: root_clutch->scr_unbound_runnable_bitmap, n: TH_BUCKET_FIXPRI)) {
585	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
586	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
587	if (!bitmap_test(map: root_clutch->scr_unbound_runnable_bitmap, n: TH_BUCKET_SHARE_FG)) {
588	/ If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling /
589	return true;
590	}
591	sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
592	sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
593	if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
594	return true;
595	}
596	}
597	return false;
598	}
599
600	/*
601	* sched_clutch_root_bound_select_aboveui()
602	*
603	* Routine to determine if the above UI bounded bucket should be selected for execution.
604	*/
605	static bool
606	sched_clutch_root_bound_select_aboveui(
607	sched_clutch_root_t root_clutch)
608	{
609	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
610	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
611	if (root_bucket_aboveui->scrb_bound_thread_runq.count == `0`) {
612	return false;
613	}
614	return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
615	}
616
617	/*
618	* sched_clutch_root_highest_root_bucket()
619	*
620	* Main routine to find the highest runnable root level bucket.
621	* This routine is called from performance sensitive contexts; so it is
622	* crucial to keep this O(1). The options parameter determines if
623	* the selection logic should look at unbounded threads only (for
624	* cross-cluster stealing operations) or both bounded and unbounded
625	* threads (for selecting next thread for execution on current cluster).
626	*/
627	static sched_clutch_root_bucket_t
628	sched_clutch_root_highest_root_bucket(
629	sched_clutch_root_t root_clutch,
630	uint64_t timestamp,
631	sched_clutch_highest_root_bucket_type_t type)
632	{
633	sched_clutch_hierarchy_locked_assert(root_clutch);
634	int highest_runnable_bucket = -`1`;
635	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
636	highest_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
637	} else {
638	int highest_unbound_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
639	int highest_bound_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
640	highest_runnable_bucket = (highest_bound_runnable_bucket != -`1`) ? ((highest_unbound_runnable_bucket != -`1`) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
641	}
642
643	if (highest_runnable_bucket == -`1`) {
644	return NULL;
645	}
646
647	/ Above UI root bucket selection (see comment above for more details on this special case handling) /
648	bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
649	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
650	if (unbound_aboveui) {
651	return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
652	}
653	/ Fall through to selecting a timeshare root bucket /
654	} else {
655	bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
656	sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
657	sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
658
659	if (unbound_aboveui && bound_aboveui) {
660	/*
661	* In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
662	* highest runnable priority in both the buckets.
663	* */
664	int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
665	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
666	int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
667	return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
668	}
669	if (unbound_aboveui) {
670	return unbound_aboveui_root_bucket;
671	}
672	if (bound_aboveui) {
673	return bound_aboveui_root_bucket;
674	}
675	/ Fall through to selecting a timeshare root bucket /
676	}
677
678	/*
679	* Above UI bucket is not runnable or has a low priority runnable thread; use the
680	* earliest deadline model to schedule threads. The idea is that as the timeshare
681	* buckets use CPU, they will drop their interactivity score/sched priority and
682	* allow the low priority AboveUI buckets to be scheduled.
683	*/
684
685	/ Find the earliest deadline bucket /
686	sched_clutch_root_bucket_t edf_bucket = NULL;
687	sched_clutch_root_bucket_t warp_bucket = NULL;
688	int warp_bucket_index = -`1`;
689
690	evaluate_root_buckets:
691	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
692	edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
693	} else {
694	sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
695	sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
696	if (bound_bucket && unbound_bucket) {
697	/ If bound and unbound root buckets are runnable, select the one with the earlier deadline /
698	edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
699	} else {
700	edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
701	}
702	}
703	/*
704	* Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
705	* buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
706	* makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
707	* QoS root bucket.
708	*/
709	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
710	warp_bucket_index = bitmap_lsb_first(map: warp_available_bitmap, nbits: TH_BUCKET_SCHED_MAX);
711
712	if ((warp_bucket_index == -`1`) \|\| (warp_bucket_index >= edf_bucket->scrb_bucket)) {
713	/ No higher buckets have warp left; best choice is the EDF based bucket /
714	if (edf_bucket->scrb_starvation_avoidance) {
715	/*
716	* Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
717	* starvation avoidance window is still open and return this bucket if it is.
718	*
719	* The starvation avoidance window is calculated based on the quantum of threads at that bucket and
720	* the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
721	* avoidance across all CPUs.
722	*/
723	uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(pset: root_clutch->scr_pset);
724	if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
725	return edf_bucket;
726	} else {
727	/ Starvation avoidance window is over; update deadline and re-evaluate EDF /
728	edf_bucket->scrb_starvation_avoidance = false;
729	edf_bucket->scrb_starvation_ts = `0`;
730	sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
731	}
732	goto evaluate_root_buckets;
733	}
734
735	/ Looks like the EDF bucket is not in starvation avoidance mode; check if it should be /
736	if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
737	/ Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance /
738	edf_bucket->scrb_starvation_avoidance = true;
739	edf_bucket->scrb_starvation_ts = timestamp;
740	} else {
741	/ EDF bucket is being selected in the natural order; update deadline and reset warp /
742	sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
743	edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
744	edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
745	if (edf_bucket->scrb_bound) {
746	bitmap_set(map: root_clutch->scr_bound_warp_available, n: edf_bucket->scrb_bucket);
747	} else {
748	bitmap_set(map: root_clutch->scr_unbound_warp_available, n: edf_bucket->scrb_bucket);
749	}
750	}
751	return edf_bucket;
752	}
753
754	/*
755	* Looks like there is a root bucket which is higher in the natural priority
756	* order than edf_bucket and might have some warp remaining.
757	*/
758	warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
759	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
760	/ Root bucket has not used any of its warp; set a deadline to expire its warp and return it /
761	warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
762	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
763	return warp_bucket;
764	}
765	if (warp_bucket->scrb_warped_deadline > timestamp) {
766	/ Root bucket already has a warp window open with some warp remaining /
767	sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
768	return warp_bucket;
769	}
770
771	/ For this bucket, warp window was opened sometime in the past but has now*
772	* expired. Mark the bucket as not avilable for warp anymore and re-run the
773	* warp bucket selection logic.
774	*/
775	warp_bucket->scrb_warp_remaining = `0`;
776	if (warp_bucket->scrb_bound) {
777	bitmap_clear(map: root_clutch->scr_bound_warp_available, n: warp_bucket->scrb_bucket);
778	} else {
779	bitmap_clear(map: root_clutch->scr_unbound_warp_available, n: warp_bucket->scrb_bucket);
780	}
781	goto evaluate_root_buckets;
782	}
783
784	/*
785	* sched_clutch_root_bucket_deadline_calculate()
786	*
787	* Calculate the deadline for the bucket based on its WCEL
788	*/
789	static uint64_t
790	sched_clutch_root_bucket_deadline_calculate(
791	sched_clutch_root_bucket_t root_bucket,
792	uint64_t timestamp)
793	{
794	/ For fixpri AboveUI bucket always return it as the earliest deadline /
795	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
796	return `0`;
797	}
798
799	/ For all timeshare buckets set the deadline as current time + worst-case-execution-latency /
800	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
801	}
802
803	/*
804	* sched_clutch_root_bucket_deadline_update()
805	*
806	* Routine to update the deadline of the root bucket when it is selected.
807	* Updating the deadline also moves the root_bucket in the EDF priority
808	* queue.
809	*/
810	static void
811	sched_clutch_root_bucket_deadline_update(
812	sched_clutch_root_bucket_t root_bucket,
813	sched_clutch_root_t root_clutch,
814	uint64_t timestamp)
815	{
816	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
817	/ The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket /
818	return;
819	}
820
821	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
822	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
823	if (__improbable(old_deadline > new_deadline)) {
824	panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
825	}
826	if (old_deadline != new_deadline) {
827	root_bucket->scrb_pqlink.deadline = new_deadline;
828	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
829	priority_queue_entry_increased(que: prioq, elt: &root_bucket->scrb_pqlink);
830	}
831	}
832
833	/*
834	* sched_clutch_root_bucket_runnable()
835	*
836	* Routine to insert a newly runnable root bucket into the hierarchy.
837	* Also updates the deadline and warp parameters as necessary.
838	*/
839	static void
840	sched_clutch_root_bucket_runnable(
841	sched_clutch_root_bucket_t root_bucket,
842	sched_clutch_root_t root_clutch,
843	uint64_t timestamp)
844	{
845	/ Mark the root bucket as runnable /
846	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
847	bitmap_set(map: runnable_bitmap, n: root_bucket->scrb_bucket);
848
849	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
850	/ Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here /
851	return;
852	}
853
854	if (root_bucket->scrb_starvation_avoidance == false) {
855	/*
856	* Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
857	* starvation avoidance and its window has expired, the highest root bucket selection logic will notice
858	* that and fix it up.
859	*/
860	root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
861	}
862	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
863	priority_queue_insert(que: prioq, elt: &root_bucket->scrb_pqlink);
864	if (root_bucket->scrb_warp_remaining) {
865	/ Since the bucket has some warp remaining and its now runnable, mark it as available for warp /
866	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
867	bitmap_set(map: warp_bitmap, n: root_bucket->scrb_bucket);
868	}
869	}
870
871	/*
872	* sched_clutch_root_bucket_empty()
873	*
874	* Routine to remove an empty root bucket from the hierarchy.
875	* Also updates the deadline and warp parameters as necessary.
876	*/
877	static void
878	sched_clutch_root_bucket_empty(
879	sched_clutch_root_bucket_t root_bucket,
880	sched_clutch_root_t root_clutch,
881	uint64_t timestamp)
882	{
883	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
884	bitmap_clear(map: runnable_bitmap, n: root_bucket->scrb_bucket);
885
886	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
887	/ Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here /
888	return;
889	}
890
891	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
892	priority_queue_remove(que: prioq, elt: &root_bucket->scrb_pqlink);
893
894	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
895	bitmap_clear(map: warp_bitmap, n: root_bucket->scrb_bucket);
896
897	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
898	if (root_bucket->scrb_warped_deadline > timestamp) {
899	/*
900	* For root buckets that were using the warp, check if the warp
901	* deadline is in the future. If yes, remove the wall time the
902	* warp was active and update the warp remaining. This allows
903	* the root bucket to use the remaining warp the next time it
904	* becomes runnable.
905	*/
906	root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
907	} else {
908	/*
909	* If the root bucket's warped deadline is in the past, it has used up
910	* all the warp it was assigned. Empty out its warp remaining.
911	*/
912	root_bucket->scrb_warp_remaining = `0`;
913	}
914	}
915	}
916
917	static int
918	sched_clutch_global_bucket_load_get(
919	sched_bucket_t bucket)
920	{
921	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
922	}
923
924	/*
925	* sched_clutch_root_pri_update()
926	*
927	* The root level priority is used for thread selection and preemption
928	* logic.
929	*
930	* The logic uses the same decision as thread selection for deciding between the
931	* above UI and timeshare buckets. If one of the timesharing buckets have to be
932	* used for priority calculation, the logic is slightly different from thread
933	* selection, because thread selection considers deadlines, warps etc. to
934	* decide the most optimal bucket at a given timestamp. Since the priority
935	* value is used for preemption decisions only, it needs to be based on the
936	* highest runnable thread available in the timeshare domain. This logic can
937	* be made more sophisticated if there are cases of unnecessary preemption
938	* being seen in workloads.
939	*/
940	static void
941	sched_clutch_root_pri_update(
942	sched_clutch_root_t root_clutch)
943	{
944	sched_clutch_hierarchy_locked_assert(root_clutch);
945	int16_t root_bound_pri = NOPRI;
946	int16_t root_unbound_pri = NOPRI;
947
948	if (bitmap_lsb_first(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX) == -`1`) {
949	goto root_pri_update_unbound;
950	}
951	sched_clutch_root_bucket_t root_bucket_bound = NULL;
952	if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
953	root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
954	} else {
955	int root_bucket_index = bitmap_lsb_next(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX, prev: TH_BUCKET_FIXPRI);
956	assert(root_bucket_index != -`1`);
957	root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
958	}
959	root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
960
961	root_pri_update_unbound:
962	if (bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX) == -`1`) {
963	goto root_pri_update_complete;
964	}
965	sched_clutch_root_bucket_t root_bucket_unbound = NULL;
966	if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
967	root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
968	} else {
969	int root_bucket_index = bitmap_lsb_next(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX, prev: TH_BUCKET_FIXPRI);
970	assert(root_bucket_index != -`1`);
971	root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
972	}
973	/ For the selected root bucket, find the highest priority clutch bucket /
974	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
975	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
976
977	root_pri_update_complete:
978	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
979	}
980
981	/*
982	* sched_clutch_root_urgency_inc()
983	*
984	* Routine to increment the urgency at the root level based on the thread
985	* priority that is being inserted into the hierarchy. The root urgency
986	* counter is updated based on the urgency of threads in any of the
987	* clutch buckets which are part of the hierarchy.
988	*
989	* Always called with the pset lock held.
990	*/
991	static void
992	sched_clutch_root_urgency_inc(
993	sched_clutch_root_t root_clutch,
994	thread_t thread)
995	{
996	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
997	root_clutch->scr_urgency++;
998	}
999	}
1000
1001	/*
1002	* sched_clutch_root_urgency_dec()
1003	*
1004	* Routine to decrement the urgency at the root level based on the thread
1005	* priority that is being removed from the hierarchy. The root urgency
1006	* counter is updated based on the urgency of threads in any of the
1007	* clutch buckets which are part of the hierarchy.
1008	*
1009	* Always called with the pset lock held.
1010	*/
1011	static void
1012	sched_clutch_root_urgency_dec(
1013	sched_clutch_root_t root_clutch,
1014	thread_t thread)
1015	{
1016	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1017	root_clutch->scr_urgency--;
1018	}
1019	}
1020
1021	/*
1022	* Clutch bucket level scheduling
1023	*
1024	* The second level of scheduling is the clutch bucket level scheduling
1025	* which tries to schedule thread groups within root_buckets. Each
1026	* clutch represents a thread group and a clutch_bucket_group represents
1027	* threads at a particular sched_bucket within that thread group. The
1028	* clutch_bucket_group contains a clutch_bucket per cluster on the system
1029	* where it holds the runnable threads destined for execution on that
1030	* cluster.
1031	*
1032	* The goal of this level of scheduling is to allow interactive thread
1033	* groups low latency access to the CPU. It also provides slight
1034	* scheduling preference for App and unrestricted thread groups.
1035	*
1036	* The clutch bucket scheduling algorithm measures an interactivity
1037	* score for all clutch bucket groups. The interactivity score is based
1038	* on the ratio of the CPU used and the voluntary blocking of threads
1039	* within the clutch bucket group. The algorithm is very close to the ULE
1040	* scheduler on FreeBSD in terms of calculations. The interactivity
1041	* score provides an interactivity boost in the range of
1042	* [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1043	* thread groups to win over CPU spinners.
1044	*
1045	* The interactivity score of the clutch bucket group is combined with the
1046	* highest base/promoted priority of threads in the clutch bucket to form
1047	* the overall priority of the clutch bucket.
1048	*/
1049
1050	/ Priority boost range for interactivity /
1051	#define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1052	uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1053
1054	/ window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked /
1055	uint64_t sched_clutch_bucket_group_adjust_threshold = `0`;
1056	#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1057
1058	/ The ratio to scale the cpu/blocked time per window /
1059	#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1060
1061	/ Initial value for voluntary blocking time for the clutch_bucket /
1062	#define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1063
1064	/ Value indicating the clutch bucket is not pending execution /
1065	#define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1066
1067	/*
1068	* Thread group CPU starvation avoidance
1069	*
1070	* In heavily CPU contended scenarios, it is possible that some thread groups
1071	* which have a low interactivity score do not get CPU time at all. In order to
1072	* resolve that, the scheduler tries to ageout the CPU usage of the clutch
1073	* bucket group when it has been pending execution for a certain time as defined
1074	* by the sched_clutch_bucket_group_pending_delta_us values below.
1075	*
1076	* The values chosen here are very close to the WCEL values for each sched bucket.
1077	* These values are multiplied by the load average of the relevant root bucket to
1078	* provide an estimate of the actual clutch bucket load.
1079	*/
1080	static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1081	SCHED_CLUTCH_INVALID_TIME_32, / FIXPRI /
1082	`10000`, / FG /
1083	`37500`, / IN /
1084	`75000`, / DF /
1085	`150000`, / UT /
1086	`250000`, / BG /
1087	};
1088	static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {`0`};
1089
1090	/*
1091	* sched_clutch_bucket_init()
1092	*
1093	* Initializer for clutch buckets.
1094	*/
1095	static void
1096	sched_clutch_bucket_init(
1097	sched_clutch_bucket_t clutch_bucket,
1098	sched_clutch_bucket_group_t clutch_bucket_group,
1099	sched_bucket_t bucket)
1100	{
1101	clutch_bucket->scb_bucket = bucket;
1102	/ scb_priority will be recalculated when a thread is inserted in the clutch bucket /
1103	clutch_bucket->scb_priority = `0`;
1104	#if CONFIG_SCHED_EDGE
1105	clutch_bucket->scb_foreign = false;
1106	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1107	#endif /* CONFIG_SCHED_EDGE */
1108	clutch_bucket->scb_group = clutch_bucket_group;
1109	clutch_bucket->scb_root = NULL;
1110	priority_queue_init(que: &clutch_bucket->scb_clutchpri_prioq);
1111	priority_queue_init(que: &clutch_bucket->scb_thread_runq);
1112	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1113	}
1114
1115	/*
1116	* sched_clutch_bucket_group_init()
1117	*
1118	* Initializer for clutch bucket groups.
1119	*/
1120	static void
1121	sched_clutch_bucket_group_init(
1122	sched_clutch_bucket_group_t clutch_bucket_group,
1123	sched_clutch_t clutch,
1124	sched_bucket_t bucket)
1125	{
1126	bzero(s: clutch_bucket_group, n: sizeof(struct sched_clutch_bucket_group));
1127	clutch_bucket_group->scbg_bucket = bucket;
1128	clutch_bucket_group->scbg_clutch = clutch;
1129
1130	int max_clusters = ml_get_cluster_count();
1131	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK \| Z_ZERO);
1132	for (int i = `0`; i < max_clusters; i++) {
1133	sched_clutch_bucket_init(clutch_bucket: &clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1134	}
1135
1136	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, `0`, relaxed);
1137	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1138	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1139	/*
1140	* All thread groups should be initialized to be interactive; this allows the newly launched
1141	* thread groups to fairly compete with already running thread groups.
1142	*/
1143	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * `2`);
1144	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = `0`;
1145	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1146	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1147	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1148	clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1149	}
1150
1151	static void
1152	sched_clutch_bucket_group_destroy(
1153	sched_clutch_bucket_group_t clutch_bucket_group)
1154	{
1155	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1156	clutch_bucket_group->scbg_clutch_buckets);
1157	}
1158
1159	/*
1160	* sched_clutch_init_with_thread_group()
1161	*
1162	* Initialize the sched_clutch when the thread group is being created
1163	*/
1164	void
1165	sched_clutch_init_with_thread_group(
1166	sched_clutch_t clutch,
1167	struct thread_group *tg)
1168	{
1169	os_atomic_store(&clutch->sc_thr_count, `0`, relaxed);
1170
1171	/ Initialize all the clutch buckets /
1172	for (uint32_t i = `0`; i < TH_BUCKET_SCHED_MAX; i++) {
1173	sched_clutch_bucket_group_init(clutch_bucket_group: &(clutch->sc_clutch_groups[i]), clutch, bucket: i);
1174	}
1175
1176	/ Grouping specific fields /
1177	clutch->sc_tg = tg;
1178	}
1179
1180	/*
1181	* sched_clutch_destroy()
1182	*
1183	* Destructor for clutch; called from thread group release code.
1184	*/
1185	void
1186	sched_clutch_destroy(
1187	sched_clutch_t clutch)
1188	{
1189	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == `0`);
1190	for (uint32_t i = `0`; i < TH_BUCKET_SCHED_MAX; i++) {
1191	sched_clutch_bucket_group_destroy(clutch_bucket_group: &(clutch->sc_clutch_groups[i]));
1192	}
1193	}
1194
1195	#if CONFIG_SCHED_EDGE
1196
1197	/*
1198	* Edge Scheduler Preferred Cluster Mechanism
1199	*
1200	* In order to have better control over various QoS buckets within a thread group, the Edge
1201	* scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1202	* preferences are stored at the sched_clutch_bucket_group level since that represents all
1203	* threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1204	* cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1205	*/
1206
1207	static uint32_t
1208	sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1209	{
1210	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1211	}
1212
1213	static uint32_t
1214	sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1215	{
1216	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1217	}
1218
1219	uint32_t
1220	sched_edge_thread_preferred_cluster(thread_t thread)
1221	{
1222	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1223	/ For threads bound to a specific cluster, return the bound cluster id /
1224	return sched_edge_thread_bound_cluster_id(thread);
1225	}
1226
1227	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1228	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1229	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1230	sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1231	}
1232	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1233	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1234	}
1235
1236	/*
1237	* Edge Scheduler Foreign Bucket Support
1238	*
1239	* In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1240	* threads that are not native to the cluster. A clutch bucket is considered native if its
1241	* preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1242	* bucket priority queue is used for rebalance operations to get threads back to their native
1243	* cluster quickly.
1244	*
1245	* It is possible to make this policy even more aggressive by considering all clusters that
1246	* are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1247	* migrations which might have performance implications.
1248	*/
1249
1250	static void
1251	sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1252	{
1253	if (clutch_bucket->scb_foreign) {
1254	clutch_bucket->scb_foreign = false;
1255	priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1256	}
1257	}
1258
1259	static void
1260	sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1261	{
1262	if (!clutch_bucket->scb_foreign) {
1263	clutch_bucket->scb_foreign = true;
1264	priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, `0`);
1265	priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1266	}
1267	}
1268
1269	/*
1270	* Edge Scheduler Cumulative Load Average
1271	*
1272	* The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1273	* making thread migration decisions. The per-bucket load is maintained as a
1274	* cumulative count since higher scheduling buckets impact load on lower buckets
1275	* for thread migration decisions.
1276	*
1277	*/
1278
1279	static void
1280	sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1281	{
1282	switch (bucket) {
1283	case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1284	case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1285	case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1286	case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1287	case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1288	case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1289	default:
1290	panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1291	}
1292	}
1293
1294	static void
1295	sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1296	{
1297	switch (bucket) {
1298	case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1299	case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1300	case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1301	case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1302	case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1303	case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1304	default:
1305	panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1306	}
1307	}
1308
1309	uint16_t
1310	sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1311	{
1312	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1313	}
1314
1315	#endif /* CONFIG_SCHED_EDGE */
1316
1317	/*
1318	* sched_clutch_bucket_hierarchy_insert()
1319	*
1320	* Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1321	*/
1322	static void
1323	sched_clutch_bucket_hierarchy_insert(
1324	sched_clutch_root_t root_clutch,
1325	sched_clutch_bucket_t clutch_bucket,
1326	sched_bucket_t bucket,
1327	uint64_t timestamp,
1328	sched_clutch_bucket_options_t options)
1329	{
1330	sched_clutch_hierarchy_locked_assert(root_clutch);
1331	if (bucket > TH_BUCKET_FIXPRI) {
1332	/ Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations /
1333	enqueue_tail(que: &root_clutch->scr_clutch_buckets, elt: &clutch_bucket->scb_listlink);
1334	}
1335	#if CONFIG_SCHED_EDGE
1336	/ Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list /
1337	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1338	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1339	sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1340	}
1341	#endif /* CONFIG_SCHED_EDGE */
1342	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1343
1344	/ If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue /
1345	if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1346	sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1347	}
1348
1349	/ Insert the clutch bucket into the root bucket run queue with order based on options /
1350	sched_clutch_bucket_runq_enqueue(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1351	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1352	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1353	}
1354
1355	/*
1356	* sched_clutch_bucket_hierarchy_remove()
1357	*
1358	* Rotuine to remove a empty clutch bucket from the root hierarchy.
1359	*/
1360	static void
1361	sched_clutch_bucket_hierarchy_remove(
1362	sched_clutch_root_t root_clutch,
1363	sched_clutch_bucket_t clutch_bucket,
1364	sched_bucket_t bucket,
1365	uint64_t timestamp,
1366	__unused sched_clutch_bucket_options_t options)
1367	{
1368	sched_clutch_hierarchy_locked_assert(root_clutch);
1369	if (bucket > TH_BUCKET_FIXPRI) {
1370	/ Remove the timeshare clutch bucket from the globally runnable clutch_bucket list /
1371	remqueue(elt: &clutch_bucket->scb_listlink);
1372	}
1373	#if CONFIG_SCHED_EDGE
1374	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1375	#endif /* CONFIG_SCHED_EDGE */
1376
1377	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1378
1379	/ Remove the clutch bucket from the root bucket priority queue /
1380	sched_clutch_bucket_runq_remove(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets, clutch_bucket);
1381	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1382
1383	/ If the root bucket priority queue is now empty, remove it from the root priority queue /
1384	if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1385	sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1386	}
1387	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1388	}
1389
1390	/*
1391	* sched_clutch_bucket_base_pri()
1392	*
1393	* Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1394	* highest base_pri and the highest sched_pri in the clutch bucket.
1395	*/
1396	static uint8_t
1397	sched_clutch_bucket_base_pri(
1398	sched_clutch_bucket_t clutch_bucket)
1399	{
1400	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1401	/*
1402	* Since the clutch bucket can contain threads that are members of the group due
1403	* to the sched_pri being promoted or due to their base pri, the base priority of
1404	* the entire clutch bucket should be based on the highest thread (promoted or base)
1405	* in the clutch bucket.
1406	*/
1407	uint8_t max_pri = `0`;
1408	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1409	max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1410	}
1411	return max_pri;
1412	}
1413
1414	/*
1415	* sched_clutch_interactivity_from_cpu_data()
1416	*
1417	* Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1418	*/
1419	static uint8_t
1420	sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1421	{
1422	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1423	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1424	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1425	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1426	uint8_t interactive_score = `0`;
1427
1428	if ((cpu_blocked == `0`) && (cpu_used == `0`)) {
1429	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1430	}
1431	/*
1432	* For all timeshare buckets, calculate the interactivity score of the bucket
1433	* and add it to the base priority
1434	*/
1435	if (cpu_blocked > cpu_used) {
1436	/ Interactive clutch_bucket case /
1437	interactive_score = sched_clutch_bucket_group_interactive_pri +
1438	((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1439	} else {
1440	/ Non-interactive clutch_bucket case /
1441	interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1442	}
1443	return interactive_score;
1444	}
1445
1446	/*
1447	* sched_clutch_bucket_pri_calculate()
1448	*
1449	* The priority calculation algorithm for the clutch_bucket is a slight
1450	* modification on the ULE interactivity score. It uses the base priority
1451	* of the clutch bucket and applies an interactivity score boost to the
1452	* highly responsive clutch buckets.
1453	*/
1454	static uint8_t
1455	sched_clutch_bucket_pri_calculate(
1456	sched_clutch_bucket_t clutch_bucket,
1457	uint64_t timestamp)
1458	{
1459	/ For empty clutch buckets, return priority 0 /
1460	if (clutch_bucket->scb_thr_count == `0`) {
1461	return `0`;
1462	}
1463
1464	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1465	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1466
1467	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1468	uint8_t pri = base_pri + interactive_score;
1469	if (pri != clutch_bucket->scb_priority) {
1470	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) \| DBG_FUNC_NONE,
1471	thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, `0`);
1472	}
1473	return pri;
1474	}
1475
1476	/*
1477	* sched_clutch_root_bucket_highest_clutch_bucket()
1478	*
1479	* Routine to find the highest priority clutch bucket
1480	* within the root bucket.
1481	*/
1482	static sched_clutch_bucket_t
1483	sched_clutch_root_bucket_highest_clutch_bucket(
1484	sched_clutch_root_bucket_t root_bucket)
1485	{
1486	if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1487	return NULL;
1488	}
1489	return sched_clutch_bucket_runq_peek(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets);
1490	}
1491
1492	/*
1493	* sched_clutch_bucket_runnable()
1494	*
1495	* Perform all operations needed when a new clutch bucket becomes runnable.
1496	* It involves inserting the clutch_bucket into the hierarchy and updating the
1497	* root priority appropriately.
1498	*/
1499	static boolean_t
1500	sched_clutch_bucket_runnable(
1501	sched_clutch_bucket_t clutch_bucket,
1502	sched_clutch_root_t root_clutch,
1503	uint64_t timestamp,
1504	sched_clutch_bucket_options_t options)
1505	{
1506	sched_clutch_hierarchy_locked_assert(root_clutch);
1507	/ Since the clutch bucket became newly runnable, update its pending timestamp /
1508	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1509	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, bucket: clutch_bucket->scb_bucket, timestamp, options);
1510
1511	/ Update the timesharing properties of this clutch_bucket; also done every sched_tick /
1512	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1513	int16_t root_old_pri = root_clutch->scr_priority;
1514	sched_clutch_root_pri_update(root_clutch);
1515	return root_clutch->scr_priority > root_old_pri;
1516	}
1517
1518	/*
1519	* sched_clutch_bucket_update()
1520	*
1521	* Update the clutch_bucket's position in the hierarchy. This routine is
1522	* called when a new thread is inserted or removed from a runnable clutch
1523	* bucket. The options specify some properties about the clutch bucket
1524	* insertion order into the clutch bucket runq.
1525	*/
1526	static boolean_t
1527	sched_clutch_bucket_update(
1528	sched_clutch_bucket_t clutch_bucket,
1529	sched_clutch_root_t root_clutch,
1530	uint64_t timestamp,
1531	sched_clutch_bucket_options_t options)
1532	{
1533	sched_clutch_hierarchy_locked_assert(root_clutch);
1534	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1535	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1536	if (new_pri == clutch_bucket->scb_priority) {
1537	/*
1538	* If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1539	* to the end of the runq. Typically used when a thread is selected for execution
1540	* from a clutch bucket.
1541	*/
1542	if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1543	sched_clutch_bucket_runq_rotate(clutch_buckets_rq: bucket_runq, clutch_bucket);
1544	}
1545	return false;
1546	}
1547	sched_clutch_bucket_runq_remove(clutch_buckets_rq: bucket_runq, clutch_bucket);
1548	#if CONFIG_SCHED_EDGE
1549	if (clutch_bucket->scb_foreign) {
1550	priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1551	}
1552	#endif /* CONFIG_SCHED_EDGE */
1553	clutch_bucket->scb_priority = new_pri;
1554	#if CONFIG_SCHED_EDGE
1555	if (clutch_bucket->scb_foreign) {
1556	priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, `0`);
1557	priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1558	}
1559	#endif /* CONFIG_SCHED_EDGE */
1560	sched_clutch_bucket_runq_enqueue(clutch_buckets_rq: bucket_runq, clutch_bucket, options);
1561
1562	int16_t root_old_pri = root_clutch->scr_priority;
1563	sched_clutch_root_pri_update(root_clutch);
1564	return root_clutch->scr_priority > root_old_pri;
1565	}
1566
1567	/*
1568	* sched_clutch_bucket_empty()
1569	*
1570	* Perform all the operations needed when a clutch_bucket is no longer runnable.
1571	* It involves removing the clutch bucket from the hierarchy and updaing the root
1572	* priority appropriately.
1573	*/
1574	static void
1575	sched_clutch_bucket_empty(
1576	sched_clutch_bucket_t clutch_bucket,
1577	sched_clutch_root_t root_clutch,
1578	uint64_t timestamp,
1579	sched_clutch_bucket_options_t options)
1580	{
1581	sched_clutch_hierarchy_locked_assert(root_clutch);
1582	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, bucket: clutch_bucket->scb_bucket, timestamp, options);
1583	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1584	sched_clutch_root_pri_update(root_clutch);
1585	}
1586
1587	/*
1588	* sched_clutch_cpu_usage_update()
1589	*
1590	* Routine to update CPU usage of the thread in the hierarchy.
1591	*/
1592	void
1593	sched_clutch_cpu_usage_update(
1594	thread_t thread,
1595	uint64_t delta)
1596	{
1597	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) \|\| SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1598	return;
1599	}
1600
1601	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1602	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1603	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1604	}
1605
1606	/*
1607	* sched_clutch_bucket_group_cpu_usage_update()
1608	*
1609	* Routine to update the CPU usage of the clutch_bucket.
1610	*/
1611	static void
1612	sched_clutch_bucket_group_cpu_usage_update(
1613	sched_clutch_bucket_group_t clutch_bucket_group,
1614	uint64_t delta)
1615	{
1616	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1617	/ Since Above UI bucket has maximum interactivity score always, nothing to do here /
1618	return;
1619	}
1620	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1621	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1622	}
1623
1624	/*
1625	* sched_clutch_bucket_group_cpu_pending_adjust()
1626	*
1627	* Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1628	* such that one "pending interval" provides one point improvement in interactivity score.
1629	*/
1630	static inline uint64_t
1631	sched_clutch_bucket_group_cpu_pending_adjust(
1632	uint64_t cpu_used,
1633	uint64_t cpu_blocked,
1634	uint8_t pending_intervals)
1635	{
1636	uint64_t cpu_used_adjusted = `0`;
1637	if (cpu_blocked < cpu_used) {
1638	cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1639	cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1640	} else {
1641	uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1642	cpu_used_adjusted = (adjust_factor > cpu_used) ? `0` : (cpu_used - adjust_factor);
1643	}
1644	return cpu_used_adjusted;
1645	}
1646
1647	/*
1648	* sched_clutch_bucket_group_cpu_adjust()
1649	*
1650	* Routine to scale the cpu usage and blocked time once the sum gets bigger
1651	* than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1652	* manageable and maintain the same ratio while allowing clutch buckets to
1653	* adjust behavior and reflect in the interactivity score in a reasonable
1654	* amount of time. Also adjusts the CPU usage based on pending_intervals
1655	* which allows ageout of CPU to avoid starvation in highly contended scenarios.
1656	*/
1657	static void
1658	sched_clutch_bucket_group_cpu_adjust(
1659	sched_clutch_bucket_group_t clutch_bucket_group,
1660	uint8_t pending_intervals)
1661	{
1662	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1663	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1664	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1665	clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1666	clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1667
1668	if ((pending_intervals == `0`) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1669	/ No changes to the CPU used and blocked values /
1670	os_atomic_rmw_loop_give_up();
1671	}
1672	if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1673	/ Only keep the recent CPU history to better indicate how this TG has been behaving /
1674	cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1675	cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1676	}
1677	/ Use the shift passed in to ageout the CPU usage /
1678	cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1679	new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1680	new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1681	});
1682	}
1683
1684	/*
1685	* Thread level scheduling algorithm
1686	*
1687	* The thread level scheduling algorithm uses the mach timeshare
1688	* decay based algorithm to achieve sharing between threads within the
1689	* same clutch bucket. The load/priority shifts etc. are all maintained
1690	* at the clutch bucket level and used for decay calculation of the
1691	* threads. The load sampling is still driven off the scheduler tick
1692	* for runnable clutch buckets (it does not use the new higher frequency
1693	* EWMA based load calculation). The idea is that the contention and load
1694	* within clutch_buckets should be limited enough to not see heavy decay
1695	* and timeshare effectively.
1696	*/
1697
1698	/*
1699	* sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1700	*
1701	* Increment the run count for the clutch bucket associated with the
1702	* thread.
1703	*/
1704	uint32_t
1705	sched_clutch_thread_run_bucket_incr(
1706	thread_t thread,
1707	sched_bucket_t bucket)
1708	{
1709	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1710	return `0`;
1711	}
1712	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1713	return sched_clutch_run_bucket_incr(clutch, bucket);
1714	}
1715
1716	static uint32_t
1717	sched_clutch_run_bucket_incr(
1718	sched_clutch_t clutch,
1719	sched_bucket_t bucket)
1720	{
1721	assert(bucket != TH_BUCKET_RUN);
1722	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1723	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1724	}
1725
1726	/*
1727	* sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1728	*
1729	* Decrement the run count for the clutch bucket associated with the
1730	* thread.
1731	*/
1732	uint32_t
1733	sched_clutch_thread_run_bucket_decr(
1734	thread_t thread,
1735	sched_bucket_t bucket)
1736	{
1737	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1738	return `0`;
1739	}
1740	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1741	return sched_clutch_run_bucket_decr(clutch, bucket);
1742	}
1743
1744	static uint32_t
1745	sched_clutch_run_bucket_decr(
1746	sched_clutch_t clutch,
1747	sched_bucket_t bucket)
1748	{
1749	assert(bucket != TH_BUCKET_RUN);
1750	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1751	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1752	}
1753
1754	/*
1755	* sched_clutch_bucket_group_timeshare_update()
1756	*
1757	* Routine to update the load and priority shift for the clutch_bucket_group
1758	* every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1759	* clutch buckets with runnable threads in them. So it is important to maintain
1760	* the timesharing information at the clutch_bucket_group level instead of
1761	* individual clutch buckets (because the algorithm is trying to timeshare all
1762	* threads at the same QoS irrespective of which hierarchy they are enqueued in).
1763	*
1764	* The routine is called from the sched tick handling code to make sure this value
1765	* is updated at least once every sched tick. For clutch bucket groups which have
1766	* not been runnable for very long, the clutch_bucket_group maintains a "last
1767	* updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1768	* if this value is outdated, the load and shifts are updated.
1769	*
1770	* Possible optimization:
1771	* - The current algorithm samples the load every sched tick (125ms).
1772	* This is prone to spikes in runnable counts; if that turns out to be
1773	* a problem, a simple solution would be to do the EWMA trick to sample
1774	* load at every load_tick (30ms) and use the averaged value for the pri
1775	* shift calculation.
1776	*/
1777	static void
1778	sched_clutch_bucket_group_timeshare_update(
1779	sched_clutch_bucket_group_t clutch_bucket_group,
1780	sched_clutch_bucket_t clutch_bucket,
1781	uint64_t ctime)
1782	{
1783	if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1784	/ No timesharing needed for fixed priority Above UI threads /
1785	return;
1786	}
1787
1788	/*
1789	* Update the timeshare parameters for the clutch bucket group
1790	* if they havent been updated in this tick.
1791	*/
1792	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1793	uint32_t current_sched_ts = sched_tick;
1794	if (sched_ts < current_sched_ts) {
1795	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1796	/ NCPU wide workloads should not experience decay /
1797	uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - `1`;
1798	uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1799	bucket_group_load = MIN(bucket_group_load, NRQS - `1`);
1800	uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1801	/ Ensure that the pri_shift value is reasonable /
1802	pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1803	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1804	}
1805
1806	/*
1807	* Update the clutch bucket priority; this allows clutch buckets that have been pending
1808	* for a long time to get an updated interactivity score.
1809	*/
1810	sched_clutch_bucket_update(clutch_bucket, root_clutch: clutch_bucket->scb_root, timestamp: ctime, options: SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1811	}
1812
1813	/*
1814	* sched_clutch_thread_clutch_update()
1815	*
1816	* Routine called when the thread changes its thread group. The current
1817	* implementation relies on the fact that the thread group is changed only from
1818	* the context of the thread itself or when the thread is runnable but not in a
1819	* runqueue. Due to this fact, the thread group change causes only counter
1820	* updates in the old & new clutch buckets and no hierarchy changes. The routine
1821	* also attributes the CPU used so far to the old clutch.
1822	*/
1823	void
1824	sched_clutch_thread_clutch_update(
1825	thread_t thread,
1826	sched_clutch_t old_clutch,
1827	sched_clutch_t new_clutch)
1828	{
1829	uint32_t cpu_delta;
1830
1831	if (old_clutch) {
1832	assert((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN);
1833
1834	sched_clutch_run_bucket_decr(clutch: old_clutch, bucket: thread->th_sched_bucket);
1835	/*
1836	* Calculate the CPU used by this thread in the old bucket and
1837	* add it to the old clutch bucket. This uses the same CPU usage
1838	* logic as update_priority etc.
1839	*/
1840	sched_tick_delta(thread, cpu_delta);
1841	if (thread->pri_shift < INT8_MAX) {
1842	thread->sched_usage += cpu_delta;
1843	}
1844	thread->cpu_delta += cpu_delta;
1845	if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1846	sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1847	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta: cpu_delta);
1848	}
1849	}
1850
1851	if (new_clutch) {
1852	sched_clutch_run_bucket_incr(clutch: new_clutch, bucket: thread->th_sched_bucket);
1853	}
1854	}
1855
1856	/ Thread Insertion/Removal/Selection routines /
1857
1858	#if CONFIG_SCHED_EDGE
1859
1860	/*
1861	* Edge Scheduler Bound Thread Support
1862	*
1863	* The edge scheduler allows threads to be bound to specific clusters. The scheduler
1864	* maintains a separate runq on the clutch root to hold these bound threads. These
1865	* bound threads count towards the root priority and thread count, but are ignored
1866	* for thread migration/steal decisions. Bound threads that are enqueued in the
1867	* separate runq have the th_bound_cluster_enqueued flag set to allow easy
1868	* removal.
1869	*
1870	* Bound Threads Timesharing
1871	* The bound threads share the timesharing properties of the clutch bucket group they are
1872	* part of. They contribute to the load and use priority shifts/decay values from the
1873	* clutch bucket group.
1874	*/
1875
1876	static boolean_t
1877	sched_edge_bound_thread_insert(
1878	sched_clutch_root_t root_clutch,
1879	thread_t thread,
1880	integer_t options)
1881	{
1882	/ Update the clutch runnable count and priority /
1883	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1884	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1885	if (root_bucket->scrb_bound_thread_runq.count == `0`) {
1886	sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1887	}
1888
1889	assert((thread->th_bound_cluster_enqueued) == false);
1890	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1891	thread->th_bound_cluster_enqueued = true;
1892
1893	/ Increment the urgency counter for the root if necessary /
1894	sched_clutch_root_urgency_inc(root_clutch, thread);
1895
1896	int16_t root_old_pri = root_clutch->scr_priority;
1897	sched_clutch_root_pri_update(root_clutch);
1898	return root_clutch->scr_priority > root_old_pri;
1899	}
1900
1901	static void
1902	sched_edge_bound_thread_remove(
1903	sched_clutch_root_t root_clutch,
1904	thread_t thread)
1905	{
1906	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1907	assert((thread->th_bound_cluster_enqueued) == true);
1908	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1909	thread->th_bound_cluster_enqueued = false;
1910
1911	/ Decrement the urgency counter for the root if necessary /
1912	sched_clutch_root_urgency_dec(root_clutch, thread);
1913
1914	/ Update the clutch runnable count and priority /
1915	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1916	if (root_bucket->scrb_bound_thread_runq.count == `0`) {
1917	sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1918	}
1919	sched_clutch_root_pri_update(root_clutch);
1920	}
1921
1922	/*
1923	* Edge Scheduler cluster shared resource threads load balancing
1924	*
1925	* The Edge scheduler attempts to load balance cluster shared resource intensive threads
1926	* across clusters in order to reduce contention on the shared resources. It achieves
1927	* that by maintaining the runnable and running shared resource load on each cluster
1928	* and balancing the load across multiple clusters.
1929	*
1930	* The current implementation for cluster shared resource load balancing looks at
1931	* the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1932	* cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1933	* do not steal/rebalance shared resource threads. Some more details for the implementation:
1934	*
1935	* - When threads are tagged as shared resource, they go through the cluster selection logic
1936	* which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1937	* enqueued in the cluster bound runqueue.
1938	*
1939	* - When the threads start running and call avoid_processor, the load balancing logic will be
1940	* invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1941	* no shared resource load.
1942	*
1943	* - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1944	* it will look at running shared resource threads on foreign clusters and actively rebalance them.
1945	*
1946	* - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1947	* go idle intentionally.
1948	*
1949	* - One caveat of this design is that if a preferred CPU has already run and finished its shared
1950	* resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1951	* The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1952	* events listed above happen.
1953	*
1954	* - Also it currently does not consider other properties such as thread priorities and
1955	* qos level thread load in the thread placement decision.
1956	*
1957	* Edge Scheduler cluster shared resource thread scheduling policy
1958	*
1959	* The threads for shared resources can be scheduled using one of the two policies:
1960	*
1961	* EDGE_SHARED_RSRC_SCHED_POLICY_RR
1962	* This policy distributes the threads so that they spread across all available clusters
1963	* irrespective of type. The idea is that this scheduling policy will put a shared resource
1964	* thread on each cluster on the platform before it starts doubling up on clusters.
1965	*
1966	* EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1967	* This policy distributes threads so that the threads first fill up all the capacity on
1968	* the preferred cluster and its homogeneous peers before spilling to different core type.
1969	* The current implementation defines capacity based on the number of CPUs in the cluster;
1970	* so a cluster's shared resource is considered full if there are "n" runnable + running
1971	* shared resource threads on the cluster with n cpus. This policy is different from the
1972	* default scheduling policy of the edge scheduler since this always tries to fill up the
1973	* native clusters to capacity even when non-native clusters might be idle.
1974	*/
1975	__options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1976	EDGE_SHARED_RSRC_SCHED_POLICY_RR = `0`,
1977	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = `1`,
1978	});
1979
1980	const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1981	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1982	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1983	};
1984
1985	static void
1986	sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1987	{
1988	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1989	root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1990	thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1991	}
1992	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1993	root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1994	thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1995	}
1996	}
1997
1998	static void
1999	sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2000	{
2001	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2002	if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2003	thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2004	root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2005	}
2006	}
2007	}
2008
2009	uint16_t
2010	sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2011	{
2012	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2013	}
2014
2015	/*
2016	* sched_edge_shared_rsrc_idle()
2017	*
2018	* Routine used to determine if the constrained resource for the pset is idle. This is
2019	* used by a CPU going idle to decide if it should rebalance a running shared resource
2020	* thread from a non-preferred cluster.
2021	*/
2022	static boolean_t
2023	sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2024	{
2025	return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == `0`;
2026	}
2027
2028	/*
2029	* sched_edge_thread_shared_rsrc_type
2030	*
2031	* This routine decides if a given thread needs special handling for being a
2032	* heavy shared resource user. It is valid for the same thread to be using
2033	* several shared resources at the same time and have multiple policy flags set.
2034	* This routine determines which of those properties will be used for load
2035	* balancing and migration decisions.
2036	*/
2037	static cluster_shared_rsrc_type_t
2038	sched_edge_thread_shared_rsrc_type(thread_t thread)
2039	{
2040	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2041	return CLUSTER_SHARED_RSRC_TYPE_RR;
2042	}
2043	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2044	return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2045	}
2046	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2047	}
2048
2049	#endif /* CONFIG_SCHED_EDGE */
2050
2051	/*
2052	* sched_clutch_thread_bound_lookup()
2053	*
2054	* Routine to lookup the highest priority runnable thread in a bounded root bucket.
2055	*/
2056	static thread_t
2057	sched_clutch_thread_bound_lookup(
2058	__unused sched_clutch_root_t root_clutch,
2059	sched_clutch_root_bucket_t root_bucket)
2060	{
2061	return run_queue_peek(runq: &root_bucket->scrb_bound_thread_runq);
2062	}
2063
2064	/*
2065	* Clutch Bucket Group Thread Counts and Pending time calculation
2066	*
2067	* The pending time on the clutch_bucket_group allows the scheduler to track if it
2068	* needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2069	* a very long time. The pending time is set to the timestamp as soon as a thread becomes
2070	* runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2071	* pending time is advanced to the time of thread selection.
2072	*
2073	* Since threads for a clutch bucket group can be added or removed from multiple CPUs
2074	* simulataneously, it is important that the updates to thread counts and pending timestamps
2075	* happen atomically. The implementation relies on the following aspects to make that work
2076	* as expected:
2077	* - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2078	* is held when threads are added/removed and pending timestamps are updated
2079	* - The thread count and pending timestamp can be updated atomically using double wide
2080	* 128 bit atomics
2081	*
2082	* Clutch bucket group interactivity timestamp and score updates also rely on the properties
2083	* above to atomically update the interactivity score for a clutch bucket group.
2084	*/
2085
2086	#if CONFIG_SCHED_EDGE
2087
2088	static void
2089	sched_clutch_bucket_group_thr_count_inc(
2090	sched_clutch_bucket_group_t clutch_bucket_group,
2091	uint64_t timestamp)
2092	{
2093	sched_clutch_counter_time_t old_pending_data;
2094	sched_clutch_counter_time_t new_pending_data;
2095	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2096	new_pending_data.scct_count = old_pending_data.scct_count + `1`;
2097	new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2098	if (old_pending_data.scct_count == `0`) {
2099	new_pending_data.scct_timestamp = timestamp;
2100	}
2101	});
2102	}
2103
2104	static void
2105	sched_clutch_bucket_group_thr_count_dec(
2106	sched_clutch_bucket_group_t clutch_bucket_group,
2107	uint64_t timestamp)
2108	{
2109	sched_clutch_counter_time_t old_pending_data;
2110	sched_clutch_counter_time_t new_pending_data;
2111	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2112	new_pending_data.scct_count = old_pending_data.scct_count - `1`;
2113	if (new_pending_data.scct_count == `0`) {
2114	new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2115	} else {
2116	new_pending_data.scct_timestamp = timestamp;
2117	}
2118	});
2119	}
2120
2121	static uint8_t
2122	sched_clutch_bucket_group_pending_ageout(
2123	sched_clutch_bucket_group_t clutch_bucket_group,
2124	uint64_t timestamp)
2125	{
2126	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2127	sched_clutch_counter_time_t old_pending_data;
2128	sched_clutch_counter_time_t new_pending_data;
2129	uint8_t cpu_usage_shift = `0`;
2130
2131	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2132	cpu_usage_shift = `0`;
2133	uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2134	bool old_update = (old_pending_ts >= timestamp);
2135	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2136	bool no_bucket_load = (bucket_load == `0`);
2137	if (old_update \|\| no_pending_time \|\| no_bucket_load) {
2138	os_atomic_rmw_loop_give_up();
2139	}
2140
2141	/ Calculate the time the clutch bucket group has been pending /
2142	uint64_t pending_delta = timestamp - old_pending_ts;
2143	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2144	if (pending_delta < interactivity_delta) {
2145	os_atomic_rmw_loop_give_up();
2146	}
2147	cpu_usage_shift = (pending_delta / interactivity_delta);
2148	new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2149	new_pending_data.scct_count = old_pending_data.scct_count;
2150	});
2151	return cpu_usage_shift;
2152	}
2153
2154	static uint8_t
2155	sched_clutch_bucket_group_interactivity_score_calculate(
2156	sched_clutch_bucket_group_t clutch_bucket_group,
2157	uint64_t timestamp)
2158	{
2159	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2160	/*
2161	* Since the root bucket selection algorithm for Above UI looks at clutch bucket
2162	* priorities, make sure all AboveUI buckets are marked interactive.
2163	*/
2164	assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (`2` * sched_clutch_bucket_group_interactive_pri));
2165	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2166	}
2167	/ Check if the clutch bucket group CPU usage needs to be aged out due to pending time /
2168	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2169	/ Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used /
2170	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2171	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2172	sched_clutch_counter_time_t old_interactivity_data;
2173	sched_clutch_counter_time_t new_interactivity_data;
2174
2175	bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2176	if (old_interactivity_data.scct_timestamp >= timestamp) {
2177	os_atomic_rmw_loop_give_up();
2178	}
2179	new_interactivity_data.scct_timestamp = timestamp;
2180	if (old_interactivity_data.scct_timestamp != `0`) {
2181	new_interactivity_data.scct_count = interactivity_score;
2182	}
2183	});
2184	if (score_updated) {
2185	return (uint8_t)new_interactivity_data.scct_count;
2186	} else {
2187	return (uint8_t)old_interactivity_data.scct_count;
2188	}
2189	}
2190
2191	#else /* CONFIG_SCHED_EDGE */
2192
2193	/*
2194	* For the clutch scheduler, atomicity is ensured by making sure all operations
2195	* are happening under the pset lock of the only cluster present on the platform.
2196	*/
2197	static void
2198	sched_clutch_bucket_group_thr_count_inc(
2199	sched_clutch_bucket_group_t clutch_bucket_group,
2200	uint64_t timestamp)
2201	{
2202	sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2203	if (clutch_bucket_group->scbg_pending_data.scct_count == `0`) {
2204	clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2205	}
2206	clutch_bucket_group->scbg_pending_data.scct_count++;
2207	}
2208
2209	static void
2210	sched_clutch_bucket_group_thr_count_dec(
2211	sched_clutch_bucket_group_t clutch_bucket_group,
2212	uint64_t timestamp)
2213	{
2214	sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2215	clutch_bucket_group->scbg_pending_data.scct_count--;
2216	if (clutch_bucket_group->scbg_pending_data.scct_count == `0`) {
2217	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2218	} else {
2219	clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2220	}
2221	}
2222
2223	static uint8_t
2224	sched_clutch_bucket_group_pending_ageout(
2225	sched_clutch_bucket_group_t clutch_bucket_group,
2226	uint64_t timestamp)
2227	{
2228	sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2229	int bucket_load = sched_clutch_global_bucket_load_get(bucket: clutch_bucket_group->scbg_bucket);
2230	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2231	bool old_update = (old_pending_ts >= timestamp);
2232	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2233	bool no_bucket_load = (bucket_load == `0`);
2234	if (old_update \|\| no_pending_time \|\| no_bucket_load) {
2235	return `0`;
2236	}
2237	uint64_t pending_delta = timestamp - old_pending_ts;
2238	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2239	if (pending_delta < interactivity_delta) {
2240	return `0`;
2241	}
2242	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2243	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2244	return cpu_usage_shift;
2245	}
2246
2247	static uint8_t
2248	sched_clutch_bucket_group_interactivity_score_calculate(
2249	sched_clutch_bucket_group_t clutch_bucket_group,
2250	uint64_t timestamp)
2251	{
2252	sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2253	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2254	/*
2255	* Since the root bucket selection algorithm for Above UI looks at clutch bucket
2256	* priorities, make sure all AboveUI buckets are marked interactive.
2257	*/
2258	assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (`2` * sched_clutch_bucket_group_interactive_pri));
2259	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2260	}
2261	/ Check if the clutch bucket group CPU usage needs to be aged out due to pending time /
2262	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2263	/ Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used /
2264	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2265	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2266	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2267	clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2268	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2269	return interactivity_score;
2270	} else {
2271	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2272	}
2273	}
2274
2275	#endif /* CONFIG_SCHED_EDGE */
2276
2277	/*
2278	* Clutch Bucket Group Run Count and Blocked Time Accounting
2279	*
2280	* The clutch bucket group maintains the number of runnable/running threads in the group.
2281	* Since the blocked time of the clutch bucket group is based on this count, it is
2282	* important to make sure the blocking timestamp and the run count are updated atomically.
2283	*
2284	* Since the run count increments happen without any pset locks held, the scheduler updates
2285	* the count & timestamp using double wide 128 bit atomics.
2286	*/
2287
2288	static uint32_t
2289	sched_clutch_bucket_group_run_count_inc(
2290	sched_clutch_bucket_group_t clutch_bucket_group)
2291	{
2292	sched_clutch_counter_time_t old_blocked_data;
2293	sched_clutch_counter_time_t new_blocked_data;
2294
2295	bool update_blocked_time = false;
2296	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2297	new_blocked_data.scct_count = old_blocked_data.scct_count + `1`;
2298	new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2299	update_blocked_time = false;
2300	if (old_blocked_data.scct_count == `0`) {
2301	new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2302	update_blocked_time = true;
2303	}
2304	});
2305	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2306	uint64_t ctime = mach_absolute_time();
2307	if (ctime > old_blocked_data.scct_timestamp) {
2308	uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2309	blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2310	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2311	}
2312	}
2313	return (uint32_t)new_blocked_data.scct_count;
2314	}
2315
2316	static uint32_t
2317	sched_clutch_bucket_group_run_count_dec(
2318	sched_clutch_bucket_group_t clutch_bucket_group)
2319	{
2320	sched_clutch_counter_time_t old_blocked_data;
2321	sched_clutch_counter_time_t new_blocked_data;
2322
2323	uint64_t ctime = mach_absolute_time();
2324	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2325	new_blocked_data.scct_count = old_blocked_data.scct_count - `1`;
2326	new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2327	if (new_blocked_data.scct_count == `0`) {
2328	new_blocked_data.scct_timestamp = ctime;
2329	}
2330	});
2331	return (uint32_t)new_blocked_data.scct_count;
2332	}
2333
2334	/*
2335	* sched_clutch_thread_insert()
2336	*
2337	* Routine to insert a thread into the sched clutch hierarchy.
2338	* Update the counts at all levels of the hierarchy and insert the nodes
2339	* as they become runnable. Always called with the pset lock held.
2340	*/
2341	static boolean_t
2342	sched_clutch_thread_insert(
2343	sched_clutch_root_t root_clutch,
2344	thread_t thread,
2345	integer_t options)
2346	{
2347	boolean_t result = FALSE;
2348
2349	sched_clutch_hierarchy_locked_assert(root_clutch);
2350	#if CONFIG_SCHED_EDGE
2351	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2352	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2353	/*
2354	* Check if the thread is bound and is being enqueued in its desired bound cluster.
2355	* One scenario where a bound thread might not be getting enqueued in the bound cluster
2356	* hierarchy would be if the thread is "soft" bound and the bound cluster is
2357	* de-recommended. In that case, the thread should be treated as an unbound
2358	* thread.
2359	*/
2360	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2361	return sched_edge_bound_thread_insert(root_clutch, thread, options);
2362	}
2363	/*
2364	* Use bound runqueue for shared resource threads. See "cluster shared resource
2365	* threads load balancing" section for details.
2366	*/
2367	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2368	return sched_edge_bound_thread_insert(root_clutch, thread, options);
2369	}
2370
2371	#endif /* CONFIG_SCHED_EDGE */
2372	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2373	assert(thread->thread_group == clutch->sc_tg);
2374
2375	uint64_t current_timestamp = mach_absolute_time();
2376	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2377	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2378	assert((clutch_bucket->scb_root == NULL) \|\| (clutch_bucket->scb_root == root_clutch));
2379
2380	/*
2381	* Thread linkage in clutch_bucket
2382	*
2383	* A thread has a few linkages within the clutch bucket:
2384	* - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2385	* - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2386	* - A queue linkage used for timesharing operations of threads at the scheduler tick
2387	*/
2388
2389	/ Insert thread into the clutch_bucket stable priority runqueue using sched_pri /
2390	thread->th_clutch_runq_link.stamp = current_timestamp;
2391	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2392	(options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2393	priority_queue_insert(que: &clutch_bucket->scb_thread_runq, elt: &thread->th_clutch_runq_link);
2394
2395	/ Insert thread into clutch_bucket priority queue based on the promoted or base priority /
2396	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2397	sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2398	priority_queue_insert(que: &clutch_bucket->scb_clutchpri_prioq, elt: &thread->th_clutch_pri_link);
2399
2400	/ Insert thread into timesharing queue of the clutch bucket /
2401	enqueue_tail(que: &clutch_bucket->scb_thread_timeshare_queue, elt: &thread->th_clutch_timeshare_link);
2402
2403	/ Increment the urgency counter for the root if necessary /
2404	sched_clutch_root_urgency_inc(root_clutch, thread);
2405
2406	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2407	sched_clutch_bucket_group_thr_count_inc(clutch_bucket_group: clutch_bucket->scb_group, timestamp: current_timestamp);
2408
2409	/ Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options /
2410	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2411	if (clutch_bucket->scb_thr_count == `0`) {
2412	sched_clutch_thr_count_inc(thr_count: &clutch_bucket->scb_thr_count);
2413	sched_clutch_thr_count_inc(thr_count: &root_clutch->scr_thr_count);
2414	result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, timestamp: current_timestamp, options: scb_options);
2415	} else {
2416	sched_clutch_thr_count_inc(thr_count: &clutch_bucket->scb_thr_count);
2417	sched_clutch_thr_count_inc(thr_count: &root_clutch->scr_thr_count);
2418	result = sched_clutch_bucket_update(clutch_bucket, root_clutch, timestamp: current_timestamp, options: scb_options);
2419	}
2420
2421	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) \| DBG_FUNC_NONE,
2422	root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2423	SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2424	return result;
2425	}
2426
2427	/*
2428	* sched_clutch_thread_remove()
2429	*
2430	* Routine to remove a thread from the sched clutch hierarchy.
2431	* Update the counts at all levels of the hierarchy and remove the nodes
2432	* as they become empty. Always called with the pset lock held.
2433	*/
2434	static void
2435	sched_clutch_thread_remove(
2436	sched_clutch_root_t root_clutch,
2437	thread_t thread,
2438	uint64_t current_timestamp,
2439	sched_clutch_bucket_options_t options)
2440	{
2441	sched_clutch_hierarchy_locked_assert(root_clutch);
2442	#if CONFIG_SCHED_EDGE
2443	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2444	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2445
2446	if (thread->th_bound_cluster_enqueued) {
2447	sched_edge_bound_thread_remove(root_clutch, thread);
2448	return;
2449	}
2450	#endif /* CONFIG_SCHED_EDGE */
2451	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2452	assert(thread->thread_group == clutch->sc_tg);
2453	thread_assert_runq_nonnull(thread);
2454
2455	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2456	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2457	assert(clutch_bucket->scb_root == root_clutch);
2458
2459	/ Decrement the urgency counter for the root if necessary /
2460	sched_clutch_root_urgency_dec(root_clutch, thread);
2461	/ Remove thread from the clutch_bucket /
2462	priority_queue_remove(que: &clutch_bucket->scb_thread_runq, elt: &thread->th_clutch_runq_link);
2463	remqueue(elt: &thread->th_clutch_timeshare_link);
2464
2465	priority_queue_remove(que: &clutch_bucket->scb_clutchpri_prioq, elt: &thread->th_clutch_pri_link);
2466
2467	/*
2468	* Warning: After this point, the thread's scheduling fields may be
2469	* modified by other cores that acquire the thread lock.
2470	*/
2471	thread_clear_runq(thread);
2472
2473	/ Update counts at various levels of the hierarchy /
2474	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2475	sched_clutch_bucket_group_thr_count_dec(clutch_bucket_group: clutch_bucket->scb_group, timestamp: current_timestamp);
2476	sched_clutch_thr_count_dec(thr_count: &root_clutch->scr_thr_count);
2477	sched_clutch_thr_count_dec(thr_count: &clutch_bucket->scb_thr_count);
2478
2479	/ Remove the clutch from hierarchy (if needed) and update properties /
2480	if (clutch_bucket->scb_thr_count == `0`) {
2481	sched_clutch_bucket_empty(clutch_bucket, root_clutch, timestamp: current_timestamp, options);
2482	} else {
2483	sched_clutch_bucket_update(clutch_bucket, root_clutch, timestamp: current_timestamp, options);
2484	}
2485
2486	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) \| DBG_FUNC_NONE,
2487	root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2488	SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2489	}
2490
2491	/*
2492	* sched_clutch_thread_unbound_lookup()
2493	*
2494	* Routine to find the highest unbound thread in the root clutch.
2495	* Helps find threads easily for steal/migrate scenarios in the
2496	* Edge scheduler.
2497	*/
2498	static thread_t
2499	sched_clutch_thread_unbound_lookup(
2500	sched_clutch_root_t root_clutch,
2501	sched_clutch_root_bucket_t root_bucket)
2502	{
2503	sched_clutch_hierarchy_locked_assert(root_clutch);
2504
2505	/ Find the highest priority clutch bucket in this root bucket /
2506	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2507	assert(clutch_bucket != NULL);
2508
2509	/ Find the highest priority runnable thread in this clutch bucket /
2510	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2511	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) \| DBG_FUNC_NONE,
2512	thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, `0`, `0`);
2513	return thread;
2514	}
2515
2516	/*
2517	* sched_clutch_thread_highest_remove()
2518	*
2519	* Routine to find and remove the highest priority thread
2520	* from the sched clutch hierarchy. The algorithm looks at the
2521	* hierarchy for the most eligible runnable thread and calls
2522	* sched_clutch_thread_remove(). Always called with the
2523	* pset lock held.
2524	*/
2525	static thread_t
2526	sched_clutch_thread_highest_remove(
2527	sched_clutch_root_t root_clutch)
2528	{
2529	sched_clutch_hierarchy_locked_assert(root_clutch);
2530	uint64_t current_timestamp = mach_absolute_time();
2531
2532	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, timestamp: current_timestamp, type: SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2533	if (root_bucket == NULL) {
2534	return THREAD_NULL;
2535	}
2536
2537	thread_t highest_thread = THREAD_NULL;
2538	if (root_bucket->scrb_bound) {
2539	highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2540	} else {
2541	highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2542	}
2543	sched_clutch_thread_remove(root_clutch, thread: highest_thread, current_timestamp, options: SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2544	return highest_thread;
2545	}
2546
2547	/ High level global accessor routines /
2548
2549	/*
2550	* sched_clutch_root_urgency()
2551	*
2552	* Routine to get the urgency of the highest runnable
2553	* thread in the hierarchy.
2554	*/
2555	static uint32_t
2556	sched_clutch_root_urgency(
2557	sched_clutch_root_t root_clutch)
2558	{
2559	return root_clutch->scr_urgency;
2560	}
2561
2562	/*
2563	* sched_clutch_root_count_sum()
2564	*
2565	* The count_sum mechanism is used for scheduler runq
2566	* statistics calculation. Its only useful for debugging
2567	* purposes; since it takes a mach_absolute_time() on
2568	* other scheduler implementations, its better to avoid
2569	* populating this until absolutely necessary.
2570	*/
2571	static uint32_t
2572	sched_clutch_root_count_sum(
2573	__unused sched_clutch_root_t root_clutch)
2574	{
2575	return `0`;
2576	}
2577
2578	/*
2579	* sched_clutch_root_priority()
2580	*
2581	* Routine to get the priority of the highest runnable
2582	* thread in the hierarchy.
2583	*/
2584	static int
2585	sched_clutch_root_priority(
2586	sched_clutch_root_t root_clutch)
2587	{
2588	return root_clutch->scr_priority;
2589	}
2590
2591	/*
2592	* sched_clutch_root_count()
2593	*
2594	* Returns total number of runnable threads in the hierarchy.
2595	*/
2596	uint32_t
2597	sched_clutch_root_count(
2598	sched_clutch_root_t root_clutch)
2599	{
2600	return root_clutch->scr_thr_count;
2601	}
2602
2603	#if CONFIG_SCHED_EDGE
2604
2605	/*
2606	* sched_clutch_root_foreign_empty()
2607	*
2608	* Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2609	*/
2610	static boolean_t
2611	sched_clutch_root_foreign_empty(
2612	sched_clutch_root_t root_clutch)
2613	{
2614	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2615	}
2616
2617	/*
2618	* sched_clutch_root_highest_foreign_thread_remove()
2619	*
2620	* Routine to return the thread in the highest priority clutch bucket in a cluster.
2621	* Must be called with the pset for the cluster locked.
2622	*/
2623	static thread_t
2624	sched_clutch_root_highest_foreign_thread_remove(
2625	sched_clutch_root_t root_clutch)
2626	{
2627	thread_t thread = THREAD_NULL;
2628	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2629	return thread;
2630	}
2631	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2632	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2633	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), `0`);
2634	return thread;
2635	}
2636
2637	#endif /* CONFIG_SCHED_EDGE */
2638
2639	/*
2640	* sched_clutch_thread_pri_shift()
2641	*
2642	* Routine to get the priority shift value for a thread.
2643	* Since the timesharing is done at the clutch_bucket level,
2644	* this routine gets the clutch_bucket and retrieves the
2645	* values from there.
2646	*/
2647	uint32_t
2648	sched_clutch_thread_pri_shift(
2649	thread_t thread,
2650	sched_bucket_t bucket)
2651	{
2652	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2653	return INT8_MAX;
2654	}
2655	assert(bucket != TH_BUCKET_RUN);
2656	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2657	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2658	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2659	}
2660
2661	#pragma mark -- Clutch Scheduler Algorithm
2662
2663	static void
2664	sched_clutch_init(void);
2665
2666	static thread_t
2667	sched_clutch_steal_thread(processor_set_t pset);
2668
2669	static void
2670	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2671
2672	static boolean_t
2673	sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2674	sched_options_t options);
2675
2676	static boolean_t
2677	sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2678
2679	static ast_t
2680	sched_clutch_processor_csw_check(processor_t processor);
2681
2682	static boolean_t
2683	sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2684
2685	static int
2686	sched_clutch_runq_count(processor_t processor);
2687
2688	static boolean_t
2689	sched_clutch_processor_queue_empty(processor_t processor);
2690
2691	static uint64_t
2692	sched_clutch_runq_stats_count_sum(processor_t processor);
2693
2694	static int
2695	sched_clutch_processor_bound_count(processor_t processor);
2696
2697	static void
2698	sched_clutch_pset_init(processor_set_t pset);
2699
2700	static void
2701	sched_clutch_processor_init(processor_t processor);
2702
2703	static thread_t
2704	sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2705
2706	static void
2707	sched_clutch_processor_queue_shutdown(processor_t processor);
2708
2709	static sched_mode_t
2710	sched_clutch_initial_thread_sched_mode(task_t parent_task);
2711
2712	static uint32_t
2713	sched_clutch_initial_quantum_size(thread_t thread);
2714
2715	static bool
2716	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason);
2717
2718	static uint32_t
2719	sched_clutch_run_incr(thread_t thread);
2720
2721	static uint32_t
2722	sched_clutch_run_decr(thread_t thread);
2723
2724	static void
2725	sched_clutch_update_thread_bucket(thread_t thread);
2726
2727	static void
2728	sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
2729
2730	const struct sched_dispatch_table sched_clutch_dispatch = {
2731	.sched_name = "clutch",
2732	.init = sched_clutch_init,
2733	.timebase_init = sched_timeshare_timebase_init,
2734	.processor_init = sched_clutch_processor_init,
2735	.pset_init = sched_clutch_pset_init,
2736	.maintenance_continuation = sched_timeshare_maintenance_continue,
2737	.choose_thread = sched_clutch_choose_thread,
2738	.steal_thread_enabled = sched_steal_thread_enabled,
2739	.steal_thread = sched_clutch_steal_thread,
2740	.compute_timeshare_priority = sched_compute_timeshare_priority,
2741	.choose_node = sched_choose_node,
2742	.choose_processor = choose_processor,
2743	.processor_enqueue = sched_clutch_processor_enqueue,
2744	.processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
2745	.processor_queue_remove = sched_clutch_processor_queue_remove,
2746	.processor_queue_empty = sched_clutch_processor_queue_empty,
2747	.priority_is_urgent = priority_is_urgent,
2748	.processor_csw_check = sched_clutch_processor_csw_check,
2749	.processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
2750	.initial_quantum_size = sched_clutch_initial_quantum_size,
2751	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2752	.can_update_priority = can_update_priority,
2753	.update_priority = update_priority,
2754	.lightweight_update_priority = lightweight_update_priority,
2755	.quantum_expire = sched_default_quantum_expire,
2756	.processor_runq_count = sched_clutch_runq_count,
2757	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2758	.processor_bound_count = sched_clutch_processor_bound_count,
2759	.thread_update_scan = sched_clutch_thread_update_scan,
2760	.multiple_psets_enabled = TRUE,
2761	.sched_groups_enabled = FALSE,
2762	.avoid_processor_enabled = TRUE,
2763	.thread_avoid_processor = sched_clutch_thread_avoid_processor,
2764	.processor_balance = sched_SMT_balance,
2765
2766	.rt_runq = sched_rtlocal_runq,
2767	.rt_init = sched_rtlocal_init,
2768	.rt_queue_shutdown = sched_rtlocal_queue_shutdown,
2769	.rt_runq_scan = sched_rtlocal_runq_scan,
2770	.rt_runq_count_sum = sched_rtlocal_runq_count_sum,
2771	.rt_steal_thread = sched_rtlocal_steal_thread,
2772
2773	.qos_max_parallelism = sched_qos_max_parallelism,
2774	.check_spill = sched_check_spill,
2775	.ipi_policy = sched_ipi_policy,
2776	.thread_should_yield = sched_thread_should_yield,
2777	.run_count_incr = sched_clutch_run_incr,
2778	.run_count_decr = sched_clutch_run_decr,
2779	.update_thread_bucket = sched_clutch_update_thread_bucket,
2780	.pset_made_schedulable = sched_pset_made_schedulable,
2781	.thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
2782	.cpu_init_completed = NULL,
2783	.thread_eligible_for_pset = NULL,
2784	};
2785
2786	__attribute__((always_inline))
2787	static inline run_queue_t
2788	sched_clutch_bound_runq(processor_t processor)
2789	{
2790	return &processor->runq;
2791	}
2792
2793	__attribute__((always_inline))
2794	static inline sched_clutch_root_t
2795	sched_clutch_processor_root_clutch(processor_t processor)
2796	{
2797	return &processor->processor_set->pset_clutch_root;
2798	}
2799
2800	__attribute__((always_inline))
2801	static inline run_queue_t
2802	sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2803	{
2804	assert(thread->bound_processor == processor);
2805	return sched_clutch_bound_runq(processor);
2806	}
2807
2808	static uint32_t
2809	sched_clutch_initial_quantum_size(thread_t thread)
2810	{
2811	if (thread == THREAD_NULL) {
2812	return std_quantum;
2813	}
2814	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2815	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2816	}
2817
2818	static sched_mode_t
2819	sched_clutch_initial_thread_sched_mode(task_t parent_task)
2820	{
2821	if (parent_task == kernel_task) {
2822	return TH_MODE_FIXED;
2823	} else {
2824	return TH_MODE_TIMESHARE;
2825	}
2826	}
2827
2828	static void
2829	sched_clutch_processor_init(processor_t processor)
2830	{
2831	run_queue_init(runq: &processor->runq);
2832	}
2833
2834	static void
2835	sched_clutch_pset_init(processor_set_t pset)
2836	{
2837	sched_clutch_root_init(root_clutch: &pset->pset_clutch_root, pset);
2838	}
2839
2840	static void
2841	sched_clutch_tunables_init(void)
2842	{
2843	sched_clutch_us_to_abstime(us_vals: sched_clutch_root_bucket_wcel_us, abstime_vals: sched_clutch_root_bucket_wcel);
2844	sched_clutch_us_to_abstime(us_vals: sched_clutch_root_bucket_warp_us, abstime_vals: sched_clutch_root_bucket_warp);
2845	sched_clutch_us_to_abstime(us_vals: sched_clutch_thread_quantum_us, abstime_vals: sched_clutch_thread_quantum);
2846	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2847	NSEC_PER_USEC, result: &sched_clutch_bucket_group_adjust_threshold);
2848	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2849	sched_clutch_us_to_abstime(us_vals: sched_clutch_bucket_group_pending_delta_us, abstime_vals: sched_clutch_bucket_group_pending_delta);
2850	}
2851
2852	static void
2853	sched_clutch_init(void)
2854	{
2855	if (!PE_parse_boot_argn(arg_string: "sched_clutch_bucket_group_interactive_pri", arg_ptr: &sched_clutch_bucket_group_interactive_pri, max_arg: sizeof(sched_clutch_bucket_group_interactive_pri))) {
2856	sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2857	}
2858	sched_timeshare_init();
2859	sched_clutch_tunables_init();
2860	}
2861
2862	static thread_t
2863	sched_clutch_choose_thread(
2864	processor_t processor,
2865	int priority,
2866	__unused ast_t reason)
2867	{
2868	int clutch_pri = sched_clutch_root_priority(root_clutch: sched_clutch_processor_root_clutch(processor));
2869	uint32_t clutch_count = sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor));
2870	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2871	boolean_t choose_from_boundq = false;
2872
2873	if (bound_runq->highq < priority &&
2874	clutch_pri < priority) {
2875	return THREAD_NULL;
2876	}
2877
2878	if (bound_runq->count && clutch_count) {
2879	if (bound_runq->highq >= clutch_pri) {
2880	choose_from_boundq = true;
2881	}
2882	} else if (bound_runq->count) {
2883	choose_from_boundq = true;
2884	} else if (clutch_count) {
2885	choose_from_boundq = false;
2886	} else {
2887	return THREAD_NULL;
2888	}
2889
2890	thread_t thread = THREAD_NULL;
2891	if (choose_from_boundq == false) {
2892	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2893	thread = sched_clutch_thread_highest_remove(root_clutch: pset_clutch_root);
2894	} else {
2895	thread = run_queue_dequeue(runq: bound_runq, options: SCHED_HEADQ);
2896	}
2897	return thread;
2898	}
2899
2900	static boolean_t
2901	sched_clutch_processor_enqueue(
2902	processor_t processor,
2903	thread_t thread,
2904	sched_options_t options)
2905	{
2906	boolean_t result;
2907
2908	thread_set_runq_locked(thread, new_runq: processor);
2909	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2910	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2911	result = sched_clutch_thread_insert(root_clutch: pset_clutch_root, thread, options);
2912	} else {
2913	run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2914	result = run_queue_enqueue(runq: rq, thread, options);
2915	}
2916	return result;
2917	}
2918
2919	static boolean_t
2920	sched_clutch_processor_queue_empty(processor_t processor)
2921	{
2922	return sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor)) == `0` &&
2923	sched_clutch_bound_runq(processor)->count == `0`;
2924	}
2925
2926	static ast_t
2927	sched_clutch_processor_csw_check(processor_t processor)
2928	{
2929	boolean_t has_higher;
2930	int pri;
2931
2932	if (sched_clutch_thread_avoid_processor(processor, thread: current_thread(), AST_NONE)) {
2933	return AST_PREEMPT \| AST_URGENT;
2934	}
2935
2936	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2937	int clutch_pri = sched_clutch_root_priority(root_clutch: sched_clutch_processor_root_clutch(processor));
2938
2939	assert(processor->active_thread != NULL);
2940
2941	pri = MAX(clutch_pri, bound_runq->highq);
2942
2943	if (processor->first_timeslice) {
2944	has_higher = (pri > processor->current_pri);
2945	} else {
2946	has_higher = (pri >= processor->current_pri);
2947	}
2948
2949	if (has_higher) {
2950	if (sched_clutch_root_urgency(root_clutch: sched_clutch_processor_root_clutch(processor)) > `0`) {
2951	return AST_PREEMPT \| AST_URGENT;
2952	}
2953
2954	if (bound_runq->urgency > `0`) {
2955	return AST_PREEMPT \| AST_URGENT;
2956	}
2957
2958	return AST_PREEMPT;
2959	}
2960
2961	return AST_NONE;
2962	}
2963
2964	static boolean_t
2965	sched_clutch_processor_queue_has_priority(processor_t processor,
2966	int priority,
2967	boolean_t gte)
2968	{
2969	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2970
2971	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2972
2973	if (gte) {
2974	return qpri >= priority;
2975	} else {
2976	return qpri > priority;
2977	}
2978	}
2979
2980	static int
2981	sched_clutch_runq_count(processor_t processor)
2982	{
2983	return (int)sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2984	}
2985
2986	static uint64_t
2987	sched_clutch_runq_stats_count_sum(processor_t processor)
2988	{
2989	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2990
2991	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2992	return bound_sum + sched_clutch_root_count_sum(root_clutch: sched_clutch_processor_root_clutch(processor));
2993	} else {
2994	return bound_sum;
2995	}
2996	}
2997	static int
2998	sched_clutch_processor_bound_count(processor_t processor)
2999	{
3000	return sched_clutch_bound_runq(processor)->count;
3001	}
3002
3003	static void
3004	sched_clutch_processor_queue_shutdown(processor_t processor)
3005	{
3006	processor_set_t pset = processor->processor_set;
3007	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3008	thread_t thread;
3009	queue_head_t tqueue;
3010
3011	/ We only need to migrate threads if this is the last active processor in the pset /
3012	if (pset->online_processor_count > `0`) {
3013	pset_unlock(pset);
3014	return;
3015	}
3016
3017	queue_init(&tqueue);
3018	while (sched_clutch_root_count(root_clutch: pset_clutch_root) > `0`) {
3019	thread = sched_clutch_thread_highest_remove(root_clutch: pset_clutch_root);
3020	enqueue_tail(que: &tqueue, elt: &thread->runq_links);
3021	}
3022
3023	pset_unlock(pset);
3024
3025	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3026	remqueue(elt: &thread->runq_links);
3027	thread_lock(thread);
3028	thread_setrun(thread, options: SCHED_TAILQ);
3029	thread_unlock(thread);
3030	}
3031	}
3032
3033	static boolean_t
3034	sched_clutch_processor_queue_remove(
3035	processor_t processor,
3036	thread_t thread)
3037	{
3038	processor_set_t pset = processor->processor_set;
3039
3040	pset_lock(pset);
3041
3042	if (processor == thread_get_runq_locked(thread)) {
3043	/*
3044	* Thread is on a run queue and we have a lock on
3045	* that run queue.
3046	*/
3047	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3048	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3049	sched_clutch_thread_remove(root_clutch: pset_clutch_root, thread, current_timestamp: mach_absolute_time(), options: SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3050	} else {
3051	run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3052	run_queue_remove(runq: rq, thread);
3053	}
3054	} else {
3055	/*
3056	* The thread left the run queue before we could
3057	* lock the run queue.
3058	*/
3059	thread_assert_runq_null(thread);
3060	processor = PROCESSOR_NULL;
3061	}
3062
3063	pset_unlock(pset);
3064
3065	return processor != PROCESSOR_NULL;
3066	}
3067
3068	static thread_t
3069	sched_clutch_steal_thread(__unused processor_set_t pset)
3070	{
3071	/ Thread stealing is not enabled for single cluster clutch scheduler platforms /
3072	return THREAD_NULL;
3073	}
3074
3075	static void
3076	sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3077	{
3078	boolean_t restart_needed = FALSE;
3079	processor_t processor = processor_list;
3080	processor_set_t pset;
3081	thread_t thread;
3082	spl_t s;
3083
3084	/*
3085	* We update the threads associated with each processor (bound and idle threads)
3086	* and then update the threads in each pset runqueue.
3087	*/
3088
3089	do {
3090	do {
3091	pset = processor->processor_set;
3092
3093	s = splsched();
3094	pset_lock(pset);
3095
3096	restart_needed = runq_scan(runq: sched_clutch_bound_runq(processor), scan_context);
3097
3098	pset_unlock(pset);
3099	splx(s);
3100
3101	if (restart_needed) {
3102	break;
3103	}
3104
3105	thread = processor->idle_thread;
3106	if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3107	if (thread_update_add_thread(thread) == FALSE) {
3108	restart_needed = TRUE;
3109	break;
3110	}
3111	}
3112	} while ((processor = processor->processor_list) != NULL);
3113
3114	/ Ok, we now have a collection of candidates -- fix them. /
3115	thread_update_process_threads();
3116	} while (restart_needed);
3117
3118	pset_node_t node = &pset_node0;
3119	pset = node->psets;
3120
3121	do {
3122	do {
3123	restart_needed = FALSE;
3124	while (pset != NULL) {
3125	s = splsched();
3126	pset_lock(pset);
3127
3128	if (sched_clutch_root_count(root_clutch: &pset->pset_clutch_root) > `0`) {
3129	for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3130	restart_needed = runq_scan(runq: &pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3131	if (restart_needed) {
3132	break;
3133	}
3134	}
3135	queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3136	sched_clutch_bucket_t clutch_bucket;
3137	qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3138	sched_clutch_bucket_group_timeshare_update(clutch_bucket_group: clutch_bucket->scb_group, clutch_bucket, ctime: scan_context->sched_tick_last_abstime);
3139	restart_needed = sched_clutch_timeshare_scan(thread_queue: &clutch_bucket->scb_thread_timeshare_queue, count: clutch_bucket->scb_thr_count, scan_context);
3140	if (restart_needed) {
3141	break;
3142	}
3143	}
3144	}
3145
3146	pset_unlock(pset);
3147	splx(s);
3148
3149	if (restart_needed) {
3150	break;
3151	}
3152	pset = pset->pset_list;
3153	}
3154
3155	if (restart_needed) {
3156	break;
3157	}
3158	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3159
3160	/ Ok, we now have a collection of candidates -- fix them. /
3161	thread_update_process_threads();
3162	} while (restart_needed);
3163	}
3164
3165	extern int sched_allow_rt_smt;
3166
3167	/ Return true if this thread should not continue running on this processor /
3168	static bool
3169	sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason)
3170	{
3171	if (processor->processor_primary != processor) {
3172	/*
3173	* This is a secondary SMT processor. If the primary is running
3174	* a realtime thread, only allow realtime threads on the secondary.
3175	*/
3176	if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) \|\| !sched_allow_rt_smt)) {
3177	return true;
3178	}
3179	}
3180
3181	return false;
3182	}
3183
3184	/*
3185	* For the clutch scheduler, the run counts are maintained in the clutch
3186	* buckets (i.e thread group scheduling structure).
3187	*/
3188	static uint32_t
3189	sched_clutch_run_incr(thread_t thread)
3190	{
3191	assert((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN);
3192	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3193	sched_clutch_thread_run_bucket_incr(thread, bucket: thread->th_sched_bucket);
3194	return new_count;
3195	}
3196
3197	static uint32_t
3198	sched_clutch_run_decr(thread_t thread)
3199	{
3200	assert((thread->state & (TH_RUN \| TH_IDLE)) != TH_RUN);
3201	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3202	sched_clutch_thread_run_bucket_decr(thread, bucket: thread->th_sched_bucket);
3203	return new_count;
3204	}
3205
3206	/*
3207	* For threads that have changed sched_pri without changing the
3208	* base_pri for any reason other than decay, use the sched_pri
3209	* as the bucketizing priority instead of base_pri. All such
3210	* changes are typically due to kernel locking primitives boosts
3211	* or demotions.
3212	*/
3213	static boolean_t
3214	sched_thread_sched_pri_promoted(thread_t thread)
3215	{
3216	return (thread->sched_flags & TH_SFLAG_PROMOTED) \|\|
3217	(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) \|\|
3218	(thread->sched_flags & TH_SFLAG_DEMOTED_MASK) \|\|
3219	(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) \|\|
3220	(thread->kern_promotion_schedpri != `0`);
3221	}
3222
3223	/*
3224	* Routine to update the scheduling bucket for the thread.
3225	*
3226	* In the clutch scheduler implementation, the thread's bucket
3227	* is based on sched_pri if it was promoted due to a kernel
3228	* primitive; otherwise its based on the thread base_pri. This
3229	* enhancement allows promoted threads to reach a higher priority
3230	* bucket and potentially get selected sooner for scheduling.
3231	*
3232	* Also, the clutch scheduler does not honor fixed priority below
3233	* FG priority. It simply puts those threads in the corresponding
3234	* timeshare bucket. The reason for to do that is because it is
3235	* extremely hard to define the scheduling properties of such threads
3236	* and they typically lead to performance issues.
3237	*/
3238
3239	void
3240	sched_clutch_update_thread_bucket(thread_t thread)
3241	{
3242	sched_bucket_t old_bucket = thread->th_sched_bucket;
3243	thread_assert_runq_null(thread);
3244	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3245	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3246
3247	if (old_bucket == new_bucket) {
3248	return;
3249	}
3250
3251	thread->th_sched_bucket = new_bucket;
3252	thread->pri_shift = sched_clutch_thread_pri_shift(thread, bucket: new_bucket);
3253	/*
3254	* Since this is called after the thread has been removed from the runq,
3255	* only the run counts need to be updated. The re-insert into the runq
3256	* would put the thread into the correct new bucket's runq.
3257	*/
3258	if ((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN) {
3259	sched_clutch_thread_run_bucket_decr(thread, bucket: old_bucket);
3260	sched_clutch_thread_run_bucket_incr(thread, bucket: new_bucket);
3261	}
3262	}
3263
3264	static void
3265	sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3266	{
3267	/ Clutch ignores the recommendation because Clutch does not migrate*
3268	* threads between cluster types independently from the Edge scheduler.
3269	*/
3270	}
3271
3272	#if CONFIG_SCHED_EDGE
3273
3274	/ Implementation of the AMP version of the clutch scheduler /
3275
3276	static void
3277	sched_edge_init(void);
3278
3279	static void
3280	sched_edge_pset_init(processor_set_t pset);
3281
3282	static thread_t
3283	sched_edge_processor_idle(processor_set_t pset);
3284
3285	static ast_t
3286	sched_edge_processor_csw_check(processor_t processor);
3287
3288	static boolean_t
3289	sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3290
3291	static boolean_t
3292	sched_edge_processor_queue_empty(processor_t processor);
3293
3294	static thread_t
3295	sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3296
3297	static void
3298	sched_edge_processor_queue_shutdown(processor_t processor);
3299
3300	static processor_t
3301	sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3302
3303	static bool
3304	sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3305
3306	static bool
3307	sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3308
3309	static void
3310	sched_edge_check_spill(processor_set_t pset, thread_t thread);
3311
3312	static bool
3313	sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3314
3315	static void
3316	sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3317
3318	static void
3319	sched_edge_cpu_init_completed(void);
3320
3321	static bool
3322	sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3323
3324	static bool
3325	sched_edge_steal_thread_enabled(processor_set_t pset);
3326
3327	static sched_ipi_type_t
3328	sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3329
3330	static uint32_t
3331	sched_edge_qos_max_parallelism(int qos, uint64_t options);
3332
3333	static uint32_t
3334	sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3335
3336	const struct sched_dispatch_table sched_edge_dispatch = {
3337	.sched_name = "edge",
3338	.init = sched_edge_init,
3339	.timebase_init = sched_timeshare_timebase_init,
3340	.processor_init = sched_clutch_processor_init,
3341	.pset_init = sched_edge_pset_init,
3342	.maintenance_continuation = sched_timeshare_maintenance_continue,
3343	.choose_thread = sched_edge_choose_thread,
3344	.steal_thread_enabled = sched_edge_steal_thread_enabled,
3345	.steal_thread = sched_edge_processor_idle,
3346	.compute_timeshare_priority = sched_compute_timeshare_priority,
3347	.choose_node = sched_choose_node,
3348	.choose_processor = sched_edge_choose_processor,
3349	.processor_enqueue = sched_clutch_processor_enqueue,
3350	.processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3351	.processor_queue_remove = sched_clutch_processor_queue_remove,
3352	.processor_queue_empty = sched_edge_processor_queue_empty,
3353	.priority_is_urgent = priority_is_urgent,
3354	.processor_csw_check = sched_edge_processor_csw_check,
3355	.processor_queue_has_priority = sched_edge_processor_queue_has_priority,
3356	.initial_quantum_size = sched_clutch_initial_quantum_size,
3357	.initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3358	.can_update_priority = can_update_priority,
3359	.update_priority = update_priority,
3360	.lightweight_update_priority = lightweight_update_priority,
3361	.quantum_expire = sched_default_quantum_expire,
3362	.processor_runq_count = sched_clutch_runq_count,
3363	.processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3364	.processor_bound_count = sched_clutch_processor_bound_count,
3365	.thread_update_scan = sched_clutch_thread_update_scan,
3366	.multiple_psets_enabled = TRUE,
3367	.sched_groups_enabled = FALSE,
3368	.avoid_processor_enabled = TRUE,
3369	.thread_avoid_processor = sched_edge_thread_avoid_processor,
3370	.processor_balance = sched_edge_balance,
3371
3372	.rt_runq = sched_rtlocal_runq,
3373	.rt_init = sched_rtlocal_init,
3374	.rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3375	.rt_runq_scan = sched_rtlocal_runq_scan,
3376	.rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3377	.rt_steal_thread = sched_rtlocal_steal_thread,
3378
3379	.qos_max_parallelism = sched_edge_qos_max_parallelism,
3380	.check_spill = sched_edge_check_spill,
3381	.ipi_policy = sched_edge_ipi_policy,
3382	.thread_should_yield = sched_edge_thread_should_yield,
3383	.run_count_incr = sched_clutch_run_incr,
3384	.run_count_decr = sched_clutch_run_decr,
3385	.update_thread_bucket = sched_clutch_update_thread_bucket,
3386	.pset_made_schedulable = sched_edge_pset_made_schedulable,
3387	.thread_group_recommendation_change = NULL,
3388	.cpu_init_completed = sched_edge_cpu_init_completed,
3389	.thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3390	};
3391
3392	static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3393
3394	/*
3395	* sched_edge_pset_available()
3396	*
3397	* Routine to determine if a pset is available for scheduling.
3398	*/
3399	static bool
3400	sched_edge_pset_available(processor_set_t pset)
3401	{
3402	if (pset == NULL) {
3403	return false;
3404	}
3405	return pset_available_cpu_count(pset) > `0`;
3406	}
3407
3408	/*
3409	* sched_edge_thread_bound_cluster_id()
3410	*
3411	* Routine to determine which cluster a particular thread is bound to. Uses
3412	* the sched_flags on the thread to map back to a specific cluster id.
3413	*
3414	* <Edge Multi-cluster Support Needed>
3415	*/
3416	static uint32_t
3417	sched_edge_thread_bound_cluster_id(thread_t thread)
3418	{
3419	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3420	return thread->th_bound_cluster_id;
3421	}
3422
3423	/ Forward declaration for some thread migration routines /
3424	static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3425	static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3426	static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3427	static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3428
3429	/*
3430	* sched_edge_config_set()
3431	*
3432	* Support to update an edge configuration. Typically used by CLPC to affect thread migration
3433	* policies in the scheduler.
3434	*/
3435	static void
3436	sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3437	{
3438	sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3439	edge->sce_edge_packed = edge_config.sce_edge_packed;
3440	}
3441
3442	/*
3443	* sched_edge_config_get()
3444	*
3445	* Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3446	* if it needs to update edges.
3447	*/
3448	static sched_clutch_edge
3449	sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3450	{
3451	return pset_array[src_cluster]->sched_edges[dst_cluster];
3452	}
3453
3454	/*
3455	* sched_edge_matrix_set()
3456	*
3457	* Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3458	* indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3459	* are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3460	*/
3461	void
3462	sched_edge_matrix_set(sched_clutch_edge edge_matrix, bool edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3463	{
3464	uint32_t edge_index = `0`;
3465	for (uint32_t src_cluster = `0`; src_cluster < matrix_order; src_cluster++) {
3466	for (uint32_t dst_cluster = `0`; dst_cluster < matrix_order; dst_cluster++) {
3467	if (edge_changes_bitmap[edge_index]) {
3468	sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3469	}
3470	edge_index++;
3471	}
3472	}
3473	}
3474
3475	/*
3476	* sched_edge_matrix_get()
3477	*
3478	* Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3479	* indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3480	* are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3481	*/
3482	void
3483	sched_edge_matrix_get(sched_clutch_edge edge_matrix, bool edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3484	{
3485	uint32_t edge_index = `0`;
3486	for (uint32_t src_cluster = `0`; src_cluster < matrix_order; src_cluster++) {
3487	for (uint32_t dst_cluster = `0`; dst_cluster < matrix_order; dst_cluster++) {
3488	if (edge_request_bitmap[edge_index]) {
3489	edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3490	}
3491	edge_index++;
3492	}
3493	}
3494	}
3495
3496	/*
3497	* sched_edge_init()
3498	*
3499	* Routine to initialize the data structures for the Edge scheduler.
3500	*/
3501	static void
3502	sched_edge_init(void)
3503	{
3504	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3505	sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3506	}
3507	sched_timeshare_init();
3508	sched_clutch_tunables_init();
3509	sched_edge_max_clusters = ml_get_cluster_count();
3510	}
3511
3512	static void
3513	sched_edge_pset_init(processor_set_t pset)
3514	{
3515	uint32_t pset_cluster_id = pset->pset_cluster_id;
3516	pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3517
3518	/ Set the edge weight and properties for the pset itself /
3519	bitmap_clear(pset->foreign_psets, pset_cluster_id);
3520	bitmap_clear(pset->native_psets, pset_cluster_id);
3521	bitmap_clear(pset->local_psets, pset_cluster_id);
3522	bitmap_clear(pset->remote_psets, pset_cluster_id);
3523	pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = `0`, .sce_migration_allowed = `0`, .sce_steal_allowed = `0`}.sce_edge_packed;
3524	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3525	bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3526	}
3527
3528	static thread_t
3529	sched_edge_choose_thread(
3530	processor_t processor,
3531	int priority,
3532	__unused ast_t reason)
3533	{
3534	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3535	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3536	boolean_t choose_from_boundq = false;
3537
3538	if ((bound_runq->highq < priority) &&
3539	(clutch_pri < priority)) {
3540	return THREAD_NULL;
3541	}
3542
3543	if (bound_runq->highq >= clutch_pri) {
3544	choose_from_boundq = true;
3545	}
3546
3547	thread_t thread = THREAD_NULL;
3548	if (choose_from_boundq == false) {
3549	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3550	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3551	} else {
3552	thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3553	}
3554	return thread;
3555	}
3556
3557	static boolean_t
3558	sched_edge_processor_queue_empty(processor_t processor)
3559	{
3560	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == `0`) &&
3561	(sched_clutch_bound_runq(processor)->count == `0`);
3562	}
3563
3564	static void
3565	sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3566	{
3567	assert(thread->bound_processor == PROCESSOR_NULL);
3568	}
3569
3570	__options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3571	SCHED_EDGE_YIELD_RUNQ_NONEMPTY = `0x0`,
3572	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = `0x1`,
3573	SCHED_EDGE_YIELD_FOREIGN_RUNNING = `0x2`,
3574	SCHED_EDGE_YIELD_STEAL_POSSIBLE = `0x3`,
3575	SCHED_EDGE_YIELD_DISALLOW = `0x4`,
3576	});
3577
3578	static bool
3579	sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3580	{
3581	if (!sched_edge_processor_queue_empty(processor) \|\| (rt_runq_count(processor->processor_set) > `0`)) {
3582	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
3583	thread_tid(thread), processor->processor_set->pset_cluster_id, `0`, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3584	return true;
3585	}
3586
3587	/*
3588	* The yield logic should follow the same logic that steal_thread () does. The
3589	* thread_should_yield() is effectively trying to quickly check that if the
3590	* current thread gave up CPU, is there any other thread that would execute
3591	* on this CPU. So it needs to provide the same answer as the steal_thread()/
3592	* processor Idle logic.
3593	*/
3594	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3595	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
3596	thread_tid(thread), processor->processor_set->pset_cluster_id, `0`, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3597	return true;
3598	}
3599	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3600	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
3601	thread_tid(thread), processor->processor_set->pset_cluster_id, `0`, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3602	return true;
3603	}
3604
3605	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3606	if (steal_candidate != NULL) {
3607	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE,
3608	thread_tid(thread), processor->processor_set->pset_cluster_id, `0`, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3609	return true;
3610	}
3611
3612	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) \| DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3613	`0`, SCHED_EDGE_YIELD_DISALLOW);
3614	return false;
3615	}
3616
3617	static ast_t
3618	sched_edge_processor_csw_check(processor_t processor)
3619	{
3620	boolean_t has_higher;
3621	int pri;
3622
3623	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3624	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3625
3626	assert(processor->active_thread != NULL);
3627
3628	pri = MAX(clutch_pri, bound_runq->highq);
3629
3630	if (processor->first_timeslice) {
3631	has_higher = (pri > processor->current_pri);
3632	} else {
3633	has_higher = (pri >= processor->current_pri);
3634	}
3635
3636	if (has_higher) {
3637	if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > `0`) {
3638	return AST_PREEMPT \| AST_URGENT;
3639	}
3640
3641	if (bound_runq->urgency > `0`) {
3642	return AST_PREEMPT \| AST_URGENT;
3643	}
3644
3645	return AST_PREEMPT;
3646	}
3647
3648	return AST_NONE;
3649	}
3650
3651	static boolean_t
3652	sched_edge_processor_queue_has_priority(processor_t processor,
3653	int priority,
3654	boolean_t gte)
3655	{
3656	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3657
3658	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3659	if (gte) {
3660	return qpri >= priority;
3661	} else {
3662	return qpri > priority;
3663	}
3664	}
3665
3666	static void
3667	sched_edge_processor_queue_shutdown(processor_t processor)
3668	{
3669	processor_set_t pset = processor->processor_set;
3670	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3671	thread_t thread;
3672	queue_head_t tqueue;
3673
3674	/ We only need to migrate threads if this is the last active or last recommended processor in the pset /
3675	if ((pset->online_processor_count > `0`) && pset_is_recommended(pset)) {
3676	pset_unlock(pset);
3677	return;
3678	}
3679
3680	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3681
3682	queue_init(&tqueue);
3683	while (sched_clutch_root_count(pset_clutch_root) > `0`) {
3684	thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3685	enqueue_tail(&tqueue, &thread->runq_links);
3686	}
3687	pset_unlock(pset);
3688
3689	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3690	remqueue(&thread->runq_links);
3691	thread_lock(thread);
3692	thread_setrun(thread, SCHED_TAILQ);
3693	thread_unlock(thread);
3694	}
3695	}
3696
3697	/*
3698	* sched_edge_cluster_load_metric()
3699	*
3700	* The load metric for a cluster is a measure of the average scheduling latency
3701	* experienced by threads on that cluster. It is a product of the average number
3702	* of threads in the runqueue and the average execution time for threads. The metric
3703	* has special values in the following cases:
3704	* - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3705	* the maximum value to disallow any threads to migrate to this cluster.
3706	* - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3707	* to be spread across the platform quickly for ncpu wide workloads.
3708	*/
3709	static uint32_t
3710	sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3711	{
3712	if (sched_edge_pset_available(pset) == false) {
3713	return UINT32_MAX;
3714	}
3715	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3716	}
3717
3718	/*
3719	*
3720	* Edge Scheduler Steal/Rebalance logic
3721	*
3722	* = Generic scheduler logic =
3723	*
3724	* The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3725	* find any thread for execution in its runqueue. The aim of the steal operation
3726	* is to find other threads running/runnable in other clusters which should be
3727	* executed here.
3728	*
3729	* If the steal callout does not return a thread, the thread_select() logic calls
3730	* SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3731	* threads and idle out the current CPU.
3732	*
3733	* = SCHED(steal_thread) for Edge Scheduler =
3734	*
3735	* The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3736	* routine tries to do the following operations in order:
3737	* (1) Find foreign runnnable threads in non-native cluster
3738	* runqueues (sched_edge_foreign_runnable_thread_remove())
3739	* (2) Check if foreign threads are running on the non-native
3740	* clusters (sched_edge_foreign_running_thread_available())
3741	* - If yes, return THREAD_NULL for the steal callout and
3742	* perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3743	* (3) Steal a thread from another cluster based on edge
3744	* weights (sched_edge_steal_thread())
3745	*
3746	* = SCHED(processor_balance) for Edge Scheduler =
3747	*
3748	* If steal_thread did not return a thread for the processor, use
3749	* sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3750	*
3751	* = Clutch Bucket Preferred Cluster Overrides =
3752	*
3753	* Since these operations (just like thread migrations on enqueue)
3754	* move threads across clusters, they need support for handling clutch
3755	* bucket group level preferred cluster recommendations.
3756	* For (1), a clutch bucket will be in the foreign runnable queue based
3757	* on the clutch bucket group preferred cluster.
3758	* For (2), the running thread will set the bit on the processor based
3759	* on its preferred cluster type.
3760	* For (3), the edge configuration would prevent threads from being stolen
3761	* in the wrong direction.
3762	*
3763	* = SCHED(thread_should_yield) =
3764	* The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3765	* since that is expecting the same answer as if thread_select() was called on a core
3766	* with an empty runqueue.
3767	*/
3768
3769	static bool
3770	sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3771	{
3772	/*
3773	* For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3774	*/
3775	return true;
3776	}
3777
3778	static processor_set_t
3779	sched_edge_steal_candidate(processor_set_t pset)
3780	{
3781	uint32_t dst_cluster_id = pset->pset_cluster_id;
3782	for (int cluster_id = `0`; cluster_id < sched_edge_max_clusters; cluster_id++) {
3783	processor_set_t candidate_pset = pset_array[cluster_id];
3784	if (cluster_id == dst_cluster_id) {
3785	continue;
3786	}
3787	if (candidate_pset == NULL) {
3788	continue;
3789	}
3790	sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3791	if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -`1`)) {
3792	return candidate_pset;
3793	}
3794	}
3795	return NULL;
3796	}
3797
3798	static boolean_t
3799	sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3800	{
3801	/ Find all the clusters that are foreign for this cluster /
3802	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3803	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= `0`; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3804	/*
3805	* For each cluster, see if there are any runnable foreign threads.
3806	* This check is currently being done without the pset lock to make it cheap for
3807	* the common case.
3808	*/
3809	processor_set_t target_pset = pset_array[cluster];
3810	if (sched_edge_pset_available(target_pset) == false) {
3811	continue;
3812	}
3813
3814	if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3815	return true;
3816	}
3817	}
3818	return false;
3819	}
3820
3821	static thread_t
3822	sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3823	{
3824	thread_t thread = THREAD_NULL;
3825
3826	/ Find all the clusters that are foreign for this cluster /
3827	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3828	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= `0`; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3829	/*
3830	* For each cluster, see if there are any runnable foreign threads.
3831	* This check is currently being done without the pset lock to make it cheap for
3832	* the common case.
3833	*/
3834	processor_set_t target_pset = pset_array[cluster];
3835	if (sched_edge_pset_available(target_pset) == false) {
3836	continue;
3837	}
3838
3839	if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3840	continue;
3841	}
3842	/*
3843	* Looks like there are runnable foreign threads in the hierarchy; lock the pset
3844	* and get the highest priority thread.
3845	*/
3846	pset_lock(target_pset);
3847	if (sched_edge_pset_available(target_pset)) {
3848	thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3849	sched_update_pset_load_average(target_pset, ctime);
3850	}
3851	pset_unlock(target_pset);
3852
3853	/*
3854	* Edge Scheduler Optimization
3855	*
3856	* The current implementation immediately returns as soon as it finds a foreign
3857	* runnable thread. This could be enhanced to look at highest priority threads
3858	* from all foreign clusters and pick the highest amongst them. That would need
3859	* some form of global state across psets to make that kind of a check cheap.
3860	*/
3861	if (thread != THREAD_NULL) {
3862	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) \| DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, `0`);
3863	break;
3864	}
3865	/ Looks like the thread escaped after the check but before the pset lock was taken; continue the search /
3866	}
3867
3868	return thread;
3869	}
3870
3871	/*
3872	* sched_edge_cpu_running_foreign_shared_rsrc_available()
3873	*
3874	* Routine to determine if the thread running on a CPU is a shared resource thread
3875	* and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3876	* a CPU going idle on a pset should rebalance a running shared resource heavy thread
3877	* from another non-ideal cluster based on the former's shared resource load.
3878	*/
3879	static boolean_t
3880	sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3881	{
3882	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3883	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3884	return false;
3885	}
3886
3887	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3888	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3889	return false;
3890	}
3891	return true;
3892	}
3893
3894	static boolean_t
3895	sched_edge_foreign_running_thread_available(processor_set_t pset)
3896	{
3897	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3898	int cluster = -`1`;
3899	while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[`0`], cluster)) != -`1`) {
3900	/ Skip the pset if its not schedulable /
3901	processor_set_t target_pset = pset_array[cluster];
3902	if (sched_edge_pset_available(target_pset) == false) {
3903	continue;
3904	}
3905
3906	uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3907	for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= `0`; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3908	if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3909	continue;
3910	}
3911	return true;
3912	}
3913	}
3914	return false;
3915	}
3916
3917	static bool
3918	sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3919	{
3920	int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3921	if (highest_runnable_bucket == -`1`) {
3922	/ Candidate cluster runq is empty /
3923	return false;
3924	}
3925
3926	if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3927	/ Always allow stealing from homogeneous clusters /
3928	return true;
3929	} else {
3930	/ Use the load metrics for highest runnable bucket since that would be stolen next /
3931	int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3932	return candidate_load > `0`;
3933	}
3934	}
3935
3936	static thread_t
3937	sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3938	{
3939	thread_t thread = THREAD_NULL;
3940
3941	/*
3942	* Edge Scheduler Optimization
3943	*
3944	* The logic today bails as soon as it finds a cluster where the cluster load is
3945	* greater than the edge weight. Maybe it should have a more advanced version
3946	* which looks for the maximum delta etc.
3947	*/
3948	int cluster_id = -`1`;
3949	while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -`1`) {
3950	processor_set_t steal_from_pset = pset_array[cluster_id];
3951	if (steal_from_pset == NULL) {
3952	continue;
3953	}
3954	sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3955	if (incoming_edge->sce_steal_allowed == false) {
3956	continue;
3957	}
3958	pset_lock(steal_from_pset);
3959	if (sched_edge_steal_possible(pset, steal_from_pset)) {
3960	uint64_t current_timestamp = mach_absolute_time();
3961	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3962	thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3963	sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3964	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) \| DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, `0`);
3965	sched_update_pset_load_average(steal_from_pset, current_timestamp);
3966	}
3967	pset_unlock(steal_from_pset);
3968	if (thread != THREAD_NULL) {
3969	break;
3970	}
3971	}
3972	return thread;
3973	}
3974
3975	/*
3976	* sched_edge_processor_idle()
3977	*
3978	* The routine is the implementation for steal_thread() for the Edge scheduler.
3979	*/
3980	static thread_t
3981	sched_edge_processor_idle(processor_set_t pset)
3982	{
3983	thread_t thread = THREAD_NULL;
3984
3985	uint64_t ctime = mach_absolute_time();
3986
3987	processor_t processor = current_processor();
3988	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
3989
3990	/ Each of the operations acquire the lock for the pset they target /
3991	pset_unlock(pset);
3992
3993	/ Find highest priority runnable thread on all non-native clusters /
3994	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
3995	if (thread != THREAD_NULL) {
3996	return thread;
3997	}
3998
3999	/ Find highest priority runnable thread on all native clusters /
4000	thread = sched_edge_steal_thread(pset, pset->native_psets[`0`]);
4001	if (thread != THREAD_NULL) {
4002	return thread;
4003	}
4004
4005	/ Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() /
4006	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4007	if (rebalance_needed) {
4008	return THREAD_NULL;
4009	}
4010
4011	/ No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. /
4012	thread = sched_edge_steal_thread(pset, pset->native_psets[`0`] \| pset->foreign_psets[`0`]);
4013	return thread;
4014	}
4015
4016	/ Return true if this shared resource thread has a better cluster to run on /
4017	static bool
4018	sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4019	{
4020	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4021	uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4022	/*
4023	* Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4024	* scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4025	*
4026	* Edge Scheduler Optimization
4027	* For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4028	*/
4029	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - `1`);
4030
4031	uint64_t eligible_pset_bitmask = `0`;
4032	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4033	/*
4034	* For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4035	* only among clusters native with the preferred cluster.
4036	*/
4037	eligible_pset_bitmask = preferred_pset->native_psets[`0`];
4038	bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4039	} else {
4040	/ For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters /
4041	eligible_pset_bitmask = sched_edge_available_pset_bitmask[`0`];
4042	}
4043
4044	/ For each eligible cluster check if there is an under-utilized cluster; return true if there is /
4045	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= `0`; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4046	if (cluster_id == current_pset->pset_cluster_id) {
4047	continue;
4048	}
4049	uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4050	if (current_pset_adjusted_load > cluster_load) {
4051	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) \| DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4052	return true;
4053	}
4054	}
4055	return false;
4056	}
4057
4058	/ Return true if this thread should not continue running on this processor /
4059	static bool
4060	sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4061	{
4062	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4063	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4064	return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4065	}
4066	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4067	if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4068	return true;
4069	}
4070	return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4071	}
4072
4073	/*
4074	* For long running parallel workloads, it is important to rebalance threads across
4075	* E/P clusters so that they make equal forward progress. This is achieved through
4076	* threads expiring their quantum on the non-preferred cluster type and explicitly
4077	* rebalancing to the preferred cluster runqueue.
4078	*/
4079	if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4080	return true;
4081	}
4082	/ If the preferred pset for the thread is now idle, try and migrate thread to that cluster /
4083	if ((processor->processor_set != preferred_pset) &&
4084	(sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == `0`)) {
4085	return true;
4086	}
4087
4088	/*
4089	* On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4090	* A migration is only recommended if there's also an idle core available that needn't be avoided.
4091	*/
4092	if (reason & AST_QUANTUM) {
4093	if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4094	uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4095	if (non_avoided_idle_primary_map != `0`) {
4096	return true;
4097	}
4098	}
4099	}
4100
4101	return false;
4102	}
4103
4104	static bool
4105	sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4106	{
4107	assert(cprocessor == current_processor());
4108	pset_unlock(cpset);
4109
4110	uint64_t ast_processor_map = `0`;
4111	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4112
4113	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4114	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= `0`; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4115	/ Skip the pset if its not schedulable /
4116	processor_set_t target_pset = pset_array[cluster];
4117	if (sched_edge_pset_available(target_pset) == false) {
4118	continue;
4119	}
4120
4121	pset_lock(target_pset);
4122	uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4123	for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= `0`; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4124	if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4125	continue;
4126	}
4127	processor_t target_cpu = processor_array[cpuid];
4128	ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4129	if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4130	bit_set(ast_processor_map, cpuid);
4131	}
4132	}
4133	pset_unlock(target_pset);
4134	}
4135
4136	for (int cpuid = lsb_first(ast_processor_map); cpuid >= `0`; cpuid = lsb_next(ast_processor_map, cpuid)) {
4137	processor_t ast_processor = processor_array[cpuid];
4138	sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4139	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) \| DBG_FUNC_NONE, `0`, cprocessor->cpu_id, cpuid, `0`);
4140	}
4141
4142	/ Core should light-weight idle using WFE if it just sent out rebalance IPIs /
4143	return ast_processor_map != `0`;
4144	}
4145
4146	/*
4147	* sched_edge_migration_check()
4148	*
4149	* Routine to evaluate an edge between two clusters to decide if migration is possible
4150	* across that edge. Also updates the selected_pset and max_edge_delta out parameters
4151	* accordingly. The return value indicates if the invoking routine should short circuit
4152	* the search, since an ideal candidate has been found. The routine looks at the regular
4153	* edges and cluster loads or the shared resource loads based on the type of thread.
4154	*/
4155	static bool
4156	sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4157	uint32_t preferred_cluster_load, thread_t thread, processor_set_t selected_pset, uint32_t max_edge_delta)
4158	{
4159	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4160	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4161	processor_set_t dst_pset = pset_array[cluster_id];
4162	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4163	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4164
4165	if (cluster_id == preferred_cluster_id) {
4166	return false;
4167	}
4168
4169	if (dst_pset == NULL) {
4170	return false;
4171	}
4172
4173	sched_clutch_edge *edge = preferred_pset->sched_edges;
4174	if (edge[cluster_id].sce_migration_allowed == false) {
4175	return false;
4176	}
4177	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4178	if (dst_load == `0`) {
4179	/ The candidate cluster is idle; select it immediately for execution /
4180	*selected_pset = dst_pset;
4181	*max_edge_delta = preferred_cluster_load;
4182	return true;
4183	}
4184
4185	uint32_t edge_delta = `0`;
4186	if (dst_load > preferred_cluster_load) {
4187	return false;
4188	}
4189	edge_delta = preferred_cluster_load - dst_load;
4190	if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4191	/*
4192	* For non shared resource threads, use the edge migration weight to decide if
4193	* this cluster is over-committed at the QoS level of this thread.
4194	*/
4195	return false;
4196	}
4197
4198	if (edge_delta < *max_edge_delta) {
4199	return false;
4200	}
4201	if (edge_delta == *max_edge_delta) {
4202	/ If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked /
4203	boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4204	boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4205	if (selected_homogeneous \|\| !candidate_homogeneous) {
4206	return false;
4207	}
4208	}
4209	/ dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated /
4210	*max_edge_delta = edge_delta;
4211	*selected_pset = dst_pset;
4212	return false;
4213	}
4214
4215	/*
4216	* sched_edge_iterate_clusters_ordered()
4217	*
4218	* Routine to iterate clusters in die local order. For multi-die machines,
4219	* the routine ensures that the candidate clusters on the same die as the
4220	* passed in pset are returned before the remote die clusters. This should
4221	* be used in all places where cluster selection in die order matters.
4222	*/
4223
4224	static int
4225	sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4226	{
4227	int cluster_id = -`1`;
4228
4229	uint64_t local_candidate_map = starting_pset->local_psets[`0`] & candidate_map;
4230	uint64_t remote_candidate_map = starting_pset->remote_psets[`0`] & candidate_map;
4231
4232	if (previous_cluster == -`1`) {
4233	/ previous_cluster == -1 indicates the initial condition /
4234	cluster_id = bit_first(local_candidate_map);
4235	if (cluster_id != -`1`) {
4236	return cluster_id;
4237	}
4238	return bit_first(remote_candidate_map);
4239	} else {
4240	/*
4241	* After the initial condition, the routine attempts to return a
4242	* cluster in the previous_cluster's locality. If none is available,
4243	* it looks at remote clusters.
4244	*/
4245	if (bit_test(local_candidate_map, previous_cluster)) {
4246	cluster_id = bit_next(local_candidate_map, previous_cluster);
4247	if (cluster_id != -`1`) {
4248	return cluster_id;
4249	} else {
4250	return bit_first(remote_candidate_map);
4251	}
4252	}
4253	return bit_next(remote_candidate_map, previous_cluster);
4254	}
4255	}
4256
4257	/*
4258	* sched_edge_migrate_edges_evaluate()
4259	*
4260	* Routine to find the candidate for thread migration based on edge weights.
4261	*
4262	* Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4263	* return preferred_pset if its the most ideal destination for this thread.
4264	*/
4265	static processor_set_t
4266	sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4267	{
4268	processor_set_t selected_pset = preferred_pset;
4269	uint32_t max_edge_delta = `0`;
4270	bool search_complete = false;
4271	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4272	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4273
4274	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4275	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4276	/ Always start the search with the native clusters /
4277	int cluster_id = -`1`;
4278	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[`0`], cluster_id)) != -`1`) {
4279	search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4280	if (search_complete) {
4281	break;
4282	}
4283	}
4284
4285	if (search_complete) {
4286	return selected_pset;
4287	}
4288
4289	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4290	/*
4291	* If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4292	* to fill up the preferred cluster and its homogeneous peers first.
4293	*/
4294
4295	if (max_edge_delta > `0`) {
4296	/*
4297	* This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4298	* above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4299	* the shared resource type. This indicates that there is capacity on a native cluster where this thread
4300	* should be placed.
4301	*/
4302	return selected_pset;
4303	}
4304	/*
4305	* Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4306	* any more capacity left.
4307	*/
4308	if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4309	return preferred_pset;
4310	}
4311	/*
4312	* Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4313	* at non-native clusters for capacity.
4314	*/
4315	}
4316
4317	/ Now look at the non-native clusters /
4318	cluster_id = -`1`;
4319	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[`0`], cluster_id)) != -`1`) {
4320	search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4321	if (search_complete) {
4322	break;
4323	}
4324	}
4325	return selected_pset;
4326	}
4327
4328	/*
4329	* sched_edge_candidate_alternative()
4330	*
4331	* Routine to find an alternative cluster from candidate_cluster_bitmap since the
4332	* selected_pset is not available for execution. The logic tries to prefer homogeneous
4333	* clusters over heterogeneous clusters since this is typically used in thread
4334	* placement decisions.
4335	*/
4336	_Static_assert(MAX_PSETS <= `64`, "Unable to fit maximum number of psets in uint64_t bitmask");
4337	static processor_set_t
4338	sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4339	{
4340	/*
4341	* It looks like the most ideal pset is not available for scheduling currently.
4342	* Try to find a homogeneous cluster that is still available.
4343	*/
4344	uint64_t available_native_clusters = selected_pset->native_psets[`0`] & candidate_cluster_bitmap;
4345	int available_cluster_id = lsb_first(available_native_clusters);
4346	if (available_cluster_id == -`1`) {
4347	/ Looks like none of the homogeneous clusters are available; pick the first available cluster /
4348	available_cluster_id = bit_first(candidate_cluster_bitmap);
4349	}
4350	assert(available_cluster_id != -`1`);
4351	return pset_array[available_cluster_id];
4352	}
4353
4354	/*
4355	* sched_edge_switch_pset_lock()
4356	*
4357	* Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4358	* switch_pset_locks.
4359	* Returns the newly locked pset after the switch.
4360	*/
4361	static processor_set_t
4362	sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4363	{
4364	if (!switch_pset_locks) {
4365	return locked_pset;
4366	}
4367	if (selected_pset != locked_pset) {
4368	pset_unlock(locked_pset);
4369	pset_lock(selected_pset);
4370	return selected_pset;
4371	} else {
4372	return locked_pset;
4373	}
4374	}
4375
4376	/*
4377	* sched_edge_amp_rebalance_pset()
4378	*
4379	* Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4380	* has executed on non-preferred cluster type for a while) should be enqueued.
4381	* The algorithm maintains a history of AMP rebalance decisions on the clutch
4382	* bucket group of the workload and round-robins between clusters to ensure
4383	* that all threads get a chance on the performance cores and make equal
4384	* progress.
4385	*/
4386	static processor_set_t
4387	sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4388	{
4389	sched_clutch_t clutch = sched_clutch_for_thread(thread);
4390	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4391
4392	uint32_t last_chosen_cluster, new_chosen_cluster;
4393
4394	/ Only AMP rebalance within clusters native to the preferred cluster /
4395	uint64_t eligible_pset_bitmask = preferred_pset->native_psets[`0`];
4396	/ Preferred cluster is also eligible for rebalancing /
4397	bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4398	/ Atomically update the AMP rebalance cluster for the clutch bucket group /
4399	os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4400	if (last_chosen_cluster == UINT32_MAX) {
4401	new_chosen_cluster = preferred_pset->pset_cluster_id;
4402	} else {
4403	new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4404	if (new_chosen_cluster == -`1`) {
4405	/ Rotate to the start of the eligible bitmask /
4406	new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4407	}
4408	}
4409	});
4410	return pset_array[new_chosen_cluster];
4411	}
4412
4413	/*
4414	* sched_edge_migrate_candidate()
4415	*
4416	* Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4417	* the thread and the preferred cluster to determine the best available pset for scheduling.
4418	*
4419	* The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4420	* accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4421	* decision has to be synchronized with another CPU changing the recommendation of clusters available
4422	* on the system. If this parameter is set to false, this routine returns the best effort indication of
4423	* the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4424	* SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4425	* more appropriate cluster.
4426	*
4427	* Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4428	* resultant pset lock is held.
4429	*/
4430	static processor_set_t
4431	sched_edge_migrate_candidate(_Nullable processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4432	{
4433	processor_set_t selected_pset = preferred_pset;
4434	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4435	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4436
4437	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4438	/ For bound threads always recommend the cluster its bound to /
4439	selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4440	if (selected_pset != NULL) {
4441	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4442	if (sched_edge_pset_available(selected_pset) \|\| (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4443	/*
4444	* If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4445	* fall through to the regular cluster selection logic which handles unavailable clusters
4446	* appropriately. If the thread is hard bound, then return the bound cluster always.
4447	*/
4448	return selected_pset;
4449	}
4450	}
4451	}
4452
4453	uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4454	#if DEVELOPMENT \|\| DEBUG
4455	extern int enable_task_set_cluster_type;
4456	task_t task = get_threadtask(thread);
4457	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4458	processor_set_t pset_hint = task->pset_hint;
4459	if (pset_hint && (selected_pset == NULL \|\| selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4460	selected_pset = pset_hint;
4461	goto migrate_candidate_available_check;
4462	}
4463	}
4464	#endif
4465
4466	if (preferred_pset == NULL) {
4467	/ The preferred_pset has not finished initializing at boot /
4468	goto migrate_candidate_available_check;
4469	}
4470
4471	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4472	/ For realtime threads, try and schedule them on the preferred pset always /
4473	goto migrate_candidate_available_check;
4474	}
4475
4476	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4477	if (preferred_cluster_load == `0`) {
4478	goto migrate_candidate_available_check;
4479	}
4480
4481	/*
4482	* If a thread is being rebalanced for achieving equal progress of parallel workloads,
4483	* it needs to end up on the preferred runqueue. This mechanism should only be used for
4484	* threads which have been previously migrated to the non-preferred cluster type.
4485	*
4486	* The AMP rebalancing mechanism is available for regular threads or shared resource
4487	* threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4488	*/
4489	bool amp_rebalance_eligible = (!shared_rsrc_thread) \|\| (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4490	if (amp_rebalance_eligible) {
4491	boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE \| AST_QUANTUM)) == (AST_REBALANCE \| AST_QUANTUM);
4492	if (amp_rebalance) {
4493	boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4494	if (non_preferred_pset) {
4495	selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4496	goto migrate_candidate_available_check;
4497	}
4498	}
4499	}
4500
4501	/ Look at edge weights to decide the most ideal migration candidate for this thread /
4502	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4503
4504	migrate_candidate_available_check:
4505	if (selected_pset == NULL) {
4506	/ The selected_pset has not finished initializing at boot /
4507	pset_unlock(locked_pset);
4508	return NULL;
4509	}
4510
4511	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4512	if (sched_edge_pset_available(selected_pset) == true) {
4513	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) \| DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4514	return selected_pset;
4515	}
4516	/ Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap /
4517	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4518	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -`1`)) {
4519	pset_unlock(locked_pset);
4520	return NULL;
4521	}
4522	/ Try and find an alternative for the selected pset /
4523	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4524	goto migrate_candidate_available_check;
4525	}
4526
4527	static processor_t
4528	sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4529	{
4530	/ Bound threads don't call this function /
4531	assert(thread->bound_processor == PROCESSOR_NULL);
4532	processor_t chosen_processor = PROCESSOR_NULL;
4533
4534	/*
4535	* sched_edge_preferred_pset() returns the preferred pset for a given thread.
4536	* It should take the passed in "pset" as a hint which represents the recency metric for
4537	* pset selection logic.
4538	*/
4539	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4540	processor_set_t chosen_pset = preferred_pset;
4541	/*
4542	* If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4543	* threads to. sched_edge_migrate_candidate() returns the preferred pset
4544	* if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4545	*
4546	* Edge Scheduler Optimization
4547	* It might be useful to build a recency metric for the thread for multiple clusters and
4548	* factor that into the migration decisions.
4549	*/
4550	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4551	if (chosen_pset) {
4552	chosen_processor = choose_processor(chosen_pset, processor, thread);
4553	}
4554	/ For RT threads, choose_processor() can return a different cluster than the one passed into it /
4555	assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4556	return chosen_processor;
4557	}
4558
4559	/*
4560	* sched_edge_clutch_bucket_threads_drain()
4561	*
4562	* Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4563	* bucket overrides etc.) into a local thread queue.
4564	*/
4565	static void
4566	sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4567	{
4568	thread_t thread = THREAD_NULL;
4569	uint64_t current_timestamp = mach_approximate_time();
4570	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4571	sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4572	enqueue_tail(clutch_threads, &thread->runq_links);
4573	}
4574	}
4575
4576	/*
4577	* sched_edge_run_drained_threads()
4578	*
4579	* Makes all drained threads in a local queue runnable.
4580	*/
4581	static void
4582	sched_edge_run_drained_threads(queue_t clutch_threads)
4583	{
4584	thread_t thread;
4585	/ Now setrun all the threads in the local queue /
4586	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4587	remqueue(&thread->runq_links);
4588	thread_lock(thread);
4589	thread_setrun(thread, SCHED_TAILQ);
4590	thread_unlock(thread);
4591	}
4592	}
4593
4594	/*
4595	* sched_edge_update_preferred_cluster()
4596	*
4597	* Routine to update the preferred cluster for QoS buckets within a thread group.
4598	* The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4599	*/
4600	static void
4601	sched_edge_update_preferred_cluster(
4602	sched_clutch_t sched_clutch,
4603	bitmap_t *clutch_bucket_modify_bitmap,
4604	uint32_t *tg_bucket_preferred_cluster)
4605	{
4606	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= `0`; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4607	os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4608	}
4609	}
4610
4611	/*
4612	* sched_edge_migrate_thread_group_runnable_threads()
4613	*
4614	* Routine to implement the migration of threads on a cluster when the thread group
4615	* recommendation is updated. The migration works using a 2-phase
4616	* algorithm.
4617	*
4618	* Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4619	* For each clutch bucket, if it needs to be migrated immediately, drain the threads
4620	* into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4621	* appropriate.
4622	*
4623	* Phase 2: After unlocking the pset, drain all the threads from the local thread
4624	* queue and mark them runnable which should land them in the right hierarchy.
4625	*
4626	* The routine assumes that the preferences for the clutch buckets/clutch bucket
4627	* groups have already been updated by the caller.
4628	*
4629	* - Called with the pset locked and interrupts disabled.
4630	* - Returns with the pset unlocked.
4631	*/
4632	static void
4633	sched_edge_migrate_thread_group_runnable_threads(
4634	sched_clutch_t sched_clutch,
4635	sched_clutch_root_t root_clutch,
4636	bitmap_t *clutch_bucket_modify_bitmap,
4637	__unused uint32_t *tg_bucket_preferred_cluster,
4638	bool migrate_immediately)
4639	{
4640	/ Queue to hold threads that have been drained from clutch buckets to be migrated /
4641	queue_head_t clutch_threads;
4642	queue_init(&clutch_threads);
4643
4644	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= `0`; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4645	/ Get the clutch bucket for this cluster and sched bucket /
4646	sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4647	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4648	sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4649	if (scb_root == NULL) {
4650	/ Clutch bucket not runnable or already in the right hierarchy; nothing to do here /
4651	assert(clutch_bucket->scb_thr_count == `0`);
4652	continue;
4653	}
4654	assert(scb_root == root_clutch);
4655	uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4656
4657	if (migrate_immediately) {
4658	/*
4659	* For transitions where threads need to be migrated immediately, drain the threads into a
4660	* local queue unless we are looking at the clutch buckets for the newly recommended
4661	* cluster.
4662	*/
4663	if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4664	sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4665	} else {
4666	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4667	}
4668	} else {
4669	/ Check if this cluster is the same type as the newly recommended cluster /
4670	boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4671	/*
4672	* If threads do not have to be migrated immediately, just change the native/foreign
4673	* flag on the clutch bucket.
4674	*/
4675	if (homogeneous_cluster) {
4676	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4677	} else {
4678	sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4679	}
4680	}
4681	}
4682
4683	pset_unlock(root_clutch->scr_pset);
4684	sched_edge_run_drained_threads(&clutch_threads);
4685	}
4686
4687	/*
4688	* sched_edge_migrate_thread_group_running_threads()
4689	*
4690	* Routine to find all running threads of a thread group on a specific cluster
4691	* and IPI them if they need to be moved immediately.
4692	*/
4693	static void
4694	sched_edge_migrate_thread_group_running_threads(
4695	sched_clutch_t sched_clutch,
4696	sched_clutch_root_t root_clutch,
4697	__unused bitmap_t *clutch_bucket_modify_bitmap,
4698	uint32_t *tg_bucket_preferred_cluster,
4699	bool migrate_immediately)
4700	{
4701	if (migrate_immediately == false) {
4702	/ If CLPC has recommended not to move threads immediately, nothing to do here /
4703	return;
4704	}
4705
4706	/*
4707	* Edge Scheduler Optimization
4708	*
4709	* When the system has a large number of clusters and cores, it might be useful to
4710	* narrow down the iteration by using a thread running bitmap per clutch.
4711	*/
4712	uint64_t ast_processor_map = `0`;
4713	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4714
4715	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4716	/*
4717	* Iterate all CPUs and look for the ones running threads from this thread group and are
4718	* not restricted to the specific cluster (due to overrides etc.)
4719	*/
4720	for (int cpuid = lsb_first(running_map); cpuid >= `0`; cpuid = lsb_next(running_map, cpuid)) {
4721	processor_t src_processor = processor_array[cpuid];
4722	boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4723	sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4724	if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4725	continue;
4726	}
4727	boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4728
4729	if (expected_tg && non_preferred_cluster) {
4730	ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4731	if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4732	bit_set(ast_processor_map, cpuid);
4733	} else if (src_processor == current_processor()) {
4734	bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4735	ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
4736	ast_on(new_preempt);
4737	}
4738	}
4739	}
4740
4741	/ Perform all the IPIs /
4742	if (bit_first(ast_processor_map) != -`1`) {
4743	for (int cpuid = lsb_first(ast_processor_map); cpuid >= `0`; cpuid = lsb_next(ast_processor_map, cpuid)) {
4744	processor_t ast_processor = processor_array[cpuid];
4745	sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4746	}
4747	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) \| DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, `0`, `0`);
4748	}
4749	}
4750
4751	/*
4752	* sched_edge_tg_preferred_cluster_change()
4753	*
4754	* Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4755	* is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4756	* thread migrations based on the policy specified by CLPC.
4757	* tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4758	* for each QoS within the thread group.
4759	*/
4760	void
4761	sched_edge_tg_preferred_cluster_change(struct thread_group tg, uint32_t tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4762	{
4763	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4764	/*
4765	* In order to optimize the processing, create a bitmap which represents all QoS buckets
4766	* for which the preferred cluster has changed.
4767	*/
4768	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {`0`};
4769	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4770	uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4771	uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4772	if (old_preferred_cluster != new_preferred_cluster) {
4773	bitmap_set(clutch_bucket_modify_bitmap, bucket);
4774	}
4775	}
4776	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -`1`) {
4777	/ No changes in any clutch buckets; nothing to do here /
4778	return;
4779	}
4780
4781	for (uint32_t cluster_id = `0`; cluster_id < sched_edge_max_clusters; cluster_id++) {
4782	processor_set_t pset = pset_array[cluster_id];
4783	spl_t s = splsched();
4784	pset_lock(pset);
4785	/*
4786	* The first operation is to update the preferred cluster for all QoS buckets within the
4787	* thread group so that any future threads becoming runnable would see the new preferred
4788	* cluster value.
4789	*/
4790	sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4791	/*
4792	* Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4793	* by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4794	*
4795	* Edge Scheduler Optimization
4796	*/
4797	/ Migrate all running threads of the TG on this cluster based on options specified by CLPC /
4798	sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4799	tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4800	/ Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC /
4801	sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4802	tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4803	/ sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked /
4804	splx(s);
4805	}
4806	}
4807
4808	/*
4809	* sched_edge_pset_made_schedulable()
4810	*
4811	* Routine to migrate all the clutch buckets which are not in their recommended
4812	* pset hierarchy now that a new pset has become runnable. Its possible that this
4813	* routine is called when the pset is already marked schedulable.
4814	*
4815	* Invoked with the pset lock held and interrupts disabled.
4816	*/
4817	static void
4818	sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4819	{
4820	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4821	/ Nothing to do here since pset is already marked schedulable /
4822	if (drop_lock) {
4823	pset_unlock(dst_pset);
4824	}
4825	return;
4826	}
4827
4828	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4829
4830	thread_t thread = sched_edge_processor_idle(dst_pset);
4831	if (thread != THREAD_NULL) {
4832	thread_lock(thread);
4833	thread_setrun(thread, SCHED_TAILQ);
4834	thread_unlock(thread);
4835	}
4836
4837	if (!drop_lock) {
4838	pset_lock(dst_pset);
4839	}
4840	}
4841
4842	/*
4843	* sched_edge_cpu_init_completed()
4844	*
4845	* Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4846	* provides an opportunity for the edge scheduler to initialize all the edge parameters.
4847	*/
4848	static void
4849	sched_edge_cpu_init_completed(void)
4850	{
4851	spl_t s = splsched();
4852	for (int src_cluster_id = `0`; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4853	processor_set_t src_pset = pset_array[src_cluster_id];
4854	pset_lock(src_pset);
4855
4856	/ For each cluster, set all its outgoing edge parameters /
4857	for (int dst_cluster_id = `0`; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4858	if (dst_cluster_id == src_cluster_id) {
4859	continue;
4860	}
4861	processor_set_t dst_pset = pset_array[dst_cluster_id];
4862	if (src_pset->pset_type == dst_pset->pset_type) {
4863	/ P->P/E->E edge config /
4864	bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4865	bitmap_set(src_pset->native_psets, dst_cluster_id);
4866	sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = `0`, .sce_migration_allowed = `1`, .sce_steal_allowed = `1`});
4867	} else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4868	/ P->E edge config /
4869	bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4870	bitmap_clear(src_pset->native_psets, dst_cluster_id);
4871	sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = `64`, .sce_migration_allowed = `1`, .sce_steal_allowed = `1`});
4872	} else {
4873	/ E->P edge config /
4874	bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4875	bitmap_clear(src_pset->native_psets, dst_cluster_id);
4876	sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = `0`, .sce_migration_allowed = `0`, .sce_steal_allowed = `0`});
4877	}
4878	bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
4879	if (clusters_local) {
4880	bitmap_set(src_pset->local_psets, dst_cluster_id);
4881	bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4882	} else {
4883	bitmap_set(src_pset->remote_psets, dst_cluster_id);
4884	bitmap_clear(src_pset->local_psets, dst_cluster_id);
4885	}
4886	}
4887
4888	pset_unlock(src_pset);
4889	}
4890	splx(s);
4891	}
4892
4893	static bool
4894	sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4895	{
4896	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4897	if (preferred_cluster_id == pset->pset_cluster_id) {
4898	return true;
4899	} else {
4900	processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4901	return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4902	}
4903	}
4904
4905	extern int sched_amp_spill_deferred_ipi;
4906	extern int sched_amp_pcores_preempt_immediate_ipi;
4907
4908	int sched_edge_migrate_ipi_immediate = `1`;
4909
4910	sched_ipi_type_t
4911	sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4912	{
4913	processor_set_t pset = dst->processor_set;
4914	assert(dst != current_processor());
4915
4916	boolean_t deferred_ipi_supported = false;
4917	#if defined(CONFIG_SCHED_DEFERRED_AST)
4918	deferred_ipi_supported = true;
4919	#endif /* CONFIG_SCHED_DEFERRED_AST */
4920
4921	switch (event) {
4922	case SCHED_IPI_EVENT_SPILL:
4923	/ For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set /
4924	if (deferred_ipi_supported) {
4925	return sched_ipi_deferred_policy(pset, dst, thread, event);
4926	}
4927	break;
4928	case SCHED_IPI_EVENT_PREEMPT:
4929	/ For preemption, the default policy is to use deferred IPIs*
4930	* for Non-RT P-core preemption. Override that behavior if
4931	* sched_amp_pcores_preempt_immediate_ipi is set
4932	*/
4933	if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4934	if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4935	return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4936	}
4937	if (sched_edge_migrate_ipi_immediate) {
4938	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4939	/*
4940	* For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4941	*/
4942	if (preferred_pset->pset_type == pset->pset_type) {
4943	return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4944	}
4945	/*
4946	* For workloads that are going wide, it might be useful use Immediate IPI to
4947	* wakeup the idle CPU if the scheduler estimates that the preferred pset will
4948	* be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4949	* latency on the preferred pset as an estimate of busyness.
4950	*/
4951	if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4952	return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4953	}
4954	}
4955	}
4956	break;
4957	default:
4958	break;
4959	}
4960	/ Default back to the global policy for all other scenarios /
4961	return sched_ipi_policy(dst, thread, dst_idle, event);
4962	}
4963
4964	/*
4965	* sched_edge_qos_max_parallelism()
4966	*/
4967	uint32_t
4968	sched_edge_qos_max_parallelism(int qos, uint64_t options)
4969	{
4970	uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4971	uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4972	uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4973	uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4974
4975	if (options & QOS_PARALLELISM_REALTIME) {
4976	/ For realtime threads on AMP, we would want them*
4977	* to limit the width to just the P-cores since we
4978	* do not spill/rebalance for RT threads.
4979	*/
4980	return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4981	}
4982
4983	/*
4984	* The Edge scheduler supports per-QoS recommendations for thread groups.
4985	* This enables lower QoS buckets (such as UT) to be scheduled on all
4986	* CPUs on the system.
4987	*
4988	* The only restriction is for BG/Maintenance QoS classes for which the
4989	* performance controller would never recommend execution on the P-cores.
4990	* If that policy changes in the future, this value should be changed.
4991	*/
4992	switch (qos) {
4993	case THREAD_QOS_BACKGROUND:
4994	case THREAD_QOS_MAINTENANCE:
4995	return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4996	default:
4997	return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4998	}
4999	}
5000
5001
5002
5003	#endif /* CONFIG_SCHED_EDGE */
5004
5005	#endif /* CONFIG_SCHED_CLUTCH */
5006

Browse the source code of xnu/osfmk/kern/sched_clutch.c