1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31#include <machine/machine_routines.h>
32#include <machine/sched_param.h>
33#include <machine/machine_cpu.h>
34#include <kern/kern_types.h>
35#include <kern/debug.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/sched.h>
41#include <kern/sched_prim.h>
42#include <kern/task.h>
43#include <kern/thread.h>
44#include <kern/sched_clutch.h>
45#include <machine/atomic.h>
46#include <kern/sched_clutch.h>
47#include <sys/kdebug.h>
48
49#if CONFIG_SCHED_EDGE
50#include <kern/sched_amp_common.h>
51#endif /* CONFIG_SCHED_EDGE */
52
53#if CONFIG_SCHED_CLUTCH
54
55/* Forward declarations of static routines */
56
57/* Root level hierarchy management */
58static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60static void sched_clutch_root_pri_update(sched_clutch_root_t);
61static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63
64__enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
66 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
67 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
68});
69
70static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71
72#if CONFIG_SCHED_EDGE
73/* Support for foreign threads on AMP platforms */
74static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76#endif /* CONFIG_SCHED_EDGE */
77
78/* Root bucket level hierarchy management */
79static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81
82/* Options for clutch bucket ordering in the runq */
83__options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
85 /* Round robin clutch bucket on thread removal */
86 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
87 /* Insert clutch bucket at head (for thread preemption) */
88 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
89 /* Insert clutch bucket at tail (default) */
90 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
91});
92
93/* Clutch bucket level hierarchy management */
94static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100
101/* Clutch bucket group level properties management */
102static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109
110/* Clutch timeshare properties updates */
111static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113
114/* Clutch membership management */
115static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118
119/* Clutch properties updates */
120static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122static int sched_clutch_root_priority(sched_clutch_root_t);
123static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124static boolean_t sched_thread_sched_pri_promoted(thread_t);
125
126#if CONFIG_SCHED_EDGE
127/* System based routines */
128static bool sched_edge_pset_available(processor_set_t);
129static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131
132/* Global indicating the maximum number of clusters on the current platform */
133static int sched_edge_max_clusters = 0;
134#endif /* CONFIG_SCHED_EDGE */
135
136/* Helper debugging routines */
137static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138
139extern processor_set_t pset_array[MAX_PSETS];
140
141/*
142 * Special markers for buckets that have invalid WCELs/quantums etc.
143 */
144#define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145#define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146
147/*
148 * Root level bucket WCELs
149 *
150 * The root level bucket selection algorithm is an Earliest Deadline
151 * First (EDF) algorithm where the deadline for buckets are defined
152 * by the worst-case-execution-latency and the make runnable timestamp
153 * for the bucket.
154 *
155 */
156static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
158 0, /* FG */
159 37500, /* IN (37.5ms) */
160 75000, /* DF (75ms) */
161 150000, /* UT (150ms) */
162 250000 /* BG (250ms) */
163};
164static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
165
166/*
167 * Root level bucket warp
168 *
169 * Each root level bucket has a warp value associated with it as well.
170 * The warp value allows the root bucket to effectively warp ahead of
171 * lower priority buckets for a limited time even if it has a later
172 * deadline. The warping behavior provides extra (but limited)
173 * opportunity for high priority buckets to remain responsive.
174 */
175
176/* Special warp deadline value to indicate that the bucket has not used any warp yet */
177#define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
178
179/* Warp window durations for various tiers */
180static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
182 8000, /* FG (8ms)*/
183 4000, /* IN (4ms) */
184 2000, /* DF (2ms) */
185 1000, /* UT (1ms) */
186 0 /* BG (0ms) */
187};
188static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
189
190/*
191 * Thread level quantum
192 *
193 * The algorithm defines quantums for threads at various buckets. This
194 * (combined with the root level bucket quantums) restricts how much
195 * the lower priority levels can preempt the higher priority threads.
196 */
197
198#if XNU_TARGET_OS_OSX
199static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200 10000, /* FIXPRI (10ms) */
201 10000, /* FG (10ms) */
202 10000, /* IN (10ms) */
203 10000, /* DF (10ms) */
204 4000, /* UT (4ms) */
205 2000 /* BG (2ms) */
206};
207#else /* XNU_TARGET_OS_OSX */
208static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209 10000, /* FIXPRI (10ms) */
210 10000, /* FG (10ms) */
211 8000, /* IN (8ms) */
212 6000, /* DF (6ms) */
213 4000, /* UT (4ms) */
214 2000 /* BG (2ms) */
215};
216#endif /* XNU_TARGET_OS_OSX */
217
218static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
219
220/*
221 * sched_clutch_us_to_abstime()
222 *
223 * Initializer for converting all durations in usec to abstime
224 */
225static void
226sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
227{
228 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
229 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231 } else {
232 clock_interval_to_absolutetime_interval(interval: us_vals[i],
233 NSEC_PER_USEC, result: &abstime_vals[i]);
234 }
235 }
236}
237
238/* Clutch/Edge Scheduler Debugging support */
239#define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
240
241#if DEVELOPMENT || DEBUG
242
243/*
244 * sched_clutch_hierarchy_locked_assert()
245 *
246 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
247 * for the hierarchy depends on where the hierarchy is hooked. The current
248 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
249 * using the pset lock.
250 */
251static inline void
252sched_clutch_hierarchy_locked_assert(
253 sched_clutch_root_t root_clutch)
254{
255 pset_assert_locked(root_clutch->scr_pset);
256}
257
258#else /* DEVELOPMENT || DEBUG */
259
260static inline void
261sched_clutch_hierarchy_locked_assert(
262 __unused sched_clutch_root_t root_clutch)
263{
264}
265
266#endif /* DEVELOPMENT || DEBUG */
267
268/*
269 * sched_clutch_thr_count_inc()
270 *
271 * Increment thread count at a hierarchy level with overflow checks.
272 */
273static void
274sched_clutch_thr_count_inc(
275 uint16_t *thr_count)
276{
277 if (__improbable(os_inc_overflow(thr_count))) {
278 panic("sched_clutch thread count overflowed!");
279 }
280}
281
282/*
283 * sched_clutch_thr_count_dec()
284 *
285 * Decrement thread count at a hierarchy level with underflow checks.
286 */
287static void
288sched_clutch_thr_count_dec(
289 uint16_t *thr_count)
290{
291 if (__improbable(os_dec_overflow(thr_count))) {
292 panic("sched_clutch thread count underflowed!");
293 }
294}
295
296static sched_bucket_t
297sched_convert_pri_to_bucket(uint8_t priority)
298{
299 sched_bucket_t bucket = TH_BUCKET_RUN;
300
301 if (priority > BASEPRI_USER_INITIATED) {
302 bucket = TH_BUCKET_SHARE_FG;
303 } else if (priority > BASEPRI_DEFAULT) {
304 bucket = TH_BUCKET_SHARE_IN;
305 } else if (priority > BASEPRI_UTILITY) {
306 bucket = TH_BUCKET_SHARE_DF;
307 } else if (priority > MAXPRI_THROTTLE) {
308 bucket = TH_BUCKET_SHARE_UT;
309 } else {
310 bucket = TH_BUCKET_SHARE_BG;
311 }
312 return bucket;
313}
314
315/*
316 * sched_clutch_thread_bucket_map()
317 *
318 * Map a thread to a scheduling bucket for the clutch/edge scheduler
319 * based on its scheduling mode and the priority attribute passed in.
320 */
321static sched_bucket_t
322sched_clutch_thread_bucket_map(thread_t thread, int pri)
323{
324 switch (thread->sched_mode) {
325 case TH_MODE_FIXED:
326 if (pri >= BASEPRI_FOREGROUND) {
327 return TH_BUCKET_FIXPRI;
328 } else {
329 return sched_convert_pri_to_bucket(priority: pri);
330 }
331
332 case TH_MODE_REALTIME:
333 return TH_BUCKET_FIXPRI;
334
335 case TH_MODE_TIMESHARE:
336 return sched_convert_pri_to_bucket(priority: pri);
337
338 default:
339 panic("unexpected mode: %d", thread->sched_mode);
340 break;
341 }
342}
343
344/*
345 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
346 * based on the amount of time they have been pending and the load at that
347 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
348 * multiple clusters, its important to keep the load also as a global counter.
349 */
350static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
351
352/*
353 * sched_clutch_root_init()
354 *
355 * Routine to initialize the scheduler hierarchy root.
356 */
357static void
358sched_clutch_root_init(
359 sched_clutch_root_t root_clutch,
360 processor_set_t pset)
361{
362 root_clutch->scr_thr_count = 0;
363 root_clutch->scr_priority = NOPRI;
364 root_clutch->scr_urgency = 0;
365 root_clutch->scr_pset = pset;
366#if CONFIG_SCHED_EDGE
367 root_clutch->scr_cluster_id = pset->pset_cluster_id;
368#else /* CONFIG_SCHED_EDGE */
369 root_clutch->scr_cluster_id = 0;
370#endif /* CONFIG_SCHED_EDGE */
371
372 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
373 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
374 }
375 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
376 queue_init(&root_clutch->scr_clutch_buckets);
377
378 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
379 priority_queue_init(que: &root_clutch->scr_foreign_buckets);
380 bzero(s: &root_clutch->scr_cumulative_run_count, n: sizeof(root_clutch->scr_cumulative_run_count));
381 bitmap_zero(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
382 bitmap_zero(map: root_clutch->scr_bound_warp_available, nbits: TH_BUCKET_SCHED_MAX);
383 priority_queue_init(que: &root_clutch->scr_bound_root_buckets);
384
385 /* Initialize the bitmap and priority queue of runnable root buckets */
386 priority_queue_init(que: &root_clutch->scr_unbound_root_buckets);
387 bitmap_zero(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
388 bitmap_zero(map: root_clutch->scr_unbound_warp_available, nbits: TH_BUCKET_SCHED_MAX);
389
390 /* Initialize all the root buckets */
391 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
392 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
393 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
394 }
395}
396
397/*
398 * Clutch Bucket Runqueues
399 *
400 * The clutch buckets are maintained in a runq at the root bucket level. The
401 * runq organization allows clutch buckets to be ordered based on various
402 * factors such as:
403 *
404 * - Clutch buckets are round robin'ed at the same priority level when a
405 * thread is selected from a clutch bucket. This prevents a clutch bucket
406 * from starving out other clutch buckets at the same priority.
407 *
408 * - Clutch buckets are inserted at the head when it becomes runnable due to
409 * thread preemption. This allows threads that were preempted to maintain
410 * their order in the queue.
411 */
412
413/*
414 * sched_clutch_bucket_runq_init()
415 *
416 * Initialize a clutch bucket runq.
417 */
418static void
419sched_clutch_bucket_runq_init(
420 sched_clutch_bucket_runq_t clutch_buckets_rq)
421{
422 clutch_buckets_rq->scbrq_highq = NOPRI;
423 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
424 clutch_buckets_rq->scbrq_bitmap[i] = 0;
425 }
426 clutch_buckets_rq->scbrq_count = 0;
427 for (int i = 0; i < NRQS; i++) {
428 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
429 }
430}
431
432/*
433 * sched_clutch_bucket_runq_empty()
434 *
435 * Returns if a clutch bucket runq is empty.
436 */
437static boolean_t
438sched_clutch_bucket_runq_empty(
439 sched_clutch_bucket_runq_t clutch_buckets_rq)
440{
441 return clutch_buckets_rq->scbrq_count == 0;
442}
443
444/*
445 * sched_clutch_bucket_runq_peek()
446 *
447 * Returns the highest priority clutch bucket in the runq.
448 */
449static sched_clutch_bucket_t
450sched_clutch_bucket_runq_peek(
451 sched_clutch_bucket_runq_t clutch_buckets_rq)
452{
453 if (clutch_buckets_rq->scbrq_count > 0) {
454 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
455 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
456 } else {
457 return NULL;
458 }
459}
460
461/*
462 * sched_clutch_bucket_runq_enqueue()
463 *
464 * Enqueue a clutch bucket into the runq based on the options passed in.
465 */
466static void
467sched_clutch_bucket_runq_enqueue(
468 sched_clutch_bucket_runq_t clutch_buckets_rq,
469 sched_clutch_bucket_t clutch_bucket,
470 sched_clutch_bucket_options_t options)
471{
472 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
473 if (circle_queue_empty(cq: queue)) {
474 circle_enqueue_tail(cq: queue, elt: &clutch_bucket->scb_runqlink);
475 bitmap_set(map: clutch_buckets_rq->scbrq_bitmap, n: clutch_bucket->scb_priority);
476 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
477 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
478 }
479 } else {
480 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
481 circle_enqueue_head(cq: queue, elt: &clutch_bucket->scb_runqlink);
482 } else {
483 /*
484 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
485 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
486 */
487 circle_enqueue_tail(cq: queue, elt: &clutch_bucket->scb_runqlink);
488 }
489 }
490 clutch_buckets_rq->scbrq_count++;
491}
492
493/*
494 * sched_clutch_bucket_runq_remove()
495 *
496 * Remove a clutch bucket from the runq.
497 */
498static void
499sched_clutch_bucket_runq_remove(
500 sched_clutch_bucket_runq_t clutch_buckets_rq,
501 sched_clutch_bucket_t clutch_bucket)
502{
503 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
504 circle_dequeue(cq: queue, elt: &clutch_bucket->scb_runqlink);
505 assert(clutch_buckets_rq->scbrq_count > 0);
506 clutch_buckets_rq->scbrq_count--;
507 if (circle_queue_empty(cq: queue)) {
508 bitmap_clear(map: clutch_buckets_rq->scbrq_bitmap, n: clutch_bucket->scb_priority);
509 clutch_buckets_rq->scbrq_highq = bitmap_first(map: clutch_buckets_rq->scbrq_bitmap, NRQS);
510 }
511}
512
513static void
514sched_clutch_bucket_runq_rotate(
515 sched_clutch_bucket_runq_t clutch_buckets_rq,
516 sched_clutch_bucket_t clutch_bucket)
517{
518 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
519 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
520 circle_queue_rotate_head_forward(cq: queue);
521}
522
523/*
524 * sched_clutch_root_bucket_init()
525 *
526 * Routine to initialize root buckets.
527 */
528static void
529sched_clutch_root_bucket_init(
530 sched_clutch_root_bucket_t root_bucket,
531 sched_bucket_t bucket,
532 bool bound_root_bucket)
533{
534 root_bucket->scrb_bucket = bucket;
535 if (bound_root_bucket) {
536 /* For bound root buckets, initialize the bound thread runq. */
537 root_bucket->scrb_bound = true;
538 run_queue_init(runq: &root_bucket->scrb_bound_thread_runq);
539 } else {
540 /*
541 * The unbounded root buckets contain a runq of runnable clutch buckets
542 * which then hold the runnable threads.
543 */
544 root_bucket->scrb_bound = false;
545 sched_clutch_bucket_runq_init(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets);
546 }
547 priority_queue_entry_init(&root_bucket->scrb_pqlink);
548 root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
549 root_bucket->scrb_warped_deadline = 0;
550 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
551 root_bucket->scrb_starvation_avoidance = false;
552 root_bucket->scrb_starvation_ts = 0;
553}
554
555/*
556 * Special case scheduling for Above UI bucket.
557 *
558 * AboveUI threads are typically system critical threads that need low latency
559 * which is why they are handled specially.
560 *
561 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
562 * important to maintain some native priority order between those buckets. For unbounded
563 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
564 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
565 * deadline based scheduling which should pickup the timeshare buckets. For the bound
566 * case, the policy simply compares the priority of the highest runnable threads in
567 * the above UI and timeshare buckets.
568 *
569 * The implementation allows extremely low latency CPU access for Above UI threads
570 * while supporting the use case of high priority timeshare threads contending with
571 * lower priority fixed priority threads.
572 */
573
574
575/*
576 * sched_clutch_root_unbound_select_aboveui()
577 *
578 * Routine to determine if the above UI unbounded bucket should be selected for execution.
579 */
580static bool
581sched_clutch_root_unbound_select_aboveui(
582 sched_clutch_root_t root_clutch)
583{
584 if (bitmap_test(map: root_clutch->scr_unbound_runnable_bitmap, n: TH_BUCKET_FIXPRI)) {
585 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
586 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
587 if (!bitmap_test(map: root_clutch->scr_unbound_runnable_bitmap, n: TH_BUCKET_SHARE_FG)) {
588 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
589 return true;
590 }
591 sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
592 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
593 if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
594 return true;
595 }
596 }
597 return false;
598}
599
600/*
601 * sched_clutch_root_bound_select_aboveui()
602 *
603 * Routine to determine if the above UI bounded bucket should be selected for execution.
604 */
605static bool
606sched_clutch_root_bound_select_aboveui(
607 sched_clutch_root_t root_clutch)
608{
609 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
610 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
611 if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
612 return false;
613 }
614 return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
615}
616
617/*
618 * sched_clutch_root_highest_root_bucket()
619 *
620 * Main routine to find the highest runnable root level bucket.
621 * This routine is called from performance sensitive contexts; so it is
622 * crucial to keep this O(1). The options parameter determines if
623 * the selection logic should look at unbounded threads only (for
624 * cross-cluster stealing operations) or both bounded and unbounded
625 * threads (for selecting next thread for execution on current cluster).
626 */
627static sched_clutch_root_bucket_t
628sched_clutch_root_highest_root_bucket(
629 sched_clutch_root_t root_clutch,
630 uint64_t timestamp,
631 sched_clutch_highest_root_bucket_type_t type)
632{
633 sched_clutch_hierarchy_locked_assert(root_clutch);
634 int highest_runnable_bucket = -1;
635 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
636 highest_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
637 } else {
638 int highest_unbound_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
639 int highest_bound_runnable_bucket = bitmap_lsb_first(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX);
640 highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
641 }
642
643 if (highest_runnable_bucket == -1) {
644 return NULL;
645 }
646
647 /* Above UI root bucket selection (see comment above for more details on this special case handling) */
648 bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
649 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
650 if (unbound_aboveui) {
651 return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
652 }
653 /* Fall through to selecting a timeshare root bucket */
654 } else {
655 bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
656 sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
657 sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
658
659 if (unbound_aboveui && bound_aboveui) {
660 /*
661 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
662 * highest runnable priority in both the buckets.
663 * */
664 int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
665 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
666 int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
667 return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
668 }
669 if (unbound_aboveui) {
670 return unbound_aboveui_root_bucket;
671 }
672 if (bound_aboveui) {
673 return bound_aboveui_root_bucket;
674 }
675 /* Fall through to selecting a timeshare root bucket */
676 }
677
678 /*
679 * Above UI bucket is not runnable or has a low priority runnable thread; use the
680 * earliest deadline model to schedule threads. The idea is that as the timeshare
681 * buckets use CPU, they will drop their interactivity score/sched priority and
682 * allow the low priority AboveUI buckets to be scheduled.
683 */
684
685 /* Find the earliest deadline bucket */
686 sched_clutch_root_bucket_t edf_bucket = NULL;
687 sched_clutch_root_bucket_t warp_bucket = NULL;
688 int warp_bucket_index = -1;
689
690evaluate_root_buckets:
691 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
692 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
693 } else {
694 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
695 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
696 if (bound_bucket && unbound_bucket) {
697 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
698 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
699 } else {
700 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
701 }
702 }
703 /*
704 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
705 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
706 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
707 * QoS root bucket.
708 */
709 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
710 warp_bucket_index = bitmap_lsb_first(map: warp_available_bitmap, nbits: TH_BUCKET_SCHED_MAX);
711
712 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
713 /* No higher buckets have warp left; best choice is the EDF based bucket */
714 if (edf_bucket->scrb_starvation_avoidance) {
715 /*
716 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
717 * starvation avoidance window is still open and return this bucket if it is.
718 *
719 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
720 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
721 * avoidance across all CPUs.
722 */
723 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(pset: root_clutch->scr_pset);
724 if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
725 return edf_bucket;
726 } else {
727 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
728 edf_bucket->scrb_starvation_avoidance = false;
729 edf_bucket->scrb_starvation_ts = 0;
730 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
731 }
732 goto evaluate_root_buckets;
733 }
734
735 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
736 if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
737 /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
738 edf_bucket->scrb_starvation_avoidance = true;
739 edf_bucket->scrb_starvation_ts = timestamp;
740 } else {
741 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
742 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
743 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
744 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
745 if (edf_bucket->scrb_bound) {
746 bitmap_set(map: root_clutch->scr_bound_warp_available, n: edf_bucket->scrb_bucket);
747 } else {
748 bitmap_set(map: root_clutch->scr_unbound_warp_available, n: edf_bucket->scrb_bucket);
749 }
750 }
751 return edf_bucket;
752 }
753
754 /*
755 * Looks like there is a root bucket which is higher in the natural priority
756 * order than edf_bucket and might have some warp remaining.
757 */
758 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
759 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
760 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
761 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
762 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
763 return warp_bucket;
764 }
765 if (warp_bucket->scrb_warped_deadline > timestamp) {
766 /* Root bucket already has a warp window open with some warp remaining */
767 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
768 return warp_bucket;
769 }
770
771 /* For this bucket, warp window was opened sometime in the past but has now
772 * expired. Mark the bucket as not avilable for warp anymore and re-run the
773 * warp bucket selection logic.
774 */
775 warp_bucket->scrb_warp_remaining = 0;
776 if (warp_bucket->scrb_bound) {
777 bitmap_clear(map: root_clutch->scr_bound_warp_available, n: warp_bucket->scrb_bucket);
778 } else {
779 bitmap_clear(map: root_clutch->scr_unbound_warp_available, n: warp_bucket->scrb_bucket);
780 }
781 goto evaluate_root_buckets;
782}
783
784/*
785 * sched_clutch_root_bucket_deadline_calculate()
786 *
787 * Calculate the deadline for the bucket based on its WCEL
788 */
789static uint64_t
790sched_clutch_root_bucket_deadline_calculate(
791 sched_clutch_root_bucket_t root_bucket,
792 uint64_t timestamp)
793{
794 /* For fixpri AboveUI bucket always return it as the earliest deadline */
795 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
796 return 0;
797 }
798
799 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
800 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
801}
802
803/*
804 * sched_clutch_root_bucket_deadline_update()
805 *
806 * Routine to update the deadline of the root bucket when it is selected.
807 * Updating the deadline also moves the root_bucket in the EDF priority
808 * queue.
809 */
810static void
811sched_clutch_root_bucket_deadline_update(
812 sched_clutch_root_bucket_t root_bucket,
813 sched_clutch_root_t root_clutch,
814 uint64_t timestamp)
815{
816 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
817 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
818 return;
819 }
820
821 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
822 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
823 if (__improbable(old_deadline > new_deadline)) {
824 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
825 }
826 if (old_deadline != new_deadline) {
827 root_bucket->scrb_pqlink.deadline = new_deadline;
828 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
829 priority_queue_entry_increased(que: prioq, elt: &root_bucket->scrb_pqlink);
830 }
831}
832
833/*
834 * sched_clutch_root_bucket_runnable()
835 *
836 * Routine to insert a newly runnable root bucket into the hierarchy.
837 * Also updates the deadline and warp parameters as necessary.
838 */
839static void
840sched_clutch_root_bucket_runnable(
841 sched_clutch_root_bucket_t root_bucket,
842 sched_clutch_root_t root_clutch,
843 uint64_t timestamp)
844{
845 /* Mark the root bucket as runnable */
846 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
847 bitmap_set(map: runnable_bitmap, n: root_bucket->scrb_bucket);
848
849 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
850 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
851 return;
852 }
853
854 if (root_bucket->scrb_starvation_avoidance == false) {
855 /*
856 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
857 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
858 * that and fix it up.
859 */
860 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
861 }
862 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
863 priority_queue_insert(que: prioq, elt: &root_bucket->scrb_pqlink);
864 if (root_bucket->scrb_warp_remaining) {
865 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
866 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
867 bitmap_set(map: warp_bitmap, n: root_bucket->scrb_bucket);
868 }
869}
870
871/*
872 * sched_clutch_root_bucket_empty()
873 *
874 * Routine to remove an empty root bucket from the hierarchy.
875 * Also updates the deadline and warp parameters as necessary.
876 */
877static void
878sched_clutch_root_bucket_empty(
879 sched_clutch_root_bucket_t root_bucket,
880 sched_clutch_root_t root_clutch,
881 uint64_t timestamp)
882{
883 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
884 bitmap_clear(map: runnable_bitmap, n: root_bucket->scrb_bucket);
885
886 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
887 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
888 return;
889 }
890
891 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
892 priority_queue_remove(que: prioq, elt: &root_bucket->scrb_pqlink);
893
894 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
895 bitmap_clear(map: warp_bitmap, n: root_bucket->scrb_bucket);
896
897 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
898 if (root_bucket->scrb_warped_deadline > timestamp) {
899 /*
900 * For root buckets that were using the warp, check if the warp
901 * deadline is in the future. If yes, remove the wall time the
902 * warp was active and update the warp remaining. This allows
903 * the root bucket to use the remaining warp the next time it
904 * becomes runnable.
905 */
906 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
907 } else {
908 /*
909 * If the root bucket's warped deadline is in the past, it has used up
910 * all the warp it was assigned. Empty out its warp remaining.
911 */
912 root_bucket->scrb_warp_remaining = 0;
913 }
914 }
915}
916
917static int
918sched_clutch_global_bucket_load_get(
919 sched_bucket_t bucket)
920{
921 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
922}
923
924/*
925 * sched_clutch_root_pri_update()
926 *
927 * The root level priority is used for thread selection and preemption
928 * logic.
929 *
930 * The logic uses the same decision as thread selection for deciding between the
931 * above UI and timeshare buckets. If one of the timesharing buckets have to be
932 * used for priority calculation, the logic is slightly different from thread
933 * selection, because thread selection considers deadlines, warps etc. to
934 * decide the most optimal bucket at a given timestamp. Since the priority
935 * value is used for preemption decisions only, it needs to be based on the
936 * highest runnable thread available in the timeshare domain. This logic can
937 * be made more sophisticated if there are cases of unnecessary preemption
938 * being seen in workloads.
939 */
940static void
941sched_clutch_root_pri_update(
942 sched_clutch_root_t root_clutch)
943{
944 sched_clutch_hierarchy_locked_assert(root_clutch);
945 int16_t root_bound_pri = NOPRI;
946 int16_t root_unbound_pri = NOPRI;
947
948 if (bitmap_lsb_first(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX) == -1) {
949 goto root_pri_update_unbound;
950 }
951 sched_clutch_root_bucket_t root_bucket_bound = NULL;
952 if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
953 root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
954 } else {
955 int root_bucket_index = bitmap_lsb_next(map: root_clutch->scr_bound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX, prev: TH_BUCKET_FIXPRI);
956 assert(root_bucket_index != -1);
957 root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
958 }
959 root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
960
961root_pri_update_unbound:
962 if (bitmap_lsb_first(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX) == -1) {
963 goto root_pri_update_complete;
964 }
965 sched_clutch_root_bucket_t root_bucket_unbound = NULL;
966 if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
967 root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
968 } else {
969 int root_bucket_index = bitmap_lsb_next(map: root_clutch->scr_unbound_runnable_bitmap, nbits: TH_BUCKET_SCHED_MAX, prev: TH_BUCKET_FIXPRI);
970 assert(root_bucket_index != -1);
971 root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
972 }
973 /* For the selected root bucket, find the highest priority clutch bucket */
974 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
975 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
976
977root_pri_update_complete:
978 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
979}
980
981/*
982 * sched_clutch_root_urgency_inc()
983 *
984 * Routine to increment the urgency at the root level based on the thread
985 * priority that is being inserted into the hierarchy. The root urgency
986 * counter is updated based on the urgency of threads in any of the
987 * clutch buckets which are part of the hierarchy.
988 *
989 * Always called with the pset lock held.
990 */
991static void
992sched_clutch_root_urgency_inc(
993 sched_clutch_root_t root_clutch,
994 thread_t thread)
995{
996 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
997 root_clutch->scr_urgency++;
998 }
999}
1000
1001/*
1002 * sched_clutch_root_urgency_dec()
1003 *
1004 * Routine to decrement the urgency at the root level based on the thread
1005 * priority that is being removed from the hierarchy. The root urgency
1006 * counter is updated based on the urgency of threads in any of the
1007 * clutch buckets which are part of the hierarchy.
1008 *
1009 * Always called with the pset lock held.
1010 */
1011static void
1012sched_clutch_root_urgency_dec(
1013 sched_clutch_root_t root_clutch,
1014 thread_t thread)
1015{
1016 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1017 root_clutch->scr_urgency--;
1018 }
1019}
1020
1021/*
1022 * Clutch bucket level scheduling
1023 *
1024 * The second level of scheduling is the clutch bucket level scheduling
1025 * which tries to schedule thread groups within root_buckets. Each
1026 * clutch represents a thread group and a clutch_bucket_group represents
1027 * threads at a particular sched_bucket within that thread group. The
1028 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1029 * where it holds the runnable threads destined for execution on that
1030 * cluster.
1031 *
1032 * The goal of this level of scheduling is to allow interactive thread
1033 * groups low latency access to the CPU. It also provides slight
1034 * scheduling preference for App and unrestricted thread groups.
1035 *
1036 * The clutch bucket scheduling algorithm measures an interactivity
1037 * score for all clutch bucket groups. The interactivity score is based
1038 * on the ratio of the CPU used and the voluntary blocking of threads
1039 * within the clutch bucket group. The algorithm is very close to the ULE
1040 * scheduler on FreeBSD in terms of calculations. The interactivity
1041 * score provides an interactivity boost in the range of
1042 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1043 * thread groups to win over CPU spinners.
1044 *
1045 * The interactivity score of the clutch bucket group is combined with the
1046 * highest base/promoted priority of threads in the clutch bucket to form
1047 * the overall priority of the clutch bucket.
1048 */
1049
1050/* Priority boost range for interactivity */
1051#define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1052uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1053
1054/* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1055uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1056#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1057
1058/* The ratio to scale the cpu/blocked time per window */
1059#define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1060
1061/* Initial value for voluntary blocking time for the clutch_bucket */
1062#define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1063
1064/* Value indicating the clutch bucket is not pending execution */
1065#define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1066
1067/*
1068 * Thread group CPU starvation avoidance
1069 *
1070 * In heavily CPU contended scenarios, it is possible that some thread groups
1071 * which have a low interactivity score do not get CPU time at all. In order to
1072 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1073 * bucket group when it has been pending execution for a certain time as defined
1074 * by the sched_clutch_bucket_group_pending_delta_us values below.
1075 *
1076 * The values chosen here are very close to the WCEL values for each sched bucket.
1077 * These values are multiplied by the load average of the relevant root bucket to
1078 * provide an estimate of the actual clutch bucket load.
1079 */
1080static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1081 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1082 10000, /* FG */
1083 37500, /* IN */
1084 75000, /* DF */
1085 150000, /* UT */
1086 250000, /* BG */
1087};
1088static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1089
1090/*
1091 * sched_clutch_bucket_init()
1092 *
1093 * Initializer for clutch buckets.
1094 */
1095static void
1096sched_clutch_bucket_init(
1097 sched_clutch_bucket_t clutch_bucket,
1098 sched_clutch_bucket_group_t clutch_bucket_group,
1099 sched_bucket_t bucket)
1100{
1101 clutch_bucket->scb_bucket = bucket;
1102 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1103 clutch_bucket->scb_priority = 0;
1104#if CONFIG_SCHED_EDGE
1105 clutch_bucket->scb_foreign = false;
1106 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1107#endif /* CONFIG_SCHED_EDGE */
1108 clutch_bucket->scb_group = clutch_bucket_group;
1109 clutch_bucket->scb_root = NULL;
1110 priority_queue_init(que: &clutch_bucket->scb_clutchpri_prioq);
1111 priority_queue_init(que: &clutch_bucket->scb_thread_runq);
1112 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1113}
1114
1115/*
1116 * sched_clutch_bucket_group_init()
1117 *
1118 * Initializer for clutch bucket groups.
1119 */
1120static void
1121sched_clutch_bucket_group_init(
1122 sched_clutch_bucket_group_t clutch_bucket_group,
1123 sched_clutch_t clutch,
1124 sched_bucket_t bucket)
1125{
1126 bzero(s: clutch_bucket_group, n: sizeof(struct sched_clutch_bucket_group));
1127 clutch_bucket_group->scbg_bucket = bucket;
1128 clutch_bucket_group->scbg_clutch = clutch;
1129
1130 int max_clusters = ml_get_cluster_count();
1131 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1132 for (int i = 0; i < max_clusters; i++) {
1133 sched_clutch_bucket_init(clutch_bucket: &clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1134 }
1135
1136 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1137 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1138 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1139 /*
1140 * All thread groups should be initialized to be interactive; this allows the newly launched
1141 * thread groups to fairly compete with already running thread groups.
1142 */
1143 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1144 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1145 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1146 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1147 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1148 clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1149}
1150
1151static void
1152sched_clutch_bucket_group_destroy(
1153 sched_clutch_bucket_group_t clutch_bucket_group)
1154{
1155 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1156 clutch_bucket_group->scbg_clutch_buckets);
1157}
1158
1159/*
1160 * sched_clutch_init_with_thread_group()
1161 *
1162 * Initialize the sched_clutch when the thread group is being created
1163 */
1164void
1165sched_clutch_init_with_thread_group(
1166 sched_clutch_t clutch,
1167 struct thread_group *tg)
1168{
1169 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1170
1171 /* Initialize all the clutch buckets */
1172 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1173 sched_clutch_bucket_group_init(clutch_bucket_group: &(clutch->sc_clutch_groups[i]), clutch, bucket: i);
1174 }
1175
1176 /* Grouping specific fields */
1177 clutch->sc_tg = tg;
1178}
1179
1180/*
1181 * sched_clutch_destroy()
1182 *
1183 * Destructor for clutch; called from thread group release code.
1184 */
1185void
1186sched_clutch_destroy(
1187 sched_clutch_t clutch)
1188{
1189 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1190 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1191 sched_clutch_bucket_group_destroy(clutch_bucket_group: &(clutch->sc_clutch_groups[i]));
1192 }
1193}
1194
1195#if CONFIG_SCHED_EDGE
1196
1197/*
1198 * Edge Scheduler Preferred Cluster Mechanism
1199 *
1200 * In order to have better control over various QoS buckets within a thread group, the Edge
1201 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1202 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1203 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1204 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1205 */
1206
1207static uint32_t
1208sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1209{
1210 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1211}
1212
1213static uint32_t
1214sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1215{
1216 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1217}
1218
1219uint32_t
1220sched_edge_thread_preferred_cluster(thread_t thread)
1221{
1222 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1223 /* For threads bound to a specific cluster, return the bound cluster id */
1224 return sched_edge_thread_bound_cluster_id(thread);
1225 }
1226
1227 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1228 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1229 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1230 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1231 }
1232 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1233 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1234}
1235
1236/*
1237 * Edge Scheduler Foreign Bucket Support
1238 *
1239 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1240 * threads that are not native to the cluster. A clutch bucket is considered native if its
1241 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1242 * bucket priority queue is used for rebalance operations to get threads back to their native
1243 * cluster quickly.
1244 *
1245 * It is possible to make this policy even more aggressive by considering all clusters that
1246 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1247 * migrations which might have performance implications.
1248 */
1249
1250static void
1251sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1252{
1253 if (clutch_bucket->scb_foreign) {
1254 clutch_bucket->scb_foreign = false;
1255 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1256 }
1257}
1258
1259static void
1260sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1261{
1262 if (!clutch_bucket->scb_foreign) {
1263 clutch_bucket->scb_foreign = true;
1264 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1265 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1266 }
1267}
1268
1269/*
1270 * Edge Scheduler Cumulative Load Average
1271 *
1272 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1273 * making thread migration decisions. The per-bucket load is maintained as a
1274 * cumulative count since higher scheduling buckets impact load on lower buckets
1275 * for thread migration decisions.
1276 *
1277 */
1278
1279static void
1280sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1281{
1282 switch (bucket) {
1283 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1284 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1285 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1286 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1287 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1288 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1289 default:
1290 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1291 }
1292}
1293
1294static void
1295sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1296{
1297 switch (bucket) {
1298 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1299 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1300 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1301 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1302 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1303 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1304 default:
1305 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1306 }
1307}
1308
1309uint16_t
1310sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1311{
1312 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1313}
1314
1315#endif /* CONFIG_SCHED_EDGE */
1316
1317/*
1318 * sched_clutch_bucket_hierarchy_insert()
1319 *
1320 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1321 */
1322static void
1323sched_clutch_bucket_hierarchy_insert(
1324 sched_clutch_root_t root_clutch,
1325 sched_clutch_bucket_t clutch_bucket,
1326 sched_bucket_t bucket,
1327 uint64_t timestamp,
1328 sched_clutch_bucket_options_t options)
1329{
1330 sched_clutch_hierarchy_locked_assert(root_clutch);
1331 if (bucket > TH_BUCKET_FIXPRI) {
1332 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1333 enqueue_tail(que: &root_clutch->scr_clutch_buckets, elt: &clutch_bucket->scb_listlink);
1334 }
1335#if CONFIG_SCHED_EDGE
1336 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1337 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1338 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1339 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1340 }
1341#endif /* CONFIG_SCHED_EDGE */
1342 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1343
1344 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1345 if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1346 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1347 }
1348
1349 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1350 sched_clutch_bucket_runq_enqueue(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1351 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1352 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1353}
1354
1355/*
1356 * sched_clutch_bucket_hierarchy_remove()
1357 *
1358 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1359 */
1360static void
1361sched_clutch_bucket_hierarchy_remove(
1362 sched_clutch_root_t root_clutch,
1363 sched_clutch_bucket_t clutch_bucket,
1364 sched_bucket_t bucket,
1365 uint64_t timestamp,
1366 __unused sched_clutch_bucket_options_t options)
1367{
1368 sched_clutch_hierarchy_locked_assert(root_clutch);
1369 if (bucket > TH_BUCKET_FIXPRI) {
1370 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1371 remqueue(elt: &clutch_bucket->scb_listlink);
1372 }
1373#if CONFIG_SCHED_EDGE
1374 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1375#endif /* CONFIG_SCHED_EDGE */
1376
1377 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1378
1379 /* Remove the clutch bucket from the root bucket priority queue */
1380 sched_clutch_bucket_runq_remove(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets, clutch_bucket);
1381 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1382
1383 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1384 if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1385 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1386 }
1387 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1388}
1389
1390/*
1391 * sched_clutch_bucket_base_pri()
1392 *
1393 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1394 * highest base_pri and the highest sched_pri in the clutch bucket.
1395 */
1396static uint8_t
1397sched_clutch_bucket_base_pri(
1398 sched_clutch_bucket_t clutch_bucket)
1399{
1400 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1401 /*
1402 * Since the clutch bucket can contain threads that are members of the group due
1403 * to the sched_pri being promoted or due to their base pri, the base priority of
1404 * the entire clutch bucket should be based on the highest thread (promoted or base)
1405 * in the clutch bucket.
1406 */
1407 uint8_t max_pri = 0;
1408 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1409 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1410 }
1411 return max_pri;
1412}
1413
1414/*
1415 * sched_clutch_interactivity_from_cpu_data()
1416 *
1417 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1418 */
1419static uint8_t
1420sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1421{
1422 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1423 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1424 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1425 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1426 uint8_t interactive_score = 0;
1427
1428 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1429 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1430 }
1431 /*
1432 * For all timeshare buckets, calculate the interactivity score of the bucket
1433 * and add it to the base priority
1434 */
1435 if (cpu_blocked > cpu_used) {
1436 /* Interactive clutch_bucket case */
1437 interactive_score = sched_clutch_bucket_group_interactive_pri +
1438 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1439 } else {
1440 /* Non-interactive clutch_bucket case */
1441 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1442 }
1443 return interactive_score;
1444}
1445
1446/*
1447 * sched_clutch_bucket_pri_calculate()
1448 *
1449 * The priority calculation algorithm for the clutch_bucket is a slight
1450 * modification on the ULE interactivity score. It uses the base priority
1451 * of the clutch bucket and applies an interactivity score boost to the
1452 * highly responsive clutch buckets.
1453 */
1454static uint8_t
1455sched_clutch_bucket_pri_calculate(
1456 sched_clutch_bucket_t clutch_bucket,
1457 uint64_t timestamp)
1458{
1459 /* For empty clutch buckets, return priority 0 */
1460 if (clutch_bucket->scb_thr_count == 0) {
1461 return 0;
1462 }
1463
1464 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1465 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1466
1467 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1468 uint8_t pri = base_pri + interactive_score;
1469 if (pri != clutch_bucket->scb_priority) {
1470 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1471 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1472 }
1473 return pri;
1474}
1475
1476/*
1477 * sched_clutch_root_bucket_highest_clutch_bucket()
1478 *
1479 * Routine to find the highest priority clutch bucket
1480 * within the root bucket.
1481 */
1482static sched_clutch_bucket_t
1483sched_clutch_root_bucket_highest_clutch_bucket(
1484 sched_clutch_root_bucket_t root_bucket)
1485{
1486 if (sched_clutch_bucket_runq_empty(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets)) {
1487 return NULL;
1488 }
1489 return sched_clutch_bucket_runq_peek(clutch_buckets_rq: &root_bucket->scrb_clutch_buckets);
1490}
1491
1492/*
1493 * sched_clutch_bucket_runnable()
1494 *
1495 * Perform all operations needed when a new clutch bucket becomes runnable.
1496 * It involves inserting the clutch_bucket into the hierarchy and updating the
1497 * root priority appropriately.
1498 */
1499static boolean_t
1500sched_clutch_bucket_runnable(
1501 sched_clutch_bucket_t clutch_bucket,
1502 sched_clutch_root_t root_clutch,
1503 uint64_t timestamp,
1504 sched_clutch_bucket_options_t options)
1505{
1506 sched_clutch_hierarchy_locked_assert(root_clutch);
1507 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1508 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1509 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, bucket: clutch_bucket->scb_bucket, timestamp, options);
1510
1511 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1512 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1513 int16_t root_old_pri = root_clutch->scr_priority;
1514 sched_clutch_root_pri_update(root_clutch);
1515 return root_clutch->scr_priority > root_old_pri;
1516}
1517
1518/*
1519 * sched_clutch_bucket_update()
1520 *
1521 * Update the clutch_bucket's position in the hierarchy. This routine is
1522 * called when a new thread is inserted or removed from a runnable clutch
1523 * bucket. The options specify some properties about the clutch bucket
1524 * insertion order into the clutch bucket runq.
1525 */
1526static boolean_t
1527sched_clutch_bucket_update(
1528 sched_clutch_bucket_t clutch_bucket,
1529 sched_clutch_root_t root_clutch,
1530 uint64_t timestamp,
1531 sched_clutch_bucket_options_t options)
1532{
1533 sched_clutch_hierarchy_locked_assert(root_clutch);
1534 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1535 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1536 if (new_pri == clutch_bucket->scb_priority) {
1537 /*
1538 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1539 * to the end of the runq. Typically used when a thread is selected for execution
1540 * from a clutch bucket.
1541 */
1542 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1543 sched_clutch_bucket_runq_rotate(clutch_buckets_rq: bucket_runq, clutch_bucket);
1544 }
1545 return false;
1546 }
1547 sched_clutch_bucket_runq_remove(clutch_buckets_rq: bucket_runq, clutch_bucket);
1548#if CONFIG_SCHED_EDGE
1549 if (clutch_bucket->scb_foreign) {
1550 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1551 }
1552#endif /* CONFIG_SCHED_EDGE */
1553 clutch_bucket->scb_priority = new_pri;
1554#if CONFIG_SCHED_EDGE
1555 if (clutch_bucket->scb_foreign) {
1556 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1557 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1558 }
1559#endif /* CONFIG_SCHED_EDGE */
1560 sched_clutch_bucket_runq_enqueue(clutch_buckets_rq: bucket_runq, clutch_bucket, options);
1561
1562 int16_t root_old_pri = root_clutch->scr_priority;
1563 sched_clutch_root_pri_update(root_clutch);
1564 return root_clutch->scr_priority > root_old_pri;
1565}
1566
1567/*
1568 * sched_clutch_bucket_empty()
1569 *
1570 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1571 * It involves removing the clutch bucket from the hierarchy and updaing the root
1572 * priority appropriately.
1573 */
1574static void
1575sched_clutch_bucket_empty(
1576 sched_clutch_bucket_t clutch_bucket,
1577 sched_clutch_root_t root_clutch,
1578 uint64_t timestamp,
1579 sched_clutch_bucket_options_t options)
1580{
1581 sched_clutch_hierarchy_locked_assert(root_clutch);
1582 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, bucket: clutch_bucket->scb_bucket, timestamp, options);
1583 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1584 sched_clutch_root_pri_update(root_clutch);
1585}
1586
1587/*
1588 * sched_clutch_cpu_usage_update()
1589 *
1590 * Routine to update CPU usage of the thread in the hierarchy.
1591 */
1592void
1593sched_clutch_cpu_usage_update(
1594 thread_t thread,
1595 uint64_t delta)
1596{
1597 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1598 return;
1599 }
1600
1601 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1602 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1603 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1604}
1605
1606/*
1607 * sched_clutch_bucket_group_cpu_usage_update()
1608 *
1609 * Routine to update the CPU usage of the clutch_bucket.
1610 */
1611static void
1612sched_clutch_bucket_group_cpu_usage_update(
1613 sched_clutch_bucket_group_t clutch_bucket_group,
1614 uint64_t delta)
1615{
1616 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1617 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1618 return;
1619 }
1620 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1621 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1622}
1623
1624/*
1625 * sched_clutch_bucket_group_cpu_pending_adjust()
1626 *
1627 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1628 * such that one "pending interval" provides one point improvement in interactivity score.
1629 */
1630static inline uint64_t
1631sched_clutch_bucket_group_cpu_pending_adjust(
1632 uint64_t cpu_used,
1633 uint64_t cpu_blocked,
1634 uint8_t pending_intervals)
1635{
1636 uint64_t cpu_used_adjusted = 0;
1637 if (cpu_blocked < cpu_used) {
1638 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1639 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1640 } else {
1641 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1642 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1643 }
1644 return cpu_used_adjusted;
1645}
1646
1647/*
1648 * sched_clutch_bucket_group_cpu_adjust()
1649 *
1650 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1651 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1652 * manageable and maintain the same ratio while allowing clutch buckets to
1653 * adjust behavior and reflect in the interactivity score in a reasonable
1654 * amount of time. Also adjusts the CPU usage based on pending_intervals
1655 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1656 */
1657static void
1658sched_clutch_bucket_group_cpu_adjust(
1659 sched_clutch_bucket_group_t clutch_bucket_group,
1660 uint8_t pending_intervals)
1661{
1662 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1663 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1664 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1665 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1666 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1667
1668 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1669 /* No changes to the CPU used and blocked values */
1670 os_atomic_rmw_loop_give_up();
1671 }
1672 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1673 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1674 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1675 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1676 }
1677 /* Use the shift passed in to ageout the CPU usage */
1678 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1679 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1680 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1681 });
1682}
1683
1684/*
1685 * Thread level scheduling algorithm
1686 *
1687 * The thread level scheduling algorithm uses the mach timeshare
1688 * decay based algorithm to achieve sharing between threads within the
1689 * same clutch bucket. The load/priority shifts etc. are all maintained
1690 * at the clutch bucket level and used for decay calculation of the
1691 * threads. The load sampling is still driven off the scheduler tick
1692 * for runnable clutch buckets (it does not use the new higher frequency
1693 * EWMA based load calculation). The idea is that the contention and load
1694 * within clutch_buckets should be limited enough to not see heavy decay
1695 * and timeshare effectively.
1696 */
1697
1698/*
1699 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1700 *
1701 * Increment the run count for the clutch bucket associated with the
1702 * thread.
1703 */
1704uint32_t
1705sched_clutch_thread_run_bucket_incr(
1706 thread_t thread,
1707 sched_bucket_t bucket)
1708{
1709 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1710 return 0;
1711 }
1712 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1713 return sched_clutch_run_bucket_incr(clutch, bucket);
1714}
1715
1716static uint32_t
1717sched_clutch_run_bucket_incr(
1718 sched_clutch_t clutch,
1719 sched_bucket_t bucket)
1720{
1721 assert(bucket != TH_BUCKET_RUN);
1722 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1723 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1724}
1725
1726/*
1727 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1728 *
1729 * Decrement the run count for the clutch bucket associated with the
1730 * thread.
1731 */
1732uint32_t
1733sched_clutch_thread_run_bucket_decr(
1734 thread_t thread,
1735 sched_bucket_t bucket)
1736{
1737 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1738 return 0;
1739 }
1740 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1741 return sched_clutch_run_bucket_decr(clutch, bucket);
1742}
1743
1744static uint32_t
1745sched_clutch_run_bucket_decr(
1746 sched_clutch_t clutch,
1747 sched_bucket_t bucket)
1748{
1749 assert(bucket != TH_BUCKET_RUN);
1750 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1751 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1752}
1753
1754/*
1755 * sched_clutch_bucket_group_timeshare_update()
1756 *
1757 * Routine to update the load and priority shift for the clutch_bucket_group
1758 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1759 * clutch buckets with runnable threads in them. So it is important to maintain
1760 * the timesharing information at the clutch_bucket_group level instead of
1761 * individual clutch buckets (because the algorithm is trying to timeshare all
1762 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1763 *
1764 * The routine is called from the sched tick handling code to make sure this value
1765 * is updated at least once every sched tick. For clutch bucket groups which have
1766 * not been runnable for very long, the clutch_bucket_group maintains a "last
1767 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1768 * if this value is outdated, the load and shifts are updated.
1769 *
1770 * Possible optimization:
1771 * - The current algorithm samples the load every sched tick (125ms).
1772 * This is prone to spikes in runnable counts; if that turns out to be
1773 * a problem, a simple solution would be to do the EWMA trick to sample
1774 * load at every load_tick (30ms) and use the averaged value for the pri
1775 * shift calculation.
1776 */
1777static void
1778sched_clutch_bucket_group_timeshare_update(
1779 sched_clutch_bucket_group_t clutch_bucket_group,
1780 sched_clutch_bucket_t clutch_bucket,
1781 uint64_t ctime)
1782{
1783 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1784 /* No timesharing needed for fixed priority Above UI threads */
1785 return;
1786 }
1787
1788 /*
1789 * Update the timeshare parameters for the clutch bucket group
1790 * if they havent been updated in this tick.
1791 */
1792 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1793 uint32_t current_sched_ts = sched_tick;
1794 if (sched_ts < current_sched_ts) {
1795 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1796 /* NCPU wide workloads should not experience decay */
1797 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1798 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1799 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1800 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1801 /* Ensure that the pri_shift value is reasonable */
1802 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1803 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1804 }
1805
1806 /*
1807 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1808 * for a long time to get an updated interactivity score.
1809 */
1810 sched_clutch_bucket_update(clutch_bucket, root_clutch: clutch_bucket->scb_root, timestamp: ctime, options: SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1811}
1812
1813/*
1814 * sched_clutch_thread_clutch_update()
1815 *
1816 * Routine called when the thread changes its thread group. The current
1817 * implementation relies on the fact that the thread group is changed only from
1818 * the context of the thread itself or when the thread is runnable but not in a
1819 * runqueue. Due to this fact, the thread group change causes only counter
1820 * updates in the old & new clutch buckets and no hierarchy changes. The routine
1821 * also attributes the CPU used so far to the old clutch.
1822 */
1823void
1824sched_clutch_thread_clutch_update(
1825 thread_t thread,
1826 sched_clutch_t old_clutch,
1827 sched_clutch_t new_clutch)
1828{
1829 uint32_t cpu_delta;
1830
1831 if (old_clutch) {
1832 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
1833
1834 sched_clutch_run_bucket_decr(clutch: old_clutch, bucket: thread->th_sched_bucket);
1835 /*
1836 * Calculate the CPU used by this thread in the old bucket and
1837 * add it to the old clutch bucket. This uses the same CPU usage
1838 * logic as update_priority etc.
1839 */
1840 sched_tick_delta(thread, cpu_delta);
1841 if (thread->pri_shift < INT8_MAX) {
1842 thread->sched_usage += cpu_delta;
1843 }
1844 thread->cpu_delta += cpu_delta;
1845 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1846 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1847 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta: cpu_delta);
1848 }
1849 }
1850
1851 if (new_clutch) {
1852 sched_clutch_run_bucket_incr(clutch: new_clutch, bucket: thread->th_sched_bucket);
1853 }
1854}
1855
1856/* Thread Insertion/Removal/Selection routines */
1857
1858#if CONFIG_SCHED_EDGE
1859
1860/*
1861 * Edge Scheduler Bound Thread Support
1862 *
1863 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1864 * maintains a separate runq on the clutch root to hold these bound threads. These
1865 * bound threads count towards the root priority and thread count, but are ignored
1866 * for thread migration/steal decisions. Bound threads that are enqueued in the
1867 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1868 * removal.
1869 *
1870 * Bound Threads Timesharing
1871 * The bound threads share the timesharing properties of the clutch bucket group they are
1872 * part of. They contribute to the load and use priority shifts/decay values from the
1873 * clutch bucket group.
1874 */
1875
1876static boolean_t
1877sched_edge_bound_thread_insert(
1878 sched_clutch_root_t root_clutch,
1879 thread_t thread,
1880 integer_t options)
1881{
1882 /* Update the clutch runnable count and priority */
1883 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1884 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1885 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1886 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1887 }
1888
1889 assert((thread->th_bound_cluster_enqueued) == false);
1890 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1891 thread->th_bound_cluster_enqueued = true;
1892
1893 /* Increment the urgency counter for the root if necessary */
1894 sched_clutch_root_urgency_inc(root_clutch, thread);
1895
1896 int16_t root_old_pri = root_clutch->scr_priority;
1897 sched_clutch_root_pri_update(root_clutch);
1898 return root_clutch->scr_priority > root_old_pri;
1899}
1900
1901static void
1902sched_edge_bound_thread_remove(
1903 sched_clutch_root_t root_clutch,
1904 thread_t thread)
1905{
1906 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1907 assert((thread->th_bound_cluster_enqueued) == true);
1908 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1909 thread->th_bound_cluster_enqueued = false;
1910
1911 /* Decrement the urgency counter for the root if necessary */
1912 sched_clutch_root_urgency_dec(root_clutch, thread);
1913
1914 /* Update the clutch runnable count and priority */
1915 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1916 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1917 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1918 }
1919 sched_clutch_root_pri_update(root_clutch);
1920}
1921
1922/*
1923 * Edge Scheduler cluster shared resource threads load balancing
1924 *
1925 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
1926 * across clusters in order to reduce contention on the shared resources. It achieves
1927 * that by maintaining the runnable and running shared resource load on each cluster
1928 * and balancing the load across multiple clusters.
1929 *
1930 * The current implementation for cluster shared resource load balancing looks at
1931 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1932 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1933 * do not steal/rebalance shared resource threads. Some more details for the implementation:
1934 *
1935 * - When threads are tagged as shared resource, they go through the cluster selection logic
1936 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1937 * enqueued in the cluster bound runqueue.
1938 *
1939 * - When the threads start running and call avoid_processor, the load balancing logic will be
1940 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1941 * no shared resource load.
1942 *
1943 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1944 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
1945 *
1946 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1947 * go idle intentionally.
1948 *
1949 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
1950 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1951 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1952 * events listed above happen.
1953 *
1954 * - Also it currently does not consider other properties such as thread priorities and
1955 * qos level thread load in the thread placement decision.
1956 *
1957 * Edge Scheduler cluster shared resource thread scheduling policy
1958 *
1959 * The threads for shared resources can be scheduled using one of the two policies:
1960 *
1961 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
1962 * This policy distributes the threads so that they spread across all available clusters
1963 * irrespective of type. The idea is that this scheduling policy will put a shared resource
1964 * thread on each cluster on the platform before it starts doubling up on clusters.
1965 *
1966 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1967 * This policy distributes threads so that the threads first fill up all the capacity on
1968 * the preferred cluster and its homogeneous peers before spilling to different core type.
1969 * The current implementation defines capacity based on the number of CPUs in the cluster;
1970 * so a cluster's shared resource is considered full if there are "n" runnable + running
1971 * shared resource threads on the cluster with n cpus. This policy is different from the
1972 * default scheduling policy of the edge scheduler since this always tries to fill up the
1973 * native clusters to capacity even when non-native clusters might be idle.
1974 */
1975__options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1976 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
1977 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
1978});
1979
1980const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1981 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1982 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1983};
1984
1985static void
1986sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1987{
1988 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1989 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1990 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1991 }
1992 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1993 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1994 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1995 }
1996}
1997
1998static void
1999sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2000{
2001 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2002 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2003 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2004 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2005 }
2006 }
2007}
2008
2009uint16_t
2010sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2011{
2012 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2013}
2014
2015/*
2016 * sched_edge_shared_rsrc_idle()
2017 *
2018 * Routine used to determine if the constrained resource for the pset is idle. This is
2019 * used by a CPU going idle to decide if it should rebalance a running shared resource
2020 * thread from a non-preferred cluster.
2021 */
2022static boolean_t
2023sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2024{
2025 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2026}
2027
2028/*
2029 * sched_edge_thread_shared_rsrc_type
2030 *
2031 * This routine decides if a given thread needs special handling for being a
2032 * heavy shared resource user. It is valid for the same thread to be using
2033 * several shared resources at the same time and have multiple policy flags set.
2034 * This routine determines which of those properties will be used for load
2035 * balancing and migration decisions.
2036 */
2037static cluster_shared_rsrc_type_t
2038sched_edge_thread_shared_rsrc_type(thread_t thread)
2039{
2040 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2041 return CLUSTER_SHARED_RSRC_TYPE_RR;
2042 }
2043 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2044 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2045 }
2046 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2047}
2048
2049#endif /* CONFIG_SCHED_EDGE */
2050
2051/*
2052 * sched_clutch_thread_bound_lookup()
2053 *
2054 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2055 */
2056static thread_t
2057sched_clutch_thread_bound_lookup(
2058 __unused sched_clutch_root_t root_clutch,
2059 sched_clutch_root_bucket_t root_bucket)
2060{
2061 return run_queue_peek(runq: &root_bucket->scrb_bound_thread_runq);
2062}
2063
2064/*
2065 * Clutch Bucket Group Thread Counts and Pending time calculation
2066 *
2067 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2068 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2069 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2070 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2071 * pending time is advanced to the time of thread selection.
2072 *
2073 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2074 * simulataneously, it is important that the updates to thread counts and pending timestamps
2075 * happen atomically. The implementation relies on the following aspects to make that work
2076 * as expected:
2077 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2078 * is held when threads are added/removed and pending timestamps are updated
2079 * - The thread count and pending timestamp can be updated atomically using double wide
2080 * 128 bit atomics
2081 *
2082 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2083 * above to atomically update the interactivity score for a clutch bucket group.
2084 */
2085
2086#if CONFIG_SCHED_EDGE
2087
2088static void
2089sched_clutch_bucket_group_thr_count_inc(
2090 sched_clutch_bucket_group_t clutch_bucket_group,
2091 uint64_t timestamp)
2092{
2093 sched_clutch_counter_time_t old_pending_data;
2094 sched_clutch_counter_time_t new_pending_data;
2095 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2096 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2097 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2098 if (old_pending_data.scct_count == 0) {
2099 new_pending_data.scct_timestamp = timestamp;
2100 }
2101 });
2102}
2103
2104static void
2105sched_clutch_bucket_group_thr_count_dec(
2106 sched_clutch_bucket_group_t clutch_bucket_group,
2107 uint64_t timestamp)
2108{
2109 sched_clutch_counter_time_t old_pending_data;
2110 sched_clutch_counter_time_t new_pending_data;
2111 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2112 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2113 if (new_pending_data.scct_count == 0) {
2114 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2115 } else {
2116 new_pending_data.scct_timestamp = timestamp;
2117 }
2118 });
2119}
2120
2121static uint8_t
2122sched_clutch_bucket_group_pending_ageout(
2123 sched_clutch_bucket_group_t clutch_bucket_group,
2124 uint64_t timestamp)
2125{
2126 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2127 sched_clutch_counter_time_t old_pending_data;
2128 sched_clutch_counter_time_t new_pending_data;
2129 uint8_t cpu_usage_shift = 0;
2130
2131 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2132 cpu_usage_shift = 0;
2133 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2134 bool old_update = (old_pending_ts >= timestamp);
2135 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2136 bool no_bucket_load = (bucket_load == 0);
2137 if (old_update || no_pending_time || no_bucket_load) {
2138 os_atomic_rmw_loop_give_up();
2139 }
2140
2141 /* Calculate the time the clutch bucket group has been pending */
2142 uint64_t pending_delta = timestamp - old_pending_ts;
2143 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2144 if (pending_delta < interactivity_delta) {
2145 os_atomic_rmw_loop_give_up();
2146 }
2147 cpu_usage_shift = (pending_delta / interactivity_delta);
2148 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2149 new_pending_data.scct_count = old_pending_data.scct_count;
2150 });
2151 return cpu_usage_shift;
2152}
2153
2154static uint8_t
2155sched_clutch_bucket_group_interactivity_score_calculate(
2156 sched_clutch_bucket_group_t clutch_bucket_group,
2157 uint64_t timestamp)
2158{
2159 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2160 /*
2161 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2162 * priorities, make sure all AboveUI buckets are marked interactive.
2163 */
2164 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2165 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2166 }
2167 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2168 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2169 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2170 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2171 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2172 sched_clutch_counter_time_t old_interactivity_data;
2173 sched_clutch_counter_time_t new_interactivity_data;
2174
2175 bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2176 if (old_interactivity_data.scct_timestamp >= timestamp) {
2177 os_atomic_rmw_loop_give_up();
2178 }
2179 new_interactivity_data.scct_timestamp = timestamp;
2180 if (old_interactivity_data.scct_timestamp != 0) {
2181 new_interactivity_data.scct_count = interactivity_score;
2182 }
2183 });
2184 if (score_updated) {
2185 return (uint8_t)new_interactivity_data.scct_count;
2186 } else {
2187 return (uint8_t)old_interactivity_data.scct_count;
2188 }
2189}
2190
2191#else /* CONFIG_SCHED_EDGE */
2192
2193/*
2194 * For the clutch scheduler, atomicity is ensured by making sure all operations
2195 * are happening under the pset lock of the only cluster present on the platform.
2196 */
2197static void
2198sched_clutch_bucket_group_thr_count_inc(
2199 sched_clutch_bucket_group_t clutch_bucket_group,
2200 uint64_t timestamp)
2201{
2202 sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2203 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2204 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2205 }
2206 clutch_bucket_group->scbg_pending_data.scct_count++;
2207}
2208
2209static void
2210sched_clutch_bucket_group_thr_count_dec(
2211 sched_clutch_bucket_group_t clutch_bucket_group,
2212 uint64_t timestamp)
2213{
2214 sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2215 clutch_bucket_group->scbg_pending_data.scct_count--;
2216 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2217 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2218 } else {
2219 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2220 }
2221}
2222
2223static uint8_t
2224sched_clutch_bucket_group_pending_ageout(
2225 sched_clutch_bucket_group_t clutch_bucket_group,
2226 uint64_t timestamp)
2227{
2228 sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2229 int bucket_load = sched_clutch_global_bucket_load_get(bucket: clutch_bucket_group->scbg_bucket);
2230 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2231 bool old_update = (old_pending_ts >= timestamp);
2232 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2233 bool no_bucket_load = (bucket_load == 0);
2234 if (old_update || no_pending_time || no_bucket_load) {
2235 return 0;
2236 }
2237 uint64_t pending_delta = timestamp - old_pending_ts;
2238 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2239 if (pending_delta < interactivity_delta) {
2240 return 0;
2241 }
2242 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2243 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2244 return cpu_usage_shift;
2245}
2246
2247static uint8_t
2248sched_clutch_bucket_group_interactivity_score_calculate(
2249 sched_clutch_bucket_group_t clutch_bucket_group,
2250 uint64_t timestamp)
2251{
2252 sched_clutch_hierarchy_locked_assert(root_clutch: &pset0.pset_clutch_root);
2253 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2254 /*
2255 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2256 * priorities, make sure all AboveUI buckets are marked interactive.
2257 */
2258 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2259 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2260 }
2261 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2262 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2263 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2264 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2265 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2266 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2267 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2268 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2269 return interactivity_score;
2270 } else {
2271 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2272 }
2273}
2274
2275#endif /* CONFIG_SCHED_EDGE */
2276
2277/*
2278 * Clutch Bucket Group Run Count and Blocked Time Accounting
2279 *
2280 * The clutch bucket group maintains the number of runnable/running threads in the group.
2281 * Since the blocked time of the clutch bucket group is based on this count, it is
2282 * important to make sure the blocking timestamp and the run count are updated atomically.
2283 *
2284 * Since the run count increments happen without any pset locks held, the scheduler updates
2285 * the count & timestamp using double wide 128 bit atomics.
2286 */
2287
2288static uint32_t
2289sched_clutch_bucket_group_run_count_inc(
2290 sched_clutch_bucket_group_t clutch_bucket_group)
2291{
2292 sched_clutch_counter_time_t old_blocked_data;
2293 sched_clutch_counter_time_t new_blocked_data;
2294
2295 bool update_blocked_time = false;
2296 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2297 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2298 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2299 update_blocked_time = false;
2300 if (old_blocked_data.scct_count == 0) {
2301 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2302 update_blocked_time = true;
2303 }
2304 });
2305 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2306 uint64_t ctime = mach_absolute_time();
2307 if (ctime > old_blocked_data.scct_timestamp) {
2308 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2309 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2310 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2311 }
2312 }
2313 return (uint32_t)new_blocked_data.scct_count;
2314}
2315
2316static uint32_t
2317sched_clutch_bucket_group_run_count_dec(
2318 sched_clutch_bucket_group_t clutch_bucket_group)
2319{
2320 sched_clutch_counter_time_t old_blocked_data;
2321 sched_clutch_counter_time_t new_blocked_data;
2322
2323 uint64_t ctime = mach_absolute_time();
2324 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2325 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2326 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2327 if (new_blocked_data.scct_count == 0) {
2328 new_blocked_data.scct_timestamp = ctime;
2329 }
2330 });
2331 return (uint32_t)new_blocked_data.scct_count;
2332}
2333
2334/*
2335 * sched_clutch_thread_insert()
2336 *
2337 * Routine to insert a thread into the sched clutch hierarchy.
2338 * Update the counts at all levels of the hierarchy and insert the nodes
2339 * as they become runnable. Always called with the pset lock held.
2340 */
2341static boolean_t
2342sched_clutch_thread_insert(
2343 sched_clutch_root_t root_clutch,
2344 thread_t thread,
2345 integer_t options)
2346{
2347 boolean_t result = FALSE;
2348
2349 sched_clutch_hierarchy_locked_assert(root_clutch);
2350#if CONFIG_SCHED_EDGE
2351 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2352 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2353 /*
2354 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2355 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2356 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2357 * de-recommended. In that case, the thread should be treated as an unbound
2358 * thread.
2359 */
2360 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2361 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2362 }
2363 /*
2364 * Use bound runqueue for shared resource threads. See "cluster shared resource
2365 * threads load balancing" section for details.
2366 */
2367 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2368 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2369 }
2370
2371#endif /* CONFIG_SCHED_EDGE */
2372 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2373 assert(thread->thread_group == clutch->sc_tg);
2374
2375 uint64_t current_timestamp = mach_absolute_time();
2376 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2377 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2378 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2379
2380 /*
2381 * Thread linkage in clutch_bucket
2382 *
2383 * A thread has a few linkages within the clutch bucket:
2384 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2385 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2386 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2387 */
2388
2389 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2390 thread->th_clutch_runq_link.stamp = current_timestamp;
2391 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2392 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2393 priority_queue_insert(que: &clutch_bucket->scb_thread_runq, elt: &thread->th_clutch_runq_link);
2394
2395 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2396 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2397 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2398 priority_queue_insert(que: &clutch_bucket->scb_clutchpri_prioq, elt: &thread->th_clutch_pri_link);
2399
2400 /* Insert thread into timesharing queue of the clutch bucket */
2401 enqueue_tail(que: &clutch_bucket->scb_thread_timeshare_queue, elt: &thread->th_clutch_timeshare_link);
2402
2403 /* Increment the urgency counter for the root if necessary */
2404 sched_clutch_root_urgency_inc(root_clutch, thread);
2405
2406 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2407 sched_clutch_bucket_group_thr_count_inc(clutch_bucket_group: clutch_bucket->scb_group, timestamp: current_timestamp);
2408
2409 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2410 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2411 if (clutch_bucket->scb_thr_count == 0) {
2412 sched_clutch_thr_count_inc(thr_count: &clutch_bucket->scb_thr_count);
2413 sched_clutch_thr_count_inc(thr_count: &root_clutch->scr_thr_count);
2414 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, timestamp: current_timestamp, options: scb_options);
2415 } else {
2416 sched_clutch_thr_count_inc(thr_count: &clutch_bucket->scb_thr_count);
2417 sched_clutch_thr_count_inc(thr_count: &root_clutch->scr_thr_count);
2418 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, timestamp: current_timestamp, options: scb_options);
2419 }
2420
2421 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2422 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2423 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2424 return result;
2425}
2426
2427/*
2428 * sched_clutch_thread_remove()
2429 *
2430 * Routine to remove a thread from the sched clutch hierarchy.
2431 * Update the counts at all levels of the hierarchy and remove the nodes
2432 * as they become empty. Always called with the pset lock held.
2433 */
2434static void
2435sched_clutch_thread_remove(
2436 sched_clutch_root_t root_clutch,
2437 thread_t thread,
2438 uint64_t current_timestamp,
2439 sched_clutch_bucket_options_t options)
2440{
2441 sched_clutch_hierarchy_locked_assert(root_clutch);
2442#if CONFIG_SCHED_EDGE
2443 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2444 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2445
2446 if (thread->th_bound_cluster_enqueued) {
2447 sched_edge_bound_thread_remove(root_clutch, thread);
2448 return;
2449 }
2450#endif /* CONFIG_SCHED_EDGE */
2451 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2452 assert(thread->thread_group == clutch->sc_tg);
2453 thread_assert_runq_nonnull(thread);
2454
2455 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2456 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2457 assert(clutch_bucket->scb_root == root_clutch);
2458
2459 /* Decrement the urgency counter for the root if necessary */
2460 sched_clutch_root_urgency_dec(root_clutch, thread);
2461 /* Remove thread from the clutch_bucket */
2462 priority_queue_remove(que: &clutch_bucket->scb_thread_runq, elt: &thread->th_clutch_runq_link);
2463 remqueue(elt: &thread->th_clutch_timeshare_link);
2464
2465 priority_queue_remove(que: &clutch_bucket->scb_clutchpri_prioq, elt: &thread->th_clutch_pri_link);
2466
2467 /*
2468 * Warning: After this point, the thread's scheduling fields may be
2469 * modified by other cores that acquire the thread lock.
2470 */
2471 thread_clear_runq(thread);
2472
2473 /* Update counts at various levels of the hierarchy */
2474 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2475 sched_clutch_bucket_group_thr_count_dec(clutch_bucket_group: clutch_bucket->scb_group, timestamp: current_timestamp);
2476 sched_clutch_thr_count_dec(thr_count: &root_clutch->scr_thr_count);
2477 sched_clutch_thr_count_dec(thr_count: &clutch_bucket->scb_thr_count);
2478
2479 /* Remove the clutch from hierarchy (if needed) and update properties */
2480 if (clutch_bucket->scb_thr_count == 0) {
2481 sched_clutch_bucket_empty(clutch_bucket, root_clutch, timestamp: current_timestamp, options);
2482 } else {
2483 sched_clutch_bucket_update(clutch_bucket, root_clutch, timestamp: current_timestamp, options);
2484 }
2485
2486 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2487 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2488 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2489}
2490
2491/*
2492 * sched_clutch_thread_unbound_lookup()
2493 *
2494 * Routine to find the highest unbound thread in the root clutch.
2495 * Helps find threads easily for steal/migrate scenarios in the
2496 * Edge scheduler.
2497 */
2498static thread_t
2499sched_clutch_thread_unbound_lookup(
2500 sched_clutch_root_t root_clutch,
2501 sched_clutch_root_bucket_t root_bucket)
2502{
2503 sched_clutch_hierarchy_locked_assert(root_clutch);
2504
2505 /* Find the highest priority clutch bucket in this root bucket */
2506 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2507 assert(clutch_bucket != NULL);
2508
2509 /* Find the highest priority runnable thread in this clutch bucket */
2510 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2511 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2512 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2513 return thread;
2514}
2515
2516/*
2517 * sched_clutch_thread_highest_remove()
2518 *
2519 * Routine to find and remove the highest priority thread
2520 * from the sched clutch hierarchy. The algorithm looks at the
2521 * hierarchy for the most eligible runnable thread and calls
2522 * sched_clutch_thread_remove(). Always called with the
2523 * pset lock held.
2524 */
2525static thread_t
2526sched_clutch_thread_highest_remove(
2527 sched_clutch_root_t root_clutch)
2528{
2529 sched_clutch_hierarchy_locked_assert(root_clutch);
2530 uint64_t current_timestamp = mach_absolute_time();
2531
2532 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, timestamp: current_timestamp, type: SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2533 if (root_bucket == NULL) {
2534 return THREAD_NULL;
2535 }
2536
2537 thread_t highest_thread = THREAD_NULL;
2538 if (root_bucket->scrb_bound) {
2539 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2540 } else {
2541 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2542 }
2543 sched_clutch_thread_remove(root_clutch, thread: highest_thread, current_timestamp, options: SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2544 return highest_thread;
2545}
2546
2547/* High level global accessor routines */
2548
2549/*
2550 * sched_clutch_root_urgency()
2551 *
2552 * Routine to get the urgency of the highest runnable
2553 * thread in the hierarchy.
2554 */
2555static uint32_t
2556sched_clutch_root_urgency(
2557 sched_clutch_root_t root_clutch)
2558{
2559 return root_clutch->scr_urgency;
2560}
2561
2562/*
2563 * sched_clutch_root_count_sum()
2564 *
2565 * The count_sum mechanism is used for scheduler runq
2566 * statistics calculation. Its only useful for debugging
2567 * purposes; since it takes a mach_absolute_time() on
2568 * other scheduler implementations, its better to avoid
2569 * populating this until absolutely necessary.
2570 */
2571static uint32_t
2572sched_clutch_root_count_sum(
2573 __unused sched_clutch_root_t root_clutch)
2574{
2575 return 0;
2576}
2577
2578/*
2579 * sched_clutch_root_priority()
2580 *
2581 * Routine to get the priority of the highest runnable
2582 * thread in the hierarchy.
2583 */
2584static int
2585sched_clutch_root_priority(
2586 sched_clutch_root_t root_clutch)
2587{
2588 return root_clutch->scr_priority;
2589}
2590
2591/*
2592 * sched_clutch_root_count()
2593 *
2594 * Returns total number of runnable threads in the hierarchy.
2595 */
2596uint32_t
2597sched_clutch_root_count(
2598 sched_clutch_root_t root_clutch)
2599{
2600 return root_clutch->scr_thr_count;
2601}
2602
2603#if CONFIG_SCHED_EDGE
2604
2605/*
2606 * sched_clutch_root_foreign_empty()
2607 *
2608 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2609 */
2610static boolean_t
2611sched_clutch_root_foreign_empty(
2612 sched_clutch_root_t root_clutch)
2613{
2614 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2615}
2616
2617/*
2618 * sched_clutch_root_highest_foreign_thread_remove()
2619 *
2620 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2621 * Must be called with the pset for the cluster locked.
2622 */
2623static thread_t
2624sched_clutch_root_highest_foreign_thread_remove(
2625 sched_clutch_root_t root_clutch)
2626{
2627 thread_t thread = THREAD_NULL;
2628 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2629 return thread;
2630 }
2631 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2632 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2633 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2634 return thread;
2635}
2636
2637#endif /* CONFIG_SCHED_EDGE */
2638
2639/*
2640 * sched_clutch_thread_pri_shift()
2641 *
2642 * Routine to get the priority shift value for a thread.
2643 * Since the timesharing is done at the clutch_bucket level,
2644 * this routine gets the clutch_bucket and retrieves the
2645 * values from there.
2646 */
2647uint32_t
2648sched_clutch_thread_pri_shift(
2649 thread_t thread,
2650 sched_bucket_t bucket)
2651{
2652 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2653 return INT8_MAX;
2654 }
2655 assert(bucket != TH_BUCKET_RUN);
2656 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2657 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2658 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2659}
2660
2661#pragma mark -- Clutch Scheduler Algorithm
2662
2663static void
2664sched_clutch_init(void);
2665
2666static thread_t
2667sched_clutch_steal_thread(processor_set_t pset);
2668
2669static void
2670sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2671
2672static boolean_t
2673sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2674 sched_options_t options);
2675
2676static boolean_t
2677sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2678
2679static ast_t
2680sched_clutch_processor_csw_check(processor_t processor);
2681
2682static boolean_t
2683sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2684
2685static int
2686sched_clutch_runq_count(processor_t processor);
2687
2688static boolean_t
2689sched_clutch_processor_queue_empty(processor_t processor);
2690
2691static uint64_t
2692sched_clutch_runq_stats_count_sum(processor_t processor);
2693
2694static int
2695sched_clutch_processor_bound_count(processor_t processor);
2696
2697static void
2698sched_clutch_pset_init(processor_set_t pset);
2699
2700static void
2701sched_clutch_processor_init(processor_t processor);
2702
2703static thread_t
2704sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2705
2706static void
2707sched_clutch_processor_queue_shutdown(processor_t processor);
2708
2709static sched_mode_t
2710sched_clutch_initial_thread_sched_mode(task_t parent_task);
2711
2712static uint32_t
2713sched_clutch_initial_quantum_size(thread_t thread);
2714
2715static bool
2716sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason);
2717
2718static uint32_t
2719sched_clutch_run_incr(thread_t thread);
2720
2721static uint32_t
2722sched_clutch_run_decr(thread_t thread);
2723
2724static void
2725sched_clutch_update_thread_bucket(thread_t thread);
2726
2727static void
2728sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
2729
2730const struct sched_dispatch_table sched_clutch_dispatch = {
2731 .sched_name = "clutch",
2732 .init = sched_clutch_init,
2733 .timebase_init = sched_timeshare_timebase_init,
2734 .processor_init = sched_clutch_processor_init,
2735 .pset_init = sched_clutch_pset_init,
2736 .maintenance_continuation = sched_timeshare_maintenance_continue,
2737 .choose_thread = sched_clutch_choose_thread,
2738 .steal_thread_enabled = sched_steal_thread_enabled,
2739 .steal_thread = sched_clutch_steal_thread,
2740 .compute_timeshare_priority = sched_compute_timeshare_priority,
2741 .choose_node = sched_choose_node,
2742 .choose_processor = choose_processor,
2743 .processor_enqueue = sched_clutch_processor_enqueue,
2744 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
2745 .processor_queue_remove = sched_clutch_processor_queue_remove,
2746 .processor_queue_empty = sched_clutch_processor_queue_empty,
2747 .priority_is_urgent = priority_is_urgent,
2748 .processor_csw_check = sched_clutch_processor_csw_check,
2749 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
2750 .initial_quantum_size = sched_clutch_initial_quantum_size,
2751 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2752 .can_update_priority = can_update_priority,
2753 .update_priority = update_priority,
2754 .lightweight_update_priority = lightweight_update_priority,
2755 .quantum_expire = sched_default_quantum_expire,
2756 .processor_runq_count = sched_clutch_runq_count,
2757 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2758 .processor_bound_count = sched_clutch_processor_bound_count,
2759 .thread_update_scan = sched_clutch_thread_update_scan,
2760 .multiple_psets_enabled = TRUE,
2761 .sched_groups_enabled = FALSE,
2762 .avoid_processor_enabled = TRUE,
2763 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
2764 .processor_balance = sched_SMT_balance,
2765
2766 .rt_runq = sched_rtlocal_runq,
2767 .rt_init = sched_rtlocal_init,
2768 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
2769 .rt_runq_scan = sched_rtlocal_runq_scan,
2770 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
2771 .rt_steal_thread = sched_rtlocal_steal_thread,
2772
2773 .qos_max_parallelism = sched_qos_max_parallelism,
2774 .check_spill = sched_check_spill,
2775 .ipi_policy = sched_ipi_policy,
2776 .thread_should_yield = sched_thread_should_yield,
2777 .run_count_incr = sched_clutch_run_incr,
2778 .run_count_decr = sched_clutch_run_decr,
2779 .update_thread_bucket = sched_clutch_update_thread_bucket,
2780 .pset_made_schedulable = sched_pset_made_schedulable,
2781 .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
2782 .cpu_init_completed = NULL,
2783 .thread_eligible_for_pset = NULL,
2784};
2785
2786__attribute__((always_inline))
2787static inline run_queue_t
2788sched_clutch_bound_runq(processor_t processor)
2789{
2790 return &processor->runq;
2791}
2792
2793__attribute__((always_inline))
2794static inline sched_clutch_root_t
2795sched_clutch_processor_root_clutch(processor_t processor)
2796{
2797 return &processor->processor_set->pset_clutch_root;
2798}
2799
2800__attribute__((always_inline))
2801static inline run_queue_t
2802sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2803{
2804 assert(thread->bound_processor == processor);
2805 return sched_clutch_bound_runq(processor);
2806}
2807
2808static uint32_t
2809sched_clutch_initial_quantum_size(thread_t thread)
2810{
2811 if (thread == THREAD_NULL) {
2812 return std_quantum;
2813 }
2814 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2815 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2816}
2817
2818static sched_mode_t
2819sched_clutch_initial_thread_sched_mode(task_t parent_task)
2820{
2821 if (parent_task == kernel_task) {
2822 return TH_MODE_FIXED;
2823 } else {
2824 return TH_MODE_TIMESHARE;
2825 }
2826}
2827
2828static void
2829sched_clutch_processor_init(processor_t processor)
2830{
2831 run_queue_init(runq: &processor->runq);
2832}
2833
2834static void
2835sched_clutch_pset_init(processor_set_t pset)
2836{
2837 sched_clutch_root_init(root_clutch: &pset->pset_clutch_root, pset);
2838}
2839
2840static void
2841sched_clutch_tunables_init(void)
2842{
2843 sched_clutch_us_to_abstime(us_vals: sched_clutch_root_bucket_wcel_us, abstime_vals: sched_clutch_root_bucket_wcel);
2844 sched_clutch_us_to_abstime(us_vals: sched_clutch_root_bucket_warp_us, abstime_vals: sched_clutch_root_bucket_warp);
2845 sched_clutch_us_to_abstime(us_vals: sched_clutch_thread_quantum_us, abstime_vals: sched_clutch_thread_quantum);
2846 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2847 NSEC_PER_USEC, result: &sched_clutch_bucket_group_adjust_threshold);
2848 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2849 sched_clutch_us_to_abstime(us_vals: sched_clutch_bucket_group_pending_delta_us, abstime_vals: sched_clutch_bucket_group_pending_delta);
2850}
2851
2852static void
2853sched_clutch_init(void)
2854{
2855 if (!PE_parse_boot_argn(arg_string: "sched_clutch_bucket_group_interactive_pri", arg_ptr: &sched_clutch_bucket_group_interactive_pri, max_arg: sizeof(sched_clutch_bucket_group_interactive_pri))) {
2856 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2857 }
2858 sched_timeshare_init();
2859 sched_clutch_tunables_init();
2860}
2861
2862static thread_t
2863sched_clutch_choose_thread(
2864 processor_t processor,
2865 int priority,
2866 __unused ast_t reason)
2867{
2868 int clutch_pri = sched_clutch_root_priority(root_clutch: sched_clutch_processor_root_clutch(processor));
2869 uint32_t clutch_count = sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor));
2870 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2871 boolean_t choose_from_boundq = false;
2872
2873 if (bound_runq->highq < priority &&
2874 clutch_pri < priority) {
2875 return THREAD_NULL;
2876 }
2877
2878 if (bound_runq->count && clutch_count) {
2879 if (bound_runq->highq >= clutch_pri) {
2880 choose_from_boundq = true;
2881 }
2882 } else if (bound_runq->count) {
2883 choose_from_boundq = true;
2884 } else if (clutch_count) {
2885 choose_from_boundq = false;
2886 } else {
2887 return THREAD_NULL;
2888 }
2889
2890 thread_t thread = THREAD_NULL;
2891 if (choose_from_boundq == false) {
2892 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2893 thread = sched_clutch_thread_highest_remove(root_clutch: pset_clutch_root);
2894 } else {
2895 thread = run_queue_dequeue(runq: bound_runq, options: SCHED_HEADQ);
2896 }
2897 return thread;
2898}
2899
2900static boolean_t
2901sched_clutch_processor_enqueue(
2902 processor_t processor,
2903 thread_t thread,
2904 sched_options_t options)
2905{
2906 boolean_t result;
2907
2908 thread_set_runq_locked(thread, new_runq: processor);
2909 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2910 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2911 result = sched_clutch_thread_insert(root_clutch: pset_clutch_root, thread, options);
2912 } else {
2913 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2914 result = run_queue_enqueue(runq: rq, thread, options);
2915 }
2916 return result;
2917}
2918
2919static boolean_t
2920sched_clutch_processor_queue_empty(processor_t processor)
2921{
2922 return sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor)) == 0 &&
2923 sched_clutch_bound_runq(processor)->count == 0;
2924}
2925
2926static ast_t
2927sched_clutch_processor_csw_check(processor_t processor)
2928{
2929 boolean_t has_higher;
2930 int pri;
2931
2932 if (sched_clutch_thread_avoid_processor(processor, thread: current_thread(), AST_NONE)) {
2933 return AST_PREEMPT | AST_URGENT;
2934 }
2935
2936 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2937 int clutch_pri = sched_clutch_root_priority(root_clutch: sched_clutch_processor_root_clutch(processor));
2938
2939 assert(processor->active_thread != NULL);
2940
2941 pri = MAX(clutch_pri, bound_runq->highq);
2942
2943 if (processor->first_timeslice) {
2944 has_higher = (pri > processor->current_pri);
2945 } else {
2946 has_higher = (pri >= processor->current_pri);
2947 }
2948
2949 if (has_higher) {
2950 if (sched_clutch_root_urgency(root_clutch: sched_clutch_processor_root_clutch(processor)) > 0) {
2951 return AST_PREEMPT | AST_URGENT;
2952 }
2953
2954 if (bound_runq->urgency > 0) {
2955 return AST_PREEMPT | AST_URGENT;
2956 }
2957
2958 return AST_PREEMPT;
2959 }
2960
2961 return AST_NONE;
2962}
2963
2964static boolean_t
2965sched_clutch_processor_queue_has_priority(processor_t processor,
2966 int priority,
2967 boolean_t gte)
2968{
2969 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2970
2971 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2972
2973 if (gte) {
2974 return qpri >= priority;
2975 } else {
2976 return qpri > priority;
2977 }
2978}
2979
2980static int
2981sched_clutch_runq_count(processor_t processor)
2982{
2983 return (int)sched_clutch_root_count(root_clutch: sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2984}
2985
2986static uint64_t
2987sched_clutch_runq_stats_count_sum(processor_t processor)
2988{
2989 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2990
2991 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2992 return bound_sum + sched_clutch_root_count_sum(root_clutch: sched_clutch_processor_root_clutch(processor));
2993 } else {
2994 return bound_sum;
2995 }
2996}
2997static int
2998sched_clutch_processor_bound_count(processor_t processor)
2999{
3000 return sched_clutch_bound_runq(processor)->count;
3001}
3002
3003static void
3004sched_clutch_processor_queue_shutdown(processor_t processor)
3005{
3006 processor_set_t pset = processor->processor_set;
3007 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3008 thread_t thread;
3009 queue_head_t tqueue;
3010
3011 /* We only need to migrate threads if this is the last active processor in the pset */
3012 if (pset->online_processor_count > 0) {
3013 pset_unlock(pset);
3014 return;
3015 }
3016
3017 queue_init(&tqueue);
3018 while (sched_clutch_root_count(root_clutch: pset_clutch_root) > 0) {
3019 thread = sched_clutch_thread_highest_remove(root_clutch: pset_clutch_root);
3020 enqueue_tail(que: &tqueue, elt: &thread->runq_links);
3021 }
3022
3023 pset_unlock(pset);
3024
3025 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3026 remqueue(elt: &thread->runq_links);
3027 thread_lock(thread);
3028 thread_setrun(thread, options: SCHED_TAILQ);
3029 thread_unlock(thread);
3030 }
3031}
3032
3033static boolean_t
3034sched_clutch_processor_queue_remove(
3035 processor_t processor,
3036 thread_t thread)
3037{
3038 processor_set_t pset = processor->processor_set;
3039
3040 pset_lock(pset);
3041
3042 if (processor == thread_get_runq_locked(thread)) {
3043 /*
3044 * Thread is on a run queue and we have a lock on
3045 * that run queue.
3046 */
3047 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3048 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3049 sched_clutch_thread_remove(root_clutch: pset_clutch_root, thread, current_timestamp: mach_absolute_time(), options: SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3050 } else {
3051 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3052 run_queue_remove(runq: rq, thread);
3053 }
3054 } else {
3055 /*
3056 * The thread left the run queue before we could
3057 * lock the run queue.
3058 */
3059 thread_assert_runq_null(thread);
3060 processor = PROCESSOR_NULL;
3061 }
3062
3063 pset_unlock(pset);
3064
3065 return processor != PROCESSOR_NULL;
3066}
3067
3068static thread_t
3069sched_clutch_steal_thread(__unused processor_set_t pset)
3070{
3071 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3072 return THREAD_NULL;
3073}
3074
3075static void
3076sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3077{
3078 boolean_t restart_needed = FALSE;
3079 processor_t processor = processor_list;
3080 processor_set_t pset;
3081 thread_t thread;
3082 spl_t s;
3083
3084 /*
3085 * We update the threads associated with each processor (bound and idle threads)
3086 * and then update the threads in each pset runqueue.
3087 */
3088
3089 do {
3090 do {
3091 pset = processor->processor_set;
3092
3093 s = splsched();
3094 pset_lock(pset);
3095
3096 restart_needed = runq_scan(runq: sched_clutch_bound_runq(processor), scan_context);
3097
3098 pset_unlock(pset);
3099 splx(s);
3100
3101 if (restart_needed) {
3102 break;
3103 }
3104
3105 thread = processor->idle_thread;
3106 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3107 if (thread_update_add_thread(thread) == FALSE) {
3108 restart_needed = TRUE;
3109 break;
3110 }
3111 }
3112 } while ((processor = processor->processor_list) != NULL);
3113
3114 /* Ok, we now have a collection of candidates -- fix them. */
3115 thread_update_process_threads();
3116 } while (restart_needed);
3117
3118 pset_node_t node = &pset_node0;
3119 pset = node->psets;
3120
3121 do {
3122 do {
3123 restart_needed = FALSE;
3124 while (pset != NULL) {
3125 s = splsched();
3126 pset_lock(pset);
3127
3128 if (sched_clutch_root_count(root_clutch: &pset->pset_clutch_root) > 0) {
3129 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3130 restart_needed = runq_scan(runq: &pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3131 if (restart_needed) {
3132 break;
3133 }
3134 }
3135 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3136 sched_clutch_bucket_t clutch_bucket;
3137 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3138 sched_clutch_bucket_group_timeshare_update(clutch_bucket_group: clutch_bucket->scb_group, clutch_bucket, ctime: scan_context->sched_tick_last_abstime);
3139 restart_needed = sched_clutch_timeshare_scan(thread_queue: &clutch_bucket->scb_thread_timeshare_queue, count: clutch_bucket->scb_thr_count, scan_context);
3140 if (restart_needed) {
3141 break;
3142 }
3143 }
3144 }
3145
3146 pset_unlock(pset);
3147 splx(s);
3148
3149 if (restart_needed) {
3150 break;
3151 }
3152 pset = pset->pset_list;
3153 }
3154
3155 if (restart_needed) {
3156 break;
3157 }
3158 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3159
3160 /* Ok, we now have a collection of candidates -- fix them. */
3161 thread_update_process_threads();
3162 } while (restart_needed);
3163}
3164
3165extern int sched_allow_rt_smt;
3166
3167/* Return true if this thread should not continue running on this processor */
3168static bool
3169sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason)
3170{
3171 if (processor->processor_primary != processor) {
3172 /*
3173 * This is a secondary SMT processor. If the primary is running
3174 * a realtime thread, only allow realtime threads on the secondary.
3175 */
3176 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3177 return true;
3178 }
3179 }
3180
3181 return false;
3182}
3183
3184/*
3185 * For the clutch scheduler, the run counts are maintained in the clutch
3186 * buckets (i.e thread group scheduling structure).
3187 */
3188static uint32_t
3189sched_clutch_run_incr(thread_t thread)
3190{
3191 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3192 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3193 sched_clutch_thread_run_bucket_incr(thread, bucket: thread->th_sched_bucket);
3194 return new_count;
3195}
3196
3197static uint32_t
3198sched_clutch_run_decr(thread_t thread)
3199{
3200 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3201 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3202 sched_clutch_thread_run_bucket_decr(thread, bucket: thread->th_sched_bucket);
3203 return new_count;
3204}
3205
3206/*
3207 * For threads that have changed sched_pri without changing the
3208 * base_pri for any reason other than decay, use the sched_pri
3209 * as the bucketizing priority instead of base_pri. All such
3210 * changes are typically due to kernel locking primitives boosts
3211 * or demotions.
3212 */
3213static boolean_t
3214sched_thread_sched_pri_promoted(thread_t thread)
3215{
3216 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3217 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3218 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3219 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3220 (thread->kern_promotion_schedpri != 0);
3221}
3222
3223/*
3224 * Routine to update the scheduling bucket for the thread.
3225 *
3226 * In the clutch scheduler implementation, the thread's bucket
3227 * is based on sched_pri if it was promoted due to a kernel
3228 * primitive; otherwise its based on the thread base_pri. This
3229 * enhancement allows promoted threads to reach a higher priority
3230 * bucket and potentially get selected sooner for scheduling.
3231 *
3232 * Also, the clutch scheduler does not honor fixed priority below
3233 * FG priority. It simply puts those threads in the corresponding
3234 * timeshare bucket. The reason for to do that is because it is
3235 * extremely hard to define the scheduling properties of such threads
3236 * and they typically lead to performance issues.
3237 */
3238
3239void
3240sched_clutch_update_thread_bucket(thread_t thread)
3241{
3242 sched_bucket_t old_bucket = thread->th_sched_bucket;
3243 thread_assert_runq_null(thread);
3244 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3245 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3246
3247 if (old_bucket == new_bucket) {
3248 return;
3249 }
3250
3251 thread->th_sched_bucket = new_bucket;
3252 thread->pri_shift = sched_clutch_thread_pri_shift(thread, bucket: new_bucket);
3253 /*
3254 * Since this is called after the thread has been removed from the runq,
3255 * only the run counts need to be updated. The re-insert into the runq
3256 * would put the thread into the correct new bucket's runq.
3257 */
3258 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3259 sched_clutch_thread_run_bucket_decr(thread, bucket: old_bucket);
3260 sched_clutch_thread_run_bucket_incr(thread, bucket: new_bucket);
3261 }
3262}
3263
3264static void
3265sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3266{
3267 /* Clutch ignores the recommendation because Clutch does not migrate
3268 * threads between cluster types independently from the Edge scheduler.
3269 */
3270}
3271
3272#if CONFIG_SCHED_EDGE
3273
3274/* Implementation of the AMP version of the clutch scheduler */
3275
3276static void
3277sched_edge_init(void);
3278
3279static void
3280sched_edge_pset_init(processor_set_t pset);
3281
3282static thread_t
3283sched_edge_processor_idle(processor_set_t pset);
3284
3285static ast_t
3286sched_edge_processor_csw_check(processor_t processor);
3287
3288static boolean_t
3289sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3290
3291static boolean_t
3292sched_edge_processor_queue_empty(processor_t processor);
3293
3294static thread_t
3295sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3296
3297static void
3298sched_edge_processor_queue_shutdown(processor_t processor);
3299
3300static processor_t
3301sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3302
3303static bool
3304sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3305
3306static bool
3307sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3308
3309static void
3310sched_edge_check_spill(processor_set_t pset, thread_t thread);
3311
3312static bool
3313sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3314
3315static void
3316sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3317
3318static void
3319sched_edge_cpu_init_completed(void);
3320
3321static bool
3322sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3323
3324static bool
3325sched_edge_steal_thread_enabled(processor_set_t pset);
3326
3327static sched_ipi_type_t
3328sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3329
3330static uint32_t
3331sched_edge_qos_max_parallelism(int qos, uint64_t options);
3332
3333static uint32_t
3334sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3335
3336const struct sched_dispatch_table sched_edge_dispatch = {
3337 .sched_name = "edge",
3338 .init = sched_edge_init,
3339 .timebase_init = sched_timeshare_timebase_init,
3340 .processor_init = sched_clutch_processor_init,
3341 .pset_init = sched_edge_pset_init,
3342 .maintenance_continuation = sched_timeshare_maintenance_continue,
3343 .choose_thread = sched_edge_choose_thread,
3344 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3345 .steal_thread = sched_edge_processor_idle,
3346 .compute_timeshare_priority = sched_compute_timeshare_priority,
3347 .choose_node = sched_choose_node,
3348 .choose_processor = sched_edge_choose_processor,
3349 .processor_enqueue = sched_clutch_processor_enqueue,
3350 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3351 .processor_queue_remove = sched_clutch_processor_queue_remove,
3352 .processor_queue_empty = sched_edge_processor_queue_empty,
3353 .priority_is_urgent = priority_is_urgent,
3354 .processor_csw_check = sched_edge_processor_csw_check,
3355 .processor_queue_has_priority = sched_edge_processor_queue_has_priority,
3356 .initial_quantum_size = sched_clutch_initial_quantum_size,
3357 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3358 .can_update_priority = can_update_priority,
3359 .update_priority = update_priority,
3360 .lightweight_update_priority = lightweight_update_priority,
3361 .quantum_expire = sched_default_quantum_expire,
3362 .processor_runq_count = sched_clutch_runq_count,
3363 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3364 .processor_bound_count = sched_clutch_processor_bound_count,
3365 .thread_update_scan = sched_clutch_thread_update_scan,
3366 .multiple_psets_enabled = TRUE,
3367 .sched_groups_enabled = FALSE,
3368 .avoid_processor_enabled = TRUE,
3369 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3370 .processor_balance = sched_edge_balance,
3371
3372 .rt_runq = sched_rtlocal_runq,
3373 .rt_init = sched_rtlocal_init,
3374 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3375 .rt_runq_scan = sched_rtlocal_runq_scan,
3376 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3377 .rt_steal_thread = sched_rtlocal_steal_thread,
3378
3379 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3380 .check_spill = sched_edge_check_spill,
3381 .ipi_policy = sched_edge_ipi_policy,
3382 .thread_should_yield = sched_edge_thread_should_yield,
3383 .run_count_incr = sched_clutch_run_incr,
3384 .run_count_decr = sched_clutch_run_decr,
3385 .update_thread_bucket = sched_clutch_update_thread_bucket,
3386 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3387 .thread_group_recommendation_change = NULL,
3388 .cpu_init_completed = sched_edge_cpu_init_completed,
3389 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3390};
3391
3392static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3393
3394/*
3395 * sched_edge_pset_available()
3396 *
3397 * Routine to determine if a pset is available for scheduling.
3398 */
3399static bool
3400sched_edge_pset_available(processor_set_t pset)
3401{
3402 if (pset == NULL) {
3403 return false;
3404 }
3405 return pset_available_cpu_count(pset) > 0;
3406}
3407
3408/*
3409 * sched_edge_thread_bound_cluster_id()
3410 *
3411 * Routine to determine which cluster a particular thread is bound to. Uses
3412 * the sched_flags on the thread to map back to a specific cluster id.
3413 *
3414 * <Edge Multi-cluster Support Needed>
3415 */
3416static uint32_t
3417sched_edge_thread_bound_cluster_id(thread_t thread)
3418{
3419 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3420 return thread->th_bound_cluster_id;
3421}
3422
3423/* Forward declaration for some thread migration routines */
3424static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3425static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3426static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3427static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3428
3429/*
3430 * sched_edge_config_set()
3431 *
3432 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3433 * policies in the scheduler.
3434 */
3435static void
3436sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3437{
3438 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3439 edge->sce_edge_packed = edge_config.sce_edge_packed;
3440}
3441
3442/*
3443 * sched_edge_config_get()
3444 *
3445 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3446 * if it needs to update edges.
3447 */
3448static sched_clutch_edge
3449sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3450{
3451 return pset_array[src_cluster]->sched_edges[dst_cluster];
3452}
3453
3454/*
3455 * sched_edge_matrix_set()
3456 *
3457 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3458 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3459 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3460 */
3461void
3462sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3463{
3464 uint32_t edge_index = 0;
3465 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3466 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3467 if (edge_changes_bitmap[edge_index]) {
3468 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3469 }
3470 edge_index++;
3471 }
3472 }
3473}
3474
3475/*
3476 * sched_edge_matrix_get()
3477 *
3478 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3479 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3480 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3481 */
3482void
3483sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3484{
3485 uint32_t edge_index = 0;
3486 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3487 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3488 if (edge_request_bitmap[edge_index]) {
3489 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3490 }
3491 edge_index++;
3492 }
3493 }
3494}
3495
3496/*
3497 * sched_edge_init()
3498 *
3499 * Routine to initialize the data structures for the Edge scheduler.
3500 */
3501static void
3502sched_edge_init(void)
3503{
3504 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3505 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3506 }
3507 sched_timeshare_init();
3508 sched_clutch_tunables_init();
3509 sched_edge_max_clusters = ml_get_cluster_count();
3510}
3511
3512static void
3513sched_edge_pset_init(processor_set_t pset)
3514{
3515 uint32_t pset_cluster_id = pset->pset_cluster_id;
3516 pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3517
3518 /* Set the edge weight and properties for the pset itself */
3519 bitmap_clear(pset->foreign_psets, pset_cluster_id);
3520 bitmap_clear(pset->native_psets, pset_cluster_id);
3521 bitmap_clear(pset->local_psets, pset_cluster_id);
3522 bitmap_clear(pset->remote_psets, pset_cluster_id);
3523 pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3524 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3525 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3526}
3527
3528static thread_t
3529sched_edge_choose_thread(
3530 processor_t processor,
3531 int priority,
3532 __unused ast_t reason)
3533{
3534 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3535 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3536 boolean_t choose_from_boundq = false;
3537
3538 if ((bound_runq->highq < priority) &&
3539 (clutch_pri < priority)) {
3540 return THREAD_NULL;
3541 }
3542
3543 if (bound_runq->highq >= clutch_pri) {
3544 choose_from_boundq = true;
3545 }
3546
3547 thread_t thread = THREAD_NULL;
3548 if (choose_from_boundq == false) {
3549 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3550 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3551 } else {
3552 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3553 }
3554 return thread;
3555}
3556
3557static boolean_t
3558sched_edge_processor_queue_empty(processor_t processor)
3559{
3560 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3561 (sched_clutch_bound_runq(processor)->count == 0);
3562}
3563
3564static void
3565sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3566{
3567 assert(thread->bound_processor == PROCESSOR_NULL);
3568}
3569
3570__options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3571 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3572 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3573 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3574 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3575 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3576});
3577
3578static bool
3579sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3580{
3581 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3582 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3583 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3584 return true;
3585 }
3586
3587 /*
3588 * The yield logic should follow the same logic that steal_thread () does. The
3589 * thread_should_yield() is effectively trying to quickly check that if the
3590 * current thread gave up CPU, is there any other thread that would execute
3591 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3592 * processor Idle logic.
3593 */
3594 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3595 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3596 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3597 return true;
3598 }
3599 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3600 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3601 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3602 return true;
3603 }
3604
3605 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3606 if (steal_candidate != NULL) {
3607 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3608 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3609 return true;
3610 }
3611
3612 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3613 0, SCHED_EDGE_YIELD_DISALLOW);
3614 return false;
3615}
3616
3617static ast_t
3618sched_edge_processor_csw_check(processor_t processor)
3619{
3620 boolean_t has_higher;
3621 int pri;
3622
3623 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3624 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3625
3626 assert(processor->active_thread != NULL);
3627
3628 pri = MAX(clutch_pri, bound_runq->highq);
3629
3630 if (processor->first_timeslice) {
3631 has_higher = (pri > processor->current_pri);
3632 } else {
3633 has_higher = (pri >= processor->current_pri);
3634 }
3635
3636 if (has_higher) {
3637 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3638 return AST_PREEMPT | AST_URGENT;
3639 }
3640
3641 if (bound_runq->urgency > 0) {
3642 return AST_PREEMPT | AST_URGENT;
3643 }
3644
3645 return AST_PREEMPT;
3646 }
3647
3648 return AST_NONE;
3649}
3650
3651static boolean_t
3652sched_edge_processor_queue_has_priority(processor_t processor,
3653 int priority,
3654 boolean_t gte)
3655{
3656 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3657
3658 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3659 if (gte) {
3660 return qpri >= priority;
3661 } else {
3662 return qpri > priority;
3663 }
3664}
3665
3666static void
3667sched_edge_processor_queue_shutdown(processor_t processor)
3668{
3669 processor_set_t pset = processor->processor_set;
3670 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3671 thread_t thread;
3672 queue_head_t tqueue;
3673
3674 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3675 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3676 pset_unlock(pset);
3677 return;
3678 }
3679
3680 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3681
3682 queue_init(&tqueue);
3683 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3684 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3685 enqueue_tail(&tqueue, &thread->runq_links);
3686 }
3687 pset_unlock(pset);
3688
3689 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3690 remqueue(&thread->runq_links);
3691 thread_lock(thread);
3692 thread_setrun(thread, SCHED_TAILQ);
3693 thread_unlock(thread);
3694 }
3695}
3696
3697/*
3698 * sched_edge_cluster_load_metric()
3699 *
3700 * The load metric for a cluster is a measure of the average scheduling latency
3701 * experienced by threads on that cluster. It is a product of the average number
3702 * of threads in the runqueue and the average execution time for threads. The metric
3703 * has special values in the following cases:
3704 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3705 * the maximum value to disallow any threads to migrate to this cluster.
3706 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3707 * to be spread across the platform quickly for ncpu wide workloads.
3708 */
3709static uint32_t
3710sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3711{
3712 if (sched_edge_pset_available(pset) == false) {
3713 return UINT32_MAX;
3714 }
3715 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3716}
3717
3718/*
3719 *
3720 * Edge Scheduler Steal/Rebalance logic
3721 *
3722 * = Generic scheduler logic =
3723 *
3724 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3725 * find any thread for execution in its runqueue. The aim of the steal operation
3726 * is to find other threads running/runnable in other clusters which should be
3727 * executed here.
3728 *
3729 * If the steal callout does not return a thread, the thread_select() logic calls
3730 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3731 * threads and idle out the current CPU.
3732 *
3733 * = SCHED(steal_thread) for Edge Scheduler =
3734 *
3735 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3736 * routine tries to do the following operations in order:
3737 * (1) Find foreign runnnable threads in non-native cluster
3738 * runqueues (sched_edge_foreign_runnable_thread_remove())
3739 * (2) Check if foreign threads are running on the non-native
3740 * clusters (sched_edge_foreign_running_thread_available())
3741 * - If yes, return THREAD_NULL for the steal callout and
3742 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3743 * (3) Steal a thread from another cluster based on edge
3744 * weights (sched_edge_steal_thread())
3745 *
3746 * = SCHED(processor_balance) for Edge Scheduler =
3747 *
3748 * If steal_thread did not return a thread for the processor, use
3749 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3750 *
3751 * = Clutch Bucket Preferred Cluster Overrides =
3752 *
3753 * Since these operations (just like thread migrations on enqueue)
3754 * move threads across clusters, they need support for handling clutch
3755 * bucket group level preferred cluster recommendations.
3756 * For (1), a clutch bucket will be in the foreign runnable queue based
3757 * on the clutch bucket group preferred cluster.
3758 * For (2), the running thread will set the bit on the processor based
3759 * on its preferred cluster type.
3760 * For (3), the edge configuration would prevent threads from being stolen
3761 * in the wrong direction.
3762 *
3763 * = SCHED(thread_should_yield) =
3764 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3765 * since that is expecting the same answer as if thread_select() was called on a core
3766 * with an empty runqueue.
3767 */
3768
3769static bool
3770sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3771{
3772 /*
3773 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3774 */
3775 return true;
3776}
3777
3778static processor_set_t
3779sched_edge_steal_candidate(processor_set_t pset)
3780{
3781 uint32_t dst_cluster_id = pset->pset_cluster_id;
3782 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
3783 processor_set_t candidate_pset = pset_array[cluster_id];
3784 if (cluster_id == dst_cluster_id) {
3785 continue;
3786 }
3787 if (candidate_pset == NULL) {
3788 continue;
3789 }
3790 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3791 if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
3792 return candidate_pset;
3793 }
3794 }
3795 return NULL;
3796}
3797
3798static boolean_t
3799sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3800{
3801 /* Find all the clusters that are foreign for this cluster */
3802 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3803 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3804 /*
3805 * For each cluster, see if there are any runnable foreign threads.
3806 * This check is currently being done without the pset lock to make it cheap for
3807 * the common case.
3808 */
3809 processor_set_t target_pset = pset_array[cluster];
3810 if (sched_edge_pset_available(target_pset) == false) {
3811 continue;
3812 }
3813
3814 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3815 return true;
3816 }
3817 }
3818 return false;
3819}
3820
3821static thread_t
3822sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3823{
3824 thread_t thread = THREAD_NULL;
3825
3826 /* Find all the clusters that are foreign for this cluster */
3827 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3828 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3829 /*
3830 * For each cluster, see if there are any runnable foreign threads.
3831 * This check is currently being done without the pset lock to make it cheap for
3832 * the common case.
3833 */
3834 processor_set_t target_pset = pset_array[cluster];
3835 if (sched_edge_pset_available(target_pset) == false) {
3836 continue;
3837 }
3838
3839 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3840 continue;
3841 }
3842 /*
3843 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3844 * and get the highest priority thread.
3845 */
3846 pset_lock(target_pset);
3847 if (sched_edge_pset_available(target_pset)) {
3848 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3849 sched_update_pset_load_average(target_pset, ctime);
3850 }
3851 pset_unlock(target_pset);
3852
3853 /*
3854 * Edge Scheduler Optimization
3855 *
3856 * The current implementation immediately returns as soon as it finds a foreign
3857 * runnable thread. This could be enhanced to look at highest priority threads
3858 * from all foreign clusters and pick the highest amongst them. That would need
3859 * some form of global state across psets to make that kind of a check cheap.
3860 */
3861 if (thread != THREAD_NULL) {
3862 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3863 break;
3864 }
3865 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3866 }
3867
3868 return thread;
3869}
3870
3871/*
3872 * sched_edge_cpu_running_foreign_shared_rsrc_available()
3873 *
3874 * Routine to determine if the thread running on a CPU is a shared resource thread
3875 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3876 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
3877 * from another non-ideal cluster based on the former's shared resource load.
3878 */
3879static boolean_t
3880sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3881{
3882 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3883 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3884 return false;
3885 }
3886
3887 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3888 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3889 return false;
3890 }
3891 return true;
3892}
3893
3894static boolean_t
3895sched_edge_foreign_running_thread_available(processor_set_t pset)
3896{
3897 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3898 int cluster = -1;
3899 while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
3900 /* Skip the pset if its not schedulable */
3901 processor_set_t target_pset = pset_array[cluster];
3902 if (sched_edge_pset_available(target_pset) == false) {
3903 continue;
3904 }
3905
3906 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3907 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3908 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3909 continue;
3910 }
3911 return true;
3912 }
3913 }
3914 return false;
3915}
3916
3917static bool
3918sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3919{
3920 int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3921 if (highest_runnable_bucket == -1) {
3922 /* Candidate cluster runq is empty */
3923 return false;
3924 }
3925
3926 if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3927 /* Always allow stealing from homogeneous clusters */
3928 return true;
3929 } else {
3930 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3931 int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3932 return candidate_load > 0;
3933 }
3934}
3935
3936static thread_t
3937sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3938{
3939 thread_t thread = THREAD_NULL;
3940
3941 /*
3942 * Edge Scheduler Optimization
3943 *
3944 * The logic today bails as soon as it finds a cluster where the cluster load is
3945 * greater than the edge weight. Maybe it should have a more advanced version
3946 * which looks for the maximum delta etc.
3947 */
3948 int cluster_id = -1;
3949 while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
3950 processor_set_t steal_from_pset = pset_array[cluster_id];
3951 if (steal_from_pset == NULL) {
3952 continue;
3953 }
3954 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3955 if (incoming_edge->sce_steal_allowed == false) {
3956 continue;
3957 }
3958 pset_lock(steal_from_pset);
3959 if (sched_edge_steal_possible(pset, steal_from_pset)) {
3960 uint64_t current_timestamp = mach_absolute_time();
3961 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3962 thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3963 sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3964 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3965 sched_update_pset_load_average(steal_from_pset, current_timestamp);
3966 }
3967 pset_unlock(steal_from_pset);
3968 if (thread != THREAD_NULL) {
3969 break;
3970 }
3971 }
3972 return thread;
3973}
3974
3975/*
3976 * sched_edge_processor_idle()
3977 *
3978 * The routine is the implementation for steal_thread() for the Edge scheduler.
3979 */
3980static thread_t
3981sched_edge_processor_idle(processor_set_t pset)
3982{
3983 thread_t thread = THREAD_NULL;
3984
3985 uint64_t ctime = mach_absolute_time();
3986
3987 processor_t processor = current_processor();
3988 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
3989
3990 /* Each of the operations acquire the lock for the pset they target */
3991 pset_unlock(pset);
3992
3993 /* Find highest priority runnable thread on all non-native clusters */
3994 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
3995 if (thread != THREAD_NULL) {
3996 return thread;
3997 }
3998
3999 /* Find highest priority runnable thread on all native clusters */
4000 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4001 if (thread != THREAD_NULL) {
4002 return thread;
4003 }
4004
4005 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4006 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4007 if (rebalance_needed) {
4008 return THREAD_NULL;
4009 }
4010
4011 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4012 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4013 return thread;
4014}
4015
4016/* Return true if this shared resource thread has a better cluster to run on */
4017static bool
4018sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4019{
4020 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4021 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4022 /*
4023 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4024 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4025 *
4026 * Edge Scheduler Optimization
4027 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4028 */
4029 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4030
4031 uint64_t eligible_pset_bitmask = 0;
4032 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4033 /*
4034 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4035 * only among clusters native with the preferred cluster.
4036 */
4037 eligible_pset_bitmask = preferred_pset->native_psets[0];
4038 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4039 } else {
4040 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4041 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4042 }
4043
4044 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4045 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4046 if (cluster_id == current_pset->pset_cluster_id) {
4047 continue;
4048 }
4049 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4050 if (current_pset_adjusted_load > cluster_load) {
4051 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4052 return true;
4053 }
4054 }
4055 return false;
4056}
4057
4058/* Return true if this thread should not continue running on this processor */
4059static bool
4060sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4061{
4062 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4063 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4064 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4065 }
4066 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4067 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4068 return true;
4069 }
4070 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4071 }
4072
4073 /*
4074 * For long running parallel workloads, it is important to rebalance threads across
4075 * E/P clusters so that they make equal forward progress. This is achieved through
4076 * threads expiring their quantum on the non-preferred cluster type and explicitly
4077 * rebalancing to the preferred cluster runqueue.
4078 */
4079 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4080 return true;
4081 }
4082 /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4083 if ((processor->processor_set != preferred_pset) &&
4084 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4085 return true;
4086 }
4087
4088 /*
4089 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4090 * A migration is only recommended if there's also an idle core available that needn't be avoided.
4091 */
4092 if (reason & AST_QUANTUM) {
4093 if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4094 uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4095 if (non_avoided_idle_primary_map != 0) {
4096 return true;
4097 }
4098 }
4099 }
4100
4101 return false;
4102}
4103
4104static bool
4105sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4106{
4107 assert(cprocessor == current_processor());
4108 pset_unlock(cpset);
4109
4110 uint64_t ast_processor_map = 0;
4111 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4112
4113 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4114 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4115 /* Skip the pset if its not schedulable */
4116 processor_set_t target_pset = pset_array[cluster];
4117 if (sched_edge_pset_available(target_pset) == false) {
4118 continue;
4119 }
4120
4121 pset_lock(target_pset);
4122 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4123 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4124 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4125 continue;
4126 }
4127 processor_t target_cpu = processor_array[cpuid];
4128 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4129 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4130 bit_set(ast_processor_map, cpuid);
4131 }
4132 }
4133 pset_unlock(target_pset);
4134 }
4135
4136 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4137 processor_t ast_processor = processor_array[cpuid];
4138 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4139 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4140 }
4141
4142 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
4143 return ast_processor_map != 0;
4144}
4145
4146/*
4147 * sched_edge_migration_check()
4148 *
4149 * Routine to evaluate an edge between two clusters to decide if migration is possible
4150 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4151 * accordingly. The return value indicates if the invoking routine should short circuit
4152 * the search, since an ideal candidate has been found. The routine looks at the regular
4153 * edges and cluster loads or the shared resource loads based on the type of thread.
4154 */
4155static bool
4156sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4157 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4158{
4159 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4160 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4161 processor_set_t dst_pset = pset_array[cluster_id];
4162 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4163 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4164
4165 if (cluster_id == preferred_cluster_id) {
4166 return false;
4167 }
4168
4169 if (dst_pset == NULL) {
4170 return false;
4171 }
4172
4173 sched_clutch_edge *edge = preferred_pset->sched_edges;
4174 if (edge[cluster_id].sce_migration_allowed == false) {
4175 return false;
4176 }
4177 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4178 if (dst_load == 0) {
4179 /* The candidate cluster is idle; select it immediately for execution */
4180 *selected_pset = dst_pset;
4181 *max_edge_delta = preferred_cluster_load;
4182 return true;
4183 }
4184
4185 uint32_t edge_delta = 0;
4186 if (dst_load > preferred_cluster_load) {
4187 return false;
4188 }
4189 edge_delta = preferred_cluster_load - dst_load;
4190 if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4191 /*
4192 * For non shared resource threads, use the edge migration weight to decide if
4193 * this cluster is over-committed at the QoS level of this thread.
4194 */
4195 return false;
4196 }
4197
4198 if (edge_delta < *max_edge_delta) {
4199 return false;
4200 }
4201 if (edge_delta == *max_edge_delta) {
4202 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4203 boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4204 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4205 if (selected_homogeneous || !candidate_homogeneous) {
4206 return false;
4207 }
4208 }
4209 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4210 *max_edge_delta = edge_delta;
4211 *selected_pset = dst_pset;
4212 return false;
4213}
4214
4215/*
4216 * sched_edge_iterate_clusters_ordered()
4217 *
4218 * Routine to iterate clusters in die local order. For multi-die machines,
4219 * the routine ensures that the candidate clusters on the same die as the
4220 * passed in pset are returned before the remote die clusters. This should
4221 * be used in all places where cluster selection in die order matters.
4222 */
4223
4224static int
4225sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4226{
4227 int cluster_id = -1;
4228
4229 uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4230 uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4231
4232 if (previous_cluster == -1) {
4233 /* previous_cluster == -1 indicates the initial condition */
4234 cluster_id = bit_first(local_candidate_map);
4235 if (cluster_id != -1) {
4236 return cluster_id;
4237 }
4238 return bit_first(remote_candidate_map);
4239 } else {
4240 /*
4241 * After the initial condition, the routine attempts to return a
4242 * cluster in the previous_cluster's locality. If none is available,
4243 * it looks at remote clusters.
4244 */
4245 if (bit_test(local_candidate_map, previous_cluster)) {
4246 cluster_id = bit_next(local_candidate_map, previous_cluster);
4247 if (cluster_id != -1) {
4248 return cluster_id;
4249 } else {
4250 return bit_first(remote_candidate_map);
4251 }
4252 }
4253 return bit_next(remote_candidate_map, previous_cluster);
4254 }
4255}
4256
4257/*
4258 * sched_edge_migrate_edges_evaluate()
4259 *
4260 * Routine to find the candidate for thread migration based on edge weights.
4261 *
4262 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4263 * return preferred_pset if its the most ideal destination for this thread.
4264 */
4265static processor_set_t
4266sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4267{
4268 processor_set_t selected_pset = preferred_pset;
4269 uint32_t max_edge_delta = 0;
4270 bool search_complete = false;
4271 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4272 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4273
4274 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4275 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4276 /* Always start the search with the native clusters */
4277 int cluster_id = -1;
4278 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4279 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4280 if (search_complete) {
4281 break;
4282 }
4283 }
4284
4285 if (search_complete) {
4286 return selected_pset;
4287 }
4288
4289 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4290 /*
4291 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4292 * to fill up the preferred cluster and its homogeneous peers first.
4293 */
4294
4295 if (max_edge_delta > 0) {
4296 /*
4297 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4298 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4299 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4300 * should be placed.
4301 */
4302 return selected_pset;
4303 }
4304 /*
4305 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4306 * any more capacity left.
4307 */
4308 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4309 return preferred_pset;
4310 }
4311 /*
4312 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4313 * at non-native clusters for capacity.
4314 */
4315 }
4316
4317 /* Now look at the non-native clusters */
4318 cluster_id = -1;
4319 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4320 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4321 if (search_complete) {
4322 break;
4323 }
4324 }
4325 return selected_pset;
4326}
4327
4328/*
4329 * sched_edge_candidate_alternative()
4330 *
4331 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4332 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4333 * clusters over heterogeneous clusters since this is typically used in thread
4334 * placement decisions.
4335 */
4336_Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4337static processor_set_t
4338sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4339{
4340 /*
4341 * It looks like the most ideal pset is not available for scheduling currently.
4342 * Try to find a homogeneous cluster that is still available.
4343 */
4344 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4345 int available_cluster_id = lsb_first(available_native_clusters);
4346 if (available_cluster_id == -1) {
4347 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4348 available_cluster_id = bit_first(candidate_cluster_bitmap);
4349 }
4350 assert(available_cluster_id != -1);
4351 return pset_array[available_cluster_id];
4352}
4353
4354/*
4355 * sched_edge_switch_pset_lock()
4356 *
4357 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4358 * switch_pset_locks.
4359 * Returns the newly locked pset after the switch.
4360 */
4361static processor_set_t
4362sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4363{
4364 if (!switch_pset_locks) {
4365 return locked_pset;
4366 }
4367 if (selected_pset != locked_pset) {
4368 pset_unlock(locked_pset);
4369 pset_lock(selected_pset);
4370 return selected_pset;
4371 } else {
4372 return locked_pset;
4373 }
4374}
4375
4376/*
4377 * sched_edge_amp_rebalance_pset()
4378 *
4379 * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4380 * has executed on non-preferred cluster type for a while) should be enqueued.
4381 * The algorithm maintains a history of AMP rebalance decisions on the clutch
4382 * bucket group of the workload and round-robins between clusters to ensure
4383 * that all threads get a chance on the performance cores and make equal
4384 * progress.
4385 */
4386static processor_set_t
4387sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4388{
4389 sched_clutch_t clutch = sched_clutch_for_thread(thread);
4390 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4391
4392 uint32_t last_chosen_cluster, new_chosen_cluster;
4393
4394 /* Only AMP rebalance within clusters native to the preferred cluster */
4395 uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4396 /* Preferred cluster is also eligible for rebalancing */
4397 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4398 /* Atomically update the AMP rebalance cluster for the clutch bucket group */
4399 os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4400 if (last_chosen_cluster == UINT32_MAX) {
4401 new_chosen_cluster = preferred_pset->pset_cluster_id;
4402 } else {
4403 new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4404 if (new_chosen_cluster == -1) {
4405 /* Rotate to the start of the eligible bitmask */
4406 new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4407 }
4408 }
4409 });
4410 return pset_array[new_chosen_cluster];
4411}
4412
4413/*
4414 * sched_edge_migrate_candidate()
4415 *
4416 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4417 * the thread and the preferred cluster to determine the best available pset for scheduling.
4418 *
4419 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4420 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4421 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4422 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4423 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4424 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4425 * more appropriate cluster.
4426 *
4427 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4428 * resultant pset lock is held.
4429 */
4430static processor_set_t
4431sched_edge_migrate_candidate(_Nullable processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4432{
4433 processor_set_t selected_pset = preferred_pset;
4434 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4435 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4436
4437 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4438 /* For bound threads always recommend the cluster its bound to */
4439 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4440 if (selected_pset != NULL) {
4441 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4442 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4443 /*
4444 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4445 * fall through to the regular cluster selection logic which handles unavailable clusters
4446 * appropriately. If the thread is hard bound, then return the bound cluster always.
4447 */
4448 return selected_pset;
4449 }
4450 }
4451 }
4452
4453 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4454#if DEVELOPMENT || DEBUG
4455 extern int enable_task_set_cluster_type;
4456 task_t task = get_threadtask(thread);
4457 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4458 processor_set_t pset_hint = task->pset_hint;
4459 if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4460 selected_pset = pset_hint;
4461 goto migrate_candidate_available_check;
4462 }
4463 }
4464#endif
4465
4466 if (preferred_pset == NULL) {
4467 /* The preferred_pset has not finished initializing at boot */
4468 goto migrate_candidate_available_check;
4469 }
4470
4471 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4472 /* For realtime threads, try and schedule them on the preferred pset always */
4473 goto migrate_candidate_available_check;
4474 }
4475
4476 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4477 if (preferred_cluster_load == 0) {
4478 goto migrate_candidate_available_check;
4479 }
4480
4481 /*
4482 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4483 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4484 * threads which have been previously migrated to the non-preferred cluster type.
4485 *
4486 * The AMP rebalancing mechanism is available for regular threads or shared resource
4487 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4488 */
4489 bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4490 if (amp_rebalance_eligible) {
4491 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4492 if (amp_rebalance) {
4493 boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4494 if (non_preferred_pset) {
4495 selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4496 goto migrate_candidate_available_check;
4497 }
4498 }
4499 }
4500
4501 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4502 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4503
4504migrate_candidate_available_check:
4505 if (selected_pset == NULL) {
4506 /* The selected_pset has not finished initializing at boot */
4507 pset_unlock(locked_pset);
4508 return NULL;
4509 }
4510
4511 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4512 if (sched_edge_pset_available(selected_pset) == true) {
4513 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4514 return selected_pset;
4515 }
4516 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4517 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4518 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4519 pset_unlock(locked_pset);
4520 return NULL;
4521 }
4522 /* Try and find an alternative for the selected pset */
4523 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4524 goto migrate_candidate_available_check;
4525}
4526
4527static processor_t
4528sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4529{
4530 /* Bound threads don't call this function */
4531 assert(thread->bound_processor == PROCESSOR_NULL);
4532 processor_t chosen_processor = PROCESSOR_NULL;
4533
4534 /*
4535 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4536 * It should take the passed in "pset" as a hint which represents the recency metric for
4537 * pset selection logic.
4538 */
4539 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4540 processor_set_t chosen_pset = preferred_pset;
4541 /*
4542 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4543 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4544 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4545 *
4546 * Edge Scheduler Optimization
4547 * It might be useful to build a recency metric for the thread for multiple clusters and
4548 * factor that into the migration decisions.
4549 */
4550 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4551 if (chosen_pset) {
4552 chosen_processor = choose_processor(chosen_pset, processor, thread);
4553 }
4554 /* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4555 assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4556 return chosen_processor;
4557}
4558
4559/*
4560 * sched_edge_clutch_bucket_threads_drain()
4561 *
4562 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4563 * bucket overrides etc.) into a local thread queue.
4564 */
4565static void
4566sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4567{
4568 thread_t thread = THREAD_NULL;
4569 uint64_t current_timestamp = mach_approximate_time();
4570 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4571 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4572 enqueue_tail(clutch_threads, &thread->runq_links);
4573 }
4574}
4575
4576/*
4577 * sched_edge_run_drained_threads()
4578 *
4579 * Makes all drained threads in a local queue runnable.
4580 */
4581static void
4582sched_edge_run_drained_threads(queue_t clutch_threads)
4583{
4584 thread_t thread;
4585 /* Now setrun all the threads in the local queue */
4586 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4587 remqueue(&thread->runq_links);
4588 thread_lock(thread);
4589 thread_setrun(thread, SCHED_TAILQ);
4590 thread_unlock(thread);
4591 }
4592}
4593
4594/*
4595 * sched_edge_update_preferred_cluster()
4596 *
4597 * Routine to update the preferred cluster for QoS buckets within a thread group.
4598 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4599 */
4600static void
4601sched_edge_update_preferred_cluster(
4602 sched_clutch_t sched_clutch,
4603 bitmap_t *clutch_bucket_modify_bitmap,
4604 uint32_t *tg_bucket_preferred_cluster)
4605{
4606 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4607 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4608 }
4609}
4610
4611/*
4612 * sched_edge_migrate_thread_group_runnable_threads()
4613 *
4614 * Routine to implement the migration of threads on a cluster when the thread group
4615 * recommendation is updated. The migration works using a 2-phase
4616 * algorithm.
4617 *
4618 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4619 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4620 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4621 * appropriate.
4622 *
4623 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4624 * queue and mark them runnable which should land them in the right hierarchy.
4625 *
4626 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4627 * groups have already been updated by the caller.
4628 *
4629 * - Called with the pset locked and interrupts disabled.
4630 * - Returns with the pset unlocked.
4631 */
4632static void
4633sched_edge_migrate_thread_group_runnable_threads(
4634 sched_clutch_t sched_clutch,
4635 sched_clutch_root_t root_clutch,
4636 bitmap_t *clutch_bucket_modify_bitmap,
4637 __unused uint32_t *tg_bucket_preferred_cluster,
4638 bool migrate_immediately)
4639{
4640 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4641 queue_head_t clutch_threads;
4642 queue_init(&clutch_threads);
4643
4644 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4645 /* Get the clutch bucket for this cluster and sched bucket */
4646 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4647 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4648 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4649 if (scb_root == NULL) {
4650 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4651 assert(clutch_bucket->scb_thr_count == 0);
4652 continue;
4653 }
4654 assert(scb_root == root_clutch);
4655 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4656
4657 if (migrate_immediately) {
4658 /*
4659 * For transitions where threads need to be migrated immediately, drain the threads into a
4660 * local queue unless we are looking at the clutch buckets for the newly recommended
4661 * cluster.
4662 */
4663 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4664 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4665 } else {
4666 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4667 }
4668 } else {
4669 /* Check if this cluster is the same type as the newly recommended cluster */
4670 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4671 /*
4672 * If threads do not have to be migrated immediately, just change the native/foreign
4673 * flag on the clutch bucket.
4674 */
4675 if (homogeneous_cluster) {
4676 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4677 } else {
4678 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4679 }
4680 }
4681 }
4682
4683 pset_unlock(root_clutch->scr_pset);
4684 sched_edge_run_drained_threads(&clutch_threads);
4685}
4686
4687/*
4688 * sched_edge_migrate_thread_group_running_threads()
4689 *
4690 * Routine to find all running threads of a thread group on a specific cluster
4691 * and IPI them if they need to be moved immediately.
4692 */
4693static void
4694sched_edge_migrate_thread_group_running_threads(
4695 sched_clutch_t sched_clutch,
4696 sched_clutch_root_t root_clutch,
4697 __unused bitmap_t *clutch_bucket_modify_bitmap,
4698 uint32_t *tg_bucket_preferred_cluster,
4699 bool migrate_immediately)
4700{
4701 if (migrate_immediately == false) {
4702 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4703 return;
4704 }
4705
4706 /*
4707 * Edge Scheduler Optimization
4708 *
4709 * When the system has a large number of clusters and cores, it might be useful to
4710 * narrow down the iteration by using a thread running bitmap per clutch.
4711 */
4712 uint64_t ast_processor_map = 0;
4713 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4714
4715 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4716 /*
4717 * Iterate all CPUs and look for the ones running threads from this thread group and are
4718 * not restricted to the specific cluster (due to overrides etc.)
4719 */
4720 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4721 processor_t src_processor = processor_array[cpuid];
4722 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4723 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4724 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4725 continue;
4726 }
4727 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4728
4729 if (expected_tg && non_preferred_cluster) {
4730 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4731 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4732 bit_set(ast_processor_map, cpuid);
4733 } else if (src_processor == current_processor()) {
4734 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4735 ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
4736 ast_on(new_preempt);
4737 }
4738 }
4739 }
4740
4741 /* Perform all the IPIs */
4742 if (bit_first(ast_processor_map) != -1) {
4743 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4744 processor_t ast_processor = processor_array[cpuid];
4745 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4746 }
4747 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4748 }
4749}
4750
4751/*
4752 * sched_edge_tg_preferred_cluster_change()
4753 *
4754 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4755 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4756 * thread migrations based on the policy specified by CLPC.
4757 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4758 * for each QoS within the thread group.
4759 */
4760void
4761sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4762{
4763 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4764 /*
4765 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4766 * for which the preferred cluster has changed.
4767 */
4768 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4769 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4770 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4771 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4772 if (old_preferred_cluster != new_preferred_cluster) {
4773 bitmap_set(clutch_bucket_modify_bitmap, bucket);
4774 }
4775 }
4776 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4777 /* No changes in any clutch buckets; nothing to do here */
4778 return;
4779 }
4780
4781 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4782 processor_set_t pset = pset_array[cluster_id];
4783 spl_t s = splsched();
4784 pset_lock(pset);
4785 /*
4786 * The first operation is to update the preferred cluster for all QoS buckets within the
4787 * thread group so that any future threads becoming runnable would see the new preferred
4788 * cluster value.
4789 */
4790 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4791 /*
4792 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4793 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4794 *
4795 * Edge Scheduler Optimization
4796 */
4797 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4798 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4799 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4800 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4801 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4802 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4803 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4804 splx(s);
4805 }
4806}
4807
4808/*
4809 * sched_edge_pset_made_schedulable()
4810 *
4811 * Routine to migrate all the clutch buckets which are not in their recommended
4812 * pset hierarchy now that a new pset has become runnable. Its possible that this
4813 * routine is called when the pset is already marked schedulable.
4814 *
4815 * Invoked with the pset lock held and interrupts disabled.
4816 */
4817static void
4818sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4819{
4820 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4821 /* Nothing to do here since pset is already marked schedulable */
4822 if (drop_lock) {
4823 pset_unlock(dst_pset);
4824 }
4825 return;
4826 }
4827
4828 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4829
4830 thread_t thread = sched_edge_processor_idle(dst_pset);
4831 if (thread != THREAD_NULL) {
4832 thread_lock(thread);
4833 thread_setrun(thread, SCHED_TAILQ);
4834 thread_unlock(thread);
4835 }
4836
4837 if (!drop_lock) {
4838 pset_lock(dst_pset);
4839 }
4840}
4841
4842/*
4843 * sched_edge_cpu_init_completed()
4844 *
4845 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4846 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
4847 */
4848static void
4849sched_edge_cpu_init_completed(void)
4850{
4851 spl_t s = splsched();
4852 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4853 processor_set_t src_pset = pset_array[src_cluster_id];
4854 pset_lock(src_pset);
4855
4856 /* For each cluster, set all its outgoing edge parameters */
4857 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4858 if (dst_cluster_id == src_cluster_id) {
4859 continue;
4860 }
4861 processor_set_t dst_pset = pset_array[dst_cluster_id];
4862 if (src_pset->pset_type == dst_pset->pset_type) {
4863 /* P->P/E->E edge config */
4864 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4865 bitmap_set(src_pset->native_psets, dst_cluster_id);
4866 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4867 } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4868 /* P->E edge config */
4869 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4870 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4871 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4872 } else {
4873 /* E->P edge config */
4874 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4875 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4876 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
4877 }
4878 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
4879 if (clusters_local) {
4880 bitmap_set(src_pset->local_psets, dst_cluster_id);
4881 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4882 } else {
4883 bitmap_set(src_pset->remote_psets, dst_cluster_id);
4884 bitmap_clear(src_pset->local_psets, dst_cluster_id);
4885 }
4886 }
4887
4888 pset_unlock(src_pset);
4889 }
4890 splx(s);
4891}
4892
4893static bool
4894sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4895{
4896 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4897 if (preferred_cluster_id == pset->pset_cluster_id) {
4898 return true;
4899 } else {
4900 processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4901 return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4902 }
4903}
4904
4905extern int sched_amp_spill_deferred_ipi;
4906extern int sched_amp_pcores_preempt_immediate_ipi;
4907
4908int sched_edge_migrate_ipi_immediate = 1;
4909
4910sched_ipi_type_t
4911sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4912{
4913 processor_set_t pset = dst->processor_set;
4914 assert(dst != current_processor());
4915
4916 boolean_t deferred_ipi_supported = false;
4917#if defined(CONFIG_SCHED_DEFERRED_AST)
4918 deferred_ipi_supported = true;
4919#endif /* CONFIG_SCHED_DEFERRED_AST */
4920
4921 switch (event) {
4922 case SCHED_IPI_EVENT_SPILL:
4923 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4924 if (deferred_ipi_supported) {
4925 return sched_ipi_deferred_policy(pset, dst, thread, event);
4926 }
4927 break;
4928 case SCHED_IPI_EVENT_PREEMPT:
4929 /* For preemption, the default policy is to use deferred IPIs
4930 * for Non-RT P-core preemption. Override that behavior if
4931 * sched_amp_pcores_preempt_immediate_ipi is set
4932 */
4933 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4934 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4935 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4936 }
4937 if (sched_edge_migrate_ipi_immediate) {
4938 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4939 /*
4940 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4941 */
4942 if (preferred_pset->pset_type == pset->pset_type) {
4943 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4944 }
4945 /*
4946 * For workloads that are going wide, it might be useful use Immediate IPI to
4947 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4948 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4949 * latency on the preferred pset as an estimate of busyness.
4950 */
4951 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4952 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4953 }
4954 }
4955 }
4956 break;
4957 default:
4958 break;
4959 }
4960 /* Default back to the global policy for all other scenarios */
4961 return sched_ipi_policy(dst, thread, dst_idle, event);
4962}
4963
4964/*
4965 * sched_edge_qos_max_parallelism()
4966 */
4967uint32_t
4968sched_edge_qos_max_parallelism(int qos, uint64_t options)
4969{
4970 uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4971 uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4972 uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4973 uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4974
4975 if (options & QOS_PARALLELISM_REALTIME) {
4976 /* For realtime threads on AMP, we would want them
4977 * to limit the width to just the P-cores since we
4978 * do not spill/rebalance for RT threads.
4979 */
4980 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4981 }
4982
4983 /*
4984 * The Edge scheduler supports per-QoS recommendations for thread groups.
4985 * This enables lower QoS buckets (such as UT) to be scheduled on all
4986 * CPUs on the system.
4987 *
4988 * The only restriction is for BG/Maintenance QoS classes for which the
4989 * performance controller would never recommend execution on the P-cores.
4990 * If that policy changes in the future, this value should be changed.
4991 */
4992 switch (qos) {
4993 case THREAD_QOS_BACKGROUND:
4994 case THREAD_QOS_MAINTENANCE:
4995 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4996 default:
4997 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4998 }
4999}
5000
5001
5002
5003#endif /* CONFIG_SCHED_EDGE */
5004
5005#endif /* CONFIG_SCHED_CLUTCH */
5006