1/*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
68
69#include <mach/mach_types.h>
70#include <mach/machine.h>
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
73#include <mach/thread_act.h>
74
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
77#include <machine/machine_cpu.h>
78#include <machine/limits.h>
79#include <machine/atomic.h>
80
81#include <machine/commpage.h>
82
83#include <kern/kern_types.h>
84#include <kern/backtrace.h>
85#include <kern/clock.h>
86#include <kern/cpu_number.h>
87#include <kern/cpu_data.h>
88#include <kern/smp.h>
89#include <kern/debug.h>
90#include <kern/macro_help.h>
91#include <kern/machine.h>
92#include <kern/misc_protos.h>
93#include <kern/monotonic.h>
94#include <kern/processor.h>
95#include <kern/queue.h>
96#include <kern/recount.h>
97#include <kern/restartable.h>
98#include <kern/sched.h>
99#include <kern/sched_prim.h>
100#include <kern/sfi.h>
101#include <kern/syscall_subr.h>
102#include <kern/task.h>
103#include <kern/thread.h>
104#include <kern/thread_group.h>
105#include <kern/ledger.h>
106#include <kern/timer_queue.h>
107#include <kern/waitq.h>
108#include <kern/policy_internal.h>
109
110#include <vm/pmap.h>
111#include <vm/vm_kern.h>
112#include <vm/vm_map.h>
113#include <vm/vm_pageout.h>
114
115#include <mach/sdt.h>
116#include <mach/mach_host.h>
117#include <mach/host_info.h>
118
119#include <sys/kdebug.h>
120#include <kperf/kperf.h>
121#include <kern/kpc.h>
122#include <san/kasan.h>
123#include <kern/pms.h>
124#include <kern/host.h>
125#include <stdatomic.h>
126#include <os/atomic_private.h>
127
128#ifdef KDBG_MACOS_RELEASE
129#define KTRC KDBG_MACOS_RELEASE
130#else
131#define KTRC KDBG_RELEASE
132#endif
133
134struct sched_statistics PERCPU_DATA(sched_stats);
135bool sched_stats_active;
136
137static uint64_t
138deadline_add(uint64_t d, uint64_t e)
139{
140 uint64_t sum;
141 return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum;
142}
143
144int
145rt_runq_count(processor_set_t pset)
146{
147 return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed);
148}
149
150uint64_t
151rt_runq_earliest_deadline(processor_set_t pset)
152{
153 return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed);
154}
155
156static int
157rt_runq_priority(processor_set_t pset)
158{
159 pset_assert_locked(pset);
160 rt_queue_t rt_run_queue = SCHED(rt_runq)(pset);
161
162 bitmap_t *map = rt_run_queue->bitmap;
163 int i = bitmap_first(map, NRTQS);
164 assert(i < NRTQS);
165
166 if (i >= 0) {
167 return i + BASEPRI_RTQUEUES;
168 }
169
170 return i;
171}
172
173static thread_t rt_runq_first(rt_queue_t rt_runq);
174
175#if DEBUG
176static void
177check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread)
178{
179 bitmap_t *map = rt_run_queue->bitmap;
180
181 uint64_t earliest_deadline = RT_DEADLINE_NONE;
182 uint32_t constraint = RT_CONSTRAINT_NONE;
183 int ed_index = NOPRI;
184 int count = 0;
185 bool found_thread = false;
186
187 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
188 int i = pri - BASEPRI_RTQUEUES;
189 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
190 queue_t queue = &rt_runq->pri_queue;
191 queue_entry_t iter;
192 int n = 0;
193 uint64_t previous_deadline = 0;
194 qe_foreach(iter, queue) {
195 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
196 assert_thread_magic(iter_thread);
197 if (iter_thread == thread) {
198 found_thread = true;
199 }
200 assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES));
201 assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE);
202 assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE);
203 assert(previous_deadline <= iter_thread->realtime.deadline);
204 n++;
205 if (iter == queue_first(queue)) {
206 assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline);
207 assert(rt_runq->pri_constraint == iter_thread->realtime.constraint);
208 }
209 previous_deadline = iter_thread->realtime.deadline;
210 }
211 assert(n == rt_runq->pri_count);
212 if (n == 0) {
213 assert(bitmap_test(map, i) == false);
214 assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE);
215 assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE);
216 } else {
217 assert(bitmap_test(map, i) == true);
218 }
219 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
220 earliest_deadline = rt_runq->pri_earliest_deadline;
221 constraint = rt_runq->pri_constraint;
222 ed_index = i;
223 }
224 count += n;
225 }
226 assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline);
227 assert(os_atomic_load(&rt_run_queue->count, relaxed) == count);
228 assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint);
229 assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index);
230 if (thread) {
231 assert(found_thread);
232 }
233}
234#define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th)
235#else
236#define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0)
237#endif
238
239uint32_t rt_constraint_threshold;
240
241static bool
242rt_runq_is_low_latency(processor_set_t pset)
243{
244 return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold;
245}
246
247TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true);
248
249/* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */
250TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */
251static uint64_t nonurgent_preemption_timer_abs = 0;
252
253#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
254TUNABLE(int, default_preemption_rate, "preempt", DEFAULT_PREEMPTION_RATE);
255
256#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
257TUNABLE(int, default_bg_preemption_rate, "bg_preempt", DEFAULT_BG_PREEMPTION_RATE);
258
259#define MAX_UNSAFE_RT_QUANTA 100
260#define SAFE_RT_MULTIPLIER 2
261
262#define MAX_UNSAFE_FIXED_QUANTA 100
263#define SAFE_FIXED_MULTIPLIER 2
264
265TUNABLE_DEV_WRITEABLE(int, max_unsafe_rt_quanta, "max_unsafe_rt_quanta", MAX_UNSAFE_RT_QUANTA);
266TUNABLE_DEV_WRITEABLE(int, max_unsafe_fixed_quanta, "max_unsafe_fixed_quanta", MAX_UNSAFE_FIXED_QUANTA);
267
268TUNABLE_DEV_WRITEABLE(int, safe_rt_multiplier, "safe_rt_multiplier", SAFE_RT_MULTIPLIER);
269TUNABLE_DEV_WRITEABLE(int, safe_fixed_multiplier, "safe_fixed_multiplier", SAFE_RT_MULTIPLIER);
270
271#define MAX_POLL_QUANTA 2
272TUNABLE(int, max_poll_quanta, "poll", MAX_POLL_QUANTA);
273
274#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
275int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
276
277uint64_t max_poll_computation;
278
279uint64_t max_unsafe_rt_computation;
280uint64_t max_unsafe_fixed_computation;
281uint64_t sched_safe_rt_duration;
282uint64_t sched_safe_fixed_duration;
283
284#if defined(CONFIG_SCHED_TIMESHARE_CORE)
285
286uint32_t std_quantum;
287uint32_t min_std_quantum;
288uint32_t bg_quantum;
289
290uint32_t std_quantum_us;
291uint32_t bg_quantum_us;
292
293#endif /* CONFIG_SCHED_TIMESHARE_CORE */
294
295uint32_t thread_depress_time;
296uint32_t default_timeshare_computation;
297uint32_t default_timeshare_constraint;
298
299uint32_t max_rt_quantum;
300uint32_t min_rt_quantum;
301
302uint32_t rt_deadline_epsilon;
303
304uint32_t rt_constraint_threshold;
305
306#if defined(CONFIG_SCHED_TIMESHARE_CORE)
307
308unsigned sched_tick;
309uint32_t sched_tick_interval;
310
311/* Timeshare load calculation interval (15ms) */
312uint32_t sched_load_compute_interval_us = 15000;
313uint64_t sched_load_compute_interval_abs;
314static _Atomic uint64_t sched_load_compute_deadline;
315
316uint32_t sched_pri_shifts[TH_BUCKET_MAX];
317uint32_t sched_fixed_shift;
318
319uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
320
321/* Allow foreground to decay past default to resolve inversions */
322#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
323int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324
325/* Defaults for timer deadline profiling */
326#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
327 * 2ms */
328#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
329 * <= 5ms */
330
331uint64_t timer_deadline_tracking_bin_1;
332uint64_t timer_deadline_tracking_bin_2;
333
334#endif /* CONFIG_SCHED_TIMESHARE_CORE */
335
336thread_t sched_maintenance_thread;
337
338/* interrupts disabled lock to guard recommended cores state */
339decl_simple_lock_data(, sched_available_cores_lock);
340uint64_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
341uint64_t perfcontrol_system_requested_recommended_cores = ALL_CORES_RECOMMENDED;
342uint64_t perfcontrol_user_requested_recommended_cores = ALL_CORES_RECOMMENDED;
343static uint64_t usercontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
344static uint64_t sched_online_processors = 0;
345static void sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags);
346static void sched_update_powered_cores(uint64_t reqested_powered_cores, processor_reason_t reason, uint32_t flags);
347
348#if __arm64__
349static void sched_recommended_cores_maintenance(void);
350uint64_t perfcontrol_failsafe_starvation_threshold;
351extern char *proc_name_address(struct proc *p);
352#endif /* __arm64__ */
353
354uint64_t sched_one_second_interval;
355boolean_t allow_direct_handoff = TRUE;
356
357/* Forwards */
358
359#if defined(CONFIG_SCHED_TIMESHARE_CORE)
360
361static void load_shift_init(void);
362static void preempt_pri_init(void);
363
364#endif /* CONFIG_SCHED_TIMESHARE_CORE */
365
366thread_t processor_idle(
367 thread_t thread,
368 processor_t processor);
369
370static ast_t
371csw_check_locked(
372 thread_t thread,
373 processor_t processor,
374 processor_set_t pset,
375 ast_t check_reason);
376
377static void processor_setrun(
378 processor_t processor,
379 thread_t thread,
380 integer_t options);
381
382static void
383sched_realtime_timebase_init(void);
384
385static void
386sched_timer_deadline_tracking_init(void);
387
388#if DEBUG
389extern int debug_task;
390#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
391#else
392#define TLOG(a, fmt, args...) do {} while (0)
393#endif
394
395static processor_t
396thread_bind_internal(
397 thread_t thread,
398 processor_t processor);
399
400static void
401sched_vm_group_maintenance(void);
402
403#if defined(CONFIG_SCHED_TIMESHARE_CORE)
404int8_t sched_load_shifts[NRQS];
405bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)];
406#endif /* CONFIG_SCHED_TIMESHARE_CORE */
407
408/*
409 * Statically allocate a buffer to hold the longest possible
410 * scheduler description string, as currently implemented.
411 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
412 * to export to userspace via sysctl(3). If either version
413 * changes, update the other.
414 *
415 * Note that in addition to being an upper bound on the strings
416 * in the kernel, it's also an exact parameter to PE_get_default(),
417 * which interrogates the device tree on some platforms. That
418 * API requires the caller know the exact size of the device tree
419 * property, so we need both a legacy size (32) and the current size
420 * (48) to deal with old and new device trees. The device tree property
421 * is similarly padded to a fixed size so that the same kernel image
422 * can run on multiple devices with different schedulers configured
423 * in the device tree.
424 */
425char sched_string[SCHED_STRING_MAX_LENGTH];
426
427uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
428
429/* Global flag which indicates whether Background Stepper Context is enabled */
430static int cpu_throttle_enabled = 1;
431
432#if DEVELOPMENT || DEBUG
433int enable_task_set_cluster_type = 0;
434bool system_ecore_only = false;
435#endif /* DEVELOPMENT || DEBUG */
436
437void
438sched_init(void)
439{
440 boolean_t direct_handoff = FALSE;
441 kprintf(fmt: "Scheduler: Default of %s\n", SCHED(sched_name));
442
443 if (!PE_parse_boot_argn(arg_string: "sched_pri_decay_limit", arg_ptr: &sched_pri_decay_band_limit, max_arg: sizeof(sched_pri_decay_band_limit))) {
444 /* No boot-args, check in device tree */
445 if (!PE_get_default(property_name: "kern.sched_pri_decay_limit",
446 property_ptr: &sched_pri_decay_band_limit,
447 max_property: sizeof(sched_pri_decay_band_limit))) {
448 /* Allow decay all the way to normal limits */
449 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
450 }
451 }
452
453 kprintf(fmt: "Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
454
455 if (PE_parse_boot_argn(arg_string: "sched_debug", arg_ptr: &sched_debug_flags, max_arg: sizeof(sched_debug_flags))) {
456 kprintf(fmt: "Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
457 }
458 strlcpy(dst: sched_string, SCHED(sched_name), n: sizeof(sched_string));
459
460#if __arm64__
461 clock_interval_to_absolutetime_interval(interval: expecting_ipi_wfe_timeout_usec, NSEC_PER_USEC, result: &expecting_ipi_wfe_timeout_mt);
462#endif /* __arm64__ */
463
464 SCHED(init)();
465 SCHED(rt_init)(&pset0);
466 sched_timer_deadline_tracking_init();
467
468 SCHED(pset_init)(&pset0);
469 SCHED(processor_init)(master_processor);
470
471 if (PE_parse_boot_argn(arg_string: "direct_handoff", arg_ptr: &direct_handoff, max_arg: sizeof(direct_handoff))) {
472 allow_direct_handoff = direct_handoff;
473 }
474
475#if DEVELOPMENT || DEBUG
476 if (PE_parse_boot_argn("enable_skstsct", &enable_task_set_cluster_type, sizeof(enable_task_set_cluster_type))) {
477 system_ecore_only = (enable_task_set_cluster_type == 2);
478 }
479#endif /* DEVELOPMENT || DEBUG */
480
481 simple_lock_init(&sched_available_cores_lock, 0);
482}
483
484void
485sched_timebase_init(void)
486{
487 uint64_t abstime;
488
489 clock_interval_to_absolutetime_interval(interval: 1, NSEC_PER_SEC, result: &abstime);
490 sched_one_second_interval = abstime;
491
492 SCHED(timebase_init)();
493 sched_realtime_timebase_init();
494}
495
496#if defined(CONFIG_SCHED_TIMESHARE_CORE)
497
498void
499sched_timeshare_init(void)
500{
501 /*
502 * Calculate the timeslicing quantum
503 * in us.
504 */
505 if (default_preemption_rate < 1) {
506 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
507 }
508 std_quantum_us = (1000 * 1000) / default_preemption_rate;
509
510 printf(format: "standard timeslicing quantum is %d us\n", std_quantum_us);
511
512 if (default_bg_preemption_rate < 1) {
513 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
514 }
515 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
516
517 printf(format: "standard background quantum is %d us\n", bg_quantum_us);
518
519 load_shift_init();
520 preempt_pri_init();
521 sched_tick = 0;
522}
523
524void
525sched_set_max_unsafe_rt_quanta(int max)
526{
527 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
528
529 max_unsafe_rt_computation = ((uint64_t)max) * quantum_size;
530
531 const int mult = safe_rt_multiplier <= 0 ? 2 : safe_rt_multiplier;
532 sched_safe_rt_duration = mult * ((uint64_t)max) * quantum_size;
533
534
535#if DEVELOPMENT || DEBUG
536 max_unsafe_rt_quanta = max;
537#else
538 /*
539 * On RELEASE kernels, this is only called on boot where
540 * max is already equal to max_unsafe_rt_quanta.
541 */
542 assert3s(max, ==, max_unsafe_rt_quanta);
543#endif
544}
545
546void
547sched_set_max_unsafe_fixed_quanta(int max)
548{
549 const uint32_t quantum_size = SCHED(initial_quantum_size)(THREAD_NULL);
550
551 max_unsafe_fixed_computation = ((uint64_t)max) * quantum_size;
552
553 const int mult = safe_fixed_multiplier <= 0 ? 2 : safe_fixed_multiplier;
554 sched_safe_fixed_duration = mult * ((uint64_t)max) * quantum_size;
555
556#if DEVELOPMENT || DEBUG
557 max_unsafe_fixed_quanta = max;
558#else
559 /*
560 * On RELEASE kernels, this is only called on boot where
561 * max is already equal to max_unsafe_fixed_quanta.
562 */
563 assert3s(max, ==, max_unsafe_fixed_quanta);
564#endif
565}
566
567void
568sched_timeshare_timebase_init(void)
569{
570 uint64_t abstime;
571 uint32_t shift;
572
573 /* standard timeslicing quantum */
574 clock_interval_to_absolutetime_interval(
575 interval: std_quantum_us, NSEC_PER_USEC, result: &abstime);
576 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
577 std_quantum = (uint32_t)abstime;
578
579 /* smallest remaining quantum (250 us) */
580 clock_interval_to_absolutetime_interval(interval: 250, NSEC_PER_USEC, result: &abstime);
581 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
582 min_std_quantum = (uint32_t)abstime;
583
584 /* quantum for background tasks */
585 clock_interval_to_absolutetime_interval(
586 interval: bg_quantum_us, NSEC_PER_USEC, result: &abstime);
587 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
588 bg_quantum = (uint32_t)abstime;
589
590 /* scheduler tick interval */
591 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
592 NSEC_PER_USEC, result: &abstime);
593 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
594 sched_tick_interval = (uint32_t)abstime;
595
596 /* timeshare load calculation interval & deadline initialization */
597 clock_interval_to_absolutetime_interval(interval: sched_load_compute_interval_us, NSEC_PER_USEC, result: &sched_load_compute_interval_abs);
598 os_atomic_init(&sched_load_compute_deadline, sched_load_compute_interval_abs);
599
600 /*
601 * Compute conversion factor from usage to
602 * timesharing priorities with 5/8 ** n aging.
603 */
604 abstime = (abstime * 5) / 3;
605 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) {
606 abstime >>= 1;
607 }
608 sched_fixed_shift = shift;
609
610 for (uint32_t i = 0; i < TH_BUCKET_MAX; i++) {
611 sched_pri_shifts[i] = INT8_MAX;
612 }
613
614 sched_set_max_unsafe_rt_quanta(max: max_unsafe_rt_quanta);
615 sched_set_max_unsafe_fixed_quanta(max: max_unsafe_fixed_quanta);
616
617 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
618 thread_depress_time = 1 * std_quantum;
619 default_timeshare_computation = std_quantum / 2;
620 default_timeshare_constraint = std_quantum;
621
622#if __arm64__
623 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
624#endif /* __arm64__ */
625
626 if (nonurgent_preemption_timer_us) {
627 clock_interval_to_absolutetime_interval(interval: nonurgent_preemption_timer_us, NSEC_PER_USEC, result: &abstime);
628 nonurgent_preemption_timer_abs = abstime;
629 }
630}
631
632#endif /* CONFIG_SCHED_TIMESHARE_CORE */
633
634void
635pset_rt_init(processor_set_t pset)
636{
637 for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) {
638 int i = pri - BASEPRI_RTQUEUES;
639 rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i];
640 queue_init(&rqi->pri_queue);
641 rqi->pri_count = 0;
642 rqi->pri_earliest_deadline = RT_DEADLINE_NONE;
643 rqi->pri_constraint = RT_CONSTRAINT_NONE;
644 }
645 os_atomic_init(&pset->rt_runq.count, 0);
646 os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE);
647 os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE);
648 os_atomic_init(&pset->rt_runq.ed_index, NOPRI);
649 memset(s: &pset->rt_runq.runq_stats, c: 0, n: sizeof pset->rt_runq.runq_stats);
650}
651
652/* epsilon for comparing RT deadlines */
653int rt_deadline_epsilon_us = 100;
654
655int
656sched_get_rt_deadline_epsilon(void)
657{
658 return rt_deadline_epsilon_us;
659}
660
661void
662sched_set_rt_deadline_epsilon(int new_epsilon_us)
663{
664 rt_deadline_epsilon_us = new_epsilon_us;
665
666 uint64_t abstime;
667 clock_interval_to_absolutetime_interval(interval: rt_deadline_epsilon_us, NSEC_PER_USEC, result: &abstime);
668 assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0));
669 rt_deadline_epsilon = (uint32_t)abstime;
670}
671
672static void
673sched_realtime_timebase_init(void)
674{
675 uint64_t abstime;
676
677 /* smallest rt computation (50 us) */
678 clock_interval_to_absolutetime_interval(interval: 50, NSEC_PER_USEC, result: &abstime);
679 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
680 min_rt_quantum = (uint32_t)abstime;
681
682 /* maximum rt computation (50 ms) */
683 clock_interval_to_absolutetime_interval(
684 interval: 50, scale_factor: 1000 * NSEC_PER_USEC, result: &abstime);
685 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
686 max_rt_quantum = (uint32_t)abstime;
687
688 /* constraint threshold for sending backup IPIs (4 ms) */
689 clock_interval_to_absolutetime_interval(interval: 4, NSEC_PER_MSEC, result: &abstime);
690 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
691 rt_constraint_threshold = (uint32_t)abstime;
692
693 /* epsilon for comparing deadlines */
694 sched_set_rt_deadline_epsilon(new_epsilon_us: rt_deadline_epsilon_us);
695}
696
697void
698sched_check_spill(processor_set_t pset, thread_t thread)
699{
700 (void)pset;
701 (void)thread;
702
703 return;
704}
705
706bool
707sched_thread_should_yield(processor_t processor, thread_t thread)
708{
709 (void)thread;
710
711 return !SCHED(processor_queue_empty)(processor) || rt_runq_count(pset: processor->processor_set) > 0;
712}
713
714/* Default implementations of .steal_thread_enabled */
715bool
716sched_steal_thread_DISABLED(processor_set_t pset)
717{
718 (void)pset;
719 return false;
720}
721
722bool
723sched_steal_thread_enabled(processor_set_t pset)
724{
725 return bit_count(x: pset->node->pset_map) > 1;
726}
727
728#if defined(CONFIG_SCHED_TIMESHARE_CORE)
729
730/*
731 * Set up values for timeshare
732 * loading factors.
733 */
734static void
735load_shift_init(void)
736{
737 int8_t k, *p = sched_load_shifts;
738 uint32_t i, j;
739
740 uint32_t sched_decay_penalty = 1;
741
742 if (PE_parse_boot_argn(arg_string: "sched_decay_penalty", arg_ptr: &sched_decay_penalty, max_arg: sizeof(sched_decay_penalty))) {
743 kprintf(fmt: "Overriding scheduler decay penalty %u\n", sched_decay_penalty);
744 }
745
746 if (PE_parse_boot_argn(arg_string: "sched_decay_usage_age_factor", arg_ptr: &sched_decay_usage_age_factor, max_arg: sizeof(sched_decay_usage_age_factor))) {
747 kprintf(fmt: "Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
748 }
749
750 if (sched_decay_penalty == 0) {
751 /*
752 * There is no penalty for timeshare threads for using too much
753 * CPU, so set all load shifts to INT8_MIN. Even under high load,
754 * sched_pri_shift will be >INT8_MAX, and there will be no
755 * penalty applied to threads (nor will sched_usage be updated per
756 * thread).
757 */
758 for (i = 0; i < NRQS; i++) {
759 sched_load_shifts[i] = INT8_MIN;
760 }
761
762 return;
763 }
764
765 *p++ = INT8_MIN; *p++ = 0;
766
767 /*
768 * For a given system load "i", the per-thread priority
769 * penalty per quantum of CPU usage is ~2^k priority
770 * levels. "sched_decay_penalty" can cause more
771 * array entries to be filled with smaller "k" values
772 */
773 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
774 for (j <<= 1; (i < j) && (i < NRQS); ++i) {
775 *p++ = k;
776 }
777 }
778}
779
780static void
781preempt_pri_init(void)
782{
783 bitmap_t *p = sched_preempt_pri;
784
785 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) {
786 bitmap_set(map: p, n: i);
787 }
788
789 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) {
790 bitmap_set(map: p, n: i);
791 }
792}
793
794#endif /* CONFIG_SCHED_TIMESHARE_CORE */
795
796void
797check_monotonic_time(uint64_t ctime)
798{
799 processor_t processor = current_processor();
800 uint64_t last_dispatch = processor->last_dispatch;
801
802 if (last_dispatch > ctime) {
803 panic("Non-monotonic time: last_dispatch at 0x%llx, ctime 0x%llx",
804 last_dispatch, ctime);
805 }
806}
807
808
809/*
810 * Thread wait timer expiration.
811 * Runs in timer interrupt context with interrupts disabled.
812 */
813void
814thread_timer_expire(void *p0, __unused void *p1)
815{
816 thread_t thread = (thread_t)p0;
817
818 assert_thread_magic(thread);
819
820 assert(ml_get_interrupts_enabled() == FALSE);
821
822 thread_lock(thread);
823
824 if (thread->wait_timer_armed) {
825 thread->wait_timer_armed = false;
826 clear_wait_internal(thread, THREAD_TIMED_OUT);
827 /* clear_wait_internal may have dropped and retaken the thread lock */
828 }
829
830 thread->wait_timer_active--;
831
832 thread_unlock(thread);
833}
834
835/*
836 * thread_unblock:
837 *
838 * Unblock thread on wake up.
839 *
840 * Returns TRUE if the thread should now be placed on the runqueue.
841 *
842 * Thread must be locked.
843 *
844 * Called at splsched().
845 */
846boolean_t
847thread_unblock(
848 thread_t thread,
849 wait_result_t wresult)
850{
851 boolean_t ready_for_runq = FALSE;
852 thread_t cthread = current_thread();
853 uint32_t new_run_count;
854 int old_thread_state;
855
856 /*
857 * Set wait_result.
858 */
859 thread->wait_result = wresult;
860
861 /*
862 * Cancel pending wait timer.
863 */
864 if (thread->wait_timer_armed) {
865 if (timer_call_cancel(call: thread->wait_timer)) {
866 thread->wait_timer_active--;
867 }
868 thread->wait_timer_armed = false;
869 }
870
871 boolean_t aticontext, pidle;
872 ml_get_power_state(&aticontext, &pidle);
873
874 /*
875 * Update scheduling state: not waiting,
876 * set running.
877 */
878 old_thread_state = thread->state;
879 thread->state = (old_thread_state | TH_RUN) &
880 ~(TH_WAIT | TH_UNINT | TH_WAIT_REPORT | TH_WAKING);
881
882 if ((old_thread_state & TH_RUN) == 0) {
883 uint64_t ctime = mach_approximate_time();
884
885 check_monotonic_time(ctime);
886
887 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
888 timer_start(timer: &thread->runnable_timer, tstamp: ctime);
889
890 ready_for_runq = TRUE;
891
892 if (old_thread_state & TH_WAIT_REPORT) {
893 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
894 }
895
896 /* Update the runnable thread count */
897 new_run_count = SCHED(run_count_incr)(thread);
898
899#if CONFIG_SCHED_AUTO_JOIN
900 if (aticontext == FALSE && work_interval_should_propagate(cthread, thread)) {
901 work_interval_auto_join_propagate(from: cthread, to: thread);
902 }
903#endif /*CONFIG_SCHED_AUTO_JOIN */
904
905 } else {
906 /*
907 * Either the thread is idling in place on another processor,
908 * or it hasn't finished context switching yet.
909 */
910 assert((thread->state & TH_IDLE) == 0);
911 /*
912 * The run count is only dropped after the context switch completes
913 * and the thread is still waiting, so we should not run_incr here
914 */
915 new_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
916 }
917
918 /*
919 * Calculate deadline for real-time threads.
920 */
921 if (thread->sched_mode == TH_MODE_REALTIME) {
922 uint64_t ctime = mach_absolute_time();
923 thread->realtime.deadline = thread->realtime.constraint + ctime;
924 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
925 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
926 }
927
928 /*
929 * Clear old quantum, fail-safe computation, etc.
930 */
931 thread->quantum_remaining = 0;
932 thread->computation_metered = 0;
933 thread->reason = AST_NONE;
934 thread->block_hint = kThreadWaitNone;
935
936 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
937 * We also account for "double hop" thread signaling via
938 * the thread callout infrastructure.
939 * DRK: consider removing the callout wakeup counters in the future
940 * they're present for verification at the moment.
941 */
942
943 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
944 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, current_proc());
945
946 uint64_t ttd = current_processor()->timer_call_ttd;
947
948 if (ttd) {
949 if (ttd <= timer_deadline_tracking_bin_1) {
950 thread->thread_timer_wakeups_bin_1++;
951 } else if (ttd <= timer_deadline_tracking_bin_2) {
952 thread->thread_timer_wakeups_bin_2++;
953 }
954 }
955
956 ledger_credit_thread(thread, ledger: thread->t_ledger,
957 entry: task_ledgers.interrupt_wakeups, amount: 1);
958 if (pidle) {
959 ledger_credit_thread(thread, ledger: thread->t_ledger,
960 entry: task_ledgers.platform_idle_wakeups, amount: 1);
961 }
962 } else if (thread_get_tag_internal(thread: cthread) & THREAD_TAG_CALLOUT) {
963 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
964 if (cthread->callout_woken_from_icontext) {
965 ledger_credit_thread(thread, ledger: thread->t_ledger,
966 entry: task_ledgers.interrupt_wakeups, amount: 1);
967 thread->thread_callout_interrupt_wakeups++;
968
969 if (cthread->callout_woken_from_platform_idle) {
970 ledger_credit_thread(thread, ledger: thread->t_ledger,
971 entry: task_ledgers.platform_idle_wakeups, amount: 1);
972 thread->thread_callout_platform_idle_wakeups++;
973 }
974
975 cthread->callout_woke_thread = TRUE;
976 }
977 }
978
979 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
980 thread->callout_woken_from_icontext = !!aticontext;
981 thread->callout_woken_from_platform_idle = !!pidle;
982 thread->callout_woke_thread = FALSE;
983 }
984
985#if KPERF
986 if (ready_for_runq) {
987 kperf_make_runnable(thread, interrupt: aticontext);
988 }
989#endif /* KPERF */
990
991 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
992 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
993 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
994 sched_run_buckets[TH_BUCKET_RUN], 0);
995
996 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, current_proc());
997
998 return ready_for_runq;
999}
1000
1001/*
1002 * Routine: thread_allowed_for_handoff
1003 * Purpose:
1004 * Check if the thread is allowed for handoff operation
1005 * Conditions:
1006 * thread lock held, IPC locks may be held.
1007 * TODO: In future, do not allow handoff if threads have different cluster
1008 * recommendations.
1009 */
1010boolean_t
1011thread_allowed_for_handoff(
1012 thread_t thread)
1013{
1014 thread_t self = current_thread();
1015
1016 if (allow_direct_handoff &&
1017 thread->sched_mode == TH_MODE_REALTIME &&
1018 self->sched_mode == TH_MODE_REALTIME) {
1019 return TRUE;
1020 }
1021
1022 return FALSE;
1023}
1024
1025/*
1026 * Routine: thread_go
1027 * Purpose:
1028 * Unblock and dispatch thread.
1029 * Conditions:
1030 * thread lock held, IPC locks may be held.
1031 * thread must have been waiting
1032 */
1033void
1034thread_go(
1035 thread_t thread,
1036 wait_result_t wresult,
1037 bool try_handoff)
1038{
1039 thread_t self = current_thread();
1040
1041 assert_thread_magic(thread);
1042
1043 assert(thread->at_safe_point == FALSE);
1044 assert(thread->wait_event == NO_EVENT64);
1045 assert(waitq_is_null(thread->waitq));
1046
1047 assert(!(thread->state & (TH_TERMINATE | TH_TERMINATE2)));
1048 assert(thread->state & TH_WAIT);
1049
1050 if (thread->started) {
1051 assert(thread->state & TH_WAKING);
1052 }
1053
1054 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1055
1056 assert(ml_get_interrupts_enabled() == false);
1057
1058 if (thread_unblock(thread, wresult)) {
1059#if SCHED_TRACE_THREAD_WAKEUPS
1060 backtrace(&thread->thread_wakeup_bt[0],
1061 (sizeof(thread->thread_wakeup_bt) / sizeof(uintptr_t)), NULL,
1062 NULL);
1063#endif /* SCHED_TRACE_THREAD_WAKEUPS */
1064 if (try_handoff && thread_allowed_for_handoff(thread)) {
1065 thread_reference(thread);
1066 assert(self->handoff_thread == NULL);
1067 self->handoff_thread = thread;
1068 } else {
1069 thread_setrun(thread, options: SCHED_PREEMPT | SCHED_TAILQ);
1070 }
1071 }
1072}
1073
1074/*
1075 * Routine: thread_mark_wait_locked
1076 * Purpose:
1077 * Mark a thread as waiting. If, given the circumstances,
1078 * it doesn't want to wait (i.e. already aborted), then
1079 * indicate that in the return value.
1080 * Conditions:
1081 * at splsched() and thread is locked.
1082 */
1083__private_extern__
1084wait_result_t
1085thread_mark_wait_locked(
1086 thread_t thread,
1087 wait_interrupt_t interruptible_orig)
1088{
1089 boolean_t at_safe_point;
1090 wait_interrupt_t interruptible = interruptible_orig;
1091
1092 if (thread->state & TH_IDLE) {
1093 panic("Invalid attempt to wait while running the idle thread");
1094 }
1095
1096 assert(!(thread->state & (TH_WAIT | TH_WAKING | TH_IDLE | TH_UNINT | TH_TERMINATE2 | TH_WAIT_REPORT)));
1097
1098 /*
1099 * The thread may have certain types of interrupts/aborts masked
1100 * off. Even if the wait location says these types of interrupts
1101 * are OK, we have to honor mask settings (outer-scoped code may
1102 * not be able to handle aborts at the moment).
1103 */
1104 interruptible &= TH_OPT_INTMASK;
1105 if (interruptible > (thread->options & TH_OPT_INTMASK)) {
1106 interruptible = thread->options & TH_OPT_INTMASK;
1107 }
1108
1109 at_safe_point = (interruptible == THREAD_ABORTSAFE);
1110
1111 if (interruptible == THREAD_UNINT ||
1112 !(thread->sched_flags & TH_SFLAG_ABORT) ||
1113 (!at_safe_point &&
1114 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1115 if (!(thread->state & TH_TERMINATE)) {
1116 DTRACE_SCHED(sleep);
1117 }
1118
1119 int state_bits = TH_WAIT;
1120 if (!interruptible) {
1121 state_bits |= TH_UNINT;
1122 }
1123 if (thread->sched_call) {
1124 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
1125 if (is_kerneltask(task: get_threadtask(thread))) {
1126 mask = THREAD_WAIT_NOREPORT_KERNEL;
1127 }
1128 if ((interruptible_orig & mask) == 0) {
1129 state_bits |= TH_WAIT_REPORT;
1130 }
1131 }
1132 thread->state |= state_bits;
1133 thread->at_safe_point = at_safe_point;
1134
1135 /* TODO: pass this through assert_wait instead, have
1136 * assert_wait just take a struct as an argument */
1137 assert(!thread->block_hint);
1138 thread->block_hint = thread->pending_block_hint;
1139 thread->pending_block_hint = kThreadWaitNone;
1140
1141 return thread->wait_result = THREAD_WAITING;
1142 } else {
1143 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) {
1144 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1145 }
1146 }
1147 thread->pending_block_hint = kThreadWaitNone;
1148
1149 return thread->wait_result = THREAD_INTERRUPTED;
1150}
1151
1152/*
1153 * Routine: thread_interrupt_level
1154 * Purpose:
1155 * Set the maximum interruptible state for the
1156 * current thread. The effective value of any
1157 * interruptible flag passed into assert_wait
1158 * will never exceed this.
1159 *
1160 * Useful for code that must not be interrupted,
1161 * but which calls code that doesn't know that.
1162 * Returns:
1163 * The old interrupt level for the thread.
1164 */
1165__private_extern__
1166wait_interrupt_t
1167thread_interrupt_level(
1168 wait_interrupt_t new_level)
1169{
1170 thread_t thread = current_thread();
1171 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1172
1173 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1174
1175 return result;
1176}
1177
1178/*
1179 * assert_wait:
1180 *
1181 * Assert that the current thread is about to go to
1182 * sleep until the specified event occurs.
1183 */
1184wait_result_t
1185assert_wait(
1186 event_t event,
1187 wait_interrupt_t interruptible)
1188{
1189 if (__improbable(event == NO_EVENT)) {
1190 panic("%s() called with NO_EVENT", __func__);
1191 }
1192
1193 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1194 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1195 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
1196
1197 struct waitq *waitq;
1198 waitq = global_eventq(event);
1199 return waitq_assert_wait64(waitq: waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
1200}
1201
1202/*
1203 * assert_wait_queue:
1204 *
1205 * Return the global waitq for the specified event
1206 */
1207struct waitq *
1208assert_wait_queue(
1209 event_t event)
1210{
1211 return global_eventq(event);
1212}
1213
1214wait_result_t
1215assert_wait_timeout(
1216 event_t event,
1217 wait_interrupt_t interruptible,
1218 uint32_t interval,
1219 uint32_t scale_factor)
1220{
1221 thread_t thread = current_thread();
1222 wait_result_t wresult;
1223 uint64_t deadline;
1224 spl_t s;
1225
1226 if (__improbable(event == NO_EVENT)) {
1227 panic("%s() called with NO_EVENT", __func__);
1228 }
1229
1230 struct waitq *waitq;
1231 waitq = global_eventq(event);
1232
1233 s = splsched();
1234 waitq_lock(wq: waitq);
1235
1236 clock_interval_to_deadline(interval, scale_factor, result: &deadline);
1237
1238 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1239 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1240 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1241
1242 wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1243 interruptible,
1244 TIMEOUT_URGENCY_SYS_NORMAL,
1245 deadline, TIMEOUT_NO_LEEWAY,
1246 thread);
1247
1248 waitq_unlock(wq: waitq);
1249 splx(s);
1250 return wresult;
1251}
1252
1253wait_result_t
1254assert_wait_timeout_with_leeway(
1255 event_t event,
1256 wait_interrupt_t interruptible,
1257 wait_timeout_urgency_t urgency,
1258 uint32_t interval,
1259 uint32_t leeway,
1260 uint32_t scale_factor)
1261{
1262 thread_t thread = current_thread();
1263 wait_result_t wresult;
1264 uint64_t deadline;
1265 uint64_t abstime;
1266 uint64_t slop;
1267 uint64_t now;
1268 spl_t s;
1269
1270 if (__improbable(event == NO_EVENT)) {
1271 panic("%s() called with NO_EVENT", __func__);
1272 }
1273
1274 now = mach_absolute_time();
1275 clock_interval_to_absolutetime_interval(interval, scale_factor, result: &abstime);
1276 deadline = now + abstime;
1277
1278 clock_interval_to_absolutetime_interval(interval: leeway, scale_factor, result: &slop);
1279
1280 struct waitq *waitq;
1281 waitq = global_eventq(event);
1282
1283 s = splsched();
1284 waitq_lock(wq: waitq);
1285
1286 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1287 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1288 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1289
1290 wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1291 interruptible,
1292 urgency, deadline, leeway: slop,
1293 thread);
1294
1295 waitq_unlock(wq: waitq);
1296 splx(s);
1297 return wresult;
1298}
1299
1300wait_result_t
1301assert_wait_deadline(
1302 event_t event,
1303 wait_interrupt_t interruptible,
1304 uint64_t deadline)
1305{
1306 thread_t thread = current_thread();
1307 wait_result_t wresult;
1308 spl_t s;
1309
1310 if (__improbable(event == NO_EVENT)) {
1311 panic("%s() called with NO_EVENT", __func__);
1312 }
1313
1314 struct waitq *waitq;
1315 waitq = global_eventq(event);
1316
1317 s = splsched();
1318 waitq_lock(wq: waitq);
1319
1320 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1321 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1322 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1323
1324 wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1325 interruptible,
1326 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1327 TIMEOUT_NO_LEEWAY, thread);
1328 waitq_unlock(wq: waitq);
1329 splx(s);
1330 return wresult;
1331}
1332
1333wait_result_t
1334assert_wait_deadline_with_leeway(
1335 event_t event,
1336 wait_interrupt_t interruptible,
1337 wait_timeout_urgency_t urgency,
1338 uint64_t deadline,
1339 uint64_t leeway)
1340{
1341 thread_t thread = current_thread();
1342 wait_result_t wresult;
1343 spl_t s;
1344
1345 if (__improbable(event == NO_EVENT)) {
1346 panic("%s() called with NO_EVENT", __func__);
1347 }
1348
1349 struct waitq *waitq;
1350 waitq = global_eventq(event);
1351
1352 s = splsched();
1353 waitq_lock(wq: waitq);
1354
1355 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1356 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT) | DBG_FUNC_NONE,
1357 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1358
1359 wresult = waitq_assert_wait64_locked(waitq: waitq, CAST_EVENT64_T(event),
1360 interruptible,
1361 urgency, deadline, leeway,
1362 thread);
1363 waitq_unlock(wq: waitq);
1364 splx(s);
1365 return wresult;
1366}
1367
1368void
1369sched_cond_init(
1370 sched_cond_atomic_t *cond)
1371{
1372 os_atomic_init(cond, SCHED_COND_INIT);
1373}
1374
1375wait_result_t
1376sched_cond_wait_parameter(
1377 sched_cond_atomic_t *cond,
1378 wait_interrupt_t interruptible,
1379 thread_continue_t continuation,
1380 void *parameter)
1381{
1382 assert_wait(event: (event_t) cond, interruptible);
1383 /* clear active bit to indicate future wakeups will have to unblock this thread */
1384 sched_cond_t new_state = (sched_cond_t) os_atomic_andnot(cond, SCHED_COND_ACTIVE, relaxed);
1385 if (__improbable(new_state & SCHED_COND_WAKEUP)) {
1386 /* a wakeup has been issued; undo wait assertion, ack the wakeup, and return */
1387 thread_t thread = current_thread();
1388 clear_wait(thread, THREAD_AWAKENED);
1389 sched_cond_ack(cond);
1390 return THREAD_AWAKENED;
1391 }
1392 return thread_block_parameter(continuation, parameter);
1393}
1394
1395wait_result_t
1396sched_cond_wait(
1397 sched_cond_atomic_t *cond,
1398 wait_interrupt_t interruptible,
1399 thread_continue_t continuation)
1400{
1401 return sched_cond_wait_parameter(cond, interruptible, continuation, NULL);
1402}
1403
1404sched_cond_t
1405sched_cond_ack(
1406 sched_cond_atomic_t *cond)
1407{
1408 sched_cond_t new_cond = (sched_cond_t) os_atomic_xor(cond, SCHED_COND_ACTIVE | SCHED_COND_WAKEUP, acquire);
1409 assert(new_cond & SCHED_COND_ACTIVE);
1410 return new_cond;
1411}
1412
1413kern_return_t
1414sched_cond_signal(
1415 sched_cond_atomic_t *cond,
1416 thread_t thread)
1417{
1418 disable_preemption();
1419 sched_cond_t old_cond = (sched_cond_t) os_atomic_or_orig(cond, SCHED_COND_WAKEUP, release);
1420 if (!(old_cond & (SCHED_COND_WAKEUP | SCHED_COND_ACTIVE))) {
1421 /* this was the first wakeup to be issued AND the thread was inactive */
1422 thread_wakeup_thread(event: (event_t) cond, thread);
1423 }
1424 enable_preemption();
1425 return KERN_SUCCESS;
1426}
1427
1428/*
1429 * thread_isoncpu:
1430 *
1431 * Return TRUE if a thread is running on a processor such that an AST
1432 * is needed to pull it out of userspace execution, or if executing in
1433 * the kernel, bring to a context switch boundary that would cause
1434 * thread state to be serialized in the thread PCB.
1435 *
1436 * Thread locked, returns the same way. While locked, fields
1437 * like "state" cannot change. "runq" can change only from set to unset.
1438 */
1439static inline boolean_t
1440thread_isoncpu(thread_t thread)
1441{
1442 /* Not running or runnable */
1443 if (!(thread->state & TH_RUN)) {
1444 return FALSE;
1445 }
1446
1447 /* Waiting on a runqueue, not currently running */
1448 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1449 if (thread_get_runq(thread) != PROCESSOR_NULL) {
1450 return FALSE;
1451 }
1452
1453 /*
1454 * Thread does not have a stack yet
1455 * It could be on the stack alloc queue or preparing to be invoked
1456 */
1457 if (!thread->kernel_stack) {
1458 return FALSE;
1459 }
1460
1461 /*
1462 * Thread must be running on a processor, or
1463 * about to run, or just did run. In all these
1464 * cases, an AST to the processor is needed
1465 * to guarantee that the thread is kicked out
1466 * of userspace and the processor has
1467 * context switched (and saved register state).
1468 */
1469 return TRUE;
1470}
1471
1472/*
1473 * thread_stop:
1474 *
1475 * Force a preemption point for a thread and wait
1476 * for it to stop running on a CPU. If a stronger
1477 * guarantee is requested, wait until no longer
1478 * runnable. Arbitrates access among
1479 * multiple stop requests. (released by unstop)
1480 *
1481 * The thread must enter a wait state and stop via a
1482 * separate means.
1483 *
1484 * Returns FALSE if interrupted.
1485 */
1486boolean_t
1487thread_stop(
1488 thread_t thread,
1489 boolean_t until_not_runnable)
1490{
1491 wait_result_t wresult;
1492 spl_t s = splsched();
1493 boolean_t oncpu;
1494
1495 wake_lock(thread);
1496 thread_lock(thread);
1497
1498 while (thread->state & TH_SUSP) {
1499 thread->wake_active = TRUE;
1500 thread_unlock(thread);
1501
1502 wresult = assert_wait(event: &thread->wake_active, THREAD_ABORTSAFE);
1503 wake_unlock(thread);
1504 splx(s);
1505
1506 if (wresult == THREAD_WAITING) {
1507 wresult = thread_block(THREAD_CONTINUE_NULL);
1508 }
1509
1510 if (wresult != THREAD_AWAKENED) {
1511 return FALSE;
1512 }
1513
1514 s = splsched();
1515 wake_lock(thread);
1516 thread_lock(thread);
1517 }
1518
1519 thread->state |= TH_SUSP;
1520
1521 while ((oncpu = thread_isoncpu(thread)) ||
1522 (until_not_runnable && (thread->state & TH_RUN))) {
1523 processor_t processor;
1524
1525 if (oncpu) {
1526 assert(thread->state & TH_RUN);
1527 processor = thread->chosen_processor;
1528 cause_ast_check(processor);
1529 }
1530
1531 thread->wake_active = TRUE;
1532 thread_unlock(thread);
1533
1534 wresult = assert_wait(event: &thread->wake_active, THREAD_ABORTSAFE);
1535 wake_unlock(thread);
1536 splx(s);
1537
1538 if (wresult == THREAD_WAITING) {
1539 wresult = thread_block(THREAD_CONTINUE_NULL);
1540 }
1541
1542 if (wresult != THREAD_AWAKENED) {
1543 thread_unstop(thread);
1544 return FALSE;
1545 }
1546
1547 s = splsched();
1548 wake_lock(thread);
1549 thread_lock(thread);
1550 }
1551
1552 thread_unlock(thread);
1553 wake_unlock(thread);
1554 splx(s);
1555
1556 /*
1557 * We return with the thread unlocked. To prevent it from
1558 * transitioning to a runnable state (or from TH_RUN to
1559 * being on the CPU), the caller must ensure the thread
1560 * is stopped via an external means (such as an AST)
1561 */
1562
1563 return TRUE;
1564}
1565
1566/*
1567 * thread_unstop:
1568 *
1569 * Release a previous stop request and set
1570 * the thread running if appropriate.
1571 *
1572 * Use only after a successful stop operation.
1573 */
1574void
1575thread_unstop(
1576 thread_t thread)
1577{
1578 spl_t s = splsched();
1579
1580 wake_lock(thread);
1581 thread_lock(thread);
1582
1583 assert((thread->state & (TH_RUN | TH_WAIT | TH_SUSP)) != TH_SUSP);
1584
1585 if (thread->state & TH_SUSP) {
1586 thread->state &= ~TH_SUSP;
1587
1588 if (thread->wake_active) {
1589 thread->wake_active = FALSE;
1590 thread_unlock(thread);
1591
1592 thread_wakeup(&thread->wake_active);
1593 wake_unlock(thread);
1594 splx(s);
1595
1596 return;
1597 }
1598 }
1599
1600 thread_unlock(thread);
1601 wake_unlock(thread);
1602 splx(s);
1603}
1604
1605/*
1606 * thread_wait:
1607 *
1608 * Wait for a thread to stop running. (non-interruptible)
1609 *
1610 */
1611void
1612thread_wait(
1613 thread_t thread,
1614 boolean_t until_not_runnable)
1615{
1616 wait_result_t wresult;
1617 boolean_t oncpu;
1618 processor_t processor;
1619 spl_t s = splsched();
1620
1621 wake_lock(thread);
1622 thread_lock(thread);
1623
1624 /*
1625 * Wait until not running on a CPU. If stronger requirement
1626 * desired, wait until not runnable. Assumption: if thread is
1627 * on CPU, then TH_RUN is set, so we're not waiting in any case
1628 * where the original, pure "TH_RUN" check would have let us
1629 * finish.
1630 */
1631 while ((oncpu = thread_isoncpu(thread)) ||
1632 (until_not_runnable && (thread->state & TH_RUN))) {
1633 if (oncpu) {
1634 assert(thread->state & TH_RUN);
1635 processor = thread->chosen_processor;
1636 cause_ast_check(processor);
1637 }
1638
1639 thread->wake_active = TRUE;
1640 thread_unlock(thread);
1641
1642 wresult = assert_wait(event: &thread->wake_active, THREAD_UNINT);
1643 wake_unlock(thread);
1644 splx(s);
1645
1646 if (wresult == THREAD_WAITING) {
1647 thread_block(THREAD_CONTINUE_NULL);
1648 }
1649
1650 s = splsched();
1651 wake_lock(thread);
1652 thread_lock(thread);
1653 }
1654
1655 thread_unlock(thread);
1656 wake_unlock(thread);
1657 splx(s);
1658}
1659
1660/*
1661 * Routine: clear_wait_internal
1662 *
1663 * Clear the wait condition for the specified thread.
1664 * Start the thread executing if that is appropriate.
1665 * Arguments:
1666 * thread thread to awaken
1667 * result Wakeup result the thread should see
1668 * Conditions:
1669 * At splsched
1670 * the thread is locked.
1671 * Returns:
1672 * KERN_SUCCESS thread was rousted out a wait
1673 * KERN_FAILURE thread was waiting but could not be rousted
1674 * KERN_NOT_WAITING thread was not waiting
1675 */
1676__private_extern__ kern_return_t
1677clear_wait_internal(
1678 thread_t thread,
1679 wait_result_t wresult)
1680{
1681 waitq_t waitq = thread->waitq;
1682
1683 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) {
1684 return KERN_FAILURE;
1685 }
1686
1687 /*
1688 * Check that the thread is waiting and not waking, as a waking thread
1689 * has already cleared its waitq, and is destined to be go'ed, don't
1690 * need to do it again.
1691 */
1692 if ((thread->state & (TH_WAIT | TH_TERMINATE | TH_WAKING)) != TH_WAIT) {
1693 assert(waitq_is_null(thread->waitq));
1694 return KERN_NOT_WAITING;
1695 }
1696
1697 /* may drop and retake the thread lock */
1698 if (!waitq_is_null(wq: waitq) && !waitq_pull_thread_locked(waitq, thread)) {
1699 return KERN_NOT_WAITING;
1700 }
1701
1702 thread_go(thread, wresult, /* handoff */ false);
1703
1704 return KERN_SUCCESS;
1705}
1706
1707
1708/*
1709 * clear_wait:
1710 *
1711 * Clear the wait condition for the specified thread. Start the thread
1712 * executing if that is appropriate.
1713 *
1714 * parameters:
1715 * thread thread to awaken
1716 * result Wakeup result the thread should see
1717 */
1718kern_return_t
1719clear_wait(
1720 thread_t thread,
1721 wait_result_t result)
1722{
1723 kern_return_t ret;
1724 spl_t s;
1725
1726 s = splsched();
1727 thread_lock(thread);
1728
1729 ret = clear_wait_internal(thread, wresult: result);
1730
1731 if (thread == current_thread()) {
1732 /*
1733 * The thread must be ready to wait again immediately
1734 * after clearing its own wait.
1735 */
1736 assert((thread->state & TH_WAKING) == 0);
1737 }
1738
1739 thread_unlock(thread);
1740 splx(s);
1741 return ret;
1742}
1743
1744
1745/*
1746 * thread_wakeup_prim:
1747 *
1748 * Common routine for thread_wakeup, thread_wakeup_with_result,
1749 * and thread_wakeup_one.
1750 *
1751 */
1752kern_return_t
1753thread_wakeup_prim(
1754 event_t event,
1755 boolean_t one_thread,
1756 wait_result_t result)
1757{
1758 if (__improbable(event == NO_EVENT)) {
1759 panic("%s() called with NO_EVENT", __func__);
1760 }
1761
1762 struct waitq *wq = global_eventq(event);
1763
1764 if (one_thread) {
1765 return waitq_wakeup64_one(waitq: wq, CAST_EVENT64_T(event), result, flags: WAITQ_WAKEUP_DEFAULT);
1766 } else {
1767 return waitq_wakeup64_all(waitq: wq, CAST_EVENT64_T(event), result, flags: WAITQ_WAKEUP_DEFAULT);
1768 }
1769}
1770
1771/*
1772 * Wakeup a specified thread if and only if it's waiting for this event
1773 */
1774kern_return_t
1775thread_wakeup_thread(
1776 event_t event,
1777 thread_t thread)
1778{
1779 if (__improbable(event == NO_EVENT)) {
1780 panic("%s() called with NO_EVENT", __func__);
1781 }
1782
1783 if (__improbable(thread == THREAD_NULL)) {
1784 panic("%s() called with THREAD_NULL", __func__);
1785 }
1786
1787 struct waitq *wq = global_eventq(event);
1788
1789 return waitq_wakeup64_thread(waitq: wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1790}
1791
1792/*
1793 * Wakeup a thread waiting on an event and promote it to a priority.
1794 *
1795 * Requires woken thread to un-promote itself when done.
1796 */
1797kern_return_t
1798thread_wakeup_one_with_pri(
1799 event_t event,
1800 int priority)
1801{
1802 if (__improbable(event == NO_EVENT)) {
1803 panic("%s() called with NO_EVENT", __func__);
1804 }
1805
1806 struct waitq *wq = global_eventq(event);
1807
1808 return waitq_wakeup64_one(waitq: wq, CAST_EVENT64_T(event), THREAD_AWAKENED, flags: priority);
1809}
1810
1811/*
1812 * Wakeup a thread waiting on an event,
1813 * promote it to a priority,
1814 * and return a reference to the woken thread.
1815 *
1816 * Requires woken thread to un-promote itself when done.
1817 */
1818thread_t
1819thread_wakeup_identify(event_t event,
1820 int priority)
1821{
1822 if (__improbable(event == NO_EVENT)) {
1823 panic("%s() called with NO_EVENT", __func__);
1824 }
1825
1826 struct waitq *wq = global_eventq(event);
1827
1828 return waitq_wakeup64_identify(waitq: wq, CAST_EVENT64_T(event), THREAD_AWAKENED, flags: priority);
1829}
1830
1831/*
1832 * thread_bind:
1833 *
1834 * Force the current thread to execute on the specified processor.
1835 * Takes effect after the next thread_block().
1836 *
1837 * Returns the previous binding. PROCESSOR_NULL means
1838 * not bound.
1839 *
1840 * XXX - DO NOT export this to users - XXX
1841 */
1842processor_t
1843thread_bind(
1844 processor_t processor)
1845{
1846 thread_t self = current_thread();
1847 processor_t prev;
1848 spl_t s;
1849
1850 s = splsched();
1851 thread_lock(self);
1852
1853 prev = thread_bind_internal(thread: self, processor);
1854
1855 thread_unlock(self);
1856 splx(s);
1857
1858 return prev;
1859}
1860
1861void
1862thread_bind_during_wakeup(thread_t thread, processor_t processor)
1863{
1864 assert(!ml_get_interrupts_enabled());
1865 assert((thread->state & (TH_WAIT | TH_WAKING)) == (TH_WAIT | TH_WAKING));
1866#if MACH_ASSERT
1867 thread_lock_assert(thread, LCK_ASSERT_OWNED);
1868#endif
1869
1870 if (thread->bound_processor != processor) {
1871 thread_bind_internal(thread, processor);
1872 }
1873}
1874
1875void
1876thread_unbind_after_queue_shutdown(
1877 thread_t thread,
1878 processor_t processor __assert_only)
1879{
1880 assert(!ml_get_interrupts_enabled());
1881
1882 thread_lock(thread);
1883
1884 if (thread->bound_processor) {
1885 bool removed;
1886
1887 assert(thread->bound_processor == processor);
1888
1889 removed = thread_run_queue_remove(thread);
1890 /*
1891 * we can always unbind even if we didn't really remove the
1892 * thread from the runqueue
1893 */
1894 thread_bind_internal(thread, PROCESSOR_NULL);
1895 if (removed) {
1896 thread_run_queue_reinsert(thread, options: SCHED_TAILQ);
1897 }
1898 }
1899
1900 thread_unlock(thread);
1901}
1902
1903/*
1904 * thread_bind_internal:
1905 *
1906 * If the specified thread is not the current thread, and it is currently
1907 * running on another CPU, a remote AST must be sent to that CPU to cause
1908 * the thread to migrate to its bound processor. Otherwise, the migration
1909 * will occur at the next quantum expiration or blocking point.
1910 *
1911 * When the thread is the current thread, and explicit thread_block() should
1912 * be used to force the current processor to context switch away and
1913 * let the thread migrate to the bound processor.
1914 *
1915 * Thread must be locked, and at splsched.
1916 */
1917
1918static processor_t
1919thread_bind_internal(
1920 thread_t thread,
1921 processor_t processor)
1922{
1923 processor_t prev;
1924
1925 /* <rdar://problem/15102234> */
1926 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1927 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1928 thread_assert_runq_null(thread);
1929
1930 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND),
1931 thread_tid(thread), processor ? processor->cpu_id : ~0ul, 0, 0, 0);
1932
1933 prev = thread->bound_processor;
1934 thread->bound_processor = processor;
1935
1936 return prev;
1937}
1938
1939/*
1940 * thread_vm_bind_group_add:
1941 *
1942 * The "VM bind group" is a special mechanism to mark a collection
1943 * of threads from the VM subsystem that, in general, should be scheduled
1944 * with only one CPU of parallelism. To accomplish this, we initially
1945 * bind all the threads to the master processor, which has the effect
1946 * that only one of the threads in the group can execute at once, including
1947 * preempting threads in the group that are a lower priority. Future
1948 * mechanisms may use more dynamic mechanisms to prevent the collection
1949 * of VM threads from using more CPU time than desired.
1950 *
1951 * The current implementation can result in priority inversions where
1952 * compute-bound priority 95 or realtime threads that happen to have
1953 * landed on the master processor prevent the VM threads from running.
1954 * When this situation is detected, we unbind the threads for one
1955 * scheduler tick to allow the scheduler to run the threads an
1956 * additional CPUs, before restoring the binding (assuming high latency
1957 * is no longer a problem).
1958 */
1959
1960/*
1961 * The current max is provisioned for:
1962 * vm_compressor_swap_trigger_thread (92)
1963 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1964 * vm_pageout_continue (92)
1965 * memorystatus_thread (95)
1966 */
1967#define MAX_VM_BIND_GROUP_COUNT (5)
1968decl_simple_lock_data(static, sched_vm_group_list_lock);
1969static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1970static int sched_vm_group_thread_count;
1971static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1972
1973void
1974thread_vm_bind_group_add(void)
1975{
1976 thread_t self = current_thread();
1977
1978 thread_reference(thread: self);
1979 self->options |= TH_OPT_SCHED_VM_GROUP;
1980
1981 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
1982 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1983 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1984 simple_unlock(&sched_vm_group_list_lock);
1985
1986 thread_bind(master_processor);
1987
1988 /* Switch to bound processor if not already there */
1989 thread_block(THREAD_CONTINUE_NULL);
1990}
1991
1992static void
1993sched_vm_group_maintenance(void)
1994{
1995 uint64_t ctime = mach_absolute_time();
1996 uint64_t longtime = ctime - sched_tick_interval;
1997 int i;
1998 spl_t s;
1999 boolean_t high_latency_observed = FALSE;
2000 boolean_t runnable_and_not_on_runq_observed = FALSE;
2001 boolean_t bind_target_changed = FALSE;
2002 processor_t bind_target = PROCESSOR_NULL;
2003
2004 /* Make sure nobody attempts to add new threads while we are enumerating them */
2005 simple_lock(&sched_vm_group_list_lock, LCK_GRP_NULL);
2006
2007 s = splsched();
2008
2009 for (i = 0; i < sched_vm_group_thread_count; i++) {
2010 thread_t thread = sched_vm_group_thread_list[i];
2011 assert(thread != THREAD_NULL);
2012 thread_lock(thread);
2013 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_RUN) {
2014 if (thread_get_runq(thread) != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
2015 high_latency_observed = TRUE;
2016 } else if (thread_get_runq(thread) == PROCESSOR_NULL) {
2017 /* There are some cases where a thread be transitiong that also fall into this case */
2018 runnable_and_not_on_runq_observed = TRUE;
2019 }
2020 }
2021 thread_unlock(thread);
2022
2023 if (high_latency_observed && runnable_and_not_on_runq_observed) {
2024 /* All the things we are looking for are true, stop looking */
2025 break;
2026 }
2027 }
2028
2029 splx(s);
2030
2031 if (sched_vm_group_temporarily_unbound) {
2032 /* If we turned off binding, make sure everything is OK before rebinding */
2033 if (!high_latency_observed) {
2034 /* rebind */
2035 bind_target_changed = TRUE;
2036 bind_target = master_processor;
2037 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
2038 }
2039 } else {
2040 /*
2041 * Check if we're in a bad state, which is defined by high
2042 * latency with no core currently executing a thread. If a
2043 * single thread is making progress on a CPU, that means the
2044 * binding concept to reduce parallelism is working as
2045 * designed.
2046 */
2047 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
2048 /* unbind */
2049 bind_target_changed = TRUE;
2050 bind_target = PROCESSOR_NULL;
2051 sched_vm_group_temporarily_unbound = TRUE;
2052 }
2053 }
2054
2055 if (bind_target_changed) {
2056 s = splsched();
2057 for (i = 0; i < sched_vm_group_thread_count; i++) {
2058 thread_t thread = sched_vm_group_thread_list[i];
2059 boolean_t removed;
2060 assert(thread != THREAD_NULL);
2061
2062 thread_lock(thread);
2063 removed = thread_run_queue_remove(thread);
2064 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
2065 thread_bind_internal(thread, processor: bind_target);
2066 } else {
2067 /*
2068 * Thread was in the middle of being context-switched-to,
2069 * or was in the process of blocking. To avoid switching the bind
2070 * state out mid-flight, defer the change if possible.
2071 */
2072 if (bind_target == PROCESSOR_NULL) {
2073 thread_bind_internal(thread, processor: bind_target);
2074 } else {
2075 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
2076 }
2077 }
2078
2079 if (removed) {
2080 thread_run_queue_reinsert(thread, options: SCHED_PREEMPT | SCHED_TAILQ);
2081 }
2082 thread_unlock(thread);
2083 }
2084 splx(s);
2085 }
2086
2087 simple_unlock(&sched_vm_group_list_lock);
2088}
2089
2090#if defined(__x86_64__)
2091#define SCHED_AVOID_CPU0 1
2092#else
2093#define SCHED_AVOID_CPU0 0
2094#endif
2095
2096int sched_allow_rt_smt = 1;
2097int sched_avoid_cpu0 = SCHED_AVOID_CPU0;
2098int sched_allow_rt_steal = 1;
2099int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */
2100
2101int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS;
2102
2103int
2104sched_get_rt_n_backup_processors(void)
2105{
2106 return sched_rt_n_backup_processors;
2107}
2108
2109void
2110sched_set_rt_n_backup_processors(int n)
2111{
2112 if (n < 0) {
2113 n = 0;
2114 } else if (n > SCHED_MAX_BACKUP_PROCESSORS) {
2115 n = SCHED_MAX_BACKUP_PROCESSORS;
2116 }
2117
2118 sched_rt_n_backup_processors = n;
2119}
2120
2121int sched_rt_runq_strict_priority = false;
2122
2123inline static processor_set_t
2124change_locked_pset(processor_set_t current_pset, processor_set_t new_pset)
2125{
2126 if (current_pset != new_pset) {
2127 pset_unlock(current_pset);
2128 pset_lock(new_pset);
2129 }
2130
2131 return new_pset;
2132}
2133
2134/*
2135 * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
2136 * rebalancing opportunity exists when a core is (instantaneously) idle, but
2137 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
2138 * IPI thrash if this core does not remain idle following the load balancing ASTs
2139 * Idle "thrash", when IPI issue is followed by idle entry/core power down
2140 * followed by a wakeup shortly thereafter.
2141 */
2142
2143#if (DEVELOPMENT || DEBUG)
2144int sched_smt_balance = 1;
2145#endif
2146
2147/* Invoked with pset locked, returns with pset unlocked */
2148bool
2149sched_SMT_balance(processor_t cprocessor, processor_set_t cpset)
2150{
2151 processor_t ast_processor = NULL;
2152
2153#if (DEVELOPMENT || DEBUG)
2154 if (__improbable(sched_smt_balance == 0)) {
2155 goto smt_balance_exit;
2156 }
2157#endif
2158
2159 assert(cprocessor == current_processor());
2160 if (cprocessor->is_SMT == FALSE) {
2161 goto smt_balance_exit;
2162 }
2163
2164 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
2165
2166 /* Determine if both this processor and its sibling are idle,
2167 * indicating an SMT rebalancing opportunity.
2168 */
2169 if (sib_processor->state != PROCESSOR_IDLE) {
2170 goto smt_balance_exit;
2171 }
2172
2173 processor_t sprocessor;
2174
2175 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2176 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
2177 ~cpset->primary_map);
2178 for (int cpuid = lsb_first(bitmap: running_secondary_map); cpuid >= 0; cpuid = lsb_next(bitmap: running_secondary_map, previous_bit: cpuid)) {
2179 sprocessor = processor_array[cpuid];
2180 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
2181 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
2182 ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2183 if (ipi_type != SCHED_IPI_NONE) {
2184 assert(sprocessor != cprocessor);
2185 ast_processor = sprocessor;
2186 break;
2187 }
2188 }
2189 }
2190
2191smt_balance_exit:
2192 pset_unlock(cpset);
2193
2194 if (ast_processor) {
2195 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
2196 sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2197 }
2198 return false;
2199}
2200
2201static cpumap_t
2202pset_available_cpumap(processor_set_t pset)
2203{
2204 return pset->cpu_available_map & pset->recommended_bitmask;
2205}
2206
2207int
2208pset_available_cpu_count(processor_set_t pset)
2209{
2210 return bit_count(x: pset_available_cpumap(pset));
2211}
2212
2213bool
2214pset_is_recommended(processor_set_t pset)
2215{
2216 if (!pset) {
2217 return false;
2218 }
2219 return pset_available_cpu_count(pset) > 0;
2220}
2221
2222static cpumap_t
2223pset_available_but_not_running_cpumap(processor_set_t pset)
2224{
2225 return (pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
2226 pset->recommended_bitmask;
2227}
2228
2229bool
2230pset_has_stealable_threads(processor_set_t pset)
2231{
2232 pset_assert_locked(pset);
2233
2234 cpumap_t avail_map = pset_available_but_not_running_cpumap(pset);
2235 /*
2236 * Secondary CPUs never steal, so allow stealing of threads if there are more threads than
2237 * available primary CPUs
2238 */
2239 avail_map &= pset->primary_map;
2240
2241 return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(x: avail_map));
2242}
2243
2244static cpumap_t
2245pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset)
2246{
2247 cpumap_t avail_map = pset_available_cpumap(pset);
2248 if (!sched_allow_rt_smt) {
2249 /*
2250 * Secondary CPUs are not allowed to run RT threads, so
2251 * only primary CPUs should be included
2252 */
2253 avail_map &= pset->primary_map;
2254 }
2255
2256 return avail_map & ~pset->realtime_map;
2257}
2258
2259static bool
2260pset_needs_a_followup_IPI(processor_set_t pset)
2261{
2262 int nbackup_cpus = 0;
2263
2264 if (rt_runq_is_low_latency(pset)) {
2265 nbackup_cpus = sched_rt_n_backup_processors;
2266 }
2267
2268 int rt_rq_count = rt_runq_count(pset);
2269
2270 return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(x: pset->pending_AST_URGENT_cpu_mask)) > 0);
2271}
2272
2273bool
2274pset_has_stealable_rt_threads(processor_set_t pset)
2275{
2276 pset_node_t node = pset->node;
2277 if (bit_count(x: node->pset_map) == 1) {
2278 return false;
2279 }
2280
2281 cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset);
2282
2283 return rt_runq_count(pset) > bit_count(x: avail_map);
2284}
2285
2286static void
2287pset_update_rt_stealable_state(processor_set_t pset)
2288{
2289 if (pset_has_stealable_rt_threads(pset)) {
2290 pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset);
2291 } else {
2292 pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE;
2293 }
2294}
2295
2296static void
2297clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number)
2298{
2299 /* Acknowledge any pending IPIs here with pset lock held */
2300 pset_assert_locked(pset);
2301 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2302 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END,
2303 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, trace_point_number);
2304 }
2305 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2306
2307#if defined(CONFIG_SCHED_DEFERRED_AST)
2308 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
2309#endif
2310}
2311
2312/*
2313 * Called with pset locked, on a processor that is committing to run a new thread
2314 * Will transition an idle or dispatching processor to running as it picks up
2315 * the first new thread from the idle thread.
2316 */
2317static void
2318pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, thread_t new_thread)
2319{
2320 pset_assert_locked(pset);
2321
2322 if (processor->state == PROCESSOR_DISPATCHING || processor->state == PROCESSOR_IDLE) {
2323 assert(current_thread() == processor->idle_thread);
2324
2325 /*
2326 * Dispatching processor is now committed to running new_thread,
2327 * so change its state to PROCESSOR_RUNNING.
2328 */
2329 pset_update_processor_state(pset, processor, new_state: PROCESSOR_RUNNING);
2330 } else {
2331 assert((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_SHUTDOWN));
2332 }
2333
2334 processor_state_update_from_thread(processor, thread: new_thread, true);
2335
2336 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2337 bit_set(pset->realtime_map, processor->cpu_id);
2338 } else {
2339 bit_clear(pset->realtime_map, processor->cpu_id);
2340 }
2341 pset_update_rt_stealable_state(pset);
2342
2343 pset_node_t node = pset->node;
2344
2345 if (bit_count(x: node->pset_map) == 1) {
2346 /* Node has only a single pset, so skip node pset map updates */
2347 return;
2348 }
2349
2350 cpumap_t avail_map = pset_available_cpumap(pset);
2351
2352 if (new_thread->sched_pri >= BASEPRI_RTQUEUES) {
2353 if ((avail_map & pset->realtime_map) == avail_map) {
2354 /* No more non-RT CPUs in this pset */
2355 atomic_bit_clear(map: &node->pset_non_rt_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2356 }
2357 avail_map &= pset->primary_map;
2358 if ((avail_map & pset->realtime_map) == avail_map) {
2359 /* No more non-RT primary CPUs in this pset */
2360 atomic_bit_clear(map: &node->pset_non_rt_primary_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2361 }
2362 } else {
2363 if ((avail_map & pset->realtime_map) != avail_map) {
2364 if (!bit_test(atomic_load(&node->pset_non_rt_map), pset->pset_id)) {
2365 atomic_bit_set(map: &node->pset_non_rt_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2366 }
2367 }
2368 avail_map &= pset->primary_map;
2369 if ((avail_map & pset->realtime_map) != avail_map) {
2370 if (!bit_test(atomic_load(&node->pset_non_rt_primary_map), pset->pset_id)) {
2371 atomic_bit_set(map: &node->pset_non_rt_primary_map, n: pset->pset_id, mem_order: memory_order_relaxed);
2372 }
2373 }
2374 }
2375}
2376
2377static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills);
2378static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline,
2379 processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus);
2380static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries);
2381#if defined(__x86_64__)
2382static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups);
2383static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups);
2384#endif
2385static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup);
2386static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor);
2387
2388static bool
2389other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline)
2390{
2391 pset_map_t pset_map = stealing_pset->node->pset_map;
2392
2393 bit_clear(pset_map, stealing_pset->pset_id);
2394
2395 for (int pset_id = lsb_first(bitmap: pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: pset_map, previous_bit: pset_id)) {
2396 processor_set_t nset = pset_array[pset_id];
2397
2398 if (deadline_add(d: nset->stealable_rt_threads_earliest_deadline, e: rt_deadline_epsilon) < earliest_deadline) {
2399 return true;
2400 }
2401 }
2402
2403 return false;
2404}
2405
2406/*
2407 * starting_pset must be locked, but returns true if it is unlocked before return
2408 */
2409static bool
2410choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi,
2411 processor_t *result_processor, sched_ipi_type_t *result_ipi_type)
2412{
2413 bool starting_pset_is_unlocked = false;
2414 uint64_t earliest_deadline = rt_runq_earliest_deadline(pset: starting_pset);
2415 int max_pri = rt_runq_priority(pset: starting_pset);
2416 __kdebug_only uint64_t spill_tid = thread_tid(thread: rt_runq_first(rt_runq: &starting_pset->rt_runq));
2417 processor_set_t pset = starting_pset;
2418 processor_t next_rt_processor = PROCESSOR_NULL;
2419 if (spill_ipi) {
2420 processor_set_t nset = next_pset(pset);
2421 assert(nset != starting_pset);
2422 pset = change_locked_pset(current_pset: pset, new_pset: nset);
2423 starting_pset_is_unlocked = true;
2424 }
2425 do {
2426 const bool consider_secondaries = true;
2427 next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, minimum_deadline: earliest_deadline, skip_processor: chosen_processor, consider_secondaries);
2428 if (next_rt_processor == PROCESSOR_NULL) {
2429 if (!spill_ipi) {
2430 break;
2431 }
2432 processor_set_t nset = next_pset(pset);
2433 if (nset == starting_pset) {
2434 break;
2435 }
2436 pset = change_locked_pset(current_pset: pset, new_pset: nset);
2437 starting_pset_is_unlocked = true;
2438 }
2439 } while (next_rt_processor == PROCESSOR_NULL);
2440 if (next_rt_processor) {
2441 if (pset != starting_pset) {
2442 if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) {
2443 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START,
2444 next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid);
2445 }
2446 }
2447 *result_ipi_type = sched_ipi_action(dst: next_rt_processor, NULL, event: SCHED_IPI_EVENT_RT_PREEMPT);
2448 *result_processor = next_rt_processor;
2449 }
2450 if (pset != starting_pset) {
2451 pset_unlock(pset);
2452 }
2453
2454 return starting_pset_is_unlocked;
2455}
2456
2457/*
2458 * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond
2459 * followup processor - used in thread_select when there are still threads on the run queue and available processors
2460 * spill processor - a processor in a different processor set that is signalled to steal a thread from this run queue
2461 */
2462typedef enum {
2463 none,
2464 backup,
2465 followup,
2466 spill
2467} next_processor_type_t;
2468
2469#undef LOOP_COUNT
2470#ifdef LOOP_COUNT
2471int max_loop_count[MAX_SCHED_CPUS] = { 0 };
2472#endif
2473
2474/*
2475 * thread_select:
2476 *
2477 * Select a new thread for the current processor to execute.
2478 *
2479 * May select the current thread, which must be locked.
2480 */
2481static thread_t
2482thread_select(thread_t thread,
2483 processor_t processor,
2484 ast_t *reason)
2485{
2486 processor_set_t pset = processor->processor_set;
2487 thread_t new_thread = THREAD_NULL;
2488
2489 assert(processor == current_processor());
2490 assert((thread->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
2491
2492 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_START,
2493 0, pset->pending_AST_URGENT_cpu_mask, 0, 0);
2494
2495 __kdebug_only int idle_reason = 0;
2496 __kdebug_only int delay_count = 0;
2497
2498#if defined(__x86_64__)
2499 int timeout_count = sched_backup_cpu_timeout_count;
2500 if ((sched_avoid_cpu0 == 1) && (processor->cpu_id == 0)) {
2501 /* Prefer cpu0 as backup */
2502 timeout_count--;
2503 } else if ((sched_avoid_cpu0 == 2) && (processor->processor_primary != processor)) {
2504 /* Prefer secondary cpu as backup */
2505 timeout_count--;
2506 }
2507#endif
2508 bool pending_AST_URGENT = false;
2509 bool pending_AST_PREEMPT = false;
2510
2511#ifdef LOOP_COUNT
2512 int loop_count = -1;
2513#endif
2514
2515 do {
2516 /*
2517 * Update the priority.
2518 */
2519 if (SCHED(can_update_priority)(thread)) {
2520 SCHED(update_priority)(thread);
2521 }
2522
2523 pset_lock(pset);
2524
2525restart:
2526#ifdef LOOP_COUNT
2527 loop_count++;
2528 if (loop_count > max_loop_count[processor->cpu_id]) {
2529 max_loop_count[processor->cpu_id] = loop_count;
2530 if (bit_count(loop_count) == 1) {
2531 kprintf("[%d]%s>max_loop_count = %d\n", processor->cpu_id, __FUNCTION__, loop_count);
2532 }
2533 }
2534#endif
2535 pending_AST_URGENT = bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
2536 pending_AST_PREEMPT = bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
2537
2538 processor_state_update_from_thread(processor, thread, true);
2539
2540 idle_reason = 0;
2541
2542 processor_t ast_processor = PROCESSOR_NULL;
2543 processor_t next_rt_processor = PROCESSOR_NULL;
2544 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2545 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2546
2547 assert(processor->state != PROCESSOR_OFF_LINE);
2548
2549 /*
2550 * Bound threads are dispatched to a processor without going through
2551 * choose_processor(), so in those cases we must continue trying to dequeue work
2552 * as we are the only option.
2553 */
2554 if (!SCHED(processor_bound_count)(processor)) {
2555 if (!processor->is_recommended) {
2556 /*
2557 * The performance controller has provided a hint to not dispatch more threads,
2558 */
2559 idle_reason = 1;
2560 goto send_followup_ipi_before_idle;
2561 } else if (rt_runq_count(pset)) {
2562 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, false);
2563 /* Give the current RT thread a chance to complete */
2564 ok_to_run_realtime_thread |= (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice);
2565#if defined(__x86_64__)
2566 /*
2567 * On Intel we want to avoid SMT secondary processors and processor 0
2568 * but allow them to be used as backup processors in case the preferred chosen
2569 * processor is delayed by interrupts or processor stalls. So if it is
2570 * not ok_to_run_realtime_thread as preferred (sched_ok_to_run_realtime_thread(pset, processor, as_backup=false))
2571 * but ok_to_run_realtime_thread as backup (sched_ok_to_run_realtime_thread(pset, processor, as_backup=true))
2572 * we delay up to (timeout_count * 10us) to give the preferred processor chance
2573 * to grab the thread before the (current) backup processor does.
2574 *
2575 * timeout_count defaults to 5 but can be tuned using sysctl kern.sched_backup_cpu_timeout_count
2576 * on DEVELOPMENT || DEBUG kernels. It is also adjusted (see above) depending on whether we want to use
2577 * cpu0 before secondary cpus or not.
2578 */
2579 if (!ok_to_run_realtime_thread) {
2580 if (sched_ok_to_run_realtime_thread(pset, processor, true)) {
2581 if (timeout_count-- > 0) {
2582 pset_unlock(pset);
2583 thread_unlock(thread);
2584 delay(10);
2585 delay_count++;
2586 thread_lock(thread);
2587 pset_lock(pset);
2588 goto restart;
2589 }
2590 ok_to_run_realtime_thread = true;
2591 }
2592 }
2593#endif
2594 if (!ok_to_run_realtime_thread) {
2595 idle_reason = 2;
2596 goto send_followup_ipi_before_idle;
2597 }
2598 } else if (processor->processor_primary != processor) {
2599 /*
2600 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
2601 * we should look for work only under the same conditions that choose_processor()
2602 * would have assigned work, which is when all primary processors have been assigned work.
2603 */
2604 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
2605 /* There are idle primaries */
2606 idle_reason = 3;
2607 goto idle;
2608 }
2609 }
2610 }
2611
2612 /*
2613 * Test to see if the current thread should continue
2614 * to run on this processor. Must not be attempting to wait, and not
2615 * bound to a different processor, nor be in the wrong
2616 * processor set, nor be forced to context switch by TH_SUSP.
2617 *
2618 * Note that there are never any RT threads in the regular runqueue.
2619 *
2620 * This code is very insanely tricky.
2621 */
2622
2623 /* i.e. not waiting, not TH_SUSP'ed */
2624 bool still_running = ((thread->state & (TH_TERMINATE | TH_IDLE | TH_WAIT | TH_RUN | TH_SUSP)) == TH_RUN);
2625
2626 /*
2627 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
2628 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
2629 * <rdar://problem/47907700>
2630 *
2631 * A yielding thread shouldn't be forced to context switch.
2632 */
2633
2634 bool is_yielding = (*reason & AST_YIELD) == AST_YIELD;
2635
2636 bool needs_smt_rebalance = !is_yielding && thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor;
2637
2638 bool affinity_mismatch = thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset;
2639
2640 bool bound_elsewhere = thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor;
2641
2642 bool avoid_processor = !is_yielding && SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, *reason);
2643
2644 bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor, true);
2645
2646 bool current_thread_can_keep_running = (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor);
2647 if (current_thread_can_keep_running) {
2648 /*
2649 * This thread is eligible to keep running on this processor.
2650 *
2651 * RT threads with un-expired quantum stay on processor,
2652 * unless there's a valid RT thread with an earlier deadline
2653 * and it is still ok_to_run_realtime_thread.
2654 */
2655 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
2656 /*
2657 * Pick a new RT thread only if ok_to_run_realtime_thread
2658 * (but the current thread is allowed to complete).
2659 */
2660 if (ok_to_run_realtime_thread) {
2661 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2662 goto pick_new_rt_thread;
2663 }
2664 if (rt_runq_priority(pset) > thread->sched_pri) {
2665 if (sched_rt_runq_strict_priority) {
2666 /* The next RT thread is better, so pick it off the runqueue. */
2667 goto pick_new_rt_thread;
2668 }
2669
2670 /*
2671 * See if the current lower priority thread can continue to run without causing
2672 * the higher priority thread on the runq queue to miss its deadline.
2673 */
2674 thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset));
2675 if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) {
2676 /* The next RT thread is better, so pick it off the runqueue. */
2677 goto pick_new_rt_thread;
2678 }
2679 } else if ((rt_runq_count(pset) > 0) && (deadline_add(d: rt_runq_earliest_deadline(pset), e: rt_deadline_epsilon) < thread->realtime.deadline)) {
2680 /* The next RT thread is better, so pick it off the runqueue. */
2681 goto pick_new_rt_thread;
2682 }
2683 if (other_psets_have_earlier_rt_threads_pending(stealing_pset: pset, earliest_deadline: thread->realtime.deadline)) {
2684 goto pick_new_rt_thread;
2685 }
2686 }
2687
2688 /* This is still the best RT thread to run. */
2689 processor->deadline = thread->realtime.deadline;
2690
2691 sched_update_pset_load_average(pset, curtime: 0);
2692
2693 clear_pending_AST_bits(pset, processor, trace_point_number: 1);
2694
2695 next_rt_processor = PROCESSOR_NULL;
2696 next_rt_ipi_type = SCHED_IPI_NONE;
2697
2698 bool pset_unlocked = false;
2699 __kdebug_only next_processor_type_t nptype = none;
2700 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2701 nptype = spill;
2702 pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, true, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2703 } else if (pset_needs_a_followup_IPI(pset)) {
2704 nptype = followup;
2705 pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, false, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2706 }
2707 if (!pset_unlocked) {
2708 pset_unlock(pset);
2709 }
2710
2711 if (next_rt_processor) {
2712 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2713 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 2);
2714 sched_ipi_perform(dst: next_rt_processor, ipi: next_rt_ipi_type);
2715 }
2716
2717 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2718 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 1);
2719 return thread;
2720 }
2721
2722 if ((rt_runq_count(pset) == 0) &&
2723 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
2724 /* This thread is still the highest priority runnable (non-idle) thread */
2725 processor->deadline = RT_DEADLINE_NONE;
2726
2727 sched_update_pset_load_average(pset, curtime: 0);
2728
2729 clear_pending_AST_bits(pset, processor, trace_point_number: 2);
2730
2731 pset_unlock(pset);
2732
2733 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2734 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 2);
2735 return thread;
2736 }
2737 } else {
2738 /*
2739 * This processor must context switch.
2740 * If it's due to a rebalance, we should aggressively find this thread a new home.
2741 */
2742 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) {
2743 *reason |= AST_REBALANCE;
2744 }
2745 }
2746
2747 bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
2748 (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
2749 (processor->processor_secondary->state == PROCESSOR_IDLE));
2750
2751 /* OK, so we're not going to run the current thread. Look at the RT queue. */
2752 if (ok_to_run_realtime_thread) {
2753pick_new_rt_thread:
2754 new_thread = sched_rt_choose_thread(pset);
2755 if (new_thread != THREAD_NULL) {
2756 processor->deadline = new_thread->realtime.deadline;
2757 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2758
2759 clear_pending_AST_bits(pset, processor, trace_point_number: 3);
2760
2761 if (processor->processor_secondary != NULL) {
2762 processor_t sprocessor = processor->processor_secondary;
2763 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2764 ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2765 ast_processor = sprocessor;
2766 }
2767 }
2768 }
2769 }
2770
2771send_followup_ipi_before_idle:
2772 /* This might not have been cleared if we didn't call sched_rt_choose_thread() */
2773 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
2774 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5);
2775 }
2776 __kdebug_only next_processor_type_t nptype = none;
2777 bool pset_unlocked = false;
2778 if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) {
2779 nptype = spill;
2780 pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, true, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2781 } else if (pset_needs_a_followup_IPI(pset)) {
2782 nptype = followup;
2783 pset_unlocked = choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor: processor, false, result_processor: &next_rt_processor, result_ipi_type: &next_rt_ipi_type);
2784 }
2785
2786 assert(new_thread || !ast_processor);
2787 if (new_thread || next_rt_processor) {
2788 if (!pset_unlocked) {
2789 pset_unlock(pset);
2790 pset_unlocked = true;
2791 }
2792 if (ast_processor == next_rt_processor) {
2793 ast_processor = PROCESSOR_NULL;
2794 ipi_type = SCHED_IPI_NONE;
2795 }
2796
2797 if (ast_processor) {
2798 sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2799 }
2800
2801 if (next_rt_processor) {
2802 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
2803 next_rt_processor->cpu_id, next_rt_processor->state, nptype, 3);
2804 sched_ipi_perform(dst: next_rt_processor, ipi: next_rt_ipi_type);
2805 }
2806
2807 if (new_thread) {
2808 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2809 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 3);
2810 return new_thread;
2811 }
2812 }
2813
2814 if (pset_unlocked) {
2815 pset_lock(pset);
2816 }
2817
2818 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2819 /* Things changed while we dropped the lock */
2820 goto restart;
2821 }
2822
2823 if (processor->is_recommended) {
2824 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
2825 if (sched_ok_to_run_realtime_thread(pset, processor, true) && (spill_pending || rt_runq_count(pset))) {
2826 /* Things changed while we dropped the lock */
2827 goto restart;
2828 }
2829
2830 if ((processor->processor_primary != processor) && (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES)) {
2831 /* secondary can only run realtime thread */
2832 if (idle_reason == 0) {
2833 idle_reason = 4;
2834 }
2835 goto idle;
2836 }
2837 } else if (!SCHED(processor_bound_count)(processor)) {
2838 /* processor not recommended and no bound threads */
2839 if (idle_reason == 0) {
2840 idle_reason = 5;
2841 }
2842 goto idle;
2843 }
2844
2845 processor->deadline = RT_DEADLINE_NONE;
2846
2847 /* No RT threads, so let's look at the regular threads. */
2848 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2849 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2850
2851 clear_pending_AST_bits(pset, processor, trace_point_number: 4);
2852
2853 ast_processor = PROCESSOR_NULL;
2854 ipi_type = SCHED_IPI_NONE;
2855
2856 processor_t sprocessor = processor->processor_secondary;
2857 if (sprocessor != NULL) {
2858 if (sprocessor->state == PROCESSOR_RUNNING) {
2859 if (thread_no_smt(thread: new_thread)) {
2860 ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_SMT_REBAL);
2861 ast_processor = sprocessor;
2862 }
2863 } else if (secondary_forced_idle && !thread_no_smt(thread: new_thread) && pset_has_stealable_threads(pset)) {
2864 ipi_type = sched_ipi_action(dst: sprocessor, NULL, event: SCHED_IPI_EVENT_PREEMPT);
2865 ast_processor = sprocessor;
2866 }
2867 }
2868 pset_unlock(pset);
2869
2870 if (ast_processor) {
2871 sched_ipi_perform(dst: ast_processor, ipi: ipi_type);
2872 }
2873 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2874 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 4);
2875 return new_thread;
2876 }
2877
2878 if (processor->must_idle) {
2879 processor->must_idle = false;
2880 *reason |= AST_REBALANCE;
2881 idle_reason = 6;
2882 goto idle;
2883 }
2884
2885 if (SCHED(steal_thread_enabled)(pset) && (processor->processor_primary == processor)) {
2886 /*
2887 * No runnable threads, attempt to steal
2888 * from other processors. Returns with pset lock dropped.
2889 */
2890
2891 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2892 pset_lock(pset);
2893 pset_commit_processor_to_new_thread(pset, processor, new_thread);
2894 if (!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
2895 /*
2896 * A realtime thread choose this processor while it was DISPATCHING
2897 * and the pset lock was dropped
2898 */
2899 ast_on(AST_URGENT | AST_PREEMPT);
2900 }
2901
2902 clear_pending_AST_bits(pset, processor, trace_point_number: 5);
2903
2904 pset_unlock(pset);
2905
2906 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2907 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 5);
2908 return new_thread;
2909 }
2910
2911 /*
2912 * If other threads have appeared, shortcut
2913 * around again.
2914 */
2915 if (SCHED(processor_bound_count)(processor)) {
2916 continue;
2917 }
2918 if (processor->is_recommended) {
2919 if (!SCHED(processor_queue_empty)(processor) || (sched_ok_to_run_realtime_thread(pset, processor, true) && (rt_runq_count(pset) > 0))) {
2920 continue;
2921 }
2922 }
2923
2924 pset_lock(pset);
2925 }
2926
2927idle:
2928 /* Someone selected this processor while we had dropped the lock */
2929 if ((!pending_AST_URGENT && bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) ||
2930 (!pending_AST_PREEMPT && bit_test(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id))) {
2931 goto restart;
2932 }
2933
2934 if ((idle_reason == 0) && current_thread_can_keep_running) {
2935 /* This thread is the only runnable (non-idle) thread */
2936 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
2937 processor->deadline = thread->realtime.deadline;
2938 } else {
2939 processor->deadline = RT_DEADLINE_NONE;
2940 }
2941
2942 sched_update_pset_load_average(pset, curtime: 0);
2943
2944 clear_pending_AST_bits(pset, processor, trace_point_number: 6);
2945
2946 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2947 (uintptr_t)thread_tid(thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 6);
2948 pset_unlock(pset);
2949 return thread;
2950 }
2951
2952 /*
2953 * Nothing is runnable, or this processor must be forced idle,
2954 * so set this processor idle if it was running.
2955 */
2956 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
2957 pset_update_processor_state(pset, processor, new_state: PROCESSOR_IDLE);
2958 processor_state_update_idle(processor);
2959 }
2960 pset_update_rt_stealable_state(pset);
2961
2962 clear_pending_AST_bits(pset, processor, trace_point_number: 7);
2963
2964 /* Invoked with pset locked, returns with pset unlocked */
2965 processor->next_idle_short = SCHED(processor_balance)(processor, pset);
2966
2967 new_thread = processor->idle_thread;
2968 } while (new_thread == THREAD_NULL);
2969
2970 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SELECT) | DBG_FUNC_END,
2971 (uintptr_t)thread_tid(new_thread), pset->pending_AST_URGENT_cpu_mask, delay_count, 10 + idle_reason);
2972 return new_thread;
2973}
2974
2975/*
2976 * thread_invoke
2977 *
2978 * Called at splsched with neither thread locked.
2979 *
2980 * Perform a context switch and start executing the new thread.
2981 *
2982 * Returns FALSE when the context switch didn't happen.
2983 * The reference to the new thread is still consumed.
2984 *
2985 * "self" is what is currently running on the processor,
2986 * "thread" is the new thread to context switch to
2987 * (which may be the same thread in some cases)
2988 */
2989static boolean_t
2990thread_invoke(
2991 thread_t self,
2992 thread_t thread,
2993 ast_t reason)
2994{
2995 if (__improbable(get_preemption_level() != 0)) {
2996 int pl = get_preemption_level();
2997 panic("thread_invoke: preemption_level %d, possible cause: %s",
2998 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2999 "blocking while holding a spinlock, or within interrupt context"));
3000 }
3001
3002 thread_continue_t continuation = self->continuation;
3003 void *parameter = self->parameter;
3004
3005 struct recount_snap snap = { 0 };
3006 recount_snapshot(snap: &snap);
3007 uint64_t ctime = snap.rsn_time_mach;
3008
3009 check_monotonic_time(ctime);
3010
3011#ifdef CONFIG_MACH_APPROXIMATE_TIME
3012 commpage_update_mach_approximate_time(ctime);
3013#endif
3014
3015 if (ctime < thread->last_made_runnable_time) {
3016 panic("Non-monotonic time: invoke at 0x%llx, runnable at 0x%llx",
3017 ctime, thread->last_made_runnable_time);
3018 }
3019
3020#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3021 if (!((thread->state & TH_IDLE) != 0 ||
3022 ((reason & AST_HANDOFF) && self->sched_mode == TH_MODE_REALTIME))) {
3023 sched_timeshare_consider_maintenance(ctime, true);
3024 }
3025#endif
3026
3027 recount_log_switch_thread(snap: &snap);
3028
3029 assert_thread_magic(self);
3030 assert(self == current_thread());
3031 thread_assert_runq_null(thread: self);
3032 assert((self->state & (TH_RUN | TH_TERMINATE2)) == TH_RUN);
3033
3034 thread_lock(thread);
3035
3036 assert_thread_magic(thread);
3037 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
3038 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
3039 thread_assert_runq_null(thread);
3040
3041 /* Update SFI class based on other factors */
3042 thread->sfi_class = sfi_thread_classify(thread);
3043
3044 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
3045 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
3046 /*
3047 * In case a base_pri update happened between the timestamp and
3048 * taking the thread lock
3049 */
3050 if (ctime <= thread->last_basepri_change_time) {
3051 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
3052 }
3053
3054 /* Allow realtime threads to hang onto a stack. */
3055 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) {
3056 self->reserved_stack = self->kernel_stack;
3057 }
3058
3059 /* Prepare for spin debugging */
3060#if SCHED_HYGIENE_DEBUG
3061 ml_spin_debug_clear(thread);
3062#endif
3063
3064 if (continuation != NULL) {
3065 if (!thread->kernel_stack) {
3066 /*
3067 * If we are using a privileged stack,
3068 * check to see whether we can exchange it with
3069 * that of the other thread.
3070 */
3071 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) {
3072 goto need_stack;
3073 }
3074
3075 /*
3076 * Context switch by performing a stack handoff.
3077 * Requires both threads to be parked in a continuation.
3078 */
3079 continuation = thread->continuation;
3080 parameter = thread->parameter;
3081
3082 processor_t processor = current_processor();
3083 processor->active_thread = thread;
3084 processor_state_update_from_thread(processor, thread, false);
3085
3086 if (thread->last_processor != processor && thread->last_processor != NULL) {
3087 if (thread->last_processor->processor_set != processor->processor_set) {
3088 thread->ps_switch++;
3089 }
3090 thread->p_switch++;
3091 }
3092 thread->last_processor = processor;
3093 thread->c_switch++;
3094 ast_context(thread);
3095
3096 thread_unlock(thread);
3097
3098 self->reason = reason;
3099
3100 processor->last_dispatch = ctime;
3101 self->last_run_time = ctime;
3102 timer_update(timer: &thread->runnable_timer, tstamp: ctime);
3103 recount_switch_thread(snap: &snap, off_thread: self, off_task: get_threadtask(self));
3104
3105 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3106 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF) | DBG_FUNC_NONE,
3107 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3108
3109 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3110 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3111 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3112 }
3113
3114 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3115
3116 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3117
3118#if KPERF
3119 kperf_off_cpu(thread: self);
3120#endif /* KPERF */
3121
3122 /*
3123 * This is where we actually switch thread identity,
3124 * and address space if required. However, register
3125 * state is not switched - this routine leaves the
3126 * stack and register state active on the current CPU.
3127 */
3128 TLOG(1, "thread_invoke: calling stack_handoff\n");
3129 stack_handoff(from: self, to: thread);
3130
3131 /* 'self' is now off core */
3132 assert(thread == current_thread_volatile());
3133
3134 DTRACE_SCHED(on__cpu);
3135
3136#if KPERF
3137 kperf_on_cpu(thread, continuation, NULL);
3138#endif /* KPERF */
3139
3140 recount_log_switch_thread_on(snap: &snap);
3141
3142 thread_dispatch(old_thread: self, new_thread: thread);
3143
3144#if KASAN
3145 /* Old thread's stack has been moved to the new thread, so explicitly
3146 * unpoison it. */
3147 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3148#endif
3149
3150 thread->continuation = thread->parameter = NULL;
3151
3152 boolean_t enable_interrupts = TRUE;
3153
3154 /* idle thread needs to stay interrupts-disabled */
3155 if ((thread->state & TH_IDLE)) {
3156 enable_interrupts = FALSE;
3157 }
3158
3159 assert(continuation);
3160 call_continuation(continuation, parameter,
3161 wresult: thread->wait_result, enable_interrupts);
3162 /*NOTREACHED*/
3163 } else if (thread == self) {
3164 /* same thread but with continuation */
3165 ast_context(thread: self);
3166
3167 thread_unlock(self);
3168
3169#if KPERF
3170 kperf_on_cpu(thread, continuation, NULL);
3171#endif /* KPERF */
3172
3173 recount_log_switch_thread_on(snap: &snap);
3174
3175 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3176 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3177 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3178
3179#if KASAN
3180 /* stack handoff to self - no thread_dispatch(), so clear the stack
3181 * and free the fakestack directly */
3182#if KASAN_CLASSIC
3183 kasan_fakestack_drop(self);
3184 kasan_fakestack_gc(self);
3185#endif /* KASAN_CLASSIC */
3186 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
3187#endif /* KASAN */
3188
3189 self->continuation = self->parameter = NULL;
3190
3191 boolean_t enable_interrupts = TRUE;
3192
3193 /* idle thread needs to stay interrupts-disabled */
3194 if ((self->state & TH_IDLE)) {
3195 enable_interrupts = FALSE;
3196 }
3197
3198 call_continuation(continuation, parameter,
3199 wresult: self->wait_result, enable_interrupts);
3200 /*NOTREACHED*/
3201 }
3202 } else {
3203 /*
3204 * Check that the other thread has a stack
3205 */
3206 if (!thread->kernel_stack) {
3207need_stack:
3208 if (!stack_alloc_try(thread)) {
3209 thread_unlock(thread);
3210 thread_stack_enqueue(thread);
3211 return FALSE;
3212 }
3213 } else if (thread == self) {
3214 ast_context(thread: self);
3215 thread_unlock(self);
3216
3217 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3218 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3219 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3220
3221 return TRUE;
3222 }
3223 }
3224
3225 /*
3226 * Context switch by full context save.
3227 */
3228 processor_t processor = current_processor();
3229 processor->active_thread = thread;
3230 processor_state_update_from_thread(processor, thread, false);
3231
3232 if (thread->last_processor != processor && thread->last_processor != NULL) {
3233 if (thread->last_processor->processor_set != processor->processor_set) {
3234 thread->ps_switch++;
3235 }
3236 thread->p_switch++;
3237 }
3238 thread->last_processor = processor;
3239 thread->c_switch++;
3240 ast_context(thread);
3241
3242 thread_unlock(thread);
3243
3244 self->reason = reason;
3245
3246 processor->last_dispatch = ctime;
3247 self->last_run_time = ctime;
3248 timer_update(timer: &thread->runnable_timer, tstamp: ctime);
3249 recount_switch_thread(snap: &snap, off_thread: self, off_task: get_threadtask(self));
3250
3251 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3252 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED) | DBG_FUNC_NONE,
3253 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
3254
3255 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3256 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED) | DBG_FUNC_NONE,
3257 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
3258 }
3259
3260 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, current_proc());
3261
3262 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
3263
3264#if KPERF
3265 kperf_off_cpu(thread: self);
3266#endif /* KPERF */
3267
3268 /*
3269 * This is where we actually switch register context,
3270 * and address space if required. We will next run
3271 * as a result of a subsequent context switch.
3272 *
3273 * Once registers are switched and the processor is running "thread",
3274 * the stack variables and non-volatile registers will contain whatever
3275 * was there the last time that thread blocked. No local variables should
3276 * be used after this point, except for the special case of "thread", which
3277 * the platform layer returns as the previous thread running on the processor
3278 * via the function call ABI as a return register, and "self", which may have
3279 * been stored on the stack or a non-volatile register, but a stale idea of
3280 * what was on the CPU is newly-accurate because that thread is again
3281 * running on the CPU.
3282 *
3283 * If one of the threads is using a continuation, thread_continue
3284 * is used to stitch up its context.
3285 *
3286 * If we are invoking a thread which is resuming from a continuation,
3287 * the CPU will invoke thread_continue next.
3288 *
3289 * If the current thread is parking in a continuation, then its state
3290 * won't be saved and the stack will be discarded. When the stack is
3291 * re-allocated, it will be configured to resume from thread_continue.
3292 */
3293
3294 assert(continuation == self->continuation);
3295 thread = machine_switch_context(old_thread: self, continuation, new_thread: thread);
3296 assert(self == current_thread_volatile());
3297 TLOG(1, "thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
3298
3299 assert(continuation == NULL && self->continuation == NULL);
3300
3301 DTRACE_SCHED(on__cpu);
3302
3303#if KPERF
3304 kperf_on_cpu(thread: self, NULL, starting_fp: __builtin_frame_address(0));
3305#endif /* KPERF */
3306
3307 /* Previous snap on the old stack is gone. */
3308 recount_log_switch_thread_on(NULL);
3309
3310 /* We have been resumed and are set to run. */
3311 thread_dispatch(old_thread: thread, new_thread: self);
3312
3313 return TRUE;
3314}
3315
3316#if defined(CONFIG_SCHED_DEFERRED_AST)
3317/*
3318 * pset_cancel_deferred_dispatch:
3319 *
3320 * Cancels all ASTs that we can cancel for the given processor set
3321 * if the current processor is running the last runnable thread in the
3322 * system.
3323 *
3324 * This function assumes the current thread is runnable. This must
3325 * be called with the pset unlocked.
3326 */
3327static void
3328pset_cancel_deferred_dispatch(
3329 processor_set_t pset,
3330 processor_t processor)
3331{
3332 processor_t active_processor = NULL;
3333 uint32_t sampled_sched_run_count;
3334
3335 pset_lock(pset);
3336 sampled_sched_run_count = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3337
3338 /*
3339 * If we have emptied the run queue, and our current thread is runnable, we
3340 * should tell any processors that are still DISPATCHING that they will
3341 * probably not have any work to do. In the event that there are no
3342 * pending signals that we can cancel, this is also uninteresting.
3343 *
3344 * In the unlikely event that another thread becomes runnable while we are
3345 * doing this (sched_run_count is atomically updated, not guarded), the
3346 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
3347 * in order to dispatch it to a processor in our pset. So, the other
3348 * codepath will wait while we squash all cancelable ASTs, get the pset
3349 * lock, and then dispatch the freshly runnable thread. So this should be
3350 * correct (we won't accidentally have a runnable thread that hasn't been
3351 * dispatched to an idle processor), if not ideal (we may be restarting the
3352 * dispatch process, which could have some overhead).
3353 */
3354
3355 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
3356 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
3357 pset->pending_deferred_AST_cpu_mask &
3358 ~pset->pending_AST_URGENT_cpu_mask);
3359 for (int cpuid = lsb_first(bitmap: dispatching_map); cpuid >= 0; cpuid = lsb_next(bitmap: dispatching_map, previous_bit: cpuid)) {
3360 active_processor = processor_array[cpuid];
3361 /*
3362 * If a processor is DISPATCHING, it could be because of
3363 * a cancelable signal.
3364 *
3365 * IF the processor is not our
3366 * current processor (the current processor should not
3367 * be DISPATCHING, so this is a bit paranoid), AND there
3368 * is a cancelable signal pending on the processor, AND
3369 * there is no non-cancelable signal pending (as there is
3370 * no point trying to backtrack on bringing the processor
3371 * up if a signal we cannot cancel is outstanding), THEN
3372 * it should make sense to roll back the processor state
3373 * to the IDLE state.
3374 *
3375 * If the racey nature of this approach (as the signal
3376 * will be arbitrated by hardware, and can fire as we
3377 * roll back state) results in the core responding
3378 * despite being pushed back to the IDLE state, it
3379 * should be no different than if the core took some
3380 * interrupt while IDLE.
3381 */
3382 if (active_processor != processor) {
3383 /*
3384 * Squash all of the processor state back to some
3385 * reasonable facsimile of PROCESSOR_IDLE.
3386 */
3387
3388 processor_state_update_idle(processor: active_processor);
3389 active_processor->deadline = RT_DEADLINE_NONE;
3390 pset_update_processor_state(pset, processor: active_processor, new_state: PROCESSOR_IDLE);
3391 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
3392 machine_signal_idle_cancel(processor: active_processor);
3393 }
3394 }
3395 }
3396
3397 pset_unlock(pset);
3398}
3399#else
3400/* We don't support deferred ASTs; everything is candycanes and sunshine. */
3401#endif
3402
3403static void
3404thread_csw_callout(
3405 thread_t old,
3406 thread_t new,
3407 uint64_t timestamp)
3408{
3409 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
3410 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
3411 machine_switch_perfcontrol_context(event, timestamp, flags: 0,
3412 new_thread_same_pri_latency: same_pri_latency, old, new);
3413}
3414
3415
3416/*
3417 * thread_dispatch:
3418 *
3419 * Handle threads at context switch. Re-dispatch other thread
3420 * if still running, otherwise update run state and perform
3421 * special actions. Update quantum for other thread and begin
3422 * the quantum for ourselves.
3423 *
3424 * "thread" is the old thread that we have switched away from.
3425 * "self" is the new current thread that we have context switched to
3426 *
3427 * Called at splsched.
3428 *
3429 */
3430void
3431thread_dispatch(
3432 thread_t thread,
3433 thread_t self)
3434{
3435 processor_t processor = self->last_processor;
3436 bool was_idle = false;
3437
3438 assert(processor == current_processor());
3439 assert(self == current_thread_volatile());
3440 assert(thread != self);
3441
3442 if (thread != THREAD_NULL) {
3443 /*
3444 * Do the perfcontrol callout for context switch.
3445 * The reason we do this here is:
3446 * - thread_dispatch() is called from various places that are not
3447 * the direct context switch path for eg. processor shutdown etc.
3448 * So adding the callout here covers all those cases.
3449 * - We want this callout as early as possible to be close
3450 * to the timestamp taken in thread_invoke()
3451 * - We want to avoid holding the thread lock while doing the
3452 * callout
3453 * - We do not want to callout if "thread" is NULL.
3454 */
3455 thread_csw_callout(old: thread, new: self, timestamp: processor->last_dispatch);
3456
3457#if KASAN
3458 if (thread->continuation != NULL) {
3459 /*
3460 * Thread has a continuation and the normal stack is going away.
3461 * Unpoison the stack and mark all fakestack objects as unused.
3462 */
3463#if KASAN_CLASSIC
3464 kasan_fakestack_drop(thread);
3465#endif /* KASAN_CLASSIC */
3466 if (thread->kernel_stack) {
3467 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
3468 }
3469 }
3470
3471
3472#if KASAN_CLASSIC
3473 /*
3474 * Free all unused fakestack objects.
3475 */
3476 kasan_fakestack_gc(thread);
3477#endif /* KASAN_CLASSIC */
3478#endif /* KASAN */
3479
3480 /*
3481 * If blocked at a continuation, discard
3482 * the stack.
3483 */
3484 if (thread->continuation != NULL && thread->kernel_stack != 0) {
3485 stack_free(thread);
3486 }
3487
3488 if (thread->state & TH_IDLE) {
3489 was_idle = true;
3490 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3491 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3492 (uintptr_t)thread_tid(thread), 0, thread->state,
3493 sched_run_buckets[TH_BUCKET_RUN], 0);
3494 } else {
3495 int64_t consumed;
3496 int64_t remainder = 0;
3497
3498 if (processor->quantum_end > processor->last_dispatch) {
3499 remainder = processor->quantum_end -
3500 processor->last_dispatch;
3501 }
3502
3503 consumed = thread->quantum_remaining - remainder;
3504
3505 if ((thread->reason & AST_LEDGER) == 0) {
3506 /*
3507 * Bill CPU time to both the task and
3508 * the individual thread.
3509 */
3510 ledger_credit_thread(thread, ledger: thread->t_ledger,
3511 entry: task_ledgers.cpu_time, amount: consumed);
3512 ledger_credit_thread(thread, ledger: thread->t_threadledger,
3513 entry: thread_ledgers.cpu_time, amount: consumed);
3514 if (thread->t_bankledger) {
3515 ledger_credit_thread(thread, ledger: thread->t_bankledger,
3516 entry: bank_ledgers.cpu_time,
3517 amount: (consumed - thread->t_deduct_bank_ledger_time));
3518 }
3519 thread->t_deduct_bank_ledger_time = 0;
3520 if (consumed > 0) {
3521 /*
3522 * This should never be negative, but in traces we are seeing some instances
3523 * of consumed being negative.
3524 * <rdar://problem/57782596> thread_dispatch() thread CPU consumed calculation sometimes results in negative value
3525 */
3526 sched_update_pset_avg_execution_time(pset: current_processor()->processor_set, delta: consumed, curtime: processor->last_dispatch, sched_bucket: thread->th_sched_bucket);
3527 }
3528 }
3529
3530 /* For the thread that we just context switched away from, figure
3531 * out if we have expired the wq quantum and set the AST if we have
3532 */
3533 if (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) {
3534 thread_evaluate_workqueue_quantum_expiry(thread);
3535 }
3536
3537 if (__improbable(thread->rwlock_count != 0)) {
3538 smr_mark_active_trackers_stalled(self: thread);
3539 }
3540
3541 /*
3542 * Pairs with task_restartable_ranges_synchronize
3543 */
3544 wake_lock(thread);
3545 thread_lock(thread);
3546
3547 /*
3548 * Same as ast_check(), in case we missed the IPI
3549 */
3550 thread_reset_pcs_ack_IPI(thread);
3551
3552 /*
3553 * Apply a priority floor if the thread holds a kernel resource
3554 * or explicitly requested it.
3555 * Do this before checking starting_pri to avoid overpenalizing
3556 * repeated rwlock blockers.
3557 */
3558 if (__improbable(thread->rwlock_count != 0)) {
3559 lck_rw_set_promotion_locked(thread);
3560 }
3561 if (__improbable(thread->priority_floor_count != 0)) {
3562 thread_floor_boost_set_promotion_locked(thread);
3563 }
3564
3565 boolean_t keep_quantum = processor->first_timeslice;
3566
3567 /*
3568 * Treat a thread which has dropped priority since it got on core
3569 * as having expired its quantum.
3570 */
3571 if (processor->starting_pri > thread->sched_pri) {
3572 keep_quantum = FALSE;
3573 }
3574
3575 /* Compute remainder of current quantum. */
3576 if (keep_quantum &&
3577 processor->quantum_end > processor->last_dispatch) {
3578 thread->quantum_remaining = (uint32_t)remainder;
3579 } else {
3580 thread->quantum_remaining = 0;
3581 }
3582
3583 if (thread->sched_mode == TH_MODE_REALTIME) {
3584 /*
3585 * Cancel the deadline if the thread has
3586 * consumed the entire quantum.
3587 */
3588 if (thread->quantum_remaining == 0) {
3589 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
3590 (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
3591 thread->realtime.deadline = RT_DEADLINE_QUANTUM_EXPIRED;
3592 }
3593 } else {
3594#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3595 /*
3596 * For non-realtime threads treat a tiny
3597 * remaining quantum as an expired quantum
3598 * but include what's left next time.
3599 */
3600 if (thread->quantum_remaining < min_std_quantum) {
3601 thread->reason |= AST_QUANTUM;
3602 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
3603 }
3604#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3605 }
3606
3607 /*
3608 * If we are doing a direct handoff then
3609 * take the remainder of the quantum.
3610 */
3611 if ((thread->reason & (AST_HANDOFF | AST_QUANTUM)) == AST_HANDOFF) {
3612 self->quantum_remaining = thread->quantum_remaining;
3613 thread->reason |= AST_QUANTUM;
3614 thread->quantum_remaining = 0;
3615 } else {
3616#if defined(CONFIG_SCHED_MULTIQ)
3617 if (SCHED(sched_groups_enabled) &&
3618 thread->sched_group == self->sched_group) {
3619 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3620 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
3621 self->reason, (uintptr_t)thread_tid(thread),
3622 self->quantum_remaining, thread->quantum_remaining, 0);
3623
3624 self->quantum_remaining = thread->quantum_remaining;
3625 thread->quantum_remaining = 0;
3626 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
3627 }
3628#endif /* defined(CONFIG_SCHED_MULTIQ) */
3629 }
3630
3631 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
3632
3633 if (!(thread->state & TH_WAIT)) {
3634 /*
3635 * Still runnable.
3636 */
3637 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
3638
3639 machine_thread_going_off_core(old_thread: thread, FALSE, last_dispatch: processor->last_dispatch, TRUE);
3640
3641 ast_t reason = thread->reason;
3642 sched_options_t options = SCHED_NONE;
3643
3644 if (reason & AST_REBALANCE) {
3645 options |= SCHED_REBALANCE;
3646 if (reason & AST_QUANTUM) {
3647 /*
3648 * Having gone to the trouble of forcing this thread off a less preferred core,
3649 * we should force the preferable core to reschedule immediately to give this
3650 * thread a chance to run instead of just sitting on the run queue where
3651 * it may just be stolen back by the idle core we just forced it off.
3652 * But only do this at the end of a quantum to prevent cascading effects.
3653 */
3654 options |= SCHED_PREEMPT;
3655 }
3656 }
3657
3658 if (reason & AST_QUANTUM) {
3659 options |= SCHED_TAILQ;
3660 } else if (reason & AST_PREEMPT) {
3661 options |= SCHED_HEADQ;
3662 } else {
3663 options |= (SCHED_PREEMPT | SCHED_TAILQ);
3664 }
3665
3666 thread_setrun(thread, options);
3667
3668 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3669 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3670 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
3671 sched_run_buckets[TH_BUCKET_RUN], 0);
3672
3673 if (thread->wake_active) {
3674 thread->wake_active = FALSE;
3675 thread_unlock(thread);
3676
3677 thread_wakeup(&thread->wake_active);
3678 } else {
3679 thread_unlock(thread);
3680 }
3681
3682 wake_unlock(thread);
3683 } else {
3684 /*
3685 * Waiting.
3686 */
3687 boolean_t should_terminate = FALSE;
3688 uint32_t new_run_count;
3689 int thread_state = thread->state;
3690
3691 /* Only the first call to thread_dispatch
3692 * after explicit termination should add
3693 * the thread to the termination queue
3694 */
3695 if ((thread_state & (TH_TERMINATE | TH_TERMINATE2)) == TH_TERMINATE) {
3696 should_terminate = TRUE;
3697 thread_state |= TH_TERMINATE2;
3698 }
3699
3700 timer_stop(timer: &thread->runnable_timer, tstamp: processor->last_dispatch);
3701
3702 thread_state &= ~TH_RUN;
3703 thread->state = thread_state;
3704
3705 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
3706 thread->chosen_processor = PROCESSOR_NULL;
3707
3708 new_run_count = SCHED(run_count_decr)(thread);
3709
3710#if CONFIG_SCHED_AUTO_JOIN
3711 if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0) {
3712 work_interval_auto_join_unwind(thread);
3713 }
3714#endif /* CONFIG_SCHED_AUTO_JOIN */
3715
3716#if CONFIG_SCHED_SFI
3717 if (thread->reason & AST_SFI) {
3718 thread->wait_sfi_begin_time = processor->last_dispatch;
3719 }
3720#endif
3721 machine_thread_going_off_core(old_thread: thread, thread_terminating: should_terminate, last_dispatch: processor->last_dispatch, FALSE);
3722
3723 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3724 MACHDBG_CODE(DBG_MACH_SCHED, MACH_DISPATCH) | DBG_FUNC_NONE,
3725 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
3726 new_run_count, 0);
3727
3728 if (thread_state & TH_WAIT_REPORT) {
3729 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
3730 }
3731
3732 if (thread->wake_active) {
3733 thread->wake_active = FALSE;
3734 thread_unlock(thread);
3735
3736 thread_wakeup(&thread->wake_active);
3737 } else {
3738 thread_unlock(thread);
3739 }
3740
3741 wake_unlock(thread);
3742
3743 if (should_terminate) {
3744 thread_terminate_enqueue(thread);
3745 }
3746 }
3747 }
3748 /*
3749 * The thread could have been added to the termination queue, so it's
3750 * unsafe to use after this point.
3751 */
3752 thread = THREAD_NULL;
3753 }
3754
3755 int urgency = THREAD_URGENCY_NONE;
3756 uint64_t latency = 0;
3757
3758 /* Update (new) current thread and reprogram running timers */
3759 thread_lock(self);
3760
3761 if (!(self->state & TH_IDLE)) {
3762 uint64_t arg1, arg2;
3763
3764#if CONFIG_SCHED_SFI
3765 ast_t new_ast;
3766
3767 new_ast = sfi_thread_needs_ast(thread: self, NULL);
3768
3769 if (new_ast != AST_NONE) {
3770 ast_on(reasons: new_ast);
3771 }
3772#endif
3773
3774 if (processor->last_dispatch < self->last_made_runnable_time) {
3775 panic("Non-monotonic time: dispatch at 0x%llx, runnable at 0x%llx",
3776 processor->last_dispatch, self->last_made_runnable_time);
3777 }
3778
3779 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
3780
3781 latency = processor->last_dispatch - self->last_made_runnable_time;
3782 assert(latency >= self->same_pri_latency);
3783
3784 urgency = thread_get_urgency(thread: self, rt_period: &arg1, rt_deadline: &arg2);
3785
3786 thread_tell_urgency(urgency, rt_period: arg1, rt_deadline: arg2, sched_latency: latency, nthread: self);
3787
3788 /*
3789 * Start a new CPU limit interval if the previous one has
3790 * expired. This should happen before initializing a new
3791 * quantum.
3792 */
3793 if (cpulimit_affects_quantum &&
3794 thread_cpulimit_interval_has_expired(now: processor->last_dispatch)) {
3795 thread_cpulimit_restart(now: processor->last_dispatch);
3796 }
3797
3798 /*
3799 * Get a new quantum if none remaining.
3800 */
3801 if (self->quantum_remaining == 0) {
3802 thread_quantum_init(thread: self, now: processor->last_dispatch);
3803 }
3804
3805 /*
3806 * Set up quantum timer and timeslice.
3807 */
3808 processor->quantum_end = processor->last_dispatch +
3809 self->quantum_remaining;
3810
3811 running_timer_setup(processor, timer: RUNNING_TIMER_QUANTUM, param: self,
3812 deadline: processor->quantum_end, now: processor->last_dispatch);
3813 if (was_idle) {
3814 /*
3815 * kperf's running timer is active whenever the idle thread for a
3816 * CPU is not running.
3817 */
3818 kperf_running_setup(processor, now: processor->last_dispatch);
3819 }
3820 running_timers_activate(processor);
3821 processor->first_timeslice = TRUE;
3822 } else {
3823 running_timers_deactivate(processor);
3824 processor->first_timeslice = FALSE;
3825 thread_tell_urgency(urgency: THREAD_URGENCY_NONE, rt_period: 0, rt_deadline: 0, sched_latency: 0, nthread: self);
3826 }
3827
3828 assert(self->block_hint == kThreadWaitNone);
3829 self->computation_epoch = processor->last_dispatch;
3830 /*
3831 * This relies on the interrupt time being tallied up to the thread in the
3832 * exception handler epilogue, which is before AST context where preemption
3833 * is considered (and the scheduler is potentially invoked to
3834 * context switch, here).
3835 */
3836 self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
3837 self->reason = AST_NONE;
3838 processor->starting_pri = self->sched_pri;
3839
3840 thread_unlock(self);
3841
3842 machine_thread_going_on_core(new_thread: self, urgency, sched_latency: latency, same_pri_latency: self->same_pri_latency,
3843 dispatch_time: processor->last_dispatch);
3844
3845#if defined(CONFIG_SCHED_DEFERRED_AST)
3846 /*
3847 * TODO: Can we state that redispatching our old thread is also
3848 * uninteresting?
3849 */
3850 if ((os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) == 1) && !(self->state & TH_IDLE)) {
3851 pset_cancel_deferred_dispatch(pset: processor->processor_set, processor);
3852 }
3853#endif
3854}
3855
3856/*
3857 * thread_block_reason:
3858 *
3859 * Forces a reschedule, blocking the caller if a wait
3860 * has been asserted.
3861 *
3862 * If a continuation is specified, then thread_invoke will
3863 * attempt to discard the thread's kernel stack. When the
3864 * thread resumes, it will execute the continuation function
3865 * on a new kernel stack.
3866 */
3867wait_result_t
3868thread_block_reason(
3869 thread_continue_t continuation,
3870 void *parameter,
3871 ast_t reason)
3872{
3873 thread_t self = current_thread();
3874 processor_t processor;
3875 thread_t new_thread;
3876 spl_t s;
3877
3878 s = splsched();
3879
3880 processor = current_processor();
3881
3882 /* If we're explicitly yielding, force a subsequent quantum */
3883 if (reason & AST_YIELD) {
3884 processor->first_timeslice = FALSE;
3885 }
3886
3887 /* We're handling all scheduling AST's */
3888 ast_off(AST_SCHEDULING);
3889
3890 clear_pending_nonurgent_preemption(processor);
3891
3892#if PROC_REF_DEBUG
3893 if ((continuation != NULL) && (get_threadtask(self) != kernel_task)) {
3894 uthread_assert_zero_proc_refcount(get_bsdthread_info(self));
3895 }
3896#endif
3897
3898#if CONFIG_EXCLAVES
3899 if (continuation != NULL) {
3900 assert3u(self->th_exclaves_state & TH_EXCLAVES_STATE_ANY, ==, 0);
3901 }
3902#endif /* CONFIG_EXCLAVES */
3903
3904 self->continuation = continuation;
3905 self->parameter = parameter;
3906
3907 if (self->state & ~(TH_RUN | TH_IDLE)) {
3908 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3909 MACHDBG_CODE(DBG_MACH_SCHED, MACH_BLOCK),
3910 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3911 }
3912
3913 do {
3914 thread_lock(self);
3915 new_thread = thread_select(thread: self, processor, reason: &reason);
3916 thread_unlock(self);
3917 } while (!thread_invoke(self, thread: new_thread, reason));
3918
3919 splx(s);
3920
3921 return self->wait_result;
3922}
3923
3924/*
3925 * thread_block:
3926 *
3927 * Block the current thread if a wait has been asserted.
3928 */
3929wait_result_t
3930thread_block(
3931 thread_continue_t continuation)
3932{
3933 return thread_block_reason(continuation, NULL, AST_NONE);
3934}
3935
3936wait_result_t
3937thread_block_parameter(
3938 thread_continue_t continuation,
3939 void *parameter)
3940{
3941 return thread_block_reason(continuation, parameter, AST_NONE);
3942}
3943
3944/*
3945 * thread_run:
3946 *
3947 * Switch directly from the current thread to the
3948 * new thread, handing off our quantum if appropriate.
3949 *
3950 * New thread must be runnable, and not on a run queue.
3951 *
3952 * Called at splsched.
3953 */
3954int
3955thread_run(
3956 thread_t self,
3957 thread_continue_t continuation,
3958 void *parameter,
3959 thread_t new_thread)
3960{
3961 ast_t reason = AST_NONE;
3962
3963 if ((self->state & TH_IDLE) == 0) {
3964 reason = AST_HANDOFF;
3965 }
3966
3967 /*
3968 * If this thread hadn't been setrun'ed, it
3969 * might not have a chosen processor, so give it one
3970 */
3971 if (new_thread->chosen_processor == NULL) {
3972 new_thread->chosen_processor = current_processor();
3973 }
3974
3975 self->continuation = continuation;
3976 self->parameter = parameter;
3977
3978 while (!thread_invoke(self, thread: new_thread, reason)) {
3979 /* the handoff failed, so we have to fall back to the normal block path */
3980 processor_t processor = current_processor();
3981
3982 reason = AST_NONE;
3983
3984 thread_lock(self);
3985 new_thread = thread_select(thread: self, processor, reason: &reason);
3986 thread_unlock(self);
3987 }
3988
3989 return self->wait_result;
3990}
3991
3992/*
3993 * thread_continue:
3994 *
3995 * Called at splsched when a thread first receives
3996 * a new stack after a continuation.
3997 *
3998 * Called with THREAD_NULL as the old thread when
3999 * invoked by machine_load_context.
4000 */
4001void
4002thread_continue(
4003 thread_t thread)
4004{
4005 thread_t self = current_thread();
4006 thread_continue_t continuation;
4007 void *parameter;
4008
4009 DTRACE_SCHED(on__cpu);
4010
4011 continuation = self->continuation;
4012 parameter = self->parameter;
4013
4014 assert(continuation != NULL);
4015
4016#if KPERF
4017 kperf_on_cpu(thread: self, continuation, NULL);
4018#endif
4019
4020 thread_dispatch(thread, self);
4021
4022 self->continuation = self->parameter = NULL;
4023
4024#if SCHED_HYGIENE_DEBUG
4025 /* Reset interrupt-masked spin debugging timeout */
4026 ml_spin_debug_clear(self);
4027#endif
4028
4029 TLOG(1, "thread_continue: calling call_continuation\n");
4030
4031 boolean_t enable_interrupts = TRUE;
4032
4033 /* bootstrap thread, idle thread need to stay interrupts-disabled */
4034 if (thread == THREAD_NULL || (self->state & TH_IDLE)) {
4035 enable_interrupts = FALSE;
4036 }
4037
4038#if KASAN_TBI
4039 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
4040#endif /* KASAN_TBI */
4041
4042
4043 call_continuation(continuation, parameter, wresult: self->wait_result, enable_interrupts);
4044 /*NOTREACHED*/
4045}
4046
4047void
4048thread_quantum_init(thread_t thread, uint64_t now)
4049{
4050 uint64_t new_quantum = 0;
4051
4052 switch (thread->sched_mode) {
4053 case TH_MODE_REALTIME:
4054 new_quantum = thread->realtime.computation;
4055 new_quantum = MIN(new_quantum, max_unsafe_rt_computation);
4056 break;
4057
4058 case TH_MODE_FIXED:
4059 new_quantum = SCHED(initial_quantum_size)(thread);
4060 new_quantum = MIN(new_quantum, max_unsafe_fixed_computation);
4061 break;
4062
4063 default:
4064 new_quantum = SCHED(initial_quantum_size)(thread);
4065 break;
4066 }
4067
4068 if (cpulimit_affects_quantum) {
4069 const uint64_t cpulimit_remaining = thread_cpulimit_remaining(now);
4070
4071 /*
4072 * If there's no remaining CPU time, the ledger system will
4073 * notice and put the thread to sleep.
4074 */
4075 if (cpulimit_remaining > 0) {
4076 new_quantum = MIN(new_quantum, cpulimit_remaining);
4077 }
4078 }
4079
4080 assert3u(new_quantum, <, UINT32_MAX);
4081 assert3u(new_quantum, >, 0);
4082
4083 thread->quantum_remaining = (uint32_t)new_quantum;
4084}
4085
4086uint32_t
4087sched_timeshare_initial_quantum_size(thread_t thread)
4088{
4089 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) {
4090 return bg_quantum;
4091 } else {
4092 return std_quantum;
4093 }
4094}
4095
4096/*
4097 * run_queue_init:
4098 *
4099 * Initialize a run queue before first use.
4100 */
4101void
4102run_queue_init(
4103 run_queue_t rq)
4104{
4105 rq->highq = NOPRI;
4106 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) {
4107 rq->bitmap[i] = 0;
4108 }
4109 rq->urgency = rq->count = 0;
4110 for (int i = 0; i < NRQS; i++) {
4111 circle_queue_init(&rq->queues[i]);
4112 }
4113}
4114
4115/*
4116 * run_queue_dequeue:
4117 *
4118 * Perform a dequeue operation on a run queue,
4119 * and return the resulting thread.
4120 *
4121 * The run queue must be locked (see thread_run_queue_remove()
4122 * for more info), and not empty.
4123 */
4124thread_t
4125run_queue_dequeue(
4126 run_queue_t rq,
4127 sched_options_t options)
4128{
4129 thread_t thread;
4130 circle_queue_t queue = &rq->queues[rq->highq];
4131
4132 if (options & SCHED_HEADQ) {
4133 thread = cqe_dequeue_head(queue, struct thread, runq_links);
4134 } else {
4135 thread = cqe_dequeue_tail(queue, struct thread, runq_links);
4136 }
4137
4138 assert(thread != THREAD_NULL);
4139 assert_thread_magic(thread);
4140
4141 thread_clear_runq(thread);
4142 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4143 rq->count--;
4144 if (SCHED(priority_is_urgent)(rq->highq)) {
4145 rq->urgency--; assert(rq->urgency >= 0);
4146 }
4147 if (circle_queue_empty(cq: queue)) {
4148 bitmap_clear(map: rq->bitmap, n: rq->highq);
4149 rq->highq = bitmap_first(map: rq->bitmap, NRQS);
4150 }
4151
4152 return thread;
4153}
4154
4155/*
4156 * run_queue_enqueue:
4157 *
4158 * Perform a enqueue operation on a run queue.
4159 *
4160 * The run queue must be locked (see thread_run_queue_remove()
4161 * for more info).
4162 */
4163boolean_t
4164run_queue_enqueue(
4165 run_queue_t rq,
4166 thread_t thread,
4167 sched_options_t options)
4168{
4169 circle_queue_t queue = &rq->queues[thread->sched_pri];
4170 boolean_t result = FALSE;
4171
4172 assert_thread_magic(thread);
4173
4174 if (circle_queue_empty(cq: queue)) {
4175 circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
4176
4177 rq_bitmap_set(map: rq->bitmap, n: thread->sched_pri);
4178 if (thread->sched_pri > rq->highq) {
4179 rq->highq = thread->sched_pri;
4180 result = TRUE;
4181 }
4182 } else {
4183 if (options & SCHED_TAILQ) {
4184 circle_enqueue_tail(cq: queue, elt: &thread->runq_links);
4185 } else {
4186 circle_enqueue_head(cq: queue, elt: &thread->runq_links);
4187 }
4188 }
4189 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4190 rq->urgency++;
4191 }
4192 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4193 rq->count++;
4194
4195 return result;
4196}
4197
4198/*
4199 * run_queue_remove:
4200 *
4201 * Remove a specific thread from a runqueue.
4202 *
4203 * The run queue must be locked.
4204 */
4205void
4206run_queue_remove(
4207 run_queue_t rq,
4208 thread_t thread)
4209{
4210 circle_queue_t queue = &rq->queues[thread->sched_pri];
4211
4212 thread_assert_runq_nonnull(thread);
4213 assert_thread_magic(thread);
4214
4215 circle_dequeue(cq: queue, elt: &thread->runq_links);
4216 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
4217 rq->count--;
4218 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
4219 rq->urgency--; assert(rq->urgency >= 0);
4220 }
4221
4222 if (circle_queue_empty(cq: queue)) {
4223 /* update run queue status */
4224 bitmap_clear(map: rq->bitmap, n: thread->sched_pri);
4225 rq->highq = bitmap_first(map: rq->bitmap, NRQS);
4226 }
4227
4228 thread_clear_runq(thread);
4229}
4230
4231/*
4232 * run_queue_peek
4233 *
4234 * Peek at the runq and return the highest
4235 * priority thread from the runq.
4236 *
4237 * The run queue must be locked.
4238 */
4239thread_t
4240run_queue_peek(
4241 run_queue_t rq)
4242{
4243 if (rq->count > 0) {
4244 circle_queue_t queue = &rq->queues[rq->highq];
4245 thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
4246 assert_thread_magic(thread);
4247 return thread;
4248 } else {
4249 return THREAD_NULL;
4250 }
4251}
4252
4253static bool
4254rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor)
4255{
4256 int pri = thread->sched_pri;
4257 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4258 int i = pri - BASEPRI_RTQUEUES;
4259 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4260 bitmap_t *map = rt_run_queue->bitmap;
4261
4262 bitmap_set(map, n: i);
4263
4264 queue_t queue = &rt_runq->pri_queue;
4265 uint64_t deadline = thread->realtime.deadline;
4266 bool preempt = false;
4267 bool earliest = false;
4268
4269 if (queue_empty(queue)) {
4270 enqueue_tail(que: queue, elt: &thread->runq_links);
4271 preempt = true;
4272 earliest = true;
4273 rt_runq->pri_earliest_deadline = deadline;
4274 rt_runq->pri_constraint = thread->realtime.constraint;
4275 } else {
4276 /* Insert into rt_runq in thread deadline order */
4277 queue_entry_t iter;
4278 qe_foreach(iter, queue) {
4279 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
4280 assert_thread_magic(iter_thread);
4281
4282 if (deadline < iter_thread->realtime.deadline) {
4283 if (iter == queue_first(queue)) {
4284 preempt = true;
4285 earliest = true;
4286 rt_runq->pri_earliest_deadline = deadline;
4287 rt_runq->pri_constraint = thread->realtime.constraint;
4288 }
4289 insque(entry: &thread->runq_links, queue_prev(iter));
4290 break;
4291 } else if (iter == queue_last(queue)) {
4292 enqueue_tail(que: queue, elt: &thread->runq_links);
4293 break;
4294 }
4295 }
4296 }
4297 if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) {
4298 os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed);
4299 os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed);
4300 os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed);
4301 }
4302
4303 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4304 rt_runq->pri_count++;
4305 os_atomic_inc(&rt_run_queue->count, relaxed);
4306
4307 thread_set_runq_locked(thread, new_runq: processor);
4308
4309 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4310
4311 return preempt;
4312}
4313
4314static thread_t
4315rt_runq_dequeue(rt_queue_t rt_run_queue)
4316{
4317 bitmap_t *map = rt_run_queue->bitmap;
4318 int i = bitmap_first(map, NRTQS);
4319 assert((i >= 0) && (i < NRTQS));
4320
4321 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4322
4323 if (!sched_rt_runq_strict_priority) {
4324 int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed);
4325 if (ed_index != i) {
4326 assert((ed_index >= 0) && (ed_index < NRTQS));
4327 rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index];
4328
4329 thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links);
4330 thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4331
4332 if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) {
4333 /* choose the earliest deadline thread */
4334 rt_runq = ed_runq;
4335 i = ed_index;
4336 }
4337 }
4338 }
4339
4340 assert(rt_runq->pri_count > 0);
4341 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4342 uint32_t constraint = RT_CONSTRAINT_NONE;
4343 int ed_index = NOPRI;
4344 thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links);
4345 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4346 if (--rt_runq->pri_count > 0) {
4347 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4348 assert(next_rt != THREAD_NULL);
4349 earliest_deadline = next_rt->realtime.deadline;
4350 constraint = next_rt->realtime.constraint;
4351 ed_index = i;
4352 } else {
4353 bitmap_clear(map, n: i);
4354 }
4355 rt_runq->pri_earliest_deadline = earliest_deadline;
4356 rt_runq->pri_constraint = constraint;
4357
4358 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, prev: i)) {
4359 rt_runq = &rt_run_queue->rt_queue_pri[i];
4360 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4361 earliest_deadline = rt_runq->pri_earliest_deadline;
4362 constraint = rt_runq->pri_constraint;
4363 ed_index = i;
4364 }
4365 }
4366 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4367 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4368 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4369 os_atomic_dec(&rt_run_queue->count, relaxed);
4370
4371 thread_clear_runq(thread: new_thread);
4372
4373 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4374
4375 return new_thread;
4376}
4377
4378static thread_t
4379rt_runq_first(rt_queue_t rt_run_queue)
4380{
4381 bitmap_t *map = rt_run_queue->bitmap;
4382 int i = bitmap_first(map, NRTQS);
4383 if (i < 0) {
4384 return THREAD_NULL;
4385 }
4386 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4387 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4388
4389 return next_rt;
4390}
4391
4392static void
4393rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread)
4394{
4395 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread);
4396
4397 int pri = thread->sched_pri;
4398 assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI));
4399 int i = pri - BASEPRI_RTQUEUES;
4400 rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i];
4401 bitmap_t *map = rt_run_queue->bitmap;
4402
4403 assert(rt_runq->pri_count > 0);
4404 uint64_t earliest_deadline = RT_DEADLINE_NONE;
4405 uint32_t constraint = RT_CONSTRAINT_NONE;
4406 int ed_index = NOPRI;
4407 remqueue(elt: &thread->runq_links);
4408 SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed));
4409 if (--rt_runq->pri_count > 0) {
4410 thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links);
4411 earliest_deadline = next_rt->realtime.deadline;
4412 constraint = next_rt->realtime.constraint;
4413 ed_index = i;
4414 } else {
4415 bitmap_clear(map, n: i);
4416 }
4417 rt_runq->pri_earliest_deadline = earliest_deadline;
4418 rt_runq->pri_constraint = constraint;
4419
4420 for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, prev: i)) {
4421 rt_runq = &rt_run_queue->rt_queue_pri[i];
4422 if (rt_runq->pri_earliest_deadline < earliest_deadline) {
4423 earliest_deadline = rt_runq->pri_earliest_deadline;
4424 constraint = rt_runq->pri_constraint;
4425 ed_index = i;
4426 }
4427 }
4428 os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed);
4429 os_atomic_store(&rt_run_queue->constraint, constraint, relaxed);
4430 os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed);
4431 os_atomic_dec(&rt_run_queue->count, relaxed);
4432
4433 thread_clear_runq_locked(thread);
4434
4435 CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL);
4436}
4437
4438rt_queue_t
4439sched_rtlocal_runq(processor_set_t pset)
4440{
4441 return &pset->rt_runq;
4442}
4443
4444void
4445sched_rtlocal_init(processor_set_t pset)
4446{
4447 pset_rt_init(pset);
4448}
4449
4450void
4451sched_rtlocal_queue_shutdown(processor_t processor)
4452{
4453 processor_set_t pset = processor->processor_set;
4454 thread_t thread;
4455 queue_head_t tqueue;
4456
4457 pset_lock(pset);
4458
4459 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4460 if (bit_count(x: pset_available_cpumap(pset)) > 0) {
4461 pset_unlock(pset);
4462 return;
4463 }
4464
4465 queue_init(&tqueue);
4466
4467 while (rt_runq_count(pset) > 0) {
4468 thread = rt_runq_dequeue(rt_run_queue: &pset->rt_runq);
4469 enqueue_tail(que: &tqueue, elt: &thread->runq_links);
4470 }
4471 sched_update_pset_load_average(pset, curtime: 0);
4472 pset_update_rt_stealable_state(pset);
4473 pset_unlock(pset);
4474
4475 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4476 remqueue(elt: &thread->runq_links);
4477
4478 thread_lock(thread);
4479
4480 thread_setrun(thread, options: SCHED_TAILQ);
4481
4482 thread_unlock(thread);
4483 }
4484}
4485
4486/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
4487void
4488sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context)
4489{
4490 thread_t thread;
4491
4492 pset_node_t node = &pset_node0;
4493 processor_set_t pset = node->psets;
4494
4495 spl_t s = splsched();
4496 do {
4497 while (pset != NULL) {
4498 pset_lock(pset);
4499
4500 bitmap_t *map = pset->rt_runq.bitmap;
4501 for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, prev: i)) {
4502 rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i];
4503
4504 qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) {
4505 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
4506 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
4507 }
4508 }
4509 }
4510
4511 pset_unlock(pset);
4512
4513 pset = pset->pset_list;
4514 }
4515 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4516 splx(s);
4517}
4518
4519int64_t
4520sched_rtlocal_runq_count_sum(void)
4521{
4522 pset_node_t node = &pset_node0;
4523 processor_set_t pset = node->psets;
4524 int64_t count = 0;
4525
4526 do {
4527 while (pset != NULL) {
4528 count += pset->rt_runq.runq_stats.count_sum;
4529
4530 pset = pset->pset_list;
4531 }
4532 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
4533
4534 return count;
4535}
4536
4537/*
4538 * Called with stealing_pset locked and
4539 * returns with stealing_pset locked
4540 * but the lock will have been dropped
4541 * if a thread is returned.
4542 */
4543thread_t
4544sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline)
4545{
4546 if (!sched_allow_rt_steal) {
4547 return THREAD_NULL;
4548 }
4549 pset_map_t pset_map = stealing_pset->node->pset_map;
4550
4551 bit_clear(pset_map, stealing_pset->pset_id);
4552
4553 processor_set_t pset = stealing_pset;
4554
4555 processor_set_t target_pset;
4556 uint64_t target_deadline;
4557
4558retry:
4559 target_pset = NULL;
4560 target_deadline = earliest_deadline - rt_deadline_epsilon;
4561
4562 for (int pset_id = lsb_first(bitmap: pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: pset_map, previous_bit: pset_id)) {
4563 processor_set_t nset = pset_array[pset_id];
4564
4565 /*
4566 * During startup, while pset_array[] and node->pset_map are still being initialized,
4567 * the update to pset_map may become visible to this cpu before the update to pset_array[].
4568 * It would be good to avoid inserting a memory barrier here that is only needed during startup,
4569 * so just check nset is not NULL instead.
4570 */
4571 if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) {
4572 target_deadline = nset->stealable_rt_threads_earliest_deadline;
4573 target_pset = nset;
4574 }
4575 }
4576
4577 if (target_pset != NULL) {
4578 pset = change_locked_pset(current_pset: pset, new_pset: target_pset);
4579 if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) {
4580 thread_t new_thread = rt_runq_dequeue(rt_run_queue: &pset->rt_runq);
4581 pset_update_rt_stealable_state(pset);
4582 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0);
4583
4584 pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4585 return new_thread;
4586 }
4587 pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4588 earliest_deadline = rt_runq_earliest_deadline(pset);
4589 goto retry;
4590 }
4591
4592 pset = change_locked_pset(current_pset: pset, new_pset: stealing_pset);
4593 return THREAD_NULL;
4594}
4595
4596/*
4597 * pset is locked
4598 */
4599thread_t
4600sched_rt_choose_thread(processor_set_t pset)
4601{
4602 processor_t processor = current_processor();
4603
4604 if (SCHED(steal_thread_enabled)(pset)) {
4605 do {
4606 bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
4607 if (spill_pending) {
4608 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2);
4609 }
4610 thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset));
4611 if (new_thread != THREAD_NULL) {
4612 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4613 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3);
4614 }
4615 return new_thread;
4616 }
4617 } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id));
4618 }
4619
4620 if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
4621 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4);
4622 }
4623
4624 if (rt_runq_count(pset) > 0) {
4625 thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset));
4626 assert(new_thread != THREAD_NULL);
4627 pset_update_rt_stealable_state(pset);
4628 return new_thread;
4629 }
4630
4631 return THREAD_NULL;
4632}
4633
4634/*
4635 * realtime_queue_insert:
4636 *
4637 * Enqueue a thread for realtime execution.
4638 */
4639static bool
4640realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
4641{
4642 pset_assert_locked(pset);
4643
4644 bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor);
4645 pset_update_rt_stealable_state(pset);
4646
4647 return preempt;
4648}
4649
4650/*
4651 * realtime_setrun:
4652 *
4653 * Dispatch a thread for realtime execution.
4654 *
4655 * Thread must be locked. Associated pset must
4656 * be locked, and is returned unlocked.
4657 */
4658static void
4659realtime_setrun(
4660 processor_t chosen_processor,
4661 thread_t thread)
4662{
4663 processor_set_t pset = chosen_processor->processor_set;
4664 pset_assert_locked(pset);
4665 bool pset_is_locked = true;
4666
4667 int n_backup = 0;
4668
4669 if (thread->realtime.constraint <= rt_constraint_threshold) {
4670 n_backup = sched_rt_n_backup_processors;
4671 }
4672 assert((n_backup >= 0) && (n_backup <= SCHED_MAX_BACKUP_PROCESSORS));
4673
4674 int existing_backups = bit_count(x: pset->pending_AST_URGENT_cpu_mask) - rt_runq_count(pset);
4675 if (existing_backups > 0) {
4676 n_backup = n_backup - existing_backups;
4677 if (n_backup < 0) {
4678 n_backup = 0;
4679 }
4680 }
4681
4682 sched_ipi_type_t ipi_type[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4683 processor_t ipi_processor[SCHED_MAX_BACKUP_PROCESSORS + 1] = {};
4684
4685 thread->chosen_processor = chosen_processor;
4686
4687 /* <rdar://problem/15102234> */
4688 assert(thread->bound_processor == PROCESSOR_NULL);
4689
4690 realtime_queue_insert(processor: chosen_processor, pset, thread);
4691
4692 processor_t processor = chosen_processor;
4693
4694 int count = 0;
4695 for (int i = 0; i <= n_backup; i++) {
4696 if (i == 0) {
4697 ipi_type[i] = SCHED_IPI_NONE;
4698 ipi_processor[i] = processor;
4699 count++;
4700
4701 ast_t preempt = AST_NONE;
4702 if (thread->sched_pri > processor->current_pri) {
4703 preempt = (AST_PREEMPT | AST_URGENT);
4704 } else if (thread->sched_pri == processor->current_pri) {
4705 if (deadline_add(d: thread->realtime.deadline, e: rt_deadline_epsilon) < processor->deadline) {
4706 preempt = (AST_PREEMPT | AST_URGENT);
4707 }
4708 }
4709
4710 if (preempt != AST_NONE) {
4711 if (processor->state == PROCESSOR_IDLE) {
4712 if (processor == current_processor()) {
4713 pset_update_processor_state(pset, processor, new_state: PROCESSOR_DISPATCHING);
4714 ast_on(reasons: preempt);
4715
4716 if ((preempt & AST_URGENT) == AST_URGENT) {
4717 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4718 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4719 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 1);
4720 }
4721 }
4722
4723 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4724 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4725 }
4726 } else {
4727 ipi_type[i] = sched_ipi_action(dst: processor, thread, event: SCHED_IPI_EVENT_RT_PREEMPT);
4728 }
4729 } else if (processor->state == PROCESSOR_DISPATCHING) {
4730 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4731 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4732 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 2);
4733 }
4734 } else {
4735 if (processor == current_processor()) {
4736 ast_on(reasons: preempt);
4737
4738 if ((preempt & AST_URGENT) == AST_URGENT) {
4739 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4740 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4741 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 3);
4742 }
4743 }
4744
4745 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
4746 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
4747 }
4748 } else {
4749 ipi_type[i] = sched_ipi_action(dst: processor, thread, event: SCHED_IPI_EVENT_RT_PREEMPT);
4750 }
4751 }
4752 } else {
4753 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
4754 }
4755 } else {
4756 if (!pset_is_locked) {
4757 pset_lock(pset);
4758 }
4759 ipi_type[i] = SCHED_IPI_NONE;
4760 ipi_processor[i] = PROCESSOR_NULL;
4761 pset_is_locked = !choose_next_rt_processor_for_IPI(starting_pset: pset, chosen_processor, false, result_processor: &ipi_processor[i], result_ipi_type: &ipi_type[i]);
4762 if (ipi_processor[i] == PROCESSOR_NULL) {
4763 break;
4764 }
4765 count++;
4766
4767 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_NEXT_PROCESSOR) | DBG_FUNC_NONE,
4768 ipi_processor[i]->cpu_id, ipi_processor[i]->state, backup, 1);
4769#if defined(__x86_64__)
4770#define p_is_good(p) (((p)->processor_primary == (p)) && ((sched_avoid_cpu0 != 1) || ((p)->cpu_id != 0)))
4771 if (n_backup == SCHED_DEFAULT_BACKUP_PROCESSORS_SMT) {
4772 processor_t p0 = ipi_processor[0];
4773 processor_t p1 = ipi_processor[1];
4774 assert(p0 && p1);
4775 if (p_is_good(p0) && p_is_good(p1)) {
4776 /*
4777 * Both the chosen processor and the first backup are non-cpu0 primaries,
4778 * so there is no need for a 2nd backup processor.
4779 */
4780 break;
4781 }
4782 }
4783#endif
4784 }
4785 }
4786
4787 if (pset_is_locked) {
4788 pset_unlock(pset);
4789 }
4790
4791 assert((count > 0) && (count <= (n_backup + 1)));
4792 for (int i = 0; i < count; i++) {
4793 assert(ipi_processor[i] != PROCESSOR_NULL);
4794 sched_ipi_perform(dst: ipi_processor[i], ipi: ipi_type[i]);
4795 }
4796}
4797
4798
4799sched_ipi_type_t
4800sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
4801 thread_t thread, __unused sched_ipi_event_t event)
4802{
4803#if defined(CONFIG_SCHED_DEFERRED_AST)
4804#if CONFIG_THREAD_GROUPS
4805 if (thread) {
4806 struct thread_group *tg = thread_group_get(t: thread);
4807 if (thread_group_uses_immediate_ipi(tg)) {
4808 return SCHED_IPI_IMMEDIATE;
4809 }
4810 }
4811#endif /* CONFIG_THREAD_GROUPS */
4812 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
4813 return SCHED_IPI_DEFERRED;
4814 }
4815#else /* CONFIG_SCHED_DEFERRED_AST */
4816 (void) thread;
4817 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
4818#endif /* CONFIG_SCHED_DEFERRED_AST */
4819 return SCHED_IPI_NONE;
4820}
4821
4822sched_ipi_type_t
4823sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event)
4824{
4825 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4826 assert(dst != NULL);
4827
4828 processor_set_t pset = dst->processor_set;
4829 if (current_processor() == dst) {
4830 return SCHED_IPI_NONE;
4831 }
4832
4833 bool dst_idle = (dst->state == PROCESSOR_IDLE);
4834 if (dst_idle) {
4835 pset_update_processor_state(pset, processor: dst, new_state: PROCESSOR_DISPATCHING);
4836 }
4837
4838 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
4839 switch (ipi_type) {
4840 case SCHED_IPI_NONE:
4841 return SCHED_IPI_NONE;
4842#if defined(CONFIG_SCHED_DEFERRED_AST)
4843 case SCHED_IPI_DEFERRED:
4844 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
4845 break;
4846#endif /* CONFIG_SCHED_DEFERRED_AST */
4847 default:
4848 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id)) {
4849 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4850 dst->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 4);
4851 }
4852 bit_set(pset->pending_AST_PREEMPT_cpu_mask, dst->cpu_id);
4853 break;
4854 }
4855 return ipi_type;
4856}
4857
4858sched_ipi_type_t
4859sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4860{
4861 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4862 boolean_t deferred_ipi_supported = false;
4863 processor_set_t pset = dst->processor_set;
4864
4865#if defined(CONFIG_SCHED_DEFERRED_AST)
4866 deferred_ipi_supported = true;
4867#endif /* CONFIG_SCHED_DEFERRED_AST */
4868
4869 switch (event) {
4870 case SCHED_IPI_EVENT_SPILL:
4871 case SCHED_IPI_EVENT_SMT_REBAL:
4872 case SCHED_IPI_EVENT_REBALANCE:
4873 case SCHED_IPI_EVENT_BOUND_THR:
4874 case SCHED_IPI_EVENT_RT_PREEMPT:
4875 /*
4876 * The RT preempt, spill, SMT rebalance, rebalance and the bound thread
4877 * scenarios use immediate IPIs always.
4878 */
4879 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4880 break;
4881 case SCHED_IPI_EVENT_PREEMPT:
4882 /* In the preemption case, use immediate IPIs for RT threads */
4883 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
4884 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4885 break;
4886 }
4887
4888 /*
4889 * For Non-RT threads preemption,
4890 * If the core is active, use immediate IPIs.
4891 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
4892 */
4893 if (deferred_ipi_supported && dst_idle) {
4894 return sched_ipi_deferred_policy(pset, dst, thread, event);
4895 }
4896 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4897 break;
4898 default:
4899 panic("Unrecognized scheduler IPI event type %d", event);
4900 }
4901 assert(ipi_type != SCHED_IPI_NONE);
4902 return ipi_type;
4903}
4904
4905void
4906sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
4907{
4908 switch (ipi) {
4909 case SCHED_IPI_NONE:
4910 break;
4911 case SCHED_IPI_IDLE:
4912 machine_signal_idle(processor: dst);
4913 break;
4914 case SCHED_IPI_IMMEDIATE:
4915 cause_ast_check(processor: dst);
4916 break;
4917 case SCHED_IPI_DEFERRED:
4918 machine_signal_idle_deferred(processor: dst);
4919 break;
4920 default:
4921 panic("Unrecognized scheduler IPI type: %d", ipi);
4922 }
4923}
4924
4925#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4926
4927boolean_t
4928priority_is_urgent(int priority)
4929{
4930 return bitmap_test(map: sched_preempt_pri, n: priority) ? TRUE : FALSE;
4931}
4932
4933#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4934
4935/*
4936 * processor_setrun:
4937 *
4938 * Dispatch a thread for execution on a
4939 * processor.
4940 *
4941 * Thread must be locked. Associated pset must
4942 * be locked, and is returned unlocked.
4943 */
4944static void
4945processor_setrun(
4946 processor_t processor,
4947 thread_t thread,
4948 integer_t options)
4949{
4950 processor_set_t pset = processor->processor_set;
4951 pset_assert_locked(pset);
4952 ast_t preempt = AST_NONE;
4953 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
4954
4955 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
4956
4957 thread->chosen_processor = processor;
4958
4959 /*
4960 * Set preemption mode.
4961 */
4962#if defined(CONFIG_SCHED_DEFERRED_AST)
4963 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
4964#endif
4965 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) {
4966 preempt = (AST_PREEMPT | AST_URGENT);
4967 } else if (processor->current_is_eagerpreempt) {
4968 preempt = (AST_PREEMPT | AST_URGENT);
4969 } else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
4970 if (SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
4971 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4972 } else {
4973 preempt = AST_NONE;
4974 }
4975 } else {
4976 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
4977 }
4978
4979 if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) {
4980 /*
4981 * Having gone to the trouble of forcing this thread off a less preferred core,
4982 * we should force the preferable core to reschedule immediately to give this
4983 * thread a chance to run instead of just sitting on the run queue where
4984 * it may just be stolen back by the idle core we just forced it off.
4985 */
4986 preempt |= AST_PREEMPT;
4987 }
4988
4989 SCHED(processor_enqueue)(processor, thread, options);
4990 sched_update_pset_load_average(pset, curtime: 0);
4991
4992 if (preempt != AST_NONE) {
4993 if (processor->state == PROCESSOR_IDLE) {
4994 ipi_action = eExitIdle;
4995 } else if (processor->state == PROCESSOR_DISPATCHING) {
4996 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
4997 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
4998 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 5);
4999 }
5000 } else if ((processor->state == PROCESSOR_RUNNING ||
5001 processor->state == PROCESSOR_SHUTDOWN) &&
5002 (thread->sched_pri >= processor->current_pri)) {
5003 ipi_action = eInterruptRunning;
5004 }
5005 } else {
5006 /*
5007 * New thread is not important enough to preempt what is running, but
5008 * special processor states may need special handling
5009 */
5010 if (processor->state == PROCESSOR_SHUTDOWN &&
5011 thread->sched_pri >= processor->current_pri) {
5012 ipi_action = eInterruptRunning;
5013 } else if (processor->state == PROCESSOR_IDLE) {
5014 ipi_action = eExitIdle;
5015 } else if (processor->state == PROCESSOR_DISPATCHING) {
5016 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5017 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5018 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 6);
5019 }
5020 }
5021 }
5022
5023 if (ipi_action != eDoNothing) {
5024 if (processor == current_processor()) {
5025 if (ipi_action == eExitIdle) {
5026 pset_update_processor_state(pset, processor, new_state: PROCESSOR_DISPATCHING);
5027 }
5028 if ((preempt = csw_check_locked(thread: processor->active_thread, processor, pset, AST_NONE)) != AST_NONE) {
5029 ast_on(reasons: preempt);
5030 }
5031
5032 if ((preempt & AST_URGENT) == AST_URGENT) {
5033 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5034 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
5035 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, (uintptr_t)thread_tid(thread), 7);
5036 }
5037 } else {
5038 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5039 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 7);
5040 }
5041 }
5042
5043 if ((preempt & AST_PREEMPT) == AST_PREEMPT) {
5044 bit_set(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5045 } else {
5046 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5047 }
5048 } else {
5049 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
5050 ipi_type = sched_ipi_action(dst: processor, thread, event);
5051 }
5052 }
5053
5054 pset_unlock(pset);
5055 sched_ipi_perform(dst: processor, ipi: ipi_type);
5056
5057 if (ipi_action != eDoNothing && processor == current_processor()) {
5058 ast_t new_preempt = update_pending_nonurgent_preemption(processor, reason: preempt);
5059 ast_on(reasons: new_preempt);
5060 }
5061}
5062
5063/*
5064 * choose_next_pset:
5065 *
5066 * Return the next sibling pset containing
5067 * available processors.
5068 *
5069 * Returns the original pset if none other is
5070 * suitable.
5071 */
5072static processor_set_t
5073choose_next_pset(
5074 processor_set_t pset)
5075{
5076 processor_set_t nset = pset;
5077
5078 do {
5079 nset = next_pset(pset: nset);
5080
5081 /*
5082 * Sometimes during startup the pset_map can contain a bit
5083 * for a pset that isn't fully published in pset_array because
5084 * the pset_map read isn't an acquire load.
5085 *
5086 * In order to avoid needing an acquire barrier here, just bail
5087 * out.
5088 */
5089 if (nset == PROCESSOR_SET_NULL) {
5090 return pset;
5091 }
5092 } while (nset->online_processor_count < 1 && nset != pset);
5093
5094 return nset;
5095}
5096
5097/*
5098 * choose_processor:
5099 *
5100 * Choose a processor for the thread, beginning at
5101 * the pset. Accepts an optional processor hint in
5102 * the pset.
5103 *
5104 * Returns a processor, possibly from a different pset.
5105 *
5106 * The thread must be locked. The pset must be locked,
5107 * and the resulting pset is locked on return.
5108 */
5109processor_t
5110choose_processor(
5111 processor_set_t starting_pset,
5112 processor_t processor,
5113 thread_t thread)
5114{
5115 processor_set_t pset = starting_pset;
5116 processor_set_t nset;
5117
5118 assert(thread->sched_pri <= MAXPRI);
5119
5120 /*
5121 * Prefer the hinted processor, when appropriate.
5122 */
5123
5124 /* Fold last processor hint from secondary processor to its primary */
5125 if (processor != PROCESSOR_NULL) {
5126 processor = processor->processor_primary;
5127 }
5128
5129 /*
5130 * Only consult platform layer if pset is active, which
5131 * it may not be in some cases when a multi-set system
5132 * is going to sleep.
5133 */
5134 if (pset->online_processor_count) {
5135 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
5136 processor_t mc_processor = machine_choose_processor(pset, processor);
5137 if (mc_processor != PROCESSOR_NULL) {
5138 processor = mc_processor->processor_primary;
5139 }
5140 }
5141 }
5142
5143 /*
5144 * At this point, we may have a processor hint, and we may have
5145 * an initial starting pset. If the hint is not in the pset, or
5146 * if the hint is for a processor in an invalid state, discard
5147 * the hint.
5148 */
5149 if (processor != PROCESSOR_NULL) {
5150 if (processor->processor_set != pset) {
5151 processor = PROCESSOR_NULL;
5152 } else if (!processor->is_recommended) {
5153 processor = PROCESSOR_NULL;
5154 } else {
5155 switch (processor->state) {
5156 case PROCESSOR_START:
5157 case PROCESSOR_SHUTDOWN:
5158 case PROCESSOR_PENDING_OFFLINE:
5159 case PROCESSOR_OFF_LINE:
5160 /*
5161 * Hint is for a processor that cannot support running new threads.
5162 */
5163 processor = PROCESSOR_NULL;
5164 break;
5165 case PROCESSOR_IDLE:
5166 /*
5167 * Hint is for an idle processor. Assume it is no worse than any other
5168 * idle processor. The platform layer had an opportunity to provide
5169 * the "least cost idle" processor above.
5170 */
5171 if ((thread->sched_pri < BASEPRI_RTQUEUES) || processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5172 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5173 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5174 /*
5175 * If the rotation bitmask to force a migration is set for this core and there's an idle core that
5176 * that needn't be avoided, don't continue running on the same core.
5177 */
5178 if (!(bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id) && non_avoided_idle_primary_map != 0)) {
5179 return processor;
5180 }
5181 }
5182 processor = PROCESSOR_NULL;
5183 break;
5184 case PROCESSOR_RUNNING:
5185 case PROCESSOR_DISPATCHING:
5186 /*
5187 * Hint is for an active CPU. This fast-path allows
5188 * realtime threads to preempt non-realtime threads
5189 * to regain their previous executing processor.
5190 */
5191 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5192 if (processor_is_fast_track_candidate_for_realtime_thread(pset, processor)) {
5193 return processor;
5194 }
5195 processor = PROCESSOR_NULL;
5196 }
5197
5198 /* Otherwise, use hint as part of search below */
5199 break;
5200 default:
5201 processor = PROCESSOR_NULL;
5202 break;
5203 }
5204 }
5205 }
5206
5207 /*
5208 * Iterate through the processor sets to locate
5209 * an appropriate processor. Seed results with
5210 * a last-processor hint, if available, so that
5211 * a search must find something strictly better
5212 * to replace it.
5213 *
5214 * A primary/secondary pair of SMT processors are
5215 * "unpaired" if the primary is busy but its
5216 * corresponding secondary is idle (so the physical
5217 * core has full use of its resources).
5218 */
5219
5220 integer_t lowest_priority = MAXPRI + 1;
5221 integer_t lowest_secondary_priority = MAXPRI + 1;
5222 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
5223 integer_t lowest_idle_secondary_priority = MAXPRI + 1;
5224 integer_t lowest_count = INT_MAX;
5225 processor_t lp_processor = PROCESSOR_NULL;
5226 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
5227 processor_t lp_idle_secondary_processor = PROCESSOR_NULL;
5228 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
5229 processor_t lc_processor = PROCESSOR_NULL;
5230
5231 if (processor != PROCESSOR_NULL) {
5232 /* All other states should be enumerated above. */
5233 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
5234 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5235
5236 lowest_priority = processor->current_pri;
5237 lp_processor = processor;
5238
5239 lowest_count = SCHED(processor_runq_count)(processor);
5240 lc_processor = processor;
5241 }
5242
5243 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5244 pset_node_t node = pset->node;
5245 bool include_ast_urgent_pending_cpus = false;
5246 cpumap_t ast_urgent_pending;
5247try_again:
5248 ast_urgent_pending = 0;
5249 int consider_secondaries = (!pset->is_SMT) || (bit_count(x: node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus;
5250 for (; consider_secondaries < 2; consider_secondaries++) {
5251 pset = change_locked_pset(current_pset: pset, new_pset: starting_pset);
5252 do {
5253 cpumap_t available_map = pset_available_cpumap(pset);
5254 if (available_map == 0) {
5255 goto no_available_cpus;
5256 }
5257
5258 processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, consider_secondaries, false);
5259 if (processor) {
5260 return processor;
5261 }
5262
5263 if (consider_secondaries) {
5264 processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri: thread->sched_pri, minimum_deadline: thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus);
5265 if (processor) {
5266 /*
5267 * Instead of looping through all the psets to find the global
5268 * furthest deadline processor, preempt the first candidate found.
5269 * The preempted thread will then find any other available far deadline
5270 * processors to preempt.
5271 */
5272 return processor;
5273 }
5274
5275 ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask;
5276
5277 if (rt_runq_count(pset) < lowest_count) {
5278 int cpuid = bit_first(bitmap: available_map);
5279 assert(cpuid >= 0);
5280 lc_processor = processor_array[cpuid];
5281 lowest_count = rt_runq_count(pset);
5282 }
5283 }
5284
5285no_available_cpus:
5286 nset = next_pset(pset);
5287
5288 if (nset != starting_pset) {
5289 pset = change_locked_pset(current_pset: pset, new_pset: nset);
5290 }
5291 } while (nset != starting_pset);
5292 }
5293
5294 /* Short cut for single pset nodes */
5295 if (bit_count(x: node->pset_map) == 1) {
5296 if (lc_processor) {
5297 pset_assert_locked(lc_processor->processor_set);
5298 return lc_processor;
5299 }
5300 } else {
5301 if (ast_urgent_pending && !include_ast_urgent_pending_cpus) {
5302 /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */
5303 include_ast_urgent_pending_cpus = true;
5304 goto try_again;
5305 }
5306 }
5307
5308 processor = lc_processor;
5309
5310 if (processor) {
5311 pset = change_locked_pset(current_pset: pset, new_pset: processor->processor_set);
5312 /* Check that chosen processor is still usable */
5313 cpumap_t available_map = pset_available_cpumap(pset);
5314 if (bit_test(available_map, processor->cpu_id)) {
5315 return processor;
5316 }
5317
5318 /* processor is no longer usable */
5319 processor = PROCESSOR_NULL;
5320 }
5321
5322 pset_assert_locked(pset);
5323 pset_unlock(pset);
5324 return PROCESSOR_NULL;
5325 }
5326
5327 /* No realtime threads from this point on */
5328 assert(thread->sched_pri < BASEPRI_RTQUEUES);
5329
5330 do {
5331 /*
5332 * Choose an idle processor, in pset traversal order
5333 */
5334 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & pset->primary_map & pset->recommended_bitmask);
5335 uint64_t preferred_idle_primary_map = idle_primary_map & pset->perfcontrol_cpu_preferred_bitmask;
5336
5337 /* there shouldn't be a pending AST if the processor is idle */
5338 assert((idle_primary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5339
5340 /*
5341 * Look at the preferred cores first.
5342 */
5343 int cpuid = lsb_next(bitmap: preferred_idle_primary_map, previous_bit: pset->cpu_preferred_last_chosen);
5344 if (cpuid < 0) {
5345 cpuid = lsb_first(bitmap: preferred_idle_primary_map);
5346 }
5347 if (cpuid >= 0) {
5348 processor = processor_array[cpuid];
5349 pset->cpu_preferred_last_chosen = cpuid;
5350 return processor;
5351 }
5352
5353 /*
5354 * Look at the cores that don't need to be avoided next.
5355 */
5356 if (pset->perfcontrol_cpu_migration_bitmask != 0) {
5357 uint64_t non_avoided_idle_primary_map = idle_primary_map & ~pset->perfcontrol_cpu_migration_bitmask;
5358 cpuid = lsb_next(bitmap: non_avoided_idle_primary_map, previous_bit: pset->cpu_preferred_last_chosen);
5359 if (cpuid < 0) {
5360 cpuid = lsb_first(bitmap: non_avoided_idle_primary_map);
5361 }
5362 if (cpuid >= 0) {
5363 processor = processor_array[cpuid];
5364 pset->cpu_preferred_last_chosen = cpuid;
5365 return processor;
5366 }
5367 }
5368
5369 /*
5370 * Fall back to any remaining idle cores if none of the preferred ones and non-avoided ones are available.
5371 */
5372 cpuid = lsb_first(bitmap: idle_primary_map);
5373 if (cpuid >= 0) {
5374 processor = processor_array[cpuid];
5375 return processor;
5376 }
5377
5378 /*
5379 * Otherwise, enumerate active and idle processors to find primary candidates
5380 * with lower priority/etc.
5381 */
5382
5383 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
5384 pset->recommended_bitmask &
5385 ~pset->pending_AST_URGENT_cpu_mask);
5386
5387 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE) {
5388 active_map &= ~pset->pending_AST_PREEMPT_cpu_mask;
5389 }
5390
5391 active_map = bit_ror64(bitmap: active_map, n: (pset->last_chosen + 1));
5392 for (int rotid = lsb_first(bitmap: active_map); rotid >= 0; rotid = lsb_next(bitmap: active_map, previous_bit: rotid)) {
5393 cpuid = ((rotid + pset->last_chosen + 1) & 63);
5394 processor = processor_array[cpuid];
5395
5396 integer_t cpri = processor->current_pri;
5397 processor_t primary = processor->processor_primary;
5398 if (primary != processor) {
5399 /* If primary is running a NO_SMT thread, don't choose its secondary */
5400 if (!((primary->state == PROCESSOR_RUNNING) && processor_active_thread_no_smt(processor: primary))) {
5401 if (cpri < lowest_secondary_priority) {
5402 lowest_secondary_priority = cpri;
5403 lp_paired_secondary_processor = processor;
5404 }
5405 }
5406 } else {
5407 if (cpri < lowest_priority) {
5408 lowest_priority = cpri;
5409 lp_processor = processor;
5410 }
5411 }
5412
5413 integer_t ccount = SCHED(processor_runq_count)(processor);
5414 if (ccount < lowest_count) {
5415 lowest_count = ccount;
5416 lc_processor = processor;
5417 }
5418 }
5419
5420 /*
5421 * For SMT configs, these idle secondary processors must have active primary. Otherwise
5422 * the idle primary would have short-circuited the loop above
5423 */
5424 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
5425 ~pset->primary_map &
5426 pset->recommended_bitmask);
5427
5428 /* there shouldn't be a pending AST if the processor is idle */
5429 assert((idle_secondary_map & pset->pending_AST_URGENT_cpu_mask) == 0);
5430 assert((idle_secondary_map & pset->pending_AST_PREEMPT_cpu_mask) == 0);
5431
5432 for (cpuid = lsb_first(bitmap: idle_secondary_map); cpuid >= 0; cpuid = lsb_next(bitmap: idle_secondary_map, previous_bit: cpuid)) {
5433 processor = processor_array[cpuid];
5434
5435 processor_t cprimary = processor->processor_primary;
5436
5437 integer_t primary_pri = cprimary->current_pri;
5438
5439 /*
5440 * TODO: This should also make the same decisions
5441 * as secondary_can_run_realtime_thread
5442 *
5443 * TODO: Keep track of the pending preemption priority
5444 * of the primary to make this more accurate.
5445 */
5446
5447 /* If the primary is running a no-smt thread, then don't choose its secondary */
5448 if (cprimary->state == PROCESSOR_RUNNING &&
5449 processor_active_thread_no_smt(processor: cprimary)) {
5450 continue;
5451 }
5452
5453 /*
5454 * Find the idle secondary processor with the lowest priority primary
5455 *
5456 * We will choose this processor as a fallback if we find no better
5457 * primary to preempt.
5458 */
5459 if (primary_pri < lowest_idle_secondary_priority) {
5460 lp_idle_secondary_processor = processor;
5461 lowest_idle_secondary_priority = primary_pri;
5462 }
5463
5464 /* Find the the lowest priority active primary with idle secondary */
5465 if (primary_pri < lowest_unpaired_primary_priority) {
5466 /* If the primary processor is offline or starting up, it's not a candidate for this path */
5467 if (cprimary->state != PROCESSOR_RUNNING &&
5468 cprimary->state != PROCESSOR_DISPATCHING) {
5469 continue;
5470 }
5471
5472 if (!cprimary->is_recommended) {
5473 continue;
5474 }
5475
5476 /* if the primary is pending preemption, don't try to re-preempt it */
5477 if (bit_test(pset->pending_AST_URGENT_cpu_mask, cprimary->cpu_id)) {
5478 continue;
5479 }
5480
5481 if (SCHED(priority_is_urgent)(thread->sched_pri) == FALSE &&
5482 bit_test(pset->pending_AST_PREEMPT_cpu_mask, cprimary->cpu_id)) {
5483 continue;
5484 }
5485
5486 lowest_unpaired_primary_priority = primary_pri;
5487 lp_unpaired_primary_processor = cprimary;
5488 }
5489 }
5490
5491 /*
5492 * We prefer preempting a primary processor over waking up its secondary.
5493 * The secondary will then be woken up by the preempted thread.
5494 */
5495 if (thread->sched_pri > lowest_unpaired_primary_priority) {
5496 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
5497 return lp_unpaired_primary_processor;
5498 }
5499
5500 /*
5501 * We prefer preempting a lower priority active processor over directly
5502 * waking up an idle secondary.
5503 * The preempted thread will then find the idle secondary.
5504 */
5505 if (thread->sched_pri > lowest_priority) {
5506 pset->last_chosen = lp_processor->cpu_id;
5507 return lp_processor;
5508 }
5509
5510 /*
5511 * lc_processor is used to indicate the best processor set run queue
5512 * on which to enqueue a thread when all available CPUs are busy with
5513 * higher priority threads, so try to make sure it is initialized.
5514 */
5515 if (lc_processor == PROCESSOR_NULL) {
5516 cpumap_t available_map = pset_available_cpumap(pset);
5517 cpuid = lsb_first(bitmap: available_map);
5518 if (cpuid >= 0) {
5519 lc_processor = processor_array[cpuid];
5520 lowest_count = SCHED(processor_runq_count)(lc_processor);
5521 }
5522 }
5523
5524 /*
5525 * Move onto the next processor set.
5526 *
5527 * If all primary processors in this pset are running a higher
5528 * priority thread, move on to next pset. Only when we have
5529 * exhausted the search for primary processors do we
5530 * fall back to secondaries.
5531 */
5532#if CONFIG_SCHED_EDGE
5533 /*
5534 * The edge scheduler expects a CPU to be selected from the pset it passed in
5535 * as the starting pset for non-RT workloads. The edge migration algorithm
5536 * should already have considered idle CPUs and loads to decide the starting_pset;
5537 * which means that this loop can be short-circuted.
5538 */
5539 nset = starting_pset;
5540#else /* CONFIG_SCHED_EDGE */
5541 nset = next_pset(pset);
5542#endif /* CONFIG_SCHED_EDGE */
5543
5544 if (nset != starting_pset) {
5545 pset = change_locked_pset(current_pset: pset, new_pset: nset);
5546 }
5547 } while (nset != starting_pset);
5548
5549 /*
5550 * Make sure that we pick a running processor,
5551 * and that the correct processor set is locked.
5552 * Since we may have unlocked the candidate processor's
5553 * pset, it may have changed state.
5554 *
5555 * All primary processors are running a higher priority
5556 * thread, so the only options left are enqueuing on
5557 * the secondary processor that would perturb the least priority
5558 * primary, or the least busy primary.
5559 */
5560
5561 /* lowest_priority is evaluated in the main loops above */
5562 if (lp_idle_secondary_processor != PROCESSOR_NULL) {
5563 processor = lp_idle_secondary_processor;
5564 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
5565 processor = lp_paired_secondary_processor;
5566 } else if (lc_processor != PROCESSOR_NULL) {
5567 processor = lc_processor;
5568 } else {
5569 processor = PROCESSOR_NULL;
5570 }
5571
5572 if (processor) {
5573 pset = change_locked_pset(current_pset: pset, new_pset: processor->processor_set);
5574 /* Check that chosen processor is still usable */
5575 cpumap_t available_map = pset_available_cpumap(pset);
5576 if (bit_test(available_map, processor->cpu_id)) {
5577 pset->last_chosen = processor->cpu_id;
5578 return processor;
5579 }
5580
5581 /* processor is no longer usable */
5582 processor = PROCESSOR_NULL;
5583 }
5584
5585 pset_assert_locked(pset);
5586 pset_unlock(pset);
5587 return PROCESSOR_NULL;
5588}
5589
5590/*
5591 * Default implementation of SCHED(choose_node)()
5592 * for single node systems
5593 */
5594pset_node_t
5595sched_choose_node(__unused thread_t thread)
5596{
5597 return &pset_node0;
5598}
5599
5600/*
5601 * choose_starting_pset:
5602 *
5603 * Choose a starting processor set for the thread.
5604 * May return a processor hint within the pset.
5605 *
5606 * Returns a starting processor set, to be used by
5607 * choose_processor.
5608 *
5609 * The thread must be locked. The resulting pset is unlocked on return,
5610 * and is chosen without taking any pset locks.
5611 */
5612processor_set_t
5613choose_starting_pset(pset_node_t node, thread_t thread, processor_t *processor_hint)
5614{
5615 processor_set_t pset;
5616 processor_t processor = PROCESSOR_NULL;
5617
5618 if (thread->affinity_set != AFFINITY_SET_NULL) {
5619 /*
5620 * Use affinity set policy hint.
5621 */
5622 pset = thread->affinity_set->aset_pset;
5623 } else if (thread->last_processor != PROCESSOR_NULL) {
5624 /*
5625 * Simple (last processor) affinity case.
5626 */
5627 processor = thread->last_processor;
5628 pset = processor->processor_set;
5629 } else {
5630 /*
5631 * No Affinity case:
5632 *
5633 * Utilitize a per task hint to spread threads
5634 * among the available processor sets.
5635 * NRG this seems like the wrong thing to do.
5636 * See also task->pset_hint = pset in thread_setrun()
5637 */
5638 pset = get_threadtask(thread)->pset_hint;
5639 if (pset == PROCESSOR_SET_NULL) {
5640 pset = current_processor()->processor_set;
5641 }
5642
5643 pset = choose_next_pset(pset);
5644 }
5645
5646 if (!bit_test(node->pset_map, pset->pset_id)) {
5647 /* pset is not from this node so choose one that is */
5648 int id = lsb_first(bitmap: node->pset_map);
5649 if (id < 0) {
5650 /* startup race, so check again under the node lock */
5651 lck_spin_lock(lck: &pset_node_lock);
5652 if (bit_test(node->pset_map, pset->pset_id)) {
5653 id = pset->pset_id;
5654 } else {
5655 id = lsb_first(bitmap: node->pset_map);
5656 }
5657 lck_spin_unlock(lck: &pset_node_lock);
5658 }
5659 assert(id >= 0);
5660 pset = pset_array[id];
5661 }
5662
5663 if (bit_count(x: node->pset_map) == 1) {
5664 /* Only a single pset in this node */
5665 goto out;
5666 }
5667
5668 bool avoid_cpu0 = false;
5669
5670#if defined(__x86_64__)
5671 if ((thread->sched_pri >= BASEPRI_RTQUEUES) && sched_avoid_cpu0) {
5672 /* Avoid the pset containing cpu0 */
5673 avoid_cpu0 = true;
5674 /* Assert that cpu0 is in pset0. I expect this to be true on __x86_64__ */
5675 assert(bit_test(pset_array[0]->cpu_bitmask, 0));
5676 }
5677#endif
5678
5679 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5680 pset_map_t rt_target_map = atomic_load(&node->pset_non_rt_primary_map);
5681 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5682 if (avoid_cpu0) {
5683 rt_target_map = bit_ror64(bitmap: rt_target_map, n: 1);
5684 }
5685 int rotid = lsb_first(bitmap: rt_target_map);
5686 if (rotid >= 0) {
5687 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5688 pset = pset_array[id];
5689 goto out;
5690 }
5691 }
5692 if (!pset->is_SMT || !sched_allow_rt_smt) {
5693 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5694 goto out;
5695 }
5696 rt_target_map = atomic_load(&node->pset_non_rt_map);
5697 if ((avoid_cpu0 && pset->pset_id == 0) || !bit_test(rt_target_map, pset->pset_id)) {
5698 if (avoid_cpu0) {
5699 rt_target_map = bit_ror64(bitmap: rt_target_map, n: 1);
5700 }
5701 int rotid = lsb_first(bitmap: rt_target_map);
5702 if (rotid >= 0) {
5703 int id = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
5704 pset = pset_array[id];
5705 goto out;
5706 }
5707 }
5708 /* All psets are full of RT threads - fall back to choose processor to find the furthest deadline RT thread */
5709 } else {
5710 pset_map_t idle_map = atomic_load(&node->pset_idle_map);
5711 if (!bit_test(idle_map, pset->pset_id)) {
5712 int next_idle_pset_id = lsb_first(bitmap: idle_map);
5713 if (next_idle_pset_id >= 0) {
5714 pset = pset_array[next_idle_pset_id];
5715 }
5716 }
5717 }
5718
5719out:
5720 if ((processor != PROCESSOR_NULL) && (processor->processor_set != pset)) {
5721 processor = PROCESSOR_NULL;
5722 }
5723 if (processor != PROCESSOR_NULL) {
5724 *processor_hint = processor;
5725 }
5726
5727 assert(pset != NULL);
5728 return pset;
5729}
5730
5731/*
5732 * thread_setrun:
5733 *
5734 * Dispatch thread for execution, onto an idle
5735 * processor or run queue, and signal a preemption
5736 * as appropriate.
5737 *
5738 * Thread must be locked.
5739 */
5740void
5741thread_setrun(
5742 thread_t thread,
5743 sched_options_t options)
5744{
5745 processor_t processor = PROCESSOR_NULL;
5746 processor_set_t pset;
5747
5748 assert((thread->state & (TH_RUN | TH_WAIT | TH_UNINT | TH_TERMINATE | TH_TERMINATE2)) == TH_RUN);
5749 thread_assert_runq_null(thread);
5750
5751#if CONFIG_PREADOPT_TG
5752 /* We know that the thread is not in the runq by virtue of being in this
5753 * function and the thread is not self since we are running. We can safely
5754 * resolve the thread group hierarchy and modify the thread's thread group
5755 * here. */
5756 thread_resolve_and_enforce_thread_group_hierarchy_if_needed(t: thread);
5757#endif
5758
5759 /*
5760 * Update priority if needed.
5761 */
5762 if (SCHED(can_update_priority)(thread)) {
5763 SCHED(update_priority)(thread);
5764 }
5765 thread->sfi_class = sfi_thread_classify(thread);
5766
5767 if (thread->bound_processor == PROCESSOR_NULL) {
5768 /*
5769 * Unbound case.
5770 *
5771 * Usually, this loop will only be executed once,
5772 * but if CLPC derecommends a processor after it has been chosen,
5773 * or if a processor is shut down after it is chosen,
5774 * choose_processor() may return NULL, so a retry
5775 * may be necessary. A single retry will usually
5776 * be enough, and we can't afford to retry too many times
5777 * because interrupts are disabled.
5778 */
5779#define CHOOSE_PROCESSOR_MAX_RETRIES 3
5780 for (int retry = 0; retry <= CHOOSE_PROCESSOR_MAX_RETRIES; retry++) {
5781 processor_t processor_hint = PROCESSOR_NULL;
5782 pset_node_t node = SCHED(choose_node)(thread);
5783 processor_set_t starting_pset = choose_starting_pset(node, thread, processor_hint: &processor_hint);
5784
5785 pset_lock(starting_pset);
5786
5787 processor = SCHED(choose_processor)(starting_pset, processor_hint, thread);
5788 if (processor != PROCESSOR_NULL) {
5789 pset = processor->processor_set;
5790 pset_assert_locked(pset);
5791 break;
5792 }
5793 }
5794 /*
5795 * If choose_processor() still returns NULL,
5796 * which is very unlikely,
5797 * choose the master_processor, which is always
5798 * safe to choose.
5799 */
5800 if (processor == PROCESSOR_NULL) {
5801 /* Choose fallback processor */
5802 processor = master_processor;
5803 pset = processor->processor_set;
5804 pset_lock(pset);
5805 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5806 }
5807 task_t task = get_threadtask(thread);
5808 if (!(task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5809 task->pset_hint = pset; /* NRG this is done without holding the task lock */
5810 }
5811 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5812 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
5813 assert((pset_available_cpu_count(pset) > 0) || (processor->state != PROCESSOR_OFF_LINE && processor->is_recommended));
5814 } else {
5815 /*
5816 * Bound case:
5817 *
5818 * Unconditionally dispatch on the processor.
5819 */
5820 processor = thread->bound_processor;
5821 pset = processor->processor_set;
5822 pset_lock(pset);
5823
5824 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT_IST(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR) | DBG_FUNC_NONE,
5825 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
5826 }
5827
5828 /*
5829 * Dispatch the thread on the chosen processor.
5830 * TODO: This should be based on sched_mode, not sched_pri
5831 */
5832 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5833 realtime_setrun(chosen_processor: processor, thread);
5834 } else {
5835 processor_setrun(processor, thread, options);
5836 }
5837 /* pset is now unlocked */
5838 if (thread->bound_processor == PROCESSOR_NULL) {
5839 SCHED(check_spill)(pset, thread);
5840 }
5841}
5842
5843processor_set_t
5844task_choose_pset(
5845 task_t task)
5846{
5847 processor_set_t pset = task->pset_hint;
5848
5849 if (pset != PROCESSOR_SET_NULL) {
5850 pset = choose_next_pset(pset);
5851 }
5852
5853 return pset;
5854}
5855
5856/*
5857 * Check for a preemption point in
5858 * the current context.
5859 *
5860 * Called at splsched with thread locked.
5861 */
5862ast_t
5863csw_check(
5864 thread_t thread,
5865 processor_t processor,
5866 ast_t check_reason)
5867{
5868 processor_set_t pset = processor->processor_set;
5869
5870 assert(thread == processor->active_thread);
5871
5872 pset_lock(pset);
5873
5874 processor_state_update_from_thread(processor, thread, true);
5875
5876 ast_t preempt = csw_check_locked(thread, processor, pset, check_reason);
5877
5878 /* Acknowledge the IPI if we decided not to preempt */
5879
5880 if ((preempt & AST_URGENT) == 0) {
5881 if (bit_clear_if_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
5882 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_END, processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 8);
5883 }
5884 }
5885
5886 if ((preempt & AST_PREEMPT) == 0) {
5887 bit_clear(pset->pending_AST_PREEMPT_cpu_mask, processor->cpu_id);
5888 }
5889
5890 pset_unlock(pset);
5891
5892 return update_pending_nonurgent_preemption(processor, reason: preempt);
5893}
5894
5895void
5896clear_pending_nonurgent_preemption(processor_t processor)
5897{
5898 if (!processor->pending_nonurgent_preemption) {
5899 return;
5900 }
5901
5902 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_END);
5903
5904 processor->pending_nonurgent_preemption = false;
5905 running_timer_clear(processor, timer: RUNNING_TIMER_PREEMPT);
5906}
5907
5908ast_t
5909update_pending_nonurgent_preemption(processor_t processor, ast_t reason)
5910{
5911 if ((reason & (AST_URGENT | AST_PREEMPT)) != (AST_PREEMPT)) {
5912 clear_pending_nonurgent_preemption(processor);
5913 return reason;
5914 }
5915
5916 if (nonurgent_preemption_timer_abs == 0) {
5917 /* Preemption timer not enabled */
5918 return reason;
5919 }
5920
5921 if (current_thread()->state & TH_IDLE) {
5922 /* idle threads don't need nonurgent preemption */
5923 return reason;
5924 }
5925
5926 if (processor->pending_nonurgent_preemption) {
5927 /* Timer is already armed, no need to do it again */
5928 return reason;
5929 }
5930
5931 if (ml_did_interrupt_userspace()) {
5932 /*
5933 * We're preempting userspace here, so we don't need
5934 * to defer the preemption. Force AST_URGENT
5935 * so that we can avoid arming this timer without risking
5936 * ast_taken_user deciding to spend too long in kernel
5937 * space to handle other ASTs.
5938 */
5939
5940 return reason | AST_URGENT;
5941 }
5942
5943 /*
5944 * We've decided to do a nonurgent preemption when running in
5945 * kernelspace. We defer the preemption until reaching userspace boundary
5946 * to give a grace period for locks etc to be dropped and to reach
5947 * a clean preemption point, so that the preempting thread doesn't
5948 * always immediately hit the lock that the waking thread still holds.
5949 *
5950 * Arm a timer to enforce that the preemption executes within a bounded
5951 * time if the thread doesn't block or return to userspace quickly.
5952 */
5953
5954 processor->pending_nonurgent_preemption = true;
5955 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE) | DBG_FUNC_START,
5956 reason);
5957
5958 uint64_t now = mach_absolute_time();
5959
5960 uint64_t deadline = now + nonurgent_preemption_timer_abs;
5961
5962 running_timer_enter(processor, timer: RUNNING_TIMER_PREEMPT, NULL,
5963 deadline: now, now: deadline);
5964
5965 return reason;
5966}
5967
5968/*
5969 * Check for preemption at splsched with
5970 * pset and thread locked
5971 */
5972ast_t
5973csw_check_locked(
5974 thread_t thread,
5975 processor_t processor,
5976 processor_set_t pset,
5977 ast_t check_reason)
5978{
5979 /*
5980 * If the current thread is running on a processor that is no longer recommended,
5981 * urgently preempt it, at which point thread_select() should
5982 * try to idle the processor and re-dispatch the thread to a recommended processor.
5983 */
5984 if (!processor->is_recommended) {
5985 return check_reason | AST_PREEMPT | AST_URGENT;
5986 }
5987
5988 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
5989 return check_reason | AST_PREEMPT | AST_URGENT;
5990 }
5991
5992 if (rt_runq_count(pset) > 0) {
5993 if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) {
5994 return check_reason | AST_PREEMPT | AST_URGENT;
5995 } else if (deadline_add(d: rt_runq_earliest_deadline(pset), e: rt_deadline_epsilon) < processor->deadline) {
5996 return check_reason | AST_PREEMPT | AST_URGENT;
5997 } else {
5998 return check_reason | AST_PREEMPT;
5999 }
6000 }
6001
6002 ast_t result = SCHED(processor_csw_check)(processor);
6003 if (result != AST_NONE) {
6004 return check_reason | result | (thread_is_eager_preempt(thread) ? AST_URGENT : AST_NONE);
6005 }
6006
6007 /*
6008 * Same for avoid-processor
6009 *
6010 * TODO: Should these set AST_REBALANCE?
6011 */
6012 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread, check_reason)) {
6013 return check_reason | AST_PREEMPT;
6014 }
6015
6016 /*
6017 * Even though we could continue executing on this processor, a
6018 * secondary SMT core should try to shed load to another primary core.
6019 *
6020 * TODO: Should this do the same check that thread_select does? i.e.
6021 * if no bound threads target this processor, and idle primaries exist, preempt
6022 * The case of RT threads existing is already taken care of above
6023 */
6024
6025 if (processor->current_pri < BASEPRI_RTQUEUES &&
6026 processor->processor_primary != processor) {
6027 return check_reason | AST_PREEMPT;
6028 }
6029
6030 if (thread->state & TH_SUSP) {
6031 return check_reason | AST_PREEMPT;
6032 }
6033
6034#if CONFIG_SCHED_SFI
6035 /*
6036 * Current thread may not need to be preempted, but maybe needs
6037 * an SFI wait?
6038 */
6039 result = sfi_thread_needs_ast(thread, NULL);
6040 if (result != AST_NONE) {
6041 return result;
6042 }
6043#endif
6044
6045 return AST_NONE;
6046}
6047
6048/*
6049 * Handle preemption IPI or IPI in response to setting an AST flag
6050 * Triggered by cause_ast_check
6051 * Called at splsched
6052 */
6053void
6054ast_check(processor_t processor)
6055{
6056 smr_ack_ipi();
6057
6058 if (processor->state != PROCESSOR_RUNNING &&
6059 processor->state != PROCESSOR_SHUTDOWN) {
6060 return;
6061 }
6062
6063 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6064 MACH_SCHED_AST_CHECK) | DBG_FUNC_START);
6065
6066 thread_t thread = processor->active_thread;
6067
6068 assert(thread == current_thread());
6069
6070 /*
6071 * Pairs with task_restartable_ranges_synchronize
6072 */
6073 thread_lock(thread);
6074
6075 thread_reset_pcs_ack_IPI(thread);
6076
6077 /*
6078 * Propagate thread ast to processor.
6079 * (handles IPI in response to setting AST flag)
6080 */
6081 ast_propagate(thread);
6082
6083 /*
6084 * Stash the old urgency and perfctl values to find out if
6085 * csw_check updates them.
6086 */
6087 thread_urgency_t old_urgency = processor->current_urgency;
6088 perfcontrol_class_t old_perfctl_class = processor->current_perfctl_class;
6089
6090 ast_t preempt;
6091
6092 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6093 ast_on(reasons: preempt);
6094 }
6095
6096 if (old_urgency != processor->current_urgency) {
6097 /*
6098 * Urgency updates happen with the thread lock held (ugh).
6099 * TODO: This doesn't notice QoS changes...
6100 */
6101 uint64_t urgency_param1, urgency_param2;
6102
6103 thread_urgency_t urgency = thread_get_urgency(thread, rt_period: &urgency_param1, rt_deadline: &urgency_param2);
6104 thread_tell_urgency(urgency, rt_period: urgency_param1, rt_deadline: urgency_param2, sched_latency: 0, nthread: thread);
6105 }
6106
6107 thread_unlock(thread);
6108
6109 if (old_perfctl_class != processor->current_perfctl_class) {
6110 /*
6111 * We updated the perfctl class of this thread from another core.
6112 * Let CLPC know that the currently running thread has a new
6113 * class.
6114 */
6115
6116 machine_switch_perfcontrol_state_update(event: PERFCONTROL_ATTR_UPDATE,
6117 timestamp: mach_approximate_time(), flags: 0, thread);
6118 }
6119
6120 SCHED_DEBUG_AST_CHECK_KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED,
6121 MACH_SCHED_AST_CHECK) | DBG_FUNC_END, preempt);
6122}
6123
6124
6125void
6126thread_preempt_expire(
6127 timer_call_param_t p0,
6128 __unused timer_call_param_t p1)
6129{
6130 processor_t processor = p0;
6131
6132 assert(processor == current_processor());
6133 assert(p1 == NULL);
6134
6135 thread_t thread = current_thread();
6136
6137 /*
6138 * This is set and cleared by the current core, so we will
6139 * never see a race with running timer expiration
6140 */
6141 assert(processor->pending_nonurgent_preemption);
6142
6143 clear_pending_nonurgent_preemption(processor);
6144
6145 thread_lock(thread);
6146
6147 /*
6148 * Check again to see if it's still worth a
6149 * context switch, but this time force enable kernel preemption
6150 */
6151
6152 ast_t preempt = csw_check(thread, processor, AST_URGENT);
6153
6154 if (preempt) {
6155 ast_on(reasons: preempt);
6156 }
6157
6158 thread_unlock(thread);
6159
6160 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt);
6161}
6162
6163
6164/*
6165 * set_sched_pri:
6166 *
6167 * Set the scheduled priority of the specified thread.
6168 *
6169 * This may cause the thread to change queues.
6170 *
6171 * Thread must be locked.
6172 */
6173void
6174set_sched_pri(
6175 thread_t thread,
6176 int16_t new_priority,
6177 set_sched_pri_options_t options)
6178{
6179 bool is_current_thread = (thread == current_thread());
6180 bool removed_from_runq = false;
6181 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
6182
6183 int16_t old_priority = thread->sched_pri;
6184
6185 /* If we're already at this priority, no need to mess with the runqueue */
6186 if (new_priority == old_priority) {
6187#if CONFIG_SCHED_CLUTCH
6188 /* For the first thread in the system, the priority is correct but
6189 * th_sched_bucket is still TH_BUCKET_RUN. Since the clutch
6190 * scheduler relies on the bucket being set for all threads, update
6191 * its bucket here.
6192 */
6193 if (thread->th_sched_bucket == TH_BUCKET_RUN) {
6194 assert(thread == vm_pageout_scan_thread);
6195 SCHED(update_thread_bucket)(thread);
6196 }
6197#endif /* CONFIG_SCHED_CLUTCH */
6198
6199 return;
6200 }
6201
6202 if (is_current_thread) {
6203 assert(thread->state & TH_RUN);
6204 thread_assert_runq_null(thread);
6205 } else {
6206 removed_from_runq = thread_run_queue_remove(thread);
6207 }
6208
6209 thread->sched_pri = new_priority;
6210
6211#if CONFIG_SCHED_CLUTCH
6212 /*
6213 * Since for the clutch scheduler, the thread's bucket determines its runq
6214 * in the hierarchy it is important to update the bucket when the thread
6215 * lock is held and the thread has been removed from the runq hierarchy.
6216 */
6217 SCHED(update_thread_bucket)(thread);
6218
6219#endif /* CONFIG_SCHED_CLUTCH */
6220
6221 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
6222 (uintptr_t)thread_tid(thread),
6223 thread->base_pri,
6224 thread->sched_pri,
6225 thread->sched_usage,
6226 0);
6227
6228 if (removed_from_runq) {
6229 thread_run_queue_reinsert(thread, options: SCHED_PREEMPT | SCHED_TAILQ);
6230 } else if (is_current_thread) {
6231 processor_t processor = thread->last_processor;
6232 assert(processor == current_processor());
6233
6234 thread_urgency_t old_urgency = processor->current_urgency;
6235
6236 /*
6237 * When dropping in priority, check if the thread no longer belongs on core.
6238 * If a thread raises its own priority, don't aggressively rebalance it.
6239 * <rdar://problem/31699165>
6240 *
6241 * csw_check does a processor_state_update_from_thread, but
6242 * we should do our own if we're being lazy.
6243 */
6244 if (!lazy_update && new_priority < old_priority) {
6245 ast_t preempt;
6246
6247 if ((preempt = csw_check(thread, processor, AST_NONE)) != AST_NONE) {
6248 ast_on(reasons: preempt);
6249 }
6250 } else {
6251 processor_state_update_from_thread(processor, thread, false);
6252 }
6253
6254 /*
6255 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
6256 * class alterations from user space to occur relatively infrequently, hence
6257 * those are lazily handled. QoS classes have distinct priority bands, and QoS
6258 * inheritance is expected to involve priority changes.
6259 */
6260 if (processor->current_urgency != old_urgency) {
6261 uint64_t urgency_param1, urgency_param2;
6262
6263 thread_urgency_t new_urgency = thread_get_urgency(thread,
6264 rt_period: &urgency_param1, rt_deadline: &urgency_param2);
6265
6266 thread_tell_urgency(urgency: new_urgency, rt_period: urgency_param1,
6267 rt_deadline: urgency_param2, sched_latency: 0, nthread: thread);
6268 }
6269
6270 /* TODO: only call this if current_perfctl_class changed */
6271 uint64_t ctime = mach_approximate_time();
6272 machine_thread_going_on_core(new_thread: thread, urgency: processor->current_urgency, sched_latency: 0, same_pri_latency: 0, dispatch_time: ctime);
6273 } else if (thread->state & TH_RUN) {
6274 processor_t processor = thread->last_processor;
6275
6276 if (!lazy_update &&
6277 processor != PROCESSOR_NULL &&
6278 processor != current_processor() &&
6279 processor->active_thread == thread) {
6280 cause_ast_check(processor);
6281 }
6282 }
6283}
6284
6285/*
6286 * thread_run_queue_remove_for_handoff
6287 *
6288 * Pull a thread or its (recursive) push target out of the runqueue
6289 * so that it is ready for thread_run()
6290 *
6291 * Called at splsched
6292 *
6293 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6294 * This may be different than the thread that was passed in.
6295 */
6296thread_t
6297thread_run_queue_remove_for_handoff(thread_t thread)
6298{
6299 thread_t pulled_thread = THREAD_NULL;
6300
6301 thread_lock(thread);
6302
6303 /*
6304 * Check that the thread is not bound to a different processor,
6305 * NO_SMT flag is not set on the thread, cluster type of
6306 * processor matches with thread if the thread is pinned to a
6307 * particular cluster and that realtime is not involved.
6308 *
6309 * Next, pull it off its run queue. If it doesn't come, it's not eligible.
6310 */
6311 processor_t processor = current_processor();
6312 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6313 && (!thread_no_smt(thread))
6314 && (processor->current_pri < BASEPRI_RTQUEUES)
6315 && (thread->sched_pri < BASEPRI_RTQUEUES)
6316#if __AMP__
6317 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6318 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6319#endif /* __AMP__ */
6320 ) {
6321 if (thread_run_queue_remove(thread)) {
6322 pulled_thread = thread;
6323 }
6324 }
6325
6326 thread_unlock(thread);
6327
6328 return pulled_thread;
6329}
6330
6331/*
6332 * thread_prepare_for_handoff
6333 *
6334 * Make the thread ready for handoff.
6335 * If the thread was runnable then pull it off the runq, if the thread could
6336 * not be pulled, return NULL.
6337 *
6338 * If the thread was woken up from wait for handoff, make sure it is not bound to
6339 * different processor.
6340 *
6341 * Called at splsched
6342 *
6343 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
6344 * This may be different than the thread that was passed in.
6345 */
6346thread_t
6347thread_prepare_for_handoff(thread_t thread, thread_handoff_option_t option)
6348{
6349 thread_t pulled_thread = THREAD_NULL;
6350
6351 if (option & THREAD_HANDOFF_SETRUN_NEEDED) {
6352 processor_t processor = current_processor();
6353 thread_lock(thread);
6354
6355 /*
6356 * Check that the thread is not bound to a different processor,
6357 * NO_SMT flag is not set on the thread and cluster type of
6358 * processor matches with thread if the thread is pinned to a
6359 * particular cluster. Call setrun instead if above conditions
6360 * are not satisfied.
6361 */
6362 if ((thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)
6363 && (!thread_no_smt(thread))
6364#if __AMP__
6365 && ((thread->th_bound_cluster_id == THREAD_BOUND_CLUSTER_NONE) ||
6366 processor->processor_set->pset_id == thread->th_bound_cluster_id)
6367#endif /* __AMP__ */
6368 ) {
6369 pulled_thread = thread;
6370 } else {
6371 thread_setrun(thread, options: SCHED_PREEMPT | SCHED_TAILQ);
6372 }
6373 thread_unlock(thread);
6374 } else {
6375 pulled_thread = thread_run_queue_remove_for_handoff(thread);
6376 }
6377
6378 return pulled_thread;
6379}
6380
6381/*
6382 * thread_run_queue_remove:
6383 *
6384 * Remove a thread from its current run queue and
6385 * return TRUE if successful.
6386 *
6387 * Thread must be locked.
6388 *
6389 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
6390 * run queues because the caller locked the thread. Otherwise
6391 * the thread is on a run queue, but could be chosen for dispatch
6392 * and removed by another processor under a different lock, which
6393 * will set thread->runq to PROCESSOR_NULL.
6394 *
6395 * Hence the thread select path must not rely on anything that could
6396 * be changed under the thread lock after calling this function,
6397 * most importantly thread->sched_pri.
6398 */
6399boolean_t
6400thread_run_queue_remove(
6401 thread_t thread)
6402{
6403 boolean_t removed = FALSE;
6404
6405 if ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT) {
6406 /* Thread isn't runnable */
6407 thread_assert_runq_null(thread);
6408 return FALSE;
6409 }
6410
6411 processor_t processor = thread_get_runq(thread);
6412 if (processor == PROCESSOR_NULL) {
6413 /*
6414 * The thread is either not on the runq,
6415 * or is in the midst of being removed from the runq.
6416 *
6417 * runq is set to NULL under the pset lock, not the thread
6418 * lock, so the thread may still be in the process of being dequeued
6419 * from the runq. It will wait in invoke for the thread lock to be
6420 * dropped.
6421 */
6422
6423 return FALSE;
6424 }
6425
6426 if (thread->sched_pri < BASEPRI_RTQUEUES) {
6427 return SCHED(processor_queue_remove)(processor, thread);
6428 }
6429
6430 processor_set_t pset = processor->processor_set;
6431
6432 pset_lock(pset);
6433
6434 /*
6435 * Must re-read the thread runq after acquiring the pset lock, in
6436 * case another core swooped in before us to dequeue the thread.
6437 */
6438 if (thread_get_runq_locked(thread) != PROCESSOR_NULL) {
6439 /*
6440 * Thread is on the RT run queue and we have a lock on
6441 * that run queue.
6442 */
6443 rt_runq_remove(SCHED(rt_runq)(pset), thread);
6444 pset_update_rt_stealable_state(pset);
6445
6446 removed = TRUE;
6447 }
6448
6449 pset_unlock(pset);
6450
6451 return removed;
6452}
6453
6454/*
6455 * Put the thread back where it goes after a thread_run_queue_remove
6456 *
6457 * Thread must have been removed under the same thread lock hold
6458 *
6459 * thread locked, at splsched
6460 */
6461void
6462thread_run_queue_reinsert(thread_t thread, sched_options_t options)
6463{
6464 thread_assert_runq_null(thread);
6465 assert(thread->state & (TH_RUN));
6466
6467 thread_setrun(thread, options);
6468}
6469
6470void
6471sys_override_cpu_throttle(boolean_t enable_override)
6472{
6473 if (enable_override) {
6474 cpu_throttle_enabled = 0;
6475 } else {
6476 cpu_throttle_enabled = 1;
6477 }
6478}
6479
6480thread_urgency_t
6481thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
6482{
6483 uint64_t urgency_param1 = 0, urgency_param2 = 0;
6484 task_t task = get_threadtask_early(thread);
6485
6486 thread_urgency_t urgency;
6487
6488 if (thread == NULL || task == TASK_NULL || (thread->state & TH_IDLE)) {
6489 urgency_param1 = 0;
6490 urgency_param2 = 0;
6491
6492 urgency = THREAD_URGENCY_NONE;
6493 } else if (thread->sched_mode == TH_MODE_REALTIME) {
6494 urgency_param1 = thread->realtime.period;
6495 urgency_param2 = thread->realtime.deadline;
6496
6497 urgency = THREAD_URGENCY_REAL_TIME;
6498 } else if (cpu_throttle_enabled &&
6499 (thread->sched_pri <= MAXPRI_THROTTLE) &&
6500 (thread->base_pri <= MAXPRI_THROTTLE)) {
6501 /*
6502 * Threads that are running at low priority but are not
6503 * tagged with a specific QoS are separated out from
6504 * the "background" urgency. Performance management
6505 * subsystem can decide to either treat these threads
6506 * as normal threads or look at other signals like thermal
6507 * levels for optimal power/perf tradeoffs for a platform.
6508 */
6509 boolean_t thread_lacks_qos = (proc_get_effective_thread_policy(thread, TASK_POLICY_QOS) == THREAD_QOS_UNSPECIFIED); //thread_has_qos_policy(thread);
6510 boolean_t task_is_suppressed = (proc_get_effective_task_policy(task, TASK_POLICY_SUP_ACTIVE) == 0x1);
6511
6512 /*
6513 * Background urgency applied when thread priority is
6514 * MAXPRI_THROTTLE or lower and thread is not promoted
6515 * and thread has a QoS specified
6516 */
6517 urgency_param1 = thread->sched_pri;
6518 urgency_param2 = thread->base_pri;
6519
6520 if (thread_lacks_qos && !task_is_suppressed) {
6521 urgency = THREAD_URGENCY_LOWPRI;
6522 } else {
6523 urgency = THREAD_URGENCY_BACKGROUND;
6524 }
6525 } else {
6526 /* For otherwise unclassified threads, report throughput QoS parameters */
6527 urgency_param1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
6528 urgency_param2 = proc_get_effective_task_policy(task, TASK_POLICY_THROUGH_QOS);
6529 urgency = THREAD_URGENCY_NORMAL;
6530 }
6531
6532 if (arg1 != NULL) {
6533 *arg1 = urgency_param1;
6534 }
6535 if (arg2 != NULL) {
6536 *arg2 = urgency_param2;
6537 }
6538
6539 return urgency;
6540}
6541
6542perfcontrol_class_t
6543thread_get_perfcontrol_class(thread_t thread)
6544{
6545 /* Special case handling */
6546 if (thread->state & TH_IDLE) {
6547 return PERFCONTROL_CLASS_IDLE;
6548 }
6549
6550 if (thread->sched_mode == TH_MODE_REALTIME) {
6551 return PERFCONTROL_CLASS_REALTIME;
6552 }
6553
6554 /* perfcontrol_class based on base_pri */
6555 if (thread->base_pri <= MAXPRI_THROTTLE) {
6556 return PERFCONTROL_CLASS_BACKGROUND;
6557 } else if (thread->base_pri <= BASEPRI_UTILITY) {
6558 return PERFCONTROL_CLASS_UTILITY;
6559 } else if (thread->base_pri <= BASEPRI_DEFAULT) {
6560 return PERFCONTROL_CLASS_NONUI;
6561 } else if (thread->base_pri <= BASEPRI_USER_INITIATED) {
6562 return PERFCONTROL_CLASS_USER_INITIATED;
6563 } else if (thread->base_pri <= BASEPRI_FOREGROUND) {
6564 return PERFCONTROL_CLASS_UI;
6565 } else {
6566 if (get_threadtask(thread) == kernel_task) {
6567 /*
6568 * Classify Above UI kernel threads as PERFCONTROL_CLASS_KERNEL.
6569 * All other lower priority kernel threads should be treated
6570 * as regular threads for performance control purposes.
6571 */
6572 return PERFCONTROL_CLASS_KERNEL;
6573 }
6574 return PERFCONTROL_CLASS_ABOVEUI;
6575 }
6576}
6577
6578/*
6579 * This is the processor idle loop, which just looks for other threads
6580 * to execute. Processor idle threads invoke this without supplying a
6581 * current thread to idle without an asserted wait state.
6582 *
6583 * Returns a the next thread to execute if dispatched directly.
6584 */
6585
6586#if 0
6587#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
6588#else
6589#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
6590#endif
6591
6592#if (DEVELOPMENT || DEBUG)
6593int sched_idle_delay_cpuid = -1;
6594#endif
6595
6596thread_t
6597processor_idle(
6598 thread_t thread,
6599 processor_t processor)
6600{
6601 processor_set_t pset = processor->processor_set;
6602 struct recount_snap snap = { 0 };
6603
6604 (void)splsched();
6605
6606 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6607 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_START,
6608 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
6609
6610 SCHED_STATS_INC(idle_transitions);
6611 assert(processor->running_timers_active == false);
6612
6613 recount_snapshot(snap: &snap);
6614 recount_processor_idle(pr: &processor->pr_recount, snap: &snap);
6615
6616 while (1) {
6617 /*
6618 * Ensure that updates to my processor and pset state,
6619 * made by the IPI source processor before sending the IPI,
6620 * are visible on this processor now (even though we don't
6621 * take the pset lock yet).
6622 */
6623 atomic_thread_fence(memory_order_acquire);
6624
6625 if (processor->state != PROCESSOR_IDLE) {
6626 break;
6627 }
6628 if (bit_test(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
6629 break;
6630 }
6631#if defined(CONFIG_SCHED_DEFERRED_AST)
6632 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) {
6633 break;
6634 }
6635#endif
6636 if (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) {
6637 break;
6638 }
6639
6640 if (processor->is_recommended && (processor->processor_primary == processor)) {
6641 if (rt_runq_count(pset)) {
6642 break;
6643 }
6644 } else {
6645 if (SCHED(processor_bound_count)(processor)) {
6646 break;
6647 }
6648 }
6649
6650 IDLE_KERNEL_DEBUG_CONSTANT(
6651 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
6652
6653 machine_track_platform_idle(TRUE);
6654
6655 machine_idle();
6656 /* returns with interrupts enabled */
6657
6658 machine_track_platform_idle(FALSE);
6659
6660#if (DEVELOPMENT || DEBUG)
6661 if (processor->cpu_id == sched_idle_delay_cpuid) {
6662 delay(500);
6663 }
6664#endif
6665
6666 (void)splsched();
6667
6668 atomic_thread_fence(memory_order_acquire);
6669
6670 IDLE_KERNEL_DEBUG_CONSTANT(
6671 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
6672
6673 /*
6674 * Check if we should call sched_timeshare_consider_maintenance() here.
6675 * The CPU was woken out of idle due to an interrupt and we should do the
6676 * call only if the processor is still idle. If the processor is non-idle,
6677 * the threads running on the processor would do the call as part of
6678 * context swithing.
6679 */
6680 if (processor->state == PROCESSOR_IDLE) {
6681 sched_timeshare_consider_maintenance(ctime: mach_absolute_time(), true);
6682 }
6683
6684 if (!SCHED(processor_queue_empty)(processor)) {
6685 /* Secondary SMT processors respond to directed wakeups
6686 * exclusively. Some platforms induce 'spurious' SMT wakeups.
6687 */
6688 if (processor->processor_primary == processor) {
6689 break;
6690 }
6691 }
6692 }
6693
6694 recount_snapshot(snap: &snap);
6695 recount_processor_run(pr: &processor->pr_recount, snap: &snap);
6696 smr_cpu_join(processor, ctime: snap.rsn_time_mach);
6697
6698 ast_t reason = AST_NONE;
6699
6700 /* We're handling all scheduling AST's */
6701 ast_off(AST_SCHEDULING);
6702
6703 /*
6704 * thread_select will move the processor from dispatching to running,
6705 * or put it in idle if there's nothing to do.
6706 */
6707 thread_t cur_thread = current_thread();
6708
6709 thread_lock(cur_thread);
6710 thread_t new_thread = thread_select(thread: cur_thread, processor, reason: &reason);
6711 thread_unlock(cur_thread);
6712
6713 assert(processor->running_timers_active == false);
6714
6715 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6716 MACHDBG_CODE(DBG_MACH_SCHED, MACH_IDLE) | DBG_FUNC_END,
6717 (uintptr_t)thread_tid(thread), processor->state, (uintptr_t)thread_tid(new_thread), reason, 0);
6718
6719 return new_thread;
6720}
6721
6722/*
6723 * Each processor has a dedicated thread which
6724 * executes the idle loop when there is no suitable
6725 * previous context.
6726 *
6727 * This continuation is entered with interrupts disabled.
6728 */
6729void
6730idle_thread(__assert_only void* parameter,
6731 __unused wait_result_t result)
6732{
6733 assert(ml_get_interrupts_enabled() == FALSE);
6734 assert(parameter == NULL);
6735
6736 processor_t processor = current_processor();
6737
6738 smr_cpu_leave(processor, ctime: processor->last_dispatch);
6739
6740 /*
6741 * Ensure that anything running in idle context triggers
6742 * preemption-disabled checks.
6743 */
6744 disable_preemption_without_measurements();
6745
6746 /*
6747 * Enable interrupts temporarily to handle any pending interrupts
6748 * or IPIs before deciding to sleep
6749 */
6750 spllo();
6751
6752 thread_t new_thread = processor_idle(THREAD_NULL, processor);
6753 /* returns with interrupts disabled */
6754
6755 enable_preemption();
6756
6757 if (new_thread != THREAD_NULL) {
6758 thread_run(self: processor->idle_thread,
6759 continuation: idle_thread, NULL, new_thread);
6760 /*NOTREACHED*/
6761 }
6762
6763 thread_block(continuation: idle_thread);
6764 /*NOTREACHED*/
6765}
6766
6767kern_return_t
6768idle_thread_create(
6769 processor_t processor)
6770{
6771 kern_return_t result;
6772 thread_t thread;
6773 spl_t s;
6774 char name[MAXTHREADNAMESIZE];
6775
6776 result = kernel_thread_create(continuation: idle_thread, NULL, MAXPRI_KERNEL, new_thread: &thread);
6777 if (result != KERN_SUCCESS) {
6778 return result;
6779 }
6780
6781 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
6782 thread_set_thread_name(th: thread, name);
6783
6784 s = splsched();
6785 thread_lock(thread);
6786 thread->bound_processor = processor;
6787 processor->idle_thread = thread;
6788 thread->sched_pri = thread->base_pri = IDLEPRI;
6789 thread->state = (TH_RUN | TH_IDLE);
6790 thread->options |= TH_OPT_IDLE_THREAD;
6791 thread->last_made_runnable_time = thread->last_basepri_change_time = mach_absolute_time();
6792 thread_unlock(thread);
6793 splx(s);
6794
6795 thread_deallocate(thread);
6796
6797 return KERN_SUCCESS;
6798}
6799
6800static void sched_update_powered_cores_continue(void);
6801
6802/*
6803 * sched_startup:
6804 *
6805 * Kicks off scheduler services.
6806 *
6807 * Called at splsched.
6808 */
6809void
6810sched_startup(void)
6811{
6812 kern_return_t result;
6813 thread_t thread;
6814
6815 simple_lock_init(&sched_vm_group_list_lock, 0);
6816
6817 result = kernel_thread_start_priority(continuation: (thread_continue_t)sched_init_thread,
6818 NULL, MAXPRI_KERNEL, new_thread: &thread);
6819 if (result != KERN_SUCCESS) {
6820 panic("sched_startup");
6821 }
6822
6823 thread_deallocate(thread);
6824
6825 assert_thread_magic(thread);
6826
6827 /*
6828 * Yield to the sched_init_thread once, to
6829 * initialize our own thread after being switched
6830 * back to.
6831 *
6832 * The current thread is the only other thread
6833 * active at this point.
6834 */
6835 thread_block(THREAD_CONTINUE_NULL);
6836
6837 result = kernel_thread_start_priority(continuation: (thread_continue_t)sched_update_powered_cores_continue,
6838 NULL, MAXPRI_KERNEL, new_thread: &thread);
6839 if (result != KERN_SUCCESS) {
6840 panic("sched_startup");
6841 }
6842
6843 thread_deallocate(thread);
6844
6845 assert_thread_magic(thread);
6846}
6847
6848#if __arm64__
6849static _Atomic uint64_t sched_perfcontrol_callback_deadline;
6850#endif /* __arm64__ */
6851
6852
6853#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6854
6855static volatile uint64_t sched_maintenance_deadline;
6856static uint64_t sched_tick_last_abstime;
6857static uint64_t sched_tick_delta;
6858uint64_t sched_tick_max_delta;
6859
6860
6861/*
6862 * sched_init_thread:
6863 *
6864 * Perform periodic bookkeeping functions about ten
6865 * times per second.
6866 */
6867void
6868sched_timeshare_maintenance_continue(void)
6869{
6870 uint64_t sched_tick_ctime, late_time;
6871
6872 struct sched_update_scan_context scan_context = {
6873 .earliest_bg_make_runnable_time = UINT64_MAX,
6874 .earliest_normal_make_runnable_time = UINT64_MAX,
6875 .earliest_rt_make_runnable_time = UINT64_MAX
6876 };
6877
6878 sched_tick_ctime = mach_absolute_time();
6879
6880 if (__improbable(sched_tick_last_abstime == 0)) {
6881 sched_tick_last_abstime = sched_tick_ctime;
6882 late_time = 0;
6883 sched_tick_delta = 1;
6884 } else {
6885 late_time = sched_tick_ctime - sched_tick_last_abstime;
6886 sched_tick_delta = late_time / sched_tick_interval;
6887 /* Ensure a delta of 1, since the interval could be slightly
6888 * smaller than the sched_tick_interval due to dispatch
6889 * latencies.
6890 */
6891 sched_tick_delta = MAX(sched_tick_delta, 1);
6892
6893 /* In the event interrupt latencies or platform
6894 * idle events that advanced the timebase resulted
6895 * in periods where no threads were dispatched,
6896 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
6897 * iterations.
6898 */
6899 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
6900
6901 sched_tick_last_abstime = sched_tick_ctime;
6902 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
6903 }
6904
6905 scan_context.sched_tick_last_abstime = sched_tick_last_abstime;
6906 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_START,
6907 sched_tick_delta, late_time, 0, 0, 0);
6908
6909 /* Add a number of pseudo-ticks corresponding to the elapsed interval
6910 * This could be greater than 1 if substantial intervals where
6911 * all processors are idle occur, which rarely occurs in practice.
6912 */
6913
6914 sched_tick += sched_tick_delta;
6915
6916 update_vm_info();
6917
6918 /*
6919 * Compute various averages.
6920 */
6921 compute_averages(sched_tick_delta);
6922
6923 /*
6924 * Scan the run queues for threads which
6925 * may need to be updated, and find the earliest runnable thread on the runqueue
6926 * to report its latency.
6927 */
6928 SCHED(thread_update_scan)(&scan_context);
6929
6930 SCHED(rt_runq_scan)(&scan_context);
6931
6932 uint64_t ctime = mach_absolute_time();
6933
6934 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
6935 ctime - scan_context.earliest_bg_make_runnable_time : 0;
6936
6937 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
6938 ctime - scan_context.earliest_normal_make_runnable_time : 0;
6939
6940 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
6941 ctime - scan_context.earliest_rt_make_runnable_time : 0;
6942
6943 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
6944
6945 /*
6946 * Check to see if the special sched VM group needs attention.
6947 */
6948 sched_vm_group_maintenance();
6949
6950#if __arm64__
6951 /* Check to see if the recommended cores failsafe is active */
6952 sched_recommended_cores_maintenance();
6953#endif /* __arm64__ */
6954
6955
6956#if DEBUG || DEVELOPMENT
6957#if __x86_64__
6958#include <i386/misc_protos.h>
6959 /* Check for long-duration interrupts */
6960 mp_interrupt_watchdog();
6961#endif /* __x86_64__ */
6962#endif /* DEBUG || DEVELOPMENT */
6963
6964 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
6965 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
6966 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
6967
6968 assert_wait(event: (event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
6969 thread_block(continuation: (thread_continue_t)sched_timeshare_maintenance_continue);
6970 /*NOTREACHED*/
6971}
6972
6973static uint64_t sched_maintenance_wakeups;
6974
6975/*
6976 * Determine if the set of routines formerly driven by a maintenance timer
6977 * must be invoked, based on a deadline comparison. Signals the scheduler
6978 * maintenance thread on deadline expiration. Must be invoked at an interval
6979 * lower than the "sched_tick_interval", currently accomplished by
6980 * invocation via the quantum expiration timer and at context switch time.
6981 * Performance matters: this routine reuses a timestamp approximating the
6982 * current absolute time received from the caller, and should perform
6983 * no more than a comparison against the deadline in the common case.
6984 */
6985void
6986sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point)
6987{
6988 uint64_t deadline = sched_maintenance_deadline;
6989
6990 if (__improbable(ctime >= deadline)) {
6991 if (__improbable(current_thread() == sched_maintenance_thread)) {
6992 return;
6993 }
6994 OSMemoryBarrier();
6995
6996 uint64_t ndeadline = ctime + sched_tick_interval;
6997
6998 if (__probable(os_atomic_cmpxchg(&sched_maintenance_deadline, deadline, ndeadline, seq_cst))) {
6999 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
7000 sched_maintenance_wakeups++;
7001 smr_maintenance(ctime);
7002 }
7003 }
7004
7005 smr_cpu_tick(ctime, safe_point);
7006
7007#if !CONFIG_SCHED_CLUTCH
7008 /*
7009 * Only non-clutch schedulers use the global load calculation EWMA algorithm. For clutch
7010 * scheduler, the load is maintained at the thread group and bucket level.
7011 */
7012 uint64_t load_compute_deadline = os_atomic_load_wide(&sched_load_compute_deadline, relaxed);
7013
7014 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
7015 uint64_t new_deadline = 0;
7016 if (os_atomic_cmpxchg(&sched_load_compute_deadline, load_compute_deadline, new_deadline, relaxed)) {
7017 compute_sched_load();
7018 new_deadline = ctime + sched_load_compute_interval_abs;
7019 os_atomic_store_wide(&sched_load_compute_deadline, new_deadline, relaxed);
7020 }
7021 }
7022#endif /* CONFIG_SCHED_CLUTCH */
7023
7024#if __arm64__
7025 uint64_t perf_deadline = os_atomic_load(&sched_perfcontrol_callback_deadline, relaxed);
7026
7027 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
7028 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
7029 if (os_atomic_cmpxchg(&sched_perfcontrol_callback_deadline, perf_deadline, 0, relaxed)) {
7030 machine_perfcontrol_deadline_passed(deadline: perf_deadline);
7031 }
7032 }
7033#endif /* __arm64__ */
7034}
7035
7036#endif /* CONFIG_SCHED_TIMESHARE_CORE */
7037
7038void
7039sched_init_thread(void)
7040{
7041 thread_block(THREAD_CONTINUE_NULL);
7042
7043 thread_t thread = current_thread();
7044
7045 thread_set_thread_name(th: thread, name: "sched_maintenance_thread");
7046
7047 sched_maintenance_thread = thread;
7048
7049 SCHED(maintenance_continuation)();
7050
7051 /*NOTREACHED*/
7052}
7053
7054#if defined(CONFIG_SCHED_TIMESHARE_CORE)
7055
7056/*
7057 * thread_update_scan / runq_scan:
7058 *
7059 * Scan the run queues to account for timesharing threads
7060 * which need to be updated.
7061 *
7062 * Scanner runs in two passes. Pass one squirrels likely
7063 * threads away in an array, pass two does the update.
7064 *
7065 * This is necessary because the run queue is locked for
7066 * the candidate scan, but the thread is locked for the update.
7067 *
7068 * Array should be sized to make forward progress, without
7069 * disabling preemption for long periods.
7070 */
7071
7072#define THREAD_UPDATE_SIZE 128
7073
7074static thread_t thread_update_array[THREAD_UPDATE_SIZE];
7075static uint32_t thread_update_count = 0;
7076
7077/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
7078boolean_t
7079thread_update_add_thread(thread_t thread)
7080{
7081 if (thread_update_count == THREAD_UPDATE_SIZE) {
7082 return FALSE;
7083 }
7084
7085 thread_update_array[thread_update_count++] = thread;
7086 thread_reference(thread);
7087 return TRUE;
7088}
7089
7090void
7091thread_update_process_threads(void)
7092{
7093 assert(thread_update_count <= THREAD_UPDATE_SIZE);
7094
7095 for (uint32_t i = 0; i < thread_update_count; i++) {
7096 thread_t thread = thread_update_array[i];
7097 assert_thread_magic(thread);
7098 thread_update_array[i] = THREAD_NULL;
7099
7100 spl_t s = splsched();
7101 thread_lock(thread);
7102 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
7103 SCHED(update_priority)(thread);
7104 }
7105 thread_unlock(thread);
7106 splx(s);
7107
7108 thread_deallocate(thread);
7109 }
7110
7111 thread_update_count = 0;
7112}
7113
7114static boolean_t
7115runq_scan_thread(
7116 thread_t thread,
7117 sched_update_scan_context_t scan_context)
7118{
7119 assert_thread_magic(thread);
7120
7121 if (thread->sched_stamp != sched_tick &&
7122 thread->sched_mode == TH_MODE_TIMESHARE) {
7123 if (thread_update_add_thread(thread) == FALSE) {
7124 return TRUE;
7125 }
7126 }
7127
7128 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
7129 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
7130 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
7131 }
7132 } else {
7133 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
7134 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
7135 }
7136 }
7137
7138 return FALSE;
7139}
7140
7141/*
7142 * Scan a runq for candidate threads.
7143 *
7144 * Returns TRUE if retry is needed.
7145 */
7146boolean_t
7147runq_scan(
7148 run_queue_t runq,
7149 sched_update_scan_context_t scan_context)
7150{
7151 int count = runq->count;
7152 int queue_index;
7153
7154 assert(count >= 0);
7155
7156 if (count == 0) {
7157 return FALSE;
7158 }
7159
7160 for (queue_index = bitmap_first(map: runq->bitmap, NRQS);
7161 queue_index >= 0;
7162 queue_index = bitmap_next(map: runq->bitmap, prev: queue_index)) {
7163 thread_t thread;
7164 circle_queue_t queue = &runq->queues[queue_index];
7165
7166 cqe_foreach_element(thread, queue, runq_links) {
7167 assert(count > 0);
7168 if (runq_scan_thread(thread, scan_context) == TRUE) {
7169 return TRUE;
7170 }
7171 count--;
7172 }
7173 }
7174
7175 return FALSE;
7176}
7177
7178#if CONFIG_SCHED_CLUTCH
7179
7180boolean_t
7181sched_clutch_timeshare_scan(
7182 queue_t thread_queue,
7183 uint16_t thread_count,
7184 sched_update_scan_context_t scan_context)
7185{
7186 if (thread_count == 0) {
7187 return FALSE;
7188 }
7189
7190 thread_t thread;
7191 qe_foreach_element_safe(thread, thread_queue, th_clutch_timeshare_link) {
7192 if (runq_scan_thread(thread, scan_context) == TRUE) {
7193 return TRUE;
7194 }
7195 thread_count--;
7196 }
7197
7198 assert(thread_count == 0);
7199 return FALSE;
7200}
7201
7202
7203#endif /* CONFIG_SCHED_CLUTCH */
7204
7205#endif /* CONFIG_SCHED_TIMESHARE_CORE */
7206
7207bool
7208thread_is_eager_preempt(thread_t thread)
7209{
7210 return thread->sched_flags & TH_SFLAG_EAGERPREEMPT;
7211}
7212
7213void
7214thread_set_eager_preempt(thread_t thread)
7215{
7216 spl_t s = splsched();
7217 thread_lock(thread);
7218
7219 assert(!thread_is_eager_preempt(thread));
7220
7221 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
7222
7223 if (thread == current_thread()) {
7224 /* csw_check updates current_is_eagerpreempt on the processor */
7225 ast_t ast = csw_check(thread, processor: current_processor(), AST_NONE);
7226
7227 thread_unlock(thread);
7228
7229 if (ast != AST_NONE) {
7230 thread_block_reason(THREAD_CONTINUE_NULL, NULL, reason: ast);
7231 }
7232 } else {
7233 processor_t last_processor = thread->last_processor;
7234
7235 if (last_processor != PROCESSOR_NULL &&
7236 last_processor->state == PROCESSOR_RUNNING &&
7237 last_processor->active_thread == thread) {
7238 cause_ast_check(processor: last_processor);
7239 }
7240
7241 thread_unlock(thread);
7242 }
7243
7244 splx(s);
7245}
7246
7247void
7248thread_clear_eager_preempt(thread_t thread)
7249{
7250 spl_t s = splsched();
7251 thread_lock(thread);
7252
7253 assert(thread_is_eager_preempt(thread));
7254
7255 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
7256
7257 if (thread == current_thread()) {
7258 current_processor()->current_is_eagerpreempt = false;
7259 }
7260
7261 thread_unlock(thread);
7262 splx(s);
7263}
7264
7265/*
7266 * Scheduling statistics
7267 */
7268void
7269sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
7270{
7271 struct sched_statistics *stats;
7272 boolean_t to_realtime = FALSE;
7273
7274 stats = PERCPU_GET_RELATIVE(sched_stats, processor, processor);
7275 stats->csw_count++;
7276
7277 if (otherpri >= BASEPRI_REALTIME) {
7278 stats->rt_sched_count++;
7279 to_realtime = TRUE;
7280 }
7281
7282 if ((reasons & AST_PREEMPT) != 0) {
7283 stats->preempt_count++;
7284
7285 if (selfpri >= BASEPRI_REALTIME) {
7286 stats->preempted_rt_count++;
7287 }
7288
7289 if (to_realtime) {
7290 stats->preempted_by_rt_count++;
7291 }
7292 }
7293}
7294
7295void
7296sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
7297{
7298 uint64_t timestamp = mach_absolute_time();
7299
7300 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
7301 stats->last_change_timestamp = timestamp;
7302}
7303
7304/*
7305 * For calls from assembly code
7306 */
7307#undef thread_wakeup
7308void
7309thread_wakeup(
7310 event_t x);
7311
7312void
7313thread_wakeup(
7314 event_t x)
7315{
7316 thread_wakeup_with_result(x, THREAD_AWAKENED);
7317}
7318
7319boolean_t
7320preemption_enabled(void)
7321{
7322 return get_preemption_level() == 0 && ml_get_interrupts_enabled();
7323}
7324
7325static void
7326sched_timer_deadline_tracking_init(void)
7327{
7328 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, result: &timer_deadline_tracking_bin_1);
7329 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, result: &timer_deadline_tracking_bin_2);
7330}
7331
7332static uint64_t latest_requested_powered_cores = ALL_CORES_POWERED;
7333processor_reason_t latest_requested_reason = REASON_NONE;
7334static uint64_t current_requested_powered_cores = ALL_CORES_POWERED;
7335bool perfcontrol_sleep_override = false;
7336
7337LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown");
7338LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp);
7339int32_t cluster_powerdown_suspend_count = 0;
7340
7341bool
7342sched_is_in_sleep(void)
7343{
7344 os_atomic_thread_fence(acquire);
7345 return perfcontrol_sleep_override;
7346}
7347
7348static void
7349sched_update_powered_cores_continue(void)
7350{
7351 lck_mtx_lock(lck: &cluster_powerdown_lock);
7352
7353 if (!cluster_powerdown_suspend_count) {
7354 spl_t s = splsched();
7355 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7356
7357 uint64_t latest = latest_requested_powered_cores;
7358 processor_reason_t reason = latest_requested_reason;
7359 uint64_t current = current_requested_powered_cores;
7360 current_requested_powered_cores = latest;
7361 bool in_sleep = perfcontrol_sleep_override;
7362
7363 simple_unlock(&sched_available_cores_lock);
7364 splx(s);
7365
7366 while (latest != current) {
7367 if (!in_sleep) {
7368 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7369 sched_update_powered_cores(reqested_powered_cores: latest, reason, SHUTDOWN_TEMPORARY | WAIT_FOR_LAST_START);
7370 }
7371
7372 s = splsched();
7373 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7374
7375 latest = latest_requested_powered_cores;
7376 reason = latest_requested_reason;
7377 current = current_requested_powered_cores;
7378 current_requested_powered_cores = latest;
7379 in_sleep = perfcontrol_sleep_override;
7380
7381 simple_unlock(&sched_available_cores_lock);
7382 splx(s);
7383 }
7384
7385 assert_wait(event: (event_t)sched_update_powered_cores_continue, THREAD_UNINT);
7386
7387 s = splsched();
7388 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7389 if (latest_requested_powered_cores != current_requested_powered_cores) {
7390 clear_wait(thread: current_thread(), THREAD_AWAKENED);
7391 }
7392 simple_unlock(&sched_available_cores_lock);
7393 splx(s);
7394 }
7395
7396 lck_mtx_unlock(lck: &cluster_powerdown_lock);
7397
7398 thread_block(continuation: (thread_continue_t)sched_update_powered_cores_continue);
7399 /*NOTREACHED*/
7400}
7401
7402void
7403sched_perfcontrol_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, __unused uint32_t flags)
7404{
7405 assert((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER));
7406
7407#if DEVELOPMENT || DEBUG
7408 if (flags & (ASSERT_IN_SLEEP | ASSERT_POWERDOWN_SUSPENDED)) {
7409 if (flags & ASSERT_POWERDOWN_SUSPENDED) {
7410 assert(cluster_powerdown_suspend_count > 0);
7411 }
7412 if (flags & ASSERT_IN_SLEEP) {
7413 assert(perfcontrol_sleep_override == true);
7414 }
7415 return;
7416 }
7417#endif
7418
7419 spl_t s = splsched();
7420 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7421
7422 bool should_wakeup = !cluster_powerdown_suspend_count;
7423 if (should_wakeup) {
7424 latest_requested_powered_cores = requested_powered_cores;
7425 latest_requested_reason = reason;
7426 }
7427
7428 simple_unlock(&sched_available_cores_lock);
7429 splx(s);
7430
7431 if (should_wakeup) {
7432 thread_wakeup(x: (event_t)sched_update_powered_cores_continue);
7433 }
7434}
7435
7436void
7437suspend_cluster_powerdown(void)
7438{
7439 lck_mtx_lock(lck: &cluster_powerdown_lock);
7440
7441 assert(cluster_powerdown_suspend_count >= 0);
7442
7443 bool first_suspend = (cluster_powerdown_suspend_count == 0);
7444 if (first_suspend) {
7445 spl_t s = splsched();
7446 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7447 latest_requested_powered_cores = ALL_CORES_POWERED;
7448 current_requested_powered_cores = ALL_CORES_POWERED;
7449 latest_requested_reason = REASON_SYSTEM;
7450 simple_unlock(&sched_available_cores_lock);
7451 splx(s);
7452 }
7453
7454 cluster_powerdown_suspend_count++;
7455
7456 if (first_suspend) {
7457 kprintf(fmt: "%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START)\n", __FUNCTION__);
7458 sched_update_powered_cores(ALL_CORES_POWERED, reason: REASON_SYSTEM, LOCK_STATE | WAIT_FOR_START);
7459 }
7460
7461 lck_mtx_unlock(lck: &cluster_powerdown_lock);
7462}
7463
7464void
7465resume_cluster_powerdown(void)
7466{
7467 lck_mtx_lock(lck: &cluster_powerdown_lock);
7468
7469 if (cluster_powerdown_suspend_count <= 0) {
7470 panic("resume_cluster_powerdown() called with cluster_powerdown_suspend_count=%d\n", cluster_powerdown_suspend_count);
7471 }
7472
7473 cluster_powerdown_suspend_count--;
7474
7475 bool last_resume = (cluster_powerdown_suspend_count == 0);
7476
7477 if (last_resume) {
7478 spl_t s = splsched();
7479 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7480 latest_requested_powered_cores = ALL_CORES_POWERED;
7481 current_requested_powered_cores = ALL_CORES_POWERED;
7482 latest_requested_reason = REASON_SYSTEM;
7483 simple_unlock(&sched_available_cores_lock);
7484 splx(s);
7485
7486 kprintf(fmt: "%s>calling sched_update_powered_cores(ALL_CORES_POWERED, REASON_SYSTEM, UNLOCK_STATE)\n", __FUNCTION__);
7487 sched_update_powered_cores(ALL_CORES_POWERED, reason: REASON_SYSTEM, UNLOCK_STATE);
7488 }
7489
7490 lck_mtx_unlock(lck: &cluster_powerdown_lock);
7491}
7492
7493LCK_MTX_DECLARE(user_cluster_powerdown_lock, &cluster_powerdown_grp);
7494static bool user_suspended_cluster_powerdown = false;
7495
7496kern_return_t
7497suspend_cluster_powerdown_from_user(void)
7498{
7499 kern_return_t ret = KERN_FAILURE;
7500
7501 lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7502
7503 if (!user_suspended_cluster_powerdown) {
7504 suspend_cluster_powerdown();
7505 user_suspended_cluster_powerdown = true;
7506 ret = KERN_SUCCESS;
7507 }
7508
7509 lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7510
7511 return ret;
7512}
7513
7514kern_return_t
7515resume_cluster_powerdown_from_user(void)
7516{
7517 kern_return_t ret = KERN_FAILURE;
7518
7519 lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7520
7521 if (user_suspended_cluster_powerdown) {
7522 resume_cluster_powerdown();
7523 user_suspended_cluster_powerdown = false;
7524 ret = KERN_SUCCESS;
7525 }
7526
7527 lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7528
7529 return ret;
7530}
7531
7532int
7533get_cluster_powerdown_user_suspended(void)
7534{
7535 lck_mtx_lock(lck: &user_cluster_powerdown_lock);
7536
7537 int ret = (int)user_suspended_cluster_powerdown;
7538
7539 lck_mtx_unlock(lck: &user_cluster_powerdown_lock);
7540
7541 return ret;
7542}
7543
7544#if DEVELOPMENT || DEBUG
7545/* Functions to support the temporary sysctl */
7546static uint64_t saved_requested_powered_cores = ALL_CORES_POWERED;
7547void
7548sched_set_powered_cores(int requested_powered_cores)
7549{
7550 processor_reason_t reason = bit_test(requested_powered_cores, 31) ? REASON_CLPC_USER : REASON_CLPC_SYSTEM;
7551 uint32_t flags = requested_powered_cores & 0x30000000;
7552
7553 saved_requested_powered_cores = requested_powered_cores;
7554
7555 requested_powered_cores = bits(requested_powered_cores, 28, 0);
7556
7557 sched_perfcontrol_update_powered_cores(requested_powered_cores, reason, flags);
7558}
7559int
7560sched_get_powered_cores(void)
7561{
7562 return (int)saved_requested_powered_cores;
7563}
7564#endif
7565
7566/*
7567 * Ensure that all cores are powered and recommended before sleep
7568 */
7569void
7570sched_override_available_cores_for_sleep(void)
7571{
7572 spl_t s = splsched();
7573 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7574
7575 if (perfcontrol_sleep_override == false) {
7576 perfcontrol_sleep_override = true;
7577#if __arm__ || __arm64__
7578 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, reason: REASON_SYSTEM, flags: 0);
7579#endif
7580 }
7581
7582 simple_unlock(&sched_available_cores_lock);
7583 splx(s);
7584
7585 suspend_cluster_powerdown();
7586}
7587
7588/*
7589 * Restore the previously recommended cores, but leave all cores powered
7590 * after sleep
7591 */
7592void
7593sched_restore_available_cores_after_sleep(void)
7594{
7595 spl_t s = splsched();
7596 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7597
7598 if (perfcontrol_sleep_override == true) {
7599 perfcontrol_sleep_override = false;
7600#if __arm__ || __arm64__
7601 sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7602 reason: REASON_NONE, flags: 0);
7603#endif
7604 }
7605
7606 simple_unlock(&sched_available_cores_lock);
7607 splx(s);
7608
7609 resume_cluster_powerdown();
7610}
7611
7612#if __arm__ || __arm64__
7613
7614uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
7615bool perfcontrol_failsafe_active = false;
7616
7617uint64_t perfcontrol_failsafe_maintenance_runnable_time;
7618uint64_t perfcontrol_failsafe_activation_time;
7619uint64_t perfcontrol_failsafe_deactivation_time;
7620
7621/* data covering who likely caused it and how long they ran */
7622#define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
7623char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
7624int perfcontrol_failsafe_pid;
7625uint64_t perfcontrol_failsafe_tid;
7626uint64_t perfcontrol_failsafe_thread_timer_at_start;
7627uint64_t perfcontrol_failsafe_thread_timer_last_seen;
7628uint64_t perfcontrol_failsafe_recommended_at_trigger;
7629
7630/*
7631 * Perf controller calls here to update the recommended core bitmask.
7632 * If the failsafe is active, we don't immediately apply the new value.
7633 * Instead, we store the new request and use it after the failsafe deactivates.
7634 *
7635 * If the failsafe is not active, immediately apply the update.
7636 *
7637 * No scheduler locks are held, no other locks are held that scheduler might depend on,
7638 * interrupts are enabled
7639 *
7640 * currently prototype is in osfmk/arm/machine_routines.h
7641 */
7642void
7643sched_perfcontrol_update_recommended_cores_reason(uint64_t recommended_cores, processor_reason_t reason, uint32_t flags)
7644{
7645 assert(preemption_enabled());
7646
7647 spl_t s = splsched();
7648 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7649
7650 if (reason == REASON_CLPC_SYSTEM) {
7651 perfcontrol_system_requested_recommended_cores = recommended_cores;
7652 } else {
7653 assert(reason == REASON_CLPC_USER);
7654 perfcontrol_user_requested_recommended_cores = recommended_cores;
7655 }
7656
7657 perfcontrol_requested_recommended_cores = perfcontrol_system_requested_recommended_cores & perfcontrol_user_requested_recommended_cores;
7658 perfcontrol_requested_recommended_core_count = __builtin_popcountll(perfcontrol_requested_recommended_cores);
7659
7660 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7661 sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores, reason, flags);
7662 } else {
7663 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7664 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7665 perfcontrol_requested_recommended_cores,
7666 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7667 }
7668
7669 simple_unlock(&sched_available_cores_lock);
7670 splx(s);
7671}
7672
7673void
7674sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
7675{
7676 sched_perfcontrol_update_recommended_cores_reason(recommended_cores, reason: REASON_CLPC_USER, flags: 0);
7677}
7678
7679/*
7680 * Consider whether we need to activate the recommended cores failsafe
7681 *
7682 * Called from quantum timer interrupt context of a realtime thread
7683 * No scheduler locks are held, interrupts are disabled
7684 */
7685void
7686sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
7687{
7688 /*
7689 * Check if a realtime thread is starving the system
7690 * and bringing up non-recommended cores would help
7691 *
7692 * TODO: Is this the correct check for recommended == possible cores?
7693 * TODO: Validate the checks without the relevant lock are OK.
7694 */
7695
7696 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
7697 /* keep track of how long the responsible thread runs */
7698 uint64_t cur_th_time = recount_current_thread_time_mach();
7699
7700 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7701
7702 if (perfcontrol_failsafe_active == TRUE &&
7703 cur_thread->thread_id == perfcontrol_failsafe_tid) {
7704 perfcontrol_failsafe_thread_timer_last_seen = cur_th_time;
7705 }
7706
7707 simple_unlock(&sched_available_cores_lock);
7708
7709 /* we're already trying to solve the problem, so bail */
7710 return;
7711 }
7712
7713 /* The failsafe won't help if there are no more processors to enable */
7714 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) {
7715 return;
7716 }
7717
7718 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
7719
7720 /* Use the maintenance thread as our canary in the coal mine */
7721 thread_t m_thread = sched_maintenance_thread;
7722
7723 /* If it doesn't look bad, nothing to see here */
7724 if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) {
7725 return;
7726 }
7727
7728 /* It looks bad, take the lock to be sure */
7729 thread_lock(m_thread);
7730
7731 if (thread_get_runq(thread: m_thread) == PROCESSOR_NULL ||
7732 (m_thread->state & (TH_RUN | TH_WAIT)) != TH_RUN ||
7733 m_thread->last_made_runnable_time >= too_long_ago) {
7734 /*
7735 * Maintenance thread is either on cpu or blocked, and
7736 * therefore wouldn't benefit from more cores
7737 */
7738 thread_unlock(m_thread);
7739 return;
7740 }
7741
7742 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
7743
7744 thread_unlock(m_thread);
7745
7746 /*
7747 * There are cores disabled at perfcontrol's recommendation, but the
7748 * system is so overloaded that the maintenance thread can't run.
7749 * That likely means that perfcontrol can't run either, so it can't fix
7750 * the recommendation. We have to kick in a failsafe to keep from starving.
7751 *
7752 * When the maintenance thread has been starved for too long,
7753 * ignore the recommendation from perfcontrol and light up all the cores.
7754 *
7755 * TODO: Consider weird states like boot, sleep, or debugger
7756 */
7757
7758 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7759
7760 if (perfcontrol_failsafe_active == TRUE) {
7761 simple_unlock(&sched_available_cores_lock);
7762 return;
7763 }
7764
7765 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7766 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
7767 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
7768
7769 perfcontrol_failsafe_active = TRUE;
7770 perfcontrol_failsafe_activation_time = mach_absolute_time();
7771 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
7772 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
7773
7774 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
7775 task_t task = get_threadtask(cur_thread);
7776 perfcontrol_failsafe_pid = task_pid(task);
7777 strlcpy(dst: perfcontrol_failsafe_name, src: proc_name_address(p: get_bsdtask_info(task)), n: sizeof(perfcontrol_failsafe_name));
7778
7779 perfcontrol_failsafe_tid = cur_thread->thread_id;
7780
7781 /* Blame the thread for time it has run recently */
7782 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
7783
7784 uint64_t last_seen = recount_current_thread_time_mach();
7785
7786 /* Compute the start time of the bad behavior in terms of the thread's on core time */
7787 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
7788 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
7789
7790 /* Ignore the previously recommended core configuration */
7791 sched_update_recommended_cores(ALL_CORES_RECOMMENDED, reason: REASON_SYSTEM, flags: 0);
7792
7793 simple_unlock(&sched_available_cores_lock);
7794}
7795
7796/*
7797 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
7798 *
7799 * Runs in the context of the maintenance thread, no locks held
7800 */
7801static void
7802sched_recommended_cores_maintenance(void)
7803{
7804 /* Common case - no failsafe, nothing to be done here */
7805 if (__probable(perfcontrol_failsafe_active == FALSE)) {
7806 return;
7807 }
7808
7809 uint64_t ctime = mach_absolute_time();
7810
7811 boolean_t print_diagnostic = FALSE;
7812 char p_name[FAILSAFE_NAME_LEN] = "";
7813
7814 spl_t s = splsched();
7815 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7816
7817 /* Check again, under the lock, to avoid races */
7818 if (perfcontrol_failsafe_active == FALSE) {
7819 goto out;
7820 }
7821
7822 /*
7823 * Ensure that the other cores get another few ticks to run some threads
7824 * If we don't have this hysteresis, the maintenance thread is the first
7825 * to run, and then it immediately kills the other cores
7826 */
7827 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) {
7828 goto out;
7829 }
7830
7831 /* Capture some diagnostic state under the lock so we can print it out later */
7832
7833 int pid = perfcontrol_failsafe_pid;
7834 uint64_t tid = perfcontrol_failsafe_tid;
7835
7836 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
7837 perfcontrol_failsafe_thread_timer_at_start;
7838 uint64_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
7839 uint64_t rec_cores_after = perfcontrol_requested_recommended_cores;
7840 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
7841 strlcpy(dst: p_name, src: perfcontrol_failsafe_name, n: sizeof(p_name));
7842
7843 print_diagnostic = TRUE;
7844
7845 /* Deactivate the failsafe and reinstate the requested recommendation settings */
7846
7847 perfcontrol_failsafe_deactivation_time = ctime;
7848 perfcontrol_failsafe_active = FALSE;
7849
7850 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7851 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
7852 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
7853
7854 sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7855 reason: REASON_NONE, flags: 0);
7856
7857out:
7858 simple_unlock(&sched_available_cores_lock);
7859 splx(s);
7860
7861 if (print_diagnostic) {
7862 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
7863
7864 absolutetime_to_nanoseconds(abstime: failsafe_duration, result: &failsafe_duration_ms);
7865 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
7866
7867 absolutetime_to_nanoseconds(abstime: thread_usage, result: &thread_usage_ms);
7868 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
7869
7870 printf(format: "recommended core failsafe kicked in for %lld ms "
7871 "likely due to %s[%d] thread 0x%llx spending "
7872 "%lld ms on cpu at realtime priority - "
7873 "new recommendation: 0x%llx -> 0x%llx\n",
7874 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
7875 rec_cores_before, rec_cores_after);
7876 }
7877}
7878
7879#endif /* __arm64__ */
7880
7881kern_return_t
7882sched_processor_enable(processor_t processor, boolean_t enable)
7883{
7884 assert(preemption_enabled());
7885
7886 if (processor == master_processor) {
7887 /* The system can hang if this is allowed */
7888 return KERN_NOT_SUPPORTED;
7889 }
7890
7891 spl_t s = splsched();
7892 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7893
7894 if (enable) {
7895 bit_set(usercontrol_requested_recommended_cores, processor->cpu_id);
7896 } else {
7897 bit_clear(usercontrol_requested_recommended_cores, processor->cpu_id);
7898 }
7899
7900#if __arm64__
7901 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) {
7902 sched_update_recommended_cores(recommended_cores: perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores,
7903 reason: REASON_USER, flags: 0);
7904 } else {
7905 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
7906 MACHDBG_CODE(DBG_MACH_SCHED, MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
7907 perfcontrol_requested_recommended_cores,
7908 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
7909 }
7910#else /* __arm64__ */
7911 sched_update_recommended_cores(usercontrol_requested_recommended_cores, REASON_USER, 0);
7912#endif /* ! __arm64__ */
7913
7914 simple_unlock(&sched_available_cores_lock);
7915 splx(s);
7916
7917 return KERN_SUCCESS;
7918}
7919
7920void
7921sched_mark_processor_online_locked(processor_t processor, __assert_only processor_reason_t reason)
7922{
7923 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7924
7925 bit_set(sched_online_processors, processor->cpu_id);
7926}
7927
7928kern_return_t
7929sched_mark_processor_offline(processor_t processor, processor_reason_t reason)
7930{
7931 assert((processor != master_processor) || (reason == REASON_SYSTEM));
7932 kern_return_t ret = KERN_SUCCESS;
7933
7934 spl_t s = splsched();
7935 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
7936
7937 if (reason == REASON_SYSTEM) {
7938 bit_clear(sched_online_processors, processor->cpu_id);
7939 simple_unlock(&sched_available_cores_lock);
7940 splx(s);
7941 return ret;
7942 }
7943
7944 uint64_t available_cores = sched_online_processors & perfcontrol_requested_recommended_cores & usercontrol_requested_recommended_cores;
7945
7946 if (!bit_test(sched_online_processors, processor->cpu_id)) {
7947 /* Processor is already offline */
7948 ret = KERN_NOT_IN_SET;
7949 } else if (available_cores == BIT(processor->cpu_id)) {
7950 ret = KERN_RESOURCE_SHORTAGE;
7951 } else {
7952 bit_clear(sched_online_processors, processor->cpu_id);
7953 ret = KERN_SUCCESS;
7954 }
7955
7956 simple_unlock(&sched_available_cores_lock);
7957 splx(s);
7958
7959 return ret;
7960}
7961
7962/*
7963 * Apply a new recommended cores mask to the processors it affects
7964 * Runs after considering failsafes and such
7965 *
7966 * Iterate over processors and update their ->is_recommended field.
7967 * If a processor is running, we let it drain out at its next
7968 * quantum expiration or blocking point. If a processor is idle, there
7969 * may be more work for it to do, so IPI it.
7970 *
7971 * interrupts disabled, sched_available_cores_lock is held
7972 */
7973static void
7974sched_update_recommended_cores(uint64_t recommended_cores, processor_reason_t reason, __unused uint32_t flags)
7975{
7976 uint64_t needs_exit_idle_mask = 0x0;
7977
7978 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
7979 recommended_cores,
7980#if __arm64__
7981 perfcontrol_failsafe_active, 0, 0);
7982#else /* __arm64__ */
7983 0, 0, 0);
7984#endif /* ! __arm64__ */
7985
7986 if (__builtin_popcountll(recommended_cores & sched_online_processors) == 0) {
7987 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
7988 }
7989
7990 /* First set recommended cores */
7991 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
7992 for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
7993 processor_set_t pset = pset_array[pset_id];
7994
7995 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
7996 cpumap_t newly_recommended = changed_recommendations & recommended_cores;
7997
7998 if (newly_recommended == 0) {
7999 /* Nothing to do */
8000 continue;
8001 }
8002
8003 pset_lock(pset);
8004
8005 for (int cpu_id = lsb_first(bitmap: newly_recommended); cpu_id >= 0; cpu_id = lsb_next(bitmap: newly_recommended, previous_bit: cpu_id)) {
8006 processor_t processor = processor_array[cpu_id];
8007 processor->is_recommended = TRUE;
8008 processor->last_recommend_reason = reason;
8009 bit_set(pset->recommended_bitmask, processor->cpu_id);
8010
8011 if (processor->state == PROCESSOR_IDLE) {
8012 if (processor != current_processor()) {
8013 bit_set(needs_exit_idle_mask, processor->cpu_id);
8014 }
8015 }
8016 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8017 os_atomic_inc(&processor_avail_count_user, relaxed);
8018 if (processor->processor_primary == processor) {
8019 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
8020 }
8021 SCHED(pset_made_schedulable)(processor, pset, false);
8022 }
8023 }
8024 pset_update_rt_stealable_state(pset);
8025
8026 pset_unlock(pset);
8027
8028 for (int cpu_id = lsb_first(bitmap: newly_recommended); cpu_id >= 0;
8029 cpu_id = lsb_next(bitmap: newly_recommended, previous_bit: cpu_id)) {
8030 smr_cpu_up(processor_array[cpu_id],
8031 SMR_CPU_REASON_IGNORED);
8032 }
8033 }
8034 }
8035
8036 /* Now shutdown not recommended cores */
8037 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8038 for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8039 processor_set_t pset = pset_array[pset_id];
8040
8041 cpumap_t changed_recommendations = (recommended_cores & pset->cpu_bitmask) ^ pset->recommended_bitmask;
8042 cpumap_t newly_unrecommended = changed_recommendations & ~recommended_cores;
8043
8044 if (newly_unrecommended == 0) {
8045 /* Nothing to do */
8046 continue;
8047 }
8048
8049 pset_lock(pset);
8050
8051 for (int cpu_id = lsb_first(bitmap: newly_unrecommended); cpu_id >= 0; cpu_id = lsb_next(bitmap: newly_unrecommended, previous_bit: cpu_id)) {
8052 processor_t processor = processor_array[cpu_id];
8053 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
8054
8055 processor->is_recommended = FALSE;
8056 if (reason != REASON_NONE) {
8057 processor->last_derecommend_reason = reason;
8058 }
8059 bit_clear(pset->recommended_bitmask, processor->cpu_id);
8060 if ((processor->state != PROCESSOR_OFF_LINE) && (processor->state != PROCESSOR_PENDING_OFFLINE)) {
8061 os_atomic_dec(&processor_avail_count_user, relaxed);
8062 if (processor->processor_primary == processor) {
8063 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
8064 }
8065 }
8066 pset_update_rt_stealable_state(pset);
8067
8068 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
8069 ipi_type = SCHED_IPI_IMMEDIATE;
8070 }
8071 SCHED(processor_queue_shutdown)(processor);
8072 /* pset unlocked */
8073
8074 SCHED(rt_queue_shutdown)(processor);
8075
8076 if (ipi_type == SCHED_IPI_NONE) {
8077 /*
8078 * If the core is idle,
8079 * we can directly mark the processor
8080 * as "Ignored"
8081 *
8082 * Otherwise, smr will detect this
8083 * during smr_cpu_leave() when the
8084 * processor actually idles.
8085 */
8086 smr_cpu_down(processor, SMR_CPU_REASON_IGNORED);
8087 } else if (processor == current_processor()) {
8088 ast_on(AST_PREEMPT);
8089 } else {
8090 sched_ipi_perform(dst: processor, ipi: ipi_type);
8091 }
8092
8093 pset_lock(pset);
8094 }
8095 pset_unlock(pset);
8096 }
8097 }
8098
8099#if defined(__x86_64__)
8100 commpage_update_active_cpus();
8101#endif
8102 /* Issue all pending IPIs now that the pset lock has been dropped */
8103 for (int cpuid = lsb_first(bitmap: needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(bitmap: needs_exit_idle_mask, previous_bit: cpuid)) {
8104 processor_t processor = processor_array[cpuid];
8105 machine_signal_idle(processor);
8106 }
8107
8108 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
8109 needs_exit_idle_mask, 0, 0, 0);
8110}
8111
8112static void
8113sched_update_powered_cores(uint64_t requested_powered_cores, processor_reason_t reason, uint32_t flags)
8114{
8115 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_START,
8116 requested_powered_cores, reason, flags, 0);
8117
8118 assert((flags & (LOCK_STATE | UNLOCK_STATE)) ? (reason == REASON_SYSTEM) && (requested_powered_cores == ALL_CORES_POWERED) : 1);
8119
8120 /*
8121 * Loop through newly set requested_powered_cores and start them.
8122 * Loop through newly cleared requested_powered_cores and shut them down.
8123 */
8124
8125 if ((reason == REASON_CLPC_SYSTEM) || (reason == REASON_CLPC_USER)) {
8126 flags |= SHUTDOWN_TEMPORARY;
8127 }
8128
8129 /* First set powered cores */
8130 cpumap_t started_cores = 0ull;
8131 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8132 for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8133 processor_set_t pset = pset_array[pset_id];
8134
8135 spl_t s = splsched();
8136 pset_lock(pset);
8137 cpumap_t pset_requested_powered_cores = requested_powered_cores & pset->cpu_bitmask;
8138 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8139 cpumap_t requested_changes = pset_requested_powered_cores ^ powered_cores;
8140 pset_unlock(pset);
8141 splx(s);
8142
8143 cpumap_t newly_powered = requested_changes & requested_powered_cores;
8144
8145 cpumap_t cpu_map = newly_powered;
8146
8147 if (flags & (LOCK_STATE | UNLOCK_STATE)) {
8148 /*
8149 * We need to change the lock state even if
8150 * we don't need to change the actual state.
8151 */
8152 cpu_map = pset_requested_powered_cores;
8153 /* But not the master_processor, which is always implicitly locked */
8154 bit_clear(cpu_map, master_processor->cpu_id);
8155 }
8156
8157 if (cpu_map == 0) {
8158 /* Nothing to do */
8159 continue;
8160 }
8161
8162 for (int cpu_id = lsb_first(bitmap: cpu_map); cpu_id >= 0; cpu_id = lsb_next(bitmap: cpu_map, previous_bit: cpu_id)) {
8163 processor_t processor = processor_array[cpu_id];
8164 processor_start_reason(processor, reason, flags);
8165 bit_set(started_cores, cpu_id);
8166 }
8167 }
8168 }
8169 if (flags & WAIT_FOR_LAST_START) {
8170 for (int cpu_id = lsb_first(bitmap: started_cores); cpu_id >= 0; cpu_id = lsb_next(bitmap: started_cores, previous_bit: cpu_id)) {
8171 processor_t processor = processor_array[cpu_id];
8172 processor_wait_for_start(processor);
8173 }
8174 }
8175
8176 /* Now shutdown not powered cores */
8177 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
8178 for (int pset_id = lsb_first(bitmap: node->pset_map); pset_id >= 0; pset_id = lsb_next(bitmap: node->pset_map, previous_bit: pset_id)) {
8179 processor_set_t pset = pset_array[pset_id];
8180
8181 spl_t s = splsched();
8182 pset_lock(pset);
8183 cpumap_t powered_cores = (pset->cpu_state_map[PROCESSOR_START] | pset->cpu_state_map[PROCESSOR_IDLE] | pset->cpu_state_map[PROCESSOR_DISPATCHING] | pset->cpu_state_map[PROCESSOR_RUNNING]);
8184 cpumap_t requested_changes = (requested_powered_cores & pset->cpu_bitmask) ^ powered_cores;
8185 pset_unlock(pset);
8186 splx(s);
8187
8188 cpumap_t newly_unpowered = requested_changes & ~requested_powered_cores;
8189
8190 if (newly_unpowered == 0) {
8191 /* Nothing to do */
8192 continue;
8193 }
8194
8195 for (int cpu_id = lsb_first(bitmap: newly_unpowered); cpu_id >= 0; cpu_id = lsb_next(bitmap: newly_unpowered, previous_bit: cpu_id)) {
8196 processor_t processor = processor_array[cpu_id];
8197
8198 processor_exit_reason(processor, reason, flags);
8199 }
8200 }
8201 }
8202
8203 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UPDATE_POWERED_CORES) | DBG_FUNC_END, 0, 0, 0, 0);
8204}
8205
8206void
8207thread_set_options(uint32_t thopt)
8208{
8209 spl_t x;
8210 thread_t t = current_thread();
8211
8212 x = splsched();
8213 thread_lock(t);
8214
8215 t->options |= thopt;
8216
8217 thread_unlock(t);
8218 splx(x);
8219}
8220
8221void
8222thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint)
8223{
8224 thread->pending_block_hint = block_hint;
8225}
8226
8227uint32_t
8228qos_max_parallelism(int qos, uint64_t options)
8229{
8230 return SCHED(qos_max_parallelism)(qos, options);
8231}
8232
8233uint32_t
8234sched_qos_max_parallelism(__unused int qos, uint64_t options)
8235{
8236 host_basic_info_data_t hinfo;
8237 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
8238
8239
8240 /*
8241 * The QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE should be used on AMP platforms only which
8242 * implement their own qos_max_parallelism() interfaces.
8243 */
8244 assert((options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) == 0);
8245
8246 /* Query the machine layer for core information */
8247 __assert_only kern_return_t kret = host_info(host: host_self(), HOST_BASIC_INFO,
8248 host_info_out: (host_info_t)&hinfo, host_info_outCnt: &count);
8249 assert(kret == KERN_SUCCESS);
8250
8251 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
8252 return hinfo.logical_cpu;
8253 } else {
8254 return hinfo.physical_cpu;
8255 }
8256}
8257
8258int sched_allow_NO_SMT_threads = 1;
8259bool
8260thread_no_smt(thread_t thread)
8261{
8262 return sched_allow_NO_SMT_threads &&
8263 (thread->bound_processor == PROCESSOR_NULL) &&
8264 ((thread->sched_flags & TH_SFLAG_NO_SMT) || (get_threadtask(thread)->t_flags & TF_NO_SMT));
8265}
8266
8267bool
8268processor_active_thread_no_smt(processor_t processor)
8269{
8270 return sched_allow_NO_SMT_threads && !processor->current_is_bound && processor->current_is_NO_SMT;
8271}
8272
8273#if __arm64__
8274
8275/*
8276 * Set up or replace old timer with new timer
8277 *
8278 * Returns true if canceled old timer, false if it did not
8279 */
8280boolean_t
8281sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
8282{
8283 /*
8284 * Exchange deadline for new deadline, if old deadline was nonzero,
8285 * then I cancelled the callback, otherwise I didn't
8286 */
8287
8288 return os_atomic_xchg(&sched_perfcontrol_callback_deadline, new_deadline,
8289 relaxed) != 0;
8290}
8291
8292/*
8293 * Set global SFI window (in usec)
8294 */
8295kern_return_t
8296sched_perfcontrol_sfi_set_window(uint64_t window_usecs)
8297{
8298 kern_return_t ret = KERN_NOT_SUPPORTED;
8299#if CONFIG_THREAD_GROUPS
8300 if (window_usecs == 0ULL) {
8301 ret = sfi_window_cancel();
8302 } else {
8303 ret = sfi_set_window(window_usecs);
8304 }
8305#endif // CONFIG_THREAD_GROUPS
8306 return ret;
8307}
8308
8309/*
8310 * Set background and maintenance SFI class offtimes
8311 */
8312kern_return_t
8313sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs)
8314{
8315 kern_return_t ret = KERN_NOT_SUPPORTED;
8316#if CONFIG_THREAD_GROUPS
8317 if (offtime_usecs == 0ULL) {
8318 ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE);
8319 ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG);
8320 } else {
8321 ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs);
8322 ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs);
8323 }
8324#endif // CONFIG_THREAD_GROUPS
8325 return ret;
8326}
8327
8328/*
8329 * Set utility SFI class offtime
8330 */
8331kern_return_t
8332sched_perfcontrol_sfi_set_utility_offtime(uint64_t offtime_usecs)
8333{
8334 kern_return_t ret = KERN_NOT_SUPPORTED;
8335#if CONFIG_THREAD_GROUPS
8336 if (offtime_usecs == 0ULL) {
8337 ret = sfi_class_offtime_cancel(SFI_CLASS_UTILITY);
8338 } else {
8339 ret = sfi_set_class_offtime(SFI_CLASS_UTILITY, offtime_usecs);
8340 }
8341#endif // CONFIG_THREAD_GROUPS
8342 return ret;
8343}
8344
8345#endif /* __arm64__ */
8346
8347#if CONFIG_SCHED_EDGE
8348
8349#define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
8350
8351/*
8352 * sched_edge_pset_running_higher_bucket()
8353 *
8354 * Routine to calculate cumulative running counts for each scheduling
8355 * bucket. This effectively lets the load calculation calculate if a
8356 * cluster is running any threads at a QoS lower than the thread being
8357 * migrated etc.
8358 */
8359
8360static void
8361sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
8362{
8363 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
8364
8365 /* Edge Scheduler Optimization */
8366 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
8367 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
8368 for (sched_bucket_t bucket = cpu_bucket; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
8369 running_higher[bucket]++;
8370 }
8371 }
8372}
8373
8374/*
8375 * sched_update_pset_load_average()
8376 *
8377 * Updates the load average for each sched bucket for a cluster.
8378 * This routine must be called with the pset lock held.
8379 */
8380void
8381sched_update_pset_load_average(processor_set_t pset, uint64_t curtime)
8382{
8383 int avail_cpu_count = pset_available_cpu_count(pset);
8384 if (avail_cpu_count == 0) {
8385 /* Looks like the pset is not runnable any more; nothing to do here */
8386 return;
8387 }
8388
8389 /*
8390 * Edge Scheduler Optimization
8391 *
8392 * See if more callers of this routine can pass in timestamps to avoid the
8393 * mach_absolute_time() call here.
8394 */
8395
8396 if (!curtime) {
8397 curtime = mach_absolute_time();
8398 }
8399 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
8400 int64_t delta_ticks = curtime - last_update;
8401 if (delta_ticks < 0) {
8402 return;
8403 }
8404
8405 uint64_t delta_nsecs = 0;
8406 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8407
8408 if (__improbable(delta_nsecs > UINT32_MAX)) {
8409 delta_nsecs = UINT32_MAX;
8410 }
8411
8412#if CONFIG_SCHED_EDGE
8413 /* Update the shared resource load on the pset */
8414 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
8415 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
8416 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
8417 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
8418 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
8419 if (old_shared_load != new_shared_load) {
8420 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
8421 }
8422 }
8423#endif /* CONFIG_SCHED_EDGE */
8424
8425 uint32_t running_higher[TH_BUCKET_SCHED_MAX] = {0};
8426 sched_edge_pset_running_higher_bucket(pset, running_higher);
8427
8428 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
8429 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
8430 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
8431 uint32_t current_runq_depth = (sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket]) / avail_cpu_count;
8432
8433 /*
8434 * For the new load average multiply current_runq_depth by delta_nsecs (which resuts in a 32.0 value).
8435 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
8436 * new load averga needs to be shifted before it can be added to the old load average.
8437 */
8438 uint64_t new_load_average_factor = (current_runq_depth * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
8439
8440 /*
8441 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
8442 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
8443 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
8444 */
8445 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8446 boolean_t load_uptick = (old_load_shifted == 0) && (current_runq_depth != 0);
8447 boolean_t load_downtick = (old_load_shifted != 0) && (current_runq_depth == 0);
8448 uint64_t load_average;
8449 if (load_uptick || load_downtick) {
8450 load_average = (current_runq_depth << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
8451 } else {
8452 /* Indicates a loaded system; use EWMA for load average calculation */
8453 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8454 }
8455 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
8456 if (load_average != old_load_average) {
8457 KTRC(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
8458 }
8459 }
8460 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
8461}
8462
8463void
8464sched_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
8465{
8466 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
8467 uint64_t avg_thread_execution_time = 0;
8468
8469 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
8470 old_execution_time_packed.pset_execution_time_packed,
8471 new_execution_time_packed.pset_execution_time_packed, relaxed, {
8472 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
8473 int64_t delta_ticks = curtime - last_update;
8474 if (delta_ticks < 0) {
8475 /*
8476 * Its possible that another CPU came in and updated the pset_execution_time
8477 * before this CPU could do it. Since the average execution time is meant to
8478 * be an approximate measure per cluster, ignore the older update.
8479 */
8480 os_atomic_rmw_loop_give_up(return );
8481 }
8482 uint64_t delta_nsecs = 0;
8483 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
8484
8485 uint64_t nanotime = 0;
8486 absolutetime_to_nanoseconds(execution_time, &nanotime);
8487 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
8488
8489 uint64_t old_execution_time = (old_execution_time_packed.pset_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
8490 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
8491
8492 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
8493 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
8494 new_execution_time_packed.pset_execution_time_last_update = curtime;
8495 });
8496 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
8497 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
8498 }
8499}
8500
8501uint64_t
8502sched_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
8503{
8504 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
8505}
8506
8507#else /* CONFIG_SCHED_EDGE */
8508
8509void
8510sched_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
8511{
8512 int non_rt_load = pset->pset_runq.count;
8513 int load = ((bit_count(x: pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
8514 int new_load_average = ((int)pset->load_average + load) >> 1;
8515
8516 pset->load_average = new_load_average;
8517#if (DEVELOPMENT || DEBUG)
8518#if __AMP__
8519 if (pset->pset_cluster_type == PSET_AMP_P) {
8520 KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
8521 }
8522#endif
8523#endif
8524}
8525
8526void
8527sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uint64_t execution_time, __unused uint64_t curtime, __unused sched_bucket_t sched_bucket)
8528{
8529}
8530
8531#endif /* CONFIG_SCHED_EDGE */
8532
8533/* pset is locked */
8534static bool
8535processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor)
8536{
8537 int cpuid = processor->cpu_id;
8538#if defined(__x86_64__)
8539 if (sched_avoid_cpu0 && (cpuid == 0)) {
8540 return false;
8541 }
8542#endif
8543
8544 cpumap_t fasttrack_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8545
8546 return bit_test(fasttrack_map, cpuid);
8547}
8548
8549/* pset is locked */
8550static processor_t
8551choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills)
8552{
8553#if defined(__x86_64__)
8554 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8555#else
8556 const bool avoid_cpu0 = false;
8557#endif
8558 cpumap_t cpu_map;
8559
8560try_again:
8561 cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map;
8562 if (skip_processor) {
8563 bit_clear(cpu_map, skip_processor->cpu_id);
8564 }
8565 if (skip_spills) {
8566 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8567 }
8568
8569 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8570 bit_clear(cpu_map, 0);
8571 }
8572
8573 cpumap_t primary_map = cpu_map & pset->primary_map;
8574 if (avoid_cpu0) {
8575 primary_map = bit_ror64(bitmap: primary_map, n: 1);
8576 }
8577
8578 int rotid = lsb_first(bitmap: primary_map);
8579 if (rotid >= 0) {
8580 int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid;
8581
8582 processor_t processor = processor_array[cpuid];
8583
8584 return processor;
8585 }
8586
8587 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8588 goto out;
8589 }
8590
8591 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8592 /* Also avoid cpu1 */
8593 bit_clear(cpu_map, 1);
8594 }
8595
8596 /* Consider secondary processors whose primary is actually running a realtime thread */
8597 cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1);
8598 if (avoid_cpu0) {
8599 /* Also avoid cpu1 */
8600 secondary_map = bit_ror64(bitmap: secondary_map, n: 2);
8601 }
8602 rotid = lsb_first(bitmap: secondary_map);
8603 if (rotid >= 0) {
8604 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8605
8606 processor_t processor = processor_array[cpuid];
8607
8608 return processor;
8609 }
8610
8611 /* Consider secondary processors */
8612 secondary_map = cpu_map & ~pset->primary_map;
8613 if (avoid_cpu0) {
8614 /* Also avoid cpu1 */
8615 secondary_map = bit_ror64(bitmap: secondary_map, n: 2);
8616 }
8617 rotid = lsb_first(bitmap: secondary_map);
8618 if (rotid >= 0) {
8619 int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid;
8620
8621 processor_t processor = processor_array[cpuid];
8622
8623 return processor;
8624 }
8625
8626 /*
8627 * I was hoping the compiler would optimize
8628 * this away when avoid_cpu0 is const bool false
8629 * but it still complains about the assignmnent
8630 * in that case.
8631 */
8632 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8633#if defined(__x86_64__)
8634 avoid_cpu0 = false;
8635#else
8636 assert(0);
8637#endif
8638 goto try_again;
8639 }
8640
8641out:
8642 if (skip_processor) {
8643 return PROCESSOR_NULL;
8644 }
8645
8646 /*
8647 * If we didn't find an obvious processor to choose, but there are still more CPUs
8648 * not already running realtime threads than realtime threads in the realtime run queue,
8649 * this thread belongs in this pset, so choose some other processor in this pset
8650 * to ensure the thread is enqueued here.
8651 */
8652 cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8653 if (bit_count(x: non_realtime_map) > rt_runq_count(pset)) {
8654 cpu_map = non_realtime_map;
8655 assert(cpu_map != 0);
8656 int cpuid = bit_first(bitmap: cpu_map);
8657 assert(cpuid >= 0);
8658 return processor_array[cpuid];
8659 }
8660
8661 if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) {
8662 goto skip_secondaries;
8663 }
8664
8665 non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map;
8666 if (bit_count(x: non_realtime_map) > rt_runq_count(pset)) {
8667 cpu_map = non_realtime_map;
8668 assert(cpu_map != 0);
8669 int cpuid = bit_first(bitmap: cpu_map);
8670 assert(cpuid >= 0);
8671 return processor_array[cpuid];
8672 }
8673
8674skip_secondaries:
8675 return PROCESSOR_NULL;
8676}
8677
8678/*
8679 * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority.
8680 * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline.
8681 *
8682 * pset is locked.
8683 */
8684static processor_t
8685choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus)
8686{
8687 uint64_t furthest_deadline = deadline_add(d: minimum_deadline, e: rt_deadline_epsilon);
8688 processor_t fd_processor = PROCESSOR_NULL;
8689 int lowest_priority = max_pri;
8690
8691 cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask;
8692 if (skip_processor) {
8693 bit_clear(cpu_map, skip_processor->cpu_id);
8694 }
8695 if (skip_spills) {
8696 cpu_map &= ~pset->rt_pending_spill_cpu_mask;
8697 }
8698
8699 for (int cpuid = bit_first(bitmap: cpu_map); cpuid >= 0; cpuid = bit_next(bitmap: cpu_map, previous_bit: cpuid)) {
8700 processor_t processor = processor_array[cpuid];
8701
8702 if (processor->current_pri > lowest_priority) {
8703 continue;
8704 }
8705
8706 if (processor->current_pri < lowest_priority) {
8707 lowest_priority = processor->current_pri;
8708 furthest_deadline = processor->deadline;
8709 fd_processor = processor;
8710 continue;
8711 }
8712
8713 if (processor->deadline > furthest_deadline) {
8714 furthest_deadline = processor->deadline;
8715 fd_processor = processor;
8716 }
8717 }
8718
8719 if (fd_processor) {
8720 return fd_processor;
8721 }
8722
8723 /*
8724 * There is a race condition possible when there are multiple processor sets.
8725 * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU,
8726 * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds
8727 * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear
8728 * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find
8729 * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads,
8730 * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling.
8731 * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue.
8732 * But even if we chose the better run queue, we still wouldn't send an IPI in this case.)
8733 *
8734 * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads
8735 * on the run queue of that pset.
8736 */
8737 if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) {
8738 cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask;
8739 assert(skip_processor == PROCESSOR_NULL);
8740 assert(skip_spills == false);
8741
8742 for (int cpuid = bit_first(bitmap: cpu_map); cpuid >= 0; cpuid = bit_next(bitmap: cpu_map, previous_bit: cpuid)) {
8743 processor_t processor = processor_array[cpuid];
8744
8745 if (processor->current_pri > lowest_priority) {
8746 continue;
8747 }
8748
8749 if (processor->current_pri < lowest_priority) {
8750 lowest_priority = processor->current_pri;
8751 furthest_deadline = processor->deadline;
8752 fd_processor = processor;
8753 continue;
8754 }
8755
8756 if (processor->deadline > furthest_deadline) {
8757 furthest_deadline = processor->deadline;
8758 fd_processor = processor;
8759 }
8760 }
8761 }
8762
8763 return fd_processor;
8764}
8765
8766/* pset is locked */
8767static processor_t
8768choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries)
8769{
8770 bool skip_spills = true;
8771 bool include_ast_urgent_pending_cpus = false;
8772
8773 processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, consider_secondaries, skip_spills);
8774 if (next_processor != PROCESSOR_NULL) {
8775 return next_processor;
8776 }
8777
8778 next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus);
8779 return next_processor;
8780}
8781
8782#if defined(__x86_64__)
8783/* pset is locked */
8784static bool
8785all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups)
8786{
8787 bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0);
8788 int nbackup_cpus = 0;
8789
8790 if (include_backups && rt_runq_is_low_latency(pset)) {
8791 nbackup_cpus = sched_rt_n_backup_processors;
8792 }
8793
8794 cpumap_t cpu_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map;
8795 if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) {
8796 bit_clear(cpu_map, 0);
8797 }
8798 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8799}
8800
8801/* pset is locked */
8802static bool
8803these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups)
8804{
8805 int nbackup_cpus = 0;
8806
8807 if (include_backups && rt_runq_is_low_latency(pset)) {
8808 nbackup_cpus = sched_rt_n_backup_processors;
8809 }
8810
8811 cpumap_t cpu_map = pset_available_cpumap(pset) & these_map & ~pset->realtime_map;
8812 return (rt_runq_count(pset) + nbackup_cpus) > bit_count(cpu_map);
8813}
8814#endif
8815
8816static bool
8817sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup)
8818{
8819 if (!processor->is_recommended) {
8820 return false;
8821 }
8822 bool ok_to_run_realtime_thread = true;
8823#if defined(__x86_64__)
8824 bool spill_pending = bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id);
8825 if (spill_pending) {
8826 return true;
8827 }
8828 if (processor->cpu_id == 0) {
8829 if (sched_avoid_cpu0 == 1) {
8830 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, pset->primary_map & ~0x1, as_backup);
8831 } else if (sched_avoid_cpu0 == 2) {
8832 ok_to_run_realtime_thread = these_processors_are_running_realtime_threads(pset, ~0x3, as_backup);
8833 }
8834 } else if (sched_avoid_cpu0 && (processor->cpu_id == 1) && processor->is_SMT) {
8835 ok_to_run_realtime_thread = sched_allow_rt_smt && these_processors_are_running_realtime_threads(pset, ~0x2, as_backup);
8836 } else if (processor->processor_primary != processor) {
8837 ok_to_run_realtime_thread = (sched_allow_rt_smt && all_available_primaries_are_running_realtime_threads(pset, as_backup));
8838 }
8839#else
8840 (void)pset;
8841 (void)processor;
8842 (void)as_backup;
8843#endif
8844 return ok_to_run_realtime_thread;
8845}
8846
8847void
8848sched_pset_made_schedulable(__unused processor_t processor, processor_set_t pset, boolean_t drop_lock)
8849{
8850 if (drop_lock) {
8851 pset_unlock(pset);
8852 }
8853}
8854
8855void
8856thread_set_no_smt(bool set)
8857{
8858 if (!system_is_SMT) {
8859 /* Not a machine that supports SMT */
8860 return;
8861 }
8862
8863 thread_t thread = current_thread();
8864
8865 spl_t s = splsched();
8866 thread_lock(thread);
8867 if (set) {
8868 thread->sched_flags |= TH_SFLAG_NO_SMT;
8869 }
8870 thread_unlock(thread);
8871 splx(s);
8872}
8873
8874bool
8875thread_get_no_smt(void)
8876{
8877 return current_thread()->sched_flags & TH_SFLAG_NO_SMT;
8878}
8879
8880extern void task_set_no_smt(task_t);
8881void
8882task_set_no_smt(task_t task)
8883{
8884 if (!system_is_SMT) {
8885 /* Not a machine that supports SMT */
8886 return;
8887 }
8888
8889 if (task == TASK_NULL) {
8890 task = current_task();
8891 }
8892
8893 task_lock(task);
8894 task->t_flags |= TF_NO_SMT;
8895 task_unlock(task);
8896}
8897
8898#if DEBUG || DEVELOPMENT
8899extern void sysctl_task_set_no_smt(char no_smt);
8900void
8901sysctl_task_set_no_smt(char no_smt)
8902{
8903 if (!system_is_SMT) {
8904 /* Not a machine that supports SMT */
8905 return;
8906 }
8907
8908 task_t task = current_task();
8909
8910 task_lock(task);
8911 if (no_smt == '1') {
8912 task->t_flags |= TF_NO_SMT;
8913 }
8914 task_unlock(task);
8915}
8916
8917extern char sysctl_task_get_no_smt(void);
8918char
8919sysctl_task_get_no_smt(void)
8920{
8921 task_t task = current_task();
8922
8923 if (task->t_flags & TF_NO_SMT) {
8924 return '1';
8925 }
8926 return '0';
8927}
8928#endif /* DEVELOPMENT || DEBUG */
8929
8930
8931__private_extern__ void
8932thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
8933{
8934#if __AMP__
8935 spl_t s = splsched();
8936 thread_lock(thread);
8937 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
8938 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
8939 if (soft_bound) {
8940 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
8941 }
8942 switch (cluster_type) {
8943 case 'e':
8944 case 'E':
8945 if (pset0.pset_cluster_type == PSET_AMP_E) {
8946 thread->th_bound_cluster_id = pset0.pset_id;
8947 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8948 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8949 }
8950 break;
8951 case 'p':
8952 case 'P':
8953 if (pset0.pset_cluster_type == PSET_AMP_P) {
8954 thread->th_bound_cluster_id = pset0.pset_id;
8955 } else if (pset_node1.psets != PROCESSOR_SET_NULL) {
8956 thread->th_bound_cluster_id = pset_node1.psets->pset_id;
8957 }
8958 break;
8959 default:
8960 break;
8961 }
8962 thread_unlock(thread);
8963 splx(s);
8964
8965 if (thread == current_thread()) {
8966 thread_block(THREAD_CONTINUE_NULL);
8967 }
8968#else /* __AMP__ */
8969 (void)thread;
8970 (void)cluster_type;
8971 (void)soft_bound;
8972#endif /* __AMP__ */
8973}
8974
8975extern uint32_t thread_bound_cluster_id(thread_t thread);
8976uint32_t
8977thread_bound_cluster_id(thread_t thread)
8978{
8979 return thread->th_bound_cluster_id;
8980}
8981
8982__private_extern__ kern_return_t
8983thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options)
8984{
8985#if __AMP__
8986
8987 processor_set_t pset = NULL;
8988
8989 /* Treat binding to THREAD_BOUND_CLUSTER_NONE as a request to unbind. */
8990 if ((options & THREAD_UNBIND) || cluster_id == THREAD_BOUND_CLUSTER_NONE) {
8991 /* If the thread was actually not bound to some cluster, nothing to do here */
8992 if (thread_bound_cluster_id(thread) == THREAD_BOUND_CLUSTER_NONE) {
8993 return KERN_SUCCESS;
8994 }
8995 } else {
8996 /* Validate the inputs for the bind case */
8997 int max_clusters = ml_get_cluster_count();
8998 if (cluster_id >= max_clusters) {
8999 /* Invalid cluster id */
9000 return KERN_INVALID_VALUE;
9001 }
9002 pset = pset_array[cluster_id];
9003 if (pset == NULL) {
9004 /* Cluster has not been initialized yet */
9005 return KERN_INVALID_VALUE;
9006 }
9007 if (options & THREAD_BIND_ELIGIBLE_ONLY) {
9008 if (SCHED(thread_eligible_for_pset(thread, pset)) == false) {
9009 /* Thread is not recommended for the cluster type */
9010 return KERN_INVALID_POLICY;
9011 }
9012 }
9013 }
9014
9015 spl_t s = splsched();
9016 thread_lock(thread);
9017
9018 /* Unbind the thread from its previous bound state */
9019 thread->sched_flags &= ~(TH_SFLAG_BOUND_SOFT);
9020 thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
9021
9022 if (options & THREAD_UNBIND) {
9023 /* Nothing more to do here */
9024 goto thread_bind_cluster_complete;
9025 }
9026
9027 if (options & THREAD_BIND_SOFT) {
9028 thread->sched_flags |= TH_SFLAG_BOUND_SOFT;
9029 }
9030 thread->th_bound_cluster_id = cluster_id;
9031
9032thread_bind_cluster_complete:
9033 thread_unlock(thread);
9034 splx(s);
9035
9036 if (thread == current_thread()) {
9037 thread_block(THREAD_CONTINUE_NULL);
9038 }
9039#else /* __AMP__ */
9040 (void)thread;
9041 (void)cluster_id;
9042 (void)options;
9043#endif /* __AMP__ */
9044 return KERN_SUCCESS;
9045}
9046
9047#if DEVELOPMENT || DEBUG
9048extern int32_t sysctl_get_bound_cpuid(void);
9049int32_t
9050sysctl_get_bound_cpuid(void)
9051{
9052 int32_t cpuid = -1;
9053 thread_t self = current_thread();
9054
9055 processor_t processor = self->bound_processor;
9056 if (processor == NULL) {
9057 cpuid = -1;
9058 } else {
9059 cpuid = processor->cpu_id;
9060 }
9061
9062 return cpuid;
9063}
9064
9065extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
9066kern_return_t
9067sysctl_thread_bind_cpuid(int32_t cpuid)
9068{
9069 processor_t processor = PROCESSOR_NULL;
9070
9071 if (cpuid == -1) {
9072 goto unbind;
9073 }
9074
9075 if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
9076 return KERN_INVALID_VALUE;
9077 }
9078
9079 processor = processor_array[cpuid];
9080 if (processor == PROCESSOR_NULL) {
9081 return KERN_INVALID_VALUE;
9082 }
9083
9084#if __AMP__
9085
9086 thread_t thread = current_thread();
9087
9088 if (thread->th_bound_cluster_id != THREAD_BOUND_CLUSTER_NONE) {
9089 if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
9090 /* Cannot hard-bind an already hard-cluster-bound thread */
9091 return KERN_NOT_SUPPORTED;
9092 }
9093 }
9094
9095#endif /* __AMP__ */
9096
9097unbind:
9098 thread_bind(processor);
9099
9100 thread_block(THREAD_CONTINUE_NULL);
9101 return KERN_SUCCESS;
9102}
9103
9104extern char sysctl_get_task_cluster_type(void);
9105char
9106sysctl_get_task_cluster_type(void)
9107{
9108 task_t task = current_task();
9109 processor_set_t pset_hint = task->pset_hint;
9110
9111 if (!pset_hint) {
9112 return '0';
9113 }
9114
9115#if __AMP__
9116 if (pset_hint->pset_cluster_type == PSET_AMP_E) {
9117 return 'E';
9118 } else if (pset_hint->pset_cluster_type == PSET_AMP_P) {
9119 return 'P';
9120 }
9121#endif
9122
9123 return '0';
9124}
9125
9126#if __AMP__
9127static processor_set_t
9128find_pset_of_type(pset_cluster_type_t t)
9129{
9130 for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) {
9131 if (node->pset_cluster_type != t) {
9132 continue;
9133 }
9134
9135 processor_set_t pset = PROCESSOR_SET_NULL;
9136 for (int pset_id = lsb_first(node->pset_map); pset_id >= 0; pset_id = lsb_next(node->pset_map, pset_id)) {
9137 pset = pset_array[pset_id];
9138 /* Prefer one with recommended processsors */
9139 if (pset->recommended_bitmask != 0) {
9140 assert(pset->pset_cluster_type == t);
9141 return pset;
9142 }
9143 }
9144 /* Otherwise return whatever was found last */
9145 return pset;
9146 }
9147
9148 return PROCESSOR_SET_NULL;
9149}
9150#endif
9151
9152extern void sysctl_task_set_cluster_type(char cluster_type);
9153void
9154sysctl_task_set_cluster_type(char cluster_type)
9155{
9156 task_t task = current_task();
9157 processor_set_t pset_hint = PROCESSOR_SET_NULL;
9158
9159#if __AMP__
9160 switch (cluster_type) {
9161 case 'e':
9162 case 'E':
9163 pset_hint = find_pset_of_type(PSET_AMP_E);
9164 break;
9165 case 'p':
9166 case 'P':
9167 pset_hint = find_pset_of_type(PSET_AMP_P);
9168 break;
9169 default:
9170 break;
9171 }
9172
9173 if (pset_hint) {
9174 task_lock(task);
9175 task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE;
9176 task->pset_hint = pset_hint;
9177 task_unlock(task);
9178
9179 thread_block(THREAD_CONTINUE_NULL);
9180 }
9181#else
9182 (void)cluster_type;
9183 (void)task;
9184 (void)pset_hint;
9185#endif
9186}
9187
9188/*
9189 * The quantum length used for Fixed and RT sched modes. In general the quantum
9190 * can vary - for example for background or QOS.
9191 */
9192extern uint64_t sysctl_get_quantum_us(void);
9193uint64_t
9194sysctl_get_quantum_us(void)
9195{
9196 uint32_t quantum;
9197 uint64_t quantum_ns;
9198
9199 quantum = SCHED(initial_quantum_size)(THREAD_NULL);
9200 absolutetime_to_nanoseconds(quantum, &quantum_ns);
9201
9202 return quantum_ns / 1000;
9203}
9204
9205#endif /* DEVELOPMENT || DEBUG */
9206