1/*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
68
69#include <mach/mach_types.h>
70#include <mach/machine.h>
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
73#include <mach/thread_act.h>
74
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
77#include <machine/machine_cpu.h>
78#include <machine/machlimits.h>
79#include <machine/atomic.h>
80
81#include <machine/commpage.h>
82
83#include <kern/kern_types.h>
84#include <kern/backtrace.h>
85#include <kern/clock.h>
86#include <kern/counters.h>
87#include <kern/cpu_number.h>
88#include <kern/cpu_data.h>
89#include <kern/smp.h>
90#include <kern/debug.h>
91#include <kern/macro_help.h>
92#include <kern/machine.h>
93#include <kern/misc_protos.h>
94#if MONOTONIC
95#include <kern/monotonic.h>
96#endif /* MONOTONIC */
97#include <kern/processor.h>
98#include <kern/queue.h>
99#include <kern/sched.h>
100#include <kern/sched_prim.h>
101#include <kern/sfi.h>
102#include <kern/syscall_subr.h>
103#include <kern/task.h>
104#include <kern/thread.h>
105#include <kern/ledger.h>
106#include <kern/timer_queue.h>
107#include <kern/waitq.h>
108#include <kern/policy_internal.h>
109#include <kern/cpu_quiesce.h>
110
111#include <vm/pmap.h>
112#include <vm/vm_kern.h>
113#include <vm/vm_map.h>
114#include <vm/vm_pageout.h>
115
116#include <mach/sdt.h>
117#include <mach/mach_host.h>
118#include <mach/host_info.h>
119
120#include <sys/kdebug.h>
121#include <kperf/kperf.h>
122#include <kern/kpc.h>
123#include <san/kasan.h>
124#include <kern/pms.h>
125#include <kern/host.h>
126#include <stdatomic.h>
127
128int rt_runq_count(processor_set_t pset)
129{
130 return atomic_load_explicit(&SCHED(rt_runq)(pset)->count, memory_order_relaxed);
131}
132
133void rt_runq_count_incr(processor_set_t pset)
134{
135 atomic_fetch_add_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed);
136}
137
138void rt_runq_count_decr(processor_set_t pset)
139{
140 atomic_fetch_sub_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed);
141}
142
143#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
144int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
145
146#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
147int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
148
149#define MAX_UNSAFE_QUANTA 800
150int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
151
152#define MAX_POLL_QUANTA 2
153int max_poll_quanta = MAX_POLL_QUANTA;
154
155#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
156int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
157
158uint64_t max_poll_computation;
159
160uint64_t max_unsafe_computation;
161uint64_t sched_safe_duration;
162
163#if defined(CONFIG_SCHED_TIMESHARE_CORE)
164
165uint32_t std_quantum;
166uint32_t min_std_quantum;
167uint32_t bg_quantum;
168
169uint32_t std_quantum_us;
170uint32_t bg_quantum_us;
171
172#endif /* CONFIG_SCHED_TIMESHARE_CORE */
173
174uint32_t thread_depress_time;
175uint32_t default_timeshare_computation;
176uint32_t default_timeshare_constraint;
177
178uint32_t max_rt_quantum;
179uint32_t min_rt_quantum;
180
181#if defined(CONFIG_SCHED_TIMESHARE_CORE)
182
183unsigned sched_tick;
184uint32_t sched_tick_interval;
185
186/* Timeshare load calculation interval (15ms) */
187uint32_t sched_load_compute_interval_us = 15000;
188uint64_t sched_load_compute_interval_abs;
189static _Atomic uint64_t sched_load_compute_deadline;
190
191uint32_t sched_pri_shifts[TH_BUCKET_MAX];
192uint32_t sched_fixed_shift;
193
194uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
195
196/* Allow foreground to decay past default to resolve inversions */
197#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
198int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
199
200/* Defaults for timer deadline profiling */
201#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
202 * 2ms */
203#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
204 <= 5ms */
205
206uint64_t timer_deadline_tracking_bin_1;
207uint64_t timer_deadline_tracking_bin_2;
208
209#endif /* CONFIG_SCHED_TIMESHARE_CORE */
210
211thread_t sched_maintenance_thread;
212
213#if __arm__ || __arm64__
214/* interrupts disabled lock to guard recommended cores state */
215decl_simple_lock_data(static,sched_recommended_cores_lock);
216static void sched_recommended_cores_maintenance(void);
217static void sched_update_recommended_cores(uint32_t recommended_cores);
218
219uint64_t perfcontrol_failsafe_starvation_threshold;
220extern char *proc_name_address(struct proc *p);
221
222#endif /* __arm__ || __arm64__ */
223
224uint64_t sched_one_second_interval;
225
226/* Forwards */
227
228#if defined(CONFIG_SCHED_TIMESHARE_CORE)
229
230static void load_shift_init(void);
231static void preempt_pri_init(void);
232
233#endif /* CONFIG_SCHED_TIMESHARE_CORE */
234
235#if CONFIG_SCHED_IDLE_IN_PLACE
236static thread_t thread_select_idle(
237 thread_t thread,
238 processor_t processor);
239#endif
240
241thread_t processor_idle(
242 thread_t thread,
243 processor_t processor);
244
245ast_t
246csw_check_locked( processor_t processor,
247 processor_set_t pset,
248 ast_t check_reason);
249
250static void processor_setrun(
251 processor_t processor,
252 thread_t thread,
253 integer_t options);
254
255static void
256sched_realtime_timebase_init(void);
257
258static void
259sched_timer_deadline_tracking_init(void);
260
261#if DEBUG
262extern int debug_task;
263#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
264#else
265#define TLOG(a, fmt, args...) do {} while (0)
266#endif
267
268static processor_t
269thread_bind_internal(
270 thread_t thread,
271 processor_t processor);
272
273static void
274sched_vm_group_maintenance(void);
275
276#if defined(CONFIG_SCHED_TIMESHARE_CORE)
277int8_t sched_load_shifts[NRQS];
278bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
279#endif /* CONFIG_SCHED_TIMESHARE_CORE */
280
281const struct sched_dispatch_table *sched_current_dispatch = NULL;
282
283/*
284 * Statically allocate a buffer to hold the longest possible
285 * scheduler description string, as currently implemented.
286 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
287 * to export to userspace via sysctl(3). If either version
288 * changes, update the other.
289 *
290 * Note that in addition to being an upper bound on the strings
291 * in the kernel, it's also an exact parameter to PE_get_default(),
292 * which interrogates the device tree on some platforms. That
293 * API requires the caller know the exact size of the device tree
294 * property, so we need both a legacy size (32) and the current size
295 * (48) to deal with old and new device trees. The device tree property
296 * is similarly padded to a fixed size so that the same kernel image
297 * can run on multiple devices with different schedulers configured
298 * in the device tree.
299 */
300char sched_string[SCHED_STRING_MAX_LENGTH];
301
302uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
303
304/* Global flag which indicates whether Background Stepper Context is enabled */
305static int cpu_throttle_enabled = 1;
306
307#if DEBUG
308
309/* Since using the indirect function dispatch table has a negative impact on
310 * context switch performance, only allow DEBUG kernels to use that mechanism.
311 */
312static void
313sched_init_override(void)
314{
315 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
316
317 /* Check for runtime selection of the scheduler algorithm */
318 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
319 sched_arg[0] = '\0';
320 }
321 if (strlen(sched_arg) > 0) {
322 if (0) {
323 /* Allow pattern below */
324#if defined(CONFIG_SCHED_TRADITIONAL)
325 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
326 sched_current_dispatch = &sched_traditional_dispatch;
327 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
328 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
329#endif
330#if defined(CONFIG_SCHED_MULTIQ)
331 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
332 sched_current_dispatch = &sched_multiq_dispatch;
333 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
334 sched_current_dispatch = &sched_dualq_dispatch;
335#endif
336 } else {
337#if defined(CONFIG_SCHED_TRADITIONAL)
338 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
339 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
340 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
341#else
342 panic("Unrecognized scheduler algorithm: %s", sched_arg);
343#endif
344 }
345 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
346 } else {
347#if defined(CONFIG_SCHED_MULTIQ)
348 sched_current_dispatch = &sched_dualq_dispatch;
349#elif defined(CONFIG_SCHED_TRADITIONAL)
350 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
351#else
352#error No default scheduler implementation
353#endif
354 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
355 }
356}
357
358#endif /* DEBUG */
359
360void
361sched_init(void)
362{
363#if DEBUG
364 sched_init_override();
365#else /* DEBUG */
366 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
367#endif /* DEBUG */
368
369 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
370 /* No boot-args, check in device tree */
371 if (!PE_get_default("kern.sched_pri_decay_limit",
372 &sched_pri_decay_band_limit,
373 sizeof(sched_pri_decay_band_limit))) {
374 /* Allow decay all the way to normal limits */
375 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
376 }
377 }
378
379 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
380
381 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
382 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
383 }
384 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
385
386 cpu_quiescent_counter_init();
387
388 SCHED(init)();
389 SCHED(rt_init)(&pset0);
390 sched_timer_deadline_tracking_init();
391
392 SCHED(pset_init)(&pset0);
393 SCHED(processor_init)(master_processor);
394}
395
396void
397sched_timebase_init(void)
398{
399 uint64_t abstime;
400
401 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
402 sched_one_second_interval = abstime;
403
404 SCHED(timebase_init)();
405 sched_realtime_timebase_init();
406}
407
408#if defined(CONFIG_SCHED_TIMESHARE_CORE)
409
410void
411sched_timeshare_init(void)
412{
413 /*
414 * Calculate the timeslicing quantum
415 * in us.
416 */
417 if (default_preemption_rate < 1)
418 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
419 std_quantum_us = (1000 * 1000) / default_preemption_rate;
420
421 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
422
423 if (default_bg_preemption_rate < 1)
424 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
425 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
426
427 printf("standard background quantum is %d us\n", bg_quantum_us);
428
429 load_shift_init();
430 preempt_pri_init();
431 sched_tick = 0;
432}
433
434void
435sched_timeshare_timebase_init(void)
436{
437 uint64_t abstime;
438 uint32_t shift;
439
440 /* standard timeslicing quantum */
441 clock_interval_to_absolutetime_interval(
442 std_quantum_us, NSEC_PER_USEC, &abstime);
443 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
444 std_quantum = (uint32_t)abstime;
445
446 /* smallest remaining quantum (250 us) */
447 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
448 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
449 min_std_quantum = (uint32_t)abstime;
450
451 /* quantum for background tasks */
452 clock_interval_to_absolutetime_interval(
453 bg_quantum_us, NSEC_PER_USEC, &abstime);
454 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
455 bg_quantum = (uint32_t)abstime;
456
457 /* scheduler tick interval */
458 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
459 NSEC_PER_USEC, &abstime);
460 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
461 sched_tick_interval = (uint32_t)abstime;
462
463 /* timeshare load calculation interval & deadline initialization */
464 clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
465 sched_load_compute_deadline = sched_load_compute_interval_abs;
466
467 /*
468 * Compute conversion factor from usage to
469 * timesharing priorities with 5/8 ** n aging.
470 */
471 abstime = (abstime * 5) / 3;
472 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
473 abstime >>= 1;
474 sched_fixed_shift = shift;
475
476 for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
477 sched_pri_shifts[i] = INT8_MAX;
478
479 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
480 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
481
482 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
483 thread_depress_time = 1 * std_quantum;
484 default_timeshare_computation = std_quantum / 2;
485 default_timeshare_constraint = std_quantum;
486
487#if __arm__ || __arm64__
488 perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
489#endif /* __arm__ || __arm64__ */
490}
491
492#endif /* CONFIG_SCHED_TIMESHARE_CORE */
493
494void
495pset_rt_init(processor_set_t pset)
496{
497 rt_lock_init(pset);
498
499 pset->rt_runq.count = 0;
500 queue_init(&pset->rt_runq.queue);
501 memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
502}
503
504rt_queue_t
505sched_rtglobal_runq(processor_set_t pset)
506{
507 (void)pset;
508
509 return &pset0.rt_runq;
510}
511
512void
513sched_rtglobal_init(processor_set_t pset)
514{
515 if (pset == &pset0) {
516 return pset_rt_init(pset);
517 }
518
519 /* Only pset0 rt_runq is used, so make it easy to detect
520 * buggy accesses to others.
521 */
522 memset(&pset->rt_runq, 0xfd, sizeof pset->rt_runq);
523}
524
525void
526sched_rtglobal_queue_shutdown(processor_t processor)
527{
528 (void)processor;
529}
530
531static void
532sched_realtime_timebase_init(void)
533{
534 uint64_t abstime;
535
536 /* smallest rt computaton (50 us) */
537 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
538 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
539 min_rt_quantum = (uint32_t)abstime;
540
541 /* maximum rt computation (50 ms) */
542 clock_interval_to_absolutetime_interval(
543 50, 1000*NSEC_PER_USEC, &abstime);
544 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
545 max_rt_quantum = (uint32_t)abstime;
546
547}
548
549void
550sched_check_spill(processor_set_t pset, thread_t thread)
551{
552 (void)pset;
553 (void)thread;
554
555 return;
556}
557
558bool
559sched_thread_should_yield(processor_t processor, thread_t thread)
560{
561 (void)thread;
562
563 return (!SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0);
564}
565
566#if defined(CONFIG_SCHED_TIMESHARE_CORE)
567
568/*
569 * Set up values for timeshare
570 * loading factors.
571 */
572static void
573load_shift_init(void)
574{
575 int8_t k, *p = sched_load_shifts;
576 uint32_t i, j;
577
578 uint32_t sched_decay_penalty = 1;
579
580 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
581 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
582 }
583
584 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
585 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
586 }
587
588 if (sched_decay_penalty == 0) {
589 /*
590 * There is no penalty for timeshare threads for using too much
591 * CPU, so set all load shifts to INT8_MIN. Even under high load,
592 * sched_pri_shift will be >INT8_MAX, and there will be no
593 * penalty applied to threads (nor will sched_usage be updated per
594 * thread).
595 */
596 for (i = 0; i < NRQS; i++) {
597 sched_load_shifts[i] = INT8_MIN;
598 }
599
600 return;
601 }
602
603 *p++ = INT8_MIN; *p++ = 0;
604
605 /*
606 * For a given system load "i", the per-thread priority
607 * penalty per quantum of CPU usage is ~2^k priority
608 * levels. "sched_decay_penalty" can cause more
609 * array entries to be filled with smaller "k" values
610 */
611 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
612 for (j <<= 1; (i < j) && (i < NRQS); ++i)
613 *p++ = k;
614 }
615}
616
617static void
618preempt_pri_init(void)
619{
620 bitmap_t *p = sched_preempt_pri;
621
622 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
623 bitmap_set(p, i);
624
625 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
626 bitmap_set(p, i);
627}
628
629#endif /* CONFIG_SCHED_TIMESHARE_CORE */
630
631/*
632 * Thread wait timer expiration.
633 */
634void
635thread_timer_expire(
636 void *p0,
637 __unused void *p1)
638{
639 thread_t thread = p0;
640 spl_t s;
641
642 assert_thread_magic(thread);
643
644 s = splsched();
645 thread_lock(thread);
646 if (--thread->wait_timer_active == 0) {
647 if (thread->wait_timer_is_set) {
648 thread->wait_timer_is_set = FALSE;
649 clear_wait_internal(thread, THREAD_TIMED_OUT);
650 }
651 }
652 thread_unlock(thread);
653 splx(s);
654}
655
656/*
657 * thread_unblock:
658 *
659 * Unblock thread on wake up.
660 *
661 * Returns TRUE if the thread should now be placed on the runqueue.
662 *
663 * Thread must be locked.
664 *
665 * Called at splsched().
666 */
667boolean_t
668thread_unblock(
669 thread_t thread,
670 wait_result_t wresult)
671{
672 boolean_t ready_for_runq = FALSE;
673 thread_t cthread = current_thread();
674 uint32_t new_run_count;
675 int old_thread_state;
676
677 /*
678 * Set wait_result.
679 */
680 thread->wait_result = wresult;
681
682 /*
683 * Cancel pending wait timer.
684 */
685 if (thread->wait_timer_is_set) {
686 if (timer_call_cancel(&thread->wait_timer))
687 thread->wait_timer_active--;
688 thread->wait_timer_is_set = FALSE;
689 }
690
691 /*
692 * Update scheduling state: not waiting,
693 * set running.
694 */
695 old_thread_state = thread->state;
696 thread->state = (old_thread_state | TH_RUN) &
697 ~(TH_WAIT|TH_UNINT|TH_WAIT_REPORT);
698
699 if ((old_thread_state & TH_RUN) == 0) {
700 uint64_t ctime = mach_approximate_time();
701 thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
702 timer_start(&thread->runnable_timer, ctime);
703
704 ready_for_runq = TRUE;
705
706 if (old_thread_state & TH_WAIT_REPORT) {
707 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
708 }
709
710 /* Update the runnable thread count */
711 new_run_count = sched_run_incr(thread);
712 } else {
713 /*
714 * Either the thread is idling in place on another processor,
715 * or it hasn't finished context switching yet.
716 */
717#if CONFIG_SCHED_IDLE_IN_PLACE
718 if (thread->state & TH_IDLE) {
719 processor_t processor = thread->last_processor;
720
721 if (processor != current_processor())
722 machine_signal_idle(processor);
723 }
724#else
725 assert((thread->state & TH_IDLE) == 0);
726#endif
727 /*
728 * The run count is only dropped after the context switch completes
729 * and the thread is still waiting, so we should not run_incr here
730 */
731 new_run_count = sched_run_buckets[TH_BUCKET_RUN];
732 }
733
734
735 /*
736 * Calculate deadline for real-time threads.
737 */
738 if (thread->sched_mode == TH_MODE_REALTIME) {
739 uint64_t ctime;
740
741 ctime = mach_absolute_time();
742 thread->realtime.deadline = thread->realtime.constraint + ctime;
743 }
744
745 /*
746 * Clear old quantum, fail-safe computation, etc.
747 */
748 thread->quantum_remaining = 0;
749 thread->computation_metered = 0;
750 thread->reason = AST_NONE;
751 thread->block_hint = kThreadWaitNone;
752
753 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
754 * We also account for "double hop" thread signaling via
755 * the thread callout infrastructure.
756 * DRK: consider removing the callout wakeup counters in the future
757 * they're present for verification at the moment.
758 */
759 boolean_t aticontext, pidle;
760 ml_get_power_state(&aticontext, &pidle);
761
762 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
763 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
764
765 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
766
767 if (ttd) {
768 if (ttd <= timer_deadline_tracking_bin_1)
769 thread->thread_timer_wakeups_bin_1++;
770 else
771 if (ttd <= timer_deadline_tracking_bin_2)
772 thread->thread_timer_wakeups_bin_2++;
773 }
774
775 ledger_credit_thread(thread, thread->t_ledger,
776 task_ledgers.interrupt_wakeups, 1);
777 if (pidle) {
778 ledger_credit_thread(thread, thread->t_ledger,
779 task_ledgers.platform_idle_wakeups, 1);
780 }
781
782 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
783 /* TODO: what about an interrupt that does a wake taken on a callout thread? */
784 if (cthread->callout_woken_from_icontext) {
785 ledger_credit_thread(thread, thread->t_ledger,
786 task_ledgers.interrupt_wakeups, 1);
787 thread->thread_callout_interrupt_wakeups++;
788
789 if (cthread->callout_woken_from_platform_idle) {
790 ledger_credit_thread(thread, thread->t_ledger,
791 task_ledgers.platform_idle_wakeups, 1);
792 thread->thread_callout_platform_idle_wakeups++;
793 }
794
795 cthread->callout_woke_thread = TRUE;
796 }
797 }
798
799 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
800 thread->callout_woken_from_icontext = aticontext;
801 thread->callout_woken_from_platform_idle = pidle;
802 thread->callout_woke_thread = FALSE;
803 }
804
805#if KPERF
806 if (ready_for_runq) {
807 kperf_make_runnable(thread, aticontext);
808 }
809#endif /* KPERF */
810
811 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
812 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
813 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
814 sched_run_buckets[TH_BUCKET_RUN], 0);
815
816 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
817
818 return (ready_for_runq);
819}
820
821/*
822 * Routine: thread_go
823 * Purpose:
824 * Unblock and dispatch thread.
825 * Conditions:
826 * thread lock held, IPC locks may be held.
827 * thread must have been pulled from wait queue under same lock hold.
828 * thread must have been waiting
829 * Returns:
830 * KERN_SUCCESS - Thread was set running
831 *
832 * TODO: This should return void
833 */
834kern_return_t
835thread_go(
836 thread_t thread,
837 wait_result_t wresult)
838{
839 assert_thread_magic(thread);
840
841 assert(thread->at_safe_point == FALSE);
842 assert(thread->wait_event == NO_EVENT64);
843 assert(thread->waitq == NULL);
844
845 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
846 assert(thread->state & TH_WAIT);
847
848
849 if (thread_unblock(thread, wresult)) {
850#if SCHED_TRACE_THREAD_WAKEUPS
851 backtrace(&thread->thread_wakeup_bt[0],
852 (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
853#endif
854 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
855 }
856
857 return (KERN_SUCCESS);
858}
859
860/*
861 * Routine: thread_mark_wait_locked
862 * Purpose:
863 * Mark a thread as waiting. If, given the circumstances,
864 * it doesn't want to wait (i.e. already aborted), then
865 * indicate that in the return value.
866 * Conditions:
867 * at splsched() and thread is locked.
868 */
869__private_extern__
870wait_result_t
871thread_mark_wait_locked(
872 thread_t thread,
873 wait_interrupt_t interruptible_orig)
874{
875 boolean_t at_safe_point;
876 wait_interrupt_t interruptible = interruptible_orig;
877
878 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2|TH_WAIT_REPORT)));
879
880 /*
881 * The thread may have certain types of interrupts/aborts masked
882 * off. Even if the wait location says these types of interrupts
883 * are OK, we have to honor mask settings (outer-scoped code may
884 * not be able to handle aborts at the moment).
885 */
886 interruptible &= TH_OPT_INTMASK;
887 if (interruptible > (thread->options & TH_OPT_INTMASK))
888 interruptible = thread->options & TH_OPT_INTMASK;
889
890 at_safe_point = (interruptible == THREAD_ABORTSAFE);
891
892 if ( interruptible == THREAD_UNINT ||
893 !(thread->sched_flags & TH_SFLAG_ABORT) ||
894 (!at_safe_point &&
895 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
896
897 if ( !(thread->state & TH_TERMINATE))
898 DTRACE_SCHED(sleep);
899
900 int state_bits = TH_WAIT;
901 if (!interruptible) {
902 state_bits |= TH_UNINT;
903 }
904 if (thread->sched_call) {
905 wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
906 if (is_kerneltask(thread->task)) {
907 mask = THREAD_WAIT_NOREPORT_KERNEL;
908 }
909 if ((interruptible_orig & mask) == 0) {
910 state_bits |= TH_WAIT_REPORT;
911 }
912 }
913 thread->state |= state_bits;
914 thread->at_safe_point = at_safe_point;
915
916 /* TODO: pass this through assert_wait instead, have
917 * assert_wait just take a struct as an argument */
918 assert(!thread->block_hint);
919 thread->block_hint = thread->pending_block_hint;
920 thread->pending_block_hint = kThreadWaitNone;
921
922 return (thread->wait_result = THREAD_WAITING);
923 } else {
924 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
925 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
926 }
927 thread->pending_block_hint = kThreadWaitNone;
928
929 return (thread->wait_result = THREAD_INTERRUPTED);
930}
931
932/*
933 * Routine: thread_interrupt_level
934 * Purpose:
935 * Set the maximum interruptible state for the
936 * current thread. The effective value of any
937 * interruptible flag passed into assert_wait
938 * will never exceed this.
939 *
940 * Useful for code that must not be interrupted,
941 * but which calls code that doesn't know that.
942 * Returns:
943 * The old interrupt level for the thread.
944 */
945__private_extern__
946wait_interrupt_t
947thread_interrupt_level(
948 wait_interrupt_t new_level)
949{
950 thread_t thread = current_thread();
951 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
952
953 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
954
955 return result;
956}
957
958/*
959 * assert_wait:
960 *
961 * Assert that the current thread is about to go to
962 * sleep until the specified event occurs.
963 */
964wait_result_t
965assert_wait(
966 event_t event,
967 wait_interrupt_t interruptible)
968{
969 if (__improbable(event == NO_EVENT))
970 panic("%s() called with NO_EVENT", __func__);
971
972 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
973 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
974 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
975
976 struct waitq *waitq;
977 waitq = global_eventq(event);
978 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
979}
980
981/*
982 * assert_wait_queue:
983 *
984 * Return the global waitq for the specified event
985 */
986struct waitq *
987assert_wait_queue(
988 event_t event)
989{
990 return global_eventq(event);
991}
992
993wait_result_t
994assert_wait_timeout(
995 event_t event,
996 wait_interrupt_t interruptible,
997 uint32_t interval,
998 uint32_t scale_factor)
999{
1000 thread_t thread = current_thread();
1001 wait_result_t wresult;
1002 uint64_t deadline;
1003 spl_t s;
1004
1005 if (__improbable(event == NO_EVENT))
1006 panic("%s() called with NO_EVENT", __func__);
1007
1008 struct waitq *waitq;
1009 waitq = global_eventq(event);
1010
1011 s = splsched();
1012 waitq_lock(waitq);
1013
1014 clock_interval_to_deadline(interval, scale_factor, &deadline);
1015
1016 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1017 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1018 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1019
1020 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1021 interruptible,
1022 TIMEOUT_URGENCY_SYS_NORMAL,
1023 deadline, TIMEOUT_NO_LEEWAY,
1024 thread);
1025
1026 waitq_unlock(waitq);
1027 splx(s);
1028 return wresult;
1029}
1030
1031wait_result_t
1032assert_wait_timeout_with_leeway(
1033 event_t event,
1034 wait_interrupt_t interruptible,
1035 wait_timeout_urgency_t urgency,
1036 uint32_t interval,
1037 uint32_t leeway,
1038 uint32_t scale_factor)
1039{
1040 thread_t thread = current_thread();
1041 wait_result_t wresult;
1042 uint64_t deadline;
1043 uint64_t abstime;
1044 uint64_t slop;
1045 uint64_t now;
1046 spl_t s;
1047
1048 if (__improbable(event == NO_EVENT))
1049 panic("%s() called with NO_EVENT", __func__);
1050
1051 now = mach_absolute_time();
1052 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
1053 deadline = now + abstime;
1054
1055 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
1056
1057 struct waitq *waitq;
1058 waitq = global_eventq(event);
1059
1060 s = splsched();
1061 waitq_lock(waitq);
1062
1063 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1064 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1065 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1066
1067 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1068 interruptible,
1069 urgency, deadline, slop,
1070 thread);
1071
1072 waitq_unlock(waitq);
1073 splx(s);
1074 return wresult;
1075}
1076
1077wait_result_t
1078assert_wait_deadline(
1079 event_t event,
1080 wait_interrupt_t interruptible,
1081 uint64_t deadline)
1082{
1083 thread_t thread = current_thread();
1084 wait_result_t wresult;
1085 spl_t s;
1086
1087 if (__improbable(event == NO_EVENT))
1088 panic("%s() called with NO_EVENT", __func__);
1089
1090 struct waitq *waitq;
1091 waitq = global_eventq(event);
1092
1093 s = splsched();
1094 waitq_lock(waitq);
1095
1096 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1097 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1098 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1099
1100 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1101 interruptible,
1102 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1103 TIMEOUT_NO_LEEWAY, thread);
1104 waitq_unlock(waitq);
1105 splx(s);
1106 return wresult;
1107}
1108
1109wait_result_t
1110assert_wait_deadline_with_leeway(
1111 event_t event,
1112 wait_interrupt_t interruptible,
1113 wait_timeout_urgency_t urgency,
1114 uint64_t deadline,
1115 uint64_t leeway)
1116{
1117 thread_t thread = current_thread();
1118 wait_result_t wresult;
1119 spl_t s;
1120
1121 if (__improbable(event == NO_EVENT))
1122 panic("%s() called with NO_EVENT", __func__);
1123
1124 struct waitq *waitq;
1125 waitq = global_eventq(event);
1126
1127 s = splsched();
1128 waitq_lock(waitq);
1129
1130 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1131 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1132 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1133
1134 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1135 interruptible,
1136 urgency, deadline, leeway,
1137 thread);
1138 waitq_unlock(waitq);
1139 splx(s);
1140 return wresult;
1141}
1142
1143/*
1144 * thread_isoncpu:
1145 *
1146 * Return TRUE if a thread is running on a processor such that an AST
1147 * is needed to pull it out of userspace execution, or if executing in
1148 * the kernel, bring to a context switch boundary that would cause
1149 * thread state to be serialized in the thread PCB.
1150 *
1151 * Thread locked, returns the same way. While locked, fields
1152 * like "state" cannot change. "runq" can change only from set to unset.
1153 */
1154static inline boolean_t
1155thread_isoncpu(thread_t thread)
1156{
1157 /* Not running or runnable */
1158 if (!(thread->state & TH_RUN))
1159 return (FALSE);
1160
1161 /* Waiting on a runqueue, not currently running */
1162 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1163 if (thread->runq != PROCESSOR_NULL)
1164 return (FALSE);
1165
1166 /*
1167 * Thread does not have a stack yet
1168 * It could be on the stack alloc queue or preparing to be invoked
1169 */
1170 if (!thread->kernel_stack)
1171 return (FALSE);
1172
1173 /*
1174 * Thread must be running on a processor, or
1175 * about to run, or just did run. In all these
1176 * cases, an AST to the processor is needed
1177 * to guarantee that the thread is kicked out
1178 * of userspace and the processor has
1179 * context switched (and saved register state).
1180 */
1181 return (TRUE);
1182}
1183
1184/*
1185 * thread_stop:
1186 *
1187 * Force a preemption point for a thread and wait
1188 * for it to stop running on a CPU. If a stronger
1189 * guarantee is requested, wait until no longer
1190 * runnable. Arbitrates access among
1191 * multiple stop requests. (released by unstop)
1192 *
1193 * The thread must enter a wait state and stop via a
1194 * separate means.
1195 *
1196 * Returns FALSE if interrupted.
1197 */
1198boolean_t
1199thread_stop(
1200 thread_t thread,
1201 boolean_t until_not_runnable)
1202{
1203 wait_result_t wresult;
1204 spl_t s = splsched();
1205 boolean_t oncpu;
1206
1207 wake_lock(thread);
1208 thread_lock(thread);
1209
1210 while (thread->state & TH_SUSP) {
1211 thread->wake_active = TRUE;
1212 thread_unlock(thread);
1213
1214 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1215 wake_unlock(thread);
1216 splx(s);
1217
1218 if (wresult == THREAD_WAITING)
1219 wresult = thread_block(THREAD_CONTINUE_NULL);
1220
1221 if (wresult != THREAD_AWAKENED)
1222 return (FALSE);
1223
1224 s = splsched();
1225 wake_lock(thread);
1226 thread_lock(thread);
1227 }
1228
1229 thread->state |= TH_SUSP;
1230
1231 while ((oncpu = thread_isoncpu(thread)) ||
1232 (until_not_runnable && (thread->state & TH_RUN))) {
1233 processor_t processor;
1234
1235 if (oncpu) {
1236 assert(thread->state & TH_RUN);
1237 processor = thread->chosen_processor;
1238 cause_ast_check(processor);
1239 }
1240
1241 thread->wake_active = TRUE;
1242 thread_unlock(thread);
1243
1244 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1245 wake_unlock(thread);
1246 splx(s);
1247
1248 if (wresult == THREAD_WAITING)
1249 wresult = thread_block(THREAD_CONTINUE_NULL);
1250
1251 if (wresult != THREAD_AWAKENED) {
1252 thread_unstop(thread);
1253 return (FALSE);
1254 }
1255
1256 s = splsched();
1257 wake_lock(thread);
1258 thread_lock(thread);
1259 }
1260
1261 thread_unlock(thread);
1262 wake_unlock(thread);
1263 splx(s);
1264
1265 /*
1266 * We return with the thread unlocked. To prevent it from
1267 * transitioning to a runnable state (or from TH_RUN to
1268 * being on the CPU), the caller must ensure the thread
1269 * is stopped via an external means (such as an AST)
1270 */
1271
1272 return (TRUE);
1273}
1274
1275/*
1276 * thread_unstop:
1277 *
1278 * Release a previous stop request and set
1279 * the thread running if appropriate.
1280 *
1281 * Use only after a successful stop operation.
1282 */
1283void
1284thread_unstop(
1285 thread_t thread)
1286{
1287 spl_t s = splsched();
1288
1289 wake_lock(thread);
1290 thread_lock(thread);
1291
1292 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
1293
1294 if (thread->state & TH_SUSP) {
1295 thread->state &= ~TH_SUSP;
1296
1297 if (thread->wake_active) {
1298 thread->wake_active = FALSE;
1299 thread_unlock(thread);
1300
1301 thread_wakeup(&thread->wake_active);
1302 wake_unlock(thread);
1303 splx(s);
1304
1305 return;
1306 }
1307 }
1308
1309 thread_unlock(thread);
1310 wake_unlock(thread);
1311 splx(s);
1312}
1313
1314/*
1315 * thread_wait:
1316 *
1317 * Wait for a thread to stop running. (non-interruptible)
1318 *
1319 */
1320void
1321thread_wait(
1322 thread_t thread,
1323 boolean_t until_not_runnable)
1324{
1325 wait_result_t wresult;
1326 boolean_t oncpu;
1327 processor_t processor;
1328 spl_t s = splsched();
1329
1330 wake_lock(thread);
1331 thread_lock(thread);
1332
1333 /*
1334 * Wait until not running on a CPU. If stronger requirement
1335 * desired, wait until not runnable. Assumption: if thread is
1336 * on CPU, then TH_RUN is set, so we're not waiting in any case
1337 * where the original, pure "TH_RUN" check would have let us
1338 * finish.
1339 */
1340 while ((oncpu = thread_isoncpu(thread)) ||
1341 (until_not_runnable && (thread->state & TH_RUN))) {
1342
1343 if (oncpu) {
1344 assert(thread->state & TH_RUN);
1345 processor = thread->chosen_processor;
1346 cause_ast_check(processor);
1347 }
1348
1349 thread->wake_active = TRUE;
1350 thread_unlock(thread);
1351
1352 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1353 wake_unlock(thread);
1354 splx(s);
1355
1356 if (wresult == THREAD_WAITING)
1357 thread_block(THREAD_CONTINUE_NULL);
1358
1359 s = splsched();
1360 wake_lock(thread);
1361 thread_lock(thread);
1362 }
1363
1364 thread_unlock(thread);
1365 wake_unlock(thread);
1366 splx(s);
1367}
1368
1369/*
1370 * Routine: clear_wait_internal
1371 *
1372 * Clear the wait condition for the specified thread.
1373 * Start the thread executing if that is appropriate.
1374 * Arguments:
1375 * thread thread to awaken
1376 * result Wakeup result the thread should see
1377 * Conditions:
1378 * At splsched
1379 * the thread is locked.
1380 * Returns:
1381 * KERN_SUCCESS thread was rousted out a wait
1382 * KERN_FAILURE thread was waiting but could not be rousted
1383 * KERN_NOT_WAITING thread was not waiting
1384 */
1385__private_extern__ kern_return_t
1386clear_wait_internal(
1387 thread_t thread,
1388 wait_result_t wresult)
1389{
1390 uint32_t i = LockTimeOutUsec;
1391 struct waitq *waitq = thread->waitq;
1392
1393 do {
1394 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1395 return (KERN_FAILURE);
1396
1397 if (waitq != NULL) {
1398 if (!waitq_pull_thread_locked(waitq, thread)) {
1399 thread_unlock(thread);
1400 delay(1);
1401 if (i > 0 && !machine_timeout_suspended())
1402 i--;
1403 thread_lock(thread);
1404 if (waitq != thread->waitq)
1405 return KERN_NOT_WAITING;
1406 continue;
1407 }
1408 }
1409
1410 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1411 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1412 return (thread_go(thread, wresult));
1413 else
1414 return (KERN_NOT_WAITING);
1415 } while (i > 0);
1416
1417 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1418 thread, waitq, cpu_number());
1419
1420 return (KERN_FAILURE);
1421}
1422
1423
1424/*
1425 * clear_wait:
1426 *
1427 * Clear the wait condition for the specified thread. Start the thread
1428 * executing if that is appropriate.
1429 *
1430 * parameters:
1431 * thread thread to awaken
1432 * result Wakeup result the thread should see
1433 */
1434kern_return_t
1435clear_wait(
1436 thread_t thread,
1437 wait_result_t result)
1438{
1439 kern_return_t ret;
1440 spl_t s;
1441
1442 s = splsched();
1443 thread_lock(thread);
1444 ret = clear_wait_internal(thread, result);
1445 thread_unlock(thread);
1446 splx(s);
1447 return ret;
1448}
1449
1450
1451/*
1452 * thread_wakeup_prim:
1453 *
1454 * Common routine for thread_wakeup, thread_wakeup_with_result,
1455 * and thread_wakeup_one.
1456 *
1457 */
1458kern_return_t
1459thread_wakeup_prim(
1460 event_t event,
1461 boolean_t one_thread,
1462 wait_result_t result)
1463{
1464 if (__improbable(event == NO_EVENT))
1465 panic("%s() called with NO_EVENT", __func__);
1466
1467 struct waitq *wq = global_eventq(event);
1468
1469 if (one_thread)
1470 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1471 else
1472 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1473}
1474
1475/*
1476 * Wakeup a specified thread if and only if it's waiting for this event
1477 */
1478kern_return_t
1479thread_wakeup_thread(
1480 event_t event,
1481 thread_t thread)
1482{
1483 if (__improbable(event == NO_EVENT))
1484 panic("%s() called with NO_EVENT", __func__);
1485
1486 if (__improbable(thread == THREAD_NULL))
1487 panic("%s() called with THREAD_NULL", __func__);
1488
1489 struct waitq *wq = global_eventq(event);
1490
1491 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1492}
1493
1494/*
1495 * Wakeup a thread waiting on an event and promote it to a priority.
1496 *
1497 * Requires woken thread to un-promote itself when done.
1498 */
1499kern_return_t
1500thread_wakeup_one_with_pri(
1501 event_t event,
1502 int priority)
1503{
1504 if (__improbable(event == NO_EVENT))
1505 panic("%s() called with NO_EVENT", __func__);
1506
1507 struct waitq *wq = global_eventq(event);
1508
1509 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1510}
1511
1512/*
1513 * Wakeup a thread waiting on an event,
1514 * promote it to a priority,
1515 * and return a reference to the woken thread.
1516 *
1517 * Requires woken thread to un-promote itself when done.
1518 */
1519thread_t
1520thread_wakeup_identify(event_t event,
1521 int priority)
1522{
1523 if (__improbable(event == NO_EVENT))
1524 panic("%s() called with NO_EVENT", __func__);
1525
1526 struct waitq *wq = global_eventq(event);
1527
1528 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1529}
1530
1531/*
1532 * thread_bind:
1533 *
1534 * Force the current thread to execute on the specified processor.
1535 * Takes effect after the next thread_block().
1536 *
1537 * Returns the previous binding. PROCESSOR_NULL means
1538 * not bound.
1539 *
1540 * XXX - DO NOT export this to users - XXX
1541 */
1542processor_t
1543thread_bind(
1544 processor_t processor)
1545{
1546 thread_t self = current_thread();
1547 processor_t prev;
1548 spl_t s;
1549
1550 s = splsched();
1551 thread_lock(self);
1552
1553 prev = thread_bind_internal(self, processor);
1554
1555 thread_unlock(self);
1556 splx(s);
1557
1558 return (prev);
1559}
1560
1561/*
1562 * thread_bind_internal:
1563 *
1564 * If the specified thread is not the current thread, and it is currently
1565 * running on another CPU, a remote AST must be sent to that CPU to cause
1566 * the thread to migrate to its bound processor. Otherwise, the migration
1567 * will occur at the next quantum expiration or blocking point.
1568 *
1569 * When the thread is the current thread, and explicit thread_block() should
1570 * be used to force the current processor to context switch away and
1571 * let the thread migrate to the bound processor.
1572 *
1573 * Thread must be locked, and at splsched.
1574 */
1575
1576static processor_t
1577thread_bind_internal(
1578 thread_t thread,
1579 processor_t processor)
1580{
1581 processor_t prev;
1582
1583 /* <rdar://problem/15102234> */
1584 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1585 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1586 assert(thread->runq == PROCESSOR_NULL);
1587
1588 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1589
1590 prev = thread->bound_processor;
1591 thread->bound_processor = processor;
1592
1593 return (prev);
1594}
1595
1596/*
1597 * thread_vm_bind_group_add:
1598 *
1599 * The "VM bind group" is a special mechanism to mark a collection
1600 * of threads from the VM subsystem that, in general, should be scheduled
1601 * with only one CPU of parallelism. To accomplish this, we initially
1602 * bind all the threads to the master processor, which has the effect
1603 * that only one of the threads in the group can execute at once, including
1604 * preempting threads in the group that are a lower priority. Future
1605 * mechanisms may use more dynamic mechanisms to prevent the collection
1606 * of VM threads from using more CPU time than desired.
1607 *
1608 * The current implementation can result in priority inversions where
1609 * compute-bound priority 95 or realtime threads that happen to have
1610 * landed on the master processor prevent the VM threads from running.
1611 * When this situation is detected, we unbind the threads for one
1612 * scheduler tick to allow the scheduler to run the threads an
1613 * additional CPUs, before restoring the binding (assuming high latency
1614 * is no longer a problem).
1615 */
1616
1617/*
1618 * The current max is provisioned for:
1619 * vm_compressor_swap_trigger_thread (92)
1620 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1621 * vm_pageout_continue (92)
1622 * memorystatus_thread (95)
1623 */
1624#define MAX_VM_BIND_GROUP_COUNT (5)
1625decl_simple_lock_data(static,sched_vm_group_list_lock);
1626static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1627static int sched_vm_group_thread_count;
1628static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1629
1630void
1631thread_vm_bind_group_add(void)
1632{
1633 thread_t self = current_thread();
1634
1635 thread_reference_internal(self);
1636 self->options |= TH_OPT_SCHED_VM_GROUP;
1637
1638 simple_lock(&sched_vm_group_list_lock);
1639 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1640 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1641 simple_unlock(&sched_vm_group_list_lock);
1642
1643 thread_bind(master_processor);
1644
1645 /* Switch to bound processor if not already there */
1646 thread_block(THREAD_CONTINUE_NULL);
1647}
1648
1649static void
1650sched_vm_group_maintenance(void)
1651{
1652 uint64_t ctime = mach_absolute_time();
1653 uint64_t longtime = ctime - sched_tick_interval;
1654 int i;
1655 spl_t s;
1656 boolean_t high_latency_observed = FALSE;
1657 boolean_t runnable_and_not_on_runq_observed = FALSE;
1658 boolean_t bind_target_changed = FALSE;
1659 processor_t bind_target = PROCESSOR_NULL;
1660
1661 /* Make sure nobody attempts to add new threads while we are enumerating them */
1662 simple_lock(&sched_vm_group_list_lock);
1663
1664 s = splsched();
1665
1666 for (i=0; i < sched_vm_group_thread_count; i++) {
1667 thread_t thread = sched_vm_group_thread_list[i];
1668 assert(thread != THREAD_NULL);
1669 thread_lock(thread);
1670 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1671 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1672 high_latency_observed = TRUE;
1673 } else if (thread->runq == PROCESSOR_NULL) {
1674 /* There are some cases where a thread be transitiong that also fall into this case */
1675 runnable_and_not_on_runq_observed = TRUE;
1676 }
1677 }
1678 thread_unlock(thread);
1679
1680 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1681 /* All the things we are looking for are true, stop looking */
1682 break;
1683 }
1684 }
1685
1686 splx(s);
1687
1688 if (sched_vm_group_temporarily_unbound) {
1689 /* If we turned off binding, make sure everything is OK before rebinding */
1690 if (!high_latency_observed) {
1691 /* rebind */
1692 bind_target_changed = TRUE;
1693 bind_target = master_processor;
1694 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1695 }
1696 } else {
1697 /*
1698 * Check if we're in a bad state, which is defined by high
1699 * latency with no core currently executing a thread. If a
1700 * single thread is making progress on a CPU, that means the
1701 * binding concept to reduce parallelism is working as
1702 * designed.
1703 */
1704 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1705 /* unbind */
1706 bind_target_changed = TRUE;
1707 bind_target = PROCESSOR_NULL;
1708 sched_vm_group_temporarily_unbound = TRUE;
1709 }
1710 }
1711
1712 if (bind_target_changed) {
1713 s = splsched();
1714 for (i=0; i < sched_vm_group_thread_count; i++) {
1715 thread_t thread = sched_vm_group_thread_list[i];
1716 boolean_t removed;
1717 assert(thread != THREAD_NULL);
1718
1719 thread_lock(thread);
1720 removed = thread_run_queue_remove(thread);
1721 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1722 thread_bind_internal(thread, bind_target);
1723 } else {
1724 /*
1725 * Thread was in the middle of being context-switched-to,
1726 * or was in the process of blocking. To avoid switching the bind
1727 * state out mid-flight, defer the change if possible.
1728 */
1729 if (bind_target == PROCESSOR_NULL) {
1730 thread_bind_internal(thread, bind_target);
1731 } else {
1732 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1733 }
1734 }
1735
1736 if (removed) {
1737 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1738 }
1739 thread_unlock(thread);
1740 }
1741 splx(s);
1742 }
1743
1744 simple_unlock(&sched_vm_group_list_lock);
1745}
1746
1747/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1748 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1749 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1750 * IPI thrash if this core does not remain idle following the load balancing ASTs
1751 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1752 * followed by a wakeup shortly thereafter.
1753 */
1754
1755#if (DEVELOPMENT || DEBUG)
1756int sched_smt_balance = 1;
1757#endif
1758
1759#if __SMP__
1760/* Invoked with pset locked, returns with pset unlocked */
1761void
1762sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1763 processor_t ast_processor = NULL;
1764
1765#if (DEVELOPMENT || DEBUG)
1766 if (__improbable(sched_smt_balance == 0))
1767 goto smt_balance_exit;
1768#endif
1769
1770 assert(cprocessor == current_processor());
1771 if (cprocessor->is_SMT == FALSE)
1772 goto smt_balance_exit;
1773
1774 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1775
1776 /* Determine if both this processor and its sibling are idle,
1777 * indicating an SMT rebalancing opportunity.
1778 */
1779 if (sib_processor->state != PROCESSOR_IDLE)
1780 goto smt_balance_exit;
1781
1782 processor_t sprocessor;
1783
1784 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
1785 uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
1786 ~cpset->primary_map);
1787 for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
1788 sprocessor = processor_array[cpuid];
1789 if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1790 (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
1791
1792 ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
1793 if (ipi_type != SCHED_IPI_NONE) {
1794 assert(sprocessor != cprocessor);
1795 ast_processor = sprocessor;
1796 break;
1797 }
1798 }
1799 }
1800
1801smt_balance_exit:
1802 pset_unlock(cpset);
1803
1804 if (ast_processor) {
1805 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1806 sched_ipi_perform(ast_processor, ipi_type);
1807 }
1808}
1809#else
1810/* Invoked with pset locked, returns with pset unlocked */
1811void
1812sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset)
1813{
1814 pset_unlock(cpset);
1815}
1816#endif /* __SMP__ */
1817
1818static processor_t choose_processor_for_realtime_thread(processor_set_t pset);
1819static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset);
1820int sched_allow_rt_smt = 1;
1821
1822/*
1823 * thread_select:
1824 *
1825 * Select a new thread for the current processor to execute.
1826 *
1827 * May select the current thread, which must be locked.
1828 */
1829static thread_t
1830thread_select(thread_t thread,
1831 processor_t processor,
1832 ast_t *reason)
1833{
1834 processor_set_t pset = processor->processor_set;
1835 thread_t new_thread = THREAD_NULL;
1836
1837 assert(processor == current_processor());
1838 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
1839
1840 do {
1841 /*
1842 * Update the priority.
1843 */
1844 if (SCHED(can_update_priority)(thread))
1845 SCHED(update_priority)(thread);
1846
1847 processor_state_update_from_thread(processor, thread);
1848
1849 pset_lock(pset);
1850
1851 assert(processor->state != PROCESSOR_OFF_LINE);
1852
1853 if (!processor->is_recommended) {
1854 /*
1855 * The performance controller has provided a hint to not dispatch more threads,
1856 * unless they are bound to us (and thus we are the only option
1857 */
1858 if (!SCHED(processor_bound_count)(processor)) {
1859 goto idle;
1860 }
1861 } else if (processor->processor_primary != processor) {
1862 /*
1863 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1864 * we should look for work only under the same conditions that choose_processor()
1865 * would have assigned work, which is when all primary processors have been assigned work.
1866 *
1867 * An exception is that bound threads are dispatched to a processor without going through
1868 * choose_processor(), so in those cases we should continue trying to dequeue work.
1869 */
1870 if (!SCHED(processor_bound_count)(processor)) {
1871 if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
1872 goto idle;
1873 }
1874
1875 /* There are no idle primaries */
1876
1877 if (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) {
1878 bool secondary_can_run_realtime_thread = sched_allow_rt_smt && rt_runq_count(pset) && all_available_primaries_are_running_realtime_threads(pset);
1879 if (!secondary_can_run_realtime_thread) {
1880 goto idle;
1881 }
1882 }
1883 }
1884 }
1885
1886 /*
1887 * Test to see if the current thread should continue
1888 * to run on this processor. Must not be attempting to wait, and not
1889 * bound to a different processor, nor be in the wrong
1890 * processor set, nor be forced to context switch by TH_SUSP.
1891 *
1892 * Note that there are never any RT threads in the regular runqueue.
1893 *
1894 * This code is very insanely tricky.
1895 */
1896
1897 /* i.e. not waiting, not TH_SUSP'ed */
1898 boolean_t still_running = ((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN);
1899
1900 /*
1901 * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
1902 * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
1903 */
1904 boolean_t needs_smt_rebalance = (thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
1905
1906 boolean_t affinity_mismatch = (thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset);
1907
1908 boolean_t bound_elsewhere = (thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor);
1909
1910 boolean_t avoid_processor = (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread));
1911
1912 if (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor) {
1913 /*
1914 * This thread is eligible to keep running on this processor.
1915 *
1916 * RT threads with un-expired quantum stay on processor,
1917 * unless there's a valid RT thread with an earlier deadline.
1918 */
1919 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
1920 if (rt_runq_count(pset) > 0) {
1921
1922 rt_lock_lock(pset);
1923
1924 if (rt_runq_count(pset) > 0) {
1925
1926 thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1927
1928 if (next_rt->realtime.deadline < processor->deadline &&
1929 (next_rt->bound_processor == PROCESSOR_NULL ||
1930 next_rt->bound_processor == processor)) {
1931 /* The next RT thread is better, so pick it off the runqueue. */
1932 goto pick_new_rt_thread;
1933 }
1934 }
1935
1936 rt_lock_unlock(pset);
1937 }
1938
1939 /* This is still the best RT thread to run. */
1940 processor->deadline = thread->realtime.deadline;
1941
1942 sched_update_pset_load_average(pset);
1943
1944 processor_t next_rt_processor = PROCESSOR_NULL;
1945 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
1946
1947 if (rt_runq_count(pset) > 0) {
1948 next_rt_processor = choose_processor_for_realtime_thread(pset);
1949 if (next_rt_processor) {
1950 next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
1951 }
1952 }
1953 pset_unlock(pset);
1954
1955 if (next_rt_processor) {
1956 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
1957 }
1958
1959 return (thread);
1960 }
1961
1962 if ((rt_runq_count(pset) == 0) &&
1963 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
1964 /* This thread is still the highest priority runnable (non-idle) thread */
1965 processor->deadline = UINT64_MAX;
1966
1967 sched_update_pset_load_average(pset);
1968 pset_unlock(pset);
1969
1970 return (thread);
1971 }
1972 } else {
1973 /*
1974 * This processor must context switch.
1975 * If it's due to a rebalance, we should aggressively find this thread a new home.
1976 */
1977 if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor)
1978 *reason |= AST_REBALANCE;
1979 }
1980
1981 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1982 if (rt_runq_count(pset) > 0) {
1983
1984 rt_lock_lock(pset);
1985
1986 if (rt_runq_count(pset) > 0) {
1987 thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1988
1989 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1990 (next_rt->bound_processor == processor)))) {
1991pick_new_rt_thread:
1992 new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
1993
1994 new_thread->runq = PROCESSOR_NULL;
1995 SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
1996 rt_runq_count_decr(pset);
1997
1998 processor->deadline = new_thread->realtime.deadline;
1999 processor_state_update_from_thread(processor, new_thread);
2000
2001 rt_lock_unlock(pset);
2002 sched_update_pset_load_average(pset);
2003
2004 processor_t ast_processor = PROCESSOR_NULL;
2005 processor_t next_rt_processor = PROCESSOR_NULL;
2006 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
2007 sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
2008
2009 if (processor->processor_secondary != NULL) {
2010 processor_t sprocessor = processor->processor_secondary;
2011 if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) {
2012 ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
2013 ast_processor = sprocessor;
2014 }
2015 }
2016 if (rt_runq_count(pset) > 0) {
2017 next_rt_processor = choose_processor_for_realtime_thread(pset);
2018 if (next_rt_processor) {
2019 next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
2020 }
2021 }
2022 pset_unlock(pset);
2023
2024 if (ast_processor) {
2025 sched_ipi_perform(ast_processor, ipi_type);
2026 }
2027
2028 if (next_rt_processor) {
2029 sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
2030 }
2031
2032 return (new_thread);
2033 }
2034 }
2035
2036 rt_lock_unlock(pset);
2037 }
2038
2039 processor->deadline = UINT64_MAX;
2040
2041 /* No RT threads, so let's look at the regular threads. */
2042 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
2043 sched_update_pset_load_average(pset);
2044 processor_state_update_from_thread(processor, new_thread);
2045 pset_unlock(pset);
2046 return (new_thread);
2047 }
2048
2049#if __SMP__
2050 if (SCHED(steal_thread_enabled)) {
2051 /*
2052 * No runnable threads, attempt to steal
2053 * from other processors. Returns with pset lock dropped.
2054 */
2055
2056 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
2057 return (new_thread);
2058 }
2059
2060 /*
2061 * If other threads have appeared, shortcut
2062 * around again.
2063 */
2064 if (!SCHED(processor_queue_empty)(processor) || rt_runq_count(pset) > 0)
2065 continue;
2066
2067 pset_lock(pset);
2068 }
2069#endif
2070
2071 idle:
2072 /*
2073 * Nothing is runnable, so set this processor idle if it
2074 * was running.
2075 */
2076 if (processor->state == PROCESSOR_RUNNING) {
2077 pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
2078 }
2079
2080#if __SMP__
2081 /* Invoked with pset locked, returns with pset unlocked */
2082 SCHED(processor_balance)(processor, pset);
2083#else
2084 pset_unlock(pset);
2085#endif
2086
2087#if CONFIG_SCHED_IDLE_IN_PLACE
2088 /*
2089 * Choose idle thread if fast idle is not possible.
2090 */
2091 if (processor->processor_primary != processor)
2092 return (processor->idle_thread);
2093
2094 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
2095 return (processor->idle_thread);
2096
2097 /*
2098 * Perform idling activities directly without a
2099 * context switch. Return dispatched thread,
2100 * else check again for a runnable thread.
2101 */
2102 new_thread = thread_select_idle(thread, processor);
2103
2104#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
2105
2106 /*
2107 * Do a full context switch to idle so that the current
2108 * thread can start running on another processor without
2109 * waiting for the fast-idled processor to wake up.
2110 */
2111 new_thread = processor->idle_thread;
2112
2113#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
2114
2115 } while (new_thread == THREAD_NULL);
2116
2117 return (new_thread);
2118}
2119
2120#if CONFIG_SCHED_IDLE_IN_PLACE
2121/*
2122 * thread_select_idle:
2123 *
2124 * Idle the processor using the current thread context.
2125 *
2126 * Called with thread locked, then dropped and relocked.
2127 */
2128static thread_t
2129thread_select_idle(
2130 thread_t thread,
2131 processor_t processor)
2132{
2133 thread_t new_thread;
2134 uint64_t arg1, arg2;
2135 int urgency;
2136
2137 sched_run_decr(thread);
2138
2139 thread->state |= TH_IDLE;
2140 processor_state_update_idle(procssor);
2141
2142 /* Reload precise timing global policy to thread-local policy */
2143 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2144
2145 thread_unlock(thread);
2146
2147 /*
2148 * Switch execution timing to processor idle thread.
2149 */
2150 processor->last_dispatch = mach_absolute_time();
2151
2152#ifdef CONFIG_MACH_APPROXIMATE_TIME
2153 commpage_update_mach_approximate_time(processor->last_dispatch);
2154#endif
2155
2156 thread->last_run_time = processor->last_dispatch;
2157 processor_timer_switch_thread(processor->last_dispatch,
2158 &processor->idle_thread->system_timer);
2159 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
2160
2161
2162 /*
2163 * Cancel the quantum timer while idling.
2164 */
2165 timer_call_quantum_timer_cancel(&processor->quantum_timer);
2166 processor->first_timeslice = FALSE;
2167
2168 if (thread->sched_call) {
2169 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2170 }
2171
2172 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
2173
2174 /*
2175 * Enable interrupts and perform idling activities. No
2176 * preemption due to TH_IDLE being set.
2177 */
2178 spllo(); new_thread = processor_idle(thread, processor);
2179
2180 /*
2181 * Return at splsched.
2182 */
2183 if (thread->sched_call) {
2184 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2185 }
2186
2187 thread_lock(thread);
2188
2189 /*
2190 * If awakened, switch to thread timer and start a new quantum.
2191 * Otherwise skip; we will context switch to another thread or return here.
2192 */
2193 if (!(thread->state & TH_WAIT)) {
2194 uint64_t time_now = processor->last_dispatch = mach_absolute_time();
2195 processor_timer_switch_thread(time_now, &thread->system_timer);
2196 timer_update(&thread->runnable_timer, time_now);
2197 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2198 thread_quantum_init(thread);
2199 processor->quantum_end = time_now + thread->quantum_remaining;
2200 timer_call_quantum_timer_enter(&processor->quantum_timer,
2201 thread, processor->quantum_end, time_now);
2202 processor->first_timeslice = TRUE;
2203
2204 thread->computation_epoch = time_now;
2205 }
2206
2207 thread->state &= ~TH_IDLE;
2208
2209 urgency = thread_get_urgency(thread, &arg1, &arg2);
2210
2211 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
2212
2213 sched_run_incr(thread);
2214
2215 return (new_thread);
2216}
2217#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2218
2219/*
2220 * thread_invoke
2221 *
2222 * Called at splsched with neither thread locked.
2223 *
2224 * Perform a context switch and start executing the new thread.
2225 *
2226 * Returns FALSE when the context switch didn't happen.
2227 * The reference to the new thread is still consumed.
2228 *
2229 * "self" is what is currently running on the processor,
2230 * "thread" is the new thread to context switch to
2231 * (which may be the same thread in some cases)
2232 */
2233static boolean_t
2234thread_invoke(
2235 thread_t self,
2236 thread_t thread,
2237 ast_t reason)
2238{
2239 if (__improbable(get_preemption_level() != 0)) {
2240 int pl = get_preemption_level();
2241 panic("thread_invoke: preemption_level %d, possible cause: %s",
2242 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2243 "blocking while holding a spinlock, or within interrupt context"));
2244 }
2245
2246 thread_continue_t continuation = self->continuation;
2247 void *parameter = self->parameter;
2248 processor_t processor;
2249
2250 uint64_t ctime = mach_absolute_time();
2251
2252#ifdef CONFIG_MACH_APPROXIMATE_TIME
2253 commpage_update_mach_approximate_time(ctime);
2254#endif
2255
2256#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2257 if ((thread->state & TH_IDLE) == 0)
2258 sched_timeshare_consider_maintenance(ctime);
2259#endif
2260
2261#if MONOTONIC
2262 mt_sched_update(self);
2263#endif /* MONOTONIC */
2264
2265 assert_thread_magic(self);
2266 assert(self == current_thread());
2267 assert(self->runq == PROCESSOR_NULL);
2268 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
2269
2270 thread_lock(thread);
2271
2272 assert_thread_magic(thread);
2273 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
2274 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2275 assert(thread->runq == PROCESSOR_NULL);
2276
2277 /* Reload precise timing global policy to thread-local policy */
2278 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2279
2280 /* Update SFI class based on other factors */
2281 thread->sfi_class = sfi_thread_classify(thread);
2282
2283 /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
2284 thread->same_pri_latency = ctime - thread->last_basepri_change_time;
2285 /*
2286 * In case a base_pri update happened between the timestamp and
2287 * taking the thread lock
2288 */
2289 if (ctime <= thread->last_basepri_change_time)
2290 thread->same_pri_latency = ctime - thread->last_made_runnable_time;
2291
2292 /* Allow realtime threads to hang onto a stack. */
2293 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2294 self->reserved_stack = self->kernel_stack;
2295
2296 /* Prepare for spin debugging */
2297#if INTERRUPT_MASKED_DEBUG
2298 ml_spin_debug_clear(thread);
2299#endif
2300
2301 if (continuation != NULL) {
2302 if (!thread->kernel_stack) {
2303 /*
2304 * If we are using a privileged stack,
2305 * check to see whether we can exchange it with
2306 * that of the other thread.
2307 */
2308 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2309 goto need_stack;
2310
2311 /*
2312 * Context switch by performing a stack handoff.
2313 */
2314 continuation = thread->continuation;
2315 parameter = thread->parameter;
2316
2317 processor = current_processor();
2318 processor->active_thread = thread;
2319 processor_state_update_from_thread(processor, thread);
2320
2321 if (thread->last_processor != processor && thread->last_processor != NULL) {
2322 if (thread->last_processor->processor_set != processor->processor_set)
2323 thread->ps_switch++;
2324 thread->p_switch++;
2325 }
2326 thread->last_processor = processor;
2327 thread->c_switch++;
2328 ast_context(thread);
2329
2330 thread_unlock(thread);
2331
2332 self->reason = reason;
2333
2334 processor->last_dispatch = ctime;
2335 self->last_run_time = ctime;
2336 processor_timer_switch_thread(ctime, &thread->system_timer);
2337 timer_update(&thread->runnable_timer, ctime);
2338 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2339
2340 /*
2341 * Since non-precise user/kernel time doesn't update the state timer
2342 * during privilege transitions, synthesize an event now.
2343 */
2344 if (!thread->precise_user_kernel_time) {
2345 timer_update(PROCESSOR_DATA(processor, current_state), ctime);
2346 }
2347
2348 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2349 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2350 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2351
2352 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2353 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2354 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2355 }
2356
2357 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2358
2359 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2360
2361#if KPERF
2362 kperf_off_cpu(self);
2363#endif /* KPERF */
2364
2365 TLOG(1, "thread_invoke: calling stack_handoff\n");
2366 stack_handoff(self, thread);
2367
2368 /* 'self' is now off core */
2369 assert(thread == current_thread_volatile());
2370
2371 DTRACE_SCHED(on__cpu);
2372
2373#if KPERF
2374 kperf_on_cpu(thread, continuation, NULL);
2375#endif /* KPERF */
2376
2377 thread_dispatch(self, thread);
2378
2379#if KASAN
2380 /* Old thread's stack has been moved to the new thread, so explicitly
2381 * unpoison it. */
2382 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2383#endif
2384
2385 thread->continuation = thread->parameter = NULL;
2386
2387 counter(c_thread_invoke_hits++);
2388
2389 assert(continuation);
2390 call_continuation(continuation, parameter, thread->wait_result, TRUE);
2391 /*NOTREACHED*/
2392 }
2393 else if (thread == self) {
2394 /* same thread but with continuation */
2395 ast_context(self);
2396 counter(++c_thread_invoke_same);
2397
2398 thread_unlock(self);
2399
2400#if KPERF
2401 kperf_on_cpu(thread, continuation, NULL);
2402#endif /* KPERF */
2403
2404 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2405 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2406 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2407
2408#if KASAN
2409 /* stack handoff to self - no thread_dispatch(), so clear the stack
2410 * and free the fakestack directly */
2411 kasan_fakestack_drop(self);
2412 kasan_fakestack_gc(self);
2413 kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
2414#endif
2415
2416 self->continuation = self->parameter = NULL;
2417
2418 call_continuation(continuation, parameter, self->wait_result, TRUE);
2419 /*NOTREACHED*/
2420 }
2421 } else {
2422 /*
2423 * Check that the other thread has a stack
2424 */
2425 if (!thread->kernel_stack) {
2426need_stack:
2427 if (!stack_alloc_try(thread)) {
2428 counter(c_thread_invoke_misses++);
2429 thread_unlock(thread);
2430 thread_stack_enqueue(thread);
2431 return (FALSE);
2432 }
2433 } else if (thread == self) {
2434 ast_context(self);
2435 counter(++c_thread_invoke_same);
2436 thread_unlock(self);
2437
2438 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2439 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2440 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2441
2442 return (TRUE);
2443 }
2444 }
2445
2446 /*
2447 * Context switch by full context save.
2448 */
2449 processor = current_processor();
2450 processor->active_thread = thread;
2451 processor_state_update_from_thread(processor, thread);
2452
2453 if (thread->last_processor != processor && thread->last_processor != NULL) {
2454 if (thread->last_processor->processor_set != processor->processor_set)
2455 thread->ps_switch++;
2456 thread->p_switch++;
2457 }
2458 thread->last_processor = processor;
2459 thread->c_switch++;
2460 ast_context(thread);
2461
2462 thread_unlock(thread);
2463
2464 counter(c_thread_invoke_csw++);
2465
2466 self->reason = reason;
2467
2468 processor->last_dispatch = ctime;
2469 self->last_run_time = ctime;
2470 processor_timer_switch_thread(ctime, &thread->system_timer);
2471 timer_update(&thread->runnable_timer, ctime);
2472 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2473
2474 /*
2475 * Since non-precise user/kernel time doesn't update the state timer
2476 * during privilege transitions, synthesize an event now.
2477 */
2478 if (!thread->precise_user_kernel_time) {
2479 timer_update(PROCESSOR_DATA(processor, current_state), ctime);
2480 }
2481
2482 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2483 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2484 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2485
2486 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2487 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2488 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2489 }
2490
2491 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2492
2493 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2494
2495#if KPERF
2496 kperf_off_cpu(self);
2497#endif /* KPERF */
2498
2499 /*
2500 * This is where we actually switch register context,
2501 * and address space if required. We will next run
2502 * as a result of a subsequent context switch.
2503 *
2504 * Once registers are switched and the processor is running "thread",
2505 * the stack variables and non-volatile registers will contain whatever
2506 * was there the last time that thread blocked. No local variables should
2507 * be used after this point, except for the special case of "thread", which
2508 * the platform layer returns as the previous thread running on the processor
2509 * via the function call ABI as a return register, and "self", which may have
2510 * been stored on the stack or a non-volatile register, but a stale idea of
2511 * what was on the CPU is newly-accurate because that thread is again
2512 * running on the CPU.
2513 */
2514 assert(continuation == self->continuation);
2515 thread = machine_switch_context(self, continuation, thread);
2516 assert(self == current_thread_volatile());
2517 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2518
2519 DTRACE_SCHED(on__cpu);
2520
2521#if KPERF
2522 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
2523#endif /* KPERF */
2524
2525 /*
2526 * We have been resumed and are set to run.
2527 */
2528 thread_dispatch(thread, self);
2529
2530 if (continuation) {
2531 self->continuation = self->parameter = NULL;
2532
2533 call_continuation(continuation, parameter, self->wait_result, TRUE);
2534 /*NOTREACHED*/
2535 }
2536
2537 return (TRUE);
2538}
2539
2540#if defined(CONFIG_SCHED_DEFERRED_AST)
2541/*
2542 * pset_cancel_deferred_dispatch:
2543 *
2544 * Cancels all ASTs that we can cancel for the given processor set
2545 * if the current processor is running the last runnable thread in the
2546 * system.
2547 *
2548 * This function assumes the current thread is runnable. This must
2549 * be called with the pset unlocked.
2550 */
2551static void
2552pset_cancel_deferred_dispatch(
2553 processor_set_t pset,
2554 processor_t processor)
2555{
2556 processor_t active_processor = NULL;
2557 uint32_t sampled_sched_run_count;
2558
2559 pset_lock(pset);
2560 sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
2561
2562 /*
2563 * If we have emptied the run queue, and our current thread is runnable, we
2564 * should tell any processors that are still DISPATCHING that they will
2565 * probably not have any work to do. In the event that there are no
2566 * pending signals that we can cancel, this is also uninteresting.
2567 *
2568 * In the unlikely event that another thread becomes runnable while we are
2569 * doing this (sched_run_count is atomically updated, not guarded), the
2570 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2571 * in order to dispatch it to a processor in our pset. So, the other
2572 * codepath will wait while we squash all cancelable ASTs, get the pset
2573 * lock, and then dispatch the freshly runnable thread. So this should be
2574 * correct (we won't accidentally have a runnable thread that hasn't been
2575 * dispatched to an idle processor), if not ideal (we may be restarting the
2576 * dispatch process, which could have some overhead).
2577 */
2578
2579 if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
2580 uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
2581 pset->pending_deferred_AST_cpu_mask &
2582 ~pset->pending_AST_cpu_mask);
2583 for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
2584 active_processor = processor_array[cpuid];
2585 /*
2586 * If a processor is DISPATCHING, it could be because of
2587 * a cancelable signal.
2588 *
2589 * IF the processor is not our
2590 * current processor (the current processor should not
2591 * be DISPATCHING, so this is a bit paranoid), AND there
2592 * is a cancelable signal pending on the processor, AND
2593 * there is no non-cancelable signal pending (as there is
2594 * no point trying to backtrack on bringing the processor
2595 * up if a signal we cannot cancel is outstanding), THEN
2596 * it should make sense to roll back the processor state
2597 * to the IDLE state.
2598 *
2599 * If the racey nature of this approach (as the signal
2600 * will be arbitrated by hardware, and can fire as we
2601 * roll back state) results in the core responding
2602 * despite being pushed back to the IDLE state, it
2603 * should be no different than if the core took some
2604 * interrupt while IDLE.
2605 */
2606 if (active_processor != processor) {
2607 /*
2608 * Squash all of the processor state back to some
2609 * reasonable facsimile of PROCESSOR_IDLE.
2610 */
2611
2612 assert(active_processor->next_thread == THREAD_NULL);
2613 processor_state_update_idle(active_processor);
2614 active_processor->deadline = UINT64_MAX;
2615 pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
2616 bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
2617 machine_signal_idle_cancel(active_processor);
2618 }
2619
2620 }
2621 }
2622
2623 pset_unlock(pset);
2624}
2625#else
2626/* We don't support deferred ASTs; everything is candycanes and sunshine. */
2627#endif
2628
2629static void
2630thread_csw_callout(
2631 thread_t old,
2632 thread_t new,
2633 uint64_t timestamp)
2634{
2635 perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
2636 uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
2637 machine_switch_perfcontrol_context(event, timestamp, 0,
2638 same_pri_latency, old, new);
2639}
2640
2641
2642/*
2643 * thread_dispatch:
2644 *
2645 * Handle threads at context switch. Re-dispatch other thread
2646 * if still running, otherwise update run state and perform
2647 * special actions. Update quantum for other thread and begin
2648 * the quantum for ourselves.
2649 *
2650 * "thread" is the old thread that we have switched away from.
2651 * "self" is the new current thread that we have context switched to
2652 *
2653 * Called at splsched.
2654 */
2655void
2656thread_dispatch(
2657 thread_t thread,
2658 thread_t self)
2659{
2660 processor_t processor = self->last_processor;
2661
2662 assert(processor == current_processor());
2663 assert(self == current_thread_volatile());
2664 assert(thread != self);
2665
2666 if (thread != THREAD_NULL) {
2667 /*
2668 * Do the perfcontrol callout for context switch.
2669 * The reason we do this here is:
2670 * - thread_dispatch() is called from various places that are not
2671 * the direct context switch path for eg. processor shutdown etc.
2672 * So adding the callout here covers all those cases.
2673 * - We want this callout as early as possible to be close
2674 * to the timestamp taken in thread_invoke()
2675 * - We want to avoid holding the thread lock while doing the
2676 * callout
2677 * - We do not want to callout if "thread" is NULL.
2678 */
2679 thread_csw_callout(thread, self, processor->last_dispatch);
2680
2681#if KASAN
2682 if (thread->continuation != NULL) {
2683 /*
2684 * Thread has a continuation and the normal stack is going away.
2685 * Unpoison the stack and mark all fakestack objects as unused.
2686 */
2687 kasan_fakestack_drop(thread);
2688 if (thread->kernel_stack) {
2689 kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
2690 }
2691 }
2692
2693 /*
2694 * Free all unused fakestack objects.
2695 */
2696 kasan_fakestack_gc(thread);
2697#endif
2698
2699 /*
2700 * If blocked at a continuation, discard
2701 * the stack.
2702 */
2703 if (thread->continuation != NULL && thread->kernel_stack != 0)
2704 stack_free(thread);
2705
2706 if (thread->state & TH_IDLE) {
2707 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2708 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2709 (uintptr_t)thread_tid(thread), 0, thread->state,
2710 sched_run_buckets[TH_BUCKET_RUN], 0);
2711 } else {
2712 int64_t consumed;
2713 int64_t remainder = 0;
2714
2715 if (processor->quantum_end > processor->last_dispatch)
2716 remainder = processor->quantum_end -
2717 processor->last_dispatch;
2718
2719 consumed = thread->quantum_remaining - remainder;
2720
2721 if ((thread->reason & AST_LEDGER) == 0) {
2722 /*
2723 * Bill CPU time to both the task and
2724 * the individual thread.
2725 */
2726 ledger_credit_thread(thread, thread->t_ledger,
2727 task_ledgers.cpu_time, consumed);
2728 ledger_credit_thread(thread, thread->t_threadledger,
2729 thread_ledgers.cpu_time, consumed);
2730 if (thread->t_bankledger) {
2731 ledger_credit_thread(thread, thread->t_bankledger,
2732 bank_ledgers.cpu_time,
2733 (consumed - thread->t_deduct_bank_ledger_time));
2734 }
2735 thread->t_deduct_bank_ledger_time = 0;
2736 }
2737
2738 wake_lock(thread);
2739 thread_lock(thread);
2740
2741 /*
2742 * Apply a priority floor if the thread holds a kernel resource
2743 * Do this before checking starting_pri to avoid overpenalizing
2744 * repeated rwlock blockers.
2745 */
2746 if (__improbable(thread->rwlock_count != 0))
2747 lck_rw_set_promotion_locked(thread);
2748
2749 boolean_t keep_quantum = processor->first_timeslice;
2750
2751 /*
2752 * Treat a thread which has dropped priority since it got on core
2753 * as having expired its quantum.
2754 */
2755 if (processor->starting_pri > thread->sched_pri)
2756 keep_quantum = FALSE;
2757
2758 /* Compute remainder of current quantum. */
2759 if (keep_quantum &&
2760 processor->quantum_end > processor->last_dispatch)
2761 thread->quantum_remaining = (uint32_t)remainder;
2762 else
2763 thread->quantum_remaining = 0;
2764
2765 if (thread->sched_mode == TH_MODE_REALTIME) {
2766 /*
2767 * Cancel the deadline if the thread has
2768 * consumed the entire quantum.
2769 */
2770 if (thread->quantum_remaining == 0) {
2771 thread->realtime.deadline = UINT64_MAX;
2772 }
2773 } else {
2774#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2775 /*
2776 * For non-realtime threads treat a tiny
2777 * remaining quantum as an expired quantum
2778 * but include what's left next time.
2779 */
2780 if (thread->quantum_remaining < min_std_quantum) {
2781 thread->reason |= AST_QUANTUM;
2782 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2783 }
2784#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2785 }
2786
2787 /*
2788 * If we are doing a direct handoff then
2789 * take the remainder of the quantum.
2790 */
2791 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2792 self->quantum_remaining = thread->quantum_remaining;
2793 thread->reason |= AST_QUANTUM;
2794 thread->quantum_remaining = 0;
2795 } else {
2796#if defined(CONFIG_SCHED_MULTIQ)
2797 if (SCHED(sched_groups_enabled) &&
2798 thread->sched_group == self->sched_group) {
2799 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2800 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
2801 self->reason, (uintptr_t)thread_tid(thread),
2802 self->quantum_remaining, thread->quantum_remaining, 0);
2803
2804 self->quantum_remaining = thread->quantum_remaining;
2805 thread->quantum_remaining = 0;
2806 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
2807 }
2808#endif /* defined(CONFIG_SCHED_MULTIQ) */
2809 }
2810
2811 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2812
2813 if (!(thread->state & TH_WAIT)) {
2814 /*
2815 * Still runnable.
2816 */
2817 thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
2818
2819 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
2820
2821 ast_t reason = thread->reason;
2822 sched_options_t options = SCHED_NONE;
2823
2824 if (reason & AST_REBALANCE) {
2825 options |= SCHED_REBALANCE;
2826 if (reason & AST_QUANTUM) {
2827 /*
2828 * Having gone to the trouble of forcing this thread off a less preferred core,
2829 * we should force the preferable core to reschedule immediately to give this
2830 * thread a chance to run instead of just sitting on the run queue where
2831 * it may just be stolen back by the idle core we just forced it off.
2832 * But only do this at the end of a quantum to prevent cascading effects.
2833 */
2834 options |= SCHED_PREEMPT;
2835 }
2836 }
2837
2838 if (reason & AST_QUANTUM)
2839 options |= SCHED_TAILQ;
2840 else if (reason & AST_PREEMPT)
2841 options |= SCHED_HEADQ;
2842 else
2843 options |= (SCHED_PREEMPT | SCHED_TAILQ);
2844
2845 thread_setrun(thread, options);
2846
2847 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2848 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2849 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2850 sched_run_buckets[TH_BUCKET_RUN], 0);
2851
2852 if (thread->wake_active) {
2853 thread->wake_active = FALSE;
2854 thread_unlock(thread);
2855
2856 thread_wakeup(&thread->wake_active);
2857 } else {
2858 thread_unlock(thread);
2859 }
2860
2861 wake_unlock(thread);
2862 } else {
2863 /*
2864 * Waiting.
2865 */
2866 boolean_t should_terminate = FALSE;
2867 uint32_t new_run_count;
2868 int thread_state = thread->state;
2869
2870 /* Only the first call to thread_dispatch
2871 * after explicit termination should add
2872 * the thread to the termination queue
2873 */
2874 if ((thread_state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2875 should_terminate = TRUE;
2876 thread_state |= TH_TERMINATE2;
2877 }
2878
2879 timer_stop(&thread->runnable_timer, processor->last_dispatch);
2880
2881 thread_state &= ~TH_RUN;
2882 thread->state = thread_state;
2883
2884 thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
2885 thread->chosen_processor = PROCESSOR_NULL;
2886
2887 new_run_count = sched_run_decr(thread);
2888
2889#if CONFIG_SCHED_SFI
2890 if (thread->reason & AST_SFI) {
2891 thread->wait_sfi_begin_time = processor->last_dispatch;
2892 }
2893#endif
2894
2895 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
2896
2897 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2898 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2899 (uintptr_t)thread_tid(thread), thread->reason, thread_state,
2900 new_run_count, 0);
2901
2902 if (thread_state & TH_WAIT_REPORT) {
2903 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2904 }
2905
2906 if (thread->wake_active) {
2907 thread->wake_active = FALSE;
2908 thread_unlock(thread);
2909
2910 thread_wakeup(&thread->wake_active);
2911 } else {
2912 thread_unlock(thread);
2913 }
2914
2915 wake_unlock(thread);
2916
2917 if (should_terminate)
2918 thread_terminate_enqueue(thread);
2919 }
2920 }
2921 }
2922
2923 int urgency = THREAD_URGENCY_NONE;
2924 uint64_t latency = 0;
2925
2926 /* Update (new) current thread and reprogram quantum timer */
2927 thread_lock(self);
2928
2929 if (!(self->state & TH_IDLE)) {
2930 uint64_t arg1, arg2;
2931
2932#if CONFIG_SCHED_SFI
2933 ast_t new_ast;
2934
2935 new_ast = sfi_thread_needs_ast(self, NULL);
2936
2937 if (new_ast != AST_NONE) {
2938 ast_on(new_ast);
2939 }
2940#endif
2941
2942 assertf(processor->last_dispatch >= self->last_made_runnable_time,
2943 "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx",
2944 processor->last_dispatch, self->last_made_runnable_time);
2945
2946 assert(self->last_made_runnable_time <= self->last_basepri_change_time);
2947
2948 latency = processor->last_dispatch - self->last_made_runnable_time;
2949 assert(latency >= self->same_pri_latency);
2950
2951 urgency = thread_get_urgency(self, &arg1, &arg2);
2952
2953 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2954
2955 /*
2956 * Get a new quantum if none remaining.
2957 */
2958 if (self->quantum_remaining == 0) {
2959 thread_quantum_init(self);
2960 }
2961
2962 /*
2963 * Set up quantum timer and timeslice.
2964 */
2965 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2966 timer_call_quantum_timer_enter(&processor->quantum_timer, self,
2967 processor->quantum_end, processor->last_dispatch);
2968
2969 processor->first_timeslice = TRUE;
2970 } else {
2971 timer_call_quantum_timer_cancel(&processor->quantum_timer);
2972 processor->first_timeslice = FALSE;
2973
2974 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2975 }
2976
2977 assert(self->block_hint == kThreadWaitNone);
2978 self->computation_epoch = processor->last_dispatch;
2979 self->reason = AST_NONE;
2980 processor->starting_pri = self->sched_pri;
2981
2982 thread_unlock(self);
2983
2984 machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
2985 processor->last_dispatch);
2986
2987#if defined(CONFIG_SCHED_DEFERRED_AST)
2988 /*
2989 * TODO: Can we state that redispatching our old thread is also
2990 * uninteresting?
2991 */
2992 if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
2993 !(self->state & TH_IDLE)) {
2994 pset_cancel_deferred_dispatch(processor->processor_set, processor);
2995 }
2996#endif
2997}
2998
2999/*
3000 * thread_block_reason:
3001 *
3002 * Forces a reschedule, blocking the caller if a wait
3003 * has been asserted.
3004 *
3005 * If a continuation is specified, then thread_invoke will
3006 * attempt to discard the thread's kernel stack. When the
3007 * thread resumes, it will execute the continuation function
3008 * on a new kernel stack.
3009 */
3010counter(mach_counter_t c_thread_block_calls = 0;)
3011
3012wait_result_t
3013thread_block_reason(
3014 thread_continue_t continuation,
3015 void *parameter,
3016 ast_t reason)
3017{
3018 thread_t self = current_thread();
3019 processor_t processor;
3020 thread_t new_thread;
3021 spl_t s;
3022
3023 counter(++c_thread_block_calls);
3024
3025 s = splsched();
3026
3027 processor = current_processor();
3028
3029 /* If we're explicitly yielding, force a subsequent quantum */
3030 if (reason & AST_YIELD)
3031 processor->first_timeslice = FALSE;
3032
3033 /* We're handling all scheduling AST's */
3034 ast_off(AST_SCHEDULING);
3035
3036#if PROC_REF_DEBUG
3037 if ((continuation != NULL) && (self->task != kernel_task)) {
3038 if (uthread_get_proc_refcount(self->uthread) != 0) {
3039 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
3040 }
3041 }
3042#endif
3043
3044 self->continuation = continuation;
3045 self->parameter = parameter;
3046
3047 if (self->state & ~(TH_RUN | TH_IDLE)) {
3048 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3049 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
3050 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
3051 }
3052
3053 do {
3054 thread_lock(self);
3055 new_thread = thread_select(self, processor, &reason);
3056 thread_unlock(self);
3057 } while (!thread_invoke(self, new_thread, reason));
3058
3059 splx(s);
3060
3061 return (self->wait_result);
3062}
3063
3064/*
3065 * thread_block:
3066 *
3067 * Block the current thread if a wait has been asserted.
3068 */
3069wait_result_t
3070thread_block(
3071 thread_continue_t continuation)
3072{
3073 return thread_block_reason(continuation, NULL, AST_NONE);
3074}
3075
3076wait_result_t
3077thread_block_parameter(
3078 thread_continue_t continuation,
3079 void *parameter)
3080{
3081 return thread_block_reason(continuation, parameter, AST_NONE);
3082}
3083
3084/*
3085 * thread_run:
3086 *
3087 * Switch directly from the current thread to the
3088 * new thread, handing off our quantum if appropriate.
3089 *
3090 * New thread must be runnable, and not on a run queue.
3091 *
3092 * Called at splsched.
3093 */
3094int
3095thread_run(
3096 thread_t self,
3097 thread_continue_t continuation,
3098 void *parameter,
3099 thread_t new_thread)
3100{
3101 ast_t reason = AST_HANDOFF;
3102
3103 self->continuation = continuation;
3104 self->parameter = parameter;
3105
3106 while (!thread_invoke(self, new_thread, reason)) {
3107 /* the handoff failed, so we have to fall back to the normal block path */
3108 processor_t processor = current_processor();
3109
3110 reason = AST_NONE;
3111
3112 thread_lock(self);
3113 new_thread = thread_select(self, processor, &reason);
3114 thread_unlock(self);
3115 }
3116
3117 return (self->wait_result);
3118}
3119
3120/*
3121 * thread_continue:
3122 *
3123 * Called at splsched when a thread first receives
3124 * a new stack after a continuation.
3125 */
3126void
3127thread_continue(
3128 thread_t thread)
3129{
3130 thread_t self = current_thread();
3131 thread_continue_t continuation;
3132 void *parameter;
3133
3134 DTRACE_SCHED(on__cpu);
3135
3136 continuation = self->continuation;
3137 parameter = self->parameter;
3138
3139#if KPERF
3140 kperf_on_cpu(self, continuation, NULL);
3141#endif
3142
3143 thread_dispatch(thread, self);
3144
3145 self->continuation = self->parameter = NULL;
3146
3147#if INTERRUPT_MASKED_DEBUG
3148 /* Reset interrupt-masked spin debugging timeout */
3149 ml_spin_debug_clear(self);
3150#endif
3151
3152 TLOG(1, "thread_continue: calling call_continuation\n");
3153
3154 boolean_t enable_interrupts = thread != THREAD_NULL;
3155 call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
3156 /*NOTREACHED*/
3157}
3158
3159void
3160thread_quantum_init(thread_t thread)
3161{
3162 if (thread->sched_mode == TH_MODE_REALTIME) {
3163 thread->quantum_remaining = thread->realtime.computation;
3164 } else {
3165 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
3166 }
3167}
3168
3169uint32_t
3170sched_timeshare_initial_quantum_size(thread_t thread)
3171{
3172 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
3173 return bg_quantum;
3174 else
3175 return std_quantum;
3176}
3177
3178/*
3179 * run_queue_init:
3180 *
3181 * Initialize a run queue before first use.
3182 */
3183void
3184run_queue_init(
3185 run_queue_t rq)
3186{
3187 rq->highq = NOPRI;
3188 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
3189 rq->bitmap[i] = 0;
3190 rq->urgency = rq->count = 0;
3191 for (int i = 0; i < NRQS; i++)
3192 queue_init(&rq->queues[i]);
3193}
3194
3195/*
3196 * run_queue_dequeue:
3197 *
3198 * Perform a dequeue operation on a run queue,
3199 * and return the resulting thread.
3200 *
3201 * The run queue must be locked (see thread_run_queue_remove()
3202 * for more info), and not empty.
3203 */
3204thread_t
3205run_queue_dequeue(
3206 run_queue_t rq,
3207 integer_t options)
3208{
3209 thread_t thread;
3210 queue_t queue = &rq->queues[rq->highq];
3211
3212 if (options & SCHED_HEADQ) {
3213 thread = qe_dequeue_head(queue, struct thread, runq_links);
3214 } else {
3215 thread = qe_dequeue_tail(queue, struct thread, runq_links);
3216 }
3217
3218 assert(thread != THREAD_NULL);
3219 assert_thread_magic(thread);
3220
3221 thread->runq = PROCESSOR_NULL;
3222 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3223 rq->count--;
3224 if (SCHED(priority_is_urgent)(rq->highq)) {
3225 rq->urgency--; assert(rq->urgency >= 0);
3226 }
3227 if (queue_empty(queue)) {
3228 bitmap_clear(rq->bitmap, rq->highq);
3229 rq->highq = bitmap_first(rq->bitmap, NRQS);
3230 }
3231
3232 return thread;
3233}
3234
3235/*
3236 * run_queue_enqueue:
3237 *
3238 * Perform a enqueue operation on a run queue.
3239 *
3240 * The run queue must be locked (see thread_run_queue_remove()
3241 * for more info).
3242 */
3243boolean_t
3244run_queue_enqueue(
3245 run_queue_t rq,
3246 thread_t thread,
3247 integer_t options)
3248{
3249 queue_t queue = &rq->queues[thread->sched_pri];
3250 boolean_t result = FALSE;
3251
3252 assert_thread_magic(thread);
3253
3254 if (queue_empty(queue)) {
3255 enqueue_tail(queue, &thread->runq_links);
3256
3257 rq_bitmap_set(rq->bitmap, thread->sched_pri);
3258 if (thread->sched_pri > rq->highq) {
3259 rq->highq = thread->sched_pri;
3260 result = TRUE;
3261 }
3262 } else {
3263 if (options & SCHED_TAILQ)
3264 enqueue_tail(queue, &thread->runq_links);
3265 else
3266 enqueue_head(queue, &thread->runq_links);
3267 }
3268 if (SCHED(priority_is_urgent)(thread->sched_pri))
3269 rq->urgency++;
3270 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3271 rq->count++;
3272
3273 return (result);
3274}
3275
3276/*
3277 * run_queue_remove:
3278 *
3279 * Remove a specific thread from a runqueue.
3280 *
3281 * The run queue must be locked.
3282 */
3283void
3284run_queue_remove(
3285 run_queue_t rq,
3286 thread_t thread)
3287{
3288 assert(thread->runq != PROCESSOR_NULL);
3289 assert_thread_magic(thread);
3290
3291 remqueue(&thread->runq_links);
3292 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3293 rq->count--;
3294 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3295 rq->urgency--; assert(rq->urgency >= 0);
3296 }
3297
3298 if (queue_empty(&rq->queues[thread->sched_pri])) {
3299 /* update run queue status */
3300 bitmap_clear(rq->bitmap, thread->sched_pri);
3301 rq->highq = bitmap_first(rq->bitmap, NRQS);
3302 }
3303
3304 thread->runq = PROCESSOR_NULL;
3305}
3306
3307/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3308void
3309sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context)
3310{
3311 spl_t s;
3312 thread_t thread;
3313
3314 processor_set_t pset = &pset0;
3315
3316 s = splsched();
3317 rt_lock_lock(pset);
3318
3319 qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
3320 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3321 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3322 }
3323 }
3324
3325 rt_lock_unlock(pset);
3326 splx(s);
3327}
3328
3329int64_t
3330sched_rtglobal_runq_count_sum(void)
3331{
3332 return pset0.rt_runq.runq_stats.count_sum;
3333}
3334
3335/*
3336 * realtime_queue_insert:
3337 *
3338 * Enqueue a thread for realtime execution.
3339 */
3340static boolean_t
3341realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
3342{
3343 queue_t queue = &SCHED(rt_runq)(pset)->queue;
3344 uint64_t deadline = thread->realtime.deadline;
3345 boolean_t preempt = FALSE;
3346
3347 rt_lock_lock(pset);
3348
3349 if (queue_empty(queue)) {
3350 enqueue_tail(queue, &thread->runq_links);
3351 preempt = TRUE;
3352 } else {
3353 /* Insert into rt_runq in thread deadline order */
3354 queue_entry_t iter;
3355 qe_foreach(iter, queue) {
3356 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
3357 assert_thread_magic(iter_thread);
3358
3359 if (deadline < iter_thread->realtime.deadline) {
3360 if (iter == queue_first(queue))
3361 preempt = TRUE;
3362 insque(&thread->runq_links, queue_prev(iter));
3363 break;
3364 } else if (iter == queue_last(queue)) {
3365 enqueue_tail(queue, &thread->runq_links);
3366 break;
3367 }
3368 }
3369 }
3370
3371 thread->runq = processor;
3372 SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
3373 rt_runq_count_incr(pset);
3374
3375 rt_lock_unlock(pset);
3376
3377 return (preempt);
3378}
3379
3380/*
3381 * realtime_setrun:
3382 *
3383 * Dispatch a thread for realtime execution.
3384 *
3385 * Thread must be locked. Associated pset must
3386 * be locked, and is returned unlocked.
3387 */
3388static void
3389realtime_setrun(
3390 processor_t processor,
3391 thread_t thread)
3392{
3393 processor_set_t pset = processor->processor_set;
3394 pset_assert_locked(pset);
3395 ast_t preempt;
3396
3397 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3398
3399 thread->chosen_processor = processor;
3400
3401 /* <rdar://problem/15102234> */
3402 assert(thread->bound_processor == PROCESSOR_NULL);
3403
3404 /*
3405 * Dispatch directly onto idle processor.
3406 */
3407 if ( (thread->bound_processor == processor)
3408 && processor->state == PROCESSOR_IDLE) {
3409
3410 processor->next_thread = thread;
3411 processor_state_update_from_thread(processor, thread);
3412 processor->deadline = thread->realtime.deadline;
3413 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3414
3415 ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
3416 pset_unlock(pset);
3417 sched_ipi_perform(processor, ipi_type);
3418 return;
3419 }
3420
3421 if (processor->current_pri < BASEPRI_RTQUEUES)
3422 preempt = (AST_PREEMPT | AST_URGENT);
3423 else if (thread->realtime.deadline < processor->deadline)
3424 preempt = (AST_PREEMPT | AST_URGENT);
3425 else
3426 preempt = AST_NONE;
3427
3428 realtime_queue_insert(processor, pset, thread);
3429
3430 ipi_type = SCHED_IPI_NONE;
3431 if (preempt != AST_NONE) {
3432 if (processor->state == PROCESSOR_IDLE) {
3433 processor->next_thread = THREAD_NULL;
3434 processor_state_update_from_thread(processor, thread);
3435 processor->deadline = thread->realtime.deadline;
3436 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3437 if (processor == current_processor()) {
3438 ast_on(preempt);
3439 } else {
3440 ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
3441 }
3442 } else if (processor->state == PROCESSOR_DISPATCHING) {
3443 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3444 processor_state_update_from_thread(processor, thread);
3445 processor->deadline = thread->realtime.deadline;
3446 }
3447 } else {
3448 if (processor == current_processor()) {
3449 ast_on(preempt);
3450 } else {
3451 ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT);
3452 }
3453 }
3454 } else {
3455 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3456 }
3457
3458 pset_unlock(pset);
3459 sched_ipi_perform(processor, ipi_type);
3460}
3461
3462
3463sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
3464 __unused sched_ipi_event_t event)
3465{
3466#if defined(CONFIG_SCHED_DEFERRED_AST)
3467 if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
3468 return SCHED_IPI_DEFERRED;
3469 }
3470#else /* CONFIG_SCHED_DEFERRED_AST */
3471 panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
3472#endif /* CONFIG_SCHED_DEFERRED_AST */
3473 return SCHED_IPI_NONE;
3474}
3475
3476sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
3477{
3478 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3479 assert(dst != NULL);
3480
3481 processor_set_t pset = dst->processor_set;
3482 if (current_processor() == dst) {
3483 return SCHED_IPI_NONE;
3484 }
3485
3486 if (bit_test(pset->pending_AST_cpu_mask, dst->cpu_id)) {
3487 return SCHED_IPI_NONE;
3488 }
3489
3490 ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
3491 switch(ipi_type) {
3492 case SCHED_IPI_NONE:
3493 return SCHED_IPI_NONE;
3494#if defined(CONFIG_SCHED_DEFERRED_AST)
3495 case SCHED_IPI_DEFERRED:
3496 bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
3497 break;
3498#endif /* CONFIG_SCHED_DEFERRED_AST */
3499 default:
3500 bit_set(pset->pending_AST_cpu_mask, dst->cpu_id);
3501 break;
3502 }
3503 return ipi_type;
3504}
3505
3506sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
3507{
3508 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3509 boolean_t deferred_ipi_supported = false;
3510 processor_set_t pset = dst->processor_set;
3511
3512#if defined(CONFIG_SCHED_DEFERRED_AST)
3513 deferred_ipi_supported = true;
3514#endif /* CONFIG_SCHED_DEFERRED_AST */
3515
3516 switch(event) {
3517 case SCHED_IPI_EVENT_SPILL:
3518 case SCHED_IPI_EVENT_SMT_REBAL:
3519 case SCHED_IPI_EVENT_REBALANCE:
3520 case SCHED_IPI_EVENT_BOUND_THR:
3521 /*
3522 * The spill, SMT rebalance, rebalance and the bound thread
3523 * scenarios use immediate IPIs always.
3524 */
3525 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3526 break;
3527 case SCHED_IPI_EVENT_PREEMPT:
3528 /* In the preemption case, use immediate IPIs for RT threads */
3529 if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
3530 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3531 break;
3532 }
3533
3534 /*
3535 * For Non-RT threads preemption,
3536 * If the core is active, use immediate IPIs.
3537 * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
3538 */
3539 if (deferred_ipi_supported && dst_idle) {
3540 return sched_ipi_deferred_policy(pset, dst, event);
3541 }
3542 ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
3543 break;
3544 default:
3545 panic("Unrecognized scheduler IPI event type %d", event);
3546 }
3547 assert(ipi_type != SCHED_IPI_NONE);
3548 return ipi_type;
3549}
3550
3551void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
3552{
3553 switch (ipi) {
3554 case SCHED_IPI_NONE:
3555 break;
3556 case SCHED_IPI_IDLE:
3557 machine_signal_idle(dst);
3558 break;
3559 case SCHED_IPI_IMMEDIATE:
3560 cause_ast_check(dst);
3561 break;
3562 case SCHED_IPI_DEFERRED:
3563 machine_signal_idle_deferred(dst);
3564 break;
3565 default:
3566 panic("Unrecognized scheduler IPI type: %d", ipi);
3567 }
3568}
3569
3570#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3571
3572boolean_t
3573priority_is_urgent(int priority)
3574{
3575 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
3576}
3577
3578#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3579
3580/*
3581 * processor_setrun:
3582 *
3583 * Dispatch a thread for execution on a
3584 * processor.
3585 *
3586 * Thread must be locked. Associated pset must
3587 * be locked, and is returned unlocked.
3588 */
3589static void
3590processor_setrun(
3591 processor_t processor,
3592 thread_t thread,
3593 integer_t options)
3594{
3595 processor_set_t pset = processor->processor_set;
3596 pset_assert_locked(pset);
3597 ast_t preempt;
3598 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3599
3600 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
3601
3602 thread->chosen_processor = processor;
3603
3604 /*
3605 * Dispatch directly onto idle processor.
3606 */
3607 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3608 thread->bound_processor == processor)
3609 && processor->state == PROCESSOR_IDLE) {
3610
3611 processor->next_thread = thread;
3612 processor_state_update_from_thread(processor, thread);
3613 processor->deadline = UINT64_MAX;
3614 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3615
3616 ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
3617 pset_unlock(pset);
3618 sched_ipi_perform(processor, ipi_type);
3619 return;
3620 }
3621
3622 /*
3623 * Set preemption mode.
3624 */
3625#if defined(CONFIG_SCHED_DEFERRED_AST)
3626 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3627#endif
3628 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3629 preempt = (AST_PREEMPT | AST_URGENT);
3630 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3631 preempt = (AST_PREEMPT | AST_URGENT);
3632 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3633 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
3634 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3635 } else {
3636 preempt = AST_NONE;
3637 }
3638 } else
3639 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3640
3641 if ((options & (SCHED_PREEMPT|SCHED_REBALANCE)) == (SCHED_PREEMPT|SCHED_REBALANCE)) {
3642 /*
3643 * Having gone to the trouble of forcing this thread off a less preferred core,
3644 * we should force the preferable core to reschedule immediately to give this
3645 * thread a chance to run instead of just sitting on the run queue where
3646 * it may just be stolen back by the idle core we just forced it off.
3647 */
3648 preempt |= AST_PREEMPT;
3649 }
3650
3651 SCHED(processor_enqueue)(processor, thread, options);
3652 sched_update_pset_load_average(pset);
3653
3654 if (preempt != AST_NONE) {
3655 if (processor->state == PROCESSOR_IDLE) {
3656 processor->next_thread = THREAD_NULL;
3657 processor_state_update_from_thread(processor, thread);
3658 processor->deadline = UINT64_MAX;
3659 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3660 ipi_action = eExitIdle;
3661 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3662 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3663 processor_state_update_from_thread(processor, thread);
3664 processor->deadline = UINT64_MAX;
3665 }
3666 } else if ( (processor->state == PROCESSOR_RUNNING ||
3667 processor->state == PROCESSOR_SHUTDOWN) &&
3668 (thread->sched_pri >= processor->current_pri)) {
3669 ipi_action = eInterruptRunning;
3670 }
3671 } else {
3672 /*
3673 * New thread is not important enough to preempt what is running, but
3674 * special processor states may need special handling
3675 */
3676 if (processor->state == PROCESSOR_SHUTDOWN &&
3677 thread->sched_pri >= processor->current_pri ) {
3678 ipi_action = eInterruptRunning;
3679 } else if (processor->state == PROCESSOR_IDLE) {
3680
3681 processor->next_thread = THREAD_NULL;
3682 processor_state_update_from_thread(processor, thread);
3683 processor->deadline = UINT64_MAX;
3684 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
3685
3686 ipi_action = eExitIdle;
3687 }
3688 }
3689
3690 if (ipi_action != eDoNothing) {
3691 if (processor == current_processor()) {
3692 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3693 ast_on(preempt);
3694 } else {
3695 sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
3696 ipi_type = sched_ipi_action(processor, thread, (ipi_action == eExitIdle), event);
3697 }
3698 }
3699 pset_unlock(pset);
3700 sched_ipi_perform(processor, ipi_type);
3701}
3702
3703/*
3704 * choose_next_pset:
3705 *
3706 * Return the next sibling pset containing
3707 * available processors.
3708 *
3709 * Returns the original pset if none other is
3710 * suitable.
3711 */
3712static processor_set_t
3713choose_next_pset(
3714 processor_set_t pset)
3715{
3716 processor_set_t nset = pset;
3717
3718 do {
3719 nset = next_pset(nset);
3720 } while (nset->online_processor_count < 1 && nset != pset);
3721
3722 return (nset);
3723}
3724
3725/*
3726 * choose_processor:
3727 *
3728 * Choose a processor for the thread, beginning at
3729 * the pset. Accepts an optional processor hint in
3730 * the pset.
3731 *
3732 * Returns a processor, possibly from a different pset.
3733 *
3734 * The thread must be locked. The pset must be locked,
3735 * and the resulting pset is locked on return.
3736 */
3737processor_t
3738choose_processor(
3739 processor_set_t starting_pset,
3740 processor_t processor,
3741 thread_t thread)
3742{
3743 processor_set_t pset = starting_pset;
3744 processor_set_t nset;
3745
3746 assert(thread->sched_pri <= BASEPRI_RTQUEUES);
3747
3748 /*
3749 * Prefer the hinted processor, when appropriate.
3750 */
3751
3752 /* Fold last processor hint from secondary processor to its primary */
3753 if (processor != PROCESSOR_NULL) {
3754 processor = processor->processor_primary;
3755 }
3756
3757 /*
3758 * Only consult platform layer if pset is active, which
3759 * it may not be in some cases when a multi-set system
3760 * is going to sleep.
3761 */
3762 if (pset->online_processor_count) {
3763 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3764 processor_t mc_processor = machine_choose_processor(pset, processor);
3765 if (mc_processor != PROCESSOR_NULL)
3766 processor = mc_processor->processor_primary;
3767 }
3768 }
3769
3770 /*
3771 * At this point, we may have a processor hint, and we may have
3772 * an initial starting pset. If the hint is not in the pset, or
3773 * if the hint is for a processor in an invalid state, discard
3774 * the hint.
3775 */
3776 if (processor != PROCESSOR_NULL) {
3777 if (processor->processor_set != pset) {
3778 processor = PROCESSOR_NULL;
3779 } else if (!processor->is_recommended) {
3780 processor = PROCESSOR_NULL;
3781 } else {
3782 switch (processor->state) {
3783 case PROCESSOR_START:
3784 case PROCESSOR_SHUTDOWN:
3785 case PROCESSOR_OFF_LINE:
3786 /*
3787 * Hint is for a processor that cannot support running new threads.
3788 */
3789 processor = PROCESSOR_NULL;
3790 break;
3791 case PROCESSOR_IDLE:
3792 /*
3793 * Hint is for an idle processor. Assume it is no worse than any other
3794 * idle processor. The platform layer had an opportunity to provide
3795 * the "least cost idle" processor above.
3796 */
3797 return (processor);
3798 case PROCESSOR_RUNNING:
3799 case PROCESSOR_DISPATCHING:
3800 /*
3801 * Hint is for an active CPU. This fast-path allows
3802 * realtime threads to preempt non-realtime threads
3803 * to regain their previous executing processor.
3804 */
3805 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3806 (processor->current_pri < BASEPRI_RTQUEUES))
3807 return (processor);
3808
3809 /* Otherwise, use hint as part of search below */
3810 break;
3811 default:
3812 processor = PROCESSOR_NULL;
3813 break;
3814 }
3815 }
3816 }
3817
3818 /*
3819 * Iterate through the processor sets to locate
3820 * an appropriate processor. Seed results with
3821 * a last-processor hint, if available, so that
3822 * a search must find something strictly better
3823 * to replace it.
3824 *
3825 * A primary/secondary pair of SMT processors are
3826 * "unpaired" if the primary is busy but its
3827 * corresponding secondary is idle (so the physical
3828 * core has full use of its resources).
3829 */
3830
3831 integer_t lowest_priority = MAXPRI + 1;
3832 integer_t lowest_secondary_priority = MAXPRI + 1;
3833 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3834 integer_t lowest_count = INT_MAX;
3835 uint64_t furthest_deadline = 1;
3836 processor_t lp_processor = PROCESSOR_NULL;
3837 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3838 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3839 processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
3840 processor_t lc_processor = PROCESSOR_NULL;
3841 processor_t fd_processor = PROCESSOR_NULL;
3842
3843 if (processor != PROCESSOR_NULL) {
3844 /* All other states should be enumerated above. */
3845 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3846
3847 lowest_priority = processor->current_pri;
3848 lp_processor = processor;
3849
3850 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3851 furthest_deadline = processor->deadline;
3852 fd_processor = processor;
3853 }
3854
3855 lowest_count = SCHED(processor_runq_count)(processor);
3856 lc_processor = processor;
3857 }
3858
3859 do {
3860 /*
3861 * Choose an idle processor, in pset traversal order
3862 */
3863
3864 uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
3865 pset->primary_map &
3866 pset->recommended_bitmask &
3867 ~pset->pending_AST_cpu_mask);
3868
3869 int cpuid = lsb_first(idle_primary_map);
3870 if (cpuid >= 0) {
3871 processor = processor_array[cpuid];
3872 return processor;
3873 }
3874
3875 /*
3876 * Otherwise, enumerate active and idle processors to find primary candidates
3877 * with lower priority/etc.
3878 */
3879
3880 uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
3881 pset->recommended_bitmask &
3882 ~pset->pending_AST_cpu_mask);
3883 active_map = bit_ror64(active_map, (pset->last_chosen + 1));
3884 for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
3885 cpuid = ((rotid + pset->last_chosen + 1) & 63);
3886 processor = processor_array[cpuid];
3887
3888 integer_t cpri = processor->current_pri;
3889 if (processor->processor_primary != processor) {
3890 if (cpri < lowest_secondary_priority) {
3891 lowest_secondary_priority = cpri;
3892 lp_paired_secondary_processor = processor;
3893 }
3894 } else {
3895 if (cpri < lowest_priority) {
3896 lowest_priority = cpri;
3897 lp_processor = processor;
3898 }
3899 }
3900
3901 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3902 furthest_deadline = processor->deadline;
3903 fd_processor = processor;
3904 }
3905
3906 integer_t ccount = SCHED(processor_runq_count)(processor);
3907 if (ccount < lowest_count) {
3908 lowest_count = ccount;
3909 lc_processor = processor;
3910 }
3911 }
3912
3913 /*
3914 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3915 * the idle primary would have short-circuited the loop above
3916 */
3917 uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
3918 ~pset->primary_map &
3919 pset->recommended_bitmask &
3920 ~pset->pending_AST_cpu_mask);
3921
3922 for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
3923 processor = processor_array[cpuid];
3924
3925 processor_t cprimary = processor->processor_primary;
3926
3927 if (!cprimary->is_recommended) {
3928 continue;
3929 }
3930 if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) {
3931 continue;
3932 }
3933
3934 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3935 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3936 integer_t primary_pri = cprimary->current_pri;
3937
3938 if (primary_pri < lowest_unpaired_primary_priority) {
3939 lowest_unpaired_primary_priority = primary_pri;
3940 lp_unpaired_primary_processor = cprimary;
3941 lp_unpaired_secondary_processor = processor;
3942 }
3943 }
3944 }
3945
3946
3947 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3948
3949 /*
3950 * For realtime threads, the most important aspect is
3951 * scheduling latency, so we attempt to assign threads
3952 * to good preemption candidates (assuming an idle primary
3953 * processor was not available above).
3954 */
3955
3956 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3957 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
3958 return lp_unpaired_primary_processor;
3959 }
3960 if (thread->sched_pri > lowest_priority) {
3961 pset->last_chosen = lp_processor->cpu_id;
3962 return lp_processor;
3963 }
3964 if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) {
3965 pset->last_chosen = lp_paired_secondary_processor->cpu_id;
3966 return lp_paired_secondary_processor;
3967 }
3968 if (thread->realtime.deadline < furthest_deadline)
3969 return fd_processor;
3970
3971 /*
3972 * If all primary and secondary CPUs are busy with realtime
3973 * threads with deadlines earlier than us, move on to next
3974 * pset.
3975 */
3976 }
3977 else {
3978
3979 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3980 pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
3981 return lp_unpaired_primary_processor;
3982 }
3983 if (thread->sched_pri > lowest_priority) {
3984 pset->last_chosen = lp_processor->cpu_id;
3985 return lp_processor;
3986 }
3987
3988 /*
3989 * If all primary processor in this pset are running a higher
3990 * priority thread, move on to next pset. Only when we have
3991 * exhausted this search do we fall back to other heuristics.
3992 */
3993 }
3994
3995 /*
3996 * Move onto the next processor set.
3997 */
3998 nset = next_pset(pset);
3999
4000 if (nset != starting_pset) {
4001 pset_unlock(pset);
4002
4003 pset = nset;
4004 pset_lock(pset);
4005 }
4006 } while (nset != starting_pset);
4007
4008 /*
4009 * Make sure that we pick a running processor,
4010 * and that the correct processor set is locked.
4011 * Since we may have unlock the candidate processor's
4012 * pset, it may have changed state.
4013 *
4014 * All primary processors are running a higher priority
4015 * thread, so the only options left are enqueuing on
4016 * the secondary processor that would perturb the least priority
4017 * primary, or the least busy primary.
4018 */
4019 do {
4020
4021 /* lowest_priority is evaluated in the main loops above */
4022 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
4023 processor = lp_unpaired_secondary_processor;
4024 lp_unpaired_secondary_processor = PROCESSOR_NULL;
4025 } else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
4026 processor = lp_paired_secondary_processor;
4027 lp_paired_secondary_processor = PROCESSOR_NULL;
4028 } else if (lc_processor != PROCESSOR_NULL) {
4029 processor = lc_processor;
4030 lc_processor = PROCESSOR_NULL;
4031 } else {
4032 /*
4033 * All processors are executing higher
4034 * priority threads, and the lowest_count
4035 * candidate was not usable
4036 */
4037 processor = master_processor;
4038 }
4039
4040 /*
4041 * Check that the correct processor set is
4042 * returned locked.
4043 */
4044 if (pset != processor->processor_set) {
4045 pset_unlock(pset);
4046 pset = processor->processor_set;
4047 pset_lock(pset);
4048 }
4049
4050 /*
4051 * We must verify that the chosen processor is still available.
4052 * master_processor is an exception, since we may need to preempt
4053 * a running thread on it during processor shutdown (for sleep),
4054 * and that thread needs to be enqueued on its runqueue to run
4055 * when the processor is restarted.
4056 */
4057 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
4058 processor = PROCESSOR_NULL;
4059
4060 } while (processor == PROCESSOR_NULL);
4061
4062 pset->last_chosen = processor->cpu_id;
4063 return processor;
4064}
4065
4066/*
4067 * thread_setrun:
4068 *
4069 * Dispatch thread for execution, onto an idle
4070 * processor or run queue, and signal a preemption
4071 * as appropriate.
4072 *
4073 * Thread must be locked.
4074 */
4075void
4076thread_setrun(
4077 thread_t thread,
4078 integer_t options)
4079{
4080 processor_t processor;
4081 processor_set_t pset;
4082
4083 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
4084 assert(thread->runq == PROCESSOR_NULL);
4085
4086 /*
4087 * Update priority if needed.
4088 */
4089 if (SCHED(can_update_priority)(thread))
4090 SCHED(update_priority)(thread);
4091
4092 thread->sfi_class = sfi_thread_classify(thread);
4093
4094 assert(thread->runq == PROCESSOR_NULL);
4095
4096#if __SMP__
4097 if (thread->bound_processor == PROCESSOR_NULL) {
4098 /*
4099 * Unbound case.
4100 */
4101 if (thread->affinity_set != AFFINITY_SET_NULL) {
4102 /*
4103 * Use affinity set policy hint.
4104 */
4105 pset = thread->affinity_set->aset_pset;
4106 pset_lock(pset);
4107
4108 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
4109 pset = processor->processor_set;
4110
4111 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
4112 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
4113 } else if (thread->last_processor != PROCESSOR_NULL) {
4114 /*
4115 * Simple (last processor) affinity case.
4116 */
4117 processor = thread->last_processor;
4118 pset = processor->processor_set;
4119 pset_lock(pset);
4120 processor = SCHED(choose_processor)(pset, processor, thread);
4121 pset = processor->processor_set;
4122
4123 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
4124 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
4125 } else {
4126 /*
4127 * No Affinity case:
4128 *
4129 * Utilitize a per task hint to spread threads
4130 * among the available processor sets.
4131 */
4132 task_t task = thread->task;
4133
4134 pset = task->pset_hint;
4135 if (pset == PROCESSOR_SET_NULL)
4136 pset = current_processor()->processor_set;
4137
4138 pset = choose_next_pset(pset);
4139 pset_lock(pset);
4140
4141 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
4142 pset = processor->processor_set;
4143 task->pset_hint = pset;
4144
4145 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
4146 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
4147 }
4148 } else {
4149 /*
4150 * Bound case:
4151 *
4152 * Unconditionally dispatch on the processor.
4153 */
4154 processor = thread->bound_processor;
4155 pset = processor->processor_set;
4156 pset_lock(pset);
4157
4158 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
4159 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
4160 }
4161#else /* !__SMP__ */
4162 /* Only one processor to choose */
4163 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
4164 processor = master_processor;
4165 pset = processor->processor_set;
4166 pset_lock(pset);
4167#endif /* !__SMP__ */
4168
4169 /*
4170 * Dispatch the thread on the chosen processor.
4171 * TODO: This should be based on sched_mode, not sched_pri
4172 */
4173 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4174 realtime_setrun(processor, thread);
4175 } else {
4176 processor_setrun(processor, thread, options);
4177 }
4178 /* pset is now unlocked */
4179 if (thread->bound_processor == PROCESSOR_NULL) {
4180 SCHED(check_spill)(pset, thread);
4181 }
4182}
4183
4184processor_set_t
4185task_choose_pset(
4186 task_t task)
4187{
4188 processor_set_t pset = task->pset_hint;
4189
4190 if (pset != PROCESSOR_SET_NULL)
4191 pset = choose_next_pset(pset);
4192
4193 return (pset);
4194}
4195
4196/*
4197 * Check for a preemption point in
4198 * the current context.
4199 *
4200 * Called at splsched with thread locked.
4201 */
4202ast_t
4203csw_check(
4204 processor_t processor,
4205 ast_t check_reason)
4206{
4207 processor_set_t pset = processor->processor_set;
4208 ast_t result;
4209
4210 pset_lock(pset);
4211
4212 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
4213 bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
4214
4215 result = csw_check_locked(processor, pset, check_reason);
4216
4217 pset_unlock(pset);
4218
4219 return result;
4220}
4221
4222/*
4223 * Check for preemption at splsched with
4224 * pset and thread locked
4225 */
4226ast_t
4227csw_check_locked(
4228 processor_t processor,
4229 processor_set_t pset,
4230 ast_t check_reason)
4231{
4232 ast_t result;
4233 thread_t thread = processor->active_thread;
4234
4235 if (processor->first_timeslice) {
4236 if (rt_runq_count(pset) > 0)
4237 return (check_reason | AST_PREEMPT | AST_URGENT);
4238 }
4239 else {
4240 if (rt_runq_count(pset) > 0) {
4241 if (BASEPRI_RTQUEUES > processor->current_pri)
4242 return (check_reason | AST_PREEMPT | AST_URGENT);
4243 else
4244 return (check_reason | AST_PREEMPT);
4245 }
4246 }
4247
4248#if __SMP__
4249 /*
4250 * If the current thread is running on a processor that is no longer recommended,
4251 * urgently preempt it, at which point thread_select() should
4252 * try to idle the processor and re-dispatch the thread to a recommended processor.
4253 */
4254 if (!processor->is_recommended) {
4255 return (check_reason | AST_PREEMPT | AST_URGENT);
4256 }
4257#endif
4258
4259 result = SCHED(processor_csw_check)(processor);
4260 if (result != AST_NONE)
4261 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
4262
4263#if __SMP__
4264 /*
4265 * Same for avoid-processor
4266 *
4267 * TODO: Should these set AST_REBALANCE?
4268 */
4269 if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
4270 return (check_reason | AST_PREEMPT);
4271 }
4272
4273 /*
4274 * Even though we could continue executing on this processor, a
4275 * secondary SMT core should try to shed load to another primary core.
4276 *
4277 * TODO: Should this do the same check that thread_select does? i.e.
4278 * if no bound threads target this processor, and idle primaries exist, preempt
4279 * The case of RT threads existing is already taken care of above
4280 */
4281
4282 if (processor->current_pri < BASEPRI_RTQUEUES &&
4283 processor->processor_primary != processor)
4284 return (check_reason | AST_PREEMPT);
4285#endif
4286
4287 if (thread->state & TH_SUSP)
4288 return (check_reason | AST_PREEMPT);
4289
4290#if CONFIG_SCHED_SFI
4291 /*
4292 * Current thread may not need to be preempted, but maybe needs
4293 * an SFI wait?
4294 */
4295 result = sfi_thread_needs_ast(thread, NULL);
4296 if (result != AST_NONE)
4297 return (check_reason | result);
4298#endif
4299
4300 return (AST_NONE);
4301}
4302
4303/*
4304 * set_sched_pri:
4305 *
4306 * Set the scheduled priority of the specified thread.
4307 *
4308 * This may cause the thread to change queues.
4309 *
4310 * Thread must be locked.
4311 */
4312void
4313set_sched_pri(
4314 thread_t thread,
4315 int new_priority,
4316 set_sched_pri_options_t options)
4317{
4318 thread_t cthread = current_thread();
4319 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
4320 int curgency, nurgency;
4321 uint64_t urgency_param1, urgency_param2;
4322 boolean_t removed_from_runq = FALSE;
4323
4324 bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
4325
4326 int old_priority = thread->sched_pri;
4327
4328 /* If we're already at this priority, no need to mess with the runqueue */
4329 if (new_priority == old_priority)
4330 return;
4331
4332 if (is_current_thread) {
4333 assert(thread->runq == PROCESSOR_NULL);
4334 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4335 } else {
4336 removed_from_runq = thread_run_queue_remove(thread);
4337 }
4338
4339 thread->sched_pri = new_priority;
4340
4341 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
4342 (uintptr_t)thread_tid(thread),
4343 thread->base_pri,
4344 thread->sched_pri,
4345 thread->sched_usage,
4346 0);
4347
4348 if (is_current_thread) {
4349 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4350 /*
4351 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
4352 * class alterations from user space to occur relatively infrequently, hence
4353 * those are lazily handled. QoS classes have distinct priority bands, and QoS
4354 * inheritance is expected to involve priority changes.
4355 */
4356 uint64_t ctime = mach_approximate_time();
4357 if (nurgency != curgency) {
4358 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
4359 }
4360 machine_thread_going_on_core(thread, nurgency, 0, 0, ctime);
4361 }
4362
4363 if (removed_from_runq)
4364 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
4365 else if (thread->state & TH_RUN) {
4366 processor_t processor = thread->last_processor;
4367
4368 if (is_current_thread) {
4369 processor_state_update_from_thread(processor, thread);
4370
4371 /*
4372 * When dropping in priority, check if the thread no longer belongs on core.
4373 * If a thread raises its own priority, don't aggressively rebalance it.
4374 * <rdar://problem/31699165>
4375 */
4376 if (!lazy_update && new_priority < old_priority) {
4377 ast_t preempt;
4378
4379 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4380 ast_on(preempt);
4381 }
4382 } else if (!lazy_update && processor != PROCESSOR_NULL &&
4383 processor != current_processor() && processor->active_thread == thread) {
4384 cause_ast_check(processor);
4385 }
4386 }
4387}
4388
4389/*
4390 * thread_run_queue_remove_for_handoff
4391 *
4392 * Pull a thread or its (recursive) push target out of the runqueue
4393 * so that it is ready for thread_run()
4394 *
4395 * Called at splsched
4396 *
4397 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4398 * This may be different than the thread that was passed in.
4399 */
4400thread_t
4401thread_run_queue_remove_for_handoff(thread_t thread) {
4402
4403 thread_t pulled_thread = THREAD_NULL;
4404
4405 thread_lock(thread);
4406
4407 /*
4408 * Check that the thread is not bound
4409 * to a different processor, and that realtime
4410 * is not involved.
4411 *
4412 * Next, pull it off its run queue. If it
4413 * doesn't come, it's not eligible.
4414 */
4415
4416 processor_t processor = current_processor();
4417 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4418 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
4419
4420 if (thread_run_queue_remove(thread))
4421 pulled_thread = thread;
4422 }
4423
4424 thread_unlock(thread);
4425
4426 return pulled_thread;
4427}
4428
4429/*
4430 * thread_run_queue_remove:
4431 *
4432 * Remove a thread from its current run queue and
4433 * return TRUE if successful.
4434 *
4435 * Thread must be locked.
4436 *
4437 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4438 * run queues because the caller locked the thread. Otherwise
4439 * the thread is on a run queue, but could be chosen for dispatch
4440 * and removed by another processor under a different lock, which
4441 * will set thread->runq to PROCESSOR_NULL.
4442 *
4443 * Hence the thread select path must not rely on anything that could
4444 * be changed under the thread lock after calling this function,
4445 * most importantly thread->sched_pri.
4446 */
4447boolean_t
4448thread_run_queue_remove(
4449 thread_t thread)
4450{
4451 boolean_t removed = FALSE;
4452 processor_t processor = thread->runq;
4453
4454 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4455 /* Thread isn't runnable */
4456 assert(thread->runq == PROCESSOR_NULL);
4457 return FALSE;
4458 }
4459
4460 if (processor == PROCESSOR_NULL) {
4461 /*
4462 * The thread is either not on the runq,
4463 * or is in the midst of being removed from the runq.
4464 *
4465 * runq is set to NULL under the pset lock, not the thread
4466 * lock, so the thread may still be in the process of being dequeued
4467 * from the runq. It will wait in invoke for the thread lock to be
4468 * dropped.
4469 */
4470
4471 return FALSE;
4472 }
4473
4474 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4475 return SCHED(processor_queue_remove)(processor, thread);
4476 }
4477
4478 processor_set_t pset = processor->processor_set;
4479
4480 rt_lock_lock(pset);
4481
4482 if (thread->runq != PROCESSOR_NULL) {
4483 /*
4484 * Thread is on the RT run queue and we have a lock on
4485 * that run queue.
4486 */
4487
4488 remqueue(&thread->runq_links);
4489 SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
4490 rt_runq_count_decr(pset);
4491
4492 thread->runq = PROCESSOR_NULL;
4493
4494 removed = TRUE;
4495 }
4496
4497 rt_lock_unlock(pset);
4498
4499 return (removed);
4500}
4501
4502/*
4503 * Put the thread back where it goes after a thread_run_queue_remove
4504 *
4505 * Thread must have been removed under the same thread lock hold
4506 *
4507 * thread locked, at splsched
4508 */
4509void
4510thread_run_queue_reinsert(thread_t thread, integer_t options)
4511{
4512 assert(thread->runq == PROCESSOR_NULL);
4513 assert(thread->state & (TH_RUN));
4514
4515 thread_setrun(thread, options);
4516}
4517
4518void
4519sys_override_cpu_throttle(boolean_t enable_override)
4520{
4521 if (enable_override)
4522 cpu_throttle_enabled = 0;
4523 else
4524 cpu_throttle_enabled = 1;
4525}
4526
4527int
4528thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4529{
4530 if (thread == NULL || (thread->state & TH_IDLE)) {
4531 *arg1 = 0;
4532 *arg2 = 0;
4533
4534 return (THREAD_URGENCY_NONE);
4535 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4536 *arg1 = thread->realtime.period;
4537 *arg2 = thread->realtime.deadline;
4538
4539 return (THREAD_URGENCY_REAL_TIME);
4540 } else if (cpu_throttle_enabled &&
4541 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4542 /*
4543 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4544 */
4545 *arg1 = thread->sched_pri;
4546 *arg2 = thread->base_pri;
4547
4548 return (THREAD_URGENCY_BACKGROUND);
4549 } else {
4550 /* For otherwise unclassified threads, report throughput QoS
4551 * parameters
4552 */
4553 *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
4554 *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
4555
4556 return (THREAD_URGENCY_NORMAL);
4557 }
4558}
4559
4560perfcontrol_class_t
4561thread_get_perfcontrol_class(thread_t thread)
4562{
4563 /* Special case handling */
4564 if (thread->state & TH_IDLE)
4565 return PERFCONTROL_CLASS_IDLE;
4566 if (thread->task == kernel_task)
4567 return PERFCONTROL_CLASS_KERNEL;
4568 if (thread->sched_mode == TH_MODE_REALTIME)
4569 return PERFCONTROL_CLASS_REALTIME;
4570
4571 /* perfcontrol_class based on base_pri */
4572 if (thread->base_pri <= MAXPRI_THROTTLE)
4573 return PERFCONTROL_CLASS_BACKGROUND;
4574 else if (thread->base_pri <= BASEPRI_UTILITY)
4575 return PERFCONTROL_CLASS_UTILITY;
4576 else if (thread->base_pri <= BASEPRI_DEFAULT)
4577 return PERFCONTROL_CLASS_NONUI;
4578 else if (thread->base_pri <= BASEPRI_FOREGROUND)
4579 return PERFCONTROL_CLASS_UI;
4580 else
4581 return PERFCONTROL_CLASS_ABOVEUI;
4582}
4583
4584/*
4585 * This is the processor idle loop, which just looks for other threads
4586 * to execute. Processor idle threads invoke this without supplying a
4587 * current thread to idle without an asserted wait state.
4588 *
4589 * Returns a the next thread to execute if dispatched directly.
4590 */
4591
4592#if 0
4593#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4594#else
4595#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4596#endif
4597
4598thread_t
4599processor_idle(
4600 thread_t thread,
4601 processor_t processor)
4602{
4603 processor_set_t pset = processor->processor_set;
4604 thread_t new_thread;
4605 int state;
4606 (void)splsched();
4607
4608 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4609 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4610 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4611
4612 SCHED_STATS_CPU_IDLE_START(processor);
4613
4614 uint64_t ctime = mach_absolute_time();
4615
4616 timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state));
4617 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4618
4619 cpu_quiescent_counter_leave(ctime);
4620
4621 while (1) {
4622 /*
4623 * Ensure that updates to my processor and pset state,
4624 * made by the IPI source processor before sending the IPI,
4625 * are visible on this processor now (even though we don't
4626 * take the pset lock yet).
4627 */
4628 atomic_thread_fence(memory_order_acquire);
4629
4630 if (processor->state != PROCESSOR_IDLE)
4631 break;
4632 if (bit_test(pset->pending_AST_cpu_mask, processor->cpu_id))
4633 break;
4634#if defined(CONFIG_SCHED_DEFERRED_AST)
4635 if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id))
4636 break;
4637#endif
4638 if (processor->is_recommended && (processor->processor_primary == processor)) {
4639 if (rt_runq_count(pset))
4640 break;
4641 } else {
4642 if (SCHED(processor_bound_count)(processor))
4643 break;
4644 }
4645
4646#if CONFIG_SCHED_IDLE_IN_PLACE
4647 if (thread != THREAD_NULL) {
4648 /* Did idle-in-place thread wake up */
4649 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4650 break;
4651 }
4652#endif
4653
4654 IDLE_KERNEL_DEBUG_CONSTANT(
4655 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
4656
4657 machine_track_platform_idle(TRUE);
4658
4659 machine_idle();
4660
4661 machine_track_platform_idle(FALSE);
4662
4663 (void)splsched();
4664
4665 /*
4666 * Check if we should call sched_timeshare_consider_maintenance() here.
4667 * The CPU was woken out of idle due to an interrupt and we should do the
4668 * call only if the processor is still idle. If the processor is non-idle,
4669 * the threads running on the processor would do the call as part of
4670 * context swithing.
4671 */
4672 if (processor->state == PROCESSOR_IDLE) {
4673 sched_timeshare_consider_maintenance(mach_absolute_time());
4674 }
4675
4676 IDLE_KERNEL_DEBUG_CONSTANT(
4677 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
4678
4679 if (!SCHED(processor_queue_empty)(processor)) {
4680 /* Secondary SMT processors respond to directed wakeups
4681 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4682 */
4683 if (processor->processor_primary == processor)
4684 break;
4685 }
4686 }
4687
4688 ctime = mach_absolute_time();
4689
4690 timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state));
4691 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4692
4693 cpu_quiescent_counter_join(ctime);
4694
4695 pset_lock(pset);
4696
4697 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4698 bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
4699#if defined(CONFIG_SCHED_DEFERRED_AST)
4700 bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
4701#endif
4702
4703 state = processor->state;
4704 if (state == PROCESSOR_DISPATCHING) {
4705 /*
4706 * Commmon case -- cpu dispatched.
4707 */
4708 new_thread = processor->next_thread;
4709 processor->next_thread = THREAD_NULL;
4710 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
4711
4712 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
4713 (rt_runq_count(pset) > 0)) ) {
4714 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
4715 processor_state_update_idle(processor);
4716 processor->deadline = UINT64_MAX;
4717
4718 pset_unlock(pset);
4719
4720 thread_lock(new_thread);
4721 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq_count(pset), 0, 0);
4722 thread_setrun(new_thread, SCHED_HEADQ);
4723 thread_unlock(new_thread);
4724
4725 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4726 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4727 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4728
4729 return (THREAD_NULL);
4730 }
4731
4732 sched_update_pset_load_average(pset);
4733
4734 pset_unlock(pset);
4735
4736 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4737 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4738 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4739
4740 return (new_thread);
4741
4742 } else if (state == PROCESSOR_IDLE) {
4743 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
4744 processor_state_update_idle(processor);
4745 processor->deadline = UINT64_MAX;
4746
4747 } else if (state == PROCESSOR_SHUTDOWN) {
4748 /*
4749 * Going off-line. Force a
4750 * reschedule.
4751 */
4752 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4753 processor->next_thread = THREAD_NULL;
4754 processor_state_update_idle(processor);
4755 processor->deadline = UINT64_MAX;
4756
4757 pset_unlock(pset);
4758
4759 thread_lock(new_thread);
4760 thread_setrun(new_thread, SCHED_HEADQ);
4761 thread_unlock(new_thread);
4762
4763 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4764 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4765 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4766
4767 return (THREAD_NULL);
4768 }
4769 }
4770
4771 pset_unlock(pset);
4772
4773 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4774 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4775 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4776
4777 return (THREAD_NULL);
4778}
4779
4780/*
4781 * Each processor has a dedicated thread which
4782 * executes the idle loop when there is no suitable
4783 * previous context.
4784 */
4785void
4786idle_thread(void)
4787{
4788 processor_t processor = current_processor();
4789 thread_t new_thread;
4790
4791 new_thread = processor_idle(THREAD_NULL, processor);
4792 if (new_thread != THREAD_NULL) {
4793 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4794 /*NOTREACHED*/
4795 }
4796
4797 thread_block((thread_continue_t)idle_thread);
4798 /*NOTREACHED*/
4799}
4800
4801kern_return_t
4802idle_thread_create(
4803 processor_t processor)
4804{
4805 kern_return_t result;
4806 thread_t thread;
4807 spl_t s;
4808 char name[MAXTHREADNAMESIZE];
4809
4810 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4811 if (result != KERN_SUCCESS)
4812 return (result);
4813
4814 snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
4815 thread_set_thread_name(thread, name);
4816
4817 s = splsched();
4818 thread_lock(thread);
4819 thread->bound_processor = processor;
4820 processor->idle_thread = thread;
4821 thread->sched_pri = thread->base_pri = IDLEPRI;
4822 thread->state = (TH_RUN | TH_IDLE);
4823 thread->options |= TH_OPT_IDLE_THREAD;
4824 thread_unlock(thread);
4825 splx(s);
4826
4827 thread_deallocate(thread);
4828
4829 return (KERN_SUCCESS);
4830}
4831
4832/*
4833 * sched_startup:
4834 *
4835 * Kicks off scheduler services.
4836 *
4837 * Called at splsched.
4838 */
4839void
4840sched_startup(void)
4841{
4842 kern_return_t result;
4843 thread_t thread;
4844
4845 simple_lock_init(&sched_vm_group_list_lock, 0);
4846
4847#if __arm__ || __arm64__
4848 simple_lock_init(&sched_recommended_cores_lock, 0);
4849#endif /* __arm__ || __arm64__ */
4850
4851 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4852 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4853 if (result != KERN_SUCCESS)
4854 panic("sched_startup");
4855
4856 thread_deallocate(thread);
4857
4858 assert_thread_magic(thread);
4859
4860 /*
4861 * Yield to the sched_init_thread once, to
4862 * initialize our own thread after being switched
4863 * back to.
4864 *
4865 * The current thread is the only other thread
4866 * active at this point.
4867 */
4868 thread_block(THREAD_CONTINUE_NULL);
4869}
4870
4871#if __arm64__
4872static _Atomic uint64_t sched_perfcontrol_callback_deadline;
4873#endif /* __arm64__ */
4874
4875
4876#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4877
4878static volatile uint64_t sched_maintenance_deadline;
4879static uint64_t sched_tick_last_abstime;
4880static uint64_t sched_tick_delta;
4881uint64_t sched_tick_max_delta;
4882
4883
4884/*
4885 * sched_init_thread:
4886 *
4887 * Perform periodic bookkeeping functions about ten
4888 * times per second.
4889 */
4890void
4891sched_timeshare_maintenance_continue(void)
4892{
4893 uint64_t sched_tick_ctime, late_time;
4894
4895 struct sched_update_scan_context scan_context = {
4896 .earliest_bg_make_runnable_time = UINT64_MAX,
4897 .earliest_normal_make_runnable_time = UINT64_MAX,
4898 .earliest_rt_make_runnable_time = UINT64_MAX
4899 };
4900
4901 sched_tick_ctime = mach_absolute_time();
4902
4903 if (__improbable(sched_tick_last_abstime == 0)) {
4904 sched_tick_last_abstime = sched_tick_ctime;
4905 late_time = 0;
4906 sched_tick_delta = 1;
4907 } else {
4908 late_time = sched_tick_ctime - sched_tick_last_abstime;
4909 sched_tick_delta = late_time / sched_tick_interval;
4910 /* Ensure a delta of 1, since the interval could be slightly
4911 * smaller than the sched_tick_interval due to dispatch
4912 * latencies.
4913 */
4914 sched_tick_delta = MAX(sched_tick_delta, 1);
4915
4916 /* In the event interrupt latencies or platform
4917 * idle events that advanced the timebase resulted
4918 * in periods where no threads were dispatched,
4919 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4920 * iterations.
4921 */
4922 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4923
4924 sched_tick_last_abstime = sched_tick_ctime;
4925 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4926 }
4927
4928 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4929 sched_tick_delta, late_time, 0, 0, 0);
4930
4931 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4932 * This could be greater than 1 if substantial intervals where
4933 * all processors are idle occur, which rarely occurs in practice.
4934 */
4935
4936 sched_tick += sched_tick_delta;
4937
4938 update_vm_info();
4939
4940 /*
4941 * Compute various averages.
4942 */
4943 compute_averages(sched_tick_delta);
4944
4945 /*
4946 * Scan the run queues for threads which
4947 * may need to be updated, and find the earliest runnable thread on the runqueue
4948 * to report its latency.
4949 */
4950 SCHED(thread_update_scan)(&scan_context);
4951
4952 SCHED(rt_runq_scan)(&scan_context);
4953
4954 uint64_t ctime = mach_absolute_time();
4955
4956 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
4957 ctime - scan_context.earliest_bg_make_runnable_time : 0;
4958
4959 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
4960 ctime - scan_context.earliest_normal_make_runnable_time : 0;
4961
4962 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
4963 ctime - scan_context.earliest_rt_make_runnable_time : 0;
4964
4965 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
4966
4967 /*
4968 * Check to see if the special sched VM group needs attention.
4969 */
4970 sched_vm_group_maintenance();
4971
4972#if __arm__ || __arm64__
4973 /* Check to see if the recommended cores failsafe is active */
4974 sched_recommended_cores_maintenance();
4975#endif /* __arm__ || __arm64__ */
4976
4977
4978#if DEBUG || DEVELOPMENT
4979#if __x86_64__
4980#include <i386/misc_protos.h>
4981 /* Check for long-duration interrupts */
4982 mp_interrupt_watchdog();
4983#endif /* __x86_64__ */
4984#endif /* DEBUG || DEVELOPMENT */
4985
4986 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
4987 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
4988 sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
4989
4990 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4991 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
4992 /*NOTREACHED*/
4993}
4994
4995static uint64_t sched_maintenance_wakeups;
4996
4997/*
4998 * Determine if the set of routines formerly driven by a maintenance timer
4999 * must be invoked, based on a deadline comparison. Signals the scheduler
5000 * maintenance thread on deadline expiration. Must be invoked at an interval
5001 * lower than the "sched_tick_interval", currently accomplished by
5002 * invocation via the quantum expiration timer and at context switch time.
5003 * Performance matters: this routine reuses a timestamp approximating the
5004 * current absolute time received from the caller, and should perform
5005 * no more than a comparison against the deadline in the common case.
5006 */
5007void
5008sched_timeshare_consider_maintenance(uint64_t ctime) {
5009
5010 cpu_quiescent_counter_checkin(ctime);
5011
5012 uint64_t deadline = sched_maintenance_deadline;
5013
5014 if (__improbable(ctime >= deadline)) {
5015 if (__improbable(current_thread() == sched_maintenance_thread))
5016 return;
5017 OSMemoryBarrier();
5018
5019 uint64_t ndeadline = ctime + sched_tick_interval;
5020
5021 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
5022 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
5023 sched_maintenance_wakeups++;
5024 }
5025 }
5026
5027 uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed);
5028
5029 if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
5030 uint64_t new_deadline = 0;
5031 if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline,
5032 memory_order_relaxed, memory_order_relaxed)) {
5033 compute_sched_load();
5034 new_deadline = ctime + sched_load_compute_interval_abs;
5035 __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed);
5036 }
5037 }
5038
5039#if __arm64__
5040 uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
5041
5042 if (__improbable(perf_deadline && ctime >= perf_deadline)) {
5043 /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
5044 if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0,
5045 memory_order_relaxed, memory_order_relaxed)) {
5046 machine_perfcontrol_deadline_passed(perf_deadline);
5047 }
5048 }
5049#endif /* __arm64__ */
5050
5051}
5052
5053#endif /* CONFIG_SCHED_TIMESHARE_CORE */
5054
5055void
5056sched_init_thread(void (*continuation)(void))
5057{
5058 thread_block(THREAD_CONTINUE_NULL);
5059
5060 thread_t thread = current_thread();
5061
5062 thread_set_thread_name(thread, "sched_maintenance_thread");
5063
5064 sched_maintenance_thread = thread;
5065
5066 continuation();
5067
5068 /*NOTREACHED*/
5069}
5070
5071#if defined(CONFIG_SCHED_TIMESHARE_CORE)
5072
5073/*
5074 * thread_update_scan / runq_scan:
5075 *
5076 * Scan the run queues to account for timesharing threads
5077 * which need to be updated.
5078 *
5079 * Scanner runs in two passes. Pass one squirrels likely
5080 * threads away in an array, pass two does the update.
5081 *
5082 * This is necessary because the run queue is locked for
5083 * the candidate scan, but the thread is locked for the update.
5084 *
5085 * Array should be sized to make forward progress, without
5086 * disabling preemption for long periods.
5087 */
5088
5089#define THREAD_UPDATE_SIZE 128
5090
5091static thread_t thread_update_array[THREAD_UPDATE_SIZE];
5092static uint32_t thread_update_count = 0;
5093
5094/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
5095boolean_t
5096thread_update_add_thread(thread_t thread)
5097{
5098 if (thread_update_count == THREAD_UPDATE_SIZE)
5099 return (FALSE);
5100
5101 thread_update_array[thread_update_count++] = thread;
5102 thread_reference_internal(thread);
5103 return (TRUE);
5104}
5105
5106void
5107thread_update_process_threads(void)
5108{
5109 assert(thread_update_count <= THREAD_UPDATE_SIZE);
5110
5111 for (uint32_t i = 0 ; i < thread_update_count ; i++) {
5112 thread_t thread = thread_update_array[i];
5113 assert_thread_magic(thread);
5114 thread_update_array[i] = THREAD_NULL;
5115
5116 spl_t s = splsched();
5117 thread_lock(thread);
5118 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
5119 SCHED(update_priority)(thread);
5120 }
5121 thread_unlock(thread);
5122 splx(s);
5123
5124 thread_deallocate(thread);
5125 }
5126
5127 thread_update_count = 0;
5128}
5129
5130/*
5131 * Scan a runq for candidate threads.
5132 *
5133 * Returns TRUE if retry is needed.
5134 */
5135boolean_t
5136runq_scan(
5137 run_queue_t runq,
5138 sched_update_scan_context_t scan_context)
5139{
5140 int count = runq->count;
5141 int queue_index;
5142
5143 assert(count >= 0);
5144
5145 if (count == 0)
5146 return FALSE;
5147
5148 for (queue_index = bitmap_first(runq->bitmap, NRQS);
5149 queue_index >= 0;
5150 queue_index = bitmap_next(runq->bitmap, queue_index)) {
5151
5152 thread_t thread;
5153 queue_t queue = &runq->queues[queue_index];
5154
5155 qe_foreach_element(thread, queue, runq_links) {
5156 assert(count > 0);
5157 assert_thread_magic(thread);
5158
5159 if (thread->sched_stamp != sched_tick &&
5160 thread->sched_mode == TH_MODE_TIMESHARE) {
5161 if (thread_update_add_thread(thread) == FALSE)
5162 return TRUE;
5163 }
5164
5165 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
5166 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
5167 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
5168 }
5169 } else {
5170 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
5171 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
5172 }
5173 }
5174 count--;
5175 }
5176 }
5177
5178 return FALSE;
5179}
5180
5181#endif /* CONFIG_SCHED_TIMESHARE_CORE */
5182
5183boolean_t
5184thread_eager_preemption(thread_t thread)
5185{
5186 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
5187}
5188
5189void
5190thread_set_eager_preempt(thread_t thread)
5191{
5192 spl_t x;
5193 processor_t p;
5194 ast_t ast = AST_NONE;
5195
5196 x = splsched();
5197 p = current_processor();
5198
5199 thread_lock(thread);
5200 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
5201
5202 if (thread == current_thread()) {
5203
5204 ast = csw_check(p, AST_NONE);
5205 thread_unlock(thread);
5206 if (ast != AST_NONE) {
5207 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
5208 }
5209 } else {
5210 p = thread->last_processor;
5211
5212 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
5213 p->active_thread == thread) {
5214 cause_ast_check(p);
5215 }
5216
5217 thread_unlock(thread);
5218 }
5219
5220 splx(x);
5221}
5222
5223void
5224thread_clear_eager_preempt(thread_t thread)
5225{
5226 spl_t x;
5227
5228 x = splsched();
5229 thread_lock(thread);
5230
5231 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
5232
5233 thread_unlock(thread);
5234 splx(x);
5235}
5236
5237/*
5238 * Scheduling statistics
5239 */
5240void
5241sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
5242{
5243 struct processor_sched_statistics *stats;
5244 boolean_t to_realtime = FALSE;
5245
5246 stats = &processor->processor_data.sched_stats;
5247 stats->csw_count++;
5248
5249 if (otherpri >= BASEPRI_REALTIME) {
5250 stats->rt_sched_count++;
5251 to_realtime = TRUE;
5252 }
5253
5254 if ((reasons & AST_PREEMPT) != 0) {
5255 stats->preempt_count++;
5256
5257 if (selfpri >= BASEPRI_REALTIME) {
5258 stats->preempted_rt_count++;
5259 }
5260
5261 if (to_realtime) {
5262 stats->preempted_by_rt_count++;
5263 }
5264
5265 }
5266}
5267
5268void
5269sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
5270{
5271 uint64_t timestamp = mach_absolute_time();
5272
5273 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
5274 stats->last_change_timestamp = timestamp;
5275}
5276
5277/*
5278 * For calls from assembly code
5279 */
5280#undef thread_wakeup
5281void
5282thread_wakeup(
5283 event_t x);
5284
5285void
5286thread_wakeup(
5287 event_t x)
5288{
5289 thread_wakeup_with_result(x, THREAD_AWAKENED);
5290}
5291
5292boolean_t
5293preemption_enabled(void)
5294{
5295 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
5296}
5297
5298static void
5299sched_timer_deadline_tracking_init(void) {
5300 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
5301 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
5302}
5303
5304#if __arm__ || __arm64__
5305
5306uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
5307uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
5308bool perfcontrol_failsafe_active = false;
5309bool perfcontrol_sleep_override = false;
5310
5311uint64_t perfcontrol_failsafe_maintenance_runnable_time;
5312uint64_t perfcontrol_failsafe_activation_time;
5313uint64_t perfcontrol_failsafe_deactivation_time;
5314
5315/* data covering who likely caused it and how long they ran */
5316#define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */
5317char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
5318int perfcontrol_failsafe_pid;
5319uint64_t perfcontrol_failsafe_tid;
5320uint64_t perfcontrol_failsafe_thread_timer_at_start;
5321uint64_t perfcontrol_failsafe_thread_timer_last_seen;
5322uint32_t perfcontrol_failsafe_recommended_at_trigger;
5323
5324/*
5325 * Perf controller calls here to update the recommended core bitmask.
5326 * If the failsafe is active, we don't immediately apply the new value.
5327 * Instead, we store the new request and use it after the failsafe deactivates.
5328 *
5329 * If the failsafe is not active, immediately apply the update.
5330 *
5331 * No scheduler locks are held, no other locks are held that scheduler might depend on,
5332 * interrupts are enabled
5333 *
5334 * currently prototype is in osfmk/arm/machine_routines.h
5335 */
5336void
5337sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
5338{
5339 assert(preemption_enabled());
5340
5341 spl_t s = splsched();
5342 simple_lock(&sched_recommended_cores_lock);
5343
5344 perfcontrol_requested_recommended_cores = recommended_cores;
5345 perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
5346
5347 if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false))
5348 sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5349 else
5350 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5351 MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE,
5352 perfcontrol_requested_recommended_cores,
5353 sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
5354
5355 simple_unlock(&sched_recommended_cores_lock);
5356 splx(s);
5357}
5358
5359void
5360sched_override_recommended_cores_for_sleep(void)
5361{
5362 spl_t s = splsched();
5363 simple_lock(&sched_recommended_cores_lock);
5364
5365 if (perfcontrol_sleep_override == false) {
5366 perfcontrol_sleep_override = true;
5367 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
5368 }
5369
5370 simple_unlock(&sched_recommended_cores_lock);
5371 splx(s);
5372}
5373
5374void
5375sched_restore_recommended_cores_after_sleep(void)
5376{
5377 spl_t s = splsched();
5378 simple_lock(&sched_recommended_cores_lock);
5379
5380 if (perfcontrol_sleep_override == true) {
5381 perfcontrol_sleep_override = false;
5382 sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5383 }
5384
5385 simple_unlock(&sched_recommended_cores_lock);
5386 splx(s);
5387}
5388
5389/*
5390 * Consider whether we need to activate the recommended cores failsafe
5391 *
5392 * Called from quantum timer interrupt context of a realtime thread
5393 * No scheduler locks are held, interrupts are disabled
5394 */
5395void
5396sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
5397{
5398 /*
5399 * Check if a realtime thread is starving the system
5400 * and bringing up non-recommended cores would help
5401 *
5402 * TODO: Is this the correct check for recommended == possible cores?
5403 * TODO: Validate the checks without the relevant lock are OK.
5404 */
5405
5406 if (__improbable(perfcontrol_failsafe_active == TRUE)) {
5407 /* keep track of how long the responsible thread runs */
5408
5409 simple_lock(&sched_recommended_cores_lock);
5410
5411 if (perfcontrol_failsafe_active == TRUE &&
5412 cur_thread->thread_id == perfcontrol_failsafe_tid) {
5413 perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
5414 timer_grab(&cur_thread->system_timer);
5415 }
5416
5417 simple_unlock(&sched_recommended_cores_lock);
5418
5419 /* we're already trying to solve the problem, so bail */
5420 return;
5421 }
5422
5423 /* The failsafe won't help if there are no more processors to enable */
5424 if (__probable(perfcontrol_requested_recommended_core_count >= processor_count))
5425 return;
5426
5427 uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
5428
5429 /* Use the maintenance thread as our canary in the coal mine */
5430 thread_t m_thread = sched_maintenance_thread;
5431
5432 /* If it doesn't look bad, nothing to see here */
5433 if (__probable(m_thread->last_made_runnable_time >= too_long_ago))
5434 return;
5435
5436 /* It looks bad, take the lock to be sure */
5437 thread_lock(m_thread);
5438
5439 if (m_thread->runq == PROCESSOR_NULL ||
5440 (m_thread->state & (TH_RUN|TH_WAIT)) != TH_RUN ||
5441 m_thread->last_made_runnable_time >= too_long_ago) {
5442 /*
5443 * Maintenance thread is either on cpu or blocked, and
5444 * therefore wouldn't benefit from more cores
5445 */
5446 thread_unlock(m_thread);
5447 return;
5448 }
5449
5450 uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
5451
5452 thread_unlock(m_thread);
5453
5454 /*
5455 * There are cores disabled at perfcontrol's recommendation, but the
5456 * system is so overloaded that the maintenance thread can't run.
5457 * That likely means that perfcontrol can't run either, so it can't fix
5458 * the recommendation. We have to kick in a failsafe to keep from starving.
5459 *
5460 * When the maintenance thread has been starved for too long,
5461 * ignore the recommendation from perfcontrol and light up all the cores.
5462 *
5463 * TODO: Consider weird states like boot, sleep, or debugger
5464 */
5465
5466 simple_lock(&sched_recommended_cores_lock);
5467
5468 if (perfcontrol_failsafe_active == TRUE) {
5469 simple_unlock(&sched_recommended_cores_lock);
5470 return;
5471 }
5472
5473 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5474 MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START,
5475 perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
5476
5477 perfcontrol_failsafe_active = TRUE;
5478 perfcontrol_failsafe_activation_time = mach_absolute_time();
5479 perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
5480 perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
5481
5482 /* Capture some data about who screwed up (assuming that the thread on core is at fault) */
5483 task_t task = cur_thread->task;
5484 perfcontrol_failsafe_pid = task_pid(task);
5485 strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
5486
5487 perfcontrol_failsafe_tid = cur_thread->thread_id;
5488
5489 /* Blame the thread for time it has run recently */
5490 uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
5491
5492 uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
5493
5494 /* Compute the start time of the bad behavior in terms of the thread's on core time */
5495 perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
5496 perfcontrol_failsafe_thread_timer_last_seen = last_seen;
5497
5498 /* Ignore the previously recommended core configuration */
5499 sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
5500
5501 simple_unlock(&sched_recommended_cores_lock);
5502}
5503
5504/*
5505 * Now that our bacon has been saved by the failsafe, consider whether to turn it off
5506 *
5507 * Runs in the context of the maintenance thread, no locks held
5508 */
5509static void
5510sched_recommended_cores_maintenance(void)
5511{
5512 /* Common case - no failsafe, nothing to be done here */
5513 if (__probable(perfcontrol_failsafe_active == FALSE))
5514 return;
5515
5516 uint64_t ctime = mach_absolute_time();
5517
5518 boolean_t print_diagnostic = FALSE;
5519 char p_name[FAILSAFE_NAME_LEN] = "";
5520
5521 spl_t s = splsched();
5522 simple_lock(&sched_recommended_cores_lock);
5523
5524 /* Check again, under the lock, to avoid races */
5525 if (perfcontrol_failsafe_active == FALSE)
5526 goto out;
5527
5528 /*
5529 * Ensure that the other cores get another few ticks to run some threads
5530 * If we don't have this hysteresis, the maintenance thread is the first
5531 * to run, and then it immediately kills the other cores
5532 */
5533 if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold)
5534 goto out;
5535
5536 /* Capture some diagnostic state under the lock so we can print it out later */
5537
5538 int pid = perfcontrol_failsafe_pid;
5539 uint64_t tid = perfcontrol_failsafe_tid;
5540
5541 uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
5542 perfcontrol_failsafe_thread_timer_at_start;
5543 uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
5544 uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
5545 uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
5546 strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
5547
5548 print_diagnostic = TRUE;
5549
5550 /* Deactivate the failsafe and reinstate the requested recommendation settings */
5551
5552 perfcontrol_failsafe_deactivation_time = ctime;
5553 perfcontrol_failsafe_active = FALSE;
5554
5555 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5556 MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END,
5557 perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
5558
5559 sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
5560
5561out:
5562 simple_unlock(&sched_recommended_cores_lock);
5563 splx(s);
5564
5565 if (print_diagnostic) {
5566 uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
5567
5568 absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
5569 failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
5570
5571 absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
5572 thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
5573
5574 printf("recommended core failsafe kicked in for %lld ms "
5575 "likely due to %s[%d] thread 0x%llx spending "
5576 "%lld ms on cpu at realtime priority - "
5577 "new recommendation: 0x%x -> 0x%x\n",
5578 failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
5579 rec_cores_before, rec_cores_after);
5580 }
5581}
5582
5583/*
5584 * Apply a new recommended cores mask to the processors it affects
5585 * Runs after considering failsafes and such
5586 *
5587 * Iterate over processors and update their ->is_recommended field.
5588 * If a processor is running, we let it drain out at its next
5589 * quantum expiration or blocking point. If a processor is idle, there
5590 * may be more work for it to do, so IPI it.
5591 *
5592 * interrupts disabled, sched_recommended_cores_lock is held
5593 */
5594static void
5595sched_update_recommended_cores(uint32_t recommended_cores)
5596{
5597 processor_set_t pset, nset;
5598 processor_t processor;
5599 uint64_t needs_exit_idle_mask = 0x0;
5600
5601 processor = processor_list;
5602 pset = processor->processor_set;
5603
5604 KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
5605 recommended_cores, perfcontrol_failsafe_active, 0, 0);
5606
5607 if (__builtin_popcount(recommended_cores) == 0) {
5608 bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
5609 }
5610
5611 /* First set recommended cores */
5612 pset_lock(pset);
5613 do {
5614
5615 nset = processor->processor_set;
5616 if (nset != pset) {
5617 pset_unlock(pset);
5618 pset = nset;
5619 pset_lock(pset);
5620 }
5621
5622 if (bit_test(recommended_cores, processor->cpu_id)) {
5623 processor->is_recommended = TRUE;
5624 bit_set(pset->recommended_bitmask, processor->cpu_id);
5625
5626 if (processor->state == PROCESSOR_IDLE) {
5627 if (processor != current_processor()) {
5628 bit_set(needs_exit_idle_mask, processor->cpu_id);
5629 }
5630 }
5631 }
5632 } while ((processor = processor->processor_list) != NULL);
5633 pset_unlock(pset);
5634
5635 /* Now shutdown not recommended cores */
5636 processor = processor_list;
5637 pset = processor->processor_set;
5638
5639 pset_lock(pset);
5640 do {
5641
5642 nset = processor->processor_set;
5643 if (nset != pset) {
5644 pset_unlock(pset);
5645 pset = nset;
5646 pset_lock(pset);
5647 }
5648
5649 if (!bit_test(recommended_cores, processor->cpu_id)) {
5650 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
5651
5652 processor->is_recommended = FALSE;
5653 bit_clear(pset->recommended_bitmask, processor->cpu_id);
5654
5655 if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
5656 ipi_type = SCHED_IPI_IMMEDIATE;
5657 }
5658 SCHED(processor_queue_shutdown)(processor);
5659 /* pset unlocked */
5660
5661 SCHED(rt_queue_shutdown)(processor);
5662
5663 if (ipi_type != SCHED_IPI_NONE) {
5664 if (processor == current_processor()) {
5665 ast_on(AST_PREEMPT);
5666 } else {
5667 sched_ipi_perform(processor, ipi_type);
5668 }
5669 }
5670
5671 pset_lock(pset);
5672 }
5673 } while ((processor = processor->processor_list) != NULL);
5674 pset_unlock(pset);
5675
5676 /* Issue all pending IPIs now that the pset lock has been dropped */
5677 for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
5678 processor = processor_array[cpuid];
5679 machine_signal_idle(processor);
5680 }
5681
5682 KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
5683 needs_exit_idle_mask, 0, 0, 0);
5684}
5685#endif /* __arm__ || __arm64__ */
5686
5687void thread_set_options(uint32_t thopt) {
5688 spl_t x;
5689 thread_t t = current_thread();
5690
5691 x = splsched();
5692 thread_lock(t);
5693
5694 t->options |= thopt;
5695
5696 thread_unlock(t);
5697 splx(x);
5698}
5699
5700void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
5701 thread->pending_block_hint = block_hint;
5702}
5703
5704uint32_t qos_max_parallelism(int qos, uint64_t options)
5705{
5706 return SCHED(qos_max_parallelism)(qos, options);
5707}
5708
5709uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options)
5710{
5711 host_basic_info_data_t hinfo;
5712 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5713 /* Query the machine layer for core information */
5714 __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
5715 (host_info_t)&hinfo, &count);
5716 assert(kret == KERN_SUCCESS);
5717
5718 /* We would not want multiple realtime threads running on the
5719 * same physical core; even for SMT capable machines.
5720 */
5721 if (options & QOS_PARALLELISM_REALTIME) {
5722 return hinfo.physical_cpu;
5723 }
5724
5725 if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
5726 return hinfo.logical_cpu;
5727 } else {
5728 return hinfo.physical_cpu;
5729 }
5730}
5731
5732#if __arm64__
5733
5734/*
5735 * Set up or replace old timer with new timer
5736 *
5737 * Returns true if canceled old timer, false if it did not
5738 */
5739boolean_t
5740sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
5741{
5742 /*
5743 * Exchange deadline for new deadline, if old deadline was nonzero,
5744 * then I cancelled the callback, otherwise I didn't
5745 */
5746
5747 uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
5748 memory_order_relaxed);
5749
5750
5751 while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
5752 &old_deadline, new_deadline,
5753 memory_order_relaxed, memory_order_relaxed));
5754
5755
5756 /* now old_deadline contains previous value, which might not be the same if it raced */
5757
5758 return (old_deadline != 0) ? TRUE : FALSE;
5759}
5760
5761#endif /* __arm64__ */
5762
5763void
5764sched_update_pset_load_average(processor_set_t pset)
5765{
5766 int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
5767 int new_load_average = (pset->load_average + load) >> 1;
5768
5769 pset->load_average = new_load_average;
5770
5771#if (DEVELOPMENT || DEBUG)
5772#endif
5773}
5774
5775/* pset is locked */
5776static processor_t
5777choose_processor_for_realtime_thread(processor_set_t pset)
5778{
5779 uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_cpu_mask);
5780
5781 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
5782 processor_t processor = processor_array[cpuid];
5783
5784 if (processor->processor_primary != processor) {
5785 continue;
5786 }
5787
5788 if (processor->state == PROCESSOR_IDLE) {
5789 return processor;
5790 }
5791
5792 if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
5793 continue;
5794 }
5795
5796 if (processor->current_pri >= BASEPRI_RTQUEUES) {
5797 continue;
5798 }
5799
5800 return processor;
5801
5802 }
5803
5804 if (!sched_allow_rt_smt) {
5805 return PROCESSOR_NULL;
5806 }
5807
5808 /* Consider secondary processors */
5809 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
5810 processor_t processor = processor_array[cpuid];
5811
5812 if (processor->processor_primary == processor) {
5813 continue;
5814 }
5815
5816 if (processor->state == PROCESSOR_IDLE) {
5817 return processor;
5818 }
5819
5820 if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
5821 continue;
5822 }
5823
5824 if (processor->current_pri >= BASEPRI_RTQUEUES) {
5825 continue;
5826 }
5827
5828 return processor;
5829
5830 }
5831
5832 return PROCESSOR_NULL;
5833}
5834
5835/* pset is locked */
5836static bool
5837all_available_primaries_are_running_realtime_threads(processor_set_t pset)
5838{
5839 uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask);
5840
5841 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
5842 processor_t processor = processor_array[cpuid];
5843
5844 if (processor->processor_primary != processor) {
5845 continue;
5846 }
5847
5848 if (processor->state == PROCESSOR_IDLE) {
5849 return false;
5850 }
5851
5852 if (processor->state == PROCESSOR_DISPATCHING) {
5853 return false;
5854 }
5855
5856 if (processor->state != PROCESSOR_RUNNING) {
5857 /*
5858 * All other processor states are considered unavailable to run
5859 * realtime threads. In particular, we prefer an available secondary
5860 * processor over the risk of leaving a realtime thread on the run queue
5861 * while waiting for a processor in PROCESSOR_START state,
5862 * which should anyway be a rare case.
5863 */
5864 continue;
5865 }
5866
5867 if (processor->current_pri < BASEPRI_RTQUEUES) {
5868 return false;
5869 }
5870 }
5871
5872 return true;
5873}
5874
5875
5876