1 | /* |
2 | * Copyright (c) 2000-2016 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | /* |
29 | * @OSF_FREE_COPYRIGHT@ |
30 | */ |
31 | /* |
32 | * Mach Operating System |
33 | * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University |
34 | * All Rights Reserved. |
35 | * |
36 | * Permission to use, copy, modify and distribute this software and its |
37 | * documentation is hereby granted, provided that both the copyright |
38 | * notice and this permission notice appear in all copies of the |
39 | * software, derivative works or modified versions, and any portions |
40 | * thereof, and that both notices appear in supporting documentation. |
41 | * |
42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR |
44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
45 | * |
46 | * Carnegie Mellon requests users of this software to return to |
47 | * |
48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
49 | * School of Computer Science |
50 | * Carnegie Mellon University |
51 | * Pittsburgh PA 15213-3890 |
52 | * |
53 | * any improvements or extensions that they make and grant Carnegie Mellon |
54 | * the rights to redistribute these changes. |
55 | */ |
56 | /* |
57 | */ |
58 | /* |
59 | * File: sched_prim.c |
60 | * Author: Avadis Tevanian, Jr. |
61 | * Date: 1986 |
62 | * |
63 | * Scheduling primitives |
64 | * |
65 | */ |
66 | |
67 | #include <debug.h> |
68 | |
69 | #include <mach/mach_types.h> |
70 | #include <mach/machine.h> |
71 | #include <mach/policy.h> |
72 | #include <mach/sync_policy.h> |
73 | #include <mach/thread_act.h> |
74 | |
75 | #include <machine/machine_routines.h> |
76 | #include <machine/sched_param.h> |
77 | #include <machine/machine_cpu.h> |
78 | #include <machine/machlimits.h> |
79 | #include <machine/atomic.h> |
80 | |
81 | #include <machine/commpage.h> |
82 | |
83 | #include <kern/kern_types.h> |
84 | #include <kern/backtrace.h> |
85 | #include <kern/clock.h> |
86 | #include <kern/counters.h> |
87 | #include <kern/cpu_number.h> |
88 | #include <kern/cpu_data.h> |
89 | #include <kern/smp.h> |
90 | #include <kern/debug.h> |
91 | #include <kern/macro_help.h> |
92 | #include <kern/machine.h> |
93 | #include <kern/misc_protos.h> |
94 | #if MONOTONIC |
95 | #include <kern/monotonic.h> |
96 | #endif /* MONOTONIC */ |
97 | #include <kern/processor.h> |
98 | #include <kern/queue.h> |
99 | #include <kern/sched.h> |
100 | #include <kern/sched_prim.h> |
101 | #include <kern/sfi.h> |
102 | #include <kern/syscall_subr.h> |
103 | #include <kern/task.h> |
104 | #include <kern/thread.h> |
105 | #include <kern/ledger.h> |
106 | #include <kern/timer_queue.h> |
107 | #include <kern/waitq.h> |
108 | #include <kern/policy_internal.h> |
109 | #include <kern/cpu_quiesce.h> |
110 | |
111 | #include <vm/pmap.h> |
112 | #include <vm/vm_kern.h> |
113 | #include <vm/vm_map.h> |
114 | #include <vm/vm_pageout.h> |
115 | |
116 | #include <mach/sdt.h> |
117 | #include <mach/mach_host.h> |
118 | #include <mach/host_info.h> |
119 | |
120 | #include <sys/kdebug.h> |
121 | #include <kperf/kperf.h> |
122 | #include <kern/kpc.h> |
123 | #include <san/kasan.h> |
124 | #include <kern/pms.h> |
125 | #include <kern/host.h> |
126 | #include <stdatomic.h> |
127 | |
128 | int rt_runq_count(processor_set_t pset) |
129 | { |
130 | return atomic_load_explicit(&SCHED(rt_runq)(pset)->count, memory_order_relaxed); |
131 | } |
132 | |
133 | void rt_runq_count_incr(processor_set_t pset) |
134 | { |
135 | atomic_fetch_add_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed); |
136 | } |
137 | |
138 | void rt_runq_count_decr(processor_set_t pset) |
139 | { |
140 | atomic_fetch_sub_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed); |
141 | } |
142 | |
143 | #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ |
144 | int default_preemption_rate = DEFAULT_PREEMPTION_RATE; |
145 | |
146 | #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */ |
147 | int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE; |
148 | |
149 | #define MAX_UNSAFE_QUANTA 800 |
150 | int max_unsafe_quanta = MAX_UNSAFE_QUANTA; |
151 | |
152 | #define MAX_POLL_QUANTA 2 |
153 | int max_poll_quanta = MAX_POLL_QUANTA; |
154 | |
155 | #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */ |
156 | int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT; |
157 | |
158 | uint64_t max_poll_computation; |
159 | |
160 | uint64_t max_unsafe_computation; |
161 | uint64_t sched_safe_duration; |
162 | |
163 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
164 | |
165 | uint32_t std_quantum; |
166 | uint32_t min_std_quantum; |
167 | uint32_t bg_quantum; |
168 | |
169 | uint32_t std_quantum_us; |
170 | uint32_t bg_quantum_us; |
171 | |
172 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
173 | |
174 | uint32_t thread_depress_time; |
175 | uint32_t default_timeshare_computation; |
176 | uint32_t default_timeshare_constraint; |
177 | |
178 | uint32_t max_rt_quantum; |
179 | uint32_t min_rt_quantum; |
180 | |
181 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
182 | |
183 | unsigned sched_tick; |
184 | uint32_t sched_tick_interval; |
185 | |
186 | /* Timeshare load calculation interval (15ms) */ |
187 | uint32_t sched_load_compute_interval_us = 15000; |
188 | uint64_t sched_load_compute_interval_abs; |
189 | static _Atomic uint64_t sched_load_compute_deadline; |
190 | |
191 | uint32_t sched_pri_shifts[TH_BUCKET_MAX]; |
192 | uint32_t sched_fixed_shift; |
193 | |
194 | uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */ |
195 | |
196 | /* Allow foreground to decay past default to resolve inversions */ |
197 | #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2) |
198 | int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; |
199 | |
200 | /* Defaults for timer deadline profiling */ |
201 | #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <= |
202 | * 2ms */ |
203 | #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines |
204 | <= 5ms */ |
205 | |
206 | uint64_t timer_deadline_tracking_bin_1; |
207 | uint64_t timer_deadline_tracking_bin_2; |
208 | |
209 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
210 | |
211 | thread_t sched_maintenance_thread; |
212 | |
213 | #if __arm__ || __arm64__ |
214 | /* interrupts disabled lock to guard recommended cores state */ |
215 | decl_simple_lock_data(static,sched_recommended_cores_lock); |
216 | static void sched_recommended_cores_maintenance(void); |
217 | static void sched_update_recommended_cores(uint32_t recommended_cores); |
218 | |
219 | uint64_t perfcontrol_failsafe_starvation_threshold; |
220 | extern char *proc_name_address(struct proc *p); |
221 | |
222 | #endif /* __arm__ || __arm64__ */ |
223 | |
224 | uint64_t sched_one_second_interval; |
225 | |
226 | /* Forwards */ |
227 | |
228 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
229 | |
230 | static void load_shift_init(void); |
231 | static void preempt_pri_init(void); |
232 | |
233 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
234 | |
235 | #if CONFIG_SCHED_IDLE_IN_PLACE |
236 | static thread_t thread_select_idle( |
237 | thread_t thread, |
238 | processor_t processor); |
239 | #endif |
240 | |
241 | thread_t processor_idle( |
242 | thread_t thread, |
243 | processor_t processor); |
244 | |
245 | ast_t |
246 | csw_check_locked( processor_t processor, |
247 | processor_set_t pset, |
248 | ast_t check_reason); |
249 | |
250 | static void processor_setrun( |
251 | processor_t processor, |
252 | thread_t thread, |
253 | integer_t options); |
254 | |
255 | static void |
256 | sched_realtime_timebase_init(void); |
257 | |
258 | static void |
259 | sched_timer_deadline_tracking_init(void); |
260 | |
261 | #if DEBUG |
262 | extern int debug_task; |
263 | #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args) |
264 | #else |
265 | #define TLOG(a, fmt, args...) do {} while (0) |
266 | #endif |
267 | |
268 | static processor_t |
269 | thread_bind_internal( |
270 | thread_t thread, |
271 | processor_t processor); |
272 | |
273 | static void |
274 | sched_vm_group_maintenance(void); |
275 | |
276 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
277 | int8_t sched_load_shifts[NRQS]; |
278 | bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)]; |
279 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
280 | |
281 | const struct sched_dispatch_table *sched_current_dispatch = NULL; |
282 | |
283 | /* |
284 | * Statically allocate a buffer to hold the longest possible |
285 | * scheduler description string, as currently implemented. |
286 | * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/ |
287 | * to export to userspace via sysctl(3). If either version |
288 | * changes, update the other. |
289 | * |
290 | * Note that in addition to being an upper bound on the strings |
291 | * in the kernel, it's also an exact parameter to PE_get_default(), |
292 | * which interrogates the device tree on some platforms. That |
293 | * API requires the caller know the exact size of the device tree |
294 | * property, so we need both a legacy size (32) and the current size |
295 | * (48) to deal with old and new device trees. The device tree property |
296 | * is similarly padded to a fixed size so that the same kernel image |
297 | * can run on multiple devices with different schedulers configured |
298 | * in the device tree. |
299 | */ |
300 | char sched_string[SCHED_STRING_MAX_LENGTH]; |
301 | |
302 | uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS; |
303 | |
304 | /* Global flag which indicates whether Background Stepper Context is enabled */ |
305 | static int cpu_throttle_enabled = 1; |
306 | |
307 | #if DEBUG |
308 | |
309 | /* Since using the indirect function dispatch table has a negative impact on |
310 | * context switch performance, only allow DEBUG kernels to use that mechanism. |
311 | */ |
312 | static void |
313 | sched_init_override(void) |
314 | { |
315 | char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' }; |
316 | |
317 | /* Check for runtime selection of the scheduler algorithm */ |
318 | if (!PE_parse_boot_argn("sched" , sched_arg, sizeof (sched_arg))) { |
319 | sched_arg[0] = '\0'; |
320 | } |
321 | if (strlen(sched_arg) > 0) { |
322 | if (0) { |
323 | /* Allow pattern below */ |
324 | #if defined(CONFIG_SCHED_TRADITIONAL) |
325 | } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) { |
326 | sched_current_dispatch = &sched_traditional_dispatch; |
327 | } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) { |
328 | sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; |
329 | #endif |
330 | #if defined(CONFIG_SCHED_MULTIQ) |
331 | } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) { |
332 | sched_current_dispatch = &sched_multiq_dispatch; |
333 | } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) { |
334 | sched_current_dispatch = &sched_dualq_dispatch; |
335 | #endif |
336 | } else { |
337 | #if defined(CONFIG_SCHED_TRADITIONAL) |
338 | printf("Unrecognized scheduler algorithm: %s\n" , sched_arg); |
339 | printf("Scheduler: Using instead: %s\n" , sched_traditional_with_pset_runqueue_dispatch.sched_name); |
340 | sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; |
341 | #else |
342 | panic("Unrecognized scheduler algorithm: %s" , sched_arg); |
343 | #endif |
344 | } |
345 | kprintf("Scheduler: Runtime selection of %s\n" , SCHED(sched_name)); |
346 | } else { |
347 | #if defined(CONFIG_SCHED_MULTIQ) |
348 | sched_current_dispatch = &sched_dualq_dispatch; |
349 | #elif defined(CONFIG_SCHED_TRADITIONAL) |
350 | sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; |
351 | #else |
352 | #error No default scheduler implementation |
353 | #endif |
354 | kprintf("Scheduler: Default of %s\n" , SCHED(sched_name)); |
355 | } |
356 | } |
357 | |
358 | #endif /* DEBUG */ |
359 | |
360 | void |
361 | sched_init(void) |
362 | { |
363 | #if DEBUG |
364 | sched_init_override(); |
365 | #else /* DEBUG */ |
366 | kprintf("Scheduler: Default of %s\n" , SCHED(sched_name)); |
367 | #endif /* DEBUG */ |
368 | |
369 | if (!PE_parse_boot_argn("sched_pri_decay_limit" , &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) { |
370 | /* No boot-args, check in device tree */ |
371 | if (!PE_get_default("kern.sched_pri_decay_limit" , |
372 | &sched_pri_decay_band_limit, |
373 | sizeof(sched_pri_decay_band_limit))) { |
374 | /* Allow decay all the way to normal limits */ |
375 | sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT; |
376 | } |
377 | } |
378 | |
379 | kprintf("Setting scheduler priority decay band limit %d\n" , sched_pri_decay_band_limit); |
380 | |
381 | if (PE_parse_boot_argn("sched_debug" , &sched_debug_flags, sizeof(sched_debug_flags))) { |
382 | kprintf("Scheduler: Debug flags 0x%08x\n" , sched_debug_flags); |
383 | } |
384 | strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string)); |
385 | |
386 | cpu_quiescent_counter_init(); |
387 | |
388 | SCHED(init)(); |
389 | SCHED(rt_init)(&pset0); |
390 | sched_timer_deadline_tracking_init(); |
391 | |
392 | SCHED(pset_init)(&pset0); |
393 | SCHED(processor_init)(master_processor); |
394 | } |
395 | |
396 | void |
397 | sched_timebase_init(void) |
398 | { |
399 | uint64_t abstime; |
400 | |
401 | clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime); |
402 | sched_one_second_interval = abstime; |
403 | |
404 | SCHED(timebase_init)(); |
405 | sched_realtime_timebase_init(); |
406 | } |
407 | |
408 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
409 | |
410 | void |
411 | sched_timeshare_init(void) |
412 | { |
413 | /* |
414 | * Calculate the timeslicing quantum |
415 | * in us. |
416 | */ |
417 | if (default_preemption_rate < 1) |
418 | default_preemption_rate = DEFAULT_PREEMPTION_RATE; |
419 | std_quantum_us = (1000 * 1000) / default_preemption_rate; |
420 | |
421 | printf("standard timeslicing quantum is %d us\n" , std_quantum_us); |
422 | |
423 | if (default_bg_preemption_rate < 1) |
424 | default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE; |
425 | bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate; |
426 | |
427 | printf("standard background quantum is %d us\n" , bg_quantum_us); |
428 | |
429 | load_shift_init(); |
430 | preempt_pri_init(); |
431 | sched_tick = 0; |
432 | } |
433 | |
434 | void |
435 | sched_timeshare_timebase_init(void) |
436 | { |
437 | uint64_t abstime; |
438 | uint32_t shift; |
439 | |
440 | /* standard timeslicing quantum */ |
441 | clock_interval_to_absolutetime_interval( |
442 | std_quantum_us, NSEC_PER_USEC, &abstime); |
443 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
444 | std_quantum = (uint32_t)abstime; |
445 | |
446 | /* smallest remaining quantum (250 us) */ |
447 | clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime); |
448 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
449 | min_std_quantum = (uint32_t)abstime; |
450 | |
451 | /* quantum for background tasks */ |
452 | clock_interval_to_absolutetime_interval( |
453 | bg_quantum_us, NSEC_PER_USEC, &abstime); |
454 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
455 | bg_quantum = (uint32_t)abstime; |
456 | |
457 | /* scheduler tick interval */ |
458 | clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT, |
459 | NSEC_PER_USEC, &abstime); |
460 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
461 | sched_tick_interval = (uint32_t)abstime; |
462 | |
463 | /* timeshare load calculation interval & deadline initialization */ |
464 | clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs); |
465 | sched_load_compute_deadline = sched_load_compute_interval_abs; |
466 | |
467 | /* |
468 | * Compute conversion factor from usage to |
469 | * timesharing priorities with 5/8 ** n aging. |
470 | */ |
471 | abstime = (abstime * 5) / 3; |
472 | for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift) |
473 | abstime >>= 1; |
474 | sched_fixed_shift = shift; |
475 | |
476 | for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++) |
477 | sched_pri_shifts[i] = INT8_MAX; |
478 | |
479 | max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum; |
480 | sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum; |
481 | |
482 | max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum; |
483 | thread_depress_time = 1 * std_quantum; |
484 | default_timeshare_computation = std_quantum / 2; |
485 | default_timeshare_constraint = std_quantum; |
486 | |
487 | #if __arm__ || __arm64__ |
488 | perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval); |
489 | #endif /* __arm__ || __arm64__ */ |
490 | } |
491 | |
492 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
493 | |
494 | void |
495 | pset_rt_init(processor_set_t pset) |
496 | { |
497 | rt_lock_init(pset); |
498 | |
499 | pset->rt_runq.count = 0; |
500 | queue_init(&pset->rt_runq.queue); |
501 | memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats); |
502 | } |
503 | |
504 | rt_queue_t |
505 | sched_rtglobal_runq(processor_set_t pset) |
506 | { |
507 | (void)pset; |
508 | |
509 | return &pset0.rt_runq; |
510 | } |
511 | |
512 | void |
513 | sched_rtglobal_init(processor_set_t pset) |
514 | { |
515 | if (pset == &pset0) { |
516 | return pset_rt_init(pset); |
517 | } |
518 | |
519 | /* Only pset0 rt_runq is used, so make it easy to detect |
520 | * buggy accesses to others. |
521 | */ |
522 | memset(&pset->rt_runq, 0xfd, sizeof pset->rt_runq); |
523 | } |
524 | |
525 | void |
526 | sched_rtglobal_queue_shutdown(processor_t processor) |
527 | { |
528 | (void)processor; |
529 | } |
530 | |
531 | static void |
532 | sched_realtime_timebase_init(void) |
533 | { |
534 | uint64_t abstime; |
535 | |
536 | /* smallest rt computaton (50 us) */ |
537 | clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); |
538 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
539 | min_rt_quantum = (uint32_t)abstime; |
540 | |
541 | /* maximum rt computation (50 ms) */ |
542 | clock_interval_to_absolutetime_interval( |
543 | 50, 1000*NSEC_PER_USEC, &abstime); |
544 | assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); |
545 | max_rt_quantum = (uint32_t)abstime; |
546 | |
547 | } |
548 | |
549 | void |
550 | sched_check_spill(processor_set_t pset, thread_t thread) |
551 | { |
552 | (void)pset; |
553 | (void)thread; |
554 | |
555 | return; |
556 | } |
557 | |
558 | bool |
559 | sched_thread_should_yield(processor_t processor, thread_t thread) |
560 | { |
561 | (void)thread; |
562 | |
563 | return (!SCHED(processor_queue_empty)(processor) || rt_runq_count(processor->processor_set) > 0); |
564 | } |
565 | |
566 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
567 | |
568 | /* |
569 | * Set up values for timeshare |
570 | * loading factors. |
571 | */ |
572 | static void |
573 | load_shift_init(void) |
574 | { |
575 | int8_t k, *p = sched_load_shifts; |
576 | uint32_t i, j; |
577 | |
578 | uint32_t sched_decay_penalty = 1; |
579 | |
580 | if (PE_parse_boot_argn("sched_decay_penalty" , &sched_decay_penalty, sizeof (sched_decay_penalty))) { |
581 | kprintf("Overriding scheduler decay penalty %u\n" , sched_decay_penalty); |
582 | } |
583 | |
584 | if (PE_parse_boot_argn("sched_decay_usage_age_factor" , &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) { |
585 | kprintf("Overriding scheduler decay usage age factor %u\n" , sched_decay_usage_age_factor); |
586 | } |
587 | |
588 | if (sched_decay_penalty == 0) { |
589 | /* |
590 | * There is no penalty for timeshare threads for using too much |
591 | * CPU, so set all load shifts to INT8_MIN. Even under high load, |
592 | * sched_pri_shift will be >INT8_MAX, and there will be no |
593 | * penalty applied to threads (nor will sched_usage be updated per |
594 | * thread). |
595 | */ |
596 | for (i = 0; i < NRQS; i++) { |
597 | sched_load_shifts[i] = INT8_MIN; |
598 | } |
599 | |
600 | return; |
601 | } |
602 | |
603 | *p++ = INT8_MIN; *p++ = 0; |
604 | |
605 | /* |
606 | * For a given system load "i", the per-thread priority |
607 | * penalty per quantum of CPU usage is ~2^k priority |
608 | * levels. "sched_decay_penalty" can cause more |
609 | * array entries to be filled with smaller "k" values |
610 | */ |
611 | for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) { |
612 | for (j <<= 1; (i < j) && (i < NRQS); ++i) |
613 | *p++ = k; |
614 | } |
615 | } |
616 | |
617 | static void |
618 | preempt_pri_init(void) |
619 | { |
620 | bitmap_t *p = sched_preempt_pri; |
621 | |
622 | for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i) |
623 | bitmap_set(p, i); |
624 | |
625 | for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i) |
626 | bitmap_set(p, i); |
627 | } |
628 | |
629 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
630 | |
631 | /* |
632 | * Thread wait timer expiration. |
633 | */ |
634 | void |
635 | thread_timer_expire( |
636 | void *p0, |
637 | __unused void *p1) |
638 | { |
639 | thread_t thread = p0; |
640 | spl_t s; |
641 | |
642 | assert_thread_magic(thread); |
643 | |
644 | s = splsched(); |
645 | thread_lock(thread); |
646 | if (--thread->wait_timer_active == 0) { |
647 | if (thread->wait_timer_is_set) { |
648 | thread->wait_timer_is_set = FALSE; |
649 | clear_wait_internal(thread, THREAD_TIMED_OUT); |
650 | } |
651 | } |
652 | thread_unlock(thread); |
653 | splx(s); |
654 | } |
655 | |
656 | /* |
657 | * thread_unblock: |
658 | * |
659 | * Unblock thread on wake up. |
660 | * |
661 | * Returns TRUE if the thread should now be placed on the runqueue. |
662 | * |
663 | * Thread must be locked. |
664 | * |
665 | * Called at splsched(). |
666 | */ |
667 | boolean_t |
668 | thread_unblock( |
669 | thread_t thread, |
670 | wait_result_t wresult) |
671 | { |
672 | boolean_t ready_for_runq = FALSE; |
673 | thread_t cthread = current_thread(); |
674 | uint32_t new_run_count; |
675 | int old_thread_state; |
676 | |
677 | /* |
678 | * Set wait_result. |
679 | */ |
680 | thread->wait_result = wresult; |
681 | |
682 | /* |
683 | * Cancel pending wait timer. |
684 | */ |
685 | if (thread->wait_timer_is_set) { |
686 | if (timer_call_cancel(&thread->wait_timer)) |
687 | thread->wait_timer_active--; |
688 | thread->wait_timer_is_set = FALSE; |
689 | } |
690 | |
691 | /* |
692 | * Update scheduling state: not waiting, |
693 | * set running. |
694 | */ |
695 | old_thread_state = thread->state; |
696 | thread->state = (old_thread_state | TH_RUN) & |
697 | ~(TH_WAIT|TH_UNINT|TH_WAIT_REPORT); |
698 | |
699 | if ((old_thread_state & TH_RUN) == 0) { |
700 | uint64_t ctime = mach_approximate_time(); |
701 | thread->last_made_runnable_time = thread->last_basepri_change_time = ctime; |
702 | timer_start(&thread->runnable_timer, ctime); |
703 | |
704 | ready_for_runq = TRUE; |
705 | |
706 | if (old_thread_state & TH_WAIT_REPORT) { |
707 | (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); |
708 | } |
709 | |
710 | /* Update the runnable thread count */ |
711 | new_run_count = sched_run_incr(thread); |
712 | } else { |
713 | /* |
714 | * Either the thread is idling in place on another processor, |
715 | * or it hasn't finished context switching yet. |
716 | */ |
717 | #if CONFIG_SCHED_IDLE_IN_PLACE |
718 | if (thread->state & TH_IDLE) { |
719 | processor_t processor = thread->last_processor; |
720 | |
721 | if (processor != current_processor()) |
722 | machine_signal_idle(processor); |
723 | } |
724 | #else |
725 | assert((thread->state & TH_IDLE) == 0); |
726 | #endif |
727 | /* |
728 | * The run count is only dropped after the context switch completes |
729 | * and the thread is still waiting, so we should not run_incr here |
730 | */ |
731 | new_run_count = sched_run_buckets[TH_BUCKET_RUN]; |
732 | } |
733 | |
734 | |
735 | /* |
736 | * Calculate deadline for real-time threads. |
737 | */ |
738 | if (thread->sched_mode == TH_MODE_REALTIME) { |
739 | uint64_t ctime; |
740 | |
741 | ctime = mach_absolute_time(); |
742 | thread->realtime.deadline = thread->realtime.constraint + ctime; |
743 | } |
744 | |
745 | /* |
746 | * Clear old quantum, fail-safe computation, etc. |
747 | */ |
748 | thread->quantum_remaining = 0; |
749 | thread->computation_metered = 0; |
750 | thread->reason = AST_NONE; |
751 | thread->block_hint = kThreadWaitNone; |
752 | |
753 | /* Obtain power-relevant interrupt and "platform-idle exit" statistics. |
754 | * We also account for "double hop" thread signaling via |
755 | * the thread callout infrastructure. |
756 | * DRK: consider removing the callout wakeup counters in the future |
757 | * they're present for verification at the moment. |
758 | */ |
759 | boolean_t aticontext, pidle; |
760 | ml_get_power_state(&aticontext, &pidle); |
761 | |
762 | if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) { |
763 | DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); |
764 | |
765 | uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd); |
766 | |
767 | if (ttd) { |
768 | if (ttd <= timer_deadline_tracking_bin_1) |
769 | thread->thread_timer_wakeups_bin_1++; |
770 | else |
771 | if (ttd <= timer_deadline_tracking_bin_2) |
772 | thread->thread_timer_wakeups_bin_2++; |
773 | } |
774 | |
775 | ledger_credit_thread(thread, thread->t_ledger, |
776 | task_ledgers.interrupt_wakeups, 1); |
777 | if (pidle) { |
778 | ledger_credit_thread(thread, thread->t_ledger, |
779 | task_ledgers.platform_idle_wakeups, 1); |
780 | } |
781 | |
782 | } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) { |
783 | /* TODO: what about an interrupt that does a wake taken on a callout thread? */ |
784 | if (cthread->callout_woken_from_icontext) { |
785 | ledger_credit_thread(thread, thread->t_ledger, |
786 | task_ledgers.interrupt_wakeups, 1); |
787 | thread->thread_callout_interrupt_wakeups++; |
788 | |
789 | if (cthread->callout_woken_from_platform_idle) { |
790 | ledger_credit_thread(thread, thread->t_ledger, |
791 | task_ledgers.platform_idle_wakeups, 1); |
792 | thread->thread_callout_platform_idle_wakeups++; |
793 | } |
794 | |
795 | cthread->callout_woke_thread = TRUE; |
796 | } |
797 | } |
798 | |
799 | if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) { |
800 | thread->callout_woken_from_icontext = aticontext; |
801 | thread->callout_woken_from_platform_idle = pidle; |
802 | thread->callout_woke_thread = FALSE; |
803 | } |
804 | |
805 | #if KPERF |
806 | if (ready_for_runq) { |
807 | kperf_make_runnable(thread, aticontext); |
808 | } |
809 | #endif /* KPERF */ |
810 | |
811 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
812 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, |
813 | (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, |
814 | sched_run_buckets[TH_BUCKET_RUN], 0); |
815 | |
816 | DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); |
817 | |
818 | return (ready_for_runq); |
819 | } |
820 | |
821 | /* |
822 | * Routine: thread_go |
823 | * Purpose: |
824 | * Unblock and dispatch thread. |
825 | * Conditions: |
826 | * thread lock held, IPC locks may be held. |
827 | * thread must have been pulled from wait queue under same lock hold. |
828 | * thread must have been waiting |
829 | * Returns: |
830 | * KERN_SUCCESS - Thread was set running |
831 | * |
832 | * TODO: This should return void |
833 | */ |
834 | kern_return_t |
835 | thread_go( |
836 | thread_t thread, |
837 | wait_result_t wresult) |
838 | { |
839 | assert_thread_magic(thread); |
840 | |
841 | assert(thread->at_safe_point == FALSE); |
842 | assert(thread->wait_event == NO_EVENT64); |
843 | assert(thread->waitq == NULL); |
844 | |
845 | assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2))); |
846 | assert(thread->state & TH_WAIT); |
847 | |
848 | |
849 | if (thread_unblock(thread, wresult)) { |
850 | #if SCHED_TRACE_THREAD_WAKEUPS |
851 | backtrace(&thread->thread_wakeup_bt[0], |
852 | (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t))); |
853 | #endif |
854 | thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ); |
855 | } |
856 | |
857 | return (KERN_SUCCESS); |
858 | } |
859 | |
860 | /* |
861 | * Routine: thread_mark_wait_locked |
862 | * Purpose: |
863 | * Mark a thread as waiting. If, given the circumstances, |
864 | * it doesn't want to wait (i.e. already aborted), then |
865 | * indicate that in the return value. |
866 | * Conditions: |
867 | * at splsched() and thread is locked. |
868 | */ |
869 | __private_extern__ |
870 | wait_result_t |
871 | thread_mark_wait_locked( |
872 | thread_t thread, |
873 | wait_interrupt_t interruptible_orig) |
874 | { |
875 | boolean_t at_safe_point; |
876 | wait_interrupt_t interruptible = interruptible_orig; |
877 | |
878 | assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2|TH_WAIT_REPORT))); |
879 | |
880 | /* |
881 | * The thread may have certain types of interrupts/aborts masked |
882 | * off. Even if the wait location says these types of interrupts |
883 | * are OK, we have to honor mask settings (outer-scoped code may |
884 | * not be able to handle aborts at the moment). |
885 | */ |
886 | interruptible &= TH_OPT_INTMASK; |
887 | if (interruptible > (thread->options & TH_OPT_INTMASK)) |
888 | interruptible = thread->options & TH_OPT_INTMASK; |
889 | |
890 | at_safe_point = (interruptible == THREAD_ABORTSAFE); |
891 | |
892 | if ( interruptible == THREAD_UNINT || |
893 | !(thread->sched_flags & TH_SFLAG_ABORT) || |
894 | (!at_safe_point && |
895 | (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) { |
896 | |
897 | if ( !(thread->state & TH_TERMINATE)) |
898 | DTRACE_SCHED(sleep); |
899 | |
900 | int state_bits = TH_WAIT; |
901 | if (!interruptible) { |
902 | state_bits |= TH_UNINT; |
903 | } |
904 | if (thread->sched_call) { |
905 | wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER; |
906 | if (is_kerneltask(thread->task)) { |
907 | mask = THREAD_WAIT_NOREPORT_KERNEL; |
908 | } |
909 | if ((interruptible_orig & mask) == 0) { |
910 | state_bits |= TH_WAIT_REPORT; |
911 | } |
912 | } |
913 | thread->state |= state_bits; |
914 | thread->at_safe_point = at_safe_point; |
915 | |
916 | /* TODO: pass this through assert_wait instead, have |
917 | * assert_wait just take a struct as an argument */ |
918 | assert(!thread->block_hint); |
919 | thread->block_hint = thread->pending_block_hint; |
920 | thread->pending_block_hint = kThreadWaitNone; |
921 | |
922 | return (thread->wait_result = THREAD_WAITING); |
923 | } else { |
924 | if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) |
925 | thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; |
926 | } |
927 | thread->pending_block_hint = kThreadWaitNone; |
928 | |
929 | return (thread->wait_result = THREAD_INTERRUPTED); |
930 | } |
931 | |
932 | /* |
933 | * Routine: thread_interrupt_level |
934 | * Purpose: |
935 | * Set the maximum interruptible state for the |
936 | * current thread. The effective value of any |
937 | * interruptible flag passed into assert_wait |
938 | * will never exceed this. |
939 | * |
940 | * Useful for code that must not be interrupted, |
941 | * but which calls code that doesn't know that. |
942 | * Returns: |
943 | * The old interrupt level for the thread. |
944 | */ |
945 | __private_extern__ |
946 | wait_interrupt_t |
947 | thread_interrupt_level( |
948 | wait_interrupt_t new_level) |
949 | { |
950 | thread_t thread = current_thread(); |
951 | wait_interrupt_t result = thread->options & TH_OPT_INTMASK; |
952 | |
953 | thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK); |
954 | |
955 | return result; |
956 | } |
957 | |
958 | /* |
959 | * assert_wait: |
960 | * |
961 | * Assert that the current thread is about to go to |
962 | * sleep until the specified event occurs. |
963 | */ |
964 | wait_result_t |
965 | assert_wait( |
966 | event_t event, |
967 | wait_interrupt_t interruptible) |
968 | { |
969 | if (__improbable(event == NO_EVENT)) |
970 | panic("%s() called with NO_EVENT" , __func__); |
971 | |
972 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
973 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, |
974 | VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0); |
975 | |
976 | struct waitq *waitq; |
977 | waitq = global_eventq(event); |
978 | return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER); |
979 | } |
980 | |
981 | /* |
982 | * assert_wait_queue: |
983 | * |
984 | * Return the global waitq for the specified event |
985 | */ |
986 | struct waitq * |
987 | assert_wait_queue( |
988 | event_t event) |
989 | { |
990 | return global_eventq(event); |
991 | } |
992 | |
993 | wait_result_t |
994 | assert_wait_timeout( |
995 | event_t event, |
996 | wait_interrupt_t interruptible, |
997 | uint32_t interval, |
998 | uint32_t scale_factor) |
999 | { |
1000 | thread_t thread = current_thread(); |
1001 | wait_result_t wresult; |
1002 | uint64_t deadline; |
1003 | spl_t s; |
1004 | |
1005 | if (__improbable(event == NO_EVENT)) |
1006 | panic("%s() called with NO_EVENT" , __func__); |
1007 | |
1008 | struct waitq *waitq; |
1009 | waitq = global_eventq(event); |
1010 | |
1011 | s = splsched(); |
1012 | waitq_lock(waitq); |
1013 | |
1014 | clock_interval_to_deadline(interval, scale_factor, &deadline); |
1015 | |
1016 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
1017 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, |
1018 | VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); |
1019 | |
1020 | wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), |
1021 | interruptible, |
1022 | TIMEOUT_URGENCY_SYS_NORMAL, |
1023 | deadline, TIMEOUT_NO_LEEWAY, |
1024 | thread); |
1025 | |
1026 | waitq_unlock(waitq); |
1027 | splx(s); |
1028 | return wresult; |
1029 | } |
1030 | |
1031 | wait_result_t |
1032 | assert_wait_timeout_with_leeway( |
1033 | event_t event, |
1034 | wait_interrupt_t interruptible, |
1035 | wait_timeout_urgency_t urgency, |
1036 | uint32_t interval, |
1037 | uint32_t leeway, |
1038 | uint32_t scale_factor) |
1039 | { |
1040 | thread_t thread = current_thread(); |
1041 | wait_result_t wresult; |
1042 | uint64_t deadline; |
1043 | uint64_t abstime; |
1044 | uint64_t slop; |
1045 | uint64_t now; |
1046 | spl_t s; |
1047 | |
1048 | if (__improbable(event == NO_EVENT)) |
1049 | panic("%s() called with NO_EVENT" , __func__); |
1050 | |
1051 | now = mach_absolute_time(); |
1052 | clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime); |
1053 | deadline = now + abstime; |
1054 | |
1055 | clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop); |
1056 | |
1057 | struct waitq *waitq; |
1058 | waitq = global_eventq(event); |
1059 | |
1060 | s = splsched(); |
1061 | waitq_lock(waitq); |
1062 | |
1063 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
1064 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, |
1065 | VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); |
1066 | |
1067 | wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), |
1068 | interruptible, |
1069 | urgency, deadline, slop, |
1070 | thread); |
1071 | |
1072 | waitq_unlock(waitq); |
1073 | splx(s); |
1074 | return wresult; |
1075 | } |
1076 | |
1077 | wait_result_t |
1078 | assert_wait_deadline( |
1079 | event_t event, |
1080 | wait_interrupt_t interruptible, |
1081 | uint64_t deadline) |
1082 | { |
1083 | thread_t thread = current_thread(); |
1084 | wait_result_t wresult; |
1085 | spl_t s; |
1086 | |
1087 | if (__improbable(event == NO_EVENT)) |
1088 | panic("%s() called with NO_EVENT" , __func__); |
1089 | |
1090 | struct waitq *waitq; |
1091 | waitq = global_eventq(event); |
1092 | |
1093 | s = splsched(); |
1094 | waitq_lock(waitq); |
1095 | |
1096 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
1097 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, |
1098 | VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); |
1099 | |
1100 | wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), |
1101 | interruptible, |
1102 | TIMEOUT_URGENCY_SYS_NORMAL, deadline, |
1103 | TIMEOUT_NO_LEEWAY, thread); |
1104 | waitq_unlock(waitq); |
1105 | splx(s); |
1106 | return wresult; |
1107 | } |
1108 | |
1109 | wait_result_t |
1110 | assert_wait_deadline_with_leeway( |
1111 | event_t event, |
1112 | wait_interrupt_t interruptible, |
1113 | wait_timeout_urgency_t urgency, |
1114 | uint64_t deadline, |
1115 | uint64_t leeway) |
1116 | { |
1117 | thread_t thread = current_thread(); |
1118 | wait_result_t wresult; |
1119 | spl_t s; |
1120 | |
1121 | if (__improbable(event == NO_EVENT)) |
1122 | panic("%s() called with NO_EVENT" , __func__); |
1123 | |
1124 | struct waitq *waitq; |
1125 | waitq = global_eventq(event); |
1126 | |
1127 | s = splsched(); |
1128 | waitq_lock(waitq); |
1129 | |
1130 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
1131 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, |
1132 | VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); |
1133 | |
1134 | wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), |
1135 | interruptible, |
1136 | urgency, deadline, leeway, |
1137 | thread); |
1138 | waitq_unlock(waitq); |
1139 | splx(s); |
1140 | return wresult; |
1141 | } |
1142 | |
1143 | /* |
1144 | * thread_isoncpu: |
1145 | * |
1146 | * Return TRUE if a thread is running on a processor such that an AST |
1147 | * is needed to pull it out of userspace execution, or if executing in |
1148 | * the kernel, bring to a context switch boundary that would cause |
1149 | * thread state to be serialized in the thread PCB. |
1150 | * |
1151 | * Thread locked, returns the same way. While locked, fields |
1152 | * like "state" cannot change. "runq" can change only from set to unset. |
1153 | */ |
1154 | static inline boolean_t |
1155 | thread_isoncpu(thread_t thread) |
1156 | { |
1157 | /* Not running or runnable */ |
1158 | if (!(thread->state & TH_RUN)) |
1159 | return (FALSE); |
1160 | |
1161 | /* Waiting on a runqueue, not currently running */ |
1162 | /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */ |
1163 | if (thread->runq != PROCESSOR_NULL) |
1164 | return (FALSE); |
1165 | |
1166 | /* |
1167 | * Thread does not have a stack yet |
1168 | * It could be on the stack alloc queue or preparing to be invoked |
1169 | */ |
1170 | if (!thread->kernel_stack) |
1171 | return (FALSE); |
1172 | |
1173 | /* |
1174 | * Thread must be running on a processor, or |
1175 | * about to run, or just did run. In all these |
1176 | * cases, an AST to the processor is needed |
1177 | * to guarantee that the thread is kicked out |
1178 | * of userspace and the processor has |
1179 | * context switched (and saved register state). |
1180 | */ |
1181 | return (TRUE); |
1182 | } |
1183 | |
1184 | /* |
1185 | * thread_stop: |
1186 | * |
1187 | * Force a preemption point for a thread and wait |
1188 | * for it to stop running on a CPU. If a stronger |
1189 | * guarantee is requested, wait until no longer |
1190 | * runnable. Arbitrates access among |
1191 | * multiple stop requests. (released by unstop) |
1192 | * |
1193 | * The thread must enter a wait state and stop via a |
1194 | * separate means. |
1195 | * |
1196 | * Returns FALSE if interrupted. |
1197 | */ |
1198 | boolean_t |
1199 | thread_stop( |
1200 | thread_t thread, |
1201 | boolean_t until_not_runnable) |
1202 | { |
1203 | wait_result_t wresult; |
1204 | spl_t s = splsched(); |
1205 | boolean_t oncpu; |
1206 | |
1207 | wake_lock(thread); |
1208 | thread_lock(thread); |
1209 | |
1210 | while (thread->state & TH_SUSP) { |
1211 | thread->wake_active = TRUE; |
1212 | thread_unlock(thread); |
1213 | |
1214 | wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE); |
1215 | wake_unlock(thread); |
1216 | splx(s); |
1217 | |
1218 | if (wresult == THREAD_WAITING) |
1219 | wresult = thread_block(THREAD_CONTINUE_NULL); |
1220 | |
1221 | if (wresult != THREAD_AWAKENED) |
1222 | return (FALSE); |
1223 | |
1224 | s = splsched(); |
1225 | wake_lock(thread); |
1226 | thread_lock(thread); |
1227 | } |
1228 | |
1229 | thread->state |= TH_SUSP; |
1230 | |
1231 | while ((oncpu = thread_isoncpu(thread)) || |
1232 | (until_not_runnable && (thread->state & TH_RUN))) { |
1233 | processor_t processor; |
1234 | |
1235 | if (oncpu) { |
1236 | assert(thread->state & TH_RUN); |
1237 | processor = thread->chosen_processor; |
1238 | cause_ast_check(processor); |
1239 | } |
1240 | |
1241 | thread->wake_active = TRUE; |
1242 | thread_unlock(thread); |
1243 | |
1244 | wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE); |
1245 | wake_unlock(thread); |
1246 | splx(s); |
1247 | |
1248 | if (wresult == THREAD_WAITING) |
1249 | wresult = thread_block(THREAD_CONTINUE_NULL); |
1250 | |
1251 | if (wresult != THREAD_AWAKENED) { |
1252 | thread_unstop(thread); |
1253 | return (FALSE); |
1254 | } |
1255 | |
1256 | s = splsched(); |
1257 | wake_lock(thread); |
1258 | thread_lock(thread); |
1259 | } |
1260 | |
1261 | thread_unlock(thread); |
1262 | wake_unlock(thread); |
1263 | splx(s); |
1264 | |
1265 | /* |
1266 | * We return with the thread unlocked. To prevent it from |
1267 | * transitioning to a runnable state (or from TH_RUN to |
1268 | * being on the CPU), the caller must ensure the thread |
1269 | * is stopped via an external means (such as an AST) |
1270 | */ |
1271 | |
1272 | return (TRUE); |
1273 | } |
1274 | |
1275 | /* |
1276 | * thread_unstop: |
1277 | * |
1278 | * Release a previous stop request and set |
1279 | * the thread running if appropriate. |
1280 | * |
1281 | * Use only after a successful stop operation. |
1282 | */ |
1283 | void |
1284 | thread_unstop( |
1285 | thread_t thread) |
1286 | { |
1287 | spl_t s = splsched(); |
1288 | |
1289 | wake_lock(thread); |
1290 | thread_lock(thread); |
1291 | |
1292 | assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP); |
1293 | |
1294 | if (thread->state & TH_SUSP) { |
1295 | thread->state &= ~TH_SUSP; |
1296 | |
1297 | if (thread->wake_active) { |
1298 | thread->wake_active = FALSE; |
1299 | thread_unlock(thread); |
1300 | |
1301 | thread_wakeup(&thread->wake_active); |
1302 | wake_unlock(thread); |
1303 | splx(s); |
1304 | |
1305 | return; |
1306 | } |
1307 | } |
1308 | |
1309 | thread_unlock(thread); |
1310 | wake_unlock(thread); |
1311 | splx(s); |
1312 | } |
1313 | |
1314 | /* |
1315 | * thread_wait: |
1316 | * |
1317 | * Wait for a thread to stop running. (non-interruptible) |
1318 | * |
1319 | */ |
1320 | void |
1321 | thread_wait( |
1322 | thread_t thread, |
1323 | boolean_t until_not_runnable) |
1324 | { |
1325 | wait_result_t wresult; |
1326 | boolean_t oncpu; |
1327 | processor_t processor; |
1328 | spl_t s = splsched(); |
1329 | |
1330 | wake_lock(thread); |
1331 | thread_lock(thread); |
1332 | |
1333 | /* |
1334 | * Wait until not running on a CPU. If stronger requirement |
1335 | * desired, wait until not runnable. Assumption: if thread is |
1336 | * on CPU, then TH_RUN is set, so we're not waiting in any case |
1337 | * where the original, pure "TH_RUN" check would have let us |
1338 | * finish. |
1339 | */ |
1340 | while ((oncpu = thread_isoncpu(thread)) || |
1341 | (until_not_runnable && (thread->state & TH_RUN))) { |
1342 | |
1343 | if (oncpu) { |
1344 | assert(thread->state & TH_RUN); |
1345 | processor = thread->chosen_processor; |
1346 | cause_ast_check(processor); |
1347 | } |
1348 | |
1349 | thread->wake_active = TRUE; |
1350 | thread_unlock(thread); |
1351 | |
1352 | wresult = assert_wait(&thread->wake_active, THREAD_UNINT); |
1353 | wake_unlock(thread); |
1354 | splx(s); |
1355 | |
1356 | if (wresult == THREAD_WAITING) |
1357 | thread_block(THREAD_CONTINUE_NULL); |
1358 | |
1359 | s = splsched(); |
1360 | wake_lock(thread); |
1361 | thread_lock(thread); |
1362 | } |
1363 | |
1364 | thread_unlock(thread); |
1365 | wake_unlock(thread); |
1366 | splx(s); |
1367 | } |
1368 | |
1369 | /* |
1370 | * Routine: clear_wait_internal |
1371 | * |
1372 | * Clear the wait condition for the specified thread. |
1373 | * Start the thread executing if that is appropriate. |
1374 | * Arguments: |
1375 | * thread thread to awaken |
1376 | * result Wakeup result the thread should see |
1377 | * Conditions: |
1378 | * At splsched |
1379 | * the thread is locked. |
1380 | * Returns: |
1381 | * KERN_SUCCESS thread was rousted out a wait |
1382 | * KERN_FAILURE thread was waiting but could not be rousted |
1383 | * KERN_NOT_WAITING thread was not waiting |
1384 | */ |
1385 | __private_extern__ kern_return_t |
1386 | clear_wait_internal( |
1387 | thread_t thread, |
1388 | wait_result_t wresult) |
1389 | { |
1390 | uint32_t i = LockTimeOutUsec; |
1391 | struct waitq *waitq = thread->waitq; |
1392 | |
1393 | do { |
1394 | if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT)) |
1395 | return (KERN_FAILURE); |
1396 | |
1397 | if (waitq != NULL) { |
1398 | if (!waitq_pull_thread_locked(waitq, thread)) { |
1399 | thread_unlock(thread); |
1400 | delay(1); |
1401 | if (i > 0 && !machine_timeout_suspended()) |
1402 | i--; |
1403 | thread_lock(thread); |
1404 | if (waitq != thread->waitq) |
1405 | return KERN_NOT_WAITING; |
1406 | continue; |
1407 | } |
1408 | } |
1409 | |
1410 | /* TODO: Can we instead assert TH_TERMINATE is not set? */ |
1411 | if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT) |
1412 | return (thread_go(thread, wresult)); |
1413 | else |
1414 | return (KERN_NOT_WAITING); |
1415 | } while (i > 0); |
1416 | |
1417 | panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n" , |
1418 | thread, waitq, cpu_number()); |
1419 | |
1420 | return (KERN_FAILURE); |
1421 | } |
1422 | |
1423 | |
1424 | /* |
1425 | * clear_wait: |
1426 | * |
1427 | * Clear the wait condition for the specified thread. Start the thread |
1428 | * executing if that is appropriate. |
1429 | * |
1430 | * parameters: |
1431 | * thread thread to awaken |
1432 | * result Wakeup result the thread should see |
1433 | */ |
1434 | kern_return_t |
1435 | clear_wait( |
1436 | thread_t thread, |
1437 | wait_result_t result) |
1438 | { |
1439 | kern_return_t ret; |
1440 | spl_t s; |
1441 | |
1442 | s = splsched(); |
1443 | thread_lock(thread); |
1444 | ret = clear_wait_internal(thread, result); |
1445 | thread_unlock(thread); |
1446 | splx(s); |
1447 | return ret; |
1448 | } |
1449 | |
1450 | |
1451 | /* |
1452 | * thread_wakeup_prim: |
1453 | * |
1454 | * Common routine for thread_wakeup, thread_wakeup_with_result, |
1455 | * and thread_wakeup_one. |
1456 | * |
1457 | */ |
1458 | kern_return_t |
1459 | thread_wakeup_prim( |
1460 | event_t event, |
1461 | boolean_t one_thread, |
1462 | wait_result_t result) |
1463 | { |
1464 | if (__improbable(event == NO_EVENT)) |
1465 | panic("%s() called with NO_EVENT" , __func__); |
1466 | |
1467 | struct waitq *wq = global_eventq(event); |
1468 | |
1469 | if (one_thread) |
1470 | return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); |
1471 | else |
1472 | return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES); |
1473 | } |
1474 | |
1475 | /* |
1476 | * Wakeup a specified thread if and only if it's waiting for this event |
1477 | */ |
1478 | kern_return_t |
1479 | thread_wakeup_thread( |
1480 | event_t event, |
1481 | thread_t thread) |
1482 | { |
1483 | if (__improbable(event == NO_EVENT)) |
1484 | panic("%s() called with NO_EVENT" , __func__); |
1485 | |
1486 | if (__improbable(thread == THREAD_NULL)) |
1487 | panic("%s() called with THREAD_NULL" , __func__); |
1488 | |
1489 | struct waitq *wq = global_eventq(event); |
1490 | |
1491 | return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED); |
1492 | } |
1493 | |
1494 | /* |
1495 | * Wakeup a thread waiting on an event and promote it to a priority. |
1496 | * |
1497 | * Requires woken thread to un-promote itself when done. |
1498 | */ |
1499 | kern_return_t |
1500 | thread_wakeup_one_with_pri( |
1501 | event_t event, |
1502 | int priority) |
1503 | { |
1504 | if (__improbable(event == NO_EVENT)) |
1505 | panic("%s() called with NO_EVENT" , __func__); |
1506 | |
1507 | struct waitq *wq = global_eventq(event); |
1508 | |
1509 | return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); |
1510 | } |
1511 | |
1512 | /* |
1513 | * Wakeup a thread waiting on an event, |
1514 | * promote it to a priority, |
1515 | * and return a reference to the woken thread. |
1516 | * |
1517 | * Requires woken thread to un-promote itself when done. |
1518 | */ |
1519 | thread_t |
1520 | thread_wakeup_identify(event_t event, |
1521 | int priority) |
1522 | { |
1523 | if (__improbable(event == NO_EVENT)) |
1524 | panic("%s() called with NO_EVENT" , __func__); |
1525 | |
1526 | struct waitq *wq = global_eventq(event); |
1527 | |
1528 | return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); |
1529 | } |
1530 | |
1531 | /* |
1532 | * thread_bind: |
1533 | * |
1534 | * Force the current thread to execute on the specified processor. |
1535 | * Takes effect after the next thread_block(). |
1536 | * |
1537 | * Returns the previous binding. PROCESSOR_NULL means |
1538 | * not bound. |
1539 | * |
1540 | * XXX - DO NOT export this to users - XXX |
1541 | */ |
1542 | processor_t |
1543 | thread_bind( |
1544 | processor_t processor) |
1545 | { |
1546 | thread_t self = current_thread(); |
1547 | processor_t prev; |
1548 | spl_t s; |
1549 | |
1550 | s = splsched(); |
1551 | thread_lock(self); |
1552 | |
1553 | prev = thread_bind_internal(self, processor); |
1554 | |
1555 | thread_unlock(self); |
1556 | splx(s); |
1557 | |
1558 | return (prev); |
1559 | } |
1560 | |
1561 | /* |
1562 | * thread_bind_internal: |
1563 | * |
1564 | * If the specified thread is not the current thread, and it is currently |
1565 | * running on another CPU, a remote AST must be sent to that CPU to cause |
1566 | * the thread to migrate to its bound processor. Otherwise, the migration |
1567 | * will occur at the next quantum expiration or blocking point. |
1568 | * |
1569 | * When the thread is the current thread, and explicit thread_block() should |
1570 | * be used to force the current processor to context switch away and |
1571 | * let the thread migrate to the bound processor. |
1572 | * |
1573 | * Thread must be locked, and at splsched. |
1574 | */ |
1575 | |
1576 | static processor_t |
1577 | thread_bind_internal( |
1578 | thread_t thread, |
1579 | processor_t processor) |
1580 | { |
1581 | processor_t prev; |
1582 | |
1583 | /* <rdar://problem/15102234> */ |
1584 | assert(thread->sched_pri < BASEPRI_RTQUEUES); |
1585 | /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */ |
1586 | assert(thread->runq == PROCESSOR_NULL); |
1587 | |
1588 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0); |
1589 | |
1590 | prev = thread->bound_processor; |
1591 | thread->bound_processor = processor; |
1592 | |
1593 | return (prev); |
1594 | } |
1595 | |
1596 | /* |
1597 | * thread_vm_bind_group_add: |
1598 | * |
1599 | * The "VM bind group" is a special mechanism to mark a collection |
1600 | * of threads from the VM subsystem that, in general, should be scheduled |
1601 | * with only one CPU of parallelism. To accomplish this, we initially |
1602 | * bind all the threads to the master processor, which has the effect |
1603 | * that only one of the threads in the group can execute at once, including |
1604 | * preempting threads in the group that are a lower priority. Future |
1605 | * mechanisms may use more dynamic mechanisms to prevent the collection |
1606 | * of VM threads from using more CPU time than desired. |
1607 | * |
1608 | * The current implementation can result in priority inversions where |
1609 | * compute-bound priority 95 or realtime threads that happen to have |
1610 | * landed on the master processor prevent the VM threads from running. |
1611 | * When this situation is detected, we unbind the threads for one |
1612 | * scheduler tick to allow the scheduler to run the threads an |
1613 | * additional CPUs, before restoring the binding (assuming high latency |
1614 | * is no longer a problem). |
1615 | */ |
1616 | |
1617 | /* |
1618 | * The current max is provisioned for: |
1619 | * vm_compressor_swap_trigger_thread (92) |
1620 | * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE |
1621 | * vm_pageout_continue (92) |
1622 | * memorystatus_thread (95) |
1623 | */ |
1624 | #define MAX_VM_BIND_GROUP_COUNT (5) |
1625 | decl_simple_lock_data(static,sched_vm_group_list_lock); |
1626 | static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT]; |
1627 | static int sched_vm_group_thread_count; |
1628 | static boolean_t sched_vm_group_temporarily_unbound = FALSE; |
1629 | |
1630 | void |
1631 | thread_vm_bind_group_add(void) |
1632 | { |
1633 | thread_t self = current_thread(); |
1634 | |
1635 | thread_reference_internal(self); |
1636 | self->options |= TH_OPT_SCHED_VM_GROUP; |
1637 | |
1638 | simple_lock(&sched_vm_group_list_lock); |
1639 | assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT); |
1640 | sched_vm_group_thread_list[sched_vm_group_thread_count++] = self; |
1641 | simple_unlock(&sched_vm_group_list_lock); |
1642 | |
1643 | thread_bind(master_processor); |
1644 | |
1645 | /* Switch to bound processor if not already there */ |
1646 | thread_block(THREAD_CONTINUE_NULL); |
1647 | } |
1648 | |
1649 | static void |
1650 | sched_vm_group_maintenance(void) |
1651 | { |
1652 | uint64_t ctime = mach_absolute_time(); |
1653 | uint64_t longtime = ctime - sched_tick_interval; |
1654 | int i; |
1655 | spl_t s; |
1656 | boolean_t high_latency_observed = FALSE; |
1657 | boolean_t runnable_and_not_on_runq_observed = FALSE; |
1658 | boolean_t bind_target_changed = FALSE; |
1659 | processor_t bind_target = PROCESSOR_NULL; |
1660 | |
1661 | /* Make sure nobody attempts to add new threads while we are enumerating them */ |
1662 | simple_lock(&sched_vm_group_list_lock); |
1663 | |
1664 | s = splsched(); |
1665 | |
1666 | for (i=0; i < sched_vm_group_thread_count; i++) { |
1667 | thread_t thread = sched_vm_group_thread_list[i]; |
1668 | assert(thread != THREAD_NULL); |
1669 | thread_lock(thread); |
1670 | if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) { |
1671 | if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) { |
1672 | high_latency_observed = TRUE; |
1673 | } else if (thread->runq == PROCESSOR_NULL) { |
1674 | /* There are some cases where a thread be transitiong that also fall into this case */ |
1675 | runnable_and_not_on_runq_observed = TRUE; |
1676 | } |
1677 | } |
1678 | thread_unlock(thread); |
1679 | |
1680 | if (high_latency_observed && runnable_and_not_on_runq_observed) { |
1681 | /* All the things we are looking for are true, stop looking */ |
1682 | break; |
1683 | } |
1684 | } |
1685 | |
1686 | splx(s); |
1687 | |
1688 | if (sched_vm_group_temporarily_unbound) { |
1689 | /* If we turned off binding, make sure everything is OK before rebinding */ |
1690 | if (!high_latency_observed) { |
1691 | /* rebind */ |
1692 | bind_target_changed = TRUE; |
1693 | bind_target = master_processor; |
1694 | sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */ |
1695 | } |
1696 | } else { |
1697 | /* |
1698 | * Check if we're in a bad state, which is defined by high |
1699 | * latency with no core currently executing a thread. If a |
1700 | * single thread is making progress on a CPU, that means the |
1701 | * binding concept to reduce parallelism is working as |
1702 | * designed. |
1703 | */ |
1704 | if (high_latency_observed && !runnable_and_not_on_runq_observed) { |
1705 | /* unbind */ |
1706 | bind_target_changed = TRUE; |
1707 | bind_target = PROCESSOR_NULL; |
1708 | sched_vm_group_temporarily_unbound = TRUE; |
1709 | } |
1710 | } |
1711 | |
1712 | if (bind_target_changed) { |
1713 | s = splsched(); |
1714 | for (i=0; i < sched_vm_group_thread_count; i++) { |
1715 | thread_t thread = sched_vm_group_thread_list[i]; |
1716 | boolean_t removed; |
1717 | assert(thread != THREAD_NULL); |
1718 | |
1719 | thread_lock(thread); |
1720 | removed = thread_run_queue_remove(thread); |
1721 | if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) { |
1722 | thread_bind_internal(thread, bind_target); |
1723 | } else { |
1724 | /* |
1725 | * Thread was in the middle of being context-switched-to, |
1726 | * or was in the process of blocking. To avoid switching the bind |
1727 | * state out mid-flight, defer the change if possible. |
1728 | */ |
1729 | if (bind_target == PROCESSOR_NULL) { |
1730 | thread_bind_internal(thread, bind_target); |
1731 | } else { |
1732 | sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */ |
1733 | } |
1734 | } |
1735 | |
1736 | if (removed) { |
1737 | thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ); |
1738 | } |
1739 | thread_unlock(thread); |
1740 | } |
1741 | splx(s); |
1742 | } |
1743 | |
1744 | simple_unlock(&sched_vm_group_list_lock); |
1745 | } |
1746 | |
1747 | /* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT |
1748 | * rebalancing opportunity exists when a core is (instantaneously) idle, but |
1749 | * other SMT-capable cores may be over-committed. TODO: some possible negatives: |
1750 | * IPI thrash if this core does not remain idle following the load balancing ASTs |
1751 | * Idle "thrash", when IPI issue is followed by idle entry/core power down |
1752 | * followed by a wakeup shortly thereafter. |
1753 | */ |
1754 | |
1755 | #if (DEVELOPMENT || DEBUG) |
1756 | int sched_smt_balance = 1; |
1757 | #endif |
1758 | |
1759 | #if __SMP__ |
1760 | /* Invoked with pset locked, returns with pset unlocked */ |
1761 | void |
1762 | sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { |
1763 | processor_t ast_processor = NULL; |
1764 | |
1765 | #if (DEVELOPMENT || DEBUG) |
1766 | if (__improbable(sched_smt_balance == 0)) |
1767 | goto smt_balance_exit; |
1768 | #endif |
1769 | |
1770 | assert(cprocessor == current_processor()); |
1771 | if (cprocessor->is_SMT == FALSE) |
1772 | goto smt_balance_exit; |
1773 | |
1774 | processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary; |
1775 | |
1776 | /* Determine if both this processor and its sibling are idle, |
1777 | * indicating an SMT rebalancing opportunity. |
1778 | */ |
1779 | if (sib_processor->state != PROCESSOR_IDLE) |
1780 | goto smt_balance_exit; |
1781 | |
1782 | processor_t sprocessor; |
1783 | |
1784 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
1785 | uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] & |
1786 | ~cpset->primary_map); |
1787 | for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) { |
1788 | sprocessor = processor_array[cpuid]; |
1789 | if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) && |
1790 | (sprocessor->current_pri < BASEPRI_RTQUEUES)) { |
1791 | |
1792 | ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); |
1793 | if (ipi_type != SCHED_IPI_NONE) { |
1794 | assert(sprocessor != cprocessor); |
1795 | ast_processor = sprocessor; |
1796 | break; |
1797 | } |
1798 | } |
1799 | } |
1800 | |
1801 | smt_balance_exit: |
1802 | pset_unlock(cpset); |
1803 | |
1804 | if (ast_processor) { |
1805 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0); |
1806 | sched_ipi_perform(ast_processor, ipi_type); |
1807 | } |
1808 | } |
1809 | #else |
1810 | /* Invoked with pset locked, returns with pset unlocked */ |
1811 | void |
1812 | sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset) |
1813 | { |
1814 | pset_unlock(cpset); |
1815 | } |
1816 | #endif /* __SMP__ */ |
1817 | |
1818 | static processor_t choose_processor_for_realtime_thread(processor_set_t pset); |
1819 | static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset); |
1820 | int sched_allow_rt_smt = 1; |
1821 | |
1822 | /* |
1823 | * thread_select: |
1824 | * |
1825 | * Select a new thread for the current processor to execute. |
1826 | * |
1827 | * May select the current thread, which must be locked. |
1828 | */ |
1829 | static thread_t |
1830 | thread_select(thread_t thread, |
1831 | processor_t processor, |
1832 | ast_t *reason) |
1833 | { |
1834 | processor_set_t pset = processor->processor_set; |
1835 | thread_t new_thread = THREAD_NULL; |
1836 | |
1837 | assert(processor == current_processor()); |
1838 | assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN); |
1839 | |
1840 | do { |
1841 | /* |
1842 | * Update the priority. |
1843 | */ |
1844 | if (SCHED(can_update_priority)(thread)) |
1845 | SCHED(update_priority)(thread); |
1846 | |
1847 | processor_state_update_from_thread(processor, thread); |
1848 | |
1849 | pset_lock(pset); |
1850 | |
1851 | assert(processor->state != PROCESSOR_OFF_LINE); |
1852 | |
1853 | if (!processor->is_recommended) { |
1854 | /* |
1855 | * The performance controller has provided a hint to not dispatch more threads, |
1856 | * unless they are bound to us (and thus we are the only option |
1857 | */ |
1858 | if (!SCHED(processor_bound_count)(processor)) { |
1859 | goto idle; |
1860 | } |
1861 | } else if (processor->processor_primary != processor) { |
1862 | /* |
1863 | * Should this secondary SMT processor attempt to find work? For pset runqueue systems, |
1864 | * we should look for work only under the same conditions that choose_processor() |
1865 | * would have assigned work, which is when all primary processors have been assigned work. |
1866 | * |
1867 | * An exception is that bound threads are dispatched to a processor without going through |
1868 | * choose_processor(), so in those cases we should continue trying to dequeue work. |
1869 | */ |
1870 | if (!SCHED(processor_bound_count)(processor)) { |
1871 | if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { |
1872 | goto idle; |
1873 | } |
1874 | |
1875 | /* There are no idle primaries */ |
1876 | |
1877 | if (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) { |
1878 | bool secondary_can_run_realtime_thread = sched_allow_rt_smt && rt_runq_count(pset) && all_available_primaries_are_running_realtime_threads(pset); |
1879 | if (!secondary_can_run_realtime_thread) { |
1880 | goto idle; |
1881 | } |
1882 | } |
1883 | } |
1884 | } |
1885 | |
1886 | /* |
1887 | * Test to see if the current thread should continue |
1888 | * to run on this processor. Must not be attempting to wait, and not |
1889 | * bound to a different processor, nor be in the wrong |
1890 | * processor set, nor be forced to context switch by TH_SUSP. |
1891 | * |
1892 | * Note that there are never any RT threads in the regular runqueue. |
1893 | * |
1894 | * This code is very insanely tricky. |
1895 | */ |
1896 | |
1897 | /* i.e. not waiting, not TH_SUSP'ed */ |
1898 | boolean_t still_running = ((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN); |
1899 | |
1900 | /* |
1901 | * Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads. |
1902 | * TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors' |
1903 | */ |
1904 | boolean_t needs_smt_rebalance = (thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor); |
1905 | |
1906 | boolean_t affinity_mismatch = (thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset); |
1907 | |
1908 | boolean_t bound_elsewhere = (thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor); |
1909 | |
1910 | boolean_t avoid_processor = (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)); |
1911 | |
1912 | if (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor) { |
1913 | /* |
1914 | * This thread is eligible to keep running on this processor. |
1915 | * |
1916 | * RT threads with un-expired quantum stay on processor, |
1917 | * unless there's a valid RT thread with an earlier deadline. |
1918 | */ |
1919 | if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) { |
1920 | if (rt_runq_count(pset) > 0) { |
1921 | |
1922 | rt_lock_lock(pset); |
1923 | |
1924 | if (rt_runq_count(pset) > 0) { |
1925 | |
1926 | thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); |
1927 | |
1928 | if (next_rt->realtime.deadline < processor->deadline && |
1929 | (next_rt->bound_processor == PROCESSOR_NULL || |
1930 | next_rt->bound_processor == processor)) { |
1931 | /* The next RT thread is better, so pick it off the runqueue. */ |
1932 | goto pick_new_rt_thread; |
1933 | } |
1934 | } |
1935 | |
1936 | rt_lock_unlock(pset); |
1937 | } |
1938 | |
1939 | /* This is still the best RT thread to run. */ |
1940 | processor->deadline = thread->realtime.deadline; |
1941 | |
1942 | sched_update_pset_load_average(pset); |
1943 | |
1944 | processor_t next_rt_processor = PROCESSOR_NULL; |
1945 | sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; |
1946 | |
1947 | if (rt_runq_count(pset) > 0) { |
1948 | next_rt_processor = choose_processor_for_realtime_thread(pset); |
1949 | if (next_rt_processor) { |
1950 | next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT); |
1951 | } |
1952 | } |
1953 | pset_unlock(pset); |
1954 | |
1955 | if (next_rt_processor) { |
1956 | sched_ipi_perform(next_rt_processor, next_rt_ipi_type); |
1957 | } |
1958 | |
1959 | return (thread); |
1960 | } |
1961 | |
1962 | if ((rt_runq_count(pset) == 0) && |
1963 | SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) { |
1964 | /* This thread is still the highest priority runnable (non-idle) thread */ |
1965 | processor->deadline = UINT64_MAX; |
1966 | |
1967 | sched_update_pset_load_average(pset); |
1968 | pset_unlock(pset); |
1969 | |
1970 | return (thread); |
1971 | } |
1972 | } else { |
1973 | /* |
1974 | * This processor must context switch. |
1975 | * If it's due to a rebalance, we should aggressively find this thread a new home. |
1976 | */ |
1977 | if (needs_smt_rebalance || affinity_mismatch || bound_elsewhere || avoid_processor) |
1978 | *reason |= AST_REBALANCE; |
1979 | } |
1980 | |
1981 | /* OK, so we're not going to run the current thread. Look at the RT queue. */ |
1982 | if (rt_runq_count(pset) > 0) { |
1983 | |
1984 | rt_lock_lock(pset); |
1985 | |
1986 | if (rt_runq_count(pset) > 0) { |
1987 | thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); |
1988 | |
1989 | if (__probable((next_rt->bound_processor == PROCESSOR_NULL || |
1990 | (next_rt->bound_processor == processor)))) { |
1991 | pick_new_rt_thread: |
1992 | new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links); |
1993 | |
1994 | new_thread->runq = PROCESSOR_NULL; |
1995 | SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); |
1996 | rt_runq_count_decr(pset); |
1997 | |
1998 | processor->deadline = new_thread->realtime.deadline; |
1999 | processor_state_update_from_thread(processor, new_thread); |
2000 | |
2001 | rt_lock_unlock(pset); |
2002 | sched_update_pset_load_average(pset); |
2003 | |
2004 | processor_t ast_processor = PROCESSOR_NULL; |
2005 | processor_t next_rt_processor = PROCESSOR_NULL; |
2006 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
2007 | sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; |
2008 | |
2009 | if (processor->processor_secondary != NULL) { |
2010 | processor_t sprocessor = processor->processor_secondary; |
2011 | if ((sprocessor->state == PROCESSOR_RUNNING) || (sprocessor->state == PROCESSOR_DISPATCHING)) { |
2012 | ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); |
2013 | ast_processor = sprocessor; |
2014 | } |
2015 | } |
2016 | if (rt_runq_count(pset) > 0) { |
2017 | next_rt_processor = choose_processor_for_realtime_thread(pset); |
2018 | if (next_rt_processor) { |
2019 | next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT); |
2020 | } |
2021 | } |
2022 | pset_unlock(pset); |
2023 | |
2024 | if (ast_processor) { |
2025 | sched_ipi_perform(ast_processor, ipi_type); |
2026 | } |
2027 | |
2028 | if (next_rt_processor) { |
2029 | sched_ipi_perform(next_rt_processor, next_rt_ipi_type); |
2030 | } |
2031 | |
2032 | return (new_thread); |
2033 | } |
2034 | } |
2035 | |
2036 | rt_lock_unlock(pset); |
2037 | } |
2038 | |
2039 | processor->deadline = UINT64_MAX; |
2040 | |
2041 | /* No RT threads, so let's look at the regular threads. */ |
2042 | if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) { |
2043 | sched_update_pset_load_average(pset); |
2044 | processor_state_update_from_thread(processor, new_thread); |
2045 | pset_unlock(pset); |
2046 | return (new_thread); |
2047 | } |
2048 | |
2049 | #if __SMP__ |
2050 | if (SCHED(steal_thread_enabled)) { |
2051 | /* |
2052 | * No runnable threads, attempt to steal |
2053 | * from other processors. Returns with pset lock dropped. |
2054 | */ |
2055 | |
2056 | if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) { |
2057 | return (new_thread); |
2058 | } |
2059 | |
2060 | /* |
2061 | * If other threads have appeared, shortcut |
2062 | * around again. |
2063 | */ |
2064 | if (!SCHED(processor_queue_empty)(processor) || rt_runq_count(pset) > 0) |
2065 | continue; |
2066 | |
2067 | pset_lock(pset); |
2068 | } |
2069 | #endif |
2070 | |
2071 | idle: |
2072 | /* |
2073 | * Nothing is runnable, so set this processor idle if it |
2074 | * was running. |
2075 | */ |
2076 | if (processor->state == PROCESSOR_RUNNING) { |
2077 | pset_update_processor_state(pset, processor, PROCESSOR_IDLE); |
2078 | } |
2079 | |
2080 | #if __SMP__ |
2081 | /* Invoked with pset locked, returns with pset unlocked */ |
2082 | SCHED(processor_balance)(processor, pset); |
2083 | #else |
2084 | pset_unlock(pset); |
2085 | #endif |
2086 | |
2087 | #if CONFIG_SCHED_IDLE_IN_PLACE |
2088 | /* |
2089 | * Choose idle thread if fast idle is not possible. |
2090 | */ |
2091 | if (processor->processor_primary != processor) |
2092 | return (processor->idle_thread); |
2093 | |
2094 | if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES) |
2095 | return (processor->idle_thread); |
2096 | |
2097 | /* |
2098 | * Perform idling activities directly without a |
2099 | * context switch. Return dispatched thread, |
2100 | * else check again for a runnable thread. |
2101 | */ |
2102 | new_thread = thread_select_idle(thread, processor); |
2103 | |
2104 | #else /* !CONFIG_SCHED_IDLE_IN_PLACE */ |
2105 | |
2106 | /* |
2107 | * Do a full context switch to idle so that the current |
2108 | * thread can start running on another processor without |
2109 | * waiting for the fast-idled processor to wake up. |
2110 | */ |
2111 | new_thread = processor->idle_thread; |
2112 | |
2113 | #endif /* !CONFIG_SCHED_IDLE_IN_PLACE */ |
2114 | |
2115 | } while (new_thread == THREAD_NULL); |
2116 | |
2117 | return (new_thread); |
2118 | } |
2119 | |
2120 | #if CONFIG_SCHED_IDLE_IN_PLACE |
2121 | /* |
2122 | * thread_select_idle: |
2123 | * |
2124 | * Idle the processor using the current thread context. |
2125 | * |
2126 | * Called with thread locked, then dropped and relocked. |
2127 | */ |
2128 | static thread_t |
2129 | thread_select_idle( |
2130 | thread_t thread, |
2131 | processor_t processor) |
2132 | { |
2133 | thread_t new_thread; |
2134 | uint64_t arg1, arg2; |
2135 | int urgency; |
2136 | |
2137 | sched_run_decr(thread); |
2138 | |
2139 | thread->state |= TH_IDLE; |
2140 | processor_state_update_idle(procssor); |
2141 | |
2142 | /* Reload precise timing global policy to thread-local policy */ |
2143 | thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); |
2144 | |
2145 | thread_unlock(thread); |
2146 | |
2147 | /* |
2148 | * Switch execution timing to processor idle thread. |
2149 | */ |
2150 | processor->last_dispatch = mach_absolute_time(); |
2151 | |
2152 | #ifdef CONFIG_MACH_APPROXIMATE_TIME |
2153 | commpage_update_mach_approximate_time(processor->last_dispatch); |
2154 | #endif |
2155 | |
2156 | thread->last_run_time = processor->last_dispatch; |
2157 | processor_timer_switch_thread(processor->last_dispatch, |
2158 | &processor->idle_thread->system_timer); |
2159 | PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer; |
2160 | |
2161 | |
2162 | /* |
2163 | * Cancel the quantum timer while idling. |
2164 | */ |
2165 | timer_call_quantum_timer_cancel(&processor->quantum_timer); |
2166 | processor->first_timeslice = FALSE; |
2167 | |
2168 | if (thread->sched_call) { |
2169 | (*thread->sched_call)(SCHED_CALL_BLOCK, thread); |
2170 | } |
2171 | |
2172 | thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL); |
2173 | |
2174 | /* |
2175 | * Enable interrupts and perform idling activities. No |
2176 | * preemption due to TH_IDLE being set. |
2177 | */ |
2178 | spllo(); new_thread = processor_idle(thread, processor); |
2179 | |
2180 | /* |
2181 | * Return at splsched. |
2182 | */ |
2183 | if (thread->sched_call) { |
2184 | (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); |
2185 | } |
2186 | |
2187 | thread_lock(thread); |
2188 | |
2189 | /* |
2190 | * If awakened, switch to thread timer and start a new quantum. |
2191 | * Otherwise skip; we will context switch to another thread or return here. |
2192 | */ |
2193 | if (!(thread->state & TH_WAIT)) { |
2194 | uint64_t time_now = processor->last_dispatch = mach_absolute_time(); |
2195 | processor_timer_switch_thread(time_now, &thread->system_timer); |
2196 | timer_update(&thread->runnable_timer, time_now); |
2197 | PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; |
2198 | thread_quantum_init(thread); |
2199 | processor->quantum_end = time_now + thread->quantum_remaining; |
2200 | timer_call_quantum_timer_enter(&processor->quantum_timer, |
2201 | thread, processor->quantum_end, time_now); |
2202 | processor->first_timeslice = TRUE; |
2203 | |
2204 | thread->computation_epoch = time_now; |
2205 | } |
2206 | |
2207 | thread->state &= ~TH_IDLE; |
2208 | |
2209 | urgency = thread_get_urgency(thread, &arg1, &arg2); |
2210 | |
2211 | thread_tell_urgency(urgency, arg1, arg2, 0, new_thread); |
2212 | |
2213 | sched_run_incr(thread); |
2214 | |
2215 | return (new_thread); |
2216 | } |
2217 | #endif /* CONFIG_SCHED_IDLE_IN_PLACE */ |
2218 | |
2219 | /* |
2220 | * thread_invoke |
2221 | * |
2222 | * Called at splsched with neither thread locked. |
2223 | * |
2224 | * Perform a context switch and start executing the new thread. |
2225 | * |
2226 | * Returns FALSE when the context switch didn't happen. |
2227 | * The reference to the new thread is still consumed. |
2228 | * |
2229 | * "self" is what is currently running on the processor, |
2230 | * "thread" is the new thread to context switch to |
2231 | * (which may be the same thread in some cases) |
2232 | */ |
2233 | static boolean_t |
2234 | thread_invoke( |
2235 | thread_t self, |
2236 | thread_t thread, |
2237 | ast_t reason) |
2238 | { |
2239 | if (__improbable(get_preemption_level() != 0)) { |
2240 | int pl = get_preemption_level(); |
2241 | panic("thread_invoke: preemption_level %d, possible cause: %s" , |
2242 | pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" : |
2243 | "blocking while holding a spinlock, or within interrupt context" )); |
2244 | } |
2245 | |
2246 | thread_continue_t continuation = self->continuation; |
2247 | void *parameter = self->parameter; |
2248 | processor_t processor; |
2249 | |
2250 | uint64_t ctime = mach_absolute_time(); |
2251 | |
2252 | #ifdef CONFIG_MACH_APPROXIMATE_TIME |
2253 | commpage_update_mach_approximate_time(ctime); |
2254 | #endif |
2255 | |
2256 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
2257 | if ((thread->state & TH_IDLE) == 0) |
2258 | sched_timeshare_consider_maintenance(ctime); |
2259 | #endif |
2260 | |
2261 | #if MONOTONIC |
2262 | mt_sched_update(self); |
2263 | #endif /* MONOTONIC */ |
2264 | |
2265 | assert_thread_magic(self); |
2266 | assert(self == current_thread()); |
2267 | assert(self->runq == PROCESSOR_NULL); |
2268 | assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN); |
2269 | |
2270 | thread_lock(thread); |
2271 | |
2272 | assert_thread_magic(thread); |
2273 | assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN); |
2274 | assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor()); |
2275 | assert(thread->runq == PROCESSOR_NULL); |
2276 | |
2277 | /* Reload precise timing global policy to thread-local policy */ |
2278 | thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); |
2279 | |
2280 | /* Update SFI class based on other factors */ |
2281 | thread->sfi_class = sfi_thread_classify(thread); |
2282 | |
2283 | /* Update the same_pri_latency for the thread (used by perfcontrol callouts) */ |
2284 | thread->same_pri_latency = ctime - thread->last_basepri_change_time; |
2285 | /* |
2286 | * In case a base_pri update happened between the timestamp and |
2287 | * taking the thread lock |
2288 | */ |
2289 | if (ctime <= thread->last_basepri_change_time) |
2290 | thread->same_pri_latency = ctime - thread->last_made_runnable_time; |
2291 | |
2292 | /* Allow realtime threads to hang onto a stack. */ |
2293 | if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) |
2294 | self->reserved_stack = self->kernel_stack; |
2295 | |
2296 | /* Prepare for spin debugging */ |
2297 | #if INTERRUPT_MASKED_DEBUG |
2298 | ml_spin_debug_clear(thread); |
2299 | #endif |
2300 | |
2301 | if (continuation != NULL) { |
2302 | if (!thread->kernel_stack) { |
2303 | /* |
2304 | * If we are using a privileged stack, |
2305 | * check to see whether we can exchange it with |
2306 | * that of the other thread. |
2307 | */ |
2308 | if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack) |
2309 | goto need_stack; |
2310 | |
2311 | /* |
2312 | * Context switch by performing a stack handoff. |
2313 | */ |
2314 | continuation = thread->continuation; |
2315 | parameter = thread->parameter; |
2316 | |
2317 | processor = current_processor(); |
2318 | processor->active_thread = thread; |
2319 | processor_state_update_from_thread(processor, thread); |
2320 | |
2321 | if (thread->last_processor != processor && thread->last_processor != NULL) { |
2322 | if (thread->last_processor->processor_set != processor->processor_set) |
2323 | thread->ps_switch++; |
2324 | thread->p_switch++; |
2325 | } |
2326 | thread->last_processor = processor; |
2327 | thread->c_switch++; |
2328 | ast_context(thread); |
2329 | |
2330 | thread_unlock(thread); |
2331 | |
2332 | self->reason = reason; |
2333 | |
2334 | processor->last_dispatch = ctime; |
2335 | self->last_run_time = ctime; |
2336 | processor_timer_switch_thread(ctime, &thread->system_timer); |
2337 | timer_update(&thread->runnable_timer, ctime); |
2338 | PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; |
2339 | |
2340 | /* |
2341 | * Since non-precise user/kernel time doesn't update the state timer |
2342 | * during privilege transitions, synthesize an event now. |
2343 | */ |
2344 | if (!thread->precise_user_kernel_time) { |
2345 | timer_update(PROCESSOR_DATA(processor, current_state), ctime); |
2346 | } |
2347 | |
2348 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2349 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, |
2350 | self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); |
2351 | |
2352 | if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) { |
2353 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, |
2354 | (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); |
2355 | } |
2356 | |
2357 | DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info); |
2358 | |
2359 | SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); |
2360 | |
2361 | #if KPERF |
2362 | kperf_off_cpu(self); |
2363 | #endif /* KPERF */ |
2364 | |
2365 | TLOG(1, "thread_invoke: calling stack_handoff\n" ); |
2366 | stack_handoff(self, thread); |
2367 | |
2368 | /* 'self' is now off core */ |
2369 | assert(thread == current_thread_volatile()); |
2370 | |
2371 | DTRACE_SCHED(on__cpu); |
2372 | |
2373 | #if KPERF |
2374 | kperf_on_cpu(thread, continuation, NULL); |
2375 | #endif /* KPERF */ |
2376 | |
2377 | thread_dispatch(self, thread); |
2378 | |
2379 | #if KASAN |
2380 | /* Old thread's stack has been moved to the new thread, so explicitly |
2381 | * unpoison it. */ |
2382 | kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); |
2383 | #endif |
2384 | |
2385 | thread->continuation = thread->parameter = NULL; |
2386 | |
2387 | counter(c_thread_invoke_hits++); |
2388 | |
2389 | assert(continuation); |
2390 | call_continuation(continuation, parameter, thread->wait_result, TRUE); |
2391 | /*NOTREACHED*/ |
2392 | } |
2393 | else if (thread == self) { |
2394 | /* same thread but with continuation */ |
2395 | ast_context(self); |
2396 | counter(++c_thread_invoke_same); |
2397 | |
2398 | thread_unlock(self); |
2399 | |
2400 | #if KPERF |
2401 | kperf_on_cpu(thread, continuation, NULL); |
2402 | #endif /* KPERF */ |
2403 | |
2404 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2405 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, |
2406 | self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); |
2407 | |
2408 | #if KASAN |
2409 | /* stack handoff to self - no thread_dispatch(), so clear the stack |
2410 | * and free the fakestack directly */ |
2411 | kasan_fakestack_drop(self); |
2412 | kasan_fakestack_gc(self); |
2413 | kasan_unpoison_stack(self->kernel_stack, kernel_stack_size); |
2414 | #endif |
2415 | |
2416 | self->continuation = self->parameter = NULL; |
2417 | |
2418 | call_continuation(continuation, parameter, self->wait_result, TRUE); |
2419 | /*NOTREACHED*/ |
2420 | } |
2421 | } else { |
2422 | /* |
2423 | * Check that the other thread has a stack |
2424 | */ |
2425 | if (!thread->kernel_stack) { |
2426 | need_stack: |
2427 | if (!stack_alloc_try(thread)) { |
2428 | counter(c_thread_invoke_misses++); |
2429 | thread_unlock(thread); |
2430 | thread_stack_enqueue(thread); |
2431 | return (FALSE); |
2432 | } |
2433 | } else if (thread == self) { |
2434 | ast_context(self); |
2435 | counter(++c_thread_invoke_same); |
2436 | thread_unlock(self); |
2437 | |
2438 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2439 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, |
2440 | self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); |
2441 | |
2442 | return (TRUE); |
2443 | } |
2444 | } |
2445 | |
2446 | /* |
2447 | * Context switch by full context save. |
2448 | */ |
2449 | processor = current_processor(); |
2450 | processor->active_thread = thread; |
2451 | processor_state_update_from_thread(processor, thread); |
2452 | |
2453 | if (thread->last_processor != processor && thread->last_processor != NULL) { |
2454 | if (thread->last_processor->processor_set != processor->processor_set) |
2455 | thread->ps_switch++; |
2456 | thread->p_switch++; |
2457 | } |
2458 | thread->last_processor = processor; |
2459 | thread->c_switch++; |
2460 | ast_context(thread); |
2461 | |
2462 | thread_unlock(thread); |
2463 | |
2464 | counter(c_thread_invoke_csw++); |
2465 | |
2466 | self->reason = reason; |
2467 | |
2468 | processor->last_dispatch = ctime; |
2469 | self->last_run_time = ctime; |
2470 | processor_timer_switch_thread(ctime, &thread->system_timer); |
2471 | timer_update(&thread->runnable_timer, ctime); |
2472 | PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; |
2473 | |
2474 | /* |
2475 | * Since non-precise user/kernel time doesn't update the state timer |
2476 | * during privilege transitions, synthesize an event now. |
2477 | */ |
2478 | if (!thread->precise_user_kernel_time) { |
2479 | timer_update(PROCESSOR_DATA(processor, current_state), ctime); |
2480 | } |
2481 | |
2482 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2483 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, |
2484 | self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); |
2485 | |
2486 | if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { |
2487 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, |
2488 | (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); |
2489 | } |
2490 | |
2491 | DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info); |
2492 | |
2493 | SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); |
2494 | |
2495 | #if KPERF |
2496 | kperf_off_cpu(self); |
2497 | #endif /* KPERF */ |
2498 | |
2499 | /* |
2500 | * This is where we actually switch register context, |
2501 | * and address space if required. We will next run |
2502 | * as a result of a subsequent context switch. |
2503 | * |
2504 | * Once registers are switched and the processor is running "thread", |
2505 | * the stack variables and non-volatile registers will contain whatever |
2506 | * was there the last time that thread blocked. No local variables should |
2507 | * be used after this point, except for the special case of "thread", which |
2508 | * the platform layer returns as the previous thread running on the processor |
2509 | * via the function call ABI as a return register, and "self", which may have |
2510 | * been stored on the stack or a non-volatile register, but a stale idea of |
2511 | * what was on the CPU is newly-accurate because that thread is again |
2512 | * running on the CPU. |
2513 | */ |
2514 | assert(continuation == self->continuation); |
2515 | thread = machine_switch_context(self, continuation, thread); |
2516 | assert(self == current_thread_volatile()); |
2517 | TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n" , self, continuation, thread); |
2518 | |
2519 | DTRACE_SCHED(on__cpu); |
2520 | |
2521 | #if KPERF |
2522 | kperf_on_cpu(self, NULL, __builtin_frame_address(0)); |
2523 | #endif /* KPERF */ |
2524 | |
2525 | /* |
2526 | * We have been resumed and are set to run. |
2527 | */ |
2528 | thread_dispatch(thread, self); |
2529 | |
2530 | if (continuation) { |
2531 | self->continuation = self->parameter = NULL; |
2532 | |
2533 | call_continuation(continuation, parameter, self->wait_result, TRUE); |
2534 | /*NOTREACHED*/ |
2535 | } |
2536 | |
2537 | return (TRUE); |
2538 | } |
2539 | |
2540 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
2541 | /* |
2542 | * pset_cancel_deferred_dispatch: |
2543 | * |
2544 | * Cancels all ASTs that we can cancel for the given processor set |
2545 | * if the current processor is running the last runnable thread in the |
2546 | * system. |
2547 | * |
2548 | * This function assumes the current thread is runnable. This must |
2549 | * be called with the pset unlocked. |
2550 | */ |
2551 | static void |
2552 | pset_cancel_deferred_dispatch( |
2553 | processor_set_t pset, |
2554 | processor_t processor) |
2555 | { |
2556 | processor_t active_processor = NULL; |
2557 | uint32_t sampled_sched_run_count; |
2558 | |
2559 | pset_lock(pset); |
2560 | sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN]; |
2561 | |
2562 | /* |
2563 | * If we have emptied the run queue, and our current thread is runnable, we |
2564 | * should tell any processors that are still DISPATCHING that they will |
2565 | * probably not have any work to do. In the event that there are no |
2566 | * pending signals that we can cancel, this is also uninteresting. |
2567 | * |
2568 | * In the unlikely event that another thread becomes runnable while we are |
2569 | * doing this (sched_run_count is atomically updated, not guarded), the |
2570 | * codepath making it runnable SHOULD (a dangerous word) need the pset lock |
2571 | * in order to dispatch it to a processor in our pset. So, the other |
2572 | * codepath will wait while we squash all cancelable ASTs, get the pset |
2573 | * lock, and then dispatch the freshly runnable thread. So this should be |
2574 | * correct (we won't accidentally have a runnable thread that hasn't been |
2575 | * dispatched to an idle processor), if not ideal (we may be restarting the |
2576 | * dispatch process, which could have some overhead). |
2577 | */ |
2578 | |
2579 | if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) { |
2580 | uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] & |
2581 | pset->pending_deferred_AST_cpu_mask & |
2582 | ~pset->pending_AST_cpu_mask); |
2583 | for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) { |
2584 | active_processor = processor_array[cpuid]; |
2585 | /* |
2586 | * If a processor is DISPATCHING, it could be because of |
2587 | * a cancelable signal. |
2588 | * |
2589 | * IF the processor is not our |
2590 | * current processor (the current processor should not |
2591 | * be DISPATCHING, so this is a bit paranoid), AND there |
2592 | * is a cancelable signal pending on the processor, AND |
2593 | * there is no non-cancelable signal pending (as there is |
2594 | * no point trying to backtrack on bringing the processor |
2595 | * up if a signal we cannot cancel is outstanding), THEN |
2596 | * it should make sense to roll back the processor state |
2597 | * to the IDLE state. |
2598 | * |
2599 | * If the racey nature of this approach (as the signal |
2600 | * will be arbitrated by hardware, and can fire as we |
2601 | * roll back state) results in the core responding |
2602 | * despite being pushed back to the IDLE state, it |
2603 | * should be no different than if the core took some |
2604 | * interrupt while IDLE. |
2605 | */ |
2606 | if (active_processor != processor) { |
2607 | /* |
2608 | * Squash all of the processor state back to some |
2609 | * reasonable facsimile of PROCESSOR_IDLE. |
2610 | */ |
2611 | |
2612 | assert(active_processor->next_thread == THREAD_NULL); |
2613 | processor_state_update_idle(active_processor); |
2614 | active_processor->deadline = UINT64_MAX; |
2615 | pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE); |
2616 | bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id); |
2617 | machine_signal_idle_cancel(active_processor); |
2618 | } |
2619 | |
2620 | } |
2621 | } |
2622 | |
2623 | pset_unlock(pset); |
2624 | } |
2625 | #else |
2626 | /* We don't support deferred ASTs; everything is candycanes and sunshine. */ |
2627 | #endif |
2628 | |
2629 | static void |
2630 | thread_csw_callout( |
2631 | thread_t old, |
2632 | thread_t new, |
2633 | uint64_t timestamp) |
2634 | { |
2635 | perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH; |
2636 | uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency; |
2637 | machine_switch_perfcontrol_context(event, timestamp, 0, |
2638 | same_pri_latency, old, new); |
2639 | } |
2640 | |
2641 | |
2642 | /* |
2643 | * thread_dispatch: |
2644 | * |
2645 | * Handle threads at context switch. Re-dispatch other thread |
2646 | * if still running, otherwise update run state and perform |
2647 | * special actions. Update quantum for other thread and begin |
2648 | * the quantum for ourselves. |
2649 | * |
2650 | * "thread" is the old thread that we have switched away from. |
2651 | * "self" is the new current thread that we have context switched to |
2652 | * |
2653 | * Called at splsched. |
2654 | */ |
2655 | void |
2656 | thread_dispatch( |
2657 | thread_t thread, |
2658 | thread_t self) |
2659 | { |
2660 | processor_t processor = self->last_processor; |
2661 | |
2662 | assert(processor == current_processor()); |
2663 | assert(self == current_thread_volatile()); |
2664 | assert(thread != self); |
2665 | |
2666 | if (thread != THREAD_NULL) { |
2667 | /* |
2668 | * Do the perfcontrol callout for context switch. |
2669 | * The reason we do this here is: |
2670 | * - thread_dispatch() is called from various places that are not |
2671 | * the direct context switch path for eg. processor shutdown etc. |
2672 | * So adding the callout here covers all those cases. |
2673 | * - We want this callout as early as possible to be close |
2674 | * to the timestamp taken in thread_invoke() |
2675 | * - We want to avoid holding the thread lock while doing the |
2676 | * callout |
2677 | * - We do not want to callout if "thread" is NULL. |
2678 | */ |
2679 | thread_csw_callout(thread, self, processor->last_dispatch); |
2680 | |
2681 | #if KASAN |
2682 | if (thread->continuation != NULL) { |
2683 | /* |
2684 | * Thread has a continuation and the normal stack is going away. |
2685 | * Unpoison the stack and mark all fakestack objects as unused. |
2686 | */ |
2687 | kasan_fakestack_drop(thread); |
2688 | if (thread->kernel_stack) { |
2689 | kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); |
2690 | } |
2691 | } |
2692 | |
2693 | /* |
2694 | * Free all unused fakestack objects. |
2695 | */ |
2696 | kasan_fakestack_gc(thread); |
2697 | #endif |
2698 | |
2699 | /* |
2700 | * If blocked at a continuation, discard |
2701 | * the stack. |
2702 | */ |
2703 | if (thread->continuation != NULL && thread->kernel_stack != 0) |
2704 | stack_free(thread); |
2705 | |
2706 | if (thread->state & TH_IDLE) { |
2707 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2708 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, |
2709 | (uintptr_t)thread_tid(thread), 0, thread->state, |
2710 | sched_run_buckets[TH_BUCKET_RUN], 0); |
2711 | } else { |
2712 | int64_t consumed; |
2713 | int64_t remainder = 0; |
2714 | |
2715 | if (processor->quantum_end > processor->last_dispatch) |
2716 | remainder = processor->quantum_end - |
2717 | processor->last_dispatch; |
2718 | |
2719 | consumed = thread->quantum_remaining - remainder; |
2720 | |
2721 | if ((thread->reason & AST_LEDGER) == 0) { |
2722 | /* |
2723 | * Bill CPU time to both the task and |
2724 | * the individual thread. |
2725 | */ |
2726 | ledger_credit_thread(thread, thread->t_ledger, |
2727 | task_ledgers.cpu_time, consumed); |
2728 | ledger_credit_thread(thread, thread->t_threadledger, |
2729 | thread_ledgers.cpu_time, consumed); |
2730 | if (thread->t_bankledger) { |
2731 | ledger_credit_thread(thread, thread->t_bankledger, |
2732 | bank_ledgers.cpu_time, |
2733 | (consumed - thread->t_deduct_bank_ledger_time)); |
2734 | } |
2735 | thread->t_deduct_bank_ledger_time = 0; |
2736 | } |
2737 | |
2738 | wake_lock(thread); |
2739 | thread_lock(thread); |
2740 | |
2741 | /* |
2742 | * Apply a priority floor if the thread holds a kernel resource |
2743 | * Do this before checking starting_pri to avoid overpenalizing |
2744 | * repeated rwlock blockers. |
2745 | */ |
2746 | if (__improbable(thread->rwlock_count != 0)) |
2747 | lck_rw_set_promotion_locked(thread); |
2748 | |
2749 | boolean_t keep_quantum = processor->first_timeslice; |
2750 | |
2751 | /* |
2752 | * Treat a thread which has dropped priority since it got on core |
2753 | * as having expired its quantum. |
2754 | */ |
2755 | if (processor->starting_pri > thread->sched_pri) |
2756 | keep_quantum = FALSE; |
2757 | |
2758 | /* Compute remainder of current quantum. */ |
2759 | if (keep_quantum && |
2760 | processor->quantum_end > processor->last_dispatch) |
2761 | thread->quantum_remaining = (uint32_t)remainder; |
2762 | else |
2763 | thread->quantum_remaining = 0; |
2764 | |
2765 | if (thread->sched_mode == TH_MODE_REALTIME) { |
2766 | /* |
2767 | * Cancel the deadline if the thread has |
2768 | * consumed the entire quantum. |
2769 | */ |
2770 | if (thread->quantum_remaining == 0) { |
2771 | thread->realtime.deadline = UINT64_MAX; |
2772 | } |
2773 | } else { |
2774 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
2775 | /* |
2776 | * For non-realtime threads treat a tiny |
2777 | * remaining quantum as an expired quantum |
2778 | * but include what's left next time. |
2779 | */ |
2780 | if (thread->quantum_remaining < min_std_quantum) { |
2781 | thread->reason |= AST_QUANTUM; |
2782 | thread->quantum_remaining += SCHED(initial_quantum_size)(thread); |
2783 | } |
2784 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
2785 | } |
2786 | |
2787 | /* |
2788 | * If we are doing a direct handoff then |
2789 | * take the remainder of the quantum. |
2790 | */ |
2791 | if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) { |
2792 | self->quantum_remaining = thread->quantum_remaining; |
2793 | thread->reason |= AST_QUANTUM; |
2794 | thread->quantum_remaining = 0; |
2795 | } else { |
2796 | #if defined(CONFIG_SCHED_MULTIQ) |
2797 | if (SCHED(sched_groups_enabled) && |
2798 | thread->sched_group == self->sched_group) { |
2799 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2800 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF), |
2801 | self->reason, (uintptr_t)thread_tid(thread), |
2802 | self->quantum_remaining, thread->quantum_remaining, 0); |
2803 | |
2804 | self->quantum_remaining = thread->quantum_remaining; |
2805 | thread->quantum_remaining = 0; |
2806 | /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */ |
2807 | } |
2808 | #endif /* defined(CONFIG_SCHED_MULTIQ) */ |
2809 | } |
2810 | |
2811 | thread->computation_metered += (processor->last_dispatch - thread->computation_epoch); |
2812 | |
2813 | if (!(thread->state & TH_WAIT)) { |
2814 | /* |
2815 | * Still runnable. |
2816 | */ |
2817 | thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch; |
2818 | |
2819 | machine_thread_going_off_core(thread, FALSE, processor->last_dispatch); |
2820 | |
2821 | ast_t reason = thread->reason; |
2822 | sched_options_t options = SCHED_NONE; |
2823 | |
2824 | if (reason & AST_REBALANCE) { |
2825 | options |= SCHED_REBALANCE; |
2826 | if (reason & AST_QUANTUM) { |
2827 | /* |
2828 | * Having gone to the trouble of forcing this thread off a less preferred core, |
2829 | * we should force the preferable core to reschedule immediately to give this |
2830 | * thread a chance to run instead of just sitting on the run queue where |
2831 | * it may just be stolen back by the idle core we just forced it off. |
2832 | * But only do this at the end of a quantum to prevent cascading effects. |
2833 | */ |
2834 | options |= SCHED_PREEMPT; |
2835 | } |
2836 | } |
2837 | |
2838 | if (reason & AST_QUANTUM) |
2839 | options |= SCHED_TAILQ; |
2840 | else if (reason & AST_PREEMPT) |
2841 | options |= SCHED_HEADQ; |
2842 | else |
2843 | options |= (SCHED_PREEMPT | SCHED_TAILQ); |
2844 | |
2845 | thread_setrun(thread, options); |
2846 | |
2847 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2848 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, |
2849 | (uintptr_t)thread_tid(thread), thread->reason, thread->state, |
2850 | sched_run_buckets[TH_BUCKET_RUN], 0); |
2851 | |
2852 | if (thread->wake_active) { |
2853 | thread->wake_active = FALSE; |
2854 | thread_unlock(thread); |
2855 | |
2856 | thread_wakeup(&thread->wake_active); |
2857 | } else { |
2858 | thread_unlock(thread); |
2859 | } |
2860 | |
2861 | wake_unlock(thread); |
2862 | } else { |
2863 | /* |
2864 | * Waiting. |
2865 | */ |
2866 | boolean_t should_terminate = FALSE; |
2867 | uint32_t new_run_count; |
2868 | int thread_state = thread->state; |
2869 | |
2870 | /* Only the first call to thread_dispatch |
2871 | * after explicit termination should add |
2872 | * the thread to the termination queue |
2873 | */ |
2874 | if ((thread_state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { |
2875 | should_terminate = TRUE; |
2876 | thread_state |= TH_TERMINATE2; |
2877 | } |
2878 | |
2879 | timer_stop(&thread->runnable_timer, processor->last_dispatch); |
2880 | |
2881 | thread_state &= ~TH_RUN; |
2882 | thread->state = thread_state; |
2883 | |
2884 | thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE; |
2885 | thread->chosen_processor = PROCESSOR_NULL; |
2886 | |
2887 | new_run_count = sched_run_decr(thread); |
2888 | |
2889 | #if CONFIG_SCHED_SFI |
2890 | if (thread->reason & AST_SFI) { |
2891 | thread->wait_sfi_begin_time = processor->last_dispatch; |
2892 | } |
2893 | #endif |
2894 | |
2895 | machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch); |
2896 | |
2897 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
2898 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, |
2899 | (uintptr_t)thread_tid(thread), thread->reason, thread_state, |
2900 | new_run_count, 0); |
2901 | |
2902 | if (thread_state & TH_WAIT_REPORT) { |
2903 | (*thread->sched_call)(SCHED_CALL_BLOCK, thread); |
2904 | } |
2905 | |
2906 | if (thread->wake_active) { |
2907 | thread->wake_active = FALSE; |
2908 | thread_unlock(thread); |
2909 | |
2910 | thread_wakeup(&thread->wake_active); |
2911 | } else { |
2912 | thread_unlock(thread); |
2913 | } |
2914 | |
2915 | wake_unlock(thread); |
2916 | |
2917 | if (should_terminate) |
2918 | thread_terminate_enqueue(thread); |
2919 | } |
2920 | } |
2921 | } |
2922 | |
2923 | int urgency = THREAD_URGENCY_NONE; |
2924 | uint64_t latency = 0; |
2925 | |
2926 | /* Update (new) current thread and reprogram quantum timer */ |
2927 | thread_lock(self); |
2928 | |
2929 | if (!(self->state & TH_IDLE)) { |
2930 | uint64_t arg1, arg2; |
2931 | |
2932 | #if CONFIG_SCHED_SFI |
2933 | ast_t new_ast; |
2934 | |
2935 | new_ast = sfi_thread_needs_ast(self, NULL); |
2936 | |
2937 | if (new_ast != AST_NONE) { |
2938 | ast_on(new_ast); |
2939 | } |
2940 | #endif |
2941 | |
2942 | assertf(processor->last_dispatch >= self->last_made_runnable_time, |
2943 | "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx" , |
2944 | processor->last_dispatch, self->last_made_runnable_time); |
2945 | |
2946 | assert(self->last_made_runnable_time <= self->last_basepri_change_time); |
2947 | |
2948 | latency = processor->last_dispatch - self->last_made_runnable_time; |
2949 | assert(latency >= self->same_pri_latency); |
2950 | |
2951 | urgency = thread_get_urgency(self, &arg1, &arg2); |
2952 | |
2953 | thread_tell_urgency(urgency, arg1, arg2, latency, self); |
2954 | |
2955 | /* |
2956 | * Get a new quantum if none remaining. |
2957 | */ |
2958 | if (self->quantum_remaining == 0) { |
2959 | thread_quantum_init(self); |
2960 | } |
2961 | |
2962 | /* |
2963 | * Set up quantum timer and timeslice. |
2964 | */ |
2965 | processor->quantum_end = processor->last_dispatch + self->quantum_remaining; |
2966 | timer_call_quantum_timer_enter(&processor->quantum_timer, self, |
2967 | processor->quantum_end, processor->last_dispatch); |
2968 | |
2969 | processor->first_timeslice = TRUE; |
2970 | } else { |
2971 | timer_call_quantum_timer_cancel(&processor->quantum_timer); |
2972 | processor->first_timeslice = FALSE; |
2973 | |
2974 | thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self); |
2975 | } |
2976 | |
2977 | assert(self->block_hint == kThreadWaitNone); |
2978 | self->computation_epoch = processor->last_dispatch; |
2979 | self->reason = AST_NONE; |
2980 | processor->starting_pri = self->sched_pri; |
2981 | |
2982 | thread_unlock(self); |
2983 | |
2984 | machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency, |
2985 | processor->last_dispatch); |
2986 | |
2987 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
2988 | /* |
2989 | * TODO: Can we state that redispatching our old thread is also |
2990 | * uninteresting? |
2991 | */ |
2992 | if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) && |
2993 | !(self->state & TH_IDLE)) { |
2994 | pset_cancel_deferred_dispatch(processor->processor_set, processor); |
2995 | } |
2996 | #endif |
2997 | } |
2998 | |
2999 | /* |
3000 | * thread_block_reason: |
3001 | * |
3002 | * Forces a reschedule, blocking the caller if a wait |
3003 | * has been asserted. |
3004 | * |
3005 | * If a continuation is specified, then thread_invoke will |
3006 | * attempt to discard the thread's kernel stack. When the |
3007 | * thread resumes, it will execute the continuation function |
3008 | * on a new kernel stack. |
3009 | */ |
3010 | counter(mach_counter_t c_thread_block_calls = 0;) |
3011 | |
3012 | wait_result_t |
3013 | thread_block_reason( |
3014 | thread_continue_t continuation, |
3015 | void *parameter, |
3016 | ast_t reason) |
3017 | { |
3018 | thread_t self = current_thread(); |
3019 | processor_t processor; |
3020 | thread_t new_thread; |
3021 | spl_t s; |
3022 | |
3023 | counter(++c_thread_block_calls); |
3024 | |
3025 | s = splsched(); |
3026 | |
3027 | processor = current_processor(); |
3028 | |
3029 | /* If we're explicitly yielding, force a subsequent quantum */ |
3030 | if (reason & AST_YIELD) |
3031 | processor->first_timeslice = FALSE; |
3032 | |
3033 | /* We're handling all scheduling AST's */ |
3034 | ast_off(AST_SCHEDULING); |
3035 | |
3036 | #if PROC_REF_DEBUG |
3037 | if ((continuation != NULL) && (self->task != kernel_task)) { |
3038 | if (uthread_get_proc_refcount(self->uthread) != 0) { |
3039 | panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0" , self->uthread); |
3040 | } |
3041 | } |
3042 | #endif |
3043 | |
3044 | self->continuation = continuation; |
3045 | self->parameter = parameter; |
3046 | |
3047 | if (self->state & ~(TH_RUN | TH_IDLE)) { |
3048 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
3049 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK), |
3050 | reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0); |
3051 | } |
3052 | |
3053 | do { |
3054 | thread_lock(self); |
3055 | new_thread = thread_select(self, processor, &reason); |
3056 | thread_unlock(self); |
3057 | } while (!thread_invoke(self, new_thread, reason)); |
3058 | |
3059 | splx(s); |
3060 | |
3061 | return (self->wait_result); |
3062 | } |
3063 | |
3064 | /* |
3065 | * thread_block: |
3066 | * |
3067 | * Block the current thread if a wait has been asserted. |
3068 | */ |
3069 | wait_result_t |
3070 | thread_block( |
3071 | thread_continue_t continuation) |
3072 | { |
3073 | return thread_block_reason(continuation, NULL, AST_NONE); |
3074 | } |
3075 | |
3076 | wait_result_t |
3077 | thread_block_parameter( |
3078 | thread_continue_t continuation, |
3079 | void *parameter) |
3080 | { |
3081 | return thread_block_reason(continuation, parameter, AST_NONE); |
3082 | } |
3083 | |
3084 | /* |
3085 | * thread_run: |
3086 | * |
3087 | * Switch directly from the current thread to the |
3088 | * new thread, handing off our quantum if appropriate. |
3089 | * |
3090 | * New thread must be runnable, and not on a run queue. |
3091 | * |
3092 | * Called at splsched. |
3093 | */ |
3094 | int |
3095 | thread_run( |
3096 | thread_t self, |
3097 | thread_continue_t continuation, |
3098 | void *parameter, |
3099 | thread_t new_thread) |
3100 | { |
3101 | ast_t reason = AST_HANDOFF; |
3102 | |
3103 | self->continuation = continuation; |
3104 | self->parameter = parameter; |
3105 | |
3106 | while (!thread_invoke(self, new_thread, reason)) { |
3107 | /* the handoff failed, so we have to fall back to the normal block path */ |
3108 | processor_t processor = current_processor(); |
3109 | |
3110 | reason = AST_NONE; |
3111 | |
3112 | thread_lock(self); |
3113 | new_thread = thread_select(self, processor, &reason); |
3114 | thread_unlock(self); |
3115 | } |
3116 | |
3117 | return (self->wait_result); |
3118 | } |
3119 | |
3120 | /* |
3121 | * thread_continue: |
3122 | * |
3123 | * Called at splsched when a thread first receives |
3124 | * a new stack after a continuation. |
3125 | */ |
3126 | void |
3127 | thread_continue( |
3128 | thread_t thread) |
3129 | { |
3130 | thread_t self = current_thread(); |
3131 | thread_continue_t continuation; |
3132 | void *parameter; |
3133 | |
3134 | DTRACE_SCHED(on__cpu); |
3135 | |
3136 | continuation = self->continuation; |
3137 | parameter = self->parameter; |
3138 | |
3139 | #if KPERF |
3140 | kperf_on_cpu(self, continuation, NULL); |
3141 | #endif |
3142 | |
3143 | thread_dispatch(thread, self); |
3144 | |
3145 | self->continuation = self->parameter = NULL; |
3146 | |
3147 | #if INTERRUPT_MASKED_DEBUG |
3148 | /* Reset interrupt-masked spin debugging timeout */ |
3149 | ml_spin_debug_clear(self); |
3150 | #endif |
3151 | |
3152 | TLOG(1, "thread_continue: calling call_continuation\n" ); |
3153 | |
3154 | boolean_t enable_interrupts = thread != THREAD_NULL; |
3155 | call_continuation(continuation, parameter, self->wait_result, enable_interrupts); |
3156 | /*NOTREACHED*/ |
3157 | } |
3158 | |
3159 | void |
3160 | thread_quantum_init(thread_t thread) |
3161 | { |
3162 | if (thread->sched_mode == TH_MODE_REALTIME) { |
3163 | thread->quantum_remaining = thread->realtime.computation; |
3164 | } else { |
3165 | thread->quantum_remaining = SCHED(initial_quantum_size)(thread); |
3166 | } |
3167 | } |
3168 | |
3169 | uint32_t |
3170 | sched_timeshare_initial_quantum_size(thread_t thread) |
3171 | { |
3172 | if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG) |
3173 | return bg_quantum; |
3174 | else |
3175 | return std_quantum; |
3176 | } |
3177 | |
3178 | /* |
3179 | * run_queue_init: |
3180 | * |
3181 | * Initialize a run queue before first use. |
3182 | */ |
3183 | void |
3184 | run_queue_init( |
3185 | run_queue_t rq) |
3186 | { |
3187 | rq->highq = NOPRI; |
3188 | for (u_int i = 0; i < BITMAP_LEN(NRQS); i++) |
3189 | rq->bitmap[i] = 0; |
3190 | rq->urgency = rq->count = 0; |
3191 | for (int i = 0; i < NRQS; i++) |
3192 | queue_init(&rq->queues[i]); |
3193 | } |
3194 | |
3195 | /* |
3196 | * run_queue_dequeue: |
3197 | * |
3198 | * Perform a dequeue operation on a run queue, |
3199 | * and return the resulting thread. |
3200 | * |
3201 | * The run queue must be locked (see thread_run_queue_remove() |
3202 | * for more info), and not empty. |
3203 | */ |
3204 | thread_t |
3205 | run_queue_dequeue( |
3206 | run_queue_t rq, |
3207 | integer_t options) |
3208 | { |
3209 | thread_t thread; |
3210 | queue_t queue = &rq->queues[rq->highq]; |
3211 | |
3212 | if (options & SCHED_HEADQ) { |
3213 | thread = qe_dequeue_head(queue, struct thread, runq_links); |
3214 | } else { |
3215 | thread = qe_dequeue_tail(queue, struct thread, runq_links); |
3216 | } |
3217 | |
3218 | assert(thread != THREAD_NULL); |
3219 | assert_thread_magic(thread); |
3220 | |
3221 | thread->runq = PROCESSOR_NULL; |
3222 | SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); |
3223 | rq->count--; |
3224 | if (SCHED(priority_is_urgent)(rq->highq)) { |
3225 | rq->urgency--; assert(rq->urgency >= 0); |
3226 | } |
3227 | if (queue_empty(queue)) { |
3228 | bitmap_clear(rq->bitmap, rq->highq); |
3229 | rq->highq = bitmap_first(rq->bitmap, NRQS); |
3230 | } |
3231 | |
3232 | return thread; |
3233 | } |
3234 | |
3235 | /* |
3236 | * run_queue_enqueue: |
3237 | * |
3238 | * Perform a enqueue operation on a run queue. |
3239 | * |
3240 | * The run queue must be locked (see thread_run_queue_remove() |
3241 | * for more info). |
3242 | */ |
3243 | boolean_t |
3244 | run_queue_enqueue( |
3245 | run_queue_t rq, |
3246 | thread_t thread, |
3247 | integer_t options) |
3248 | { |
3249 | queue_t queue = &rq->queues[thread->sched_pri]; |
3250 | boolean_t result = FALSE; |
3251 | |
3252 | assert_thread_magic(thread); |
3253 | |
3254 | if (queue_empty(queue)) { |
3255 | enqueue_tail(queue, &thread->runq_links); |
3256 | |
3257 | rq_bitmap_set(rq->bitmap, thread->sched_pri); |
3258 | if (thread->sched_pri > rq->highq) { |
3259 | rq->highq = thread->sched_pri; |
3260 | result = TRUE; |
3261 | } |
3262 | } else { |
3263 | if (options & SCHED_TAILQ) |
3264 | enqueue_tail(queue, &thread->runq_links); |
3265 | else |
3266 | enqueue_head(queue, &thread->runq_links); |
3267 | } |
3268 | if (SCHED(priority_is_urgent)(thread->sched_pri)) |
3269 | rq->urgency++; |
3270 | SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); |
3271 | rq->count++; |
3272 | |
3273 | return (result); |
3274 | } |
3275 | |
3276 | /* |
3277 | * run_queue_remove: |
3278 | * |
3279 | * Remove a specific thread from a runqueue. |
3280 | * |
3281 | * The run queue must be locked. |
3282 | */ |
3283 | void |
3284 | run_queue_remove( |
3285 | run_queue_t rq, |
3286 | thread_t thread) |
3287 | { |
3288 | assert(thread->runq != PROCESSOR_NULL); |
3289 | assert_thread_magic(thread); |
3290 | |
3291 | remqueue(&thread->runq_links); |
3292 | SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); |
3293 | rq->count--; |
3294 | if (SCHED(priority_is_urgent)(thread->sched_pri)) { |
3295 | rq->urgency--; assert(rq->urgency >= 0); |
3296 | } |
3297 | |
3298 | if (queue_empty(&rq->queues[thread->sched_pri])) { |
3299 | /* update run queue status */ |
3300 | bitmap_clear(rq->bitmap, thread->sched_pri); |
3301 | rq->highq = bitmap_first(rq->bitmap, NRQS); |
3302 | } |
3303 | |
3304 | thread->runq = PROCESSOR_NULL; |
3305 | } |
3306 | |
3307 | /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */ |
3308 | void |
3309 | sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context) |
3310 | { |
3311 | spl_t s; |
3312 | thread_t thread; |
3313 | |
3314 | processor_set_t pset = &pset0; |
3315 | |
3316 | s = splsched(); |
3317 | rt_lock_lock(pset); |
3318 | |
3319 | qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) { |
3320 | if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { |
3321 | scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; |
3322 | } |
3323 | } |
3324 | |
3325 | rt_lock_unlock(pset); |
3326 | splx(s); |
3327 | } |
3328 | |
3329 | int64_t |
3330 | sched_rtglobal_runq_count_sum(void) |
3331 | { |
3332 | return pset0.rt_runq.runq_stats.count_sum; |
3333 | } |
3334 | |
3335 | /* |
3336 | * realtime_queue_insert: |
3337 | * |
3338 | * Enqueue a thread for realtime execution. |
3339 | */ |
3340 | static boolean_t |
3341 | realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread) |
3342 | { |
3343 | queue_t queue = &SCHED(rt_runq)(pset)->queue; |
3344 | uint64_t deadline = thread->realtime.deadline; |
3345 | boolean_t preempt = FALSE; |
3346 | |
3347 | rt_lock_lock(pset); |
3348 | |
3349 | if (queue_empty(queue)) { |
3350 | enqueue_tail(queue, &thread->runq_links); |
3351 | preempt = TRUE; |
3352 | } else { |
3353 | /* Insert into rt_runq in thread deadline order */ |
3354 | queue_entry_t iter; |
3355 | qe_foreach(iter, queue) { |
3356 | thread_t iter_thread = qe_element(iter, struct thread, runq_links); |
3357 | assert_thread_magic(iter_thread); |
3358 | |
3359 | if (deadline < iter_thread->realtime.deadline) { |
3360 | if (iter == queue_first(queue)) |
3361 | preempt = TRUE; |
3362 | insque(&thread->runq_links, queue_prev(iter)); |
3363 | break; |
3364 | } else if (iter == queue_last(queue)) { |
3365 | enqueue_tail(queue, &thread->runq_links); |
3366 | break; |
3367 | } |
3368 | } |
3369 | } |
3370 | |
3371 | thread->runq = processor; |
3372 | SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); |
3373 | rt_runq_count_incr(pset); |
3374 | |
3375 | rt_lock_unlock(pset); |
3376 | |
3377 | return (preempt); |
3378 | } |
3379 | |
3380 | /* |
3381 | * realtime_setrun: |
3382 | * |
3383 | * Dispatch a thread for realtime execution. |
3384 | * |
3385 | * Thread must be locked. Associated pset must |
3386 | * be locked, and is returned unlocked. |
3387 | */ |
3388 | static void |
3389 | realtime_setrun( |
3390 | processor_t processor, |
3391 | thread_t thread) |
3392 | { |
3393 | processor_set_t pset = processor->processor_set; |
3394 | pset_assert_locked(pset); |
3395 | ast_t preempt; |
3396 | |
3397 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
3398 | |
3399 | thread->chosen_processor = processor; |
3400 | |
3401 | /* <rdar://problem/15102234> */ |
3402 | assert(thread->bound_processor == PROCESSOR_NULL); |
3403 | |
3404 | /* |
3405 | * Dispatch directly onto idle processor. |
3406 | */ |
3407 | if ( (thread->bound_processor == processor) |
3408 | && processor->state == PROCESSOR_IDLE) { |
3409 | |
3410 | processor->next_thread = thread; |
3411 | processor_state_update_from_thread(processor, thread); |
3412 | processor->deadline = thread->realtime.deadline; |
3413 | pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); |
3414 | |
3415 | ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); |
3416 | pset_unlock(pset); |
3417 | sched_ipi_perform(processor, ipi_type); |
3418 | return; |
3419 | } |
3420 | |
3421 | if (processor->current_pri < BASEPRI_RTQUEUES) |
3422 | preempt = (AST_PREEMPT | AST_URGENT); |
3423 | else if (thread->realtime.deadline < processor->deadline) |
3424 | preempt = (AST_PREEMPT | AST_URGENT); |
3425 | else |
3426 | preempt = AST_NONE; |
3427 | |
3428 | realtime_queue_insert(processor, pset, thread); |
3429 | |
3430 | ipi_type = SCHED_IPI_NONE; |
3431 | if (preempt != AST_NONE) { |
3432 | if (processor->state == PROCESSOR_IDLE) { |
3433 | processor->next_thread = THREAD_NULL; |
3434 | processor_state_update_from_thread(processor, thread); |
3435 | processor->deadline = thread->realtime.deadline; |
3436 | pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); |
3437 | if (processor == current_processor()) { |
3438 | ast_on(preempt); |
3439 | } else { |
3440 | ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT); |
3441 | } |
3442 | } else if (processor->state == PROCESSOR_DISPATCHING) { |
3443 | if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) { |
3444 | processor_state_update_from_thread(processor, thread); |
3445 | processor->deadline = thread->realtime.deadline; |
3446 | } |
3447 | } else { |
3448 | if (processor == current_processor()) { |
3449 | ast_on(preempt); |
3450 | } else { |
3451 | ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT); |
3452 | } |
3453 | } |
3454 | } else { |
3455 | /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */ |
3456 | } |
3457 | |
3458 | pset_unlock(pset); |
3459 | sched_ipi_perform(processor, ipi_type); |
3460 | } |
3461 | |
3462 | |
3463 | sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst, |
3464 | __unused sched_ipi_event_t event) |
3465 | { |
3466 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
3467 | if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) { |
3468 | return SCHED_IPI_DEFERRED; |
3469 | } |
3470 | #else /* CONFIG_SCHED_DEFERRED_AST */ |
3471 | panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d" , pset, dst->cpu_id); |
3472 | #endif /* CONFIG_SCHED_DEFERRED_AST */ |
3473 | return SCHED_IPI_NONE; |
3474 | } |
3475 | |
3476 | sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) |
3477 | { |
3478 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
3479 | assert(dst != NULL); |
3480 | |
3481 | processor_set_t pset = dst->processor_set; |
3482 | if (current_processor() == dst) { |
3483 | return SCHED_IPI_NONE; |
3484 | } |
3485 | |
3486 | if (bit_test(pset->pending_AST_cpu_mask, dst->cpu_id)) { |
3487 | return SCHED_IPI_NONE; |
3488 | } |
3489 | |
3490 | ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event); |
3491 | switch(ipi_type) { |
3492 | case SCHED_IPI_NONE: |
3493 | return SCHED_IPI_NONE; |
3494 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
3495 | case SCHED_IPI_DEFERRED: |
3496 | bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id); |
3497 | break; |
3498 | #endif /* CONFIG_SCHED_DEFERRED_AST */ |
3499 | default: |
3500 | bit_set(pset->pending_AST_cpu_mask, dst->cpu_id); |
3501 | break; |
3502 | } |
3503 | return ipi_type; |
3504 | } |
3505 | |
3506 | sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) |
3507 | { |
3508 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
3509 | boolean_t deferred_ipi_supported = false; |
3510 | processor_set_t pset = dst->processor_set; |
3511 | |
3512 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
3513 | deferred_ipi_supported = true; |
3514 | #endif /* CONFIG_SCHED_DEFERRED_AST */ |
3515 | |
3516 | switch(event) { |
3517 | case SCHED_IPI_EVENT_SPILL: |
3518 | case SCHED_IPI_EVENT_SMT_REBAL: |
3519 | case SCHED_IPI_EVENT_REBALANCE: |
3520 | case SCHED_IPI_EVENT_BOUND_THR: |
3521 | /* |
3522 | * The spill, SMT rebalance, rebalance and the bound thread |
3523 | * scenarios use immediate IPIs always. |
3524 | */ |
3525 | ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; |
3526 | break; |
3527 | case SCHED_IPI_EVENT_PREEMPT: |
3528 | /* In the preemption case, use immediate IPIs for RT threads */ |
3529 | if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) { |
3530 | ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; |
3531 | break; |
3532 | } |
3533 | |
3534 | /* |
3535 | * For Non-RT threads preemption, |
3536 | * If the core is active, use immediate IPIs. |
3537 | * If the core is idle, use deferred IPIs if supported; otherwise immediate IPI. |
3538 | */ |
3539 | if (deferred_ipi_supported && dst_idle) { |
3540 | return sched_ipi_deferred_policy(pset, dst, event); |
3541 | } |
3542 | ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; |
3543 | break; |
3544 | default: |
3545 | panic("Unrecognized scheduler IPI event type %d" , event); |
3546 | } |
3547 | assert(ipi_type != SCHED_IPI_NONE); |
3548 | return ipi_type; |
3549 | } |
3550 | |
3551 | void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi) |
3552 | { |
3553 | switch (ipi) { |
3554 | case SCHED_IPI_NONE: |
3555 | break; |
3556 | case SCHED_IPI_IDLE: |
3557 | machine_signal_idle(dst); |
3558 | break; |
3559 | case SCHED_IPI_IMMEDIATE: |
3560 | cause_ast_check(dst); |
3561 | break; |
3562 | case SCHED_IPI_DEFERRED: |
3563 | machine_signal_idle_deferred(dst); |
3564 | break; |
3565 | default: |
3566 | panic("Unrecognized scheduler IPI type: %d" , ipi); |
3567 | } |
3568 | } |
3569 | |
3570 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
3571 | |
3572 | boolean_t |
3573 | priority_is_urgent(int priority) |
3574 | { |
3575 | return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE; |
3576 | } |
3577 | |
3578 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
3579 | |
3580 | /* |
3581 | * processor_setrun: |
3582 | * |
3583 | * Dispatch a thread for execution on a |
3584 | * processor. |
3585 | * |
3586 | * Thread must be locked. Associated pset must |
3587 | * be locked, and is returned unlocked. |
3588 | */ |
3589 | static void |
3590 | processor_setrun( |
3591 | processor_t processor, |
3592 | thread_t thread, |
3593 | integer_t options) |
3594 | { |
3595 | processor_set_t pset = processor->processor_set; |
3596 | pset_assert_locked(pset); |
3597 | ast_t preempt; |
3598 | enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing; |
3599 | |
3600 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
3601 | |
3602 | thread->chosen_processor = processor; |
3603 | |
3604 | /* |
3605 | * Dispatch directly onto idle processor. |
3606 | */ |
3607 | if ( (SCHED(direct_dispatch_to_idle_processors) || |
3608 | thread->bound_processor == processor) |
3609 | && processor->state == PROCESSOR_IDLE) { |
3610 | |
3611 | processor->next_thread = thread; |
3612 | processor_state_update_from_thread(processor, thread); |
3613 | processor->deadline = UINT64_MAX; |
3614 | pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); |
3615 | |
3616 | ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); |
3617 | pset_unlock(pset); |
3618 | sched_ipi_perform(processor, ipi_type); |
3619 | return; |
3620 | } |
3621 | |
3622 | /* |
3623 | * Set preemption mode. |
3624 | */ |
3625 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
3626 | /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */ |
3627 | #endif |
3628 | if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) |
3629 | preempt = (AST_PREEMPT | AST_URGENT); |
3630 | else if(processor->active_thread && thread_eager_preemption(processor->active_thread)) |
3631 | preempt = (AST_PREEMPT | AST_URGENT); |
3632 | else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) { |
3633 | if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) { |
3634 | preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; |
3635 | } else { |
3636 | preempt = AST_NONE; |
3637 | } |
3638 | } else |
3639 | preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; |
3640 | |
3641 | if ((options & (SCHED_PREEMPT|SCHED_REBALANCE)) == (SCHED_PREEMPT|SCHED_REBALANCE)) { |
3642 | /* |
3643 | * Having gone to the trouble of forcing this thread off a less preferred core, |
3644 | * we should force the preferable core to reschedule immediately to give this |
3645 | * thread a chance to run instead of just sitting on the run queue where |
3646 | * it may just be stolen back by the idle core we just forced it off. |
3647 | */ |
3648 | preempt |= AST_PREEMPT; |
3649 | } |
3650 | |
3651 | SCHED(processor_enqueue)(processor, thread, options); |
3652 | sched_update_pset_load_average(pset); |
3653 | |
3654 | if (preempt != AST_NONE) { |
3655 | if (processor->state == PROCESSOR_IDLE) { |
3656 | processor->next_thread = THREAD_NULL; |
3657 | processor_state_update_from_thread(processor, thread); |
3658 | processor->deadline = UINT64_MAX; |
3659 | pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); |
3660 | ipi_action = eExitIdle; |
3661 | } else if ( processor->state == PROCESSOR_DISPATCHING) { |
3662 | if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) { |
3663 | processor_state_update_from_thread(processor, thread); |
3664 | processor->deadline = UINT64_MAX; |
3665 | } |
3666 | } else if ( (processor->state == PROCESSOR_RUNNING || |
3667 | processor->state == PROCESSOR_SHUTDOWN) && |
3668 | (thread->sched_pri >= processor->current_pri)) { |
3669 | ipi_action = eInterruptRunning; |
3670 | } |
3671 | } else { |
3672 | /* |
3673 | * New thread is not important enough to preempt what is running, but |
3674 | * special processor states may need special handling |
3675 | */ |
3676 | if (processor->state == PROCESSOR_SHUTDOWN && |
3677 | thread->sched_pri >= processor->current_pri ) { |
3678 | ipi_action = eInterruptRunning; |
3679 | } else if (processor->state == PROCESSOR_IDLE) { |
3680 | |
3681 | processor->next_thread = THREAD_NULL; |
3682 | processor_state_update_from_thread(processor, thread); |
3683 | processor->deadline = UINT64_MAX; |
3684 | pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); |
3685 | |
3686 | ipi_action = eExitIdle; |
3687 | } |
3688 | } |
3689 | |
3690 | if (ipi_action != eDoNothing) { |
3691 | if (processor == current_processor()) { |
3692 | if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE) |
3693 | ast_on(preempt); |
3694 | } else { |
3695 | sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT; |
3696 | ipi_type = sched_ipi_action(processor, thread, (ipi_action == eExitIdle), event); |
3697 | } |
3698 | } |
3699 | pset_unlock(pset); |
3700 | sched_ipi_perform(processor, ipi_type); |
3701 | } |
3702 | |
3703 | /* |
3704 | * choose_next_pset: |
3705 | * |
3706 | * Return the next sibling pset containing |
3707 | * available processors. |
3708 | * |
3709 | * Returns the original pset if none other is |
3710 | * suitable. |
3711 | */ |
3712 | static processor_set_t |
3713 | choose_next_pset( |
3714 | processor_set_t pset) |
3715 | { |
3716 | processor_set_t nset = pset; |
3717 | |
3718 | do { |
3719 | nset = next_pset(nset); |
3720 | } while (nset->online_processor_count < 1 && nset != pset); |
3721 | |
3722 | return (nset); |
3723 | } |
3724 | |
3725 | /* |
3726 | * choose_processor: |
3727 | * |
3728 | * Choose a processor for the thread, beginning at |
3729 | * the pset. Accepts an optional processor hint in |
3730 | * the pset. |
3731 | * |
3732 | * Returns a processor, possibly from a different pset. |
3733 | * |
3734 | * The thread must be locked. The pset must be locked, |
3735 | * and the resulting pset is locked on return. |
3736 | */ |
3737 | processor_t |
3738 | choose_processor( |
3739 | processor_set_t starting_pset, |
3740 | processor_t processor, |
3741 | thread_t thread) |
3742 | { |
3743 | processor_set_t pset = starting_pset; |
3744 | processor_set_t nset; |
3745 | |
3746 | assert(thread->sched_pri <= BASEPRI_RTQUEUES); |
3747 | |
3748 | /* |
3749 | * Prefer the hinted processor, when appropriate. |
3750 | */ |
3751 | |
3752 | /* Fold last processor hint from secondary processor to its primary */ |
3753 | if (processor != PROCESSOR_NULL) { |
3754 | processor = processor->processor_primary; |
3755 | } |
3756 | |
3757 | /* |
3758 | * Only consult platform layer if pset is active, which |
3759 | * it may not be in some cases when a multi-set system |
3760 | * is going to sleep. |
3761 | */ |
3762 | if (pset->online_processor_count) { |
3763 | if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) { |
3764 | processor_t mc_processor = machine_choose_processor(pset, processor); |
3765 | if (mc_processor != PROCESSOR_NULL) |
3766 | processor = mc_processor->processor_primary; |
3767 | } |
3768 | } |
3769 | |
3770 | /* |
3771 | * At this point, we may have a processor hint, and we may have |
3772 | * an initial starting pset. If the hint is not in the pset, or |
3773 | * if the hint is for a processor in an invalid state, discard |
3774 | * the hint. |
3775 | */ |
3776 | if (processor != PROCESSOR_NULL) { |
3777 | if (processor->processor_set != pset) { |
3778 | processor = PROCESSOR_NULL; |
3779 | } else if (!processor->is_recommended) { |
3780 | processor = PROCESSOR_NULL; |
3781 | } else { |
3782 | switch (processor->state) { |
3783 | case PROCESSOR_START: |
3784 | case PROCESSOR_SHUTDOWN: |
3785 | case PROCESSOR_OFF_LINE: |
3786 | /* |
3787 | * Hint is for a processor that cannot support running new threads. |
3788 | */ |
3789 | processor = PROCESSOR_NULL; |
3790 | break; |
3791 | case PROCESSOR_IDLE: |
3792 | /* |
3793 | * Hint is for an idle processor. Assume it is no worse than any other |
3794 | * idle processor. The platform layer had an opportunity to provide |
3795 | * the "least cost idle" processor above. |
3796 | */ |
3797 | return (processor); |
3798 | case PROCESSOR_RUNNING: |
3799 | case PROCESSOR_DISPATCHING: |
3800 | /* |
3801 | * Hint is for an active CPU. This fast-path allows |
3802 | * realtime threads to preempt non-realtime threads |
3803 | * to regain their previous executing processor. |
3804 | */ |
3805 | if ((thread->sched_pri >= BASEPRI_RTQUEUES) && |
3806 | (processor->current_pri < BASEPRI_RTQUEUES)) |
3807 | return (processor); |
3808 | |
3809 | /* Otherwise, use hint as part of search below */ |
3810 | break; |
3811 | default: |
3812 | processor = PROCESSOR_NULL; |
3813 | break; |
3814 | } |
3815 | } |
3816 | } |
3817 | |
3818 | /* |
3819 | * Iterate through the processor sets to locate |
3820 | * an appropriate processor. Seed results with |
3821 | * a last-processor hint, if available, so that |
3822 | * a search must find something strictly better |
3823 | * to replace it. |
3824 | * |
3825 | * A primary/secondary pair of SMT processors are |
3826 | * "unpaired" if the primary is busy but its |
3827 | * corresponding secondary is idle (so the physical |
3828 | * core has full use of its resources). |
3829 | */ |
3830 | |
3831 | integer_t lowest_priority = MAXPRI + 1; |
3832 | integer_t lowest_secondary_priority = MAXPRI + 1; |
3833 | integer_t lowest_unpaired_primary_priority = MAXPRI + 1; |
3834 | integer_t lowest_count = INT_MAX; |
3835 | uint64_t furthest_deadline = 1; |
3836 | processor_t lp_processor = PROCESSOR_NULL; |
3837 | processor_t lp_unpaired_primary_processor = PROCESSOR_NULL; |
3838 | processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL; |
3839 | processor_t lp_paired_secondary_processor = PROCESSOR_NULL; |
3840 | processor_t lc_processor = PROCESSOR_NULL; |
3841 | processor_t fd_processor = PROCESSOR_NULL; |
3842 | |
3843 | if (processor != PROCESSOR_NULL) { |
3844 | /* All other states should be enumerated above. */ |
3845 | assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING); |
3846 | |
3847 | lowest_priority = processor->current_pri; |
3848 | lp_processor = processor; |
3849 | |
3850 | if (processor->current_pri >= BASEPRI_RTQUEUES) { |
3851 | furthest_deadline = processor->deadline; |
3852 | fd_processor = processor; |
3853 | } |
3854 | |
3855 | lowest_count = SCHED(processor_runq_count)(processor); |
3856 | lc_processor = processor; |
3857 | } |
3858 | |
3859 | do { |
3860 | /* |
3861 | * Choose an idle processor, in pset traversal order |
3862 | */ |
3863 | |
3864 | uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & |
3865 | pset->primary_map & |
3866 | pset->recommended_bitmask & |
3867 | ~pset->pending_AST_cpu_mask); |
3868 | |
3869 | int cpuid = lsb_first(idle_primary_map); |
3870 | if (cpuid >= 0) { |
3871 | processor = processor_array[cpuid]; |
3872 | return processor; |
3873 | } |
3874 | |
3875 | /* |
3876 | * Otherwise, enumerate active and idle processors to find primary candidates |
3877 | * with lower priority/etc. |
3878 | */ |
3879 | |
3880 | uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) & |
3881 | pset->recommended_bitmask & |
3882 | ~pset->pending_AST_cpu_mask); |
3883 | active_map = bit_ror64(active_map, (pset->last_chosen + 1)); |
3884 | for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) { |
3885 | cpuid = ((rotid + pset->last_chosen + 1) & 63); |
3886 | processor = processor_array[cpuid]; |
3887 | |
3888 | integer_t cpri = processor->current_pri; |
3889 | if (processor->processor_primary != processor) { |
3890 | if (cpri < lowest_secondary_priority) { |
3891 | lowest_secondary_priority = cpri; |
3892 | lp_paired_secondary_processor = processor; |
3893 | } |
3894 | } else { |
3895 | if (cpri < lowest_priority) { |
3896 | lowest_priority = cpri; |
3897 | lp_processor = processor; |
3898 | } |
3899 | } |
3900 | |
3901 | if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) { |
3902 | furthest_deadline = processor->deadline; |
3903 | fd_processor = processor; |
3904 | } |
3905 | |
3906 | integer_t ccount = SCHED(processor_runq_count)(processor); |
3907 | if (ccount < lowest_count) { |
3908 | lowest_count = ccount; |
3909 | lc_processor = processor; |
3910 | } |
3911 | } |
3912 | |
3913 | /* |
3914 | * For SMT configs, these idle secondary processors must have active primary. Otherwise |
3915 | * the idle primary would have short-circuited the loop above |
3916 | */ |
3917 | uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & |
3918 | ~pset->primary_map & |
3919 | pset->recommended_bitmask & |
3920 | ~pset->pending_AST_cpu_mask); |
3921 | |
3922 | for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) { |
3923 | processor = processor_array[cpuid]; |
3924 | |
3925 | processor_t cprimary = processor->processor_primary; |
3926 | |
3927 | if (!cprimary->is_recommended) { |
3928 | continue; |
3929 | } |
3930 | if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) { |
3931 | continue; |
3932 | } |
3933 | |
3934 | /* If the primary processor is offline or starting up, it's not a candidate for this path */ |
3935 | if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) { |
3936 | integer_t primary_pri = cprimary->current_pri; |
3937 | |
3938 | if (primary_pri < lowest_unpaired_primary_priority) { |
3939 | lowest_unpaired_primary_priority = primary_pri; |
3940 | lp_unpaired_primary_processor = cprimary; |
3941 | lp_unpaired_secondary_processor = processor; |
3942 | } |
3943 | } |
3944 | } |
3945 | |
3946 | |
3947 | if (thread->sched_pri >= BASEPRI_RTQUEUES) { |
3948 | |
3949 | /* |
3950 | * For realtime threads, the most important aspect is |
3951 | * scheduling latency, so we attempt to assign threads |
3952 | * to good preemption candidates (assuming an idle primary |
3953 | * processor was not available above). |
3954 | */ |
3955 | |
3956 | if (thread->sched_pri > lowest_unpaired_primary_priority) { |
3957 | pset->last_chosen = lp_unpaired_primary_processor->cpu_id; |
3958 | return lp_unpaired_primary_processor; |
3959 | } |
3960 | if (thread->sched_pri > lowest_priority) { |
3961 | pset->last_chosen = lp_processor->cpu_id; |
3962 | return lp_processor; |
3963 | } |
3964 | if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) { |
3965 | pset->last_chosen = lp_paired_secondary_processor->cpu_id; |
3966 | return lp_paired_secondary_processor; |
3967 | } |
3968 | if (thread->realtime.deadline < furthest_deadline) |
3969 | return fd_processor; |
3970 | |
3971 | /* |
3972 | * If all primary and secondary CPUs are busy with realtime |
3973 | * threads with deadlines earlier than us, move on to next |
3974 | * pset. |
3975 | */ |
3976 | } |
3977 | else { |
3978 | |
3979 | if (thread->sched_pri > lowest_unpaired_primary_priority) { |
3980 | pset->last_chosen = lp_unpaired_primary_processor->cpu_id; |
3981 | return lp_unpaired_primary_processor; |
3982 | } |
3983 | if (thread->sched_pri > lowest_priority) { |
3984 | pset->last_chosen = lp_processor->cpu_id; |
3985 | return lp_processor; |
3986 | } |
3987 | |
3988 | /* |
3989 | * If all primary processor in this pset are running a higher |
3990 | * priority thread, move on to next pset. Only when we have |
3991 | * exhausted this search do we fall back to other heuristics. |
3992 | */ |
3993 | } |
3994 | |
3995 | /* |
3996 | * Move onto the next processor set. |
3997 | */ |
3998 | nset = next_pset(pset); |
3999 | |
4000 | if (nset != starting_pset) { |
4001 | pset_unlock(pset); |
4002 | |
4003 | pset = nset; |
4004 | pset_lock(pset); |
4005 | } |
4006 | } while (nset != starting_pset); |
4007 | |
4008 | /* |
4009 | * Make sure that we pick a running processor, |
4010 | * and that the correct processor set is locked. |
4011 | * Since we may have unlock the candidate processor's |
4012 | * pset, it may have changed state. |
4013 | * |
4014 | * All primary processors are running a higher priority |
4015 | * thread, so the only options left are enqueuing on |
4016 | * the secondary processor that would perturb the least priority |
4017 | * primary, or the least busy primary. |
4018 | */ |
4019 | do { |
4020 | |
4021 | /* lowest_priority is evaluated in the main loops above */ |
4022 | if (lp_unpaired_secondary_processor != PROCESSOR_NULL) { |
4023 | processor = lp_unpaired_secondary_processor; |
4024 | lp_unpaired_secondary_processor = PROCESSOR_NULL; |
4025 | } else if (lp_paired_secondary_processor != PROCESSOR_NULL) { |
4026 | processor = lp_paired_secondary_processor; |
4027 | lp_paired_secondary_processor = PROCESSOR_NULL; |
4028 | } else if (lc_processor != PROCESSOR_NULL) { |
4029 | processor = lc_processor; |
4030 | lc_processor = PROCESSOR_NULL; |
4031 | } else { |
4032 | /* |
4033 | * All processors are executing higher |
4034 | * priority threads, and the lowest_count |
4035 | * candidate was not usable |
4036 | */ |
4037 | processor = master_processor; |
4038 | } |
4039 | |
4040 | /* |
4041 | * Check that the correct processor set is |
4042 | * returned locked. |
4043 | */ |
4044 | if (pset != processor->processor_set) { |
4045 | pset_unlock(pset); |
4046 | pset = processor->processor_set; |
4047 | pset_lock(pset); |
4048 | } |
4049 | |
4050 | /* |
4051 | * We must verify that the chosen processor is still available. |
4052 | * master_processor is an exception, since we may need to preempt |
4053 | * a running thread on it during processor shutdown (for sleep), |
4054 | * and that thread needs to be enqueued on its runqueue to run |
4055 | * when the processor is restarted. |
4056 | */ |
4057 | if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)) |
4058 | processor = PROCESSOR_NULL; |
4059 | |
4060 | } while (processor == PROCESSOR_NULL); |
4061 | |
4062 | pset->last_chosen = processor->cpu_id; |
4063 | return processor; |
4064 | } |
4065 | |
4066 | /* |
4067 | * thread_setrun: |
4068 | * |
4069 | * Dispatch thread for execution, onto an idle |
4070 | * processor or run queue, and signal a preemption |
4071 | * as appropriate. |
4072 | * |
4073 | * Thread must be locked. |
4074 | */ |
4075 | void |
4076 | thread_setrun( |
4077 | thread_t thread, |
4078 | integer_t options) |
4079 | { |
4080 | processor_t processor; |
4081 | processor_set_t pset; |
4082 | |
4083 | assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN); |
4084 | assert(thread->runq == PROCESSOR_NULL); |
4085 | |
4086 | /* |
4087 | * Update priority if needed. |
4088 | */ |
4089 | if (SCHED(can_update_priority)(thread)) |
4090 | SCHED(update_priority)(thread); |
4091 | |
4092 | thread->sfi_class = sfi_thread_classify(thread); |
4093 | |
4094 | assert(thread->runq == PROCESSOR_NULL); |
4095 | |
4096 | #if __SMP__ |
4097 | if (thread->bound_processor == PROCESSOR_NULL) { |
4098 | /* |
4099 | * Unbound case. |
4100 | */ |
4101 | if (thread->affinity_set != AFFINITY_SET_NULL) { |
4102 | /* |
4103 | * Use affinity set policy hint. |
4104 | */ |
4105 | pset = thread->affinity_set->aset_pset; |
4106 | pset_lock(pset); |
4107 | |
4108 | processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); |
4109 | pset = processor->processor_set; |
4110 | |
4111 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, |
4112 | (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0); |
4113 | } else if (thread->last_processor != PROCESSOR_NULL) { |
4114 | /* |
4115 | * Simple (last processor) affinity case. |
4116 | */ |
4117 | processor = thread->last_processor; |
4118 | pset = processor->processor_set; |
4119 | pset_lock(pset); |
4120 | processor = SCHED(choose_processor)(pset, processor, thread); |
4121 | pset = processor->processor_set; |
4122 | |
4123 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, |
4124 | (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0); |
4125 | } else { |
4126 | /* |
4127 | * No Affinity case: |
4128 | * |
4129 | * Utilitize a per task hint to spread threads |
4130 | * among the available processor sets. |
4131 | */ |
4132 | task_t task = thread->task; |
4133 | |
4134 | pset = task->pset_hint; |
4135 | if (pset == PROCESSOR_SET_NULL) |
4136 | pset = current_processor()->processor_set; |
4137 | |
4138 | pset = choose_next_pset(pset); |
4139 | pset_lock(pset); |
4140 | |
4141 | processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); |
4142 | pset = processor->processor_set; |
4143 | task->pset_hint = pset; |
4144 | |
4145 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, |
4146 | (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0); |
4147 | } |
4148 | } else { |
4149 | /* |
4150 | * Bound case: |
4151 | * |
4152 | * Unconditionally dispatch on the processor. |
4153 | */ |
4154 | processor = thread->bound_processor; |
4155 | pset = processor->processor_set; |
4156 | pset_lock(pset); |
4157 | |
4158 | SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE, |
4159 | (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0); |
4160 | } |
4161 | #else /* !__SMP__ */ |
4162 | /* Only one processor to choose */ |
4163 | assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor); |
4164 | processor = master_processor; |
4165 | pset = processor->processor_set; |
4166 | pset_lock(pset); |
4167 | #endif /* !__SMP__ */ |
4168 | |
4169 | /* |
4170 | * Dispatch the thread on the chosen processor. |
4171 | * TODO: This should be based on sched_mode, not sched_pri |
4172 | */ |
4173 | if (thread->sched_pri >= BASEPRI_RTQUEUES) { |
4174 | realtime_setrun(processor, thread); |
4175 | } else { |
4176 | processor_setrun(processor, thread, options); |
4177 | } |
4178 | /* pset is now unlocked */ |
4179 | if (thread->bound_processor == PROCESSOR_NULL) { |
4180 | SCHED(check_spill)(pset, thread); |
4181 | } |
4182 | } |
4183 | |
4184 | processor_set_t |
4185 | task_choose_pset( |
4186 | task_t task) |
4187 | { |
4188 | processor_set_t pset = task->pset_hint; |
4189 | |
4190 | if (pset != PROCESSOR_SET_NULL) |
4191 | pset = choose_next_pset(pset); |
4192 | |
4193 | return (pset); |
4194 | } |
4195 | |
4196 | /* |
4197 | * Check for a preemption point in |
4198 | * the current context. |
4199 | * |
4200 | * Called at splsched with thread locked. |
4201 | */ |
4202 | ast_t |
4203 | csw_check( |
4204 | processor_t processor, |
4205 | ast_t check_reason) |
4206 | { |
4207 | processor_set_t pset = processor->processor_set; |
4208 | ast_t result; |
4209 | |
4210 | pset_lock(pset); |
4211 | |
4212 | /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */ |
4213 | bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id); |
4214 | |
4215 | result = csw_check_locked(processor, pset, check_reason); |
4216 | |
4217 | pset_unlock(pset); |
4218 | |
4219 | return result; |
4220 | } |
4221 | |
4222 | /* |
4223 | * Check for preemption at splsched with |
4224 | * pset and thread locked |
4225 | */ |
4226 | ast_t |
4227 | csw_check_locked( |
4228 | processor_t processor, |
4229 | processor_set_t pset, |
4230 | ast_t check_reason) |
4231 | { |
4232 | ast_t result; |
4233 | thread_t thread = processor->active_thread; |
4234 | |
4235 | if (processor->first_timeslice) { |
4236 | if (rt_runq_count(pset) > 0) |
4237 | return (check_reason | AST_PREEMPT | AST_URGENT); |
4238 | } |
4239 | else { |
4240 | if (rt_runq_count(pset) > 0) { |
4241 | if (BASEPRI_RTQUEUES > processor->current_pri) |
4242 | return (check_reason | AST_PREEMPT | AST_URGENT); |
4243 | else |
4244 | return (check_reason | AST_PREEMPT); |
4245 | } |
4246 | } |
4247 | |
4248 | #if __SMP__ |
4249 | /* |
4250 | * If the current thread is running on a processor that is no longer recommended, |
4251 | * urgently preempt it, at which point thread_select() should |
4252 | * try to idle the processor and re-dispatch the thread to a recommended processor. |
4253 | */ |
4254 | if (!processor->is_recommended) { |
4255 | return (check_reason | AST_PREEMPT | AST_URGENT); |
4256 | } |
4257 | #endif |
4258 | |
4259 | result = SCHED(processor_csw_check)(processor); |
4260 | if (result != AST_NONE) |
4261 | return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE)); |
4262 | |
4263 | #if __SMP__ |
4264 | /* |
4265 | * Same for avoid-processor |
4266 | * |
4267 | * TODO: Should these set AST_REBALANCE? |
4268 | */ |
4269 | if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) { |
4270 | return (check_reason | AST_PREEMPT); |
4271 | } |
4272 | |
4273 | /* |
4274 | * Even though we could continue executing on this processor, a |
4275 | * secondary SMT core should try to shed load to another primary core. |
4276 | * |
4277 | * TODO: Should this do the same check that thread_select does? i.e. |
4278 | * if no bound threads target this processor, and idle primaries exist, preempt |
4279 | * The case of RT threads existing is already taken care of above |
4280 | */ |
4281 | |
4282 | if (processor->current_pri < BASEPRI_RTQUEUES && |
4283 | processor->processor_primary != processor) |
4284 | return (check_reason | AST_PREEMPT); |
4285 | #endif |
4286 | |
4287 | if (thread->state & TH_SUSP) |
4288 | return (check_reason | AST_PREEMPT); |
4289 | |
4290 | #if CONFIG_SCHED_SFI |
4291 | /* |
4292 | * Current thread may not need to be preempted, but maybe needs |
4293 | * an SFI wait? |
4294 | */ |
4295 | result = sfi_thread_needs_ast(thread, NULL); |
4296 | if (result != AST_NONE) |
4297 | return (check_reason | result); |
4298 | #endif |
4299 | |
4300 | return (AST_NONE); |
4301 | } |
4302 | |
4303 | /* |
4304 | * set_sched_pri: |
4305 | * |
4306 | * Set the scheduled priority of the specified thread. |
4307 | * |
4308 | * This may cause the thread to change queues. |
4309 | * |
4310 | * Thread must be locked. |
4311 | */ |
4312 | void |
4313 | set_sched_pri( |
4314 | thread_t thread, |
4315 | int new_priority, |
4316 | set_sched_pri_options_t options) |
4317 | { |
4318 | thread_t cthread = current_thread(); |
4319 | boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE; |
4320 | int curgency, nurgency; |
4321 | uint64_t urgency_param1, urgency_param2; |
4322 | boolean_t removed_from_runq = FALSE; |
4323 | |
4324 | bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY); |
4325 | |
4326 | int old_priority = thread->sched_pri; |
4327 | |
4328 | /* If we're already at this priority, no need to mess with the runqueue */ |
4329 | if (new_priority == old_priority) |
4330 | return; |
4331 | |
4332 | if (is_current_thread) { |
4333 | assert(thread->runq == PROCESSOR_NULL); |
4334 | curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); |
4335 | } else { |
4336 | removed_from_runq = thread_run_queue_remove(thread); |
4337 | } |
4338 | |
4339 | thread->sched_pri = new_priority; |
4340 | |
4341 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), |
4342 | (uintptr_t)thread_tid(thread), |
4343 | thread->base_pri, |
4344 | thread->sched_pri, |
4345 | thread->sched_usage, |
4346 | 0); |
4347 | |
4348 | if (is_current_thread) { |
4349 | nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2); |
4350 | /* |
4351 | * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS |
4352 | * class alterations from user space to occur relatively infrequently, hence |
4353 | * those are lazily handled. QoS classes have distinct priority bands, and QoS |
4354 | * inheritance is expected to involve priority changes. |
4355 | */ |
4356 | uint64_t ctime = mach_approximate_time(); |
4357 | if (nurgency != curgency) { |
4358 | thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread); |
4359 | } |
4360 | machine_thread_going_on_core(thread, nurgency, 0, 0, ctime); |
4361 | } |
4362 | |
4363 | if (removed_from_runq) |
4364 | thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ); |
4365 | else if (thread->state & TH_RUN) { |
4366 | processor_t processor = thread->last_processor; |
4367 | |
4368 | if (is_current_thread) { |
4369 | processor_state_update_from_thread(processor, thread); |
4370 | |
4371 | /* |
4372 | * When dropping in priority, check if the thread no longer belongs on core. |
4373 | * If a thread raises its own priority, don't aggressively rebalance it. |
4374 | * <rdar://problem/31699165> |
4375 | */ |
4376 | if (!lazy_update && new_priority < old_priority) { |
4377 | ast_t preempt; |
4378 | |
4379 | if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE) |
4380 | ast_on(preempt); |
4381 | } |
4382 | } else if (!lazy_update && processor != PROCESSOR_NULL && |
4383 | processor != current_processor() && processor->active_thread == thread) { |
4384 | cause_ast_check(processor); |
4385 | } |
4386 | } |
4387 | } |
4388 | |
4389 | /* |
4390 | * thread_run_queue_remove_for_handoff |
4391 | * |
4392 | * Pull a thread or its (recursive) push target out of the runqueue |
4393 | * so that it is ready for thread_run() |
4394 | * |
4395 | * Called at splsched |
4396 | * |
4397 | * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled. |
4398 | * This may be different than the thread that was passed in. |
4399 | */ |
4400 | thread_t |
4401 | thread_run_queue_remove_for_handoff(thread_t thread) { |
4402 | |
4403 | thread_t pulled_thread = THREAD_NULL; |
4404 | |
4405 | thread_lock(thread); |
4406 | |
4407 | /* |
4408 | * Check that the thread is not bound |
4409 | * to a different processor, and that realtime |
4410 | * is not involved. |
4411 | * |
4412 | * Next, pull it off its run queue. If it |
4413 | * doesn't come, it's not eligible. |
4414 | */ |
4415 | |
4416 | processor_t processor = current_processor(); |
4417 | if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES && |
4418 | (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) { |
4419 | |
4420 | if (thread_run_queue_remove(thread)) |
4421 | pulled_thread = thread; |
4422 | } |
4423 | |
4424 | thread_unlock(thread); |
4425 | |
4426 | return pulled_thread; |
4427 | } |
4428 | |
4429 | /* |
4430 | * thread_run_queue_remove: |
4431 | * |
4432 | * Remove a thread from its current run queue and |
4433 | * return TRUE if successful. |
4434 | * |
4435 | * Thread must be locked. |
4436 | * |
4437 | * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the |
4438 | * run queues because the caller locked the thread. Otherwise |
4439 | * the thread is on a run queue, but could be chosen for dispatch |
4440 | * and removed by another processor under a different lock, which |
4441 | * will set thread->runq to PROCESSOR_NULL. |
4442 | * |
4443 | * Hence the thread select path must not rely on anything that could |
4444 | * be changed under the thread lock after calling this function, |
4445 | * most importantly thread->sched_pri. |
4446 | */ |
4447 | boolean_t |
4448 | thread_run_queue_remove( |
4449 | thread_t thread) |
4450 | { |
4451 | boolean_t removed = FALSE; |
4452 | processor_t processor = thread->runq; |
4453 | |
4454 | if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) { |
4455 | /* Thread isn't runnable */ |
4456 | assert(thread->runq == PROCESSOR_NULL); |
4457 | return FALSE; |
4458 | } |
4459 | |
4460 | if (processor == PROCESSOR_NULL) { |
4461 | /* |
4462 | * The thread is either not on the runq, |
4463 | * or is in the midst of being removed from the runq. |
4464 | * |
4465 | * runq is set to NULL under the pset lock, not the thread |
4466 | * lock, so the thread may still be in the process of being dequeued |
4467 | * from the runq. It will wait in invoke for the thread lock to be |
4468 | * dropped. |
4469 | */ |
4470 | |
4471 | return FALSE; |
4472 | } |
4473 | |
4474 | if (thread->sched_pri < BASEPRI_RTQUEUES) { |
4475 | return SCHED(processor_queue_remove)(processor, thread); |
4476 | } |
4477 | |
4478 | processor_set_t pset = processor->processor_set; |
4479 | |
4480 | rt_lock_lock(pset); |
4481 | |
4482 | if (thread->runq != PROCESSOR_NULL) { |
4483 | /* |
4484 | * Thread is on the RT run queue and we have a lock on |
4485 | * that run queue. |
4486 | */ |
4487 | |
4488 | remqueue(&thread->runq_links); |
4489 | SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset)); |
4490 | rt_runq_count_decr(pset); |
4491 | |
4492 | thread->runq = PROCESSOR_NULL; |
4493 | |
4494 | removed = TRUE; |
4495 | } |
4496 | |
4497 | rt_lock_unlock(pset); |
4498 | |
4499 | return (removed); |
4500 | } |
4501 | |
4502 | /* |
4503 | * Put the thread back where it goes after a thread_run_queue_remove |
4504 | * |
4505 | * Thread must have been removed under the same thread lock hold |
4506 | * |
4507 | * thread locked, at splsched |
4508 | */ |
4509 | void |
4510 | thread_run_queue_reinsert(thread_t thread, integer_t options) |
4511 | { |
4512 | assert(thread->runq == PROCESSOR_NULL); |
4513 | assert(thread->state & (TH_RUN)); |
4514 | |
4515 | thread_setrun(thread, options); |
4516 | } |
4517 | |
4518 | void |
4519 | sys_override_cpu_throttle(boolean_t enable_override) |
4520 | { |
4521 | if (enable_override) |
4522 | cpu_throttle_enabled = 0; |
4523 | else |
4524 | cpu_throttle_enabled = 1; |
4525 | } |
4526 | |
4527 | int |
4528 | thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2) |
4529 | { |
4530 | if (thread == NULL || (thread->state & TH_IDLE)) { |
4531 | *arg1 = 0; |
4532 | *arg2 = 0; |
4533 | |
4534 | return (THREAD_URGENCY_NONE); |
4535 | } else if (thread->sched_mode == TH_MODE_REALTIME) { |
4536 | *arg1 = thread->realtime.period; |
4537 | *arg2 = thread->realtime.deadline; |
4538 | |
4539 | return (THREAD_URGENCY_REAL_TIME); |
4540 | } else if (cpu_throttle_enabled && |
4541 | ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { |
4542 | /* |
4543 | * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted |
4544 | */ |
4545 | *arg1 = thread->sched_pri; |
4546 | *arg2 = thread->base_pri; |
4547 | |
4548 | return (THREAD_URGENCY_BACKGROUND); |
4549 | } else { |
4550 | /* For otherwise unclassified threads, report throughput QoS |
4551 | * parameters |
4552 | */ |
4553 | *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS); |
4554 | *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS); |
4555 | |
4556 | return (THREAD_URGENCY_NORMAL); |
4557 | } |
4558 | } |
4559 | |
4560 | perfcontrol_class_t |
4561 | thread_get_perfcontrol_class(thread_t thread) |
4562 | { |
4563 | /* Special case handling */ |
4564 | if (thread->state & TH_IDLE) |
4565 | return PERFCONTROL_CLASS_IDLE; |
4566 | if (thread->task == kernel_task) |
4567 | return PERFCONTROL_CLASS_KERNEL; |
4568 | if (thread->sched_mode == TH_MODE_REALTIME) |
4569 | return PERFCONTROL_CLASS_REALTIME; |
4570 | |
4571 | /* perfcontrol_class based on base_pri */ |
4572 | if (thread->base_pri <= MAXPRI_THROTTLE) |
4573 | return PERFCONTROL_CLASS_BACKGROUND; |
4574 | else if (thread->base_pri <= BASEPRI_UTILITY) |
4575 | return PERFCONTROL_CLASS_UTILITY; |
4576 | else if (thread->base_pri <= BASEPRI_DEFAULT) |
4577 | return PERFCONTROL_CLASS_NONUI; |
4578 | else if (thread->base_pri <= BASEPRI_FOREGROUND) |
4579 | return PERFCONTROL_CLASS_UI; |
4580 | else |
4581 | return PERFCONTROL_CLASS_ABOVEUI; |
4582 | } |
4583 | |
4584 | /* |
4585 | * This is the processor idle loop, which just looks for other threads |
4586 | * to execute. Processor idle threads invoke this without supplying a |
4587 | * current thread to idle without an asserted wait state. |
4588 | * |
4589 | * Returns a the next thread to execute if dispatched directly. |
4590 | */ |
4591 | |
4592 | #if 0 |
4593 | #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__) |
4594 | #else |
4595 | #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0) |
4596 | #endif |
4597 | |
4598 | thread_t |
4599 | processor_idle( |
4600 | thread_t thread, |
4601 | processor_t processor) |
4602 | { |
4603 | processor_set_t pset = processor->processor_set; |
4604 | thread_t new_thread; |
4605 | int state; |
4606 | (void)splsched(); |
4607 | |
4608 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
4609 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START, |
4610 | (uintptr_t)thread_tid(thread), 0, 0, 0, 0); |
4611 | |
4612 | SCHED_STATS_CPU_IDLE_START(processor); |
4613 | |
4614 | uint64_t ctime = mach_absolute_time(); |
4615 | |
4616 | timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state)); |
4617 | PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state); |
4618 | |
4619 | cpu_quiescent_counter_leave(ctime); |
4620 | |
4621 | while (1) { |
4622 | /* |
4623 | * Ensure that updates to my processor and pset state, |
4624 | * made by the IPI source processor before sending the IPI, |
4625 | * are visible on this processor now (even though we don't |
4626 | * take the pset lock yet). |
4627 | */ |
4628 | atomic_thread_fence(memory_order_acquire); |
4629 | |
4630 | if (processor->state != PROCESSOR_IDLE) |
4631 | break; |
4632 | if (bit_test(pset->pending_AST_cpu_mask, processor->cpu_id)) |
4633 | break; |
4634 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
4635 | if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id)) |
4636 | break; |
4637 | #endif |
4638 | if (processor->is_recommended && (processor->processor_primary == processor)) { |
4639 | if (rt_runq_count(pset)) |
4640 | break; |
4641 | } else { |
4642 | if (SCHED(processor_bound_count)(processor)) |
4643 | break; |
4644 | } |
4645 | |
4646 | #if CONFIG_SCHED_IDLE_IN_PLACE |
4647 | if (thread != THREAD_NULL) { |
4648 | /* Did idle-in-place thread wake up */ |
4649 | if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active) |
4650 | break; |
4651 | } |
4652 | #endif |
4653 | |
4654 | IDLE_KERNEL_DEBUG_CONSTANT( |
4655 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0); |
4656 | |
4657 | machine_track_platform_idle(TRUE); |
4658 | |
4659 | machine_idle(); |
4660 | |
4661 | machine_track_platform_idle(FALSE); |
4662 | |
4663 | (void)splsched(); |
4664 | |
4665 | /* |
4666 | * Check if we should call sched_timeshare_consider_maintenance() here. |
4667 | * The CPU was woken out of idle due to an interrupt and we should do the |
4668 | * call only if the processor is still idle. If the processor is non-idle, |
4669 | * the threads running on the processor would do the call as part of |
4670 | * context swithing. |
4671 | */ |
4672 | if (processor->state == PROCESSOR_IDLE) { |
4673 | sched_timeshare_consider_maintenance(mach_absolute_time()); |
4674 | } |
4675 | |
4676 | IDLE_KERNEL_DEBUG_CONSTANT( |
4677 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0); |
4678 | |
4679 | if (!SCHED(processor_queue_empty)(processor)) { |
4680 | /* Secondary SMT processors respond to directed wakeups |
4681 | * exclusively. Some platforms induce 'spurious' SMT wakeups. |
4682 | */ |
4683 | if (processor->processor_primary == processor) |
4684 | break; |
4685 | } |
4686 | } |
4687 | |
4688 | ctime = mach_absolute_time(); |
4689 | |
4690 | timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state)); |
4691 | PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); |
4692 | |
4693 | cpu_quiescent_counter_join(ctime); |
4694 | |
4695 | pset_lock(pset); |
4696 | |
4697 | /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */ |
4698 | bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id); |
4699 | #if defined(CONFIG_SCHED_DEFERRED_AST) |
4700 | bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id); |
4701 | #endif |
4702 | |
4703 | state = processor->state; |
4704 | if (state == PROCESSOR_DISPATCHING) { |
4705 | /* |
4706 | * Commmon case -- cpu dispatched. |
4707 | */ |
4708 | new_thread = processor->next_thread; |
4709 | processor->next_thread = THREAD_NULL; |
4710 | pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); |
4711 | |
4712 | if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) || |
4713 | (rt_runq_count(pset) > 0)) ) { |
4714 | /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */ |
4715 | processor_state_update_idle(processor); |
4716 | processor->deadline = UINT64_MAX; |
4717 | |
4718 | pset_unlock(pset); |
4719 | |
4720 | thread_lock(new_thread); |
4721 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq_count(pset), 0, 0); |
4722 | thread_setrun(new_thread, SCHED_HEADQ); |
4723 | thread_unlock(new_thread); |
4724 | |
4725 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
4726 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, |
4727 | (uintptr_t)thread_tid(thread), state, 0, 0, 0); |
4728 | |
4729 | return (THREAD_NULL); |
4730 | } |
4731 | |
4732 | sched_update_pset_load_average(pset); |
4733 | |
4734 | pset_unlock(pset); |
4735 | |
4736 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
4737 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, |
4738 | (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); |
4739 | |
4740 | return (new_thread); |
4741 | |
4742 | } else if (state == PROCESSOR_IDLE) { |
4743 | pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); |
4744 | processor_state_update_idle(processor); |
4745 | processor->deadline = UINT64_MAX; |
4746 | |
4747 | } else if (state == PROCESSOR_SHUTDOWN) { |
4748 | /* |
4749 | * Going off-line. Force a |
4750 | * reschedule. |
4751 | */ |
4752 | if ((new_thread = processor->next_thread) != THREAD_NULL) { |
4753 | processor->next_thread = THREAD_NULL; |
4754 | processor_state_update_idle(processor); |
4755 | processor->deadline = UINT64_MAX; |
4756 | |
4757 | pset_unlock(pset); |
4758 | |
4759 | thread_lock(new_thread); |
4760 | thread_setrun(new_thread, SCHED_HEADQ); |
4761 | thread_unlock(new_thread); |
4762 | |
4763 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
4764 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, |
4765 | (uintptr_t)thread_tid(thread), state, 0, 0, 0); |
4766 | |
4767 | return (THREAD_NULL); |
4768 | } |
4769 | } |
4770 | |
4771 | pset_unlock(pset); |
4772 | |
4773 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
4774 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, |
4775 | (uintptr_t)thread_tid(thread), state, 0, 0, 0); |
4776 | |
4777 | return (THREAD_NULL); |
4778 | } |
4779 | |
4780 | /* |
4781 | * Each processor has a dedicated thread which |
4782 | * executes the idle loop when there is no suitable |
4783 | * previous context. |
4784 | */ |
4785 | void |
4786 | idle_thread(void) |
4787 | { |
4788 | processor_t processor = current_processor(); |
4789 | thread_t new_thread; |
4790 | |
4791 | new_thread = processor_idle(THREAD_NULL, processor); |
4792 | if (new_thread != THREAD_NULL) { |
4793 | thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread); |
4794 | /*NOTREACHED*/ |
4795 | } |
4796 | |
4797 | thread_block((thread_continue_t)idle_thread); |
4798 | /*NOTREACHED*/ |
4799 | } |
4800 | |
4801 | kern_return_t |
4802 | idle_thread_create( |
4803 | processor_t processor) |
4804 | { |
4805 | kern_return_t result; |
4806 | thread_t thread; |
4807 | spl_t s; |
4808 | char name[MAXTHREADNAMESIZE]; |
4809 | |
4810 | result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread); |
4811 | if (result != KERN_SUCCESS) |
4812 | return (result); |
4813 | |
4814 | snprintf(name, sizeof(name), "idle #%d" , processor->cpu_id); |
4815 | thread_set_thread_name(thread, name); |
4816 | |
4817 | s = splsched(); |
4818 | thread_lock(thread); |
4819 | thread->bound_processor = processor; |
4820 | processor->idle_thread = thread; |
4821 | thread->sched_pri = thread->base_pri = IDLEPRI; |
4822 | thread->state = (TH_RUN | TH_IDLE); |
4823 | thread->options |= TH_OPT_IDLE_THREAD; |
4824 | thread_unlock(thread); |
4825 | splx(s); |
4826 | |
4827 | thread_deallocate(thread); |
4828 | |
4829 | return (KERN_SUCCESS); |
4830 | } |
4831 | |
4832 | /* |
4833 | * sched_startup: |
4834 | * |
4835 | * Kicks off scheduler services. |
4836 | * |
4837 | * Called at splsched. |
4838 | */ |
4839 | void |
4840 | sched_startup(void) |
4841 | { |
4842 | kern_return_t result; |
4843 | thread_t thread; |
4844 | |
4845 | simple_lock_init(&sched_vm_group_list_lock, 0); |
4846 | |
4847 | #if __arm__ || __arm64__ |
4848 | simple_lock_init(&sched_recommended_cores_lock, 0); |
4849 | #endif /* __arm__ || __arm64__ */ |
4850 | |
4851 | result = kernel_thread_start_priority((thread_continue_t)sched_init_thread, |
4852 | (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread); |
4853 | if (result != KERN_SUCCESS) |
4854 | panic("sched_startup" ); |
4855 | |
4856 | thread_deallocate(thread); |
4857 | |
4858 | assert_thread_magic(thread); |
4859 | |
4860 | /* |
4861 | * Yield to the sched_init_thread once, to |
4862 | * initialize our own thread after being switched |
4863 | * back to. |
4864 | * |
4865 | * The current thread is the only other thread |
4866 | * active at this point. |
4867 | */ |
4868 | thread_block(THREAD_CONTINUE_NULL); |
4869 | } |
4870 | |
4871 | #if __arm64__ |
4872 | static _Atomic uint64_t sched_perfcontrol_callback_deadline; |
4873 | #endif /* __arm64__ */ |
4874 | |
4875 | |
4876 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
4877 | |
4878 | static volatile uint64_t sched_maintenance_deadline; |
4879 | static uint64_t sched_tick_last_abstime; |
4880 | static uint64_t sched_tick_delta; |
4881 | uint64_t sched_tick_max_delta; |
4882 | |
4883 | |
4884 | /* |
4885 | * sched_init_thread: |
4886 | * |
4887 | * Perform periodic bookkeeping functions about ten |
4888 | * times per second. |
4889 | */ |
4890 | void |
4891 | sched_timeshare_maintenance_continue(void) |
4892 | { |
4893 | uint64_t sched_tick_ctime, late_time; |
4894 | |
4895 | struct sched_update_scan_context scan_context = { |
4896 | .earliest_bg_make_runnable_time = UINT64_MAX, |
4897 | .earliest_normal_make_runnable_time = UINT64_MAX, |
4898 | .earliest_rt_make_runnable_time = UINT64_MAX |
4899 | }; |
4900 | |
4901 | sched_tick_ctime = mach_absolute_time(); |
4902 | |
4903 | if (__improbable(sched_tick_last_abstime == 0)) { |
4904 | sched_tick_last_abstime = sched_tick_ctime; |
4905 | late_time = 0; |
4906 | sched_tick_delta = 1; |
4907 | } else { |
4908 | late_time = sched_tick_ctime - sched_tick_last_abstime; |
4909 | sched_tick_delta = late_time / sched_tick_interval; |
4910 | /* Ensure a delta of 1, since the interval could be slightly |
4911 | * smaller than the sched_tick_interval due to dispatch |
4912 | * latencies. |
4913 | */ |
4914 | sched_tick_delta = MAX(sched_tick_delta, 1); |
4915 | |
4916 | /* In the event interrupt latencies or platform |
4917 | * idle events that advanced the timebase resulted |
4918 | * in periods where no threads were dispatched, |
4919 | * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA |
4920 | * iterations. |
4921 | */ |
4922 | sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA); |
4923 | |
4924 | sched_tick_last_abstime = sched_tick_ctime; |
4925 | sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta); |
4926 | } |
4927 | |
4928 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START, |
4929 | sched_tick_delta, late_time, 0, 0, 0); |
4930 | |
4931 | /* Add a number of pseudo-ticks corresponding to the elapsed interval |
4932 | * This could be greater than 1 if substantial intervals where |
4933 | * all processors are idle occur, which rarely occurs in practice. |
4934 | */ |
4935 | |
4936 | sched_tick += sched_tick_delta; |
4937 | |
4938 | update_vm_info(); |
4939 | |
4940 | /* |
4941 | * Compute various averages. |
4942 | */ |
4943 | compute_averages(sched_tick_delta); |
4944 | |
4945 | /* |
4946 | * Scan the run queues for threads which |
4947 | * may need to be updated, and find the earliest runnable thread on the runqueue |
4948 | * to report its latency. |
4949 | */ |
4950 | SCHED(thread_update_scan)(&scan_context); |
4951 | |
4952 | SCHED(rt_runq_scan)(&scan_context); |
4953 | |
4954 | uint64_t ctime = mach_absolute_time(); |
4955 | |
4956 | uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ? |
4957 | ctime - scan_context.earliest_bg_make_runnable_time : 0; |
4958 | |
4959 | uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ? |
4960 | ctime - scan_context.earliest_normal_make_runnable_time : 0; |
4961 | |
4962 | uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ? |
4963 | ctime - scan_context.earliest_rt_make_runnable_time : 0; |
4964 | |
4965 | machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency); |
4966 | |
4967 | /* |
4968 | * Check to see if the special sched VM group needs attention. |
4969 | */ |
4970 | sched_vm_group_maintenance(); |
4971 | |
4972 | #if __arm__ || __arm64__ |
4973 | /* Check to see if the recommended cores failsafe is active */ |
4974 | sched_recommended_cores_maintenance(); |
4975 | #endif /* __arm__ || __arm64__ */ |
4976 | |
4977 | |
4978 | #if DEBUG || DEVELOPMENT |
4979 | #if __x86_64__ |
4980 | #include <i386/misc_protos.h> |
4981 | /* Check for long-duration interrupts */ |
4982 | mp_interrupt_watchdog(); |
4983 | #endif /* __x86_64__ */ |
4984 | #endif /* DEBUG || DEVELOPMENT */ |
4985 | |
4986 | KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END, |
4987 | sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], |
4988 | sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0); |
4989 | |
4990 | assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT); |
4991 | thread_block((thread_continue_t)sched_timeshare_maintenance_continue); |
4992 | /*NOTREACHED*/ |
4993 | } |
4994 | |
4995 | static uint64_t sched_maintenance_wakeups; |
4996 | |
4997 | /* |
4998 | * Determine if the set of routines formerly driven by a maintenance timer |
4999 | * must be invoked, based on a deadline comparison. Signals the scheduler |
5000 | * maintenance thread on deadline expiration. Must be invoked at an interval |
5001 | * lower than the "sched_tick_interval", currently accomplished by |
5002 | * invocation via the quantum expiration timer and at context switch time. |
5003 | * Performance matters: this routine reuses a timestamp approximating the |
5004 | * current absolute time received from the caller, and should perform |
5005 | * no more than a comparison against the deadline in the common case. |
5006 | */ |
5007 | void |
5008 | sched_timeshare_consider_maintenance(uint64_t ctime) { |
5009 | |
5010 | cpu_quiescent_counter_checkin(ctime); |
5011 | |
5012 | uint64_t deadline = sched_maintenance_deadline; |
5013 | |
5014 | if (__improbable(ctime >= deadline)) { |
5015 | if (__improbable(current_thread() == sched_maintenance_thread)) |
5016 | return; |
5017 | OSMemoryBarrier(); |
5018 | |
5019 | uint64_t ndeadline = ctime + sched_tick_interval; |
5020 | |
5021 | if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) { |
5022 | thread_wakeup((event_t)sched_timeshare_maintenance_continue); |
5023 | sched_maintenance_wakeups++; |
5024 | } |
5025 | } |
5026 | |
5027 | uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed); |
5028 | |
5029 | if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) { |
5030 | uint64_t new_deadline = 0; |
5031 | if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline, |
5032 | memory_order_relaxed, memory_order_relaxed)) { |
5033 | compute_sched_load(); |
5034 | new_deadline = ctime + sched_load_compute_interval_abs; |
5035 | __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed); |
5036 | } |
5037 | } |
5038 | |
5039 | #if __arm64__ |
5040 | uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed); |
5041 | |
5042 | if (__improbable(perf_deadline && ctime >= perf_deadline)) { |
5043 | /* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */ |
5044 | if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0, |
5045 | memory_order_relaxed, memory_order_relaxed)) { |
5046 | machine_perfcontrol_deadline_passed(perf_deadline); |
5047 | } |
5048 | } |
5049 | #endif /* __arm64__ */ |
5050 | |
5051 | } |
5052 | |
5053 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
5054 | |
5055 | void |
5056 | sched_init_thread(void (*continuation)(void)) |
5057 | { |
5058 | thread_block(THREAD_CONTINUE_NULL); |
5059 | |
5060 | thread_t thread = current_thread(); |
5061 | |
5062 | thread_set_thread_name(thread, "sched_maintenance_thread" ); |
5063 | |
5064 | sched_maintenance_thread = thread; |
5065 | |
5066 | continuation(); |
5067 | |
5068 | /*NOTREACHED*/ |
5069 | } |
5070 | |
5071 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
5072 | |
5073 | /* |
5074 | * thread_update_scan / runq_scan: |
5075 | * |
5076 | * Scan the run queues to account for timesharing threads |
5077 | * which need to be updated. |
5078 | * |
5079 | * Scanner runs in two passes. Pass one squirrels likely |
5080 | * threads away in an array, pass two does the update. |
5081 | * |
5082 | * This is necessary because the run queue is locked for |
5083 | * the candidate scan, but the thread is locked for the update. |
5084 | * |
5085 | * Array should be sized to make forward progress, without |
5086 | * disabling preemption for long periods. |
5087 | */ |
5088 | |
5089 | #define THREAD_UPDATE_SIZE 128 |
5090 | |
5091 | static thread_t thread_update_array[THREAD_UPDATE_SIZE]; |
5092 | static uint32_t thread_update_count = 0; |
5093 | |
5094 | /* Returns TRUE if thread was added, FALSE if thread_update_array is full */ |
5095 | boolean_t |
5096 | thread_update_add_thread(thread_t thread) |
5097 | { |
5098 | if (thread_update_count == THREAD_UPDATE_SIZE) |
5099 | return (FALSE); |
5100 | |
5101 | thread_update_array[thread_update_count++] = thread; |
5102 | thread_reference_internal(thread); |
5103 | return (TRUE); |
5104 | } |
5105 | |
5106 | void |
5107 | thread_update_process_threads(void) |
5108 | { |
5109 | assert(thread_update_count <= THREAD_UPDATE_SIZE); |
5110 | |
5111 | for (uint32_t i = 0 ; i < thread_update_count ; i++) { |
5112 | thread_t thread = thread_update_array[i]; |
5113 | assert_thread_magic(thread); |
5114 | thread_update_array[i] = THREAD_NULL; |
5115 | |
5116 | spl_t s = splsched(); |
5117 | thread_lock(thread); |
5118 | if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) { |
5119 | SCHED(update_priority)(thread); |
5120 | } |
5121 | thread_unlock(thread); |
5122 | splx(s); |
5123 | |
5124 | thread_deallocate(thread); |
5125 | } |
5126 | |
5127 | thread_update_count = 0; |
5128 | } |
5129 | |
5130 | /* |
5131 | * Scan a runq for candidate threads. |
5132 | * |
5133 | * Returns TRUE if retry is needed. |
5134 | */ |
5135 | boolean_t |
5136 | runq_scan( |
5137 | run_queue_t runq, |
5138 | sched_update_scan_context_t scan_context) |
5139 | { |
5140 | int count = runq->count; |
5141 | int queue_index; |
5142 | |
5143 | assert(count >= 0); |
5144 | |
5145 | if (count == 0) |
5146 | return FALSE; |
5147 | |
5148 | for (queue_index = bitmap_first(runq->bitmap, NRQS); |
5149 | queue_index >= 0; |
5150 | queue_index = bitmap_next(runq->bitmap, queue_index)) { |
5151 | |
5152 | thread_t thread; |
5153 | queue_t queue = &runq->queues[queue_index]; |
5154 | |
5155 | qe_foreach_element(thread, queue, runq_links) { |
5156 | assert(count > 0); |
5157 | assert_thread_magic(thread); |
5158 | |
5159 | if (thread->sched_stamp != sched_tick && |
5160 | thread->sched_mode == TH_MODE_TIMESHARE) { |
5161 | if (thread_update_add_thread(thread) == FALSE) |
5162 | return TRUE; |
5163 | } |
5164 | |
5165 | if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) { |
5166 | if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) { |
5167 | scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time; |
5168 | } |
5169 | } else { |
5170 | if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) { |
5171 | scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time; |
5172 | } |
5173 | } |
5174 | count--; |
5175 | } |
5176 | } |
5177 | |
5178 | return FALSE; |
5179 | } |
5180 | |
5181 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
5182 | |
5183 | boolean_t |
5184 | thread_eager_preemption(thread_t thread) |
5185 | { |
5186 | return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0); |
5187 | } |
5188 | |
5189 | void |
5190 | thread_set_eager_preempt(thread_t thread) |
5191 | { |
5192 | spl_t x; |
5193 | processor_t p; |
5194 | ast_t ast = AST_NONE; |
5195 | |
5196 | x = splsched(); |
5197 | p = current_processor(); |
5198 | |
5199 | thread_lock(thread); |
5200 | thread->sched_flags |= TH_SFLAG_EAGERPREEMPT; |
5201 | |
5202 | if (thread == current_thread()) { |
5203 | |
5204 | ast = csw_check(p, AST_NONE); |
5205 | thread_unlock(thread); |
5206 | if (ast != AST_NONE) { |
5207 | (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast); |
5208 | } |
5209 | } else { |
5210 | p = thread->last_processor; |
5211 | |
5212 | if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING && |
5213 | p->active_thread == thread) { |
5214 | cause_ast_check(p); |
5215 | } |
5216 | |
5217 | thread_unlock(thread); |
5218 | } |
5219 | |
5220 | splx(x); |
5221 | } |
5222 | |
5223 | void |
5224 | thread_clear_eager_preempt(thread_t thread) |
5225 | { |
5226 | spl_t x; |
5227 | |
5228 | x = splsched(); |
5229 | thread_lock(thread); |
5230 | |
5231 | thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT; |
5232 | |
5233 | thread_unlock(thread); |
5234 | splx(x); |
5235 | } |
5236 | |
5237 | /* |
5238 | * Scheduling statistics |
5239 | */ |
5240 | void |
5241 | sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri) |
5242 | { |
5243 | struct processor_sched_statistics *stats; |
5244 | boolean_t to_realtime = FALSE; |
5245 | |
5246 | stats = &processor->processor_data.sched_stats; |
5247 | stats->csw_count++; |
5248 | |
5249 | if (otherpri >= BASEPRI_REALTIME) { |
5250 | stats->rt_sched_count++; |
5251 | to_realtime = TRUE; |
5252 | } |
5253 | |
5254 | if ((reasons & AST_PREEMPT) != 0) { |
5255 | stats->preempt_count++; |
5256 | |
5257 | if (selfpri >= BASEPRI_REALTIME) { |
5258 | stats->preempted_rt_count++; |
5259 | } |
5260 | |
5261 | if (to_realtime) { |
5262 | stats->preempted_by_rt_count++; |
5263 | } |
5264 | |
5265 | } |
5266 | } |
5267 | |
5268 | void |
5269 | sched_stats_handle_runq_change(struct runq_stats *stats, int old_count) |
5270 | { |
5271 | uint64_t timestamp = mach_absolute_time(); |
5272 | |
5273 | stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count; |
5274 | stats->last_change_timestamp = timestamp; |
5275 | } |
5276 | |
5277 | /* |
5278 | * For calls from assembly code |
5279 | */ |
5280 | #undef thread_wakeup |
5281 | void |
5282 | thread_wakeup( |
5283 | event_t x); |
5284 | |
5285 | void |
5286 | thread_wakeup( |
5287 | event_t x) |
5288 | { |
5289 | thread_wakeup_with_result(x, THREAD_AWAKENED); |
5290 | } |
5291 | |
5292 | boolean_t |
5293 | preemption_enabled(void) |
5294 | { |
5295 | return (get_preemption_level() == 0 && ml_get_interrupts_enabled()); |
5296 | } |
5297 | |
5298 | static void |
5299 | sched_timer_deadline_tracking_init(void) { |
5300 | nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1); |
5301 | nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2); |
5302 | } |
5303 | |
5304 | #if __arm__ || __arm64__ |
5305 | |
5306 | uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED; |
5307 | uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS; |
5308 | bool perfcontrol_failsafe_active = false; |
5309 | bool perfcontrol_sleep_override = false; |
5310 | |
5311 | uint64_t perfcontrol_failsafe_maintenance_runnable_time; |
5312 | uint64_t perfcontrol_failsafe_activation_time; |
5313 | uint64_t perfcontrol_failsafe_deactivation_time; |
5314 | |
5315 | /* data covering who likely caused it and how long they ran */ |
5316 | #define FAILSAFE_NAME_LEN 33 /* (2*MAXCOMLEN)+1 from size of p_name */ |
5317 | char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN]; |
5318 | int perfcontrol_failsafe_pid; |
5319 | uint64_t perfcontrol_failsafe_tid; |
5320 | uint64_t perfcontrol_failsafe_thread_timer_at_start; |
5321 | uint64_t perfcontrol_failsafe_thread_timer_last_seen; |
5322 | uint32_t perfcontrol_failsafe_recommended_at_trigger; |
5323 | |
5324 | /* |
5325 | * Perf controller calls here to update the recommended core bitmask. |
5326 | * If the failsafe is active, we don't immediately apply the new value. |
5327 | * Instead, we store the new request and use it after the failsafe deactivates. |
5328 | * |
5329 | * If the failsafe is not active, immediately apply the update. |
5330 | * |
5331 | * No scheduler locks are held, no other locks are held that scheduler might depend on, |
5332 | * interrupts are enabled |
5333 | * |
5334 | * currently prototype is in osfmk/arm/machine_routines.h |
5335 | */ |
5336 | void |
5337 | sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores) |
5338 | { |
5339 | assert(preemption_enabled()); |
5340 | |
5341 | spl_t s = splsched(); |
5342 | simple_lock(&sched_recommended_cores_lock); |
5343 | |
5344 | perfcontrol_requested_recommended_cores = recommended_cores; |
5345 | perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores); |
5346 | |
5347 | if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) |
5348 | sched_update_recommended_cores(perfcontrol_requested_recommended_cores); |
5349 | else |
5350 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
5351 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_NONE, |
5352 | perfcontrol_requested_recommended_cores, |
5353 | sched_maintenance_thread->last_made_runnable_time, 0, 0, 0); |
5354 | |
5355 | simple_unlock(&sched_recommended_cores_lock); |
5356 | splx(s); |
5357 | } |
5358 | |
5359 | void |
5360 | sched_override_recommended_cores_for_sleep(void) |
5361 | { |
5362 | spl_t s = splsched(); |
5363 | simple_lock(&sched_recommended_cores_lock); |
5364 | |
5365 | if (perfcontrol_sleep_override == false) { |
5366 | perfcontrol_sleep_override = true; |
5367 | sched_update_recommended_cores(ALL_CORES_RECOMMENDED); |
5368 | } |
5369 | |
5370 | simple_unlock(&sched_recommended_cores_lock); |
5371 | splx(s); |
5372 | } |
5373 | |
5374 | void |
5375 | sched_restore_recommended_cores_after_sleep(void) |
5376 | { |
5377 | spl_t s = splsched(); |
5378 | simple_lock(&sched_recommended_cores_lock); |
5379 | |
5380 | if (perfcontrol_sleep_override == true) { |
5381 | perfcontrol_sleep_override = false; |
5382 | sched_update_recommended_cores(perfcontrol_requested_recommended_cores); |
5383 | } |
5384 | |
5385 | simple_unlock(&sched_recommended_cores_lock); |
5386 | splx(s); |
5387 | } |
5388 | |
5389 | /* |
5390 | * Consider whether we need to activate the recommended cores failsafe |
5391 | * |
5392 | * Called from quantum timer interrupt context of a realtime thread |
5393 | * No scheduler locks are held, interrupts are disabled |
5394 | */ |
5395 | void |
5396 | sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread) |
5397 | { |
5398 | /* |
5399 | * Check if a realtime thread is starving the system |
5400 | * and bringing up non-recommended cores would help |
5401 | * |
5402 | * TODO: Is this the correct check for recommended == possible cores? |
5403 | * TODO: Validate the checks without the relevant lock are OK. |
5404 | */ |
5405 | |
5406 | if (__improbable(perfcontrol_failsafe_active == TRUE)) { |
5407 | /* keep track of how long the responsible thread runs */ |
5408 | |
5409 | simple_lock(&sched_recommended_cores_lock); |
5410 | |
5411 | if (perfcontrol_failsafe_active == TRUE && |
5412 | cur_thread->thread_id == perfcontrol_failsafe_tid) { |
5413 | perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) + |
5414 | timer_grab(&cur_thread->system_timer); |
5415 | } |
5416 | |
5417 | simple_unlock(&sched_recommended_cores_lock); |
5418 | |
5419 | /* we're already trying to solve the problem, so bail */ |
5420 | return; |
5421 | } |
5422 | |
5423 | /* The failsafe won't help if there are no more processors to enable */ |
5424 | if (__probable(perfcontrol_requested_recommended_core_count >= processor_count)) |
5425 | return; |
5426 | |
5427 | uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold; |
5428 | |
5429 | /* Use the maintenance thread as our canary in the coal mine */ |
5430 | thread_t m_thread = sched_maintenance_thread; |
5431 | |
5432 | /* If it doesn't look bad, nothing to see here */ |
5433 | if (__probable(m_thread->last_made_runnable_time >= too_long_ago)) |
5434 | return; |
5435 | |
5436 | /* It looks bad, take the lock to be sure */ |
5437 | thread_lock(m_thread); |
5438 | |
5439 | if (m_thread->runq == PROCESSOR_NULL || |
5440 | (m_thread->state & (TH_RUN|TH_WAIT)) != TH_RUN || |
5441 | m_thread->last_made_runnable_time >= too_long_ago) { |
5442 | /* |
5443 | * Maintenance thread is either on cpu or blocked, and |
5444 | * therefore wouldn't benefit from more cores |
5445 | */ |
5446 | thread_unlock(m_thread); |
5447 | return; |
5448 | } |
5449 | |
5450 | uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time; |
5451 | |
5452 | thread_unlock(m_thread); |
5453 | |
5454 | /* |
5455 | * There are cores disabled at perfcontrol's recommendation, but the |
5456 | * system is so overloaded that the maintenance thread can't run. |
5457 | * That likely means that perfcontrol can't run either, so it can't fix |
5458 | * the recommendation. We have to kick in a failsafe to keep from starving. |
5459 | * |
5460 | * When the maintenance thread has been starved for too long, |
5461 | * ignore the recommendation from perfcontrol and light up all the cores. |
5462 | * |
5463 | * TODO: Consider weird states like boot, sleep, or debugger |
5464 | */ |
5465 | |
5466 | simple_lock(&sched_recommended_cores_lock); |
5467 | |
5468 | if (perfcontrol_failsafe_active == TRUE) { |
5469 | simple_unlock(&sched_recommended_cores_lock); |
5470 | return; |
5471 | } |
5472 | |
5473 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
5474 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_START, |
5475 | perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0); |
5476 | |
5477 | perfcontrol_failsafe_active = TRUE; |
5478 | perfcontrol_failsafe_activation_time = mach_absolute_time(); |
5479 | perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time; |
5480 | perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores; |
5481 | |
5482 | /* Capture some data about who screwed up (assuming that the thread on core is at fault) */ |
5483 | task_t task = cur_thread->task; |
5484 | perfcontrol_failsafe_pid = task_pid(task); |
5485 | strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name)); |
5486 | |
5487 | perfcontrol_failsafe_tid = cur_thread->thread_id; |
5488 | |
5489 | /* Blame the thread for time it has run recently */ |
5490 | uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered; |
5491 | |
5492 | uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer); |
5493 | |
5494 | /* Compute the start time of the bad behavior in terms of the thread's on core time */ |
5495 | perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation; |
5496 | perfcontrol_failsafe_thread_timer_last_seen = last_seen; |
5497 | |
5498 | /* Ignore the previously recommended core configuration */ |
5499 | sched_update_recommended_cores(ALL_CORES_RECOMMENDED); |
5500 | |
5501 | simple_unlock(&sched_recommended_cores_lock); |
5502 | } |
5503 | |
5504 | /* |
5505 | * Now that our bacon has been saved by the failsafe, consider whether to turn it off |
5506 | * |
5507 | * Runs in the context of the maintenance thread, no locks held |
5508 | */ |
5509 | static void |
5510 | sched_recommended_cores_maintenance(void) |
5511 | { |
5512 | /* Common case - no failsafe, nothing to be done here */ |
5513 | if (__probable(perfcontrol_failsafe_active == FALSE)) |
5514 | return; |
5515 | |
5516 | uint64_t ctime = mach_absolute_time(); |
5517 | |
5518 | boolean_t print_diagnostic = FALSE; |
5519 | char p_name[FAILSAFE_NAME_LEN] = "" ; |
5520 | |
5521 | spl_t s = splsched(); |
5522 | simple_lock(&sched_recommended_cores_lock); |
5523 | |
5524 | /* Check again, under the lock, to avoid races */ |
5525 | if (perfcontrol_failsafe_active == FALSE) |
5526 | goto out; |
5527 | |
5528 | /* |
5529 | * Ensure that the other cores get another few ticks to run some threads |
5530 | * If we don't have this hysteresis, the maintenance thread is the first |
5531 | * to run, and then it immediately kills the other cores |
5532 | */ |
5533 | if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold) |
5534 | goto out; |
5535 | |
5536 | /* Capture some diagnostic state under the lock so we can print it out later */ |
5537 | |
5538 | int pid = perfcontrol_failsafe_pid; |
5539 | uint64_t tid = perfcontrol_failsafe_tid; |
5540 | |
5541 | uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen - |
5542 | perfcontrol_failsafe_thread_timer_at_start; |
5543 | uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger; |
5544 | uint32_t rec_cores_after = perfcontrol_requested_recommended_cores; |
5545 | uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time; |
5546 | strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name)); |
5547 | |
5548 | print_diagnostic = TRUE; |
5549 | |
5550 | /* Deactivate the failsafe and reinstate the requested recommendation settings */ |
5551 | |
5552 | perfcontrol_failsafe_deactivation_time = ctime; |
5553 | perfcontrol_failsafe_active = FALSE; |
5554 | |
5555 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
5556 | MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) | DBG_FUNC_END, |
5557 | perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0); |
5558 | |
5559 | sched_update_recommended_cores(perfcontrol_requested_recommended_cores); |
5560 | |
5561 | out: |
5562 | simple_unlock(&sched_recommended_cores_lock); |
5563 | splx(s); |
5564 | |
5565 | if (print_diagnostic) { |
5566 | uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0; |
5567 | |
5568 | absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms); |
5569 | failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC; |
5570 | |
5571 | absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms); |
5572 | thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC; |
5573 | |
5574 | printf("recommended core failsafe kicked in for %lld ms " |
5575 | "likely due to %s[%d] thread 0x%llx spending " |
5576 | "%lld ms on cpu at realtime priority - " |
5577 | "new recommendation: 0x%x -> 0x%x\n" , |
5578 | failsafe_duration_ms, p_name, pid, tid, thread_usage_ms, |
5579 | rec_cores_before, rec_cores_after); |
5580 | } |
5581 | } |
5582 | |
5583 | /* |
5584 | * Apply a new recommended cores mask to the processors it affects |
5585 | * Runs after considering failsafes and such |
5586 | * |
5587 | * Iterate over processors and update their ->is_recommended field. |
5588 | * If a processor is running, we let it drain out at its next |
5589 | * quantum expiration or blocking point. If a processor is idle, there |
5590 | * may be more work for it to do, so IPI it. |
5591 | * |
5592 | * interrupts disabled, sched_recommended_cores_lock is held |
5593 | */ |
5594 | static void |
5595 | sched_update_recommended_cores(uint32_t recommended_cores) |
5596 | { |
5597 | processor_set_t pset, nset; |
5598 | processor_t processor; |
5599 | uint64_t needs_exit_idle_mask = 0x0; |
5600 | |
5601 | processor = processor_list; |
5602 | pset = processor->processor_set; |
5603 | |
5604 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START, |
5605 | recommended_cores, perfcontrol_failsafe_active, 0, 0); |
5606 | |
5607 | if (__builtin_popcount(recommended_cores) == 0) { |
5608 | bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */ |
5609 | } |
5610 | |
5611 | /* First set recommended cores */ |
5612 | pset_lock(pset); |
5613 | do { |
5614 | |
5615 | nset = processor->processor_set; |
5616 | if (nset != pset) { |
5617 | pset_unlock(pset); |
5618 | pset = nset; |
5619 | pset_lock(pset); |
5620 | } |
5621 | |
5622 | if (bit_test(recommended_cores, processor->cpu_id)) { |
5623 | processor->is_recommended = TRUE; |
5624 | bit_set(pset->recommended_bitmask, processor->cpu_id); |
5625 | |
5626 | if (processor->state == PROCESSOR_IDLE) { |
5627 | if (processor != current_processor()) { |
5628 | bit_set(needs_exit_idle_mask, processor->cpu_id); |
5629 | } |
5630 | } |
5631 | } |
5632 | } while ((processor = processor->processor_list) != NULL); |
5633 | pset_unlock(pset); |
5634 | |
5635 | /* Now shutdown not recommended cores */ |
5636 | processor = processor_list; |
5637 | pset = processor->processor_set; |
5638 | |
5639 | pset_lock(pset); |
5640 | do { |
5641 | |
5642 | nset = processor->processor_set; |
5643 | if (nset != pset) { |
5644 | pset_unlock(pset); |
5645 | pset = nset; |
5646 | pset_lock(pset); |
5647 | } |
5648 | |
5649 | if (!bit_test(recommended_cores, processor->cpu_id)) { |
5650 | sched_ipi_type_t ipi_type = SCHED_IPI_NONE; |
5651 | |
5652 | processor->is_recommended = FALSE; |
5653 | bit_clear(pset->recommended_bitmask, processor->cpu_id); |
5654 | |
5655 | if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) { |
5656 | ipi_type = SCHED_IPI_IMMEDIATE; |
5657 | } |
5658 | SCHED(processor_queue_shutdown)(processor); |
5659 | /* pset unlocked */ |
5660 | |
5661 | SCHED(rt_queue_shutdown)(processor); |
5662 | |
5663 | if (ipi_type != SCHED_IPI_NONE) { |
5664 | if (processor == current_processor()) { |
5665 | ast_on(AST_PREEMPT); |
5666 | } else { |
5667 | sched_ipi_perform(processor, ipi_type); |
5668 | } |
5669 | } |
5670 | |
5671 | pset_lock(pset); |
5672 | } |
5673 | } while ((processor = processor->processor_list) != NULL); |
5674 | pset_unlock(pset); |
5675 | |
5676 | /* Issue all pending IPIs now that the pset lock has been dropped */ |
5677 | for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) { |
5678 | processor = processor_array[cpuid]; |
5679 | machine_signal_idle(processor); |
5680 | } |
5681 | |
5682 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END, |
5683 | needs_exit_idle_mask, 0, 0, 0); |
5684 | } |
5685 | #endif /* __arm__ || __arm64__ */ |
5686 | |
5687 | void thread_set_options(uint32_t thopt) { |
5688 | spl_t x; |
5689 | thread_t t = current_thread(); |
5690 | |
5691 | x = splsched(); |
5692 | thread_lock(t); |
5693 | |
5694 | t->options |= thopt; |
5695 | |
5696 | thread_unlock(t); |
5697 | splx(x); |
5698 | } |
5699 | |
5700 | void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) { |
5701 | thread->pending_block_hint = block_hint; |
5702 | } |
5703 | |
5704 | uint32_t qos_max_parallelism(int qos, uint64_t options) |
5705 | { |
5706 | return SCHED(qos_max_parallelism)(qos, options); |
5707 | } |
5708 | |
5709 | uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options) |
5710 | { |
5711 | host_basic_info_data_t hinfo; |
5712 | mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; |
5713 | /* Query the machine layer for core information */ |
5714 | __assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO, |
5715 | (host_info_t)&hinfo, &count); |
5716 | assert(kret == KERN_SUCCESS); |
5717 | |
5718 | /* We would not want multiple realtime threads running on the |
5719 | * same physical core; even for SMT capable machines. |
5720 | */ |
5721 | if (options & QOS_PARALLELISM_REALTIME) { |
5722 | return hinfo.physical_cpu; |
5723 | } |
5724 | |
5725 | if (options & QOS_PARALLELISM_COUNT_LOGICAL) { |
5726 | return hinfo.logical_cpu; |
5727 | } else { |
5728 | return hinfo.physical_cpu; |
5729 | } |
5730 | } |
5731 | |
5732 | #if __arm64__ |
5733 | |
5734 | /* |
5735 | * Set up or replace old timer with new timer |
5736 | * |
5737 | * Returns true if canceled old timer, false if it did not |
5738 | */ |
5739 | boolean_t |
5740 | sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) |
5741 | { |
5742 | /* |
5743 | * Exchange deadline for new deadline, if old deadline was nonzero, |
5744 | * then I cancelled the callback, otherwise I didn't |
5745 | */ |
5746 | |
5747 | uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, |
5748 | memory_order_relaxed); |
5749 | |
5750 | |
5751 | while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline, |
5752 | &old_deadline, new_deadline, |
5753 | memory_order_relaxed, memory_order_relaxed)); |
5754 | |
5755 | |
5756 | /* now old_deadline contains previous value, which might not be the same if it raced */ |
5757 | |
5758 | return (old_deadline != 0) ? TRUE : FALSE; |
5759 | } |
5760 | |
5761 | #endif /* __arm64__ */ |
5762 | |
5763 | void |
5764 | sched_update_pset_load_average(processor_set_t pset) |
5765 | { |
5766 | int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); |
5767 | int new_load_average = (pset->load_average + load) >> 1; |
5768 | |
5769 | pset->load_average = new_load_average; |
5770 | |
5771 | #if (DEVELOPMENT || DEBUG) |
5772 | #endif |
5773 | } |
5774 | |
5775 | /* pset is locked */ |
5776 | static processor_t |
5777 | choose_processor_for_realtime_thread(processor_set_t pset) |
5778 | { |
5779 | uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_cpu_mask); |
5780 | |
5781 | for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { |
5782 | processor_t processor = processor_array[cpuid]; |
5783 | |
5784 | if (processor->processor_primary != processor) { |
5785 | continue; |
5786 | } |
5787 | |
5788 | if (processor->state == PROCESSOR_IDLE) { |
5789 | return processor; |
5790 | } |
5791 | |
5792 | if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) { |
5793 | continue; |
5794 | } |
5795 | |
5796 | if (processor->current_pri >= BASEPRI_RTQUEUES) { |
5797 | continue; |
5798 | } |
5799 | |
5800 | return processor; |
5801 | |
5802 | } |
5803 | |
5804 | if (!sched_allow_rt_smt) { |
5805 | return PROCESSOR_NULL; |
5806 | } |
5807 | |
5808 | /* Consider secondary processors */ |
5809 | for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { |
5810 | processor_t processor = processor_array[cpuid]; |
5811 | |
5812 | if (processor->processor_primary == processor) { |
5813 | continue; |
5814 | } |
5815 | |
5816 | if (processor->state == PROCESSOR_IDLE) { |
5817 | return processor; |
5818 | } |
5819 | |
5820 | if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) { |
5821 | continue; |
5822 | } |
5823 | |
5824 | if (processor->current_pri >= BASEPRI_RTQUEUES) { |
5825 | continue; |
5826 | } |
5827 | |
5828 | return processor; |
5829 | |
5830 | } |
5831 | |
5832 | return PROCESSOR_NULL; |
5833 | } |
5834 | |
5835 | /* pset is locked */ |
5836 | static bool |
5837 | all_available_primaries_are_running_realtime_threads(processor_set_t pset) |
5838 | { |
5839 | uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask); |
5840 | |
5841 | for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { |
5842 | processor_t processor = processor_array[cpuid]; |
5843 | |
5844 | if (processor->processor_primary != processor) { |
5845 | continue; |
5846 | } |
5847 | |
5848 | if (processor->state == PROCESSOR_IDLE) { |
5849 | return false; |
5850 | } |
5851 | |
5852 | if (processor->state == PROCESSOR_DISPATCHING) { |
5853 | return false; |
5854 | } |
5855 | |
5856 | if (processor->state != PROCESSOR_RUNNING) { |
5857 | /* |
5858 | * All other processor states are considered unavailable to run |
5859 | * realtime threads. In particular, we prefer an available secondary |
5860 | * processor over the risk of leaving a realtime thread on the run queue |
5861 | * while waiting for a processor in PROCESSOR_START state, |
5862 | * which should anyway be a rare case. |
5863 | */ |
5864 | continue; |
5865 | } |
5866 | |
5867 | if (processor->current_pri < BASEPRI_RTQUEUES) { |
5868 | return false; |
5869 | } |
5870 | } |
5871 | |
5872 | return true; |
5873 | } |
5874 | |
5875 | |
5876 | |