1 | /* |
2 | * Copyright (c) 2007-2023 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Routines for preemption disablement, |
31 | * which prevents the current thread from giving up its current CPU. |
32 | */ |
33 | |
34 | #include <arm/cpu_data.h> |
35 | #include <arm/cpu_data_internal.h> |
36 | #include <arm/preemption_disable_internal.h> |
37 | #include <kern/cpu_data.h> |
38 | #include <kern/percpu.h> |
39 | #include <kern/thread.h> |
40 | #include <mach/machine/sdt.h> |
41 | #include <os/base.h> |
42 | #include <stdint.h> |
43 | #include <sys/kdebug.h> |
44 | |
45 | #if SCHED_HYGIENE_DEBUG |
46 | static void |
47 | _do_disable_preemption_without_measurements(void); |
48 | #endif |
49 | |
50 | /* |
51 | * This function checks whether an AST_URGENT has been pended. |
52 | * |
53 | * It is called once the preemption has been reenabled, which means the thread |
54 | * may have been preempted right before this was called, and when this function |
55 | * actually performs the check, we've changed CPU. |
56 | * |
57 | * This race is however benign: the point of AST_URGENT is to trigger a context |
58 | * switch, so if one happened, there's nothing left to check for, and AST_URGENT |
59 | * was cleared in the process. |
60 | * |
61 | * It follows that this check cannot have false negatives, which allows us |
62 | * to avoid fiddling with interrupt state for the vast majority of cases |
63 | * when the check will actually be negative. |
64 | */ |
65 | static OS_NOINLINE |
66 | void |
67 | kernel_preempt_check(void) |
68 | { |
69 | uint64_t state; |
70 | |
71 | /* If interrupts are masked, we can't take an AST here */ |
72 | state = __builtin_arm_rsr64("DAIF" ); |
73 | if (state & DAIF_IRQF) { |
74 | return; |
75 | } |
76 | |
77 | /* disable interrupts (IRQ FIQ ASYNCF) */ |
78 | __builtin_arm_wsr64("DAIFSet" , DAIFSC_STANDARD_DISABLE); |
79 | |
80 | /* |
81 | * Reload cpu_pending_ast: a context switch would cause it to change. |
82 | * Now that interrupts are disabled, this will debounce false positives. |
83 | */ |
84 | if (current_thread()->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { |
85 | ast_taken_kernel(); |
86 | } |
87 | |
88 | /* restore the original interrupt mask */ |
89 | __builtin_arm_wsr64("DAIF" , state); |
90 | } |
91 | |
92 | static inline void |
93 | _enable_preemption_write_count(thread_t thread, unsigned int count) |
94 | { |
95 | os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel); |
96 | |
97 | /* |
98 | * This check is racy and could load from another CPU's pending_ast mask, |
99 | * but as described above, this can't have false negatives. |
100 | */ |
101 | if (count == 0) { |
102 | if (__improbable(thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT)) { |
103 | return kernel_preempt_check(); |
104 | } |
105 | } |
106 | } |
107 | |
108 | /* |
109 | * This function is written in a way that the codegen is extremely short. |
110 | * |
111 | * LTO isn't smart enough to inline it, yet it is profitable because |
112 | * the vast majority of callers use current_thread() already. |
113 | * |
114 | * TODO: It is unfortunate that we have to load |
115 | * sched_preemption_disable_debug_mode |
116 | * |
117 | * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\ |
118 | */ |
119 | OS_ALWAYS_INLINE |
120 | void |
121 | _disable_preemption(void) |
122 | { |
123 | thread_t thread = current_thread(); |
124 | unsigned int count = thread->machine.preemption_count; |
125 | |
126 | os_atomic_store(&thread->machine.preemption_count, |
127 | count + 1, compiler_acq_rel); |
128 | |
129 | #if SCHED_HYGIENE_DEBUG |
130 | /* |
131 | * Note that this is not the only place preemption gets disabled, |
132 | * it also gets modified on ISR and PPL entry/exit. Both of those |
133 | * events will be treated specially however, and |
134 | * increment/decrement being paired around their entry/exit means |
135 | * that collection here is not desynced otherwise. |
136 | */ |
137 | |
138 | if (__improbable(count == 0 && sched_preemption_disable_debug_mode)) { |
139 | __attribute__((musttail)) |
140 | return _prepare_preemption_disable_measurement(); |
141 | } |
142 | #endif /* SCHED_HYGIENE_DEBUG */ |
143 | } |
144 | |
145 | /* |
146 | * This variant of disable_preemption() allows disabling preemption |
147 | * without taking measurements (and later potentially triggering |
148 | * actions on those). |
149 | */ |
150 | OS_ALWAYS_INLINE |
151 | void |
152 | _disable_preemption_without_measurements(void) |
153 | { |
154 | thread_t thread = current_thread(); |
155 | unsigned int count = thread->machine.preemption_count; |
156 | |
157 | #if SCHED_HYGIENE_DEBUG |
158 | _do_disable_preemption_without_measurements(); |
159 | #endif /* SCHED_HYGIENE_DEBUG */ |
160 | |
161 | os_atomic_store(&thread->machine.preemption_count, |
162 | count + 1, compiler_acq_rel); |
163 | } |
164 | |
165 | /* |
166 | * To help _enable_preemption() inline everywhere with LTO, |
167 | * we keep these nice non inlineable functions as the panic() |
168 | * codegen setup is quite large and for weird reasons causes a frame. |
169 | */ |
170 | __abortlike |
171 | static void |
172 | _enable_preemption_underflow(void) |
173 | { |
174 | panic("Preemption count underflow" ); |
175 | } |
176 | |
177 | /* |
178 | * This function is written in a way that the codegen is extremely short. |
179 | * |
180 | * LTO isn't smart enough to inline it, yet it is profitable because |
181 | * the vast majority of callers use current_thread() already. |
182 | * |
183 | * The SCHED_HYGIENE_MARKER trick is used so that we do not have to load |
184 | * unrelated fields of current_thread(). |
185 | * |
186 | * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\ |
187 | */ |
188 | OS_ALWAYS_INLINE |
189 | void |
190 | _enable_preemption(void) |
191 | { |
192 | thread_t thread = current_thread(); |
193 | unsigned int count = thread->machine.preemption_count; |
194 | |
195 | if (__improbable(count == 0)) { |
196 | _enable_preemption_underflow(); |
197 | } |
198 | |
199 | #if SCHED_HYGIENE_DEBUG |
200 | if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) { |
201 | return _collect_preemption_disable_measurement(); |
202 | } |
203 | #endif /* SCHED_HYGIENE_DEBUG */ |
204 | |
205 | _enable_preemption_write_count(thread, count: count - 1); |
206 | } |
207 | |
208 | OS_ALWAYS_INLINE |
209 | unsigned int |
210 | get_preemption_level_for_thread(thread_t thread) |
211 | { |
212 | unsigned int count = thread->machine.preemption_count; |
213 | |
214 | #if SCHED_HYGIENE_DEBUG |
215 | /* |
216 | * hide this "flag" from callers, |
217 | * and it would make the count look negative anyway |
218 | * which some people dislike |
219 | */ |
220 | count &= ~SCHED_HYGIENE_MARKER; |
221 | #endif |
222 | return (int)count; |
223 | } |
224 | |
225 | OS_ALWAYS_INLINE |
226 | int |
227 | get_preemption_level(void) |
228 | { |
229 | return get_preemption_level_for_thread(thread: current_thread()); |
230 | } |
231 | |
232 | #if SCHED_HYGIENE_DEBUG |
233 | |
234 | uint64_t _Atomic PERCPU_DATA_HACK_78750602(preemption_disable_max_mt); |
235 | |
236 | #if XNU_PLATFORM_iPhoneOS |
237 | #define DEFAULT_PREEMPTION_TIMEOUT 120000 /* 5ms */ |
238 | #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC |
239 | #else |
240 | #define DEFAULT_PREEMPTION_TIMEOUT 0 /* Disabled */ |
241 | #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_OFF |
242 | #endif /* XNU_PLATFORM_iPhoneOS */ |
243 | |
244 | MACHINE_TIMEOUT_DEV_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption" , |
245 | DEFAULT_PREEMPTION_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred); |
246 | TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode, |
247 | "machine-timeouts" , |
248 | "sched-preemption-disable-mode" , /* DT property names have to be 31 chars max */ |
249 | "sched_preemption_disable_debug_mode" , |
250 | DEFAULT_PREEMPTION_MODE, |
251 | TUNABLE_DT_CHECK_CHOSEN); |
252 | |
253 | struct _preemption_disable_pcpu PERCPU_DATA(_preemption_disable_pcpu_data); |
254 | |
255 | /* |
256 | ** Start a measurement window for the current CPU's preemption disable timeout. |
257 | * |
258 | * Interrupts must be disabled when calling this function, |
259 | * but the assertion has been elided as this is on the fast path. |
260 | */ |
261 | static void |
262 | _preemption_disable_snap_start(void) |
263 | { |
264 | struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); |
265 | pcpu->pdp_abandon = false; |
266 | pcpu->pdp_start.pds_mach_time = ml_get_sched_hygiene_timebase(); |
267 | pcpu->pdp_start.pds_int_mach_time = recount_current_processor_interrupt_time_mach(); |
268 | #if CONFIG_CPU_COUNTERS |
269 | if (__probable(sched_hygiene_debug_pmc)) { |
270 | mt_cur_cpu_cycles_instrs_speculative(&pcpu->pdp_start.pds_cycles, |
271 | &pcpu->pdp_start.pds_instrs); |
272 | } |
273 | #endif /* CONFIG_CPU_COUNTERS */ |
274 | } |
275 | |
276 | /* |
277 | ** |
278 | * End a measurement window for the current CPU's preemption disable timeout, |
279 | * using the snapshot started by _preemption_disable_snap_start(). |
280 | * |
281 | * @param start An out-parameter for the starting snapshot, |
282 | * captured while interrupts are disabled. |
283 | * |
284 | * @param now An out-parameter for the current times, |
285 | * captured at the same time as the start and with interrupts disabled. |
286 | * This is meant for computing a delta. |
287 | * Even with @link sched_hygiene_debug_pmc , the PMCs will not be read. |
288 | * This allows their (relatively expensive) reads to happen only if the time threshold has been violated. |
289 | * |
290 | * @return Whether to abandon the current measurement due to a call to abandon_preemption_disable_measurement(). |
291 | */ |
292 | static bool |
293 | _preemption_disable_snap_end( |
294 | struct _preemption_disable_snap *start, |
295 | struct _preemption_disable_snap *now) |
296 | { |
297 | struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); |
298 | |
299 | const bool int_masked_debug = false; |
300 | const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); |
301 | /* |
302 | * Collect start time and current time with interrupts disabled. |
303 | * Otherwise an interrupt coming in after grabbing the timestamp |
304 | * could spuriously inflate the measurement, because it will |
305 | * adjust preemption_disable_mt only after we already grabbed |
306 | * it. |
307 | * |
308 | * (Even worse if we collected the current time first: Then a |
309 | * subsequent interrupt could adjust preemption_disable_mt to |
310 | * make the duration go negative after subtracting the already |
311 | * grabbed time. With interrupts disabled we don't care much about |
312 | * the order.) |
313 | */ |
314 | |
315 | *start = pcpu->pdp_start; |
316 | uint64_t now_time = ml_get_sched_hygiene_timebase(); |
317 | now->pds_mach_time = now_time; |
318 | now->pds_int_mach_time = recount_current_processor_interrupt_time_mach(); |
319 | const bool abandon = pcpu->pdp_abandon; |
320 | const uint64_t max_duration = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed); |
321 | |
322 | pcpu->pdp_start.pds_mach_time = 0; |
323 | |
324 | /* |
325 | * Don't need to reset (or even save) pdp_abandon here: |
326 | * abandon_preemption_disable_measurement is a no-op anyway |
327 | * if pdp_start.pds_mach_time == 0 (which we just set), and it |
328 | * will stay that way until the next call to |
329 | * _collect_preemption_disable_measurement. |
330 | */ |
331 | ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); |
332 | if (__probable(!abandon)) { |
333 | const int64_t gross_duration = now_time - start->pds_mach_time; |
334 | if (__improbable(gross_duration > max_duration)) { |
335 | os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed); |
336 | } |
337 | } |
338 | return abandon; |
339 | } |
340 | |
341 | OS_NOINLINE |
342 | void |
343 | _prepare_preemption_disable_measurement(void) |
344 | { |
345 | thread_t thread = current_thread(); |
346 | |
347 | if (thread->machine.inthandler_timestamp == 0) { |
348 | /* |
349 | * Only prepare a measurement if not currently in an interrupt |
350 | * handler. |
351 | * |
352 | * We are only interested in the net duration of disabled |
353 | * preemption, that is: The time in which preemption was |
354 | * disabled, minus the intervals in which any (likely |
355 | * unrelated) interrupts were handled. |
356 | * recount_current_thread_interrupt_time_mach() will remove those |
357 | * intervals, however we also do not even start measuring |
358 | * preemption disablement if we are already within handling of |
359 | * an interrupt when preemption was disabled (the resulting |
360 | * net time would be 0). |
361 | * |
362 | * Interrupt handling duration is handled separately, and any |
363 | * long intervals of preemption disablement are counted |
364 | * towards that. |
365 | */ |
366 | |
367 | bool const int_masked_debug = false; |
368 | bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); |
369 | thread->machine.preemption_count |= SCHED_HYGIENE_MARKER; |
370 | _preemption_disable_snap_start(); |
371 | ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); |
372 | } |
373 | } |
374 | |
375 | OS_NOINLINE |
376 | void |
377 | _collect_preemption_disable_measurement(void) |
378 | { |
379 | struct _preemption_disable_snap start = { 0 }; |
380 | struct _preemption_disable_snap now = { 0 }; |
381 | const bool abandon = _preemption_disable_snap_end(&start, &now); |
382 | |
383 | if (__improbable(abandon)) { |
384 | goto out; |
385 | } |
386 | |
387 | int64_t const gross_duration = now.pds_mach_time - start.pds_mach_time; |
388 | uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed); |
389 | if (__improbable(threshold > 0 && gross_duration >= threshold)) { |
390 | /* |
391 | * Double check that the time spent not handling interrupts is over the threshold. |
392 | */ |
393 | int64_t const interrupt_duration = now.pds_int_mach_time - start.pds_int_mach_time; |
394 | int64_t const net_duration = gross_duration - interrupt_duration; |
395 | assert3u(net_duration, >=, 0); |
396 | if (net_duration < threshold) { |
397 | goto out; |
398 | } |
399 | |
400 | uint64_t average_freq = 0; |
401 | uint64_t average_cpi_whole = 0; |
402 | uint64_t average_cpi_fractional = 0; |
403 | |
404 | #if CONFIG_CPU_COUNTERS |
405 | if (__probable(sched_hygiene_debug_pmc)) { |
406 | /* |
407 | * We're getting these values a bit late, but getting them |
408 | * is a bit expensive, so we take the slight hit in |
409 | * accuracy for the reported values (which aren't very |
410 | * stable anyway). |
411 | */ |
412 | const bool int_masked_debug = false; |
413 | const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); |
414 | mt_cur_cpu_cycles_instrs_speculative(&now.pds_cycles, &now.pds_instrs); |
415 | ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); |
416 | const uint64_t cycles_elapsed = now.pds_cycles - start.pds_cycles; |
417 | const uint64_t instrs_retired = now.pds_instrs - start.pds_instrs; |
418 | |
419 | uint64_t duration_ns; |
420 | absolutetime_to_nanoseconds(gross_duration, &duration_ns); |
421 | |
422 | average_freq = cycles_elapsed / (duration_ns / 1000); |
423 | average_cpi_whole = cycles_elapsed / instrs_retired; |
424 | average_cpi_fractional = |
425 | ((cycles_elapsed * 100) / instrs_retired) % 100; |
426 | } |
427 | #endif /* CONFIG_CPU_COUNTERS */ |
428 | |
429 | if (__probable(sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC)) { |
430 | panic("preemption disable timeout exceeded: %llu >= %llu mt ticks (start: %llu, now: %llu, gross: %llu, inttime: %llu), " |
431 | "freq = %llu MHz, CPI = %llu.%llu" , |
432 | net_duration, threshold, start.pds_mach_time, now.pds_mach_time, |
433 | gross_duration, interrupt_duration, |
434 | average_freq, average_cpi_whole, average_cpi_fractional); |
435 | } |
436 | |
437 | DTRACE_SCHED4(mach_preemption_expired, uint64_t, net_duration, uint64_t, gross_duration, |
438 | uint64_t, average_cpi_whole, uint64_t, average_cpi_fractional); |
439 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED), net_duration, gross_duration, average_cpi_whole, average_cpi_fractional); |
440 | } |
441 | |
442 | out: |
443 | /* |
444 | * the preemption count is SCHED_HYGIENE_MARKER, we need to clear it. |
445 | */ |
446 | _enable_preemption_write_count(current_thread(), 0); |
447 | } |
448 | |
449 | /* |
450 | * Abandon a potential preemption disable measurement. Useful for |
451 | * example for the idle thread, which would just spuriously |
452 | * trigger the threshold while actually idling, which we don't |
453 | * care about. |
454 | */ |
455 | void |
456 | abandon_preemption_disable_measurement(void) |
457 | { |
458 | const bool int_masked_debug = false; |
459 | bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); |
460 | struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); |
461 | if (pcpu->pdp_start.pds_mach_time != 0) { |
462 | pcpu->pdp_abandon = true; |
463 | } |
464 | ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); |
465 | } |
466 | |
467 | /* Inner part of disable_preemption_without_measuerments() */ |
468 | OS_ALWAYS_INLINE |
469 | static void |
470 | _do_disable_preemption_without_measurements(void) |
471 | { |
472 | /* |
473 | * Inform _collect_preemption_disable_measurement() |
474 | * that we didn't really care. |
475 | */ |
476 | struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); |
477 | pcpu->pdp_abandon = true; |
478 | } |
479 | |
480 | /** |
481 | * Reset the max interrupt durations of all CPUs. |
482 | */ |
483 | void preemption_disable_reset_max_durations(void); |
484 | void |
485 | preemption_disable_reset_max_durations(void) |
486 | { |
487 | percpu_foreach(pcpu, _preemption_disable_pcpu_data) { |
488 | os_atomic_store(&pcpu->pdp_max_mach_duration, 0, relaxed); |
489 | } |
490 | } |
491 | |
492 | unsigned int preemption_disable_get_max_durations(uint64_t *durations, size_t count); |
493 | unsigned int |
494 | preemption_disable_get_max_durations(uint64_t *durations, size_t count) |
495 | { |
496 | int cpu = 0; |
497 | percpu_foreach(pcpu, _preemption_disable_pcpu_data) { |
498 | if (cpu < count) { |
499 | durations[cpu++] = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed); |
500 | } |
501 | } |
502 | return cpu; |
503 | } |
504 | |
505 | /* |
506 | * Skip predicate for sched_preemption_disable, which would trigger |
507 | * spuriously when kprintf spam is enabled. |
508 | */ |
509 | bool |
510 | kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec) |
511 | { |
512 | bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output); |
513 | return kprintf_spam_enabled; |
514 | } |
515 | |
516 | /* |
517 | * Abandon function exported for AppleCLPC, as a workaround to rdar://91668370. |
518 | * |
519 | * Only for AppleCLPC! |
520 | */ |
521 | void |
522 | sched_perfcontrol_abandon_preemption_disable_measurement(void) |
523 | { |
524 | abandon_preemption_disable_measurement(); |
525 | } |
526 | |
527 | #else /* SCHED_HYGIENE_DEBUG */ |
528 | |
529 | void |
530 | sched_perfcontrol_abandon_preemption_disable_measurement(void) |
531 | { |
532 | // No-op. Function is exported, so needs to be defined |
533 | } |
534 | |
535 | #endif /* SCHED_HYGIENE_DEBUG */ |
536 | |