1/*
2 * Copyright (c) 2007-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Routines for preemption disablement,
31 * which prevents the current thread from giving up its current CPU.
32 */
33
34#include <arm/cpu_data.h>
35#include <arm/cpu_data_internal.h>
36#include <arm/preemption_disable_internal.h>
37#include <kern/cpu_data.h>
38#include <kern/percpu.h>
39#include <kern/thread.h>
40#include <mach/machine/sdt.h>
41#include <os/base.h>
42#include <stdint.h>
43#include <sys/kdebug.h>
44
45#if SCHED_HYGIENE_DEBUG
46static void
47_do_disable_preemption_without_measurements(void);
48#endif
49
50/*
51 * This function checks whether an AST_URGENT has been pended.
52 *
53 * It is called once the preemption has been reenabled, which means the thread
54 * may have been preempted right before this was called, and when this function
55 * actually performs the check, we've changed CPU.
56 *
57 * This race is however benign: the point of AST_URGENT is to trigger a context
58 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
59 * was cleared in the process.
60 *
61 * It follows that this check cannot have false negatives, which allows us
62 * to avoid fiddling with interrupt state for the vast majority of cases
63 * when the check will actually be negative.
64 */
65static OS_NOINLINE
66void
67kernel_preempt_check(void)
68{
69 uint64_t state;
70
71 /* If interrupts are masked, we can't take an AST here */
72 state = __builtin_arm_rsr64("DAIF");
73 if (state & DAIF_IRQF) {
74 return;
75 }
76
77 /* disable interrupts (IRQ FIQ ASYNCF) */
78 __builtin_arm_wsr64("DAIFSet", DAIFSC_STANDARD_DISABLE);
79
80 /*
81 * Reload cpu_pending_ast: a context switch would cause it to change.
82 * Now that interrupts are disabled, this will debounce false positives.
83 */
84 if (current_thread()->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
85 ast_taken_kernel();
86 }
87
88 /* restore the original interrupt mask */
89 __builtin_arm_wsr64("DAIF", state);
90}
91
92static inline void
93_enable_preemption_write_count(thread_t thread, unsigned int count)
94{
95 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
96
97 /*
98 * This check is racy and could load from another CPU's pending_ast mask,
99 * but as described above, this can't have false negatives.
100 */
101 if (count == 0) {
102 if (__improbable(thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT)) {
103 return kernel_preempt_check();
104 }
105 }
106}
107
108/*
109 * This function is written in a way that the codegen is extremely short.
110 *
111 * LTO isn't smart enough to inline it, yet it is profitable because
112 * the vast majority of callers use current_thread() already.
113 *
114 * TODO: It is unfortunate that we have to load
115 * sched_preemption_disable_debug_mode
116 *
117 * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
118 */
119OS_ALWAYS_INLINE
120void
121_disable_preemption(void)
122{
123 thread_t thread = current_thread();
124 unsigned int count = thread->machine.preemption_count;
125
126 os_atomic_store(&thread->machine.preemption_count,
127 count + 1, compiler_acq_rel);
128
129#if SCHED_HYGIENE_DEBUG
130 /*
131 * Note that this is not the only place preemption gets disabled,
132 * it also gets modified on ISR and PPL entry/exit. Both of those
133 * events will be treated specially however, and
134 * increment/decrement being paired around their entry/exit means
135 * that collection here is not desynced otherwise.
136 */
137
138 if (__improbable(count == 0 && sched_preemption_disable_debug_mode)) {
139 __attribute__((musttail))
140 return _prepare_preemption_disable_measurement();
141 }
142#endif /* SCHED_HYGIENE_DEBUG */
143}
144
145/*
146 * This variant of disable_preemption() allows disabling preemption
147 * without taking measurements (and later potentially triggering
148 * actions on those).
149 */
150OS_ALWAYS_INLINE
151void
152_disable_preemption_without_measurements(void)
153{
154 thread_t thread = current_thread();
155 unsigned int count = thread->machine.preemption_count;
156
157#if SCHED_HYGIENE_DEBUG
158 _do_disable_preemption_without_measurements();
159#endif /* SCHED_HYGIENE_DEBUG */
160
161 os_atomic_store(&thread->machine.preemption_count,
162 count + 1, compiler_acq_rel);
163}
164
165/*
166 * To help _enable_preemption() inline everywhere with LTO,
167 * we keep these nice non inlineable functions as the panic()
168 * codegen setup is quite large and for weird reasons causes a frame.
169 */
170__abortlike
171static void
172_enable_preemption_underflow(void)
173{
174 panic("Preemption count underflow");
175}
176
177/*
178 * This function is written in a way that the codegen is extremely short.
179 *
180 * LTO isn't smart enough to inline it, yet it is profitable because
181 * the vast majority of callers use current_thread() already.
182 *
183 * The SCHED_HYGIENE_MARKER trick is used so that we do not have to load
184 * unrelated fields of current_thread().
185 *
186 * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
187 */
188OS_ALWAYS_INLINE
189void
190_enable_preemption(void)
191{
192 thread_t thread = current_thread();
193 unsigned int count = thread->machine.preemption_count;
194
195 if (__improbable(count == 0)) {
196 _enable_preemption_underflow();
197 }
198
199#if SCHED_HYGIENE_DEBUG
200 if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) {
201 return _collect_preemption_disable_measurement();
202 }
203#endif /* SCHED_HYGIENE_DEBUG */
204
205 _enable_preemption_write_count(thread, count: count - 1);
206}
207
208OS_ALWAYS_INLINE
209unsigned int
210get_preemption_level_for_thread(thread_t thread)
211{
212 unsigned int count = thread->machine.preemption_count;
213
214#if SCHED_HYGIENE_DEBUG
215 /*
216 * hide this "flag" from callers,
217 * and it would make the count look negative anyway
218 * which some people dislike
219 */
220 count &= ~SCHED_HYGIENE_MARKER;
221#endif
222 return (int)count;
223}
224
225OS_ALWAYS_INLINE
226int
227get_preemption_level(void)
228{
229 return get_preemption_level_for_thread(thread: current_thread());
230}
231
232#if SCHED_HYGIENE_DEBUG
233
234uint64_t _Atomic PERCPU_DATA_HACK_78750602(preemption_disable_max_mt);
235
236#if XNU_PLATFORM_iPhoneOS
237#define DEFAULT_PREEMPTION_TIMEOUT 120000 /* 5ms */
238#define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC
239#else
240#define DEFAULT_PREEMPTION_TIMEOUT 0 /* Disabled */
241#define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_OFF
242#endif /* XNU_PLATFORM_iPhoneOS */
243
244MACHINE_TIMEOUT_DEV_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption",
245 DEFAULT_PREEMPTION_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred);
246TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode,
247 "machine-timeouts",
248 "sched-preemption-disable-mode", /* DT property names have to be 31 chars max */
249 "sched_preemption_disable_debug_mode",
250 DEFAULT_PREEMPTION_MODE,
251 TUNABLE_DT_CHECK_CHOSEN);
252
253struct _preemption_disable_pcpu PERCPU_DATA(_preemption_disable_pcpu_data);
254
255/*
256** Start a measurement window for the current CPU's preemption disable timeout.
257*
258* Interrupts must be disabled when calling this function,
259* but the assertion has been elided as this is on the fast path.
260*/
261static void
262_preemption_disable_snap_start(void)
263{
264 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
265 pcpu->pdp_abandon = false;
266 pcpu->pdp_start.pds_mach_time = ml_get_sched_hygiene_timebase();
267 pcpu->pdp_start.pds_int_mach_time = recount_current_processor_interrupt_time_mach();
268#if CONFIG_CPU_COUNTERS
269 if (__probable(sched_hygiene_debug_pmc)) {
270 mt_cur_cpu_cycles_instrs_speculative(&pcpu->pdp_start.pds_cycles,
271 &pcpu->pdp_start.pds_instrs);
272 }
273#endif /* CONFIG_CPU_COUNTERS */
274}
275
276/*
277**
278* End a measurement window for the current CPU's preemption disable timeout,
279* using the snapshot started by _preemption_disable_snap_start().
280*
281* @param start An out-parameter for the starting snapshot,
282* captured while interrupts are disabled.
283*
284* @param now An out-parameter for the current times,
285* captured at the same time as the start and with interrupts disabled.
286* This is meant for computing a delta.
287* Even with @link sched_hygiene_debug_pmc , the PMCs will not be read.
288* This allows their (relatively expensive) reads to happen only if the time threshold has been violated.
289*
290* @return Whether to abandon the current measurement due to a call to abandon_preemption_disable_measurement().
291*/
292static bool
293_preemption_disable_snap_end(
294 struct _preemption_disable_snap *start,
295 struct _preemption_disable_snap *now)
296{
297 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
298
299 const bool int_masked_debug = false;
300 const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
301 /*
302 * Collect start time and current time with interrupts disabled.
303 * Otherwise an interrupt coming in after grabbing the timestamp
304 * could spuriously inflate the measurement, because it will
305 * adjust preemption_disable_mt only after we already grabbed
306 * it.
307 *
308 * (Even worse if we collected the current time first: Then a
309 * subsequent interrupt could adjust preemption_disable_mt to
310 * make the duration go negative after subtracting the already
311 * grabbed time. With interrupts disabled we don't care much about
312 * the order.)
313 */
314
315 *start = pcpu->pdp_start;
316 uint64_t now_time = ml_get_sched_hygiene_timebase();
317 now->pds_mach_time = now_time;
318 now->pds_int_mach_time = recount_current_processor_interrupt_time_mach();
319 const bool abandon = pcpu->pdp_abandon;
320 const uint64_t max_duration = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
321
322 pcpu->pdp_start.pds_mach_time = 0;
323
324 /*
325 * Don't need to reset (or even save) pdp_abandon here:
326 * abandon_preemption_disable_measurement is a no-op anyway
327 * if pdp_start.pds_mach_time == 0 (which we just set), and it
328 * will stay that way until the next call to
329 * _collect_preemption_disable_measurement.
330 */
331 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
332 if (__probable(!abandon)) {
333 const int64_t gross_duration = now_time - start->pds_mach_time;
334 if (__improbable(gross_duration > max_duration)) {
335 os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed);
336 }
337 }
338 return abandon;
339}
340
341OS_NOINLINE
342void
343_prepare_preemption_disable_measurement(void)
344{
345 thread_t thread = current_thread();
346
347 if (thread->machine.inthandler_timestamp == 0) {
348 /*
349 * Only prepare a measurement if not currently in an interrupt
350 * handler.
351 *
352 * We are only interested in the net duration of disabled
353 * preemption, that is: The time in which preemption was
354 * disabled, minus the intervals in which any (likely
355 * unrelated) interrupts were handled.
356 * recount_current_thread_interrupt_time_mach() will remove those
357 * intervals, however we also do not even start measuring
358 * preemption disablement if we are already within handling of
359 * an interrupt when preemption was disabled (the resulting
360 * net time would be 0).
361 *
362 * Interrupt handling duration is handled separately, and any
363 * long intervals of preemption disablement are counted
364 * towards that.
365 */
366
367 bool const int_masked_debug = false;
368 bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
369 thread->machine.preemption_count |= SCHED_HYGIENE_MARKER;
370 _preemption_disable_snap_start();
371 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
372 }
373}
374
375OS_NOINLINE
376void
377_collect_preemption_disable_measurement(void)
378{
379 struct _preemption_disable_snap start = { 0 };
380 struct _preemption_disable_snap now = { 0 };
381 const bool abandon = _preemption_disable_snap_end(&start, &now);
382
383 if (__improbable(abandon)) {
384 goto out;
385 }
386
387 int64_t const gross_duration = now.pds_mach_time - start.pds_mach_time;
388 uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed);
389 if (__improbable(threshold > 0 && gross_duration >= threshold)) {
390 /*
391 * Double check that the time spent not handling interrupts is over the threshold.
392 */
393 int64_t const interrupt_duration = now.pds_int_mach_time - start.pds_int_mach_time;
394 int64_t const net_duration = gross_duration - interrupt_duration;
395 assert3u(net_duration, >=, 0);
396 if (net_duration < threshold) {
397 goto out;
398 }
399
400 uint64_t average_freq = 0;
401 uint64_t average_cpi_whole = 0;
402 uint64_t average_cpi_fractional = 0;
403
404#if CONFIG_CPU_COUNTERS
405 if (__probable(sched_hygiene_debug_pmc)) {
406 /*
407 * We're getting these values a bit late, but getting them
408 * is a bit expensive, so we take the slight hit in
409 * accuracy for the reported values (which aren't very
410 * stable anyway).
411 */
412 const bool int_masked_debug = false;
413 const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
414 mt_cur_cpu_cycles_instrs_speculative(&now.pds_cycles, &now.pds_instrs);
415 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
416 const uint64_t cycles_elapsed = now.pds_cycles - start.pds_cycles;
417 const uint64_t instrs_retired = now.pds_instrs - start.pds_instrs;
418
419 uint64_t duration_ns;
420 absolutetime_to_nanoseconds(gross_duration, &duration_ns);
421
422 average_freq = cycles_elapsed / (duration_ns / 1000);
423 average_cpi_whole = cycles_elapsed / instrs_retired;
424 average_cpi_fractional =
425 ((cycles_elapsed * 100) / instrs_retired) % 100;
426 }
427#endif /* CONFIG_CPU_COUNTERS */
428
429 if (__probable(sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC)) {
430 panic("preemption disable timeout exceeded: %llu >= %llu mt ticks (start: %llu, now: %llu, gross: %llu, inttime: %llu), "
431 "freq = %llu MHz, CPI = %llu.%llu",
432 net_duration, threshold, start.pds_mach_time, now.pds_mach_time,
433 gross_duration, interrupt_duration,
434 average_freq, average_cpi_whole, average_cpi_fractional);
435 }
436
437 DTRACE_SCHED4(mach_preemption_expired, uint64_t, net_duration, uint64_t, gross_duration,
438 uint64_t, average_cpi_whole, uint64_t, average_cpi_fractional);
439 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED), net_duration, gross_duration, average_cpi_whole, average_cpi_fractional);
440 }
441
442out:
443 /*
444 * the preemption count is SCHED_HYGIENE_MARKER, we need to clear it.
445 */
446 _enable_preemption_write_count(current_thread(), 0);
447}
448
449/*
450 * Abandon a potential preemption disable measurement. Useful for
451 * example for the idle thread, which would just spuriously
452 * trigger the threshold while actually idling, which we don't
453 * care about.
454 */
455void
456abandon_preemption_disable_measurement(void)
457{
458 const bool int_masked_debug = false;
459 bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
460 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
461 if (pcpu->pdp_start.pds_mach_time != 0) {
462 pcpu->pdp_abandon = true;
463 }
464 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
465}
466
467/* Inner part of disable_preemption_without_measuerments() */
468OS_ALWAYS_INLINE
469static void
470_do_disable_preemption_without_measurements(void)
471{
472 /*
473 * Inform _collect_preemption_disable_measurement()
474 * that we didn't really care.
475 */
476 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
477 pcpu->pdp_abandon = true;
478}
479
480/**
481 * Reset the max interrupt durations of all CPUs.
482 */
483void preemption_disable_reset_max_durations(void);
484void
485preemption_disable_reset_max_durations(void)
486{
487 percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
488 os_atomic_store(&pcpu->pdp_max_mach_duration, 0, relaxed);
489 }
490}
491
492unsigned int preemption_disable_get_max_durations(uint64_t *durations, size_t count);
493unsigned int
494preemption_disable_get_max_durations(uint64_t *durations, size_t count)
495{
496 int cpu = 0;
497 percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
498 if (cpu < count) {
499 durations[cpu++] = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
500 }
501 }
502 return cpu;
503}
504
505/*
506 * Skip predicate for sched_preemption_disable, which would trigger
507 * spuriously when kprintf spam is enabled.
508 */
509bool
510kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec)
511{
512 bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output);
513 return kprintf_spam_enabled;
514}
515
516/*
517 * Abandon function exported for AppleCLPC, as a workaround to rdar://91668370.
518 *
519 * Only for AppleCLPC!
520 */
521void
522sched_perfcontrol_abandon_preemption_disable_measurement(void)
523{
524 abandon_preemption_disable_measurement();
525}
526
527#else /* SCHED_HYGIENE_DEBUG */
528
529void
530sched_perfcontrol_abandon_preemption_disable_measurement(void)
531{
532 // No-op. Function is exported, so needs to be defined
533}
534
535#endif /* SCHED_HYGIENE_DEBUG */
536