1/*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: kern/machine.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1987
62 *
63 * Support for machine independent machine abstraction.
64 */
65
66#include <string.h>
67
68#include <mach/mach_types.h>
69#include <mach/boolean.h>
70#include <mach/kern_return.h>
71#include <mach/machine.h>
72#include <mach/host_info.h>
73#include <mach/host_reboot.h>
74#include <mach/host_priv_server.h>
75#include <mach/processor_server.h>
76#include <mach/sdt.h>
77
78#include <kern/kern_types.h>
79#include <kern/cpu_data.h>
80#include <kern/ipc_host.h>
81#include <kern/host.h>
82#include <kern/machine.h>
83#include <kern/misc_protos.h>
84#include <kern/percpu.h>
85#include <kern/processor.h>
86#include <kern/queue.h>
87#include <kern/sched.h>
88#include <kern/startup.h>
89#include <kern/task.h>
90#include <kern/thread.h>
91#include <kern/iotrace.h>
92
93#include <libkern/OSDebug.h>
94#if ML_IO_TIMEOUTS_ENABLED
95#include <libkern/tree.h>
96#endif
97
98#include <pexpert/device_tree.h>
99
100#include <machine/commpage.h>
101#include <machine/machine_routines.h>
102
103#if HIBERNATION
104#include <IOKit/IOHibernatePrivate.h>
105#endif
106#include <IOKit/IOPlatformExpert.h>
107
108#if CONFIG_DTRACE
109extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110#endif
111
112#if defined(__arm64__)
113extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114#if CONFIG_SPTM
115#include <arm64/sptm/pmap/pmap_data.h>
116#else
117#include <arm/pmap/pmap_data.h>
118#endif /* CONFIG_SPTM */
119#endif /* defined(__arm64__) */
120
121#if defined(__x86_64__)
122#include <i386/panic_notify.h>
123#endif
124
125#if ML_IO_TIMEOUTS_ENABLED
126#if defined(__x86_64__)
127#define ml_io_timestamp mach_absolute_time
128#else
129#define ml_io_timestamp ml_get_timebase
130#endif /* __x86_64__ */
131#endif /* ML_IO_TIMEOUTS_ENABLED */
132
133/*
134 * Exported variables:
135 */
136
137struct machine_info machine_info;
138
139/* Forwards */
140static void
141processor_doshutdown(processor_t processor);
142
143static void
144processor_offline(void * parameter, __unused wait_result_t result);
145
146static void
147processor_offline_intstack(processor_t processor) __dead2;
148
149static void
150processor_up_update_counts(processor_t processor)
151{
152 ml_cpu_up_update_counts(cpu_id: processor->cpu_id);
153
154 os_atomic_inc(&processor_avail_count, relaxed);
155 if (processor->is_recommended) {
156 os_atomic_inc(&processor_avail_count_user, relaxed);
157 }
158 if (processor->processor_primary == processor) {
159 os_atomic_inc(&primary_processor_avail_count, relaxed);
160 if (processor->is_recommended) {
161 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
162 }
163 }
164 commpage_update_active_cpus();
165}
166
167/*
168 * processor_up:
169 *
170 * Flag processor as up and running, and available
171 * for scheduling.
172 */
173void
174processor_up(
175 processor_t processor)
176{
177 processor_set_t pset;
178 spl_t s;
179
180 s = splsched();
181 init_ast_check(processor);
182
183#if defined(__arm64__)
184 /*
185 * A processor coming online won't have received a SIGPdebug signal
186 * to cause it to spin while a stackshot or panic is taking place,
187 * so spin here on mp_kdp_trap.
188 *
189 * However, since cpu_signal() is not yet enabled for this processor,
190 * there is a race if we have just passed this when a cpu_signal()
191 * is attempted. The sender will assume the cpu is offline, so it will
192 * not end up spinning anywhere. See processor_offline() for the fix
193 * for this race.
194 */
195 wait_while_mp_kdp_trap(false);
196#endif
197
198 pset = processor->processor_set;
199 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
200 pset_lock(pset);
201
202 ++pset->online_processor_count;
203 simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
204 pset_update_processor_state(pset, processor, new_state: PROCESSOR_RUNNING);
205 simple_unlock(&processor->start_state_lock);
206 bool temporary = processor->shutdown_temporary;
207 if (temporary) {
208 processor->shutdown_temporary = false;
209 } else {
210 processor_up_update_counts(processor);
211 }
212 if (processor->is_recommended) {
213 SCHED(pset_made_schedulable)(processor, pset, false);
214 }
215 pset_unlock(pset);
216 ml_cpu_up();
217 smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
218 sched_mark_processor_online_locked(processor, reason: processor->last_startup_reason);
219 simple_unlock(&sched_available_cores_lock);
220 splx(s);
221
222 thread_wakeup((event_t)&processor->state);
223
224#if CONFIG_DTRACE
225 if (dtrace_cpu_state_changed_hook) {
226 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
227 }
228#endif
229}
230#include <atm/atm_internal.h>
231
232kern_return_t
233host_reboot(
234 host_priv_t host_priv,
235 int options)
236{
237 if (host_priv == HOST_PRIV_NULL) {
238 return KERN_INVALID_HOST;
239 }
240
241#if DEVELOPMENT || DEBUG
242 if (options & HOST_REBOOT_DEBUGGER) {
243 Debugger("Debugger");
244 return KERN_SUCCESS;
245 }
246#endif
247
248 if (options & HOST_REBOOT_UPSDELAY) {
249 // UPS power cutoff path
250 PEHaltRestart( type: kPEUPSDelayHaltCPU );
251 } else {
252 halt_all_cpus(reboot: !(options & HOST_REBOOT_HALT));
253 }
254
255 return KERN_SUCCESS;
256}
257
258kern_return_t
259processor_assign(
260 __unused processor_t processor,
261 __unused processor_set_t new_pset,
262 __unused boolean_t wait)
263{
264 return KERN_FAILURE;
265}
266
267static void
268processor_down_update_counts(processor_t processor)
269{
270 ml_cpu_down_update_counts(cpu_id: processor->cpu_id);
271
272 os_atomic_dec(&processor_avail_count, relaxed);
273 if (processor->is_recommended) {
274 os_atomic_dec(&processor_avail_count_user, relaxed);
275 }
276 if (processor->processor_primary == processor) {
277 os_atomic_dec(&primary_processor_avail_count, relaxed);
278 if (processor->is_recommended) {
279 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
280 }
281 }
282 commpage_update_active_cpus();
283}
284
285extern lck_mtx_t processor_updown_lock;
286
287kern_return_t
288processor_shutdown(
289 processor_t processor,
290 processor_reason_t reason,
291 uint32_t flags)
292{
293 if (!ml_cpu_can_exit(cpu_id: processor->cpu_id, reason)) {
294 /*
295 * Failure if disallowed by arch code.
296 */
297 return KERN_NOT_SUPPORTED;
298 }
299
300 lck_mtx_lock(lck: &processor_updown_lock);
301
302 spl_t s = splsched();
303 processor_set_t pset = processor->processor_set;
304
305 pset_lock(pset);
306
307 if (processor->state == PROCESSOR_START) {
308 pset_unlock(pset);
309 splx(s);
310
311 processor_wait_for_start(processor);
312
313 s = splsched();
314 pset_lock(pset);
315 }
316
317 /*
318 * If the processor is dispatching, let it finish.
319 */
320 while (processor->state == PROCESSOR_DISPATCHING) {
321 pset_unlock(pset);
322 splx(s);
323 delay(usec: 1);
324 s = splsched();
325 pset_lock(pset);
326 }
327 pset_unlock(pset);
328 splx(s);
329
330 kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
331 if (mark_ret != KERN_SUCCESS) {
332 /* Must fail or we deadlock */
333 lck_mtx_unlock(lck: &processor_updown_lock);
334 return KERN_FAILURE;
335 }
336
337 ml_cpu_begin_state_transition(cpu_id: processor->cpu_id);
338 s = splsched();
339
340 pset_lock(pset);
341 if (processor->state == PROCESSOR_OFF_LINE) {
342 /*
343 * Success if already shutdown.
344 */
345 if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
346 /* Convert a temporary shutdown into a permanent shutdown */
347 processor->shutdown_temporary = false;
348 processor_down_update_counts(processor);
349 }
350 pset_unlock(pset);
351 splx(s);
352 ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
353
354 lck_mtx_unlock(lck: &processor_updown_lock);
355 return KERN_SUCCESS;
356 }
357
358 if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
359 /*
360 * Failure if processor is locked against shutdown.
361 */
362 pset_unlock(pset);
363 splx(s);
364
365 lck_mtx_unlock(lck: &processor_updown_lock);
366 return KERN_FAILURE;
367 }
368
369 /*
370 * If the processor is dispatching, let it finish.
371 */
372 while (processor->state == PROCESSOR_DISPATCHING) {
373 pset_unlock(pset);
374 splx(s);
375 delay(usec: 1);
376 s = splsched();
377 pset_lock(pset);
378 }
379
380 /*
381 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
382 */
383 if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
384 bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
385
386 pset_unlock(pset);
387 splx(s);
388 ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
389
390 lck_mtx_unlock(lck: &processor_updown_lock);
391 return success ? KERN_SUCCESS : KERN_FAILURE;
392 }
393
394 ml_broadcast_cpu_event(event: CPU_EXIT_REQUESTED, cpu_or_cluster: processor->cpu_id);
395 pset_update_processor_state(pset, processor, new_state: PROCESSOR_SHUTDOWN);
396 processor->last_shutdown_reason = reason;
397 if (flags & SHUTDOWN_TEMPORARY) {
398 processor->shutdown_temporary = true;
399 }
400 pset_unlock(pset);
401
402 processor_doshutdown(processor);
403 splx(s);
404
405 cpu_exit_wait(slot_num: processor->cpu_id);
406
407 if (processor != master_processor) {
408 s = splsched();
409 pset_lock(pset);
410 pset_update_processor_state(pset, processor, new_state: PROCESSOR_OFF_LINE);
411 pset_unlock(pset);
412 splx(s);
413 }
414
415 ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
416 ml_broadcast_cpu_event(event: CPU_EXITED, cpu_or_cluster: processor->cpu_id);
417 ml_cpu_power_disable(cpu_id: processor->cpu_id);
418
419 lck_mtx_unlock(lck: &processor_updown_lock);
420 return KERN_SUCCESS;
421}
422
423/*
424 * Called with interrupts disabled.
425 */
426static void
427processor_doshutdown(
428 processor_t processor)
429{
430 thread_t self = current_thread();
431
432 /*
433 * Get onto the processor to shutdown
434 */
435 processor_t prev = thread_bind(processor);
436 thread_block(THREAD_CONTINUE_NULL);
437
438 /* interrupts still disabled */
439 assert(ml_get_interrupts_enabled() == FALSE);
440
441 assert(processor == current_processor());
442 assert(processor->state == PROCESSOR_SHUTDOWN);
443
444#if CONFIG_DTRACE
445 if (dtrace_cpu_state_changed_hook) {
446 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
447 }
448#endif
449
450#if defined(__arm64__)
451 /*
452 * Catch a processor going offline
453 * while a panic or stackshot is in progress, as it won't
454 * receive a SIGPdebug now that interrupts are disabled.
455 */
456 wait_while_mp_kdp_trap(false);
457#endif
458
459 smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
460 ml_cpu_down();
461
462#if HIBERNATION
463 if (processor_avail_count < 2) {
464 hibernate_vm_lock();
465 hibernate_vm_unlock();
466 }
467#endif
468
469 processor_set_t pset = processor->processor_set;
470
471 pset_lock(pset);
472 pset_update_processor_state(pset, processor, new_state: PROCESSOR_PENDING_OFFLINE);
473 --pset->online_processor_count;
474 if (!processor->shutdown_temporary) {
475 processor_down_update_counts(processor);
476 }
477 SCHED(processor_queue_shutdown)(processor);
478 /* pset lock dropped */
479 SCHED(rt_queue_shutdown)(processor);
480
481 thread_bind(processor: prev);
482
483 /* interrupts still disabled */
484
485 /*
486 * Continue processor shutdown on the processor's idle thread.
487 * The handoff won't fail because the idle thread has a reserved stack.
488 * Switching to the idle thread leaves interrupts disabled,
489 * so we can't accidentally take an interrupt after the context switch.
490 */
491 thread_t shutdown_thread = processor->idle_thread;
492 shutdown_thread->continuation = processor_offline;
493 shutdown_thread->parameter = processor;
494
495 thread_run(self, NULL, NULL, new_thread: shutdown_thread);
496}
497
498/*
499 * Called in the context of the idle thread to shut down the processor
500 *
501 * A shut-down processor looks like it's 'running' the idle thread parked
502 * in this routine, but it's actually been powered off and has no hardware state.
503 */
504static void
505processor_offline(
506 void * parameter,
507 __unused wait_result_t result)
508{
509 processor_t processor = (processor_t) parameter;
510 thread_t self = current_thread();
511 __assert_only thread_t old_thread = THREAD_NULL;
512
513 assert(processor == current_processor());
514 assert(self->state & TH_IDLE);
515 assert(processor->idle_thread == self);
516 assert(ml_get_interrupts_enabled() == FALSE);
517 assert(self->continuation == NULL);
518 assert(processor->processor_offlined == false);
519 assert(processor->running_timers_active == false);
520
521 bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
522
523 /*
524 * Scheduling is now disabled for this processor.
525 * Ensure that primitives that need scheduling (like mutexes) know this.
526 */
527 if (enforce_quiesce_safety) {
528 disable_preemption_without_measurements();
529 }
530
531 /* convince slave_main to come back here */
532 processor->processor_offlined = true;
533
534 /*
535 * Switch to the interrupt stack and shut down the processor.
536 *
537 * When the processor comes back, it will eventually call load_context which
538 * restores the context saved by machine_processor_shutdown, returning here.
539 */
540 old_thread = machine_processor_shutdown(thread: self, doshutdown: processor_offline_intstack, processor);
541
542 /* old_thread should be NULL because we got here through Load_context */
543 assert(old_thread == THREAD_NULL);
544
545 assert(processor == current_processor());
546 assert(processor->idle_thread == current_thread());
547
548 assert(ml_get_interrupts_enabled() == FALSE);
549 assert(self->continuation == NULL);
550
551 /* Extract the machine_param value stashed by slave_main */
552 void * machine_param = self->parameter;
553 self->parameter = NULL;
554
555 /* Re-initialize the processor */
556 slave_machine_init(machine_param);
557
558 assert(processor->processor_offlined == true);
559 processor->processor_offlined = false;
560
561 if (enforce_quiesce_safety) {
562 enable_preemption();
563 }
564
565#if defined(__arm64__)
566 /*
567 * See the comments for DebuggerLock in processor_up().
568 *
569 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
570 * the first time we take an IPI. This is triggered by slave_machine_init(), above,
571 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
572 * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
573 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
574 */
575 ml_set_interrupts_enabled(TRUE);
576
577 ml_set_interrupts_enabled(FALSE);
578
579 wait_while_mp_kdp_trap(true);
580
581 /*
582 * At this point,
583 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
584 * or we sucessfully received a SIGPdebug signal which will cause us to
585 * break out of the spin on mp_kdp_trap and instead
586 * spin next time interrupts are enabled in idle_thread().
587 */
588#endif
589
590 /*
591 * Now that the processor is back, invoke the idle thread to find out what to do next.
592 * idle_thread will enable interrupts.
593 */
594 thread_block(continuation: idle_thread);
595 /*NOTREACHED*/
596}
597
598/*
599 * Complete the shutdown and place the processor offline.
600 *
601 * Called at splsched in the shutdown context
602 * (i.e. on the idle thread, on the interrupt stack)
603 *
604 * The onlining half of this is done in load_context().
605 */
606static void
607processor_offline_intstack(
608 processor_t processor)
609{
610 assert(processor == current_processor());
611 assert(processor->active_thread == current_thread());
612
613 struct recount_snap snap = { 0 };
614 recount_snapshot(snap: &snap);
615 recount_processor_idle(pr: &processor->pr_recount, snap: &snap);
616
617 smr_cpu_leave(processor, ctime: processor->last_dispatch);
618
619 PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
620
621 cpu_sleep();
622 panic("zombie processor");
623 /*NOTREACHED*/
624}
625
626kern_return_t
627host_get_boot_info(
628 host_priv_t host_priv,
629 kernel_boot_info_t boot_info)
630{
631 const char *src = "";
632 if (host_priv == HOST_PRIV_NULL) {
633 return KERN_INVALID_HOST;
634 }
635
636 /*
637 * Copy first operator string terminated by '\0' followed by
638 * standardized strings generated from boot string.
639 */
640 src = machine_boot_info(buf: boot_info, KERNEL_BOOT_INFO_MAX);
641 if (src != boot_info) {
642 (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
643 }
644
645 return KERN_SUCCESS;
646}
647
648// These are configured through sysctls.
649#if DEVELOPMENT || DEBUG
650uint32_t phy_read_panic = 1;
651uint32_t phy_write_panic = 1;
652uint64_t simulate_stretched_io = 0;
653#else
654uint32_t phy_read_panic = 0;
655uint32_t phy_write_panic = 0;
656#endif
657
658#if !defined(__x86_64__)
659
660#if DEVELOPMENT || DEBUG
661static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
662static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
663#else
664static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
665#endif
666
667// The MACHINE_TIMEOUT facility only exists on ARM.
668MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
669MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
670MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
671MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
672
673#if SCHED_HYGIENE_DEBUG
674/*
675 * Note: The interrupt-masked timeout goes through two initializations - one
676 * early in boot and one later. Thus this function is also called twice and
677 * can't be marked '__startup_func'.
678 */
679static void
680ml_io_init_timeouts(void)
681{
682 /*
683 * The timeouts may be completely disabled via an override.
684 */
685 if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
686 os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
687 os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
688 return;
689 }
690
691 /*
692 * There may be no interrupt masked timeout set.
693 */
694 const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed);
695 if (interrupt_masked_timeout == 0) {
696 return;
697 }
698
699 /*
700 * Inherit from the interrupt masked timeout if smaller and the timeout
701 * hasn't been explicitly set via boot-arg.
702 */
703 uint64_t arg = 0;
704
705 if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) {
706 uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
707 report_phy_read_delay = report_phy_read_delay == 0 ?
708 interrupt_masked_to :
709 MIN(report_phy_read_delay, interrupt_masked_to);
710 os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed);
711 }
712
713 if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) {
714 uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
715 report_phy_write_delay = report_phy_write_delay == 0 ?
716 interrupt_masked_to :
717 MIN(report_phy_write_delay, interrupt_masked_to);
718 os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed);
719 }
720}
721
722/*
723 * It's important that this happens after machine timeouts have initialized so
724 * the correct timeouts can be inherited.
725 */
726STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
727#endif /* SCHED_HYGIENE_DEBUG */
728
729extern pmap_paddr_t kvtophys(vm_offset_t va);
730#endif
731
732#if ML_IO_TIMEOUTS_ENABLED
733
734static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
735static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
736
737struct io_timeout_override_entry {
738 RB_ENTRY(io_timeout_override_entry) tree;
739
740 uintptr_t iovaddr_base;
741 unsigned int size;
742 uint32_t read_timeout;
743 uint32_t write_timeout;
744};
745
746static inline int
747io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
748{
749 if (a->iovaddr_base < b->iovaddr_base) {
750 return -1;
751 } else if (a->iovaddr_base > b->iovaddr_base) {
752 return 1;
753 } else {
754 return 0;
755 }
756}
757
758static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
759RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
760RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
761
762#endif /* ML_IO_TIMEOUTS_ENABLED */
763
764int
765ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
766{
767#if ML_IO_TIMEOUTS_ENABLED
768 const size_t MAX_SIZE = 4096;
769 const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
770
771 assert(preemption_enabled());
772
773 int ret = KERN_SUCCESS;
774
775 if (size == 0) {
776 return KERN_INVALID_ARGUMENT;
777 }
778
779 uintptr_t iovaddr_end;
780 if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
781 return KERN_INVALID_ARGUMENT;
782 }
783
784 uint64_t read_timeout_abs, write_timeout_abs;
785 nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, result: &read_timeout_abs);
786 nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, result: &write_timeout_abs);
787 if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
788 return KERN_INVALID_ARGUMENT;
789 }
790
791 struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
792 node->iovaddr_base = iovaddr_base;
793 node->size = size;
794 node->read_timeout = (uint32_t)read_timeout_abs;
795 node->write_timeout = (uint32_t)write_timeout_abs;
796
797 /*
798 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
799 * interrupts must be disabled any time io_timeout_override_lock is
800 * held. Otherwise the CPU could take an interrupt while holding the
801 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
802 * trying to acquire the lock again.
803 */
804 boolean_t istate = ml_set_interrupts_enabled(FALSE);
805 lck_spin_lock(lck: &io_timeout_override_lock);
806 if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
807 ret = KERN_INVALID_ARGUMENT;
808 goto out;
809 }
810
811 /* Check that this didn't create any new overlaps */
812 struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
813 if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
814 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
815 ret = KERN_INVALID_ARGUMENT;
816 goto out;
817 }
818 struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
819 if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
820 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
821 ret = KERN_INVALID_ARGUMENT;
822 goto out;
823 }
824
825out:
826 lck_spin_unlock(lck: &io_timeout_override_lock);
827 ml_set_interrupts_enabled(enable: istate);
828 if (ret != KERN_SUCCESS) {
829 kfree_type(struct io_timeout_override_entry, node);
830 }
831 return ret;
832#else /* !ML_IO_TIMEOUTS_ENABLED */
833#pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
834 return KERN_SUCCESS;
835#endif
836}
837
838int
839ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
840{
841#if ML_IO_TIMEOUTS_ENABLED
842 assert(preemption_enabled());
843
844 struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
845
846 boolean_t istate = ml_set_interrupts_enabled(FALSE);
847 lck_spin_lock(lck: &io_timeout_override_lock);
848 struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
849 if (node) {
850 if (node->size == size) {
851 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
852 } else {
853 node = NULL;
854 }
855 }
856 lck_spin_unlock(lck: &io_timeout_override_lock);
857 ml_set_interrupts_enabled(enable: istate);
858
859 if (!node) {
860 return KERN_NOT_FOUND;
861 }
862
863 kfree_type(struct io_timeout_override_entry, node);
864#else /* !ML_IO_TIMEOUTS_ENABLED */
865#pragma unused(iovaddr_base, size)
866#endif
867 return KERN_SUCCESS;
868}
869
870#if ML_IO_TIMEOUTS_ENABLED
871
872static bool
873override_io_timeouts_va(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
874{
875 assert(!ml_get_interrupts_enabled());
876
877 struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
878
879 lck_spin_lock(lck: &io_timeout_override_lock);
880 /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
881 while (node) {
882 if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
883 if (read_timeout) {
884 *read_timeout = node->read_timeout;
885 }
886 if (write_timeout) {
887 *write_timeout = node->write_timeout;
888 }
889 lck_spin_unlock(lck: &io_timeout_override_lock);
890 return true;
891 } else if (vaddr < node->iovaddr_base) {
892 node = RB_LEFT(node, tree);
893 } else {
894 node = RB_RIGHT(node, tree);
895 }
896 }
897 lck_spin_unlock(lck: &io_timeout_override_lock);
898
899 return false;
900}
901
902static bool
903override_io_timeouts_pa(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
904{
905#if defined(__arm64__)
906 /*
907 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
908 * timeout greater than the PCIe completion timeout (50ms). In some
909 * cases those timeouts can stack so make the timeout significantly
910 * higher.
911 */
912 #define STRONG_SYNC_TIMEOUT 1800000 /* 75ms */
913
914 pmap_io_range_t *range = pmap_find_io_attr(paddr);
915 if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
916 if (read_timeout) {
917 *read_timeout = STRONG_SYNC_TIMEOUT;
918 }
919 if (write_timeout) {
920 *write_timeout = STRONG_SYNC_TIMEOUT;
921 }
922
923 return true;
924 }
925#else
926 (void)paddr;
927 (void)read_timeout;
928 (void)write_timeout;
929#endif /* __arm64__ */
930 return false;
931}
932
933void
934override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
935{
936 if (vaddr != 0 &&
937 override_io_timeouts_va(vaddr, read_timeout, write_timeout)) {
938 return;
939 }
940
941 if (paddr != 0 &&
942 override_io_timeouts_pa(paddr, read_timeout, write_timeout)) {
943 return;
944 }
945}
946#endif /* ML_IO_TIMEOUTS_ENABLED */
947
948unsigned long long
949ml_io_read(uintptr_t vaddr, int size)
950{
951 unsigned long long result = 0;
952 unsigned char s1;
953 unsigned short s2;
954
955#ifdef ML_IO_VERIFY_UNCACHEABLE
956 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
957#elif defined(ML_IO_TIMEOUTS_ENABLED)
958 uintptr_t const paddr = kvtophys(va: vaddr);
959#endif
960
961#ifdef ML_IO_TIMEOUTS_ENABLED
962 uint64_t sabs, eabs;
963 boolean_t istate, timeread = FALSE;
964 uint64_t report_read_delay;
965#if __x86_64__
966 report_read_delay = report_phy_read_delay;
967#else
968 report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
969 uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
970#endif /* __x86_64__ */
971
972 if (__improbable(report_read_delay != 0)) {
973 istate = ml_set_interrupts_enabled(FALSE);
974 sabs = ml_io_timestamp();
975 timeread = TRUE;
976 }
977
978#ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
979 if (__improbable(timeread && simulate_stretched_io)) {
980 sabs -= simulate_stretched_io;
981 }
982#endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
983#endif /* ML_IO_TIMEOUTS_ENABLED */
984
985#if DEVELOPMENT || DEBUG
986 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
987 if (use_fences) {
988 ml_timebase_to_memory_fence();
989 }
990#endif
991
992 switch (size) {
993 case 1:
994 s1 = *(volatile unsigned char *)vaddr;
995 result = s1;
996 break;
997 case 2:
998 s2 = *(volatile unsigned short *)vaddr;
999 result = s2;
1000 break;
1001 case 4:
1002 result = *(volatile unsigned int *)vaddr;
1003 break;
1004 case 8:
1005 result = *(volatile unsigned long long *)vaddr;
1006 break;
1007 default:
1008 panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
1009 break;
1010 }
1011
1012#if DEVELOPMENT || DEBUG
1013 if (use_fences) {
1014 ml_memory_to_timebase_fence();
1015 }
1016#endif
1017
1018#ifdef ML_IO_TIMEOUTS_ENABLED
1019 if (__improbable(timeread == TRUE)) {
1020 eabs = ml_io_timestamp();
1021
1022 /* Prevent the processor from calling iotrace during its
1023 * initialization procedure. */
1024 if (current_processor()->state == PROCESSOR_RUNNING) {
1025 iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
1026 }
1027
1028 if (__improbable((eabs - sabs) > report_read_delay)) {
1029 DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
1030 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
1031
1032 uint64_t override = 0;
1033 override_io_timeouts(vaddr, paddr, read_timeout: &override, NULL);
1034
1035 if (override != 0) {
1036#if SCHED_HYGIENE_DEBUG
1037 /*
1038 * The IO timeout was overridden. As interrupts are disabled in
1039 * order to accurately measure IO time this can cause the
1040 * interrupt masked timeout threshold to be exceeded. If the
1041 * interrupt masked debug mode is set to panic, abandon the
1042 * measurement. If in trace mode leave it as-is for
1043 * observability.
1044 */
1045 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1046 ml_spin_debug_clear(current_thread());
1047 ml_irq_debug_abandon();
1048 }
1049#endif
1050 report_read_delay = override;
1051 }
1052 }
1053
1054 if (__improbable((eabs - sabs) > report_read_delay)) {
1055 if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1056#if defined(__x86_64__)
1057 panic_notify();
1058#endif /* defined(__x86_64__) */
1059 uint64_t nsec = 0;
1060 absolutetime_to_nanoseconds(abstime: eabs - sabs, result: &nsec);
1061 panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1062 "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1063 vaddr, paddr, nsec, result, sabs, eabs,
1064 report_read_delay);
1065 }
1066 }
1067
1068 if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1069 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1070 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1071 }
1072
1073 (void)ml_set_interrupts_enabled(enable: istate);
1074 }
1075#endif /* ML_IO_TIMEOUTS_ENABLED */
1076 return result;
1077}
1078
1079unsigned int
1080ml_io_read8(uintptr_t vaddr)
1081{
1082 return (unsigned) ml_io_read(vaddr, size: 1);
1083}
1084
1085unsigned int
1086ml_io_read16(uintptr_t vaddr)
1087{
1088 return (unsigned) ml_io_read(vaddr, size: 2);
1089}
1090
1091unsigned int
1092ml_io_read32(uintptr_t vaddr)
1093{
1094 return (unsigned) ml_io_read(vaddr, size: 4);
1095}
1096
1097unsigned long long
1098ml_io_read64(uintptr_t vaddr)
1099{
1100 return ml_io_read(vaddr, size: 8);
1101}
1102
1103/* ml_io_write* */
1104
1105void
1106ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1107{
1108#ifdef ML_IO_VERIFY_UNCACHEABLE
1109 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1110#elif defined(ML_IO_TIMEOUTS_ENABLED)
1111 uintptr_t const paddr = kvtophys(va: vaddr);
1112#endif
1113
1114#ifdef ML_IO_TIMEOUTS_ENABLED
1115 uint64_t sabs, eabs;
1116 boolean_t istate, timewrite = FALSE;
1117 uint64_t report_write_delay;
1118#if __x86_64__
1119 report_write_delay = report_phy_write_delay;
1120#else
1121 report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1122 uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1123#endif /* !defined(__x86_64__) */
1124 if (__improbable(report_write_delay != 0)) {
1125 istate = ml_set_interrupts_enabled(FALSE);
1126 sabs = ml_io_timestamp();
1127 timewrite = TRUE;
1128 }
1129
1130#ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1131 if (__improbable(timewrite && simulate_stretched_io)) {
1132 sabs -= simulate_stretched_io;
1133 }
1134#endif /* DEVELOPMENT || DEBUG */
1135#endif /* ML_IO_TIMEOUTS_ENABLED */
1136
1137#if DEVELOPMENT || DEBUG
1138 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1139 if (use_fences) {
1140 ml_timebase_to_memory_fence();
1141 }
1142#endif
1143
1144 switch (size) {
1145 case 1:
1146 *(volatile uint8_t *)vaddr = (uint8_t)val;
1147 break;
1148 case 2:
1149 *(volatile uint16_t *)vaddr = (uint16_t)val;
1150 break;
1151 case 4:
1152 *(volatile uint32_t *)vaddr = (uint32_t)val;
1153 break;
1154 case 8:
1155 *(volatile uint64_t *)vaddr = (uint64_t)val;
1156 break;
1157 default:
1158 panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1159 break;
1160 }
1161
1162#if DEVELOPMENT || DEBUG
1163 if (use_fences) {
1164 ml_memory_to_timebase_fence();
1165 }
1166#endif
1167
1168#ifdef ML_IO_TIMEOUTS_ENABLED
1169 if (__improbable(timewrite == TRUE)) {
1170 eabs = ml_io_timestamp();
1171
1172
1173 /* Prevent the processor from calling iotrace during its
1174 * initialization procedure. */
1175 if (current_processor()->state == PROCESSOR_RUNNING) {
1176 iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1177 }
1178
1179
1180 if (__improbable((eabs - sabs) > report_write_delay)) {
1181 DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1182 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1183
1184 uint64_t override = 0;
1185 override_io_timeouts(vaddr, paddr, NULL, write_timeout: &override);
1186
1187 if (override != 0) {
1188#if SCHED_HYGIENE_DEBUG
1189 /*
1190 * The IO timeout was overridden. As interrupts are disabled in
1191 * order to accurately measure IO time this can cause the
1192 * interrupt masked timeout threshold to be exceeded. If the
1193 * interrupt masked debug mode is set to panic, abandon the
1194 * measurement. If in trace mode leave it as-is for
1195 * observability.
1196 */
1197 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1198 ml_spin_debug_clear(current_thread());
1199 ml_irq_debug_abandon();
1200 }
1201#endif
1202 report_write_delay = override;
1203 }
1204 }
1205
1206 if (__improbable((eabs - sabs) > report_write_delay)) {
1207 if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1208#if defined(__x86_64__)
1209 panic_notify();
1210#endif /* defined(__x86_64__) */
1211
1212 uint64_t nsec = 0;
1213 absolutetime_to_nanoseconds(abstime: eabs - sabs, result: &nsec);
1214 panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1215 " (start: %llu, end: %llu), ceiling: %llu",
1216 (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1217 report_write_delay);
1218 }
1219 }
1220
1221 if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1222 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1223 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1224 }
1225
1226 (void)ml_set_interrupts_enabled(enable: istate);
1227 }
1228#endif /* ML_IO_TIMEOUTS_ENABLED */
1229}
1230
1231void
1232ml_io_write8(uintptr_t vaddr, uint8_t val)
1233{
1234 ml_io_write(vaddr, val, size: 1);
1235}
1236
1237void
1238ml_io_write16(uintptr_t vaddr, uint16_t val)
1239{
1240 ml_io_write(vaddr, val, size: 2);
1241}
1242
1243void
1244ml_io_write32(uintptr_t vaddr, uint32_t val)
1245{
1246 ml_io_write(vaddr, val, size: 4);
1247}
1248
1249void
1250ml_io_write64(uintptr_t vaddr, uint64_t val)
1251{
1252 ml_io_write(vaddr, val, size: 8);
1253}
1254
1255struct cpu_callback_chain_elem {
1256 cpu_callback_t fn;
1257 void *param;
1258 struct cpu_callback_chain_elem *next;
1259};
1260
1261static struct cpu_callback_chain_elem *cpu_callback_chain;
1262static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1263static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1264
1265void
1266cpu_event_register_callback(cpu_callback_t fn, void *param)
1267{
1268 struct cpu_callback_chain_elem *new_elem;
1269
1270 new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1271 if (!new_elem) {
1272 panic("can't allocate cpu_callback_chain_elem");
1273 }
1274
1275 lck_spin_lock(lck: &cpu_callback_chain_lock);
1276 new_elem->next = cpu_callback_chain;
1277 new_elem->fn = fn;
1278 new_elem->param = param;
1279 os_atomic_store(&cpu_callback_chain, new_elem, release);
1280 lck_spin_unlock(lck: &cpu_callback_chain_lock);
1281}
1282
1283__attribute__((noreturn))
1284void
1285cpu_event_unregister_callback(__unused cpu_callback_t fn)
1286{
1287 panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1288}
1289
1290void
1291ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1292{
1293 struct cpu_callback_chain_elem *cursor;
1294
1295 cursor = os_atomic_load(&cpu_callback_chain, dependency);
1296 for (; cursor != NULL; cursor = cursor->next) {
1297 cursor->fn(cursor->param, event, cpu_or_cluster);
1298 }
1299}
1300
1301// Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1302// definition)
1303
1304void
1305machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1306{
1307 if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1308 // This timeout should be disabled.
1309 os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1310 return;
1311 }
1312
1313 assert(suffix != NULL);
1314 assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1315
1316 size_t const suffix_len = strlen(s: suffix);
1317
1318 size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1319 char dt_name[dt_name_size];
1320
1321 strlcpy(dst: dt_name, src: spec->name, n: dt_name_size);
1322 strlcat(dst: dt_name, src: suffix, n: dt_name_size);
1323
1324 size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen(s: "-scale") + 1;
1325 char scale_name[scale_name_size];
1326
1327 strlcpy(dst: scale_name, src: spec->name, n: scale_name_size);
1328 strlcat(dst: scale_name, src: suffix, n: scale_name_size);
1329 strlcat(dst: scale_name, src: "-scale", n: scale_name_size);
1330
1331 size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen(s: "ml-timeout-") + suffix_len + 1;
1332 char boot_arg_name[boot_arg_name_size];
1333
1334 strlcpy(dst: boot_arg_name, src: "ml-timeout-", n: boot_arg_name_size);
1335 strlcat(dst: boot_arg_name, src: spec->name, n: boot_arg_name_size);
1336 strlcat(dst: boot_arg_name, src: suffix, n: boot_arg_name_size);
1337
1338 size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1339 strlen(s: "ml-timeout-") + strlen(s: "-scale") + suffix_len + 1;
1340 char boot_arg_scale_name[boot_arg_scale_name_size];
1341
1342 strlcpy(dst: boot_arg_scale_name, src: "ml-timeout-", n: boot_arg_scale_name_size);
1343 strlcat(dst: boot_arg_scale_name, src: spec->name, n: boot_arg_scale_name_size);
1344 strlcat(dst: boot_arg_scale_name, src: suffix, n: boot_arg_name_size);
1345 strlcat(dst: boot_arg_scale_name, src: "-scale", n: boot_arg_scale_name_size);
1346
1347
1348 /*
1349 * Determine base value from DT and boot-args.
1350 */
1351
1352 DTEntry base, chosen;
1353
1354 if (SecureDTLookupEntry(NULL, pathName: "/machine-timeouts", foundEntry: &base) != kSuccess) {
1355 base = NULL;
1356 }
1357
1358 if (SecureDTLookupEntry(NULL, pathName: "/chosen/machine-timeouts", foundEntry: &chosen) != kSuccess) {
1359 chosen = NULL;
1360 }
1361
1362 uint64_t timeout = spec->default_value;
1363 bool found = false;
1364
1365 uint64_t const *data = NULL;
1366 unsigned int data_size = sizeof(*data);
1367
1368 /* First look in /machine-timeouts/<name> */
1369 if (base != NULL && SecureDTGetProperty(entry: base, propertyName: dt_name, propertyValue: (const void **)&data, propertySize: &data_size) == kSuccess) {
1370 if (data_size != sizeof(*data)) {
1371 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1372 }
1373
1374 timeout = *data;
1375 found = true;
1376 }
1377
1378 /* A value in /chosen/machine-timeouts/<name> overrides */
1379 if (chosen != NULL && SecureDTGetProperty(entry: chosen, propertyName: dt_name, propertyValue: (const void **)&data, propertySize: &data_size) == kSuccess) {
1380 if (data_size != sizeof(*data)) {
1381 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1382 }
1383
1384 timeout = *data;
1385 found = true;
1386 }
1387
1388 /* A boot-arg ml-timeout-<name> overrides */
1389 uint64_t boot_arg = 0;
1390
1391 if (PE_parse_boot_argn(arg_string: boot_arg_name, arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1392 timeout = boot_arg;
1393 found = true;
1394 }
1395
1396
1397 /*
1398 * Determine scale value from DT and boot-args.
1399 */
1400
1401 uint64_t scale = 1;
1402 uint32_t const *scale_data;
1403 unsigned int scale_size = sizeof(scale_data);
1404
1405 /* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1406 if (base != NULL && SecureDTGetProperty(entry: base, propertyName: scale_name, propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1407 if (scale_size != sizeof(*scale_data)) {
1408 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1409 }
1410
1411 scale = *scale_data;
1412 }
1413
1414 /* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1415 if (chosen != NULL && SecureDTGetProperty(entry: chosen, propertyName: scale_name, propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1416 if (scale_size != sizeof(*scale_data)) {
1417 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1418 scale_size, dt_name);
1419 }
1420
1421 scale = *scale_data;
1422 }
1423
1424 /* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1425 if (PE_parse_boot_argn(arg_string: boot_arg_scale_name, arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1426 scale = boot_arg;
1427 }
1428
1429 static bool global_scale_set;
1430 static uint64_t global_scale;
1431
1432 if (!global_scale_set) {
1433 /* Apply /machine-timeouts/global-scale if present */
1434 if (SecureDTGetProperty(entry: base, propertyName: "global-scale", propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1435 if (scale_size != sizeof(*scale_data)) {
1436 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1437 scale_size);
1438 }
1439
1440 global_scale = *scale_data;
1441 global_scale_set = true;
1442 }
1443
1444 /* Use /chosen/machine-timeouts/global-scale if present */
1445 if (SecureDTGetProperty(entry: chosen, propertyName: "global-scale", propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1446 if (scale_size != sizeof(*scale_data)) {
1447 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1448 scale_size);
1449 }
1450
1451 global_scale = *scale_data;
1452 global_scale_set = true;
1453 }
1454
1455 /* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1456 if (PE_parse_boot_argn(arg_string: "ml-timeout-global-scale", arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1457 global_scale = boot_arg;
1458 global_scale_set = true;
1459 }
1460 }
1461
1462 if (global_scale_set) {
1463 scale *= global_scale;
1464 }
1465
1466 /* Compute the final timeout, and done. */
1467 if (found && timeout > 0) {
1468 /* Only apply inherent unit scale if the value came in
1469 * externally. */
1470
1471 if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1472 uint64_t nanoseconds = timeout / 1000;
1473 nanoseconds_to_absolutetime(nanoseconds, result: &timeout);
1474 } else {
1475 timeout /= spec->unit_scale;
1476 }
1477
1478 if (timeout == 0) {
1479 /* Ensure unit scaling did not disable the timeout. */
1480 timeout = 1;
1481 }
1482 }
1483
1484 if (os_mul_overflow(timeout, scale, &timeout)) {
1485 timeout = UINT64_MAX; // clamp
1486 }
1487
1488 os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1489}
1490
1491void
1492machine_timeout_init(const struct machine_timeout_spec *spec)
1493{
1494 machine_timeout_init_with_suffix(spec, suffix: "");
1495}
1496
1497#if DEVELOPMENT || DEBUG
1498/*
1499 * Late timeout (re-)initialization, at the end of bsd_init()
1500 */
1501void
1502machine_timeout_bsd_init(void)
1503{
1504 char const * const __unused mt_suffix = "-b";
1505#if SCHED_HYGIENE_DEBUG
1506 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1507 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1508
1509 /*
1510 * The io timeouts can inherit from interrupt_masked_timeout.
1511 * Re-initialize, as interrupt_masked_timeout may have changed.
1512 */
1513 ml_io_init_timeouts();
1514
1515 extern void preemption_disable_reset_max_durations(void);
1516 /*
1517 * Reset the preemption disable stats, so that they are not
1518 * polluted by long early boot code.
1519 */
1520 preemption_disable_reset_max_durations();
1521#endif /* SCHED_HYGIENE_DEBUG */
1522}
1523#endif /* DEVELOPMENT || DEBUG */
1524
1525#if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1526#include <tests/xnupost.h>
1527
1528extern kern_return_t ml_io_timeout_test(void);
1529
1530static inline void
1531ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1532{
1533 *read_timeout = 0;
1534 *write_timeout = 0;
1535
1536 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1537 override_io_timeouts(vaddr, 0, read_timeout, write_timeout);
1538 ml_set_interrupts_enabled(istate);
1539}
1540
1541kern_return_t
1542ml_io_timeout_test(void)
1543{
1544 const size_t SIZE = 16;
1545 uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1546 uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1547 uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1548 uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1549
1550 const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1551 const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1552 uint64_t read_timeout1_abs, write_timeout1_abs;
1553 uint64_t read_timeout2_abs, write_timeout2_abs;
1554 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1555 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1556 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1557 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1558
1559 int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1560 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1561
1562 err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1563 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1564
1565 err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1566 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1567
1568 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1569 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1570
1571 err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1572 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1573
1574 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1575 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1576
1577 err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1578 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1579
1580 uint64_t read_timeout, write_timeout;
1581 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1582 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1583 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1584
1585 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1586 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1587 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1588
1589 ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1590 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1591 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1592
1593 err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1594 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1595
1596 err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1597 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1598
1599 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1600 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1601
1602 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1603 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1604 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1605
1606 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1607 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1608
1609 err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1610 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1611
1612 return KERN_SUCCESS;
1613}
1614#endif /* CONFIG_XNUPOST */
1615