machine.c source code [xnu/osfmk/kern/machine.c]

1	/*
2	* Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: kern/machine.c
60	* Author: Avadis Tevanian, Jr.
61	* Date: 1987
62	*
63	* Support for machine independent machine abstraction.
64	*/
65
66	#include <string.h>
67
68	#include <mach/mach_types.h>
69	#include <mach/boolean.h>
70	#include <mach/kern_return.h>
71	#include <mach/machine.h>
72	#include <mach/host_info.h>
73	#include <mach/host_reboot.h>
74	#include <mach/host_priv_server.h>
75	#include <mach/processor_server.h>
76	#include <mach/sdt.h>
77
78	#include <kern/kern_types.h>
79	#include <kern/cpu_data.h>
80	#include <kern/ipc_host.h>
81	#include <kern/host.h>
82	#include <kern/machine.h>
83	#include <kern/misc_protos.h>
84	#include <kern/percpu.h>
85	#include <kern/processor.h>
86	#include <kern/queue.h>
87	#include <kern/sched.h>
88	#include <kern/startup.h>
89	#include <kern/task.h>
90	#include <kern/thread.h>
91	#include <kern/iotrace.h>
92
93	#include <libkern/OSDebug.h>
94	#if ML_IO_TIMEOUTS_ENABLED
95	#include <libkern/tree.h>
96	#endif
97
98	#include <pexpert/device_tree.h>
99
100	#include <machine/commpage.h>
101	#include <machine/machine_routines.h>
102
103	#if HIBERNATION
104	#include <IOKit/IOHibernatePrivate.h>
105	#endif
106	#include <IOKit/IOPlatformExpert.h>
107
108	#if CONFIG_DTRACE
109	extern void (dtrace_cpu_state_changed_hook)(int*, boolean_t);
110	#endif
111
112	#if defined(__arm64__)
113	extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114	#if CONFIG_SPTM
115	#include <arm64/sptm/pmap/pmap_data.h>
116	#else
117	#include <arm/pmap/pmap_data.h>
118	#endif /* CONFIG_SPTM */
119	#endif /* defined(__arm64__) */
120
121	#if defined(__x86_64__)
122	#include <i386/panic_notify.h>
123	#endif
124
125	#if ML_IO_TIMEOUTS_ENABLED
126	#if defined(__x86_64__)
127	#define ml_io_timestamp mach_absolute_time
128	#else
129	#define ml_io_timestamp ml_get_timebase
130	#endif /* __x86_64__ */
131	#endif /* ML_IO_TIMEOUTS_ENABLED */
132
133	/*
134	* Exported variables:
135	*/
136
137	struct machine_info machine_info;
138
139	/ Forwards /
140	static void
141	processor_doshutdown(processor_t processor);
142
143	static void
144	processor_offline(void * parameter, __unused wait_result_t result);
145
146	static void
147	processor_offline_intstack(processor_t processor) __dead2;
148
149	static void
150	processor_up_update_counts(processor_t processor)
151	{
152	ml_cpu_up_update_counts(cpu_id: processor->cpu_id);
153
154	os_atomic_inc(&processor_avail_count, relaxed);
155	if (processor->is_recommended) {
156	os_atomic_inc(&processor_avail_count_user, relaxed);
157	}
158	if (processor->processor_primary == processor) {
159	os_atomic_inc(&primary_processor_avail_count, relaxed);
160	if (processor->is_recommended) {
161	os_atomic_inc(&primary_processor_avail_count_user, relaxed);
162	}
163	}
164	commpage_update_active_cpus();
165	}
166
167	/*
168	* processor_up:
169	*
170	* Flag processor as up and running, and available
171	* for scheduling.
172	*/
173	void
174	processor_up(
175	processor_t processor)
176	{
177	processor_set_t pset;
178	spl_t s;
179
180	s = splsched();
181	init_ast_check(processor);
182
183	#if defined(__arm64__)
184	/*
185	* A processor coming online won't have received a SIGPdebug signal
186	* to cause it to spin while a stackshot or panic is taking place,
187	* so spin here on mp_kdp_trap.
188	*
189	* However, since cpu_signal() is not yet enabled for this processor,
190	* there is a race if we have just passed this when a cpu_signal()
191	* is attempted. The sender will assume the cpu is offline, so it will
192	* not end up spinning anywhere. See processor_offline() for the fix
193	* for this race.
194	*/
195	wait_while_mp_kdp_trap(false);
196	#endif
197
198	pset = processor->processor_set;
199	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
200	pset_lock(pset);
201
202	++pset->online_processor_count;
203	simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
204	pset_update_processor_state(pset, processor, new_state: PROCESSOR_RUNNING);
205	simple_unlock(&processor->start_state_lock);
206	bool temporary = processor->shutdown_temporary;
207	if (temporary) {
208	processor->shutdown_temporary = false;
209	} else {
210	processor_up_update_counts(processor);
211	}
212	if (processor->is_recommended) {
213	SCHED(pset_made_schedulable)(processor, pset, false);
214	}
215	pset_unlock(pset);
216	ml_cpu_up();
217	smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
218	sched_mark_processor_online_locked(processor, reason: processor->last_startup_reason);
219	simple_unlock(&sched_available_cores_lock);
220	splx(s);
221
222	thread_wakeup((event_t)&processor->state);
223
224	#if CONFIG_DTRACE
225	if (dtrace_cpu_state_changed_hook) {
226	(*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
227	}
228	#endif
229	}
230	#include <atm/atm_internal.h>
231
232	kern_return_t
233	host_reboot(
234	host_priv_t host_priv,
235	int options)
236	{
237	if (host_priv == HOST_PRIV_NULL) {
238	return KERN_INVALID_HOST;
239	}
240
241	#if DEVELOPMENT \|\| DEBUG
242	if (options & HOST_REBOOT_DEBUGGER) {
243	Debugger("Debugger");
244	return KERN_SUCCESS;
245	}
246	#endif
247
248	if (options & HOST_REBOOT_UPSDELAY) {
249	// UPS power cutoff path
250	PEHaltRestart( type: kPEUPSDelayHaltCPU );
251	} else {
252	halt_all_cpus(reboot: !(options & HOST_REBOOT_HALT));
253	}
254
255	return KERN_SUCCESS;
256	}
257
258	kern_return_t
259	processor_assign(
260	__unused processor_t processor,
261	__unused processor_set_t new_pset,
262	__unused boolean_t wait)
263	{
264	return KERN_FAILURE;
265	}
266
267	static void
268	processor_down_update_counts(processor_t processor)
269	{
270	ml_cpu_down_update_counts(cpu_id: processor->cpu_id);
271
272	os_atomic_dec(&processor_avail_count, relaxed);
273	if (processor->is_recommended) {
274	os_atomic_dec(&processor_avail_count_user, relaxed);
275	}
276	if (processor->processor_primary == processor) {
277	os_atomic_dec(&primary_processor_avail_count, relaxed);
278	if (processor->is_recommended) {
279	os_atomic_dec(&primary_processor_avail_count_user, relaxed);
280	}
281	}
282	commpage_update_active_cpus();
283	}
284
285	extern lck_mtx_t processor_updown_lock;
286
287	kern_return_t
288	processor_shutdown(
289	processor_t processor,
290	processor_reason_t reason,
291	uint32_t flags)
292	{
293	if (!ml_cpu_can_exit(cpu_id: processor->cpu_id, reason)) {
294	/*
295	* Failure if disallowed by arch code.
296	*/
297	return KERN_NOT_SUPPORTED;
298	}
299
300	lck_mtx_lock(lck: &processor_updown_lock);
301
302	spl_t s = splsched();
303	processor_set_t pset = processor->processor_set;
304
305	pset_lock(pset);
306
307	if (processor->state == PROCESSOR_START) {
308	pset_unlock(pset);
309	splx(s);
310
311	processor_wait_for_start(processor);
312
313	s = splsched();
314	pset_lock(pset);
315	}
316
317	/*
318	* If the processor is dispatching, let it finish.
319	*/
320	while (processor->state == PROCESSOR_DISPATCHING) {
321	pset_unlock(pset);
322	splx(s);
323	delay(usec: `1`);
324	s = splsched();
325	pset_lock(pset);
326	}
327	pset_unlock(pset);
328	splx(s);
329
330	kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
331	if (mark_ret != KERN_SUCCESS) {
332	/ Must fail or we deadlock /
333	lck_mtx_unlock(lck: &processor_updown_lock);
334	return KERN_FAILURE;
335	}
336
337	ml_cpu_begin_state_transition(cpu_id: processor->cpu_id);
338	s = splsched();
339
340	pset_lock(pset);
341	if (processor->state == PROCESSOR_OFF_LINE) {
342	/*
343	* Success if already shutdown.
344	*/
345	if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
346	/ Convert a temporary shutdown into a permanent shutdown /
347	processor->shutdown_temporary = false;
348	processor_down_update_counts(processor);
349	}
350	pset_unlock(pset);
351	splx(s);
352	ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
353
354	lck_mtx_unlock(lck: &processor_updown_lock);
355	return KERN_SUCCESS;
356	}
357
358	if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
359	/*
360	* Failure if processor is locked against shutdown.
361	*/
362	pset_unlock(pset);
363	splx(s);
364
365	lck_mtx_unlock(lck: &processor_updown_lock);
366	return KERN_FAILURE;
367	}
368
369	/*
370	* If the processor is dispatching, let it finish.
371	*/
372	while (processor->state == PROCESSOR_DISPATCHING) {
373	pset_unlock(pset);
374	splx(s);
375	delay(usec: `1`);
376	s = splsched();
377	pset_lock(pset);
378	}
379
380	/*
381	* Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
382	*/
383	if ((processor->state == PROCESSOR_SHUTDOWN) \|\| (processor->state == PROCESSOR_PENDING_OFFLINE)) {
384	bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
385
386	pset_unlock(pset);
387	splx(s);
388	ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
389
390	lck_mtx_unlock(lck: &processor_updown_lock);
391	return success ? KERN_SUCCESS : KERN_FAILURE;
392	}
393
394	ml_broadcast_cpu_event(event: CPU_EXIT_REQUESTED, cpu_or_cluster: processor->cpu_id);
395	pset_update_processor_state(pset, processor, new_state: PROCESSOR_SHUTDOWN);
396	processor->last_shutdown_reason = reason;
397	if (flags & SHUTDOWN_TEMPORARY) {
398	processor->shutdown_temporary = true;
399	}
400	pset_unlock(pset);
401
402	processor_doshutdown(processor);
403	splx(s);
404
405	cpu_exit_wait(slot_num: processor->cpu_id);
406
407	if (processor != master_processor) {
408	s = splsched();
409	pset_lock(pset);
410	pset_update_processor_state(pset, processor, new_state: PROCESSOR_OFF_LINE);
411	pset_unlock(pset);
412	splx(s);
413	}
414
415	ml_cpu_end_state_transition(cpu_id: processor->cpu_id);
416	ml_broadcast_cpu_event(event: CPU_EXITED, cpu_or_cluster: processor->cpu_id);
417	ml_cpu_power_disable(cpu_id: processor->cpu_id);
418
419	lck_mtx_unlock(lck: &processor_updown_lock);
420	return KERN_SUCCESS;
421	}
422
423	/*
424	* Called with interrupts disabled.
425	*/
426	static void
427	processor_doshutdown(
428	processor_t processor)
429	{
430	thread_t self = current_thread();
431
432	/*
433	* Get onto the processor to shutdown
434	*/
435	processor_t prev = thread_bind(processor);
436	thread_block(THREAD_CONTINUE_NULL);
437
438	/ interrupts still disabled /
439	assert(ml_get_interrupts_enabled() == FALSE);
440
441	assert(processor == current_processor());
442	assert(processor->state == PROCESSOR_SHUTDOWN);
443
444	#if CONFIG_DTRACE
445	if (dtrace_cpu_state_changed_hook) {
446	(*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
447	}
448	#endif
449
450	#if defined(__arm64__)
451	/*
452	* Catch a processor going offline
453	* while a panic or stackshot is in progress, as it won't
454	* receive a SIGPdebug now that interrupts are disabled.
455	*/
456	wait_while_mp_kdp_trap(false);
457	#endif
458
459	smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
460	ml_cpu_down();
461
462	#if HIBERNATION
463	if (processor_avail_count < `2`) {
464	hibernate_vm_lock();
465	hibernate_vm_unlock();
466	}
467	#endif
468
469	processor_set_t pset = processor->processor_set;
470
471	pset_lock(pset);
472	pset_update_processor_state(pset, processor, new_state: PROCESSOR_PENDING_OFFLINE);
473	--pset->online_processor_count;
474	if (!processor->shutdown_temporary) {
475	processor_down_update_counts(processor);
476	}
477	SCHED(processor_queue_shutdown)(processor);
478	/ pset lock dropped /
479	SCHED(rt_queue_shutdown)(processor);
480
481	thread_bind(processor: prev);
482
483	/ interrupts still disabled /
484
485	/*
486	* Continue processor shutdown on the processor's idle thread.
487	* The handoff won't fail because the idle thread has a reserved stack.
488	* Switching to the idle thread leaves interrupts disabled,
489	* so we can't accidentally take an interrupt after the context switch.
490	*/
491	thread_t shutdown_thread = processor->idle_thread;
492	shutdown_thread->continuation = processor_offline;
493	shutdown_thread->parameter = processor;
494
495	thread_run(self, NULL, NULL, new_thread: shutdown_thread);
496	}
497
498	/*
499	* Called in the context of the idle thread to shut down the processor
500	*
501	* A shut-down processor looks like it's 'running' the idle thread parked
502	* in this routine, but it's actually been powered off and has no hardware state.
503	*/
504	static void
505	processor_offline(
506	void * parameter,
507	__unused wait_result_t result)
508	{
509	processor_t processor = (processor_t) parameter;
510	thread_t self = current_thread();
511	__assert_only thread_t old_thread = THREAD_NULL;
512
513	assert(processor == current_processor());
514	assert(self->state & TH_IDLE);
515	assert(processor->idle_thread == self);
516	assert(ml_get_interrupts_enabled() == FALSE);
517	assert(self->continuation == NULL);
518	assert(processor->processor_offlined == false);
519	assert(processor->running_timers_active == false);
520
521	bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
522
523	/*
524	* Scheduling is now disabled for this processor.
525	* Ensure that primitives that need scheduling (like mutexes) know this.
526	*/
527	if (enforce_quiesce_safety) {
528	disable_preemption_without_measurements();
529	}
530
531	/ convince slave_main to come back here /
532	processor->processor_offlined = true;
533
534	/*
535	* Switch to the interrupt stack and shut down the processor.
536	*
537	* When the processor comes back, it will eventually call load_context which
538	* restores the context saved by machine_processor_shutdown, returning here.
539	*/
540	old_thread = machine_processor_shutdown(thread: self, doshutdown: processor_offline_intstack, processor);
541
542	/ old_thread should be NULL because we got here through Load_context /
543	assert(old_thread == THREAD_NULL);
544
545	assert(processor == current_processor());
546	assert(processor->idle_thread == current_thread());
547
548	assert(ml_get_interrupts_enabled() == FALSE);
549	assert(self->continuation == NULL);
550
551	/ Extract the machine_param value stashed by slave_main /
552	void * machine_param = self->parameter;
553	self->parameter = NULL;
554
555	/ Re-initialize the processor /
556	slave_machine_init(machine_param);
557
558	assert(processor->processor_offlined == true);
559	processor->processor_offlined = false;
560
561	if (enforce_quiesce_safety) {
562	enable_preemption();
563	}
564
565	#if defined(__arm64__)
566	/*
567	* See the comments for DebuggerLock in processor_up().
568	*
569	* SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
570	* the first time we take an IPI. This is triggered by slave_machine_init(), above,
571	* which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
572	* a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
573	* here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
574	*/
575	ml_set_interrupts_enabled(TRUE);
576
577	ml_set_interrupts_enabled(FALSE);
578
579	wait_while_mp_kdp_trap(true);
580
581	/*
582	* At this point,
583	* if a stackshot or panic is in progress, we either spin on mp_kdp_trap
584	* or we sucessfully received a SIGPdebug signal which will cause us to
585	* break out of the spin on mp_kdp_trap and instead
586	* spin next time interrupts are enabled in idle_thread().
587	*/
588	#endif
589
590	/*
591	* Now that the processor is back, invoke the idle thread to find out what to do next.
592	* idle_thread will enable interrupts.
593	*/
594	thread_block(continuation: idle_thread);
595	/NOTREACHED/
596	}
597
598	/*
599	* Complete the shutdown and place the processor offline.
600	*
601	* Called at splsched in the shutdown context
602	* (i.e. on the idle thread, on the interrupt stack)
603	*
604	* The onlining half of this is done in load_context().
605	*/
606	static void
607	processor_offline_intstack(
608	processor_t processor)
609	{
610	assert(processor == current_processor());
611	assert(processor->active_thread == current_thread());
612
613	struct recount_snap snap = { `0` };
614	recount_snapshot(snap: &snap);
615	recount_processor_idle(pr: &processor->pr_recount, snap: &snap);
616
617	smr_cpu_leave(processor, ctime: processor->last_dispatch);
618
619	PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
620
621	cpu_sleep();
622	panic("zombie processor");
623	/NOTREACHED/
624	}
625
626	kern_return_t
627	host_get_boot_info(
628	host_priv_t host_priv,
629	kernel_boot_info_t boot_info)
630	{
631	const char *src = "";
632	if (host_priv == HOST_PRIV_NULL) {
633	return KERN_INVALID_HOST;
634	}
635
636	/*
637	* Copy first operator string terminated by '\0' followed by
638	* standardized strings generated from boot string.
639	*/
640	src = machine_boot_info(buf: boot_info, KERNEL_BOOT_INFO_MAX);
641	if (src != boot_info) {
642	(void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
643	}
644
645	return KERN_SUCCESS;
646	}
647
648	// These are configured through sysctls.
649	#if DEVELOPMENT \|\| DEBUG
650	uint32_t phy_read_panic = `1`;
651	uint32_t phy_write_panic = `1`;
652	uint64_t simulate_stretched_io = `0`;
653	#else
654	uint32_t phy_read_panic = `0`;
655	uint32_t phy_write_panic = `0`;
656	#endif
657
658	#if !defined(__x86_64__)
659
660	#if DEVELOPMENT \|\| DEBUG
661	static const uint64_t TIMEBASE_TICKS_PER_USEC = `24000000ULL` / USEC_PER_SEC;
662	static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = `100` * TIMEBASE_TICKS_PER_USEC;
663	#else
664	static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = `0`;
665	#endif
666
667	// The MACHINE_TIMEOUT facility only exists on ARM.
668	MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", `0`, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
669	MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", `0`, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
670	MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
671	MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
672
673	#if SCHED_HYGIENE_DEBUG
674	/*
675	* Note: The interrupt-masked timeout goes through two initializations - one
676	* early in boot and one later. Thus this function is also called twice and
677	* can't be marked '__startup_func'.
678	*/
679	static void
680	ml_io_init_timeouts(void)
681	{
682	/*
683	* The timeouts may be completely disabled via an override.
684	*/
685	if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
686	os_atomic_store(&report_phy_write_delay_to, `0`, relaxed);
687	os_atomic_store(&report_phy_read_delay_to, `0`, relaxed);
688	return;
689	}
690
691	/*
692	* There may be no interrupt masked timeout set.
693	*/
694	const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed);
695	if (interrupt_masked_timeout == `0`) {
696	return;
697	}
698
699	/*
700	* Inherit from the interrupt masked timeout if smaller and the timeout
701	* hasn't been explicitly set via boot-arg.
702	*/
703	uint64_t arg = `0`;
704
705	if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) {
706	uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
707	report_phy_read_delay = report_phy_read_delay == `0` ?
708	interrupt_masked_to :
709	MIN(report_phy_read_delay, interrupt_masked_to);
710	os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed);
711	}
712
713	if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) {
714	uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
715	report_phy_write_delay = report_phy_write_delay == `0` ?
716	interrupt_masked_to :
717	MIN(report_phy_write_delay, interrupt_masked_to);
718	os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed);
719	}
720	}
721
722	/*
723	* It's important that this happens after machine timeouts have initialized so
724	* the correct timeouts can be inherited.
725	*/
726	STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
727	#endif /* SCHED_HYGIENE_DEBUG */
728
729	extern pmap_paddr_t kvtophys(vm_offset_t va);
730	#endif
731
732	#if ML_IO_TIMEOUTS_ENABLED
733
734	static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
735	static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
736
737	struct io_timeout_override_entry {
738	RB_ENTRY(io_timeout_override_entry) tree;
739
740	uintptr_t iovaddr_base;
741	unsigned int size;
742	uint32_t read_timeout;
743	uint32_t write_timeout;
744	};
745
746	static inline int
747	io_timeout_override_cmp(const struct io_timeout_override_entry a, const* struct io_timeout_override_entry *b)
748	{
749	if (a->iovaddr_base < b->iovaddr_base) {
750	return -`1`;
751	} else if (a->iovaddr_base > b->iovaddr_base) {
752	return `1`;
753	} else {
754	return `0`;
755	}
756	}
757
758	static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
759	RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
760	RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
761
762	#endif /* ML_IO_TIMEOUTS_ENABLED */
763
764	int
765	ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
766	{
767	#if ML_IO_TIMEOUTS_ENABLED
768	const size_t MAX_SIZE = `4096`;
769	const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
770
771	assert(preemption_enabled());
772
773	int ret = KERN_SUCCESS;
774
775	if (size == `0`) {
776	return KERN_INVALID_ARGUMENT;
777	}
778
779	uintptr_t iovaddr_end;
780	if (size > MAX_SIZE \|\| os_add_overflow(iovaddr_base, size - `1`, &iovaddr_end)) {
781	return KERN_INVALID_ARGUMENT;
782	}
783
784	uint64_t read_timeout_abs, write_timeout_abs;
785	nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, result: &read_timeout_abs);
786	nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, result: &write_timeout_abs);
787	if (read_timeout_abs > MAX_TIMEOUT_ABS \|\| write_timeout_abs > MAX_TIMEOUT_ABS) {
788	return KERN_INVALID_ARGUMENT;
789	}
790
791	struct io_timeout_override_entry node = kalloc_type(struct* io_timeout_override_entry, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
792	node->iovaddr_base = iovaddr_base;
793	node->size = size;
794	node->read_timeout = (uint32_t)read_timeout_abs;
795	node->write_timeout = (uint32_t)write_timeout_abs;
796
797	/*
798	* Interrupt handlers are allowed to call ml_io_{read,write}*, so
799	* interrupts must be disabled any time io_timeout_override_lock is
800	* held. Otherwise the CPU could take an interrupt while holding the
801	* lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
802	* trying to acquire the lock again.
803	*/
804	boolean_t istate = ml_set_interrupts_enabled(FALSE);
805	lck_spin_lock(lck: &io_timeout_override_lock);
806	if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
807	ret = KERN_INVALID_ARGUMENT;
808	goto out;
809	}
810
811	/ Check that this didn't create any new overlaps /
812	struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
813	if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
814	RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
815	ret = KERN_INVALID_ARGUMENT;
816	goto out;
817	}
818	struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
819	if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
820	RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
821	ret = KERN_INVALID_ARGUMENT;
822	goto out;
823	}
824
825	out:
826	lck_spin_unlock(lck: &io_timeout_override_lock);
827	ml_set_interrupts_enabled(enable: istate);
828	if (ret != KERN_SUCCESS) {
829	kfree_type(struct io_timeout_override_entry, node);
830	}
831	return ret;
832	#else /* !ML_IO_TIMEOUTS_ENABLED */
833	#pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
834	return KERN_SUCCESS;
835	#endif
836	}
837
838	int
839	ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
840	{
841	#if ML_IO_TIMEOUTS_ENABLED
842	assert(preemption_enabled());
843
844	struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
845
846	boolean_t istate = ml_set_interrupts_enabled(FALSE);
847	lck_spin_lock(lck: &io_timeout_override_lock);
848	struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
849	if (node) {
850	if (node->size == size) {
851	RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
852	} else {
853	node = NULL;
854	}
855	}
856	lck_spin_unlock(lck: &io_timeout_override_lock);
857	ml_set_interrupts_enabled(enable: istate);
858
859	if (!node) {
860	return KERN_NOT_FOUND;
861	}
862
863	kfree_type(struct io_timeout_override_entry, node);
864	#else /* !ML_IO_TIMEOUTS_ENABLED */
865	#pragma unused(iovaddr_base, size)
866	#endif
867	return KERN_SUCCESS;
868	}
869
870	#if ML_IO_TIMEOUTS_ENABLED
871
872	static bool
873	override_io_timeouts_va(uintptr_t vaddr, uint64_t read_timeout, uint64_t write_timeout)
874	{
875	assert(!ml_get_interrupts_enabled());
876
877	struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
878
879	lck_spin_lock(lck: &io_timeout_override_lock);
880	/ RB_FIND() doesn't support custom cmp functions, so we have to open-code our own /
881	while (node) {
882	if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
883	if (read_timeout) {
884	*read_timeout = node->read_timeout;
885	}
886	if (write_timeout) {
887	*write_timeout = node->write_timeout;
888	}
889	lck_spin_unlock(lck: &io_timeout_override_lock);
890	return true;
891	} else if (vaddr < node->iovaddr_base) {
892	node = RB_LEFT(node, tree);
893	} else {
894	node = RB_RIGHT(node, tree);
895	}
896	}
897	lck_spin_unlock(lck: &io_timeout_override_lock);
898
899	return false;
900	}
901
902	static bool
903	override_io_timeouts_pa(uint64_t paddr, uint64_t read_timeout, uint64_t write_timeout)
904	{
905	#if defined(__arm64__)
906	/*
907	* PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
908	* timeout greater than the PCIe completion timeout (50ms). In some
909	* cases those timeouts can stack so make the timeout significantly
910	* higher.
911	*/
912	#define STRONG_SYNC_TIMEOUT 1800000 /* 75ms */
913
914	pmap_io_range_t *range = pmap_find_io_attr(paddr);
915	if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != `0`) {
916	if (read_timeout) {
917	*read_timeout = STRONG_SYNC_TIMEOUT;
918	}
919	if (write_timeout) {
920	*write_timeout = STRONG_SYNC_TIMEOUT;
921	}
922
923	return true;
924	}
925	#else
926	(void)paddr;
927	(void)read_timeout;
928	(void)write_timeout;
929	#endif /* __arm64__ */
930	return false;
931	}
932
933	void
934	override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t read_timeout, uint64_t write_timeout)
935	{
936	if (vaddr != `0` &&
937	override_io_timeouts_va(vaddr, read_timeout, write_timeout)) {
938	return;
939	}
940
941	if (paddr != `0` &&
942	override_io_timeouts_pa(paddr, read_timeout, write_timeout)) {
943	return;
944	}
945	}
946	#endif /* ML_IO_TIMEOUTS_ENABLED */
947
948	unsigned long long
949	ml_io_read(uintptr_t vaddr, int size)
950	{
951	unsigned long long result = `0`;
952	unsigned char s1;
953	unsigned short s2;
954
955	#ifdef ML_IO_VERIFY_UNCACHEABLE
956	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
957	#elif defined(ML_IO_TIMEOUTS_ENABLED)
958	uintptr_t const paddr = kvtophys(va: vaddr);
959	#endif
960
961	#ifdef ML_IO_TIMEOUTS_ENABLED
962	uint64_t sabs, eabs;
963	boolean_t istate, timeread = FALSE;
964	uint64_t report_read_delay;
965	#if __x86_64__
966	report_read_delay = report_phy_read_delay;
967	#else
968	report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
969	uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
970	#endif /* __x86_64__ */
971
972	if (__improbable(report_read_delay != `0`)) {
973	istate = ml_set_interrupts_enabled(FALSE);
974	sabs = ml_io_timestamp();
975	timeread = TRUE;
976	}
977
978	#ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
979	if (__improbable(timeread && simulate_stretched_io)) {
980	sabs -= simulate_stretched_io;
981	}
982	#endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
983	#endif /* ML_IO_TIMEOUTS_ENABLED */
984
985	#if DEVELOPMENT \|\| DEBUG
986	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
987	if (use_fences) {
988	ml_timebase_to_memory_fence();
989	}
990	#endif
991
992	switch (size) {
993	case `1`:
994	s1 = (volatile* unsigned char *)vaddr;
995	result = s1;
996	break;
997	case `2`:
998	s2 = (volatile* unsigned short *)vaddr;
999	result = s2;
1000	break;
1001	case `4`:
1002	result = (volatile* unsigned int *)vaddr;
1003	break;
1004	case `8`:
1005	result = (volatile* unsigned long long *)vaddr;
1006	break;
1007	default:
1008	panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
1009	break;
1010	}
1011
1012	#if DEVELOPMENT \|\| DEBUG
1013	if (use_fences) {
1014	ml_memory_to_timebase_fence();
1015	}
1016	#endif
1017
1018	#ifdef ML_IO_TIMEOUTS_ENABLED
1019	if (__improbable(timeread == TRUE)) {
1020	eabs = ml_io_timestamp();
1021
1022	/ Prevent the processor from calling iotrace during its*
1023	* initialization procedure. */
1024	if (current_processor()->state == PROCESSOR_RUNNING) {
1025	iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
1026	}
1027
1028	if (__improbable((eabs - sabs) > report_read_delay)) {
1029	DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
1030	uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
1031
1032	uint64_t override = `0`;
1033	override_io_timeouts(vaddr, paddr, read_timeout: &override, NULL);
1034
1035	if (override != `0`) {
1036	#if SCHED_HYGIENE_DEBUG
1037	/*
1038	* The IO timeout was overridden. As interrupts are disabled in
1039	* order to accurately measure IO time this can cause the
1040	* interrupt masked timeout threshold to be exceeded. If the
1041	* interrupt masked debug mode is set to panic, abandon the
1042	* measurement. If in trace mode leave it as-is for
1043	* observability.
1044	*/
1045	if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1046	ml_spin_debug_clear(current_thread());
1047	ml_irq_debug_abandon();
1048	}
1049	#endif
1050	report_read_delay = override;
1051	}
1052	}
1053
1054	if (__improbable((eabs - sabs) > report_read_delay)) {
1055	if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1056	#if defined(__x86_64__)
1057	panic_notify();
1058	#endif /* defined(__x86_64__) */
1059	uint64_t nsec = `0`;
1060	absolutetime_to_nanoseconds(abstime: eabs - sabs, result: &nsec);
1061	panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1062	"result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1063	vaddr, paddr, nsec, result, sabs, eabs,
1064	report_read_delay);
1065	}
1066	}
1067
1068	if (__improbable(trace_phy_read_delay > `0` && (eabs - sabs) > trace_phy_read_delay)) {
1069	KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1070	(eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1071	}
1072
1073	(void)ml_set_interrupts_enabled(enable: istate);
1074	}
1075	#endif /* ML_IO_TIMEOUTS_ENABLED */
1076	return result;
1077	}
1078
1079	unsigned int
1080	ml_io_read8(uintptr_t vaddr)
1081	{
1082	return (unsigned) ml_io_read(vaddr, size: `1`);
1083	}
1084
1085	unsigned int
1086	ml_io_read16(uintptr_t vaddr)
1087	{
1088	return (unsigned) ml_io_read(vaddr, size: `2`);
1089	}
1090
1091	unsigned int
1092	ml_io_read32(uintptr_t vaddr)
1093	{
1094	return (unsigned) ml_io_read(vaddr, size: `4`);
1095	}
1096
1097	unsigned long long
1098	ml_io_read64(uintptr_t vaddr)
1099	{
1100	return ml_io_read(vaddr, size: `8`);
1101	}
1102
1103	/ ml_io_write* /
1104
1105	void
1106	ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1107	{
1108	#ifdef ML_IO_VERIFY_UNCACHEABLE
1109	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1110	#elif defined(ML_IO_TIMEOUTS_ENABLED)
1111	uintptr_t const paddr = kvtophys(va: vaddr);
1112	#endif
1113
1114	#ifdef ML_IO_TIMEOUTS_ENABLED
1115	uint64_t sabs, eabs;
1116	boolean_t istate, timewrite = FALSE;
1117	uint64_t report_write_delay;
1118	#if __x86_64__
1119	report_write_delay = report_phy_write_delay;
1120	#else
1121	report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1122	uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1123	#endif /* !defined(__x86_64__) */
1124	if (__improbable(report_write_delay != `0`)) {
1125	istate = ml_set_interrupts_enabled(FALSE);
1126	sabs = ml_io_timestamp();
1127	timewrite = TRUE;
1128	}
1129
1130	#ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1131	if (__improbable(timewrite && simulate_stretched_io)) {
1132	sabs -= simulate_stretched_io;
1133	}
1134	#endif /* DEVELOPMENT \|\| DEBUG */
1135	#endif /* ML_IO_TIMEOUTS_ENABLED */
1136
1137	#if DEVELOPMENT \|\| DEBUG
1138	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1139	if (use_fences) {
1140	ml_timebase_to_memory_fence();
1141	}
1142	#endif
1143
1144	switch (size) {
1145	case `1`:
1146	(volatile* uint8_t *)vaddr = (uint8_t)val;
1147	break;
1148	case `2`:
1149	(volatile* uint16_t *)vaddr = (uint16_t)val;
1150	break;
1151	case `4`:
1152	(volatile* uint32_t *)vaddr = (uint32_t)val;
1153	break;
1154	case `8`:
1155	(volatile* uint64_t *)vaddr = (uint64_t)val;
1156	break;
1157	default:
1158	panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1159	break;
1160	}
1161
1162	#if DEVELOPMENT \|\| DEBUG
1163	if (use_fences) {
1164	ml_memory_to_timebase_fence();
1165	}
1166	#endif
1167
1168	#ifdef ML_IO_TIMEOUTS_ENABLED
1169	if (__improbable(timewrite == TRUE)) {
1170	eabs = ml_io_timestamp();
1171
1172
1173	/ Prevent the processor from calling iotrace during its*
1174	* initialization procedure. */
1175	if (current_processor()->state == PROCESSOR_RUNNING) {
1176	iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1177	}
1178
1179
1180	if (__improbable((eabs - sabs) > report_write_delay)) {
1181	DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1182	uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1183
1184	uint64_t override = `0`;
1185	override_io_timeouts(vaddr, paddr, NULL, write_timeout: &override);
1186
1187	if (override != `0`) {
1188	#if SCHED_HYGIENE_DEBUG
1189	/*
1190	* The IO timeout was overridden. As interrupts are disabled in
1191	* order to accurately measure IO time this can cause the
1192	* interrupt masked timeout threshold to be exceeded. If the
1193	* interrupt masked debug mode is set to panic, abandon the
1194	* measurement. If in trace mode leave it as-is for
1195	* observability.
1196	*/
1197	if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1198	ml_spin_debug_clear(current_thread());
1199	ml_irq_debug_abandon();
1200	}
1201	#endif
1202	report_write_delay = override;
1203	}
1204	}
1205
1206	if (__improbable((eabs - sabs) > report_write_delay)) {
1207	if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1208	#if defined(__x86_64__)
1209	panic_notify();
1210	#endif /* defined(__x86_64__) */
1211
1212	uint64_t nsec = `0`;
1213	absolutetime_to_nanoseconds(abstime: eabs - sabs, result: &nsec);
1214	panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1215	" (start: %llu, end: %llu), ceiling: %llu",
1216	(void )vaddr, (void* *)paddr, val, nsec, sabs, eabs,
1217	report_write_delay);
1218	}
1219	}
1220
1221	if (__improbable(trace_phy_write_delay > `0` && (eabs - sabs) > trace_phy_write_delay)) {
1222	KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1223	(eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1224	}
1225
1226	(void)ml_set_interrupts_enabled(enable: istate);
1227	}
1228	#endif /* ML_IO_TIMEOUTS_ENABLED */
1229	}
1230
1231	void
1232	ml_io_write8(uintptr_t vaddr, uint8_t val)
1233	{
1234	ml_io_write(vaddr, val, size: `1`);
1235	}
1236
1237	void
1238	ml_io_write16(uintptr_t vaddr, uint16_t val)
1239	{
1240	ml_io_write(vaddr, val, size: `2`);
1241	}
1242
1243	void
1244	ml_io_write32(uintptr_t vaddr, uint32_t val)
1245	{
1246	ml_io_write(vaddr, val, size: `4`);
1247	}
1248
1249	void
1250	ml_io_write64(uintptr_t vaddr, uint64_t val)
1251	{
1252	ml_io_write(vaddr, val, size: `8`);
1253	}
1254
1255	struct cpu_callback_chain_elem {
1256	cpu_callback_t fn;
1257	void *param;
1258	struct cpu_callback_chain_elem *next;
1259	};
1260
1261	static struct cpu_callback_chain_elem *cpu_callback_chain;
1262	static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1263	static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1264
1265	void
1266	cpu_event_register_callback(cpu_callback_t fn, void *param)
1267	{
1268	struct cpu_callback_chain_elem *new_elem;
1269
1270	new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1271	if (!new_elem) {
1272	panic("can't allocate cpu_callback_chain_elem");
1273	}
1274
1275	lck_spin_lock(lck: &cpu_callback_chain_lock);
1276	new_elem->next = cpu_callback_chain;
1277	new_elem->fn = fn;
1278	new_elem->param = param;
1279	os_atomic_store(&cpu_callback_chain, new_elem, release);
1280	lck_spin_unlock(lck: &cpu_callback_chain_lock);
1281	}
1282
1283	__attribute__((noreturn))
1284	void
1285	cpu_event_unregister_callback(__unused cpu_callback_t fn)
1286	{
1287	panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1288	}
1289
1290	void
1291	ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1292	{
1293	struct cpu_callback_chain_elem *cursor;
1294
1295	cursor = os_atomic_load(&cpu_callback_chain, dependency);
1296	for (; cursor != NULL; cursor = cursor->next) {
1297	cursor->fn(cursor->param, event, cpu_or_cluster);
1298	}
1299	}
1300
1301	// Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1302	// definition)
1303
1304	void
1305	machine_timeout_init_with_suffix(const struct machine_timeout_spec spec, char* const *suffix)
1306	{
1307	if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1308	// This timeout should be disabled.
1309	os_atomic_store_wide((uint64_t*)spec->ptr, `0`, relaxed);
1310	return;
1311	}
1312
1313	assert(suffix != NULL);
1314	assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1315
1316	size_t const suffix_len = strlen(s: suffix);
1317
1318	size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + `1`;
1319	char dt_name[dt_name_size];
1320
1321	strlcpy(dst: dt_name, src: spec->name, n: dt_name_size);
1322	strlcat(dst: dt_name, src: suffix, n: dt_name_size);
1323
1324	size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen(s: "-scale") + `1`;
1325	char scale_name[scale_name_size];
1326
1327	strlcpy(dst: scale_name, src: spec->name, n: scale_name_size);
1328	strlcat(dst: scale_name, src: suffix, n: scale_name_size);
1329	strlcat(dst: scale_name, src: "-scale", n: scale_name_size);
1330
1331	size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen(s: "ml-timeout-") + suffix_len + `1`;
1332	char boot_arg_name[boot_arg_name_size];
1333
1334	strlcpy(dst: boot_arg_name, src: "ml-timeout-", n: boot_arg_name_size);
1335	strlcat(dst: boot_arg_name, src: spec->name, n: boot_arg_name_size);
1336	strlcat(dst: boot_arg_name, src: suffix, n: boot_arg_name_size);
1337
1338	size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1339	strlen(s: "ml-timeout-") + strlen(s: "-scale") + suffix_len + `1`;
1340	char boot_arg_scale_name[boot_arg_scale_name_size];
1341
1342	strlcpy(dst: boot_arg_scale_name, src: "ml-timeout-", n: boot_arg_scale_name_size);
1343	strlcat(dst: boot_arg_scale_name, src: spec->name, n: boot_arg_scale_name_size);
1344	strlcat(dst: boot_arg_scale_name, src: suffix, n: boot_arg_name_size);
1345	strlcat(dst: boot_arg_scale_name, src: "-scale", n: boot_arg_scale_name_size);
1346
1347
1348	/*
1349	* Determine base value from DT and boot-args.
1350	*/
1351
1352	DTEntry base, chosen;
1353
1354	if (SecureDTLookupEntry(NULL, pathName: "/machine-timeouts", foundEntry: &base) != kSuccess) {
1355	base = NULL;
1356	}
1357
1358	if (SecureDTLookupEntry(NULL, pathName: "/chosen/machine-timeouts", foundEntry: &chosen) != kSuccess) {
1359	chosen = NULL;
1360	}
1361
1362	uint64_t timeout = spec->default_value;
1363	bool found = false;
1364
1365	uint64_t const *data = NULL;
1366	unsigned int data_size = sizeof(*data);
1367
1368	/ First look in /machine-timeouts/<name> /
1369	if (base != NULL && SecureDTGetProperty(entry: base, propertyName: dt_name, propertyValue: (const void **)&data, propertySize: &data_size) == kSuccess) {
1370	if (data_size != sizeof(*data)) {
1371	panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1372	}
1373
1374	timeout = *data;
1375	found = true;
1376	}
1377
1378	/ A value in /chosen/machine-timeouts/<name> overrides /
1379	if (chosen != NULL && SecureDTGetProperty(entry: chosen, propertyName: dt_name, propertyValue: (const void **)&data, propertySize: &data_size) == kSuccess) {
1380	if (data_size != sizeof(*data)) {
1381	panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1382	}
1383
1384	timeout = *data;
1385	found = true;
1386	}
1387
1388	/ A boot-arg ml-timeout-<name> overrides /
1389	uint64_t boot_arg = `0`;
1390
1391	if (PE_parse_boot_argn(arg_string: boot_arg_name, arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1392	timeout = boot_arg;
1393	found = true;
1394	}
1395
1396
1397	/*
1398	* Determine scale value from DT and boot-args.
1399	*/
1400
1401	uint64_t scale = `1`;
1402	uint32_t const *scale_data;
1403	unsigned int scale_size = sizeof(scale_data);
1404
1405	/ If there is a scale factor /machine-timeouts/<name>-scale, apply it. /
1406	if (base != NULL && SecureDTGetProperty(entry: base, propertyName: scale_name, propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1407	if (scale_size != sizeof(*scale_data)) {
1408	panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1409	}
1410
1411	scale = *scale_data;
1412	}
1413
1414	/ If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. /
1415	if (chosen != NULL && SecureDTGetProperty(entry: chosen, propertyName: scale_name, propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1416	if (scale_size != sizeof(*scale_data)) {
1417	panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1418	scale_size, dt_name);
1419	}
1420
1421	scale = *scale_data;
1422	}
1423
1424	/ Finally, a boot-arg ml-timeout-<name>-scale takes precedence. /
1425	if (PE_parse_boot_argn(arg_string: boot_arg_scale_name, arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1426	scale = boot_arg;
1427	}
1428
1429	static bool global_scale_set;
1430	static uint64_t global_scale;
1431
1432	if (!global_scale_set) {
1433	/ Apply /machine-timeouts/global-scale if present /
1434	if (SecureDTGetProperty(entry: base, propertyName: "global-scale", propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1435	if (scale_size != sizeof(*scale_data)) {
1436	panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1437	scale_size);
1438	}
1439
1440	global_scale = *scale_data;
1441	global_scale_set = true;
1442	}
1443
1444	/ Use /chosen/machine-timeouts/global-scale if present /
1445	if (SecureDTGetProperty(entry: chosen, propertyName: "global-scale", propertyValue: (const void **)&scale_data, propertySize: &scale_size) == kSuccess) {
1446	if (scale_size != sizeof(*scale_data)) {
1447	panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1448	scale_size);
1449	}
1450
1451	global_scale = *scale_data;
1452	global_scale_set = true;
1453	}
1454
1455	/ Finally, the boot-arg ml-timeout-global-scale takes precedence. /
1456	if (PE_parse_boot_argn(arg_string: "ml-timeout-global-scale", arg_ptr: &boot_arg, max_arg: sizeof(boot_arg))) {
1457	global_scale = boot_arg;
1458	global_scale_set = true;
1459	}
1460	}
1461
1462	if (global_scale_set) {
1463	scale *= global_scale;
1464	}
1465
1466	/ Compute the final timeout, and done. /
1467	if (found && timeout > `0`) {
1468	/ Only apply inherent unit scale if the value came in*
1469	* externally. */
1470
1471	if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1472	uint64_t nanoseconds = timeout / `1000`;
1473	nanoseconds_to_absolutetime(nanoseconds, result: &timeout);
1474	} else {
1475	timeout /= spec->unit_scale;
1476	}
1477
1478	if (timeout == `0`) {
1479	/ Ensure unit scaling did not disable the timeout. /
1480	timeout = `1`;
1481	}
1482	}
1483
1484	if (os_mul_overflow(timeout, scale, &timeout)) {
1485	timeout = UINT64_MAX; // clamp
1486	}
1487
1488	os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1489	}
1490
1491	void
1492	machine_timeout_init(const struct machine_timeout_spec *spec)
1493	{
1494	machine_timeout_init_with_suffix(spec, suffix: "");
1495	}
1496
1497	#if DEVELOPMENT \|\| DEBUG
1498	/*
1499	* Late timeout (re-)initialization, at the end of bsd_init()
1500	*/
1501	void
1502	machine_timeout_bsd_init(void)
1503	{
1504	char const * const __unused mt_suffix = "-b";
1505	#if SCHED_HYGIENE_DEBUG
1506	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1507	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1508
1509	/*
1510	* The io timeouts can inherit from interrupt_masked_timeout.
1511	* Re-initialize, as interrupt_masked_timeout may have changed.
1512	*/
1513	ml_io_init_timeouts();
1514
1515	extern void preemption_disable_reset_max_durations(void);
1516	/*
1517	* Reset the preemption disable stats, so that they are not
1518	* polluted by long early boot code.
1519	*/
1520	preemption_disable_reset_max_durations();
1521	#endif /* SCHED_HYGIENE_DEBUG */
1522	}
1523	#endif /* DEVELOPMENT \|\| DEBUG */
1524
1525	#if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1526	#include <tests/xnupost.h>
1527
1528	extern kern_return_t ml_io_timeout_test(void);
1529
1530	static inline void
1531	ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t read_timeout, uint64_t write_timeout)
1532	{
1533	*read_timeout = `0`;
1534	*write_timeout = `0`;
1535
1536	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1537	override_io_timeouts(vaddr, `0`, read_timeout, write_timeout);
1538	ml_set_interrupts_enabled(istate);
1539	}
1540
1541	kern_return_t
1542	ml_io_timeout_test(void)
1543	{
1544	const size_t SIZE = `16`;
1545	uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1546	uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1547	uintptr_t vaddr1 = iovaddr_base1 + SIZE / `2`;
1548	uintptr_t vaddr2 = iovaddr_base2 + SIZE / `2`;
1549
1550	const uint64_t READ_TIMEOUT1_US = `50000`, WRITE_TIMEOUT1_US = `50001`;
1551	const uint64_t READ_TIMEOUT2_US = `50002`, WRITE_TIMEOUT2_US = `50003`;
1552	uint64_t read_timeout1_abs, write_timeout1_abs;
1553	uint64_t read_timeout2_abs, write_timeout2_abs;
1554	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1555	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1556	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1557	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1558
1559	int err = ml_io_increase_timeouts(iovaddr_base1, `0`, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1560	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1561
1562	err = ml_io_increase_timeouts(iovaddr_base1, `4097`, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1563	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1564
1565	err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1566	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1567
1568	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1569	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1570
1571	err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1572	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1573
1574	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1575	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1576
1577	err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1578	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1579
1580	uint64_t read_timeout, write_timeout;
1581	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1582	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1583	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1584
1585	ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1586	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1587	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1588
1589	ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1590	T_EXPECT_EQ_ULLONG(read_timeout, `0`, "Read timeout without override");
1591	T_EXPECT_EQ_ULLONG(write_timeout, `0`, "Write timeout without override");
1592
1593	err = ml_io_reset_timeouts(iovaddr_base1 + `1`, SIZE - `1`);
1594	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1595
1596	err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1597	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1598
1599	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1600	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1601
1602	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1603	T_EXPECT_EQ_ULLONG(read_timeout, `0`, "Read timeout for reset region");
1604	T_EXPECT_EQ_ULLONG(write_timeout, `0`, "Write timeout for reset region");
1605
1606	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1607	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1608
1609	err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1610	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1611
1612	return KERN_SUCCESS;
1613	}
1614	#endif /* CONFIG_XNUPOST */
1615

Browse the source code of xnu/osfmk/kern/machine.c