machine_routines.c source code [xnu/osfmk/arm64/machine_routines.c]

1	/*
2	* Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <arm64/machine_machdep.h>
30	#include <arm64/proc_reg.h>
31	#include <arm/machine_cpu.h>
32	#include <arm/cpu_internal.h>
33	#include <arm/cpuid.h>
34	#include <arm/cpu_data.h>
35	#include <arm/cpu_data_internal.h>
36	#include <arm/caches_internal.h>
37	#include <arm/misc_protos.h>
38	#include <arm/machdep_call.h>
39	#include <arm/machine_routines.h>
40	#include <arm/rtclock.h>
41	#include <arm/cpuid_internal.h>
42	#include <arm/cpu_capabilities.h>
43	#include <console/serial_protos.h>
44	#include <kern/machine.h>
45	#include <kern/misc_protos.h>
46	#include <prng/random.h>
47	#include <kern/startup.h>
48	#include <kern/thread.h>
49	#include <kern/timer_queue.h>
50	#include <mach/machine.h>
51	#include <machine/atomic.h>
52	#include <machine/config.h>
53	#include <vm/pmap.h>
54	#include <vm/vm_page.h>
55	#include <vm/vm_shared_region.h>
56	#include <vm/vm_map.h>
57	#include <sys/codesign.h>
58	#include <sys/kdebug.h>
59	#include <kern/coalition.h>
60	#include <pexpert/device_tree.h>
61
62	#include <IOKit/IOPlatformExpert.h>
63	#if HIBERNATION
64	#include <IOKit/IOHibernatePrivate.h>
65	#endif /* HIBERNATION */
66
67	#if defined(KERNEL_INTEGRITY_KTRR) \|\| defined(KERNEL_INTEGRITY_CTRR)
68	#include <arm64/amcc_rorgn.h>
69	#endif
70
71
72	#if CONFIG_SPTM
73	#include <arm64/sptm/sptm.h>
74	#endif /* CONFIG_SPTM */
75
76	#include <libkern/section_keywords.h>
77
78	/**
79	* On supported hardware, debuggable builds make the HID bits read-only
80	* without locking them. This lets people manually modify HID bits while
81	* debugging, since they can use a debugging tool to first reset the HID
82	* bits back to read/write. However it will still catch xnu changes that
83	* accidentally write to HID bits after they've been made read-only.
84	*/
85	SECURITY_READ_ONLY_LATE(bool) skip_spr_lockdown_glb = `0`;
86
87	/*
88	* On some SoCs, PIO lockdown is applied in assembly in early boot by
89	* secondary CPUs.
90	* Since the cluster_pio_ro_ctl value is dynamic, it is stored here by the
91	* primary CPU so that it doesn't have to be computed each time by the
92	* startup code.
93	*/
94	SECURITY_READ_ONLY_LATE(uint64_t) cluster_pio_ro_ctl_mask_glb = `0`;
95
96	#if CONFIG_CPU_COUNTERS
97	#include <kern/kpc.h>
98	#endif /* CONFIG_CPU_COUNTERS */
99
100	#define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
101	#define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
102
103	#if HAS_CLUSTER
104	static uint8_t cluster_initialized = `0`;
105	#endif
106
107	MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", `6e6` / 0.25s /, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
108	machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
109
110	MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", `3e6` / 0.125s /, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
111
112	MACHINE_TIMEOUT_DEV_WRITEABLE(MutexSpin, "mutex-spin", `240` / 10us /, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
113
114	uint64_t low_MutexSpin;
115	int64_t high_MutexSpin;
116
117
118
119	static uint64_t ml_wfe_hint_max_interval;
120	#define MAX_WFE_HINT_INTERVAL_US (500ULL)
121
122	/ Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true /
123	TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", `0`);
124
125	extern vm_offset_t segLOWEST;
126	extern vm_offset_t segLOWESTTEXT;
127	extern vm_offset_t segLASTB;
128	extern unsigned long segSizeLAST;
129
130	/ ARM64 specific bounds; used to test for presence in the kernelcache. /
131	extern vm_offset_t vm_kernelcache_base;
132	extern vm_offset_t vm_kernelcache_top;
133
134	/ Location of the physmap / physical aperture /
135	extern uint64_t physmap_base;
136
137	#if defined(CONFIG_SPTM)
138	extern const arm_physrange_t *arm_vm_kernelcache_ranges;
139	extern int arm_vm_kernelcache_numranges;
140	#else /* defined(CONFIG_SPTM) */
141	extern vm_offset_t arm_vm_kernelcache_phys_start;
142	extern vm_offset_t arm_vm_kernelcache_phys_end;
143	#endif /* defined(CONFIG_SPTM) */
144
145	#if defined(HAS_IPI)
146	unsigned int gFastIPI = `1`;
147	#define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
148	static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
149	kDeferredIPITimerDefault);
150	#endif /* defined(HAS_IPI) */
151
152	thread_t Idle_context(void);
153
154	SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
155
156	SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
157	SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
158	SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
159	.version = CPU_TOPOLOGY_VERSION,
160	.cpus = topology_cpu_array,
161	.clusters = topology_cluster_array,
162	};
163
164	_Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
165
166	/**
167	* Represents the offset of each cluster within a hypothetical array of MAX_CPUS
168	* entries of an arbitrary data type. This is intended for use by specialized consumers
169	* that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
170	* as follows:
171	* hypothetical_array[cluster_offsets[AFF1] + AFF0]
172	* Most consumers should instead use general-purpose facilities such as PERCPU or
173	* ml_get_cpu_number().
174	*/
175	SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + `1`];
176
177	SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
178
179	extern uint32_t lockdown_done;
180
181	/**
182	* Represents regions of virtual address space that should be reserved
183	* (pre-mapped) in each user address space.
184	*/
185	static const struct vm_reserved_region vm_reserved_regions[] = {
186	{
187	.vmrr_name = "GPU Carveout",
188	.vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
189	.vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
190	},
191	/*
192	* Reserve the virtual memory space representing the commpage nesting region
193	* to prevent user processes from allocating memory within it. The actual
194	* page table entries for the commpage are inserted by vm_commpage_enter().
195	* This vm_map_enter() just prevents userspace from allocating/deallocating
196	* anything within the entire commpage nested region.
197	*/
198	{
199	.vmrr_name = "commpage nesting",
200	.vmrr_addr = _COMM_PAGE64_NESTING_START,
201	.vmrr_size = _COMM_PAGE64_NESTING_SIZE
202	}
203	};
204
205	uint32_t get_arm_cpu_version(void);
206
207	#if defined(HAS_IPI)
208	static inline void
209	ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
210	{
211	#if HAS_CLUSTER
212	uint64_t local_mpidr;
213	/ NOTE: this logic expects that we are called in a non-preemptible*
214	* context, or at least one in which the calling thread is bound
215	* to a single CPU. Otherwise we may migrate between choosing which
216	* IPI mechanism to use and issuing the IPI. */
217	MRS(local_mpidr, "MPIDR_EL1");
218	if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
219	uint64_t x = type \| MPIDR_CPU_ID(cpu_mpidr);
220	MSR("S3_5_C15_C0_0", x);
221	} else {
222	#define IPI_RR_TARGET_CLUSTER_SHIFT 16
223	uint64_t x = type \| (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) \| MPIDR_CPU_ID(cpu_mpidr);
224	MSR("S3_5_C15_C0_1", x);
225	}
226	#else
227	uint64_t x = type \| MPIDR_CPU_ID(cpu_mpidr);
228	MSR("S3_5_C15_C0_1", x);
229	#endif
230	/ The recommended local/global IPI sequence is:*
231	* DSB <sys> (This ensures visibility of e.g. older stores to the
232	* pending CPU signals bit vector in DRAM prior to IPI reception,
233	* and is present in cpu_signal_internal())
234	* MSR S3_5_C15_C0_1, Xt
235	* ISB
236	*/
237	__builtin_arm_isb(ISB_SY);
238	}
239	#endif
240
241	#if !defined(HAS_IPI)
242	__dead2
243	#endif
244	void
245	ml_cpu_signal(unsigned int cpu_mpidr __unused)
246	{
247	#if defined(HAS_IPI)
248	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
249	#else
250	panic("Platform does not support ACC Fast IPI");
251	#endif
252	}
253
254	#if !defined(HAS_IPI)
255	__dead2
256	#endif
257	void
258	ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
259	{
260	#if defined(HAS_IPI)
261	/ adjust IPI_CR timer countdown value for deferred IPI*
262	* accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
263	* clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
264	*
265	* global register, should only require a single write to update all
266	* CPU cores: from Skye ACC user spec section 5.7.3.3
267	*
268	* IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
269	* IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
270	*/
271	uint64_t abstime;
272
273	nanoseconds_to_absolutetime(nanosecs, &abstime);
274
275	abstime = MIN(abstime, `0xFFFF`);
276
277	/ update deferred_ipi_timer_ns with the new clamped value /
278	absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
279
280	MSR("S3_5_C15_C3_1", abstime);
281	#else
282	(void)nanosecs;
283	panic("Platform does not support ACC Fast IPI");
284	#endif
285	}
286
287	uint64_t
288	ml_cpu_signal_deferred_get_timer()
289	{
290	#if defined(HAS_IPI)
291	return deferred_ipi_timer_ns;
292	#else
293	return `0`;
294	#endif
295	}
296
297	#if !defined(HAS_IPI)
298	__dead2
299	#endif
300	void
301	ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
302	{
303	#if defined(HAS_IPI)
304	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
305	#else
306	panic("Platform does not support ACC Fast IPI deferral");
307	#endif
308	}
309
310	#if !defined(HAS_IPI)
311	__dead2
312	#endif
313	void
314	ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
315	{
316	#if defined(HAS_IPI)
317	ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
318	#else
319	panic("Platform does not support ACC Fast IPI retraction");
320	#endif
321	}
322
323	extern uint32_t idle_proximate_io_wfe_unmasked;
324
325	#define CPUPM_IDLE_WFE 0x5310300
326	static bool
327	wfe_process_recommendation(void)
328	{
329	bool ipending = false;
330	if (__probable(idle_proximate_io_wfe_unmasked == `1`)) {
331	/ Check for an active perf. controller generated*
332	* WFE recommendation for this cluster.
333	*/
334	cpu_data_t *cdp = getCpuDatap();
335	uint32_t cid = cdp->cpu_cluster_id;
336	uint64_t wfe_ttd = `0`;
337	uint64_t wfe_deadline = `0`;
338
339	if ((wfe_ttd = ml_cluster_wfe_timeout(wfe_cluster_id: cid)) != `0`) {
340	wfe_deadline = mach_absolute_time() + wfe_ttd;
341	}
342
343	if (wfe_deadline != `0`) {
344	/ Poll issuing event-bounded WFEs until an interrupt*
345	* arrives or the WFE recommendation expires
346	*/
347	#if DEVELOPMENT \|\| DEBUG
348	uint64_t wc = cdp->wfe_count;
349	KDBG(CPUPM_IDLE_WFE \| DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
350	#endif
351	/ Issue WFE until the recommendation expires,*
352	* with IRQs unmasked.
353	*/
354	ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true, true);
355	#if DEVELOPMENT \|\| DEBUG
356	KDBG(CPUPM_IDLE_WFE \| DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
357	#endif
358	}
359	}
360	return ipending;
361	}
362
363	void
364	machine_idle(void)
365	{
366	/ Interrupts are expected to be masked on entry or re-entry via*
367	* Idle_load_context()
368	*/
369	assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF \| DAIF_FIQF)) == (DAIF_IRQF \| DAIF_FIQF));
370	/ Check for, and act on, a WFE recommendation.*
371	* Bypasses context spill/fill for a minor perf. increment.
372	* May unmask and restore IRQ+FIQ mask.
373	*/
374	if (wfe_process_recommendation() == false) {
375	/ If WFE recommendation absent, or WFE deadline*
376	* arrived with no interrupt pending/processed,
377	* fall back to WFI.
378	*/
379	Idle_context();
380	}
381	__builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF \| DAIFSC_FIQF));
382	}
383
384	void
385	OSSynchronizeIO(void)
386	{
387	__builtin_arm_dsb(DSB_SY);
388	}
389
390	uint64_t
391	get_aux_control(void)
392	{
393	uint64_t value;
394
395	MRS(value, "ACTLR_EL1");
396	return value;
397	}
398
399	uint64_t
400	get_mmu_control(void)
401	{
402	uint64_t value;
403
404	MRS(value, "SCTLR_EL1");
405	return value;
406	}
407
408	uint64_t
409	get_tcr(void)
410	{
411	uint64_t value;
412
413	MRS(value, "TCR_EL1");
414	return value;
415	}
416
417	boolean_t
418	ml_get_interrupts_enabled(void)
419	{
420	uint64_t value;
421
422	MRS(value, "DAIF");
423	if (value & DAIF_IRQF) {
424	return FALSE;
425	}
426	return TRUE;
427	}
428
429	pmap_paddr_t
430	get_mmu_ttb(void)
431	{
432	pmap_paddr_t value;
433
434	MRS(value, "TTBR0_EL1");
435	return value;
436	}
437
438	uint32_t
439	get_arm_cpu_version(void)
440	{
441	uint32_t value = machine_read_midr();
442
443	/ Compose the register values into 8 bits; variant[7:4], revision[3:0]. /
444	return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) \| ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - `4`));
445	}
446
447	bool
448	ml_feature_supported(uint32_t feature_bit)
449	{
450	uint64_t aidr_el1_value = `0`;
451
452	MRS(aidr_el1_value, "AIDR_EL1");
453
454	#ifdef APPLEAVALANCHE
455	#endif // APPLEAVALANCHE
456
457	return aidr_el1_value & feature_bit;
458	}
459
460	/*
461	* user_cont_hwclock_allowed()
462	*
463	* Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
464	* as a continuous time source (e.g. from mach_continuous_time)
465	*/
466	boolean_t
467	user_cont_hwclock_allowed(void)
468	{
469	#if HAS_CONTINUOUS_HWCLOCK
470	return TRUE;
471	#else
472	return FALSE;
473	#endif
474	}
475
476	/*
477	* user_timebase_type()
478	*
479	* Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
480	*
481	* USER_TIMEBASE_NONE: EL0 has no access to timebase register
482	* USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
483	* USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
484	*
485	*/
486
487	uint8_t
488	user_timebase_type(void)
489	{
490	#if HAS_ACNTVCT
491	return USER_TIMEBASE_NOSPEC_APPLE;
492	#elif __ARM_ARCH_8_6__
493	return USER_TIMEBASE_NOSPEC;
494	#else
495	return USER_TIMEBASE_SPEC;
496	#endif
497	}
498
499	void
500	machine_startup(__unused boot_args * args)
501	{
502	#if defined(HAS_IPI) && (DEVELOPMENT \|\| DEBUG)
503	if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
504	gFastIPI = `1`;
505	}
506	#endif /* defined(HAS_IPI) && (DEVELOPMENT \|\| DEBUG)*/
507
508
509	machine_conf();
510
511
512	/*
513	* Kick off the kernel bootstrap.
514	*/
515	kernel_bootstrap();
516	/ NOTREACHED /
517	}
518
519	typedef void (invalidate_fn_t)(void*);
520
521	static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
522
523	void set_invalidate_hmac_function(invalidate_fn_t fn);
524
525	void
526	set_invalidate_hmac_function(invalidate_fn_t fn)
527	{
528	if (NULL != invalidate_hmac_function) {
529	panic("Invalidate HMAC function already set");
530	}
531
532	invalidate_hmac_function = fn;
533	}
534
535	void
536	machine_lockdown(void)
537	{
538
539	#if CONFIG_SPTM
540	/**
541	* On devices that make use of the SPTM, the SPTM is responsible for
542	* managing system register locks. Due to this, we skip the call to
543	* spr_lockdown() below.
544	*/
545	#else
546	#endif
547
548	arm_vm_prot_finalize(args: PE_state.bootArgs);
549
550	#if CONFIG_KERNEL_INTEGRITY
551	#if KERNEL_INTEGRITY_WT
552	/ Watchtower*
553	*
554	* Notify the monitor about the completion of early kernel bootstrap.
555	* From this point forward it will enforce the integrity of kernel text,
556	* rodata and page tables.
557	*/
558
559	#ifdef MONITOR
560	monitor_call(MONITOR_LOCKDOWN, `0`, `0`, `0`);
561	#endif
562	#endif /* KERNEL_INTEGRITY_WT */
563
564	#if CONFIG_SPTM
565	extern void pmap_prepare_commpages(void);
566	pmap_prepare_commpages();
567
568	/**
569	* sptm_lockdown_xnu() disables preemption like all SPTM calls, but may take
570	* a fair amount of time as it involves retyping a large number of pages.
571	* This preemption latency is not really a concern since we're still fairly
572	* early in the boot process, so just explicitly disable preemption before
573	* invoking the SPTM and abandon preemption latency measurements before
574	* re-enabling it.
575	*/
576	disable_preemption();
577	/ Signal the SPTM that XNU is ready for RO memory to actually become read-only /
578	sptm_lockdown_xnu();
579	#if SCHED_HYGIENE_DEBUG
580	abandon_preemption_disable_measurement();
581	#endif /* SCHED_HYGIENE_DEBUG */
582	enable_preemption();
583	#else
584	#if defined(KERNEL_INTEGRITY_KTRR) \|\| defined(KERNEL_INTEGRITY_CTRR)
585	/ KTRR*
586	*
587	* Lock physical KTRR region. KTRR region is read-only. Memory outside
588	* the region is not executable at EL1.
589	*/
590
591	rorgn_lockdown();
592	#endif /* defined(KERNEL_INTEGRITY_KTRR) \|\| defined(KERNEL_INTEGRITY_CTRR) */
593	#endif /* CONFIG_SPTM */
594
595	#if XNU_MONITOR
596	pmap_lockdown_ppl();
597	#endif
598
599	#endif /* CONFIG_KERNEL_INTEGRITY */
600
601
602	if (NULL != invalidate_hmac_function) {
603	invalidate_hmac_function();
604	}
605
606	lockdown_done = `1`;
607	}
608
609
610	char *
611	machine_boot_info(
612	__unused char *buf,
613	__unused vm_size_t size)
614	{
615	return PE_boot_args();
616	}
617
618	void
619	slave_machine_init(__unused void *param)
620	{
621	cpu_machine_init(); / Initialize the processor /
622	clock_init(); / Init the clock /
623	}
624
625	/*
626	* Routine: machine_processor_shutdown
627	* Function:
628	*/
629	thread_t
630	machine_processor_shutdown(
631	__unused thread_t thread,
632	void (*doshutdown)(processor_t),
633	processor_t processor)
634	{
635	return Shutdown_context(doshutdown, processor);
636	}
637
638	/*
639	* Routine: ml_init_lock_timeout
640	* Function:
641	*/
642	static void __startup_func
643	ml_init_lock_timeout(void)
644	{
645	/*
646	* This function is called after STARTUP_SUB_TIMEOUTS
647	* initialization, so using the "legacy" boot-args here overrides
648	* the ml-timeout-... configuration. (Given that these boot-args
649	* here are usually explicitly specified, this makes sense by
650	* overriding ml-timeout-..., which may come from the device tree.
651	*/
652
653	uint64_t lto_timeout_ns;
654	uint64_t lto_abstime;
655	uint32_t slto;
656
657	if (PE_parse_boot_argn(arg_string: "slto_us", arg_ptr: &slto, max_arg: sizeof(slto))) {
658	lto_timeout_ns = slto * NSEC_PER_USEC;
659	nanoseconds_to_absolutetime(nanoseconds: lto_timeout_ns, result: &lto_abstime);
660	os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
661	} else {
662	lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
663	absolutetime_to_nanoseconds(abstime: lto_abstime, result: &lto_timeout_ns);
664	}
665
666	os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
667
668	if (PE_parse_boot_argn(arg_string: "tlto_us", arg_ptr: &slto, max_arg: sizeof(slto))) {
669	nanoseconds_to_absolutetime(nanoseconds: slto * NSEC_PER_USEC, result: &lto_abstime);
670	os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
671	} else if (lto_abstime != `0`) {
672	os_atomic_store(&TLockTimeOut, lto_abstime >> `1`, relaxed);
673	} // else take default from MACHINE_TIMEOUT.
674
675	uint64_t mtxspin;
676	uint64_t mtx_abstime;
677	if (PE_parse_boot_argn(arg_string: "mtxspin", arg_ptr: &mtxspin, max_arg: sizeof(mtxspin))) {
678	if (mtxspin > USEC_PER_SEC >> `4`) {
679	mtxspin = USEC_PER_SEC >> `4`;
680	}
681	nanoseconds_to_absolutetime(nanoseconds: mtxspin * NSEC_PER_USEC, result: &mtx_abstime);
682	os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
683	} else {
684	mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
685	}
686
687	low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
688	/*
689	* high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
690	* real_ncpus is not set at this time
691	*
692	* NOTE: active spinning is disabled in arm. It can be activated
693	* by setting high_MutexSpin through the sysctl.
694	*/
695	high_MutexSpin = low_MutexSpin;
696
697	uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
698	PE_parse_boot_argn(arg_string: "max_wfe_us", arg_ptr: &maxwfeus, max_arg: sizeof(maxwfeus));
699	nanoseconds_to_absolutetime(nanoseconds: maxwfeus * NSEC_PER_USEC, result: &ml_wfe_hint_max_interval);
700	}
701	STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
702
703
704	/*
705	* This is called when all of the ml_processor_info_t structures have been
706	* initialized and all the processors have been started through processor_start().
707	*
708	* Required by the scheduler subsystem.
709	*/
710	void
711	ml_cpu_init_completed(void)
712	{
713	if (SCHED(cpu_init_completed) != NULL) {
714	SCHED(cpu_init_completed)();
715	}
716	}
717
718	/*
719	* These are called from the machine-independent routine cpu_up()
720	* to perform machine-dependent info updates.
721	*
722	* The update to CPU counts needs to be separate from other actions
723	* because we don't update the counts when CLPC causes temporary
724	* cluster powerdown events, as these must be transparent to the user.
725	*/
726	void
727	ml_cpu_up(void)
728	{
729	}
730
731	void
732	ml_cpu_up_update_counts(int cpu_id)
733	{
734	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
735
736	os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
737
738	os_atomic_inc(&machine_info.physical_cpu, relaxed);
739	os_atomic_inc(&machine_info.logical_cpu, relaxed);
740	}
741
742	/*
743	* These are called from the machine-independent routine cpu_down()
744	* to perform machine-dependent info updates.
745	*
746	* The update to CPU counts needs to be separate from other actions
747	* because we don't update the counts when CLPC causes temporary
748	* cluster powerdown events, as these must be transparent to the user.
749	*/
750	void
751	ml_cpu_down(void)
752	{
753	/*
754	* If we want to deal with outstanding IPIs, we need to
755	* do relatively early in the processor_doshutdown path,
756	* as we pend decrementer interrupts using the IPI
757	* mechanism if we cannot immediately service them (if
758	* IRQ is masked). Do so now.
759	*
760	* We aren't on the interrupt stack here; would it make
761	* more sense to disable signaling and then enable
762	* interrupts? It might be a bit cleaner.
763	*/
764	cpu_data_t *cpu_data_ptr = getCpuDatap();
765	cpu_data_ptr->cpu_running = FALSE;
766
767	if (cpu_data_ptr != &BootCpuData) {
768	/*
769	* Move all of this cpu's timers to the master/boot cpu,
770	* and poke it in case there's a sooner deadline for it to schedule.
771	*/
772	timer_queue_shutdown(queue: &cpu_data_ptr->rtclock_timer.queue);
773	kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
774	if (rv != KERN_SUCCESS) {
775	panic("ml_cpu_down: IPI failure %d", rv);
776	}
777	}
778
779	cpu_signal_handler_internal(TRUE);
780	}
781	void
782	ml_cpu_down_update_counts(int cpu_id)
783	{
784	ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
785
786	os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
787
788	os_atomic_dec(&machine_info.physical_cpu, relaxed);
789	os_atomic_dec(&machine_info.logical_cpu, relaxed);
790	}
791
792
793	unsigned int
794	ml_get_machine_mem(void)
795	{
796	return machine_info.memory_size;
797	}
798
799	__attribute__((noreturn))
800	void
801	halt_all_cpus(boolean_t reboot)
802	{
803	if (reboot) {
804	printf(format: "MACH Reboot\n");
805	PEHaltRestart(type: kPERestartCPU);
806	} else {
807	printf(format: "CPU halted\n");
808	PEHaltRestart(type: kPEHaltCPU);
809	}
810	while (`1`) {
811	;
812	}
813	}
814
815	__attribute__((noreturn))
816	void
817	halt_cpu(void)
818	{
819	halt_all_cpus(FALSE);
820	}
821
822	/*
823	* Routine: machine_signal_idle
824	* Function:
825	*/
826	void
827	machine_signal_idle(
828	processor_t processor)
829	{
830	cpu_signal(target: processor_to_cpu_datap(processor), SIGPnop, p0: (void )NULL, p1: (void* *)NULL);
831	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, `0` / nop /, `0`, `0`, `0`);
832	}
833
834	void
835	machine_signal_idle_deferred(
836	processor_t processor)
837	{
838	cpu_signal_deferred(target: processor_to_cpu_datap(processor));
839	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, `0` / nop /, `0`, `0`, `0`);
840	}
841
842	void
843	machine_signal_idle_cancel(
844	processor_t processor)
845	{
846	cpu_signal_cancel(target: processor_to_cpu_datap(processor));
847	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, `0` / nop /, `0`, `0`, `0`);
848	}
849
850	/*
851	* Routine: ml_install_interrupt_handler
852	* Function: Initialize Interrupt Handler
853	*/
854	void
855	ml_install_interrupt_handler(
856	void *nub,
857	int source,
858	void *target,
859	IOInterruptHandler handler,
860	void *refCon)
861	{
862	cpu_data_t *cpu_data_ptr;
863	boolean_t current_state;
864
865	current_state = ml_set_interrupts_enabled(FALSE);
866	cpu_data_ptr = getCpuDatap();
867
868	cpu_data_ptr->interrupt_nub = nub;
869	cpu_data_ptr->interrupt_source = source;
870	cpu_data_ptr->interrupt_target = target;
871	cpu_data_ptr->interrupt_handler = handler;
872	cpu_data_ptr->interrupt_refCon = refCon;
873
874	(void) ml_set_interrupts_enabled(enable: current_state);
875	}
876
877	/*
878	* Routine: ml_init_interrupt
879	* Function: Initialize Interrupts
880	*/
881	void
882	ml_init_interrupt(void)
883	{
884	#if defined(HAS_IPI)
885	/*
886	* ml_init_interrupt will get called once for each CPU, but this is redundant
887	* because there is only one global copy of the register for skye. do it only
888	* on the bootstrap cpu
889	*/
890	if (getCpuDatap()->cluster_master) {
891	ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
892	}
893	#endif
894	}
895
896	/*
897	* Routine: ml_init_timebase
898	* Function: register and setup Timebase, Decremeter services
899	*/
900	void
901	ml_init_timebase(
902	void *args,
903	tbd_ops_t tbd_funcs,
904	vm_offset_t int_address,
905	vm_offset_t int_value __unused)
906	{
907	cpu_data_t *cpu_data_ptr;
908
909	cpu_data_ptr = (cpu_data_t *)args;
910
911	if ((cpu_data_ptr == &BootCpuData)
912	&& (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
913	rtclock_timebase_func = *tbd_funcs;
914	rtclock_timebase_addr = int_address;
915	}
916	}
917
918	#define ML_READPROP_MANDATORY UINT64_MAX
919
920	static uint64_t
921	ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
922	{
923	void const *prop;
924	unsigned int propSize;
925
926	if (SecureDTGetProperty(entry, propertyName, propertyValue: &prop, propertySize: &propSize) == kSuccess) {
927	if (propSize == sizeof(uint8_t)) {
928	return ((uint8_t const* *)prop);
929	} else if (propSize == sizeof(uint16_t)) {
930	return ((uint16_t const* *)prop);
931	} else if (propSize == sizeof(uint32_t)) {
932	return ((uint32_t const* *)prop);
933	} else if (propSize == sizeof(uint64_t)) {
934	return ((uint64_t const* *)prop);
935	} else {
936	panic("CPU property '%s' has bad size %u", propertyName, propSize);
937	}
938	} else {
939	if (default_value == ML_READPROP_MANDATORY) {
940	panic("Missing mandatory property '%s'", propertyName);
941	}
942	return default_value;
943	}
944	}
945
946	static boolean_t
947	ml_read_reg_range(const DTEntry entry, const char propertyName, uint64_t pa_ptr, uint64_t *len_ptr)
948	{
949	uint64_t const *prop;
950	unsigned int propSize;
951
952	if (SecureDTGetProperty(entry, propertyName, propertyValue: (void const **)&prop, propertySize: &propSize) != kSuccess) {
953	return FALSE;
954	}
955
956	if (propSize != sizeof(uint64_t) * `2`) {
957	panic("Wrong property size for %s", propertyName);
958	}
959
960	*pa_ptr = prop[`0`];
961	*len_ptr = prop[`1`];
962	return TRUE;
963	}
964
965	static boolean_t
966	ml_is_boot_cpu(const DTEntry entry)
967	{
968	void const *prop;
969	unsigned int propSize;
970
971	if (SecureDTGetProperty(entry, propertyName: "state", propertyValue: &prop, propertySize: &propSize) != kSuccess) {
972	panic("unable to retrieve state for cpu");
973	}
974
975	if (strncmp(s1: (char const *)prop, s2: "running", n: propSize) == `0`) {
976	return TRUE;
977	} else {
978	return FALSE;
979	}
980	}
981
982	static void
983	ml_read_chip_revision(unsigned int *rev __unused)
984	{
985	// The CPU_VERSION_ macros are only defined on APPLE_ARM64_ARCH_FAMILY builds*
986	#ifdef APPLE_ARM64_ARCH_FAMILY
987	DTEntry entryP;
988
989	if ((SecureDTFindEntry(propName: "name", propValue: "arm-io", entryH: &entryP) == kSuccess)) {
990	rev = (unsigned* int)ml_readprop(entry: entryP, propertyName: "chip-revision", CPU_VERSION_UNKNOWN);
991	} else {
992	*rev = CPU_VERSION_UNKNOWN;
993	}
994	#endif
995	}
996
997	void
998	ml_parse_cpu_topology(void)
999	{
1000	DTEntry entry, child __unused;
1001	OpaqueDTEntryIterator iter;
1002	uint32_t cpu_boot_arg = MAX_CPUS;
1003	uint64_t cpumask_boot_arg = ULLONG_MAX;
1004	int err;
1005
1006	int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + `1`];
1007	int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + `1`];
1008	const boolean_t cpus_boot_arg_present = PE_parse_boot_argn(arg_string: "cpus", arg_ptr: &cpu_boot_arg, max_arg: sizeof(cpu_boot_arg));
1009	const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn(arg_string: "cpumask", arg_ptr: &cpumask_boot_arg, max_arg: sizeof(cpumask_boot_arg));
1010
1011	// The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
1012	// so that we trigger a panic later in the boot process, once serial is enabled.
1013	if (cpus_boot_arg_present && cpumask_boot_arg_present) {
1014	cpu_config_correct = false;
1015	}
1016
1017	err = SecureDTLookupEntry(NULL, pathName: "/cpus", foundEntry: &entry);
1018	assert(err == kSuccess);
1019
1020	err = SecureDTInitEntryIterator(startEntry: entry, iter: &iter);
1021	assert(err == kSuccess);
1022
1023	for (int i = `0`; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1024	cluster_offsets[i] = -`1`;
1025	cluster_phys_to_logical[i] = -`1`;
1026	cluster_max_cpu_phys_id[i] = `0`;
1027	}
1028
1029	while (kSuccess == SecureDTIterateEntries(iterator: &iter, nextEntry: &child)) {
1030	boolean_t is_boot_cpu = ml_is_boot_cpu(entry: child);
1031	boolean_t cpu_enabled = cpumask_boot_arg & `1`;
1032	cpumask_boot_arg >>= `1`;
1033
1034	// Boot CPU disabled in cpumask. Flag this so that we trigger a panic
1035	// later in the boot process, once serial is enabled.
1036	if (is_boot_cpu && !cpu_enabled) {
1037	cpu_config_correct = false;
1038	}
1039
1040	// Ignore this CPU if it has been disabled by the cpumask= boot-arg.
1041	if (!is_boot_cpu && !cpu_enabled) {
1042	continue;
1043	}
1044
1045	// If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
1046	// been added to the topology struct yet, and we only have one slot left, then skip
1047	// every other non-boot CPU in order to leave room for the boot CPU.
1048	//
1049	// e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
1050	// array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
1051	if (topology_info.num_cpus >= (cpu_boot_arg - `1`) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
1052	continue;
1053	}
1054	if (topology_info.num_cpus >= cpu_boot_arg) {
1055	break;
1056	}
1057
1058	ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1059
1060	cpu->cpu_id = topology_info.num_cpus++;
1061	assert(cpu->cpu_id < MAX_CPUS);
1062	topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1063
1064	cpu->die_id = `0`;
1065	topology_info.max_die_id = `0`;
1066
1067	cpu->phys_id = (uint32_t)ml_readprop(entry: child, propertyName: "reg", ML_READPROP_MANDATORY);
1068
1069	cpu->l2_access_penalty = (uint32_t)ml_readprop(entry: child, propertyName: "l2-access-penalty", default_value: `0`);
1070	cpu->l2_cache_size = (uint32_t)ml_readprop(entry: child, propertyName: "l2-cache-size", default_value: `0`);
1071	cpu->l2_cache_id = (uint32_t)ml_readprop(entry: child, propertyName: "l2-cache-id", default_value: `0`);
1072	cpu->l3_cache_size = (uint32_t)ml_readprop(entry: child, propertyName: "l3-cache-size", default_value: `0`);
1073	cpu->l3_cache_id = (uint32_t)ml_readprop(entry: child, propertyName: "l3-cache-id", default_value: `0`);
1074
1075	ml_read_reg_range(entry: child, propertyName: "cpu-uttdbg-reg", pa_ptr: &cpu->cpu_UTTDBG_pa, len_ptr: &cpu->cpu_UTTDBG_len);
1076	ml_read_reg_range(entry: child, propertyName: "cpu-impl-reg", pa_ptr: &cpu->cpu_IMPL_pa, len_ptr: &cpu->cpu_IMPL_len);
1077	ml_read_reg_range(entry: child, propertyName: "coresight-reg", pa_ptr: &cpu->coresight_pa, len_ptr: &cpu->coresight_len);
1078	cpu->cluster_type = CLUSTER_TYPE_SMP;
1079
1080	int cluster_type = (int)ml_readprop(entry: child, propertyName: "cluster-type", default_value: `0`);
1081	if (cluster_type == `'E'`) {
1082	cpu->cluster_type = CLUSTER_TYPE_E;
1083	} else if (cluster_type == `'P'`) {
1084	cpu->cluster_type = CLUSTER_TYPE_P;
1085	}
1086
1087	topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1088
1089	/*
1090	* Since we want to keep a linear cluster ID space, we cannot just rely
1091	* on the value provided by EDT. Instead, use the MPIDR value to see if we have
1092	* seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1093	*/
1094	#if HAS_CLUSTER
1095	uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1096	#else
1097	uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1098	#endif
1099	assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1100	cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -`1`) ?
1101	topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1102
1103	assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1104
1105	ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1106	if (cluster->num_cpus == `0`) {
1107	assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1108
1109	topology_info.num_clusters++;
1110	topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1111	topology_info.cluster_types \|= (`1` << cpu->cluster_type);
1112
1113	cluster->cluster_id = cpu->cluster_id;
1114	cluster->cluster_type = cpu->cluster_type;
1115	cluster->first_cpu_id = cpu->cpu_id;
1116	assert(cluster_phys_to_logical[phys_cluster_id] == -`1`);
1117	cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1118
1119	topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1120
1121	// Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1122	// If we wind up with a bunch of these, we might want to create separate per-cluster
1123	// EDT nodes and have the CPU nodes reference them through a phandle.
1124	ml_read_reg_range(entry: child, propertyName: "acc-impl-reg", pa_ptr: &cluster->acc_IMPL_pa, len_ptr: &cluster->acc_IMPL_len);
1125	ml_read_reg_range(entry: child, propertyName: "cpm-impl-reg", pa_ptr: &cluster->cpm_IMPL_pa, len_ptr: &cluster->cpm_IMPL_len);
1126	}
1127
1128	#if HAS_CLUSTER
1129	if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1130	cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1131	}
1132	#endif
1133
1134	cpu->die_cluster_id = (int)ml_readprop(entry: child, propertyName: "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1135	cpu->cluster_core_id = (int)ml_readprop(entry: child, propertyName: "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1136
1137	cluster->num_cpus++;
1138	cluster->cpu_mask \|= `1ULL` << cpu->cpu_id;
1139
1140	if (is_boot_cpu) {
1141	assert(topology_info.boot_cpu == NULL);
1142	topology_info.boot_cpu = cpu;
1143	topology_info.boot_cluster = cluster;
1144	}
1145
1146	#if CONFIG_SPTM
1147	sptm_register_cpu(cpu->phys_id);
1148	#endif
1149	}
1150
1151	#if HAS_CLUSTER
1152	/*
1153	* Build the cluster offset array, ensuring that the region reserved
1154	* for each physical cluster contains enough entries to be indexed
1155	* by the maximum physical CPU ID (AFF0) within the cluster.
1156	*/
1157	unsigned int cur_cluster_offset = `0`;
1158	for (int i = `0`; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1159	if (cluster_phys_to_logical[i] != -`1`) {
1160	cluster_offsets[i] = cur_cluster_offset;
1161	cur_cluster_offset += (cluster_max_cpu_phys_id[i] + `1`);
1162	}
1163	}
1164	assert(cur_cluster_offset <= MAX_CPUS);
1165	#else
1166	/*
1167	* For H10, there are really 2 physical clusters, but they are not separated
1168	* into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1169	* is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1170	* treat H10 and earlier devices as though they contain a single cluster.
1171	*/
1172	cluster_offsets[`0`] = `0`;
1173	#endif
1174	assert(topology_info.boot_cpu != NULL);
1175	ml_read_chip_revision(rev: &topology_info.chip_revision);
1176
1177	/*
1178	* Set TPIDR_EL0 to indicate the correct cpu number & cluster id,
1179	* as we may not be booting from cpu 0. Userspace will consume
1180	* the current CPU number through this register. For non-boot
1181	* cores, this is done in start.s (start_cpu) using the per-cpu
1182	* data object.
1183	*/
1184	ml_topology_cpu_t *boot_cpu = topology_info.boot_cpu;
1185	uint64_t tpidr_el0 = ((boot_cpu->cpu_id << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) \| \
1186	((boot_cpu->cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1187	assert(((tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == boot_cpu->cpu_id);
1188	assert(((tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == boot_cpu->cluster_id);
1189	__builtin_arm_wsr64("TPIDR_EL0", tpidr_el0);
1190
1191	__builtin_arm_wsr64("TPIDRRO_EL0", `0`);
1192	}
1193
1194	const ml_topology_info_t *
1195	ml_get_topology_info(void)
1196	{
1197	return &topology_info;
1198	}
1199
1200	void
1201	ml_map_cpu_pio(void)
1202	{
1203	unsigned int i;
1204
1205	for (i = `0`; i < topology_info.num_cpus; i++) {
1206	ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1207	if (cpu->cpu_IMPL_pa) {
1208	cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->cpu_IMPL_pa, size: cpu->cpu_IMPL_len);
1209	cpu->coresight_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->coresight_pa, size: cpu->coresight_len);
1210	}
1211	if (cpu->cpu_UTTDBG_pa) {
1212	cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->cpu_UTTDBG_pa, size: cpu->cpu_UTTDBG_len);
1213	}
1214	}
1215
1216	for (i = `0`; i < topology_info.num_clusters; i++) {
1217	ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1218	if (cluster->acc_IMPL_pa) {
1219	cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cluster->acc_IMPL_pa, size: cluster->acc_IMPL_len);
1220	}
1221	if (cluster->cpm_IMPL_pa) {
1222	cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cluster->cpm_IMPL_pa, size: cluster->cpm_IMPL_len);
1223	}
1224	}
1225	}
1226
1227	unsigned int
1228	ml_get_cpu_count(void)
1229	{
1230	return topology_info.num_cpus;
1231	}
1232
1233	unsigned int
1234	ml_get_cluster_count(void)
1235	{
1236	return topology_info.num_clusters;
1237	}
1238
1239	int
1240	ml_get_boot_cpu_number(void)
1241	{
1242	return topology_info.boot_cpu->cpu_id;
1243	}
1244
1245	cluster_type_t
1246	ml_get_boot_cluster_type(void)
1247	{
1248	return topology_info.boot_cluster->cluster_type;
1249	}
1250
1251	int
1252	ml_get_cpu_number(uint32_t phys_id)
1253	{
1254	phys_id &= MPIDR_AFF1_MASK \| MPIDR_AFF0_MASK;
1255
1256	for (unsigned i = `0`; i < topology_info.num_cpus; i++) {
1257	if (topology_info.cpus[i].phys_id == phys_id) {
1258	return i;
1259	}
1260	}
1261
1262	return -`1`;
1263	}
1264
1265	int
1266	ml_get_cluster_number(uint32_t phys_id)
1267	{
1268	int cpu_id = ml_get_cpu_number(phys_id);
1269	if (cpu_id < `0`) {
1270	return -`1`;
1271	}
1272
1273	ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1274
1275	return cpu->cluster_id;
1276	}
1277
1278	unsigned int
1279	ml_get_cpu_number_local(void)
1280	{
1281	uint64_t mpidr_el1_value = `0`;
1282	unsigned cpu_id;
1283
1284	/ We identify the CPU based on the constant bits of MPIDR_EL1. /
1285	MRS(mpidr_el1_value, "MPIDR_EL1");
1286	cpu_id = ml_get_cpu_number(phys_id: (uint32_t)mpidr_el1_value);
1287
1288	assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1289
1290	return cpu_id;
1291	}
1292
1293	int
1294	ml_get_cluster_number_local()
1295	{
1296	uint64_t mpidr_el1_value = `0`;
1297	unsigned cluster_id;
1298
1299	/ We identify the cluster based on the constant bits of MPIDR_EL1. /
1300	MRS(mpidr_el1_value, "MPIDR_EL1");
1301	cluster_id = ml_get_cluster_number(phys_id: (uint32_t)mpidr_el1_value);
1302
1303	assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1304
1305	return cluster_id;
1306	}
1307
1308	int
1309	ml_get_max_cpu_number(void)
1310	{
1311	return topology_info.max_cpu_id;
1312	}
1313
1314	int
1315	ml_get_max_cluster_number(void)
1316	{
1317	return topology_info.max_cluster_id;
1318	}
1319
1320	unsigned int
1321	ml_get_first_cpu_id(unsigned int cluster_id)
1322	{
1323	return topology_info.clusters[cluster_id].first_cpu_id;
1324	}
1325
1326	static_assert(MAX_CPUS <= `256`, "MAX_CPUS must fit in _COMM_PAGE_CPU_TO_CLUSTER; Increase table size if needed");
1327
1328	void
1329	ml_map_cpus_to_clusters(uint8_t *table)
1330	{
1331	for (uint16_t cpu_id = `0`; cpu_id < topology_info.num_cpus; cpu_id++) {
1332	*(table + cpu_id) = (uint8_t)(topology_info.cpus[cpu_id].cluster_id);
1333	}
1334	}
1335
1336	/*
1337	* Return the die id of a cluster.
1338	*/
1339	unsigned int
1340	ml_get_die_id(unsigned int cluster_id)
1341	{
1342	/*
1343	* The current implementation gets the die_id from the
1344	* first CPU of the cluster.
1345	* rdar://80917654 (Add the die_id field to the cluster topology info)
1346	*/
1347	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1348	return topology_info.cpus[first_cpu].die_id;
1349	}
1350
1351	/*
1352	* Return the index of a cluster in its die.
1353	*/
1354	unsigned int
1355	ml_get_die_cluster_id(unsigned int cluster_id)
1356	{
1357	/*
1358	* The current implementation gets the die_id from the
1359	* first CPU of the cluster.
1360	* rdar://80917654 (Add the die_id field to the cluster topology info)
1361	*/
1362	unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1363	return topology_info.cpus[first_cpu].die_cluster_id;
1364	}
1365
1366	/*
1367	* Return the highest die id of the system.
1368	*/
1369	unsigned int
1370	ml_get_max_die_id(void)
1371	{
1372	return topology_info.max_die_id;
1373	}
1374
1375	void
1376	ml_lockdown_init()
1377	{
1378	#if defined(KERNEL_INTEGRITY_KTRR) \|\| defined(KERNEL_INTEGRITY_CTRR)
1379	rorgn_stash_range();
1380	#endif
1381	}
1382
1383	kern_return_t
1384	ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1385	{
1386	if (!f) {
1387	return KERN_FAILURE;
1388	}
1389
1390	assert(lockdown_done);
1391	f(this); // XXX: f this whole function
1392
1393	return KERN_SUCCESS;
1394	}
1395
1396	static mcache_flush_function mcache_flush_func;
1397	static void* mcache_flush_service;
1398	kern_return_t
1399	ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1400	{
1401	mcache_flush_service = service;
1402	mcache_flush_func = func;
1403
1404	return KERN_SUCCESS;
1405	}
1406
1407	kern_return_t
1408	ml_mcache_flush(void)
1409	{
1410	if (!mcache_flush_func) {
1411	panic("Cannot flush M$ with no flush callback registered");
1412
1413	return KERN_FAILURE;
1414	} else {
1415	return mcache_flush_func(mcache_flush_service);
1416	}
1417	}
1418
1419
1420	extern lck_mtx_t pset_create_lock;
1421
1422	kern_return_t
1423	ml_processor_register(ml_processor_info_t *in_processor_info,
1424	processor_t processor_out, ipi_handler_t ipi_handler_out,
1425	perfmon_interrupt_handler_func *pmi_handler_out)
1426	{
1427	cpu_data_t *this_cpu_datap;
1428	processor_set_t pset;
1429	boolean_t is_boot_cpu;
1430	static unsigned int reg_cpu_count = `0`;
1431
1432	if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1433	return KERN_FAILURE;
1434	}
1435
1436	if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1437	return KERN_FAILURE;
1438	}
1439
1440	if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1441	is_boot_cpu = FALSE;
1442	this_cpu_datap = cpu_data_alloc(FALSE);
1443	cpu_data_init(cpu_data_ptr: this_cpu_datap);
1444	} else {
1445	this_cpu_datap = &BootCpuData;
1446	is_boot_cpu = TRUE;
1447	}
1448
1449	assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1450
1451	this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1452
1453	if (!is_boot_cpu) {
1454	this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1455
1456	if (cpu_data_register(cpu_data_ptr: this_cpu_datap) != KERN_SUCCESS) {
1457	goto processor_register_error;
1458	}
1459	assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1460	}
1461
1462	this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1463	this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1464	nanoseconds_to_absolutetime(nanoseconds: (uint64_t) in_processor_info->powergate_latency, result: &this_cpu_datap->cpu_idle_latency);
1465	this_cpu_datap->cpu_reset_assist = kvtophys(va: in_processor_info->powergate_stub_addr);
1466
1467	this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1468	this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1469
1470	this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1471	this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1472	this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1473	this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1474
1475	this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1476	this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1477	this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1478	this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1479	this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1480	this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1481
1482	/*
1483	* Encode cpu_id, cluster_id to be stored in TPIDR_EL0 (see
1484	* cswitch.s:set_thread_registers, start.s:start_cpu) for consumption
1485	* by userspace.
1486	*/
1487	this_cpu_datap->cpu_tpidr_el0 = ((this_cpu_datap->cpu_number << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) \| \
1488	((this_cpu_datap->cpu_cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1489	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == this_cpu_datap->cpu_number);
1490	assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == this_cpu_datap->cpu_cluster_id);
1491
1492	#if HAS_CLUSTER
1493	this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1494	#else /* HAS_CLUSTER */
1495	this_cpu_datap->cluster_master = is_boot_cpu;
1496	#endif /* HAS_CLUSTER */
1497	lck_mtx_lock(lck: &pset_create_lock);
1498	pset = pset_find(cluster_id: in_processor_info->cluster_id, NULL);
1499	kprintf(fmt: "[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -`1`);
1500	if (pset == NULL) {
1501	#if __AMP__
1502	pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1503	pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1504	assert(pset != PROCESSOR_SET_NULL);
1505	kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1506	#else /* __AMP__ */
1507	pset_cluster_type_t pset_cluster_type = PSET_SMP;
1508	pset = pset_create(node: &pset_node0, pset_type: pset_cluster_type, pset_cluster_id: this_cpu_datap->cpu_cluster_id, pset_id: this_cpu_datap->cpu_cluster_id);
1509	assert(pset != PROCESSOR_SET_NULL);
1510	#endif /* __AMP__ */
1511	}
1512	kprintf(fmt: "[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1513	lck_mtx_unlock(lck: &pset_create_lock);
1514
1515	processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1516	if (!is_boot_cpu) {
1517	processor_init(processor, cpu_id: this_cpu_datap->cpu_number, processor_set: pset);
1518
1519	if (this_cpu_datap->cpu_l2_access_penalty) {
1520	/*
1521	* Cores that have a non-zero L2 access penalty compared
1522	* to the boot processor should be de-prioritized by the
1523	* scheduler, so that threads use the cores with better L2
1524	* preferentially.
1525	*/
1526	processor_set_primary(processor, master_processor);
1527	}
1528	}
1529
1530	*processor_out = processor;
1531	*ipi_handler_out = cpu_signal_handler;
1532	#if CPMU_AIC_PMI && CONFIG_CPU_COUNTERS
1533	*pmi_handler_out = mt_cpmu_aic_pmi;
1534	#else
1535	*pmi_handler_out = NULL;
1536	#endif /* CPMU_AIC_PMI && CONFIG_CPU_COUNTERS */
1537	if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1538	*in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1539	}
1540
1541	#if CONFIG_CPU_COUNTERS
1542	if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1543	goto processor_register_error;
1544	}
1545	#endif /* CONFIG_CPU_COUNTERS */
1546
1547
1548	if (!is_boot_cpu) {
1549	random_cpu_init(cpu: this_cpu_datap->cpu_number);
1550	// now let next CPU register itself
1551	OSIncrementAtomic((SInt32*)&real_ncpus);
1552	}
1553
1554	return KERN_SUCCESS;
1555
1556	processor_register_error:
1557	#if CONFIG_CPU_COUNTERS
1558	kpc_unregister_cpu(this_cpu_datap);
1559	#endif /* CONFIG_CPU_COUNTERS */
1560	if (!is_boot_cpu) {
1561	cpu_data_free(cpu_data_ptr: this_cpu_datap);
1562	}
1563
1564	return KERN_FAILURE;
1565	}
1566
1567	void
1568	ml_init_arm_debug_interface(
1569	void * in_cpu_datap,
1570	vm_offset_t virt_address)
1571	{
1572	((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1573	do_debugid();
1574	}
1575
1576	/*
1577	* Routine: init_ast_check
1578	* Function:
1579	*/
1580	void
1581	init_ast_check(
1582	__unused processor_t processor)
1583	{
1584	}
1585
1586	/*
1587	* Routine: cause_ast_check
1588	* Function:
1589	*/
1590	void
1591	cause_ast_check(
1592	processor_t processor)
1593	{
1594	if (current_processor() != processor) {
1595	cpu_signal(target: processor_to_cpu_datap(processor), SIGPast, p0: (void )NULL, p1: (void* *)NULL);
1596	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, `1` / ast /, `0`, `0`, `0`);
1597	}
1598	}
1599
1600	extern uint32_t cpu_idle_count;
1601
1602	void
1603	ml_get_power_state(boolean_t icp, boolean_t pidlep)
1604	{
1605	*icp = ml_at_interrupt_context();
1606	*pidlep = (cpu_idle_count == real_ncpus);
1607	}
1608
1609	/*
1610	* Routine: ml_cause_interrupt
1611	* Function: Generate a fake interrupt
1612	*/
1613	void
1614	ml_cause_interrupt(void)
1615	{
1616	return; / BS_XXX /
1617	}
1618
1619	/ Map memory map IO space /
1620	vm_offset_t
1621	ml_io_map(
1622	vm_offset_t phys_addr,
1623	vm_size_t size)
1624	{
1625	return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1626	}
1627
1628	/ Map memory map IO space (with protections specified) /
1629	vm_offset_t
1630	ml_io_map_with_prot(
1631	vm_offset_t phys_addr,
1632	vm_size_t size,
1633	vm_prot_t prot)
1634	{
1635	return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1636	}
1637
1638	vm_offset_t
1639	ml_io_map_unmappable(
1640	vm_offset_t phys_addr,
1641	vm_size_t size,
1642	unsigned int flags)
1643	{
1644	return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1645	}
1646
1647	vm_offset_t
1648	ml_io_map_wcomb(
1649	vm_offset_t phys_addr,
1650	vm_size_t size)
1651	{
1652	return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1653	}
1654
1655	void
1656	ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1657	{
1658	pmap_remove(map: kernel_pmap, s: addr, e: addr + sz);
1659	kmem_free(map: kernel_map, addr, size: sz);
1660	}
1661
1662	vm_map_address_t
1663	ml_map_high_window(
1664	vm_offset_t phys_addr,
1665	vm_size_t len)
1666	{
1667	return pmap_map_high_window_bd(pa: phys_addr, len, VM_PROT_READ \| VM_PROT_WRITE);
1668	}
1669
1670	vm_offset_t
1671	ml_static_ptovirt(
1672	vm_offset_t paddr)
1673	{
1674	return phystokv(pa: paddr);
1675	}
1676
1677	vm_offset_t
1678	ml_static_slide(
1679	vm_offset_t vaddr)
1680	{
1681	vm_offset_t slid_vaddr = `0`;
1682
1683	#if CONFIG_SPTM
1684	if ((vaddr >= vm_sptm_offsets.unslid_base) && (vaddr < vm_sptm_offsets.unslid_top)) {
1685	slid_vaddr = vaddr + vm_sptm_offsets.slide;
1686	} else if ((vaddr >= vm_txm_offsets.unslid_base) && (vaddr < vm_txm_offsets.unslid_top)) {
1687	slid_vaddr = vaddr + vm_txm_offsets.slide;
1688	} else
1689	#endif /* CONFIG_SPTM */
1690	{
1691	slid_vaddr = vaddr + vm_kernel_slide;
1692	}
1693
1694	if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1695	/ This is only intended for use on static kernel addresses. /
1696	return `0`;
1697	}
1698
1699	return slid_vaddr;
1700	}
1701
1702	vm_offset_t
1703	ml_static_unslide(
1704	vm_offset_t vaddr)
1705	{
1706	if (!VM_KERNEL_IS_SLID(vaddr)) {
1707	/ This is only intended for use on static kernel addresses. /
1708	return `0`;
1709	}
1710
1711	#if CONFIG_SPTM
1712	/**
1713	* Addresses coming from the SPTM and TXM have a different slide than the
1714	* rest of the kernel.
1715	*/
1716	if ((vaddr >= vm_sptm_offsets.slid_base) && (vaddr < vm_sptm_offsets.slid_top)) {
1717	return vaddr - vm_sptm_offsets.slide;
1718	}
1719
1720	if ((vaddr >= vm_txm_offsets.slid_base) && (vaddr < vm_txm_offsets.slid_top)) {
1721	return vaddr - vm_txm_offsets.slide;
1722	}
1723	#endif /* CONFIG_SPTM */
1724
1725	return vaddr - vm_kernel_slide;
1726	}
1727
1728	extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1729
1730	kern_return_t
1731	ml_static_protect(
1732	vm_offset_t vaddr, / kernel virtual address /
1733	vm_size_t size,
1734	vm_prot_t new_prot __unused)
1735	{
1736	#if CONFIG_SPTM
1737	/**
1738	* Retype any frames that may be passed to the VM to XNU_DEFAULT.
1739	*/
1740	for (vm_offset_t sptm_vaddr_cur = vaddr; sptm_vaddr_cur < trunc_page_64(vaddr + size); sptm_vaddr_cur += PAGE_SIZE) {
1741	/ Check if this frame is XNU_DEFAULT and only retype it if is not /
1742	sptm_paddr_t sptm_paddr_cur = kvtophys_nofail(sptm_vaddr_cur);
1743	sptm_frame_type_t current_type = sptm_get_frame_type(sptm_paddr_cur);
1744	if (current_type != XNU_DEFAULT) {
1745	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1746	sptm_retype(sptm_paddr_cur, current_type, XNU_DEFAULT, retype_params);
1747	}
1748	}
1749
1750	return KERN_SUCCESS;
1751	#else /* CONFIG_SPTM */
1752	pt_entry_t arm_prot = `0`;
1753	pt_entry_t arm_block_prot = `0`;
1754	vm_offset_t vaddr_cur;
1755	ppnum_t ppn;
1756	kern_return_t result = KERN_SUCCESS;
1757
1758	if (vaddr < physmap_base) {
1759	panic("ml_static_protect(): %p < %p", (void ) vaddr, (void* *) physmap_base);
1760	return KERN_FAILURE;
1761	}
1762
1763	assert((vaddr & (PAGE_SIZE - `1`)) == `0`); / must be page aligned /
1764
1765	if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1766	panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1767	}
1768	if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1769	panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1770	}
1771
1772	/ Set up the protection bits, and block bits so we can validate block mappings. /
1773	if (new_prot & VM_PROT_WRITE) {
1774	arm_prot \|= ARM_PTE_AP(AP_RWNA);
1775	arm_block_prot \|= ARM_TTE_BLOCK_AP(AP_RWNA);
1776	} else {
1777	arm_prot \|= ARM_PTE_AP(AP_RONA);
1778	arm_block_prot \|= ARM_TTE_BLOCK_AP(AP_RONA);
1779	}
1780
1781	arm_prot \|= ARM_PTE_NX;
1782	arm_block_prot \|= ARM_TTE_BLOCK_NX;
1783
1784	if (!(new_prot & VM_PROT_EXECUTE)) {
1785	arm_prot \|= ARM_PTE_PNX;
1786	arm_block_prot \|= ARM_TTE_BLOCK_PNX;
1787	}
1788
1789	for (vaddr_cur = vaddr;
1790	vaddr_cur < trunc_page_64(vaddr + size);
1791	vaddr_cur += PAGE_SIZE) {
1792	ppn = pmap_find_phys(map: kernel_pmap, va: vaddr_cur);
1793	if (ppn != (vm_offset_t) NULL) {
1794	tt_entry_t *tte2;
1795	pt_entry_t *pte_p;
1796	pt_entry_t ptmp;
1797
1798	#if XNU_MONITOR
1799	assert(!pmap_is_monitor(ppn));
1800	assert(!TEST_PAGE_RATIO_4);
1801	#endif
1802
1803	tte2 = arm_kva_to_tte(va: vaddr_cur);
1804
1805	if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1806	if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1807	((*tte2 & (ARM_TTE_BLOCK_NXMASK \| ARM_TTE_BLOCK_PNXMASK \| ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1808	/*
1809	* We can support ml_static_protect on a block mapping if the mapping already has
1810	* the desired protections. We still want to run checks on a per-page basis.
1811	*/
1812	continue;
1813	}
1814
1815	result = KERN_FAILURE;
1816	break;
1817	}
1818
1819	pte_p = (pt_entry_t )&((tt_entry_t)(phystokv(pa: (*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1820	ptmp = *pte_p;
1821
1822	if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK \| ARM_PTE_PNXMASK \| ARM_PTE_NXMASK)) != arm_prot)) {
1823	/*
1824	* The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1825	* protections do not match the desired protections, then we will fail (as we cannot update
1826	* this mapping without updating other mappings as well).
1827	*/
1828	result = KERN_FAILURE;
1829	break;
1830	}
1831
1832	__unreachable_ok_push
1833	if (TEST_PAGE_RATIO_4) {
1834	{
1835	unsigned int i;
1836	pt_entry_t *ptep_iter;
1837
1838	ptep_iter = pte_p;
1839	for (i = `0`; i < `4`; i++, ptep_iter++) {
1840	/ Note that there is a hole in the HINT sanity checking here. /
1841	ptmp = *ptep_iter;
1842
1843	/ We only need to update the page tables if the protections do not match. /
1844	if ((ptmp & (ARM_PTE_APMASK \| ARM_PTE_PNXMASK \| ARM_PTE_NXMASK)) != arm_prot) {
1845	ptmp = (ptmp & ~(ARM_PTE_APMASK \| ARM_PTE_PNXMASK \| ARM_PTE_NXMASK)) \| arm_prot;
1846	*ptep_iter = ptmp;
1847	}
1848	}
1849	}
1850	} else {
1851	ptmp = *pte_p;
1852	/ We only need to update the page tables if the protections do not match. /
1853	if ((ptmp & (ARM_PTE_APMASK \| ARM_PTE_PNXMASK \| ARM_PTE_NXMASK)) != arm_prot) {
1854	ptmp = (ptmp & ~(ARM_PTE_APMASK \| ARM_PTE_PNXMASK \| ARM_PTE_NXMASK)) \| arm_prot;
1855	*pte_p = ptmp;
1856	}
1857	}
1858	__unreachable_ok_pop
1859	}
1860	}
1861
1862	if (vaddr_cur > vaddr) {
1863	assert(((vaddr_cur - vaddr) & `0xFFFFFFFF00000000ULL`) == `0`);
1864	flush_mmu_tlb_region(va: vaddr, length: (uint32_t)(vaddr_cur - vaddr));
1865	}
1866
1867
1868	return result;
1869	#endif /* CONFIG_SPTM */
1870	}
1871
1872	#if defined(CONFIG_SPTM)
1873	/*
1874	* Returns true if the given physical address is in one of the boot kernelcache ranges.
1875	*/
1876	static bool
1877	ml_physaddr_in_bootkc_range(vm_offset_t physaddr)
1878	{
1879	for (int i = `0`; i < arm_vm_kernelcache_numranges; i++) {
1880	if (physaddr >= arm_vm_kernelcache_ranges[i].start_phys && physaddr < arm_vm_kernelcache_ranges[i].end_phys) {
1881	return true;
1882	}
1883	}
1884	return false;
1885	}
1886	#endif /* defined(CONFIG_SPTM) */
1887
1888	/*
1889	* Routine: ml_static_mfree
1890	* Function:
1891	*/
1892	void
1893	ml_static_mfree(
1894	vm_offset_t vaddr,
1895	vm_size_t size)
1896	{
1897	vm_offset_t vaddr_cur;
1898	vm_offset_t paddr_cur;
1899	ppnum_t ppn;
1900	uint32_t freed_pages = `0`;
1901	uint32_t freed_kernelcache_pages = `0`;
1902
1903
1904	/ It is acceptable (if bad) to fail to free. /
1905	if (vaddr < physmap_base) {
1906	return;
1907	}
1908
1909	assert((vaddr & (PAGE_SIZE - `1`)) == `0`); / must be page aligned /
1910
1911	for (vaddr_cur = vaddr;
1912	vaddr_cur < trunc_page_64(vaddr + size);
1913	vaddr_cur += PAGE_SIZE) {
1914	ppn = pmap_find_phys(map: kernel_pmap, va: vaddr_cur);
1915	if (ppn != (vm_offset_t) NULL) {
1916	/*
1917	* It is not acceptable to fail to update the protections on a page
1918	* we will release to the VM. We need to either panic or continue.
1919	* For now, we'll panic (to help flag if there is memory we can
1920	* reclaim).
1921	*/
1922	if (ml_static_protect(vaddr: vaddr_cur, PAGE_SIZE, VM_PROT_WRITE \| VM_PROT_READ) != KERN_SUCCESS) {
1923	panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1924	}
1925
1926	paddr_cur = ptoa(ppn);
1927
1928
1929	vm_page_create(start: ppn, end: (ppn + `1`));
1930	freed_pages++;
1931	#if defined(CONFIG_SPTM)
1932	if (ml_physaddr_in_bootkc_range(paddr_cur)) {
1933	#else
1934	if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end) {
1935	#endif
1936	freed_kernelcache_pages++;
1937	}
1938	}
1939	}
1940	vm_page_lockspin_queues();
1941	vm_page_wire_count -= freed_pages;
1942	vm_page_wire_count_initial -= freed_pages;
1943	vm_page_kernelcache_count -= freed_kernelcache_pages;
1944	vm_page_unlock_queues();
1945	#if DEBUG
1946	kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1947	#endif
1948	}
1949
1950	/*
1951	* Routine: ml_page_protection_type
1952	* Function: Returns the type of page protection that the system supports.
1953	*/
1954	ml_page_protection_t
1955	ml_page_protection_type(void)
1956	{
1957	#if CONFIG_SPTM
1958	return `2`;
1959	#elif XNU_MONITOR
1960	return `1`;
1961	#else
1962	return `0`;
1963	#endif
1964	}
1965
1966	/ virtual to physical on wired pages /
1967	vm_offset_t
1968	ml_vtophys(vm_offset_t vaddr)
1969	{
1970	return kvtophys(va: vaddr);
1971	}
1972
1973	/*
1974	* Routine: ml_nofault_copy
1975	* Function: Perform a physical mode copy if the source and destination have
1976	* valid translations in the kernel pmap. If translations are present, they are
1977	* assumed to be wired; e.g., no attempt is made to guarantee that the
1978	* translations obtained remain valid for the duration of the copy process.
1979	*/
1980	vm_size_t
1981	ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1982	{
1983	addr64_t cur_phys_dst, cur_phys_src;
1984	vm_size_t count, nbytes = `0`;
1985
1986	while (size > `0`) {
1987	if (!(cur_phys_src = kvtophys(va: virtsrc))) {
1988	break;
1989	}
1990	if (!(cur_phys_dst = kvtophys(va: virtdst))) {
1991	break;
1992	}
1993	if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) \|\|
1994	!pmap_valid_address(trunc_page_64(cur_phys_src))) {
1995	break;
1996	}
1997	count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1998	if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1999	count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
2000	}
2001	if (count > size) {
2002	count = size;
2003	}
2004
2005	bcopy_phys(from: cur_phys_src, to: cur_phys_dst, nbytes: count);
2006
2007	nbytes += count;
2008	virtsrc += count;
2009	virtdst += count;
2010	size -= count;
2011	}
2012
2013	return nbytes;
2014	}
2015
2016	/*
2017	* Routine: ml_validate_nofault
2018	* Function: Validate that ths address range has a valid translations
2019	* in the kernel pmap. If translations are present, they are
2020	* assumed to be wired; i.e. no attempt is made to guarantee
2021	* that the translation persist after the check.
2022	* Returns: TRUE if the range is mapped and will not cause a fault,
2023	* FALSE otherwise.
2024	*/
2025
2026	boolean_t
2027	ml_validate_nofault(
2028	vm_offset_t virtsrc, vm_size_t size)
2029	{
2030	addr64_t cur_phys_src;
2031	uint32_t count;
2032
2033	while (size > `0`) {
2034	if (!(cur_phys_src = kvtophys(va: virtsrc))) {
2035	return FALSE;
2036	}
2037	if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
2038	return FALSE;
2039	}
2040	count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
2041	if (count > size) {
2042	count = (uint32_t)size;
2043	}
2044
2045	virtsrc += count;
2046	size -= count;
2047	}
2048
2049	return TRUE;
2050	}
2051
2052	void
2053	ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
2054	{
2055	*phys_addr = `0`;
2056	*size = `0`;
2057	}
2058
2059	void
2060	active_rt_threads(__unused boolean_t active)
2061	{
2062	}
2063
2064	static void
2065	cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
2066	{
2067	return;
2068	}
2069
2070	cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
2071
2072	void
2073	cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
2074	{
2075	if (cpu_qos_cb != NULL) {
2076	cpu_qos_update = cpu_qos_cb;
2077	} else {
2078	cpu_qos_update = cpu_qos_cb_default;
2079	}
2080	}
2081
2082	void
2083	thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
2084	{
2085	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) \| DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, `0`);
2086
2087	cpu_qos_update((int)urgency, rt_period, rt_deadline);
2088
2089	SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) \| DBG_FUNC_END, urgency, rt_period, rt_deadline, `0`, `0`);
2090	}
2091
2092	void
2093	machine_run_count(__unused uint32_t count)
2094	{
2095	}
2096
2097	processor_t
2098	machine_choose_processor(__unused processor_set_t pset, processor_t processor)
2099	{
2100	return processor;
2101	}
2102
2103	#if KASAN
2104	vm_offset_t ml_stack_base(void);
2105	vm_size_t ml_stack_size(void);
2106
2107	vm_offset_t
2108	ml_stack_base(void)
2109	{
2110	uintptr_t local = (uintptr_t) &local;
2111	vm_offset_t intstack_top_ptr;
2112
2113	intstack_top_ptr = getCpuDatap()->intstack_top;
2114	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2115	return intstack_top_ptr - INTSTACK_SIZE;
2116	} else {
2117	return current_thread()->kernel_stack;
2118	}
2119	}
2120	vm_size_t
2121	ml_stack_size(void)
2122	{
2123	uintptr_t local = (uintptr_t) &local;
2124	vm_offset_t intstack_top_ptr;
2125
2126	intstack_top_ptr = getCpuDatap()->intstack_top;
2127	if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2128	return INTSTACK_SIZE;
2129	} else {
2130	return kernel_stack_size;
2131	}
2132	}
2133	#endif
2134
2135	#ifdef CONFIG_KCOV
2136
2137	kcov_cpu_data_t *
2138	current_kcov_data(void)
2139	{
2140	return &current_cpu_datap()->cpu_kcov_data;
2141	}
2142
2143	kcov_cpu_data_t *
2144	cpu_kcov_data(int cpuid)
2145	{
2146	return &cpu_datap(cpuid)->cpu_kcov_data;
2147	}
2148
2149	#endif /* CONFIG_KCOV */
2150
2151	boolean_t
2152	machine_timeout_suspended(void)
2153	{
2154	return FALSE;
2155	}
2156
2157	kern_return_t
2158	ml_interrupt_prewarm(__unused uint64_t deadline)
2159	{
2160	return KERN_FAILURE;
2161	}
2162
2163	/*
2164	* Assumes fiq, irq disabled.
2165	*/
2166	void
2167	ml_set_decrementer(uint32_t dec_value)
2168	{
2169	cpu_data_t *cdp = getCpuDatap();
2170
2171	assert(ml_get_interrupts_enabled() == FALSE);
2172	cdp->cpu_decrementer = dec_value;
2173
2174	if (cdp->cpu_set_decrementer_func) {
2175	cdp->cpu_set_decrementer_func(dec_value);
2176	} else {
2177	__builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
2178	}
2179	}
2180
2181	/**
2182	* Perform a read of the timebase which is permitted to be executed
2183	* speculatively and/or out of program order.
2184	*/
2185	static inline uint64_t
2186	speculative_timebase(void)
2187	{
2188	return __builtin_arm_rsr64("CNTVCT_EL0");
2189	}
2190
2191	/**
2192	* Read a non-speculative view of the timebase if one is available,
2193	* otherwise fallback on an ISB to prevent prevent speculation and
2194	* enforce ordering.
2195	*/
2196	static inline uint64_t
2197	nonspeculative_timebase(void)
2198	{
2199	#if defined(HAS_ACNTVCT)
2200	return __builtin_arm_rsr64("S3_4_c15_c10_6");
2201	#elif __ARM_ARCH_8_6__
2202	return __builtin_arm_rsr64("CNTVCTSS_EL0");
2203	#else
2204	// ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2205	// "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2206	// to other instructions executed on the same processor."
2207	__builtin_arm_isb(ISB_SY);
2208	return speculative_timebase();
2209	#endif
2210	}
2211
2212
2213	uint64_t
2214	ml_get_hwclock()
2215	{
2216	uint64_t timebase = nonspeculative_timebase();
2217	return timebase;
2218	}
2219
2220	uint64_t
2221	ml_get_timebase()
2222	{
2223	uint64_t clock, timebase;
2224
2225	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2226	do {
2227	timebase = getCpuDatap()->cpu_base_timebase;
2228	os_compiler_barrier();
2229	clock = ml_get_hwclock();
2230	os_compiler_barrier();
2231	} while (getCpuDatap()->cpu_base_timebase != timebase);
2232
2233	return clock + timebase;
2234	}
2235
2236	/**
2237	* Issue a barrier that guarantees all prior memory accesses will complete
2238	* before any subsequent timebase reads.
2239	*/
2240	void
2241	ml_memory_to_timebase_fence(void)
2242	{
2243	__builtin_arm_dmb(DMB_SY);
2244	const uint64_t take_backwards_branch = `0`;
2245	asm volatile (
2246	"1:"
2247	"ldr x0, [%[take_backwards_branch]]" "\n"
2248	"cbnz x0, 1b" "\n"
2249	:
2250	: [take_backwards_branch] "r"(&take_backwards_branch)
2251	: "x0"
2252	);
2253
2254	/ throwaway read to prevent ml_get_speculative_timebase() reordering /
2255	(void)ml_get_hwclock();
2256	}
2257
2258	/**
2259	* Issue a barrier that guarantees all prior timebase reads will
2260	* be ordered before any subsequent memory accesses.
2261	*/
2262	void
2263	ml_timebase_to_memory_fence(void)
2264	{
2265	__builtin_arm_isb(ISB_SY);
2266	}
2267
2268	/*
2269	* Get the speculative timebase without an ISB.
2270	*/
2271	uint64_t
2272	ml_get_speculative_timebase(void)
2273	{
2274	uint64_t clock, timebase;
2275
2276	//the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2277	do {
2278	timebase = getCpuDatap()->cpu_base_timebase;
2279	os_compiler_barrier();
2280	clock = speculative_timebase();
2281
2282	os_compiler_barrier();
2283	} while (getCpuDatap()->cpu_base_timebase != timebase);
2284
2285	return clock + timebase;
2286	}
2287
2288	uint64_t
2289	ml_get_timebase_entropy(void)
2290	{
2291	return ml_get_speculative_timebase();
2292	}
2293
2294	uint32_t
2295	ml_get_decrementer(void)
2296	{
2297	cpu_data_t *cdp = getCpuDatap();
2298	uint32_t dec;
2299
2300	assert(ml_get_interrupts_enabled() == FALSE);
2301
2302	if (cdp->cpu_get_decrementer_func) {
2303	dec = cdp->cpu_get_decrementer_func();
2304	} else {
2305	uint64_t wide_val;
2306
2307	wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2308	dec = (uint32_t)wide_val;
2309	assert(wide_val == (uint64_t)dec);
2310	}
2311
2312	return dec;
2313	}
2314
2315	boolean_t
2316	ml_get_timer_pending(void)
2317	{
2318	uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2319	return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != `0`) ? TRUE : FALSE;
2320	}
2321
2322	__attribute__((noreturn))
2323	void
2324	platform_syscall(arm_saved_state_t *state)
2325	{
2326	uint32_t code;
2327
2328	#define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2329
2330	code = (uint32_t)get_saved_state_reg(iss: state, reg: `3`);
2331
2332	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) \| DBG_FUNC_START,
2333	get_saved_state_reg(state, `0`),
2334	get_saved_state_reg(state, `1`),
2335	get_saved_state_reg(state, `2`));
2336
2337	switch (code) {
2338	case `2`:
2339	/ set cthread /
2340	platform_syscall_kprintf("set cthread self.\n");
2341	thread_set_cthread_self(get_saved_state_reg(iss: state, reg: `0`));
2342	break;
2343	case `3`:
2344	/ get cthread /
2345	platform_syscall_kprintf("get cthread self.\n");
2346	set_user_saved_state_reg(iss: state, reg: `0`, value: thread_get_cthread_self());
2347	break;
2348	case `0`: / I-Cache flush (removed) /
2349	case `1`: / D-Cache flush (removed) /
2350	default:
2351	platform_syscall_kprintf("unknown: %d\n", code);
2352	break;
2353	}
2354
2355	KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) \| DBG_FUNC_END,
2356	get_saved_state_reg(state, `0`));
2357
2358	thread_exception_return();
2359	}
2360
2361	static void
2362	_enable_timebase_event_stream(uint32_t bit_index)
2363	{
2364	uint64_t cntkctl; / One wants to use 32 bits, but "mrs" prefers it this way /
2365
2366	if (bit_index >= `64`) {
2367	panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2368	}
2369
2370	__asm__ volatile ("mrs %0, CNTKCTL_EL1" : "=r"(cntkctl));
2371
2372	cntkctl \|= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2373	cntkctl \|= CNTKCTL_EL1_EVNTEN;
2374	cntkctl \|= CNTKCTL_EL1_EVENTDIR; / 1->0; why not? /
2375
2376	/*
2377	* If the SOC supports it (and it isn't broken), enable
2378	* EL0 access to the timebase registers.
2379	*/
2380	if (user_timebase_type() != USER_TIMEBASE_NONE) {
2381	cntkctl \|= (CNTKCTL_EL1_PL0PCTEN \| CNTKCTL_EL1_PL0VCTEN);
2382	}
2383
2384	__builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2385	}
2386
2387	/*
2388	* Turn timer on, unmask that interrupt.
2389	*/
2390	static void
2391	_enable_virtual_timer(void)
2392	{
2393	uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; / One wants to use 32 bits, but "mrs" prefers it this way /
2394
2395	__builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2396	/ disable the physical timer as a precaution, as its registers reset to architecturally unknown values /
2397	__builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2398	}
2399
2400	void
2401	fiq_context_init(boolean_t enable_fiq __unused)
2402	{
2403	/ Interrupts still disabled. /
2404	assert(ml_get_interrupts_enabled() == FALSE);
2405	_enable_virtual_timer();
2406	}
2407
2408	void
2409	wfe_timeout_init(void)
2410	{
2411	_enable_timebase_event_stream(bit_index: arm64_eventi);
2412	}
2413
2414	/**
2415	* Configures, but does not enable, the WFE event stream. The event stream
2416	* generates an event at a set interval to act as a timeout for WFEs.
2417	*
2418	* This function sets the static global variable arm64_eventi to be the proper
2419	* bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2420	* period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2421	* is used by wfe_timeout_init to actually poke the registers and enable the
2422	* event stream.
2423	*
2424	* The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2425	* is the trigger for the system to generate an event. The trigger can occur on
2426	* either the rising or falling edge of the bit depending on the value of
2427	* CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2428	* falling edge (1->0) transition to generate events.
2429	*/
2430	void
2431	wfe_timeout_configure(void)
2432	{
2433	/ Could fill in our own ops here, if we needed them /
2434	uint64_t ticks_per_sec, ticks_per_event, events_per_sec = `0`;
2435	uint32_t bit_index;
2436
2437	if (PE_parse_boot_argn(arg_string: "wfe_events_sec", arg_ptr: &events_per_sec, max_arg: sizeof(events_per_sec))) {
2438	if (events_per_sec <= `0`) {
2439	events_per_sec = `1`;
2440	} else if (events_per_sec > USEC_PER_SEC) {
2441	events_per_sec = USEC_PER_SEC;
2442	}
2443	} else {
2444	events_per_sec = USEC_PER_SEC;
2445	}
2446	ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2447	ticks_per_event = ticks_per_sec / events_per_sec;
2448
2449	/ Bit index of next power of two greater than ticks_per_event /
2450	bit_index = flsll(mask: ticks_per_event) - `1`;
2451	/ Round up to next power of two if ticks_per_event is initially power of two /
2452	if ((ticks_per_event & ((`1` << bit_index) - `1`)) != `0`) {
2453	bit_index++;
2454	}
2455
2456	/*
2457	* The timer can only trigger on rising or falling edge, not both; we don't
2458	* care which we trigger on, but we do need to adjust which bit we are
2459	* interested in to account for this.
2460	*
2461	* In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2462	* falling edge of the given bit. Therefore, we must decrement the bit index
2463	* by one as when the bit before the one we care about makes a 1 -> 0
2464	* transition, the bit we care about makes a 0 -> 1 transition.
2465	*
2466	* For example if we want an event generated every 8 ticks (if we calculated
2467	* a bit_index of 3), we would want the event to be generated whenever the
2468	* lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2469	* see that the bit at index 2 makes a falling transition in this scenario,
2470	* so we would want EVENTI to be 2 instead of 3.
2471	*/
2472	if (bit_index != `0`) {
2473	bit_index--;
2474	}
2475
2476	arm64_eventi = bit_index;
2477	}
2478
2479	boolean_t
2480	ml_delay_should_spin(uint64_t interval)
2481	{
2482	cpu_data_t *cdp = getCpuDatap();
2483
2484	if (cdp->cpu_idle_latency) {
2485	return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2486	} else {
2487	/*
2488	* Early boot, latency is unknown. Err on the side of blocking,
2489	* which should always be safe, even if slow
2490	*/
2491	return FALSE;
2492	}
2493	}
2494
2495	boolean_t
2496	ml_thread_is64bit(thread_t thread)
2497	{
2498	return thread_is_64bit_addr(thread);
2499	}
2500
2501	void
2502	ml_delay_on_yield(void)
2503	{
2504	#if DEVELOPMENT \|\| DEBUG
2505	if (yield_delay_us) {
2506	delay(yield_delay_us);
2507	}
2508	#endif
2509	}
2510
2511	void
2512	ml_timer_evaluate(void)
2513	{
2514	}
2515
2516	boolean_t
2517	ml_timer_forced_evaluation(void)
2518	{
2519	return FALSE;
2520	}
2521
2522	void
2523	ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2524	{
2525	/*
2526	* For now: update the resource coalition stats of the
2527	* current thread's coalition
2528	*/
2529	task_coalition_update_gpu_stats(task: current_task(), gpu_ns_delta);
2530	}
2531
2532	uint64_t
2533	ml_gpu_stat(__unused thread_t t)
2534	{
2535	return `0`;
2536	}
2537
2538	thread_t
2539	current_thread(void)
2540	{
2541	return current_thread_fast();
2542	}
2543
2544	#if defined(HAS_APPLE_PAC)
2545	uint8_t
2546	ml_task_get_disable_user_jop(task_t task)
2547	{
2548	assert(task);
2549	return task->disable_user_jop;
2550	}
2551
2552	void
2553	ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2554	{
2555	assert(task);
2556	task->disable_user_jop = disable_user_jop;
2557	}
2558
2559	void
2560	ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2561	{
2562	assert(thread);
2563	if (disable_user_jop) {
2564	thread->machine.arm_machine_flags \|= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2565	} else {
2566	thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2567	}
2568	}
2569
2570	void
2571	ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2572	{
2573	if (inherit) {
2574	task->rop_pid = parent_task->rop_pid;
2575	} else {
2576	task->rop_pid = early_random();
2577	}
2578	}
2579
2580	/**
2581	* jop_pid may be inherited from the parent task or generated inside the shared
2582	* region. Unfortunately these two parameters are available at very different
2583	* times during task creation, so we need to split this into two steps.
2584	*/
2585	void
2586	ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit, boolean_t disable_user_jop)
2587	{
2588	if (inherit) {
2589	task->jop_pid = parent_task->jop_pid;
2590	} else if (disable_user_jop) {
2591	task->jop_pid = ml_non_arm64e_user_jop_pid();
2592	} else {
2593	task->jop_pid = ml_default_jop_pid();
2594	}
2595	}
2596
2597	void
2598	ml_task_set_jop_pid_from_shared_region(task_t task, boolean_t disable_user_jop)
2599	{
2600	if (disable_user_jop) {
2601	task->jop_pid = ml_non_arm64e_user_jop_pid();
2602	return;
2603	}
2604
2605	vm_shared_region_t sr = vm_shared_region_get(task);
2606	/*
2607	* If there's no shared region, we can assign the key arbitrarily. This
2608	* typically happens when Mach-O image activation failed part of the way
2609	* through, and this task is in the middle of dying with SIGKILL anyway.
2610	*/
2611	if (__improbable(!sr)) {
2612	task->jop_pid = early_random();
2613	return;
2614	}
2615	vm_shared_region_deallocate(shared_region: sr);
2616
2617	/*
2618	* Similarly we have to worry about jetsam having killed the task and
2619	* already cleared the shared_region_id.
2620	*/
2621	task_lock(task);
2622	if (task->shared_region_id != NULL) {
2623	task->jop_pid = shared_region_find_key(shared_region_id: task->shared_region_id);
2624	} else {
2625	task->jop_pid = early_random();
2626	}
2627	task_unlock(task);
2628	}
2629
2630	void
2631	ml_thread_set_jop_pid(thread_t thread, task_t task)
2632	{
2633	thread->machine.jop_pid = task->jop_pid;
2634	}
2635	#endif /* defined(HAS_APPLE_PAC) */
2636
2637	#if DEVELOPMENT \|\| DEBUG
2638	static uint64_t minor_badness_suffered = `0`;
2639	#endif
2640	void
2641	ml_report_minor_badness(uint32_t __unused badness_id)
2642	{
2643	#if DEVELOPMENT \|\| DEBUG
2644	(void)os_atomic_or(&minor_badness_suffered, `1ULL` << badness_id, relaxed);
2645	#endif
2646	}
2647
2648	#if defined(HAS_APPLE_PAC)
2649	#if __ARM_ARCH_8_6__ \|\| APPLEVIRTUALPLATFORM
2650	/**
2651	* The ARMv8.6 implementation is also safe for non-FPAC CPUs, but less efficient;
2652	* guest kernels need to use it because it does not know at compile time whether
2653	* the host CPU supports FPAC.
2654	*/
2655
2656	/**
2657	* Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2658	*/
2659	static void *
2660	ml_poison_ptr(void *ptr, ptrauth_key key)
2661	{
2662	bool b_key = key & (`1ULL` << `0`);
2663	uint64_t error_code;
2664	if (b_key) {
2665	error_code = `2`;
2666	} else {
2667	error_code = `1`;
2668	}
2669
2670	bool kernel_pointer = (uintptr_t)ptr & (`1ULL` << `55`);
2671	bool data_key = key & (`1ULL` << `1`);
2672	/ When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters /
2673	bool tbi = data_key && !kernel_pointer;
2674	unsigned int poison_shift;
2675	if (tbi) {
2676	poison_shift = `53`;
2677	} else {
2678	poison_shift = `61`;
2679	}
2680
2681	uintptr_t poisoned = (uintptr_t)ptr;
2682	poisoned &= ~(`3ULL` << poison_shift);
2683	poisoned \|= error_code << poison_shift;
2684	return (void *)poisoned;
2685	}
2686
2687	/*
2688	* ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2689	* compiler to assume this operation has side-effects and cannot be reordered
2690	*/
2691	#define ptrauth_sign_volatile(__value, __suffix, __data) \
2692	({ \
2693	void *__ret = __value; \
2694	asm volatile ( \
2695	"pac" #__suffix " %[value], %[data]" \
2696	: [value] "+r"(__ret) \
2697	: [data] "r"(__data) \
2698	); \
2699	__ret; \
2700	})
2701
2702	#define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier) \
2703	do { \
2704	void *stripped = ptrauth_strip(_ptr, _key); \
2705	void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier); \
2706	if (__probable(_ptr == reauthed)) { \
2707	_ptr = stripped; \
2708	} else { \
2709	_ptr = ml_poison_ptr(stripped, _key); \
2710	} \
2711	} while (0)
2712
2713	#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2714	ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2715	#else
2716	#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2717	asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2718	#endif /* __ARM_ARCH_8_6__ \|\| APPLEVIRTUALPLATFORM */
2719
2720	/**
2721	* Authenticates a signed pointer without trapping on failure.
2722	*
2723	* @warning This function must be called with interrupts disabled.
2724	*
2725	* @warning Pointer authentication failure should normally be treated as a fatal
2726	* error. This function is intended for a handful of callers that cannot panic
2727	* on failure, and that understand the risks in handling a poisoned return
2728	* value. Other code should generally use the trapping variant
2729	* ptrauth_auth_data() instead.
2730	*
2731	* @param ptr the pointer to authenticate
2732	* @param key which key to use for authentication
2733	* @param modifier a modifier to mix into the key
2734	* @return an authenticated version of ptr, possibly with poison bits set
2735	*/
2736	void *
2737	ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2738	{
2739	switch (key & `0x3`) {
2740	case ptrauth_key_asia:
2741	_ml_auth_ptr_unchecked(ptr, ia, modifier);
2742	break;
2743	case ptrauth_key_asib:
2744	_ml_auth_ptr_unchecked(ptr, ib, modifier);
2745	break;
2746	case ptrauth_key_asda:
2747	_ml_auth_ptr_unchecked(ptr, da, modifier);
2748	break;
2749	case ptrauth_key_asdb:
2750	_ml_auth_ptr_unchecked(ptr, db, modifier);
2751	break;
2752	}
2753
2754	return ptr;
2755	}
2756	#endif /* defined(HAS_APPLE_PAC) */
2757
2758	#ifdef CONFIG_XNUPOST
2759	void
2760	ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2761	{
2762	thread_t thread = current_thread();
2763	thread->machine.expected_fault_handler = expected_fault_handler;
2764	thread->machine.expected_fault_addr = expected_fault_addr;
2765	thread->machine.expected_fault_pc = `0`;
2766	}
2767
2768	/* Expect an exception to be thrown at EXPECTED_FAULT_PC /
2769	void
2770	ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_pc)
2771	{
2772	thread_t thread = current_thread();
2773	thread->machine.expected_fault_handler = expected_fault_handler;
2774	thread->machine.expected_fault_addr = `0`;
2775	uintptr_t raw_func = (uintptr_t)ptrauth_strip(
2776	(void *)expected_fault_pc,
2777	ptrauth_key_function_pointer);
2778	thread->machine.expected_fault_pc = raw_func;
2779	}
2780
2781	void
2782	ml_expect_fault_end(void)
2783	{
2784	thread_t thread = current_thread();
2785	thread->machine.expected_fault_handler = NULL;
2786	thread->machine.expected_fault_addr = `0`;
2787	thread->machine.expected_fault_pc = `0`;
2788	}
2789	#endif /* CONFIG_XNUPOST */
2790
2791	void
2792	ml_hibernate_active_pre(void)
2793	{
2794	#if HIBERNATION
2795	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2796
2797	hibernate_rebuild_vm_structs();
2798	}
2799	#endif /* HIBERNATION */
2800	}
2801
2802	void
2803	ml_hibernate_active_post(void)
2804	{
2805	#if HIBERNATION
2806	if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2807	hibernate_machine_init();
2808	hibernate_vm_lock_end();
2809	current_cpu_datap()->cpu_hibernate = `0`;
2810	}
2811	#endif /* HIBERNATION */
2812	}
2813
2814	/**
2815	* Return back a machine-dependent array of address space regions that should be
2816	* reserved by the VM (pre-mapped in the address space). This will prevent user
2817	* processes from allocating or deallocating from within these regions.
2818	*
2819	* @param vm_is64bit True if the process has a 64-bit address space.
2820	* @param regions An out parameter representing an array of regions to reserve.
2821	*
2822	* @return The number of reserved regions returned through `regions`.
2823	*/
2824	size_t
2825	ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2826	{
2827	assert(regions != NULL);
2828
2829	/**
2830	* Reserved regions only apply to 64-bit address spaces. This is because
2831	* we only expect to grow the maximum user VA address on 64-bit address spaces
2832	* (we've essentially already reached the max for 32-bit spaces). The reserved
2833	* regions should safely fall outside of the max user VA for 32-bit processes.
2834	*/
2835	if (vm_is64bit) {
2836	*regions = vm_reserved_regions;
2837	return ARRAY_COUNT(vm_reserved_regions);
2838	} else {
2839	/ Don't reserve any VA regions on arm64_32 processes. /
2840	*regions = NULL;
2841	return `0`;
2842	}
2843	}
2844
2845	/ These WFE recommendations are expected to be updated on a relatively*
2846	* infrequent cadence, possibly from a different cluster, hence
2847	* false cacheline sharing isn't expected to be material
2848	*/
2849	static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2850
2851	uint32_t
2852	ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2853	{
2854	assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2855	assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2856	os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2857	return `0`; / Success /
2858	}
2859
2860	#if DEVELOPMENT \|\| DEBUG
2861	int wfe_rec_max = `0`;
2862	int wfe_rec_none = `0`;
2863	uint64_t wfe_rec_override_mat = `0`;
2864	uint64_t wfe_rec_clamp = `0`;
2865	#endif
2866
2867	uint64_t
2868	ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2869	{
2870	/ This and its consumer does not synchronize vis-a-vis updates*
2871	* of the recommendation; races are acceptable.
2872	*/
2873	uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2874	#if DEVELOPMENT \|\| DEBUG
2875	if (wfe_rec_clamp) {
2876	wfet = MIN(wfe_rec_clamp, wfet);
2877	}
2878
2879	if (wfe_rec_max) {
2880	for (int i = `0`; i < MAX_CPU_CLUSTERS; i++) {
2881	if (arm64_cluster_wfe_recs[i] > wfet) {
2882	wfet = arm64_cluster_wfe_recs[i];
2883	}
2884	}
2885	}
2886
2887	if (wfe_rec_none) {
2888	wfet = `0`;
2889	}
2890
2891	if (wfe_rec_override_mat) {
2892	wfet = wfe_rec_override_mat;
2893	}
2894	#endif
2895	return wfet;
2896	}
2897
2898	__pure2 bool
2899	ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
2900	{
2901	#if CONFIG_SPTM
2902	/**
2903	* If the address is within one of the SPTM-allocated per-cpu stacks, then
2904	* return true.
2905	*/
2906	if ((addr >= SPTMArgs->cpu_stack_papt_start) &&
2907	(addr < SPTMArgs->cpu_stack_papt_end)) {
2908	return true;
2909	}
2910
2911	/**
2912	* If the address is within one of the TXM thread stacks, then return true.
2913	* The SPTM guarantees that these stacks are virtually contiguous.
2914	*/
2915	if ((addr >= SPTMArgs->txm_thread_stacks[`0`]) &&
2916	(addr < SPTMArgs->txm_thread_stacks[MAX_CPUS - `1`])) {
2917	return true;
2918	}
2919
2920	return false;
2921	#elif XNU_MONITOR
2922	return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
2923	#else
2924	return false;
2925	#endif /* CONFIG_SPTM \|\| XNU_MONITOR */
2926	}
2927
2928	uint64_t
2929	ml_get_backtrace_pc(struct arm_saved_state *state)
2930	{
2931	assert((state != NULL) && is_saved_state64(state));
2932
2933	#if CONFIG_SPTM
2934	/**
2935	* On SPTM-based systems, when a non-XNU domain (e.g., SPTM) is interrupted,
2936	* the PC value saved into the state is not the actual PC at the interrupted
2937	* point, but a fixed value to a handler that knows how to re-enter the
2938	* interrupted domain. The interrupted domain's actual PC value is saved
2939	* into x14, so let's return that instead.
2940	*/
2941	if (ml_addr_in_non_xnu_stack(get_saved_state_fp(state))) {
2942	return saved_state64(state)->x[`14`];
2943	}
2944	#endif /* CONFIG_SPTM */
2945
2946	return get_saved_state_pc(iss: state);
2947	}
2948
2949
2950	bool
2951	ml_paddr_is_exclaves_owned(vm_offset_t paddr)
2952	{
2953	#if CONFIG_SPTM
2954	const sptm_frame_type_t type = sptm_get_frame_type(paddr);
2955	return type == SK_DEFAULT \|\| type == SK_IO; // SK_SHARED_R[OW] are not exclusively exclaves frames
2956	#else
2957	#pragma unused(paddr)
2958	return false;
2959	#endif /* CONFIG_SPTM */
2960	}
2961
2962	/**
2963	* Panic because an ARM saved-state accessor expected user saved-state but was
2964	* passed non-user saved-state.
2965	*
2966	* @param ss invalid saved-state (CPSR.M != EL0)
2967	*/
2968	void
2969	ml_panic_on_invalid_old_cpsr(const arm_saved_state_t *ss)
2970	{
2971	panic("invalid CPSR in user saved-state %p", ss);
2972	}
2973
2974	/**
2975	* Panic because an ARM saved-state accessor was passed user saved-state and
2976	* asked to assign a non-user CPSR.
2977	*
2978	* @param ss original EL0 saved-state
2979	* @param cpsr invalid new CPSR value (CPSR.M != EL0)
2980	*/
2981	void
2982	ml_panic_on_invalid_new_cpsr(const arm_saved_state_t *ss, uint32_t cpsr)
2983	{
2984	panic("attempt to set non-user CPSR %#010x on user saved-state %p", cpsr, ss);
2985	}
2986
2987	/**
2988	* Explicitly preallocates a floating point save area.
2989	* This is a noop on ARM because preallocation isn't required at this time.
2990	*/
2991	void
2992	ml_fp_save_area_prealloc(void)
2993	{
2994	}
2995

Browse the source code of xnu/osfmk/arm64/machine_routines.c