1/*
2 * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <arm64/machine_machdep.h>
30#include <arm64/proc_reg.h>
31#include <arm/machine_cpu.h>
32#include <arm/cpu_internal.h>
33#include <arm/cpuid.h>
34#include <arm/cpu_data.h>
35#include <arm/cpu_data_internal.h>
36#include <arm/caches_internal.h>
37#include <arm/misc_protos.h>
38#include <arm/machdep_call.h>
39#include <arm/machine_routines.h>
40#include <arm/rtclock.h>
41#include <arm/cpuid_internal.h>
42#include <arm/cpu_capabilities.h>
43#include <console/serial_protos.h>
44#include <kern/machine.h>
45#include <kern/misc_protos.h>
46#include <prng/random.h>
47#include <kern/startup.h>
48#include <kern/thread.h>
49#include <kern/timer_queue.h>
50#include <mach/machine.h>
51#include <machine/atomic.h>
52#include <machine/config.h>
53#include <vm/pmap.h>
54#include <vm/vm_page.h>
55#include <vm/vm_shared_region.h>
56#include <vm/vm_map.h>
57#include <sys/codesign.h>
58#include <sys/kdebug.h>
59#include <kern/coalition.h>
60#include <pexpert/device_tree.h>
61
62#include <IOKit/IOPlatformExpert.h>
63#if HIBERNATION
64#include <IOKit/IOHibernatePrivate.h>
65#endif /* HIBERNATION */
66
67#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
68#include <arm64/amcc_rorgn.h>
69#endif
70
71
72#if CONFIG_SPTM
73#include <arm64/sptm/sptm.h>
74#endif /* CONFIG_SPTM */
75
76#include <libkern/section_keywords.h>
77
78/**
79 * On supported hardware, debuggable builds make the HID bits read-only
80 * without locking them. This lets people manually modify HID bits while
81 * debugging, since they can use a debugging tool to first reset the HID
82 * bits back to read/write. However it will still catch xnu changes that
83 * accidentally write to HID bits after they've been made read-only.
84 */
85SECURITY_READ_ONLY_LATE(bool) skip_spr_lockdown_glb = 0;
86
87/*
88 * On some SoCs, PIO lockdown is applied in assembly in early boot by
89 * secondary CPUs.
90 * Since the cluster_pio_ro_ctl value is dynamic, it is stored here by the
91 * primary CPU so that it doesn't have to be computed each time by the
92 * startup code.
93 */
94SECURITY_READ_ONLY_LATE(uint64_t) cluster_pio_ro_ctl_mask_glb = 0;
95
96#if CONFIG_CPU_COUNTERS
97#include <kern/kpc.h>
98#endif /* CONFIG_CPU_COUNTERS */
99
100#define MPIDR_CPU_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT)
101#define MPIDR_CLUSTER_ID(mpidr_el1_val) (((mpidr_el1_val) & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT)
102
103#if HAS_CLUSTER
104static uint8_t cluster_initialized = 0;
105#endif
106
107MACHINE_TIMEOUT_DEV_WRITEABLE(LockTimeOut, "lock", 6e6 /* 0.25s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
108machine_timeout_t LockTimeOutUsec; // computed in ml_init_lock_timeout
109
110MACHINE_TIMEOUT_DEV_WRITEABLE(TLockTimeOut, "ticket-lock", 3e6 /* 0.125s */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
111
112MACHINE_TIMEOUT_DEV_WRITEABLE(MutexSpin, "mutex-spin", 240 /* 10us */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
113
114uint64_t low_MutexSpin;
115int64_t high_MutexSpin;
116
117
118
119static uint64_t ml_wfe_hint_max_interval;
120#define MAX_WFE_HINT_INTERVAL_US (500ULL)
121
122/* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */
123TUNABLE(uint32_t, yield_delay_us, "yield_delay_us", 0);
124
125extern vm_offset_t segLOWEST;
126extern vm_offset_t segLOWESTTEXT;
127extern vm_offset_t segLASTB;
128extern unsigned long segSizeLAST;
129
130/* ARM64 specific bounds; used to test for presence in the kernelcache. */
131extern vm_offset_t vm_kernelcache_base;
132extern vm_offset_t vm_kernelcache_top;
133
134/* Location of the physmap / physical aperture */
135extern uint64_t physmap_base;
136
137#if defined(CONFIG_SPTM)
138extern const arm_physrange_t *arm_vm_kernelcache_ranges;
139extern int arm_vm_kernelcache_numranges;
140#else /* defined(CONFIG_SPTM) */
141extern vm_offset_t arm_vm_kernelcache_phys_start;
142extern vm_offset_t arm_vm_kernelcache_phys_end;
143#endif /* defined(CONFIG_SPTM) */
144
145#if defined(HAS_IPI)
146unsigned int gFastIPI = 1;
147#define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */
148static TUNABLE_WRITEABLE(uint64_t, deferred_ipi_timer_ns, "fastipitimeout",
149 kDeferredIPITimerDefault);
150#endif /* defined(HAS_IPI) */
151
152thread_t Idle_context(void);
153
154SECURITY_READ_ONLY_LATE(bool) cpu_config_correct = true;
155
156SECURITY_READ_ONLY_LATE(static ml_topology_cpu_t) topology_cpu_array[MAX_CPUS];
157SECURITY_READ_ONLY_LATE(static ml_topology_cluster_t) topology_cluster_array[MAX_CPU_CLUSTERS];
158SECURITY_READ_ONLY_LATE(static ml_topology_info_t) topology_info = {
159 .version = CPU_TOPOLOGY_VERSION,
160 .cpus = topology_cpu_array,
161 .clusters = topology_cluster_array,
162};
163
164_Atomic unsigned int cluster_type_num_active_cpus[MAX_CPU_TYPES];
165
166/**
167 * Represents the offset of each cluster within a hypothetical array of MAX_CPUS
168 * entries of an arbitrary data type. This is intended for use by specialized consumers
169 * that must quickly access per-CPU data using only the physical CPU ID (MPIDR_EL1),
170 * as follows:
171 * hypothetical_array[cluster_offsets[AFF1] + AFF0]
172 * Most consumers should instead use general-purpose facilities such as PERCPU or
173 * ml_get_cpu_number().
174 */
175SECURITY_READ_ONLY_LATE(int64_t) cluster_offsets[MAX_CPU_CLUSTER_PHY_ID + 1];
176
177SECURITY_READ_ONLY_LATE(static uint32_t) arm64_eventi = UINT32_MAX;
178
179extern uint32_t lockdown_done;
180
181/**
182 * Represents regions of virtual address space that should be reserved
183 * (pre-mapped) in each user address space.
184 */
185static const struct vm_reserved_region vm_reserved_regions[] = {
186 {
187 .vmrr_name = "GPU Carveout",
188 .vmrr_addr = MACH_VM_MIN_GPU_CARVEOUT_ADDRESS,
189 .vmrr_size = (vm_map_size_t)(MACH_VM_MAX_GPU_CARVEOUT_ADDRESS - MACH_VM_MIN_GPU_CARVEOUT_ADDRESS)
190 },
191 /*
192 * Reserve the virtual memory space representing the commpage nesting region
193 * to prevent user processes from allocating memory within it. The actual
194 * page table entries for the commpage are inserted by vm_commpage_enter().
195 * This vm_map_enter() just prevents userspace from allocating/deallocating
196 * anything within the entire commpage nested region.
197 */
198 {
199 .vmrr_name = "commpage nesting",
200 .vmrr_addr = _COMM_PAGE64_NESTING_START,
201 .vmrr_size = _COMM_PAGE64_NESTING_SIZE
202 }
203};
204
205uint32_t get_arm_cpu_version(void);
206
207#if defined(HAS_IPI)
208static inline void
209ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
210{
211#if HAS_CLUSTER
212 uint64_t local_mpidr;
213 /* NOTE: this logic expects that we are called in a non-preemptible
214 * context, or at least one in which the calling thread is bound
215 * to a single CPU. Otherwise we may migrate between choosing which
216 * IPI mechanism to use and issuing the IPI. */
217 MRS(local_mpidr, "MPIDR_EL1");
218 if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
219 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
220 MSR("S3_5_C15_C0_0", x);
221 } else {
222 #define IPI_RR_TARGET_CLUSTER_SHIFT 16
223 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
224 MSR("S3_5_C15_C0_1", x);
225 }
226#else
227 uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
228 MSR("S3_5_C15_C0_1", x);
229#endif
230 /* The recommended local/global IPI sequence is:
231 * DSB <sys> (This ensures visibility of e.g. older stores to the
232 * pending CPU signals bit vector in DRAM prior to IPI reception,
233 * and is present in cpu_signal_internal())
234 * MSR S3_5_C15_C0_1, Xt
235 * ISB
236 */
237 __builtin_arm_isb(ISB_SY);
238}
239#endif
240
241#if !defined(HAS_IPI)
242__dead2
243#endif
244void
245ml_cpu_signal(unsigned int cpu_mpidr __unused)
246{
247#if defined(HAS_IPI)
248 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE);
249#else
250 panic("Platform does not support ACC Fast IPI");
251#endif
252}
253
254#if !defined(HAS_IPI)
255__dead2
256#endif
257void
258ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
259{
260#if defined(HAS_IPI)
261 /* adjust IPI_CR timer countdown value for deferred IPI
262 * accepts input in nanosecs, convert to absolutetime (REFCLK ticks),
263 * clamp maximum REFCLK ticks to 0xFFFF (16 bit field)
264 *
265 * global register, should only require a single write to update all
266 * CPU cores: from Skye ACC user spec section 5.7.3.3
267 *
268 * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK.
269 * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies.
270 */
271 uint64_t abstime;
272
273 nanoseconds_to_absolutetime(nanosecs, &abstime);
274
275 abstime = MIN(abstime, 0xFFFF);
276
277 /* update deferred_ipi_timer_ns with the new clamped value */
278 absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
279
280 MSR("S3_5_C15_C3_1", abstime);
281#else
282 (void)nanosecs;
283 panic("Platform does not support ACC Fast IPI");
284#endif
285}
286
287uint64_t
288ml_cpu_signal_deferred_get_timer()
289{
290#if defined(HAS_IPI)
291 return deferred_ipi_timer_ns;
292#else
293 return 0;
294#endif
295}
296
297#if !defined(HAS_IPI)
298__dead2
299#endif
300void
301ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused)
302{
303#if defined(HAS_IPI)
304 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED);
305#else
306 panic("Platform does not support ACC Fast IPI deferral");
307#endif
308}
309
310#if !defined(HAS_IPI)
311__dead2
312#endif
313void
314ml_cpu_signal_retract(unsigned int cpu_mpidr __unused)
315{
316#if defined(HAS_IPI)
317 ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT);
318#else
319 panic("Platform does not support ACC Fast IPI retraction");
320#endif
321}
322
323extern uint32_t idle_proximate_io_wfe_unmasked;
324
325#define CPUPM_IDLE_WFE 0x5310300
326static bool
327wfe_process_recommendation(void)
328{
329 bool ipending = false;
330 if (__probable(idle_proximate_io_wfe_unmasked == 1)) {
331 /* Check for an active perf. controller generated
332 * WFE recommendation for this cluster.
333 */
334 cpu_data_t *cdp = getCpuDatap();
335 uint32_t cid = cdp->cpu_cluster_id;
336 uint64_t wfe_ttd = 0;
337 uint64_t wfe_deadline = 0;
338
339 if ((wfe_ttd = ml_cluster_wfe_timeout(wfe_cluster_id: cid)) != 0) {
340 wfe_deadline = mach_absolute_time() + wfe_ttd;
341 }
342
343 if (wfe_deadline != 0) {
344 /* Poll issuing event-bounded WFEs until an interrupt
345 * arrives or the WFE recommendation expires
346 */
347#if DEVELOPMENT || DEBUG
348 uint64_t wc = cdp->wfe_count;
349 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_START, ipending, wc, wfe_ttd, cdp->cpu_stat.irq_ex_cnt_wake);
350#endif
351 /* Issue WFE until the recommendation expires,
352 * with IRQs unmasked.
353 */
354 ipending = wfe_to_deadline_or_interrupt(cid, wfe_deadline, cdp, true, true);
355#if DEVELOPMENT || DEBUG
356 KDBG(CPUPM_IDLE_WFE | DBG_FUNC_END, ipending, cdp->wfe_count - wc, wfe_deadline, cdp->cpu_stat.irq_ex_cnt_wake);
357#endif
358 }
359 }
360 return ipending;
361}
362
363void
364machine_idle(void)
365{
366 /* Interrupts are expected to be masked on entry or re-entry via
367 * Idle_load_context()
368 */
369 assert((__builtin_arm_rsr("DAIF") & (DAIF_IRQF | DAIF_FIQF)) == (DAIF_IRQF | DAIF_FIQF));
370 /* Check for, and act on, a WFE recommendation.
371 * Bypasses context spill/fill for a minor perf. increment.
372 * May unmask and restore IRQ+FIQ mask.
373 */
374 if (wfe_process_recommendation() == false) {
375 /* If WFE recommendation absent, or WFE deadline
376 * arrived with no interrupt pending/processed,
377 * fall back to WFI.
378 */
379 Idle_context();
380 }
381 __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
382}
383
384void
385OSSynchronizeIO(void)
386{
387 __builtin_arm_dsb(DSB_SY);
388}
389
390uint64_t
391get_aux_control(void)
392{
393 uint64_t value;
394
395 MRS(value, "ACTLR_EL1");
396 return value;
397}
398
399uint64_t
400get_mmu_control(void)
401{
402 uint64_t value;
403
404 MRS(value, "SCTLR_EL1");
405 return value;
406}
407
408uint64_t
409get_tcr(void)
410{
411 uint64_t value;
412
413 MRS(value, "TCR_EL1");
414 return value;
415}
416
417boolean_t
418ml_get_interrupts_enabled(void)
419{
420 uint64_t value;
421
422 MRS(value, "DAIF");
423 if (value & DAIF_IRQF) {
424 return FALSE;
425 }
426 return TRUE;
427}
428
429pmap_paddr_t
430get_mmu_ttb(void)
431{
432 pmap_paddr_t value;
433
434 MRS(value, "TTBR0_EL1");
435 return value;
436}
437
438uint32_t
439get_arm_cpu_version(void)
440{
441 uint32_t value = machine_read_midr();
442
443 /* Compose the register values into 8 bits; variant[7:4], revision[3:0]. */
444 return ((value & MIDR_EL1_REV_MASK) >> MIDR_EL1_REV_SHIFT) | ((value & MIDR_EL1_VAR_MASK) >> (MIDR_EL1_VAR_SHIFT - 4));
445}
446
447bool
448ml_feature_supported(uint32_t feature_bit)
449{
450 uint64_t aidr_el1_value = 0;
451
452 MRS(aidr_el1_value, "AIDR_EL1");
453
454#ifdef APPLEAVALANCHE
455#endif // APPLEAVALANCHE
456
457 return aidr_el1_value & feature_bit;
458}
459
460/*
461 * user_cont_hwclock_allowed()
462 *
463 * Indicates whether we allow EL0 to read the virtual timebase (CNTVCT_EL0)
464 * as a continuous time source (e.g. from mach_continuous_time)
465 */
466boolean_t
467user_cont_hwclock_allowed(void)
468{
469#if HAS_CONTINUOUS_HWCLOCK
470 return TRUE;
471#else
472 return FALSE;
473#endif
474}
475
476/*
477 * user_timebase_type()
478 *
479 * Indicates type of EL0 virtual timebase read (CNTVCT_EL0).
480 *
481 * USER_TIMEBASE_NONE: EL0 has no access to timebase register
482 * USER_TIMEBASE_SPEC: EL0 has access to speculative timebase reads (CNTVCT_EL0)
483 * USER_TIMEBASE_NOSPEC: EL0 has access to non speculative timebase reads (CNTVCTSS_EL0)
484 *
485 */
486
487uint8_t
488user_timebase_type(void)
489{
490#if HAS_ACNTVCT
491 return USER_TIMEBASE_NOSPEC_APPLE;
492#elif __ARM_ARCH_8_6__
493 return USER_TIMEBASE_NOSPEC;
494#else
495 return USER_TIMEBASE_SPEC;
496#endif
497}
498
499void
500machine_startup(__unused boot_args * args)
501{
502#if defined(HAS_IPI) && (DEVELOPMENT || DEBUG)
503 if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) {
504 gFastIPI = 1;
505 }
506#endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/
507
508
509 machine_conf();
510
511
512 /*
513 * Kick off the kernel bootstrap.
514 */
515 kernel_bootstrap();
516 /* NOTREACHED */
517}
518
519typedef void (*invalidate_fn_t)(void);
520
521static SECURITY_READ_ONLY_LATE(invalidate_fn_t) invalidate_hmac_function = NULL;
522
523void set_invalidate_hmac_function(invalidate_fn_t fn);
524
525void
526set_invalidate_hmac_function(invalidate_fn_t fn)
527{
528 if (NULL != invalidate_hmac_function) {
529 panic("Invalidate HMAC function already set");
530 }
531
532 invalidate_hmac_function = fn;
533}
534
535void
536machine_lockdown(void)
537{
538
539#if CONFIG_SPTM
540 /**
541 * On devices that make use of the SPTM, the SPTM is responsible for
542 * managing system register locks. Due to this, we skip the call to
543 * spr_lockdown() below.
544 */
545#else
546#endif
547
548 arm_vm_prot_finalize(args: PE_state.bootArgs);
549
550#if CONFIG_KERNEL_INTEGRITY
551#if KERNEL_INTEGRITY_WT
552 /* Watchtower
553 *
554 * Notify the monitor about the completion of early kernel bootstrap.
555 * From this point forward it will enforce the integrity of kernel text,
556 * rodata and page tables.
557 */
558
559#ifdef MONITOR
560 monitor_call(MONITOR_LOCKDOWN, 0, 0, 0);
561#endif
562#endif /* KERNEL_INTEGRITY_WT */
563
564#if CONFIG_SPTM
565 extern void pmap_prepare_commpages(void);
566 pmap_prepare_commpages();
567
568 /**
569 * sptm_lockdown_xnu() disables preemption like all SPTM calls, but may take
570 * a fair amount of time as it involves retyping a large number of pages.
571 * This preemption latency is not really a concern since we're still fairly
572 * early in the boot process, so just explicitly disable preemption before
573 * invoking the SPTM and abandon preemption latency measurements before
574 * re-enabling it.
575 */
576 disable_preemption();
577 /* Signal the SPTM that XNU is ready for RO memory to actually become read-only */
578 sptm_lockdown_xnu();
579#if SCHED_HYGIENE_DEBUG
580 abandon_preemption_disable_measurement();
581#endif /* SCHED_HYGIENE_DEBUG */
582 enable_preemption();
583#else
584#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
585 /* KTRR
586 *
587 * Lock physical KTRR region. KTRR region is read-only. Memory outside
588 * the region is not executable at EL1.
589 */
590
591 rorgn_lockdown();
592#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
593#endif /* CONFIG_SPTM */
594
595#if XNU_MONITOR
596 pmap_lockdown_ppl();
597#endif
598
599#endif /* CONFIG_KERNEL_INTEGRITY */
600
601
602 if (NULL != invalidate_hmac_function) {
603 invalidate_hmac_function();
604 }
605
606 lockdown_done = 1;
607}
608
609
610char *
611machine_boot_info(
612 __unused char *buf,
613 __unused vm_size_t size)
614{
615 return PE_boot_args();
616}
617
618void
619slave_machine_init(__unused void *param)
620{
621 cpu_machine_init(); /* Initialize the processor */
622 clock_init(); /* Init the clock */
623}
624
625/*
626 * Routine: machine_processor_shutdown
627 * Function:
628 */
629thread_t
630machine_processor_shutdown(
631 __unused thread_t thread,
632 void (*doshutdown)(processor_t),
633 processor_t processor)
634{
635 return Shutdown_context(doshutdown, processor);
636}
637
638/*
639 * Routine: ml_init_lock_timeout
640 * Function:
641 */
642static void __startup_func
643ml_init_lock_timeout(void)
644{
645 /*
646 * This function is called after STARTUP_SUB_TIMEOUTS
647 * initialization, so using the "legacy" boot-args here overrides
648 * the ml-timeout-... configuration. (Given that these boot-args
649 * here are usually explicitly specified, this makes sense by
650 * overriding ml-timeout-..., which may come from the device tree.
651 */
652
653 uint64_t lto_timeout_ns;
654 uint64_t lto_abstime;
655 uint32_t slto;
656
657 if (PE_parse_boot_argn(arg_string: "slto_us", arg_ptr: &slto, max_arg: sizeof(slto))) {
658 lto_timeout_ns = slto * NSEC_PER_USEC;
659 nanoseconds_to_absolutetime(nanoseconds: lto_timeout_ns, result: &lto_abstime);
660 os_atomic_store(&LockTimeOut, lto_abstime, relaxed);
661 } else {
662 lto_abstime = os_atomic_load(&LockTimeOut, relaxed);
663 absolutetime_to_nanoseconds(abstime: lto_abstime, result: &lto_timeout_ns);
664 }
665
666 os_atomic_store(&LockTimeOutUsec, lto_timeout_ns / NSEC_PER_USEC, relaxed);
667
668 if (PE_parse_boot_argn(arg_string: "tlto_us", arg_ptr: &slto, max_arg: sizeof(slto))) {
669 nanoseconds_to_absolutetime(nanoseconds: slto * NSEC_PER_USEC, result: &lto_abstime);
670 os_atomic_store(&TLockTimeOut, lto_abstime, relaxed);
671 } else if (lto_abstime != 0) {
672 os_atomic_store(&TLockTimeOut, lto_abstime >> 1, relaxed);
673 } // else take default from MACHINE_TIMEOUT.
674
675 uint64_t mtxspin;
676 uint64_t mtx_abstime;
677 if (PE_parse_boot_argn(arg_string: "mtxspin", arg_ptr: &mtxspin, max_arg: sizeof(mtxspin))) {
678 if (mtxspin > USEC_PER_SEC >> 4) {
679 mtxspin = USEC_PER_SEC >> 4;
680 }
681 nanoseconds_to_absolutetime(nanoseconds: mtxspin * NSEC_PER_USEC, result: &mtx_abstime);
682 os_atomic_store(&MutexSpin, mtx_abstime, relaxed);
683 } else {
684 mtx_abstime = os_atomic_load(&MutexSpin, relaxed);
685 }
686
687 low_MutexSpin = os_atomic_load(&MutexSpin, relaxed);
688 /*
689 * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
690 * real_ncpus is not set at this time
691 *
692 * NOTE: active spinning is disabled in arm. It can be activated
693 * by setting high_MutexSpin through the sysctl.
694 */
695 high_MutexSpin = low_MutexSpin;
696
697 uint64_t maxwfeus = MAX_WFE_HINT_INTERVAL_US;
698 PE_parse_boot_argn(arg_string: "max_wfe_us", arg_ptr: &maxwfeus, max_arg: sizeof(maxwfeus));
699 nanoseconds_to_absolutetime(nanoseconds: maxwfeus * NSEC_PER_USEC, result: &ml_wfe_hint_max_interval);
700}
701STARTUP(TIMEOUTS, STARTUP_RANK_MIDDLE, ml_init_lock_timeout);
702
703
704/*
705 * This is called when all of the ml_processor_info_t structures have been
706 * initialized and all the processors have been started through processor_start().
707 *
708 * Required by the scheduler subsystem.
709 */
710void
711ml_cpu_init_completed(void)
712{
713 if (SCHED(cpu_init_completed) != NULL) {
714 SCHED(cpu_init_completed)();
715 }
716}
717
718/*
719 * These are called from the machine-independent routine cpu_up()
720 * to perform machine-dependent info updates.
721 *
722 * The update to CPU counts needs to be separate from other actions
723 * because we don't update the counts when CLPC causes temporary
724 * cluster powerdown events, as these must be transparent to the user.
725 */
726void
727ml_cpu_up(void)
728{
729}
730
731void
732ml_cpu_up_update_counts(int cpu_id)
733{
734 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
735
736 os_atomic_inc(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
737
738 os_atomic_inc(&machine_info.physical_cpu, relaxed);
739 os_atomic_inc(&machine_info.logical_cpu, relaxed);
740}
741
742/*
743 * These are called from the machine-independent routine cpu_down()
744 * to perform machine-dependent info updates.
745 *
746 * The update to CPU counts needs to be separate from other actions
747 * because we don't update the counts when CLPC causes temporary
748 * cluster powerdown events, as these must be transparent to the user.
749 */
750void
751ml_cpu_down(void)
752{
753 /*
754 * If we want to deal with outstanding IPIs, we need to
755 * do relatively early in the processor_doshutdown path,
756 * as we pend decrementer interrupts using the IPI
757 * mechanism if we cannot immediately service them (if
758 * IRQ is masked). Do so now.
759 *
760 * We aren't on the interrupt stack here; would it make
761 * more sense to disable signaling and then enable
762 * interrupts? It might be a bit cleaner.
763 */
764 cpu_data_t *cpu_data_ptr = getCpuDatap();
765 cpu_data_ptr->cpu_running = FALSE;
766
767 if (cpu_data_ptr != &BootCpuData) {
768 /*
769 * Move all of this cpu's timers to the master/boot cpu,
770 * and poke it in case there's a sooner deadline for it to schedule.
771 */
772 timer_queue_shutdown(queue: &cpu_data_ptr->rtclock_timer.queue);
773 kern_return_t rv = cpu_xcall(BootCpuData.cpu_number, &timer_queue_expire_local, &ml_cpu_down);
774 if (rv != KERN_SUCCESS) {
775 panic("ml_cpu_down: IPI failure %d", rv);
776 }
777 }
778
779 cpu_signal_handler_internal(TRUE);
780}
781void
782ml_cpu_down_update_counts(int cpu_id)
783{
784 ml_topology_cpu_t *cpu = &ml_get_topology_info()->cpus[cpu_id];
785
786 os_atomic_dec(&cluster_type_num_active_cpus[cpu->cluster_type], relaxed);
787
788 os_atomic_dec(&machine_info.physical_cpu, relaxed);
789 os_atomic_dec(&machine_info.logical_cpu, relaxed);
790}
791
792
793unsigned int
794ml_get_machine_mem(void)
795{
796 return machine_info.memory_size;
797}
798
799__attribute__((noreturn))
800void
801halt_all_cpus(boolean_t reboot)
802{
803 if (reboot) {
804 printf(format: "MACH Reboot\n");
805 PEHaltRestart(type: kPERestartCPU);
806 } else {
807 printf(format: "CPU halted\n");
808 PEHaltRestart(type: kPEHaltCPU);
809 }
810 while (1) {
811 ;
812 }
813}
814
815__attribute__((noreturn))
816void
817halt_cpu(void)
818{
819 halt_all_cpus(FALSE);
820}
821
822/*
823 * Routine: machine_signal_idle
824 * Function:
825 */
826void
827machine_signal_idle(
828 processor_t processor)
829{
830 cpu_signal(target: processor_to_cpu_datap(processor), SIGPnop, p0: (void *)NULL, p1: (void *)NULL);
831 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
832}
833
834void
835machine_signal_idle_deferred(
836 processor_t processor)
837{
838 cpu_signal_deferred(target: processor_to_cpu_datap(processor));
839 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
840}
841
842void
843machine_signal_idle_cancel(
844 processor_t processor)
845{
846 cpu_signal_cancel(target: processor_to_cpu_datap(processor));
847 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0);
848}
849
850/*
851 * Routine: ml_install_interrupt_handler
852 * Function: Initialize Interrupt Handler
853 */
854void
855ml_install_interrupt_handler(
856 void *nub,
857 int source,
858 void *target,
859 IOInterruptHandler handler,
860 void *refCon)
861{
862 cpu_data_t *cpu_data_ptr;
863 boolean_t current_state;
864
865 current_state = ml_set_interrupts_enabled(FALSE);
866 cpu_data_ptr = getCpuDatap();
867
868 cpu_data_ptr->interrupt_nub = nub;
869 cpu_data_ptr->interrupt_source = source;
870 cpu_data_ptr->interrupt_target = target;
871 cpu_data_ptr->interrupt_handler = handler;
872 cpu_data_ptr->interrupt_refCon = refCon;
873
874 (void) ml_set_interrupts_enabled(enable: current_state);
875}
876
877/*
878 * Routine: ml_init_interrupt
879 * Function: Initialize Interrupts
880 */
881void
882ml_init_interrupt(void)
883{
884#if defined(HAS_IPI)
885 /*
886 * ml_init_interrupt will get called once for each CPU, but this is redundant
887 * because there is only one global copy of the register for skye. do it only
888 * on the bootstrap cpu
889 */
890 if (getCpuDatap()->cluster_master) {
891 ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns);
892 }
893#endif
894}
895
896/*
897 * Routine: ml_init_timebase
898 * Function: register and setup Timebase, Decremeter services
899 */
900void
901ml_init_timebase(
902 void *args,
903 tbd_ops_t tbd_funcs,
904 vm_offset_t int_address,
905 vm_offset_t int_value __unused)
906{
907 cpu_data_t *cpu_data_ptr;
908
909 cpu_data_ptr = (cpu_data_t *)args;
910
911 if ((cpu_data_ptr == &BootCpuData)
912 && (rtclock_timebase_func.tbd_fiq_handler == (void *)NULL)) {
913 rtclock_timebase_func = *tbd_funcs;
914 rtclock_timebase_addr = int_address;
915 }
916}
917
918#define ML_READPROP_MANDATORY UINT64_MAX
919
920static uint64_t
921ml_readprop(const DTEntry entry, const char *propertyName, uint64_t default_value)
922{
923 void const *prop;
924 unsigned int propSize;
925
926 if (SecureDTGetProperty(entry, propertyName, propertyValue: &prop, propertySize: &propSize) == kSuccess) {
927 if (propSize == sizeof(uint8_t)) {
928 return *((uint8_t const *)prop);
929 } else if (propSize == sizeof(uint16_t)) {
930 return *((uint16_t const *)prop);
931 } else if (propSize == sizeof(uint32_t)) {
932 return *((uint32_t const *)prop);
933 } else if (propSize == sizeof(uint64_t)) {
934 return *((uint64_t const *)prop);
935 } else {
936 panic("CPU property '%s' has bad size %u", propertyName, propSize);
937 }
938 } else {
939 if (default_value == ML_READPROP_MANDATORY) {
940 panic("Missing mandatory property '%s'", propertyName);
941 }
942 return default_value;
943 }
944}
945
946static boolean_t
947ml_read_reg_range(const DTEntry entry, const char *propertyName, uint64_t *pa_ptr, uint64_t *len_ptr)
948{
949 uint64_t const *prop;
950 unsigned int propSize;
951
952 if (SecureDTGetProperty(entry, propertyName, propertyValue: (void const **)&prop, propertySize: &propSize) != kSuccess) {
953 return FALSE;
954 }
955
956 if (propSize != sizeof(uint64_t) * 2) {
957 panic("Wrong property size for %s", propertyName);
958 }
959
960 *pa_ptr = prop[0];
961 *len_ptr = prop[1];
962 return TRUE;
963}
964
965static boolean_t
966ml_is_boot_cpu(const DTEntry entry)
967{
968 void const *prop;
969 unsigned int propSize;
970
971 if (SecureDTGetProperty(entry, propertyName: "state", propertyValue: &prop, propertySize: &propSize) != kSuccess) {
972 panic("unable to retrieve state for cpu");
973 }
974
975 if (strncmp(s1: (char const *)prop, s2: "running", n: propSize) == 0) {
976 return TRUE;
977 } else {
978 return FALSE;
979 }
980}
981
982static void
983ml_read_chip_revision(unsigned int *rev __unused)
984{
985 // The CPU_VERSION_* macros are only defined on APPLE_ARM64_ARCH_FAMILY builds
986#ifdef APPLE_ARM64_ARCH_FAMILY
987 DTEntry entryP;
988
989 if ((SecureDTFindEntry(propName: "name", propValue: "arm-io", entryH: &entryP) == kSuccess)) {
990 *rev = (unsigned int)ml_readprop(entry: entryP, propertyName: "chip-revision", CPU_VERSION_UNKNOWN);
991 } else {
992 *rev = CPU_VERSION_UNKNOWN;
993 }
994#endif
995}
996
997void
998ml_parse_cpu_topology(void)
999{
1000 DTEntry entry, child __unused;
1001 OpaqueDTEntryIterator iter;
1002 uint32_t cpu_boot_arg = MAX_CPUS;
1003 uint64_t cpumask_boot_arg = ULLONG_MAX;
1004 int err;
1005
1006 int64_t cluster_phys_to_logical[MAX_CPU_CLUSTER_PHY_ID + 1];
1007 int64_t cluster_max_cpu_phys_id[MAX_CPU_CLUSTER_PHY_ID + 1];
1008 const boolean_t cpus_boot_arg_present = PE_parse_boot_argn(arg_string: "cpus", arg_ptr: &cpu_boot_arg, max_arg: sizeof(cpu_boot_arg));
1009 const boolean_t cpumask_boot_arg_present = PE_parse_boot_argn(arg_string: "cpumask", arg_ptr: &cpumask_boot_arg, max_arg: sizeof(cpumask_boot_arg));
1010
1011 // The cpus=N and cpumask=N boot args cannot be used simultaneously. Flag this
1012 // so that we trigger a panic later in the boot process, once serial is enabled.
1013 if (cpus_boot_arg_present && cpumask_boot_arg_present) {
1014 cpu_config_correct = false;
1015 }
1016
1017 err = SecureDTLookupEntry(NULL, pathName: "/cpus", foundEntry: &entry);
1018 assert(err == kSuccess);
1019
1020 err = SecureDTInitEntryIterator(startEntry: entry, iter: &iter);
1021 assert(err == kSuccess);
1022
1023 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1024 cluster_offsets[i] = -1;
1025 cluster_phys_to_logical[i] = -1;
1026 cluster_max_cpu_phys_id[i] = 0;
1027 }
1028
1029 while (kSuccess == SecureDTIterateEntries(iterator: &iter, nextEntry: &child)) {
1030 boolean_t is_boot_cpu = ml_is_boot_cpu(entry: child);
1031 boolean_t cpu_enabled = cpumask_boot_arg & 1;
1032 cpumask_boot_arg >>= 1;
1033
1034 // Boot CPU disabled in cpumask. Flag this so that we trigger a panic
1035 // later in the boot process, once serial is enabled.
1036 if (is_boot_cpu && !cpu_enabled) {
1037 cpu_config_correct = false;
1038 }
1039
1040 // Ignore this CPU if it has been disabled by the cpumask= boot-arg.
1041 if (!is_boot_cpu && !cpu_enabled) {
1042 continue;
1043 }
1044
1045 // If the number of CPUs is constrained by the cpus= boot-arg, and the boot CPU hasn't
1046 // been added to the topology struct yet, and we only have one slot left, then skip
1047 // every other non-boot CPU in order to leave room for the boot CPU.
1048 //
1049 // e.g. if the boot-args say "cpus=3" and CPU4 is the boot CPU, then the cpus[]
1050 // array will list CPU0, CPU1, and CPU4. CPU2-CPU3 and CPU5-CPUn will be omitted.
1051 if (topology_info.num_cpus >= (cpu_boot_arg - 1) && topology_info.boot_cpu == NULL && !is_boot_cpu) {
1052 continue;
1053 }
1054 if (topology_info.num_cpus >= cpu_boot_arg) {
1055 break;
1056 }
1057
1058 ml_topology_cpu_t *cpu = &topology_info.cpus[topology_info.num_cpus];
1059
1060 cpu->cpu_id = topology_info.num_cpus++;
1061 assert(cpu->cpu_id < MAX_CPUS);
1062 topology_info.max_cpu_id = MAX(topology_info.max_cpu_id, cpu->cpu_id);
1063
1064 cpu->die_id = 0;
1065 topology_info.max_die_id = 0;
1066
1067 cpu->phys_id = (uint32_t)ml_readprop(entry: child, propertyName: "reg", ML_READPROP_MANDATORY);
1068
1069 cpu->l2_access_penalty = (uint32_t)ml_readprop(entry: child, propertyName: "l2-access-penalty", default_value: 0);
1070 cpu->l2_cache_size = (uint32_t)ml_readprop(entry: child, propertyName: "l2-cache-size", default_value: 0);
1071 cpu->l2_cache_id = (uint32_t)ml_readprop(entry: child, propertyName: "l2-cache-id", default_value: 0);
1072 cpu->l3_cache_size = (uint32_t)ml_readprop(entry: child, propertyName: "l3-cache-size", default_value: 0);
1073 cpu->l3_cache_id = (uint32_t)ml_readprop(entry: child, propertyName: "l3-cache-id", default_value: 0);
1074
1075 ml_read_reg_range(entry: child, propertyName: "cpu-uttdbg-reg", pa_ptr: &cpu->cpu_UTTDBG_pa, len_ptr: &cpu->cpu_UTTDBG_len);
1076 ml_read_reg_range(entry: child, propertyName: "cpu-impl-reg", pa_ptr: &cpu->cpu_IMPL_pa, len_ptr: &cpu->cpu_IMPL_len);
1077 ml_read_reg_range(entry: child, propertyName: "coresight-reg", pa_ptr: &cpu->coresight_pa, len_ptr: &cpu->coresight_len);
1078 cpu->cluster_type = CLUSTER_TYPE_SMP;
1079
1080 int cluster_type = (int)ml_readprop(entry: child, propertyName: "cluster-type", default_value: 0);
1081 if (cluster_type == 'E') {
1082 cpu->cluster_type = CLUSTER_TYPE_E;
1083 } else if (cluster_type == 'P') {
1084 cpu->cluster_type = CLUSTER_TYPE_P;
1085 }
1086
1087 topology_info.cluster_type_num_cpus[cpu->cluster_type]++;
1088
1089 /*
1090 * Since we want to keep a linear cluster ID space, we cannot just rely
1091 * on the value provided by EDT. Instead, use the MPIDR value to see if we have
1092 * seen this exact cluster before. If so, then reuse that cluster ID for this CPU.
1093 */
1094#if HAS_CLUSTER
1095 uint32_t phys_cluster_id = MPIDR_CLUSTER_ID(cpu->phys_id);
1096#else
1097 uint32_t phys_cluster_id = (cpu->cluster_type == CLUSTER_TYPE_P);
1098#endif
1099 assert(phys_cluster_id <= MAX_CPU_CLUSTER_PHY_ID);
1100 cpu->cluster_id = ((cluster_phys_to_logical[phys_cluster_id] == -1) ?
1101 topology_info.num_clusters : cluster_phys_to_logical[phys_cluster_id]);
1102
1103 assert(cpu->cluster_id < MAX_CPU_CLUSTERS);
1104
1105 ml_topology_cluster_t *cluster = &topology_info.clusters[cpu->cluster_id];
1106 if (cluster->num_cpus == 0) {
1107 assert(topology_info.num_clusters < MAX_CPU_CLUSTERS);
1108
1109 topology_info.num_clusters++;
1110 topology_info.max_cluster_id = MAX(topology_info.max_cluster_id, cpu->cluster_id);
1111 topology_info.cluster_types |= (1 << cpu->cluster_type);
1112
1113 cluster->cluster_id = cpu->cluster_id;
1114 cluster->cluster_type = cpu->cluster_type;
1115 cluster->first_cpu_id = cpu->cpu_id;
1116 assert(cluster_phys_to_logical[phys_cluster_id] == -1);
1117 cluster_phys_to_logical[phys_cluster_id] = cpu->cluster_id;
1118
1119 topology_info.cluster_type_num_clusters[cluster->cluster_type]++;
1120
1121 // Since we don't have a per-cluster EDT node, this is repeated in each CPU node.
1122 // If we wind up with a bunch of these, we might want to create separate per-cluster
1123 // EDT nodes and have the CPU nodes reference them through a phandle.
1124 ml_read_reg_range(entry: child, propertyName: "acc-impl-reg", pa_ptr: &cluster->acc_IMPL_pa, len_ptr: &cluster->acc_IMPL_len);
1125 ml_read_reg_range(entry: child, propertyName: "cpm-impl-reg", pa_ptr: &cluster->cpm_IMPL_pa, len_ptr: &cluster->cpm_IMPL_len);
1126 }
1127
1128#if HAS_CLUSTER
1129 if (MPIDR_CPU_ID(cpu->phys_id) > cluster_max_cpu_phys_id[phys_cluster_id]) {
1130 cluster_max_cpu_phys_id[phys_cluster_id] = MPIDR_CPU_ID(cpu->phys_id);
1131 }
1132#endif
1133
1134 cpu->die_cluster_id = (int)ml_readprop(entry: child, propertyName: "die-cluster-id", MPIDR_CLUSTER_ID(cpu->phys_id));
1135 cpu->cluster_core_id = (int)ml_readprop(entry: child, propertyName: "cluster-core-id", MPIDR_CPU_ID(cpu->phys_id));
1136
1137 cluster->num_cpus++;
1138 cluster->cpu_mask |= 1ULL << cpu->cpu_id;
1139
1140 if (is_boot_cpu) {
1141 assert(topology_info.boot_cpu == NULL);
1142 topology_info.boot_cpu = cpu;
1143 topology_info.boot_cluster = cluster;
1144 }
1145
1146#if CONFIG_SPTM
1147 sptm_register_cpu(cpu->phys_id);
1148#endif
1149 }
1150
1151#if HAS_CLUSTER
1152 /*
1153 * Build the cluster offset array, ensuring that the region reserved
1154 * for each physical cluster contains enough entries to be indexed
1155 * by the maximum physical CPU ID (AFF0) within the cluster.
1156 */
1157 unsigned int cur_cluster_offset = 0;
1158 for (int i = 0; i <= MAX_CPU_CLUSTER_PHY_ID; i++) {
1159 if (cluster_phys_to_logical[i] != -1) {
1160 cluster_offsets[i] = cur_cluster_offset;
1161 cur_cluster_offset += (cluster_max_cpu_phys_id[i] + 1);
1162 }
1163 }
1164 assert(cur_cluster_offset <= MAX_CPUS);
1165#else
1166 /*
1167 * For H10, there are really 2 physical clusters, but they are not separated
1168 * into distinct ACCs. AFF1 therefore always reports 0, and AFF0 numbering
1169 * is linear across both clusters. For the purpose of MPIDR_EL1-based indexing,
1170 * treat H10 and earlier devices as though they contain a single cluster.
1171 */
1172 cluster_offsets[0] = 0;
1173#endif
1174 assert(topology_info.boot_cpu != NULL);
1175 ml_read_chip_revision(rev: &topology_info.chip_revision);
1176
1177 /*
1178 * Set TPIDR_EL0 to indicate the correct cpu number & cluster id,
1179 * as we may not be booting from cpu 0. Userspace will consume
1180 * the current CPU number through this register. For non-boot
1181 * cores, this is done in start.s (start_cpu) using the per-cpu
1182 * data object.
1183 */
1184 ml_topology_cpu_t *boot_cpu = topology_info.boot_cpu;
1185 uint64_t tpidr_el0 = ((boot_cpu->cpu_id << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1186 ((boot_cpu->cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1187 assert(((tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == boot_cpu->cpu_id);
1188 assert(((tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == boot_cpu->cluster_id);
1189 __builtin_arm_wsr64("TPIDR_EL0", tpidr_el0);
1190
1191 __builtin_arm_wsr64("TPIDRRO_EL0", 0);
1192}
1193
1194const ml_topology_info_t *
1195ml_get_topology_info(void)
1196{
1197 return &topology_info;
1198}
1199
1200void
1201ml_map_cpu_pio(void)
1202{
1203 unsigned int i;
1204
1205 for (i = 0; i < topology_info.num_cpus; i++) {
1206 ml_topology_cpu_t *cpu = &topology_info.cpus[i];
1207 if (cpu->cpu_IMPL_pa) {
1208 cpu->cpu_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->cpu_IMPL_pa, size: cpu->cpu_IMPL_len);
1209 cpu->coresight_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->coresight_pa, size: cpu->coresight_len);
1210 }
1211 if (cpu->cpu_UTTDBG_pa) {
1212 cpu->cpu_UTTDBG_regs = (vm_offset_t)ml_io_map(phys_addr: cpu->cpu_UTTDBG_pa, size: cpu->cpu_UTTDBG_len);
1213 }
1214 }
1215
1216 for (i = 0; i < topology_info.num_clusters; i++) {
1217 ml_topology_cluster_t *cluster = &topology_info.clusters[i];
1218 if (cluster->acc_IMPL_pa) {
1219 cluster->acc_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cluster->acc_IMPL_pa, size: cluster->acc_IMPL_len);
1220 }
1221 if (cluster->cpm_IMPL_pa) {
1222 cluster->cpm_IMPL_regs = (vm_offset_t)ml_io_map(phys_addr: cluster->cpm_IMPL_pa, size: cluster->cpm_IMPL_len);
1223 }
1224 }
1225}
1226
1227unsigned int
1228ml_get_cpu_count(void)
1229{
1230 return topology_info.num_cpus;
1231}
1232
1233unsigned int
1234ml_get_cluster_count(void)
1235{
1236 return topology_info.num_clusters;
1237}
1238
1239int
1240ml_get_boot_cpu_number(void)
1241{
1242 return topology_info.boot_cpu->cpu_id;
1243}
1244
1245cluster_type_t
1246ml_get_boot_cluster_type(void)
1247{
1248 return topology_info.boot_cluster->cluster_type;
1249}
1250
1251int
1252ml_get_cpu_number(uint32_t phys_id)
1253{
1254 phys_id &= MPIDR_AFF1_MASK | MPIDR_AFF0_MASK;
1255
1256 for (unsigned i = 0; i < topology_info.num_cpus; i++) {
1257 if (topology_info.cpus[i].phys_id == phys_id) {
1258 return i;
1259 }
1260 }
1261
1262 return -1;
1263}
1264
1265int
1266ml_get_cluster_number(uint32_t phys_id)
1267{
1268 int cpu_id = ml_get_cpu_number(phys_id);
1269 if (cpu_id < 0) {
1270 return -1;
1271 }
1272
1273 ml_topology_cpu_t *cpu = &topology_info.cpus[cpu_id];
1274
1275 return cpu->cluster_id;
1276}
1277
1278unsigned int
1279ml_get_cpu_number_local(void)
1280{
1281 uint64_t mpidr_el1_value = 0;
1282 unsigned cpu_id;
1283
1284 /* We identify the CPU based on the constant bits of MPIDR_EL1. */
1285 MRS(mpidr_el1_value, "MPIDR_EL1");
1286 cpu_id = ml_get_cpu_number(phys_id: (uint32_t)mpidr_el1_value);
1287
1288 assert(cpu_id <= (unsigned int)ml_get_max_cpu_number());
1289
1290 return cpu_id;
1291}
1292
1293int
1294ml_get_cluster_number_local()
1295{
1296 uint64_t mpidr_el1_value = 0;
1297 unsigned cluster_id;
1298
1299 /* We identify the cluster based on the constant bits of MPIDR_EL1. */
1300 MRS(mpidr_el1_value, "MPIDR_EL1");
1301 cluster_id = ml_get_cluster_number(phys_id: (uint32_t)mpidr_el1_value);
1302
1303 assert(cluster_id <= (unsigned int)ml_get_max_cluster_number());
1304
1305 return cluster_id;
1306}
1307
1308int
1309ml_get_max_cpu_number(void)
1310{
1311 return topology_info.max_cpu_id;
1312}
1313
1314int
1315ml_get_max_cluster_number(void)
1316{
1317 return topology_info.max_cluster_id;
1318}
1319
1320unsigned int
1321ml_get_first_cpu_id(unsigned int cluster_id)
1322{
1323 return topology_info.clusters[cluster_id].first_cpu_id;
1324}
1325
1326static_assert(MAX_CPUS <= 256, "MAX_CPUS must fit in _COMM_PAGE_CPU_TO_CLUSTER; Increase table size if needed");
1327
1328void
1329ml_map_cpus_to_clusters(uint8_t *table)
1330{
1331 for (uint16_t cpu_id = 0; cpu_id < topology_info.num_cpus; cpu_id++) {
1332 *(table + cpu_id) = (uint8_t)(topology_info.cpus[cpu_id].cluster_id);
1333 }
1334}
1335
1336/*
1337 * Return the die id of a cluster.
1338 */
1339unsigned int
1340ml_get_die_id(unsigned int cluster_id)
1341{
1342 /*
1343 * The current implementation gets the die_id from the
1344 * first CPU of the cluster.
1345 * rdar://80917654 (Add the die_id field to the cluster topology info)
1346 */
1347 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1348 return topology_info.cpus[first_cpu].die_id;
1349}
1350
1351/*
1352 * Return the index of a cluster in its die.
1353 */
1354unsigned int
1355ml_get_die_cluster_id(unsigned int cluster_id)
1356{
1357 /*
1358 * The current implementation gets the die_id from the
1359 * first CPU of the cluster.
1360 * rdar://80917654 (Add the die_id field to the cluster topology info)
1361 */
1362 unsigned int first_cpu = ml_get_first_cpu_id(cluster_id);
1363 return topology_info.cpus[first_cpu].die_cluster_id;
1364}
1365
1366/*
1367 * Return the highest die id of the system.
1368 */
1369unsigned int
1370ml_get_max_die_id(void)
1371{
1372 return topology_info.max_die_id;
1373}
1374
1375void
1376ml_lockdown_init()
1377{
1378#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1379 rorgn_stash_range();
1380#endif
1381}
1382
1383kern_return_t
1384ml_lockdown_handler_register(lockdown_handler_t f, void *this)
1385{
1386 if (!f) {
1387 return KERN_FAILURE;
1388 }
1389
1390 assert(lockdown_done);
1391 f(this); // XXX: f this whole function
1392
1393 return KERN_SUCCESS;
1394}
1395
1396static mcache_flush_function mcache_flush_func;
1397static void* mcache_flush_service;
1398kern_return_t
1399ml_mcache_flush_callback_register(mcache_flush_function func, void *service)
1400{
1401 mcache_flush_service = service;
1402 mcache_flush_func = func;
1403
1404 return KERN_SUCCESS;
1405}
1406
1407kern_return_t
1408ml_mcache_flush(void)
1409{
1410 if (!mcache_flush_func) {
1411 panic("Cannot flush M$ with no flush callback registered");
1412
1413 return KERN_FAILURE;
1414 } else {
1415 return mcache_flush_func(mcache_flush_service);
1416 }
1417}
1418
1419
1420extern lck_mtx_t pset_create_lock;
1421
1422kern_return_t
1423ml_processor_register(ml_processor_info_t *in_processor_info,
1424 processor_t *processor_out, ipi_handler_t *ipi_handler_out,
1425 perfmon_interrupt_handler_func *pmi_handler_out)
1426{
1427 cpu_data_t *this_cpu_datap;
1428 processor_set_t pset;
1429 boolean_t is_boot_cpu;
1430 static unsigned int reg_cpu_count = 0;
1431
1432 if (in_processor_info->log_id > (uint32_t)ml_get_max_cpu_number()) {
1433 return KERN_FAILURE;
1434 }
1435
1436 if ((unsigned)OSIncrementAtomic((SInt32*)&reg_cpu_count) >= topology_info.num_cpus) {
1437 return KERN_FAILURE;
1438 }
1439
1440 if (in_processor_info->log_id != (uint32_t)ml_get_boot_cpu_number()) {
1441 is_boot_cpu = FALSE;
1442 this_cpu_datap = cpu_data_alloc(FALSE);
1443 cpu_data_init(cpu_data_ptr: this_cpu_datap);
1444 } else {
1445 this_cpu_datap = &BootCpuData;
1446 is_boot_cpu = TRUE;
1447 }
1448
1449 assert(in_processor_info->log_id <= (uint32_t)ml_get_max_cpu_number());
1450
1451 this_cpu_datap->cpu_id = in_processor_info->cpu_id;
1452
1453 if (!is_boot_cpu) {
1454 this_cpu_datap->cpu_number = (unsigned short)(in_processor_info->log_id);
1455
1456 if (cpu_data_register(cpu_data_ptr: this_cpu_datap) != KERN_SUCCESS) {
1457 goto processor_register_error;
1458 }
1459 assert((this_cpu_datap->cpu_number & MACHDEP_TPIDR_CPUNUM_MASK) == this_cpu_datap->cpu_number);
1460 }
1461
1462 this_cpu_datap->cpu_idle_notify = in_processor_info->processor_idle;
1463 this_cpu_datap->cpu_cache_dispatch = (cache_dispatch_t)in_processor_info->platform_cache_dispatch;
1464 nanoseconds_to_absolutetime(nanoseconds: (uint64_t) in_processor_info->powergate_latency, result: &this_cpu_datap->cpu_idle_latency);
1465 this_cpu_datap->cpu_reset_assist = kvtophys(va: in_processor_info->powergate_stub_addr);
1466
1467 this_cpu_datap->idle_timer_notify = in_processor_info->idle_timer;
1468 this_cpu_datap->idle_timer_refcon = in_processor_info->idle_timer_refcon;
1469
1470 this_cpu_datap->platform_error_handler = in_processor_info->platform_error_handler;
1471 this_cpu_datap->cpu_regmap_paddr = in_processor_info->regmap_paddr;
1472 this_cpu_datap->cpu_phys_id = in_processor_info->phys_id;
1473 this_cpu_datap->cpu_l2_access_penalty = in_processor_info->l2_access_penalty;
1474
1475 this_cpu_datap->cpu_cluster_type = in_processor_info->cluster_type;
1476 this_cpu_datap->cpu_cluster_id = in_processor_info->cluster_id;
1477 this_cpu_datap->cpu_l2_id = in_processor_info->l2_cache_id;
1478 this_cpu_datap->cpu_l2_size = in_processor_info->l2_cache_size;
1479 this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id;
1480 this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size;
1481
1482 /*
1483 * Encode cpu_id, cluster_id to be stored in TPIDR_EL0 (see
1484 * cswitch.s:set_thread_registers, start.s:start_cpu) for consumption
1485 * by userspace.
1486 */
1487 this_cpu_datap->cpu_tpidr_el0 = ((this_cpu_datap->cpu_number << MACHDEP_TPIDR_CPUNUM_SHIFT) & MACHDEP_TPIDR_CPUNUM_MASK) | \
1488 ((this_cpu_datap->cpu_cluster_id << MACHDEP_TPIDR_CLUSTERID_SHIFT) & MACHDEP_TPIDR_CLUSTERID_MASK);
1489 assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CPUNUM_MASK) >> MACHDEP_TPIDR_CPUNUM_SHIFT) == this_cpu_datap->cpu_number);
1490 assert(((this_cpu_datap->cpu_tpidr_el0 & MACHDEP_TPIDR_CLUSTERID_MASK) >> MACHDEP_TPIDR_CLUSTERID_SHIFT) == this_cpu_datap->cpu_cluster_id);
1491
1492#if HAS_CLUSTER
1493 this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized);
1494#else /* HAS_CLUSTER */
1495 this_cpu_datap->cluster_master = is_boot_cpu;
1496#endif /* HAS_CLUSTER */
1497 lck_mtx_lock(lck: &pset_create_lock);
1498 pset = pset_find(cluster_id: in_processor_info->cluster_id, NULL);
1499 kprintf(fmt: "[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1);
1500 if (pset == NULL) {
1501#if __AMP__
1502 pset_cluster_type_t pset_cluster_type = this_cpu_datap->cpu_cluster_type == CLUSTER_TYPE_E ? PSET_AMP_E : PSET_AMP_P;
1503 pset = pset_create(ml_get_boot_cluster_type() == this_cpu_datap->cpu_cluster_type ? &pset_node0 : &pset_node1, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id);
1504 assert(pset != PROCESSOR_SET_NULL);
1505 kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id);
1506#else /* __AMP__ */
1507 pset_cluster_type_t pset_cluster_type = PSET_SMP;
1508 pset = pset_create(node: &pset_node0, pset_type: pset_cluster_type, pset_cluster_id: this_cpu_datap->cpu_cluster_id, pset_id: this_cpu_datap->cpu_cluster_id);
1509 assert(pset != PROCESSOR_SET_NULL);
1510#endif /* __AMP__ */
1511 }
1512 kprintf(fmt: "[%d]%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
1513 lck_mtx_unlock(lck: &pset_create_lock);
1514
1515 processor_t processor = PERCPU_GET_RELATIVE(processor, cpu_data, this_cpu_datap);
1516 if (!is_boot_cpu) {
1517 processor_init(processor, cpu_id: this_cpu_datap->cpu_number, processor_set: pset);
1518
1519 if (this_cpu_datap->cpu_l2_access_penalty) {
1520 /*
1521 * Cores that have a non-zero L2 access penalty compared
1522 * to the boot processor should be de-prioritized by the
1523 * scheduler, so that threads use the cores with better L2
1524 * preferentially.
1525 */
1526 processor_set_primary(processor, master_processor);
1527 }
1528 }
1529
1530 *processor_out = processor;
1531 *ipi_handler_out = cpu_signal_handler;
1532#if CPMU_AIC_PMI && CONFIG_CPU_COUNTERS
1533 *pmi_handler_out = mt_cpmu_aic_pmi;
1534#else
1535 *pmi_handler_out = NULL;
1536#endif /* CPMU_AIC_PMI && CONFIG_CPU_COUNTERS */
1537 if (in_processor_info->idle_tickle != (idle_tickle_t *) NULL) {
1538 *in_processor_info->idle_tickle = (idle_tickle_t) cpu_idle_tickle;
1539 }
1540
1541#if CONFIG_CPU_COUNTERS
1542 if (kpc_register_cpu(this_cpu_datap) != TRUE) {
1543 goto processor_register_error;
1544 }
1545#endif /* CONFIG_CPU_COUNTERS */
1546
1547
1548 if (!is_boot_cpu) {
1549 random_cpu_init(cpu: this_cpu_datap->cpu_number);
1550 // now let next CPU register itself
1551 OSIncrementAtomic((SInt32*)&real_ncpus);
1552 }
1553
1554 return KERN_SUCCESS;
1555
1556processor_register_error:
1557#if CONFIG_CPU_COUNTERS
1558 kpc_unregister_cpu(this_cpu_datap);
1559#endif /* CONFIG_CPU_COUNTERS */
1560 if (!is_boot_cpu) {
1561 cpu_data_free(cpu_data_ptr: this_cpu_datap);
1562 }
1563
1564 return KERN_FAILURE;
1565}
1566
1567void
1568ml_init_arm_debug_interface(
1569 void * in_cpu_datap,
1570 vm_offset_t virt_address)
1571{
1572 ((cpu_data_t *)in_cpu_datap)->cpu_debug_interface_map = virt_address;
1573 do_debugid();
1574}
1575
1576/*
1577 * Routine: init_ast_check
1578 * Function:
1579 */
1580void
1581init_ast_check(
1582 __unused processor_t processor)
1583{
1584}
1585
1586/*
1587 * Routine: cause_ast_check
1588 * Function:
1589 */
1590void
1591cause_ast_check(
1592 processor_t processor)
1593{
1594 if (current_processor() != processor) {
1595 cpu_signal(target: processor_to_cpu_datap(processor), SIGPast, p0: (void *)NULL, p1: (void *)NULL);
1596 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), processor->cpu_id, 1 /* ast */, 0, 0, 0);
1597 }
1598}
1599
1600extern uint32_t cpu_idle_count;
1601
1602void
1603ml_get_power_state(boolean_t *icp, boolean_t *pidlep)
1604{
1605 *icp = ml_at_interrupt_context();
1606 *pidlep = (cpu_idle_count == real_ncpus);
1607}
1608
1609/*
1610 * Routine: ml_cause_interrupt
1611 * Function: Generate a fake interrupt
1612 */
1613void
1614ml_cause_interrupt(void)
1615{
1616 return; /* BS_XXX */
1617}
1618
1619/* Map memory map IO space */
1620vm_offset_t
1621ml_io_map(
1622 vm_offset_t phys_addr,
1623 vm_size_t size)
1624{
1625 return io_map(phys_addr, size, VM_WIMG_IO, VM_PROT_DEFAULT, false);
1626}
1627
1628/* Map memory map IO space (with protections specified) */
1629vm_offset_t
1630ml_io_map_with_prot(
1631 vm_offset_t phys_addr,
1632 vm_size_t size,
1633 vm_prot_t prot)
1634{
1635 return io_map(phys_addr, size, VM_WIMG_IO, prot, false);
1636}
1637
1638vm_offset_t
1639ml_io_map_unmappable(
1640 vm_offset_t phys_addr,
1641 vm_size_t size,
1642 unsigned int flags)
1643{
1644 return io_map(phys_addr, size, flags, VM_PROT_DEFAULT, true);
1645}
1646
1647vm_offset_t
1648ml_io_map_wcomb(
1649 vm_offset_t phys_addr,
1650 vm_size_t size)
1651{
1652 return io_map(phys_addr, size, VM_WIMG_WCOMB, VM_PROT_DEFAULT, false);
1653}
1654
1655void
1656ml_io_unmap(vm_offset_t addr, vm_size_t sz)
1657{
1658 pmap_remove(map: kernel_pmap, s: addr, e: addr + sz);
1659 kmem_free(map: kernel_map, addr, size: sz);
1660}
1661
1662vm_map_address_t
1663ml_map_high_window(
1664 vm_offset_t phys_addr,
1665 vm_size_t len)
1666{
1667 return pmap_map_high_window_bd(pa: phys_addr, len, VM_PROT_READ | VM_PROT_WRITE);
1668}
1669
1670vm_offset_t
1671ml_static_ptovirt(
1672 vm_offset_t paddr)
1673{
1674 return phystokv(pa: paddr);
1675}
1676
1677vm_offset_t
1678ml_static_slide(
1679 vm_offset_t vaddr)
1680{
1681 vm_offset_t slid_vaddr = 0;
1682
1683#if CONFIG_SPTM
1684 if ((vaddr >= vm_sptm_offsets.unslid_base) && (vaddr < vm_sptm_offsets.unslid_top)) {
1685 slid_vaddr = vaddr + vm_sptm_offsets.slide;
1686 } else if ((vaddr >= vm_txm_offsets.unslid_base) && (vaddr < vm_txm_offsets.unslid_top)) {
1687 slid_vaddr = vaddr + vm_txm_offsets.slide;
1688 } else
1689#endif /* CONFIG_SPTM */
1690 {
1691 slid_vaddr = vaddr + vm_kernel_slide;
1692 }
1693
1694 if (!VM_KERNEL_IS_SLID(slid_vaddr)) {
1695 /* This is only intended for use on static kernel addresses. */
1696 return 0;
1697 }
1698
1699 return slid_vaddr;
1700}
1701
1702vm_offset_t
1703ml_static_unslide(
1704 vm_offset_t vaddr)
1705{
1706 if (!VM_KERNEL_IS_SLID(vaddr)) {
1707 /* This is only intended for use on static kernel addresses. */
1708 return 0;
1709 }
1710
1711#if CONFIG_SPTM
1712 /**
1713 * Addresses coming from the SPTM and TXM have a different slide than the
1714 * rest of the kernel.
1715 */
1716 if ((vaddr >= vm_sptm_offsets.slid_base) && (vaddr < vm_sptm_offsets.slid_top)) {
1717 return vaddr - vm_sptm_offsets.slide;
1718 }
1719
1720 if ((vaddr >= vm_txm_offsets.slid_base) && (vaddr < vm_txm_offsets.slid_top)) {
1721 return vaddr - vm_txm_offsets.slide;
1722 }
1723#endif /* CONFIG_SPTM */
1724
1725 return vaddr - vm_kernel_slide;
1726}
1727
1728extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
1729
1730kern_return_t
1731ml_static_protect(
1732 vm_offset_t vaddr, /* kernel virtual address */
1733 vm_size_t size,
1734 vm_prot_t new_prot __unused)
1735{
1736#if CONFIG_SPTM
1737 /**
1738 * Retype any frames that may be passed to the VM to XNU_DEFAULT.
1739 */
1740 for (vm_offset_t sptm_vaddr_cur = vaddr; sptm_vaddr_cur < trunc_page_64(vaddr + size); sptm_vaddr_cur += PAGE_SIZE) {
1741 /* Check if this frame is XNU_DEFAULT and only retype it if is not */
1742 sptm_paddr_t sptm_paddr_cur = kvtophys_nofail(sptm_vaddr_cur);
1743 sptm_frame_type_t current_type = sptm_get_frame_type(sptm_paddr_cur);
1744 if (current_type != XNU_DEFAULT) {
1745 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1746 sptm_retype(sptm_paddr_cur, current_type, XNU_DEFAULT, retype_params);
1747 }
1748 }
1749
1750 return KERN_SUCCESS;
1751#else /* CONFIG_SPTM */
1752 pt_entry_t arm_prot = 0;
1753 pt_entry_t arm_block_prot = 0;
1754 vm_offset_t vaddr_cur;
1755 ppnum_t ppn;
1756 kern_return_t result = KERN_SUCCESS;
1757
1758 if (vaddr < physmap_base) {
1759 panic("ml_static_protect(): %p < %p", (void *) vaddr, (void *) physmap_base);
1760 return KERN_FAILURE;
1761 }
1762
1763 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1764
1765 if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_EXECUTE)) {
1766 panic("ml_static_protect(): WX request on %p", (void *) vaddr);
1767 }
1768 if (lockdown_done && (new_prot & VM_PROT_EXECUTE)) {
1769 panic("ml_static_protect(): attempt to inject executable mapping on %p", (void *) vaddr);
1770 }
1771
1772 /* Set up the protection bits, and block bits so we can validate block mappings. */
1773 if (new_prot & VM_PROT_WRITE) {
1774 arm_prot |= ARM_PTE_AP(AP_RWNA);
1775 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RWNA);
1776 } else {
1777 arm_prot |= ARM_PTE_AP(AP_RONA);
1778 arm_block_prot |= ARM_TTE_BLOCK_AP(AP_RONA);
1779 }
1780
1781 arm_prot |= ARM_PTE_NX;
1782 arm_block_prot |= ARM_TTE_BLOCK_NX;
1783
1784 if (!(new_prot & VM_PROT_EXECUTE)) {
1785 arm_prot |= ARM_PTE_PNX;
1786 arm_block_prot |= ARM_TTE_BLOCK_PNX;
1787 }
1788
1789 for (vaddr_cur = vaddr;
1790 vaddr_cur < trunc_page_64(vaddr + size);
1791 vaddr_cur += PAGE_SIZE) {
1792 ppn = pmap_find_phys(map: kernel_pmap, va: vaddr_cur);
1793 if (ppn != (vm_offset_t) NULL) {
1794 tt_entry_t *tte2;
1795 pt_entry_t *pte_p;
1796 pt_entry_t ptmp;
1797
1798#if XNU_MONITOR
1799 assert(!pmap_is_monitor(ppn));
1800 assert(!TEST_PAGE_RATIO_4);
1801#endif
1802
1803 tte2 = arm_kva_to_tte(va: vaddr_cur);
1804
1805 if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1806 if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
1807 ((*tte2 & (ARM_TTE_BLOCK_NXMASK | ARM_TTE_BLOCK_PNXMASK | ARM_TTE_BLOCK_APMASK)) == arm_block_prot)) {
1808 /*
1809 * We can support ml_static_protect on a block mapping if the mapping already has
1810 * the desired protections. We still want to run checks on a per-page basis.
1811 */
1812 continue;
1813 }
1814
1815 result = KERN_FAILURE;
1816 break;
1817 }
1818
1819 pte_p = (pt_entry_t *)&((tt_entry_t*)(phystokv(pa: (*tte2) & ARM_TTE_TABLE_MASK)))[(((vaddr_cur) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
1820 ptmp = *pte_p;
1821
1822 if ((ptmp & ARM_PTE_HINT_MASK) && ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot)) {
1823 /*
1824 * The contiguous hint is similar to a block mapping for ml_static_protect; if the existing
1825 * protections do not match the desired protections, then we will fail (as we cannot update
1826 * this mapping without updating other mappings as well).
1827 */
1828 result = KERN_FAILURE;
1829 break;
1830 }
1831
1832 __unreachable_ok_push
1833 if (TEST_PAGE_RATIO_4) {
1834 {
1835 unsigned int i;
1836 pt_entry_t *ptep_iter;
1837
1838 ptep_iter = pte_p;
1839 for (i = 0; i < 4; i++, ptep_iter++) {
1840 /* Note that there is a hole in the HINT sanity checking here. */
1841 ptmp = *ptep_iter;
1842
1843 /* We only need to update the page tables if the protections do not match. */
1844 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1845 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1846 *ptep_iter = ptmp;
1847 }
1848 }
1849 }
1850 } else {
1851 ptmp = *pte_p;
1852 /* We only need to update the page tables if the protections do not match. */
1853 if ((ptmp & (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) != arm_prot) {
1854 ptmp = (ptmp & ~(ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK)) | arm_prot;
1855 *pte_p = ptmp;
1856 }
1857 }
1858 __unreachable_ok_pop
1859 }
1860 }
1861
1862 if (vaddr_cur > vaddr) {
1863 assert(((vaddr_cur - vaddr) & 0xFFFFFFFF00000000ULL) == 0);
1864 flush_mmu_tlb_region(va: vaddr, length: (uint32_t)(vaddr_cur - vaddr));
1865 }
1866
1867
1868 return result;
1869#endif /* CONFIG_SPTM */
1870}
1871
1872#if defined(CONFIG_SPTM)
1873/*
1874 * Returns true if the given physical address is in one of the boot kernelcache ranges.
1875 */
1876static bool
1877ml_physaddr_in_bootkc_range(vm_offset_t physaddr)
1878{
1879 for (int i = 0; i < arm_vm_kernelcache_numranges; i++) {
1880 if (physaddr >= arm_vm_kernelcache_ranges[i].start_phys && physaddr < arm_vm_kernelcache_ranges[i].end_phys) {
1881 return true;
1882 }
1883 }
1884 return false;
1885}
1886#endif /* defined(CONFIG_SPTM) */
1887
1888/*
1889 * Routine: ml_static_mfree
1890 * Function:
1891 */
1892void
1893ml_static_mfree(
1894 vm_offset_t vaddr,
1895 vm_size_t size)
1896{
1897 vm_offset_t vaddr_cur;
1898 vm_offset_t paddr_cur;
1899 ppnum_t ppn;
1900 uint32_t freed_pages = 0;
1901 uint32_t freed_kernelcache_pages = 0;
1902
1903
1904 /* It is acceptable (if bad) to fail to free. */
1905 if (vaddr < physmap_base) {
1906 return;
1907 }
1908
1909 assert((vaddr & (PAGE_SIZE - 1)) == 0); /* must be page aligned */
1910
1911 for (vaddr_cur = vaddr;
1912 vaddr_cur < trunc_page_64(vaddr + size);
1913 vaddr_cur += PAGE_SIZE) {
1914 ppn = pmap_find_phys(map: kernel_pmap, va: vaddr_cur);
1915 if (ppn != (vm_offset_t) NULL) {
1916 /*
1917 * It is not acceptable to fail to update the protections on a page
1918 * we will release to the VM. We need to either panic or continue.
1919 * For now, we'll panic (to help flag if there is memory we can
1920 * reclaim).
1921 */
1922 if (ml_static_protect(vaddr: vaddr_cur, PAGE_SIZE, VM_PROT_WRITE | VM_PROT_READ) != KERN_SUCCESS) {
1923 panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
1924 }
1925
1926 paddr_cur = ptoa(ppn);
1927
1928
1929 vm_page_create(start: ppn, end: (ppn + 1));
1930 freed_pages++;
1931#if defined(CONFIG_SPTM)
1932 if (ml_physaddr_in_bootkc_range(paddr_cur)) {
1933#else
1934 if (paddr_cur >= arm_vm_kernelcache_phys_start && paddr_cur < arm_vm_kernelcache_phys_end) {
1935#endif
1936 freed_kernelcache_pages++;
1937 }
1938 }
1939 }
1940 vm_page_lockspin_queues();
1941 vm_page_wire_count -= freed_pages;
1942 vm_page_wire_count_initial -= freed_pages;
1943 vm_page_kernelcache_count -= freed_kernelcache_pages;
1944 vm_page_unlock_queues();
1945#if DEBUG
1946 kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
1947#endif
1948}
1949
1950/*
1951 * Routine: ml_page_protection_type
1952 * Function: Returns the type of page protection that the system supports.
1953 */
1954ml_page_protection_t
1955ml_page_protection_type(void)
1956{
1957#if CONFIG_SPTM
1958 return 2;
1959#elif XNU_MONITOR
1960 return 1;
1961#else
1962 return 0;
1963#endif
1964}
1965
1966/* virtual to physical on wired pages */
1967vm_offset_t
1968ml_vtophys(vm_offset_t vaddr)
1969{
1970 return kvtophys(va: vaddr);
1971}
1972
1973/*
1974 * Routine: ml_nofault_copy
1975 * Function: Perform a physical mode copy if the source and destination have
1976 * valid translations in the kernel pmap. If translations are present, they are
1977 * assumed to be wired; e.g., no attempt is made to guarantee that the
1978 * translations obtained remain valid for the duration of the copy process.
1979 */
1980vm_size_t
1981ml_nofault_copy(vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size)
1982{
1983 addr64_t cur_phys_dst, cur_phys_src;
1984 vm_size_t count, nbytes = 0;
1985
1986 while (size > 0) {
1987 if (!(cur_phys_src = kvtophys(va: virtsrc))) {
1988 break;
1989 }
1990 if (!(cur_phys_dst = kvtophys(va: virtdst))) {
1991 break;
1992 }
1993 if (!pmap_valid_address(trunc_page_64(cur_phys_dst)) ||
1994 !pmap_valid_address(trunc_page_64(cur_phys_src))) {
1995 break;
1996 }
1997 count = PAGE_SIZE - (cur_phys_src & PAGE_MASK);
1998 if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) {
1999 count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK);
2000 }
2001 if (count > size) {
2002 count = size;
2003 }
2004
2005 bcopy_phys(from: cur_phys_src, to: cur_phys_dst, nbytes: count);
2006
2007 nbytes += count;
2008 virtsrc += count;
2009 virtdst += count;
2010 size -= count;
2011 }
2012
2013 return nbytes;
2014}
2015
2016/*
2017 * Routine: ml_validate_nofault
2018 * Function: Validate that ths address range has a valid translations
2019 * in the kernel pmap. If translations are present, they are
2020 * assumed to be wired; i.e. no attempt is made to guarantee
2021 * that the translation persist after the check.
2022 * Returns: TRUE if the range is mapped and will not cause a fault,
2023 * FALSE otherwise.
2024 */
2025
2026boolean_t
2027ml_validate_nofault(
2028 vm_offset_t virtsrc, vm_size_t size)
2029{
2030 addr64_t cur_phys_src;
2031 uint32_t count;
2032
2033 while (size > 0) {
2034 if (!(cur_phys_src = kvtophys(va: virtsrc))) {
2035 return FALSE;
2036 }
2037 if (!pmap_valid_address(trunc_page_64(cur_phys_src))) {
2038 return FALSE;
2039 }
2040 count = (uint32_t)(PAGE_SIZE - (cur_phys_src & PAGE_MASK));
2041 if (count > size) {
2042 count = (uint32_t)size;
2043 }
2044
2045 virtsrc += count;
2046 size -= count;
2047 }
2048
2049 return TRUE;
2050}
2051
2052void
2053ml_get_bouncepool_info(vm_offset_t * phys_addr, vm_size_t * size)
2054{
2055 *phys_addr = 0;
2056 *size = 0;
2057}
2058
2059void
2060active_rt_threads(__unused boolean_t active)
2061{
2062}
2063
2064static void
2065cpu_qos_cb_default(__unused int urgency, __unused uint64_t qos_param1, __unused uint64_t qos_param2)
2066{
2067 return;
2068}
2069
2070cpu_qos_update_t cpu_qos_update = cpu_qos_cb_default;
2071
2072void
2073cpu_qos_update_register(cpu_qos_update_t cpu_qos_cb)
2074{
2075 if (cpu_qos_cb != NULL) {
2076 cpu_qos_update = cpu_qos_cb;
2077 } else {
2078 cpu_qos_update = cpu_qos_cb_default;
2079 }
2080}
2081
2082void
2083thread_tell_urgency(thread_urgency_t urgency, uint64_t rt_period, uint64_t rt_deadline, uint64_t sched_latency __unused, __unused thread_t nthread)
2084{
2085 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, sched_latency, 0);
2086
2087 cpu_qos_update((int)urgency, rt_period, rt_deadline);
2088
2089 SCHED_DEBUG_PLATFORM_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0);
2090}
2091
2092void
2093machine_run_count(__unused uint32_t count)
2094{
2095}
2096
2097processor_t
2098machine_choose_processor(__unused processor_set_t pset, processor_t processor)
2099{
2100 return processor;
2101}
2102
2103#if KASAN
2104vm_offset_t ml_stack_base(void);
2105vm_size_t ml_stack_size(void);
2106
2107vm_offset_t
2108ml_stack_base(void)
2109{
2110 uintptr_t local = (uintptr_t) &local;
2111 vm_offset_t intstack_top_ptr;
2112
2113 intstack_top_ptr = getCpuDatap()->intstack_top;
2114 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2115 return intstack_top_ptr - INTSTACK_SIZE;
2116 } else {
2117 return current_thread()->kernel_stack;
2118 }
2119}
2120vm_size_t
2121ml_stack_size(void)
2122{
2123 uintptr_t local = (uintptr_t) &local;
2124 vm_offset_t intstack_top_ptr;
2125
2126 intstack_top_ptr = getCpuDatap()->intstack_top;
2127 if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
2128 return INTSTACK_SIZE;
2129 } else {
2130 return kernel_stack_size;
2131 }
2132}
2133#endif
2134
2135#ifdef CONFIG_KCOV
2136
2137kcov_cpu_data_t *
2138current_kcov_data(void)
2139{
2140 return &current_cpu_datap()->cpu_kcov_data;
2141}
2142
2143kcov_cpu_data_t *
2144cpu_kcov_data(int cpuid)
2145{
2146 return &cpu_datap(cpuid)->cpu_kcov_data;
2147}
2148
2149#endif /* CONFIG_KCOV */
2150
2151boolean_t
2152machine_timeout_suspended(void)
2153{
2154 return FALSE;
2155}
2156
2157kern_return_t
2158ml_interrupt_prewarm(__unused uint64_t deadline)
2159{
2160 return KERN_FAILURE;
2161}
2162
2163/*
2164 * Assumes fiq, irq disabled.
2165 */
2166void
2167ml_set_decrementer(uint32_t dec_value)
2168{
2169 cpu_data_t *cdp = getCpuDatap();
2170
2171 assert(ml_get_interrupts_enabled() == FALSE);
2172 cdp->cpu_decrementer = dec_value;
2173
2174 if (cdp->cpu_set_decrementer_func) {
2175 cdp->cpu_set_decrementer_func(dec_value);
2176 } else {
2177 __builtin_arm_wsr64("CNTV_TVAL_EL0", (uint64_t)dec_value);
2178 }
2179}
2180
2181/**
2182 * Perform a read of the timebase which is permitted to be executed
2183 * speculatively and/or out of program order.
2184 */
2185static inline uint64_t
2186speculative_timebase(void)
2187{
2188 return __builtin_arm_rsr64("CNTVCT_EL0");
2189}
2190
2191/**
2192 * Read a non-speculative view of the timebase if one is available,
2193 * otherwise fallback on an ISB to prevent prevent speculation and
2194 * enforce ordering.
2195 */
2196static inline uint64_t
2197nonspeculative_timebase(void)
2198{
2199#if defined(HAS_ACNTVCT)
2200 return __builtin_arm_rsr64("S3_4_c15_c10_6");
2201#elif __ARM_ARCH_8_6__
2202 return __builtin_arm_rsr64("CNTVCTSS_EL0");
2203#else
2204 // ISB required by ARMV7C.b section B8.1.2 & ARMv8 section D6.1.2
2205 // "Reads of CNT[PV]CT[_EL0] can occur speculatively and out of order relative
2206 // to other instructions executed on the same processor."
2207 __builtin_arm_isb(ISB_SY);
2208 return speculative_timebase();
2209#endif
2210}
2211
2212
2213uint64_t
2214ml_get_hwclock()
2215{
2216 uint64_t timebase = nonspeculative_timebase();
2217 return timebase;
2218}
2219
2220uint64_t
2221ml_get_timebase()
2222{
2223 uint64_t clock, timebase;
2224
2225 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633
2226 do {
2227 timebase = getCpuDatap()->cpu_base_timebase;
2228 os_compiler_barrier();
2229 clock = ml_get_hwclock();
2230 os_compiler_barrier();
2231 } while (getCpuDatap()->cpu_base_timebase != timebase);
2232
2233 return clock + timebase;
2234}
2235
2236/**
2237 * Issue a barrier that guarantees all prior memory accesses will complete
2238 * before any subsequent timebase reads.
2239 */
2240void
2241ml_memory_to_timebase_fence(void)
2242{
2243 __builtin_arm_dmb(DMB_SY);
2244 const uint64_t take_backwards_branch = 0;
2245 asm volatile (
2246 "1:"
2247 "ldr x0, [%[take_backwards_branch]]" "\n"
2248 "cbnz x0, 1b" "\n"
2249 :
2250 : [take_backwards_branch] "r"(&take_backwards_branch)
2251 : "x0"
2252 );
2253
2254 /* throwaway read to prevent ml_get_speculative_timebase() reordering */
2255 (void)ml_get_hwclock();
2256}
2257
2258/**
2259 * Issue a barrier that guarantees all prior timebase reads will
2260 * be ordered before any subsequent memory accesses.
2261 */
2262void
2263ml_timebase_to_memory_fence(void)
2264{
2265 __builtin_arm_isb(ISB_SY);
2266}
2267
2268/*
2269 * Get the speculative timebase without an ISB.
2270 */
2271uint64_t
2272ml_get_speculative_timebase(void)
2273{
2274 uint64_t clock, timebase;
2275
2276 //the retry is for the case where S2R catches us in the middle of this. see rdar://77019633&77697482
2277 do {
2278 timebase = getCpuDatap()->cpu_base_timebase;
2279 os_compiler_barrier();
2280 clock = speculative_timebase();
2281
2282 os_compiler_barrier();
2283 } while (getCpuDatap()->cpu_base_timebase != timebase);
2284
2285 return clock + timebase;
2286}
2287
2288uint64_t
2289ml_get_timebase_entropy(void)
2290{
2291 return ml_get_speculative_timebase();
2292}
2293
2294uint32_t
2295ml_get_decrementer(void)
2296{
2297 cpu_data_t *cdp = getCpuDatap();
2298 uint32_t dec;
2299
2300 assert(ml_get_interrupts_enabled() == FALSE);
2301
2302 if (cdp->cpu_get_decrementer_func) {
2303 dec = cdp->cpu_get_decrementer_func();
2304 } else {
2305 uint64_t wide_val;
2306
2307 wide_val = __builtin_arm_rsr64("CNTV_TVAL_EL0");
2308 dec = (uint32_t)wide_val;
2309 assert(wide_val == (uint64_t)dec);
2310 }
2311
2312 return dec;
2313}
2314
2315boolean_t
2316ml_get_timer_pending(void)
2317{
2318 uint64_t cntv_ctl = __builtin_arm_rsr64("CNTV_CTL_EL0");
2319 return ((cntv_ctl & CNTV_CTL_EL0_ISTATUS) != 0) ? TRUE : FALSE;
2320}
2321
2322__attribute__((noreturn))
2323void
2324platform_syscall(arm_saved_state_t *state)
2325{
2326 uint32_t code;
2327
2328#define platform_syscall_kprintf(x...) /* kprintf("platform_syscall: " x) */
2329
2330 code = (uint32_t)get_saved_state_reg(iss: state, reg: 3);
2331
2332 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_START,
2333 get_saved_state_reg(state, 0),
2334 get_saved_state_reg(state, 1),
2335 get_saved_state_reg(state, 2));
2336
2337 switch (code) {
2338 case 2:
2339 /* set cthread */
2340 platform_syscall_kprintf("set cthread self.\n");
2341 thread_set_cthread_self(get_saved_state_reg(iss: state, reg: 0));
2342 break;
2343 case 3:
2344 /* get cthread */
2345 platform_syscall_kprintf("get cthread self.\n");
2346 set_user_saved_state_reg(iss: state, reg: 0, value: thread_get_cthread_self());
2347 break;
2348 case 0: /* I-Cache flush (removed) */
2349 case 1: /* D-Cache flush (removed) */
2350 default:
2351 platform_syscall_kprintf("unknown: %d\n", code);
2352 break;
2353 }
2354
2355 KDBG(MACHDBG_CODE(DBG_MACH_MACHDEP_EXCP_SC_ARM, code) | DBG_FUNC_END,
2356 get_saved_state_reg(state, 0));
2357
2358 thread_exception_return();
2359}
2360
2361static void
2362_enable_timebase_event_stream(uint32_t bit_index)
2363{
2364 uint64_t cntkctl; /* One wants to use 32 bits, but "mrs" prefers it this way */
2365
2366 if (bit_index >= 64) {
2367 panic("%s: invalid bit index (%u)", __FUNCTION__, bit_index);
2368 }
2369
2370 __asm__ volatile ("mrs %0, CNTKCTL_EL1" : "=r"(cntkctl));
2371
2372 cntkctl |= (bit_index << CNTKCTL_EL1_EVENTI_SHIFT);
2373 cntkctl |= CNTKCTL_EL1_EVNTEN;
2374 cntkctl |= CNTKCTL_EL1_EVENTDIR; /* 1->0; why not? */
2375
2376 /*
2377 * If the SOC supports it (and it isn't broken), enable
2378 * EL0 access to the timebase registers.
2379 */
2380 if (user_timebase_type() != USER_TIMEBASE_NONE) {
2381 cntkctl |= (CNTKCTL_EL1_PL0PCTEN | CNTKCTL_EL1_PL0VCTEN);
2382 }
2383
2384 __builtin_arm_wsr64("CNTKCTL_EL1", cntkctl);
2385}
2386
2387/*
2388 * Turn timer on, unmask that interrupt.
2389 */
2390static void
2391_enable_virtual_timer(void)
2392{
2393 uint64_t cntvctl = CNTV_CTL_EL0_ENABLE; /* One wants to use 32 bits, but "mrs" prefers it this way */
2394
2395 __builtin_arm_wsr64("CNTV_CTL_EL0", cntvctl);
2396 /* disable the physical timer as a precaution, as its registers reset to architecturally unknown values */
2397 __builtin_arm_wsr64("CNTP_CTL_EL0", CNTP_CTL_EL0_IMASKED);
2398}
2399
2400void
2401fiq_context_init(boolean_t enable_fiq __unused)
2402{
2403 /* Interrupts still disabled. */
2404 assert(ml_get_interrupts_enabled() == FALSE);
2405 _enable_virtual_timer();
2406}
2407
2408void
2409wfe_timeout_init(void)
2410{
2411 _enable_timebase_event_stream(bit_index: arm64_eventi);
2412}
2413
2414/**
2415 * Configures, but does not enable, the WFE event stream. The event stream
2416 * generates an event at a set interval to act as a timeout for WFEs.
2417 *
2418 * This function sets the static global variable arm64_eventi to be the proper
2419 * bit index for the CNTKCTL_EL1.EVENTI field to generate events at the correct
2420 * period (1us unless specified by the "wfe_events_sec" boot-arg). arm64_eventi
2421 * is used by wfe_timeout_init to actually poke the registers and enable the
2422 * event stream.
2423 *
2424 * The CNTKCTL_EL1.EVENTI field contains the index of the bit of CNTVCT_EL0 that
2425 * is the trigger for the system to generate an event. The trigger can occur on
2426 * either the rising or falling edge of the bit depending on the value of
2427 * CNTKCTL_EL1.EVNTDIR. This is arbitrary for our purposes, so we use the
2428 * falling edge (1->0) transition to generate events.
2429 */
2430void
2431wfe_timeout_configure(void)
2432{
2433 /* Could fill in our own ops here, if we needed them */
2434 uint64_t ticks_per_sec, ticks_per_event, events_per_sec = 0;
2435 uint32_t bit_index;
2436
2437 if (PE_parse_boot_argn(arg_string: "wfe_events_sec", arg_ptr: &events_per_sec, max_arg: sizeof(events_per_sec))) {
2438 if (events_per_sec <= 0) {
2439 events_per_sec = 1;
2440 } else if (events_per_sec > USEC_PER_SEC) {
2441 events_per_sec = USEC_PER_SEC;
2442 }
2443 } else {
2444 events_per_sec = USEC_PER_SEC;
2445 }
2446 ticks_per_sec = gPEClockFrequencyInfo.timebase_frequency_hz;
2447 ticks_per_event = ticks_per_sec / events_per_sec;
2448
2449 /* Bit index of next power of two greater than ticks_per_event */
2450 bit_index = flsll(mask: ticks_per_event) - 1;
2451 /* Round up to next power of two if ticks_per_event is initially power of two */
2452 if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
2453 bit_index++;
2454 }
2455
2456 /*
2457 * The timer can only trigger on rising or falling edge, not both; we don't
2458 * care which we trigger on, but we do need to adjust which bit we are
2459 * interested in to account for this.
2460 *
2461 * In particular, we set CNTKCTL_EL1.EVENTDIR to trigger events on the
2462 * falling edge of the given bit. Therefore, we must decrement the bit index
2463 * by one as when the bit before the one we care about makes a 1 -> 0
2464 * transition, the bit we care about makes a 0 -> 1 transition.
2465 *
2466 * For example if we want an event generated every 8 ticks (if we calculated
2467 * a bit_index of 3), we would want the event to be generated whenever the
2468 * lower four bits of the counter transition from 0b0111 -> 0b1000. We can
2469 * see that the bit at index 2 makes a falling transition in this scenario,
2470 * so we would want EVENTI to be 2 instead of 3.
2471 */
2472 if (bit_index != 0) {
2473 bit_index--;
2474 }
2475
2476 arm64_eventi = bit_index;
2477}
2478
2479boolean_t
2480ml_delay_should_spin(uint64_t interval)
2481{
2482 cpu_data_t *cdp = getCpuDatap();
2483
2484 if (cdp->cpu_idle_latency) {
2485 return (interval < cdp->cpu_idle_latency) ? TRUE : FALSE;
2486 } else {
2487 /*
2488 * Early boot, latency is unknown. Err on the side of blocking,
2489 * which should always be safe, even if slow
2490 */
2491 return FALSE;
2492 }
2493}
2494
2495boolean_t
2496ml_thread_is64bit(thread_t thread)
2497{
2498 return thread_is_64bit_addr(thread);
2499}
2500
2501void
2502ml_delay_on_yield(void)
2503{
2504#if DEVELOPMENT || DEBUG
2505 if (yield_delay_us) {
2506 delay(yield_delay_us);
2507 }
2508#endif
2509}
2510
2511void
2512ml_timer_evaluate(void)
2513{
2514}
2515
2516boolean_t
2517ml_timer_forced_evaluation(void)
2518{
2519 return FALSE;
2520}
2521
2522void
2523ml_gpu_stat_update(__unused uint64_t gpu_ns_delta)
2524{
2525 /*
2526 * For now: update the resource coalition stats of the
2527 * current thread's coalition
2528 */
2529 task_coalition_update_gpu_stats(task: current_task(), gpu_ns_delta);
2530}
2531
2532uint64_t
2533ml_gpu_stat(__unused thread_t t)
2534{
2535 return 0;
2536}
2537
2538thread_t
2539current_thread(void)
2540{
2541 return current_thread_fast();
2542}
2543
2544#if defined(HAS_APPLE_PAC)
2545uint8_t
2546ml_task_get_disable_user_jop(task_t task)
2547{
2548 assert(task);
2549 return task->disable_user_jop;
2550}
2551
2552void
2553ml_task_set_disable_user_jop(task_t task, uint8_t disable_user_jop)
2554{
2555 assert(task);
2556 task->disable_user_jop = disable_user_jop;
2557}
2558
2559void
2560ml_thread_set_disable_user_jop(thread_t thread, uint8_t disable_user_jop)
2561{
2562 assert(thread);
2563 if (disable_user_jop) {
2564 thread->machine.arm_machine_flags |= ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2565 } else {
2566 thread->machine.arm_machine_flags &= ~ARM_MACHINE_THREAD_DISABLE_USER_JOP;
2567 }
2568}
2569
2570void
2571ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit)
2572{
2573 if (inherit) {
2574 task->rop_pid = parent_task->rop_pid;
2575 } else {
2576 task->rop_pid = early_random();
2577 }
2578}
2579
2580/**
2581 * jop_pid may be inherited from the parent task or generated inside the shared
2582 * region. Unfortunately these two parameters are available at very different
2583 * times during task creation, so we need to split this into two steps.
2584 */
2585void
2586ml_task_set_jop_pid(task_t task, task_t parent_task, boolean_t inherit, boolean_t disable_user_jop)
2587{
2588 if (inherit) {
2589 task->jop_pid = parent_task->jop_pid;
2590 } else if (disable_user_jop) {
2591 task->jop_pid = ml_non_arm64e_user_jop_pid();
2592 } else {
2593 task->jop_pid = ml_default_jop_pid();
2594 }
2595}
2596
2597void
2598ml_task_set_jop_pid_from_shared_region(task_t task, boolean_t disable_user_jop)
2599{
2600 if (disable_user_jop) {
2601 task->jop_pid = ml_non_arm64e_user_jop_pid();
2602 return;
2603 }
2604
2605 vm_shared_region_t sr = vm_shared_region_get(task);
2606 /*
2607 * If there's no shared region, we can assign the key arbitrarily. This
2608 * typically happens when Mach-O image activation failed part of the way
2609 * through, and this task is in the middle of dying with SIGKILL anyway.
2610 */
2611 if (__improbable(!sr)) {
2612 task->jop_pid = early_random();
2613 return;
2614 }
2615 vm_shared_region_deallocate(shared_region: sr);
2616
2617 /*
2618 * Similarly we have to worry about jetsam having killed the task and
2619 * already cleared the shared_region_id.
2620 */
2621 task_lock(task);
2622 if (task->shared_region_id != NULL) {
2623 task->jop_pid = shared_region_find_key(shared_region_id: task->shared_region_id);
2624 } else {
2625 task->jop_pid = early_random();
2626 }
2627 task_unlock(task);
2628}
2629
2630void
2631ml_thread_set_jop_pid(thread_t thread, task_t task)
2632{
2633 thread->machine.jop_pid = task->jop_pid;
2634}
2635#endif /* defined(HAS_APPLE_PAC) */
2636
2637#if DEVELOPMENT || DEBUG
2638static uint64_t minor_badness_suffered = 0;
2639#endif
2640void
2641ml_report_minor_badness(uint32_t __unused badness_id)
2642{
2643 #if DEVELOPMENT || DEBUG
2644 (void)os_atomic_or(&minor_badness_suffered, 1ULL << badness_id, relaxed);
2645 #endif
2646}
2647
2648#if defined(HAS_APPLE_PAC)
2649#if __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM
2650/**
2651 * The ARMv8.6 implementation is also safe for non-FPAC CPUs, but less efficient;
2652 * guest kernels need to use it because it does not know at compile time whether
2653 * the host CPU supports FPAC.
2654 */
2655
2656/**
2657 * Emulates the poisoning done by ARMv8.3-PAuth instructions on auth failure.
2658 */
2659static void *
2660ml_poison_ptr(void *ptr, ptrauth_key key)
2661{
2662 bool b_key = key & (1ULL << 0);
2663 uint64_t error_code;
2664 if (b_key) {
2665 error_code = 2;
2666 } else {
2667 error_code = 1;
2668 }
2669
2670 bool kernel_pointer = (uintptr_t)ptr & (1ULL << 55);
2671 bool data_key = key & (1ULL << 1);
2672 /* When PAC is enabled, only userspace data pointers use TBI, regardless of boot parameters */
2673 bool tbi = data_key && !kernel_pointer;
2674 unsigned int poison_shift;
2675 if (tbi) {
2676 poison_shift = 53;
2677 } else {
2678 poison_shift = 61;
2679 }
2680
2681 uintptr_t poisoned = (uintptr_t)ptr;
2682 poisoned &= ~(3ULL << poison_shift);
2683 poisoned |= error_code << poison_shift;
2684 return (void *)poisoned;
2685}
2686
2687/*
2688 * ptrauth_sign_unauthenticated() reimplemented using asm volatile, forcing the
2689 * compiler to assume this operation has side-effects and cannot be reordered
2690 */
2691#define ptrauth_sign_volatile(__value, __suffix, __data) \
2692 ({ \
2693 void *__ret = __value; \
2694 asm volatile ( \
2695 "pac" #__suffix " %[value], %[data]" \
2696 : [value] "+r"(__ret) \
2697 : [data] "r"(__data) \
2698 ); \
2699 __ret; \
2700 })
2701
2702#define ml_auth_ptr_unchecked_for_key(_ptr, _suffix, _key, _modifier) \
2703 do { \
2704 void *stripped = ptrauth_strip(_ptr, _key); \
2705 void *reauthed = ptrauth_sign_volatile(stripped, _suffix, _modifier); \
2706 if (__probable(_ptr == reauthed)) { \
2707 _ptr = stripped; \
2708 } else { \
2709 _ptr = ml_poison_ptr(stripped, _key); \
2710 } \
2711 } while (0)
2712
2713#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2714 ml_auth_ptr_unchecked_for_key(_ptr, _suffix, ptrauth_key_as ## _suffix, _modifier)
2715#else
2716#define _ml_auth_ptr_unchecked(_ptr, _suffix, _modifier) \
2717 asm volatile ("aut" #_suffix " %[ptr], %[modifier]" : [ptr] "+r"(_ptr) : [modifier] "r"(_modifier));
2718#endif /* __ARM_ARCH_8_6__ || APPLEVIRTUALPLATFORM */
2719
2720/**
2721 * Authenticates a signed pointer without trapping on failure.
2722 *
2723 * @warning This function must be called with interrupts disabled.
2724 *
2725 * @warning Pointer authentication failure should normally be treated as a fatal
2726 * error. This function is intended for a handful of callers that cannot panic
2727 * on failure, and that understand the risks in handling a poisoned return
2728 * value. Other code should generally use the trapping variant
2729 * ptrauth_auth_data() instead.
2730 *
2731 * @param ptr the pointer to authenticate
2732 * @param key which key to use for authentication
2733 * @param modifier a modifier to mix into the key
2734 * @return an authenticated version of ptr, possibly with poison bits set
2735 */
2736void *
2737ml_auth_ptr_unchecked(void *ptr, ptrauth_key key, uint64_t modifier)
2738{
2739 switch (key & 0x3) {
2740 case ptrauth_key_asia:
2741 _ml_auth_ptr_unchecked(ptr, ia, modifier);
2742 break;
2743 case ptrauth_key_asib:
2744 _ml_auth_ptr_unchecked(ptr, ib, modifier);
2745 break;
2746 case ptrauth_key_asda:
2747 _ml_auth_ptr_unchecked(ptr, da, modifier);
2748 break;
2749 case ptrauth_key_asdb:
2750 _ml_auth_ptr_unchecked(ptr, db, modifier);
2751 break;
2752 }
2753
2754 return ptr;
2755}
2756#endif /* defined(HAS_APPLE_PAC) */
2757
2758#ifdef CONFIG_XNUPOST
2759void
2760ml_expect_fault_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_addr)
2761{
2762 thread_t thread = current_thread();
2763 thread->machine.expected_fault_handler = expected_fault_handler;
2764 thread->machine.expected_fault_addr = expected_fault_addr;
2765 thread->machine.expected_fault_pc = 0;
2766}
2767
2768/** Expect an exception to be thrown at EXPECTED_FAULT_PC */
2769void
2770ml_expect_fault_pc_begin(expected_fault_handler_t expected_fault_handler, uintptr_t expected_fault_pc)
2771{
2772 thread_t thread = current_thread();
2773 thread->machine.expected_fault_handler = expected_fault_handler;
2774 thread->machine.expected_fault_addr = 0;
2775 uintptr_t raw_func = (uintptr_t)ptrauth_strip(
2776 (void *)expected_fault_pc,
2777 ptrauth_key_function_pointer);
2778 thread->machine.expected_fault_pc = raw_func;
2779}
2780
2781void
2782ml_expect_fault_end(void)
2783{
2784 thread_t thread = current_thread();
2785 thread->machine.expected_fault_handler = NULL;
2786 thread->machine.expected_fault_addr = 0;
2787 thread->machine.expected_fault_pc = 0;
2788}
2789#endif /* CONFIG_XNUPOST */
2790
2791void
2792ml_hibernate_active_pre(void)
2793{
2794#if HIBERNATION
2795 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2796
2797 hibernate_rebuild_vm_structs();
2798 }
2799#endif /* HIBERNATION */
2800}
2801
2802void
2803ml_hibernate_active_post(void)
2804{
2805#if HIBERNATION
2806 if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) {
2807 hibernate_machine_init();
2808 hibernate_vm_lock_end();
2809 current_cpu_datap()->cpu_hibernate = 0;
2810 }
2811#endif /* HIBERNATION */
2812}
2813
2814/**
2815 * Return back a machine-dependent array of address space regions that should be
2816 * reserved by the VM (pre-mapped in the address space). This will prevent user
2817 * processes from allocating or deallocating from within these regions.
2818 *
2819 * @param vm_is64bit True if the process has a 64-bit address space.
2820 * @param regions An out parameter representing an array of regions to reserve.
2821 *
2822 * @return The number of reserved regions returned through `regions`.
2823 */
2824size_t
2825ml_get_vm_reserved_regions(bool vm_is64bit, const struct vm_reserved_region **regions)
2826{
2827 assert(regions != NULL);
2828
2829 /**
2830 * Reserved regions only apply to 64-bit address spaces. This is because
2831 * we only expect to grow the maximum user VA address on 64-bit address spaces
2832 * (we've essentially already reached the max for 32-bit spaces). The reserved
2833 * regions should safely fall outside of the max user VA for 32-bit processes.
2834 */
2835 if (vm_is64bit) {
2836 *regions = vm_reserved_regions;
2837 return ARRAY_COUNT(vm_reserved_regions);
2838 } else {
2839 /* Don't reserve any VA regions on arm64_32 processes. */
2840 *regions = NULL;
2841 return 0;
2842 }
2843}
2844
2845/* These WFE recommendations are expected to be updated on a relatively
2846 * infrequent cadence, possibly from a different cluster, hence
2847 * false cacheline sharing isn't expected to be material
2848 */
2849static uint64_t arm64_cluster_wfe_recs[MAX_CPU_CLUSTERS];
2850
2851uint32_t
2852ml_update_cluster_wfe_recommendation(uint32_t wfe_cluster_id, uint64_t wfe_timeout_abstime_interval, __unused uint64_t wfe_hint_flags)
2853{
2854 assert(wfe_cluster_id < MAX_CPU_CLUSTERS);
2855 assert(wfe_timeout_abstime_interval <= ml_wfe_hint_max_interval);
2856 os_atomic_store(&arm64_cluster_wfe_recs[wfe_cluster_id], wfe_timeout_abstime_interval, relaxed);
2857 return 0; /* Success */
2858}
2859
2860#if DEVELOPMENT || DEBUG
2861int wfe_rec_max = 0;
2862int wfe_rec_none = 0;
2863uint64_t wfe_rec_override_mat = 0;
2864uint64_t wfe_rec_clamp = 0;
2865#endif
2866
2867uint64_t
2868ml_cluster_wfe_timeout(uint32_t wfe_cluster_id)
2869{
2870 /* This and its consumer does not synchronize vis-a-vis updates
2871 * of the recommendation; races are acceptable.
2872 */
2873 uint64_t wfet = os_atomic_load(&arm64_cluster_wfe_recs[wfe_cluster_id], relaxed);
2874#if DEVELOPMENT || DEBUG
2875 if (wfe_rec_clamp) {
2876 wfet = MIN(wfe_rec_clamp, wfet);
2877 }
2878
2879 if (wfe_rec_max) {
2880 for (int i = 0; i < MAX_CPU_CLUSTERS; i++) {
2881 if (arm64_cluster_wfe_recs[i] > wfet) {
2882 wfet = arm64_cluster_wfe_recs[i];
2883 }
2884 }
2885 }
2886
2887 if (wfe_rec_none) {
2888 wfet = 0;
2889 }
2890
2891 if (wfe_rec_override_mat) {
2892 wfet = wfe_rec_override_mat;
2893 }
2894#endif
2895 return wfet;
2896}
2897
2898__pure2 bool
2899ml_addr_in_non_xnu_stack(__unused uintptr_t addr)
2900{
2901#if CONFIG_SPTM
2902 /**
2903 * If the address is within one of the SPTM-allocated per-cpu stacks, then
2904 * return true.
2905 */
2906 if ((addr >= SPTMArgs->cpu_stack_papt_start) &&
2907 (addr < SPTMArgs->cpu_stack_papt_end)) {
2908 return true;
2909 }
2910
2911 /**
2912 * If the address is within one of the TXM thread stacks, then return true.
2913 * The SPTM guarantees that these stacks are virtually contiguous.
2914 */
2915 if ((addr >= SPTMArgs->txm_thread_stacks[0]) &&
2916 (addr < SPTMArgs->txm_thread_stacks[MAX_CPUS - 1])) {
2917 return true;
2918 }
2919
2920 return false;
2921#elif XNU_MONITOR
2922 return (addr >= (uintptr_t)pmap_stacks_start) && (addr < (uintptr_t)pmap_stacks_end);
2923#else
2924 return false;
2925#endif /* CONFIG_SPTM || XNU_MONITOR */
2926}
2927
2928uint64_t
2929ml_get_backtrace_pc(struct arm_saved_state *state)
2930{
2931 assert((state != NULL) && is_saved_state64(state));
2932
2933#if CONFIG_SPTM
2934 /**
2935 * On SPTM-based systems, when a non-XNU domain (e.g., SPTM) is interrupted,
2936 * the PC value saved into the state is not the actual PC at the interrupted
2937 * point, but a fixed value to a handler that knows how to re-enter the
2938 * interrupted domain. The interrupted domain's actual PC value is saved
2939 * into x14, so let's return that instead.
2940 */
2941 if (ml_addr_in_non_xnu_stack(get_saved_state_fp(state))) {
2942 return saved_state64(state)->x[14];
2943 }
2944#endif /* CONFIG_SPTM */
2945
2946 return get_saved_state_pc(iss: state);
2947}
2948
2949
2950bool
2951ml_paddr_is_exclaves_owned(vm_offset_t paddr)
2952{
2953#if CONFIG_SPTM
2954 const sptm_frame_type_t type = sptm_get_frame_type(paddr);
2955 return type == SK_DEFAULT || type == SK_IO; // SK_SHARED_R[OW] are not exclusively exclaves frames
2956#else
2957 #pragma unused(paddr)
2958 return false;
2959#endif /* CONFIG_SPTM */
2960}
2961
2962/**
2963 * Panic because an ARM saved-state accessor expected user saved-state but was
2964 * passed non-user saved-state.
2965 *
2966 * @param ss invalid saved-state (CPSR.M != EL0)
2967 */
2968void
2969ml_panic_on_invalid_old_cpsr(const arm_saved_state_t *ss)
2970{
2971 panic("invalid CPSR in user saved-state %p", ss);
2972}
2973
2974/**
2975 * Panic because an ARM saved-state accessor was passed user saved-state and
2976 * asked to assign a non-user CPSR.
2977 *
2978 * @param ss original EL0 saved-state
2979 * @param cpsr invalid new CPSR value (CPSR.M != EL0)
2980 */
2981void
2982ml_panic_on_invalid_new_cpsr(const arm_saved_state_t *ss, uint32_t cpsr)
2983{
2984 panic("attempt to set non-user CPSR %#010x on user saved-state %p", cpsr, ss);
2985}
2986
2987/**
2988 * Explicitly preallocates a floating point save area.
2989 * This is a noop on ARM because preallocation isn't required at this time.
2990 */
2991void
2992ml_fp_save_area_prealloc(void)
2993{
2994}
2995