1/*
2 * Copyright (c) 2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/task.h>
31
32#include <kern/ast.h>
33#include <kern/kalloc.h>
34#include <kern/kern_types.h>
35#include <kern/mach_param.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/restartable.h>
41#include <kern/task.h>
42#include <kern/thread.h>
43#include <kern/waitq.h>
44
45#include <os/atomic_private.h>
46#include <os/hash.h>
47#include <os/refcnt.h>
48
49/**
50 * @file osfmk/kern/restartable.c
51 *
52 * @brief
53 * This module implements restartable userspace functions.
54 *
55 * @discussion
56 * task_restartable_ranges_register() allows task to configure
57 * the restartable ranges, only once per task,
58 * before it has made its second thread.
59 *
60 * task_restartable_ranges_synchronize() can later be used to trigger
61 * restarts for threads with a PC in a restartable region.
62 *
63 * It is implemented with an AST (AST_RESET_PCS) that will cause threads
64 * as they return to userspace to reset PCs in a restartable region
65 * to the recovery offset of this region.
66 *
67 * Because signal delivery would mask the proper saved PC for threads,
68 * sigreturn also forcefully sets the AST and will go through the logic
69 * every single time.
70 */
71
72typedef int (*cmpfunc_t)(const void *a, const void *b);
73extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
74
75#define RR_RANGES_MAX 64
76struct restartable_ranges {
77 queue_chain_t rr_link;
78 os_refcnt_t rr_ref;
79 uint32_t rr_count;
80 uint32_t rr_hash;
81 task_restartable_range_t rr_ranges[RR_RANGES_MAX];
82};
83
84#if DEBUG || DEVELOPMENT
85#define RR_HASH_SIZE 256
86#else
87// Release kernel userspace should have shared caches and a single registration
88#define RR_HASH_SIZE 16
89#endif
90
91static queue_head_t rr_hash[RR_HASH_SIZE];
92LCK_GRP_DECLARE(rr_lock_grp, "restartable ranges");
93LCK_SPIN_DECLARE(rr_spinlock, &rr_lock_grp);
94
95#define rr_lock() lck_spin_lock_grp(&rr_spinlock, &rr_lock_grp)
96#define rr_unlock() lck_spin_unlock(&rr_spinlock);
97
98#pragma mark internals
99
100/**
101 * @function _ranges_cmp
102 *
103 * @brief
104 * Compares two ranges together.
105 */
106static int
107_ranges_cmp(const void *_r1, const void *_r2)
108{
109 const task_restartable_range_t *r1 = _r1;
110 const task_restartable_range_t *r2 = _r2;
111
112 if (r1->location != r2->location) {
113 return r1->location < r2->location ? -1 : 1;
114 }
115 if (r1->length == r2->length) {
116 return 0;
117 }
118 return r1->length < r2->length ? -1 : 1;
119}
120
121/**
122 * @function _ranges_validate
123 *
124 * @brief
125 * Validates an array of PC ranges for wraps and intersections.
126 *
127 * @discussion
128 * This sorts and modifies the input.
129 *
130 * The ranges must:
131 * - not wrap around,
132 * - have a length/recovery offset within a page of the range start
133 *
134 * @returns
135 * - KERN_SUCCESS: ranges are valid
136 * - KERN_INVALID_ARGUMENT: ranges are invalid
137 */
138static kern_return_t
139_ranges_validate(task_t task, task_restartable_range_t *ranges, uint32_t count)
140{
141 qsort(a: ranges, n: count, es: sizeof(task_restartable_range_t), cmp: _ranges_cmp);
142 uint64_t limit = task_has_64Bit_data(task) ? UINT64_MAX : UINT32_MAX;
143 uint64_t end, recovery;
144
145 if (count == 0) {
146 return KERN_INVALID_ARGUMENT;
147 }
148
149 for (size_t i = 0; i < count; i++) {
150 if (ranges[i].length > TASK_RESTARTABLE_OFFSET_MAX ||
151 ranges[i].recovery_offs > TASK_RESTARTABLE_OFFSET_MAX) {
152 return KERN_INVALID_ARGUMENT;
153 }
154 if (ranges[i].flags) {
155 return KERN_INVALID_ARGUMENT;
156 }
157 if (os_add_overflow(ranges[i].location, ranges[i].length, &end)) {
158 return KERN_INVALID_ARGUMENT;
159 }
160 if (os_add_overflow(ranges[i].location, ranges[i].recovery_offs, &recovery)) {
161 return KERN_INVALID_ARGUMENT;
162 }
163 if (ranges[i].location > limit || end > limit || recovery > limit) {
164 return KERN_INVALID_ARGUMENT;
165 }
166 if (i + 1 < count && end > ranges[i + 1].location) {
167 return KERN_INVALID_ARGUMENT;
168 }
169 }
170
171 return KERN_SUCCESS;
172}
173
174/**
175 * @function _ranges_lookup
176 *
177 * @brief
178 * Lookup the left side of a range for a given PC within a set of ranges.
179 *
180 * @returns
181 * - 0: no PC range found
182 * - the left-side of the range.
183 */
184__attribute__((always_inline))
185static mach_vm_address_t
186_ranges_lookup(struct restartable_ranges *rr, mach_vm_address_t pc)
187{
188 task_restartable_range_t *ranges = rr->rr_ranges;
189 uint32_t l = 0, r = rr->rr_count;
190
191 if (pc <= ranges[0].location) {
192 return 0;
193 }
194 if (pc >= ranges[r - 1].location + ranges[r - 1].length) {
195 return 0;
196 }
197
198 while (l < r) {
199 uint32_t i = (r + l) / 2;
200 mach_vm_address_t location = ranges[i].location;
201
202 if (pc <= location) {
203 /* if the PC is exactly at pc_start, no reset is needed */
204 r = i;
205 } else if (location + ranges[i].length <= pc) {
206 /* if the PC is exactly at the end, it's out of the function */
207 l = i + 1;
208 } else {
209 /* else it's strictly in the range, return the recovery pc */
210 return location + ranges[i].recovery_offs;
211 }
212 }
213
214 return 0;
215}
216
217/**
218 * @function _restartable_ranges_dispose
219 *
220 * @brief
221 * Helper to dispose of a range that has reached a 0 refcount.
222 */
223__attribute__((noinline))
224static void
225_restartable_ranges_dispose(struct restartable_ranges *rr, bool hash_remove)
226{
227 if (hash_remove) {
228 rr_lock();
229 remqueue(elt: &rr->rr_link);
230 rr_unlock();
231 }
232 kfree_type(struct restartable_ranges, rr);
233}
234
235/**
236 * @function _restartable_ranges_equals
237 *
238 * @brief
239 * Helper to compare two restartable ranges.
240 */
241static bool
242_restartable_ranges_equals(
243 const struct restartable_ranges *rr1,
244 const struct restartable_ranges *rr2)
245{
246 size_t rr1_size = rr1->rr_count * sizeof(task_restartable_range_t);
247 return rr1->rr_hash == rr2->rr_hash &&
248 rr1->rr_count == rr2->rr_count &&
249 memcmp(s1: rr1->rr_ranges, s2: rr2->rr_ranges, n: rr1_size) == 0;
250}
251
252/**
253 * @function _restartable_ranges_create
254 *
255 * @brief
256 * Helper to create a uniqued restartable range.
257 *
258 * @returns
259 * - KERN_SUCCESS
260 * - KERN_INVALID_ARGUMENT: the validation of the new ranges failed.
261 * - KERN_RESOURCE_SHORTAGE: too many ranges, out of memory
262 */
263static kern_return_t
264_restartable_ranges_create(task_t task, task_restartable_range_t *ranges,
265 uint32_t count, struct restartable_ranges **rr_storage)
266{
267 struct restartable_ranges *rr, *rr_found, *rr_base;
268 queue_head_t *head;
269 uint32_t base_count, total_count;
270 size_t base_size, size;
271 kern_return_t kr;
272
273 rr_base = *rr_storage;
274 base_count = rr_base ? rr_base->rr_count : 0;
275 base_size = sizeof(task_restartable_range_t) * base_count;
276 size = sizeof(task_restartable_range_t) * count;
277
278 if (os_add_overflow(base_count, count, &total_count)) {
279 return KERN_INVALID_ARGUMENT;
280 }
281 if (total_count > RR_RANGES_MAX) {
282 return KERN_RESOURCE_SHORTAGE;
283 }
284
285 rr = kalloc_type(struct restartable_ranges,
286 (zalloc_flags_t) (Z_WAITOK | Z_ZERO | Z_NOFAIL));
287
288 queue_chain_init(rr->rr_link);
289 os_ref_init(&rr->rr_ref, NULL);
290 rr->rr_count = total_count;
291 if (base_size) {
292 memcpy(dst: rr->rr_ranges, src: rr_base->rr_ranges, n: base_size);
293 }
294 memcpy(dst: rr->rr_ranges + base_count, src: ranges, n: size);
295 kr = _ranges_validate(task, ranges: rr->rr_ranges, count: total_count);
296 if (kr) {
297 _restartable_ranges_dispose(rr, false);
298 return kr;
299 }
300 rr->rr_hash = os_hash_jenkins(data: rr->rr_ranges,
301 length: rr->rr_count * sizeof(task_restartable_range_t));
302
303 head = &rr_hash[rr->rr_hash % RR_HASH_SIZE];
304
305 rr_lock();
306 queue_iterate(head, rr_found, struct restartable_ranges *, rr_link) {
307 if (_restartable_ranges_equals(rr1: rr, rr2: rr_found) &&
308 os_ref_retain_try(rc: &rr_found->rr_ref)) {
309 goto found;
310 }
311 }
312
313 enqueue_tail(que: head, elt: &rr->rr_link);
314 rr_found = rr;
315
316found:
317 if (rr_base && os_ref_release_relaxed(rc: &rr_base->rr_ref) == 0) {
318 remqueue(elt: &rr_base->rr_link);
319 } else {
320 rr_base = NULL;
321 }
322 rr_unlock();
323
324 *rr_storage = rr_found;
325
326 if (rr_found != rr) {
327 _restartable_ranges_dispose(rr, false);
328 }
329 if (rr_base) {
330 _restartable_ranges_dispose(rr: rr_base, false);
331 }
332 return KERN_SUCCESS;
333}
334
335#pragma mark extern interfaces
336
337__attribute__((always_inline))
338void
339restartable_ranges_release(struct restartable_ranges *rr)
340{
341 if (os_ref_release_relaxed(rc: &rr->rr_ref) == 0) {
342 _restartable_ranges_dispose(rr, true);
343 }
344}
345
346__attribute__((always_inline))
347void
348thread_reset_pcs_will_fault(thread_t thread)
349{
350 /*
351 * Called in the exception handling code while interrupts
352 * are still disabled.
353 */
354 os_atomic_store(&thread->t_rr_state.trr_fault_state,
355 (uint8_t)TRR_FAULT_PENDING, relaxed);
356}
357
358__attribute__((always_inline))
359void
360thread_reset_pcs_done_faulting(struct thread *thread)
361{
362 thread_rr_state_t state = {
363 .trr_ipi_ack_pending = ~0,
364 };
365
366 /*
367 * Called by the exception handling code on the way back,
368 * or when the thread is terminated.
369 */
370 state.trr_value = os_atomic_and_orig(&thread->t_rr_state.trr_value,
371 state.trr_value, relaxed);
372
373 if (__improbable(state.trr_sync_waiting)) {
374 task_t task = get_threadtask(thread);
375
376 task_lock(task);
377 wakeup_all_with_inheritor(event: &thread->t_rr_state, THREAD_AWAKENED);
378 task_unlock(task);
379 }
380}
381
382void
383thread_reset_pcs_ack_IPI(struct thread *thread)
384{
385 thread_rr_state_t trrs;
386
387 /*
388 * Called under the thread lock from IPI or CSwitch context.
389 */
390 trrs.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
391 if (__improbable(trrs.trr_ipi_ack_pending)) {
392 trrs.trr_ipi_ack_pending = false;
393 if (trrs.trr_fault_state) {
394 assert3u(trrs.trr_fault_state, ==, TRR_FAULT_PENDING);
395 trrs.trr_fault_state = TRR_FAULT_OBSERVED;
396 }
397 os_atomic_store(&thread->t_rr_state.trr_value,
398 trrs.trr_value, relaxed);
399 }
400}
401
402static bool
403thread_rr_wait_if_needed(task_t task, thread_t thread)
404{
405 thread_rr_state_t state;
406 bool did_unlock = false;
407
408 state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
409 if (state.trr_value == 0) {
410 return did_unlock;
411 }
412
413 assert(state.trr_sync_waiting == 0);
414
415 thread_reference(thread);
416
417 /*
418 * The thread_rr_state state machine is:
419 *
420 * ,------------ IPI ack --------------.
421 * v |
422 * .-----> {f:N, w:0, ipi:0} --- IPI sent ---> {f:N, w:0, ipi:1}
423 * | | ^ |
424 * | | | |
425 * fault will fault will
426 * done fault done fault
427 * | | | |
428 * | v | v
429 * | {f:P, w:0, ipi:0} --- IPI sent ---> {f:P, w:0, ipi:1}
430 * | | |
431 * | | |
432 * | act_set_ast_reset_pcs() |
433 * | | |
434 * | v |
435 * +------ {f:O, w:0, ipi:0} <--- IPI Ack -------------'
436 * | |
437 * | |
438 * | wait_if_needed()
439 * | |
440 * | v
441 * `------ {f:O, w:1, ipi:0}
442 */
443
444 while (state.trr_ipi_ack_pending) {
445 disable_preemption();
446 task_unlock(task);
447
448 state.trr_value =
449 hw_wait_while_equals32(address: &thread->t_rr_state.trr_value,
450 current: state.trr_value);
451
452 enable_preemption();
453 task_lock(task);
454
455 did_unlock = true;
456 }
457
458 /*
459 * If a VM fault is in flight we must wait for it to resolve
460 * before we can return from task_restartable_ranges_synchronize(),
461 * as the memory we're faulting against might be freed by the caller
462 * as soon as it returns, leading a crash.
463 */
464 if (state.trr_fault_state == TRR_FAULT_OBSERVED) {
465 thread_rr_state_t nstate = {
466 .trr_fault_state = TRR_FAULT_OBSERVED,
467 .trr_sync_waiting = 1,
468 };
469
470 if (os_atomic_cmpxchg(&thread->t_rr_state, state,
471 nstate, relaxed)) {
472 lck_mtx_sleep_with_inheritor(lock: &task->lock,
473 lck_sleep_action: LCK_SLEEP_DEFAULT, event: &thread->t_rr_state,
474 inheritor: thread, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
475 did_unlock = true;
476 }
477 }
478
479#if MACH_ASSERT
480 state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
481 assert3u(state.trr_fault_state, !=, TRR_FAULT_OBSERVED);
482 assert3u(state.trr_ipi_ack_pending, ==, 0);
483 assert3u(state.trr_sync_waiting, ==, 0);
484#endif
485
486 thread_deallocate_safe(thread);
487 return did_unlock;
488}
489
490bool
491thread_reset_pcs_in_range(task_t task, thread_t thread)
492{
493 return _ranges_lookup(rr: task->t_rr_ranges, pc: machine_thread_pc(thread)) != 0;
494}
495
496void
497thread_reset_pcs_ast(task_t task, thread_t thread)
498{
499 struct restartable_ranges *rr;
500 mach_vm_address_t pc;
501
502 /*
503 * Because restartable_ranges are set while the task only has on thread
504 * and can't be mutated outside of this, no lock is required to read this.
505 */
506 rr = task->t_rr_ranges;
507 if (thread->active && rr) {
508 pc = _ranges_lookup(rr, pc: machine_thread_pc(thread));
509
510 if (pc) {
511 machine_thread_reset_pc(thread, pc);
512 }
513 }
514
515#if MACH_ASSERT
516 thread_rr_state_t state;
517
518 state.trr_value = os_atomic_load(&thread->t_rr_state.trr_value, relaxed);
519 assert3u(state.trr_fault_state, ==, TRR_FAULT_NONE);
520 assert3u(state.trr_sync_waiting, ==, 0);
521#endif
522}
523
524void
525restartable_init(void)
526{
527 for (size_t i = 0; i < RR_HASH_SIZE; i++) {
528 queue_head_init(rr_hash[i]);
529 }
530}
531
532#pragma mark MiG interfaces
533
534kern_return_t
535task_restartable_ranges_register(
536 task_t task,
537 task_restartable_range_t *ranges,
538 mach_msg_type_number_t count)
539{
540 kern_return_t kr;
541
542 if (task != current_task()) {
543 return KERN_FAILURE;
544 }
545
546#if CONFIG_ROSETTA
547 // <rdar://problem/48527888> Obj-C adoption of task_restartable_ranges_register breaks Cambria
548 if (task_is_translated(task)) {
549 return KERN_RESOURCE_SHORTAGE;
550 }
551#endif
552
553 kr = _ranges_validate(task, ranges, count);
554
555 if (kr == KERN_SUCCESS) {
556 task_lock(task);
557
558 if (task->thread_count > 1) {
559 kr = KERN_NOT_SUPPORTED;
560#if !DEBUG && !DEVELOPMENT
561 } else if (task->t_rr_ranges) {
562 /*
563 * For security reasons, on release kernels,
564 * only allow for this to be configured once.
565 *
566 * But to be able to test the feature we need
567 * to relax this for dev kernels.
568 */
569 kr = KERN_NOT_SUPPORTED;
570#endif
571 } else {
572 kr = _restartable_ranges_create(task, ranges, count,
573 rr_storage: &task->t_rr_ranges);
574 }
575
576 task_unlock(task);
577 }
578
579 return kr;
580}
581
582kern_return_t
583task_restartable_ranges_synchronize(task_t task)
584{
585 thread_pri_floor_t token;
586 thread_t thread;
587 bool needs_wait = false;
588 kern_return_t kr = KERN_SUCCESS;
589
590 if (task != current_task()) {
591 return KERN_FAILURE;
592 }
593
594 /*
595 * t_rr_ranges can only be set if the process is single threaded.
596 * As a result, `t_rr_ranges` can _always_ be looked at
597 * from current_thread() without holding a lock:
598 * - either because it's the only thread in the task
599 * - or because the existence of another thread precludes
600 * modification
601 */
602 if (!task->t_rr_ranges) {
603 return KERN_SUCCESS;
604 }
605
606 /*
607 * When initiating a GC, artificially raise the priority for the
608 * duration of sending ASTs, we want to be preemptible, but this
609 * sequence has to terminate in a timely fashion.
610 */
611 token = thread_priority_floor_start();
612
613 task_lock(task);
614
615 /*
616 * In order to avoid trivial deadlocks of 2 threads trying
617 * to wait on each other while in kernel, disallow
618 * concurrent usage of task_restartable_ranges_synchronize().
619 *
620 * At the time this code was written, the one client (Objective-C)
621 * does this under lock which guarantees ordering. If we ever need
622 * more clients, the library around restartable ranges will have
623 * to synchronize in userspace.
624 */
625 if (task->task_rr_in_flight) {
626 kr = KERN_ALREADY_WAITING;
627 goto out;
628 }
629
630 task->task_rr_in_flight = true;
631
632 /*
633 * Pair with the acquire barriers handling RR_TSTATE_ONCORE.
634 *
635 * For threads that weren't on core, we rely on the fact
636 * that we are taking their lock in act_set_ast_reset_pcs()
637 * and that the context switch path will also take it before
638 * resuming them which rovides the required ordering.
639 *
640 * For new threads not existing yet, because the task_lock()
641 * is taken to add them to the task thread list,
642 * which also synchronizes with this code.
643 */
644 os_atomic_thread_fence(release);
645
646 /*
647 * Set all the AST_RESET_PCS, and see if any thread needs
648 * actual acknowledgement.
649 */
650 queue_iterate(&task->threads, thread, thread_t, task_threads) {
651 if (thread != current_thread()) {
652 needs_wait |= act_set_ast_reset_pcs(task, thread);
653 }
654 }
655
656 /*
657 * Now wait for acknowledgement if we need any
658 */
659 while (needs_wait) {
660 needs_wait = false;
661
662 queue_iterate(&task->threads, thread, thread_t, task_threads) {
663 if (thread == current_thread()) {
664 continue;
665 }
666
667 needs_wait = thread_rr_wait_if_needed(task, thread);
668 if (needs_wait) {
669 /*
670 * We drop the task lock,
671 * we need to restart enumerating threads.
672 */
673 break;
674 }
675 }
676 }
677
678 task->task_rr_in_flight = false;
679
680out:
681 task_unlock(task);
682
683 thread_priority_floor_end(token: &token);
684
685 return kr;
686}
687