1 | /* |
2 | * Copyright (c) 2020 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <sys/sysctl.h> |
30 | |
31 | #include <kern/cpu_data.h> |
32 | |
33 | #if __arm64__ |
34 | #include <arm/machine_routines.h> |
35 | #endif /* __arm64__ */ |
36 | |
37 | #if CONFIG_DEBUG_SYSCALL_REJECTION |
38 | |
39 | #include <mach/mach_time.h> |
40 | |
41 | #include <kern/bits.h> |
42 | #include <kern/clock.h> |
43 | #include <kern/exc_guard.h> |
44 | #include <kern/exception.h> |
45 | #include <kern/kalloc.h> |
46 | #include <kern/simple_lock.h> |
47 | #include <kern/startup.h> |
48 | #include <kern/syscall_sw.h> |
49 | #include <kern/task.h> |
50 | |
51 | #include <pexpert/pexpert.h> |
52 | |
53 | #include <sys/syscall.h> |
54 | #include <sys/sysent.h> |
55 | #include <sys/systm.h> |
56 | #include <sys/types.h> |
57 | #include <sys/user.h> |
58 | #include <sys/variant_internal.h> |
59 | |
60 | #include <sys/kern_debug.h> |
61 | |
62 | #define SYSCALL_REJECTION_MODE_IGNORE 0 |
63 | #define SYSCALL_REJECTION_MODE_GUARD 1 |
64 | #define SYSCALL_REJECTION_MODE_CRASH 2 |
65 | |
66 | TUNABLE_WRITEABLE(int, debug_syscall_rejection_mode, "syscall_rejection_mode" , |
67 | #if DEVELOPMENT || DEBUG |
68 | SYSCALL_REJECTION_MODE_GUARD |
69 | #else |
70 | SYSCALL_REJECTION_MODE_IGNORE |
71 | #endif |
72 | ); |
73 | |
74 | static int |
75 | sysctl_debug_syscall_rejection_mode(struct sysctl_oid __unused *oidp, void * __unused arg1, int __unused arg2, |
76 | struct sysctl_req *req) |
77 | { |
78 | int error, changed; |
79 | int value = *(int *) arg1; |
80 | |
81 | if (!os_variant_has_internal_diagnostics("com.apple.xnu" )) { |
82 | return ENOTSUP; |
83 | } |
84 | |
85 | error = sysctl_io_number(req, value, sizeof(value), &value, &changed); |
86 | if (!error && changed) { |
87 | debug_syscall_rejection_mode = value; |
88 | } |
89 | return error; |
90 | } |
91 | |
92 | void |
93 | reset_debug_syscall_rejection_mode(void) |
94 | { |
95 | if (!os_variant_has_internal_diagnostics("com.apple.xnu" )) { |
96 | debug_syscall_rejection_mode = 0; |
97 | } |
98 | } |
99 | |
100 | SYSCTL_PROC(_kern, OID_AUTO, debug_syscall_rejection_mode, CTLFLAG_RW | CTLFLAG_LOCKED, |
101 | &debug_syscall_rejection_mode, 0, sysctl_debug_syscall_rejection_mode, "I" , "0: ignore, 1: non-fatal, 2: crash" ); |
102 | |
103 | |
104 | static size_t const predefined_masks = 2; // 0: null mask (all 0), 1: all mask (all 1) |
105 | |
106 | /* |
107 | * The number of masks is derived from the mask selector width: |
108 | * |
109 | * A selector is just made of an index into syscall_rejection_masks, |
110 | * with the exception of the highest bit, which indicates whether the |
111 | * mask is to be added as an "allow" mask or a "deny" mask. |
112 | * Additionally, predefined masks don't actually have storage and are |
113 | * handled specially, so syscall_rejection_masks starts with the first |
114 | * non-predefined mask (and is sized appropriately). |
115 | */ |
116 | static size_t const syscall_rejection_mask_count = SYSCALL_REJECTION_SELECTOR_MASK_COUNT - predefined_masks; |
117 | static syscall_rejection_mask_t syscall_rejection_masks[syscall_rejection_mask_count]; |
118 | |
119 | #define SR_MASK_SIZE (BITMAP_SIZE(mach_trap_count + nsysent)) |
120 | |
121 | static LCK_GRP_DECLARE(syscall_rejection_lck_grp, "syscall rejection lock" ); |
122 | static LCK_MTX_DECLARE(syscall_rejection_mtx, &syscall_rejection_lck_grp); |
123 | |
124 | bool |
125 | debug_syscall_rejection_handle(int syscall_mach_trap_number) |
126 | { |
127 | uthread_t ut = current_uthread(); |
128 | uint64_t const flags = ut->syscall_rejection_flags; |
129 | bool fatal = (bool)(flags & SYSCALL_REJECTION_FLAGS_FORCE_FATAL); |
130 | |
131 | switch (debug_syscall_rejection_mode) { |
132 | case SYSCALL_REJECTION_MODE_IGNORE: |
133 | if (!fatal) { |
134 | /* ignore */ |
135 | break; |
136 | } |
137 | OS_FALLTHROUGH; |
138 | case SYSCALL_REJECTION_MODE_CRASH: |
139 | fatal = true; |
140 | OS_FALLTHROUGH; |
141 | case SYSCALL_REJECTION_MODE_GUARD: { |
142 | if (flags & SYSCALL_REJECTION_FLAGS_ONCE) { |
143 | int const number = syscall_mach_trap_number < 0 ? -syscall_mach_trap_number : (mach_trap_count + syscall_mach_trap_number); |
144 | |
145 | // don't trip on this system call again |
146 | bitmap_set(ut->syscall_rejection_mask, number); |
147 | bitmap_set(ut->syscall_rejection_once_mask, number); |
148 | } |
149 | |
150 | mach_exception_code_t code = 0; |
151 | EXC_GUARD_ENCODE_TYPE(code, GUARD_TYPE_REJECTED_SC); |
152 | EXC_GUARD_ENCODE_FLAVOR(code, 0); |
153 | EXC_GUARD_ENCODE_TARGET(code, syscall_mach_trap_number < 0); |
154 | mach_exception_subcode_t subcode = |
155 | syscall_mach_trap_number < 0 ? -syscall_mach_trap_number : syscall_mach_trap_number; |
156 | |
157 | if (!fatal) { |
158 | task_violated_guard(code, subcode, NULL, TRUE); |
159 | } else { |
160 | thread_guard_violation(current_thread(), code, subcode, fatal); |
161 | } |
162 | break; |
163 | }; |
164 | default: |
165 | /* ignore */ |
166 | ; |
167 | } |
168 | return fatal; |
169 | } |
170 | |
171 | extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, |
172 | mach_exception_data_type_t subcode); |
173 | |
174 | void |
175 | rejected_syscall_guard_ast( |
176 | thread_t t, |
177 | mach_exception_data_type_t code, |
178 | mach_exception_data_type_t subcode) |
179 | { |
180 | const bool fatal = true; |
181 | /* |
182 | * Check if anyone has registered for Synchronous EXC_GUARD, if yes then, |
183 | * deliver it synchronously and then kill the process, else kill the process |
184 | * and deliver the exception via EXC_CORPSE_NOTIFY. Always kill the process if we are not in dev mode. |
185 | */ |
186 | if (task_exception_notify(EXC_GUARD, code, subcode, fatal) == KERN_SUCCESS) { |
187 | psignal_uthread(t, SIGSYS); |
188 | } else { |
189 | exit_with_guard_exception(current_proc(), code, subcode); |
190 | } |
191 | } |
192 | |
193 | |
194 | static void |
195 | _syscall_rejection_apply_mask(syscall_rejection_mask_t dest, const syscall_rejection_mask_t src, bool apply_as_allow) |
196 | { |
197 | assert(dest != NULL); |
198 | assert(src != NULL); |
199 | |
200 | if (apply_as_allow) { |
201 | bitmap_or(dest, dest, src, mach_trap_count + nsysent); |
202 | } else { |
203 | bitmap_and_not(dest, dest, src, mach_trap_count + nsysent); |
204 | } |
205 | } |
206 | |
207 | /* |
208 | * The masks to apply are passed to the kernel as packed selectors, |
209 | * which are just however many of the selector data type fit into one |
210 | * (or more) fields of the natural word size (i.e. a register). This |
211 | * avoids copying from user space. |
212 | * |
213 | * More specifically, at the time of this writing, a selector is 7 |
214 | * bits wide, and there are two uint64_t arguments |
215 | * (args->packed_selectors<n>), so up to 18 selectors can be |
216 | * specified, which are then stuffed into the 128 bits of the |
217 | * arguments. If less than 18 masks are requested to be applied, the |
218 | * remaining selectors will just be left as 0, which naturally |
219 | * resolves as the "empty" or "NULL" mask that changes nothing. |
220 | * |
221 | * The libsyscall wrapper provides a more convenient interface where |
222 | * an array (up to 18 elements long) and its length are passed in, |
223 | * which the wrapper then packs into packed_selectors of the actual |
224 | * system call. |
225 | */ |
226 | |
227 | int |
228 | sys_debug_syscall_reject_config(struct proc *p __unused, struct debug_syscall_reject_config_args *args, int *retval) |
229 | { |
230 | int error = 0; |
231 | |
232 | *retval = 0; |
233 | |
234 | uthread_t ut = current_uthread(); |
235 | |
236 | bitmap_t mask[SR_MASK_SIZE / sizeof(bitmap_t)]; |
237 | // syscall rejection masks are always reset to "deny all" |
238 | memset(mask, 0, SR_MASK_SIZE); |
239 | |
240 | lck_mtx_lock(&syscall_rejection_mtx); |
241 | |
242 | for (int i = 0; |
243 | i + SYSCALL_REJECTION_SELECTOR_BITS < (sizeof(args->packed_selectors1) + sizeof(args->packed_selectors2)) * 8; |
244 | i += SYSCALL_REJECTION_SELECTOR_BITS) { |
245 | #define s_left_shift(x, n) ((n) < 0 ? ((x) >> -(n)) : ((x) << (n))) |
246 | |
247 | syscall_rejection_selector_t const selector = (syscall_rejection_selector_t) |
248 | (((i < 64 ? (args->packed_selectors1 >> i) : 0) | |
249 | (i > 64 - SYSCALL_REJECTION_SELECTOR_BITS ? s_left_shift(args->packed_selectors2, 64 - i) : 0)) & SYSCALL_REJECTION_SELECTOR_MASK); |
250 | bool const is_allow_mask = selector & SYSCALL_REJECTION_IS_ALLOW_MASK; |
251 | int const mask_index = selector & SYSCALL_REJECTION_INDEX_MASK; |
252 | |
253 | if (mask_index == SYSCALL_REJECTION_NULL) { |
254 | // mask 0 is always empty (nothing to apply) |
255 | continue; |
256 | } |
257 | |
258 | if (mask_index == SYSCALL_REJECTION_ALL) { |
259 | // mask 1 is always full (overrides everything) |
260 | memset(mask, is_allow_mask ? 0xff : 0x00, SR_MASK_SIZE); |
261 | continue; |
262 | } |
263 | |
264 | syscall_rejection_mask_t mask_to_apply = syscall_rejection_masks[mask_index - predefined_masks]; |
265 | |
266 | if (mask_to_apply == NULL) { |
267 | error = ENOENT; |
268 | goto out_locked; |
269 | } |
270 | |
271 | _syscall_rejection_apply_mask(mask, mask_to_apply, is_allow_mask); |
272 | } |
273 | |
274 | /* Not RT-safe, but only necessary once. */ |
275 | if (ut->syscall_rejection_mask == NULL) { |
276 | ut->syscall_rejection_mask = kalloc_data(SR_MASK_SIZE, Z_WAITOK); |
277 | |
278 | if (ut->syscall_rejection_mask == NULL) { |
279 | error = ENOMEM; |
280 | goto out_locked; |
281 | } |
282 | } |
283 | |
284 | memcpy(ut->syscall_rejection_mask, mask, SR_MASK_SIZE); |
285 | |
286 | if ((args->flags & SYSCALL_REJECTION_FLAGS_ONCE)) { |
287 | if (ut->syscall_rejection_once_mask == NULL) { |
288 | ut->syscall_rejection_once_mask = kalloc_data(SR_MASK_SIZE, Z_WAITOK); |
289 | |
290 | if (ut->syscall_rejection_once_mask == NULL) { |
291 | kfree_data(ut->syscall_rejection_mask, SR_MASK_SIZE); |
292 | ut->syscall_rejection_mask = NULL; |
293 | error = ENOMEM; |
294 | goto out_locked; |
295 | } |
296 | |
297 | memset(ut->syscall_rejection_once_mask, 0, SR_MASK_SIZE); |
298 | } else { |
299 | // prevent the already hit syscalls from hitting again. |
300 | bitmap_or(ut->syscall_rejection_mask, ut->syscall_rejection_mask, ut->syscall_rejection_once_mask, mach_trap_count + nsysent); |
301 | } |
302 | } |
303 | |
304 | out_locked: |
305 | lck_mtx_unlock(&syscall_rejection_mtx); |
306 | |
307 | if (error == 0) { |
308 | ut->syscall_rejection_flags = args->flags; |
309 | } |
310 | |
311 | if (error == ENOENT && debug_syscall_rejection_mode == SYSCALL_REJECTION_MODE_IGNORE) { |
312 | /* Existing code may rely on the system call failing |
313 | * gracefully if syscall rejection is currently off. */ |
314 | error = 0; |
315 | } |
316 | |
317 | return error; |
318 | } |
319 | |
320 | /* |
321 | * debug_syscall_reject |
322 | * |
323 | * Compatibility interface to the old form of the system call. |
324 | */ |
325 | int |
326 | debug_syscall_reject(struct proc *p, struct debug_syscall_reject_args *args, int *retval) |
327 | { |
328 | struct debug_syscall_reject_config_args new_args; |
329 | |
330 | bzero(&new_args, sizeof(new_args)); |
331 | new_args.packed_selectors1 = args->packed_selectors; |
332 | // packed_selectors2 left empty |
333 | new_args.flags = SYSCALL_REJECTION_FLAGS_DEFAULT; |
334 | |
335 | return sys_debug_syscall_reject_config(p, &new_args, retval); |
336 | } |
337 | |
338 | |
339 | static bool |
340 | _syscall_rejection_add(syscall_rejection_mask_t dst, char const *name) |
341 | { |
342 | /* |
343 | * Yes, this function is O(n+m), making the whole act of setting a |
344 | * mask O(l*(n+m)), but defining masks is done rarely enough (and |
345 | * i, n and m small enough) for this to not matter. |
346 | */ |
347 | |
348 | for (int i = 0; i < mach_trap_count; i++) { |
349 | if (strcmp(mach_syscall_name_table[i], name) == 0) { |
350 | bitmap_set(dst, i); |
351 | return true; |
352 | } |
353 | } |
354 | |
355 | extern char const *syscallnames[]; |
356 | |
357 | for (int i = 0; i < nsysent; i++) { |
358 | if (strcmp(syscallnames[i], name) == 0) { |
359 | bitmap_set(dst, i + mach_trap_count); |
360 | return true; |
361 | } |
362 | } |
363 | |
364 | printf("%s: trying to add non-existing syscall/mach trap '%s'\n" , __func__, name); |
365 | return false; |
366 | } |
367 | |
368 | /* Pretty much arbitrary, we just don't want userspace to pass |
369 | * unreasonably large buffers to parse. */ |
370 | static size_t const max_input_size = 16 * PAGE_MAX_SIZE; |
371 | |
372 | static int |
373 | _sysctl_debug_syscall_rejection_masks(struct sysctl_oid __unused *oidp, void * __unused arg1, int __unused arg2, |
374 | struct sysctl_req *req) |
375 | { |
376 | size_t const max_name_len = 128; |
377 | char name[max_name_len]; |
378 | |
379 | if (req->newptr == 0) { |
380 | return 0; |
381 | } |
382 | |
383 | if (req->newlen > max_input_size) { |
384 | return E2BIG; |
385 | } |
386 | |
387 | size_t const len = req->newlen; |
388 | char *buf = kalloc_data(len + 1, Z_WAITOK); |
389 | |
390 | if (buf == NULL) { |
391 | return ENOMEM; |
392 | } |
393 | |
394 | /* |
395 | * sysctl_io_string always copies out the given buffer as the |
396 | * "old" value if requested. We could construct a text |
397 | * representation of existing masks, but this is not particularly |
398 | * interesting, so we just return the dummy string "<masks>". |
399 | */ |
400 | strlcpy(buf, "<masks>" , len + 1); |
401 | int changed = 0; |
402 | int error = sysctl_io_string(req, buf, len + 1, 0, &changed); |
403 | |
404 | if (error != 0 || !changed) { |
405 | goto out; |
406 | } |
407 | |
408 | char const *p = buf; |
409 | |
410 | int id = 0; |
411 | int l = 0; |
412 | int n = sscanf(p, "%i: %n" , &id, &l); |
413 | |
414 | if (n != 1 || id < predefined_masks || id > syscall_rejection_mask_count + predefined_masks) { |
415 | printf("%s: invalid mask id %i (or conversion failed)\n" , __FUNCTION__, id); |
416 | error = EINVAL; |
417 | goto out; |
418 | } |
419 | |
420 | p += l; |
421 | |
422 | syscall_rejection_mask_t new_mask = kalloc_data(SR_MASK_SIZE, |
423 | Z_WAITOK | Z_ZERO); |
424 | if (new_mask == NULL) { |
425 | printf("%s: allocating new mask for id %i failed\n" , __FUNCTION__, id); |
426 | error = ENOMEM; |
427 | goto out; |
428 | } |
429 | |
430 | error = 0; |
431 | |
432 | while (p < buf + len && *p != 0) { |
433 | name[0] = 0; |
434 | n = sscanf(p, "%127s %n" , name, &l); |
435 | if (n != 1 || name[0] == 0) { |
436 | error = EINVAL; |
437 | kfree_data(new_mask, SR_MASK_SIZE); |
438 | goto out; |
439 | } |
440 | |
441 | if (!_syscall_rejection_add(new_mask, name)) { |
442 | error = ENOENT; |
443 | kfree_data(new_mask, SR_MASK_SIZE); |
444 | goto out; |
445 | } |
446 | |
447 | p += l; |
448 | } |
449 | |
450 | |
451 | syscall_rejection_mask_t to_free = NULL; |
452 | |
453 | lck_mtx_lock(&syscall_rejection_mtx); |
454 | |
455 | syscall_rejection_mask_t *target_mask = &syscall_rejection_masks[id - predefined_masks]; |
456 | |
457 | to_free = *target_mask; |
458 | *target_mask = new_mask; |
459 | |
460 | lck_mtx_unlock(&syscall_rejection_mtx); |
461 | |
462 | kfree_data(to_free, SR_MASK_SIZE); |
463 | out: |
464 | |
465 | kfree_data(buf, len + 1); |
466 | return error; |
467 | } |
468 | |
469 | SYSCTL_PROC(_kern, OID_AUTO, syscall_rejection_masks, CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, |
470 | 0, 0, _sysctl_debug_syscall_rejection_masks, "A" , "system call rejection masks" ); |
471 | |
472 | #else /* CONFIG_DEBUG_SYSCALL_REJECTION */ |
473 | |
474 | #include <sys/kern_debug.h> |
475 | |
476 | int |
477 | sys_debug_syscall_reject_config(struct proc * __unused p, struct debug_syscall_reject_config_args * __unused args, int __unused *ret) |
478 | { |
479 | /* not supported. */ |
480 | return ENOTSUP; |
481 | } |
482 | |
483 | int |
484 | debug_syscall_reject(struct proc * __unused p, struct debug_syscall_reject_args * __unused args, int * __unused retval) |
485 | { |
486 | /* not supported. */ |
487 | return ENOTSUP; |
488 | } |
489 | |
490 | void |
491 | reset_debug_syscall_rejection_mode(void) |
492 | { |
493 | /* not supported. */ |
494 | } |
495 | |
496 | #endif /* CONFIG_DEBUG_SYSCALL_REJECTION */ |
497 | |
498 | #if __arm64__ && (DEBUG || DEVELOPMENT) |
499 | |
500 | static void |
501 | _spinfor(uint64_t nanoseconds) |
502 | { |
503 | uint64_t mt = 0; |
504 | nanoseconds_to_absolutetime(nanoseconds, &mt); |
505 | |
506 | uint64_t start = mach_absolute_time(); |
507 | |
508 | while (mach_absolute_time() < start + mt) { |
509 | // Spinning. |
510 | } |
511 | } |
512 | |
513 | static int |
514 | _sysctl_debug_disable_interrupts_test(struct sysctl_oid __unused *oidp, void * __unused arg1, int __unused arg2, |
515 | struct sysctl_req *req) |
516 | { |
517 | int error = 0; |
518 | |
519 | if (req->newptr == 0) { |
520 | goto out; |
521 | } |
522 | |
523 | uint64_t val = 0; |
524 | error = sysctl_io_number(req, 0, sizeof(val), &val, NULL); |
525 | |
526 | if (error != 0 || val == 0) { |
527 | goto out; |
528 | } |
529 | |
530 | boolean_t istate = ml_set_interrupts_enabled(false); |
531 | _spinfor(val); |
532 | ml_set_interrupts_enabled(istate); |
533 | |
534 | out: |
535 | return error; |
536 | } |
537 | |
538 | static int |
539 | _sysctl_debug_disable_preemption_test(struct sysctl_oid __unused *oidp, void * __unused arg1, int __unused arg2, |
540 | struct sysctl_req *req) |
541 | { |
542 | int error = 0; |
543 | |
544 | if (req->newptr == 0) { |
545 | goto out; |
546 | } |
547 | |
548 | uint64_t val = 0; |
549 | error = sysctl_io_number(req, 0, sizeof(val), &val, NULL); |
550 | |
551 | if (error != 0 || val == 0) { |
552 | goto out; |
553 | } |
554 | |
555 | disable_preemption(); |
556 | _spinfor(val); |
557 | enable_preemption(); |
558 | |
559 | out: |
560 | return error; |
561 | } |
562 | |
563 | SYSCTL_PROC(_kern, OID_AUTO, debug_disable_interrupts_test, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, |
564 | 0, 0, _sysctl_debug_disable_interrupts_test, "Q" , "disable interrupts for specified number of nanoseconds, for testing" ); |
565 | |
566 | SYSCTL_PROC(_kern, OID_AUTO, debug_disable_preemption_test, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, |
567 | 0, 0, _sysctl_debug_disable_preemption_test, "Q" , "disable preemption for specified number of nanoseconds, for testing" ); |
568 | |
569 | #endif /* __arm64__ && (DEBUG || DEVELOPMENT) */ |
570 | |