1 | /* |
2 | * CDDL HEADER START |
3 | * |
4 | * The contents of this file are subject to the terms of the |
5 | * Common Development and Distribution License (the "License"). |
6 | * You may not use this file except in compliance with the License. |
7 | * |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
9 | * or http://www.opensolaris.org/os/licensing. |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. |
12 | * |
13 | * When distributing Covered Code, include this CDDL HEADER in each |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
15 | * If applicable, add the following below this CDDL HEADER, with the |
16 | * fields enclosed by brackets "[]" replaced with your own identifying |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] |
18 | * |
19 | * CDDL HEADER END |
20 | */ |
21 | /* |
22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
23 | * Use is subject to license terms. |
24 | */ |
25 | |
26 | #include <ptrauth.h> |
27 | |
28 | #include <kern/thread.h> |
29 | #include <mach/thread_status.h> |
30 | |
31 | /* XXX All of these should really be derived from syscall_sw.h */ |
32 | #if defined (__x86_64__) |
33 | #define SYSCALL_CLASS_SHIFT 24 |
34 | #define SYSCALL_CLASS_MASK (0xFF << SYSCALL_CLASS_SHIFT) |
35 | #define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK) |
36 | #define I386_SYSCALL_NUMBER_MASK (0xFFFF) |
37 | #endif |
38 | |
39 | #include <sys/param.h> |
40 | #include <sys/systm.h> |
41 | #include <sys/proc.h> |
42 | #include <sys/errno.h> |
43 | #include <sys/ioctl.h> |
44 | #include <sys/conf.h> |
45 | #include <sys/fcntl.h> |
46 | #include <sys/syscall.h> |
47 | #include <miscfs/devfs/devfs.h> |
48 | |
49 | #include <sys/dtrace.h> |
50 | #include <sys/dtrace_impl.h> |
51 | #include <sys/systrace_args.h> |
52 | #include "systrace.h" |
53 | #include <sys/stat.h> |
54 | #include <sys/systm.h> |
55 | #include <sys/conf.h> |
56 | #include <sys/user.h> |
57 | |
58 | #include <machine/pal_routines.h> |
59 | |
60 | #if defined (__x86_64__) |
61 | #define SYSTRACE_ARTIFICIAL_FRAMES 2 |
62 | #define MACHTRACE_ARTIFICIAL_FRAMES 3 |
63 | #elif defined(__arm64__) |
64 | #define SYSTRACE_ARTIFICIAL_FRAMES 2 |
65 | #define MACHTRACE_ARTIFICIAL_FRAMES 3 |
66 | #else |
67 | #error Unknown Architecture |
68 | #endif |
69 | |
70 | #define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0])) |
71 | #define MACHTRACE_NARGS (int)(sizeof(struct mach_call_args) / sizeof(syscall_arg_t)) |
72 | |
73 | #include <sys/sysent.h> |
74 | #define sy_callc sy_call /* Map Solaris slot name to Darwin's */ |
75 | #define NSYSCALL nsysent /* and is less than 500 or so */ |
76 | |
77 | extern const char *syscallnames[]; |
78 | |
79 | #include <sys/dtrace_glue.h> |
80 | #define casptr dtrace_casptr |
81 | #define membar_enter dtrace_membar_producer |
82 | |
83 | #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */ |
84 | #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */ |
85 | |
86 | static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock, |
87 | &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */ |
88 | |
89 | systrace_sysent_t *systrace_sysent = NULL; |
90 | void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); |
91 | |
92 | static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int); |
93 | static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); |
94 | |
95 | void |
96 | systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1, |
97 | uint64_t arg2, uint64_t arg3, uint64_t arg4) |
98 | { |
99 | #pragma unused(id,arg0,arg1,arg2,arg3,arg4) |
100 | } |
101 | |
102 | int32_t |
103 | dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) |
104 | { |
105 | unsigned short code; /* The system call number */ |
106 | |
107 | systrace_sysent_t *sy; |
108 | dtrace_id_t id; |
109 | int32_t rval; |
110 | syscall_arg_t *ip = (syscall_arg_t *)uap; |
111 | uint64_t uargs[SYSTRACE_NARGS] = {0}; |
112 | |
113 | #if defined (__x86_64__) |
114 | { |
115 | pal_register_cache_state(current_thread(), VALID); |
116 | x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); |
117 | |
118 | if (is_saved_state64(tagged_regs)) { |
119 | x86_saved_state64_t *regs = saved_state64(tagged_regs); |
120 | code = regs->rax & SYSCALL_NUMBER_MASK; |
121 | /* |
122 | * Check for indirect system call... system call number |
123 | * passed as 'arg0' |
124 | */ |
125 | if (code == 0) { |
126 | code = regs->rdi; |
127 | } |
128 | } else { |
129 | code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK; |
130 | |
131 | if (code == 0) { |
132 | vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int)); |
133 | code = fuword(params); |
134 | } |
135 | } |
136 | } |
137 | #elif defined(__arm64__) |
138 | { |
139 | /* |
140 | * On arm64, syscall numbers depend on a flavor (indirect or not) |
141 | * ... and for u32 can be in either r0 or r12 |
142 | * ... and for u64 can be in either x0 or x16 |
143 | */ |
144 | |
145 | /* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */ |
146 | arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(thread: current_thread()); |
147 | |
148 | if (is_saved_state32(iss: arm_regs)) { |
149 | /* Check for indirect system call */ |
150 | if (saved_state32(iss: arm_regs)->r[12] != 0) { |
151 | code = saved_state32(iss: arm_regs)->r[12]; |
152 | } else { |
153 | code = saved_state32(iss: arm_regs)->r[0]; |
154 | } |
155 | } else { |
156 | /* Check for indirect system call */ |
157 | if (saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) { |
158 | code = saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM]; |
159 | } else { |
160 | code = saved_state64(iss: arm_regs)->x[0]; |
161 | } |
162 | } |
163 | } |
164 | #else |
165 | #error Unknown Architecture |
166 | #endif |
167 | |
168 | // Bounds "check" the value of code a la unix_syscall |
169 | sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; |
170 | |
171 | systrace_args(sysnum: code, params: ip, uarg: uargs); |
172 | |
173 | if ((id = sy->stsy_entry) != DTRACE_IDNONE) { |
174 | uthread_t uthread = current_uthread(); |
175 | if (uthread) { |
176 | uthread->t_dtrace_syscall_args = uargs; |
177 | } |
178 | |
179 | static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments" ); |
180 | (*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]); |
181 | |
182 | if (uthread) { |
183 | uthread->t_dtrace_syscall_args = NULL; |
184 | } |
185 | } |
186 | |
187 | |
188 | |
189 | #if 0 /* XXX */ |
190 | /* |
191 | * APPLE NOTE: Not implemented. |
192 | * We want to explicitly allow DTrace consumers to stop a process |
193 | * before it actually executes the meat of the syscall. |
194 | */ |
195 | p = ttoproc(curthread); |
196 | mutex_enter(&p->p_lock); |
197 | if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) { |
198 | curthread->t_dtrace_stop = 0; |
199 | stop(PR_REQUESTED, 0); |
200 | } |
201 | mutex_exit(&p->p_lock); |
202 | #endif |
203 | |
204 | rval = (*sy->stsy_underlying)(pp, uap, rv); |
205 | |
206 | if ((id = sy->stsy_return) != DTRACE_IDNONE) { |
207 | uint64_t munged_rv0, munged_rv1; |
208 | uthread_t uthread = current_uthread(); |
209 | |
210 | if (uthread) { |
211 | uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */ |
212 | } |
213 | /* |
214 | * "Decode" rv for use in the call to dtrace_probe() |
215 | */ |
216 | if (rval == ERESTART) { |
217 | munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */ |
218 | munged_rv1 = -1LL; |
219 | } else if (rval != EJUSTRETURN) { |
220 | if (rval) { |
221 | munged_rv0 = -1LL; /* Mimic what libc will do. */ |
222 | munged_rv1 = -1LL; |
223 | } else { |
224 | switch (sy->stsy_return_type) { |
225 | case _SYSCALL_RET_INT_T: |
226 | munged_rv0 = rv[0]; |
227 | munged_rv1 = rv[1]; |
228 | break; |
229 | case _SYSCALL_RET_UINT_T: |
230 | munged_rv0 = ((u_int)rv[0]); |
231 | munged_rv1 = ((u_int)rv[1]); |
232 | break; |
233 | case _SYSCALL_RET_OFF_T: |
234 | case _SYSCALL_RET_UINT64_T: |
235 | munged_rv0 = *(u_int64_t *)rv; |
236 | munged_rv1 = 0LL; |
237 | break; |
238 | case _SYSCALL_RET_ADDR_T: |
239 | case _SYSCALL_RET_SIZE_T: |
240 | case _SYSCALL_RET_SSIZE_T: |
241 | munged_rv0 = *(user_addr_t *)rv; |
242 | munged_rv1 = 0LL; |
243 | break; |
244 | case _SYSCALL_RET_NONE: |
245 | munged_rv0 = 0LL; |
246 | munged_rv1 = 0LL; |
247 | break; |
248 | default: |
249 | munged_rv0 = 0LL; |
250 | munged_rv1 = 0LL; |
251 | break; |
252 | } |
253 | } |
254 | } else { |
255 | munged_rv0 = 0LL; |
256 | munged_rv1 = 0LL; |
257 | } |
258 | |
259 | /* |
260 | * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says: |
261 | * |
262 | * "This is a bit of an historical artifact. At first, the syscall provider just |
263 | * had its return value in arg0, and the fbt and pid providers had their return |
264 | * values in arg1 (so that we could use arg0 for the offset of the return site). |
265 | * |
266 | * We inevitably started writing scripts where we wanted to see the return |
267 | * values from probes in all three providers, and we made this script easier |
268 | * to write by replicating the syscall return values in arg1 to match fbt and |
269 | * pid. We debated briefly about removing the return value from arg0, but |
270 | * decided that it would be less confusing to have the same data in two places |
271 | * than to have some non-helpful, non-intuitive value in arg0. |
272 | * |
273 | * This change was made 4/23/2003 according to the DTrace project's putback log." |
274 | */ |
275 | (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); |
276 | } |
277 | |
278 | return rval; |
279 | } |
280 | |
281 | void |
282 | dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv) |
283 | { |
284 | systrace_sysent_t *sy; |
285 | dtrace_id_t id; |
286 | |
287 | // Bounds "check" the value of code a la unix_syscall_return |
288 | sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; |
289 | |
290 | if ((id = sy->stsy_return) != DTRACE_IDNONE) { |
291 | uint64_t munged_rv0, munged_rv1; |
292 | uthread_t uthread = current_uthread(); |
293 | |
294 | if (uthread) { |
295 | uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */ |
296 | } |
297 | /* |
298 | * "Decode" rv for use in the call to dtrace_probe() |
299 | */ |
300 | if (rval == ERESTART) { |
301 | munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */ |
302 | munged_rv1 = -1LL; |
303 | } else if (rval != EJUSTRETURN) { |
304 | if (rval) { |
305 | munged_rv0 = -1LL; /* Mimic what libc will do. */ |
306 | munged_rv1 = -1LL; |
307 | } else { |
308 | switch (sy->stsy_return_type) { |
309 | case _SYSCALL_RET_INT_T: |
310 | munged_rv0 = rv[0]; |
311 | munged_rv1 = rv[1]; |
312 | break; |
313 | case _SYSCALL_RET_UINT_T: |
314 | munged_rv0 = ((u_int)rv[0]); |
315 | munged_rv1 = ((u_int)rv[1]); |
316 | break; |
317 | case _SYSCALL_RET_OFF_T: |
318 | case _SYSCALL_RET_UINT64_T: |
319 | munged_rv0 = *(u_int64_t *)rv; |
320 | munged_rv1 = 0LL; |
321 | break; |
322 | case _SYSCALL_RET_ADDR_T: |
323 | case _SYSCALL_RET_SIZE_T: |
324 | case _SYSCALL_RET_SSIZE_T: |
325 | munged_rv0 = *(user_addr_t *)rv; |
326 | munged_rv1 = 0LL; |
327 | break; |
328 | case _SYSCALL_RET_NONE: |
329 | munged_rv0 = 0LL; |
330 | munged_rv1 = 0LL; |
331 | break; |
332 | default: |
333 | munged_rv0 = 0LL; |
334 | munged_rv1 = 0LL; |
335 | break; |
336 | } |
337 | } |
338 | } else { |
339 | munged_rv0 = 0LL; |
340 | munged_rv1 = 0LL; |
341 | } |
342 | |
343 | (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); |
344 | } |
345 | } |
346 | |
347 | #define SYSTRACE_SHIFT 16 |
348 | #define SYSTRACE_ISENTRY(x) ((int)(x) >> SYSTRACE_SHIFT) |
349 | #define SYSTRACE_SYSNUM(x) ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1)) |
350 | #define SYSTRACE_ENTRY(id) ((1 << SYSTRACE_SHIFT) | (id)) |
351 | #define SYSTRACE_RETURN(id) (id) |
352 | |
353 | #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL) |
354 | #error 1 << SYSTRACE_SHIFT must exceed number of system calls |
355 | #endif |
356 | |
357 | static dtrace_provider_id_t systrace_id; |
358 | |
359 | /* |
360 | * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol. |
361 | * See balanced undef below. |
362 | */ |
363 | #define systrace_init _systrace_init |
364 | |
365 | static void |
366 | systrace_init(const struct sysent *actual, systrace_sysent_t **interposed) |
367 | { |
368 | systrace_sysent_t *ssysent = *interposed; /* Avoid sysent shadow warning |
369 | * from bsd/sys/sysent.h */ |
370 | unsigned int i; |
371 | |
372 | if (ssysent == NULL) { |
373 | *interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) * |
374 | NSYSCALL, KM_SLEEP); |
375 | } |
376 | |
377 | for (i = 0; i < NSYSCALL; i++) { |
378 | /* Use of volatile protects the if statement below from being optimized away */ |
379 | const volatile struct sysent *a = &actual[i]; |
380 | systrace_sysent_t *s = &ssysent[i]; |
381 | |
382 | if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) { |
383 | continue; |
384 | } |
385 | |
386 | if (a->sy_callc == dtrace_systrace_syscall) { |
387 | continue; |
388 | } |
389 | |
390 | s->stsy_underlying = a->sy_callc; |
391 | s->stsy_return_type = a->sy_return_type; |
392 | } |
393 | } |
394 | |
395 | |
396 | /*ARGSUSED*/ |
397 | static void |
398 | systrace_provide(void *arg, const dtrace_probedesc_t *desc) |
399 | { |
400 | #pragma unused(arg) /* __APPLE__ */ |
401 | unsigned int i; |
402 | |
403 | if (desc != NULL) { |
404 | return; |
405 | } |
406 | |
407 | systrace_init(actual: sysent, interposed: &systrace_sysent); |
408 | |
409 | for (i = 0; i < NSYSCALL; i++) { |
410 | if (systrace_sysent[i].stsy_underlying == NULL) { |
411 | continue; |
412 | } |
413 | |
414 | if (dtrace_probe_lookup(systrace_id, NULL, |
415 | syscallnames[i], "entry" ) != 0) { |
416 | continue; |
417 | } |
418 | |
419 | (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i], |
420 | "entry" , SYSTRACE_ARTIFICIAL_FRAMES, |
421 | (void *)((uintptr_t)SYSTRACE_ENTRY(i))); |
422 | (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i], |
423 | "return" , SYSTRACE_ARTIFICIAL_FRAMES, |
424 | (void *)((uintptr_t)SYSTRACE_RETURN(i))); |
425 | |
426 | systrace_sysent[i].stsy_entry = DTRACE_IDNONE; |
427 | systrace_sysent[i].stsy_return = DTRACE_IDNONE; |
428 | } |
429 | } |
430 | #undef systrace_init |
431 | |
432 | /*ARGSUSED*/ |
433 | static void |
434 | systrace_destroy(void *arg, dtrace_id_t id, void *parg) |
435 | { |
436 | #pragma unused(arg,id) /* __APPLE__ */ |
437 | |
438 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
439 | |
440 | #pragma unused(sysnum) /* __APPLE__ */ |
441 | /* |
442 | * There's nothing to do here but assert that we have actually been |
443 | * disabled. |
444 | */ |
445 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
446 | ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE); |
447 | } else { |
448 | ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); |
449 | } |
450 | } |
451 | |
452 | /*ARGSUSED*/ |
453 | static int |
454 | systrace_enable(void *arg, dtrace_id_t id, void *parg) |
455 | { |
456 | #pragma unused(arg) /* __APPLE__ */ |
457 | |
458 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
459 | int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE || |
460 | systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE); |
461 | |
462 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
463 | systrace_sysent[sysnum].stsy_entry = id; |
464 | } else { |
465 | systrace_sysent[sysnum].stsy_return = id; |
466 | } |
467 | |
468 | if (enabled) { |
469 | ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); |
470 | return 0; |
471 | } |
472 | |
473 | lck_mtx_lock(lck: &dtrace_systrace_lock); |
474 | if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) { |
475 | /* It is not possible to write to sysent[] directly because it is const. */ |
476 | vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall); |
477 | ml_nofault_copy(virtsrc: (vm_offset_t)&dss, virtdst: (vm_offset_t)&sysent[sysnum].sy_callc, size: sizeof(vm_offset_t)); |
478 | } |
479 | lck_mtx_unlock(lck: &dtrace_systrace_lock); |
480 | |
481 | return 0; |
482 | } |
483 | |
484 | /*ARGSUSED*/ |
485 | static void |
486 | systrace_disable(void *arg, dtrace_id_t id, void *parg) |
487 | { |
488 | #pragma unused(arg,id) /* __APPLE__ */ |
489 | |
490 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
491 | int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE || |
492 | systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); |
493 | |
494 | if (disable) { |
495 | /* |
496 | * Usage of volatile protects the if statement below from being optimized away. |
497 | * |
498 | * Compilers are clever and know that const array values can't change in time |
499 | * and the if below is always false. That is because it can't see that DTrace |
500 | * injects dtrace_systrace_syscall dynamically and violates constness of the |
501 | * array. |
502 | */ |
503 | volatile const struct sysent *syscallent = &sysent[sysnum]; |
504 | |
505 | lck_mtx_lock(lck: &dtrace_systrace_lock); |
506 | if (syscallent->sy_callc == dtrace_systrace_syscall) { |
507 | ml_nofault_copy(virtsrc: (vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, |
508 | virtdst: (vm_offset_t)&syscallent->sy_callc, size: sizeof(vm_offset_t)); |
509 | } |
510 | lck_mtx_unlock(lck: &dtrace_systrace_lock); |
511 | } |
512 | |
513 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
514 | systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE; |
515 | } else { |
516 | systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE; |
517 | } |
518 | } |
519 | |
520 | static dtrace_pattr_t systrace_attr = { |
521 | { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, |
522 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, |
523 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, |
524 | { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, |
525 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, |
526 | }; |
527 | |
528 | static dtrace_pops_t systrace_pops = { |
529 | .dtps_provide = systrace_provide, |
530 | .dtps_provide_module = NULL, |
531 | .dtps_enable = systrace_enable, |
532 | .dtps_disable = systrace_disable, |
533 | .dtps_suspend = NULL, |
534 | .dtps_resume = NULL, |
535 | .dtps_getargdesc = systrace_getargdesc, |
536 | .dtps_getargval = systrace_getargval, |
537 | .dtps_usermode = NULL, |
538 | .dtps_destroy = systrace_destroy |
539 | }; |
540 | |
541 | static int |
542 | systrace_attach(dev_info_t *devi) |
543 | { |
544 | systrace_probe = (void*)&dtrace_probe; |
545 | membar_enter(); |
546 | |
547 | if (ddi_create_minor_node(devi, "systrace" , S_IFCHR, 0, |
548 | DDI_PSEUDO, 0) == DDI_FAILURE || |
549 | dtrace_register("syscall" , &systrace_attr, DTRACE_PRIV_USER, NULL, |
550 | &systrace_pops, NULL, &systrace_id) != 0) { |
551 | systrace_probe = systrace_stub; |
552 | ddi_remove_minor_node(devi, NULL); |
553 | return DDI_FAILURE; |
554 | } |
555 | |
556 | return DDI_SUCCESS; |
557 | } |
558 | |
559 | |
560 | /* |
561 | * APPLE NOTE: systrace_detach not implemented |
562 | */ |
563 | #if !defined(__APPLE__) |
564 | static int |
565 | systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) |
566 | { |
567 | switch (cmd) { |
568 | case DDI_DETACH: |
569 | break; |
570 | case DDI_SUSPEND: |
571 | return DDI_SUCCESS; |
572 | default: |
573 | return DDI_FAILURE; |
574 | } |
575 | |
576 | if (dtrace_unregister(systrace_id) != 0) { |
577 | return DDI_FAILURE; |
578 | } |
579 | |
580 | ddi_remove_minor_node(devi, NULL); |
581 | systrace_probe = systrace_stub; |
582 | return DDI_SUCCESS; |
583 | } |
584 | #endif /* __APPLE__ */ |
585 | |
586 | |
587 | typedef kern_return_t (*mach_call_t)(void *); |
588 | |
589 | /* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */ |
590 | #if CONFIG_REQUIRES_U32_MUNGING |
591 | typedef void mach_munge_t(void *); |
592 | #elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4) |
593 | typedef int mach_munge_t(const void *, void *); |
594 | #endif |
595 | |
596 | typedef struct { |
597 | unsigned char mach_trap_arg_count; /* Number of trap arguments (Arch independant) */ |
598 | unsigned char mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */ |
599 | unsigned char mach_trap_returns_port; |
600 | unsigned char __mach_trap_padding; |
601 | kern_return_t (*mach_trap_function)(void *); |
602 | #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4)) |
603 | mach_munge_t *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */ |
604 | #endif |
605 | #if MACH_ASSERT |
606 | const char *mach_trap_name; |
607 | #endif /* MACH_ASSERT */ |
608 | } mach_trap_t; |
609 | |
610 | |
611 | #define MACH_TRAP_TABLE_COUNT 128 |
612 | |
613 | extern const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT]; |
614 | extern const int mach_trap_count; |
615 | extern const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT]; |
616 | |
617 | |
618 | /* XXX From osfmk/i386/bsd_i386.c */ |
619 | struct mach_call_args { |
620 | syscall_arg_t arg1; |
621 | syscall_arg_t arg2; |
622 | syscall_arg_t arg3; |
623 | syscall_arg_t arg4; |
624 | syscall_arg_t arg5; |
625 | syscall_arg_t arg6; |
626 | syscall_arg_t arg7; |
627 | syscall_arg_t arg8; |
628 | syscall_arg_t arg9; |
629 | }; |
630 | |
631 | #undef NSYSCALL |
632 | #define NSYSCALL mach_trap_count |
633 | |
634 | #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL) |
635 | #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps |
636 | #endif |
637 | |
638 | typedef struct machtrace_sysent { |
639 | dtrace_id_t stsy_entry; |
640 | dtrace_id_t stsy_return; |
641 | kern_return_t (*stsy_underlying)(void *); |
642 | int32_t stsy_return_type; |
643 | } machtrace_sysent_t; |
644 | |
645 | static machtrace_sysent_t *machtrace_sysent = NULL; |
646 | |
647 | void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t, |
648 | uint64_t, uint64_t, uint64_t); |
649 | |
650 | static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int); |
651 | |
652 | static dtrace_provider_id_t machtrace_id; |
653 | |
654 | static kern_return_t |
655 | dtrace_machtrace_syscall(struct mach_call_args *args) |
656 | { |
657 | int code; /* The mach call number */ |
658 | |
659 | machtrace_sysent_t *sy; |
660 | dtrace_id_t id; |
661 | kern_return_t rval; |
662 | #if 0 /* XXX */ |
663 | proc_t *p; |
664 | #endif |
665 | syscall_arg_t *ip = (syscall_arg_t *)args; |
666 | mach_call_t mach_call; |
667 | |
668 | #if defined (__x86_64__) |
669 | { |
670 | pal_register_cache_state(current_thread(), VALID); |
671 | x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); |
672 | |
673 | if (is_saved_state64(tagged_regs)) { |
674 | code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK; |
675 | } else { |
676 | code = -saved_state32(tagged_regs)->eax; |
677 | } |
678 | } |
679 | #elif defined(__arm64__) |
680 | { |
681 | /* From arm/thread_status.h:get_saved_state_svc_number */ |
682 | arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(thread: current_thread()); |
683 | if (is_saved_state32(iss: arm_regs)) { |
684 | code = (int)saved_state32(iss: arm_regs)->r[12]; |
685 | } else { |
686 | code = (int)saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM]; |
687 | } |
688 | |
689 | /* From bsd/arm64.c:mach_syscall */ |
690 | ASSERT(code < 0); /* Otherwise it would be a Unix syscall */ |
691 | code = -code; |
692 | } |
693 | #else |
694 | #error Unknown Architecture |
695 | #endif |
696 | |
697 | sy = &machtrace_sysent[code]; |
698 | |
699 | if ((id = sy->stsy_entry) != DTRACE_IDNONE) { |
700 | uthread_t uthread = current_uthread(); |
701 | |
702 | if (uthread) { |
703 | uthread->t_dtrace_syscall_args = (void *)ip; |
704 | } |
705 | |
706 | (*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4)); |
707 | |
708 | if (uthread) { |
709 | uthread->t_dtrace_syscall_args = (void *)0; |
710 | } |
711 | } |
712 | |
713 | #if 0 /* XXX */ |
714 | /* |
715 | * APPLE NOTE: Not implemented. |
716 | * We want to explicitly allow DTrace consumers to stop a process |
717 | * before it actually executes the meat of the syscall. |
718 | */ |
719 | p = ttoproc(curthread); |
720 | mutex_enter(&p->p_lock); |
721 | if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) { |
722 | curthread->t_dtrace_stop = 0; |
723 | stop(PR_REQUESTED, 0); |
724 | } |
725 | mutex_exit(&p->p_lock); |
726 | #endif |
727 | |
728 | mach_call = (mach_call_t)(*sy->stsy_underlying); |
729 | rval = mach_call(args); |
730 | |
731 | if ((id = sy->stsy_return) != DTRACE_IDNONE) { |
732 | (*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0); |
733 | } |
734 | |
735 | return rval; |
736 | } |
737 | |
738 | static void |
739 | machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed) |
740 | { |
741 | machtrace_sysent_t *msysent = *interposed; |
742 | int i; |
743 | |
744 | if (msysent == NULL) { |
745 | *interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) * |
746 | NSYSCALL, KM_SLEEP); |
747 | } |
748 | |
749 | for (i = 0; i < NSYSCALL; i++) { |
750 | const volatile mach_trap_t *a = &actual[i]; |
751 | machtrace_sysent_t *s = &msysent[i]; |
752 | |
753 | if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) { |
754 | continue; |
755 | } |
756 | |
757 | if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) { |
758 | continue; |
759 | } |
760 | |
761 | s->stsy_underlying = a->mach_trap_function; |
762 | } |
763 | } |
764 | |
765 | /*ARGSUSED*/ |
766 | static void |
767 | machtrace_provide(void *arg, const dtrace_probedesc_t *desc) |
768 | { |
769 | #pragma unused(arg) /* __APPLE__ */ |
770 | |
771 | int i; |
772 | |
773 | if (desc != NULL) { |
774 | return; |
775 | } |
776 | |
777 | machtrace_init(actual: mach_trap_table, interposed: &machtrace_sysent); |
778 | |
779 | for (i = 0; i < NSYSCALL; i++) { |
780 | if (machtrace_sysent[i].stsy_underlying == NULL) { |
781 | continue; |
782 | } |
783 | |
784 | if (dtrace_probe_lookup(machtrace_id, NULL, |
785 | mach_syscall_name_table[i], "entry" ) != 0) { |
786 | continue; |
787 | } |
788 | |
789 | (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i], |
790 | "entry" , MACHTRACE_ARTIFICIAL_FRAMES, |
791 | (void *)((uintptr_t)SYSTRACE_ENTRY(i))); |
792 | (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i], |
793 | "return" , MACHTRACE_ARTIFICIAL_FRAMES, |
794 | (void *)((uintptr_t)SYSTRACE_RETURN(i))); |
795 | |
796 | machtrace_sysent[i].stsy_entry = DTRACE_IDNONE; |
797 | machtrace_sysent[i].stsy_return = DTRACE_IDNONE; |
798 | } |
799 | } |
800 | |
801 | /*ARGSUSED*/ |
802 | static void |
803 | machtrace_destroy(void *arg, dtrace_id_t id, void *parg) |
804 | { |
805 | #pragma unused(arg,id) /* __APPLE__ */ |
806 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
807 | |
808 | #pragma unused(sysnum) /* __APPLE__ */ |
809 | |
810 | /* |
811 | * There's nothing to do here but assert that we have actually been |
812 | * disabled. |
813 | */ |
814 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
815 | ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE); |
816 | } else { |
817 | ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); |
818 | } |
819 | } |
820 | |
821 | /*ARGSUSED*/ |
822 | static int |
823 | machtrace_enable(void *arg, dtrace_id_t id, void *parg) |
824 | { |
825 | #pragma unused(arg) /* __APPLE__ */ |
826 | |
827 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
828 | int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE || |
829 | machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE); |
830 | |
831 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
832 | machtrace_sysent[sysnum].stsy_entry = id; |
833 | } else { |
834 | machtrace_sysent[sysnum].stsy_return = id; |
835 | } |
836 | |
837 | if (enabled) { |
838 | ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall); |
839 | return 0; |
840 | } |
841 | |
842 | lck_mtx_lock(lck: &dtrace_systrace_lock); |
843 | |
844 | if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) { |
845 | /* It is not possible to write to mach_trap_table[] directly because it is const. */ |
846 | vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall); |
847 | ml_nofault_copy(virtsrc: (vm_offset_t)&dss, virtdst: (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, size: sizeof(vm_offset_t)); |
848 | } |
849 | |
850 | lck_mtx_unlock(lck: &dtrace_systrace_lock); |
851 | |
852 | return 0; |
853 | } |
854 | |
855 | /*ARGSUSED*/ |
856 | static void |
857 | machtrace_disable(void *arg, dtrace_id_t id, void *parg) |
858 | { |
859 | #pragma unused(arg,id) /* __APPLE__ */ |
860 | |
861 | int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); |
862 | int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE || |
863 | machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); |
864 | |
865 | if (disable) { |
866 | /* |
867 | * Usage of volatile protects the if statement below from being optimized away. |
868 | * |
869 | * Compilers are clever and know that const array values can't change in time |
870 | * and the if below is always false. That is because it can't see that DTrace |
871 | * injects dtrace_machtrace_syscall dynamically and violates constness of the |
872 | * array. |
873 | */ |
874 | volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum]; |
875 | |
876 | lck_mtx_lock(lck: &dtrace_systrace_lock); |
877 | if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) { |
878 | ml_nofault_copy(virtsrc: (vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, |
879 | virtdst: (vm_offset_t)&machtrap->mach_trap_function, size: sizeof(vm_offset_t)); |
880 | } |
881 | lck_mtx_unlock(lck: &dtrace_systrace_lock); |
882 | } |
883 | |
884 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
885 | machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE; |
886 | } else { |
887 | machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE; |
888 | } |
889 | } |
890 | |
891 | static dtrace_pattr_t machtrace_attr = { |
892 | { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, |
893 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, |
894 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, |
895 | { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, |
896 | { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, |
897 | }; |
898 | |
899 | static dtrace_pops_t machtrace_pops = { |
900 | .dtps_provide = machtrace_provide, |
901 | .dtps_provide_module = NULL, |
902 | .dtps_enable = machtrace_enable, |
903 | .dtps_disable = machtrace_disable, |
904 | .dtps_suspend = NULL, |
905 | .dtps_resume = NULL, |
906 | .dtps_getargdesc = NULL, |
907 | .dtps_getargval = machtrace_getarg, |
908 | .dtps_usermode = NULL, |
909 | .dtps_destroy = machtrace_destroy |
910 | }; |
911 | |
912 | static int |
913 | machtrace_attach(dev_info_t *devi) |
914 | { |
915 | machtrace_probe = dtrace_probe; |
916 | membar_enter(); |
917 | |
918 | if (ddi_create_minor_node(devi, "machtrace" , S_IFCHR, 0, |
919 | DDI_PSEUDO, 0) == DDI_FAILURE || |
920 | dtrace_register("mach_trap" , &machtrace_attr, DTRACE_PRIV_USER, NULL, |
921 | &machtrace_pops, NULL, &machtrace_id) != 0) { |
922 | machtrace_probe = (void*)&systrace_stub; |
923 | ddi_remove_minor_node(devi, NULL); |
924 | return DDI_FAILURE; |
925 | } |
926 | |
927 | return DDI_SUCCESS; |
928 | } |
929 | |
930 | d_open_t _systrace_open; |
931 | |
932 | int |
933 | _systrace_open(dev_t dev, int flags, int devtype, struct proc *p) |
934 | { |
935 | #pragma unused(dev,flags,devtype,p) |
936 | return 0; |
937 | } |
938 | |
939 | #define SYSTRACE_MAJOR -24 /* let the kernel pick the device number */ |
940 | |
941 | static struct cdevsw systrace_cdevsw = |
942 | { |
943 | .d_open = _systrace_open, |
944 | .d_close = eno_opcl, |
945 | .d_read = eno_rdwrt, |
946 | .d_write = eno_rdwrt, |
947 | .d_ioctl = eno_ioctl, |
948 | .d_stop = eno_stop, |
949 | .d_reset = eno_reset, |
950 | .d_select = eno_select, |
951 | .d_mmap = eno_mmap, |
952 | .d_strategy = eno_strat, |
953 | .d_reserved_1 = eno_getc, |
954 | .d_reserved_2 = eno_putc, |
955 | }; |
956 | |
957 | void systrace_init( void ); |
958 | |
959 | void |
960 | systrace_init( void ) |
961 | { |
962 | if (dtrace_sdt_probes_restricted()) { |
963 | return; |
964 | } |
965 | |
966 | int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); |
967 | |
968 | if (majdevno < 0) { |
969 | printf("systrace_init: failed to allocate a major number!\n" ); |
970 | return; |
971 | } |
972 | |
973 | systrace_attach(devi: (dev_info_t*)(uintptr_t)majdevno); |
974 | machtrace_attach(devi: (dev_info_t*)(uintptr_t)majdevno); |
975 | } |
976 | #undef SYSTRACE_MAJOR |
977 | |
978 | static uint64_t |
979 | systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) |
980 | { |
981 | #pragma unused(arg,id,parg,aframes) /* __APPLE__ */ |
982 | uint64_t val = 0; |
983 | uint64_t *uargs = NULL; |
984 | |
985 | uthread_t uthread = current_uthread(); |
986 | |
987 | if (uthread) { |
988 | uargs = uthread->t_dtrace_syscall_args; |
989 | } |
990 | if (!uargs) { |
991 | return 0; |
992 | } |
993 | if (argno < 0 || argno >= SYSTRACE_NARGS) { |
994 | return 0; |
995 | } |
996 | |
997 | DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); |
998 | val = uargs[argno]; |
999 | DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); |
1000 | return val; |
1001 | } |
1002 | |
1003 | static void |
1004 | systrace_getargdesc(void *arg, dtrace_id_t id, void *parg, |
1005 | dtrace_argdesc_t *desc) |
1006 | { |
1007 | #pragma unused(arg, id) |
1008 | int sysnum = SYSTRACE_SYSNUM(parg); |
1009 | uthread_t uthread = current_uthread(); |
1010 | uint64_t *uargs = NULL; |
1011 | |
1012 | if (!uthread) { |
1013 | desc->dtargd_ndx = DTRACE_ARGNONE; |
1014 | return; |
1015 | } |
1016 | |
1017 | uargs = uthread->t_dtrace_syscall_args; |
1018 | |
1019 | if (SYSTRACE_ISENTRY((uintptr_t)parg)) { |
1020 | systrace_entry_setargdesc(sysnum, ndx: desc->dtargd_ndx, |
1021 | desc: desc->dtargd_native, descsz: sizeof(desc->dtargd_native)); |
1022 | } else { |
1023 | systrace_return_setargdesc(sysnum, ndx: desc->dtargd_ndx, |
1024 | desc: desc->dtargd_native, descsz: sizeof(desc->dtargd_native)); |
1025 | } |
1026 | |
1027 | if (desc->dtargd_native[0] == '\0') { |
1028 | desc->dtargd_ndx = DTRACE_ARGNONE; |
1029 | } |
1030 | } |
1031 | |
1032 | static uint64_t |
1033 | machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) |
1034 | { |
1035 | #pragma unused(arg,id,parg,aframes) /* __APPLE__ */ |
1036 | uint64_t val = 0; |
1037 | syscall_arg_t *stack = (syscall_arg_t *)NULL; |
1038 | |
1039 | uthread_t uthread = current_uthread(); |
1040 | |
1041 | if (uthread) { |
1042 | stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args; |
1043 | } |
1044 | |
1045 | if (!stack) { |
1046 | return 0; |
1047 | } |
1048 | |
1049 | if (argno < 0 || argno >= MACHTRACE_NARGS) { |
1050 | return 0; |
1051 | } |
1052 | |
1053 | DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); |
1054 | /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ |
1055 | val = (uint64_t)*(stack + argno); |
1056 | DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); |
1057 | return val; |
1058 | } |
1059 | |