1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <ptrauth.h>
27
28#include <kern/thread.h>
29#include <mach/thread_status.h>
30
31/* XXX All of these should really be derived from syscall_sw.h */
32#if defined (__x86_64__)
33#define SYSCALL_CLASS_SHIFT 24
34#define SYSCALL_CLASS_MASK (0xFF << SYSCALL_CLASS_SHIFT)
35#define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
36#define I386_SYSCALL_NUMBER_MASK (0xFFFF)
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/proc.h>
42#include <sys/errno.h>
43#include <sys/ioctl.h>
44#include <sys/conf.h>
45#include <sys/fcntl.h>
46#include <sys/syscall.h>
47#include <miscfs/devfs/devfs.h>
48
49#include <sys/dtrace.h>
50#include <sys/dtrace_impl.h>
51#include <sys/systrace_args.h>
52#include "systrace.h"
53#include <sys/stat.h>
54#include <sys/systm.h>
55#include <sys/conf.h>
56#include <sys/user.h>
57
58#include <machine/pal_routines.h>
59
60#if defined (__x86_64__)
61#define SYSTRACE_ARTIFICIAL_FRAMES 2
62#define MACHTRACE_ARTIFICIAL_FRAMES 3
63#elif defined(__arm64__)
64#define SYSTRACE_ARTIFICIAL_FRAMES 2
65#define MACHTRACE_ARTIFICIAL_FRAMES 3
66#else
67#error Unknown Architecture
68#endif
69
70#define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0]))
71#define MACHTRACE_NARGS (int)(sizeof(struct mach_call_args) / sizeof(syscall_arg_t))
72
73#include <sys/sysent.h>
74#define sy_callc sy_call /* Map Solaris slot name to Darwin's */
75#define NSYSCALL nsysent /* and is less than 500 or so */
76
77extern const char *syscallnames[];
78
79#include <sys/dtrace_glue.h>
80#define casptr dtrace_casptr
81#define membar_enter dtrace_membar_producer
82
83#define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
84#define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
85
86static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock,
87 &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */
88
89systrace_sysent_t *systrace_sysent = NULL;
90void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
91
92static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int);
93static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
94
95void
96systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
97 uint64_t arg2, uint64_t arg3, uint64_t arg4)
98{
99#pragma unused(id,arg0,arg1,arg2,arg3,arg4)
100}
101
102int32_t
103dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
104{
105 unsigned short code; /* The system call number */
106
107 systrace_sysent_t *sy;
108 dtrace_id_t id;
109 int32_t rval;
110 syscall_arg_t *ip = (syscall_arg_t *)uap;
111 uint64_t uargs[SYSTRACE_NARGS] = {0};
112
113#if defined (__x86_64__)
114 {
115 pal_register_cache_state(current_thread(), VALID);
116 x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
117
118 if (is_saved_state64(tagged_regs)) {
119 x86_saved_state64_t *regs = saved_state64(tagged_regs);
120 code = regs->rax & SYSCALL_NUMBER_MASK;
121 /*
122 * Check for indirect system call... system call number
123 * passed as 'arg0'
124 */
125 if (code == 0) {
126 code = regs->rdi;
127 }
128 } else {
129 code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
130
131 if (code == 0) {
132 vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int));
133 code = fuword(params);
134 }
135 }
136 }
137#elif defined(__arm64__)
138 {
139 /*
140 * On arm64, syscall numbers depend on a flavor (indirect or not)
141 * ... and for u32 can be in either r0 or r12
142 * ... and for u64 can be in either x0 or x16
143 */
144
145 /* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */
146 arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(thread: current_thread());
147
148 if (is_saved_state32(iss: arm_regs)) {
149 /* Check for indirect system call */
150 if (saved_state32(iss: arm_regs)->r[12] != 0) {
151 code = saved_state32(iss: arm_regs)->r[12];
152 } else {
153 code = saved_state32(iss: arm_regs)->r[0];
154 }
155 } else {
156 /* Check for indirect system call */
157 if (saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) {
158 code = saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
159 } else {
160 code = saved_state64(iss: arm_regs)->x[0];
161 }
162 }
163 }
164#else
165#error Unknown Architecture
166#endif
167
168 // Bounds "check" the value of code a la unix_syscall
169 sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
170
171 systrace_args(sysnum: code, params: ip, uarg: uargs);
172
173 if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
174 uthread_t uthread = current_uthread();
175 if (uthread) {
176 uthread->t_dtrace_syscall_args = uargs;
177 }
178
179 static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments");
180 (*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
181
182 if (uthread) {
183 uthread->t_dtrace_syscall_args = NULL;
184 }
185 }
186
187
188
189#if 0 /* XXX */
190 /*
191 * APPLE NOTE: Not implemented.
192 * We want to explicitly allow DTrace consumers to stop a process
193 * before it actually executes the meat of the syscall.
194 */
195 p = ttoproc(curthread);
196 mutex_enter(&p->p_lock);
197 if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
198 curthread->t_dtrace_stop = 0;
199 stop(PR_REQUESTED, 0);
200 }
201 mutex_exit(&p->p_lock);
202#endif
203
204 rval = (*sy->stsy_underlying)(pp, uap, rv);
205
206 if ((id = sy->stsy_return) != DTRACE_IDNONE) {
207 uint64_t munged_rv0, munged_rv1;
208 uthread_t uthread = current_uthread();
209
210 if (uthread) {
211 uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
212 }
213 /*
214 * "Decode" rv for use in the call to dtrace_probe()
215 */
216 if (rval == ERESTART) {
217 munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
218 munged_rv1 = -1LL;
219 } else if (rval != EJUSTRETURN) {
220 if (rval) {
221 munged_rv0 = -1LL; /* Mimic what libc will do. */
222 munged_rv1 = -1LL;
223 } else {
224 switch (sy->stsy_return_type) {
225 case _SYSCALL_RET_INT_T:
226 munged_rv0 = rv[0];
227 munged_rv1 = rv[1];
228 break;
229 case _SYSCALL_RET_UINT_T:
230 munged_rv0 = ((u_int)rv[0]);
231 munged_rv1 = ((u_int)rv[1]);
232 break;
233 case _SYSCALL_RET_OFF_T:
234 case _SYSCALL_RET_UINT64_T:
235 munged_rv0 = *(u_int64_t *)rv;
236 munged_rv1 = 0LL;
237 break;
238 case _SYSCALL_RET_ADDR_T:
239 case _SYSCALL_RET_SIZE_T:
240 case _SYSCALL_RET_SSIZE_T:
241 munged_rv0 = *(user_addr_t *)rv;
242 munged_rv1 = 0LL;
243 break;
244 case _SYSCALL_RET_NONE:
245 munged_rv0 = 0LL;
246 munged_rv1 = 0LL;
247 break;
248 default:
249 munged_rv0 = 0LL;
250 munged_rv1 = 0LL;
251 break;
252 }
253 }
254 } else {
255 munged_rv0 = 0LL;
256 munged_rv1 = 0LL;
257 }
258
259 /*
260 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
261 *
262 * "This is a bit of an historical artifact. At first, the syscall provider just
263 * had its return value in arg0, and the fbt and pid providers had their return
264 * values in arg1 (so that we could use arg0 for the offset of the return site).
265 *
266 * We inevitably started writing scripts where we wanted to see the return
267 * values from probes in all three providers, and we made this script easier
268 * to write by replicating the syscall return values in arg1 to match fbt and
269 * pid. We debated briefly about removing the return value from arg0, but
270 * decided that it would be less confusing to have the same data in two places
271 * than to have some non-helpful, non-intuitive value in arg0.
272 *
273 * This change was made 4/23/2003 according to the DTrace project's putback log."
274 */
275 (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
276 }
277
278 return rval;
279}
280
281void
282dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
283{
284 systrace_sysent_t *sy;
285 dtrace_id_t id;
286
287 // Bounds "check" the value of code a la unix_syscall_return
288 sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
289
290 if ((id = sy->stsy_return) != DTRACE_IDNONE) {
291 uint64_t munged_rv0, munged_rv1;
292 uthread_t uthread = current_uthread();
293
294 if (uthread) {
295 uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
296 }
297 /*
298 * "Decode" rv for use in the call to dtrace_probe()
299 */
300 if (rval == ERESTART) {
301 munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
302 munged_rv1 = -1LL;
303 } else if (rval != EJUSTRETURN) {
304 if (rval) {
305 munged_rv0 = -1LL; /* Mimic what libc will do. */
306 munged_rv1 = -1LL;
307 } else {
308 switch (sy->stsy_return_type) {
309 case _SYSCALL_RET_INT_T:
310 munged_rv0 = rv[0];
311 munged_rv1 = rv[1];
312 break;
313 case _SYSCALL_RET_UINT_T:
314 munged_rv0 = ((u_int)rv[0]);
315 munged_rv1 = ((u_int)rv[1]);
316 break;
317 case _SYSCALL_RET_OFF_T:
318 case _SYSCALL_RET_UINT64_T:
319 munged_rv0 = *(u_int64_t *)rv;
320 munged_rv1 = 0LL;
321 break;
322 case _SYSCALL_RET_ADDR_T:
323 case _SYSCALL_RET_SIZE_T:
324 case _SYSCALL_RET_SSIZE_T:
325 munged_rv0 = *(user_addr_t *)rv;
326 munged_rv1 = 0LL;
327 break;
328 case _SYSCALL_RET_NONE:
329 munged_rv0 = 0LL;
330 munged_rv1 = 0LL;
331 break;
332 default:
333 munged_rv0 = 0LL;
334 munged_rv1 = 0LL;
335 break;
336 }
337 }
338 } else {
339 munged_rv0 = 0LL;
340 munged_rv1 = 0LL;
341 }
342
343 (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
344 }
345}
346
347#define SYSTRACE_SHIFT 16
348#define SYSTRACE_ISENTRY(x) ((int)(x) >> SYSTRACE_SHIFT)
349#define SYSTRACE_SYSNUM(x) ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
350#define SYSTRACE_ENTRY(id) ((1 << SYSTRACE_SHIFT) | (id))
351#define SYSTRACE_RETURN(id) (id)
352
353#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
354#error 1 << SYSTRACE_SHIFT must exceed number of system calls
355#endif
356
357static dtrace_provider_id_t systrace_id;
358
359/*
360 * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol.
361 * See balanced undef below.
362 */
363#define systrace_init _systrace_init
364
365static void
366systrace_init(const struct sysent *actual, systrace_sysent_t **interposed)
367{
368 systrace_sysent_t *ssysent = *interposed; /* Avoid sysent shadow warning
369 * from bsd/sys/sysent.h */
370 unsigned int i;
371
372 if (ssysent == NULL) {
373 *interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) *
374 NSYSCALL, KM_SLEEP);
375 }
376
377 for (i = 0; i < NSYSCALL; i++) {
378 /* Use of volatile protects the if statement below from being optimized away */
379 const volatile struct sysent *a = &actual[i];
380 systrace_sysent_t *s = &ssysent[i];
381
382 if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
383 continue;
384 }
385
386 if (a->sy_callc == dtrace_systrace_syscall) {
387 continue;
388 }
389
390 s->stsy_underlying = a->sy_callc;
391 s->stsy_return_type = a->sy_return_type;
392 }
393}
394
395
396/*ARGSUSED*/
397static void
398systrace_provide(void *arg, const dtrace_probedesc_t *desc)
399{
400#pragma unused(arg) /* __APPLE__ */
401 unsigned int i;
402
403 if (desc != NULL) {
404 return;
405 }
406
407 systrace_init(actual: sysent, interposed: &systrace_sysent);
408
409 for (i = 0; i < NSYSCALL; i++) {
410 if (systrace_sysent[i].stsy_underlying == NULL) {
411 continue;
412 }
413
414 if (dtrace_probe_lookup(systrace_id, NULL,
415 syscallnames[i], "entry") != 0) {
416 continue;
417 }
418
419 (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
420 "entry", SYSTRACE_ARTIFICIAL_FRAMES,
421 (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
422 (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
423 "return", SYSTRACE_ARTIFICIAL_FRAMES,
424 (void *)((uintptr_t)SYSTRACE_RETURN(i)));
425
426 systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
427 systrace_sysent[i].stsy_return = DTRACE_IDNONE;
428 }
429}
430#undef systrace_init
431
432/*ARGSUSED*/
433static void
434systrace_destroy(void *arg, dtrace_id_t id, void *parg)
435{
436#pragma unused(arg,id) /* __APPLE__ */
437
438 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
439
440#pragma unused(sysnum) /* __APPLE__ */
441 /*
442 * There's nothing to do here but assert that we have actually been
443 * disabled.
444 */
445 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
446 ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
447 } else {
448 ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
449 }
450}
451
452/*ARGSUSED*/
453static int
454systrace_enable(void *arg, dtrace_id_t id, void *parg)
455{
456#pragma unused(arg) /* __APPLE__ */
457
458 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
459 int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
460 systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
461
462 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
463 systrace_sysent[sysnum].stsy_entry = id;
464 } else {
465 systrace_sysent[sysnum].stsy_return = id;
466 }
467
468 if (enabled) {
469 ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
470 return 0;
471 }
472
473 lck_mtx_lock(lck: &dtrace_systrace_lock);
474 if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
475 /* It is not possible to write to sysent[] directly because it is const. */
476 vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall);
477 ml_nofault_copy(virtsrc: (vm_offset_t)&dss, virtdst: (vm_offset_t)&sysent[sysnum].sy_callc, size: sizeof(vm_offset_t));
478 }
479 lck_mtx_unlock(lck: &dtrace_systrace_lock);
480
481 return 0;
482}
483
484/*ARGSUSED*/
485static void
486systrace_disable(void *arg, dtrace_id_t id, void *parg)
487{
488#pragma unused(arg,id) /* __APPLE__ */
489
490 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
491 int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
492 systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
493
494 if (disable) {
495 /*
496 * Usage of volatile protects the if statement below from being optimized away.
497 *
498 * Compilers are clever and know that const array values can't change in time
499 * and the if below is always false. That is because it can't see that DTrace
500 * injects dtrace_systrace_syscall dynamically and violates constness of the
501 * array.
502 */
503 volatile const struct sysent *syscallent = &sysent[sysnum];
504
505 lck_mtx_lock(lck: &dtrace_systrace_lock);
506 if (syscallent->sy_callc == dtrace_systrace_syscall) {
507 ml_nofault_copy(virtsrc: (vm_offset_t)&systrace_sysent[sysnum].stsy_underlying,
508 virtdst: (vm_offset_t)&syscallent->sy_callc, size: sizeof(vm_offset_t));
509 }
510 lck_mtx_unlock(lck: &dtrace_systrace_lock);
511 }
512
513 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
514 systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
515 } else {
516 systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
517 }
518}
519
520static dtrace_pattr_t systrace_attr = {
521 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
522 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
523 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
524 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
525 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
526};
527
528static dtrace_pops_t systrace_pops = {
529 .dtps_provide = systrace_provide,
530 .dtps_provide_module = NULL,
531 .dtps_enable = systrace_enable,
532 .dtps_disable = systrace_disable,
533 .dtps_suspend = NULL,
534 .dtps_resume = NULL,
535 .dtps_getargdesc = systrace_getargdesc,
536 .dtps_getargval = systrace_getargval,
537 .dtps_usermode = NULL,
538 .dtps_destroy = systrace_destroy
539};
540
541static int
542systrace_attach(dev_info_t *devi)
543{
544 systrace_probe = (void*)&dtrace_probe;
545 membar_enter();
546
547 if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
548 DDI_PSEUDO, 0) == DDI_FAILURE ||
549 dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
550 &systrace_pops, NULL, &systrace_id) != 0) {
551 systrace_probe = systrace_stub;
552 ddi_remove_minor_node(devi, NULL);
553 return DDI_FAILURE;
554 }
555
556 return DDI_SUCCESS;
557}
558
559
560/*
561 * APPLE NOTE: systrace_detach not implemented
562 */
563#if !defined(__APPLE__)
564static int
565systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
566{
567 switch (cmd) {
568 case DDI_DETACH:
569 break;
570 case DDI_SUSPEND:
571 return DDI_SUCCESS;
572 default:
573 return DDI_FAILURE;
574 }
575
576 if (dtrace_unregister(systrace_id) != 0) {
577 return DDI_FAILURE;
578 }
579
580 ddi_remove_minor_node(devi, NULL);
581 systrace_probe = systrace_stub;
582 return DDI_SUCCESS;
583}
584#endif /* __APPLE__ */
585
586
587typedef kern_return_t (*mach_call_t)(void *);
588
589/* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
590#if CONFIG_REQUIRES_U32_MUNGING
591typedef void mach_munge_t(void *);
592#elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
593typedef int mach_munge_t(const void *, void *);
594#endif
595
596typedef struct {
597 unsigned char mach_trap_arg_count; /* Number of trap arguments (Arch independant) */
598 unsigned char mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */
599 unsigned char mach_trap_returns_port;
600 unsigned char __mach_trap_padding;
601 kern_return_t (*mach_trap_function)(void *);
602#if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
603 mach_munge_t *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */
604#endif
605#if MACH_ASSERT
606 const char *mach_trap_name;
607#endif /* MACH_ASSERT */
608} mach_trap_t;
609
610
611#define MACH_TRAP_TABLE_COUNT 128
612
613extern const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT];
614extern const int mach_trap_count;
615extern const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT];
616
617
618/* XXX From osfmk/i386/bsd_i386.c */
619struct mach_call_args {
620 syscall_arg_t arg1;
621 syscall_arg_t arg2;
622 syscall_arg_t arg3;
623 syscall_arg_t arg4;
624 syscall_arg_t arg5;
625 syscall_arg_t arg6;
626 syscall_arg_t arg7;
627 syscall_arg_t arg8;
628 syscall_arg_t arg9;
629};
630
631#undef NSYSCALL
632#define NSYSCALL mach_trap_count
633
634#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
635#error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
636#endif
637
638typedef struct machtrace_sysent {
639 dtrace_id_t stsy_entry;
640 dtrace_id_t stsy_return;
641 kern_return_t (*stsy_underlying)(void *);
642 int32_t stsy_return_type;
643} machtrace_sysent_t;
644
645static machtrace_sysent_t *machtrace_sysent = NULL;
646
647void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
648 uint64_t, uint64_t, uint64_t);
649
650static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
651
652static dtrace_provider_id_t machtrace_id;
653
654static kern_return_t
655dtrace_machtrace_syscall(struct mach_call_args *args)
656{
657 int code; /* The mach call number */
658
659 machtrace_sysent_t *sy;
660 dtrace_id_t id;
661 kern_return_t rval;
662#if 0 /* XXX */
663 proc_t *p;
664#endif
665 syscall_arg_t *ip = (syscall_arg_t *)args;
666 mach_call_t mach_call;
667
668#if defined (__x86_64__)
669 {
670 pal_register_cache_state(current_thread(), VALID);
671 x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
672
673 if (is_saved_state64(tagged_regs)) {
674 code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
675 } else {
676 code = -saved_state32(tagged_regs)->eax;
677 }
678 }
679#elif defined(__arm64__)
680 {
681 /* From arm/thread_status.h:get_saved_state_svc_number */
682 arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(thread: current_thread());
683 if (is_saved_state32(iss: arm_regs)) {
684 code = (int)saved_state32(iss: arm_regs)->r[12];
685 } else {
686 code = (int)saved_state64(iss: arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
687 }
688
689 /* From bsd/arm64.c:mach_syscall */
690 ASSERT(code < 0); /* Otherwise it would be a Unix syscall */
691 code = -code;
692 }
693#else
694#error Unknown Architecture
695#endif
696
697 sy = &machtrace_sysent[code];
698
699 if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
700 uthread_t uthread = current_uthread();
701
702 if (uthread) {
703 uthread->t_dtrace_syscall_args = (void *)ip;
704 }
705
706 (*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4));
707
708 if (uthread) {
709 uthread->t_dtrace_syscall_args = (void *)0;
710 }
711 }
712
713#if 0 /* XXX */
714 /*
715 * APPLE NOTE: Not implemented.
716 * We want to explicitly allow DTrace consumers to stop a process
717 * before it actually executes the meat of the syscall.
718 */
719 p = ttoproc(curthread);
720 mutex_enter(&p->p_lock);
721 if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
722 curthread->t_dtrace_stop = 0;
723 stop(PR_REQUESTED, 0);
724 }
725 mutex_exit(&p->p_lock);
726#endif
727
728 mach_call = (mach_call_t)(*sy->stsy_underlying);
729 rval = mach_call(args);
730
731 if ((id = sy->stsy_return) != DTRACE_IDNONE) {
732 (*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
733 }
734
735 return rval;
736}
737
738static void
739machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
740{
741 machtrace_sysent_t *msysent = *interposed;
742 int i;
743
744 if (msysent == NULL) {
745 *interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) *
746 NSYSCALL, KM_SLEEP);
747 }
748
749 for (i = 0; i < NSYSCALL; i++) {
750 const volatile mach_trap_t *a = &actual[i];
751 machtrace_sysent_t *s = &msysent[i];
752
753 if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
754 continue;
755 }
756
757 if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) {
758 continue;
759 }
760
761 s->stsy_underlying = a->mach_trap_function;
762 }
763}
764
765/*ARGSUSED*/
766static void
767machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
768{
769#pragma unused(arg) /* __APPLE__ */
770
771 int i;
772
773 if (desc != NULL) {
774 return;
775 }
776
777 machtrace_init(actual: mach_trap_table, interposed: &machtrace_sysent);
778
779 for (i = 0; i < NSYSCALL; i++) {
780 if (machtrace_sysent[i].stsy_underlying == NULL) {
781 continue;
782 }
783
784 if (dtrace_probe_lookup(machtrace_id, NULL,
785 mach_syscall_name_table[i], "entry") != 0) {
786 continue;
787 }
788
789 (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
790 "entry", MACHTRACE_ARTIFICIAL_FRAMES,
791 (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
792 (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
793 "return", MACHTRACE_ARTIFICIAL_FRAMES,
794 (void *)((uintptr_t)SYSTRACE_RETURN(i)));
795
796 machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
797 machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
798 }
799}
800
801/*ARGSUSED*/
802static void
803machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
804{
805#pragma unused(arg,id) /* __APPLE__ */
806 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
807
808#pragma unused(sysnum) /* __APPLE__ */
809
810 /*
811 * There's nothing to do here but assert that we have actually been
812 * disabled.
813 */
814 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
815 ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
816 } else {
817 ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
818 }
819}
820
821/*ARGSUSED*/
822static int
823machtrace_enable(void *arg, dtrace_id_t id, void *parg)
824{
825#pragma unused(arg) /* __APPLE__ */
826
827 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
828 int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
829 machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
830
831 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
832 machtrace_sysent[sysnum].stsy_entry = id;
833 } else {
834 machtrace_sysent[sysnum].stsy_return = id;
835 }
836
837 if (enabled) {
838 ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
839 return 0;
840 }
841
842 lck_mtx_lock(lck: &dtrace_systrace_lock);
843
844 if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
845 /* It is not possible to write to mach_trap_table[] directly because it is const. */
846 vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall);
847 ml_nofault_copy(virtsrc: (vm_offset_t)&dss, virtdst: (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, size: sizeof(vm_offset_t));
848 }
849
850 lck_mtx_unlock(lck: &dtrace_systrace_lock);
851
852 return 0;
853}
854
855/*ARGSUSED*/
856static void
857machtrace_disable(void *arg, dtrace_id_t id, void *parg)
858{
859#pragma unused(arg,id) /* __APPLE__ */
860
861 int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
862 int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
863 machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
864
865 if (disable) {
866 /*
867 * Usage of volatile protects the if statement below from being optimized away.
868 *
869 * Compilers are clever and know that const array values can't change in time
870 * and the if below is always false. That is because it can't see that DTrace
871 * injects dtrace_machtrace_syscall dynamically and violates constness of the
872 * array.
873 */
874 volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum];
875
876 lck_mtx_lock(lck: &dtrace_systrace_lock);
877 if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
878 ml_nofault_copy(virtsrc: (vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying,
879 virtdst: (vm_offset_t)&machtrap->mach_trap_function, size: sizeof(vm_offset_t));
880 }
881 lck_mtx_unlock(lck: &dtrace_systrace_lock);
882 }
883
884 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
885 machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
886 } else {
887 machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
888 }
889}
890
891static dtrace_pattr_t machtrace_attr = {
892 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
893 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
894 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
895 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
896 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
897};
898
899static dtrace_pops_t machtrace_pops = {
900 .dtps_provide = machtrace_provide,
901 .dtps_provide_module = NULL,
902 .dtps_enable = machtrace_enable,
903 .dtps_disable = machtrace_disable,
904 .dtps_suspend = NULL,
905 .dtps_resume = NULL,
906 .dtps_getargdesc = NULL,
907 .dtps_getargval = machtrace_getarg,
908 .dtps_usermode = NULL,
909 .dtps_destroy = machtrace_destroy
910};
911
912static int
913machtrace_attach(dev_info_t *devi)
914{
915 machtrace_probe = dtrace_probe;
916 membar_enter();
917
918 if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
919 DDI_PSEUDO, 0) == DDI_FAILURE ||
920 dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
921 &machtrace_pops, NULL, &machtrace_id) != 0) {
922 machtrace_probe = (void*)&systrace_stub;
923 ddi_remove_minor_node(devi, NULL);
924 return DDI_FAILURE;
925 }
926
927 return DDI_SUCCESS;
928}
929
930d_open_t _systrace_open;
931
932int
933_systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
934{
935#pragma unused(dev,flags,devtype,p)
936 return 0;
937}
938
939#define SYSTRACE_MAJOR -24 /* let the kernel pick the device number */
940
941static struct cdevsw systrace_cdevsw =
942{
943 .d_open = _systrace_open,
944 .d_close = eno_opcl,
945 .d_read = eno_rdwrt,
946 .d_write = eno_rdwrt,
947 .d_ioctl = eno_ioctl,
948 .d_stop = eno_stop,
949 .d_reset = eno_reset,
950 .d_select = eno_select,
951 .d_mmap = eno_mmap,
952 .d_strategy = eno_strat,
953 .d_reserved_1 = eno_getc,
954 .d_reserved_2 = eno_putc,
955};
956
957void systrace_init( void );
958
959void
960systrace_init( void )
961{
962 if (dtrace_sdt_probes_restricted()) {
963 return;
964 }
965
966 int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
967
968 if (majdevno < 0) {
969 printf("systrace_init: failed to allocate a major number!\n");
970 return;
971 }
972
973 systrace_attach(devi: (dev_info_t*)(uintptr_t)majdevno);
974 machtrace_attach(devi: (dev_info_t*)(uintptr_t)majdevno);
975}
976#undef SYSTRACE_MAJOR
977
978static uint64_t
979systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
980{
981#pragma unused(arg,id,parg,aframes) /* __APPLE__ */
982 uint64_t val = 0;
983 uint64_t *uargs = NULL;
984
985 uthread_t uthread = current_uthread();
986
987 if (uthread) {
988 uargs = uthread->t_dtrace_syscall_args;
989 }
990 if (!uargs) {
991 return 0;
992 }
993 if (argno < 0 || argno >= SYSTRACE_NARGS) {
994 return 0;
995 }
996
997 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
998 val = uargs[argno];
999 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1000 return val;
1001}
1002
1003static void
1004systrace_getargdesc(void *arg, dtrace_id_t id, void *parg,
1005 dtrace_argdesc_t *desc)
1006{
1007#pragma unused(arg, id)
1008 int sysnum = SYSTRACE_SYSNUM(parg);
1009 uthread_t uthread = current_uthread();
1010 uint64_t *uargs = NULL;
1011
1012 if (!uthread) {
1013 desc->dtargd_ndx = DTRACE_ARGNONE;
1014 return;
1015 }
1016
1017 uargs = uthread->t_dtrace_syscall_args;
1018
1019 if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
1020 systrace_entry_setargdesc(sysnum, ndx: desc->dtargd_ndx,
1021 desc: desc->dtargd_native, descsz: sizeof(desc->dtargd_native));
1022 } else {
1023 systrace_return_setargdesc(sysnum, ndx: desc->dtargd_ndx,
1024 desc: desc->dtargd_native, descsz: sizeof(desc->dtargd_native));
1025 }
1026
1027 if (desc->dtargd_native[0] == '\0') {
1028 desc->dtargd_ndx = DTRACE_ARGNONE;
1029 }
1030}
1031
1032static uint64_t
1033machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1034{
1035#pragma unused(arg,id,parg,aframes) /* __APPLE__ */
1036 uint64_t val = 0;
1037 syscall_arg_t *stack = (syscall_arg_t *)NULL;
1038
1039 uthread_t uthread = current_uthread();
1040
1041 if (uthread) {
1042 stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1043 }
1044
1045 if (!stack) {
1046 return 0;
1047 }
1048
1049 if (argno < 0 || argno >= MACHTRACE_NARGS) {
1050 return 0;
1051 }
1052
1053 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1054 /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1055 val = (uint64_t)*(stack + argno);
1056 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1057 return val;
1058}
1059