1/*
2 * Copyright (c) 2005-2021 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/thread.h>
30
31#include <sys/time.h>
32#include <sys/proc.h>
33#include <sys/kauth.h>
34#include <sys/user.h>
35#include <sys/systm.h>
36#include <sys/dtrace.h>
37#include <sys/dtrace_impl.h>
38#include <machine/atomic.h>
39#include <libkern/OSKextLibPrivate.h>
40#include <kern/kern_types.h>
41#include <kern/timer_call.h>
42#include <kern/thread_call.h>
43#include <kern/task.h>
44#include <kern/sched_prim.h>
45#include <miscfs/devfs/devfs.h>
46#include <kern/kalloc.h>
47
48#include <mach/vm_param.h>
49#include <mach/mach_vm.h>
50#include <mach/task.h>
51#include <vm/vm_map.h> /* All the bits we care about are guarded by MACH_KERNEL_PRIVATE :-( */
52
53/*
54 * pid/proc
55 */
56/* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
57#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
58
59KALLOC_HEAP_DEFINE(KHEAP_DTRACE, "dtrace", KHEAP_ID_KT_VAR);
60
61void
62dtrace_sprlock(proc_t *p)
63{
64 lck_mtx_lock(lck: &p->p_dtrace_sprlock);
65}
66
67void
68dtrace_sprunlock(proc_t *p)
69{
70 lck_mtx_unlock(lck: &p->p_dtrace_sprlock);
71}
72
73/* Not called from probe context */
74proc_t *
75sprlock(pid_t pid)
76{
77 proc_t* p;
78
79 if ((p = proc_find(pid)) == PROC_NULL) {
80 return PROC_NULL;
81 }
82
83 task_suspend_internal(task: proc_task(p));
84
85 dtrace_sprlock(p);
86
87 return p;
88}
89
90/* Not called from probe context */
91void
92sprunlock(proc_t *p)
93{
94 if (p != PROC_NULL) {
95 dtrace_sprunlock(p);
96
97 task_resume_internal(task: proc_task(p));
98
99 proc_rele(p);
100 }
101}
102
103/*
104 * uread/uwrite
105 */
106
107// These are not exported from vm_map.h.
108extern kern_return_t vm_map_read_user(vm_map_t map, vm_map_address_t src_addr, void *dst_p, vm_size_t size);
109extern kern_return_t vm_map_write_user(vm_map_t map, void *src_p, vm_map_address_t dst_addr, vm_size_t size);
110
111/* Not called from probe context */
112int
113uread(proc_t *p, void *buf, user_size_t len, user_addr_t a)
114{
115 kern_return_t ret;
116
117 ASSERT(p != PROC_NULL);
118 ASSERT(proc_task(p) != NULL);
119
120 task_t task = proc_task(p);
121
122 /*
123 * Grab a reference to the task vm_map_t to make sure
124 * the map isn't pulled out from under us.
125 *
126 * Because the proc_lock is not held at all times on all code
127 * paths leading here, it is possible for the proc to have
128 * exited. If the map is null, fail.
129 */
130 vm_map_t map = get_task_map_reference(task);
131 if (map) {
132 ret = vm_map_read_user( map, src_addr: (vm_map_address_t)a, dst_p: buf, size: (vm_size_t)len);
133 vm_map_deallocate(map);
134 } else {
135 ret = KERN_TERMINATED;
136 }
137
138 return (int)ret;
139}
140
141
142/* Not called from probe context */
143int
144uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a)
145{
146 kern_return_t ret;
147
148 ASSERT(p != NULL);
149 ASSERT(proc_task(p) != NULL);
150
151 task_t task = proc_task(p);
152
153 /*
154 * Grab a reference to the task vm_map_t to make sure
155 * the map isn't pulled out from under us.
156 *
157 * Because the proc_lock is not held at all times on all code
158 * paths leading here, it is possible for the proc to have
159 * exited. If the map is null, fail.
160 */
161 vm_map_t map = get_task_map_reference(task);
162 if (map) {
163 /* Find the memory permissions. */
164 uint32_t nestingDepth = 999999;
165 vm_region_submap_short_info_data_64_t info;
166 mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
167 mach_vm_address_t address = (mach_vm_address_t)a;
168 mach_vm_size_t sizeOfRegion = (mach_vm_size_t)len;
169
170 ret = mach_vm_region_recurse(target_task: map, address: &address, size: &sizeOfRegion, nesting_depth: &nestingDepth, info: (vm_region_recurse_info_t)&info, infoCnt: &count);
171 if (ret != KERN_SUCCESS) {
172 goto done;
173 }
174
175 vm_prot_t reprotect;
176
177 if (!(info.protection & VM_PROT_WRITE)) {
178 /* Save the original protection values for restoration later */
179 reprotect = info.protection;
180
181 if (info.max_protection & VM_PROT_WRITE) {
182 /* The memory is not currently writable, but can be made writable. */
183 ret = mach_vm_protect(target_task: map, address: (mach_vm_offset_t)a, size: (mach_vm_size_t)len, set_maximum: 0, new_protection: (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE);
184 } else {
185 /*
186 * The memory is not currently writable, and cannot be made writable. We need to COW this memory.
187 *
188 * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails.
189 */
190 ret = mach_vm_protect(target_task: map, address: (mach_vm_offset_t)a, size: (mach_vm_size_t)len, set_maximum: 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE);
191 }
192
193 if (ret != KERN_SUCCESS) {
194 goto done;
195 }
196 } else {
197 /* The memory was already writable. */
198 reprotect = VM_PROT_NONE;
199 }
200
201 ret = vm_map_write_user( map,
202 src_p: buf,
203 dst_addr: (vm_map_address_t)a,
204 size: (vm_size_t)len);
205
206 dtrace_flush_caches();
207
208 if (ret != KERN_SUCCESS) {
209 goto done;
210 }
211
212 if (reprotect != VM_PROT_NONE) {
213 ASSERT(reprotect & VM_PROT_EXECUTE);
214 ret = mach_vm_protect(target_task: map, address: (mach_vm_offset_t)a, size: (mach_vm_size_t)len, set_maximum: 0, new_protection: reprotect);
215 }
216
217done:
218 vm_map_deallocate(map);
219 } else {
220 ret = KERN_TERMINATED;
221 }
222
223 return (int)ret;
224}
225
226/*
227 * cpuvar
228 */
229LCK_MTX_DECLARE_ATTR(cpu_lock, &dtrace_lck_grp, &dtrace_lck_attr);
230LCK_MTX_DECLARE_ATTR(cyc_lock, &dtrace_lck_grp, &dtrace_lck_attr);
231LCK_MTX_DECLARE_ATTR(mod_lock, &dtrace_lck_grp, &dtrace_lck_attr);
232
233dtrace_cpu_t *cpu_list;
234cpu_core_t *cpu_core; /* XXX TLB lockdown? */
235
236/*
237 * cred_t
238 */
239
240/*
241 * dtrace_CRED() can be called from probe context. We cannot simply call kauth_cred_get() since
242 * that function may try to resolve a lazy credential binding, which entails taking the proc_lock.
243 */
244cred_t *
245dtrace_CRED(void)
246{
247 return current_thread_ro_unchecked()->tro_cred;
248}
249
250int
251PRIV_POLICY_CHOICE(void* cred, int priv, int all)
252{
253#pragma unused(priv, all)
254 return kauth_cred_issuser(cred: cred); /* XXX TODO: How is this different from PRIV_POLICY_ONLY? */
255}
256
257int
258PRIV_POLICY_ONLY(void *cr, int priv, int boolean)
259{
260#pragma unused(priv, boolean)
261 return kauth_cred_issuser(cred: cr); /* XXX TODO: HAS_PRIVILEGE(cr, priv); */
262}
263
264uid_t
265crgetuid(const cred_t *cr)
266{
267 cred_t copy_cr = *cr; return kauth_cred_getuid(cred: &copy_cr);
268}
269
270/*
271 * "cyclic"
272 */
273
274typedef struct wrap_timer_call {
275 /* node attributes */
276 cyc_handler_t hdlr;
277 cyc_time_t when;
278 uint64_t deadline;
279 int cpuid;
280 boolean_t suspended;
281 struct timer_call call;
282
283 /* next item in the linked list */
284 LIST_ENTRY(wrap_timer_call) entries;
285} wrap_timer_call_t;
286
287#define WAKEUP_REAPER 0x7FFFFFFFFFFFFFFFLL
288#define NEARLY_FOREVER 0x7FFFFFFFFFFFFFFELL
289
290
291typedef struct cyc_list {
292 cyc_omni_handler_t cyl_omni;
293 wrap_timer_call_t cyl_wrap_by_cpus[];
294} cyc_list_t;
295
296/* CPU going online/offline notifications */
297void (*dtrace_cpu_state_changed_hook)(int, boolean_t) = NULL;
298void dtrace_cpu_state_changed(int, boolean_t);
299
300void
301dtrace_install_cpu_hooks(void)
302{
303 dtrace_cpu_state_changed_hook = dtrace_cpu_state_changed;
304}
305
306void
307dtrace_cpu_state_changed(int cpuid, boolean_t is_running)
308{
309 wrap_timer_call_t *wrapTC = NULL;
310 boolean_t suspend = (is_running ? FALSE : TRUE);
311 dtrace_icookie_t s;
312
313 /* Ensure that we're not going to leave the CPU */
314 s = dtrace_interrupt_disable();
315
316 LIST_FOREACH(wrapTC, &(cpu_list[cpuid].cpu_cyc_list), entries) {
317 assert3u(wrapTC->cpuid, ==, cpuid);
318 if (suspend) {
319 assert(!wrapTC->suspended);
320 /* If this fails, we'll panic anyway, so let's do this now. */
321 if (!timer_call_cancel(call: &wrapTC->call)) {
322 panic("timer_call_cancel() failed to cancel a timer call: %p",
323 &wrapTC->call);
324 }
325 wrapTC->suspended = TRUE;
326 } else {
327 /* Rearm the timer, but ensure it was suspended first. */
328 assert(wrapTC->suspended);
329 clock_deadline_for_periodic_event(interval: wrapTC->when.cyt_interval, abstime: mach_absolute_time(),
330 deadline: &wrapTC->deadline);
331 timer_call_enter1(call: &wrapTC->call, param1: (void*) wrapTC, deadline: wrapTC->deadline,
332 TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
333 wrapTC->suspended = FALSE;
334 }
335 }
336
337 /* Restore the previous interrupt state. */
338 dtrace_interrupt_enable(s);
339}
340
341static void
342_timer_call_apply_cyclic( void *ignore, void *vTChdl )
343{
344#pragma unused(ignore)
345 wrap_timer_call_t *wrapTC = (wrap_timer_call_t *)vTChdl;
346
347 (*(wrapTC->hdlr.cyh_func))( wrapTC->hdlr.cyh_arg );
348
349 clock_deadline_for_periodic_event( interval: wrapTC->when.cyt_interval, abstime: mach_absolute_time(), deadline: &(wrapTC->deadline));
350 timer_call_enter1( call: &(wrapTC->call), param1: (void *)wrapTC, deadline: wrapTC->deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL );
351}
352
353static cyclic_id_t
354timer_call_add_cyclic(wrap_timer_call_t *wrapTC, cyc_handler_t *handler, cyc_time_t *when)
355{
356 uint64_t now;
357 dtrace_icookie_t s;
358
359 timer_call_setup( call: &(wrapTC->call), func: _timer_call_apply_cyclic, NULL );
360 wrapTC->hdlr = *handler;
361 wrapTC->when = *when;
362
363 nanoseconds_to_absolutetime( nanoseconds: wrapTC->when.cyt_interval, result: (uint64_t *)&wrapTC->when.cyt_interval );
364
365 now = mach_absolute_time();
366 wrapTC->deadline = now;
367
368 clock_deadline_for_periodic_event( interval: wrapTC->when.cyt_interval, abstime: now, deadline: &(wrapTC->deadline));
369
370 /* Insert the timer to the list of the running timers on this CPU, and start it. */
371 s = dtrace_interrupt_disable();
372 wrapTC->cpuid = cpu_number();
373 LIST_INSERT_HEAD(&cpu_list[wrapTC->cpuid].cpu_cyc_list, wrapTC, entries);
374 timer_call_enter1(call: &wrapTC->call, param1: (void*) wrapTC, deadline: wrapTC->deadline,
375 TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
376 wrapTC->suspended = FALSE;
377 dtrace_interrupt_enable(s);
378
379 return (cyclic_id_t)wrapTC;
380}
381
382/*
383 * Executed on the CPU the timer is running on.
384 */
385static void
386timer_call_remove_cyclic(wrap_timer_call_t *wrapTC)
387{
388 assert(wrapTC);
389 assert(cpu_number() == wrapTC->cpuid);
390
391 if (!timer_call_cancel(call: &wrapTC->call)) {
392 panic("timer_call_remove_cyclic() failed to cancel a timer call");
393 }
394
395 LIST_REMOVE(wrapTC, entries);
396}
397
398static void *
399timer_call_get_cyclic_arg(wrap_timer_call_t *wrapTC)
400{
401 return wrapTC ? wrapTC->hdlr.cyh_arg : NULL;
402}
403
404cyclic_id_t
405cyclic_timer_add(cyc_handler_t *handler, cyc_time_t *when)
406{
407 wrap_timer_call_t *wrapTC = kalloc_type(wrap_timer_call_t, Z_ZERO | Z_WAITOK);
408 if (NULL == wrapTC) {
409 return CYCLIC_NONE;
410 } else {
411 return timer_call_add_cyclic( wrapTC, handler, when );
412 }
413}
414
415void
416cyclic_timer_remove(cyclic_id_t cyclic)
417{
418 ASSERT( cyclic != CYCLIC_NONE );
419
420 /* Removing a timer call must be done on the CPU the timer is running on. */
421 wrap_timer_call_t *wrapTC = (wrap_timer_call_t *) cyclic;
422 dtrace_xcall(wrapTC->cpuid, (dtrace_xcall_t) timer_call_remove_cyclic, (void*) cyclic);
423
424 kfree_type(wrap_timer_call_t, wrapTC);
425}
426
427static void
428_cyclic_add_omni(cyc_list_t *cyc_list)
429{
430 cyc_time_t cT;
431 cyc_handler_t cH;
432 cyc_omni_handler_t *omni = &cyc_list->cyl_omni;
433
434 (omni->cyo_online)(omni->cyo_arg, CPU, &cH, &cT);
435
436 wrap_timer_call_t *wrapTC = &cyc_list->cyl_wrap_by_cpus[cpu_number()];
437 timer_call_add_cyclic(wrapTC, handler: &cH, when: &cT);
438}
439
440cyclic_id_list_t
441cyclic_add_omni(cyc_omni_handler_t *omni)
442{
443 cyc_list_t *cyc_list = kalloc_type(cyc_list_t, wrap_timer_call_t, NCPU, Z_WAITOK | Z_ZERO);
444
445 if (NULL == cyc_list) {
446 return NULL;
447 }
448
449 cyc_list->cyl_omni = *omni;
450
451 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)_cyclic_add_omni, (void *)cyc_list);
452
453 return (cyclic_id_list_t)cyc_list;
454}
455
456static void
457_cyclic_remove_omni(cyc_list_t *cyc_list)
458{
459 cyc_omni_handler_t *omni = &cyc_list->cyl_omni;
460 void *oarg;
461 wrap_timer_call_t *wrapTC;
462
463 /*
464 * If the processor was offline when dtrace started, we did not allocate
465 * a cyclic timer for this CPU.
466 */
467 if ((wrapTC = &cyc_list->cyl_wrap_by_cpus[cpu_number()]) != NULL) {
468 oarg = timer_call_get_cyclic_arg(wrapTC);
469 timer_call_remove_cyclic(wrapTC);
470 (omni->cyo_offline)(omni->cyo_arg, CPU, oarg);
471 }
472}
473
474void
475cyclic_remove_omni(cyclic_id_list_t cyc_list)
476{
477 ASSERT(cyc_list != NULL);
478
479 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)_cyclic_remove_omni, (void *)cyc_list);
480 void *cyc_list_p = (void *)cyc_list;
481 kfree_type(cyc_list_t, wrap_timer_call_t, NCPU, cyc_list_p);
482}
483
484typedef struct wrap_thread_call {
485 thread_call_t TChdl;
486 cyc_handler_t hdlr;
487 cyc_time_t when;
488 uint64_t deadline;
489} wrap_thread_call_t;
490
491/*
492 * _cyclic_apply will run on some thread under kernel_task. That's OK for the
493 * cleaner and the deadman, but too distant in time and place for the profile provider.
494 */
495static void
496_cyclic_apply( void *ignore, void *vTChdl )
497{
498#pragma unused(ignore)
499 wrap_thread_call_t *wrapTC = (wrap_thread_call_t *)vTChdl;
500
501 (*(wrapTC->hdlr.cyh_func))( wrapTC->hdlr.cyh_arg );
502
503 clock_deadline_for_periodic_event( interval: wrapTC->when.cyt_interval, abstime: mach_absolute_time(), deadline: &(wrapTC->deadline));
504 (void)thread_call_enter1_delayed( call: wrapTC->TChdl, param1: (void *)wrapTC, deadline: wrapTC->deadline );
505
506 /* Did cyclic_remove request a wakeup call when this thread call was re-armed? */
507 if (wrapTC->when.cyt_interval == WAKEUP_REAPER) {
508 thread_wakeup((event_t)wrapTC);
509 }
510}
511
512cyclic_id_t
513cyclic_add(cyc_handler_t *handler, cyc_time_t *when)
514{
515 uint64_t now;
516
517 wrap_thread_call_t *wrapTC = kalloc_type(wrap_thread_call_t, Z_ZERO | Z_WAITOK);
518 if (NULL == wrapTC) {
519 return CYCLIC_NONE;
520 }
521
522 wrapTC->TChdl = thread_call_allocate( func: _cyclic_apply, NULL );
523 wrapTC->hdlr = *handler;
524 wrapTC->when = *when;
525
526 ASSERT(when->cyt_when == 0);
527 ASSERT(when->cyt_interval < WAKEUP_REAPER);
528
529 nanoseconds_to_absolutetime(nanoseconds: wrapTC->when.cyt_interval, result: (uint64_t *)&wrapTC->when.cyt_interval);
530
531 now = mach_absolute_time();
532 wrapTC->deadline = now;
533
534 clock_deadline_for_periodic_event( interval: wrapTC->when.cyt_interval, abstime: now, deadline: &(wrapTC->deadline));
535 (void)thread_call_enter1_delayed( call: wrapTC->TChdl, param1: (void *)wrapTC, deadline: wrapTC->deadline );
536
537 return (cyclic_id_t)wrapTC;
538}
539
540static void
541noop_cyh_func(void * ignore)
542{
543#pragma unused(ignore)
544}
545
546void
547cyclic_remove(cyclic_id_t cyclic)
548{
549 wrap_thread_call_t *wrapTC = (wrap_thread_call_t *)cyclic;
550
551 ASSERT(cyclic != CYCLIC_NONE);
552
553 while (!thread_call_cancel(call: wrapTC->TChdl)) {
554 int ret = assert_wait(event: wrapTC, THREAD_UNINT);
555 ASSERT(ret == THREAD_WAITING);
556
557 wrapTC->when.cyt_interval = WAKEUP_REAPER;
558
559 ret = thread_block(THREAD_CONTINUE_NULL);
560 ASSERT(ret == THREAD_AWAKENED);
561 }
562
563 if (thread_call_free(call: wrapTC->TChdl)) {
564 kfree_type(wrap_thread_call_t, wrapTC);
565 } else {
566 /* Gut this cyclic and move on ... */
567 wrapTC->hdlr.cyh_func = noop_cyh_func;
568 wrapTC->when.cyt_interval = NEARLY_FOREVER;
569 }
570}
571
572int
573ddi_driver_major(dev_info_t *devi)
574{
575 return (int)major(CAST_DOWN_EXPLICIT(int, devi));
576}
577
578int
579ddi_create_minor_node(dev_info_t *dip, const char *name, int spec_type,
580 minor_t minor_num, const char *node_type, int flag)
581{
582#pragma unused(spec_type,node_type,flag)
583 dev_t dev = makedev( ddi_driver_major(dip), minor_num );
584
585 if (NULL == devfs_make_node( dev, DEVFS_CHAR, UID_ROOT, GID_WHEEL, perms: 0666, fmt: "%s", name )) {
586 return DDI_FAILURE;
587 } else {
588 return DDI_SUCCESS;
589 }
590}
591
592void
593ddi_remove_minor_node(dev_info_t *dip, char *name)
594{
595#pragma unused(dip,name)
596/* XXX called from dtrace_detach, so NOTREACHED for now. */
597}
598
599major_t
600getemajor( dev_t d )
601{
602 return (major_t) major(d);
603}
604
605minor_t
606getminor( dev_t d )
607{
608 return (minor_t) minor(d);
609}
610
611extern void Debugger(const char*);
612
613void
614debug_enter(char *c)
615{
616 Debugger(c);
617}
618
619/*
620 * kmem
621 */
622
623// rdar://88962505
624__typed_allocators_ignore_push
625
626void *
627dt_kmem_alloc_tag(size_t size, int kmflag, vm_tag_t tag)
628{
629#pragma unused(kmflag)
630
631/*
632 * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact).
633 * Requests larger than 8K with M_NOWAIT fail in kalloc_ext.
634 */
635 return kheap_alloc_tag(KHEAP_DTRACE, size, Z_WAITOK, tag);
636}
637
638void *
639dt_kmem_zalloc_tag(size_t size, int kmflag, vm_tag_t tag)
640{
641#pragma unused(kmflag)
642
643/*
644 * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact).
645 * Requests larger than 8K with M_NOWAIT fail in kalloc_ext.
646 */
647 return kheap_alloc_tag(KHEAP_DTRACE, size, Z_WAITOK | Z_ZERO, tag);
648}
649
650void
651dt_kmem_free(void *buf, size_t size)
652{
653 kheap_free(KHEAP_DTRACE, buf, size);
654}
655
656__typed_allocators_ignore_pop
657
658
659/*
660 * aligned dt_kmem allocator
661 * align should be a power of two
662 */
663
664void*
665dt_kmem_alloc_aligned_tag(size_t size, size_t align, int kmflag, vm_tag_t tag)
666{
667 void *mem, **addr_to_free;
668 intptr_t mem_aligned;
669 size_t *size_to_free, hdr_size;
670
671 /* Must be a power of two. */
672 assert(align != 0);
673 assert((align & (align - 1)) == 0);
674
675 /*
676 * We are going to add a header to the allocation. It contains
677 * the address to free and the total size of the buffer.
678 */
679 hdr_size = sizeof(size_t) + sizeof(void*);
680 mem = dt_kmem_alloc_tag(size: size + align + hdr_size, kmflag, tag);
681 if (mem == NULL) {
682 return NULL;
683 }
684
685 mem_aligned = (intptr_t) (((intptr_t) mem + align + hdr_size) & ~(align - 1));
686
687 /* Write the address to free in the header. */
688 addr_to_free = (void**) (mem_aligned - sizeof(void*));
689 *addr_to_free = mem;
690
691 /* Write the size to free in the header. */
692 size_to_free = (size_t*) (mem_aligned - hdr_size);
693 *size_to_free = size + align + hdr_size;
694
695 return (void*) mem_aligned;
696}
697
698void*
699dt_kmem_zalloc_aligned_tag(size_t size, size_t align, int kmflag, vm_tag_t tag)
700{
701 void* buf;
702
703 buf = dt_kmem_alloc_aligned_tag(size, align, kmflag, tag);
704
705 if (!buf) {
706 return NULL;
707 }
708
709 bzero(s: buf, n: size);
710
711 return buf;
712}
713
714void
715dt_kmem_free_aligned(void* buf, size_t size)
716{
717#pragma unused(size)
718 intptr_t ptr = (intptr_t) buf;
719 void **addr_to_free = (void**) (ptr - sizeof(void*));
720 size_t *size_to_free = (size_t*) (ptr - (sizeof(size_t) + sizeof(void*)));
721
722 if (buf == NULL) {
723 return;
724 }
725
726 dt_kmem_free(buf: *addr_to_free, size: *size_to_free);
727}
728
729/*
730 * vmem (Solaris "slab" allocator) used by DTrace solely to hand out resource ids
731 */
732typedef unsigned int u_daddr_t;
733#include "blist.h"
734
735/* By passing around blist *handles*, the underlying blist can be resized as needed. */
736struct blist_hdl {
737 blist_t blist;
738};
739
740vmem_t *
741vmem_create(const char *name, void *base, size_t size, size_t quantum, void *ignore5,
742 void *ignore6, vmem_t *source, size_t qcache_max, int vmflag)
743{
744#pragma unused(name,quantum,ignore5,ignore6,source,qcache_max,vmflag)
745 blist_t bl;
746 struct blist_hdl *p = kalloc_type(struct blist_hdl, Z_WAITOK);
747
748 ASSERT(quantum == 1);
749 ASSERT(NULL == ignore5);
750 ASSERT(NULL == ignore6);
751 ASSERT(NULL == source);
752 ASSERT(0 == qcache_max);
753 ASSERT(size <= INT32_MAX);
754 ASSERT(vmflag & VMC_IDENTIFIER);
755
756 size = MIN(128, size); /* Clamp to 128 initially, since the underlying data structure is pre-allocated */
757
758 p->blist = bl = blist_create(blocks: (daddr_t)size);
759 blist_free(blist: bl, blkno: 0, count: (daddr_t)size);
760 if (base) {
761 blist_alloc( blist: bl, count: (daddr_t)(uintptr_t)base ); /* Chomp off initial ID(s) */
762 }
763 return (vmem_t *)p;
764}
765
766void *
767vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
768{
769#pragma unused(vmflag)
770 struct blist_hdl *q = (struct blist_hdl *)vmp;
771 blist_t bl = q->blist;
772 daddr_t p;
773
774 p = blist_alloc(blist: bl, count: (daddr_t)size);
775
776 if (p == SWAPBLK_NONE) {
777 blist_resize(pblist: &bl, count: (bl->bl_blocks) << 1, freenew: 1);
778 q->blist = bl;
779 p = blist_alloc(blist: bl, count: (daddr_t)size);
780 if (p == SWAPBLK_NONE) {
781 panic("vmem_alloc: failure after blist_resize!");
782 }
783 }
784
785 return (void *)(uintptr_t)p;
786}
787
788void
789vmem_free(vmem_t *vmp, void *vaddr, size_t size)
790{
791 struct blist_hdl *p = (struct blist_hdl *)vmp;
792
793 blist_free( blist: p->blist, blkno: (daddr_t)(uintptr_t)vaddr, count: (daddr_t)size );
794}
795
796void
797vmem_destroy(vmem_t *vmp)
798{
799 struct blist_hdl *p = (struct blist_hdl *)vmp;
800
801 blist_destroy( blist: p->blist );
802 kfree_type(struct blist_hdl, p);
803}
804
805/*
806 * Timing
807 */
808
809/*
810 * dtrace_gethrestime() provides the "walltimestamp", a value that is anchored at
811 * January 1, 1970. Because it can be called from probe context, it must take no locks.
812 */
813
814hrtime_t
815dtrace_gethrestime(void)
816{
817 clock_sec_t secs;
818 clock_nsec_t nanosecs;
819 uint64_t secs64, ns64;
820
821 clock_get_calendar_nanotime_nowait(secs: &secs, nanosecs: &nanosecs);
822 secs64 = (uint64_t)secs;
823 ns64 = (uint64_t)nanosecs;
824
825 ns64 = ns64 + (secs64 * 1000000000LL);
826 return ns64;
827}
828
829/*
830 * dtrace_gethrtime() provides high-resolution timestamps with machine-dependent origin.
831 * Hence its primary use is to specify intervals.
832 */
833
834hrtime_t
835dtrace_abs_to_nano(uint64_t elapsed)
836{
837 static mach_timebase_info_data_t sTimebaseInfo = { 0, 0 };
838
839 /*
840 * If this is the first time we've run, get the timebase.
841 * We can use denom == 0 to indicate that sTimebaseInfo is
842 * uninitialised because it makes no sense to have a zero
843 * denominator in a fraction.
844 */
845
846 if (sTimebaseInfo.denom == 0) {
847 (void) clock_timebase_info(info: &sTimebaseInfo);
848 }
849
850 /*
851 * Convert to nanoseconds.
852 * return (elapsed * (uint64_t)sTimebaseInfo.numer)/(uint64_t)sTimebaseInfo.denom;
853 *
854 * Provided the final result is representable in 64 bits the following maneuver will
855 * deliver that result without intermediate overflow.
856 */
857 if (sTimebaseInfo.denom == sTimebaseInfo.numer) {
858 return elapsed;
859 } else if (sTimebaseInfo.denom == 1) {
860 return elapsed * (uint64_t)sTimebaseInfo.numer;
861 } else {
862 /* Decompose elapsed = eta32 * 2^32 + eps32: */
863 uint64_t eta32 = elapsed >> 32;
864 uint64_t eps32 = elapsed & 0x00000000ffffffffLL;
865
866 uint32_t numer = sTimebaseInfo.numer, denom = sTimebaseInfo.denom;
867
868 /* Form product of elapsed64 (decomposed) and numer: */
869 uint64_t mu64 = numer * eta32;
870 uint64_t lambda64 = numer * eps32;
871
872 /* Divide the constituents by denom: */
873 uint64_t q32 = mu64 / denom;
874 uint64_t r32 = mu64 - (q32 * denom); /* mu64 % denom */
875
876 return (q32 << 32) + ((r32 << 32) + lambda64) / denom;
877 }
878}
879
880hrtime_t
881dtrace_gethrtime(void)
882{
883 static uint64_t start = 0;
884
885 if (start == 0) {
886 start = mach_absolute_time();
887 }
888
889 return dtrace_abs_to_nano(elapsed: mach_absolute_time() - start);
890}
891
892/*
893 * Atomicity and synchronization
894 */
895uint32_t
896dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new)
897{
898 if (OSCompareAndSwap((UInt32)cmp, (UInt32)new, (volatile UInt32 *)target )) {
899 return cmp;
900 } else {
901 return ~cmp; /* Must return something *other* than cmp */
902 }
903}
904
905void *
906dtrace_casptr(void *target, void *cmp, void *new)
907{
908 if (OSCompareAndSwapPtr( cmp, new, (void**)target )) {
909 return cmp;
910 } else {
911 return (void *)(~(uintptr_t)cmp); /* Must return something *other* than cmp */
912 }
913}
914
915/*
916 * Interrupt manipulation
917 */
918dtrace_icookie_t
919dtrace_interrupt_disable(void)
920{
921 return (dtrace_icookie_t)ml_set_interrupts_enabled(FALSE);
922}
923
924void
925dtrace_interrupt_enable(dtrace_icookie_t reenable)
926{
927 (void)ml_set_interrupts_enabled(enable: (boolean_t)reenable);
928}
929
930/*
931 * MP coordination
932 */
933static void
934dtrace_sync_func(void)
935{
936}
937
938/*
939 * dtrace_sync() is not called from probe context.
940 */
941void
942dtrace_sync(void)
943{
944 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
945}
946
947/*
948 * The dtrace_copyin/out/instr and dtrace_fuword* routines can be called from probe context.
949 */
950
951extern kern_return_t dtrace_copyio_preflight(addr64_t);
952extern kern_return_t dtrace_copyio_postflight(addr64_t);
953
954static int
955dtrace_copycheck(user_addr_t uaddr, uintptr_t kaddr, size_t size)
956{
957#pragma unused(kaddr)
958
959 ASSERT(kaddr + size >= kaddr);
960
961 if (uaddr + size < uaddr || /* Avoid address wrap. */
962 KERN_FAILURE == dtrace_copyio_preflight(uaddr)) { /* Machine specific setup/constraints. */
963 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
964 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
965 return 0;
966 }
967 return 1;
968}
969
970void
971dtrace_copyin(user_addr_t src, uintptr_t dst, size_t len, volatile uint16_t *flags)
972{
973#pragma unused(flags)
974
975 if (dtrace_copycheck( uaddr: src, kaddr: dst, size: len )) {
976 if (copyin((const user_addr_t)src, (char *)dst, (vm_size_t)len)) {
977 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
978 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = src;
979 }
980 dtrace_copyio_postflight(src);
981 }
982}
983
984void
985dtrace_copyinstr(user_addr_t src, uintptr_t dst, size_t len, volatile uint16_t *flags)
986{
987#pragma unused(flags)
988
989 size_t actual;
990
991 if (dtrace_copycheck( uaddr: src, kaddr: dst, size: len )) {
992 /* copyin as many as 'len' bytes. */
993 int error = copyinstr(uaddr: (const user_addr_t)src, kaddr: (char *)dst, len: (vm_size_t)len, done: &actual);
994
995 /*
996 * ENAMETOOLONG is returned when 'len' bytes have been copied in but the NUL terminator was
997 * not encountered. That does not require raising CPU_DTRACE_BADADDR, and we press on.
998 * Note that we do *not* stuff a NUL terminator when returning ENAMETOOLONG, that's left
999 * to the caller.
1000 */
1001 if (error && error != ENAMETOOLONG) {
1002 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1003 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = src;
1004 }
1005 dtrace_copyio_postflight(src);
1006 }
1007}
1008
1009void
1010dtrace_copyout(uintptr_t src, user_addr_t dst, size_t len, volatile uint16_t *flags)
1011{
1012#pragma unused(flags)
1013
1014 if (dtrace_copycheck( uaddr: dst, kaddr: src, size: len )) {
1015 if (copyout((const void *)src, dst, (vm_size_t)len)) {
1016 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1017 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = dst;
1018 }
1019 dtrace_copyio_postflight(dst);
1020 }
1021}
1022
1023void
1024dtrace_copyoutstr(uintptr_t src, user_addr_t dst, size_t len, volatile uint16_t *flags)
1025{
1026#pragma unused(flags)
1027
1028 size_t actual;
1029
1030 if (dtrace_copycheck( uaddr: dst, kaddr: src, size: len )) {
1031 /*
1032 * ENAMETOOLONG is returned when 'len' bytes have been copied out but the NUL terminator was
1033 * not encountered. We raise CPU_DTRACE_BADADDR in that case.
1034 * Note that we do *not* stuff a NUL terminator when returning ENAMETOOLONG, that's left
1035 * to the caller.
1036 */
1037 if (copyoutstr(kaddr: (const void *)src, udaddr: dst, len: (size_t)len, done: &actual)) {
1038 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1039 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = dst;
1040 }
1041 dtrace_copyio_postflight(dst);
1042 }
1043}
1044
1045extern const int copysize_limit_panic;
1046
1047int
1048dtrace_copy_maxsize(void)
1049{
1050 return copysize_limit_panic;
1051}
1052
1053
1054int
1055dtrace_buffer_copyout(const void *kaddr, user_addr_t uaddr, vm_size_t nbytes)
1056{
1057 int maxsize = dtrace_copy_maxsize();
1058 /*
1059 * Partition the copyout in copysize_limit_panic-sized chunks
1060 */
1061 while (nbytes >= (vm_size_t)maxsize) {
1062 if (copyout(kaddr, uaddr, maxsize) != 0) {
1063 return EFAULT;
1064 }
1065
1066 nbytes -= maxsize;
1067 uaddr += maxsize;
1068 kaddr = (const void *)((uintptr_t)kaddr + maxsize);
1069 }
1070 if (nbytes > 0) {
1071 if (copyout(kaddr, uaddr, nbytes) != 0) {
1072 return EFAULT;
1073 }
1074 }
1075
1076 return 0;
1077}
1078
1079uint8_t
1080dtrace_fuword8(user_addr_t uaddr)
1081{
1082 uint8_t ret = 0;
1083
1084 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1085 if (dtrace_copycheck( uaddr, kaddr: (uintptr_t)&ret, size: sizeof(ret))) {
1086 if (copyin((const user_addr_t)uaddr, (char *)&ret, sizeof(ret))) {
1087 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1088 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
1089 }
1090 dtrace_copyio_postflight(uaddr);
1091 }
1092 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1093
1094 return ret;
1095}
1096
1097uint16_t
1098dtrace_fuword16(user_addr_t uaddr)
1099{
1100 uint16_t ret = 0;
1101
1102 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1103 if (dtrace_copycheck( uaddr, kaddr: (uintptr_t)&ret, size: sizeof(ret))) {
1104 if (copyin((const user_addr_t)uaddr, (char *)&ret, sizeof(ret))) {
1105 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1106 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
1107 }
1108 dtrace_copyio_postflight(uaddr);
1109 }
1110 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1111
1112 return ret;
1113}
1114
1115uint32_t
1116dtrace_fuword32(user_addr_t uaddr)
1117{
1118 uint32_t ret = 0;
1119
1120 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1121 if (dtrace_copycheck( uaddr, kaddr: (uintptr_t)&ret, size: sizeof(ret))) {
1122 if (copyin((const user_addr_t)uaddr, (char *)&ret, sizeof(ret))) {
1123 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1124 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
1125 }
1126 dtrace_copyio_postflight(uaddr);
1127 }
1128 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1129
1130 return ret;
1131}
1132
1133uint64_t
1134dtrace_fuword64(user_addr_t uaddr)
1135{
1136 uint64_t ret = 0;
1137
1138 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1139 if (dtrace_copycheck( uaddr, kaddr: (uintptr_t)&ret, size: sizeof(ret))) {
1140 if (copyin((const user_addr_t)uaddr, (char *)&ret, sizeof(ret))) {
1141 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1142 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
1143 }
1144 dtrace_copyio_postflight(uaddr);
1145 }
1146 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1147
1148 return ret;
1149}
1150
1151/*
1152 * Emulation of Solaris fuword / suword
1153 * Called from the fasttrap provider, so the use of copyin/out requires fewer safegaurds.
1154 */
1155
1156int
1157fuword8(user_addr_t uaddr, uint8_t *value)
1158{
1159 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint8_t)) != 0) {
1160 return -1;
1161 }
1162
1163 return 0;
1164}
1165
1166int
1167fuword16(user_addr_t uaddr, uint16_t *value)
1168{
1169 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint16_t)) != 0) {
1170 return -1;
1171 }
1172
1173 return 0;
1174}
1175
1176int
1177fuword32(user_addr_t uaddr, uint32_t *value)
1178{
1179 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint32_t)) != 0) {
1180 return -1;
1181 }
1182
1183 return 0;
1184}
1185
1186int
1187fuword64(user_addr_t uaddr, uint64_t *value)
1188{
1189 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint64_t)) != 0) {
1190 return -1;
1191 }
1192
1193 return 0;
1194}
1195
1196void
1197fuword32_noerr(user_addr_t uaddr, uint32_t *value)
1198{
1199 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint32_t))) {
1200 *value = 0;
1201 }
1202}
1203
1204void
1205fuword64_noerr(user_addr_t uaddr, uint64_t *value)
1206{
1207 if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint64_t))) {
1208 *value = 0;
1209 }
1210}
1211
1212int
1213suword64(user_addr_t addr, uint64_t value)
1214{
1215 if (copyout((const void *)&value, addr, sizeof(value)) != 0) {
1216 return -1;
1217 }
1218
1219 return 0;
1220}
1221
1222int
1223suword32(user_addr_t addr, uint32_t value)
1224{
1225 if (copyout((const void *)&value, addr, sizeof(value)) != 0) {
1226 return -1;
1227 }
1228
1229 return 0;
1230}
1231
1232/*
1233 * Miscellaneous
1234 */
1235extern boolean_t dtrace_tally_fault(user_addr_t);
1236
1237boolean_t
1238dtrace_tally_fault(user_addr_t uaddr)
1239{
1240 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1241 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = uaddr;
1242 return DTRACE_CPUFLAG_ISSET(CPU_DTRACE_NOFAULT) ? TRUE : FALSE;
1243}
1244
1245#define TOTTY 0x02
1246extern int prf(const char *, va_list, int, struct tty *); /* bsd/kern/subr_prf.h */
1247
1248int
1249vuprintf(const char *format, va_list ap)
1250{
1251 return prf(format, ap, TOTTY, NULL);
1252}
1253
1254/* Not called from probe context */
1255void
1256cmn_err( int level, const char *format, ... )
1257{
1258#pragma unused(level)
1259 va_list alist;
1260
1261 va_start(alist, format);
1262 vuprintf(format, ap: alist);
1263 va_end(alist);
1264 uprintf("\n");
1265}
1266
1267const void*
1268bsearch(const void *key, const void *base0, size_t nmemb, size_t size, int (*compar)(const void *, const void *))
1269{
1270 const char *base = base0;
1271 size_t lim;
1272 int cmp;
1273 const void *p;
1274 for (lim = nmemb; lim != 0; lim >>= 1) {
1275 p = base + (lim >> 1) * size;
1276 cmp = (*compar)(key, p);
1277 if (cmp == 0) {
1278 return p;
1279 }
1280 if (cmp > 0) { /* key > p: move right */
1281 base = (const char *)p + size;
1282 lim--;
1283 } /* else move left */
1284 }
1285 return NULL;
1286}
1287
1288/*
1289 * Runtime and ABI
1290 */
1291uintptr_t
1292dtrace_caller(int ignore)
1293{
1294#pragma unused(ignore)
1295 return -1; /* Just as in Solaris dtrace_asm.s */
1296}
1297
1298int
1299dtrace_getstackdepth(int aframes)
1300{
1301 struct frame *fp = (struct frame *)__builtin_frame_address(0);
1302 struct frame *nextfp, *minfp, *stacktop;
1303 int depth = 0;
1304 int on_intr;
1305
1306 if ((on_intr = CPU_ON_INTR(CPU)) != 0) {
1307 stacktop = (struct frame *)dtrace_get_cpu_int_stack_top();
1308 } else {
1309 stacktop = (struct frame *)(dtrace_get_kernel_stack(current_thread()) + kernel_stack_size);
1310 }
1311
1312 minfp = fp;
1313
1314 aframes++;
1315
1316 for (;;) {
1317 depth++;
1318
1319 nextfp = *(struct frame **)fp;
1320
1321 if (nextfp <= minfp || nextfp >= stacktop) {
1322 if (on_intr) {
1323 /*
1324 * Hop from interrupt stack to thread stack.
1325 */
1326 vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread());
1327
1328 minfp = (struct frame *)kstack_base;
1329 stacktop = (struct frame *)(kstack_base + kernel_stack_size);
1330
1331 on_intr = 0;
1332 continue;
1333 }
1334 break;
1335 }
1336
1337 fp = nextfp;
1338 minfp = fp;
1339 }
1340
1341 if (depth <= aframes) {
1342 return 0;
1343 }
1344
1345 return depth - aframes;
1346}
1347
1348int
1349dtrace_addr_in_module(const void* addr, const struct modctl *ctl)
1350{
1351 return OSKextKextForAddress(addr) == (void*)ctl->mod_address;
1352}
1353
1354/*
1355 * Unconsidered
1356 */
1357void
1358dtrace_vtime_enable(void)
1359{
1360}
1361
1362void
1363dtrace_vtime_disable(void)
1364{
1365}
1366