dtrace.c source code [xnu/bsd/dev/dtrace/dtrace.c]

1	/*
2	* CDDL HEADER START
3	*
4	* The contents of this file are subject to the terms of the
5	* Common Development and Distribution License (the "License").
6	* You may not use this file except in compliance with the License.
7	*
8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9	* or http://www.opensolaris.org/os/licensing.
10	* See the License for the specific language governing permissions
11	* and limitations under the License.
12	*
13	* When distributing Covered Code, include this CDDL HEADER in each
14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15	* If applicable, add the following below this CDDL HEADER, with the
16	* fields enclosed by brackets "[]" replaced with your own identifying
17	* information: Portions Copyright [yyyy] [name of copyright owner]
18	*
19	* CDDL HEADER END
20	*/
21
22	/*
23	* Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24	* Portions Copyright (c) 2013 by Delphix. All rights reserved.
25	*/
26
27	/*
28	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29	* Use is subject to license terms.
30	*/
31
32	/*
33	* DTrace - Dynamic Tracing for Solaris
34	*
35	* This is the implementation of the Solaris Dynamic Tracing framework
36	* (DTrace). The user-visible interface to DTrace is described at length in
37	* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
38	* library, the in-kernel DTrace framework, and the DTrace providers are
39	* described in the block comments in the <sys/dtrace.h> header file. The
40	* internal architecture of DTrace is described in the block comments in the
41	* <sys/dtrace_impl.h> header file. The comments contained within the DTrace
42	* implementation very much assume mastery of all of these sources; if one has
43	* an unanswered question about the implementation, one should consult them
44	* first.
45	*
46	* The functions here are ordered roughly as follows:
47	*
48	* - Probe context functions
49	* - Probe hashing functions
50	* - Non-probe context utility functions
51	* - Matching functions
52	* - Provider-to-Framework API functions
53	* - Probe management functions
54	* - DIF object functions
55	* - Format functions
56	* - Predicate functions
57	* - ECB functions
58	* - Buffer functions
59	* - Enabling functions
60	* - DOF functions
61	* - Anonymous enabling functions
62	* - Process functions
63	* - Consumer state functions
64	* - Helper functions
65	* - Hook functions
66	* - Driver cookbook functions
67	*
68	* Each group of functions begins with a block comment labelled the "DTrace
69	* [Group] Functions", allowing one to find each block by searching forward
70	* on capital-f functions.
71	*/
72	#include <sys/errno.h>
73	#include <sys/types.h>
74	#include <sys/stat.h>
75	#include <sys/conf.h>
76	#include <sys/random.h>
77	#include <sys/systm.h>
78	#include <sys/dtrace_impl.h>
79	#include <sys/param.h>
80	#include <sys/proc_internal.h>
81	#include <sys/ioctl.h>
82	#include <sys/fcntl.h>
83	#include <miscfs/devfs/devfs.h>
84	#include <sys/malloc.h>
85	#include <sys/kernel_types.h>
86	#include <sys/proc_internal.h>
87	#include <sys/uio_internal.h>
88	#include <sys/kauth.h>
89	#include <vm/pmap.h>
90	#include <sys/user.h>
91	#include <mach/exception_types.h>
92	#include <sys/signalvar.h>
93	#include <mach/task.h>
94	#include <kern/ast.h>
95	#include <kern/hvg_hypercall.h>
96	#include <kern/sched_prim.h>
97	#include <kern/processor.h>
98	#include <kern/task.h>
99	#include <kern/zalloc.h>
100	#include <netinet/in.h>
101	#include <libkern/sysctl.h>
102	#include <sys/kdebug.h>
103	#include <sys/sdt_impl.h>
104
105	#if CONFIG_PERVASIVE_CPI
106	#include <kern/monotonic.h>
107	#include <machine/monotonic.h>
108	#endif /* CONFIG_PERVASIVE_CPI */
109
110	#include "dtrace_xoroshiro128_plus.h"
111
112	#include <IOKit/IOPlatformExpert.h>
113
114	#include <kern/cpu_data.h>
115
116	extern addr64_t kvtophys(vm_offset_t va);
117
118	extern uint32_t pmap_find_phys(void *, uint64_t);
119	extern boolean_t pmap_valid_page(uint32_t);
120	extern void OSKextRegisterKextsWithDTrace(void);
121	extern kmod_info_t g_kernel_kmod_info;
122	extern void commpage_update_dof(boolean_t enabled);
123
124	/ Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. /
125	#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
126
127	#define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
128
129	extern void dtrace_suspend(void);
130	extern void dtrace_resume(void);
131	extern void dtrace_early_init(void);
132	extern int dtrace_keep_kernel_symbols(void);
133	extern void dtrace_init(void);
134	extern void helper_init(void);
135	extern void fasttrap_init(void);
136
137	static int dtrace_lazy_dofs_duplicate(proc_t , proc_t );
138	extern void dtrace_lazy_dofs_destroy(proc_t *);
139	extern void dtrace_postinit(void);
140
141	extern void dtrace_proc_fork(proc_t, proc_t, int);
142	extern void dtrace_proc_exec(proc_t*);
143	extern void dtrace_proc_exit(proc_t*);
144
145	/*
146	* DTrace Tunable Variables
147	*
148	* The following variables may be dynamically tuned by using sysctl(8), the
149	* variables being stored in the kern.dtrace namespace. For example:
150	* sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
151	*
152	* In general, the only variables that one should be tuning this way are those
153	* that affect system-wide DTrace behavior, and for which the default behavior
154	* is undesirable. Most of these variables are tunable on a per-consumer
155	* basis using DTrace options, and need not be tuned on a system-wide basis.
156	* When tuning these variables, avoid pathological values; while some attempt
157	* is made to verify the integrity of these variables, they are not considered
158	* part of the supported interface to DTrace, and they are therefore not
159	* checked comprehensively.
160	*/
161	uint64_t dtrace_buffer_memory_maxsize = `0`; / initialized in dtrace_init /
162	uint64_t dtrace_buffer_memory_inuse = `0`;
163	int dtrace_destructive_disallow = `1`;
164	dtrace_optval_t dtrace_nonroot_maxsize = (`16` * `1024` * `1024`);
165	size_t dtrace_difo_maxsize = (`256` * `1024`);
166	dtrace_optval_t dtrace_dof_maxsize = (`512` * `1024`);
167	dtrace_optval_t dtrace_statvar_maxsize = (`16` * `1024`);
168	dtrace_optval_t dtrace_statvar_maxsize_max = (`16` * `10` * `1024`);
169	size_t dtrace_actions_max = (`16` * `1024`);
170	size_t dtrace_retain_max = `1024`;
171	dtrace_optval_t dtrace_helper_actions_max = `32`;
172	dtrace_optval_t dtrace_helper_providers_max = `64`;
173	dtrace_optval_t dtrace_dstate_defsize = (`1` * `1024` * `1024`);
174	size_t dtrace_strsize_default = `256`;
175	dtrace_optval_t dtrace_strsize_min = `8`;
176	dtrace_optval_t dtrace_strsize_max = `65536`;
177	dtrace_optval_t dtrace_cleanrate_default = `990099000`; / 1.1 hz /
178	dtrace_optval_t dtrace_cleanrate_min = `20000000`; / 50 hz /
179	dtrace_optval_t dtrace_cleanrate_max = (uint64_t)`60` * NANOSEC; / 1/minute /
180	dtrace_optval_t dtrace_aggrate_default = NANOSEC; / 1 hz /
181	dtrace_optval_t dtrace_statusrate_default = NANOSEC; / 1 hz /
182	dtrace_optval_t dtrace_statusrate_max = (hrtime_t)`10` * NANOSEC; / 6/minute /
183	dtrace_optval_t dtrace_switchrate_default = NANOSEC; / 1 hz /
184	dtrace_optval_t dtrace_nspec_default = `1`;
185	dtrace_optval_t dtrace_specsize_default = `32` * `1024`;
186	dtrace_optval_t dtrace_stackframes_default = `20`;
187	dtrace_optval_t dtrace_ustackframes_default = `20`;
188	dtrace_optval_t dtrace_jstackframes_default = `50`;
189	dtrace_optval_t dtrace_jstackstrsize_default = `512`;
190	dtrace_optval_t dtrace_buflimit_default = `75`;
191	dtrace_optval_t dtrace_buflimit_min = `1`;
192	dtrace_optval_t dtrace_buflimit_max = `99`;
193	size_t dtrace_nprobes_default = `4`;
194	int dtrace_msgdsize_max = `128`;
195	hrtime_t dtrace_chill_max = `500` * (NANOSEC / MILLISEC); / 500 ms /
196	hrtime_t dtrace_chill_interval = NANOSEC; / 1000 ms /
197	int dtrace_devdepth_max = `32`;
198	int dtrace_err_verbose;
199	hrtime_t dtrace_deadman_interval = NANOSEC;
200	hrtime_t dtrace_deadman_timeout = (hrtime_t)`10` * NANOSEC;
201	hrtime_t dtrace_deadman_user = (hrtime_t)`30` * NANOSEC;
202
203	/*
204	* DTrace External Variables
205	*
206	* As dtrace(7D) is a kernel module, any DTrace variables are obviously
207	* available to DTrace consumers via the backtick (`) syntax. One of these,
208	* dtrace_zero, is made deliberately so: it is provided as a source of
209	* well-known, zero-filled memory. While this variable is not documented,
210	* it is used by some translators as an implementation detail.
211	*/
212	const char dtrace_zero[`256`] = { `0` }; / zero-filled memory /
213	unsigned int dtrace_max_cpus = `0`; / number of enabled cpus /
214	/*
215	* DTrace Internal Variables
216	*/
217	static dev_info_t dtrace_devi; /* device info /
218	static vmem_t dtrace_arena; /* probe ID arena /
219	static dtrace_probe_t *dtrace_probes; /* array of all probes /
220	static int dtrace_nprobes; / number of probes /
221	static dtrace_provider_t dtrace_provider; /* provider list /
222	static dtrace_meta_t dtrace_meta_pid; /* user-land meta provider /
223	static int dtrace_opens; / number of opens /
224	static int dtrace_helpers; / number of helpers /
225	static dtrace_hash_t *dtrace_strings;
226	static dtrace_hash_t dtrace_byprov; /* probes hashed by provider /
227	static dtrace_hash_t dtrace_bymod; /* probes hashed by module /
228	static dtrace_hash_t dtrace_byfunc; /* probes hashed by function /
229	static dtrace_hash_t dtrace_byname; /* probes hashed by name /
230	static dtrace_toxrange_t dtrace_toxrange; /* toxic range array /
231	static int dtrace_toxranges; / number of toxic ranges /
232	static int dtrace_toxranges_max; / size of toxic range array /
233	static dtrace_anon_t dtrace_anon; / anonymous enabling /
234	static uint64_t dtrace_vtime_references; / number of vtimestamp refs /
235	static kthread_t dtrace_panicked; /* panicking thread /
236	static dtrace_ecb_t dtrace_ecb_create_cache; /* cached created ECB /
237	static dtrace_genid_t dtrace_probegen; / current probe generation /
238	static dtrace_helpers_t dtrace_deferred_pid; /* deferred helper list /
239	static dtrace_enabling_t dtrace_retained; /* list of retained enablings /
240	static dtrace_genid_t dtrace_retained_gen; / current retained enab gen /
241	static dtrace_dynvar_t dtrace_dynhash_sink; / end of dynamic hash chains /
242
243	static int dtrace_dof_mode; / See dtrace_impl.h for a description of Darwin's dof modes. /
244
245	/*
246	* This does't quite fit as an internal variable, as it must be accessed in
247	* fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
248	*/
249	int dtrace_kernel_symbol_mode; / See dtrace_impl.h for a description of Darwin's kernel symbol modes. /
250	static uint32_t dtrace_wake_clients;
251	static uint8_t dtrace_kerneluuid[`16`]; / the 128-bit uuid /
252
253	/*
254	* To save memory, some common memory allocations are given a
255	* unique zone. For example, dtrace_probe_t is 72 bytes in size,
256	* which means it would fall into the kalloc.128 bucket. With
257	* 20k elements allocated, the space saved is substantial.
258	*/
259
260	static ZONE_DEFINE_TYPE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t",
261	dtrace_probe_t, ZC_PGZ_USE_GUARDS);
262
263	static ZONE_DEFINE(dtrace_state_pcpu_zone, "dtrace.dtrace_dstate_percpu_t",
264	sizeof(dtrace_dstate_percpu_t), ZC_PERCPU);
265
266	static int dtrace_module_unloaded(struct kmod_info *kmod);
267
268	/*
269	* DTrace Locking
270	* DTrace is protected by three (relatively coarse-grained) locks:
271	*
272	* (1) dtrace_lock is required to manipulate essentially any DTrace state,
273	* including enabling state, probes, ECBs, consumer state, helper state,
274	* etc. Importantly, dtrace_lock is _not_ required when in probe context;
275	* probe context is lock-free -- synchronization is handled via the
276	* dtrace_sync() cross call mechanism.
277	*
278	* (2) dtrace_provider_lock is required when manipulating provider state, or
279	* when provider state must be held constant.
280	*
281	* (3) dtrace_meta_lock is required when manipulating meta provider state, or
282	* when meta provider state must be held constant.
283	*
284	* The lock ordering between these three locks is dtrace_meta_lock before
285	* dtrace_provider_lock before dtrace_lock. (In particular, there are
286	* several places where dtrace_provider_lock is held by the framework as it
287	* calls into the providers -- which then call back into the framework,
288	* grabbing dtrace_lock.)
289	*
290	* There are two other locks in the mix: mod_lock and cpu_lock. With respect
291	* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
292	* role as a coarse-grained lock; it is acquired before both of these locks.
293	* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
294	* be acquired _between_ dtrace_meta_lock and any other DTrace locks.
295	* mod_lock is similar with respect to dtrace_provider_lock in that it must be
296	* acquired _between_ dtrace_provider_lock and dtrace_lock.
297	*/
298
299
300	/*
301	* APPLE NOTE:
302	*
303	* For porting purposes, all kmutex_t vars have been changed
304	* to lck_mtx_t, which require explicit initialization.
305	*
306	* kmutex_t becomes lck_mtx_t
307	* mutex_enter() becomes lck_mtx_lock()
308	* mutex_exit() becomes lck_mtx_unlock()
309	*
310	* Lock asserts are changed like this:
311	*
312	* ASSERT(MUTEX_HELD(&cpu_lock));
313	* becomes:
314	* LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
315	*
316	*/
317	static LCK_MTX_DECLARE_ATTR(dtrace_lock,
318	&dtrace_lck_grp, &dtrace_lck_attr); / probe state lock /
319	static LCK_MTX_DECLARE_ATTR(dtrace_provider_lock,
320	&dtrace_lck_grp, &dtrace_lck_attr); / provider state lock /
321	static LCK_MTX_DECLARE_ATTR(dtrace_meta_lock,
322	&dtrace_lck_grp, &dtrace_lck_attr); / meta-provider state lock /
323	static LCK_RW_DECLARE_ATTR(dtrace_dof_mode_lock,
324	&dtrace_lck_grp, &dtrace_lck_attr); / dof mode lock /
325
326	/*
327	* DTrace Provider Variables
328	*
329	* These are the variables relating to DTrace as a provider (that is, the
330	* provider of the BEGIN, END, and ERROR probes).
331	*/
332	static dtrace_pattr_t dtrace_provider_attr = {
333	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
334	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
335	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
336	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
337	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
338	};
339
340	static void
341	dtrace_provide_nullop(void arg, const* dtrace_probedesc_t *desc)
342	{
343	#pragma unused(arg, desc)
344	}
345
346	static void
347	dtrace_provide_module_nullop(void arg, struct* modctl *ctl)
348	{
349	#pragma unused(arg, ctl)
350	}
351
352	static int
353	dtrace_enable_nullop(void arg, dtrace_id_t id, void* *parg)
354	{
355	#pragma unused(arg, id, parg)
356	return (`0`);
357	}
358
359	static void
360	dtrace_disable_nullop(void arg, dtrace_id_t id, void* *parg)
361	{
362	#pragma unused(arg, id, parg)
363	}
364
365	static void
366	dtrace_suspend_nullop(void arg, dtrace_id_t id, void* *parg)
367	{
368	#pragma unused(arg, id, parg)
369	}
370
371	static void
372	dtrace_resume_nullop(void arg, dtrace_id_t id, void* *parg)
373	{
374	#pragma unused(arg, id, parg)
375	}
376
377	static void
378	dtrace_destroy_nullop(void arg, dtrace_id_t id, void* *parg)
379	{
380	#pragma unused(arg, id, parg)
381	}
382
383
384	static dtrace_pops_t dtrace_provider_ops = {
385	.dtps_provide = dtrace_provide_nullop,
386	.dtps_provide_module = dtrace_provide_module_nullop,
387	.dtps_enable = dtrace_enable_nullop,
388	.dtps_disable = dtrace_disable_nullop,
389	.dtps_suspend = dtrace_suspend_nullop,
390	.dtps_resume = dtrace_resume_nullop,
391	.dtps_getargdesc = NULL,
392	.dtps_getargval = NULL,
393	.dtps_usermode = NULL,
394	.dtps_destroy = dtrace_destroy_nullop,
395	};
396
397	static dtrace_id_t dtrace_probeid_begin; / special BEGIN probe /
398	static dtrace_id_t dtrace_probeid_end; / special END probe /
399	dtrace_id_t dtrace_probeid_error; / special ERROR probe /
400
401	/*
402	* DTrace Helper Tracing Variables
403	*/
404	uint32_t dtrace_helptrace_next = `0`;
405	uint32_t dtrace_helptrace_nlocals;
406	char *dtrace_helptrace_buffer;
407	size_t dtrace_helptrace_bufsize = `512` * `1024`;
408
409	#if DEBUG
410	int dtrace_helptrace_enabled = `1`;
411	#else
412	int dtrace_helptrace_enabled = `0`;
413	#endif
414
415	#if defined (__arm64__)
416	/*
417	* The ioctl for adding helper DOF is based on the
418	* size of a user_addr_t. We need to recognize both
419	* U32 and U64 as the same action.
420	*/
421	#define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
422	#define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
423	#endif /* __arm64__ */
424
425	/*
426	* DTrace Error Hashing
427	*
428	* On DEBUG kernels, DTrace will track the errors that has seen in a hash
429	* table. This is very useful for checking coverage of tests that are
430	* expected to induce DIF or DOF processing errors, and may be useful for
431	* debugging problems in the DIF code generator or in DOF generation . The
432	* error hash may be examined with the ::dtrace_errhash MDB dcmd.
433	*/
434	#if DEBUG
435	static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
436	static const char *dtrace_errlast;
437	static kthread_t *dtrace_errthread;
438	static LCK_MTX_DECLARE_ATTR(dtrace_errlock, &dtrace_lck_grp, &dtrace_lck_attr);
439	#endif
440
441	/*
442	* DTrace Macros and Constants
443	*
444	* These are various macros that are useful in various spots in the
445	* implementation, along with a few random constants that have no meaning
446	* outside of the implementation. There is no real structure to this cpp
447	* mishmash -- but is there ever?
448	*/
449
450	#define DTRACE_GETSTR(hash, elm) \
451	(hash->dth_getstr(elm, hash->dth_stroffs))
452
453	#define DTRACE_HASHSTR(hash, elm) \
454	dtrace_hash_str(DTRACE_GETSTR(hash, elm))
455
456	#define DTRACE_HASHNEXT(hash, elm) \
457	(void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
458
459	#define DTRACE_HASHPREV(hash, elm) \
460	(void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
461
462	#define DTRACE_HASHEQ(hash, lhs, rhs) \
463	(strcmp(DTRACE_GETSTR(hash, lhs), \
464	DTRACE_GETSTR(hash, rhs)) == 0)
465
466	#define DTRACE_AGGHASHSIZE_SLEW 17
467
468	#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
469
470	/*
471	* The key for a thread-local variable needs to be unique to a single
472	* thread over the lifetime of the system, and not overlap with any variable
473	* IDs. So we take thread's thread_id, a unique 64-bit number that is never
474	* reused after the thread exits, and add DIF_VARIABLE_MAX to it, which
475	* guarantees that it won’t overlap any variable IDs. We also want to treat
476	* running in interrupt context as independent of thread-context. So if
477	* interrupts are active, we set the 63rd bit, otherwise it’s cleared.
478	*
479	* This is necessary (but not sufficient) to assure that global associative
480	* arrays never collide with thread-local variables. To guarantee that they
481	* cannot collide, we must also define the order for keying dynamic variables.
482	*
483	* That order is:
484	*
485	* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
486	*
487	* Because the variable-key and the tls-key are in orthogonal spaces, there is
488	* no way for a global variable key signature to match a thread-local key
489	* signature.
490	*/
491	#if defined (__x86_64__) \|\| defined(__arm64__)
492	#define DTRACE_TLS_THRKEY(where) { \
493	uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
494	uint64_t thr = thread_tid(current_thread()); \
495	ASSERT(intr < 2); \
496	(where) = ((thr + DIF_VARIABLE_MAX) & (~((uint64_t)1 << 63))) \| \
497	((uint64_t)intr << 63); \
498	}
499	#else
500	#error Unknown architecture
501	#endif
502
503	#define DT_BSWAP_8(x) ((x) & 0xff)
504	#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) \| DT_BSWAP_8((x) >> 8))
505	#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) \| DT_BSWAP_16((x) >> 16))
506	#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) \| DT_BSWAP_32((x) >> 32))
507
508	#define DT_MASK_LO 0x00000000FFFFFFFFULL
509
510	#define DTRACE_STORE(type, tomax, offset, what) \
511	((type )((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
512
513
514	#define DTRACE_ALIGNCHECK(addr, size, flags) \
515	if (addr & (MIN(size,4) - 1)) { \
516	*flags \|= CPU_DTRACE_BADALIGN; \
517	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
518	return (0); \
519	}
520
521	#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
522	do { \
523	if ((remp) != NULL) { \
524	*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
525	} \
526	} while (0)
527
528
529	/*
530	* Test whether a range of memory starting at testaddr of size testsz falls
531	* within the range of memory described by addr, sz. We take care to avoid
532	* problems with overflow and underflow of the unsigned quantities, and
533	* disallow all negative sizes. Ranges of size 0 are allowed.
534	*/
535	#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
536	((testaddr) - (baseaddr) < (basesz) && \
537	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
538	(testaddr) + (testsz) >= (testaddr))
539
540	/*
541	* Test whether alloc_sz bytes will fit in the scratch region. We isolate
542	* alloc_sz on the righthand side of the comparison in order to avoid overflow
543	* or underflow in the comparison with it. This is simpler than the INRANGE
544	* check above, because we know that the dtms_scratch_ptr is valid in the
545	* range. Allocations of size zero are allowed.
546	*/
547	#define DTRACE_INSCRATCH(mstate, alloc_sz) \
548	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
549	(mstate)->dtms_scratch_ptr >= (alloc_sz))
550
551	#if defined (__x86_64__) \|\| defined (__arm64__)
552	#define DTRACE_LOADFUNC(bits) \
553	/CSTYLED/ \
554	uint##bits##_t dtrace_load##bits(uintptr_t addr); \
555	\
556	extern int dtrace_nofault_copy##bits(uintptr_t, uint##bits##_t *); \
557	\
558	uint##bits##_t \
559	dtrace_load##bits(uintptr_t addr) \
560	{ \
561	size_t size = bits / NBBY; \
562	/CSTYLED/ \
563	uint##bits##_t rval = 0; \
564	int i; \
565	volatile uint16_t flags = (volatile uint16_t ) \
566	&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
567	\
568	DTRACE_ALIGNCHECK(addr, size, flags); \
569	\
570	for (i = 0; i < dtrace_toxranges; i++) { \
571	if (addr >= dtrace_toxrange[i].dtt_limit) \
572	continue; \
573	\
574	if (addr + size <= dtrace_toxrange[i].dtt_base) \
575	continue; \
576	\
577	/* \
578	* This address falls within a toxic region; return 0. \
579	*/ \
580	*flags \|= CPU_DTRACE_BADADDR; \
581	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
582	return (0); \
583	} \
584	\
585	{ \
586	*flags \|= CPU_DTRACE_NOFAULT; \
587	/CSTYLED/ \
588	/* \
589	* PR6394061 - avoid device memory that is unpredictably \
590	* mapped and unmapped \
591	*/ \
592	if (!pmap_valid_page(pmap_find_phys(kernel_pmap, addr)) \|\| \
593	dtrace_nofault_copy##bits(addr, &rval)) { \
594	*flags \|= CPU_DTRACE_BADADDR; \
595	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
596	return (0); \
597	} \
598	\
599	*flags &= ~CPU_DTRACE_NOFAULT; \
600	} \
601	\
602	return (rval); \
603	}
604	#else /* all other architectures */
605	#error Unknown Architecture
606	#endif
607
608	#ifdef __LP64__
609	#define dtrace_loadptr dtrace_load64
610	#else
611	#define dtrace_loadptr dtrace_load32
612	#endif
613
614	#define DTRACE_DYNHASH_FREE 0
615	#define DTRACE_DYNHASH_SINK 1
616	#define DTRACE_DYNHASH_VALID 2
617
618	#define DTRACE_MATCH_FAIL -1
619	#define DTRACE_MATCH_NEXT 0
620	#define DTRACE_MATCH_DONE 1
621	#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
622	#define DTRACE_STATE_ALIGN 64
623
624	#define DTRACE_FLAGS2FLT(flags) \
625	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
626	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
627	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
628	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
629	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
630	((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
631	((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
632	((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
633	((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
634	DTRACEFLT_UNKNOWN)
635
636	#define DTRACEACT_ISSTRING(act) \
637	((act)->dta_kind == DTRACEACT_DIFEXPR && \
638	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
639
640
641	static size_t dtrace_strlen(const char *, size_t);
642	static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
643	static void dtrace_enabling_provide(dtrace_provider_t *);
644	static int dtrace_enabling_match(dtrace_enabling_t , int* , dtrace_match_cond_t cond);
645	static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
646	static void dtrace_enabling_matchall(void);
647	static dtrace_state_t dtrace_anon_grab(void*);
648	static uint64_t dtrace_helper(int, dtrace_mstate_t *,
649	dtrace_state_t *, uint64_t, uint64_t);
650	static dtrace_helpers_t dtrace_helpers_create(proc_t );
651	static void dtrace_buffer_drop(dtrace_buffer_t *);
652	static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
653	dtrace_state_t , dtrace_mstate_t );
654	static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
655	dtrace_optval_t);
656	static int dtrace_ecb_create_enable(dtrace_probe_t , void* , void* *);
657	static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
658	static int dtrace_canload_remains(uint64_t, size_t, size_t *,
659	dtrace_mstate_t , dtrace_vstate_t );
660	static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
661	dtrace_mstate_t , dtrace_vstate_t );
662
663
664	/*
665	* DTrace sysctl handlers
666	*
667	* These declarations and functions are used for a deeper DTrace configuration.
668	* Most of them are not per-consumer basis and may impact the other DTrace
669	* consumers. Correctness may not be supported for all the variables, so you
670	* should be careful about what values you are using.
671	*/
672
673	SYSCTL_DECL(_kern_dtrace);
674	SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "dtrace");
675
676	static int
677	sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
678	{
679	#pragma unused(oidp, arg2)
680	int changed, error;
681	int value = (int* *) arg1;
682
683	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
684	if (error \|\| !changed)
685	return (error);
686
687	if (value != `0` && value != `1`)
688	return (ERANGE);
689
690	lck_mtx_lock(lck: &dtrace_lock);
691	dtrace_err_verbose = value;
692	lck_mtx_unlock(lck: &dtrace_lock);
693
694	return (`0`);
695	}
696
697	/*
698	* kern.dtrace.err_verbose
699	*
700	* Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
701	* Errors are reported when a DIFO or a DOF has been rejected by the kernel.
702	*/
703	SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
704	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
705	&dtrace_err_verbose, `0`,
706	sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
707
708	static int
709	sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
710	{
711	#pragma unused(oidp, arg2, req)
712	int changed, error;
713	uint64_t value = (uint64_t ) arg1;
714
715	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
716	if (error \|\| !changed)
717	return (error);
718
719	if (value <= dtrace_buffer_memory_inuse)
720	return (ERANGE);
721
722	lck_mtx_lock(lck: &dtrace_lock);
723	dtrace_buffer_memory_maxsize = value;
724	lck_mtx_unlock(lck: &dtrace_lock);
725
726	return (`0`);
727	}
728
729	/*
730	* kern.dtrace.buffer_memory_maxsize
731	*
732	* Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
733	* the limit is PHYS_MEM / 3 for all consumers. Attempting to set a null, a negative value
734	* or a value <= to dtrace_buffer_memory_inuse will result in a failure.
735	*/
736	SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
737	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
738	&dtrace_buffer_memory_maxsize, `0`,
739	sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
740
741	/*
742	* kern.dtrace.buffer_memory_inuse
743	*
744	* Current state buffer memory used, in bytes, by all the DTrace consumers.
745	* This value is read-only.
746	*/
747	SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD \| CTLFLAG_LOCKED,
748	&dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
749
750	static int
751	sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
752	{
753	#pragma unused(oidp, arg2, req)
754	int changed, error;
755	size_t value = (size_t) arg1;
756
757	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
758	if (error \|\| !changed)
759	return (error);
760
761	if (value <= `0`)
762	return (ERANGE);
763
764	lck_mtx_lock(lck: &dtrace_lock);
765	dtrace_difo_maxsize = value;
766	lck_mtx_unlock(lck: &dtrace_lock);
767
768	return (`0`);
769	}
770
771	/*
772	* kern.dtrace.difo_maxsize
773	*
774	* Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
775	* to get the default value. Attempting to set a null or negative size will
776	* result in a failure.
777	*/
778	SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
779	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
780	&dtrace_difo_maxsize, `0`,
781	sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
782
783	static int
784	sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
785	{
786	#pragma unused(oidp, arg2, req)
787	int changed, error;
788	dtrace_optval_t value = (dtrace_optval_t ) arg1;
789
790	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
791	if (error \|\| !changed)
792	return (error);
793
794	if (value <= `0`)
795	return (ERANGE);
796
797	if (value >= dtrace_copy_maxsize())
798	return (ERANGE);
799
800	lck_mtx_lock(lck: &dtrace_lock);
801	dtrace_dof_maxsize = value;
802	lck_mtx_unlock(lck: &dtrace_lock);
803
804	return (`0`);
805	}
806
807	/*
808	* kern.dtrace.dof_maxsize
809	*
810	* Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
811	* get the default value. Attempting to set a null or negative size will result
812	* in a failure.
813	*/
814	SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
815	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
816	&dtrace_dof_maxsize, `0`,
817	sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
818
819	static int
820	sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
821	{
822	#pragma unused(oidp, arg2, req)
823	int changed, error;
824	dtrace_optval_t value = (dtrace_optval_t) arg1;
825
826	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
827	if (error \|\| !changed)
828	return (error);
829
830	if (value <= `0`)
831	return (ERANGE);
832	if (value > dtrace_statvar_maxsize_max)
833	return (ERANGE);
834
835	lck_mtx_lock(lck: &dtrace_lock);
836	dtrace_statvar_maxsize = value;
837	lck_mtx_unlock(lck: &dtrace_lock);
838
839	return (`0`);
840	}
841
842	/*
843	* kern.dtrace.global_maxsize
844	*
845	* Set the variable max size in bytes, check the definition of
846	* dtrace_statvar_maxsize to get the default value. Attempting to set a null,
847	* too high or negative size will result in a failure.
848	*/
849	SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
850	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
851	&dtrace_statvar_maxsize, `0`,
852	sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
853
854
855	/*
856	* kern.dtrace.provide_private_probes
857	*
858	* Set whether the providers must provide the private probes. This is
859	* kept as compatibility as they are always provided.
860	*/
861	SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
862	CTLFLAG_RD \| CTLFLAG_LOCKED,
863	(int *)NULL, `1`, "provider must provide the private probes");
864
865	/*
866	* kern.dtrace.dof_mode
867	*
868	* Returns the current DOF mode.
869	* This value is read-only.
870	*/
871	SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD \| CTLFLAG_LOCKED,
872	&dtrace_dof_mode, `0`, "dtrace dof mode");
873
874	/*
875	* DTrace Probe Context Functions
876	*
877	* These functions are called from probe context. Because probe context is
878	* any context in which C may be called, arbitrarily locks may be held,
879	* interrupts may be disabled, we may be in arbitrary dispatched state, etc.
880	* As a result, functions called from probe context may only call other DTrace
881	* support functions -- they may not interact at all with the system at large.
882	* (Note that the ASSERT macro is made probe-context safe by redefining it in
883	* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
884	* loads are to be performed from probe context, they _must_ be in terms of
885	* the safe dtrace_load*() variants.
886	*
887	* Some functions in this block are not actually called from probe context;
888	* for these functions, there will be a comment above the function reading
889	* "Note: not called from probe context."
890	*/
891
892	int
893	dtrace_assfail(const char a, const* char f, int* l)
894	{
895	panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
896
897	/*
898	* We just need something here that even the most clever compiler
899	* cannot optimize away.
900	*/
901	return (a[(uintptr_t)f]);
902	}
903
904	/*
905	* Atomically increment a specified error counter from probe context.
906	*/
907	static void
908	dtrace_error(uint32_t *counter)
909	{
910	/*
911	* Most counters stored to in probe context are per-CPU counters.
912	* However, there are some error conditions that are sufficiently
913	* arcane that they don't merit per-CPU storage. If these counters
914	* are incremented concurrently on different CPUs, scalability will be
915	* adversely affected -- but we don't expect them to be white-hot in a
916	* correctly constructed enabling...
917	*/
918	uint32_t oval, nval;
919
920	do {
921	oval = *counter;
922
923	if ((nval = oval + `1`) == `0`) {
924	/*
925	* If the counter would wrap, set it to 1 -- assuring
926	* that the counter is never zero when we have seen
927	* errors. (The counter must be 32-bits because we
928	* aren't guaranteed a 64-bit compare&swap operation.)
929	* To save this code both the infamy of being fingered
930	* by a priggish news story and the indignity of being
931	* the target of a neo-puritan witch trial, we're
932	* carefully avoiding any colorful description of the
933	* likelihood of this condition -- but suffice it to
934	* say that it is only slightly more likely than the
935	* overflow of predicate cache IDs, as discussed in
936	* dtrace_predicate_create().
937	*/
938	nval = `1`;
939	}
940	} while (dtrace_cas32(counter, oval, nval) != oval);
941	}
942
943	/*
944	* Use the DTRACE_LOADFUNC macro to define functions for each of loading a
945	* uint8_t, a uint16_t, a uint32_t and a uint64_t.
946	*/
947	DTRACE_LOADFUNC(`8`)
948	DTRACE_LOADFUNC(`16`)
949	DTRACE_LOADFUNC(`32`)
950	DTRACE_LOADFUNC(`64`)
951
952	static int
953	dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
954	{
955	if (dest < mstate->dtms_scratch_base)
956	return (`0`);
957
958	if (dest + size < dest)
959	return (`0`);
960
961	if (dest + size > mstate->dtms_scratch_ptr)
962	return (`0`);
963
964	return (`1`);
965	}
966
967	static int
968	dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
969	dtrace_statvar_t *svars, int* nsvars)
970	{
971	int i;
972
973	size_t maxglobalsize, maxlocalsize;
974
975	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
976	maxlocalsize = (maxglobalsize) * NCPU;
977
978	if (nsvars == `0`)
979	return (`0`);
980
981	for (i = `0`; i < nsvars; i++) {
982	dtrace_statvar_t *svar = svars[i];
983	uint8_t scope;
984	size_t size;
985
986	if (svar == NULL \|\| (size = svar->dtsv_size) == `0`)
987	continue;
988
989	scope = svar->dtsv_var.dtdv_scope;
990
991	/**
992	* We verify that our size is valid in the spirit of providing
993	* defense in depth: we want to prevent attackers from using
994	* DTrace to escalate an orthogonal kernel heap corruption bug
995	* into the ability to store to arbitrary locations in memory.
996	*/
997	VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) \|\|
998	(scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
999
1000	if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1001	DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1002	svar->dtsv_size);
1003	return (`1`);
1004	}
1005	}
1006
1007	return (`0`);
1008	}
1009
1010	/*
1011	* Check to see if the address is within a memory region to which a store may
1012	* be issued. This includes the DTrace scratch areas, and any DTrace variable
1013	* region. The caller of dtrace_canstore() is responsible for performing any
1014	* alignment checks that are needed before stores are actually executed.
1015	*/
1016	static int
1017	dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1018	dtrace_vstate_t *vstate)
1019	{
1020	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1021	}
1022	/*
1023	* Implementation of dtrace_canstore which communicates the upper bound of the
1024	* allowed memory region.
1025	*/
1026	static int
1027	dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1028	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1029	{
1030	/*
1031	* First, check to see if the address is in allocated scratch space...
1032	*/
1033	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1034	mstate->dtms_scratch_ptr - mstate->dtms_scratch_base)) {
1035	DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1036	mstate->dtms_scratch_ptr - mstate->dtms_scratch_base);
1037	return (`1`);
1038	}
1039	/*
1040	* Now check to see if it's a dynamic variable. This check will pick
1041	* up both thread-local variables and any global dynamically-allocated
1042	* variables.
1043	*/
1044	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1045	vstate->dtvs_dynvars.dtds_size)) {
1046	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1047	uintptr_t base = (uintptr_t)dstate->dtds_base +
1048	(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1049	uintptr_t chunkoffs;
1050	dtrace_dynvar_t *dvar;
1051
1052	/*
1053	* Before we assume that we can store here, we need to make
1054	* sure that it isn't in our metadata -- storing to our
1055	* dynamic variable metadata would corrupt our state. For
1056	* the range to not include any dynamic variable metadata,
1057	* it must:
1058	*
1059	* (1) Start above the hash table that is at the base of
1060	* the dynamic variable space
1061	*
1062	* (2) Have a starting chunk offset that is beyond the
1063	* dtrace_dynvar_t that is at the base of every chunk
1064	*
1065	* (3) Not span a chunk boundary
1066	*
1067	* (4) Not be in the tuple space of a dynamic variable
1068	*
1069	*/
1070	if (addr < base)
1071	return (`0`);
1072
1073	chunkoffs = (addr - base) % dstate->dtds_chunksize;
1074
1075	if (chunkoffs < sizeof (dtrace_dynvar_t))
1076	return (`0`);
1077
1078	if (chunkoffs + sz > dstate->dtds_chunksize)
1079	return (`0`);
1080
1081	dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1082
1083	if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1084	return (`0`);
1085
1086	if (chunkoffs < sizeof (dtrace_dynvar_t) +
1087	((dvar->dtdv_tuple.dtt_nkeys - `1`) * sizeof (dtrace_key_t)))
1088	return (`0`);
1089
1090	return (`1`);
1091	}
1092
1093	/*
1094	* Finally, check the static local and global variables. These checks
1095	* take the longest, so we perform them last.
1096	*/
1097	if (dtrace_canstore_statvar(addr, sz, remain,
1098	svars: vstate->dtvs_locals, nsvars: vstate->dtvs_nlocals))
1099	return (`1`);
1100
1101	if (dtrace_canstore_statvar(addr, sz, remain,
1102	svars: vstate->dtvs_globals, nsvars: vstate->dtvs_nglobals))
1103	return (`1`);
1104
1105	return (`0`);
1106	}
1107
1108
1109	/*
1110	* Convenience routine to check to see if the address is within a memory
1111	* region in which a load may be issued given the user's privilege level;
1112	* if not, it sets the appropriate error flags and loads 'addr' into the
1113	* illegal value slot.
1114	*
1115	* DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1116	* appropriate memory access protection.
1117	*/
1118	int
1119	dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1120	dtrace_vstate_t *vstate)
1121	{
1122	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1123	}
1124
1125	/*
1126	* Implementation of dtrace_canload which communicates the upper bound of the
1127	* allowed memory region.
1128	*/
1129	static int
1130	dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1131	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1132	{
1133	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1134
1135	/*
1136	* If we hold the privilege to read from kernel memory, then
1137	* everything is readable.
1138	*/
1139	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1140	DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1141	return (`1`);
1142	}
1143
1144	/*
1145	* You can obviously read that which you can store.
1146	*/
1147	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1148	return (`1`);
1149
1150	/*
1151	* We're allowed to read from our own string table.
1152	*/
1153	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1154	mstate->dtms_difo->dtdo_strlen)) {
1155	DTRACE_RANGE_REMAIN(remain, addr,
1156	mstate->dtms_difo->dtdo_strtab,
1157	mstate->dtms_difo->dtdo_strlen);
1158	return (`1`);
1159	}
1160
1161	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1162	*illval = addr;
1163	return (`0`);
1164	}
1165
1166	/*
1167	* Convenience routine to check to see if a given string is within a memory
1168	* region in which a load may be issued given the user's privilege level;
1169	* this exists so that we don't need to issue unnecessary dtrace_strlen()
1170	* calls in the event that the user has all privileges.
1171	*/
1172	static int
1173	dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1174	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1175	{
1176	size_t rsize = `0`;
1177
1178	/*
1179	* If we hold the privilege to read from kernel memory, then
1180	* everything is readable.
1181	*/
1182	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1183	DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1184	return (`1`);
1185	}
1186
1187	/*
1188	* Even if the caller is uninterested in querying the remaining valid
1189	* range, it is required to ensure that the access is allowed.
1190	*/
1191	if (remain == NULL) {
1192	remain = &rsize;
1193	}
1194	if (dtrace_canload_remains(addr, sz: `0`, remain, mstate, vstate)) {
1195	size_t strsz;
1196	/*
1197	* Perform the strlen after determining the length of the
1198	* memory region which is accessible. This prevents timing
1199	* information from being used to find NULs in memory which is
1200	* not accessible to the caller.
1201	*/
1202	strsz = `1` + dtrace_strlen((char *)(uintptr_t)addr,
1203	MIN(sz, *remain));
1204	if (strsz <= *remain) {
1205	return (`1`);
1206	}
1207	}
1208
1209	return (`0`);
1210	}
1211
1212	/*
1213	* Convenience routine to check to see if a given variable is within a memory
1214	* region in which a load may be issued given the user's privilege level.
1215	*/
1216	static int
1217	dtrace_vcanload(void src, dtrace_diftype_t type, size_t *remain,
1218	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1219	{
1220	size_t sz;
1221	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222
1223	/*
1224	* Calculate the max size before performing any checks since even
1225	* DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1226	* return the max length via 'remain'.
1227	*/
1228	if (type->dtdt_kind == DIF_TYPE_STRING) {
1229	dtrace_state_t *state = vstate->dtvs_state;
1230
1231	if (state != NULL) {
1232	sz = state->dts_options[DTRACEOPT_STRSIZE];
1233	} else {
1234	/*
1235	* In helper context, we have a NULL state; fall back
1236	* to using the system-wide default for the string size
1237	* in this case.
1238	*/
1239	sz = dtrace_strsize_default;
1240	}
1241	} else {
1242	sz = type->dtdt_size;
1243	}
1244
1245	/*
1246	* If we hold the privilege to read from kernel memory, then
1247	* everything is readable.
1248	*/
1249	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1250	DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1251	return (`1`);
1252	}
1253
1254	if (type->dtdt_kind == DIF_TYPE_STRING) {
1255	return (dtrace_strcanload(addr: (uintptr_t)src, sz, remain, mstate,
1256	vstate));
1257	}
1258	return (dtrace_canload_remains(addr: (uintptr_t)src, sz, remain, mstate,
1259	vstate));
1260	}
1261
1262	#define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
1263	#define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
1264	#define isspace(ch) (((ch) == ' ') \|\| ((ch) == '\r') \|\| ((ch) == '\n') \|\| \
1265	((ch) == '\t') \|\| ((ch) == '\f'))
1266	#define isxdigit(ch) (isdigit(ch) \|\| ((ch) >= 'a' && (ch) <= 'f') \|\| \
1267	((ch) >= 'A' && (ch) <= 'F'))
1268	#define lisalnum(x) \
1269	(isdigit(x) \|\| ((x) >= 'a' && (x) <= 'z') \|\| ((x) >= 'A' && (x) <= 'Z'))
1270
1271	#define DIGIT(x) \
1272	(isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1273
1274	/*
1275	* Convert a string to a signed integer using safe loads.
1276	*/
1277	static int64_t
1278	dtrace_strtoll(char input, int* base, size_t limit)
1279	{
1280	uintptr_t pos = (uintptr_t)input;
1281	int64_t val = `0`;
1282	int x;
1283	boolean_t neg = B_FALSE;
1284	char c, cc, ccc;
1285	uintptr_t end = pos + limit;
1286
1287	/*
1288	* Consume any whitespace preceding digits.
1289	*/
1290	while ((c = dtrace_load8(addr: pos)) == `' '` \|\| c == `'\t'`)
1291	pos++;
1292
1293	/*
1294	* Handle an explicit sign if one is present.
1295	*/
1296	if (c == `'-'` \|\| c == `'+'`) {
1297	if (c == `'-'`)
1298	neg = B_TRUE;
1299	c = dtrace_load8(addr: ++pos);
1300	}
1301
1302	/*
1303	* Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1304	* if present.
1305	*/
1306	if (base == `16` && c == `'0'` && ((cc = dtrace_load8(addr: pos + `1`)) == `'x'` \|\|
1307	cc == `'X'`) && isxdigit(ccc = dtrace_load8(pos + `2`))) {
1308	pos += `2`;
1309	c = ccc;
1310	}
1311
1312	/*
1313	* Read in contiguous digits until the first non-digit character.
1314	*/
1315	for (; pos < end && c != `'\0'` && lisalnum(c) && (x = DIGIT(c)) < base;
1316	c = dtrace_load8(addr: ++pos))
1317	val = val * base + x;
1318
1319	return (neg ? -val : val);
1320	}
1321
1322
1323	/*
1324	* Compare two strings using safe loads.
1325	*/
1326	static int
1327	dtrace_strncmp(const char s1, const* char *s2, size_t limit)
1328	{
1329	uint8_t c1, c2;
1330	volatile uint16_t *flags;
1331
1332	if (s1 == s2 \|\| limit == `0`)
1333	return (`0`);
1334
1335	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1336
1337	do {
1338	if (s1 == NULL) {
1339	c1 = `'\0'`;
1340	} else {
1341	c1 = dtrace_load8(addr: (uintptr_t)s1++);
1342	}
1343
1344	if (s2 == NULL) {
1345	c2 = `'\0'`;
1346	} else {
1347	c2 = dtrace_load8(addr: (uintptr_t)s2++);
1348	}
1349
1350	if (c1 != c2)
1351	return (c1 - c2);
1352	} while (--limit && c1 != `'\0'` && !(*flags & CPU_DTRACE_FAULT));
1353
1354	return (`0`);
1355	}
1356
1357	/*
1358	* Compute strlen(s) for a string using safe memory accesses. The additional
1359	* len parameter is used to specify a maximum length to ensure completion.
1360	*/
1361	static size_t
1362	dtrace_strlen(const char *s, size_t lim)
1363	{
1364	uint_t len;
1365
1366	for (len = `0`; len != lim; len++) {
1367	if (dtrace_load8(addr: (uintptr_t)s++) == `'\0'`)
1368	break;
1369	}
1370
1371	return (len);
1372	}
1373
1374	/*
1375	* Check if an address falls within a toxic region.
1376	*/
1377	static int
1378	dtrace_istoxic(uintptr_t kaddr, size_t size)
1379	{
1380	uintptr_t taddr, tsize;
1381	int i;
1382
1383	for (i = `0`; i < dtrace_toxranges; i++) {
1384	taddr = dtrace_toxrange[i].dtt_base;
1385	tsize = dtrace_toxrange[i].dtt_limit - taddr;
1386
1387	if (kaddr - taddr < tsize) {
1388	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1389	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1390	return (`1`);
1391	}
1392
1393	if (taddr - kaddr < size) {
1394	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1395	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1396	return (`1`);
1397	}
1398	}
1399
1400	return (`0`);
1401	}
1402
1403	/*
1404	* Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1405	* memory specified by the DIF program. The dst is assumed to be safe memory
1406	* that we can store to directly because it is managed by DTrace. As with
1407	* standard bcopy, overlapping copies are handled properly.
1408	*/
1409	static void
1410	dtrace_bcopy(const void src, void* *dst, size_t len)
1411	{
1412	if (len != `0`) {
1413	uint8_t *s1 = dst;
1414	const uint8_t *s2 = src;
1415
1416	if (s1 <= s2) {
1417	do {
1418	*s1++ = dtrace_load8(addr: (uintptr_t)s2++);
1419	} while (--len != `0`);
1420	} else {
1421	s2 += len;
1422	s1 += len;
1423
1424	do {
1425	*--s1 = dtrace_load8(addr: (uintptr_t)--s2);
1426	} while (--len != `0`);
1427	}
1428	}
1429	}
1430
1431	/*
1432	* Copy src to dst using safe memory accesses, up to either the specified
1433	* length, or the point that a nul byte is encountered. The src is assumed to
1434	* be unsafe memory specified by the DIF program. The dst is assumed to be
1435	* safe memory that we can store to directly because it is managed by DTrace.
1436	* Unlike dtrace_bcopy(), overlapping regions are not handled.
1437	*/
1438	static void
1439	dtrace_strcpy(const void src, void* *dst, size_t len)
1440	{
1441	if (len != `0`) {
1442	uint8_t *s1 = dst, c;
1443	const uint8_t *s2 = src;
1444
1445	do {
1446	*s1++ = c = dtrace_load8(addr: (uintptr_t)s2++);
1447	} while (--len != `0` && c != `'\0'`);
1448	}
1449	}
1450
1451	/*
1452	* Copy src to dst, deriving the size and type from the specified (BYREF)
1453	* variable type. The src is assumed to be unsafe memory specified by the DIF
1454	* program. The dst is assumed to be DTrace variable memory that is of the
1455	* specified type; we assume that we can store to directly.
1456	*/
1457	static void
1458	dtrace_vcopy(void src, void* dst, dtrace_diftype_t type, size_t limit)
1459	{
1460	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1461
1462	if (type->dtdt_kind == DIF_TYPE_STRING) {
1463	dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1464	} else {
1465	dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1466	}
1467	}
1468
1469	/*
1470	* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1471	* unsafe memory specified by the DIF program. The s2 data is assumed to be
1472	* safe memory that we can access directly because it is managed by DTrace.
1473	*/
1474	static int
1475	dtrace_bcmp(const void s1, const* void *s2, size_t len)
1476	{
1477	volatile uint16_t *flags;
1478
1479	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1480
1481	if (s1 == s2)
1482	return (`0`);
1483
1484	if (s1 == NULL \|\| s2 == NULL)
1485	return (`1`);
1486
1487	if (s1 != s2 && len != `0`) {
1488	const uint8_t *ps1 = s1;
1489	const uint8_t *ps2 = s2;
1490
1491	do {
1492	if (dtrace_load8(addr: (uintptr_t)ps1++) != *ps2++)
1493	return (`1`);
1494	} while (--len != `0` && !(*flags & CPU_DTRACE_FAULT));
1495	}
1496	return (`0`);
1497	}
1498
1499	/*
1500	* Zero the specified region using a simple byte-by-byte loop. Note that this
1501	* is for safe DTrace-managed memory only.
1502	*/
1503	static void
1504	dtrace_bzero(void *dst, size_t len)
1505	{
1506	uchar_t *cp;
1507
1508	for (cp = dst; len != `0`; len--)
1509	*cp++ = `0`;
1510	}
1511
1512	static void
1513	dtrace_add_128(uint64_t addend1, uint64_t addend2, uint64_t *sum)
1514	{
1515	uint64_t result[`2`];
1516
1517	result[`0`] = addend1[`0`] + addend2[`0`];
1518	result[`1`] = addend1[`1`] + addend2[`1`] +
1519	(result[`0`] < addend1[`0`] \|\| result[`0`] < addend2[`0`] ? `1` : `0`);
1520
1521	sum[`0`] = result[`0`];
1522	sum[`1`] = result[`1`];
1523	}
1524
1525	/*
1526	* Shift the 128-bit value in a by b. If b is positive, shift left.
1527	* If b is negative, shift right.
1528	*/
1529	static void
1530	dtrace_shift_128(uint64_t a, int* b)
1531	{
1532	uint64_t mask;
1533
1534	if (b == `0`)
1535	return;
1536
1537	if (b < `0`) {
1538	b = -b;
1539	if (b >= `64`) {
1540	a[`0`] = a[`1`] >> (b - `64`);
1541	a[`1`] = `0`;
1542	} else {
1543	a[`0`] >>= b;
1544	mask = `1LL` << (`64` - b);
1545	mask -= `1`;
1546	a[`0`] \|= ((a[`1`] & mask) << (`64` - b));
1547	a[`1`] >>= b;
1548	}
1549	} else {
1550	if (b >= `64`) {
1551	a[`1`] = a[`0`] << (b - `64`);
1552	a[`0`] = `0`;
1553	} else {
1554	a[`1`] <<= b;
1555	mask = a[`0`] >> (`64` - b);
1556	a[`1`] \|= mask;
1557	a[`0`] <<= b;
1558	}
1559	}
1560	}
1561
1562	/*
1563	* The basic idea is to break the 2 64-bit values into 4 32-bit values,
1564	* use native multiplication on those, and then re-combine into the
1565	* resulting 128-bit value.
1566	*
1567	* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1568	* hi1 * hi2 << 64 +
1569	* hi1 * lo2 << 32 +
1570	* hi2 * lo1 << 32 +
1571	* lo1 * lo2
1572	*/
1573	static void
1574	dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1575	{
1576	uint64_t hi1, hi2, lo1, lo2;
1577	uint64_t tmp[`2`];
1578
1579	hi1 = factor1 >> `32`;
1580	hi2 = factor2 >> `32`;
1581
1582	lo1 = factor1 & DT_MASK_LO;
1583	lo2 = factor2 & DT_MASK_LO;
1584
1585	product[`0`] = lo1 * lo2;
1586	product[`1`] = hi1 * hi2;
1587
1588	tmp[`0`] = hi1 * lo2;
1589	tmp[`1`] = `0`;
1590	dtrace_shift_128(a: tmp, b: `32`);
1591	dtrace_add_128(addend1: product, addend2: tmp, sum: product);
1592
1593	tmp[`0`] = hi2 * lo1;
1594	tmp[`1`] = `0`;
1595	dtrace_shift_128(a: tmp, b: `32`);
1596	dtrace_add_128(addend1: product, addend2: tmp, sum: product);
1597	}
1598
1599	/*
1600	* This privilege check should be used by actions and subroutines to
1601	* verify that the user credentials of the process that enabled the
1602	* invoking ECB match the target credentials
1603	*/
1604	static int
1605	dtrace_priv_proc_common_user(dtrace_state_t *state)
1606	{
1607	cred_t cr, s_cr = state->dts_cred.dcr_cred;
1608
1609	/*
1610	* We should always have a non-NULL state cred here, since if cred
1611	* is null (anonymous tracing), we fast-path bypass this routine.
1612	*/
1613	ASSERT(s_cr != NULL);
1614
1615	if ((cr = dtrace_CRED()) != NULL &&
1616	posix_cred_get(cred: s_cr)->cr_uid == posix_cred_get(cred: cr)->cr_uid &&
1617	posix_cred_get(cred: s_cr)->cr_uid == posix_cred_get(cred: cr)->cr_ruid &&
1618	posix_cred_get(cred: s_cr)->cr_uid == posix_cred_get(cred: cr)->cr_suid &&
1619	posix_cred_get(cred: s_cr)->cr_gid == posix_cred_get(cred: cr)->cr_gid &&
1620	posix_cred_get(cred: s_cr)->cr_gid == posix_cred_get(cred: cr)->cr_rgid &&
1621	posix_cred_get(cred: s_cr)->cr_gid == posix_cred_get(cred: cr)->cr_sgid)
1622	return (`1`);
1623
1624	return (`0`);
1625	}
1626
1627	/*
1628	* This privilege check should be used by actions and subroutines to
1629	* verify that the zone of the process that enabled the invoking ECB
1630	* matches the target credentials
1631	*/
1632	static int
1633	dtrace_priv_proc_common_zone(dtrace_state_t *state)
1634	{
1635	cred_t cr, s_cr = state->dts_cred.dcr_cred;
1636	#pragma unused(cr, s_cr, state) /* __APPLE__ */
1637
1638	/*
1639	* We should always have a non-NULL state cred here, since if cred
1640	* is null (anonymous tracing), we fast-path bypass this routine.
1641	*/
1642	ASSERT(s_cr != NULL);
1643
1644	return `1`; / APPLE NOTE: Darwin doesn't do zones. /
1645	}
1646
1647	/*
1648	* This privilege check should be used by actions and subroutines to
1649	* verify that the process has not setuid or changed credentials.
1650	*/
1651	static int
1652	dtrace_priv_proc_common_nocd(void)
1653	{
1654	return `1`; / Darwin omits "No Core Dump" flag. /
1655	}
1656
1657	static int
1658	dtrace_priv_proc_destructive(dtrace_state_t *state)
1659	{
1660	int action = state->dts_cred.dcr_action;
1661
1662	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1663	goto bad;
1664
1665	if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1666	goto bad;
1667
1668	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == `0`) &&
1669	dtrace_priv_proc_common_zone(state) == `0`)
1670	goto bad;
1671
1672	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == `0`) &&
1673	dtrace_priv_proc_common_user(state) == `0`)
1674	goto bad;
1675
1676	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == `0`) &&
1677	dtrace_priv_proc_common_nocd() == `0`)
1678	goto bad;
1679
1680	return (`1`);
1681
1682	bad:
1683	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1684
1685	return (`0`);
1686	}
1687
1688	static int
1689	dtrace_priv_proc_control(dtrace_state_t *state)
1690	{
1691	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1692	goto bad;
1693
1694	if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1695	goto bad;
1696
1697	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1698	return (`1`);
1699
1700	if (dtrace_priv_proc_common_zone(state) &&
1701	dtrace_priv_proc_common_user(state) &&
1702	dtrace_priv_proc_common_nocd())
1703	return (`1`);
1704
1705	bad:
1706	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1707
1708	return (`0`);
1709	}
1710
1711	static int
1712	dtrace_priv_proc(dtrace_state_t *state)
1713	{
1714	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1715	goto bad;
1716
1717	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1718	goto bad;
1719
1720	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1721	return (`1`);
1722
1723	bad:
1724	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1725
1726	return (`0`);
1727	}
1728
1729	/*
1730	* The P_LNOATTACH check is an Apple specific check.
1731	* We need a version of dtrace_priv_proc() that omits
1732	* that check for PID and EXECNAME accesses
1733	*/
1734	static int
1735	dtrace_priv_proc_relaxed(dtrace_state_t *state)
1736	{
1737
1738	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1739	return (`1`);
1740
1741	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1742
1743	return (`0`);
1744	}
1745
1746	static int
1747	dtrace_priv_kernel(dtrace_state_t *state)
1748	{
1749	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1750	goto bad;
1751
1752	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1753	return (`1`);
1754
1755	bad:
1756	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;
1757
1758	return (`0`);
1759	}
1760
1761	static int
1762	dtrace_priv_kernel_destructive(dtrace_state_t *state)
1763	{
1764	if (dtrace_is_restricted())
1765	goto bad;
1766
1767	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1768	return (`1`);
1769
1770	bad:
1771	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;
1772
1773	return (`0`);
1774	}
1775
1776	/*
1777	* Note: not called from probe context. This function is called
1778	* asynchronously (and at a regular interval) from outside of probe context to
1779	* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1780	* cleaning is explained in detail in <sys/dtrace_impl.h>.
1781	*/
1782	static void
1783	dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1784	{
1785	dtrace_dynvar_t *dirty;
1786	int work = `0`;
1787
1788	zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1789	ASSERT(dcpu->dtdsc_rinsing == NULL);
1790
1791	/*
1792	* If the dirty list is NULL, there is no dirty work to do.
1793	*/
1794	if (dcpu->dtdsc_dirty == NULL)
1795	continue;
1796
1797	/*
1798	* If the clean list is non-NULL, then we're not going to do
1799	* any work for this CPU -- it means that there has not been
1800	* a dtrace_dynvar() allocation on this CPU (or from this CPU)
1801	* since the last time we cleaned house.
1802	*/
1803	if (dcpu->dtdsc_clean != NULL)
1804	continue;
1805
1806	work = `1`;
1807
1808	/*
1809	* Atomically move the dirty list aside.
1810	*/
1811	do {
1812	dirty = dcpu->dtdsc_dirty;
1813
1814	/*
1815	* Before we zap the dirty list, set the rinsing list.
1816	* (This allows for a potential assertion in
1817	* dtrace_dynvar(): if a free dynamic variable appears
1818	* on a hash chain, either the dirty list or the
1819	* rinsing list for some CPU must be non-NULL.)
1820	*/
1821	dcpu->dtdsc_rinsing = dirty;
1822	dtrace_membar_producer();
1823	} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1824	dirty, NULL) != dirty);
1825	}
1826
1827	if (!work) {
1828	/*
1829	* We have no work to do; we can simply return.
1830	*/
1831	return;
1832	}
1833
1834	dtrace_sync();
1835
1836	zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1837	if (dcpu->dtdsc_rinsing == NULL)
1838	continue;
1839
1840	/*
1841	* We are now guaranteed that no hash chain contains a pointer
1842	* into this dirty list; we can make it clean.
1843	*/
1844	ASSERT(dcpu->dtdsc_clean == NULL);
1845	dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1846	dcpu->dtdsc_rinsing = NULL;
1847	}
1848
1849	/*
1850	* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1851	* sure that all CPUs have seen all of the dtdsc_clean pointers.
1852	* This prevents a race whereby a CPU incorrectly decides that
1853	* the state should be something other than DTRACE_DSTATE_CLEAN
1854	* after dtrace_dynvar_clean() has completed.
1855	*/
1856	dtrace_sync();
1857
1858	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1859	}
1860
1861	/*
1862	* Depending on the value of the op parameter, this function looks-up,
1863	* allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1864	* allocation is requested, this function will return a pointer to a
1865	* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1866	* variable can be allocated. If NULL is returned, the appropriate counter
1867	* will be incremented.
1868	*/
1869	static dtrace_dynvar_t *
1870	dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1871	dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1872	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1873	{
1874	uint64_t hashval = DTRACE_DYNHASH_VALID;
1875	dtrace_dynhash_t *hash = dstate->dtds_hash;
1876	dtrace_dynvar_t free, new_free, next, dvar, start, prev = NULL;
1877	processorid_t me = CPU->cpu_id, cpu = me;
1878	dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, me);
1879	size_t bucket, ksize;
1880	size_t chunksize = dstate->dtds_chunksize;
1881	uintptr_t kdata, lock, nstate;
1882	uint_t i;
1883
1884	ASSERT(nkeys != `0`);
1885
1886	/*
1887	* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1888	* algorithm. For the by-value portions, we perform the algorithm in
1889	* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1890	* bit, and seems to have only a minute effect on distribution. For
1891	* the by-reference data, we perform "One-at-a-time" iterating (safely)
1892	* over each referenced byte. It's painful to do this, but it's much
1893	* better than pathological hash distribution. The efficacy of the
1894	* hashing algorithm (and a comparison with other algorithms) may be
1895	* found by running the ::dtrace_dynstat MDB dcmd.
1896	*/
1897	for (i = `0`; i < nkeys; i++) {
1898	if (key[i].dttk_size == `0`) {
1899	uint64_t val = key[i].dttk_value;
1900
1901	hashval += (val >> `48`) & `0xffff`;
1902	hashval += (hashval << `10`);
1903	hashval ^= (hashval >> `6`);
1904
1905	hashval += (val >> `32`) & `0xffff`;
1906	hashval += (hashval << `10`);
1907	hashval ^= (hashval >> `6`);
1908
1909	hashval += (val >> `16`) & `0xffff`;
1910	hashval += (hashval << `10`);
1911	hashval ^= (hashval >> `6`);
1912
1913	hashval += val & `0xffff`;
1914	hashval += (hashval << `10`);
1915	hashval ^= (hashval >> `6`);
1916	} else {
1917	/*
1918	* This is incredibly painful, but it beats the hell
1919	* out of the alternative.
1920	*/
1921	uint64_t j, size = key[i].dttk_size;
1922	uintptr_t base = (uintptr_t)key[i].dttk_value;
1923
1924	if (!dtrace_canload(addr: base, sz: size, mstate, vstate))
1925	break;
1926
1927	for (j = `0`; j < size; j++) {
1928	hashval += dtrace_load8(addr: base + j);
1929	hashval += (hashval << `10`);
1930	hashval ^= (hashval >> `6`);
1931	}
1932	}
1933	}
1934
1935	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1936	return (NULL);
1937
1938	hashval += (hashval << `3`);
1939	hashval ^= (hashval >> `11`);
1940	hashval += (hashval << `15`);
1941
1942	/*
1943	* There is a remote chance (ideally, 1 in 2^31) that our hashval
1944	* comes out to be one of our two sentinel hash values. If this
1945	* actually happens, we set the hashval to be a value known to be a
1946	* non-sentinel value.
1947	*/
1948	if (hashval == DTRACE_DYNHASH_FREE \|\| hashval == DTRACE_DYNHASH_SINK)
1949	hashval = DTRACE_DYNHASH_VALID;
1950
1951	/*
1952	* Yes, it's painful to do a divide here. If the cycle count becomes
1953	* important here, tricks can be pulled to reduce it. (However, it's
1954	* critical that hash collisions be kept to an absolute minimum;
1955	* they're much more painful than a divide.) It's better to have a
1956	* solution that generates few collisions and still keeps things
1957	* relatively simple.
1958	*/
1959	bucket = hashval % dstate->dtds_hashsize;
1960
1961	if (op == DTRACE_DYNVAR_DEALLOC) {
1962	volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1963
1964	for (;;) {
1965	while ((lock = *lockp) & `1`)
1966	continue;
1967
1968	if (dtrace_casptr((void *)(uintptr_t)lockp,
1969	(void )lock, (void* )(lock + `1`)) == (void* *)lock)
1970	break;
1971	}
1972
1973	dtrace_membar_producer();
1974	}
1975
1976	top:
1977	prev = NULL;
1978	lock = hash[bucket].dtdh_lock;
1979
1980	dtrace_membar_consumer();
1981
1982	start = hash[bucket].dtdh_chain;
1983	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK \|\|
1984	start->dtdv_hashval != DTRACE_DYNHASH_FREE \|\|
1985	op != DTRACE_DYNVAR_DEALLOC));
1986
1987	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1988	dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1989	dtrace_key_t *dkey = &dtuple->dtt_key[`0`];
1990
1991	if (dvar->dtdv_hashval != hashval) {
1992	if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1993	/*
1994	* We've reached the sink, and therefore the
1995	* end of the hash chain; we can kick out of
1996	* the loop knowing that we have seen a valid
1997	* snapshot of state.
1998	*/
1999	ASSERT(dvar->dtdv_next == NULL);
2000	ASSERT(dvar == &dtrace_dynhash_sink);
2001	break;
2002	}
2003
2004	if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2005	/*
2006	* We've gone off the rails: somewhere along
2007	* the line, one of the members of this hash
2008	* chain was deleted. Note that we could also
2009	* detect this by simply letting this loop run
2010	* to completion, as we would eventually hit
2011	* the end of the dirty list. However, we
2012	* want to avoid running the length of the
2013	* dirty list unnecessarily (it might be quite
2014	* long), so we catch this as early as
2015	* possible by detecting the hash marker. In
2016	* this case, we simply set dvar to NULL and
2017	* break; the conditional after the loop will
2018	* send us back to top.
2019	*/
2020	dvar = NULL;
2021	break;
2022	}
2023
2024	goto next;
2025	}
2026
2027	if (dtuple->dtt_nkeys != nkeys)
2028	goto next;
2029
2030	for (i = `0`; i < nkeys; i++, dkey++) {
2031	if (dkey->dttk_size != key[i].dttk_size)
2032	goto next; / size or type mismatch /
2033
2034	if (dkey->dttk_size != `0`) {
2035	if (dtrace_bcmp(
2036	s1: (void *)(uintptr_t)key[i].dttk_value,
2037	s2: (void *)(uintptr_t)dkey->dttk_value,
2038	len: dkey->dttk_size))
2039	goto next;
2040	} else {
2041	if (dkey->dttk_value != key[i].dttk_value)
2042	goto next;
2043	}
2044	}
2045
2046	if (op != DTRACE_DYNVAR_DEALLOC)
2047	return (dvar);
2048
2049	ASSERT(dvar->dtdv_next == NULL \|\|
2050	dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2051
2052	if (prev != NULL) {
2053	ASSERT(hash[bucket].dtdh_chain != dvar);
2054	ASSERT(start != dvar);
2055	ASSERT(prev->dtdv_next == dvar);
2056	prev->dtdv_next = dvar->dtdv_next;
2057	} else {
2058	if (dtrace_casptr(&hash[bucket].dtdh_chain,
2059	start, dvar->dtdv_next) != start) {
2060	/*
2061	* We have failed to atomically swing the
2062	* hash table head pointer, presumably because
2063	* of a conflicting allocation on another CPU.
2064	* We need to reread the hash chain and try
2065	* again.
2066	*/
2067	goto top;
2068	}
2069	}
2070
2071	dtrace_membar_producer();
2072
2073	/*
2074	* Now set the hash value to indicate that it's free.
2075	*/
2076	ASSERT(hash[bucket].dtdh_chain != dvar);
2077	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2078
2079	dtrace_membar_producer();
2080
2081	/*
2082	* Set the next pointer to point at the dirty list, and
2083	* atomically swing the dirty pointer to the newly freed dvar.
2084	*/
2085	do {
2086	next = dcpu->dtdsc_dirty;
2087	dvar->dtdv_next = next;
2088	} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2089
2090	/*
2091	* Finally, unlock this hash bucket.
2092	*/
2093	ASSERT(hash[bucket].dtdh_lock == lock);
2094	ASSERT(lock & `1`);
2095	hash[bucket].dtdh_lock++;
2096
2097	return (NULL);
2098	next:
2099	prev = dvar;
2100	continue;
2101	}
2102
2103	if (dvar == NULL) {
2104	/*
2105	* If dvar is NULL, it is because we went off the rails:
2106	* one of the elements that we traversed in the hash chain
2107	* was deleted while we were traversing it. In this case,
2108	* we assert that we aren't doing a dealloc (deallocs lock
2109	* the hash bucket to prevent themselves from racing with
2110	* one another), and retry the hash chain traversal.
2111	*/
2112	ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2113	goto top;
2114	}
2115
2116	if (op != DTRACE_DYNVAR_ALLOC) {
2117	/*
2118	* If we are not to allocate a new variable, we want to
2119	* return NULL now. Before we return, check that the value
2120	* of the lock word hasn't changed. If it has, we may have
2121	* seen an inconsistent snapshot.
2122	*/
2123	if (op == DTRACE_DYNVAR_NOALLOC) {
2124	if (hash[bucket].dtdh_lock != lock)
2125	goto top;
2126	} else {
2127	ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2128	ASSERT(hash[bucket].dtdh_lock == lock);
2129	ASSERT(lock & `1`);
2130	hash[bucket].dtdh_lock++;
2131	}
2132
2133	return (NULL);
2134	}
2135
2136	/*
2137	* We need to allocate a new dynamic variable. The size we need is the
2138	* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2139	* size of any auxiliary key data (rounded up to 8-byte alignment) plus
2140	* the size of any referred-to data (dsize). We then round the final
2141	* size up to the chunksize for allocation.
2142	*/
2143	for (ksize = `0`, i = `0`; i < nkeys; i++)
2144	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2145
2146	/*
2147	* This should be pretty much impossible, but could happen if, say,
2148	* strange DIF specified the tuple. Ideally, this should be an
2149	* assertion and not an error condition -- but that requires that the
2150	* chunksize calculation in dtrace_difo_chunksize() be absolutely
2151	* bullet-proof. (That is, it must not be able to be fooled by
2152	* malicious DIF.) Given the lack of backwards branches in DIF,
2153	* solving this would presumably not amount to solving the Halting
2154	* Problem -- but it still seems awfully hard.
2155	*/
2156	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - `1`) +
2157	ksize + dsize > chunksize) {
2158	dcpu->dtdsc_drops++;
2159	return (NULL);
2160	}
2161
2162	nstate = DTRACE_DSTATE_EMPTY;
2163
2164	do {
2165	retry:
2166	free = dcpu->dtdsc_free;
2167
2168	if (free == NULL) {
2169	dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2170	void *rval;
2171
2172	if (clean == NULL) {
2173	/*
2174	* We're out of dynamic variable space on
2175	* this CPU. Unless we have tried all CPUs,
2176	* we'll try to allocate from a different
2177	* CPU.
2178	*/
2179	switch (dstate->dtds_state) {
2180	case DTRACE_DSTATE_CLEAN: {
2181	void *sp = &dstate->dtds_state;
2182
2183	if (++cpu >= (int)NCPU)
2184	cpu = `0`;
2185
2186	if (dcpu->dtdsc_dirty != NULL &&
2187	nstate == DTRACE_DSTATE_EMPTY)
2188	nstate = DTRACE_DSTATE_DIRTY;
2189
2190	if (dcpu->dtdsc_rinsing != NULL)
2191	nstate = DTRACE_DSTATE_RINSING;
2192
2193	dcpu = zpercpu_get_cpu(dstate->dtds_percpu, cpu);
2194
2195	if (cpu != me)
2196	goto retry;
2197
2198	(void) dtrace_cas32(sp,
2199	DTRACE_DSTATE_CLEAN, nstate);
2200
2201	/*
2202	* To increment the correct bean
2203	* counter, take another lap.
2204	*/
2205	goto retry;
2206	}
2207
2208	case DTRACE_DSTATE_DIRTY:
2209	dcpu->dtdsc_dirty_drops++;
2210	break;
2211
2212	case DTRACE_DSTATE_RINSING:
2213	dcpu->dtdsc_rinsing_drops++;
2214	break;
2215
2216	case DTRACE_DSTATE_EMPTY:
2217	dcpu->dtdsc_drops++;
2218	break;
2219	}
2220
2221	DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2222	return (NULL);
2223	}
2224
2225	/*
2226	* The clean list appears to be non-empty. We want to
2227	* move the clean list to the free list; we start by
2228	* moving the clean pointer aside.
2229	*/
2230	if (dtrace_casptr(&dcpu->dtdsc_clean,
2231	clean, NULL) != clean) {
2232	/*
2233	* We are in one of two situations:
2234	*
2235	* (a) The clean list was switched to the
2236	* free list by another CPU.
2237	*
2238	* (b) The clean list was added to by the
2239	* cleansing cyclic.
2240	*
2241	* In either of these situations, we can
2242	* just reattempt the free list allocation.
2243	*/
2244	goto retry;
2245	}
2246
2247	ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2248
2249	/*
2250	* Now we'll move the clean list to the free list.
2251	* It's impossible for this to fail: the only way
2252	* the free list can be updated is through this
2253	* code path, and only one CPU can own the clean list.
2254	* Thus, it would only be possible for this to fail if
2255	* this code were racing with dtrace_dynvar_clean().
2256	* (That is, if dtrace_dynvar_clean() updated the clean
2257	* list, and we ended up racing to update the free
2258	* list.) This race is prevented by the dtrace_sync()
2259	* in dtrace_dynvar_clean() -- which flushes the
2260	* owners of the clean lists out before resetting
2261	* the clean lists.
2262	*/
2263	rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2264	ASSERT(rval == NULL);
2265	goto retry;
2266	}
2267
2268	dvar = free;
2269	new_free = dvar->dtdv_next;
2270	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2271
2272	/*
2273	* We have now allocated a new chunk. We copy the tuple keys into the
2274	* tuple array and copy any referenced key data into the data space
2275	* following the tuple array. As we do this, we relocate dttk_value
2276	* in the final tuple to point to the key data address in the chunk.
2277	*/
2278	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2279	dvar->dtdv_data = (void *)(kdata + ksize);
2280	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2281
2282	for (i = `0`; i < nkeys; i++) {
2283	dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2284	size_t kesize = key[i].dttk_size;
2285
2286	if (kesize != `0`) {
2287	dtrace_bcopy(
2288	src: (const void *)(uintptr_t)key[i].dttk_value,
2289	dst: (void *)kdata, len: kesize);
2290	dkey->dttk_value = kdata;
2291	kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2292	} else {
2293	dkey->dttk_value = key[i].dttk_value;
2294	}
2295
2296	dkey->dttk_size = kesize;
2297	}
2298
2299	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2300	dvar->dtdv_hashval = hashval;
2301	dvar->dtdv_next = start;
2302
2303	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2304	return (dvar);
2305
2306	/*
2307	* The cas has failed. Either another CPU is adding an element to
2308	* this hash chain, or another CPU is deleting an element from this
2309	* hash chain. The simplest way to deal with both of these cases
2310	* (though not necessarily the most efficient) is to free our
2311	* allocated block and tail-call ourselves. Note that the free is
2312	* to the dirty list and _not_ to the free list. This is to prevent
2313	* races with allocators, above.
2314	*/
2315	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2316
2317	dtrace_membar_producer();
2318
2319	do {
2320	free = dcpu->dtdsc_dirty;
2321	dvar->dtdv_next = free;
2322	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2323
2324	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2325	}
2326
2327	/ARGSUSED/
2328	static void
2329	dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2330	{
2331	#pragma unused(arg) /* __APPLE__ */
2332	if ((int64_t)nval < (int64_t)*oval)
2333	*oval = nval;
2334	}
2335
2336	/ARGSUSED/
2337	static void
2338	dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2339	{
2340	#pragma unused(arg) /* __APPLE__ */
2341	if ((int64_t)nval > (int64_t)*oval)
2342	*oval = nval;
2343	}
2344
2345	static void
2346	dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2347	{
2348	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2349	int64_t val = (int64_t)nval;
2350
2351	if (val < `0`) {
2352	for (i = `0`; i < zero; i++) {
2353	if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2354	quanta[i] += incr;
2355	return;
2356	}
2357	}
2358	} else {
2359	for (i = zero + `1`; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2360	if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2361	quanta[i - `1`] += incr;
2362	return;
2363	}
2364	}
2365
2366	quanta[DTRACE_QUANTIZE_NBUCKETS - `1`] += incr;
2367	return;
2368	}
2369
2370	ASSERT(`0`);
2371	}
2372
2373	static void
2374	dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2375	{
2376	uint64_t arg = *lquanta++;
2377	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2378	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2379	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2380	int32_t val = (int32_t)nval, level;
2381
2382	ASSERT(step != `0`);
2383	ASSERT(levels != `0`);
2384
2385	if (val < base) {
2386	/*
2387	* This is an underflow.
2388	*/
2389	lquanta[`0`] += incr;
2390	return;
2391	}
2392
2393	level = (val - base) / step;
2394
2395	if (level < levels) {
2396	lquanta[level + `1`] += incr;
2397	return;
2398	}
2399
2400	/*
2401	* This is an overflow.
2402	*/
2403	lquanta[levels + `1`] += incr;
2404	}
2405
2406	static int
2407	dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2408	int16_t nsteps, int64_t value)
2409	{
2410	int64_t this = `1`, last, next;
2411	int base = `1`, order;
2412
2413	for (order = `0`; order < low; ++order)
2414	this *= factor;
2415
2416	/*
2417	* If our value is less than our factor taken to the power of the
2418	* low order of magnitude, it goes into the zeroth bucket.
2419	*/
2420	if (value < this)
2421	return `0`;
2422	else
2423	last = this;
2424
2425	for (this *= factor; order <= high; ++order) {
2426	int nbuckets = this > nsteps ? nsteps : this;
2427
2428	/*
2429	* We should not generally get log/linear quantizations
2430	* with a high magnitude that allows 64-bits to
2431	* overflow, but we nonetheless protect against this
2432	* by explicitly checking for overflow, and clamping
2433	* our value accordingly.
2434	*/
2435	next = this * factor;
2436	if (next < this) {
2437	value = this - `1`;
2438	}
2439
2440	/*
2441	* If our value lies within this order of magnitude,
2442	* determine its position by taking the offset within
2443	* the order of magnitude, dividing by the bucket
2444	* width, and adding to our (accumulated) base.
2445	*/
2446	if (value < this) {
2447	return (base + (value - last) / (this / nbuckets));
2448	}
2449
2450	base += nbuckets - (nbuckets / factor);
2451	last = this;
2452	this = next;
2453	}
2454
2455	/*
2456	* Our value is greater than or equal to our factor taken to the
2457	* power of one plus the high magnitude -- return the top bucket.
2458	*/
2459	return base;
2460	}
2461
2462	static void
2463	dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2464	{
2465	uint64_t arg = *llquanta++;
2466	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2467	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2468	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2469	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2470
2471	llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, value: nval)] += incr;
2472	}
2473
2474	/ARGSUSED/
2475	static void
2476	dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2477	{
2478	#pragma unused(arg) /* __APPLE__ */
2479	data[`0`]++;
2480	data[`1`] += nval;
2481	}
2482
2483	/ARGSUSED/
2484	static void
2485	dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2486	{
2487	#pragma unused(arg) /* __APPLE__ */
2488	int64_t snval = (int64_t)nval;
2489	uint64_t tmp[`2`];
2490
2491	data[`0`]++;
2492	data[`1`] += nval;
2493
2494	/*
2495	* What we want to say here is:
2496	*
2497	* data[2] += nval * nval;
2498	*
2499	* But given that nval is 64-bit, we could easily overflow, so
2500	* we do this as 128-bit arithmetic.
2501	*/
2502	if (snval < `0`)
2503	snval = -snval;
2504
2505	dtrace_multiply_128(factor1: (uint64_t)snval, factor2: (uint64_t)snval, product: tmp);
2506	dtrace_add_128(addend1: data + `2`, addend2: tmp, sum: data + `2`);
2507	}
2508
2509	/ARGSUSED/
2510	static void
2511	dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2512	{
2513	#pragma unused(nval, arg) /* __APPLE__ */
2514	oval = oval + `1`;
2515	}
2516
2517	/ARGSUSED/
2518	static void
2519	dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2520	{
2521	#pragma unused(arg) /* __APPLE__ */
2522	*oval += nval;
2523	}
2524
2525	/*
2526	* Aggregate given the tuple in the principal data buffer, and the aggregating
2527	* action denoted by the specified dtrace_aggregation_t. The aggregation
2528	* buffer is specified as the buf parameter. This routine does not return
2529	* failure; if there is no space in the aggregation buffer, the data will be
2530	* dropped, and a corresponding counter incremented.
2531	*/
2532	__attribute__((noinline))
2533	static void
2534	dtrace_aggregate(dtrace_aggregation_t agg, dtrace_buffer_t dbuf,
2535	intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2536	{
2537	#pragma unused(arg)
2538	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2539	uint32_t i, ndx, size, fsize;
2540	uint32_t align = sizeof (uint64_t) - `1`;
2541	dtrace_aggbuffer_t *agb;
2542	dtrace_aggkey_t *key;
2543	uint32_t hashval = `0`, limit, isstr;
2544	caddr_t tomax, data, kdata;
2545	dtrace_actkind_t action;
2546	dtrace_action_t *act;
2547	uintptr_t offs;
2548
2549	if (buf == NULL)
2550	return;
2551
2552	if (!agg->dtag_hasarg) {
2553	/*
2554	* Currently, only quantize() and lquantize() take additional
2555	* arguments, and they have the same semantics: an increment
2556	* value that defaults to 1 when not present. If additional
2557	* aggregating actions take arguments, the setting of the
2558	* default argument value will presumably have to become more
2559	* sophisticated...
2560	*/
2561	arg = `1`;
2562	}
2563
2564	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2565	size = rec->dtrd_offset - agg->dtag_base;
2566	fsize = size + rec->dtrd_size;
2567
2568	ASSERT(dbuf->dtb_tomax != NULL);
2569	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2570
2571	if ((tomax = buf->dtb_tomax) == NULL) {
2572	dtrace_buffer_drop(buf);
2573	return;
2574	}
2575
2576	/*
2577	* The metastructure is always at the bottom of the buffer.
2578	*/
2579	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2580	sizeof (dtrace_aggbuffer_t));
2581
2582	if (buf->dtb_offset == `0`) {
2583	/*
2584	* We just kludge up approximately 1/8th of the size to be
2585	* buckets. If this guess ends up being routinely
2586	* off-the-mark, we may need to dynamically readjust this
2587	* based on past performance.
2588	*/
2589	uintptr_t hashsize = (buf->dtb_size >> `3`) / sizeof (uintptr_t);
2590
2591	if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2592	(uintptr_t)tomax \|\| hashsize == `0`) {
2593	/*
2594	* We've been given a ludicrously small buffer;
2595	* increment our drop count and leave.
2596	*/
2597	dtrace_buffer_drop(buf);
2598	return;
2599	}
2600
2601	/*
2602	* And now, a pathetic attempt to try to get a an odd (or
2603	* perchance, a prime) hash size for better hash distribution.
2604	*/
2605	if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << `3`))
2606	hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2607
2608	agb->dtagb_hashsize = hashsize;
2609	agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2610	agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2611	agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2612
2613	for (i = `0`; i < agb->dtagb_hashsize; i++)
2614	agb->dtagb_hash[i] = NULL;
2615	}
2616
2617	ASSERT(agg->dtag_first != NULL);
2618	ASSERT(agg->dtag_first->dta_intuple);
2619
2620	/*
2621	* Calculate the hash value based on the key. Note that we _don't_
2622	* include the aggid in the hashing (but we will store it as part of
2623	* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2624	* algorithm: a simple, quick algorithm that has no known funnels, and
2625	* gets good distribution in practice. The efficacy of the hashing
2626	* algorithm (and a comparison with other algorithms) may be found by
2627	* running the ::dtrace_aggstat MDB dcmd.
2628	*/
2629	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2630	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2631	limit = i + act->dta_rec.dtrd_size;
2632	ASSERT(limit <= size);
2633	isstr = DTRACEACT_ISSTRING(act);
2634
2635	for (; i < limit; i++) {
2636	hashval += data[i];
2637	hashval += (hashval << `10`);
2638	hashval ^= (hashval >> `6`);
2639
2640	if (isstr && data[i] == `'\0'`)
2641	break;
2642	}
2643	}
2644
2645	hashval += (hashval << `3`);
2646	hashval ^= (hashval >> `11`);
2647	hashval += (hashval << `15`);
2648
2649	/*
2650	* Yes, the divide here is expensive -- but it's generally the least
2651	* of the performance issues given the amount of data that we iterate
2652	* over to compute hash values, compare data, etc.
2653	*/
2654	ndx = hashval % agb->dtagb_hashsize;
2655
2656	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2657	ASSERT((caddr_t)key >= tomax);
2658	ASSERT((caddr_t)key < tomax + buf->dtb_size);
2659
2660	if (hashval != key->dtak_hashval \|\| key->dtak_size != size)
2661	continue;
2662
2663	kdata = key->dtak_data;
2664	ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2665
2666	for (act = agg->dtag_first; act->dta_intuple;
2667	act = act->dta_next) {
2668	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2669	limit = i + act->dta_rec.dtrd_size;
2670	ASSERT(limit <= size);
2671	isstr = DTRACEACT_ISSTRING(act);
2672
2673	for (; i < limit; i++) {
2674	if (kdata[i] != data[i])
2675	goto next;
2676
2677	if (isstr && data[i] == `'\0'`)
2678	break;
2679	}
2680	}
2681
2682	if (action != key->dtak_action) {
2683	/*
2684	* We are aggregating on the same value in the same
2685	* aggregation with two different aggregating actions.
2686	* (This should have been picked up in the compiler,
2687	* so we may be dealing with errant or devious DIF.)
2688	* This is an error condition; we indicate as much,
2689	* and return.
2690	*/
2691	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2692	return;
2693	}
2694
2695	/*
2696	* This is a hit: we need to apply the aggregator to
2697	* the value at this key.
2698	*/
2699	agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2700	return;
2701	next:
2702	continue;
2703	}
2704
2705	/*
2706	* We didn't find it. We need to allocate some zero-filled space,
2707	* link it into the hash table appropriately, and apply the aggregator
2708	* to the (zero-filled) value.
2709	*/
2710	offs = buf->dtb_offset;
2711	while (offs & (align - `1`))
2712	offs += sizeof (uint32_t);
2713
2714	/*
2715	* If we don't have enough room to both allocate a new key _and_
2716	* its associated data, increment the drop count and return.
2717	*/
2718	if ((uintptr_t)tomax + offs + fsize >
2719	agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2720	dtrace_buffer_drop(buf);
2721	return;
2722	}
2723
2724	/CONSTCOND/
2725	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - `1`)));
2726	key = (dtrace_aggkey_t )(agb->dtagb_free - sizeof* (dtrace_aggkey_t));
2727	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2728
2729	key->dtak_data = kdata = tomax + offs;
2730	buf->dtb_offset = offs + fsize;
2731
2732	/*
2733	* Now copy the data across.
2734	*/
2735	((dtrace_aggid_t )kdata) = agg->dtag_id;
2736
2737	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2738	kdata[i] = data[i];
2739
2740	/*
2741	* Because strings are not zeroed out by default, we need to iterate
2742	* looking for actions that store strings, and we need to explicitly
2743	* pad these strings out with zeroes.
2744	*/
2745	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2746	int nul;
2747
2748	if (!DTRACEACT_ISSTRING(act))
2749	continue;
2750
2751	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2752	limit = i + act->dta_rec.dtrd_size;
2753	ASSERT(limit <= size);
2754
2755	for (nul = `0`; i < limit; i++) {
2756	if (nul) {
2757	kdata[i] = `'\0'`;
2758	continue;
2759	}
2760
2761	if (data[i] != `'\0'`)
2762	continue;
2763
2764	nul = `1`;
2765	}
2766	}
2767
2768	for (i = size; i < fsize; i++)
2769	kdata[i] = `0`;
2770
2771	key->dtak_hashval = hashval;
2772	key->dtak_size = size;
2773	key->dtak_action = action;
2774	key->dtak_next = agb->dtagb_hash[ndx];
2775	agb->dtagb_hash[ndx] = key;
2776
2777	/*
2778	* Finally, apply the aggregator.
2779	*/
2780	((uint64_t )(key->dtak_data + size)) = agg->dtag_initial;
2781	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2782	}
2783
2784	/*
2785	* Given consumer state, this routine finds a speculation in the INACTIVE
2786	* state and transitions it into the ACTIVE state. If there is no speculation
2787	* in the INACTIVE state, 0 is returned. In this case, no error counter is
2788	* incremented -- it is up to the caller to take appropriate action.
2789	*/
2790	static int
2791	dtrace_speculation(dtrace_state_t *state)
2792	{
2793	int i = `0`;
2794	dtrace_speculation_state_t current;
2795	uint32_t *stat = &state->dts_speculations_unavail, count;
2796
2797	while (i < state->dts_nspeculations) {
2798	dtrace_speculation_t *spec = &state->dts_speculations[i];
2799
2800	current = spec->dtsp_state;
2801
2802	if (current != DTRACESPEC_INACTIVE) {
2803	if (current == DTRACESPEC_COMMITTINGMANY \|\|
2804	current == DTRACESPEC_COMMITTING \|\|
2805	current == DTRACESPEC_DISCARDING)
2806	stat = &state->dts_speculations_busy;
2807	i++;
2808	continue;
2809	}
2810
2811	if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2812	current, DTRACESPEC_ACTIVE) == current)
2813	return (i + `1`);
2814	}
2815
2816	/*
2817	* We couldn't find a speculation. If we found as much as a single
2818	* busy speculation buffer, we'll attribute this failure as "busy"
2819	* instead of "unavail".
2820	*/
2821	do {
2822	count = *stat;
2823	} while (dtrace_cas32(stat, count, count + `1`) != count);
2824
2825	return (`0`);
2826	}
2827
2828	/*
2829	* This routine commits an active speculation. If the specified speculation
2830	* is not in a valid state to perform a commit(), this routine will silently do
2831	* nothing. The state of the specified speculation is transitioned according
2832	* to the state transition diagram outlined in <sys/dtrace_impl.h>
2833	*/
2834	static void
2835	dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2836	dtrace_specid_t which)
2837	{
2838	dtrace_speculation_t *spec;
2839	dtrace_buffer_t src, dest;
2840	uintptr_t daddr, saddr, dlimit, slimit;
2841	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2842	intptr_t offs;
2843	uint64_t timestamp;
2844
2845	if (which == `0`)
2846	return;
2847
2848	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2849	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
2850	return;
2851	}
2852
2853	spec = &state->dts_speculations[which - `1`];
2854	src = &spec->dtsp_buffer[cpu];
2855	dest = &state->dts_buffer[cpu];
2856
2857	do {
2858	current = spec->dtsp_state;
2859
2860	if (current == DTRACESPEC_COMMITTINGMANY)
2861	break;
2862
2863	switch (current) {
2864	case DTRACESPEC_INACTIVE:
2865	case DTRACESPEC_DISCARDING:
2866	return;
2867
2868	case DTRACESPEC_COMMITTING:
2869	/*
2870	* This is only possible if we are (a) commit()'ing
2871	* without having done a prior speculate() on this CPU
2872	* and (b) racing with another commit() on a different
2873	* CPU. There's nothing to do -- we just assert that
2874	* our offset is 0.
2875	*/
2876	ASSERT(src->dtb_offset == `0`);
2877	return;
2878
2879	case DTRACESPEC_ACTIVE:
2880	new = DTRACESPEC_COMMITTING;
2881	break;
2882
2883	case DTRACESPEC_ACTIVEONE:
2884	/*
2885	* This speculation is active on one CPU. If our
2886	* buffer offset is non-zero, we know that the one CPU
2887	* must be us. Otherwise, we are committing on a
2888	* different CPU from the speculate(), and we must
2889	* rely on being asynchronously cleaned.
2890	*/
2891	if (src->dtb_offset != `0`) {
2892	new = DTRACESPEC_COMMITTING;
2893	break;
2894	}
2895	OS_FALLTHROUGH;
2896
2897	case DTRACESPEC_ACTIVEMANY:
2898	new = DTRACESPEC_COMMITTINGMANY;
2899	break;
2900
2901	default:
2902	ASSERT(`0`);
2903	}
2904	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2905	current, new) != current);
2906
2907	/*
2908	* We have set the state to indicate that we are committing this
2909	* speculation. Now reserve the necessary space in the destination
2910	* buffer.
2911	*/
2912	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2913	sizeof (uint64_t), state, NULL)) < `0`) {
2914	dtrace_buffer_drop(dest);
2915	goto out;
2916	}
2917
2918	/*
2919	* We have sufficient space to copy the speculative buffer into the
2920	* primary buffer. First, modify the speculative buffer, filling
2921	* in the timestamp of all entries with the current time. The data
2922	* must have the commit() time rather than the time it was traced,
2923	* so that all entries in the primary buffer are in timestamp order.
2924	*/
2925	timestamp = dtrace_gethrtime();
2926	saddr = (uintptr_t)src->dtb_tomax;
2927	slimit = saddr + src->dtb_offset;
2928	while (saddr < slimit) {
2929	size_t size;
2930	dtrace_rechdr_t dtrh = (dtrace_rechdr_t )saddr;
2931
2932	if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2933	saddr += sizeof (dtrace_epid_t);
2934	continue;
2935	}
2936
2937	ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2938	size = state->dts_ecbs[dtrh->dtrh_epid - `1`]->dte_size;
2939
2940	ASSERT(saddr + size <= slimit);
2941	ASSERT(size >= sizeof(dtrace_rechdr_t));
2942	ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2943
2944	DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2945
2946	saddr += size;
2947	}
2948
2949	/*
2950	* Copy the buffer across. (Note that this is a
2951	* highly subobtimal bcopy(); in the unlikely event that this becomes
2952	* a serious performance issue, a high-performance DTrace-specific
2953	* bcopy() should obviously be invented.)
2954	*/
2955	daddr = (uintptr_t)dest->dtb_tomax + offs;
2956	dlimit = daddr + src->dtb_offset;
2957	saddr = (uintptr_t)src->dtb_tomax;
2958
2959	/*
2960	* First, the aligned portion.
2961	*/
2962	while (dlimit - daddr >= sizeof (uint64_t)) {
2963	((uint64_t )daddr) = ((uint64_t )saddr);
2964
2965	daddr += sizeof (uint64_t);
2966	saddr += sizeof (uint64_t);
2967	}
2968
2969	/*
2970	* Now any left-over bit...
2971	*/
2972	while (dlimit - daddr)
2973	((uint8_t )daddr++) = ((uint8_t )saddr++);
2974
2975	/*
2976	* Finally, commit the reserved space in the destination buffer.
2977	*/
2978	dest->dtb_offset = offs + src->dtb_offset;
2979
2980	out:
2981	/*
2982	* If we're lucky enough to be the only active CPU on this speculation
2983	* buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2984	*/
2985	if (current == DTRACESPEC_ACTIVE \|\|
2986	(current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2987	uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2988	DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2989	#pragma unused(rval) /* __APPLE__ */
2990
2991	ASSERT(rval == DTRACESPEC_COMMITTING);
2992	}
2993
2994	src->dtb_offset = `0`;
2995	src->dtb_xamot_drops += src->dtb_drops;
2996	src->dtb_drops = `0`;
2997	}
2998
2999	/*
3000	* This routine discards an active speculation. If the specified speculation
3001	* is not in a valid state to perform a discard(), this routine will silently
3002	* do nothing. The state of the specified speculation is transitioned
3003	* according to the state transition diagram outlined in <sys/dtrace_impl.h>
3004	*/
3005	__attribute__((noinline))
3006	static void
3007	dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3008	dtrace_specid_t which)
3009	{
3010	dtrace_speculation_t *spec;
3011	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3012	dtrace_buffer_t *buf;
3013
3014	if (which == `0`)
3015	return;
3016
3017	if (which > (dtrace_specid_t)state->dts_nspeculations) {
3018	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
3019	return;
3020	}
3021
3022	spec = &state->dts_speculations[which - `1`];
3023	buf = &spec->dtsp_buffer[cpu];
3024
3025	do {
3026	current = spec->dtsp_state;
3027
3028	switch (current) {
3029	case DTRACESPEC_INACTIVE:
3030	case DTRACESPEC_COMMITTINGMANY:
3031	case DTRACESPEC_COMMITTING:
3032	case DTRACESPEC_DISCARDING:
3033	return;
3034
3035	case DTRACESPEC_ACTIVE:
3036	case DTRACESPEC_ACTIVEMANY:
3037	new = DTRACESPEC_DISCARDING;
3038	break;
3039
3040	case DTRACESPEC_ACTIVEONE:
3041	if (buf->dtb_offset != `0`) {
3042	new = DTRACESPEC_INACTIVE;
3043	} else {
3044	new = DTRACESPEC_DISCARDING;
3045	}
3046	break;
3047
3048	default:
3049	ASSERT(`0`);
3050	}
3051	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3052	current, new) != current);
3053
3054	buf->dtb_offset = `0`;
3055	buf->dtb_drops = `0`;
3056	}
3057
3058	/*
3059	* Note: not called from probe context. This function is called
3060	* asynchronously from cross call context to clean any speculations that are
3061	* in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3062	* transitioned back to the INACTIVE state until all CPUs have cleaned the
3063	* speculation.
3064	*/
3065	static void
3066	dtrace_speculation_clean_here(dtrace_state_t *state)
3067	{
3068	dtrace_icookie_t cookie;
3069	processorid_t cpu = CPU->cpu_id;
3070	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3071	dtrace_specid_t i;
3072
3073	cookie = dtrace_interrupt_disable();
3074
3075	if (dest->dtb_tomax == NULL) {
3076	dtrace_interrupt_enable(cookie);
3077	return;
3078	}
3079
3080	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3081	dtrace_speculation_t *spec = &state->dts_speculations[i];
3082	dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3083
3084	if (src->dtb_tomax == NULL)
3085	continue;
3086
3087	if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3088	src->dtb_offset = `0`;
3089	continue;
3090	}
3091
3092	if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3093	continue;
3094
3095	if (src->dtb_offset == `0`)
3096	continue;
3097
3098	dtrace_speculation_commit(state, cpu, which: i + `1`);
3099	}
3100
3101	dtrace_interrupt_enable(cookie);
3102	}
3103
3104	/*
3105	* Note: not called from probe context. This function is called
3106	* asynchronously (and at a regular interval) to clean any speculations that
3107	* are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3108	* is work to be done, it cross calls all CPUs to perform that work;
3109	* COMMITMANY and DISCARDING speculations may not be transitioned back to the
3110	* INACTIVE state until they have been cleaned by all CPUs.
3111	*/
3112	static void
3113	dtrace_speculation_clean(dtrace_state_t *state)
3114	{
3115	int work = `0`;
3116	uint32_t rv;
3117	dtrace_specid_t i;
3118
3119	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3120	dtrace_speculation_t *spec = &state->dts_speculations[i];
3121
3122	ASSERT(!spec->dtsp_cleaning);
3123
3124	if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3125	spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3126	continue;
3127
3128	work++;
3129	spec->dtsp_cleaning = `1`;
3130	}
3131
3132	if (!work)
3133	return;
3134
3135	dtrace_xcall(DTRACE_CPUALL,
3136	(dtrace_xcall_t)dtrace_speculation_clean_here, state);
3137
3138	/*
3139	* We now know that all CPUs have committed or discarded their
3140	* speculation buffers, as appropriate. We can now set the state
3141	* to inactive.
3142	*/
3143	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3144	dtrace_speculation_t *spec = &state->dts_speculations[i];
3145	dtrace_speculation_state_t current, new;
3146
3147	if (!spec->dtsp_cleaning)
3148	continue;
3149
3150	current = spec->dtsp_state;
3151	ASSERT(current == DTRACESPEC_DISCARDING \|\|
3152	current == DTRACESPEC_COMMITTINGMANY);
3153
3154	new = DTRACESPEC_INACTIVE;
3155
3156	rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3157	ASSERT(rv == current);
3158	spec->dtsp_cleaning = `0`;
3159	}
3160	}
3161
3162	/*
3163	* Called as part of a speculate() to get the speculative buffer associated
3164	* with a given speculation. Returns NULL if the specified speculation is not
3165	* in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3166	* the active CPU is not the specified CPU -- the speculation will be
3167	* atomically transitioned into the ACTIVEMANY state.
3168	*/
3169	__attribute__((noinline))
3170	static dtrace_buffer_t *
3171	dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3172	dtrace_specid_t which)
3173	{
3174	dtrace_speculation_t *spec;
3175	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3176	dtrace_buffer_t *buf;
3177
3178	if (which == `0`)
3179	return (NULL);
3180
3181	if (which > (dtrace_specid_t)state->dts_nspeculations) {
3182	cpu_core[cpuid].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
3183	return (NULL);
3184	}
3185
3186	spec = &state->dts_speculations[which - `1`];
3187	buf = &spec->dtsp_buffer[cpuid];
3188
3189	do {
3190	current = spec->dtsp_state;
3191
3192	switch (current) {
3193	case DTRACESPEC_INACTIVE:
3194	case DTRACESPEC_COMMITTINGMANY:
3195	case DTRACESPEC_DISCARDING:
3196	return (NULL);
3197
3198	case DTRACESPEC_COMMITTING:
3199	ASSERT(buf->dtb_offset == `0`);
3200	return (NULL);
3201
3202	case DTRACESPEC_ACTIVEONE:
3203	/*
3204	* This speculation is currently active on one CPU.
3205	* Check the offset in the buffer; if it's non-zero,
3206	* that CPU must be us (and we leave the state alone).
3207	* If it's zero, assume that we're starting on a new
3208	* CPU -- and change the state to indicate that the
3209	* speculation is active on more than one CPU.
3210	*/
3211	if (buf->dtb_offset != `0`)
3212	return (buf);
3213
3214	new = DTRACESPEC_ACTIVEMANY;
3215	break;
3216
3217	case DTRACESPEC_ACTIVEMANY:
3218	return (buf);
3219
3220	case DTRACESPEC_ACTIVE:
3221	new = DTRACESPEC_ACTIVEONE;
3222	break;
3223
3224	default:
3225	ASSERT(`0`);
3226	}
3227	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3228	current, new) != current);
3229
3230	ASSERT(new == DTRACESPEC_ACTIVEONE \|\| new == DTRACESPEC_ACTIVEMANY);
3231	return (buf);
3232	}
3233
3234	/*
3235	* Return a string. In the event that the user lacks the privilege to access
3236	* arbitrary kernel memory, we copy the string out to scratch memory so that we
3237	* don't fail access checking.
3238	*
3239	* dtrace_dif_variable() uses this routine as a helper for various
3240	* builtin values such as 'execname' and 'probefunc.'
3241	*/
3242	static
3243	uintptr_t
3244	dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3245	dtrace_mstate_t *mstate)
3246	{
3247	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3248	uintptr_t ret;
3249	size_t strsz;
3250
3251	/*
3252	* The easy case: this probe is allowed to read all of memory, so
3253	* we can just return this as a vanilla pointer.
3254	*/
3255	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`)
3256	return (addr);
3257
3258	/*
3259	* This is the tougher case: we copy the string in question from
3260	* kernel memory into scratch memory and return it that way: this
3261	* ensures that we won't trip up when access checking tests the
3262	* BYREF return value.
3263	*/
3264	strsz = dtrace_strlen(s: (char *)addr, lim: size) + `1`;
3265
3266	if (mstate->dtms_scratch_ptr + strsz >
3267	mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3268	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3269	return (`0`);
3270	}
3271
3272	dtrace_strcpy(src: (const void )addr, dst: (void* *)mstate->dtms_scratch_ptr,
3273	len: strsz);
3274	ret = mstate->dtms_scratch_ptr;
3275	mstate->dtms_scratch_ptr += strsz;
3276	return (ret);
3277	}
3278
3279	/*
3280	* This function implements the DIF emulator's variable lookups. The emulator
3281	* passes a reserved variable identifier and optional built-in array index.
3282	*/
3283	static uint64_t
3284	dtrace_dif_variable(dtrace_mstate_t mstate, dtrace_state_t state, uint64_t v,
3285	uint64_t ndx)
3286	{
3287	/*
3288	* If we're accessing one of the uncached arguments, we'll turn this
3289	* into a reference in the args array.
3290	*/
3291	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3292	ndx = v - DIF_VAR_ARG0;
3293	v = DIF_VAR_ARGS;
3294	}
3295
3296	switch (v) {
3297	case DIF_VAR_ARGS:
3298	ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3299	if (ndx >= sizeof (mstate->dtms_arg) /
3300	sizeof (mstate->dtms_arg[`0`])) {
3301	int aframes = mstate->dtms_probe->dtpr_aframes + `2`;
3302	dtrace_vstate_t *vstate = &state->dts_vstate;
3303	dtrace_provider_t *pv;
3304	uint64_t val;
3305	int argndx = ndx;
3306
3307	if (argndx < `0`) {
3308	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3309	return (`0`);
3310	}
3311
3312	pv = mstate->dtms_probe->dtpr_provider;
3313	if (pv->dtpv_pops.dtps_getargval != NULL)
3314	val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3315	mstate->dtms_probe->dtpr_id,
3316	mstate->dtms_probe->dtpr_arg, argndx, aframes);
3317	/ Special case access of arg5 as passed to dtrace_probe_error() (which see.) /
3318	else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && argndx == `5`) {
3319	return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[`0`]))->dts_arg_error_illval;
3320	}
3321
3322	else
3323	val = dtrace_getarg(argndx, aframes, mstate, vstate);
3324
3325	/*
3326	* This is regrettably required to keep the compiler
3327	* from tail-optimizing the call to dtrace_getarg().
3328	* The condition always evaluates to true, but the
3329	* compiler has no way of figuring that out a priori.
3330	* (None of this would be necessary if the compiler
3331	* could be relied upon to _always_ tail-optimize
3332	* the call to dtrace_getarg() -- but it can't.)
3333	*/
3334	if (mstate->dtms_probe != NULL)
3335	return (val);
3336
3337	ASSERT(`0`);
3338	}
3339
3340	return (mstate->dtms_arg[ndx]);
3341
3342	case DIF_VAR_UREGS: {
3343	thread_t thread;
3344
3345	if (!dtrace_priv_proc(state))
3346	return (`0`);
3347
3348	if ((thread = current_thread()) == NULL) {
3349	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3350	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = `0`;
3351	return (`0`);
3352	}
3353
3354	return (dtrace_getreg(find_user_regs(thread), ndx));
3355	}
3356
3357	case DIF_VAR_VMREGS: {
3358	uint64_t rval;
3359
3360	if (!dtrace_priv_kernel(state))
3361	return (`0`);
3362
3363	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3364
3365	rval = dtrace_getvmreg(ndx);
3366
3367	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3368
3369	return (rval);
3370	}
3371
3372	case DIF_VAR_CURTHREAD:
3373	if (!dtrace_priv_kernel(state))
3374	return (`0`);
3375
3376	return ((uint64_t)(uintptr_t)current_thread());
3377
3378	case DIF_VAR_TIMESTAMP:
3379	if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3380	mstate->dtms_timestamp = dtrace_gethrtime();
3381	mstate->dtms_present \|= DTRACE_MSTATE_TIMESTAMP;
3382	}
3383	return (mstate->dtms_timestamp);
3384
3385	case DIF_VAR_VTIMESTAMP:
3386	ASSERT(dtrace_vtime_references != `0`);
3387	return (dtrace_get_thread_vtime(current_thread()));
3388
3389	case DIF_VAR_WALLTIMESTAMP:
3390	if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3391	mstate->dtms_walltimestamp = dtrace_gethrestime();
3392	mstate->dtms_present \|= DTRACE_MSTATE_WALLTIMESTAMP;
3393	}
3394	return (mstate->dtms_walltimestamp);
3395
3396	case DIF_VAR_MACHTIMESTAMP:
3397	if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3398	mstate->dtms_machtimestamp = mach_absolute_time();
3399	mstate->dtms_present \|= DTRACE_MSTATE_MACHTIMESTAMP;
3400	}
3401	return (mstate->dtms_machtimestamp);
3402
3403	case DIF_VAR_MACHCTIMESTAMP:
3404	if (!(mstate->dtms_present & DTRACE_MSTATE_MACHCTIMESTAMP)) {
3405	mstate->dtms_machctimestamp = mach_continuous_time();
3406	mstate->dtms_present \|= DTRACE_MSTATE_MACHCTIMESTAMP;
3407	}
3408	return (mstate->dtms_machctimestamp);
3409
3410
3411	case DIF_VAR_CPU:
3412	return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3413
3414	case DIF_VAR_IPL:
3415	if (!dtrace_priv_kernel(state))
3416	return (`0`);
3417	if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3418	mstate->dtms_ipl = dtrace_getipl();
3419	mstate->dtms_present \|= DTRACE_MSTATE_IPL;
3420	}
3421	return (mstate->dtms_ipl);
3422
3423	case DIF_VAR_EPID:
3424	ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3425	return (mstate->dtms_epid);
3426
3427	case DIF_VAR_ID:
3428	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3429	return (mstate->dtms_probe->dtpr_id);
3430
3431	case DIF_VAR_STACKDEPTH:
3432	if (!dtrace_priv_kernel(state))
3433	return (`0`);
3434	if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3435	int aframes = mstate->dtms_probe->dtpr_aframes + `2`;
3436
3437	mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3438	mstate->dtms_present \|= DTRACE_MSTATE_STACKDEPTH;
3439	}
3440	return (mstate->dtms_stackdepth);
3441
3442	case DIF_VAR_USTACKDEPTH:
3443	if (!dtrace_priv_proc(state))
3444	return (`0`);
3445	if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3446	/*
3447	* See comment in DIF_VAR_PID.
3448	*/
3449	if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3450	CPU_ON_INTR(CPU)) {
3451	mstate->dtms_ustackdepth = `0`;
3452	} else {
3453	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3454	mstate->dtms_ustackdepth =
3455	dtrace_getustackdepth();
3456	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3457	}
3458	mstate->dtms_present \|= DTRACE_MSTATE_USTACKDEPTH;
3459	}
3460	return (mstate->dtms_ustackdepth);
3461
3462	case DIF_VAR_CALLER:
3463	if (!dtrace_priv_kernel(state))
3464	return (`0`);
3465	if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3466	int aframes = mstate->dtms_probe->dtpr_aframes + `2`;
3467
3468	if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3469	/*
3470	* If this is an unanchored probe, we are
3471	* required to go through the slow path:
3472	* dtrace_caller() only guarantees correct
3473	* results for anchored probes.
3474	*/
3475	pc_t caller[`2`];
3476
3477	dtrace_getpcstack(caller, `2`, aframes,
3478	(uint32_t *)(uintptr_t)mstate->dtms_arg[`0`]);
3479	mstate->dtms_caller = caller[`1`];
3480	} else if ((mstate->dtms_caller =
3481	dtrace_caller(aframes)) == (uintptr_t)-`1`) {
3482	/*
3483	* We have failed to do this the quick way;
3484	* we must resort to the slower approach of
3485	* calling dtrace_getpcstack().
3486	*/
3487	pc_t caller;
3488
3489	dtrace_getpcstack(&caller, `1`, aframes, NULL);
3490	mstate->dtms_caller = caller;
3491	}
3492
3493	mstate->dtms_present \|= DTRACE_MSTATE_CALLER;
3494	}
3495	return (mstate->dtms_caller);
3496
3497	case DIF_VAR_UCALLER:
3498	if (!dtrace_priv_proc(state))
3499	return (`0`);
3500
3501	if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3502	uint64_t ustack[`3`];
3503
3504	/*
3505	* dtrace_getupcstack() fills in the first uint64_t
3506	* with the current PID. The second uint64_t will
3507	* be the program counter at user-level. The third
3508	* uint64_t will contain the caller, which is what
3509	* we're after.
3510	*/
3511	ustack[`2`] = `0`;
3512	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3513	dtrace_getupcstack(ustack, `3`);
3514	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3515	mstate->dtms_ucaller = ustack[`2`];
3516	mstate->dtms_present \|= DTRACE_MSTATE_UCALLER;
3517	}
3518
3519	return (mstate->dtms_ucaller);
3520
3521	case DIF_VAR_PROBEPROV:
3522	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3523	return (dtrace_dif_varstr(
3524	addr: (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3525	state, mstate));
3526
3527	case DIF_VAR_PROBEMOD:
3528	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3529	return (dtrace_dif_varstr(
3530	addr: (uintptr_t)mstate->dtms_probe->dtpr_mod,
3531	state, mstate));
3532
3533	case DIF_VAR_PROBEFUNC:
3534	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3535	return (dtrace_dif_varstr(
3536	addr: (uintptr_t)mstate->dtms_probe->dtpr_func,
3537	state, mstate));
3538
3539	case DIF_VAR_PROBENAME:
3540	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3541	return (dtrace_dif_varstr(
3542	addr: (uintptr_t)mstate->dtms_probe->dtpr_name,
3543	state, mstate));
3544
3545	case DIF_VAR_PID:
3546	if (!dtrace_priv_proc_relaxed(state))
3547	return (`0`);
3548
3549	/*
3550	* Note that we are assuming that an unanchored probe is
3551	* always due to a high-level interrupt. (And we're assuming
3552	* that there is only a single high level interrupt.)
3553	*/
3554	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3555	/ Anchored probe that fires while on an interrupt accrues to process 0 /
3556	return `0`;
3557
3558	return ((uint64_t)dtrace_proc_selfpid());
3559
3560	case DIF_VAR_PPID:
3561	if (!dtrace_priv_proc_relaxed(state))
3562	return (`0`);
3563
3564	/*
3565	* See comment in DIF_VAR_PID.
3566	*/
3567	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3568	return (`0`);
3569
3570	return ((uint64_t)dtrace_proc_selfppid());
3571
3572	case DIF_VAR_TID:
3573	/ We do not need to check for null current_thread() /
3574	return thread_tid(thread: current_thread()); / globally unique /
3575
3576	case DIF_VAR_PTHREAD_SELF:
3577	if (!dtrace_priv_proc(state))
3578	return (`0`);
3579
3580	/ Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self /
3581	return `0`;
3582
3583	case DIF_VAR_DISPATCHQADDR:
3584	if (!dtrace_priv_proc(state))
3585	return (`0`);
3586
3587	/ We do not need to check for null current_thread() /
3588	return thread_dispatchqaddr(thread: current_thread());
3589
3590	case DIF_VAR_EXECNAME:
3591	{
3592	char xname = (char* *)mstate->dtms_scratch_ptr;
3593	char *pname = proc_best_name(curproc);
3594	size_t scratch_size = sizeof(proc_name_t);
3595
3596	/ The scratch allocation's lifetime is that of the clause. /
3597	if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3598	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3599	return `0`;
3600	}
3601
3602	if (!dtrace_priv_proc_relaxed(state))
3603	return (`0`);
3604
3605	mstate->dtms_scratch_ptr += scratch_size;
3606	strlcpy(dst: xname, src: pname, n: scratch_size);
3607
3608	return ((uint64_t)(uintptr_t)xname);
3609	}
3610
3611
3612	case DIF_VAR_ZONENAME:
3613	{
3614	/ scratch_size is equal to length('global') + 1 for the null-terminator. /
3615	char zname = (char* *)mstate->dtms_scratch_ptr;
3616	size_t scratch_size = `6` + `1`;
3617
3618	if (!dtrace_priv_proc(state))
3619	return (`0`);
3620
3621	/ The scratch allocation's lifetime is that of the clause. /
3622	if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3623	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3624	return `0`;
3625	}
3626
3627	mstate->dtms_scratch_ptr += scratch_size;
3628
3629	/ The kernel does not provide zonename, it will always return 'global'. /
3630	strlcpy(dst: zname, src: "global", n: scratch_size);
3631
3632	return ((uint64_t)(uintptr_t)zname);
3633	}
3634
3635	#if CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS
3636	case DIF_VAR_CPUINSTRS:
3637	return mt_cur_cpu_instrs();
3638
3639	case DIF_VAR_CPUCYCLES:
3640	return mt_cur_cpu_cycles();
3641
3642	case DIF_VAR_VINSTRS: {
3643	struct recount_usage usage = { `0` };
3644	recount_current_thread_usage(&usage);
3645	return recount_usage_instructions(&usage);
3646	}
3647
3648	case DIF_VAR_VCYCLES: {
3649	struct recount_usage usage = { `0` };
3650	recount_current_thread_usage(&usage);
3651	return recount_usage_cycles(&usage);
3652	}
3653
3654	#else /* CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS */
3655	case DIF_VAR_CPUINSTRS:
3656	case DIF_VAR_CPUCYCLES:
3657	case DIF_VAR_VINSTRS:
3658	case DIF_VAR_VCYCLES:
3659	return `0`;
3660	#endif /* !CONFIG_PERVASIVE_CPI \|\| !CONFIG_CPU_COUNTERS */
3661
3662	case DIF_VAR_UID:
3663	if (!dtrace_priv_proc_relaxed(state))
3664	return (`0`);
3665
3666	/*
3667	* See comment in DIF_VAR_PID.
3668	*/
3669	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3670	return (`0`);
3671
3672	return ((uint64_t) dtrace_proc_selfruid());
3673
3674	case DIF_VAR_GID:
3675	if (!dtrace_priv_proc(state))
3676	return (`0`);
3677
3678	/*
3679	* See comment in DIF_VAR_PID.
3680	*/
3681	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3682	return (`0`);
3683
3684	if (dtrace_CRED() != NULL)
3685	/ Credential does not require lazy initialization. /
3686	return ((uint64_t)kauth_getgid());
3687	else {
3688	/ proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). /
3689	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3690	return -`1ULL`;
3691	}
3692
3693	case DIF_VAR_ERRNO: {
3694	uthread_t uthread = current_uthread();
3695	if (!dtrace_priv_proc(state))
3696	return (`0`);
3697
3698	/*
3699	* See comment in DIF_VAR_PID.
3700	*/
3701	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3702	return (`0`);
3703
3704	if (uthread)
3705	return (uint64_t)uthread->t_dtrace_errno;
3706	else {
3707	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3708	return -`1ULL`;
3709	}
3710	}
3711
3712	default:
3713	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3714	return (`0`);
3715	}
3716	}
3717
3718	typedef enum dtrace_json_state {
3719	DTRACE_JSON_REST = `1`,
3720	DTRACE_JSON_OBJECT,
3721	DTRACE_JSON_STRING,
3722	DTRACE_JSON_STRING_ESCAPE,
3723	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3724	DTRACE_JSON_COLON,
3725	DTRACE_JSON_COMMA,
3726	DTRACE_JSON_VALUE,
3727	DTRACE_JSON_IDENTIFIER,
3728	DTRACE_JSON_NUMBER,
3729	DTRACE_JSON_NUMBER_FRAC,
3730	DTRACE_JSON_NUMBER_EXP,
3731	DTRACE_JSON_COLLECT_OBJECT
3732	} dtrace_json_state_t;
3733
3734	/*
3735	* This function possesses just enough knowledge about JSON to extract a single
3736	* value from a JSON string and store it in the scratch buffer. It is able
3737	* to extract nested object values, and members of arrays by index.
3738	*
3739	* elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3740	* be looked up as we descend into the object tree. e.g.
3741	*
3742	* foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3743	* with nelems = 5.
3744	*
3745	* The run time of this function must be bounded above by strsize to limit the
3746	* amount of work done in probe context. As such, it is implemented as a
3747	* simple state machine, reading one character at a time using safe loads
3748	* until we find the requested element, hit a parsing error or run off the
3749	* end of the object or string.
3750	*
3751	* As there is no way for a subroutine to return an error without interrupting
3752	* clause execution, we simply return NULL in the event of a missing key or any
3753	* other error condition. Each NULL return in this function is commented with
3754	* the error condition it represents -- parsing or otherwise.
3755	*
3756	* The set of states for the state machine closely matches the JSON
3757	* specification (http://json.org/). Briefly:
3758	*
3759	* DTRACE_JSON_REST:
3760	* Skip whitespace until we find either a top-level Object, moving
3761	* to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3762	*
3763	* DTRACE_JSON_OBJECT:
3764	* Locate the next key String in an Object. Sets a flag to denote
3765	* the next String as a key string and moves to DTRACE_JSON_STRING.
3766	*
3767	* DTRACE_JSON_COLON:
3768	* Skip whitespace until we find the colon that separates key Strings
3769	* from their values. Once found, move to DTRACE_JSON_VALUE.
3770	*
3771	* DTRACE_JSON_VALUE:
3772	* Detects the type of the next value (String, Number, Identifier, Object
3773	* or Array) and routes to the states that process that type. Here we also
3774	* deal with the element selector list if we are requested to traverse down
3775	* into the object tree.
3776	*
3777	* DTRACE_JSON_COMMA:
3778	* Skip whitespace until we find the comma that separates key-value pairs
3779	* in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3780	* (similarly DTRACE_JSON_VALUE). All following literal value processing
3781	* states return to this state at the end of their value, unless otherwise
3782	* noted.
3783	*
3784	* DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3785	* Processes a Number literal from the JSON, including any exponent
3786	* component that may be present. Numbers are returned as strings, which
3787	* may be passed to strtoll() if an integer is required.
3788	*
3789	* DTRACE_JSON_IDENTIFIER:
3790	* Processes a "true", "false" or "null" literal in the JSON.
3791	*
3792	* DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3793	* DTRACE_JSON_STRING_ESCAPE_UNICODE:
3794	* Processes a String literal from the JSON, whether the String denotes
3795	* a key, a value or part of a larger Object. Handles all escape sequences
3796	* present in the specification, including four-digit unicode characters,
3797	* but merely includes the escape sequence without converting it to the
3798	* actual escaped character. If the String is flagged as a key, we
3799	* move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3800	*
3801	* DTRACE_JSON_COLLECT_OBJECT:
3802	* This state collects an entire Object (or Array), correctly handling
3803	* embedded strings. If the full element selector list matches this nested
3804	* object, we return the Object in full as a string. If not, we use this
3805	* state to skip to the next value at this level and continue processing.
3806	*/
3807	static char *
3808	dtrace_json(uint64_t size, uintptr_t json, char elemlist, int* nelems,
3809	char *dest)
3810	{
3811	dtrace_json_state_t state = DTRACE_JSON_REST;
3812	int64_t array_elem = INT64_MIN;
3813	int64_t array_pos = `0`;
3814	uint8_t escape_unicount = `0`;
3815	boolean_t string_is_key = B_FALSE;
3816	boolean_t collect_object = B_FALSE;
3817	boolean_t found_key = B_FALSE;
3818	boolean_t in_array = B_FALSE;
3819	uint32_t braces = `0`, brackets = `0`;
3820	char *elem = elemlist;
3821	char *dd = dest;
3822	uintptr_t cur;
3823
3824	for (cur = json; cur < json + size; cur++) {
3825	char cc = dtrace_load8(addr: cur);
3826	if (cc == `'\0'`)
3827	return (NULL);
3828
3829	switch (state) {
3830	case DTRACE_JSON_REST:
3831	if (isspace(cc))
3832	break;
3833
3834	if (cc == `'{'`) {
3835	state = DTRACE_JSON_OBJECT;
3836	break;
3837	}
3838
3839	if (cc == `'['`) {
3840	in_array = B_TRUE;
3841	array_pos = `0`;
3842	array_elem = dtrace_strtoll(input: elem, base: `10`, limit: size);
3843	found_key = array_elem == `0` ? B_TRUE : B_FALSE;
3844	state = DTRACE_JSON_VALUE;
3845	break;
3846	}
3847
3848	/*
3849	* ERROR: expected to find a top-level object or array.
3850	*/
3851	return (NULL);
3852	case DTRACE_JSON_OBJECT:
3853	if (isspace(cc))
3854	break;
3855
3856	if (cc == `'"'`) {
3857	state = DTRACE_JSON_STRING;
3858	string_is_key = B_TRUE;
3859	break;
3860	}
3861
3862	/*
3863	* ERROR: either the object did not start with a key
3864	* string, or we've run off the end of the object
3865	* without finding the requested key.
3866	*/
3867	return (NULL);
3868	case DTRACE_JSON_STRING:
3869	if (cc == `'\\'`) {
3870	*dd++ = `'\\'`;
3871	state = DTRACE_JSON_STRING_ESCAPE;
3872	break;
3873	}
3874
3875	if (cc == `'"'`) {
3876	if (collect_object) {
3877	/*
3878	* We don't reset the dest here, as
3879	* the string is part of a larger
3880	* object being collected.
3881	*/
3882	*dd++ = cc;
3883	collect_object = B_FALSE;
3884	state = DTRACE_JSON_COLLECT_OBJECT;
3885	break;
3886	}
3887	*dd = `'\0'`;
3888	dd = dest; / reset string buffer /
3889	if (string_is_key) {
3890	if (dtrace_strncmp(s1: dest, s2: elem,
3891	limit: size) == `0`)
3892	found_key = B_TRUE;
3893	} else if (found_key) {
3894	if (nelems > `1`) {
3895	/*
3896	* We expected an object, not
3897	* this string.
3898	*/
3899	return (NULL);
3900	}
3901	return (dest);
3902	}
3903	state = string_is_key ? DTRACE_JSON_COLON :
3904	DTRACE_JSON_COMMA;
3905	string_is_key = B_FALSE;
3906	break;
3907	}
3908
3909	*dd++ = cc;
3910	break;
3911	case DTRACE_JSON_STRING_ESCAPE:
3912	*dd++ = cc;
3913	if (cc == `'u'`) {
3914	escape_unicount = `0`;
3915	state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3916	} else {
3917	state = DTRACE_JSON_STRING;
3918	}
3919	break;
3920	case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3921	if (!isxdigit(cc)) {
3922	/*
3923	* ERROR: invalid unicode escape, expected
3924	* four valid hexidecimal digits.
3925	*/
3926	return (NULL);
3927	}
3928
3929	*dd++ = cc;
3930	if (++escape_unicount == `4`)
3931	state = DTRACE_JSON_STRING;
3932	break;
3933	case DTRACE_JSON_COLON:
3934	if (isspace(cc))
3935	break;
3936
3937	if (cc == `':'`) {
3938	state = DTRACE_JSON_VALUE;
3939	break;
3940	}
3941
3942	/*
3943	* ERROR: expected a colon.
3944	*/
3945	return (NULL);
3946	case DTRACE_JSON_COMMA:
3947	if (isspace(cc))
3948	break;
3949
3950	if (cc == `','`) {
3951	if (in_array) {
3952	state = DTRACE_JSON_VALUE;
3953	if (++array_pos == array_elem)
3954	found_key = B_TRUE;
3955	} else {
3956	state = DTRACE_JSON_OBJECT;
3957	}
3958	break;
3959	}
3960
3961	/*
3962	* ERROR: either we hit an unexpected character, or
3963	* we reached the end of the object or array without
3964	* finding the requested key.
3965	*/
3966	return (NULL);
3967	case DTRACE_JSON_IDENTIFIER:
3968	if (islower(cc)) {
3969	*dd++ = cc;
3970	break;
3971	}
3972
3973	*dd = `'\0'`;
3974	dd = dest; / reset string buffer /
3975
3976	if (dtrace_strncmp(s1: dest, s2: "true", limit: `5`) == `0` \|\|
3977	dtrace_strncmp(s1: dest, s2: "false", limit: `6`) == `0` \|\|
3978	dtrace_strncmp(s1: dest, s2: "null", limit: `5`) == `0`) {
3979	if (found_key) {
3980	if (nelems > `1`) {
3981	/*
3982	* ERROR: We expected an object,
3983	* not this identifier.
3984	*/
3985	return (NULL);
3986	}
3987	return (dest);
3988	} else {
3989	cur--;
3990	state = DTRACE_JSON_COMMA;
3991	break;
3992	}
3993	}
3994
3995	/*
3996	* ERROR: we did not recognise the identifier as one
3997	* of those in the JSON specification.
3998	*/
3999	return (NULL);
4000	case DTRACE_JSON_NUMBER:
4001	if (cc == `'.'`) {
4002	*dd++ = cc;
4003	state = DTRACE_JSON_NUMBER_FRAC;
4004	break;
4005	}
4006
4007	if (cc == `'x'` \|\| cc == `'X'`) {
4008	/*
4009	* ERROR: specification explicitly excludes
4010	* hexidecimal or octal numbers.
4011	*/
4012	return (NULL);
4013	}
4014
4015	OS_FALLTHROUGH;
4016	case DTRACE_JSON_NUMBER_FRAC:
4017	if (cc == `'e'` \|\| cc == `'E'`) {
4018	*dd++ = cc;
4019	state = DTRACE_JSON_NUMBER_EXP;
4020	break;
4021	}
4022
4023	if (cc == `'+'` \|\| cc == `'-'`) {
4024	/*
4025	* ERROR: expect sign as part of exponent only.
4026	*/
4027	return (NULL);
4028	}
4029	OS_FALLTHROUGH;
4030	case DTRACE_JSON_NUMBER_EXP:
4031	if (isdigit(cc) \|\| cc == `'+'` \|\| cc == `'-'`) {
4032	*dd++ = cc;
4033	break;
4034	}
4035
4036	*dd = `'\0'`;
4037	dd = dest; / reset string buffer /
4038	if (found_key) {
4039	if (nelems > `1`) {
4040	/*
4041	* ERROR: We expected an object, not
4042	* this number.
4043	*/
4044	return (NULL);
4045	}
4046	return (dest);
4047	}
4048
4049	cur--;
4050	state = DTRACE_JSON_COMMA;
4051	break;
4052	case DTRACE_JSON_VALUE:
4053	if (isspace(cc))
4054	break;
4055
4056	if (cc == `'{'` \|\| cc == `'['`) {
4057	if (nelems > `1` && found_key) {
4058	in_array = cc == `'['` ? B_TRUE : B_FALSE;
4059	/*
4060	* If our element selector directs us
4061	* to descend into this nested object,
4062	* then move to the next selector
4063	* element in the list and restart the
4064	* state machine.
4065	*/
4066	while (*elem != `'\0'`)
4067	elem++;
4068	elem++; / skip the inter-element NUL /
4069	nelems--;
4070	dd = dest;
4071	if (in_array) {
4072	state = DTRACE_JSON_VALUE;
4073	array_pos = `0`;
4074	array_elem = dtrace_strtoll(
4075	input: elem, base: `10`, limit: size);
4076	found_key = array_elem == `0` ?
4077	B_TRUE : B_FALSE;
4078	} else {
4079	found_key = B_FALSE;
4080	state = DTRACE_JSON_OBJECT;
4081	}
4082	break;
4083	}
4084
4085	/*
4086	* Otherwise, we wish to either skip this
4087	* nested object or return it in full.
4088	*/
4089	if (cc == `'['`)
4090	brackets = `1`;
4091	else
4092	braces = `1`;
4093	*dd++ = cc;
4094	state = DTRACE_JSON_COLLECT_OBJECT;
4095	break;
4096	}
4097
4098	if (cc == `'"'`) {
4099	state = DTRACE_JSON_STRING;
4100	break;
4101	}
4102
4103	if (islower(cc)) {
4104	/*
4105	* Here we deal with true, false and null.
4106	*/
4107	*dd++ = cc;
4108	state = DTRACE_JSON_IDENTIFIER;
4109	break;
4110	}
4111
4112	if (cc == `'-'` \|\| isdigit(cc)) {
4113	*dd++ = cc;
4114	state = DTRACE_JSON_NUMBER;
4115	break;
4116	}
4117
4118	/*
4119	* ERROR: unexpected character at start of value.
4120	*/
4121	return (NULL);
4122	case DTRACE_JSON_COLLECT_OBJECT:
4123	if (cc == `'\0'`)
4124	/*
4125	* ERROR: unexpected end of input.
4126	*/
4127	return (NULL);
4128
4129	*dd++ = cc;
4130	if (cc == `'"'`) {
4131	collect_object = B_TRUE;
4132	state = DTRACE_JSON_STRING;
4133	break;
4134	}
4135
4136	if (cc == `']'`) {
4137	if (brackets-- == `0`) {
4138	/*
4139	* ERROR: unbalanced brackets.
4140	*/
4141	return (NULL);
4142	}
4143	} else if (cc == `'}'`) {
4144	if (braces-- == `0`) {
4145	/*
4146	* ERROR: unbalanced braces.
4147	*/
4148	return (NULL);
4149	}
4150	} else if (cc == `'{'`) {
4151	braces++;
4152	} else if (cc == `'['`) {
4153	brackets++;
4154	}
4155
4156	if (brackets == `0` && braces == `0`) {
4157	if (found_key) {
4158	*dd = `'\0'`;
4159	return (dest);
4160	}
4161	dd = dest; / reset string buffer /
4162	state = DTRACE_JSON_COMMA;
4163	}
4164	break;
4165	}
4166	}
4167	return (NULL);
4168	}
4169
4170	/*
4171	* Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4172	* Notice that we don't bother validating the proper number of arguments or
4173	* their types in the tuple stack. This isn't needed because all argument
4174	* interpretation is safe because of our load safety -- the worst that can
4175	* happen is that a bogus program can obtain bogus results.
4176	*/
4177	static void
4178	dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4179	dtrace_key_t tupregs, int* nargs,
4180	dtrace_mstate_t mstate, dtrace_state_t state)
4181	{
4182	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4183	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4184	dtrace_vstate_t *vstate = &state->dts_vstate;
4185
4186	#if !defined(__APPLE__)
4187	union {
4188	mutex_impl_t mi;
4189	uint64_t mx;
4190	} m;
4191
4192	union {
4193	krwlock_t ri;
4194	uintptr_t rw;
4195	} r;
4196	#else
4197	/ FIXME: awaits lock/mutex work /
4198	#endif /* __APPLE__ */
4199
4200	switch (subr) {
4201	case DIF_SUBR_RAND:
4202	regs[rd] = dtrace_xoroshiro128_plus_next(
4203	state->dts_rstate[CPU->cpu_id]);
4204	break;
4205
4206	#if !defined(__APPLE__)
4207	case DIF_SUBR_MUTEX_OWNED:
4208	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
4209	mstate, vstate)) {
4210	regs[rd] = `0`;
4211	break;
4212	}
4213
4214	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
4215	if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4216	regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4217	else
4218	regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4219	break;
4220
4221	case DIF_SUBR_MUTEX_OWNER:
4222	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
4223	mstate, vstate)) {
4224	regs[rd] = `0`;
4225	break;
4226	}
4227
4228	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
4229	if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4230	MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4231	regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4232	else
4233	regs[rd] = `0`;
4234	break;
4235
4236	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4237	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
4238	mstate, vstate)) {
4239	regs[rd] = `0`;
4240	break;
4241	}
4242
4243	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
4244	regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4245	break;
4246
4247	case DIF_SUBR_MUTEX_TYPE_SPIN:
4248	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
4249	mstate, vstate)) {
4250	regs[rd] = `0`;
4251	break;
4252	}
4253
4254	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
4255	regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4256	break;
4257
4258	case DIF_SUBR_RW_READ_HELD: {
4259	uintptr_t tmp;
4260
4261	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (uintptr_t),
4262	mstate, vstate)) {
4263	regs[rd] = `0`;
4264	break;
4265	}
4266
4267	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
4268	regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4269	break;
4270	}
4271
4272	case DIF_SUBR_RW_WRITE_HELD:
4273	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (krwlock_t),
4274	mstate, vstate)) {
4275	regs[rd] = `0`;
4276	break;
4277	}
4278
4279	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
4280	regs[rd] = _RW_WRITE_HELD(&r.ri);
4281	break;
4282
4283	case DIF_SUBR_RW_ISWRITER:
4284	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (krwlock_t),
4285	mstate, vstate)) {
4286	regs[rd] = `0`;
4287	break;
4288	}
4289
4290	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
4291	regs[rd] = _RW_ISWRITER(&r.ri);
4292	break;
4293	#else
4294	/ FIXME: awaits lock/mutex work /
4295	#endif /* __APPLE__ */
4296
4297	case DIF_SUBR_BCOPY: {
4298	/*
4299	* We need to be sure that the destination is in the scratch
4300	* region -- no other region is allowed.
4301	*/
4302	uintptr_t src = tupregs[`0`].dttk_value;
4303	uintptr_t dest = tupregs[`1`].dttk_value;
4304	size_t size = tupregs[`2`].dttk_value;
4305
4306	if (!dtrace_inscratch(dest, size, mstate)) {
4307	*flags \|= CPU_DTRACE_BADADDR;
4308	*illval = regs[rd];
4309	break;
4310	}
4311
4312	if (!dtrace_canload(addr: src, sz: size, mstate, vstate)) {
4313	regs[rd] = `0`;
4314	break;
4315	}
4316
4317	dtrace_bcopy(src: (void )src, dst: (void* *)dest, len: size);
4318	break;
4319	}
4320
4321	case DIF_SUBR_ALLOCA:
4322	case DIF_SUBR_COPYIN: {
4323	uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
4324	uint64_t size =
4325	tupregs[subr == DIF_SUBR_ALLOCA ? `0` : `1`].dttk_value;
4326	size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4327
4328	/*
4329	* Check whether the user can access kernel memory
4330	*/
4331	if (dtrace_priv_kernel(state) == `0`) {
4332	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4333	regs[rd] = `0`;
4334	break;
4335	}
4336	/*
4337	* This action doesn't require any credential checks since
4338	* probes will not activate in user contexts to which the
4339	* enabling user does not have permissions.
4340	*/
4341
4342	/*
4343	* Rounding up the user allocation size could have overflowed
4344	* a large, bogus allocation (like -1ULL) to 0.
4345	*/
4346	if (scratch_size < size \|\|
4347	!DTRACE_INSCRATCH(mstate, scratch_size)) {
4348	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4349	regs[rd] = `0`;
4350	break;
4351	}
4352
4353	if (subr == DIF_SUBR_COPYIN) {
4354	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4355	if (dtrace_priv_proc(state))
4356	dtrace_copyin(tupregs[`0`].dttk_value, dest, size, flags);
4357	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4358	}
4359
4360	mstate->dtms_scratch_ptr += scratch_size;
4361	regs[rd] = dest;
4362	break;
4363	}
4364
4365	case DIF_SUBR_COPYINTO: {
4366	uint64_t size = tupregs[`1`].dttk_value;
4367	uintptr_t dest = tupregs[`2`].dttk_value;
4368
4369	/*
4370	* This action doesn't require any credential checks since
4371	* probes will not activate in user contexts to which the
4372	* enabling user does not have permissions.
4373	*/
4374	if (!dtrace_inscratch(dest, size, mstate)) {
4375	*flags \|= CPU_DTRACE_BADADDR;
4376	*illval = regs[rd];
4377	break;
4378	}
4379
4380	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4381	if (dtrace_priv_proc(state))
4382	dtrace_copyin(tupregs[`0`].dttk_value, dest, size, flags);
4383	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4384	break;
4385	}
4386
4387	case DIF_SUBR_COPYINSTR: {
4388	uintptr_t dest = mstate->dtms_scratch_ptr;
4389	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4390
4391	if (nargs > `1` && tupregs[`1`].dttk_value < size)
4392	size = tupregs[`1`].dttk_value + `1`;
4393
4394	/*
4395	* This action doesn't require any credential checks since
4396	* probes will not activate in user contexts to which the
4397	* enabling user does not have permissions.
4398	*/
4399	if (!DTRACE_INSCRATCH(mstate, size)) {
4400	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4401	regs[rd] = `0`;
4402	break;
4403	}
4404
4405	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4406	if (dtrace_priv_proc(state))
4407	dtrace_copyinstr(tupregs[`0`].dttk_value, dest, size, flags);
4408	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4409
4410	((char *)dest)[size - `1`] = `'\0'`;
4411	mstate->dtms_scratch_ptr += size;
4412	regs[rd] = dest;
4413	break;
4414	}
4415
4416	case DIF_SUBR_MSGSIZE:
4417	case DIF_SUBR_MSGDSIZE: {
4418	/ Darwin does not implement SysV streams messages /
4419	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4420	regs[rd] = `0`;
4421	break;
4422	}
4423
4424	case DIF_SUBR_PROGENYOF: {
4425	pid_t pid = tupregs[`0`].dttk_value;
4426	struct proc *p = current_proc();
4427	int rval = `0`, lim = nprocs;
4428
4429	while(p && (lim-- > `0`)) {
4430	pid_t ppid;
4431
4432	ppid = (pid_t)dtrace_load32(addr: (uintptr_t)&(p->p_pid));
4433	if (*flags & CPU_DTRACE_FAULT)
4434	break;
4435
4436	if (ppid == pid) {
4437	rval = `1`;
4438	break;
4439	}
4440
4441	if (ppid == `0`)
4442	break; / Can't climb process tree any further. /
4443
4444	p = (struct proc *)dtrace_loadptr(addr: (uintptr_t)&(p->p_pptr));
4445	#if __has_feature(ptrauth_calls)
4446	p = ptrauth_strip(p, ptrauth_key_process_independent_data);
4447	#endif
4448	if (*flags & CPU_DTRACE_FAULT)
4449	break;
4450	}
4451
4452	regs[rd] = rval;
4453	break;
4454	}
4455
4456	case DIF_SUBR_SPECULATION:
4457	regs[rd] = dtrace_speculation(state);
4458	break;
4459
4460
4461	case DIF_SUBR_COPYOUT: {
4462	uintptr_t kaddr = tupregs[`0`].dttk_value;
4463	user_addr_t uaddr = tupregs[`1`].dttk_value;
4464	uint64_t size = tupregs[`2`].dttk_value;
4465
4466	if (!dtrace_destructive_disallow &&
4467	dtrace_priv_proc_control(state) &&
4468	!dtrace_istoxic(kaddr, size) &&
4469	dtrace_canload(addr: kaddr, sz: size, mstate, vstate)) {
4470	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4471	dtrace_copyout(kaddr, uaddr, size, flags);
4472	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4473	}
4474	break;
4475	}
4476
4477	case DIF_SUBR_COPYOUTSTR: {
4478	uintptr_t kaddr = tupregs[`0`].dttk_value;
4479	user_addr_t uaddr = tupregs[`1`].dttk_value;
4480	uint64_t size = tupregs[`2`].dttk_value;
4481	size_t lim;
4482
4483	if (!dtrace_destructive_disallow &&
4484	dtrace_priv_proc_control(state) &&
4485	!dtrace_istoxic(kaddr, size) &&
4486	dtrace_strcanload(addr: kaddr, sz: size, remain: &lim, mstate, vstate)) {
4487	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4488	dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4489	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4490	}
4491	break;
4492	}
4493
4494	case DIF_SUBR_STRLEN: {
4495	size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4496	uintptr_t addr = (uintptr_t)tupregs[`0`].dttk_value;
4497	size_t lim;
4498
4499	if (!dtrace_strcanload(addr, sz: size, remain: &lim, mstate, vstate)) {
4500	regs[rd] = `0`;
4501	break;
4502	}
4503
4504	regs[rd] = dtrace_strlen(s: (char *)addr, lim);
4505
4506	break;
4507	}
4508
4509	case DIF_SUBR_STRCHR:
4510	case DIF_SUBR_STRRCHR: {
4511	/*
4512	* We're going to iterate over the string looking for the
4513	* specified character. We will iterate until we have reached
4514	* the string length or we have found the character. If this
4515	* is DIF_SUBR_STRRCHR, we will look for the last occurrence
4516	* of the specified character instead of the first.
4517	*/
4518	uintptr_t addr = tupregs[`0`].dttk_value;
4519	uintptr_t addr_limit;
4520	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4521	size_t lim;
4522	char c, target = (char)tupregs[`1`].dttk_value;
4523
4524	if (!dtrace_strcanload(addr, sz: size, remain: &lim, mstate, vstate)) {
4525	regs[rd] = `0`;
4526	break;
4527	}
4528	addr_limit = addr + lim;
4529
4530	for (regs[rd] = `0`; addr < addr_limit; addr++) {
4531	if ((c = dtrace_load8(addr)) == target) {
4532	regs[rd] = addr;
4533
4534	if (subr == DIF_SUBR_STRCHR)
4535	break;
4536	}
4537
4538	if (c == `'\0'`)
4539	break;
4540	}
4541
4542	break;
4543	}
4544
4545	case DIF_SUBR_STRSTR:
4546	case DIF_SUBR_INDEX:
4547	case DIF_SUBR_RINDEX: {
4548	/*
4549	* We're going to iterate over the string looking for the
4550	* specified string. We will iterate until we have reached
4551	* the string length or we have found the string. (Yes, this
4552	* is done in the most naive way possible -- but considering
4553	* that the string we're searching for is likely to be
4554	* relatively short, the complexity of Rabin-Karp or similar
4555	* hardly seems merited.)
4556	*/
4557	char addr = (char* *)(uintptr_t)tupregs[`0`].dttk_value;
4558	char substr = (char* *)(uintptr_t)tupregs[`1`].dttk_value;
4559	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4560	size_t len = dtrace_strlen(s: addr, lim: size);
4561	size_t sublen = dtrace_strlen(s: substr, lim: size);
4562	char limit = addr + len, orig = addr;
4563	int notfound = subr == DIF_SUBR_STRSTR ? `0` : -`1`;
4564	int inc = `1`;
4565
4566	regs[rd] = notfound;
4567
4568	if (!dtrace_canload(addr: (uintptr_t)addr, sz: len + `1`, mstate, vstate)) {
4569	regs[rd] = `0`;
4570	break;
4571	}
4572
4573	if (!dtrace_canload(addr: (uintptr_t)substr, sz: sublen + `1`, mstate,
4574	vstate)) {
4575	regs[rd] = `0`;
4576	break;
4577	}
4578
4579	/*
4580	* strstr() and index()/rindex() have similar semantics if
4581	* both strings are the empty string: strstr() returns a
4582	* pointer to the (empty) string, and index() and rindex()
4583	* both return index 0 (regardless of any position argument).
4584	*/
4585	if (sublen == `0` && len == `0`) {
4586	if (subr == DIF_SUBR_STRSTR)
4587	regs[rd] = (uintptr_t)addr;
4588	else
4589	regs[rd] = `0`;
4590	break;
4591	}
4592
4593	if (subr != DIF_SUBR_STRSTR) {
4594	if (subr == DIF_SUBR_RINDEX) {
4595	limit = orig - `1`;
4596	addr += len;
4597	inc = -`1`;
4598	}
4599
4600	/*
4601	* Both index() and rindex() take an optional position
4602	* argument that denotes the starting position.
4603	*/
4604	if (nargs == `3`) {
4605	int64_t pos = (int64_t)tupregs[`2`].dttk_value;
4606
4607	/*
4608	* If the position argument to index() is
4609	* negative, Perl implicitly clamps it at
4610	* zero. This semantic is a little surprising
4611	* given the special meaning of negative
4612	* positions to similar Perl functions like
4613	* substr(), but it appears to reflect a
4614	* notion that index() can start from a
4615	* negative index and increment its way up to
4616	* the string. Given this notion, Perl's
4617	* rindex() is at least self-consistent in
4618	* that it implicitly clamps positions greater
4619	* than the string length to be the string
4620	* length. Where Perl completely loses
4621	* coherence, however, is when the specified
4622	* substring is the empty string (""). In
4623	* this case, even if the position is
4624	* negative, rindex() returns 0 -- and even if
4625	* the position is greater than the length,
4626	* index() returns the string length. These
4627	* semantics violate the notion that index()
4628	* should never return a value less than the
4629	* specified position and that rindex() should
4630	* never return a value greater than the
4631	* specified position. (One assumes that
4632	* these semantics are artifacts of Perl's
4633	* implementation and not the results of
4634	* deliberate design -- it beggars belief that
4635	* even Larry Wall could desire such oddness.)
4636	* While in the abstract one would wish for
4637	* consistent position semantics across
4638	* substr(), index() and rindex() -- or at the
4639	* very least self-consistent position
4640	* semantics for index() and rindex() -- we
4641	* instead opt to keep with the extant Perl
4642	* semantics, in all their broken glory. (Do
4643	* we have more desire to maintain Perl's
4644	* semantics than Perl does? Probably.)
4645	*/
4646	if (subr == DIF_SUBR_RINDEX) {
4647	if (pos < `0`) {
4648	if (sublen == `0`)
4649	regs[rd] = `0`;
4650	break;
4651	}
4652
4653	if ((size_t)pos > len)
4654	pos = len;
4655	} else {
4656	if (pos < `0`)
4657	pos = `0`;
4658
4659	if ((size_t)pos >= len) {
4660	if (sublen == `0`)
4661	regs[rd] = len;
4662	break;
4663	}
4664	}
4665
4666	addr = orig + pos;
4667	}
4668	}
4669
4670	for (regs[rd] = notfound; addr != limit; addr += inc) {
4671	if (dtrace_strncmp(s1: addr, s2: substr, limit: sublen) == `0`) {
4672	if (subr != DIF_SUBR_STRSTR) {
4673	/*
4674	* As D index() and rindex() are
4675	* modeled on Perl (and not on awk),
4676	* we return a zero-based (and not a
4677	* one-based) index. (For you Perl
4678	* weenies: no, we're not going to add
4679	* $[ -- and shouldn't you be at a con
4680	* or something?)
4681	*/
4682	regs[rd] = (uintptr_t)(addr - orig);
4683	break;
4684	}
4685
4686	ASSERT(subr == DIF_SUBR_STRSTR);
4687	regs[rd] = (uintptr_t)addr;
4688	break;
4689	}
4690	}
4691
4692	break;
4693	}
4694
4695	case DIF_SUBR_STRTOK: {
4696	uintptr_t addr = tupregs[`0`].dttk_value;
4697	uintptr_t tokaddr = tupregs[`1`].dttk_value;
4698	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4699	uintptr_t limit, toklimit;
4700	size_t clim;
4701	char dest = (char* *)mstate->dtms_scratch_ptr;
4702	uint8_t c=`'\0'`, tokmap[`32`]; / 256 / 8 /
4703	uint64_t i = `0`;
4704
4705	/*
4706	* Check both the token buffer and (later) the input buffer,
4707	* since both could be non-scratch addresses.
4708	*/
4709	if (!dtrace_strcanload(addr: tokaddr, sz: size, remain: &clim, mstate, vstate)) {
4710	regs[rd] = `0`;
4711	break;
4712	}
4713	toklimit = tokaddr + clim;
4714
4715	if (!DTRACE_INSCRATCH(mstate, size)) {
4716	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4717	regs[rd] = `0`;
4718	break;
4719	}
4720
4721	if (addr == `0`) {
4722	/*
4723	* If the address specified is NULL, we use our saved
4724	* strtok pointer from the mstate. Note that this
4725	* means that the saved strtok pointer is _only_
4726	* valid within multiple enablings of the same probe --
4727	* it behaves like an implicit clause-local variable.
4728	*/
4729	addr = mstate->dtms_strtok;
4730	limit = mstate->dtms_strtok_limit;
4731	} else {
4732	/*
4733	* If the user-specified address is non-NULL we must
4734	* access check it. This is the only time we have
4735	* a chance to do so, since this address may reside
4736	* in the string table of this clause-- future calls
4737	* (when we fetch addr from mstate->dtms_strtok)
4738	* would fail this access check.
4739	*/
4740	if (!dtrace_strcanload(addr, sz: size, remain: &clim, mstate,
4741	vstate)) {
4742	regs[rd] = `0`;
4743	break;
4744	}
4745	limit = addr + clim;
4746	}
4747
4748	/*
4749	* First, zero the token map, and then process the token
4750	* string -- setting a bit in the map for every character
4751	* found in the token string.
4752	*/
4753	for (i = `0`; i < (int)sizeof (tokmap); i++)
4754	tokmap[i] = `0`;
4755
4756	for (; tokaddr < toklimit; tokaddr++) {
4757	if ((c = dtrace_load8(addr: tokaddr)) == `'\0'`)
4758	break;
4759
4760	ASSERT((c >> `3`) < sizeof (tokmap));
4761	tokmap[c >> `3`] \|= (`1` << (c & `0x7`));
4762	}
4763
4764	for (; addr < limit; addr++) {
4765	/*
4766	* We're looking for a character that is _not_
4767	* contained in the token string.
4768	*/
4769	if ((c = dtrace_load8(addr)) == `'\0'`)
4770	break;
4771
4772	if (!(tokmap[c >> `3`] & (`1` << (c & `0x7`))))
4773	break;
4774	}
4775
4776	if (c == `'\0'`) {
4777	/*
4778	* We reached the end of the string without finding
4779	* any character that was not in the token string.
4780	* We return NULL in this case, and we set the saved
4781	* address to NULL as well.
4782	*/
4783	regs[rd] = `0`;
4784	mstate->dtms_strtok = `0`;
4785	mstate->dtms_strtok_limit = `0`;
4786	break;
4787	}
4788
4789	/*
4790	* From here on, we're copying into the destination string.
4791	*/
4792	for (i = `0`; addr < limit && i < size - `1`; addr++) {
4793	if ((c = dtrace_load8(addr)) == `'\0'`)
4794	break;
4795
4796	if (tokmap[c >> `3`] & (`1` << (c & `0x7`)))
4797	break;
4798
4799	ASSERT(i < size);
4800	dest[i++] = c;
4801	}
4802
4803	ASSERT(i < size);
4804	dest[i] = `'\0'`;
4805	regs[rd] = (uintptr_t)dest;
4806	mstate->dtms_scratch_ptr += size;
4807	mstate->dtms_strtok = addr;
4808	mstate->dtms_strtok_limit = limit;
4809	break;
4810	}
4811
4812	case DIF_SUBR_SUBSTR: {
4813	uintptr_t s = tupregs[`0`].dttk_value;
4814	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4815	char d = (char* *)mstate->dtms_scratch_ptr;
4816	int64_t index = (int64_t)tupregs[`1`].dttk_value;
4817	int64_t remaining = (int64_t)tupregs[`2`].dttk_value;
4818	size_t len = dtrace_strlen(s: (char *)s, lim: size);
4819	int64_t i = `0`;
4820
4821	if (!dtrace_canload(addr: s, sz: len + `1`, mstate, vstate)) {
4822	regs[rd] = `0`;
4823	break;
4824	}
4825
4826	if (!DTRACE_INSCRATCH(mstate, size)) {
4827	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4828	regs[rd] = `0`;
4829	break;
4830	}
4831
4832	if (nargs <= `2`)
4833	remaining = (int64_t)size;
4834
4835	if (index < `0`) {
4836	index += len;
4837
4838	if (index < `0` && index + remaining > `0`) {
4839	remaining += index;
4840	index = `0`;
4841	}
4842	}
4843
4844	if ((size_t)index >= len \|\| index < `0`) {
4845	remaining = `0`;
4846	} else if (remaining < `0`) {
4847	remaining += len - index;
4848	} else if ((uint64_t)index + (uint64_t)remaining > size) {
4849	remaining = size - index;
4850	}
4851
4852	for (i = `0`; i < remaining; i++) {
4853	if ((d[i] = dtrace_load8(addr: s + index + i)) == `'\0'`)
4854	break;
4855	}
4856
4857	d[i] = `'\0'`;
4858
4859	mstate->dtms_scratch_ptr += size;
4860	regs[rd] = (uintptr_t)d;
4861	break;
4862	}
4863
4864	case DIF_SUBR_GETMAJOR:
4865	regs[rd] = (uintptr_t)major( (dev_t)tupregs[`0`].dttk_value );
4866	break;
4867
4868	case DIF_SUBR_GETMINOR:
4869	regs[rd] = (uintptr_t)minor( (dev_t)tupregs[`0`].dttk_value );
4870	break;
4871
4872	case DIF_SUBR_DDI_PATHNAME: {
4873	/ APPLE NOTE: currently unsupported on Darwin /
4874	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4875	regs[rd] = `0`;
4876	break;
4877	}
4878
4879	case DIF_SUBR_STRJOIN: {
4880	char d = (char* *)mstate->dtms_scratch_ptr;
4881	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4882	uintptr_t s1 = tupregs[`0`].dttk_value;
4883	uintptr_t s2 = tupregs[`1`].dttk_value;
4884	uint64_t i = `0`, j = `0`;
4885	size_t lim1, lim2;
4886	char c;
4887
4888	if (!dtrace_strcanload(addr: s1, sz: size, remain: &lim1, mstate, vstate) \|\|
4889	!dtrace_strcanload(addr: s2, sz: size, remain: &lim2, mstate, vstate)) {
4890	regs[rd] = `0`;
4891	break;
4892	}
4893
4894	if (!DTRACE_INSCRATCH(mstate, size)) {
4895	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4896	regs[rd] = `0`;
4897	break;
4898	}
4899
4900	for (;;) {
4901	if (i >= size) {
4902	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4903	regs[rd] = `0`;
4904	break;
4905	}
4906	c = (i >= lim1) ? `'\0'` : dtrace_load8(addr: s1++);
4907	if ((d[i++] = c) == `'\0'`) {
4908	i--;
4909	break;
4910	}
4911	}
4912
4913	for (;;) {
4914	if (i >= size) {
4915	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4916	regs[rd] = `0`;
4917	break;
4918	}
4919	c = (j++ >= lim2) ? `'\0'` : dtrace_load8(addr: s2++);
4920	if ((d[i++] = c) == `'\0'`)
4921	break;
4922	}
4923
4924	if (i < size) {
4925	mstate->dtms_scratch_ptr += i;
4926	regs[rd] = (uintptr_t)d;
4927	}
4928
4929	break;
4930	}
4931
4932	case DIF_SUBR_STRTOLL: {
4933	uintptr_t s = tupregs[`0`].dttk_value;
4934	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4935	size_t lim;
4936	int base = `10`;
4937
4938	if (nargs > `1`) {
4939	if ((base = tupregs[`1`].dttk_value) <= `1` \|\|
4940	base > (`'z'` - `'a'` + `1`) + (`'9'` - `'0'` + `1`)) {
4941	*flags \|= CPU_DTRACE_ILLOP;
4942	break;
4943	}
4944	}
4945
4946	if (!dtrace_strcanload(addr: s, sz: size, remain: &lim, mstate, vstate)) {
4947	regs[rd] = INT64_MIN;
4948	break;
4949	}
4950
4951	regs[rd] = dtrace_strtoll(input: (char *)s, base, limit: lim);
4952	break;
4953	}
4954
4955	case DIF_SUBR_LLTOSTR: {
4956	int64_t i = (int64_t)tupregs[`0`].dttk_value;
4957	uint64_t val, digit;
4958	uint64_t size = `65`; / enough room for 2^64 in binary /
4959	char end = (char* *)mstate->dtms_scratch_ptr + size - `1`;
4960	int base = `10`;
4961
4962	if (nargs > `1`) {
4963	if ((base = tupregs[`1`].dttk_value) <= `1` \|\|
4964	base > (`'z'` - `'a'` + `1`) + (`'9'` - `'0'` + `1`)) {
4965	*flags \|= CPU_DTRACE_ILLOP;
4966	break;
4967	}
4968	}
4969
4970	val = (base == `10` && i < `0`) ? i * -`1` : i;
4971
4972	if (!DTRACE_INSCRATCH(mstate, size)) {
4973	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4974	regs[rd] = `0`;
4975	break;
4976	}
4977
4978	for (*end-- = `'\0'`; val; val /= base) {
4979	if ((digit = val % base) <= `'9'` - `'0'`) {
4980	*end-- = `'0'` + digit;
4981	} else {
4982	*end-- = `'a'` + (digit - (`'9'` - `'0'`) - `1`);
4983	}
4984	}
4985
4986	if (i == `0` && base == `16`)
4987	*end-- = `'0'`;
4988
4989	if (base == `16`)
4990	*end-- = `'x'`;
4991
4992	if (i == `0` \|\| base == `8` \|\| base == `16`)
4993	*end-- = `'0'`;
4994
4995	if (i < `0` && base == `10`)
4996	*end-- = `'-'`;
4997
4998	regs[rd] = (uintptr_t)end + `1`;
4999	mstate->dtms_scratch_ptr += size;
5000	break;
5001	}
5002
5003	case DIF_SUBR_HTONS:
5004	case DIF_SUBR_NTOHS:
5005	#ifdef _BIG_ENDIAN
5006	regs[rd] = (uint16_t)tupregs[`0`].dttk_value;
5007	#else
5008	regs[rd] = DT_BSWAP_16((uint16_t)tupregs[`0`].dttk_value);
5009	#endif
5010	break;
5011
5012
5013	case DIF_SUBR_HTONL:
5014	case DIF_SUBR_NTOHL:
5015	#ifdef _BIG_ENDIAN
5016	regs[rd] = (uint32_t)tupregs[`0`].dttk_value;
5017	#else
5018	regs[rd] = DT_BSWAP_32((uint32_t)tupregs[`0`].dttk_value);
5019	#endif
5020	break;
5021
5022
5023	case DIF_SUBR_HTONLL:
5024	case DIF_SUBR_NTOHLL:
5025	#ifdef _BIG_ENDIAN
5026	regs[rd] = (uint64_t)tupregs[`0`].dttk_value;
5027	#else
5028	regs[rd] = DT_BSWAP_64((uint64_t)tupregs[`0`].dttk_value);
5029	#endif
5030	break;
5031
5032
5033	case DIF_SUBR_DIRNAME:
5034	case DIF_SUBR_BASENAME: {
5035	char dest = (char* *)mstate->dtms_scratch_ptr;
5036	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5037	uintptr_t src = tupregs[`0`].dttk_value;
5038	int i, j, len = dtrace_strlen(s: (char *)src, lim: size);
5039	int lastbase = -`1`, firstbase = -`1`, lastdir = -`1`;
5040	int start, end;
5041
5042	if (!dtrace_canload(addr: src, sz: len + `1`, mstate, vstate)) {
5043	regs[rd] = `0`;
5044	break;
5045	}
5046
5047	if (!DTRACE_INSCRATCH(mstate, size)) {
5048	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5049	regs[rd] = `0`;
5050	break;
5051	}
5052
5053	/*
5054	* The basename and dirname for a zero-length string is
5055	* defined to be "."
5056	*/
5057	if (len == `0`) {
5058	len = `1`;
5059	src = (uintptr_t)".";
5060	}
5061
5062	/*
5063	* Start from the back of the string, moving back toward the
5064	* front until we see a character that isn't a slash. That
5065	* character is the last character in the basename.
5066	*/
5067	for (i = len - `1`; i >= `0`; i--) {
5068	if (dtrace_load8(addr: src + i) != `'/'`)
5069	break;
5070	}
5071
5072	if (i >= `0`)
5073	lastbase = i;
5074
5075	/*
5076	* Starting from the last character in the basename, move
5077	* towards the front until we find a slash. The character
5078	* that we processed immediately before that is the first
5079	* character in the basename.
5080	*/
5081	for (; i >= `0`; i--) {
5082	if (dtrace_load8(addr: src + i) == `'/'`)
5083	break;
5084	}
5085
5086	if (i >= `0`)
5087	firstbase = i + `1`;
5088
5089	/*
5090	* Now keep going until we find a non-slash character. That
5091	* character is the last character in the dirname.
5092	*/
5093	for (; i >= `0`; i--) {
5094	if (dtrace_load8(addr: src + i) != `'/'`)
5095	break;
5096	}
5097
5098	if (i >= `0`)
5099	lastdir = i;
5100
5101	ASSERT(!(lastbase == -`1` && firstbase != -`1`));
5102	ASSERT(!(firstbase == -`1` && lastdir != -`1`));
5103
5104	if (lastbase == -`1`) {
5105	/*
5106	* We didn't find a non-slash character. We know that
5107	* the length is non-zero, so the whole string must be
5108	* slashes. In either the dirname or the basename
5109	* case, we return '/'.
5110	*/
5111	ASSERT(firstbase == -`1`);
5112	firstbase = lastbase = lastdir = `0`;
5113	}
5114
5115	if (firstbase == -`1`) {
5116	/*
5117	* The entire string consists only of a basename
5118	* component. If we're looking for dirname, we need
5119	* to change our string to be just "."; if we're
5120	* looking for a basename, we'll just set the first
5121	* character of the basename to be 0.
5122	*/
5123	if (subr == DIF_SUBR_DIRNAME) {
5124	ASSERT(lastdir == -`1`);
5125	src = (uintptr_t)".";
5126	lastdir = `0`;
5127	} else {
5128	firstbase = `0`;
5129	}
5130	}
5131
5132	if (subr == DIF_SUBR_DIRNAME) {
5133	if (lastdir == -`1`) {
5134	/*
5135	* We know that we have a slash in the name --
5136	* or lastdir would be set to 0, above. And
5137	* because lastdir is -1, we know that this
5138	* slash must be the first character. (That
5139	* is, the full string must be of the form
5140	* "/basename".) In this case, the last
5141	* character of the directory name is 0.
5142	*/
5143	lastdir = `0`;
5144	}
5145
5146	start = `0`;
5147	end = lastdir;
5148	} else {
5149	ASSERT(subr == DIF_SUBR_BASENAME);
5150	ASSERT(firstbase != -`1` && lastbase != -`1`);
5151	start = firstbase;
5152	end = lastbase;
5153	}
5154
5155	for (i = start, j = `0`; i <= end && (uint64_t)j < size - `1`; i++, j++)
5156	dest[j] = dtrace_load8(addr: src + i);
5157
5158	dest[j] = `'\0'`;
5159	regs[rd] = (uintptr_t)dest;
5160	mstate->dtms_scratch_ptr += size;
5161	break;
5162	}
5163
5164	case DIF_SUBR_CLEANPATH: {
5165	char dest = (char* *)mstate->dtms_scratch_ptr, c;
5166	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5167	uintptr_t src = tupregs[`0`].dttk_value;
5168	size_t lim;
5169	size_t i = `0`, j = `0`;
5170
5171	if (!dtrace_strcanload(addr: src, sz: size, remain: &lim, mstate, vstate)) {
5172	regs[rd] = `0`;
5173	break;
5174	}
5175
5176	if (!DTRACE_INSCRATCH(mstate, size)) {
5177	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5178	regs[rd] = `0`;
5179	break;
5180	}
5181
5182	/*
5183	* Move forward, loading each character.
5184	*/
5185	do {
5186	c = (i >= lim) ? `'\0'` : dtrace_load8(addr: src + i++);
5187	next:
5188	if ((uint64_t)(j + `5`) >= size) / 5 = strlen("/..c\0") /
5189	break;
5190
5191	if (c != `'/'`) {
5192	dest[j++] = c;
5193	continue;
5194	}
5195
5196	c = (i >= lim) ? `'\0'` : dtrace_load8(addr: src + i++);
5197
5198	if (c == `'/'`) {
5199	/*
5200	* We have two slashes -- we can just advance
5201	* to the next character.
5202	*/
5203	goto next;
5204	}
5205
5206	if (c != `'.'`) {
5207	/*
5208	* This is not "." and it's not ".." -- we can
5209	* just store the "/" and this character and
5210	* drive on.
5211	*/
5212	dest[j++] = `'/'`;
5213	dest[j++] = c;
5214	continue;
5215	}
5216
5217	c = (i >= lim) ? `'\0'` : dtrace_load8(addr: src + i++);
5218
5219	if (c == `'/'`) {
5220	/*
5221	* This is a "/./" component. We're not going
5222	* to store anything in the destination buffer;
5223	* we're just going to go to the next component.
5224	*/
5225	goto next;
5226	}
5227
5228	if (c != `'.'`) {
5229	/*
5230	* This is not ".." -- we can just store the
5231	* "/." and this character and continue
5232	* processing.
5233	*/
5234	dest[j++] = `'/'`;
5235	dest[j++] = `'.'`;
5236	dest[j++] = c;
5237	continue;
5238	}
5239
5240	c = (i >= lim) ? `'\0'` : dtrace_load8(addr: src + i++);
5241
5242	if (c != `'/'` && c != `'\0'`) {
5243	/*
5244	* This is not ".." -- it's "..[mumble]".
5245	* We'll store the "/.." and this character
5246	* and continue processing.
5247	*/
5248	dest[j++] = `'/'`;
5249	dest[j++] = `'.'`;
5250	dest[j++] = `'.'`;
5251	dest[j++] = c;
5252	continue;
5253	}
5254
5255	/*
5256	* This is "/../" or "/..\0". We need to back up
5257	* our destination pointer until we find a "/".
5258	*/
5259	i--;
5260	while (j != `0` && dest[--j] != `'/'`)
5261	continue;
5262
5263	if (c == `'\0'`)
5264	dest[++j] = `'/'`;
5265	} while (c != `'\0'`);
5266
5267	dest[j] = `'\0'`;
5268	regs[rd] = (uintptr_t)dest;
5269	mstate->dtms_scratch_ptr += size;
5270	break;
5271	}
5272
5273	case DIF_SUBR_INET_NTOA:
5274	case DIF_SUBR_INET_NTOA6:
5275	case DIF_SUBR_INET_NTOP: {
5276	size_t size;
5277	int af, argi, i;
5278	char base, end;
5279
5280	if (subr == DIF_SUBR_INET_NTOP) {
5281	af = (int)tupregs[`0`].dttk_value;
5282	argi = `1`;
5283	} else {
5284	af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5285	argi = `0`;
5286	}
5287
5288	if (af == AF_INET) {
5289	#if !defined(__APPLE__)
5290	ipaddr_t ip4;
5291	#else
5292	uint32_t ip4;
5293	#endif /* __APPLE__ */
5294	uint8_t *ptr8, val;
5295
5296	/*
5297	* Safely load the IPv4 address.
5298	*/
5299	#if !defined(__APPLE__)
5300	ip4 = dtrace_load32(tupregs[argi].dttk_value);
5301	#else
5302	if (!dtrace_canload(addr: tupregs[argi].dttk_value, sz: sizeof(ip4),
5303	mstate, vstate)) {
5304	regs[rd] = `0`;
5305	break;
5306	}
5307
5308	dtrace_bcopy(
5309	src: (void *)(uintptr_t)tupregs[argi].dttk_value,
5310	dst: (void )(uintptr_t)&ip4, len: sizeof* (ip4));
5311	#endif /* __APPLE__ */
5312	/*
5313	* Check an IPv4 string will fit in scratch.
5314	*/
5315	#if !defined(__APPLE__)
5316	size = INET_ADDRSTRLEN;
5317	#else
5318	size = MAX_IPv4_STR_LEN;
5319	#endif /* __APPLE__ */
5320	if (!DTRACE_INSCRATCH(mstate, size)) {
5321	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5322	regs[rd] = `0`;
5323	break;
5324	}
5325	base = (char *)mstate->dtms_scratch_ptr;
5326	end = (char *)mstate->dtms_scratch_ptr + size - `1`;
5327
5328	/*
5329	* Stringify as a dotted decimal quad.
5330	*/
5331	*end-- = `'\0'`;
5332	ptr8 = (uint8_t *)&ip4;
5333	for (i = `3`; i >= `0`; i--) {
5334	val = ptr8[i];
5335
5336	if (val == `0`) {
5337	*end-- = `'0'`;
5338	} else {
5339	for (; val; val /= `10`) {
5340	*end-- = `'0'` + (val % `10`);
5341	}
5342	}
5343
5344	if (i > `0`)
5345	*end-- = `'.'`;
5346	}
5347	ASSERT(end + `1` >= base);
5348
5349	} else if (af == AF_INET6) {
5350	#if defined(__APPLE__)
5351	#define _S6_un __u6_addr
5352	#define _S6_u8 __u6_addr8
5353	#endif /* __APPLE__ */
5354	struct in6_addr ip6;
5355	int firstzero, tryzero, numzero, v6end;
5356	uint16_t val;
5357	const char digits[] = "0123456789abcdef";
5358
5359	/*
5360	* Stringify using RFC 1884 convention 2 - 16 bit
5361	* hexadecimal values with a zero-run compression.
5362	* Lower case hexadecimal digits are used.
5363	* eg, fe80::214:4fff:fe0b:76c8.
5364	* The IPv4 embedded form is returned for inet_ntop,
5365	* just the IPv4 string is returned for inet_ntoa6.
5366	*/
5367
5368	if (!dtrace_canload(addr: tupregs[argi].dttk_value,
5369	sz: sizeof(struct in6_addr), mstate, vstate)) {
5370	regs[rd] = `0`;
5371	break;
5372	}
5373
5374	/*
5375	* Safely load the IPv6 address.
5376	*/
5377	dtrace_bcopy(
5378	src: (void *)(uintptr_t)tupregs[argi].dttk_value,
5379	dst: (void )(uintptr_t)&ip6, len: sizeof* (struct in6_addr));
5380
5381	/*
5382	* Check an IPv6 string will fit in scratch.
5383	*/
5384	size = INET6_ADDRSTRLEN;
5385	if (!DTRACE_INSCRATCH(mstate, size)) {
5386	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5387	regs[rd] = `0`;
5388	break;
5389	}
5390	base = (char *)mstate->dtms_scratch_ptr;
5391	end = (char *)mstate->dtms_scratch_ptr + size - `1`;
5392	*end-- = `'\0'`;
5393
5394	/*
5395	* Find the longest run of 16 bit zero values
5396	* for the single allowed zero compression - "::".
5397	*/
5398	firstzero = -`1`;
5399	tryzero = -`1`;
5400	numzero = `1`;
5401	for (i = `0`; i < (int)sizeof (struct in6_addr); i++) {
5402	if (ip6._S6_un._S6_u8[i] == `0` &&
5403	tryzero == -`1` && i % `2` == `0`) {
5404	tryzero = i;
5405	continue;
5406	}
5407
5408	if (tryzero != -`1` &&
5409	(ip6._S6_un._S6_u8[i] != `0` \|\|
5410	i == sizeof (struct in6_addr) - `1`)) {
5411
5412	if (i - tryzero <= numzero) {
5413	tryzero = -`1`;
5414	continue;
5415	}
5416
5417	firstzero = tryzero;
5418	numzero = i - i % `2` - tryzero;
5419	tryzero = -`1`;
5420
5421	if (ip6._S6_un._S6_u8[i] == `0` &&
5422	i == sizeof (struct in6_addr) - `1`)
5423	numzero += `2`;
5424	}
5425	}
5426	ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5427
5428	/*
5429	* Check for an IPv4 embedded address.
5430	*/
5431	v6end = sizeof (struct in6_addr) - `2`;
5432	if (IN6_IS_ADDR_V4MAPPED(&ip6) \|\|
5433	IN6_IS_ADDR_V4COMPAT(&ip6)) {
5434	for (i = sizeof (struct in6_addr) - `1`;
5435	i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5436	ASSERT(end >= base);
5437
5438	val = ip6._S6_un._S6_u8[i];
5439
5440	if (val == `0`) {
5441	*end-- = `'0'`;
5442	} else {
5443	for (; val; val /= `10`) {
5444	*end-- = `'0'` + val % `10`;
5445	}
5446	}
5447
5448	if (i > (int)DTRACE_V4MAPPED_OFFSET)
5449	*end-- = `'.'`;
5450	}
5451
5452	if (subr == DIF_SUBR_INET_NTOA6)
5453	goto inetout;
5454
5455	/*
5456	* Set v6end to skip the IPv4 address that
5457	* we have already stringified.
5458	*/
5459	v6end = `10`;
5460	}
5461
5462	/*
5463	* Build the IPv6 string by working through the
5464	* address in reverse.
5465	*/
5466	for (i = v6end; i >= `0`; i -= `2`) {
5467	ASSERT(end >= base);
5468
5469	if (i == firstzero + numzero - `2`) {
5470	*end-- = `':'`;
5471	*end-- = `':'`;
5472	i -= numzero - `2`;
5473	continue;
5474	}
5475
5476	if (i < `14` && i != firstzero - `2`)
5477	*end-- = `':'`;
5478
5479	val = (ip6._S6_un._S6_u8[i] << `8`) +
5480	ip6._S6_un._S6_u8[i + `1`];
5481
5482	if (val == `0`) {
5483	*end-- = `'0'`;
5484	} else {
5485	for (; val; val /= `16`) {
5486	*end-- = digits[val % `16`];
5487	}
5488	}
5489	}
5490	ASSERT(end + `1` >= base);
5491
5492	#if defined(__APPLE__)
5493	#undef _S6_un
5494	#undef _S6_u8
5495	#endif /* __APPLE__ */
5496	} else {
5497	/*
5498	* The user didn't use AH_INET or AH_INET6.
5499	*/
5500	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5501	regs[rd] = `0`;
5502	break;
5503	}
5504
5505	inetout: regs[rd] = (uintptr_t)end + `1`;
5506	mstate->dtms_scratch_ptr += size;
5507	break;
5508	}
5509
5510	case DIF_SUBR_JSON: {
5511	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5512	uintptr_t json = tupregs[`0`].dttk_value;
5513	size_t jsonlen = dtrace_strlen(s: (char *)json, lim: size);
5514	uintptr_t elem = tupregs[`1`].dttk_value;
5515	size_t elemlen = dtrace_strlen(s: (char *)elem, lim: size);
5516
5517	char dest = (char* *)mstate->dtms_scratch_ptr;
5518	char elemlist = (char* *)mstate->dtms_scratch_ptr + jsonlen + `1`;
5519	char *ee = elemlist;
5520	int nelems = `1`;
5521	uintptr_t cur;
5522
5523	if (!dtrace_canload(addr: json, sz: jsonlen + `1`, mstate, vstate) \|\|
5524	!dtrace_canload(addr: elem, sz: elemlen + `1`, mstate, vstate)) {
5525	regs[rd] = `0`;
5526	break;
5527	}
5528
5529	if (!DTRACE_INSCRATCH(mstate, jsonlen + `1` + elemlen + `1`)) {
5530	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5531	regs[rd] = `0`;
5532	break;
5533	}
5534
5535	/*
5536	* Read the element selector and split it up into a packed list
5537	* of strings.
5538	*/
5539	for (cur = elem; cur < elem + elemlen; cur++) {
5540	char cc = dtrace_load8(addr: cur);
5541
5542	if (cur == elem && cc == `'['`) {
5543	/*
5544	* If the first element selector key is
5545	* actually an array index then ignore the
5546	* bracket.
5547	*/
5548	continue;
5549	}
5550
5551	if (cc == `']'`)
5552	continue;
5553
5554	if (cc == `'.'` \|\| cc == `'['`) {
5555	nelems++;
5556	cc = `'\0'`;
5557	}
5558
5559	*ee++ = cc;
5560	}
5561	*ee++ = `'\0'`;
5562
5563	if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5564	nelems, dest)) != `0`)
5565	mstate->dtms_scratch_ptr += jsonlen + `1`;
5566	break;
5567	}
5568
5569	case DIF_SUBR_TOUPPER:
5570	case DIF_SUBR_TOLOWER: {
5571	uintptr_t src = tupregs[`0`].dttk_value;
5572	char dest = (char* *)mstate->dtms_scratch_ptr;
5573	char lower, upper, base, c;
5574	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5575	size_t len = dtrace_strlen(s: (char*) src, lim: size);
5576	size_t i = `0`;
5577
5578	lower = (subr == DIF_SUBR_TOUPPER) ? `'a'` : `'A'`;
5579	upper = (subr == DIF_SUBR_TOUPPER) ? `'z'` : `'Z'`;
5580	base = (subr == DIF_SUBR_TOUPPER) ? `'A'` : `'a'`;
5581
5582	if (!dtrace_canload(addr: src, sz: len + `1`, mstate, vstate)) {
5583	regs[rd] = `0`;
5584	break;
5585	}
5586
5587	if (!DTRACE_INSCRATCH(mstate, size)) {
5588	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5589	regs[rd] = `0`;
5590	break;
5591	}
5592
5593	for (i = `0`; i < size - `1`; ++i) {
5594	if ((c = dtrace_load8(addr: src + i)) == `'\0'`)
5595	break;
5596	if (c >= lower && c <= upper)
5597	c = base + (c - lower);
5598	dest[i] = c;
5599	}
5600
5601	ASSERT(i < size);
5602
5603	dest[i] = `'\0'`;
5604	regs[rd] = (uintptr_t) dest;
5605	mstate->dtms_scratch_ptr += size;
5606
5607	break;
5608	}
5609
5610	case DIF_SUBR_STRIP:
5611	if (!dtrace_is_valid_ptrauth_key(tupregs[`1`].dttk_value)) {
5612	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5613	break;
5614	}
5615	regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5616	(void*)tupregs[`0`].dttk_value, tupregs[`1`].dttk_value);
5617	break;
5618
5619	#if defined(__APPLE__)
5620	case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5621	if (!dtrace_priv_kernel(state)) {
5622	regs[rd] = `0`;
5623	} else {
5624	regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[`0`].dttk_value);
5625	}
5626
5627	break;
5628	}
5629
5630	case DIF_SUBR_KDEBUG_TRACE: {
5631	uint32_t debugid;
5632	uintptr_t args[`4`] = {`0`};
5633	int i;
5634
5635	if (nargs < `2` \|\| nargs > `5`) {
5636	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5637	break;
5638	}
5639
5640	if (dtrace_destructive_disallow \|\|
5641	!dtrace_priv_kernel_destructive(state)) {
5642	return;
5643	}
5644
5645	debugid = tupregs[`0`].dttk_value;
5646	for (i = `0`; i < nargs - `1`; i++)
5647	args[i] = tupregs[i + `1`].dttk_value;
5648
5649	kernel_debug(debugid, arg1: args[`0`], arg2: args[`1`], arg3: args[`2`], arg4: args[`3`], arg5: `0`);
5650
5651	break;
5652	}
5653
5654	case DIF_SUBR_KDEBUG_TRACE_STRING: {
5655	if (nargs != `3`) {
5656	break;
5657	}
5658
5659	if (dtrace_destructive_disallow \|\|
5660	!dtrace_priv_kernel_destructive(state)) {
5661	return;
5662	}
5663
5664	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5665	uint32_t debugid = tupregs[`0`].dttk_value;
5666	uint64_t str_id = tupregs[`1`].dttk_value;
5667	uintptr_t src = tupregs[`2`].dttk_value;
5668	size_t lim;
5669	char buf[size];
5670	char* str = NULL;
5671
5672	if (src != (uintptr_t)`0`) {
5673	str = buf;
5674	if (!dtrace_strcanload(addr: src, sz: size, remain: &lim, mstate, vstate)) {
5675	break;
5676	}
5677	dtrace_strcpy(src: (void*)src, dst: buf, len: size);
5678	}
5679
5680	(void)kernel_debug_string(debugid, str_id: &str_id, str);
5681	regs[rd] = str_id;
5682
5683	break;
5684	}
5685
5686	case DIF_SUBR_MTONS:
5687	absolutetime_to_nanoseconds(abstime: tupregs[`0`].dttk_value, result: &regs[rd]);
5688
5689	break;
5690	case DIF_SUBR_PHYSMEM_READ: {
5691	#if DEBUG \|\| DEVELOPMENT
5692	if (dtrace_destructive_disallow \|\|
5693	!dtrace_priv_kernel_destructive(state)) {
5694	return;
5695	}
5696	regs[rd] = dtrace_physmem_read(tupregs[`0`].dttk_value,
5697	tupregs[`1`].dttk_value);
5698	#else
5699	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5700	#endif /* DEBUG \|\| DEVELOPMENT */
5701	break;
5702	}
5703	case DIF_SUBR_PHYSMEM_WRITE: {
5704	#if DEBUG \|\| DEVELOPMENT
5705	if (dtrace_destructive_disallow \|\|
5706	!dtrace_priv_kernel_destructive(state)) {
5707	return;
5708	}
5709
5710	dtrace_physmem_write(tupregs[`0`].dttk_value,
5711	tupregs[`1`].dttk_value, (size_t)tupregs[`2`].dttk_value);
5712	#else
5713	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5714	#endif /* DEBUG \|\| DEVELOPMENT */
5715	break;
5716	}
5717
5718	case DIF_SUBR_KVTOPHYS: {
5719	#if DEBUG \|\| DEVELOPMENT
5720	regs[rd] = kvtophys(tupregs[`0`].dttk_value);
5721	#else
5722	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5723	#endif /* DEBUG \|\| DEVELOPMENT */
5724	break;
5725	}
5726
5727	case DIF_SUBR_LIVEDUMP: {
5728	#if DEBUG \|\| DEVELOPMENT
5729	if (dtrace_destructive_disallow \|\|
5730	!dtrace_priv_kernel_destructive(state)) {
5731	break;
5732	}
5733
5734	/ For the moment, there is only one type of livedump. /
5735	if (nargs != `1` \|\| tupregs[`0`].dttk_value != `0`) {
5736	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5737	break;
5738	}
5739
5740	char dest = (char* *)mstate->dtms_scratch_ptr;
5741	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5742
5743	if (!DTRACE_INSCRATCH(mstate, size)) {
5744	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5745	regs[rd] = `0`;
5746	break;
5747	}
5748
5749	dtrace_livedump(dest, size);
5750	regs[rd] = (uintptr_t) dest;
5751	mstate->dtms_scratch_ptr += strlen(dest) + `1`;
5752	#else
5753	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5754	#endif /* DEBUG \|\| DEVELOPMENT */
5755	break;
5756	}
5757	#endif /* defined(__APPLE__) */
5758
5759	}
5760	}
5761
5762	/*
5763	* Emulate the execution of DTrace IR instructions specified by the given
5764	* DIF object. This function is deliberately void of assertions as all of
5765	* the necessary checks are handled by a call to dtrace_difo_validate().
5766	*/
5767	static uint64_t
5768	dtrace_dif_emulate(dtrace_difo_t difo, dtrace_mstate_t mstate,
5769	dtrace_vstate_t vstate, dtrace_state_t state)
5770	{
5771	const dif_instr_t *text = difo->dtdo_buf;
5772	const uint_t textlen = difo->dtdo_len;
5773	const char *strtab = difo->dtdo_strtab;
5774	const uint64_t *inttab = difo->dtdo_inttab;
5775
5776	uint64_t rval = `0`;
5777	dtrace_statvar_t *svar;
5778	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5779	dtrace_difv_t *v;
5780	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5781	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5782
5783	dtrace_key_t tupregs[DIF_DTR_NREGS + `2`]; / +2 for thread and id /
5784	uint64_t regs[DIF_DIR_NREGS];
5785	uint64_t *tmp;
5786
5787	uint8_t cc_n = `0`, cc_z = `0`, cc_v = `0`, cc_c = `0`;
5788	int64_t cc_r;
5789	uint_t pc = `0`, id, opc = `0`;
5790	uint8_t ttop = `0`;
5791	dif_instr_t instr;
5792	uint_t r1, r2, rd;
5793
5794	/*
5795	* We stash the current DIF object into the machine state: we need it
5796	* for subsequent access checking.
5797	*/
5798	mstate->dtms_difo = difo;
5799
5800	regs[DIF_REG_R0] = `0`; / %r0 is fixed at zero /
5801
5802	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5803	opc = pc;
5804
5805	instr = text[pc++];
5806	r1 = DIF_INSTR_R1(instr);
5807	r2 = DIF_INSTR_R2(instr);
5808	rd = DIF_INSTR_RD(instr);
5809
5810	switch (DIF_INSTR_OP(instr)) {
5811	case DIF_OP_OR:
5812	regs[rd] = regs[r1] \| regs[r2];
5813	break;
5814	case DIF_OP_XOR:
5815	regs[rd] = regs[r1] ^ regs[r2];
5816	break;
5817	case DIF_OP_AND:
5818	regs[rd] = regs[r1] & regs[r2];
5819	break;
5820	case DIF_OP_SLL:
5821	regs[rd] = regs[r1] << regs[r2];
5822	break;
5823	case DIF_OP_SRL:
5824	regs[rd] = regs[r1] >> regs[r2];
5825	break;
5826	case DIF_OP_SUB:
5827	regs[rd] = regs[r1] - regs[r2];
5828	break;
5829	case DIF_OP_ADD:
5830	regs[rd] = regs[r1] + regs[r2];
5831	break;
5832	case DIF_OP_MUL:
5833	regs[rd] = regs[r1] * regs[r2];
5834	break;
5835	case DIF_OP_SDIV:
5836	if (regs[r2] == `0`) {
5837	regs[rd] = `0`;
5838	*flags \|= CPU_DTRACE_DIVZERO;
5839	} else {
5840	regs[rd] = (int64_t)regs[r1] /
5841	(int64_t)regs[r2];
5842	}
5843	break;
5844
5845	case DIF_OP_UDIV:
5846	if (regs[r2] == `0`) {
5847	regs[rd] = `0`;
5848	*flags \|= CPU_DTRACE_DIVZERO;
5849	} else {
5850	regs[rd] = regs[r1] / regs[r2];
5851	}
5852	break;
5853
5854	case DIF_OP_SREM:
5855	if (regs[r2] == `0`) {
5856	regs[rd] = `0`;
5857	*flags \|= CPU_DTRACE_DIVZERO;
5858	} else {
5859	regs[rd] = (int64_t)regs[r1] %
5860	(int64_t)regs[r2];
5861	}
5862	break;
5863
5864	case DIF_OP_UREM:
5865	if (regs[r2] == `0`) {
5866	regs[rd] = `0`;
5867	*flags \|= CPU_DTRACE_DIVZERO;
5868	} else {
5869	regs[rd] = regs[r1] % regs[r2];
5870	}
5871	break;
5872
5873	case DIF_OP_NOT:
5874	regs[rd] = ~regs[r1];
5875	break;
5876	case DIF_OP_MOV:
5877	regs[rd] = regs[r1];
5878	break;
5879	case DIF_OP_CMP:
5880	cc_r = regs[r1] - regs[r2];
5881	cc_n = cc_r < `0`;
5882	cc_z = cc_r == `0`;
5883	cc_v = `0`;
5884	cc_c = regs[r1] < regs[r2];
5885	break;
5886	case DIF_OP_TST:
5887	cc_n = cc_v = cc_c = `0`;
5888	cc_z = regs[r1] == `0`;
5889	break;
5890	case DIF_OP_BA:
5891	pc = DIF_INSTR_LABEL(instr);
5892	break;
5893	case DIF_OP_BE:
5894	if (cc_z)
5895	pc = DIF_INSTR_LABEL(instr);
5896	break;
5897	case DIF_OP_BNE:
5898	if (cc_z == `0`)
5899	pc = DIF_INSTR_LABEL(instr);
5900	break;
5901	case DIF_OP_BG:
5902	if ((cc_z \| (cc_n ^ cc_v)) == `0`)
5903	pc = DIF_INSTR_LABEL(instr);
5904	break;
5905	case DIF_OP_BGU:
5906	if ((cc_c \| cc_z) == `0`)
5907	pc = DIF_INSTR_LABEL(instr);
5908	break;
5909	case DIF_OP_BGE:
5910	if ((cc_n ^ cc_v) == `0`)
5911	pc = DIF_INSTR_LABEL(instr);
5912	break;
5913	case DIF_OP_BGEU:
5914	if (cc_c == `0`)
5915	pc = DIF_INSTR_LABEL(instr);
5916	break;
5917	case DIF_OP_BL:
5918	if (cc_n ^ cc_v)
5919	pc = DIF_INSTR_LABEL(instr);
5920	break;
5921	case DIF_OP_BLU:
5922	if (cc_c)
5923	pc = DIF_INSTR_LABEL(instr);
5924	break;
5925	case DIF_OP_BLE:
5926	if (cc_z \| (cc_n ^ cc_v))
5927	pc = DIF_INSTR_LABEL(instr);
5928	break;
5929	case DIF_OP_BLEU:
5930	if (cc_c \| cc_z)
5931	pc = DIF_INSTR_LABEL(instr);
5932	break;
5933	case DIF_OP_RLDSB:
5934	if (!dtrace_canstore(regs[r1], sz: `1`, mstate, vstate)) {
5935	*flags \|= CPU_DTRACE_KPRIV;
5936	*illval = regs[r1];
5937	break;
5938	}
5939	OS_FALLTHROUGH;
5940	case DIF_OP_LDSB:
5941	regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5942	break;
5943	case DIF_OP_RLDSH:
5944	if (!dtrace_canstore(regs[r1], sz: `2`, mstate, vstate)) {
5945	*flags \|= CPU_DTRACE_KPRIV;
5946	*illval = regs[r1];
5947	break;
5948	}
5949	OS_FALLTHROUGH;
5950	case DIF_OP_LDSH:
5951	regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5952	break;
5953	case DIF_OP_RLDSW:
5954	if (!dtrace_canstore(regs[r1], sz: `4`, mstate, vstate)) {
5955	*flags \|= CPU_DTRACE_KPRIV;
5956	*illval = regs[r1];
5957	break;
5958	}
5959	OS_FALLTHROUGH;
5960	case DIF_OP_LDSW:
5961	regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5962	break;
5963	case DIF_OP_RLDUB:
5964	if (!dtrace_canstore(regs[r1], sz: `1`, mstate, vstate)) {
5965	*flags \|= CPU_DTRACE_KPRIV;
5966	*illval = regs[r1];
5967	break;
5968	}
5969	OS_FALLTHROUGH;
5970	case DIF_OP_LDUB:
5971	regs[rd] = dtrace_load8(regs[r1]);
5972	break;
5973	case DIF_OP_RLDUH:
5974	if (!dtrace_canstore(regs[r1], sz: `2`, mstate, vstate)) {
5975	*flags \|= CPU_DTRACE_KPRIV;
5976	*illval = regs[r1];
5977	break;
5978	}
5979	OS_FALLTHROUGH;
5980	case DIF_OP_LDUH:
5981	regs[rd] = dtrace_load16(regs[r1]);
5982	break;
5983	case DIF_OP_RLDUW:
5984	if (!dtrace_canstore(regs[r1], sz: `4`, mstate, vstate)) {
5985	*flags \|= CPU_DTRACE_KPRIV;
5986	*illval = regs[r1];
5987	break;
5988	}
5989	OS_FALLTHROUGH;
5990	case DIF_OP_LDUW:
5991	regs[rd] = dtrace_load32(regs[r1]);
5992	break;
5993	case DIF_OP_RLDX:
5994	if (!dtrace_canstore(regs[r1], sz: `8`, mstate, vstate)) {
5995	*flags \|= CPU_DTRACE_KPRIV;
5996	*illval = regs[r1];
5997	break;
5998	}
5999	OS_FALLTHROUGH;
6000	case DIF_OP_LDX:
6001	regs[rd] = dtrace_load64(regs[r1]);
6002	break;
6003	/*
6004	* Darwin 32-bit kernel may fetch from 64-bit user.
6005	* Do not cast regs to uintptr_t
6006	* DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
6007	* DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
6008	*/
6009	case DIF_OP_ULDSB:
6010	regs[rd] = (int8_t)
6011	dtrace_fuword8(regs[r1]);
6012	break;
6013	case DIF_OP_ULDSH:
6014	regs[rd] = (int16_t)
6015	dtrace_fuword16(regs[r1]);
6016	break;
6017	case DIF_OP_ULDSW:
6018	regs[rd] = (int32_t)
6019	dtrace_fuword32(regs[r1]);
6020	break;
6021	case DIF_OP_ULDUB:
6022	regs[rd] =
6023	dtrace_fuword8(regs[r1]);
6024	break;
6025	case DIF_OP_ULDUH:
6026	regs[rd] =
6027	dtrace_fuword16(regs[r1]);
6028	break;
6029	case DIF_OP_ULDUW:
6030	regs[rd] =
6031	dtrace_fuword32(regs[r1]);
6032	break;
6033	case DIF_OP_ULDX:
6034	regs[rd] =
6035	dtrace_fuword64(regs[r1]);
6036	break;
6037	case DIF_OP_RET:
6038	rval = regs[rd];
6039	pc = textlen;
6040	break;
6041	case DIF_OP_NOP:
6042	break;
6043	case DIF_OP_SETX:
6044	regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6045	break;
6046	case DIF_OP_SETS:
6047	regs[rd] = (uint64_t)(uintptr_t)
6048	(strtab + DIF_INSTR_STRING(instr));
6049	break;
6050	case DIF_OP_SCMP: {
6051	size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6052	uintptr_t s1 = regs[r1];
6053	uintptr_t s2 = regs[r2];
6054	size_t lim1 = sz, lim2 = sz;
6055
6056	if (s1 != `0` &&
6057	!dtrace_strcanload(addr: s1, sz, remain: &lim1, mstate, vstate))
6058	break;
6059	if (s2 != `0` &&
6060	!dtrace_strcanload(addr: s2, sz, remain: &lim2, mstate, vstate))
6061	break;
6062
6063	cc_r = dtrace_strncmp(s1: (char )s1, s2: (char* *)s2,
6064	MIN(lim1, lim2));
6065
6066	cc_n = cc_r < `0`;
6067	cc_z = cc_r == `0`;
6068	cc_v = cc_c = `0`;
6069	break;
6070	}
6071	case DIF_OP_LDGA:
6072	regs[rd] = dtrace_dif_variable(mstate, state,
6073	v: r1, regs[r2]);
6074	break;
6075	case DIF_OP_LDGS:
6076	id = DIF_INSTR_VAR(instr);
6077
6078	if (id >= DIF_VAR_OTHER_UBASE) {
6079	uintptr_t a;
6080
6081	id -= DIF_VAR_OTHER_UBASE;
6082	svar = vstate->dtvs_globals[id];
6083	ASSERT(svar != NULL);
6084	v = &svar->dtsv_var;
6085
6086	if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6087	regs[rd] = svar->dtsv_data;
6088	break;
6089	}
6090
6091	a = (uintptr_t)svar->dtsv_data;
6092
6093	if ((uint8_t )a == UINT8_MAX) {
6094	/*
6095	* If the 0th byte is set to UINT8_MAX
6096	* then this is to be treated as a
6097	* reference to a NULL variable.
6098	*/
6099	regs[rd] = `0`;
6100	} else {
6101	regs[rd] = a + sizeof (uint64_t);
6102	}
6103
6104	break;
6105	}
6106
6107	regs[rd] = dtrace_dif_variable(mstate, state, v: id, ndx: `0`);
6108	break;
6109
6110	case DIF_OP_STGS:
6111	id = DIF_INSTR_VAR(instr);
6112
6113	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6114	id -= DIF_VAR_OTHER_UBASE;
6115
6116	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6117	svar = vstate->dtvs_globals[id];
6118	ASSERT(svar != NULL);
6119	v = &svar->dtsv_var;
6120
6121	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6122	uintptr_t a = (uintptr_t)svar->dtsv_data;
6123	size_t lim = `0`;
6124
6125	ASSERT(a != `0`);
6126	ASSERT(svar->dtsv_size != `0`);
6127
6128	if (regs[rd] == `0`) {
6129	(uint8_t )a = UINT8_MAX;
6130	break;
6131	} else {
6132	(uint8_t )a = `0`;
6133	a += sizeof (uint64_t);
6134	}
6135	if (!dtrace_vcanload(
6136	src: (void *)(uintptr_t)regs[rd], type: &v->dtdv_type,
6137	remain: &lim, mstate, vstate))
6138	break;
6139
6140	dtrace_vcopy(src: (void *)(uintptr_t)regs[rd],
6141	dst: (void *)a, type: &v->dtdv_type, limit: lim);
6142	break;
6143	}
6144
6145	svar->dtsv_data = regs[rd];
6146	break;
6147
6148	case DIF_OP_LDTA:
6149	/*
6150	* There are no DTrace built-in thread-local arrays at
6151	* present. This opcode is saved for future work.
6152	*/
6153	*flags \|= CPU_DTRACE_ILLOP;
6154	regs[rd] = `0`;
6155	break;
6156
6157	case DIF_OP_LDLS:
6158	id = DIF_INSTR_VAR(instr);
6159
6160	if (id < DIF_VAR_OTHER_UBASE) {
6161	/*
6162	* For now, this has no meaning.
6163	*/
6164	regs[rd] = `0`;
6165	break;
6166	}
6167
6168	id -= DIF_VAR_OTHER_UBASE;
6169
6170	ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6171	ASSERT(vstate->dtvs_locals != NULL);
6172	svar = vstate->dtvs_locals[id];
6173	ASSERT(svar != NULL);
6174	v = &svar->dtsv_var;
6175
6176	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6177	uintptr_t a = (uintptr_t)svar->dtsv_data;
6178	size_t sz = v->dtdv_type.dtdt_size;
6179
6180	sz += sizeof (uint64_t);
6181	ASSERT(svar->dtsv_size == (int)NCPU * sz);
6182	a += CPU->cpu_id * sz;
6183
6184	if ((uint8_t )a == UINT8_MAX) {
6185	/*
6186	* If the 0th byte is set to UINT8_MAX
6187	* then this is to be treated as a
6188	* reference to a NULL variable.
6189	*/
6190	regs[rd] = `0`;
6191	} else {
6192	regs[rd] = a + sizeof (uint64_t);
6193	}
6194
6195	break;
6196	}
6197
6198	ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6199	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6200	regs[rd] = tmp[CPU->cpu_id];
6201	break;
6202
6203	case DIF_OP_STLS:
6204	id = DIF_INSTR_VAR(instr);
6205
6206	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6207	id -= DIF_VAR_OTHER_UBASE;
6208	VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6209	ASSERT(vstate->dtvs_locals != NULL);
6210	svar = vstate->dtvs_locals[id];
6211	ASSERT(svar != NULL);
6212	v = &svar->dtsv_var;
6213
6214	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6215	uintptr_t a = (uintptr_t)svar->dtsv_data;
6216	size_t sz = v->dtdv_type.dtdt_size;
6217	size_t lim = `0`;
6218
6219	sz += sizeof (uint64_t);
6220	ASSERT(svar->dtsv_size == (int)NCPU * sz);
6221	a += CPU->cpu_id * sz;
6222
6223	if (regs[rd] == `0`) {
6224	(uint8_t )a = UINT8_MAX;
6225	break;
6226	} else {
6227	(uint8_t )a = `0`;
6228	a += sizeof (uint64_t);
6229	}
6230
6231	if (!dtrace_vcanload(
6232	src: (void *)(uintptr_t)regs[rd], type: &v->dtdv_type,
6233	remain: &lim, mstate, vstate))
6234	break;
6235
6236	dtrace_vcopy(src: (void *)(uintptr_t)regs[rd],
6237	dst: (void *)a, type: &v->dtdv_type, limit: lim);
6238	break;
6239	}
6240
6241	ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6242	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6243	tmp[CPU->cpu_id] = regs[rd];
6244	break;
6245
6246	case DIF_OP_LDTS: {
6247	dtrace_dynvar_t *dvar;
6248	dtrace_key_t *key;
6249
6250	id = DIF_INSTR_VAR(instr);
6251	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6252	id -= DIF_VAR_OTHER_UBASE;
6253	v = &vstate->dtvs_tlocals[id];
6254
6255	key = &tupregs[DIF_DTR_NREGS];
6256	key[`0`].dttk_value = (uint64_t)id;
6257	key[`0`].dttk_size = `0`;
6258	DTRACE_TLS_THRKEY(key[`1`].dttk_value);
6259	key[`1`].dttk_size = `0`;
6260
6261	dvar = dtrace_dynvar(dstate, nkeys: `2`, key,
6262	dsize: sizeof (uint64_t), op: DTRACE_DYNVAR_NOALLOC,
6263	mstate, vstate);
6264
6265	if (dvar == NULL) {
6266	regs[rd] = `0`;
6267	break;
6268	}
6269
6270	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6271	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6272	} else {
6273	regs[rd] = ((uint64_t )dvar->dtdv_data);
6274	}
6275
6276	break;
6277	}
6278
6279	case DIF_OP_STTS: {
6280	dtrace_dynvar_t *dvar;
6281	dtrace_key_t *key;
6282
6283	id = DIF_INSTR_VAR(instr);
6284	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285	id -= DIF_VAR_OTHER_UBASE;
6286	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6287
6288	key = &tupregs[DIF_DTR_NREGS];
6289	key[`0`].dttk_value = (uint64_t)id;
6290	key[`0`].dttk_size = `0`;
6291	DTRACE_TLS_THRKEY(key[`1`].dttk_value);
6292	key[`1`].dttk_size = `0`;
6293	v = &vstate->dtvs_tlocals[id];
6294
6295	dvar = dtrace_dynvar(dstate, nkeys: `2`, key,
6296	dsize: v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6297	v->dtdv_type.dtdt_size : sizeof (uint64_t),
6298	regs[rd] ? DTRACE_DYNVAR_ALLOC :
6299	DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6300
6301	/*
6302	* Given that we're storing to thread-local data,
6303	* we need to flush our predicate cache.
6304	*/
6305	dtrace_set_thread_predcache(current_thread(), `0`);
6306
6307	if (dvar == NULL)
6308	break;
6309
6310	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6311	size_t lim = `0`;
6312
6313	if (!dtrace_vcanload(
6314	src: (void *)(uintptr_t)regs[rd],
6315	type: &v->dtdv_type, remain: &lim, mstate, vstate))
6316	break;
6317
6318	dtrace_vcopy(src: (void *)(uintptr_t)regs[rd],
6319	dst: dvar->dtdv_data, type: &v->dtdv_type, limit: lim);
6320	} else {
6321	((uint64_t )dvar->dtdv_data) = regs[rd];
6322	}
6323
6324	break;
6325	}
6326
6327	case DIF_OP_SRA:
6328	regs[rd] = (int64_t)regs[r1] >> regs[r2];
6329	break;
6330
6331	case DIF_OP_CALL:
6332	dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6333	regs, tupregs, nargs: ttop, mstate, state);
6334	break;
6335
6336	case DIF_OP_PUSHTR:
6337	if (ttop == DIF_DTR_NREGS) {
6338	*flags \|= CPU_DTRACE_TUPOFLOW;
6339	break;
6340	}
6341
6342	if (r1 == DIF_TYPE_STRING) {
6343	/*
6344	* If this is a string type and the size is 0,
6345	* we'll use the system-wide default string
6346	* size. Note that we are _not_ looking at
6347	* the value of the DTRACEOPT_STRSIZE option;
6348	* had this been set, we would expect to have
6349	* a non-zero size value in the "pushtr".
6350	*/
6351	tupregs[ttop].dttk_size =
6352	dtrace_strlen(s: (char *)(uintptr_t)regs[rd],
6353	regs[r2] ? regs[r2] :
6354	dtrace_strsize_default) + `1`;
6355	} else {
6356	if (regs[r2] > LONG_MAX) {
6357	*flags \|= CPU_DTRACE_ILLOP;
6358	break;
6359	}
6360	tupregs[ttop].dttk_size = regs[r2];
6361	}
6362
6363	tupregs[ttop++].dttk_value = regs[rd];
6364	break;
6365
6366	case DIF_OP_PUSHTV:
6367	if (ttop == DIF_DTR_NREGS) {
6368	*flags \|= CPU_DTRACE_TUPOFLOW;
6369	break;
6370	}
6371
6372	tupregs[ttop].dttk_value = regs[rd];
6373	tupregs[ttop++].dttk_size = `0`;
6374	break;
6375
6376	case DIF_OP_POPTS:
6377	if (ttop != `0`)
6378	ttop--;
6379	break;
6380
6381	case DIF_OP_FLUSHTS:
6382	ttop = `0`;
6383	break;
6384
6385	case DIF_OP_LDGAA:
6386	case DIF_OP_LDTAA: {
6387	dtrace_dynvar_t *dvar;
6388	dtrace_key_t *key = tupregs;
6389	uint_t nkeys = ttop;
6390
6391	id = DIF_INSTR_VAR(instr);
6392	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6393	id -= DIF_VAR_OTHER_UBASE;
6394
6395	key[nkeys].dttk_value = (uint64_t)id;
6396	key[nkeys++].dttk_size = `0`;
6397
6398	if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6399	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6400	key[nkeys++].dttk_size = `0`;
6401	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6402	v = &vstate->dtvs_tlocals[id];
6403	} else {
6404	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6405	v = &vstate->dtvs_globals[id]->dtsv_var;
6406	}
6407
6408	dvar = dtrace_dynvar(dstate, nkeys, key,
6409	dsize: v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6410	v->dtdv_type.dtdt_size : sizeof (uint64_t),
6411	op: DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6412
6413	if (dvar == NULL) {
6414	regs[rd] = `0`;
6415	break;
6416	}
6417
6418	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6419	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6420	} else {
6421	regs[rd] = ((uint64_t )dvar->dtdv_data);
6422	}
6423
6424	break;
6425	}
6426
6427	case DIF_OP_STGAA:
6428	case DIF_OP_STTAA: {
6429	dtrace_dynvar_t *dvar;
6430	dtrace_key_t *key = tupregs;
6431	uint_t nkeys = ttop;
6432
6433	id = DIF_INSTR_VAR(instr);
6434	ASSERT(id >= DIF_VAR_OTHER_UBASE);
6435	id -= DIF_VAR_OTHER_UBASE;
6436
6437	key[nkeys].dttk_value = (uint64_t)id;
6438	key[nkeys++].dttk_size = `0`;
6439
6440	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6441	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6442	key[nkeys++].dttk_size = `0`;
6443	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6444	v = &vstate->dtvs_tlocals[id];
6445	} else {
6446	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6447	v = &vstate->dtvs_globals[id]->dtsv_var;
6448	}
6449
6450	dvar = dtrace_dynvar(dstate, nkeys, key,
6451	dsize: v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6452	v->dtdv_type.dtdt_size : sizeof (uint64_t),
6453	regs[rd] ? DTRACE_DYNVAR_ALLOC :
6454	DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6455
6456	if (dvar == NULL)
6457	break;
6458
6459	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6460	size_t lim = `0`;
6461
6462	if (!dtrace_vcanload(
6463	src: (void *)(uintptr_t)regs[rd], type: &v->dtdv_type,
6464	remain: &lim, mstate, vstate))
6465	break;
6466
6467	dtrace_vcopy(src: (void *)(uintptr_t)regs[rd],
6468	dst: dvar->dtdv_data, type: &v->dtdv_type, limit: lim);
6469	} else {
6470	((uint64_t )dvar->dtdv_data) = regs[rd];
6471	}
6472
6473	break;
6474	}
6475
6476	case DIF_OP_ALLOCS: {
6477	uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
6478	size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6479
6480	/*
6481	* Rounding up the user allocation size could have
6482	* overflowed large, bogus allocations (like -1ULL) to
6483	* 0.
6484	*/
6485	if (size < regs[r1] \|\|
6486	!DTRACE_INSCRATCH(mstate, size)) {
6487	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6488	regs[rd] = `0`;
6489	break;
6490	}
6491
6492	dtrace_bzero(dst: (void *) mstate->dtms_scratch_ptr, len: size);
6493	mstate->dtms_scratch_ptr += size;
6494	regs[rd] = ptr;
6495	break;
6496	}
6497
6498	case DIF_OP_COPYS:
6499	if (!dtrace_canstore(regs[rd], regs[r2],
6500	mstate, vstate)) {
6501	*flags \|= CPU_DTRACE_BADADDR;
6502	*illval = regs[rd];
6503	break;
6504	}
6505
6506	if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6507	break;
6508
6509	dtrace_bcopy(src: (void *)(uintptr_t)regs[r1],
6510	dst: (void *)(uintptr_t)regs[rd], len: (size_t)regs[r2]);
6511	break;
6512
6513	case DIF_OP_STB:
6514	if (!dtrace_canstore(regs[rd], sz: `1`, mstate, vstate)) {
6515	*flags \|= CPU_DTRACE_BADADDR;
6516	*illval = regs[rd];
6517	break;
6518	}
6519	((uint8_t )(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6520	break;
6521
6522	case DIF_OP_STH:
6523	if (!dtrace_canstore(regs[rd], sz: `2`, mstate, vstate)) {
6524	*flags \|= CPU_DTRACE_BADADDR;
6525	*illval = regs[rd];
6526	break;
6527	}
6528	if (regs[rd] & `1`) {
6529	*flags \|= CPU_DTRACE_BADALIGN;
6530	*illval = regs[rd];
6531	break;
6532	}
6533	((uint16_t )(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6534	break;
6535
6536	case DIF_OP_STW:
6537	if (!dtrace_canstore(regs[rd], sz: `4`, mstate, vstate)) {
6538	*flags \|= CPU_DTRACE_BADADDR;
6539	*illval = regs[rd];
6540	break;
6541	}
6542	if (regs[rd] & `3`) {
6543	*flags \|= CPU_DTRACE_BADALIGN;
6544	*illval = regs[rd];
6545	break;
6546	}
6547	((uint32_t )(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6548	break;
6549
6550	case DIF_OP_STX:
6551	if (!dtrace_canstore(regs[rd], sz: `8`, mstate, vstate)) {
6552	*flags \|= CPU_DTRACE_BADADDR;
6553	*illval = regs[rd];
6554	break;
6555	}
6556
6557	/*
6558	* Darwin kmem_zalloc() called from
6559	* dtrace_difo_init() is 4-byte aligned.
6560	*/
6561	if (regs[rd] & `3`) {
6562	*flags \|= CPU_DTRACE_BADALIGN;
6563	*illval = regs[rd];
6564	break;
6565	}
6566	((uint64_t )(uintptr_t)regs[rd]) = regs[r1];
6567	break;
6568	case DIF_OP_STRIP:
6569	regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6570	(void*)regs[r1], r2);
6571	break;
6572	}
6573	}
6574
6575	if (!(*flags & CPU_DTRACE_FAULT))
6576	return (rval);
6577
6578	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6579	mstate->dtms_present \|= DTRACE_MSTATE_FLTOFFS;
6580
6581	return (`0`);
6582	}
6583
6584	__attribute__((noinline))
6585	static void
6586	dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6587	{
6588	dtrace_probe_t *probe = ecb->dte_probe;
6589	dtrace_provider_t *prov = probe->dtpr_provider;
6590	char c[DTRACE_FULLNAMELEN + `80`], *str;
6591	const char *msg = "dtrace: breakpoint action at probe ";
6592	const char *ecbmsg = " (ecb ";
6593	uintptr_t mask = (`0xf` << (sizeof (uintptr_t) * NBBY / `4`));
6594	uintptr_t val = (uintptr_t)ecb;
6595	int shift = (sizeof (uintptr_t) * NBBY) - `4`, i = `0`;
6596
6597	if (dtrace_destructive_disallow)
6598	return;
6599
6600	/*
6601	* It's impossible to be taking action on the NULL probe.
6602	*/
6603	ASSERT(probe != NULL);
6604
6605	/*
6606	* This is a poor man's (destitute man's?) sprintf(): we want to
6607	* print the provider name, module name, function name and name of
6608	* the probe, along with the hex address of the ECB with the breakpoint
6609	* action -- all of which we must place in the character buffer by
6610	* hand.
6611	*/
6612	while (*msg != `'\0'`)
6613	c[i++] = *msg++;
6614
6615	for (str = prov->dtpv_name; *str != `'\0'`; str++)
6616	c[i++] = *str;
6617	c[i++] = `':'`;
6618
6619	for (str = probe->dtpr_mod; *str != `'\0'`; str++)
6620	c[i++] = *str;
6621	c[i++] = `':'`;
6622
6623	for (str = probe->dtpr_func; *str != `'\0'`; str++)
6624	c[i++] = *str;
6625	c[i++] = `':'`;
6626
6627	for (str = probe->dtpr_name; *str != `'\0'`; str++)
6628	c[i++] = *str;
6629
6630	while (*ecbmsg != `'\0'`)
6631	c[i++] = *ecbmsg++;
6632
6633	while (shift >= `0`) {
6634	mask = (uintptr_t)`0xf` << shift;
6635
6636	if (val >= ((uintptr_t)`1` << shift))
6637	c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6638	shift -= `4`;
6639	}
6640
6641	c[i++] = `')'`;
6642	c[i] = `'\0'`;
6643
6644	debug_enter(c);
6645	}
6646
6647	__attribute__((noinline))
6648	static void
6649	dtrace_action_panic(dtrace_ecb_t *ecb)
6650	{
6651	dtrace_probe_t *probe = ecb->dte_probe;
6652
6653	/*
6654	* It's impossible to be taking action on the NULL probe.
6655	*/
6656	ASSERT(probe != NULL);
6657
6658	if (dtrace_destructive_disallow)
6659	return;
6660
6661	if (dtrace_panicked != NULL)
6662	return;
6663
6664	if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6665	return;
6666
6667	/*
6668	* We won the right to panic. (We want to be sure that only one
6669	* thread calls panic() from dtrace_probe(), and that panic() is
6670	* called exactly once.)
6671	*/
6672	panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6673	probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6674	probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6675
6676	/*
6677	* APPLE NOTE: this was for an old Mac OS X debug feature
6678	* allowing a return from panic(). Revisit someday.
6679	*/
6680	dtrace_panicked = NULL;
6681	}
6682
6683	static void
6684	dtrace_action_raise(uint64_t sig)
6685	{
6686	if (dtrace_destructive_disallow)
6687	return;
6688
6689	if (sig >= NSIG) {
6690	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6691	return;
6692	}
6693
6694	/*
6695	* raise() has a queue depth of 1 -- we ignore all subsequent
6696	* invocations of the raise() action.
6697	*/
6698
6699	uthread_t uthread = current_uthread();
6700
6701	if (uthread && uthread->t_dtrace_sig == `0`) {
6702	uthread->t_dtrace_sig = sig;
6703	act_set_astbsd(current_thread());
6704	}
6705	}
6706
6707	static void
6708	dtrace_action_stop(void)
6709	{
6710	if (dtrace_destructive_disallow)
6711	return;
6712
6713	uthread_t uthread = current_uthread();
6714	if (uthread) {
6715	/*
6716	* The currently running process will be set to task_suspend
6717	* when it next leaves the kernel.
6718	*/
6719	uthread->t_dtrace_stop = `1`;
6720	act_set_astbsd(current_thread());
6721	}
6722	}
6723
6724
6725	/*
6726	* APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6727	* Both activate only when the currently running process next leaves the
6728	* kernel.
6729	*/
6730	static void
6731	dtrace_action_pidresume(uint64_t pid)
6732	{
6733	if (dtrace_destructive_disallow)
6734	return;
6735
6736	if (kauth_cred_issuser(cred: kauth_cred_get()) == `0`) {
6737	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6738	return;
6739	}
6740	uthread_t uthread = current_uthread();
6741
6742	/*
6743	* When the currently running process leaves the kernel, it attempts to
6744	* task_resume the process (denoted by pid), if that pid appears to have
6745	* been stopped by dtrace_action_stop().
6746	* The currently running process has a pidresume() queue depth of 1 --
6747	* subsequent invocations of the pidresume() action are ignored.
6748	*/
6749
6750	if (pid != `0` && uthread && uthread->t_dtrace_resumepid == `0`) {
6751	uthread->t_dtrace_resumepid = pid;
6752	act_set_astbsd(current_thread());
6753	}
6754	}
6755
6756	__attribute__((noinline))
6757	static void
6758	dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6759	{
6760	hrtime_t now;
6761	volatile uint16_t *flags;
6762	dtrace_cpu_t *cpu = CPU;
6763
6764	if (dtrace_destructive_disallow)
6765	return;
6766
6767	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6768
6769	now = dtrace_gethrtime();
6770
6771	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6772	/*
6773	* We need to advance the mark to the current time.
6774	*/
6775	cpu->cpu_dtrace_chillmark = now;
6776	cpu->cpu_dtrace_chilled = `0`;
6777	}
6778
6779	/*
6780	* Now check to see if the requested chill time would take us over
6781	* the maximum amount of time allowed in the chill interval. (Or
6782	* worse, if the calculation itself induces overflow.)
6783	*/
6784	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max \|\|
6785	cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6786	*flags \|= CPU_DTRACE_ILLOP;
6787	return;
6788	}
6789
6790	while (dtrace_gethrtime() - now < val)
6791	continue;
6792
6793	/*
6794	* Normally, we assure that the value of the variable "timestamp" does
6795	* not change within an ECB. The presence of chill() represents an
6796	* exception to this rule, however.
6797	*/
6798	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6799	cpu->cpu_dtrace_chilled += val;
6800	}
6801
6802	__attribute__((noinline))
6803	static void
6804	dtrace_action_ustack(dtrace_mstate_t mstate, dtrace_state_t state,
6805	uint64_t *buf, uint64_t arg)
6806	{
6807	int nframes = DTRACE_USTACK_NFRAMES(arg);
6808	int strsize = DTRACE_USTACK_STRSIZE(arg);
6809	uint64_t pcs = &buf[`1`], fps;
6810	char str = (char* *)&pcs[nframes];
6811	int size, offs = `0`, i, j;
6812	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6813	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6814	char *sym;
6815
6816	/*
6817	* Should be taking a faster path if string space has not been
6818	* allocated.
6819	*/
6820	ASSERT(strsize != `0`);
6821
6822	/*
6823	* We will first allocate some temporary space for the frame pointers.
6824	*/
6825	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
6826	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6827	(nframes * sizeof (uint64_t));
6828
6829	if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6830	/*
6831	* Not enough room for our frame pointers -- need to indicate
6832	* that we ran out of scratch space.
6833	*/
6834	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6835	return;
6836	}
6837
6838	mstate->dtms_scratch_ptr += size;
6839	saved = mstate->dtms_scratch_ptr;
6840
6841	/*
6842	* Now get a stack with both program counters and frame pointers.
6843	*/
6844	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6845	dtrace_getufpstack(buf, fps, nframes + `1`);
6846	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6847
6848	/*
6849	* If that faulted, we're cooked.
6850	*/
6851	if (*flags & CPU_DTRACE_FAULT)
6852	goto out;
6853
6854	/*
6855	* Now we want to walk up the stack, calling the USTACK helper. For
6856	* each iteration, we restore the scratch pointer.
6857	*/
6858	for (i = `0`; i < nframes; i++) {
6859	mstate->dtms_scratch_ptr = saved;
6860
6861	if (offs >= strsize)
6862	break;
6863
6864	sym = (char *)(uintptr_t)dtrace_helper(
6865	DTRACE_HELPER_ACTION_USTACK,
6866	mstate, state, pcs[i], fps[i]);
6867
6868	/*
6869	* If we faulted while running the helper, we're going to
6870	* clear the fault and null out the corresponding string.
6871	*/
6872	if (*flags & CPU_DTRACE_FAULT) {
6873	*flags &= ~CPU_DTRACE_FAULT;
6874	str[offs++] = `'\0'`;
6875	continue;
6876	}
6877
6878	if (sym == NULL) {
6879	str[offs++] = `'\0'`;
6880	continue;
6881	}
6882
6883	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6884
6885	/*
6886	* Now copy in the string that the helper returned to us.
6887	*/
6888	for (j = `0`; offs + j < strsize; j++) {
6889	if ((str[offs + j] = sym[j]) == `'\0'`)
6890	break;
6891	}
6892
6893	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6894
6895	offs += j + `1`;
6896	}
6897
6898	if (offs >= strsize) {
6899	/*
6900	* If we didn't have room for all of the strings, we don't
6901	* abort processing -- this needn't be a fatal error -- but we
6902	* still want to increment a counter (dts_stkstroverflows) to
6903	* allow this condition to be warned about. (If this is from
6904	* a jstack() action, it is easily tuned via jstackstrsize.)
6905	*/
6906	dtrace_error(counter: &state->dts_stkstroverflows);
6907	}
6908
6909	while (offs < strsize)
6910	str[offs++] = `'\0'`;
6911
6912	out:
6913	mstate->dtms_scratch_ptr = old;
6914	}
6915
6916	__attribute__((noinline))
6917	static void
6918	dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6919	size_t valoffsp, uint64_t valp, uint64_t end, int intuple, int dtkind)
6920	{
6921	volatile uint16_t *flags;
6922	uint64_t val = *valp;
6923	size_t valoffs = *valoffsp;
6924
6925	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6926	ASSERT(dtkind == DIF_TF_BYREF \|\| dtkind == DIF_TF_BYUREF);
6927
6928	/*
6929	* If this is a string, we're going to only load until we find the zero
6930	* byte -- after which we'll store zero bytes.
6931	*/
6932	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6933	char c = `'\0'` + `1`;
6934	size_t s;
6935
6936	for (s = `0`; s < size; s++) {
6937	if (c != `'\0'` && dtkind == DIF_TF_BYREF) {
6938	c = dtrace_load8(addr: val++);
6939	} else if (c != `'\0'` && dtkind == DIF_TF_BYUREF) {
6940	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6941	c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6942	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6943	if (*flags & CPU_DTRACE_FAULT)
6944	break;
6945	}
6946
6947	DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6948
6949	if (c == `'\0'` && intuple)
6950	break;
6951	}
6952	} else {
6953	uint8_t c;
6954	while (valoffs < end) {
6955	if (dtkind == DIF_TF_BYREF) {
6956	c = dtrace_load8(addr: val++);
6957	} else if (dtkind == DIF_TF_BYUREF) {
6958	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6959	c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6960	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6961	if (*flags & CPU_DTRACE_FAULT)
6962	break;
6963	}
6964
6965	DTRACE_STORE(uint8_t, tomax,
6966	valoffs++, c);
6967	}
6968	}
6969
6970	*valp = val;
6971	*valoffsp = valoffs;
6972	}
6973
6974	/*
6975	* Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6976	* defined, we also assert that we are not recursing unless the probe ID is an
6977	* error probe.
6978	*/
6979	static dtrace_icookie_t
6980	dtrace_probe_enter(dtrace_id_t id)
6981	{
6982	thread_t thread = current_thread();
6983	uint16_t inprobe;
6984
6985	dtrace_icookie_t cookie;
6986
6987	cookie = dtrace_interrupt_disable();
6988
6989	/*
6990	* Unless this is an ERROR probe, we are not allowed to recurse in
6991	* dtrace_probe(). Recursing into DTrace probe usually means that a
6992	* function is instrumented that should not have been instrumented or
6993	* that the ordering guarantee of the records will be violated,
6994	* resulting in unexpected output. If there is an exception to this
6995	* assertion, a new case should be added.
6996	*/
6997	inprobe = dtrace_get_thread_inprobe(thread);
6998	VERIFY(inprobe == `0` \|\|
6999	id == dtrace_probeid_error);
7000	ASSERT(inprobe < UINT16_MAX);
7001	dtrace_set_thread_inprobe(thread, inprobe + `1`);
7002
7003	return (cookie);
7004	}
7005
7006	/*
7007	* Clears the per-thread inprobe flag and enables interrupts.
7008	*/
7009	static void
7010	dtrace_probe_exit(dtrace_icookie_t cookie)
7011	{
7012	thread_t thread = current_thread();
7013	uint16_t inprobe = dtrace_get_thread_inprobe(thread);
7014
7015	ASSERT(inprobe > `0`);
7016	dtrace_set_thread_inprobe(thread, inprobe - `1`);
7017
7018	#if SCHED_HYGIENE_DEBUG
7019	/*
7020	* Probes can take a relatively long time depending on what the user has
7021	* requested be done in probe context.
7022	* Probes can fire from places where interrupts are already disabled
7023	* (like an interrupt handler) or where preemption has been disabled.
7024	* In order to not trip the interrupt or preemption thresholds, it is
7025	* important to reset timestamps when leaving probe context.
7026	*/
7027
7028	/ Interrupts were disabled for the duration of this probe. /
7029	ml_spin_debug_reset(thread);
7030
7031	/ May have been called from an interrupt handler. /
7032	ml_irq_debug_abandon();
7033
7034	/ May have been called with preemption disabled. /
7035	abandon_preemption_disable_measurement();
7036
7037	#endif /* SCHED_HYGIENE_DEBUG */
7038
7039	dtrace_interrupt_enable(cookie);
7040	}
7041
7042	/*
7043	* If you're looking for the epicenter of DTrace, you just found it. This
7044	* is the function called by the provider to fire a probe -- from which all
7045	* subsequent probe-context DTrace activity emanates.
7046	*/
7047	void
7048	dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7049	uint64_t arg2, uint64_t arg3, uint64_t arg4)
7050	{
7051	processorid_t cpuid;
7052	dtrace_icookie_t cookie;
7053	dtrace_probe_t *probe;
7054	dtrace_mstate_t mstate;
7055	dtrace_ecb_t *ecb;
7056	dtrace_action_t *act;
7057	intptr_t offs;
7058	size_t size;
7059	int vtime, onintr;
7060	volatile uint16_t *flags;
7061	hrtime_t now;
7062
7063	cookie = dtrace_probe_enter(id);
7064
7065	/ Ensure that probe id is valid. /
7066	if (id - `1` >= (dtrace_id_t)dtrace_nprobes) {
7067	dtrace_probe_exit(cookie);
7068	return;
7069	}
7070
7071	probe = dtrace_probes[id - `1`];
7072	if (probe == NULL) {
7073	dtrace_probe_exit(cookie);
7074	return;
7075	}
7076
7077	cpuid = CPU->cpu_id;
7078	onintr = CPU_ON_INTR(CPU);
7079
7080	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7081	probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
7082	/*
7083	* We have hit in the predicate cache; we know that
7084	* this predicate would evaluate to be false.
7085	*/
7086	dtrace_probe_exit(cookie);
7087	return;
7088	}
7089
7090	if (panic_quiesce) {
7091	/*
7092	* We don't trace anything if we're panicking.
7093	*/
7094	dtrace_probe_exit(cookie);
7095	return;
7096	}
7097
7098	#if !defined(__APPLE__)
7099	now = dtrace_gethrtime();
7100	vtime = dtrace_vtime_references != `0`;
7101
7102	if (vtime && curthread->t_dtrace_start)
7103	curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7104	#else
7105	/*
7106	* APPLE NOTE: The time spent entering DTrace and arriving
7107	* to this point, is attributed to the current thread.
7108	* Instead it should accrue to DTrace. FIXME
7109	*/
7110	vtime = dtrace_vtime_references != `0`;
7111
7112	if (vtime)
7113	{
7114	int64_t dtrace_accum_time, recent_vtime;
7115	thread_t thread = current_thread();
7116
7117	dtrace_accum_time = dtrace_get_thread_tracing(thread); / Time spent inside DTrace so far (nanoseconds) /
7118
7119	if (dtrace_accum_time >= `0`) {
7120	recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); / up to the moment thread vtime /
7121
7122	recent_vtime = recent_vtime - dtrace_accum_time; / Time without DTrace contribution /
7123
7124	dtrace_set_thread_vtime(thread, recent_vtime);
7125	}
7126	}
7127
7128	now = dtrace_gethrtime(); / must not precede dtrace_calc_thread_recent_vtime() call! /
7129	#endif /* __APPLE__ */
7130
7131	/*
7132	* APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
7133	* dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
7134	* However the provider has no access to ECB context, so passes
7135	* 0 through "arg0" and the probe_id of the overridden probe as arg1.
7136	* Detect that here and cons up a viable state (from the probe_id).
7137	*/
7138	if (dtrace_probeid_error == id && `0` == arg0) {
7139	dtrace_id_t ftp_id = (dtrace_id_t)arg1;
7140	dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - `1`];
7141	dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7142
7143	if (NULL != ftp_ecb) {
7144	dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7145
7146	arg0 = (uint64_t)(uintptr_t)ftp_state;
7147	arg1 = ftp_ecb->dte_epid;
7148	/*
7149	* args[2-4] established by caller.
7150	*/
7151	ftp_state->dts_arg_error_illval = -`1`; / arg5 /
7152	}
7153	}
7154
7155	mstate.dtms_difo = NULL;
7156	mstate.dtms_probe = probe;
7157	mstate.dtms_strtok = `0`;
7158	mstate.dtms_arg[`0`] = arg0;
7159	mstate.dtms_arg[`1`] = arg1;
7160	mstate.dtms_arg[`2`] = arg2;
7161	mstate.dtms_arg[`3`] = arg3;
7162	mstate.dtms_arg[`4`] = arg4;
7163
7164	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7165
7166	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7167	dtrace_predicate_t *pred = ecb->dte_predicate;
7168	dtrace_state_t *state = ecb->dte_state;
7169	dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7170	dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7171	dtrace_vstate_t *vstate = &state->dts_vstate;
7172	dtrace_provider_t *prov = probe->dtpr_provider;
7173	uint64_t tracememsize = `0`;
7174	int committed = `0`;
7175	caddr_t tomax;
7176
7177	/*
7178	* A little subtlety with the following (seemingly innocuous)
7179	* declaration of the automatic 'val': by looking at the
7180	* code, you might think that it could be declared in the
7181	* action processing loop, below. (That is, it's only used in
7182	* the action processing loop.) However, it must be declared
7183	* out of that scope because in the case of DIF expression
7184	* arguments to aggregating actions, one iteration of the
7185	* action loop will use the last iteration's value.
7186	*/
7187	#ifdef lint
7188	uint64_t val = `0`;
7189	#else
7190	uint64_t val = `0`;
7191	#endif
7192
7193	mstate.dtms_present = DTRACE_MSTATE_ARGS \| DTRACE_MSTATE_PROBE;
7194	*flags &= ~CPU_DTRACE_ERROR;
7195
7196	if (prov == dtrace_provider) {
7197	/*
7198	* If dtrace itself is the provider of this probe,
7199	* we're only going to continue processing the ECB if
7200	* arg0 (the dtrace_state_t) is equal to the ECB's
7201	* creating state. (This prevents disjoint consumers
7202	* from seeing one another's metaprobes.)
7203	*/
7204	if (arg0 != (uint64_t)(uintptr_t)state)
7205	continue;
7206	}
7207
7208	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7209	/*
7210	* We're not currently active. If our provider isn't
7211	* the dtrace pseudo provider, we're not interested.
7212	*/
7213	if (prov != dtrace_provider)
7214	continue;
7215
7216	/*
7217	* Now we must further check if we are in the BEGIN
7218	* probe. If we are, we will only continue processing
7219	* if we're still in WARMUP -- if one BEGIN enabling
7220	* has invoked the exit() action, we don't want to
7221	* evaluate subsequent BEGIN enablings.
7222	*/
7223	if (probe->dtpr_id == dtrace_probeid_begin &&
7224	state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7225	ASSERT(state->dts_activity ==
7226	DTRACE_ACTIVITY_DRAINING);
7227	continue;
7228	}
7229	}
7230
7231	if (ecb->dte_cond) {
7232	/*
7233	* If the dte_cond bits indicate that this
7234	* consumer is only allowed to see user-mode firings
7235	* of this probe, call the provider's dtps_usermode()
7236	* entry point to check that the probe was fired
7237	* while in a user context. Skip this ECB if that's
7238	* not the case.
7239	*/
7240	if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7241	prov->dtpv_pops.dtps_usermode &&
7242	prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7243	probe->dtpr_id, probe->dtpr_arg) == `0`)
7244	continue;
7245
7246	/*
7247	* This is more subtle than it looks. We have to be
7248	* absolutely certain that CRED() isn't going to
7249	* change out from under us so it's only legit to
7250	* examine that structure if we're in constrained
7251	* situations. Currently, the only times we'll this
7252	* check is if a non-super-user has enabled the
7253	* profile or syscall providers -- providers that
7254	* allow visibility of all processes. For the
7255	* profile case, the check above will ensure that
7256	* we're examining a user context.
7257	*/
7258	if (ecb->dte_cond & DTRACE_COND_OWNER) {
7259	cred_t *cr;
7260	cred_t *s_cr =
7261	ecb->dte_state->dts_cred.dcr_cred;
7262	proc_t *proc;
7263	#pragma unused(proc) /* __APPLE__ */
7264
7265	ASSERT(s_cr != NULL);
7266
7267	/*
7268	* XXX this is hackish, but so is setting a variable
7269	* XXX in a McCarthy OR...
7270	*/
7271	if ((cr = dtrace_CRED()) == NULL \|\|
7272	posix_cred_get(cred: s_cr)->cr_uid != posix_cred_get(cred: cr)->cr_uid \|\|
7273	posix_cred_get(cred: s_cr)->cr_uid != posix_cred_get(cred: cr)->cr_ruid \|\|
7274	posix_cred_get(cred: s_cr)->cr_uid != posix_cred_get(cred: cr)->cr_suid \|\|
7275	posix_cred_get(cred: s_cr)->cr_gid != posix_cred_get(cred: cr)->cr_gid \|\|
7276	posix_cred_get(cred: s_cr)->cr_gid != posix_cred_get(cred: cr)->cr_rgid \|\|
7277	posix_cred_get(cred: s_cr)->cr_gid != posix_cred_get(cred: cr)->cr_sgid \|\|
7278	#if !defined(__APPLE__)
7279	(proc = ttoproc(curthread)) == NULL \|\|
7280	(proc->p_flag & SNOCD))
7281	#else
7282	`1`) / APPLE NOTE: Darwin omits "No Core Dump" flag /
7283	#endif /* __APPLE__ */
7284	continue;
7285	}
7286
7287	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7288	cred_t *cr;
7289	cred_t *s_cr =
7290	ecb->dte_state->dts_cred.dcr_cred;
7291	#pragma unused(cr, s_cr) /* __APPLE__ */
7292
7293	ASSERT(s_cr != NULL);
7294
7295	#if !defined(__APPLE__)
7296	if ((cr = CRED()) == NULL \|\|
7297	s_cr->cr_zone->zone_id !=
7298	cr->cr_zone->zone_id)
7299	continue;
7300	#else
7301	/ APPLE NOTE: Darwin doesn't do zones. /
7302	#endif /* __APPLE__ */
7303	}
7304	}
7305
7306	if (now - state->dts_alive > dtrace_deadman_timeout) {
7307	/*
7308	* We seem to be dead. Unless we (a) have kernel
7309	* destructive permissions (b) have expicitly enabled
7310	* destructive actions and (c) destructive actions have
7311	* not been disabled, we're going to transition into
7312	* the KILLED state, from which no further processing
7313	* on this state will be performed.
7314	*/
7315	if (!dtrace_priv_kernel_destructive(state) \|\|
7316	!state->dts_cred.dcr_destructive \|\|
7317	dtrace_destructive_disallow) {
7318	void *activity = &state->dts_activity;
7319	dtrace_activity_t current;
7320
7321	do {
7322	current = state->dts_activity;
7323	} while (dtrace_cas32(activity, current,
7324	DTRACE_ACTIVITY_KILLED) != current);
7325
7326	continue;
7327	}
7328	}
7329
7330	if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7331	ecb->dte_alignment, state, &mstate)) < `0`)
7332	continue;
7333
7334	tomax = buf->dtb_tomax;
7335	ASSERT(tomax != NULL);
7336
7337	/*
7338	* Build and store the record header corresponding to the ECB.
7339	*/
7340	if (ecb->dte_size != `0`) {
7341	dtrace_rechdr_t dtrh;
7342
7343	if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7344	mstate.dtms_timestamp = dtrace_gethrtime();
7345	mstate.dtms_present \|= DTRACE_MSTATE_TIMESTAMP;
7346	}
7347
7348	ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7349
7350	dtrh.dtrh_epid = ecb->dte_epid;
7351	DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7352	DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7353	}
7354
7355	mstate.dtms_epid = ecb->dte_epid;
7356	mstate.dtms_present \|= DTRACE_MSTATE_EPID;
7357
7358	if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7359	mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7360	else
7361	mstate.dtms_access = `0`;
7362
7363	if (pred != NULL) {
7364	dtrace_difo_t *dp = pred->dtp_difo;
7365	uint64_t rval;
7366
7367	rval = dtrace_dif_emulate(difo: dp, mstate: &mstate, vstate, state);
7368
7369	if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7370	dtrace_cacheid_t cid = probe->dtpr_predcache;
7371
7372	if (cid != DTRACE_CACHEIDNONE && !onintr) {
7373	/*
7374	* Update the predicate cache...
7375	*/
7376	ASSERT(cid == pred->dtp_cacheid);
7377
7378	dtrace_set_thread_predcache(current_thread(), cid);
7379	}
7380
7381	continue;
7382	}
7383	}
7384
7385	for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7386	act != NULL; act = act->dta_next) {
7387	size_t valoffs;
7388	dtrace_difo_t *dp;
7389	dtrace_recdesc_t *rec = &act->dta_rec;
7390
7391	size = rec->dtrd_size;
7392	valoffs = offs + rec->dtrd_offset;
7393
7394	if (DTRACEACT_ISAGG(act->dta_kind)) {
7395	uint64_t v = `0xbad`;
7396	dtrace_aggregation_t *agg;
7397
7398	agg = (dtrace_aggregation_t *)act;
7399
7400	if ((dp = act->dta_difo) != NULL)
7401	v = dtrace_dif_emulate(difo: dp,
7402	mstate: &mstate, vstate, state);
7403
7404	if (*flags & CPU_DTRACE_ERROR)
7405	continue;
7406
7407	/*
7408	* Note that we always pass the expression
7409	* value from the previous iteration of the
7410	* action loop. This value will only be used
7411	* if there is an expression argument to the
7412	* aggregating action, denoted by the
7413	* dtag_hasarg field.
7414	*/
7415	dtrace_aggregate(agg, dbuf: buf,
7416	offset: offs, buf: aggbuf, expr: v, arg: val);
7417	continue;
7418	}
7419
7420	switch (act->dta_kind) {
7421	case DTRACEACT_STOP:
7422	if (dtrace_priv_proc_destructive(state))
7423	dtrace_action_stop();
7424	continue;
7425
7426	case DTRACEACT_BREAKPOINT:
7427	if (dtrace_priv_kernel_destructive(state))
7428	dtrace_action_breakpoint(ecb);
7429	continue;
7430
7431	case DTRACEACT_PANIC:
7432	if (dtrace_priv_kernel_destructive(state))
7433	dtrace_action_panic(ecb);
7434	continue;
7435
7436	case DTRACEACT_STACK:
7437	if (!dtrace_priv_kernel(state))
7438	continue;
7439
7440	dtrace_getpcstack((pc_t *)(tomax + valoffs),
7441	size / sizeof (pc_t), probe->dtpr_aframes,
7442	DTRACE_ANCHORED(probe) ? NULL :
7443	(uint32_t *)(uintptr_t)arg0);
7444	continue;
7445
7446	case DTRACEACT_JSTACK:
7447	case DTRACEACT_USTACK:
7448	if (!dtrace_priv_proc(state))
7449	continue;
7450
7451	/*
7452	* See comment in DIF_VAR_PID.
7453	*/
7454	if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7455	CPU_ON_INTR(CPU)) {
7456	int depth = DTRACE_USTACK_NFRAMES(
7457	rec->dtrd_arg) + `1`;
7458
7459	dtrace_bzero(dst: (void *)(tomax + valoffs),
7460	DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7461	+ depth * sizeof (uint64_t));
7462
7463	continue;
7464	}
7465
7466	if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != `0` &&
7467	curproc->p_dtrace_helpers != NULL) {
7468	/*
7469	* This is the slow path -- we have
7470	* allocated string space, and we're
7471	* getting the stack of a process that
7472	* has helpers. Call into a separate
7473	* routine to perform this processing.
7474	*/
7475	dtrace_action_ustack(mstate: &mstate, state,
7476	buf: (uint64_t *)(tomax + valoffs),
7477	arg: rec->dtrd_arg);
7478	continue;
7479	}
7480
7481	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7482	dtrace_getupcstack((uint64_t *)
7483	(tomax + valoffs),
7484	DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + `1`);
7485	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7486	continue;
7487
7488	default:
7489	break;
7490	}
7491
7492	dp = act->dta_difo;
7493	ASSERT(dp != NULL);
7494
7495	val = dtrace_dif_emulate(difo: dp, mstate: &mstate, vstate, state);
7496
7497	if (*flags & CPU_DTRACE_ERROR)
7498	continue;
7499
7500	switch (act->dta_kind) {
7501	case DTRACEACT_SPECULATE: {
7502	dtrace_rechdr_t *dtrh = NULL;
7503
7504	ASSERT(buf == &state->dts_buffer[cpuid]);
7505	buf = dtrace_speculation_buffer(state,
7506	cpuid, which: val);
7507
7508	if (buf == NULL) {
7509	*flags \|= CPU_DTRACE_DROP;
7510	continue;
7511	}
7512
7513	offs = dtrace_buffer_reserve(buf,
7514	ecb->dte_needed, ecb->dte_alignment,
7515	state, NULL);
7516
7517	if (offs < `0`) {
7518	*flags \|= CPU_DTRACE_DROP;
7519	continue;
7520	}
7521
7522	tomax = buf->dtb_tomax;
7523	ASSERT(tomax != NULL);
7524
7525	if (ecb->dte_size == `0`)
7526	continue;
7527
7528	ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7529	dtrh = ((void *)(tomax + offs));
7530	dtrh->dtrh_epid = ecb->dte_epid;
7531
7532	/*
7533	* When the speculation is committed, all of
7534	* the records in the speculative buffer will
7535	* have their timestamps set to the commit
7536	* time. Until then, it is set to a sentinel
7537	* value, for debugability.
7538	*/
7539	DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7540
7541	continue;
7542	}
7543
7544	case DTRACEACT_CHILL:
7545	if (dtrace_priv_kernel_destructive(state))
7546	dtrace_action_chill(mstate: &mstate, val);
7547	continue;
7548
7549	case DTRACEACT_RAISE:
7550	if (dtrace_priv_proc_destructive(state))
7551	dtrace_action_raise(sig: val);
7552	continue;
7553
7554	case DTRACEACT_PIDRESUME: / __APPLE__ /
7555	if (dtrace_priv_proc_destructive(state))
7556	dtrace_action_pidresume(pid: val);
7557	continue;
7558
7559	case DTRACEACT_COMMIT:
7560	ASSERT(!committed);
7561
7562	/*
7563	* We need to commit our buffer state.
7564	*/
7565	if (ecb->dte_size)
7566	buf->dtb_offset = offs + ecb->dte_size;
7567	buf = &state->dts_buffer[cpuid];
7568	dtrace_speculation_commit(state, cpu: cpuid, which: val);
7569	committed = `1`;
7570	continue;
7571
7572	case DTRACEACT_DISCARD:
7573	dtrace_speculation_discard(state, cpu: cpuid, which: val);
7574	continue;
7575
7576	case DTRACEACT_DIFEXPR:
7577	case DTRACEACT_LIBACT:
7578	case DTRACEACT_PRINTF:
7579	case DTRACEACT_PRINTA:
7580	case DTRACEACT_SYSTEM:
7581	case DTRACEACT_FREOPEN:
7582	case DTRACEACT_APPLEBINARY: / __APPLE__ /
7583	case DTRACEACT_TRACEMEM:
7584	break;
7585
7586	case DTRACEACT_TRACEMEM_DYNSIZE:
7587	tracememsize = val;
7588	break;
7589
7590	case DTRACEACT_SYM:
7591	case DTRACEACT_MOD:
7592	if (!dtrace_priv_kernel(state))
7593	continue;
7594	break;
7595
7596	case DTRACEACT_USYM:
7597	case DTRACEACT_UMOD:
7598	case DTRACEACT_UADDR: {
7599	if (!dtrace_priv_proc(state))
7600	continue;
7601
7602	DTRACE_STORE(uint64_t, tomax,
7603	valoffs, (uint64_t)dtrace_proc_selfpid());
7604	DTRACE_STORE(uint64_t, tomax,
7605	valoffs + sizeof (uint64_t), val);
7606
7607	continue;
7608	}
7609
7610	case DTRACEACT_EXIT: {
7611	/*
7612	* For the exit action, we are going to attempt
7613	* to atomically set our activity to be
7614	* draining. If this fails (either because
7615	* another CPU has beat us to the exit action,
7616	* or because our current activity is something
7617	* other than ACTIVE or WARMUP), we will
7618	* continue. This assures that the exit action
7619	* can be successfully recorded at most once
7620	* when we're in the ACTIVE state. If we're
7621	* encountering the exit() action while in
7622	* COOLDOWN, however, we want to honor the new
7623	* status code. (We know that we're the only
7624	* thread in COOLDOWN, so there is no race.)
7625	*/
7626	void *activity = &state->dts_activity;
7627	dtrace_activity_t current = state->dts_activity;
7628
7629	if (current == DTRACE_ACTIVITY_COOLDOWN)
7630	break;
7631
7632	if (current != DTRACE_ACTIVITY_WARMUP)
7633	current = DTRACE_ACTIVITY_ACTIVE;
7634
7635	if (dtrace_cas32(activity, current,
7636	DTRACE_ACTIVITY_DRAINING) != current) {
7637	*flags \|= CPU_DTRACE_DROP;
7638	continue;
7639	}
7640
7641	break;
7642	}
7643
7644	default:
7645	ASSERT(`0`);
7646	}
7647
7648	if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF \| DIF_TF_BYUREF)) {
7649	uintptr_t end = valoffs + size;
7650
7651	if (tracememsize != `0` &&
7652	valoffs + tracememsize < end)
7653	{
7654	end = valoffs + tracememsize;
7655	tracememsize = `0`;
7656	}
7657
7658	if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7659	!dtrace_vcanload(src: (void *)(uintptr_t)val,
7660	type: &dp->dtdo_rtype, NULL, mstate: &mstate, vstate))
7661	{
7662	continue;
7663	}
7664
7665	dtrace_store_by_ref(dp, tomax, size, valoffsp: &valoffs,
7666	valp: &val, end, intuple: act->dta_intuple,
7667	dtkind: dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7668	DIF_TF_BYREF: DIF_TF_BYUREF);
7669
7670	continue;
7671	}
7672
7673	switch (size) {
7674	case `0`:
7675	break;
7676
7677	case sizeof (uint8_t):
7678	DTRACE_STORE(uint8_t, tomax, valoffs, val);
7679	break;
7680	case sizeof (uint16_t):
7681	DTRACE_STORE(uint16_t, tomax, valoffs, val);
7682	break;
7683	case sizeof (uint32_t):
7684	DTRACE_STORE(uint32_t, tomax, valoffs, val);
7685	break;
7686	case sizeof (uint64_t):
7687	DTRACE_STORE(uint64_t, tomax, valoffs, val);
7688	break;
7689	default:
7690	/*
7691	* Any other size should have been returned by
7692	* reference, not by value.
7693	*/
7694	ASSERT(`0`);
7695	break;
7696	}
7697	}
7698
7699	if (*flags & CPU_DTRACE_DROP)
7700	continue;
7701
7702	if (*flags & CPU_DTRACE_FAULT) {
7703	int ndx;
7704	dtrace_action_t *err;
7705
7706	buf->dtb_errors++;
7707
7708	if (probe->dtpr_id == dtrace_probeid_error) {
7709	/*
7710	* There's nothing we can do -- we had an
7711	* error on the error probe. We bump an
7712	* error counter to at least indicate that
7713	* this condition happened.
7714	*/
7715	dtrace_error(counter: &state->dts_dblerrors);
7716	continue;
7717	}
7718
7719	if (vtime) {
7720	/*
7721	* Before recursing on dtrace_probe(), we
7722	* need to explicitly clear out our start
7723	* time to prevent it from being accumulated
7724	* into t_dtrace_vtime.
7725	*/
7726
7727	/*
7728	* Darwin sets the sign bit on t_dtrace_tracing
7729	* to suspend accumulation to it.
7730	*/
7731	dtrace_set_thread_tracing(current_thread(),
7732	(`1ULL`<<`63`) \| dtrace_get_thread_tracing(current_thread()));
7733	}
7734
7735	/*
7736	* Iterate over the actions to figure out which action
7737	* we were processing when we experienced the error.
7738	* Note that act points _past_ the faulting action; if
7739	* act is ecb->dte_action, the fault was in the
7740	* predicate, if it's ecb->dte_action->dta_next it's
7741	* in action #1, and so on.
7742	*/
7743	for (err = ecb->dte_action, ndx = `0`;
7744	err != act; err = err->dta_next, ndx++)
7745	continue;
7746
7747	dtrace_probe_error(state, ecb->dte_epid, ndx,
7748	(mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7749	mstate.dtms_fltoffs : -`1`, DTRACE_FLAGS2FLT(*flags),
7750	cpu_core[cpuid].cpuc_dtrace_illval);
7751
7752	continue;
7753	}
7754
7755	if (!committed)
7756	buf->dtb_offset = offs + ecb->dte_size;
7757	}
7758
7759	/ FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed*
7760	to the current thread. Instead it should accrue to DTrace. /*
7761	if (vtime) {
7762	thread_t thread = current_thread();
7763	int64_t t = dtrace_get_thread_tracing(thread);
7764
7765	if (t >= `0`) {
7766	/ Usual case, accumulate time spent here into t_dtrace_tracing /
7767	dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7768	} else {
7769	/ Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. /
7770	dtrace_set_thread_tracing(thread, (~(`1ULL`<<`63`)) & t);
7771	}
7772	}
7773
7774	dtrace_probe_exit(cookie);
7775	}
7776
7777	/*
7778	* DTrace Probe Hashing Functions
7779	*
7780	* The functions in this section (and indeed, the functions in remaining
7781	* sections) are not _called_ from probe context. (Any exceptions to this are
7782	* marked with a "Note:".) Rather, they are called from elsewhere in the
7783	* DTrace framework to look-up probes in, add probes to and remove probes from
7784	* the DTrace probe hashes. (Each probe is hashed by each element of the
7785	* probe tuple -- allowing for fast lookups, regardless of what was
7786	* specified.)
7787	*/
7788	static uint_t
7789	dtrace_hash_str(const char *p)
7790	{
7791	unsigned int g;
7792	uint_t hval = `0`;
7793
7794	while (*p) {
7795	hval = (hval << `4`) + *p++;
7796	if ((g = (hval & `0xf0000000`)) != `0`)
7797	hval ^= g >> `24`;
7798	hval &= ~g;
7799	}
7800	return (hval);
7801	}
7802
7803	static const char*
7804	dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7805	{
7806	#pragma unused(offs)
7807	dtrace_probe_t probe = (dtrace_probe_t)elm;
7808	return probe->dtpr_provider->dtpv_name;
7809	}
7810
7811	static const char*
7812	dtrace_strkey_offset(void *elm, uintptr_t offs)
7813	{
7814	return ((char *)((uintptr_t)(elm) + offs));
7815	}
7816
7817	static const char*
7818	dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7819	{
7820	return ((char* **)((uintptr_t)(elm) + offs));
7821	}
7822
7823	static dtrace_hash_t *
7824	dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7825	{
7826	dtrace_hash_t hash = kmem_zalloc(sizeof* (dtrace_hash_t), KM_SLEEP);
7827
7828	hash->dth_getstr = func;
7829	hash->dth_stroffs = arg;
7830	hash->dth_nextoffs = nextoffs;
7831	hash->dth_prevoffs = prevoffs;
7832
7833	hash->dth_size = `1`;
7834	hash->dth_mask = hash->dth_size - `1`;
7835
7836	hash->dth_tab = kmem_zalloc(hash->dth_size *
7837	sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7838
7839	return (hash);
7840	}
7841
7842	/*
7843	* APPLE NOTE: dtrace_hash_destroy is not used.
7844	* It is called by dtrace_detach which is not
7845	* currently implemented. Revisit someday.
7846	*/
7847	#if !defined(__APPLE__)
7848	static void
7849	dtrace_hash_destroy(dtrace_hash_t *hash)
7850	{
7851	#if DEBUG
7852	int i;
7853
7854	for (i = `0`; i < hash->dth_size; i++)
7855	ASSERT(hash->dth_tab[i] == NULL);
7856	#endif
7857
7858	kmem_free(hash->dth_tab,
7859	hash->dth_size * sizeof (dtrace_hashbucket_t *));
7860	kmem_free(hash, sizeof (dtrace_hash_t));
7861	}
7862	#endif /* __APPLE__ */
7863
7864	static void
7865	dtrace_hash_resize(dtrace_hash_t *hash)
7866	{
7867	int size = hash->dth_size, i, ndx;
7868	int new_size = hash->dth_size << `1`;
7869	int new_mask = new_size - `1`;
7870	dtrace_hashbucket_t *new_tab, bucket, *next;
7871
7872	ASSERT((new_size & new_mask) == `0`);
7873
7874	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7875
7876	for (i = `0`; i < size; i++) {
7877	for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7878	void *elm = bucket->dthb_chain;
7879
7880	ASSERT(elm != NULL);
7881	ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7882
7883	next = bucket->dthb_next;
7884	bucket->dthb_next = new_tab[ndx];
7885	new_tab[ndx] = bucket;
7886	}
7887	}
7888
7889	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7890	hash->dth_tab = new_tab;
7891	hash->dth_size = new_size;
7892	hash->dth_mask = new_mask;
7893	}
7894
7895	static void
7896	dtrace_hash_add(dtrace_hash_t hash, void* *new)
7897	{
7898	int hashval = DTRACE_HASHSTR(hash, new);
7899	int ndx = hashval & hash->dth_mask;
7900	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7901	void nextp, prevp;
7902
7903	for (; bucket != NULL; bucket = bucket->dthb_next) {
7904	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7905	goto add;
7906	}
7907
7908	if ((hash->dth_nbuckets >> `1`) > hash->dth_size) {
7909	dtrace_hash_resize(hash);
7910	dtrace_hash_add(hash, new);
7911	return;
7912	}
7913
7914	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7915	bucket->dthb_next = hash->dth_tab[ndx];
7916	hash->dth_tab[ndx] = bucket;
7917	hash->dth_nbuckets++;
7918
7919	add:
7920	nextp = DTRACE_HASHNEXT(hash, new);
7921	ASSERT(nextp == NULL && (DTRACE_HASHPREV(hash, new)) == NULL);
7922	*nextp = bucket->dthb_chain;
7923
7924	if (bucket->dthb_chain != NULL) {
7925	prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7926	ASSERT(*prevp == NULL);
7927	*prevp = new;
7928	}
7929
7930	bucket->dthb_chain = new;
7931	bucket->dthb_len++;
7932	}
7933
7934	static void *
7935	dtrace_hash_lookup_string(dtrace_hash_t hash, const* char *str)
7936	{
7937	int hashval = dtrace_hash_str(p: str);
7938	int ndx = hashval & hash->dth_mask;
7939	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7940
7941	for (; bucket != NULL; bucket = bucket->dthb_next) {
7942	if (strcmp(s1: str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == `0`)
7943	return (bucket->dthb_chain);
7944	}
7945
7946	return (NULL);
7947	}
7948
7949	static dtrace_probe_t *
7950	dtrace_hash_lookup(dtrace_hash_t hash, void* *template)
7951	{
7952	return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7953	}
7954
7955	static int
7956	dtrace_hash_collisions(dtrace_hash_t hash, void* *template)
7957	{
7958	int hashval = DTRACE_HASHSTR(hash, template);
7959	int ndx = hashval & hash->dth_mask;
7960	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7961
7962	for (; bucket != NULL; bucket = bucket->dthb_next) {
7963	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7964	return (bucket->dthb_len);
7965	}
7966
7967	return (`0`);
7968	}
7969
7970	static void
7971	dtrace_hash_remove(dtrace_hash_t hash, void* *elm)
7972	{
7973	int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7974	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7975
7976	void **prevp = DTRACE_HASHPREV(hash, elm);
7977	void **nextp = DTRACE_HASHNEXT(hash, elm);
7978
7979	/*
7980	* Find the bucket that we're removing this elm from.
7981	*/
7982	for (; bucket != NULL; bucket = bucket->dthb_next) {
7983	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7984	break;
7985	}
7986
7987	ASSERT(bucket != NULL);
7988
7989	if (*prevp == NULL) {
7990	if (*nextp == NULL) {
7991	/*
7992	* The removed element was the only element on this
7993	* bucket; we need to remove the bucket.
7994	*/
7995	dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7996
7997	ASSERT(bucket->dthb_chain == elm);
7998	ASSERT(b != NULL);
7999
8000	if (b == bucket) {
8001	hash->dth_tab[ndx] = bucket->dthb_next;
8002	} else {
8003	while (b->dthb_next != bucket)
8004	b = b->dthb_next;
8005	b->dthb_next = bucket->dthb_next;
8006	}
8007
8008	ASSERT(hash->dth_nbuckets > `0`);
8009	hash->dth_nbuckets--;
8010	kmem_free(bucket, sizeof (dtrace_hashbucket_t));
8011	return;
8012	}
8013
8014	bucket->dthb_chain = *nextp;
8015	} else {
8016	(DTRACE_HASHNEXT(hash, prevp)) = *nextp;
8017	}
8018
8019	if (*nextp != NULL)
8020	(DTRACE_HASHPREV(hash, nextp)) = *prevp;
8021	}
8022
8023	/*
8024	* DTrace Utility Functions
8025	*
8026	* These are random utility functions that are _not_ called from probe context.
8027	*/
8028	static int
8029	dtrace_badattr(const dtrace_attribute_t *a)
8030	{
8031	return (a->dtat_name > DTRACE_STABILITY_MAX \|\|
8032	a->dtat_data > DTRACE_STABILITY_MAX \|\|
8033	a->dtat_class > DTRACE_CLASS_MAX);
8034	}
8035
8036	/*
8037	* Returns a dtrace-managed copy of a string, and will
8038	* deduplicate copies of the same string.
8039	* If the specified string is NULL, returns an empty string
8040	*/
8041	static char *
8042	dtrace_strref(const char *str)
8043	{
8044	dtrace_string_t *s = NULL;
8045	size_t bufsize = (str != NULL ? strlen(s: str) : `0`) + `1`;
8046
8047	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8048
8049	if (str == NULL)
8050	str = "";
8051
8052	for (s = dtrace_hash_lookup_string(hash: dtrace_strings, str); s != NULL;
8053	s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8054	if (strncmp(s1: str, s2: s->dtst_str, n: bufsize) != `0`) {
8055	continue;
8056	}
8057	ASSERT(s->dtst_refcount != UINT32_MAX);
8058	s->dtst_refcount++;
8059	return s->dtst_str;
8060	}
8061
8062	s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
8063	s->dtst_refcount = `1`;
8064	(void) strlcpy(dst: s->dtst_str, src: str, n: bufsize);
8065
8066	dtrace_hash_add(hash: dtrace_strings, new: s);
8067
8068	return s->dtst_str;
8069	}
8070
8071	static void
8072	dtrace_strunref(const char *str)
8073	{
8074	ASSERT(str != NULL);
8075	dtrace_string_t *s = NULL;
8076	size_t bufsize = strlen(s: str) + `1`;
8077
8078	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8079
8080	for (s = dtrace_hash_lookup_string(hash: dtrace_strings, str); s != NULL;
8081	s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8082	if (strncmp(s1: str, s2: s->dtst_str, n: bufsize) != `0`) {
8083	continue;
8084	}
8085	ASSERT(s->dtst_refcount != `0`);
8086	s->dtst_refcount--;
8087	if (s->dtst_refcount == `0`) {
8088	dtrace_hash_remove(hash: dtrace_strings, elm: s);
8089	kmem_free(s, sizeof(dtrace_string_t) + bufsize);
8090	}
8091	return;
8092	}
8093	panic("attempt to unref non-existent string %s", str);
8094	}
8095
8096	#define DTRACE_ISALPHA(c) \
8097	(((c) >= 'a' && (c) <= 'z') \|\| ((c) >= 'A' && (c) <= 'Z'))
8098
8099	static int
8100	dtrace_badname(const char *s)
8101	{
8102	char c;
8103
8104	if (s == NULL \|\| (c = *s++) == `'\0'`)
8105	return (`0`);
8106
8107	if (!DTRACE_ISALPHA(c) && c != `'-'` && c != `'_'` && c != `'.'`)
8108	return (`1`);
8109
8110	while ((c = *s++) != `'\0'`) {
8111	if (!DTRACE_ISALPHA(c) && (c < `'0'` \|\| c > `'9'`) &&
8112	c != `'-'` && c != `'_'` && c != `'.'` && c != '`')
8113	return (`1`);
8114	}
8115
8116	return (`0`);
8117	}
8118
8119	static void
8120	dtrace_cred2priv(cred_t cr, uint32_t privp, uid_t uidp, zoneid_t zoneidp)
8121	{
8122	uint32_t priv;
8123
8124	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8125	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
8126	priv = DTRACE_PRIV_USER \| DTRACE_PRIV_PROC \| DTRACE_PRIV_OWNER;
8127	}
8128	else {
8129	priv = DTRACE_PRIV_ALL;
8130	}
8131	*uidp = `0`;
8132	*zoneidp = `0`;
8133	} else {
8134	*uidp = crgetuid(cr);
8135	*zoneidp = crgetzoneid(cr);
8136
8137	priv = `0`;
8138	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8139	priv \|= DTRACE_PRIV_KERNEL \| DTRACE_PRIV_USER;
8140	else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8141	priv \|= DTRACE_PRIV_USER;
8142	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8143	priv \|= DTRACE_PRIV_PROC;
8144	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8145	priv \|= DTRACE_PRIV_OWNER;
8146	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8147	priv \|= DTRACE_PRIV_ZONEOWNER;
8148	}
8149
8150	*privp = priv;
8151	}
8152
8153	#ifdef DTRACE_ERRDEBUG
8154	static void
8155	dtrace_errdebug(const char *str)
8156	{
8157	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8158	int occupied = `0`;
8159
8160	lck_mtx_lock(&dtrace_errlock);
8161	dtrace_errlast = str;
8162	dtrace_errthread = (kthread_t *)current_thread();
8163
8164	while (occupied++ < DTRACE_ERRHASHSZ) {
8165	if (dtrace_errhash[hval].dter_msg == str) {
8166	dtrace_errhash[hval].dter_count++;
8167	goto out;
8168	}
8169
8170	if (dtrace_errhash[hval].dter_msg != NULL) {
8171	hval = (hval + `1`) % DTRACE_ERRHASHSZ;
8172	continue;
8173	}
8174
8175	dtrace_errhash[hval].dter_msg = str;
8176	dtrace_errhash[hval].dter_count = `1`;
8177	goto out;
8178	}
8179
8180	panic("dtrace: undersized error hash");
8181	out:
8182	lck_mtx_unlock(&dtrace_errlock);
8183	}
8184	#endif
8185
8186	/*
8187	* DTrace Matching Functions
8188	*
8189	* These functions are used to match groups of probes, given some elements of
8190	* a probe tuple, or some globbed expressions for elements of a probe tuple.
8191	*/
8192	static int
8193	dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8194	zoneid_t zoneid)
8195	{
8196	if (priv != DTRACE_PRIV_ALL) {
8197	uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8198	uint32_t match = priv & ppriv;
8199
8200	/*
8201	* No PRIV_DTRACE_* privileges...
8202	*/
8203	if ((priv & (DTRACE_PRIV_PROC \| DTRACE_PRIV_USER \|
8204	DTRACE_PRIV_KERNEL)) == `0`)
8205	return (`0`);
8206
8207	/*
8208	* No matching bits, but there were bits to match...
8209	*/
8210	if (match == `0` && ppriv != `0`)
8211	return (`0`);
8212
8213	/*
8214	* Need to have permissions to the process, but don't...
8215	*/
8216	if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != `0` &&
8217	uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8218	return (`0`);
8219	}
8220
8221	/*
8222	* Need to be in the same zone unless we possess the
8223	* privilege to examine all zones.
8224	*/
8225	if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != `0` &&
8226	zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8227	return (`0`);
8228	}
8229	}
8230
8231	return (`1`);
8232	}
8233
8234	/*
8235	* dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8236	* consists of input pattern strings and an ops-vector to evaluate them.
8237	* This function returns >0 for match, 0 for no match, and <0 for error.
8238	*/
8239	static int
8240	dtrace_match_probe(const dtrace_probe_t prp, const* dtrace_probekey_t *pkp,
8241	uint32_t priv, uid_t uid, zoneid_t zoneid)
8242	{
8243	dtrace_provider_t *pvp = prp->dtpr_provider;
8244	int rv;
8245
8246	if (pvp->dtpv_defunct)
8247	return (`0`);
8248
8249	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, `0`)) <= `0`)
8250	return (rv);
8251
8252	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, `0`)) <= `0`)
8253	return (rv);
8254
8255	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, `0`)) <= `0`)
8256	return (rv);
8257
8258	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, `0`)) <= `0`)
8259	return (rv);
8260
8261	if (dtrace_match_priv(prp, priv, uid, zoneid) == `0`)
8262	return (`0`);
8263
8264	return (rv);
8265	}
8266
8267	/*
8268	* dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8269	* interface for matching a glob pattern 'p' to an input string 's'. Unlike
8270	* libc's version, the kernel version only applies to 8-bit ASCII strings.
8271	* In addition, all of the recursion cases except for '*' matching have been
8272	* unwound. For '*', we still implement recursive evaluation, but a depth
8273	* counter is maintained and matching is aborted if we recurse too deep.
8274	* The function returns 0 if no match, >0 if match, and <0 if recursion error.
8275	*/
8276	static int
8277	dtrace_match_glob(const char s, const* char p, int* depth)
8278	{
8279	const char *olds;
8280	char s1, c;
8281	int gs;
8282
8283	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8284	return (-`1`);
8285
8286	if (s == NULL)
8287	s = ""; / treat NULL as empty string /
8288
8289	top:
8290	olds = s;
8291	s1 = *s++;
8292
8293	if (p == NULL)
8294	return (`0`);
8295
8296	if ((c = *p++) == `'\0'`)
8297	return (s1 == `'\0'`);
8298
8299	switch (c) {
8300	case `'['`: {
8301	int ok = `0`, notflag = `0`;
8302	char lc = `'\0'`;
8303
8304	if (s1 == `'\0'`)
8305	return (`0`);
8306
8307	if (*p == `'!'`) {
8308	notflag = `1`;
8309	p++;
8310	}
8311
8312	if ((c = *p++) == `'\0'`)
8313	return (`0`);
8314
8315	do {
8316	if (c == `'-'` && lc != `'\0'` && *p != `']'`) {
8317	if ((c = *p++) == `'\0'`)
8318	return (`0`);
8319	if (c == `'\\'` && (c = *p++) == `'\0'`)
8320	return (`0`);
8321
8322	if (notflag) {
8323	if (s1 < lc \|\| s1 > c)
8324	ok++;
8325	else
8326	return (`0`);
8327	} else if (lc <= s1 && s1 <= c)
8328	ok++;
8329
8330	} else if (c == `'\\'` && (c = *p++) == `'\0'`)
8331	return (`0`);
8332
8333	lc = c; / save left-hand 'c' for next iteration /
8334
8335	if (notflag) {
8336	if (s1 != c)
8337	ok++;
8338	else
8339	return (`0`);
8340	} else if (s1 == c)
8341	ok++;
8342
8343	if ((c = *p++) == `'\0'`)
8344	return (`0`);
8345
8346	} while (c != `']'`);
8347
8348	if (ok)
8349	goto top;
8350
8351	return (`0`);
8352	}
8353
8354	case `'\\'`:
8355	if ((c = *p++) == `'\0'`)
8356	return (`0`);
8357	OS_FALLTHROUGH;
8358
8359	default:
8360	if (c != s1)
8361	return (`0`);
8362	OS_FALLTHROUGH;
8363
8364	case `'?'`:
8365	if (s1 != `'\0'`)
8366	goto top;
8367	return (`0`);
8368
8369	case `'*'`:
8370	while (p == `''`)
8371	p++; / consecutive 's are identical to a single one /*
8372
8373	if (*p == `'\0'`)
8374	return (`1`);
8375
8376	for (s = olds; *s != `'\0'`; s++) {
8377	if ((gs = dtrace_match_glob(s, p, depth: depth + `1`)) != `0`)
8378	return (gs);
8379	}
8380
8381	return (`0`);
8382	}
8383	}
8384
8385	/ARGSUSED/
8386	static int
8387	dtrace_match_string(const char s, const* char p, int* depth)
8388	{
8389	#pragma unused(depth) /* __APPLE__ */
8390	return (s != NULL && s == p);
8391	}
8392
8393	/ARGSUSED/
8394	static int
8395	dtrace_match_module(const char s, const* char p, int* depth)
8396	{
8397	#pragma unused(depth) /* __APPLE__ */
8398	size_t len;
8399	if (s == NULL \|\| p == NULL)
8400	return (`0`);
8401
8402	len = strlen(s: p);
8403
8404	if (strncmp(s1: p, s2: s, n: len) != `0`)
8405	return (`0`);
8406
8407	if (s[len] == `'.'` \|\| s[len] == `'\0'`)
8408	return (`1`);
8409
8410	return (`0`);
8411	}
8412
8413	/ARGSUSED/
8414	static int
8415	dtrace_match_nul(const char s, const* char p, int* depth)
8416	{
8417	#pragma unused(s, p, depth) /* __APPLE__ */
8418	return (`1`); / always match the empty pattern /
8419	}
8420
8421	/ARGSUSED/
8422	static int
8423	dtrace_match_nonzero(const char s, const* char p, int* depth)
8424	{
8425	#pragma unused(p, depth) /* __APPLE__ */
8426	return (s != NULL && s[`0`] != `'\0'`);
8427	}
8428
8429	static int
8430	dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8431	zoneid_t zoneid, int (matched)(dtrace_probe_t , void , void* ), void* arg1, void* *arg2)
8432	{
8433	dtrace_probe_t *probe;
8434	dtrace_provider_t prov_template = {
8435	.dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8436	};
8437
8438	dtrace_probe_t template = {
8439	.dtpr_provider = &prov_template,
8440	.dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8441	.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8442	.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8443	};
8444
8445	dtrace_hash_t *hash = NULL;
8446	int len, rc, best = INT_MAX, nmatched = `0`;
8447	dtrace_id_t i;
8448
8449	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8450
8451	/*
8452	* If the probe ID is specified in the key, just lookup by ID and
8453	* invoke the match callback once if a matching probe is found.
8454	*/
8455	if (pkp->dtpk_id != DTRACE_IDNONE) {
8456	if ((probe = dtrace_probe_lookup_id(id: pkp->dtpk_id)) != NULL &&
8457	dtrace_match_probe(prp: probe, pkp, priv, uid, zoneid) > `0`) {
8458	if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8459	return (DTRACE_MATCH_FAIL);
8460	nmatched++;
8461	}
8462	return (nmatched);
8463	}
8464
8465	/*
8466	* We want to find the most distinct of the provider name, module name,
8467	* function name, and name. So for each one that is not a glob
8468	* pattern or empty string, we perform a lookup in the corresponding
8469	* hash and use the hash table with the fewest collisions to do our
8470	* search.
8471	*/
8472	if (pkp->dtpk_pmatch == &dtrace_match_string &&
8473	(len = dtrace_hash_collisions(hash: dtrace_byprov, template: &template)) < best) {
8474	best = len;
8475	hash = dtrace_byprov;
8476	}
8477
8478	if (pkp->dtpk_mmatch == &dtrace_match_string &&
8479	(len = dtrace_hash_collisions(hash: dtrace_bymod, template: &template)) < best) {
8480	best = len;
8481	hash = dtrace_bymod;
8482	}
8483
8484	if (pkp->dtpk_fmatch == &dtrace_match_string &&
8485	(len = dtrace_hash_collisions(hash: dtrace_byfunc, template: &template)) < best) {
8486	best = len;
8487	hash = dtrace_byfunc;
8488	}
8489
8490	if (pkp->dtpk_nmatch == &dtrace_match_string &&
8491	(len = dtrace_hash_collisions(hash: dtrace_byname, template: &template)) < best) {
8492	best = len;
8493	hash = dtrace_byname;
8494	}
8495
8496	/*
8497	* If we did not select a hash table, iterate over every probe and
8498	* invoke our callback for each one that matches our input probe key.
8499	*/
8500	if (hash == NULL) {
8501	for (i = `0`; i < (dtrace_id_t)dtrace_nprobes; i++) {
8502	if ((probe = dtrace_probes[i]) == NULL \|\|
8503	dtrace_match_probe(prp: probe, pkp, priv, uid,
8504	zoneid) <= `0`)
8505	continue;
8506
8507	nmatched++;
8508
8509	if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8510	if (rc == DTRACE_MATCH_FAIL)
8511	return (DTRACE_MATCH_FAIL);
8512	break;
8513	}
8514	}
8515
8516	return (nmatched);
8517	}
8518
8519	/*
8520	* If we selected a hash table, iterate over each probe of the same key
8521	* name and invoke the callback for every probe that matches the other
8522	* attributes of our input probe key.
8523	*/
8524	for (probe = dtrace_hash_lookup(hash, template: &template); probe != NULL;
8525	probe = *(DTRACE_HASHNEXT(hash, probe))) {
8526
8527	if (dtrace_match_probe(prp: probe, pkp, priv, uid, zoneid) <= `0`)
8528	continue;
8529
8530	nmatched++;
8531
8532	if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8533	if (rc == DTRACE_MATCH_FAIL)
8534	return (DTRACE_MATCH_FAIL);
8535	break;
8536	}
8537	}
8538
8539	return (nmatched);
8540	}
8541
8542	/*
8543	* Return the function pointer dtrace_probecmp() should use to compare the
8544	* specified pattern with a string. For NULL or empty patterns, we select
8545	* dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8546	* For non-empty non-glob strings, we use dtrace_match_string().
8547	*/
8548	static dtrace_probekey_f *
8549	dtrace_probekey_func(const char *p)
8550	{
8551	char c;
8552
8553	if (p == NULL \|\| *p == `'\0'`)
8554	return (&dtrace_match_nul);
8555
8556	while ((c = *p++) != `'\0'`) {
8557	if (c == `'['` \|\| c == `'?'` \|\| c == `'*'` \|\| c == `'\\'`)
8558	return (&dtrace_match_glob);
8559	}
8560
8561	return (&dtrace_match_string);
8562	}
8563
8564	static dtrace_probekey_f *
8565	dtrace_probekey_module_func(const char *p)
8566	{
8567	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8568
8569	dtrace_probekey_f *f = dtrace_probekey_func(p);
8570	if (f == &dtrace_match_string) {
8571	dtrace_probe_t template = {
8572	.dtpr_mod = (char *)(uintptr_t)p,
8573	};
8574	if (dtrace_hash_lookup(hash: dtrace_bymod, template: &template) == NULL) {
8575	return (&dtrace_match_module);
8576	}
8577	return (&dtrace_match_string);
8578	}
8579	return f;
8580	}
8581
8582	/*
8583	* Build a probe comparison key for use with dtrace_match_probe() from the
8584	* given probe description. By convention, a null key only matches anchored
8585	* probes: if each field is the empty string, reset dtpk_fmatch to
8586	* dtrace_match_nonzero().
8587	*/
8588	static void
8589	dtrace_probekey(const dtrace_probedesc_t pdp, dtrace_probekey_t pkp)
8590	{
8591
8592	pkp->dtpk_prov = dtrace_strref(str: pdp->dtpd_provider);
8593	pkp->dtpk_pmatch = dtrace_probekey_func(p: pdp->dtpd_provider);
8594
8595	pkp->dtpk_mod = dtrace_strref(str: pdp->dtpd_mod);
8596	pkp->dtpk_mmatch = dtrace_probekey_module_func(p: pdp->dtpd_mod);
8597
8598	pkp->dtpk_func = dtrace_strref(str: pdp->dtpd_func);
8599	pkp->dtpk_fmatch = dtrace_probekey_func(p: pdp->dtpd_func);
8600
8601	pkp->dtpk_name = dtrace_strref(str: pdp->dtpd_name);
8602	pkp->dtpk_nmatch = dtrace_probekey_func(p: pdp->dtpd_name);
8603
8604	pkp->dtpk_id = pdp->dtpd_id;
8605
8606	if (pkp->dtpk_id == DTRACE_IDNONE &&
8607	pkp->dtpk_pmatch == &dtrace_match_nul &&
8608	pkp->dtpk_mmatch == &dtrace_match_nul &&
8609	pkp->dtpk_fmatch == &dtrace_match_nul &&
8610	pkp->dtpk_nmatch == &dtrace_match_nul)
8611	pkp->dtpk_fmatch = &dtrace_match_nonzero;
8612	}
8613
8614	static void
8615	dtrace_probekey_release(dtrace_probekey_t *pkp)
8616	{
8617	dtrace_strunref(str: pkp->dtpk_prov);
8618	dtrace_strunref(str: pkp->dtpk_mod);
8619	dtrace_strunref(str: pkp->dtpk_func);
8620	dtrace_strunref(str: pkp->dtpk_name);
8621	}
8622
8623	static int
8624	dtrace_cond_provider_match(dtrace_probedesc_t desc, void* *data)
8625	{
8626	if (desc == NULL)
8627	return `1`;
8628
8629	dtrace_probekey_f *func = dtrace_probekey_func(p: desc->dtpd_provider);
8630
8631	return func((char*)data, desc->dtpd_provider, `0`);
8632	}
8633
8634	/*
8635	* DTrace Provider-to-Framework API Functions
8636	*
8637	* These functions implement much of the Provider-to-Framework API, as
8638	* described in <sys/dtrace.h>. The parts of the API not in this section are
8639	* the functions in the API for probe management (found below), and
8640	* dtrace_probe() itself (found above).
8641	*/
8642
8643	/*
8644	* Register the calling provider with the DTrace framework. This should
8645	* generally be called by DTrace providers in their attach(9E) entry point.
8646	*/
8647	int
8648	dtrace_register(const char name, const* dtrace_pattr_t *pap, uint32_t priv,
8649	cred_t cr, const* dtrace_pops_t pops, void* arg, dtrace_provider_id_t idp)
8650	{
8651	dtrace_provider_t *provider;
8652
8653	if (name == NULL \|\| pap == NULL \|\| pops == NULL \|\| idp == NULL) {
8654	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8655	"arguments", name ? name : "<NULL>");
8656	return (EINVAL);
8657	}
8658
8659	if (name[`0`] == `'\0'` \|\| dtrace_badname(s: name)) {
8660	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8661	"provider name", name);
8662	return (EINVAL);
8663	}
8664
8665	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) \|\|
8666	pops->dtps_enable == NULL \|\| pops->dtps_disable == NULL \|\|
8667	pops->dtps_destroy == NULL \|\|
8668	((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8669	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8670	"provider ops", name);
8671	return (EINVAL);
8672	}
8673
8674	if (dtrace_badattr(a: &pap->dtpa_provider) \|\|
8675	dtrace_badattr(a: &pap->dtpa_mod) \|\|
8676	dtrace_badattr(a: &pap->dtpa_func) \|\|
8677	dtrace_badattr(a: &pap->dtpa_name) \|\|
8678	dtrace_badattr(a: &pap->dtpa_args)) {
8679	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8680	"provider attributes", name);
8681	return (EINVAL);
8682	}
8683
8684	if (priv & ~DTRACE_PRIV_ALL) {
8685	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8686	"privilege attributes", name);
8687	return (EINVAL);
8688	}
8689
8690	if ((priv & DTRACE_PRIV_KERNEL) &&
8691	(priv & (DTRACE_PRIV_USER \| DTRACE_PRIV_OWNER)) &&
8692	pops->dtps_usermode == NULL) {
8693	cmn_err(CE_WARN, "failed to register provider '%s': need "
8694	"dtps_usermode() op for given privilege attributes", name);
8695	return (EINVAL);
8696	}
8697
8698	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8699
8700	provider->dtpv_attr = *pap;
8701	provider->dtpv_priv.dtpp_flags = priv;
8702	if (cr != NULL) {
8703	provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8704	provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8705	}
8706	provider->dtpv_pops = *pops;
8707
8708	if (pops->dtps_provide == NULL) {
8709	ASSERT(pops->dtps_provide_module != NULL);
8710	provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8711	}
8712
8713	if (pops->dtps_provide_module == NULL) {
8714	ASSERT(pops->dtps_provide != NULL);
8715	provider->dtpv_pops.dtps_provide_module =
8716	dtrace_provide_module_nullop;
8717	}
8718
8719	if (pops->dtps_suspend == NULL) {
8720	ASSERT(pops->dtps_resume == NULL);
8721	provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8722	provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8723	}
8724
8725	provider->dtpv_arg = arg;
8726	*idp = (dtrace_provider_id_t)provider;
8727
8728	if (pops == &dtrace_provider_ops) {
8729	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8730	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8731
8732	provider->dtpv_name = dtrace_strref(str: name);
8733
8734	ASSERT(dtrace_anon.dta_enabling == NULL);
8735
8736	/*
8737	* We make sure that the DTrace provider is at the head of
8738	* the provider chain.
8739	*/
8740	provider->dtpv_next = dtrace_provider;
8741	dtrace_provider = provider;
8742	return (`0`);
8743	}
8744
8745	lck_mtx_lock(lck: &dtrace_provider_lock);
8746	lck_mtx_lock(lck: &dtrace_lock);
8747
8748	provider->dtpv_name = dtrace_strref(str: name);
8749
8750	/*
8751	* If there is at least one provider registered, we'll add this
8752	* provider after the first provider.
8753	*/
8754	if (dtrace_provider != NULL) {
8755	provider->dtpv_next = dtrace_provider->dtpv_next;
8756	dtrace_provider->dtpv_next = provider;
8757	} else {
8758	dtrace_provider = provider;
8759	}
8760
8761	if (dtrace_retained != NULL) {
8762	dtrace_enabling_provide(provider);
8763
8764	/*
8765	* Now we need to call dtrace_enabling_matchall_with_cond() --
8766	* with a condition matching the provider name we just added,
8767	* which will acquire cpu_lock and dtrace_lock. We therefore need
8768	* to drop all of our locks before calling into it...
8769	*/
8770	lck_mtx_unlock(lck: &dtrace_lock);
8771	lck_mtx_unlock(lck: &dtrace_provider_lock);
8772
8773	dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8774	dtrace_enabling_matchall_with_cond(cond: &cond);
8775
8776	return (`0`);
8777	}
8778
8779	lck_mtx_unlock(lck: &dtrace_lock);
8780	lck_mtx_unlock(lck: &dtrace_provider_lock);
8781
8782	return (`0`);
8783	}
8784
8785	/*
8786	* Unregister the specified provider from the DTrace framework. This should
8787	* generally be called by DTrace providers in their detach(9E) entry point.
8788	*/
8789	int
8790	dtrace_unregister(dtrace_provider_id_t id)
8791	{
8792	dtrace_provider_t old = (dtrace_provider_t )id;
8793	dtrace_provider_t *prev = NULL;
8794	int self = `0`;
8795	dtrace_probe_t probe, first = NULL, *next = NULL;
8796	dtrace_probe_t template = {
8797	.dtpr_provider = old
8798	};
8799
8800	if (old->dtpv_pops.dtps_enable ==
8801	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop) {
8802	/*
8803	* If DTrace itself is the provider, we're called with locks
8804	* already held.
8805	*/
8806	ASSERT(old == dtrace_provider);
8807	ASSERT(dtrace_devi != NULL);
8808	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8809	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8810	self = `1`;
8811
8812	if (dtrace_provider->dtpv_next != NULL) {
8813	/*
8814	* There's another provider here; return failure.
8815	*/
8816	return (EBUSY);
8817	}
8818	} else {
8819	lck_mtx_lock(lck: &dtrace_provider_lock);
8820	lck_mtx_lock(lck: &mod_lock);
8821	lck_mtx_lock(lck: &dtrace_lock);
8822	}
8823
8824	/*
8825	* If anyone has /dev/dtrace open, or if there are anonymous enabled
8826	* probes, we refuse to let providers slither away, unless this
8827	* provider has already been explicitly invalidated.
8828	*/
8829	if (!old->dtpv_defunct &&
8830	(dtrace_opens \|\| (dtrace_anon.dta_state != NULL &&
8831	dtrace_anon.dta_state->dts_necbs > `0`))) {
8832	if (!self) {
8833	lck_mtx_unlock(lck: &dtrace_lock);
8834	lck_mtx_unlock(lck: &mod_lock);
8835	lck_mtx_unlock(lck: &dtrace_provider_lock);
8836	}
8837	return (EBUSY);
8838	}
8839
8840	/*
8841	* Attempt to destroy the probes associated with this provider.
8842	*/
8843	if (old->dtpv_ecb_count!=`0`) {
8844	/*
8845	* We have at least one ECB; we can't remove this provider.
8846	*/
8847	if (!self) {
8848	lck_mtx_unlock(lck: &dtrace_lock);
8849	lck_mtx_unlock(lck: &mod_lock);
8850	lck_mtx_unlock(lck: &dtrace_provider_lock);
8851	}
8852	return (EBUSY);
8853	}
8854
8855	/*
8856	* All of the probes for this provider are disabled; we can safely
8857	* remove all of them from their hash chains and from the probe array.
8858	*/
8859	for (probe = dtrace_hash_lookup(hash: dtrace_byprov, template: &template); probe != NULL;
8860	probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8861	if (probe->dtpr_provider != old)
8862	continue;
8863
8864	dtrace_probes[probe->dtpr_id - `1`] = NULL;
8865	old->dtpv_probe_count--;
8866
8867	dtrace_hash_remove(hash: dtrace_bymod, elm: probe);
8868	dtrace_hash_remove(hash: dtrace_byfunc, elm: probe);
8869	dtrace_hash_remove(hash: dtrace_byname, elm: probe);
8870
8871	if (first == NULL) {
8872	first = probe;
8873	probe->dtpr_nextmod = NULL;
8874	} else {
8875	/*
8876	* Use nextmod as the chain of probes to remove
8877	*/
8878	probe->dtpr_nextmod = first;
8879	first = probe;
8880	}
8881	}
8882
8883	for (probe = first; probe != NULL; probe = next) {
8884	next = probe->dtpr_nextmod;
8885	dtrace_hash_remove(hash: dtrace_byprov, elm: probe);
8886	}
8887
8888	/*
8889	* The provider's probes have been removed from the hash chains and
8890	* from the probe array. Now issue a dtrace_sync() to be sure that
8891	* everyone has cleared out from any probe array processing.
8892	*/
8893	dtrace_sync();
8894
8895	for (probe = first; probe != NULL; probe = next) {
8896	next = probe->dtpr_nextmod;
8897
8898	old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8899	probe->dtpr_arg);
8900	dtrace_strunref(str: probe->dtpr_mod);
8901	dtrace_strunref(str: probe->dtpr_func);
8902	dtrace_strunref(str: probe->dtpr_name);
8903	vmem_free(vmp: dtrace_arena, vaddr: (void *)(uintptr_t)(probe->dtpr_id), size: `1`);
8904	zfree(dtrace_probe_t_zone, probe);
8905	}
8906
8907	if ((prev = dtrace_provider) == old) {
8908	ASSERT(self \|\| dtrace_devi == NULL);
8909	ASSERT(old->dtpv_next == NULL \|\| dtrace_devi == NULL);
8910	dtrace_provider = old->dtpv_next;
8911	} else {
8912	while (prev != NULL && prev->dtpv_next != old)
8913	prev = prev->dtpv_next;
8914
8915	if (prev == NULL) {
8916	panic("attempt to unregister non-existent "
8917	"dtrace provider %p\n", (void *)id);
8918	}
8919
8920	prev->dtpv_next = old->dtpv_next;
8921	}
8922
8923	dtrace_strunref(str: old->dtpv_name);
8924
8925	if (!self) {
8926	lck_mtx_unlock(lck: &dtrace_lock);
8927	lck_mtx_unlock(lck: &mod_lock);
8928	lck_mtx_unlock(lck: &dtrace_provider_lock);
8929	}
8930
8931	kmem_free(old, sizeof (dtrace_provider_t));
8932
8933	return (`0`);
8934	}
8935
8936	/*
8937	* Invalidate the specified provider. All subsequent probe lookups for the
8938	* specified provider will fail, but its probes will not be removed.
8939	*/
8940	void
8941	dtrace_invalidate(dtrace_provider_id_t id)
8942	{
8943	dtrace_provider_t pvp = (dtrace_provider_t )id;
8944
8945	ASSERT(pvp->dtpv_pops.dtps_enable !=
8946	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop);
8947
8948	lck_mtx_lock(lck: &dtrace_provider_lock);
8949	lck_mtx_lock(lck: &dtrace_lock);
8950
8951	pvp->dtpv_defunct = `1`;
8952
8953	lck_mtx_unlock(lck: &dtrace_lock);
8954	lck_mtx_unlock(lck: &dtrace_provider_lock);
8955	}
8956
8957	/*
8958	* Indicate whether or not DTrace has attached.
8959	*/
8960	int
8961	dtrace_attached(void)
8962	{
8963	/*
8964	* dtrace_provider will be non-NULL iff the DTrace driver has
8965	* attached. (It's non-NULL because DTrace is always itself a
8966	* provider.)
8967	*/
8968	return (dtrace_provider != NULL);
8969	}
8970
8971	/*
8972	* Remove all the unenabled probes for the given provider. This function is
8973	* not unlike dtrace_unregister(), except that it doesn't remove the provider
8974	* -- just as many of its associated probes as it can.
8975	*/
8976	int
8977	dtrace_condense(dtrace_provider_id_t id)
8978	{
8979	dtrace_provider_t prov = (dtrace_provider_t )id;
8980	dtrace_probe_t probe, first = NULL;
8981	dtrace_probe_t template = {
8982	.dtpr_provider = prov
8983	};
8984
8985	/*
8986	* Make sure this isn't the dtrace provider itself.
8987	*/
8988	ASSERT(prov->dtpv_pops.dtps_enable !=
8989	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop);
8990
8991	lck_mtx_lock(lck: &dtrace_provider_lock);
8992	lck_mtx_lock(lck: &dtrace_lock);
8993
8994	/*
8995	* Attempt to destroy the probes associated with this provider.
8996	*/
8997	for (probe = dtrace_hash_lookup(hash: dtrace_byprov, template: &template); probe != NULL;
8998	probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8999
9000	if (probe->dtpr_provider != prov)
9001	continue;
9002
9003	if (probe->dtpr_ecb != NULL)
9004	continue;
9005
9006	dtrace_probes[probe->dtpr_id - `1`] = NULL;
9007	prov->dtpv_probe_count--;
9008
9009	dtrace_hash_remove(hash: dtrace_bymod, elm: probe);
9010	dtrace_hash_remove(hash: dtrace_byfunc, elm: probe);
9011	dtrace_hash_remove(hash: dtrace_byname, elm: probe);
9012
9013	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
9014	probe->dtpr_arg);
9015	dtrace_strunref(str: probe->dtpr_mod);
9016	dtrace_strunref(str: probe->dtpr_func);
9017	dtrace_strunref(str: probe->dtpr_name);
9018	if (first == NULL) {
9019	first = probe;
9020	probe->dtpr_nextmod = NULL;
9021	} else {
9022	/*
9023	* Use nextmod as the chain of probes to remove
9024	*/
9025	probe->dtpr_nextmod = first;
9026	first = probe;
9027	}
9028	}
9029
9030	for (probe = first; probe != NULL; probe = first) {
9031	first = probe->dtpr_nextmod;
9032	dtrace_hash_remove(hash: dtrace_byprov, elm: probe);
9033	vmem_free(vmp: dtrace_arena, vaddr: (void *)((uintptr_t)probe->dtpr_id), size: `1`);
9034	zfree(dtrace_probe_t_zone, probe);
9035	}
9036
9037	lck_mtx_unlock(lck: &dtrace_lock);
9038	lck_mtx_unlock(lck: &dtrace_provider_lock);
9039
9040	return (`0`);
9041	}
9042
9043	/*
9044	* DTrace Probe Management Functions
9045	*
9046	* The functions in this section perform the DTrace probe management,
9047	* including functions to create probes, look-up probes, and call into the
9048	* providers to request that probes be provided. Some of these functions are
9049	* in the Provider-to-Framework API; these functions can be identified by the
9050	* fact that they are not declared "static".
9051	*/
9052
9053	/*
9054	* Create a probe with the specified module name, function name, and name.
9055	*/
9056	dtrace_id_t
9057	dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
9058	const char func, const* char name, int* aframes, void *arg)
9059	{
9060	dtrace_probe_t probe, *probes;
9061	dtrace_provider_t provider = (dtrace_provider_t )prov;
9062	dtrace_id_t id;
9063
9064	if (provider == dtrace_provider) {
9065	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9066	} else {
9067	lck_mtx_lock(lck: &dtrace_lock);
9068	}
9069
9070	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, `1`,
9071	VM_BESTFIT \| VM_SLEEP);
9072
9073	probe = zalloc_flags(dtrace_probe_t_zone, Z_WAITOK \| Z_ZERO);
9074
9075	probe->dtpr_id = id;
9076	probe->dtpr_gen = dtrace_probegen++;
9077	probe->dtpr_mod = dtrace_strref(str: mod);
9078	probe->dtpr_func = dtrace_strref(str: func);
9079	probe->dtpr_name = dtrace_strref(str: name);
9080	probe->dtpr_arg = arg;
9081	probe->dtpr_aframes = aframes;
9082	probe->dtpr_provider = provider;
9083
9084	dtrace_hash_add(hash: dtrace_byprov, new: probe);
9085	dtrace_hash_add(hash: dtrace_bymod, new: probe);
9086	dtrace_hash_add(hash: dtrace_byfunc, new: probe);
9087	dtrace_hash_add(hash: dtrace_byname, new: probe);
9088
9089	if (id - `1` >= (dtrace_id_t)dtrace_nprobes) {
9090	size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
9091	size_t nsize = osize * `2`;
9092
9093	probes = kmem_zalloc(nsize, KM_SLEEP);
9094
9095	dtrace_probe_t **oprobes = dtrace_probes;
9096
9097	bcopy(src: oprobes, dst: probes, n: osize);
9098	dtrace_membar_producer();
9099	dtrace_probes = probes;
9100
9101	dtrace_sync();
9102
9103	/*
9104	* All CPUs are now seeing the new probes array; we can
9105	* safely free the old array.
9106	*/
9107	kmem_free(oprobes, osize);
9108	dtrace_nprobes *= `2`;
9109
9110	ASSERT(id - `1` < (dtrace_id_t)dtrace_nprobes);
9111	}
9112
9113	ASSERT(dtrace_probes[id - `1`] == NULL);
9114	dtrace_probes[id - `1`] = probe;
9115	provider->dtpv_probe_count++;
9116
9117	if (provider != dtrace_provider)
9118	lck_mtx_unlock(lck: &dtrace_lock);
9119
9120	return (id);
9121	}
9122
9123	static dtrace_probe_t *
9124	dtrace_probe_lookup_id(dtrace_id_t id)
9125	{
9126	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9127
9128	if (id == `0` \|\| id > (dtrace_id_t)dtrace_nprobes)
9129	return (NULL);
9130
9131	return (dtrace_probes[id - `1`]);
9132	}
9133
9134	static int
9135	dtrace_probe_lookup_match(dtrace_probe_t probe, void* arg1, void* *arg2)
9136	{
9137	#pragma unused(arg2)
9138	((dtrace_id_t )arg1) = probe->dtpr_id;
9139
9140	return (DTRACE_MATCH_DONE);
9141	}
9142
9143	/*
9144	* Look up a probe based on provider and one or more of module name, function
9145	* name and probe name.
9146	*/
9147	dtrace_id_t
9148	dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9149	const char func, const* char *name)
9150	{
9151	dtrace_probekey_t pkey;
9152	dtrace_id_t id;
9153	int match;
9154
9155	lck_mtx_lock(lck: &dtrace_lock);
9156
9157	pkey.dtpk_prov = dtrace_strref(str: ((dtrace_provider_t *)prid)->dtpv_name);
9158	pkey.dtpk_pmatch = &dtrace_match_string;
9159	pkey.dtpk_mod = dtrace_strref(str: mod);
9160	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9161	pkey.dtpk_func = dtrace_strref(str: func);
9162	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9163	pkey.dtpk_name = dtrace_strref(str: name);
9164	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9165	pkey.dtpk_id = DTRACE_IDNONE;
9166
9167	match = dtrace_match(pkp: &pkey, DTRACE_PRIV_ALL, uid: `0`, zoneid: `0`,
9168	matched: dtrace_probe_lookup_match, arg1: &id, NULL);
9169
9170	dtrace_probekey_release(pkp: &pkey);
9171
9172	lck_mtx_unlock(lck: &dtrace_lock);
9173
9174	ASSERT(match == `1` \|\| match == `0`);
9175	return (match ? id : `0`);
9176	}
9177
9178	/*
9179	* Returns the probe argument associated with the specified probe.
9180	*/
9181	void *
9182	dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9183	{
9184	dtrace_probe_t *probe;
9185	void *rval = NULL;
9186
9187	lck_mtx_lock(lck: &dtrace_lock);
9188
9189	if ((probe = dtrace_probe_lookup_id(id: pid)) != NULL &&
9190	probe->dtpr_provider == (dtrace_provider_t *)id)
9191	rval = probe->dtpr_arg;
9192
9193	lck_mtx_unlock(lck: &dtrace_lock);
9194
9195	return (rval);
9196	}
9197
9198	/*
9199	* Copy a probe into a probe description.
9200	*/
9201	static void
9202	dtrace_probe_description(const dtrace_probe_t prp, dtrace_probedesc_t pdp)
9203	{
9204	bzero(s: pdp, n: sizeof (dtrace_probedesc_t));
9205	pdp->dtpd_id = prp->dtpr_id;
9206
9207	/ APPLE NOTE: Darwin employs size bounded string operation. /
9208	(void) strlcpy(dst: pdp->dtpd_provider,
9209	src: prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9210
9211	(void) strlcpy(dst: pdp->dtpd_mod, src: prp->dtpr_mod, DTRACE_MODNAMELEN);
9212	(void) strlcpy(dst: pdp->dtpd_func, src: prp->dtpr_func, DTRACE_FUNCNAMELEN);
9213	(void) strlcpy(dst: pdp->dtpd_name, src: prp->dtpr_name, DTRACE_NAMELEN);
9214	}
9215
9216	/*
9217	* Called to indicate that a probe -- or probes -- should be provided by a
9218	* specfied provider. If the specified description is NULL, the provider will
9219	* be told to provide all of its probes. (This is done whenever a new
9220	* consumer comes along, or whenever a retained enabling is to be matched.) If
9221	* the specified description is non-NULL, the provider is given the
9222	* opportunity to dynamically provide the specified probe, allowing providers
9223	* to support the creation of probes on-the-fly. (So-called _autocreated_
9224	* probes.) If the provider is NULL, the operations will be applied to all
9225	* providers; if the provider is non-NULL the operations will only be applied
9226	* to the specified provider. The dtrace_provider_lock must be held, and the
9227	* dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9228	* will need to grab the dtrace_lock when it reenters the framework through
9229	* dtrace_probe_lookup(), dtrace_probe_create(), etc.
9230	*/
9231	static void
9232	dtrace_probe_provide(dtrace_probedesc_t desc, dtrace_provider_t prv)
9233	{
9234	struct modctl *ctl;
9235	int all = `0`;
9236
9237	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9238
9239	if (prv == NULL) {
9240	all = `1`;
9241	prv = dtrace_provider;
9242	}
9243
9244	do {
9245	/*
9246	* First, call the blanket provide operation.
9247	*/
9248	prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9249
9250	/*
9251	* Now call the per-module provide operation. We will grab
9252	* mod_lock to prevent the list from being modified. Note
9253	* that this also prevents the mod_busy bits from changing.
9254	* (mod_busy can only be changed with mod_lock held.)
9255	*/
9256	lck_mtx_lock(lck: &mod_lock);
9257
9258	ctl = dtrace_modctl_list;
9259	while (ctl) {
9260	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9261	ctl = ctl->mod_next;
9262	}
9263
9264	lck_mtx_unlock(lck: &mod_lock);
9265	} while (all && (prv = prv->dtpv_next) != NULL);
9266	}
9267
9268	/*
9269	* Iterate over each probe, and call the Framework-to-Provider API function
9270	* denoted by offs.
9271	*/
9272	static void
9273	dtrace_probe_foreach(uintptr_t offs)
9274	{
9275	dtrace_provider_t *prov;
9276	void (func)(void* , dtrace_id_t, void* *);
9277	dtrace_probe_t *probe;
9278	dtrace_icookie_t cookie;
9279	int i;
9280
9281	/*
9282	* We disable interrupts to walk through the probe array. This is
9283	* safe -- the dtrace_sync() in dtrace_unregister() assures that we
9284	* won't see stale data.
9285	*/
9286	cookie = dtrace_interrupt_disable();
9287
9288	for (i = `0`; i < dtrace_nprobes; i++) {
9289	if ((probe = dtrace_probes[i]) == NULL)
9290	continue;
9291
9292	if (probe->dtpr_ecb == NULL) {
9293	/*
9294	* This probe isn't enabled -- don't call the function.
9295	*/
9296	continue;
9297	}
9298
9299	prov = probe->dtpr_provider;
9300	func = ((void()(void* , dtrace_id_t, void* *))
9301	((uintptr_t)&prov->dtpv_pops + offs));
9302
9303	func(prov->dtpv_arg, i + `1`, probe->dtpr_arg);
9304	}
9305
9306	dtrace_interrupt_enable(cookie);
9307	}
9308
9309	static int
9310	dtrace_probe_enable(const dtrace_probedesc_t desc, dtrace_enabling_t enab, dtrace_ecbdesc_t *ep)
9311	{
9312	dtrace_probekey_t pkey;
9313	uint32_t priv;
9314	uid_t uid;
9315	zoneid_t zoneid;
9316	int err;
9317
9318	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9319
9320	dtrace_ecb_create_cache = NULL;
9321
9322	if (desc == NULL) {
9323	/*
9324	* If we're passed a NULL description, we're being asked to
9325	* create an ECB with a NULL probe.
9326	*/
9327	(void) dtrace_ecb_create_enable(NULL, enab, ep);
9328	return (`0`);
9329	}
9330
9331	dtrace_probekey(pdp: desc, pkp: &pkey);
9332	dtrace_cred2priv(cr: enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9333	privp: &priv, uidp: &uid, zoneidp: &zoneid);
9334
9335	err = dtrace_match(pkp: &pkey, priv, uid, zoneid, matched: dtrace_ecb_create_enable, arg1: enab, arg2: ep);
9336
9337	dtrace_probekey_release(pkp: &pkey);
9338
9339	return err;
9340	}
9341
9342	/*
9343	* DTrace Helper Provider Functions
9344	*/
9345	static void
9346	dtrace_dofattr2attr(dtrace_attribute_t attr, const* dof_attr_t dofattr)
9347	{
9348	attr->dtat_name = DOF_ATTR_NAME(dofattr);
9349	attr->dtat_data = DOF_ATTR_DATA(dofattr);
9350	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9351	}
9352
9353	static void
9354	dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9355	const dof_provider_t dofprov, char* *strtab)
9356	{
9357	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9358	dtrace_dofattr2attr(attr: &hprov->dthpv_pattr.dtpa_provider,
9359	dofattr: dofprov->dofpv_provattr);
9360	dtrace_dofattr2attr(attr: &hprov->dthpv_pattr.dtpa_mod,
9361	dofattr: dofprov->dofpv_modattr);
9362	dtrace_dofattr2attr(attr: &hprov->dthpv_pattr.dtpa_func,
9363	dofattr: dofprov->dofpv_funcattr);
9364	dtrace_dofattr2attr(attr: &hprov->dthpv_pattr.dtpa_name,
9365	dofattr: dofprov->dofpv_nameattr);
9366	dtrace_dofattr2attr(attr: &hprov->dthpv_pattr.dtpa_args,
9367	dofattr: dofprov->dofpv_argsattr);
9368	}
9369
9370	static void
9371	dtrace_helper_provide_one(dof_helper_t dhp, dof_sec_t sec, proc_t *p)
9372	{
9373	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9374	dof_hdr_t dof = (dof_hdr_t )daddr;
9375	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
9376	dof_provider_t *provider;
9377	dof_probe_t *probe;
9378	uint32_t off, enoff;
9379	uint8_t *arg;
9380	char *strtab;
9381	uint_t i, nprobes;
9382	dtrace_helper_provdesc_t dhpv;
9383	dtrace_helper_probedesc_t dhpb;
9384	dtrace_meta_t *meta = dtrace_meta_pid;
9385	dtrace_mops_t *mops = &meta->dtm_mops;
9386	void *parg;
9387
9388	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9389	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9390	provider->dofpv_strtab * dof->dofh_secsize);
9391	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9392	provider->dofpv_probes * dof->dofh_secsize);
9393	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9394	provider->dofpv_prargs * dof->dofh_secsize);
9395	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9396	provider->dofpv_proffs * dof->dofh_secsize);
9397
9398	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9399	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9400	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9401	enoff = NULL;
9402
9403	/*
9404	* See dtrace_helper_provider_validate().
9405	*/
9406	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9407	provider->dofpv_prenoffs != DOF_SECT_NONE) {
9408	enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9409	provider->dofpv_prenoffs * dof->dofh_secsize);
9410	enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9411	}
9412
9413	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9414
9415	/*
9416	* Create the provider.
9417	*/
9418	dtrace_dofprov2hprov(hprov: &dhpv, dofprov: provider, strtab);
9419
9420	if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9421	return;
9422
9423	meta->dtm_count++;
9424
9425	/*
9426	* Create the probes.
9427	*/
9428	for (i = `0`; i < nprobes; i++) {
9429	probe = (dof_probe_t *)(uintptr_t)(daddr +
9430	prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9431
9432	dhpb.dthpb_mod = dhp->dofhp_mod;
9433	dhpb.dthpb_func = strtab + probe->dofpr_func;
9434	dhpb.dthpb_name = strtab + probe->dofpr_name;
9435	#if !defined(__APPLE__)
9436	dhpb.dthpb_base = probe->dofpr_addr;
9437	#else
9438	dhpb.dthpb_base = dhp->dofhp_addr; / FIXME: James, why? /
9439	#endif
9440	dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9441	dhpb.dthpb_noffs = probe->dofpr_noffs;
9442	if (enoff != NULL) {
9443	dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9444	dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9445	} else {
9446	dhpb.dthpb_enoffs = NULL;
9447	dhpb.dthpb_nenoffs = `0`;
9448	}
9449	dhpb.dthpb_args = arg + probe->dofpr_argidx;
9450	dhpb.dthpb_nargc = probe->dofpr_nargc;
9451	dhpb.dthpb_xargc = probe->dofpr_xargc;
9452	dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9453	dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9454
9455	mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9456	}
9457
9458	/*
9459	* Since we just created probes, we need to match our enablings
9460	* against those, with a precondition knowing that we have only
9461	* added probes from this provider
9462	*/
9463	char *prov_name = mops->dtms_provider_name(parg);
9464	ASSERT(prov_name != NULL);
9465	dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9466
9467	dtrace_enabling_matchall_with_cond(cond: &cond);
9468	}
9469
9470	static void
9471	dtrace_helper_provide(dof_helper_t dhp, proc_t p)
9472	{
9473	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9474	dof_hdr_t dof = (dof_hdr_t )daddr;
9475	uint32_t i;
9476
9477	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9478
9479	for (i = `0`; i < dof->dofh_secnum; i++) {
9480	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
9481	dof->dofh_secoff + i * dof->dofh_secsize);
9482
9483	if (sec->dofs_type != DOF_SECT_PROVIDER)
9484	continue;
9485
9486	dtrace_helper_provide_one(dhp, sec, p);
9487	}
9488	}
9489
9490	static void
9491	dtrace_helper_provider_remove_one(dof_helper_t dhp, dof_sec_t sec, proc_t *p)
9492	{
9493	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9494	dof_hdr_t dof = (dof_hdr_t )daddr;
9495	dof_sec_t *str_sec;
9496	dof_provider_t *provider;
9497	char *strtab;
9498	dtrace_helper_provdesc_t dhpv;
9499	dtrace_meta_t *meta = dtrace_meta_pid;
9500	dtrace_mops_t *mops = &meta->dtm_mops;
9501
9502	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9503	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9504	provider->dofpv_strtab * dof->dofh_secsize);
9505
9506	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9507
9508	/*
9509	* Create the provider.
9510	*/
9511	dtrace_dofprov2hprov(hprov: &dhpv, dofprov: provider, strtab);
9512
9513	mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9514
9515	meta->dtm_count--;
9516	}
9517
9518	static void
9519	dtrace_helper_provider_remove(dof_helper_t dhp, proc_t p)
9520	{
9521	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9522	dof_hdr_t dof = (dof_hdr_t )daddr;
9523	uint32_t i;
9524
9525	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9526
9527	for (i = `0`; i < dof->dofh_secnum; i++) {
9528	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
9529	dof->dofh_secoff + i * dof->dofh_secsize);
9530
9531	if (sec->dofs_type != DOF_SECT_PROVIDER)
9532	continue;
9533
9534	dtrace_helper_provider_remove_one(dhp, sec, p);
9535	}
9536	}
9537
9538	/*
9539	* DTrace Meta Provider-to-Framework API Functions
9540	*
9541	* These functions implement the Meta Provider-to-Framework API, as described
9542	* in <sys/dtrace.h>.
9543	*/
9544	int
9545	dtrace_meta_register(const char name, const* dtrace_mops_t mops, void* *arg,
9546	dtrace_meta_provider_id_t *idp)
9547	{
9548	dtrace_meta_t *meta;
9549	dtrace_helpers_t help, next;
9550	uint_t i;
9551
9552	*idp = DTRACE_METAPROVNONE;
9553
9554	/*
9555	* We strictly don't need the name, but we hold onto it for
9556	* debuggability. All hail error queues!
9557	*/
9558	if (name == NULL) {
9559	cmn_err(CE_WARN, "failed to register meta-provider: "
9560	"invalid name");
9561	return (EINVAL);
9562	}
9563
9564	if (mops == NULL \|\|
9565	mops->dtms_create_probe == NULL \|\|
9566	mops->dtms_provide_proc == NULL \|\|
9567	mops->dtms_remove_proc == NULL) {
9568	cmn_err(CE_WARN, "failed to register meta-register %s: "
9569	"invalid ops", name);
9570	return (EINVAL);
9571	}
9572
9573	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9574	meta->dtm_mops = *mops;
9575	meta->dtm_arg = arg;
9576
9577	lck_mtx_lock(lck: &dtrace_meta_lock);
9578	lck_mtx_lock(lck: &dtrace_lock);
9579
9580	if (dtrace_meta_pid != NULL) {
9581	lck_mtx_unlock(lck: &dtrace_lock);
9582	lck_mtx_unlock(lck: &dtrace_meta_lock);
9583	cmn_err(CE_WARN, "failed to register meta-register %s: "
9584	"user-land meta-provider exists", name);
9585	kmem_free(meta, sizeof (dtrace_meta_t));
9586	return (EINVAL);
9587	}
9588
9589	meta->dtm_name = dtrace_strref(str: name);
9590
9591	dtrace_meta_pid = meta;
9592	*idp = (dtrace_meta_provider_id_t)meta;
9593
9594	/*
9595	* If there are providers and probes ready to go, pass them
9596	* off to the new meta provider now.
9597	*/
9598
9599	help = dtrace_deferred_pid;
9600	dtrace_deferred_pid = NULL;
9601
9602	lck_mtx_unlock(lck: &dtrace_lock);
9603
9604	while (help != NULL) {
9605	for (i = `0`; i < help->dthps_nprovs; i++) {
9606	proc_t *p = proc_find(pid: help->dthps_pid);
9607	if (p == PROC_NULL)
9608	continue;
9609	dtrace_helper_provide(dhp: &help->dthps_provs[i]->dthp_prov,
9610	p);
9611	proc_rele(p);
9612	}
9613
9614	next = help->dthps_next;
9615	help->dthps_next = NULL;
9616	help->dthps_prev = NULL;
9617	help->dthps_deferred = `0`;
9618	help = next;
9619	}
9620
9621	lck_mtx_unlock(lck: &dtrace_meta_lock);
9622
9623	return (`0`);
9624	}
9625
9626	int
9627	dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9628	{
9629	dtrace_meta_t *pp, old = (dtrace_meta_t *)id;
9630
9631	lck_mtx_lock(lck: &dtrace_meta_lock);
9632	lck_mtx_lock(lck: &dtrace_lock);
9633
9634	if (old == dtrace_meta_pid) {
9635	pp = &dtrace_meta_pid;
9636	} else {
9637	panic("attempt to unregister non-existent "
9638	"dtrace meta-provider %p\n", (void *)old);
9639	}
9640
9641	if (old->dtm_count != `0`) {
9642	lck_mtx_unlock(lck: &dtrace_lock);
9643	lck_mtx_unlock(lck: &dtrace_meta_lock);
9644	return (EBUSY);
9645	}
9646
9647	*pp = NULL;
9648
9649	dtrace_strunref(str: old->dtm_name);
9650
9651	lck_mtx_unlock(lck: &dtrace_lock);
9652	lck_mtx_unlock(lck: &dtrace_meta_lock);
9653
9654	kmem_free(old, sizeof (dtrace_meta_t));
9655
9656	return (`0`);
9657	}
9658
9659
9660	/*
9661	* DTrace DIF Object Functions
9662	*/
9663	static int
9664	dtrace_difo_err(uint_t pc, const char *format, ...)
9665	{
9666	if (dtrace_err_verbose) {
9667	va_list alist;
9668
9669	(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9670	va_start(alist, format);
9671	(void) vuprintf(format, alist);
9672	va_end(alist);
9673	}
9674
9675	#ifdef DTRACE_ERRDEBUG
9676	dtrace_errdebug(format);
9677	#endif
9678	return (`1`);
9679	}
9680
9681	/*
9682	* Validate a DTrace DIF object by checking the IR instructions. The following
9683	* rules are currently enforced by dtrace_difo_validate():
9684	*
9685	* 1. Each instruction must have a valid opcode
9686	* 2. Each register, string, variable, or subroutine reference must be valid
9687	* 3. No instruction can modify register %r0 (must be zero)
9688	* 4. All instruction reserved bits must be set to zero
9689	* 5. The last instruction must be a "ret" instruction
9690	* 6. All branch targets must reference a valid instruction _after_ the branch
9691	*/
9692	static int
9693	dtrace_difo_validate(dtrace_difo_t dp, dtrace_vstate_t vstate, uint_t nregs,
9694	cred_t *cr)
9695	{
9696	int err = `0`;
9697	uint_t i;
9698
9699	int (efunc)(uint_t pc, const* char *, ...) = dtrace_difo_err;
9700	int kcheckload;
9701	uint_t pc;
9702	int maxglobal = -`1`, maxlocal = -`1`, maxtlocal = -`1`;
9703
9704	kcheckload = cr == NULL \|\|
9705	(vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == `0`;
9706
9707	dp->dtdo_destructive = `0`;
9708
9709	for (pc = `0`; pc < dp->dtdo_len && err == `0`; pc++) {
9710	dif_instr_t instr = dp->dtdo_buf[pc];
9711
9712	uint_t r1 = DIF_INSTR_R1(instr);
9713	uint_t r2 = DIF_INSTR_R2(instr);
9714	uint_t rd = DIF_INSTR_RD(instr);
9715	uint_t rs = DIF_INSTR_RS(instr);
9716	uint_t label = DIF_INSTR_LABEL(instr);
9717	uint_t v = DIF_INSTR_VAR(instr);
9718	uint_t subr = DIF_INSTR_SUBR(instr);
9719	uint_t type = DIF_INSTR_TYPE(instr);
9720	uint_t op = DIF_INSTR_OP(instr);
9721
9722	switch (op) {
9723	case DIF_OP_OR:
9724	case DIF_OP_XOR:
9725	case DIF_OP_AND:
9726	case DIF_OP_SLL:
9727	case DIF_OP_SRL:
9728	case DIF_OP_SRA:
9729	case DIF_OP_SUB:
9730	case DIF_OP_ADD:
9731	case DIF_OP_MUL:
9732	case DIF_OP_SDIV:
9733	case DIF_OP_UDIV:
9734	case DIF_OP_SREM:
9735	case DIF_OP_UREM:
9736	case DIF_OP_COPYS:
9737	if (r1 >= nregs)
9738	err += efunc(pc, "invalid register %u\n", r1);
9739	if (r2 >= nregs)
9740	err += efunc(pc, "invalid register %u\n", r2);
9741	if (rd >= nregs)
9742	err += efunc(pc, "invalid register %u\n", rd);
9743	if (rd == `0`)
9744	err += efunc(pc, "cannot write to %%r0\n");
9745	break;
9746	case DIF_OP_NOT:
9747	case DIF_OP_MOV:
9748	case DIF_OP_ALLOCS:
9749	if (r1 >= nregs)
9750	err += efunc(pc, "invalid register %u\n", r1);
9751	if (r2 != `0`)
9752	err += efunc(pc, "non-zero reserved bits\n");
9753	if (rd >= nregs)
9754	err += efunc(pc, "invalid register %u\n", rd);
9755	if (rd == `0`)
9756	err += efunc(pc, "cannot write to %%r0\n");
9757	break;
9758	case DIF_OP_LDSB:
9759	case DIF_OP_LDSH:
9760	case DIF_OP_LDSW:
9761	case DIF_OP_LDUB:
9762	case DIF_OP_LDUH:
9763	case DIF_OP_LDUW:
9764	case DIF_OP_LDX:
9765	if (r1 >= nregs)
9766	err += efunc(pc, "invalid register %u\n", r1);
9767	if (r2 != `0`)
9768	err += efunc(pc, "non-zero reserved bits\n");
9769	if (rd >= nregs)
9770	err += efunc(pc, "invalid register %u\n", rd);
9771	if (rd == `0`)
9772	err += efunc(pc, "cannot write to %%r0\n");
9773	if (kcheckload)
9774	dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9775	DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9776	break;
9777	case DIF_OP_RLDSB:
9778	case DIF_OP_RLDSH:
9779	case DIF_OP_RLDSW:
9780	case DIF_OP_RLDUB:
9781	case DIF_OP_RLDUH:
9782	case DIF_OP_RLDUW:
9783	case DIF_OP_RLDX:
9784	if (r1 >= nregs)
9785	err += efunc(pc, "invalid register %u\n", r1);
9786	if (r2 != `0`)
9787	err += efunc(pc, "non-zero reserved bits\n");
9788	if (rd >= nregs)
9789	err += efunc(pc, "invalid register %u\n", rd);
9790	if (rd == `0`)
9791	err += efunc(pc, "cannot write to %%r0\n");
9792	break;
9793	case DIF_OP_ULDSB:
9794	case DIF_OP_ULDSH:
9795	case DIF_OP_ULDSW:
9796	case DIF_OP_ULDUB:
9797	case DIF_OP_ULDUH:
9798	case DIF_OP_ULDUW:
9799	case DIF_OP_ULDX:
9800	if (r1 >= nregs)
9801	err += efunc(pc, "invalid register %u\n", r1);
9802	if (r2 != `0`)
9803	err += efunc(pc, "non-zero reserved bits\n");
9804	if (rd >= nregs)
9805	err += efunc(pc, "invalid register %u\n", rd);
9806	if (rd == `0`)
9807	err += efunc(pc, "cannot write to %%r0\n");
9808	break;
9809	case DIF_OP_STB:
9810	case DIF_OP_STH:
9811	case DIF_OP_STW:
9812	case DIF_OP_STX:
9813	if (r1 >= nregs)
9814	err += efunc(pc, "invalid register %u\n", r1);
9815	if (r2 != `0`)
9816	err += efunc(pc, "non-zero reserved bits\n");
9817	if (rd >= nregs)
9818	err += efunc(pc, "invalid register %u\n", rd);
9819	if (rd == `0`)
9820	err += efunc(pc, "cannot write to 0 address\n");
9821	break;
9822	case DIF_OP_CMP:
9823	case DIF_OP_SCMP:
9824	if (r1 >= nregs)
9825	err += efunc(pc, "invalid register %u\n", r1);
9826	if (r2 >= nregs)
9827	err += efunc(pc, "invalid register %u\n", r2);
9828	if (rd != `0`)
9829	err += efunc(pc, "non-zero reserved bits\n");
9830	break;
9831	case DIF_OP_TST:
9832	if (r1 >= nregs)
9833	err += efunc(pc, "invalid register %u\n", r1);
9834	if (r2 != `0` \|\| rd != `0`)
9835	err += efunc(pc, "non-zero reserved bits\n");
9836	break;
9837	case DIF_OP_BA:
9838	case DIF_OP_BE:
9839	case DIF_OP_BNE:
9840	case DIF_OP_BG:
9841	case DIF_OP_BGU:
9842	case DIF_OP_BGE:
9843	case DIF_OP_BGEU:
9844	case DIF_OP_BL:
9845	case DIF_OP_BLU:
9846	case DIF_OP_BLE:
9847	case DIF_OP_BLEU:
9848	if (label >= dp->dtdo_len) {
9849	err += efunc(pc, "invalid branch target %u\n",
9850	label);
9851	}
9852	if (label <= pc) {
9853	err += efunc(pc, "backward branch to %u\n",
9854	label);
9855	}
9856	break;
9857	case DIF_OP_RET:
9858	if (r1 != `0` \|\| r2 != `0`)
9859	err += efunc(pc, "non-zero reserved bits\n");
9860	if (rd >= nregs)
9861	err += efunc(pc, "invalid register %u\n", rd);
9862	break;
9863	case DIF_OP_NOP:
9864	case DIF_OP_POPTS:
9865	case DIF_OP_FLUSHTS:
9866	if (r1 != `0` \|\| r2 != `0` \|\| rd != `0`)
9867	err += efunc(pc, "non-zero reserved bits\n");
9868	break;
9869	case DIF_OP_SETX:
9870	if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9871	err += efunc(pc, "invalid integer ref %u\n",
9872	DIF_INSTR_INTEGER(instr));
9873	}
9874	if (rd >= nregs)
9875	err += efunc(pc, "invalid register %u\n", rd);
9876	if (rd == `0`)
9877	err += efunc(pc, "cannot write to %%r0\n");
9878	break;
9879	case DIF_OP_SETS:
9880	if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9881	err += efunc(pc, "invalid string ref %u\n",
9882	DIF_INSTR_STRING(instr));
9883	}
9884	if (rd >= nregs)
9885	err += efunc(pc, "invalid register %u\n", rd);
9886	if (rd == `0`)
9887	err += efunc(pc, "cannot write to %%r0\n");
9888	break;
9889	case DIF_OP_LDGA:
9890	case DIF_OP_LDTA:
9891	if (r1 > DIF_VAR_ARRAY_MAX)
9892	err += efunc(pc, "invalid array %u\n", r1);
9893	if (r2 >= nregs)
9894	err += efunc(pc, "invalid register %u\n", r2);
9895	if (rd >= nregs)
9896	err += efunc(pc, "invalid register %u\n", rd);
9897	if (rd == `0`)
9898	err += efunc(pc, "cannot write to %%r0\n");
9899	break;
9900	case DIF_OP_LDGS:
9901	case DIF_OP_LDTS:
9902	case DIF_OP_LDLS:
9903	case DIF_OP_LDGAA:
9904	case DIF_OP_LDTAA:
9905	if (v < DIF_VAR_OTHER_MIN \|\| v > DIF_VAR_OTHER_MAX)
9906	err += efunc(pc, "invalid variable %u\n", v);
9907	if (rd >= nregs)
9908	err += efunc(pc, "invalid register %u\n", rd);
9909	if (rd == `0`)
9910	err += efunc(pc, "cannot write to %%r0\n");
9911	break;
9912	case DIF_OP_STGS:
9913	case DIF_OP_STTS:
9914	case DIF_OP_STLS:
9915	case DIF_OP_STGAA:
9916	case DIF_OP_STTAA:
9917	if (v < DIF_VAR_OTHER_UBASE \|\| v > DIF_VAR_OTHER_MAX)
9918	err += efunc(pc, "invalid variable %u\n", v);
9919	if (rs >= nregs)
9920	err += efunc(pc, "invalid register %u\n", rd);
9921	break;
9922	case DIF_OP_CALL:
9923	if (subr > DIF_SUBR_MAX &&
9924	!(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9925	err += efunc(pc, "invalid subr %u\n", subr);
9926	if (rd >= nregs)
9927	err += efunc(pc, "invalid register %u\n", rd);
9928	if (rd == `0`)
9929	err += efunc(pc, "cannot write to %%r0\n");
9930
9931	switch (subr) {
9932	case DIF_SUBR_COPYOUT:
9933	case DIF_SUBR_COPYOUTSTR:
9934	case DIF_SUBR_KDEBUG_TRACE:
9935	case DIF_SUBR_KDEBUG_TRACE_STRING:
9936	case DIF_SUBR_PHYSMEM_READ:
9937	case DIF_SUBR_PHYSMEM_WRITE:
9938	case DIF_SUBR_LIVEDUMP:
9939	dp->dtdo_destructive = `1`;
9940	break;
9941	default:
9942	break;
9943	}
9944	break;
9945	case DIF_OP_PUSHTR:
9946	if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9947	err += efunc(pc, "invalid ref type %u\n", type);
9948	if (r2 >= nregs)
9949	err += efunc(pc, "invalid register %u\n", r2);
9950	if (rs >= nregs)
9951	err += efunc(pc, "invalid register %u\n", rs);
9952	break;
9953	case DIF_OP_PUSHTV:
9954	if (type != DIF_TYPE_CTF)
9955	err += efunc(pc, "invalid val type %u\n", type);
9956	if (r2 >= nregs)
9957	err += efunc(pc, "invalid register %u\n", r2);
9958	if (rs >= nregs)
9959	err += efunc(pc, "invalid register %u\n", rs);
9960	break;
9961	case DIF_OP_STRIP:
9962	if (r1 >= nregs)
9963	err += efunc(pc, "invalid register %u\n", r1);
9964	if (!dtrace_is_valid_ptrauth_key(r2))
9965	err += efunc(pc, "invalid key\n");
9966	if (rd >= nregs)
9967	err += efunc(pc, "invalid register %u\n", rd);
9968	if (rd == `0`)
9969	err += efunc(pc, "cannot write to %%r0\n");
9970	break;
9971	default:
9972	err += efunc(pc, "invalid opcode %u\n",
9973	DIF_INSTR_OP(instr));
9974	}
9975	}
9976
9977	if (dp->dtdo_len != `0` &&
9978	DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - `1`]) != DIF_OP_RET) {
9979	err += efunc(dp->dtdo_len - `1`,
9980	"expected 'ret' as last DIF instruction\n");
9981	}
9982
9983	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF \| DIF_TF_BYUREF))) {
9984	/*
9985	* If we're not returning by reference, the size must be either
9986	* 0 or the size of one of the base types.
9987	*/
9988	switch (dp->dtdo_rtype.dtdt_size) {
9989	case `0`:
9990	case sizeof (uint8_t):
9991	case sizeof (uint16_t):
9992	case sizeof (uint32_t):
9993	case sizeof (uint64_t):
9994	break;
9995
9996	default:
9997	err += efunc(dp->dtdo_len - `1`, "bad return size\n");
9998	}
9999	}
10000
10001	for (i = `0`; i < dp->dtdo_varlen && err == `0`; i++) {
10002	dtrace_difv_t v = &dp->dtdo_vartab[i], existing = NULL;
10003	dtrace_diftype_t vt, et;
10004	uint_t id;
10005	int ndx;
10006
10007	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
10008	v->dtdv_scope != DIFV_SCOPE_THREAD &&
10009	v->dtdv_scope != DIFV_SCOPE_LOCAL) {
10010	err += efunc(i, "unrecognized variable scope %d\n",
10011	v->dtdv_scope);
10012	break;
10013	}
10014
10015	if (v->dtdv_kind != DIFV_KIND_ARRAY &&
10016	v->dtdv_kind != DIFV_KIND_SCALAR) {
10017	err += efunc(i, "unrecognized variable type %d\n",
10018	v->dtdv_kind);
10019	break;
10020	}
10021
10022	if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
10023	err += efunc(i, "%d exceeds variable id limit\n", id);
10024	break;
10025	}
10026
10027	if (id < DIF_VAR_OTHER_UBASE)
10028	continue;
10029
10030	/*
10031	* For user-defined variables, we need to check that this
10032	* definition is identical to any previous definition that we
10033	* encountered.
10034	*/
10035	ndx = id - DIF_VAR_OTHER_UBASE;
10036
10037	switch (v->dtdv_scope) {
10038	case DIFV_SCOPE_GLOBAL:
10039	if (maxglobal == -`1` \|\| ndx > maxglobal)
10040	maxglobal = ndx;
10041
10042	if (ndx < vstate->dtvs_nglobals) {
10043	dtrace_statvar_t *svar;
10044
10045	if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10046	existing = &svar->dtsv_var;
10047	}
10048
10049	break;
10050
10051	case DIFV_SCOPE_THREAD:
10052	if (maxtlocal == -`1` \|\| ndx > maxtlocal)
10053	maxtlocal = ndx;
10054
10055	if (ndx < vstate->dtvs_ntlocals)
10056	existing = &vstate->dtvs_tlocals[ndx];
10057	break;
10058
10059	case DIFV_SCOPE_LOCAL:
10060	if (maxlocal == -`1` \|\| ndx > maxlocal)
10061	maxlocal = ndx;
10062	if (ndx < vstate->dtvs_nlocals) {
10063	dtrace_statvar_t *svar;
10064
10065	if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10066	existing = &svar->dtsv_var;
10067	}
10068
10069	break;
10070	}
10071
10072	vt = &v->dtdv_type;
10073
10074	if (vt->dtdt_flags & DIF_TF_BYREF) {
10075	if (vt->dtdt_size == `0`) {
10076	err += efunc(i, "zero-sized variable\n");
10077	break;
10078	}
10079
10080	if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL \|\|
10081	v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10082	vt->dtdt_size > dtrace_statvar_maxsize) {
10083	err += efunc(i, "oversized by-ref static\n");
10084	break;
10085	}
10086	}
10087
10088	if (existing == NULL \|\| existing->dtdv_id == `0`)
10089	continue;
10090
10091	ASSERT(existing->dtdv_id == v->dtdv_id);
10092	ASSERT(existing->dtdv_scope == v->dtdv_scope);
10093
10094	if (existing->dtdv_kind != v->dtdv_kind)
10095	err += efunc(i, "%d changed variable kind\n", id);
10096
10097	et = &existing->dtdv_type;
10098
10099	if (vt->dtdt_flags != et->dtdt_flags) {
10100	err += efunc(i, "%d changed variable type flags\n", id);
10101	break;
10102	}
10103
10104	if (vt->dtdt_size != `0` && vt->dtdt_size != et->dtdt_size) {
10105	err += efunc(i, "%d changed variable type size\n", id);
10106	break;
10107	}
10108	}
10109
10110	for (pc = `0`; pc < dp->dtdo_len && err == `0`; pc++) {
10111	dif_instr_t instr = dp->dtdo_buf[pc];
10112
10113	uint_t v = DIF_INSTR_VAR(instr);
10114	uint_t op = DIF_INSTR_OP(instr);
10115
10116	switch (op) {
10117	case DIF_OP_LDGS:
10118	case DIF_OP_LDGAA:
10119	case DIF_OP_STGS:
10120	case DIF_OP_STGAA:
10121	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
10122	err += efunc(pc, "invalid variable %u\n", v);
10123	break;
10124	case DIF_OP_LDTS:
10125	case DIF_OP_LDTAA:
10126	case DIF_OP_STTS:
10127	case DIF_OP_STTAA:
10128	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
10129	err += efunc(pc, "invalid variable %u\n", v);
10130	break;
10131	case DIF_OP_LDLS:
10132	case DIF_OP_STLS:
10133	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
10134	err += efunc(pc, "invalid variable %u\n", v);
10135	break;
10136	default:
10137	break;
10138	}
10139	}
10140
10141	return (err);
10142	}
10143
10144	/*
10145	* Validate a DTrace DIF object that it is to be used as a helper. Helpers
10146	* are much more constrained than normal DIFOs. Specifically, they may
10147	* not:
10148	*
10149	* 1. Make calls to subroutines other than copyin(), copyinstr() or
10150	* miscellaneous string routines
10151	* 2. Access DTrace variables other than the args[] array, and the
10152	* curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10153	* 3. Have thread-local variables.
10154	* 4. Have dynamic variables.
10155	*/
10156	static int
10157	dtrace_difo_validate_helper(dtrace_difo_t *dp)
10158	{
10159	int (efunc)(uint_t pc, const* char *, ...) = dtrace_difo_err;
10160	int err = `0`;
10161	uint_t pc;
10162
10163	for (pc = `0`; pc < dp->dtdo_len; pc++) {
10164	dif_instr_t instr = dp->dtdo_buf[pc];
10165
10166	uint_t v = DIF_INSTR_VAR(instr);
10167	uint_t subr = DIF_INSTR_SUBR(instr);
10168	uint_t op = DIF_INSTR_OP(instr);
10169
10170	switch (op) {
10171	case DIF_OP_OR:
10172	case DIF_OP_XOR:
10173	case DIF_OP_AND:
10174	case DIF_OP_SLL:
10175	case DIF_OP_SRL:
10176	case DIF_OP_SRA:
10177	case DIF_OP_SUB:
10178	case DIF_OP_ADD:
10179	case DIF_OP_MUL:
10180	case DIF_OP_SDIV:
10181	case DIF_OP_UDIV:
10182	case DIF_OP_SREM:
10183	case DIF_OP_UREM:
10184	case DIF_OP_COPYS:
10185	case DIF_OP_NOT:
10186	case DIF_OP_MOV:
10187	case DIF_OP_RLDSB:
10188	case DIF_OP_RLDSH:
10189	case DIF_OP_RLDSW:
10190	case DIF_OP_RLDUB:
10191	case DIF_OP_RLDUH:
10192	case DIF_OP_RLDUW:
10193	case DIF_OP_RLDX:
10194	case DIF_OP_ULDSB:
10195	case DIF_OP_ULDSH:
10196	case DIF_OP_ULDSW:
10197	case DIF_OP_ULDUB:
10198	case DIF_OP_ULDUH:
10199	case DIF_OP_ULDUW:
10200	case DIF_OP_ULDX:
10201	case DIF_OP_STB:
10202	case DIF_OP_STH:
10203	case DIF_OP_STW:
10204	case DIF_OP_STX:
10205	case DIF_OP_ALLOCS:
10206	case DIF_OP_CMP:
10207	case DIF_OP_SCMP:
10208	case DIF_OP_TST:
10209	case DIF_OP_BA:
10210	case DIF_OP_BE:
10211	case DIF_OP_BNE:
10212	case DIF_OP_BG:
10213	case DIF_OP_BGU:
10214	case DIF_OP_BGE:
10215	case DIF_OP_BGEU:
10216	case DIF_OP_BL:
10217	case DIF_OP_BLU:
10218	case DIF_OP_BLE:
10219	case DIF_OP_BLEU:
10220	case DIF_OP_RET:
10221	case DIF_OP_NOP:
10222	case DIF_OP_POPTS:
10223	case DIF_OP_FLUSHTS:
10224	case DIF_OP_SETX:
10225	case DIF_OP_SETS:
10226	case DIF_OP_LDGA:
10227	case DIF_OP_LDLS:
10228	case DIF_OP_STGS:
10229	case DIF_OP_STLS:
10230	case DIF_OP_PUSHTR:
10231	case DIF_OP_PUSHTV:
10232	break;
10233
10234	case DIF_OP_LDGS:
10235	if (v >= DIF_VAR_OTHER_UBASE)
10236	break;
10237
10238	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10239	break;
10240
10241	if (v == DIF_VAR_CURTHREAD \|\| v == DIF_VAR_PID \|\|
10242	v == DIF_VAR_PPID \|\| v == DIF_VAR_TID \|\|
10243	v == DIF_VAR_EXECNAME \|\| v == DIF_VAR_ZONENAME \|\|
10244	v == DIF_VAR_UID \|\| v == DIF_VAR_GID)
10245	break;
10246
10247	err += efunc(pc, "illegal variable %u\n", v);
10248	break;
10249
10250	case DIF_OP_LDTA:
10251	case DIF_OP_LDTS:
10252	case DIF_OP_LDGAA:
10253	case DIF_OP_LDTAA:
10254	err += efunc(pc, "illegal dynamic variable load\n");
10255	break;
10256
10257	case DIF_OP_STTS:
10258	case DIF_OP_STGAA:
10259	case DIF_OP_STTAA:
10260	err += efunc(pc, "illegal dynamic variable store\n");
10261	break;
10262
10263	case DIF_OP_CALL:
10264	switch (subr) {
10265	case DIF_SUBR_ALLOCA:
10266	case DIF_SUBR_BCOPY:
10267	case DIF_SUBR_COPYIN:
10268	case DIF_SUBR_COPYINTO:
10269	case DIF_SUBR_COPYINSTR:
10270	case DIF_SUBR_HTONS:
10271	case DIF_SUBR_HTONL:
10272	case DIF_SUBR_HTONLL:
10273	case DIF_SUBR_INDEX:
10274	case DIF_SUBR_INET_NTOA:
10275	case DIF_SUBR_INET_NTOA6:
10276	case DIF_SUBR_INET_NTOP:
10277	case DIF_SUBR_JSON:
10278	case DIF_SUBR_LLTOSTR:
10279	case DIF_SUBR_NTOHS:
10280	case DIF_SUBR_NTOHL:
10281	case DIF_SUBR_NTOHLL:
10282	case DIF_SUBR_RINDEX:
10283	case DIF_SUBR_STRCHR:
10284	case DIF_SUBR_STRTOLL:
10285	case DIF_SUBR_STRJOIN:
10286	case DIF_SUBR_STRRCHR:
10287	case DIF_SUBR_STRSTR:
10288	break;
10289	default:
10290	err += efunc(pc, "invalid subr %u\n", subr);
10291	}
10292	break;
10293
10294	default:
10295	err += efunc(pc, "invalid opcode %u\n",
10296	DIF_INSTR_OP(instr));
10297	}
10298	}
10299
10300	return (err);
10301	}
10302
10303	/*
10304	* Returns 1 if the expression in the DIF object can be cached on a per-thread
10305	* basis; 0 if not.
10306	*/
10307	static int
10308	dtrace_difo_cacheable(dtrace_difo_t *dp)
10309	{
10310	uint_t i;
10311
10312	if (dp == NULL)
10313	return (`0`);
10314
10315	for (i = `0`; i < dp->dtdo_varlen; i++) {
10316	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10317
10318	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10319	continue;
10320
10321	switch (v->dtdv_id) {
10322	case DIF_VAR_CURTHREAD:
10323	case DIF_VAR_PID:
10324	case DIF_VAR_TID:
10325	case DIF_VAR_EXECNAME:
10326	case DIF_VAR_ZONENAME:
10327	break;
10328
10329	default:
10330	return (`0`);
10331	}
10332	}
10333
10334	/*
10335	* This DIF object may be cacheable. Now we need to look for any
10336	* array loading instructions, any memory loading instructions, or
10337	* any stores to thread-local variables.
10338	*/
10339	for (i = `0`; i < dp->dtdo_len; i++) {
10340	uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10341
10342	if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) \|\|
10343	(op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) \|\|
10344	(op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) \|\|
10345	op == DIF_OP_LDGA \|\| op == DIF_OP_STTS)
10346	return (`0`);
10347	}
10348
10349	return (`1`);
10350	}
10351
10352	static void
10353	dtrace_difo_hold(dtrace_difo_t *dp)
10354	{
10355	uint_t i;
10356
10357	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10358
10359	dp->dtdo_refcnt++;
10360	ASSERT(dp->dtdo_refcnt != `0`);
10361
10362	/*
10363	* We need to check this DIF object for references to the variable
10364	* DIF_VAR_VTIMESTAMP.
10365	*/
10366	for (i = `0`; i < dp->dtdo_varlen; i++) {
10367	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10368
10369	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10370	continue;
10371
10372	if (dtrace_vtime_references++ == `0`)
10373	dtrace_vtime_enable();
10374	}
10375	}
10376
10377	/*
10378	* This routine calculates the dynamic variable chunksize for a given DIF
10379	* object. The calculation is not fool-proof, and can probably be tricked by
10380	* malicious DIF -- but it works for all compiler-generated DIF. Because this
10381	* calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10382	* if a dynamic variable size exceeds the chunksize.
10383	*/
10384	static void
10385	dtrace_difo_chunksize(dtrace_difo_t dp, dtrace_vstate_t vstate)
10386	{
10387	uint64_t sval = `0`;
10388	dtrace_key_t tupregs[DIF_DTR_NREGS + `2`]; / +2 for thread and id /
10389	const dif_instr_t *text = dp->dtdo_buf;
10390	uint_t pc, srd = `0`;
10391	uint_t ttop = `0`;
10392	size_t size, ksize;
10393	uint_t id, i;
10394
10395	for (pc = `0`; pc < dp->dtdo_len; pc++) {
10396	dif_instr_t instr = text[pc];
10397	uint_t op = DIF_INSTR_OP(instr);
10398	uint_t rd = DIF_INSTR_RD(instr);
10399	uint_t r1 = DIF_INSTR_R1(instr);
10400	uint_t nkeys = `0`;
10401	uchar_t scope;
10402
10403	dtrace_key_t *key = tupregs;
10404
10405	switch (op) {
10406	case DIF_OP_SETX:
10407	sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10408	srd = rd;
10409	continue;
10410
10411	case DIF_OP_STTS:
10412	key = &tupregs[DIF_DTR_NREGS];
10413	key[`0`].dttk_size = `0`;
10414	key[`1`].dttk_size = `0`;
10415	nkeys = `2`;
10416	scope = DIFV_SCOPE_THREAD;
10417	break;
10418
10419	case DIF_OP_STGAA:
10420	case DIF_OP_STTAA:
10421	nkeys = ttop;
10422
10423	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10424	key[nkeys++].dttk_size = `0`;
10425
10426	key[nkeys++].dttk_size = `0`;
10427
10428	if (op == DIF_OP_STTAA) {
10429	scope = DIFV_SCOPE_THREAD;
10430	} else {
10431	scope = DIFV_SCOPE_GLOBAL;
10432	}
10433
10434	break;
10435
10436	case DIF_OP_PUSHTR:
10437	if (ttop == DIF_DTR_NREGS)
10438	return;
10439
10440	if ((srd == `0` \|\| sval == `0`) && r1 == DIF_TYPE_STRING) {
10441	/*
10442	* If the register for the size of the "pushtr"
10443	* is %r0 (or the value is 0) and the type is
10444	* a string, we'll use the system-wide default
10445	* string size.
10446	*/
10447	tupregs[ttop++].dttk_size =
10448	dtrace_strsize_default;
10449	} else {
10450	if (srd == `0`)
10451	return;
10452
10453	if (sval > LONG_MAX)
10454	return;
10455
10456	tupregs[ttop++].dttk_size = sval;
10457	}
10458
10459	break;
10460
10461	case DIF_OP_PUSHTV:
10462	if (ttop == DIF_DTR_NREGS)
10463	return;
10464
10465	tupregs[ttop++].dttk_size = `0`;
10466	break;
10467
10468	case DIF_OP_FLUSHTS:
10469	ttop = `0`;
10470	break;
10471
10472	case DIF_OP_POPTS:
10473	if (ttop != `0`)
10474	ttop--;
10475	break;
10476	}
10477
10478	sval = `0`;
10479	srd = `0`;
10480
10481	if (nkeys == `0`)
10482	continue;
10483
10484	/*
10485	* We have a dynamic variable allocation; calculate its size.
10486	*/
10487	for (ksize = `0`, i = `0`; i < nkeys; i++)
10488	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10489
10490	size = sizeof (dtrace_dynvar_t);
10491	size += sizeof (dtrace_key_t) * (nkeys - `1`);
10492	size += ksize;
10493
10494	/*
10495	* Now we need to determine the size of the stored data.
10496	*/
10497	id = DIF_INSTR_VAR(instr);
10498
10499	for (i = `0`; i < dp->dtdo_varlen; i++) {
10500	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10501
10502	if (v->dtdv_id == id && v->dtdv_scope == scope) {
10503	size += v->dtdv_type.dtdt_size;
10504	break;
10505	}
10506	}
10507
10508	if (i == dp->dtdo_varlen)
10509	return;
10510
10511	/*
10512	* We have the size. If this is larger than the chunk size
10513	* for our dynamic variable state, reset the chunk size.
10514	*/
10515	size = P2ROUNDUP(size, sizeof (uint64_t));
10516
10517	/*
10518	* Before setting the chunk size, check that we're not going
10519	* to set it to a negative value...
10520	*/
10521	if (size > LONG_MAX)
10522	return;
10523
10524	/*
10525	* ...and make certain that we didn't badly overflow.
10526	*/
10527	if (size < ksize \|\| size < sizeof (dtrace_dynvar_t))
10528	return;
10529
10530	if (size > vstate->dtvs_dynvars.dtds_chunksize)
10531	vstate->dtvs_dynvars.dtds_chunksize = size;
10532	}
10533	}
10534
10535	static void
10536	dtrace_difo_init(dtrace_difo_t dp, dtrace_vstate_t vstate)
10537	{
10538	int oldsvars, osz, nsz, otlocals, ntlocals;
10539	uint_t i, id;
10540
10541	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10542	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != `0`);
10543
10544	for (i = `0`; i < dp->dtdo_varlen; i++) {
10545	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10546	dtrace_statvar_t *svar;
10547	dtrace_statvar_t ***svarp = NULL;
10548	size_t dsize = `0`;
10549	uint8_t scope = v->dtdv_scope;
10550	int np = (int* *)NULL;
10551
10552	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10553	continue;
10554
10555	id -= DIF_VAR_OTHER_UBASE;
10556
10557	switch (scope) {
10558	case DIFV_SCOPE_THREAD:
10559	while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10560	dtrace_difv_t *tlocals;
10561
10562	if ((ntlocals = (otlocals << `1`)) == `0`)
10563	ntlocals = `1`;
10564
10565	osz = otlocals * sizeof (dtrace_difv_t);
10566	nsz = ntlocals * sizeof (dtrace_difv_t);
10567
10568	tlocals = kmem_zalloc(nsz, KM_SLEEP);
10569
10570	if (osz != `0`) {
10571	bcopy(src: vstate->dtvs_tlocals,
10572	dst: tlocals, n: osz);
10573	kmem_free(vstate->dtvs_tlocals, osz);
10574	}
10575
10576	vstate->dtvs_tlocals = tlocals;
10577	vstate->dtvs_ntlocals = ntlocals;
10578	}
10579
10580	vstate->dtvs_tlocals[id] = *v;
10581	continue;
10582
10583	case DIFV_SCOPE_LOCAL:
10584	np = &vstate->dtvs_nlocals;
10585	svarp = &vstate->dtvs_locals;
10586
10587	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10588	dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10589	sizeof (uint64_t));
10590	else
10591	dsize = (int)NCPU * sizeof (uint64_t);
10592
10593	break;
10594
10595	case DIFV_SCOPE_GLOBAL:
10596	np = &vstate->dtvs_nglobals;
10597	svarp = &vstate->dtvs_globals;
10598
10599	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10600	dsize = v->dtdv_type.dtdt_size +
10601	sizeof (uint64_t);
10602
10603	break;
10604
10605	default:
10606	ASSERT(`0`);
10607	}
10608
10609	while (id >= (uint_t)(oldsvars = *np)) {
10610	dtrace_statvar_t **statics;
10611	int newsvars, oldsize, newsize;
10612
10613	if ((newsvars = (oldsvars << `1`)) == `0`)
10614	newsvars = `1`;
10615
10616	oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10617	newsize = newsvars * sizeof (dtrace_statvar_t *);
10618
10619	statics = kmem_zalloc(newsize, KM_SLEEP);
10620
10621	if (oldsize != `0`) {
10622	bcopy(src: *svarp, dst: statics, n: oldsize);
10623	kmem_free(*svarp, oldsize);
10624	}
10625
10626	*svarp = statics;
10627	*np = newsvars;
10628	}
10629
10630	if ((svar = (*svarp)[id]) == NULL) {
10631	svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10632	svar->dtsv_var = *v;
10633
10634	if ((svar->dtsv_size = dsize) != `0`) {
10635	svar->dtsv_data = (uint64_t)(uintptr_t)
10636	kmem_zalloc(dsize, KM_SLEEP);
10637	}
10638
10639	(*svarp)[id] = svar;
10640	}
10641
10642	svar->dtsv_refcnt++;
10643	}
10644
10645	dtrace_difo_chunksize(dp, vstate);
10646	dtrace_difo_hold(dp);
10647	}
10648
10649	static dtrace_difo_t *
10650	dtrace_difo_duplicate(dtrace_difo_t dp, dtrace_vstate_t vstate)
10651	{
10652	dtrace_difo_t *new;
10653	size_t sz;
10654
10655	ASSERT(dp->dtdo_buf != NULL);
10656	ASSERT(dp->dtdo_refcnt != `0`);
10657
10658	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10659
10660	ASSERT(dp->dtdo_buf != NULL);
10661	sz = dp->dtdo_len * sizeof (dif_instr_t);
10662	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10663	bcopy(src: dp->dtdo_buf, dst: new->dtdo_buf, n: sz);
10664	new->dtdo_len = dp->dtdo_len;
10665
10666	if (dp->dtdo_strtab != NULL) {
10667	ASSERT(dp->dtdo_strlen != `0`);
10668	new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10669	bcopy(src: dp->dtdo_strtab, dst: new->dtdo_strtab, n: dp->dtdo_strlen);
10670	new->dtdo_strlen = dp->dtdo_strlen;
10671	}
10672
10673	if (dp->dtdo_inttab != NULL) {
10674	ASSERT(dp->dtdo_intlen != `0`);
10675	sz = dp->dtdo_intlen * sizeof (uint64_t);
10676	new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10677	bcopy(src: dp->dtdo_inttab, dst: new->dtdo_inttab, n: sz);
10678	new->dtdo_intlen = dp->dtdo_intlen;
10679	}
10680
10681	if (dp->dtdo_vartab != NULL) {
10682	ASSERT(dp->dtdo_varlen != `0`);
10683	sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10684	new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10685	bcopy(src: dp->dtdo_vartab, dst: new->dtdo_vartab, n: sz);
10686	new->dtdo_varlen = dp->dtdo_varlen;
10687	}
10688
10689	dtrace_difo_init(dp: new, vstate);
10690	return (new);
10691	}
10692
10693	static void
10694	dtrace_difo_destroy(dtrace_difo_t dp, dtrace_vstate_t vstate)
10695	{
10696	uint_t i;
10697
10698	ASSERT(dp->dtdo_refcnt == `0`);
10699
10700	for (i = `0`; i < dp->dtdo_varlen; i++) {
10701	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10702	dtrace_statvar_t *svar;
10703	dtrace_statvar_t **svarp = NULL;
10704	uint_t id;
10705	uint8_t scope = v->dtdv_scope;
10706	int *np = NULL;
10707
10708	switch (scope) {
10709	case DIFV_SCOPE_THREAD:
10710	continue;
10711
10712	case DIFV_SCOPE_LOCAL:
10713	np = &vstate->dtvs_nlocals;
10714	svarp = vstate->dtvs_locals;
10715	break;
10716
10717	case DIFV_SCOPE_GLOBAL:
10718	np = &vstate->dtvs_nglobals;
10719	svarp = vstate->dtvs_globals;
10720	break;
10721
10722	default:
10723	ASSERT(`0`);
10724	}
10725
10726	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10727	continue;
10728
10729	id -= DIF_VAR_OTHER_UBASE;
10730
10731	ASSERT(id < (uint_t)*np);
10732
10733	svar = svarp[id];
10734	ASSERT(svar != NULL);
10735	ASSERT(svar->dtsv_refcnt > `0`);
10736
10737	if (--svar->dtsv_refcnt > `0`)
10738	continue;
10739
10740	if (svar->dtsv_size != `0`) {
10741	ASSERT(svar->dtsv_data != `0`);
10742	kmem_free((void *)(uintptr_t)svar->dtsv_data,
10743	svar->dtsv_size);
10744	}
10745
10746	kmem_free(svar, sizeof (dtrace_statvar_t));
10747	svarp[id] = NULL;
10748	}
10749
10750	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10751	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10752	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10753	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10754
10755	kmem_free(dp, sizeof (dtrace_difo_t));
10756	}
10757
10758	static void
10759	dtrace_difo_release(dtrace_difo_t dp, dtrace_vstate_t vstate)
10760	{
10761	uint_t i;
10762
10763	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10764	ASSERT(dp->dtdo_refcnt != `0`);
10765
10766	for (i = `0`; i < dp->dtdo_varlen; i++) {
10767	dtrace_difv_t *v = &dp->dtdo_vartab[i];
10768
10769	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10770	continue;
10771
10772	ASSERT(dtrace_vtime_references > `0`);
10773	if (--dtrace_vtime_references == `0`)
10774	dtrace_vtime_disable();
10775	}
10776
10777	if (--dp->dtdo_refcnt == `0`)
10778	dtrace_difo_destroy(dp, vstate);
10779	}
10780
10781	/*
10782	* DTrace Format Functions
10783	*/
10784
10785	static dtrace_format_t*
10786	dtrace_format_new(char *str)
10787	{
10788	dtrace_format_t *fmt = NULL;
10789	size_t bufsize = strlen(s: str) + `1`;
10790
10791	fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10792
10793	fmt->dtf_refcount = `1`;
10794	(void) strlcpy(dst: fmt->dtf_str, src: str, n: bufsize);
10795
10796	return fmt;
10797	}
10798
10799	static uint16_t
10800	dtrace_format_add(dtrace_state_t state, char* *str)
10801	{
10802	dtrace_format_t **new;
10803	uint16_t ndx;
10804
10805	for (ndx = `0`; ndx < state->dts_nformats; ndx++) {
10806	if (state->dts_formats[ndx] == NULL) {
10807	state->dts_formats[ndx] = dtrace_format_new(str);
10808	return (ndx + `1`);
10809	}
10810	else if (strcmp(s1: state->dts_formats[ndx]->dtf_str, s2: str) == `0`) {
10811	VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10812	state->dts_formats[ndx]->dtf_refcount++;
10813	return (ndx + `1`);
10814	}
10815	}
10816
10817	if (state->dts_nformats == USHRT_MAX) {
10818	/*
10819	* This is only likely if a denial-of-service attack is being
10820	* attempted. As such, it's okay to fail silently here.
10821	*/
10822	return (`0`);
10823	}
10824
10825	/*
10826	* For simplicity, we always resize the formats array to be exactly the
10827	* number of formats.
10828	*/
10829	ndx = state->dts_nformats++;
10830	new = kmem_alloc((ndx + `1`) * sizeof (*state->dts_formats), KM_SLEEP);
10831
10832	if (state->dts_formats != NULL) {
10833	ASSERT(ndx != `0`);
10834	bcopy(src: state->dts_formats, dst: new, n: ndx * sizeof (*state->dts_formats));
10835	kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10836	}
10837
10838	state->dts_formats = new;
10839	state->dts_formats[ndx] = dtrace_format_new(str);
10840
10841	return (ndx + `1`);
10842	}
10843
10844	static void
10845	dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10846	{
10847	dtrace_format_t *fmt;
10848
10849	ASSERT(state->dts_formats != NULL);
10850	ASSERT(format <= state->dts_nformats);
10851
10852	fmt = state->dts_formats[format - `1`];
10853
10854	ASSERT(fmt != NULL);
10855	VERIFY(fmt->dtf_refcount > `0`);
10856
10857	fmt->dtf_refcount--;
10858
10859	if (fmt->dtf_refcount == `0`) {
10860	kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10861	state->dts_formats[format - `1`] = NULL;
10862	}
10863	}
10864
10865	static void
10866	dtrace_format_destroy(dtrace_state_t *state)
10867	{
10868	int i;
10869
10870	if (state->dts_nformats == `0`) {
10871	ASSERT(state->dts_formats == NULL);
10872	return;
10873	}
10874
10875	ASSERT(state->dts_formats != NULL);
10876
10877	for (i = `0`; i < state->dts_nformats; i++) {
10878	dtrace_format_t *fmt = state->dts_formats[i];
10879
10880	if (fmt == NULL)
10881	continue;
10882
10883	kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10884	}
10885
10886	kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10887	state->dts_nformats = `0`;
10888	state->dts_formats = NULL;
10889	}
10890
10891	/*
10892	* DTrace Predicate Functions
10893	*/
10894	static dtrace_predicate_t *
10895	dtrace_predicate_create(dtrace_difo_t *dp)
10896	{
10897	dtrace_predicate_t *pred;
10898
10899	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10900	ASSERT(dp->dtdo_refcnt != `0`);
10901
10902	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10903	pred->dtp_difo = dp;
10904	pred->dtp_refcnt = `1`;
10905
10906	if (!dtrace_difo_cacheable(dp))
10907	return (pred);
10908
10909	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10910	/*
10911	* This is only theoretically possible -- we have had 2^32
10912	* cacheable predicates on this machine. We cannot allow any
10913	* more predicates to become cacheable: as unlikely as it is,
10914	* there may be a thread caching a (now stale) predicate cache
10915	* ID. (N.B.: the temptation is being successfully resisted to
10916	* have this cmn_err() "Holy shit -- we executed this code!")
10917	*/
10918	return (pred);
10919	}
10920
10921	pred->dtp_cacheid = dtrace_predcache_id++;
10922
10923	return (pred);
10924	}
10925
10926	static void
10927	dtrace_predicate_hold(dtrace_predicate_t *pred)
10928	{
10929	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10930	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != `0`);
10931	ASSERT(pred->dtp_refcnt > `0`);
10932
10933	pred->dtp_refcnt++;
10934	}
10935
10936	static void
10937	dtrace_predicate_release(dtrace_predicate_t pred, dtrace_vstate_t vstate)
10938	{
10939	dtrace_difo_t *dp = pred->dtp_difo;
10940	#pragma unused(dp) /* __APPLE__ */
10941
10942	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10943	ASSERT(dp != NULL && dp->dtdo_refcnt != `0`);
10944	ASSERT(pred->dtp_refcnt > `0`);
10945
10946	if (--pred->dtp_refcnt == `0`) {
10947	dtrace_difo_release(dp: pred->dtp_difo, vstate);
10948	kmem_free(pred, sizeof (dtrace_predicate_t));
10949	}
10950	}
10951
10952	/*
10953	* DTrace Action Description Functions
10954	*/
10955	static dtrace_actdesc_t *
10956	dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10957	uint64_t uarg, uint64_t arg)
10958	{
10959	dtrace_actdesc_t *act;
10960
10961	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) \|\| (arg != `0` &&
10962	arg >= KERNELBASE) \|\| (arg == `0` && kind == DTRACEACT_PRINTA));
10963
10964	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10965	act->dtad_kind = kind;
10966	act->dtad_ntuple = ntuple;
10967	act->dtad_uarg = uarg;
10968	act->dtad_arg = arg;
10969	act->dtad_refcnt = `1`;
10970
10971	return (act);
10972	}
10973
10974	static void
10975	dtrace_actdesc_hold(dtrace_actdesc_t *act)
10976	{
10977	ASSERT(act->dtad_refcnt >= `1`);
10978	act->dtad_refcnt++;
10979	}
10980
10981	static void
10982	dtrace_actdesc_release(dtrace_actdesc_t act, dtrace_vstate_t vstate)
10983	{
10984	dtrace_actkind_t kind = act->dtad_kind;
10985	dtrace_difo_t *dp;
10986
10987	ASSERT(act->dtad_refcnt >= `1`);
10988
10989	if (--act->dtad_refcnt != `0`)
10990	return;
10991
10992	if ((dp = act->dtad_difo) != NULL)
10993	dtrace_difo_release(dp, vstate);
10994
10995	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10996	char str = (char* *)(uintptr_t)act->dtad_arg;
10997
10998	ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) \|\|
10999	(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
11000
11001	if (str != NULL)
11002	kmem_free(str, strlen(s: str) + `1`);
11003	}
11004
11005	kmem_free(act, sizeof (dtrace_actdesc_t));
11006	}
11007
11008	/*
11009	* DTrace ECB Functions
11010	*/
11011	static dtrace_ecb_t *
11012	dtrace_ecb_add(dtrace_state_t state, dtrace_probe_t probe)
11013	{
11014	dtrace_ecb_t *ecb;
11015	dtrace_epid_t epid;
11016
11017	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11018
11019	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
11020	ecb->dte_predicate = NULL;
11021	ecb->dte_probe = probe;
11022
11023	/*
11024	* The default size is the size of the default action: recording
11025	* the header.
11026	*/
11027	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
11028	ecb->dte_alignment = sizeof (dtrace_epid_t);
11029
11030	epid = state->dts_epid++;
11031
11032	if (epid - `1` >= (dtrace_epid_t)state->dts_necbs) {
11033	dtrace_ecb_t oecbs = state->dts_ecbs, ecbs;
11034	int necbs = state->dts_necbs << `1`;
11035
11036	ASSERT(epid == (dtrace_epid_t)state->dts_necbs + `1`);
11037
11038	if (necbs == `0`) {
11039	ASSERT(oecbs == NULL);
11040	necbs = `1`;
11041	}
11042
11043	ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
11044
11045	if (oecbs != NULL)
11046	bcopy(src: oecbs, dst: ecbs, n: state->dts_necbs * sizeof (*ecbs));
11047
11048	dtrace_membar_producer();
11049	state->dts_ecbs = ecbs;
11050
11051	if (oecbs != NULL) {
11052	/*
11053	* If this state is active, we must dtrace_sync()
11054	* before we can free the old dts_ecbs array: we're
11055	* coming in hot, and there may be active ring
11056	* buffer processing (which indexes into the dts_ecbs
11057	* array) on another CPU.
11058	*/
11059	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11060	dtrace_sync();
11061
11062	kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11063	}
11064
11065	dtrace_membar_producer();
11066	state->dts_necbs = necbs;
11067	}
11068
11069	ecb->dte_state = state;
11070
11071	ASSERT(state->dts_ecbs[epid - `1`] == NULL);
11072	dtrace_membar_producer();
11073	state->dts_ecbs[(ecb->dte_epid = epid) - `1`] = ecb;
11074
11075	return (ecb);
11076	}
11077
11078	static int
11079	dtrace_ecb_enable(dtrace_ecb_t *ecb)
11080	{
11081	dtrace_probe_t *probe = ecb->dte_probe;
11082
11083	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11084	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11085	ASSERT(ecb->dte_next == NULL);
11086
11087	if (probe == NULL) {
11088	/*
11089	* This is the NULL probe -- there's nothing to do.
11090	*/
11091	return(`0`);
11092	}
11093
11094	probe->dtpr_provider->dtpv_ecb_count++;
11095	if (probe->dtpr_ecb == NULL) {
11096	dtrace_provider_t *prov = probe->dtpr_provider;
11097
11098	/*
11099	* We're the first ECB on this probe.
11100	*/
11101	probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11102
11103	if (ecb->dte_predicate != NULL)
11104	probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11105
11106	return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11107	probe->dtpr_id, probe->dtpr_arg));
11108	} else {
11109	/*
11110	* This probe is already active. Swing the last pointer to
11111	* point to the new ECB, and issue a dtrace_sync() to assure
11112	* that all CPUs have seen the change.
11113	*/
11114	ASSERT(probe->dtpr_ecb_last != NULL);
11115	probe->dtpr_ecb_last->dte_next = ecb;
11116	probe->dtpr_ecb_last = ecb;
11117	probe->dtpr_predcache = `0`;
11118
11119	dtrace_sync();
11120	return(`0`);
11121	}
11122	}
11123
11124	static int
11125	dtrace_ecb_resize(dtrace_ecb_t *ecb)
11126	{
11127	dtrace_action_t *act;
11128	uint32_t curneeded = UINT32_MAX;
11129	uint32_t aggbase = UINT32_MAX;
11130
11131	/*
11132	* If we record anything, we always record the dtrace_rechdr_t. (And
11133	* we always record it first.)
11134	*/
11135	ecb->dte_size = sizeof (dtrace_rechdr_t);
11136	ecb->dte_alignment = sizeof (dtrace_epid_t);
11137
11138	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11139	dtrace_recdesc_t *rec = &act->dta_rec;
11140	ASSERT(rec->dtrd_size > `0` \|\| rec->dtrd_alignment == `1`);
11141
11142	ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
11143
11144	if (DTRACEACT_ISAGG(act->dta_kind)) {
11145	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
11146
11147	ASSERT(rec->dtrd_size != `0`);
11148	ASSERT(agg->dtag_first != NULL);
11149	ASSERT(act->dta_prev->dta_intuple);
11150	ASSERT(aggbase != UINT32_MAX);
11151	ASSERT(curneeded != UINT32_MAX);
11152
11153	agg->dtag_base = aggbase;
11154	curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11155	rec->dtrd_offset = curneeded;
11156	if (curneeded + rec->dtrd_size < curneeded)
11157	return (EINVAL);
11158	curneeded += rec->dtrd_size;
11159	ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11160
11161	aggbase = UINT32_MAX;
11162	curneeded = UINT32_MAX;
11163	} else if (act->dta_intuple) {
11164	if (curneeded == UINT32_MAX) {
11165	/*
11166	* This is the first record in a tuple. Align
11167	* curneeded to be at offset 4 in an 8-byte
11168	* aligned block.
11169	*/
11170	ASSERT(act->dta_prev == NULL \|\| !act->dta_prev->dta_intuple);
11171	ASSERT(aggbase == UINT32_MAX);
11172
11173	curneeded = P2PHASEUP(ecb->dte_size,
11174	sizeof (uint64_t), sizeof (dtrace_aggid_t));
11175
11176	aggbase = curneeded - sizeof (dtrace_aggid_t);
11177	ASSERT(IS_P2ALIGNED(aggbase,
11178	sizeof (uint64_t)));
11179	}
11180
11181	curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11182	rec->dtrd_offset = curneeded;
11183	curneeded += rec->dtrd_size;
11184	if (curneeded + rec->dtrd_size < curneeded)
11185	return (EINVAL);
11186	} else {
11187	/ tuples must be followed by an aggregation /
11188	ASSERT(act->dta_prev == NULL \|\| !act->dta_prev->dta_intuple);
11189	ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11190	rec->dtrd_offset = ecb->dte_size;
11191	if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11192	return (EINVAL);
11193	ecb->dte_size += rec->dtrd_size;
11194	ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11195	}
11196	}
11197
11198	if ((act = ecb->dte_action) != NULL &&
11199	!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11200	ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11201	/*
11202	* If the size is still sizeof (dtrace_rechdr_t), then all
11203	* actions store no data; set the size to 0.
11204	*/
11205	ecb->dte_size = `0`;
11206	}
11207
11208	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11209	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11210	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11211	return (`0`);
11212	}
11213
11214	static dtrace_action_t *
11215	dtrace_ecb_aggregation_create(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
11216	{
11217	dtrace_aggregation_t *agg;
11218	size_t size = sizeof (uint64_t);
11219	int ntuple = desc->dtad_ntuple;
11220	dtrace_action_t *act;
11221	dtrace_recdesc_t *frec;
11222	dtrace_aggid_t aggid;
11223	dtrace_state_t *state = ecb->dte_state;
11224
11225	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11226	agg->dtag_ecb = ecb;
11227
11228	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11229
11230	switch (desc->dtad_kind) {
11231	case DTRACEAGG_MIN:
11232	agg->dtag_initial = INT64_MAX;
11233	agg->dtag_aggregate = dtrace_aggregate_min;
11234	break;
11235
11236	case DTRACEAGG_MAX:
11237	agg->dtag_initial = INT64_MIN;
11238	agg->dtag_aggregate = dtrace_aggregate_max;
11239	break;
11240
11241	case DTRACEAGG_COUNT:
11242	agg->dtag_aggregate = dtrace_aggregate_count;
11243	break;
11244
11245	case DTRACEAGG_QUANTIZE:
11246	agg->dtag_aggregate = dtrace_aggregate_quantize;
11247	size = (((sizeof (uint64_t) * NBBY) - `1`) * `2` + `1`) *
11248	sizeof (uint64_t);
11249	break;
11250
11251	case DTRACEAGG_LQUANTIZE: {
11252	uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11253	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11254
11255	agg->dtag_initial = desc->dtad_arg;
11256	agg->dtag_aggregate = dtrace_aggregate_lquantize;
11257
11258	if (step == `0` \|\| levels == `0`)
11259	goto err;
11260
11261	size = levels * sizeof (uint64_t) + `3` * sizeof (uint64_t);
11262	break;
11263	}
11264
11265	case DTRACEAGG_LLQUANTIZE: {
11266	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11267	uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11268	uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11269	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11270	int64_t v;
11271
11272	agg->dtag_initial = desc->dtad_arg;
11273	agg->dtag_aggregate = dtrace_aggregate_llquantize;
11274
11275	if (factor < `2` \|\| low >= high \|\| nsteps < factor)
11276	goto err;
11277
11278	/*
11279	* Now check that the number of steps evenly divides a power
11280	* of the factor. (This assures both integer bucket size and
11281	* linearity within each magnitude.)
11282	*/
11283	for (v = factor; v < nsteps; v *= factor)
11284	continue;
11285
11286	if ((v % nsteps) \|\| (nsteps % factor))
11287	goto err;
11288
11289	size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + `2`) * sizeof (uint64_t);
11290	break;
11291	}
11292
11293	case DTRACEAGG_AVG:
11294	agg->dtag_aggregate = dtrace_aggregate_avg;
11295	size = sizeof (uint64_t) * `2`;
11296	break;
11297
11298	case DTRACEAGG_STDDEV:
11299	agg->dtag_aggregate = dtrace_aggregate_stddev;
11300	size = sizeof (uint64_t) * `4`;
11301	break;
11302
11303	case DTRACEAGG_SUM:
11304	agg->dtag_aggregate = dtrace_aggregate_sum;
11305	break;
11306
11307	default:
11308	goto err;
11309	}
11310
11311	agg->dtag_action.dta_rec.dtrd_size = size;
11312
11313	if (ntuple == `0`)
11314	goto err;
11315
11316	/*
11317	* We must make sure that we have enough actions for the n-tuple.
11318	*/
11319	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11320	if (DTRACEACT_ISAGG(act->dta_kind))
11321	break;
11322
11323	if (--ntuple == `0`) {
11324	/*
11325	* This is the action with which our n-tuple begins.
11326	*/
11327	agg->dtag_first = act;
11328	goto success;
11329	}
11330	}
11331
11332	/*
11333	* This n-tuple is short by ntuple elements. Return failure.
11334	*/
11335	ASSERT(ntuple != `0`);
11336	err:
11337	kmem_free(agg, sizeof (dtrace_aggregation_t));
11338	return (NULL);
11339
11340	success:
11341	/*
11342	* If the last action in the tuple has a size of zero, it's actually
11343	* an expression argument for the aggregating action.
11344	*/
11345	ASSERT(ecb->dte_action_last != NULL);
11346	act = ecb->dte_action_last;
11347
11348	if (act->dta_kind == DTRACEACT_DIFEXPR) {
11349	ASSERT(act->dta_difo != NULL);
11350
11351	if (act->dta_difo->dtdo_rtype.dtdt_size == `0`)
11352	agg->dtag_hasarg = `1`;
11353	}
11354
11355	/*
11356	* We need to allocate an id for this aggregation.
11357	*/
11358	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, `1`,
11359	VM_BESTFIT \| VM_SLEEP);
11360
11361	if (aggid - `1` >= (dtrace_aggid_t)state->dts_naggregations) {
11362	dtrace_aggregation_t **oaggs = state->dts_aggregations;
11363	dtrace_aggregation_t **aggs;
11364	int naggs = state->dts_naggregations << `1`;
11365	int onaggs = state->dts_naggregations;
11366
11367	ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + `1`);
11368
11369	if (naggs == `0`) {
11370	ASSERT(oaggs == NULL);
11371	naggs = `1`;
11372	}
11373
11374	aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11375
11376	if (oaggs != NULL) {
11377	bcopy(src: oaggs, dst: aggs, n: onaggs * sizeof (*aggs));
11378	kmem_free(oaggs, onaggs * sizeof (*aggs));
11379	}
11380
11381	state->dts_aggregations = aggs;
11382	state->dts_naggregations = naggs;
11383	}
11384
11385	ASSERT(state->dts_aggregations[aggid - `1`] == NULL);
11386	state->dts_aggregations[(agg->dtag_id = aggid) - `1`] = agg;
11387
11388	frec = &agg->dtag_first->dta_rec;
11389	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11390	frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11391
11392	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11393	ASSERT(!act->dta_intuple);
11394	act->dta_intuple = `1`;
11395	}
11396
11397	return (&agg->dtag_action);
11398	}
11399
11400	static void
11401	dtrace_ecb_aggregation_destroy(dtrace_ecb_t ecb, dtrace_action_t act)
11402	{
11403	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
11404	dtrace_state_t *state = ecb->dte_state;
11405	dtrace_aggid_t aggid = agg->dtag_id;
11406
11407	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11408	vmem_free(vmp: state->dts_aggid_arena, vaddr: (void *)(uintptr_t)aggid, size: `1`);
11409
11410	ASSERT(state->dts_aggregations[aggid - `1`] == agg);
11411	state->dts_aggregations[aggid - `1`] = NULL;
11412
11413	kmem_free(agg, sizeof (dtrace_aggregation_t));
11414	}
11415
11416	static int
11417	dtrace_ecb_action_add(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
11418	{
11419	dtrace_action_t action, last;
11420	dtrace_difo_t *dp = desc->dtad_difo;
11421	uint32_t size = `0`, align = sizeof (uint8_t), mask;
11422	uint16_t format = `0`;
11423	dtrace_recdesc_t *rec;
11424	dtrace_state_t *state = ecb->dte_state;
11425	dtrace_optval_t *opt = state->dts_options;
11426	dtrace_optval_t nframes=`0`, strsize;
11427	uint64_t arg = desc->dtad_arg;
11428
11429	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11430	ASSERT(ecb->dte_action == NULL \|\| ecb->dte_action->dta_refcnt == `1`);
11431
11432	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11433	/*
11434	* If this is an aggregating action, there must be neither
11435	* a speculate nor a commit on the action chain.
11436	*/
11437	dtrace_action_t *act;
11438
11439	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11440	if (act->dta_kind == DTRACEACT_COMMIT)
11441	return (EINVAL);
11442
11443	if (act->dta_kind == DTRACEACT_SPECULATE)
11444	return (EINVAL);
11445	}
11446
11447	action = dtrace_ecb_aggregation_create(ecb, desc);
11448
11449	if (action == NULL)
11450	return (EINVAL);
11451	} else {
11452	if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) \|\|
11453	(desc->dtad_kind == DTRACEACT_DIFEXPR &&
11454	dp != NULL && dp->dtdo_destructive)) {
11455	state->dts_destructive = `1`;
11456	}
11457
11458	switch (desc->dtad_kind) {
11459	case DTRACEACT_PRINTF:
11460	case DTRACEACT_PRINTA:
11461	case DTRACEACT_SYSTEM:
11462	case DTRACEACT_FREOPEN:
11463	case DTRACEACT_DIFEXPR:
11464	/*
11465	* We know that our arg is a string -- turn it into a
11466	* format.
11467	*/
11468	if (arg == `0`) {
11469	ASSERT(desc->dtad_kind == DTRACEACT_PRINTA \|\|
11470	desc->dtad_kind == DTRACEACT_DIFEXPR);
11471	format = `0`;
11472	} else {
11473	ASSERT(arg != `0`);
11474	ASSERT(arg > KERNELBASE);
11475	format = dtrace_format_add(state,
11476	str: (char *)(uintptr_t)arg);
11477	}
11478
11479	OS_FALLTHROUGH;
11480	case DTRACEACT_LIBACT:
11481	case DTRACEACT_TRACEMEM:
11482	case DTRACEACT_TRACEMEM_DYNSIZE:
11483	case DTRACEACT_APPLEBINARY: / __APPLE__ /
11484	if (dp == NULL)
11485	return (EINVAL);
11486
11487	if ((size = dp->dtdo_rtype.dtdt_size) != `0`)
11488	break;
11489
11490	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11491	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11492	return (EINVAL);
11493
11494	size = opt[DTRACEOPT_STRSIZE];
11495	}
11496
11497	break;
11498
11499	case DTRACEACT_STACK:
11500	if ((nframes = arg) == `0`) {
11501	nframes = opt[DTRACEOPT_STACKFRAMES];
11502	ASSERT(nframes > `0`);
11503	arg = nframes;
11504	}
11505
11506	size = nframes * sizeof (pc_t);
11507	break;
11508
11509	case DTRACEACT_JSTACK:
11510	if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == `0`)
11511	strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11512
11513	if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == `0`)
11514	nframes = opt[DTRACEOPT_JSTACKFRAMES];
11515
11516	arg = DTRACE_USTACK_ARG(nframes, strsize);
11517
11518	OS_FALLTHROUGH;
11519	case DTRACEACT_USTACK:
11520	if (desc->dtad_kind != DTRACEACT_JSTACK &&
11521	(nframes = DTRACE_USTACK_NFRAMES(arg)) == `0`) {
11522	strsize = DTRACE_USTACK_STRSIZE(arg);
11523	nframes = opt[DTRACEOPT_USTACKFRAMES];
11524	ASSERT(nframes > `0`);
11525	arg = DTRACE_USTACK_ARG(nframes, strsize);
11526	}
11527
11528	/*
11529	* Save a slot for the pid.
11530	*/
11531	size = (nframes + `1`) * sizeof (uint64_t);
11532	size += DTRACE_USTACK_STRSIZE(arg);
11533	size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11534
11535	break;
11536
11537	case DTRACEACT_SYM:
11538	case DTRACEACT_MOD:
11539	if (dp == NULL \|\| ((size = dp->dtdo_rtype.dtdt_size) !=
11540	sizeof (uint64_t)) \|\|
11541	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11542	return (EINVAL);
11543	break;
11544
11545	case DTRACEACT_USYM:
11546	case DTRACEACT_UMOD:
11547	case DTRACEACT_UADDR:
11548	if (dp == NULL \|\|
11549	(dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) \|\|
11550	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11551	return (EINVAL);
11552
11553	/*
11554	* We have a slot for the pid, plus a slot for the
11555	* argument. To keep things simple (aligned with
11556	* bitness-neutral sizing), we store each as a 64-bit
11557	* quantity.
11558	*/
11559	size = `2` * sizeof (uint64_t);
11560	break;
11561
11562	case DTRACEACT_STOP:
11563	case DTRACEACT_BREAKPOINT:
11564	case DTRACEACT_PANIC:
11565	break;
11566
11567	case DTRACEACT_CHILL:
11568	case DTRACEACT_DISCARD:
11569	case DTRACEACT_RAISE:
11570	case DTRACEACT_PIDRESUME: / __APPLE__ /
11571	if (dp == NULL)
11572	return (EINVAL);
11573	break;
11574
11575	case DTRACEACT_EXIT:
11576	if (dp == NULL \|\|
11577	(size = dp->dtdo_rtype.dtdt_size) != sizeof (int) \|\|
11578	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11579	return (EINVAL);
11580	break;
11581
11582	case DTRACEACT_SPECULATE:
11583	if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11584	return (EINVAL);
11585
11586	if (dp == NULL)
11587	return (EINVAL);
11588
11589	state->dts_speculates = `1`;
11590	break;
11591
11592	case DTRACEACT_COMMIT: {
11593	dtrace_action_t *act = ecb->dte_action;
11594
11595	for (; act != NULL; act = act->dta_next) {
11596	if (act->dta_kind == DTRACEACT_COMMIT)
11597	return (EINVAL);
11598	}
11599
11600	if (dp == NULL)
11601	return (EINVAL);
11602	break;
11603	}
11604
11605	default:
11606	return (EINVAL);
11607	}
11608
11609	if (size != `0` \|\| desc->dtad_kind == DTRACEACT_SPECULATE) {
11610	/*
11611	* If this is a data-storing action or a speculate,
11612	* we must be sure that there isn't a commit on the
11613	* action chain.
11614	*/
11615	dtrace_action_t *act = ecb->dte_action;
11616
11617	for (; act != NULL; act = act->dta_next) {
11618	if (act->dta_kind == DTRACEACT_COMMIT)
11619	return (EINVAL);
11620	}
11621	}
11622
11623	action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11624	action->dta_rec.dtrd_size = size;
11625	}
11626
11627	action->dta_refcnt = `1`;
11628	rec = &action->dta_rec;
11629	size = rec->dtrd_size;
11630
11631	for (mask = sizeof (uint64_t) - `1`; size != `0` && mask > `0`; mask >>= `1`) {
11632	if (!(size & mask)) {
11633	align = mask + `1`;
11634	break;
11635	}
11636	}
11637
11638	action->dta_kind = desc->dtad_kind;
11639
11640	if ((action->dta_difo = dp) != NULL)
11641	dtrace_difo_hold(dp);
11642
11643	rec->dtrd_action = action->dta_kind;
11644	rec->dtrd_arg = arg;
11645	rec->dtrd_uarg = desc->dtad_uarg;
11646	rec->dtrd_alignment = (uint16_t)align;
11647	rec->dtrd_format = format;
11648
11649	if ((last = ecb->dte_action_last) != NULL) {
11650	ASSERT(ecb->dte_action != NULL);
11651	action->dta_prev = last;
11652	last->dta_next = action;
11653	} else {
11654	ASSERT(ecb->dte_action == NULL);
11655	ecb->dte_action = action;
11656	}
11657
11658	ecb->dte_action_last = action;
11659
11660	return (`0`);
11661	}
11662
11663	static void
11664	dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11665	{
11666	dtrace_action_t act = ecb->dte_action, next;
11667	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11668	dtrace_difo_t *dp;
11669	uint16_t format;
11670
11671	if (act != NULL && act->dta_refcnt > `1`) {
11672	ASSERT(act->dta_next == NULL \|\| act->dta_next->dta_refcnt == `1`);
11673	act->dta_refcnt--;
11674	} else {
11675	for (; act != NULL; act = next) {
11676	next = act->dta_next;
11677	ASSERT(next != NULL \|\| act == ecb->dte_action_last);
11678	ASSERT(act->dta_refcnt == `1`);
11679
11680	if ((format = act->dta_rec.dtrd_format) != `0`)
11681	dtrace_format_remove(state: ecb->dte_state, format);
11682
11683	if ((dp = act->dta_difo) != NULL)
11684	dtrace_difo_release(dp, vstate);
11685
11686	if (DTRACEACT_ISAGG(act->dta_kind)) {
11687	dtrace_ecb_aggregation_destroy(ecb, act);
11688	} else {
11689	kmem_free(act, sizeof (dtrace_action_t));
11690	}
11691	}
11692	}
11693
11694	ecb->dte_action = NULL;
11695	ecb->dte_action_last = NULL;
11696	ecb->dte_size = `0`;
11697	}
11698
11699	static void
11700	dtrace_ecb_disable(dtrace_ecb_t *ecb)
11701	{
11702	/*
11703	* We disable the ECB by removing it from its probe.
11704	*/
11705	dtrace_ecb_t pecb, prev = NULL;
11706	dtrace_probe_t *probe = ecb->dte_probe;
11707
11708	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11709
11710	if (probe == NULL) {
11711	/*
11712	* This is the NULL probe; there is nothing to disable.
11713	*/
11714	return;
11715	}
11716
11717	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11718	if (pecb == ecb)
11719	break;
11720	prev = pecb;
11721	}
11722
11723	ASSERT(pecb != NULL);
11724
11725	if (prev == NULL) {
11726	probe->dtpr_ecb = ecb->dte_next;
11727	} else {
11728	prev->dte_next = ecb->dte_next;
11729	}
11730
11731	if (ecb == probe->dtpr_ecb_last) {
11732	ASSERT(ecb->dte_next == NULL);
11733	probe->dtpr_ecb_last = prev;
11734	}
11735
11736	probe->dtpr_provider->dtpv_ecb_count--;
11737	/*
11738	* The ECB has been disconnected from the probe; now sync to assure
11739	* that all CPUs have seen the change before returning.
11740	*/
11741	dtrace_sync();
11742
11743	if (probe->dtpr_ecb == NULL) {
11744	/*
11745	* That was the last ECB on the probe; clear the predicate
11746	* cache ID for the probe, disable it and sync one more time
11747	* to assure that we'll never hit it again.
11748	*/
11749	dtrace_provider_t *prov = probe->dtpr_provider;
11750
11751	ASSERT(ecb->dte_next == NULL);
11752	ASSERT(probe->dtpr_ecb_last == NULL);
11753	probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11754	prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11755	probe->dtpr_id, probe->dtpr_arg);
11756	dtrace_sync();
11757	} else {
11758	/*
11759	* There is at least one ECB remaining on the probe. If there
11760	* is _exactly_ one, set the probe's predicate cache ID to be
11761	* the predicate cache ID of the remaining ECB.
11762	*/
11763	ASSERT(probe->dtpr_ecb_last != NULL);
11764	ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11765
11766	if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11767	dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11768
11769	ASSERT(probe->dtpr_ecb->dte_next == NULL);
11770
11771	if (p != NULL)
11772	probe->dtpr_predcache = p->dtp_cacheid;
11773	}
11774
11775	ecb->dte_next = NULL;
11776	}
11777	}
11778
11779	static void
11780	dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11781	{
11782	dtrace_state_t *state = ecb->dte_state;
11783	dtrace_vstate_t *vstate = &state->dts_vstate;
11784	dtrace_predicate_t *pred;
11785	dtrace_epid_t epid = ecb->dte_epid;
11786
11787	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11788	ASSERT(ecb->dte_next == NULL);
11789	ASSERT(ecb->dte_probe == NULL \|\| ecb->dte_probe->dtpr_ecb != ecb);
11790
11791	if ((pred = ecb->dte_predicate) != NULL)
11792	dtrace_predicate_release(pred, vstate);
11793
11794	dtrace_ecb_action_remove(ecb);
11795
11796	ASSERT(state->dts_ecbs[epid - `1`] == ecb);
11797	state->dts_ecbs[epid - `1`] = NULL;
11798
11799	kmem_free(ecb, sizeof (dtrace_ecb_t));
11800	}
11801
11802	static dtrace_ecb_t *
11803	dtrace_ecb_create(dtrace_state_t state, dtrace_probe_t probe,
11804	dtrace_enabling_t *enab)
11805	{
11806	dtrace_ecb_t *ecb;
11807	dtrace_predicate_t *pred;
11808	dtrace_actdesc_t *act;
11809	dtrace_provider_t *prov;
11810	dtrace_ecbdesc_t *desc = enab->dten_current;
11811
11812	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11813	ASSERT(state != NULL);
11814
11815	ecb = dtrace_ecb_add(state, probe);
11816	ecb->dte_uarg = desc->dted_uarg;
11817
11818	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11819	dtrace_predicate_hold(pred);
11820	ecb->dte_predicate = pred;
11821	}
11822
11823	if (probe != NULL) {
11824	/*
11825	* If the provider shows more leg than the consumer is old
11826	* enough to see, we need to enable the appropriate implicit
11827	* predicate bits to prevent the ecb from activating at
11828	* revealing times.
11829	*
11830	* Providers specifying DTRACE_PRIV_USER at register time
11831	* are stating that they need the /proc-style privilege
11832	* model to be enforced, and this is what DTRACE_COND_OWNER
11833	* and DTRACE_COND_ZONEOWNER will then do at probe time.
11834	*/
11835	prov = probe->dtpr_provider;
11836	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11837	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11838	ecb->dte_cond \|= DTRACE_COND_OWNER;
11839
11840	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11841	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11842	ecb->dte_cond \|= DTRACE_COND_ZONEOWNER;
11843
11844	/*
11845	* If the provider shows us kernel innards and the user
11846	* is lacking sufficient privilege, enable the
11847	* DTRACE_COND_USERMODE implicit predicate.
11848	*/
11849	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11850	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11851	ecb->dte_cond \|= DTRACE_COND_USERMODE;
11852	}
11853
11854	if (dtrace_ecb_create_cache != NULL) {
11855	/*
11856	* If we have a cached ecb, we'll use its action list instead
11857	* of creating our own (saving both time and space).
11858	*/
11859	dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11860	dtrace_action_t *act_if = cached->dte_action;
11861
11862	if (act_if != NULL) {
11863	ASSERT(act_if->dta_refcnt > `0`);
11864	act_if->dta_refcnt++;
11865	ecb->dte_action = act_if;
11866	ecb->dte_action_last = cached->dte_action_last;
11867	ecb->dte_needed = cached->dte_needed;
11868	ecb->dte_size = cached->dte_size;
11869	ecb->dte_alignment = cached->dte_alignment;
11870	}
11871
11872	return (ecb);
11873	}
11874
11875	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11876	if ((enab->dten_error = dtrace_ecb_action_add(ecb, desc: act)) != `0`) {
11877	dtrace_ecb_destroy(ecb);
11878	return (NULL);
11879	}
11880	}
11881
11882	if ((enab->dten_error = dtrace_ecb_resize(ecb)) != `0`) {
11883	dtrace_ecb_destroy(ecb);
11884	return (NULL);
11885	}
11886
11887	return (dtrace_ecb_create_cache = ecb);
11888	}
11889
11890	static int
11891	dtrace_ecb_create_enable(dtrace_probe_t probe, void* arg1, void* *arg2)
11892	{
11893	dtrace_ecb_t *ecb;
11894	dtrace_enabling_t *enab = arg1;
11895	dtrace_ecbdesc_t *ep = arg2;
11896	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11897
11898	ASSERT(state != NULL);
11899
11900	if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11901	/*
11902	* This probe was created in a generation for which this
11903	* enabling has previously created ECBs; we don't want to
11904	* enable it again, so just kick out.
11905	*/
11906	return (DTRACE_MATCH_NEXT);
11907	}
11908
11909	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11910	return (DTRACE_MATCH_DONE);
11911
11912	if (dtrace_ecb_enable(ecb) < `0`)
11913	return (DTRACE_MATCH_FAIL);
11914
11915	return (DTRACE_MATCH_NEXT);
11916	}
11917
11918	static dtrace_ecb_t *
11919	dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11920	{
11921	dtrace_ecb_t *ecb;
11922	#pragma unused(ecb) /* __APPLE__ */
11923
11924	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11925
11926	if (id == `0` \|\| id > (dtrace_epid_t)state->dts_necbs)
11927	return (NULL);
11928
11929	ASSERT(state->dts_necbs > `0` && state->dts_ecbs != NULL);
11930	ASSERT((ecb = state->dts_ecbs[id - `1`]) == NULL \|\| ecb->dte_epid == id);
11931
11932	return (state->dts_ecbs[id - `1`]);
11933	}
11934
11935	static dtrace_aggregation_t *
11936	dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11937	{
11938	dtrace_aggregation_t *agg;
11939	#pragma unused(agg) /* __APPLE__ */
11940
11941	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11942
11943	if (id == `0` \|\| id > (dtrace_aggid_t)state->dts_naggregations)
11944	return (NULL);
11945
11946	ASSERT(state->dts_naggregations > `0` && state->dts_aggregations != NULL);
11947	ASSERT((agg = state->dts_aggregations[id - `1`]) == NULL \|\|
11948	agg->dtag_id == id);
11949
11950	return (state->dts_aggregations[id - `1`]);
11951	}
11952
11953	/*
11954	* DTrace Buffer Functions
11955	*
11956	* The following functions manipulate DTrace buffers. Most of these functions
11957	* are called in the context of establishing or processing consumer state;
11958	* exceptions are explicitly noted.
11959	*/
11960
11961	/*
11962	* Note: called from cross call context. This function switches the two
11963	* buffers on a given CPU. The atomicity of this operation is assured by
11964	* disabling interrupts while the actual switch takes place; the disabling of
11965	* interrupts serializes the execution with any execution of dtrace_probe() on
11966	* the same CPU.
11967	*/
11968	static void
11969	dtrace_buffer_switch(dtrace_buffer_t *buf)
11970	{
11971	caddr_t tomax = buf->dtb_tomax;
11972	caddr_t xamot = buf->dtb_xamot;
11973	dtrace_icookie_t cookie;
11974	hrtime_t now;
11975
11976	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11977	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11978
11979	cookie = dtrace_interrupt_disable();
11980	now = dtrace_gethrtime();
11981	buf->dtb_tomax = xamot;
11982	buf->dtb_xamot = tomax;
11983	buf->dtb_xamot_drops = buf->dtb_drops;
11984	buf->dtb_xamot_offset = buf->dtb_offset;
11985	buf->dtb_xamot_errors = buf->dtb_errors;
11986	buf->dtb_xamot_flags = buf->dtb_flags;
11987	buf->dtb_offset = `0`;
11988	buf->dtb_drops = `0`;
11989	buf->dtb_errors = `0`;
11990	buf->dtb_flags &= ~(DTRACEBUF_ERROR \| DTRACEBUF_DROPPED);
11991	buf->dtb_interval = now - buf->dtb_switched;
11992	buf->dtb_switched = now;
11993	buf->dtb_cur_limit = buf->dtb_limit;
11994
11995	dtrace_interrupt_enable(cookie);
11996	}
11997
11998	/*
11999	* Note: called from cross call context. This function activates a buffer
12000	* on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
12001	* is guaranteed by the disabling of interrupts.
12002	*/
12003	static void
12004	dtrace_buffer_activate(dtrace_state_t *state)
12005	{
12006	dtrace_buffer_t *buf;
12007	dtrace_icookie_t cookie = dtrace_interrupt_disable();
12008
12009	buf = &state->dts_buffer[CPU->cpu_id];
12010
12011	if (buf->dtb_tomax != NULL) {
12012	/*
12013	* We might like to assert that the buffer is marked inactive,
12014	* but this isn't necessarily true: the buffer for the CPU
12015	* that processes the BEGIN probe has its buffer activated
12016	* manually. In this case, we take the (harmless) action
12017	* re-clearing the bit INACTIVE bit.
12018	*/
12019	buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
12020	}
12021
12022	dtrace_interrupt_enable(cookie);
12023	}
12024
12025	static int
12026	dtrace_buffer_canalloc(size_t size)
12027	{
12028	if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
12029	return (B_FALSE);
12030	if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
12031	return (B_FALSE);
12032
12033	return (B_TRUE);
12034	}
12035
12036	static int
12037	dtrace_buffer_alloc(dtrace_buffer_t bufs, size_t limit, size_t size, int* flags,
12038	processorid_t cpu)
12039	{
12040	dtrace_cpu_t *cp;
12041	dtrace_buffer_t *buf;
12042	size_t size_before_alloc = dtrace_buffer_memory_inuse;
12043
12044	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12045	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12046
12047	if (size > (size_t)dtrace_nonroot_maxsize &&
12048	!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12049	return (EFBIG);
12050
12051	cp = cpu_list;
12052
12053	do {
12054	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12055	continue;
12056
12057	buf = &bufs[cp->cpu_id];
12058
12059	/*
12060	* If there is already a buffer allocated for this CPU, it
12061	* is only possible that this is a DR event. In this case,
12062	* the buffer size must match our specified size.
12063	*/
12064	if (buf->dtb_tomax != NULL) {
12065	ASSERT(buf->dtb_size == size);
12066	continue;
12067	}
12068
12069	ASSERT(buf->dtb_xamot == NULL);
12070
12071	/ DTrace, please do not eat all the memory. /
12072	if (dtrace_buffer_canalloc(size) == B_FALSE)
12073	goto err;
12074	if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12075	goto err;
12076	dtrace_buffer_memory_inuse += size;
12077
12078	/ Unsure that limit is always lower than size /
12079	limit = limit == size ? limit - `1` : limit;
12080	buf->dtb_cur_limit = limit;
12081	buf->dtb_limit = limit;
12082	buf->dtb_size = size;
12083	buf->dtb_flags = flags;
12084	buf->dtb_offset = `0`;
12085	buf->dtb_drops = `0`;
12086
12087	if (flags & DTRACEBUF_NOSWITCH)
12088	continue;
12089
12090	/ DTrace, please do not eat all the memory. /
12091	if (dtrace_buffer_canalloc(size) == B_FALSE)
12092	goto err;
12093	if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12094	goto err;
12095	dtrace_buffer_memory_inuse += size;
12096	} while ((cp = cp->cpu_next) != cpu_list);
12097
12098	ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
12099
12100	return (`0`);
12101
12102	err:
12103	cp = cpu_list;
12104
12105	do {
12106	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12107	continue;
12108
12109	buf = &bufs[cp->cpu_id];
12110
12111	if (buf->dtb_xamot != NULL) {
12112	ASSERT(buf->dtb_tomax != NULL);
12113	ASSERT(buf->dtb_size == size);
12114	kmem_free(buf->dtb_xamot, size);
12115	}
12116
12117	if (buf->dtb_tomax != NULL) {
12118	ASSERT(buf->dtb_size == size);
12119	kmem_free(buf->dtb_tomax, size);
12120	}
12121
12122	buf->dtb_tomax = NULL;
12123	buf->dtb_xamot = NULL;
12124	buf->dtb_size = `0`;
12125	} while ((cp = cp->cpu_next) != cpu_list);
12126
12127	/ Restore the size saved before allocating memory /
12128	dtrace_buffer_memory_inuse = size_before_alloc;
12129
12130	return (ENOMEM);
12131	}
12132
12133	/*
12134	* Note: called from probe context. This function just increments the drop
12135	* count on a buffer. It has been made a function to allow for the
12136	* possibility of understanding the source of mysterious drop counts. (A
12137	* problem for which one may be particularly disappointed that DTrace cannot
12138	* be used to understand DTrace.)
12139	*/
12140	static void
12141	dtrace_buffer_drop(dtrace_buffer_t *buf)
12142	{
12143	buf->dtb_drops++;
12144	}
12145
12146	/*
12147	* Note: called from probe context. This function is called to reserve space
12148	* in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12149	* mstate. Returns the new offset in the buffer, or a negative value if an
12150	* error has occurred.
12151	*/
12152	static intptr_t
12153	dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12154	dtrace_state_t state, dtrace_mstate_t mstate)
12155	{
12156	intptr_t offs = buf->dtb_offset, soffs;
12157	intptr_t woffs;
12158	caddr_t tomax;
12159	size_t total_off;
12160
12161	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12162	return (-`1`);
12163
12164	if ((tomax = buf->dtb_tomax) == NULL) {
12165	dtrace_buffer_drop(buf);
12166	return (-`1`);
12167	}
12168
12169	if (!(buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL))) {
12170	while (offs & (align - `1`)) {
12171	/*
12172	* Assert that our alignment is off by a number which
12173	* is itself sizeof (uint32_t) aligned.
12174	*/
12175	ASSERT(!((align - (offs & (align - `1`))) &
12176	(sizeof (uint32_t) - `1`)));
12177	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12178	offs += sizeof (uint32_t);
12179	}
12180
12181	if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12182	if (buf->dtb_cur_limit == buf->dtb_limit) {
12183	buf->dtb_cur_limit = buf->dtb_size;
12184
12185	os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12186	/**
12187	* Set an AST on the current processor
12188	* so that we can wake up the process
12189	* outside of probe context, when we know
12190	* it is safe to do so
12191	*/
12192	minor_t minor = getminor(state->dts_dev);
12193	ASSERT(minor < `32`);
12194
12195	os_atomic_or(&dtrace_wake_clients, `1` << minor, relaxed);
12196	ast_dtrace_on();
12197	}
12198	if ((uint64_t)soffs > buf->dtb_size) {
12199	dtrace_buffer_drop(buf);
12200	return (-`1`);
12201	}
12202	}
12203
12204	if (mstate == NULL)
12205	return (offs);
12206
12207	mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12208	mstate->dtms_scratch_size = buf->dtb_size - soffs;
12209	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12210
12211	return (offs);
12212	}
12213
12214	if (buf->dtb_flags & DTRACEBUF_FILL) {
12215	if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12216	(buf->dtb_flags & DTRACEBUF_FULL))
12217	return (-`1`);
12218	goto out;
12219	}
12220
12221	total_off = needed + (offs & (align - `1`));
12222
12223	/*
12224	* For a ring buffer, life is quite a bit more complicated. Before
12225	* we can store any padding, we need to adjust our wrapping offset.
12226	* (If we've never before wrapped or we're not about to, no adjustment
12227	* is required.)
12228	*/
12229	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) \|\|
12230	offs + total_off > buf->dtb_size) {
12231	woffs = buf->dtb_xamot_offset;
12232
12233	if (offs + total_off > buf->dtb_size) {
12234	/*
12235	* We can't fit in the end of the buffer. First, a
12236	* sanity check that we can fit in the buffer at all.
12237	*/
12238	if (total_off > buf->dtb_size) {
12239	dtrace_buffer_drop(buf);
12240	return (-`1`);
12241	}
12242
12243	/*
12244	* We're going to be storing at the top of the buffer,
12245	* so now we need to deal with the wrapped offset. We
12246	* only reset our wrapped offset to 0 if it is
12247	* currently greater than the current offset. If it
12248	* is less than the current offset, it is because a
12249	* previous allocation induced a wrap -- but the
12250	* allocation didn't subsequently take the space due
12251	* to an error or false predicate evaluation. In this
12252	* case, we'll just leave the wrapped offset alone: if
12253	* the wrapped offset hasn't been advanced far enough
12254	* for this allocation, it will be adjusted in the
12255	* lower loop.
12256	*/
12257	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12258	if (woffs >= offs)
12259	woffs = `0`;
12260	} else {
12261	woffs = `0`;
12262	}
12263
12264	/*
12265	* Now we know that we're going to be storing to the
12266	* top of the buffer and that there is room for us
12267	* there. We need to clear the buffer from the current
12268	* offset to the end (there may be old gunk there).
12269	*/
12270	while ((uint64_t)offs < buf->dtb_size)
12271	tomax[offs++] = `0`;
12272
12273	/*
12274	* We need to set our offset to zero. And because we
12275	* are wrapping, we need to set the bit indicating as
12276	* much. We can also adjust our needed space back
12277	* down to the space required by the ECB -- we know
12278	* that the top of the buffer is aligned.
12279	*/
12280	offs = `0`;
12281	total_off = needed;
12282	buf->dtb_flags \|= DTRACEBUF_WRAPPED;
12283	} else {
12284	/*
12285	* There is room for us in the buffer, so we simply
12286	* need to check the wrapped offset.
12287	*/
12288	if (woffs < offs) {
12289	/*
12290	* The wrapped offset is less than the offset.
12291	* This can happen if we allocated buffer space
12292	* that induced a wrap, but then we didn't
12293	* subsequently take the space due to an error
12294	* or false predicate evaluation. This is
12295	* okay; we know that _this_ allocation isn't
12296	* going to induce a wrap. We still can't
12297	* reset the wrapped offset to be zero,
12298	* however: the space may have been trashed in
12299	* the previous failed probe attempt. But at
12300	* least the wrapped offset doesn't need to
12301	* be adjusted at all...
12302	*/
12303	goto out;
12304	}
12305	}
12306
12307	while (offs + total_off > (size_t)woffs) {
12308	dtrace_epid_t epid = (uint32_t )(tomax + woffs);
12309	size_t size;
12310
12311	if (epid == DTRACE_EPIDNONE) {
12312	size = sizeof (uint32_t);
12313	} else {
12314	ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12315	ASSERT(state->dts_ecbs[epid - `1`] != NULL);
12316
12317	size = state->dts_ecbs[epid - `1`]->dte_size;
12318	}
12319
12320	ASSERT(woffs + size <= buf->dtb_size);
12321	ASSERT(size != `0`);
12322
12323	if (woffs + size == buf->dtb_size) {
12324	/*
12325	* We've reached the end of the buffer; we want
12326	* to set the wrapped offset to 0 and break
12327	* out. However, if the offs is 0, then we're
12328	* in a strange edge-condition: the amount of
12329	* space that we want to reserve plus the size
12330	* of the record that we're overwriting is
12331	* greater than the size of the buffer. This
12332	* is problematic because if we reserve the
12333	* space but subsequently don't consume it (due
12334	* to a failed predicate or error) the wrapped
12335	* offset will be 0 -- yet the EPID at offset 0
12336	* will not be committed. This situation is
12337	* relatively easy to deal with: if we're in
12338	* this case, the buffer is indistinguishable
12339	* from one that hasn't wrapped; we need only
12340	* finish the job by clearing the wrapped bit,
12341	* explicitly setting the offset to be 0, and
12342	* zero'ing out the old data in the buffer.
12343	*/
12344	if (offs == `0`) {
12345	buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12346	buf->dtb_offset = `0`;
12347	woffs = total_off;
12348
12349	while ((uint64_t)woffs < buf->dtb_size)
12350	tomax[woffs++] = `0`;
12351	}
12352
12353	woffs = `0`;
12354	break;
12355	}
12356
12357	woffs += size;
12358	}
12359
12360	/*
12361	* We have a wrapped offset. It may be that the wrapped offset
12362	* has become zero -- that's okay.
12363	*/
12364	buf->dtb_xamot_offset = woffs;
12365	}
12366
12367	out:
12368	/*
12369	* Now we can plow the buffer with any necessary padding.
12370	*/
12371	while (offs & (align - `1`)) {
12372	/*
12373	* Assert that our alignment is off by a number which
12374	* is itself sizeof (uint32_t) aligned.
12375	*/
12376	ASSERT(!((align - (offs & (align - `1`))) &
12377	(sizeof (uint32_t) - `1`)));
12378	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12379	offs += sizeof (uint32_t);
12380	}
12381
12382	if (buf->dtb_flags & DTRACEBUF_FILL) {
12383	if (offs + needed > buf->dtb_size - state->dts_reserve) {
12384	buf->dtb_flags \|= DTRACEBUF_FULL;
12385	return (-`1`);
12386	}
12387	}
12388
12389	if (mstate == NULL)
12390	return (offs);
12391
12392	/*
12393	* For ring buffers and fill buffers, the scratch space is always
12394	* the inactive buffer.
12395	*/
12396	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12397	mstate->dtms_scratch_size = buf->dtb_size;
12398	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12399
12400	return (offs);
12401	}
12402
12403	static void
12404	dtrace_buffer_polish(dtrace_buffer_t *buf)
12405	{
12406	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12407	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12408
12409	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12410	return;
12411
12412	/*
12413	* We need to polish the ring buffer. There are three cases:
12414	*
12415	* - The first (and presumably most common) is that there is no gap
12416	* between the buffer offset and the wrapped offset. In this case,
12417	* there is nothing in the buffer that isn't valid data; we can
12418	* mark the buffer as polished and return.
12419	*
12420	* - The second (less common than the first but still more common
12421	* than the third) is that there is a gap between the buffer offset
12422	* and the wrapped offset, and the wrapped offset is larger than the
12423	* buffer offset. This can happen because of an alignment issue, or
12424	* can happen because of a call to dtrace_buffer_reserve() that
12425	* didn't subsequently consume the buffer space. In this case,
12426	* we need to zero the data from the buffer offset to the wrapped
12427	* offset.
12428	*
12429	* - The third (and least common) is that there is a gap between the
12430	* buffer offset and the wrapped offset, but the wrapped offset is
12431	* _less_ than the buffer offset. This can only happen because a
12432	* call to dtrace_buffer_reserve() induced a wrap, but the space
12433	* was not subsequently consumed. In this case, we need to zero the
12434	* space from the offset to the end of the buffer _and_ from the
12435	* top of the buffer to the wrapped offset.
12436	*/
12437	if (buf->dtb_offset < buf->dtb_xamot_offset) {
12438	bzero(s: buf->dtb_tomax + buf->dtb_offset,
12439	n: buf->dtb_xamot_offset - buf->dtb_offset);
12440	}
12441
12442	if (buf->dtb_offset > buf->dtb_xamot_offset) {
12443	bzero(s: buf->dtb_tomax + buf->dtb_offset,
12444	n: buf->dtb_size - buf->dtb_offset);
12445	bzero(s: buf->dtb_tomax, n: buf->dtb_xamot_offset);
12446	}
12447	}
12448
12449	static void
12450	dtrace_buffer_free(dtrace_buffer_t *bufs)
12451	{
12452	int i;
12453
12454	for (i = `0`; i < (int)NCPU; i++) {
12455	dtrace_buffer_t *buf = &bufs[i];
12456
12457	if (buf->dtb_tomax == NULL) {
12458	ASSERT(buf->dtb_xamot == NULL);
12459	ASSERT(buf->dtb_size == `0`);
12460	continue;
12461	}
12462
12463	if (buf->dtb_xamot != NULL) {
12464	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12465	kmem_free(buf->dtb_xamot, buf->dtb_size);
12466
12467	ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12468	dtrace_buffer_memory_inuse -= buf->dtb_size;
12469	}
12470
12471	kmem_free(buf->dtb_tomax, buf->dtb_size);
12472	ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12473	dtrace_buffer_memory_inuse -= buf->dtb_size;
12474
12475	buf->dtb_size = `0`;
12476	buf->dtb_tomax = NULL;
12477	buf->dtb_xamot = NULL;
12478	}
12479	}
12480
12481	/*
12482	* DTrace Enabling Functions
12483	*/
12484	static dtrace_enabling_t *
12485	dtrace_enabling_create(dtrace_vstate_t *vstate)
12486	{
12487	dtrace_enabling_t *enab;
12488
12489	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12490	enab->dten_vstate = vstate;
12491
12492	return (enab);
12493	}
12494
12495	static void
12496	dtrace_enabling_add(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb)
12497	{
12498	dtrace_ecbdesc_t **ndesc;
12499	size_t osize, nsize;
12500
12501	/*
12502	* We can't add to enablings after we've enabled them, or after we've
12503	* retained them.
12504	*/
12505	ASSERT(enab->dten_probegen == `0`);
12506	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12507
12508	/ APPLE NOTE: this protects against gcc 4.0 botch on x86 /
12509	if (ecb == NULL) return;
12510
12511	if (enab->dten_ndesc < enab->dten_maxdesc) {
12512	enab->dten_desc[enab->dten_ndesc++] = ecb;
12513	return;
12514	}
12515
12516	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12517
12518	if (enab->dten_maxdesc == `0`) {
12519	enab->dten_maxdesc = `1`;
12520	} else {
12521	enab->dten_maxdesc <<= `1`;
12522	}
12523
12524	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12525
12526	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12527	ndesc = kmem_zalloc(nsize, KM_SLEEP);
12528	bcopy(src: enab->dten_desc, dst: ndesc, n: osize);
12529	kmem_free(enab->dten_desc, osize);
12530
12531	enab->dten_desc = ndesc;
12532	enab->dten_desc[enab->dten_ndesc++] = ecb;
12533	}
12534
12535	static void
12536	dtrace_enabling_addlike(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb,
12537	dtrace_probedesc_t *pd)
12538	{
12539	dtrace_ecbdesc_t *new;
12540	dtrace_predicate_t *pred;
12541	dtrace_actdesc_t *act;
12542
12543	/*
12544	* We're going to create a new ECB description that matches the
12545	* specified ECB in every way, but has the specified probe description.
12546	*/
12547	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12548
12549	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12550	dtrace_predicate_hold(pred);
12551
12552	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12553	dtrace_actdesc_hold(act);
12554
12555	new->dted_action = ecb->dted_action;
12556	new->dted_pred = ecb->dted_pred;
12557	new->dted_probe = *pd;
12558	new->dted_uarg = ecb->dted_uarg;
12559
12560	dtrace_enabling_add(enab, ecb: new);
12561	}
12562
12563	static void
12564	dtrace_enabling_dump(dtrace_enabling_t *enab)
12565	{
12566	int i;
12567
12568	for (i = `0`; i < enab->dten_ndesc; i++) {
12569	dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12570
12571	cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12572	desc->dtpd_provider, desc->dtpd_mod,
12573	desc->dtpd_func, desc->dtpd_name);
12574	}
12575	}
12576
12577	static void
12578	dtrace_enabling_destroy(dtrace_enabling_t *enab)
12579	{
12580	int i;
12581	dtrace_ecbdesc_t *ep;
12582	dtrace_vstate_t *vstate = enab->dten_vstate;
12583
12584	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12585
12586	for (i = `0`; i < enab->dten_ndesc; i++) {
12587	dtrace_actdesc_t act, next;
12588	dtrace_predicate_t *pred;
12589
12590	ep = enab->dten_desc[i];
12591
12592	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12593	dtrace_predicate_release(pred, vstate);
12594
12595	for (act = ep->dted_action; act != NULL; act = next) {
12596	next = act->dtad_next;
12597	dtrace_actdesc_release(act, vstate);
12598	}
12599
12600	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12601	}
12602
12603	kmem_free(enab->dten_desc,
12604	enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12605
12606	/*
12607	* If this was a retained enabling, decrement the dts_nretained count
12608	* and take it off of the dtrace_retained list.
12609	*/
12610	if (enab->dten_prev != NULL \|\| enab->dten_next != NULL \|\|
12611	dtrace_retained == enab) {
12612	ASSERT(enab->dten_vstate->dtvs_state != NULL);
12613	ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > `0`);
12614	enab->dten_vstate->dtvs_state->dts_nretained--;
12615	dtrace_retained_gen++;
12616	}
12617
12618	if (enab->dten_prev == NULL) {
12619	if (dtrace_retained == enab) {
12620	dtrace_retained = enab->dten_next;
12621
12622	if (dtrace_retained != NULL)
12623	dtrace_retained->dten_prev = NULL;
12624	}
12625	} else {
12626	ASSERT(enab != dtrace_retained);
12627	ASSERT(dtrace_retained != NULL);
12628	enab->dten_prev->dten_next = enab->dten_next;
12629	}
12630
12631	if (enab->dten_next != NULL) {
12632	ASSERT(dtrace_retained != NULL);
12633	enab->dten_next->dten_prev = enab->dten_prev;
12634	}
12635
12636	kmem_free(enab, sizeof (dtrace_enabling_t));
12637	}
12638
12639	static int
12640	dtrace_enabling_retain(dtrace_enabling_t *enab)
12641	{
12642	dtrace_state_t *state;
12643
12644	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12645	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12646	ASSERT(enab->dten_vstate != NULL);
12647
12648	state = enab->dten_vstate->dtvs_state;
12649	ASSERT(state != NULL);
12650
12651	/*
12652	* We only allow each state to retain dtrace_retain_max enablings.
12653	*/
12654	if (state->dts_nretained >= dtrace_retain_max)
12655	return (ENOSPC);
12656
12657	state->dts_nretained++;
12658	dtrace_retained_gen++;
12659
12660	if (dtrace_retained == NULL) {
12661	dtrace_retained = enab;
12662	return (`0`);
12663	}
12664
12665	enab->dten_next = dtrace_retained;
12666	dtrace_retained->dten_prev = enab;
12667	dtrace_retained = enab;
12668
12669	return (`0`);
12670	}
12671
12672	static int
12673	dtrace_enabling_replicate(dtrace_state_t state, dtrace_probedesc_t match,
12674	dtrace_probedesc_t *create)
12675	{
12676	dtrace_enabling_t new, enab;
12677	int found = `0`, err = ENOENT;
12678
12679	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12680	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12681	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12682	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12683	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12684
12685	new = dtrace_enabling_create(vstate: &state->dts_vstate);
12686
12687	/*
12688	* Iterate over all retained enablings, looking for enablings that
12689	* match the specified state.
12690	*/
12691	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12692	int i;
12693
12694	/*
12695	* dtvs_state can only be NULL for helper enablings -- and
12696	* helper enablings can't be retained.
12697	*/
12698	ASSERT(enab->dten_vstate->dtvs_state != NULL);
12699
12700	if (enab->dten_vstate->dtvs_state != state)
12701	continue;
12702
12703	/*
12704	* Now iterate over each probe description; we're looking for
12705	* an exact match to the specified probe description.
12706	*/
12707	for (i = `0`; i < enab->dten_ndesc; i++) {
12708	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12709	dtrace_probedesc_t *pd = &ep->dted_probe;
12710
12711	/ APPLE NOTE: Darwin employs size bounded string operation. /
12712	if (strncmp(s1: pd->dtpd_provider, s2: match->dtpd_provider, DTRACE_PROVNAMELEN))
12713	continue;
12714
12715	if (strncmp(s1: pd->dtpd_mod, s2: match->dtpd_mod, DTRACE_MODNAMELEN))
12716	continue;
12717
12718	if (strncmp(s1: pd->dtpd_func, s2: match->dtpd_func, DTRACE_FUNCNAMELEN))
12719	continue;
12720
12721	if (strncmp(s1: pd->dtpd_name, s2: match->dtpd_name, DTRACE_NAMELEN))
12722	continue;
12723
12724	/*
12725	* We have a winning probe! Add it to our growing
12726	* enabling.
12727	*/
12728	found = `1`;
12729	dtrace_enabling_addlike(enab: new, ecb: ep, pd: create);
12730	}
12731	}
12732
12733	if (!found \|\| (err = dtrace_enabling_retain(enab: new)) != `0`) {
12734	dtrace_enabling_destroy(enab: new);
12735	return (err);
12736	}
12737
12738	return (`0`);
12739	}
12740
12741	static void
12742	dtrace_enabling_retract(dtrace_state_t *state)
12743	{
12744	dtrace_enabling_t enab, next;
12745
12746	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12747
12748	/*
12749	* Iterate over all retained enablings, destroy the enablings retained
12750	* for the specified state.
12751	*/
12752	for (enab = dtrace_retained; enab != NULL; enab = next) {
12753	next = enab->dten_next;
12754
12755	/*
12756	* dtvs_state can only be NULL for helper enablings -- and
12757	* helper enablings can't be retained.
12758	*/
12759	ASSERT(enab->dten_vstate->dtvs_state != NULL);
12760
12761	if (enab->dten_vstate->dtvs_state == state) {
12762	ASSERT(state->dts_nretained > `0`);
12763	dtrace_enabling_destroy(enab);
12764	}
12765	}
12766
12767	ASSERT(state->dts_nretained == `0`);
12768	}
12769
12770	static int
12771	dtrace_enabling_match(dtrace_enabling_t enab, int* nmatched, dtrace_match_cond_t cond)
12772	{
12773	int i = `0`;
12774	int total_matched = `0`, matched = `0`;
12775
12776	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12777	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12778
12779	for (i = `0`; i < enab->dten_ndesc; i++) {
12780	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12781
12782	enab->dten_current = ep;
12783	enab->dten_error = `0`;
12784
12785	/**
12786	* Before doing a dtrace_probe_enable, which is really
12787	* expensive, check that this enabling matches the matching precondition
12788	* if we have one
12789	*/
12790	if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == `0`)) {
12791	continue;
12792	}
12793	/*
12794	* If a provider failed to enable a probe then get out and
12795	* let the consumer know we failed.
12796	*/
12797	if ((matched = dtrace_probe_enable(desc: &ep->dted_probe, enab, ep)) < `0`)
12798	return (EBUSY);
12799
12800	total_matched += matched;
12801
12802	if (enab->dten_error != `0`) {
12803	/*
12804	* If we get an error half-way through enabling the
12805	* probes, we kick out -- perhaps with some number of
12806	* them enabled. Leaving enabled probes enabled may
12807	* be slightly confusing for user-level, but we expect
12808	* that no one will attempt to actually drive on in
12809	* the face of such errors. If this is an anonymous
12810	* enabling (indicated with a NULL nmatched pointer),
12811	* we cmn_err() a message. We aren't expecting to
12812	* get such an error -- such as it can exist at all,
12813	* it would be a result of corrupted DOF in the driver
12814	* properties.
12815	*/
12816	if (nmatched == NULL) {
12817	cmn_err(CE_WARN, "dtrace_enabling_match() "
12818	"error on %p: %d", (void *)ep,
12819	enab->dten_error);
12820	}
12821
12822	return (enab->dten_error);
12823	}
12824
12825	ep->dted_probegen = dtrace_probegen;
12826	}
12827
12828	if (nmatched != NULL)
12829	*nmatched = total_matched;
12830
12831	return (`0`);
12832	}
12833
12834	static void
12835	dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12836	{
12837	dtrace_enabling_t *enab;
12838
12839	lck_mtx_lock(lck: &cpu_lock);
12840	lck_mtx_lock(lck: &dtrace_lock);
12841
12842	/*
12843	* Iterate over all retained enablings to see if any probes match
12844	* against them. We only perform this operation on enablings for which
12845	* we have sufficient permissions by virtue of being in the global zone
12846	* or in the same zone as the DTrace client. Because we can be called
12847	* after dtrace_detach() has been called, we cannot assert that there
12848	* are retained enablings. We can safely load from dtrace_retained,
12849	* however: the taskq_destroy() at the end of dtrace_detach() will
12850	* block pending our completion.
12851	*/
12852
12853	/*
12854	* Darwin doesn't do zones.
12855	* Behave as if always in "global" zone."
12856	*/
12857	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12858	(void) dtrace_enabling_match(enab, NULL, cond);
12859	}
12860
12861	lck_mtx_unlock(lck: &dtrace_lock);
12862	lck_mtx_unlock(lck: &cpu_lock);
12863
12864	}
12865
12866	static void
12867	dtrace_enabling_matchall(void)
12868	{
12869	dtrace_enabling_matchall_with_cond(NULL);
12870	}
12871
12872
12873
12874	/*
12875	* If an enabling is to be enabled without having matched probes (that is, if
12876	* dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12877	* enabling must be _primed_ by creating an ECB for every ECB description.
12878	* This must be done to assure that we know the number of speculations, the
12879	* number of aggregations, the minimum buffer size needed, etc. before we
12880	* transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12881	* enabling any probes, we create ECBs for every ECB decription, but with a
12882	* NULL probe -- which is exactly what this function does.
12883	*/
12884	static void
12885	dtrace_enabling_prime(dtrace_state_t *state)
12886	{
12887	dtrace_enabling_t *enab;
12888	int i;
12889
12890	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12891	ASSERT(enab->dten_vstate->dtvs_state != NULL);
12892
12893	if (enab->dten_vstate->dtvs_state != state)
12894	continue;
12895
12896	/*
12897	* We don't want to prime an enabling more than once, lest
12898	* we allow a malicious user to induce resource exhaustion.
12899	* (The ECBs that result from priming an enabling aren't
12900	* leaked -- but they also aren't deallocated until the
12901	* consumer state is destroyed.)
12902	*/
12903	if (enab->dten_primed)
12904	continue;
12905
12906	for (i = `0`; i < enab->dten_ndesc; i++) {
12907	enab->dten_current = enab->dten_desc[i];
12908	(void) dtrace_probe_enable(NULL, enab, NULL);
12909	}
12910
12911	enab->dten_primed = `1`;
12912	}
12913	}
12914
12915	/*
12916	* Called to indicate that probes should be provided due to retained
12917	* enablings. This is implemented in terms of dtrace_probe_provide(), but it
12918	* must take an initial lap through the enabling calling the dtps_provide()
12919	* entry point explicitly to allow for autocreated probes.
12920	*/
12921	static void
12922	dtrace_enabling_provide(dtrace_provider_t *prv)
12923	{
12924	int i, all = `0`;
12925	dtrace_probedesc_t desc;
12926	dtrace_genid_t gen;
12927
12928	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12929	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12930
12931	if (prv == NULL) {
12932	all = `1`;
12933	prv = dtrace_provider;
12934	}
12935
12936	do {
12937	dtrace_enabling_t *enab;
12938	void *parg = prv->dtpv_arg;
12939
12940	retry:
12941	gen = dtrace_retained_gen;
12942	for (enab = dtrace_retained; enab != NULL;
12943	enab = enab->dten_next) {
12944	for (i = `0`; i < enab->dten_ndesc; i++) {
12945	desc = enab->dten_desc[i]->dted_probe;
12946	lck_mtx_unlock(lck: &dtrace_lock);
12947	prv->dtpv_pops.dtps_provide(parg, &desc);
12948	lck_mtx_lock(lck: &dtrace_lock);
12949	/*
12950	* Process the retained enablings again if
12951	* they have changed while we weren't holding
12952	* dtrace_lock.
12953	*/
12954	if (gen != dtrace_retained_gen)
12955	goto retry;
12956	}
12957	}
12958	} while (all && (prv = prv->dtpv_next) != NULL);
12959
12960	lck_mtx_unlock(lck: &dtrace_lock);
12961	dtrace_probe_provide(NULL, prv: all ? NULL : prv);
12962	lck_mtx_lock(lck: &dtrace_lock);
12963	}
12964
12965	/*
12966	* DTrace DOF Functions
12967	*/
12968	/ARGSUSED/
12969	static void
12970	dtrace_dof_error(dof_hdr_t dof, const* char *str)
12971	{
12972	#pragma unused(dof) /* __APPLE__ */
12973	if (dtrace_err_verbose)
12974	cmn_err(CE_WARN, "failed to process DOF: %s", str);
12975
12976	#ifdef DTRACE_ERRDEBUG
12977	dtrace_errdebug(str);
12978	#endif
12979	}
12980
12981	/*
12982	* Create DOF out of a currently enabled state. Right now, we only create
12983	* DOF containing the run-time options -- but this could be expanded to create
12984	* complete DOF representing the enabled state.
12985	*/
12986	static dof_hdr_t *
12987	dtrace_dof_create(dtrace_state_t *state)
12988	{
12989	dof_hdr_t *dof;
12990	dof_sec_t *sec;
12991	dof_optdesc_t *opt;
12992	int i, len = sizeof (dof_hdr_t) +
12993	roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12994	sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12995
12996	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12997
12998	dof = kmem_zalloc_aligned(len, `8`, KM_SLEEP);
12999	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
13000	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
13001	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
13002	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
13003
13004	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
13005	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
13006	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
13007	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
13008	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
13009	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
13010
13011	dof->dofh_flags = `0`;
13012	dof->dofh_hdrsize = sizeof (dof_hdr_t);
13013	dof->dofh_secsize = sizeof (dof_sec_t);
13014	dof->dofh_secnum = `1`; / only DOF_SECT_OPTDESC /
13015	dof->dofh_secoff = sizeof (dof_hdr_t);
13016	dof->dofh_loadsz = len;
13017	dof->dofh_filesz = len;
13018	dof->dofh_pad = `0`;
13019
13020	/*
13021	* Fill in the option section header...
13022	*/
13023	sec = (dof_sec_t )((uintptr_t)dof + sizeof* (dof_hdr_t));
13024	sec->dofs_type = DOF_SECT_OPTDESC;
13025	sec->dofs_align = sizeof (uint64_t);
13026	sec->dofs_flags = DOF_SECF_LOAD;
13027	sec->dofs_entsize = sizeof (dof_optdesc_t);
13028
13029	opt = (dof_optdesc_t *)((uintptr_t)sec +
13030	roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
13031
13032	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
13033	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13034
13035	for (i = `0`; i < DTRACEOPT_MAX; i++) {
13036	opt[i].dofo_option = i;
13037	opt[i].dofo_strtab = DOF_SECIDX_NONE;
13038	opt[i].dofo_value = state->dts_options[i];
13039	}
13040
13041	return (dof);
13042	}
13043
13044	static dof_hdr_t *
13045	dtrace_dof_copyin(user_addr_t uarg, int *errp)
13046	{
13047	dof_hdr_t hdr, *dof;
13048
13049	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13050
13051	/*
13052	* First, we're going to copyin() the sizeof (dof_hdr_t).
13053	*/
13054	if (copyin(uarg, &hdr, sizeof (hdr)) != `0`) {
13055	dtrace_dof_error(NULL, str: "failed to copyin DOF header");
13056	*errp = EFAULT;
13057	return (NULL);
13058	}
13059
13060	/*
13061	* Now we'll allocate the entire DOF and copy it in -- provided
13062	* that the length isn't outrageous.
13063	*/
13064	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13065	dtrace_dof_error(dof: &hdr, str: "load size exceeds maximum");
13066	*errp = E2BIG;
13067	return (NULL);
13068	}
13069
13070	if (hdr.dofh_loadsz < sizeof (hdr)) {
13071	dtrace_dof_error(dof: &hdr, str: "invalid load size");
13072	*errp = EINVAL;
13073	return (NULL);
13074	}
13075
13076	dof = kmem_alloc_aligned(hdr.dofh_loadsz, `8`, KM_SLEEP);
13077
13078	if (copyin(uarg, dof, hdr.dofh_loadsz) != `0` \|\|
13079	dof->dofh_loadsz != hdr.dofh_loadsz) {
13080	kmem_free_aligned(dof, hdr.dofh_loadsz);
13081	*errp = EFAULT;
13082	return (NULL);
13083	}
13084
13085	return (dof);
13086	}
13087
13088	static dof_hdr_t *
13089	dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
13090	{
13091	dof_hdr_t hdr, *dof;
13092
13093	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13094
13095	/*
13096	* First, we're going to copyin() the sizeof (dof_hdr_t).
13097	*/
13098	if (uread(p, buf: &hdr, len: sizeof(hdr), a: uarg) != KERN_SUCCESS) {
13099	dtrace_dof_error(NULL, str: "failed to copyin DOF header");
13100	*errp = EFAULT;
13101	return (NULL);
13102	}
13103
13104	/*
13105	* Now we'll allocate the entire DOF and copy it in -- provided
13106	* that the length isn't outrageous.
13107	*/
13108	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13109	dtrace_dof_error(dof: &hdr, str: "load size exceeds maximum");
13110	*errp = E2BIG;
13111	return (NULL);
13112	}
13113
13114	if (hdr.dofh_loadsz < sizeof (hdr)) {
13115	dtrace_dof_error(dof: &hdr, str: "invalid load size");
13116	*errp = EINVAL;
13117	return (NULL);
13118	}
13119
13120	dof = kmem_alloc_aligned(hdr.dofh_loadsz, `8`, KM_SLEEP);
13121
13122	if (uread(p, buf: dof, len: hdr.dofh_loadsz, a: uarg) != KERN_SUCCESS \|\|
13123	dof->dofh_loadsz != hdr.dofh_loadsz) {
13124	kmem_free_aligned(dof, hdr.dofh_loadsz);
13125	*errp = EFAULT;
13126	return (NULL);
13127	}
13128
13129	return (dof);
13130	}
13131
13132	static void
13133	dtrace_dof_destroy(dof_hdr_t *dof)
13134	{
13135	kmem_free_aligned(dof, dof->dofh_loadsz);
13136	}
13137
13138	static dof_hdr_t *
13139	dtrace_dof_property(const char *name)
13140	{
13141	unsigned int len = `0`;
13142	dof_hdr_t *dof;
13143
13144	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13145	return NULL;
13146	}
13147
13148	if (!PEReadNVRAMProperty(symbol: name, NULL, len: &len)) {
13149	return NULL;
13150	}
13151
13152	dof = kmem_alloc_aligned(len, `8`, KM_SLEEP);
13153
13154	if (!PEReadNVRAMProperty(symbol: name, value: dof, len: &len)) {
13155	dtrace_dof_destroy(dof);
13156	dtrace_dof_error(NULL, str: "unreadable DOF");
13157	return NULL;
13158	}
13159
13160	if (len < sizeof (dof_hdr_t)) {
13161	dtrace_dof_destroy(dof);
13162	dtrace_dof_error(NULL, str: "truncated header");
13163	return (NULL);
13164	}
13165
13166	if (len < dof->dofh_loadsz) {
13167	dtrace_dof_destroy(dof);
13168	dtrace_dof_error(NULL, str: "truncated DOF");
13169	return (NULL);
13170	}
13171
13172	if (len != dof->dofh_loadsz) {
13173	dtrace_dof_destroy(dof);
13174	dtrace_dof_error(NULL, str: "invalid DOF size");
13175	return (NULL);
13176	}
13177
13178	if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13179	dtrace_dof_destroy(dof);
13180	dtrace_dof_error(NULL, str: "oversized DOF");
13181	return (NULL);
13182	}
13183
13184	return (dof);
13185	}
13186
13187	/*
13188	* Return the dof_sec_t pointer corresponding to a given section index. If the
13189	* index is not valid, dtrace_dof_error() is called and NULL is returned. If
13190	* a type other than DOF_SECT_NONE is specified, the header is checked against
13191	* this type and NULL is returned if the types do not match.
13192	*/
13193	static dof_sec_t *
13194	dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13195	{
13196	dof_sec_t sec = (dof_sec_t )(uintptr_t)
13197	((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13198
13199	if (i >= dof->dofh_secnum) {
13200	dtrace_dof_error(dof, str: "referenced section index is invalid");
13201	return (NULL);
13202	}
13203
13204	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13205	dtrace_dof_error(dof, str: "referenced section is not loadable");
13206	return (NULL);
13207	}
13208
13209	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13210	dtrace_dof_error(dof, str: "referenced section is the wrong type");
13211	return (NULL);
13212	}
13213
13214	return (sec);
13215	}
13216
13217	static dtrace_probedesc_t *
13218	dtrace_dof_probedesc(dof_hdr_t dof, dof_sec_t sec, dtrace_probedesc_t *desc)
13219	{
13220	dof_probedesc_t *probe;
13221	dof_sec_t *strtab;
13222	uintptr_t daddr = (uintptr_t)dof;
13223	uintptr_t str;
13224	size_t size;
13225
13226	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13227	dtrace_dof_error(dof, str: "invalid probe section");
13228	return (NULL);
13229	}
13230
13231	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13232	dtrace_dof_error(dof, str: "bad alignment in probe description");
13233	return (NULL);
13234	}
13235
13236	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13237	dtrace_dof_error(dof, str: "truncated probe description");
13238	return (NULL);
13239	}
13240
13241	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13242	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, i: probe->dofp_strtab);
13243
13244	if (strtab == NULL)
13245	return (NULL);
13246
13247	str = daddr + strtab->dofs_offset;
13248	size = strtab->dofs_size;
13249
13250	if (probe->dofp_provider >= strtab->dofs_size) {
13251	dtrace_dof_error(dof, str: "corrupt probe provider");
13252	return (NULL);
13253	}
13254
13255	(void) strncpy(desc->dtpd_provider,
13256	(char *)(str + probe->dofp_provider),
13257	MIN(DTRACE_PROVNAMELEN - `1`, size - probe->dofp_provider));
13258
13259	/ APPLE NOTE: Darwin employs size bounded string operation. /
13260	desc->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
13261
13262	if (probe->dofp_mod >= strtab->dofs_size) {
13263	dtrace_dof_error(dof, str: "corrupt probe module");
13264	return (NULL);
13265	}
13266
13267	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13268	MIN(DTRACE_MODNAMELEN - `1`, size - probe->dofp_mod));
13269
13270	/ APPLE NOTE: Darwin employs size bounded string operation. /
13271	desc->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
13272
13273	if (probe->dofp_func >= strtab->dofs_size) {
13274	dtrace_dof_error(dof, str: "corrupt probe function");
13275	return (NULL);
13276	}
13277
13278	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13279	MIN(DTRACE_FUNCNAMELEN - `1`, size - probe->dofp_func));
13280
13281	/ APPLE NOTE: Darwin employs size bounded string operation. /
13282	desc->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
13283
13284	if (probe->dofp_name >= strtab->dofs_size) {
13285	dtrace_dof_error(dof, str: "corrupt probe name");
13286	return (NULL);
13287	}
13288
13289	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13290	MIN(DTRACE_NAMELEN - `1`, size - probe->dofp_name));
13291
13292	/ APPLE NOTE: Darwin employs size bounded string operation. /
13293	desc->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
13294
13295	return (desc);
13296	}
13297
13298	static dtrace_difo_t *
13299	dtrace_dof_difo(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
13300	cred_t *cr)
13301	{
13302	dtrace_difo_t *dp;
13303	size_t ttl = `0`;
13304	dof_difohdr_t *dofd;
13305	uintptr_t daddr = (uintptr_t)dof;
13306	size_t max_size = dtrace_difo_maxsize;
13307	uint_t i;
13308	int l, n;
13309
13310
13311	static const struct {
13312	int section;
13313	int bufoffs;
13314	int lenoffs;
13315	int entsize;
13316	int align;
13317	const char *msg;
13318	} difo[] = {
13319	{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13320	offsetof(dtrace_difo_t, dtdo_len), .lenoffs: sizeof (dif_instr_t),
13321	.entsize: sizeof (dif_instr_t), .align: "multiple DIF sections" },
13322
13323	{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13324	offsetof(dtrace_difo_t, dtdo_intlen), .entsize: sizeof (uint64_t),
13325	.align: sizeof (uint64_t), .msg: "multiple integer tables" },
13326
13327	{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13328	offsetof(dtrace_difo_t, dtdo_strlen), .entsize: `0`,
13329	.align: sizeof (char), .msg: "multiple string tables" },
13330
13331	{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13332	offsetof(dtrace_difo_t, dtdo_varlen), .entsize: sizeof (dtrace_difv_t),
13333	.align: sizeof (uint_t), .msg: "multiple variable tables" },
13334
13335	{ DOF_SECT_NONE, .bufoffs: `0`, .lenoffs: `0`, .entsize: `0`, .align: `0`, NULL }
13336	};
13337
13338	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13339	dtrace_dof_error(dof, str: "invalid DIFO header section");
13340	return (NULL);
13341	}
13342
13343	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13344	dtrace_dof_error(dof, str: "bad alignment in DIFO header");
13345	return (NULL);
13346	}
13347
13348	if (sec->dofs_size < sizeof (dof_difohdr_t) \|\|
13349	sec->dofs_size % sizeof (dof_secidx_t)) {
13350	dtrace_dof_error(dof, str: "bad size in DIFO header");
13351	return (NULL);
13352	}
13353
13354	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13355	n = (sec->dofs_size - sizeof (dofd)) / sizeof* (dof_secidx_t) + `1`;
13356
13357	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13358	dp->dtdo_rtype = dofd->dofd_rtype;
13359
13360	for (l = `0`; l < n; l++) {
13361	dof_sec_t *subsec;
13362	void **bufp;
13363	uint32_t *lenp;
13364
13365	if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13366	i: dofd->dofd_links[l])) == NULL)
13367	goto err; / invalid section link /
13368
13369	if (ttl + subsec->dofs_size > max_size) {
13370	dtrace_dof_error(dof, str: "exceeds maximum size");
13371	goto err;
13372	}
13373
13374	ttl += subsec->dofs_size;
13375
13376	for (i = `0`; difo[i].section != DOF_SECT_NONE; i++) {
13377
13378	if (subsec->dofs_type != (uint32_t)difo[i].section)
13379	continue;
13380
13381	if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13382	dtrace_dof_error(dof, str: "section not loaded");
13383	goto err;
13384	}
13385
13386	if (subsec->dofs_align != (uint32_t)difo[i].align) {
13387	dtrace_dof_error(dof, str: "bad alignment");
13388	goto err;
13389	}
13390
13391	bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13392	lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13393
13394	if (*bufp != NULL) {
13395	dtrace_dof_error(dof, str: difo[i].msg);
13396	goto err;
13397	}
13398
13399	if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13400	dtrace_dof_error(dof, str: "entry size mismatch");
13401	goto err;
13402	}
13403
13404	if (subsec->dofs_entsize != `0` &&
13405	(subsec->dofs_size % subsec->dofs_entsize) != `0`) {
13406	dtrace_dof_error(dof, str: "corrupt entry size");
13407	goto err;
13408	}
13409
13410	*lenp = subsec->dofs_size;
13411	*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13412	bcopy(src: (char *)(uintptr_t)(daddr + subsec->dofs_offset),
13413	dst: *bufp, n: subsec->dofs_size);
13414
13415	if (subsec->dofs_entsize != `0`)
13416	*lenp /= subsec->dofs_entsize;
13417
13418	break;
13419	}
13420
13421	/*
13422	* If we encounter a loadable DIFO sub-section that is not
13423	* known to us, assume this is a broken program and fail.
13424	*/
13425	if (difo[i].section == DOF_SECT_NONE &&
13426	(subsec->dofs_flags & DOF_SECF_LOAD)) {
13427	dtrace_dof_error(dof, str: "unrecognized DIFO subsection");
13428	goto err;
13429	}
13430	}
13431
13432	if (dp->dtdo_buf == NULL) {
13433	/*
13434	* We can't have a DIF object without DIF text.
13435	*/
13436	dtrace_dof_error(dof, str: "missing DIF text");
13437	goto err;
13438	}
13439
13440	/*
13441	* Before we validate the DIF object, run through the variable table
13442	* looking for the strings -- if any of their size are under, we'll set
13443	* their size to be the system-wide default string size. Note that
13444	* this should _not_ happen if the "strsize" option has been set --
13445	* in this case, the compiler should have set the size to reflect the
13446	* setting of the option.
13447	*/
13448	for (i = `0`; i < dp->dtdo_varlen; i++) {
13449	dtrace_difv_t *v = &dp->dtdo_vartab[i];
13450	dtrace_diftype_t *t = &v->dtdv_type;
13451
13452	if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13453	continue;
13454
13455	if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == `0`)
13456	t->dtdt_size = dtrace_strsize_default;
13457	}
13458
13459	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != `0`)
13460	goto err;
13461
13462	dtrace_difo_init(dp, vstate);
13463	return (dp);
13464
13465	err:
13466	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13467	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13468	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13469	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13470
13471	kmem_free(dp, sizeof (dtrace_difo_t));
13472	return (NULL);
13473	}
13474
13475	static dtrace_predicate_t *
13476	dtrace_dof_predicate(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
13477	cred_t *cr)
13478	{
13479	dtrace_difo_t *dp;
13480
13481	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13482	return (NULL);
13483
13484	return (dtrace_predicate_create(dp));
13485	}
13486
13487	static dtrace_actdesc_t *
13488	dtrace_dof_actdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
13489	cred_t *cr)
13490	{
13491	dtrace_actdesc_t act, first = NULL, last = NULL, next;
13492	dof_actdesc_t *desc;
13493	dof_sec_t *difosec;
13494	size_t offs;
13495	uintptr_t daddr = (uintptr_t)dof;
13496	uint64_t arg;
13497	dtrace_actkind_t kind;
13498
13499	if (sec->dofs_type != DOF_SECT_ACTDESC) {
13500	dtrace_dof_error(dof, str: "invalid action section");
13501	return (NULL);
13502	}
13503
13504	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13505	dtrace_dof_error(dof, str: "truncated action description");
13506	return (NULL);
13507	}
13508
13509	if (sec->dofs_align != sizeof (uint64_t)) {
13510	dtrace_dof_error(dof, str: "bad alignment in action description");
13511	return (NULL);
13512	}
13513
13514	if (sec->dofs_size < sec->dofs_entsize) {
13515	dtrace_dof_error(dof, str: "section entry size exceeds total size");
13516	return (NULL);
13517	}
13518
13519	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13520	dtrace_dof_error(dof, str: "bad entry size in action description");
13521	return (NULL);
13522	}
13523
13524	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13525	dtrace_dof_error(dof, str: "actions exceed dtrace_actions_max");
13526	return (NULL);
13527	}
13528
13529	for (offs = `0`; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13530	desc = (dof_actdesc_t *)(daddr +
13531	(uintptr_t)sec->dofs_offset + offs);
13532	kind = (dtrace_actkind_t)desc->dofa_kind;
13533
13534	if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13535	(kind != DTRACEACT_PRINTA \|\| desc->dofa_strtab != DOF_SECIDX_NONE)) \|\|
13536	(kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13537	{
13538	dof_sec_t *strtab;
13539	char str, fmt;
13540	uint64_t i;
13541
13542	/*
13543	* The argument to these actions is an index into the
13544	* DOF string table. For printf()-like actions, this
13545	* is the format string. For print(), this is the
13546	* CTF type of the expression result.
13547	*/
13548	if ((strtab = dtrace_dof_sect(dof,
13549	DOF_SECT_STRTAB, i: desc->dofa_strtab)) == NULL)
13550	goto err;
13551
13552	str = (char *)((uintptr_t)dof +
13553	(uintptr_t)strtab->dofs_offset);
13554
13555	for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13556	if (str[i] == `'\0'`)
13557	break;
13558	}
13559
13560	if (i >= strtab->dofs_size) {
13561	dtrace_dof_error(dof, str: "bogus format string");
13562	goto err;
13563	}
13564
13565	if (i == desc->dofa_arg) {
13566	dtrace_dof_error(dof, str: "empty format string");
13567	goto err;
13568	}
13569
13570	i -= desc->dofa_arg;
13571	fmt = kmem_alloc(i + `1`, KM_SLEEP);
13572	bcopy(src: &str[desc->dofa_arg], dst: fmt, n: i + `1`);
13573	arg = (uint64_t)(uintptr_t)fmt;
13574	} else {
13575	if (kind == DTRACEACT_PRINTA) {
13576	ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13577	arg = `0`;
13578	} else {
13579	arg = desc->dofa_arg;
13580	}
13581	}
13582
13583	act = dtrace_actdesc_create(kind, ntuple: desc->dofa_ntuple,
13584	uarg: desc->dofa_uarg, arg);
13585
13586	if (last != NULL) {
13587	last->dtad_next = act;
13588	} else {
13589	first = act;
13590	}
13591
13592	last = act;
13593
13594	if (desc->dofa_difo == DOF_SECIDX_NONE)
13595	continue;
13596
13597	if ((difosec = dtrace_dof_sect(dof,
13598	DOF_SECT_DIFOHDR, i: desc->dofa_difo)) == NULL)
13599	goto err;
13600
13601	act->dtad_difo = dtrace_dof_difo(dof, sec: difosec, vstate, cr);
13602
13603	if (act->dtad_difo == NULL)
13604	goto err;
13605	}
13606
13607	ASSERT(first != NULL);
13608	return (first);
13609
13610	err:
13611	for (act = first; act != NULL; act = next) {
13612	next = act->dtad_next;
13613	dtrace_actdesc_release(act, vstate);
13614	}
13615
13616	return (NULL);
13617	}
13618
13619	static dtrace_ecbdesc_t *
13620	dtrace_dof_ecbdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
13621	cred_t *cr)
13622	{
13623	dtrace_ecbdesc_t *ep;
13624	dof_ecbdesc_t *ecb;
13625	dtrace_probedesc_t *desc;
13626	dtrace_predicate_t *pred = NULL;
13627
13628	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13629	dtrace_dof_error(dof, str: "truncated ECB description");
13630	return (NULL);
13631	}
13632
13633	if (sec->dofs_align != sizeof (uint64_t)) {
13634	dtrace_dof_error(dof, str: "bad alignment in ECB description");
13635	return (NULL);
13636	}
13637
13638	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13639	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, i: ecb->dofe_probes);
13640
13641	if (sec == NULL)
13642	return (NULL);
13643
13644	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13645	ep->dted_uarg = ecb->dofe_uarg;
13646	desc = &ep->dted_probe;
13647
13648	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13649	goto err;
13650
13651	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13652	if ((sec = dtrace_dof_sect(dof,
13653	DOF_SECT_DIFOHDR, i: ecb->dofe_pred)) == NULL)
13654	goto err;
13655
13656	if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13657	goto err;
13658
13659	ep->dted_pred.dtpdd_predicate = pred;
13660	}
13661
13662	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13663	if ((sec = dtrace_dof_sect(dof,
13664	DOF_SECT_ACTDESC, i: ecb->dofe_actions)) == NULL)
13665	goto err;
13666
13667	ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13668
13669	if (ep->dted_action == NULL)
13670	goto err;
13671	}
13672
13673	return (ep);
13674
13675	err:
13676	if (pred != NULL)
13677	dtrace_predicate_release(pred, vstate);
13678	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13679	return (NULL);
13680	}
13681
13682	/*
13683	* APPLE NOTE: dyld handles dof relocation.
13684	* Darwin does not need dtrace_dof_relocate()
13685	*/
13686
13687	/*
13688	* The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13689	* header: it should be at the front of a memory region that is at least
13690	* sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13691	* size. It need not be validated in any other way.
13692	*/
13693	static int
13694	dtrace_dof_slurp(dof_hdr_t dof, dtrace_vstate_t vstate, cred_t *cr,
13695	dtrace_enabling_t *enabp, uint64_t ubase, int* noprobes)
13696	{
13697	#pragma unused(ubase) /* __APPLE__ */
13698	uint64_t len = dof->dofh_loadsz, seclen;
13699	uintptr_t daddr = (uintptr_t)dof;
13700	dtrace_ecbdesc_t *ep;
13701	dtrace_enabling_t *enab;
13702	uint_t i;
13703
13704	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13705	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13706
13707	/*
13708	* Check the DOF header identification bytes. In addition to checking
13709	* valid settings, we also verify that unused bits/bytes are zeroed so
13710	* we can use them later without fear of regressing existing binaries.
13711	*/
13712	if (bcmp(s1: &dof->dofh_ident[DOF_ID_MAG0],
13713	DOF_MAG_STRING, DOF_MAG_STRLEN) != `0`) {
13714	dtrace_dof_error(dof, str: "DOF magic string mismatch");
13715	return (-`1`);
13716	}
13717
13718	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13719	dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13720	dtrace_dof_error(dof, str: "DOF has invalid data model");
13721	return (-`1`);
13722	}
13723
13724	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13725	dtrace_dof_error(dof, str: "DOF encoding mismatch");
13726	return (-`1`);
13727	}
13728
13729	/*
13730	* APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13731	*/
13732	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13733	dtrace_dof_error(dof, str: "DOF version mismatch");
13734	return (-`1`);
13735	}
13736
13737	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13738	dtrace_dof_error(dof, str: "DOF uses unsupported instruction set");
13739	return (-`1`);
13740	}
13741
13742	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13743	dtrace_dof_error(dof, str: "DOF uses too many integer registers");
13744	return (-`1`);
13745	}
13746
13747	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13748	dtrace_dof_error(dof, str: "DOF uses too many tuple registers");
13749	return (-`1`);
13750	}
13751
13752	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13753	if (dof->dofh_ident[i] != `0`) {
13754	dtrace_dof_error(dof, str: "DOF has invalid ident byte set");
13755	return (-`1`);
13756	}
13757	}
13758
13759	if (dof->dofh_flags & ~DOF_FL_VALID) {
13760	dtrace_dof_error(dof, str: "DOF has invalid flag bits set");
13761	return (-`1`);
13762	}
13763
13764	if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13765	dtrace_dof_error(dof, str: "invalid section header size");
13766	return (-`1`);
13767	}
13768
13769	/*
13770	* Check that the section headers don't exceed the amount of DOF
13771	* data. Note that we cast the section size and number of sections
13772	* to uint64_t's to prevent possible overflow in the multiplication.
13773	*/
13774	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13775
13776	if (dof->dofh_secoff > len \|\| seclen > len \|\|
13777	dof->dofh_secoff + seclen > len) {
13778	dtrace_dof_error(dof, str: "truncated section headers");
13779	return (-`1`);
13780	}
13781
13782	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13783	dtrace_dof_error(dof, str: "misaligned section headers");
13784	return (-`1`);
13785	}
13786
13787	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13788	dtrace_dof_error(dof, str: "misaligned section size");
13789	return (-`1`);
13790	}
13791
13792	/*
13793	* Take an initial pass through the section headers to be sure that
13794	* the headers don't have stray offsets. If the 'noprobes' flag is
13795	* set, do not permit sections relating to providers, probes, or args.
13796	*/
13797	for (i = `0`; i < dof->dofh_secnum; i++) {
13798	dof_sec_t sec = (dof_sec_t )(daddr +
13799	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13800
13801	if (noprobes) {
13802	switch (sec->dofs_type) {
13803	case DOF_SECT_PROVIDER:
13804	case DOF_SECT_PROBES:
13805	case DOF_SECT_PRARGS:
13806	case DOF_SECT_PROFFS:
13807	dtrace_dof_error(dof, str: "illegal sections "
13808	"for enabling");
13809	return (-`1`);
13810	}
13811	}
13812
13813	if (sec->dofs_align & (sec->dofs_align - `1`)) {
13814	dtrace_dof_error(dof, str: "bad section alignment");
13815	return (-`1`);
13816	}
13817
13818	if (sec->dofs_offset & (sec->dofs_align - `1`)) {
13819	dtrace_dof_error(dof, str: "misaligned section");
13820	return (-`1`);
13821	}
13822
13823	if (sec->dofs_flags & DOF_SECF_LOAD) {
13824	len = dof->dofh_loadsz;
13825	} else {
13826	len = dof->dofh_filesz;
13827	}
13828
13829	if (sec->dofs_offset > len \|\| sec->dofs_size > len \|\|
13830	sec->dofs_offset + sec->dofs_size > len) {
13831	dtrace_dof_error(dof, str: "corrupt section header");
13832	return (-`1`);
13833	}
13834
13835	if (sec->dofs_type == DOF_SECT_STRTAB && ((char* *)daddr +
13836	sec->dofs_offset + sec->dofs_size - `1`) != `'\0'`) {
13837	dtrace_dof_error(dof, str: "non-terminating string table");
13838	return (-`1`);
13839	}
13840	}
13841
13842	/*
13843	* APPLE NOTE: We have no further relocation to perform.
13844	* All dof values are relative offsets.
13845	*/
13846
13847	if ((enab = *enabp) == NULL)
13848	enab = *enabp = dtrace_enabling_create(vstate);
13849
13850	for (i = `0`; i < dof->dofh_secnum; i++) {
13851	dof_sec_t sec = (dof_sec_t )(daddr +
13852	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13853
13854	if (sec->dofs_type != DOF_SECT_ECBDESC)
13855	continue;
13856
13857	/*
13858	* APPLE NOTE: Defend against gcc 4.0 botch on x86.
13859	* not all paths out of inlined dtrace_dof_ecbdesc
13860	* are checked for the NULL return value.
13861	* Check for NULL explicitly here.
13862	*/
13863	ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13864	if (ep == NULL) {
13865	dtrace_enabling_destroy(enab);
13866	*enabp = NULL;
13867	return (-`1`);
13868	}
13869
13870	dtrace_enabling_add(enab, ecb: ep);
13871	}
13872
13873	return (`0`);
13874	}
13875
13876	/*
13877	* Process DOF for any options. This routine assumes that the DOF has been
13878	* at least processed by dtrace_dof_slurp().
13879	*/
13880	static int
13881	dtrace_dof_options(dof_hdr_t dof, dtrace_state_t state)
13882	{
13883	uint_t i;
13884	int rval;
13885	uint32_t entsize;
13886	size_t offs;
13887	dof_optdesc_t *desc;
13888
13889	for (i = `0`; i < dof->dofh_secnum; i++) {
13890	dof_sec_t sec = (dof_sec_t )((uintptr_t)dof +
13891	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13892
13893	if (sec->dofs_type != DOF_SECT_OPTDESC)
13894	continue;
13895
13896	if (sec->dofs_align != sizeof (uint64_t)) {
13897	dtrace_dof_error(dof, str: "bad alignment in "
13898	"option description");
13899	return (EINVAL);
13900	}
13901
13902	if ((entsize = sec->dofs_entsize) == `0`) {
13903	dtrace_dof_error(dof, str: "zeroed option entry size");
13904	return (EINVAL);
13905	}
13906
13907	if (entsize < sizeof (dof_optdesc_t)) {
13908	dtrace_dof_error(dof, str: "bad option entry size");
13909	return (EINVAL);
13910	}
13911
13912	for (offs = `0`; offs < sec->dofs_size; offs += entsize) {
13913	desc = (dof_optdesc_t *)((uintptr_t)dof +
13914	(uintptr_t)sec->dofs_offset + offs);
13915
13916	if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13917	dtrace_dof_error(dof, str: "non-zero option string");
13918	return (EINVAL);
13919	}
13920
13921	if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13922	dtrace_dof_error(dof, str: "unset option");
13923	return (EINVAL);
13924	}
13925
13926	if ((rval = dtrace_state_option(state,
13927	desc->dofo_option, desc->dofo_value)) != `0`) {
13928	dtrace_dof_error(dof, str: "rejected option");
13929	return (rval);
13930	}
13931	}
13932	}
13933
13934	return (`0`);
13935	}
13936
13937	/*
13938	* DTrace Consumer State Functions
13939	*/
13940	static int
13941	dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13942	{
13943	size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13944	void *base;
13945	uintptr_t limit;
13946	dtrace_dynvar_t dvar, next, *start;
13947
13948	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13949	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13950
13951	bzero(s: dstate, n: sizeof (dtrace_dstate_t));
13952
13953	if ((dstate->dtds_chunksize = chunksize) == `0`)
13954	dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13955
13956	VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13957
13958	if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13959	size = min_size;
13960
13961	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13962	return (ENOMEM);
13963
13964	dstate->dtds_size = size;
13965	dstate->dtds_base = base;
13966	dstate->dtds_percpu = zalloc_percpu(dtrace_state_pcpu_zone, Z_WAITOK \| Z_ZERO);
13967
13968	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13969
13970	if (hashsize != `1` && (hashsize & `1`))
13971	hashsize--;
13972
13973	dstate->dtds_hashsize = hashsize;
13974	dstate->dtds_hash = dstate->dtds_base;
13975
13976	/*
13977	* Set all of our hash buckets to point to the single sink, and (if
13978	* it hasn't already been set), set the sink's hash value to be the
13979	* sink sentinel value. The sink is needed for dynamic variable
13980	* lookups to know that they have iterated over an entire, valid hash
13981	* chain.
13982	*/
13983	for (size_t i = `0`; i < hashsize; i++)
13984	dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13985
13986	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13987	dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13988
13989	/*
13990	* Determine number of active CPUs. Divide free list evenly among
13991	* active CPUs.
13992	*/
13993	start = (dtrace_dynvar_t *)
13994	((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13995	limit = (uintptr_t)base + size;
13996
13997	VERIFY((uintptr_t)start < limit);
13998	VERIFY((uintptr_t)start >= (uintptr_t)base);
13999
14000	maxper = (limit - (uintptr_t)start) / (int)NCPU;
14001	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
14002
14003	zpercpu_foreach_cpu(i) {
14004	dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
14005
14006	dcpu->dtdsc_free = dvar = start;
14007
14008	/*
14009	* If we don't even have enough chunks to make it once through
14010	* NCPUs, we're just going to allocate everything to the first
14011	* CPU. And if we're on the last CPU, we're going to allocate
14012	* whatever is left over. In either case, we set the limit to
14013	* be the limit of the dynamic variable space.
14014	*/
14015	if (maxper == `0` \|\| i == NCPU - `1`) {
14016	limit = (uintptr_t)base + size;
14017	start = NULL;
14018	} else {
14019	limit = (uintptr_t)start + maxper;
14020	start = (dtrace_dynvar_t *)limit;
14021	}
14022
14023	VERIFY(limit <= (uintptr_t)base + size);
14024
14025	for (;;) {
14026	next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14027	dstate->dtds_chunksize);
14028
14029	if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14030	break;
14031
14032	VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
14033	(uintptr_t)dvar <= (uintptr_t)base + size);
14034	dvar->dtdv_next = next;
14035	dvar = next;
14036	}
14037
14038	if (maxper == `0`)
14039	break;
14040	}
14041
14042	return (`0`);
14043	}
14044
14045	static void
14046	dtrace_dstate_fini(dtrace_dstate_t *dstate)
14047	{
14048	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14049
14050	if (dstate->dtds_base == NULL)
14051	return;
14052
14053	kmem_free(dstate->dtds_base, dstate->dtds_size);
14054	zfree_percpu(zone_or_view: dtrace_state_pcpu_zone, addr: dstate->dtds_percpu);
14055	}
14056
14057	static void
14058	dtrace_vstate_fini(dtrace_vstate_t *vstate)
14059	{
14060	/*
14061	* Logical XOR, where are you?
14062	*/
14063	ASSERT((vstate->dtvs_nglobals == `0`) ^ (vstate->dtvs_globals != NULL));
14064
14065	if (vstate->dtvs_nglobals > `0`) {
14066	kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14067	sizeof (dtrace_statvar_t *));
14068	}
14069
14070	if (vstate->dtvs_ntlocals > `0`) {
14071	kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14072	sizeof (dtrace_difv_t));
14073	}
14074
14075	ASSERT((vstate->dtvs_nlocals == `0`) ^ (vstate->dtvs_locals != NULL));
14076
14077	if (vstate->dtvs_nlocals > `0`) {
14078	kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14079	sizeof (dtrace_statvar_t *));
14080	}
14081	}
14082
14083	static void
14084	dtrace_state_clean(dtrace_state_t *state)
14085	{
14086	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14087	return;
14088
14089	dtrace_dynvar_clean(dstate: &state->dts_vstate.dtvs_dynvars);
14090	dtrace_speculation_clean(state);
14091	}
14092
14093	static void
14094	dtrace_state_deadman(dtrace_state_t *state)
14095	{
14096	hrtime_t now;
14097
14098	dtrace_sync();
14099
14100	now = dtrace_gethrtime();
14101
14102	if (state != dtrace_anon.dta_state &&
14103	now - state->dts_laststatus >= dtrace_deadman_user)
14104	return;
14105
14106	/*
14107	* We must be sure that dts_alive never appears to be less than the
14108	* value upon entry to dtrace_state_deadman(), and because we lack a
14109	* dtrace_cas64(), we cannot store to it atomically. We thus instead
14110	* store INT64_MAX to it, followed by a memory barrier, followed by
14111	* the new value. This assures that dts_alive never appears to be
14112	* less than its true value, regardless of the order in which the
14113	* stores to the underlying storage are issued.
14114	*/
14115	state->dts_alive = INT64_MAX;
14116	dtrace_membar_producer();
14117	state->dts_alive = now;
14118	}
14119
14120	static int
14121	dtrace_state_create(dev_t devp, cred_t cr, dtrace_state_t **new_state)
14122	{
14123	minor_t minor;
14124	major_t major;
14125	char c[`30`];
14126	dtrace_state_t *state;
14127	dtrace_optval_t *opt;
14128	int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
14129	unsigned int cpu_it;
14130
14131	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14132	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14133
14134	/ Cause restart /
14135	*new_state = NULL;
14136
14137	if (devp != NULL) {
14138	minor = getminor(*devp);
14139	}
14140	else {
14141	minor = DTRACE_NCLIENTS - `1`;
14142	}
14143
14144	state = dtrace_state_allocate(minor);
14145	if (NULL == state) {
14146	printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
14147	return (ERESTART); / can't reacquire /
14148	}
14149
14150	state->dts_epid = DTRACE_EPIDNONE + `1`;
14151
14152	(void) snprintf(c, count: sizeof (c), "dtrace_aggid_%d", minor);
14153	state->dts_aggid_arena = vmem_create(c, (void *)`1`, INT32_MAX, `1`,
14154	NULL, NULL, NULL, `0`, VM_SLEEP \| VMC_IDENTIFIER);
14155
14156	if (devp != NULL) {
14157	major = getemajor(*devp);
14158	} else {
14159	major = ddi_driver_major(dtrace_devi);
14160	}
14161
14162	state->dts_dev = makedev(major, minor);
14163
14164	if (devp != NULL)
14165	*devp = state->dts_dev;
14166
14167	/*
14168	* We allocate NCPU buffers. On the one hand, this can be quite
14169	* a bit of memory per instance (nearly 36K on a Starcat). On the
14170	* other hand, it saves an additional memory reference in the probe
14171	* path.
14172	*/
14173	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14174	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14175	state->dts_buf_over_limit = `0`;
14176
14177	/*
14178	* Allocate and initialise the per-process per-CPU random state.
14179	* SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14180	* assumed to be seeded at this point (if from Fortuna seed file).
14181	*/
14182	state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14183	state->dts_rstate[`0`] = kmem_zalloc(`2` * sizeof(uint64_t), KM_SLEEP);
14184	(void) read_random(buffer: state->dts_rstate[`0`], numBytes: `2` * sizeof(uint64_t));
14185	for (cpu_it = `1`; cpu_it < NCPU; cpu_it++) {
14186	state->dts_rstate[cpu_it] = kmem_zalloc(`2` * sizeof(uint64_t), KM_SLEEP);
14187	/*
14188	* Each CPU is assigned a 2^64 period, non-overlapping
14189	* subsequence.
14190	*/
14191	dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-`1`],
14192	state->dts_rstate[cpu_it]);
14193	}
14194
14195	state->dts_cleaner = CYCLIC_NONE;
14196	state->dts_deadman = CYCLIC_NONE;
14197	state->dts_vstate.dtvs_state = state;
14198
14199	for (i = `0`; i < DTRACEOPT_MAX; i++)
14200	state->dts_options[i] = DTRACEOPT_UNSET;
14201
14202	/*
14203	* Set the default options.
14204	*/
14205	opt = state->dts_options;
14206	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14207	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14208	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14209	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14210	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14211	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14212	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14213	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14214	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14215	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14216	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14217	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14218	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14219	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14220	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14221
14222	/*
14223	* Depending on the user credentials, we set flag bits which alter probe
14224	* visibility or the amount of destructiveness allowed. In the case of
14225	* actual anonymous tracing, or the possession of all privileges, all of
14226	* the normal checks are bypassed.
14227	*/
14228	#if defined(__APPLE__)
14229	if (cr != NULL) {
14230	kauth_cred_ref(cred: cr);
14231	state->dts_cred.dcr_cred = cr;
14232	}
14233	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14234	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14235	/*
14236	* Allow only proc credentials when DTrace is
14237	* restricted by the current security policy
14238	*/
14239	state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14240	state->dts_cred.dcr_action = DTRACE_CRA_PROC \| DTRACE_CRA_PROC_CONTROL \| DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14241	}
14242	else {
14243	state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14244	state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14245	}
14246	}
14247
14248	#else
14249	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14250	state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14251	state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14252	}
14253	else {
14254	/*
14255	* Set up the credentials for this instantiation. We take a
14256	* hold on the credential to prevent it from disappearing on
14257	* us; this in turn prevents the zone_t referenced by this
14258	* credential from disappearing. This means that we can
14259	* examine the credential and the zone from probe context.
14260	*/
14261	crhold(cr);
14262	state->dts_cred.dcr_cred = cr;
14263
14264	/*
14265	* CRA_PROC means "we have some privilege for dtrace" and
14266	* unlocks the use of variables like pid, zonename, etc.
14267	*/
14268	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) \|\|
14269	PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14270	state->dts_cred.dcr_action \|= DTRACE_CRA_PROC;
14271	}
14272
14273	/*
14274	* dtrace_user allows use of syscall and profile providers.
14275	* If the user also has proc_owner and/or proc_zone, we
14276	* extend the scope to include additional visibility and
14277	* destructive power.
14278	*/
14279	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14280	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14281	state->dts_cred.dcr_visible \|=
14282	DTRACE_CRV_ALLPROC;
14283
14284	state->dts_cred.dcr_action \|=
14285	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14286	}
14287
14288	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14289	state->dts_cred.dcr_visible \|=
14290	DTRACE_CRV_ALLZONE;
14291
14292	state->dts_cred.dcr_action \|=
14293	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14294	}
14295
14296	/*
14297	* If we have all privs in whatever zone this is,
14298	* we can do destructive things to processes which
14299	* have altered credentials.
14300	*
14301	* APPLE NOTE: Darwin doesn't do zones.
14302	* Behave as if zone always has destructive privs.
14303	*/
14304
14305	state->dts_cred.dcr_action \|=
14306	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14307	}
14308
14309	/*
14310	* Holding the dtrace_kernel privilege also implies that
14311	* the user has the dtrace_user privilege from a visibility
14312	* perspective. But without further privileges, some
14313	* destructive actions are not available.
14314	*/
14315	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14316	/*
14317	* Make all probes in all zones visible. However,
14318	* this doesn't mean that all actions become available
14319	* to all zones.
14320	*/
14321	state->dts_cred.dcr_visible \|= DTRACE_CRV_KERNEL \|
14322	DTRACE_CRV_ALLPROC \| DTRACE_CRV_ALLZONE;
14323
14324	state->dts_cred.dcr_action \|= DTRACE_CRA_KERNEL \|
14325	DTRACE_CRA_PROC;
14326	/*
14327	* Holding proc_owner means that destructive actions
14328	* for this zone are allowed.
14329	*/
14330	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14331	state->dts_cred.dcr_action \|=
14332	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14333
14334	/*
14335	* Holding proc_zone means that destructive actions
14336	* for this user/group ID in all zones is allowed.
14337	*/
14338	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14339	state->dts_cred.dcr_action \|=
14340	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14341
14342	/*
14343	* If we have all privs in whatever zone this is,
14344	* we can do destructive things to processes which
14345	* have altered credentials.
14346	*
14347	* APPLE NOTE: Darwin doesn't do zones.
14348	* Behave as if zone always has destructive privs.
14349	*/
14350	state->dts_cred.dcr_action \|=
14351	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14352	}
14353
14354	/*
14355	* Holding the dtrace_proc privilege gives control over fasttrap
14356	* and pid providers. We need to grant wider destructive
14357	* privileges in the event that the user has proc_owner and/or
14358	* proc_zone.
14359	*/
14360	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14361	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14362	state->dts_cred.dcr_action \|=
14363	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14364
14365	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14366	state->dts_cred.dcr_action \|=
14367	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14368	}
14369	}
14370	#endif
14371
14372	*new_state = state;
14373	return(`0`); / Success /
14374	}
14375
14376	static int
14377	dtrace_state_buffer(dtrace_state_t state, dtrace_buffer_t buf, int which)
14378	{
14379	dtrace_optval_t *opt = state->dts_options, size;
14380	processorid_t cpu = `0`;
14381	size_t limit = buf->dtb_size;
14382	int flags = `0`, rval;
14383
14384	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14385	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14386	ASSERT(which < DTRACEOPT_MAX);
14387	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE \|\|
14388	(state == dtrace_anon.dta_state &&
14389	state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14390
14391	if (opt[which] == DTRACEOPT_UNSET \|\| opt[which] == `0`)
14392	return (`0`);
14393
14394	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14395	cpu = opt[DTRACEOPT_CPU];
14396
14397	if (which == DTRACEOPT_SPECSIZE)
14398	flags \|= DTRACEBUF_NOSWITCH;
14399
14400	if (which == DTRACEOPT_BUFSIZE) {
14401	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14402	flags \|= DTRACEBUF_RING;
14403
14404	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14405	flags \|= DTRACEBUF_FILL;
14406
14407	if (state != dtrace_anon.dta_state \|\|
14408	state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14409	flags \|= DTRACEBUF_INACTIVE;
14410	}
14411
14412	for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= `1`) {
14413	/*
14414	* The size must be 8-byte aligned. If the size is not 8-byte
14415	* aligned, drop it down by the difference.
14416	*/
14417	if (size & (sizeof (uint64_t) - `1`))
14418	size -= size & (sizeof (uint64_t) - `1`);
14419
14420	if (size < state->dts_reserve) {
14421	/*
14422	* Buffers always must be large enough to accommodate
14423	* their prereserved space. We return E2BIG instead
14424	* of ENOMEM in this case to allow for user-level
14425	* software to differentiate the cases.
14426	*/
14427	return (E2BIG);
14428	}
14429	limit = opt[DTRACEOPT_BUFLIMIT] * size / `100`;
14430	rval = dtrace_buffer_alloc(bufs: buf, limit, size, flags, cpu);
14431
14432	if (rval != ENOMEM) {
14433	opt[which] = size;
14434	return (rval);
14435	}
14436
14437	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14438	return (rval);
14439	}
14440
14441	return (ENOMEM);
14442	}
14443
14444	static int
14445	dtrace_state_buffers(dtrace_state_t *state)
14446	{
14447	dtrace_speculation_t *spec = state->dts_speculations;
14448	int rval, i;
14449
14450	if ((rval = dtrace_state_buffer(state, buf: state->dts_buffer,
14451	DTRACEOPT_BUFSIZE)) != `0`)
14452	return (rval);
14453
14454	if ((rval = dtrace_state_buffer(state, buf: state->dts_aggbuffer,
14455	DTRACEOPT_AGGSIZE)) != `0`)
14456	return (rval);
14457
14458	for (i = `0`; i < state->dts_nspeculations; i++) {
14459	if ((rval = dtrace_state_buffer(state,
14460	buf: spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != `0`)
14461	return (rval);
14462	}
14463
14464	return (`0`);
14465	}
14466
14467	static void
14468	dtrace_state_prereserve(dtrace_state_t *state)
14469	{
14470	dtrace_ecb_t *ecb;
14471	dtrace_probe_t *probe;
14472
14473	state->dts_reserve = `0`;
14474
14475	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14476	return;
14477
14478	/*
14479	* If our buffer policy is a "fill" buffer policy, we need to set the
14480	* prereserved space to be the space required by the END probes.
14481	*/
14482	probe = dtrace_probes[dtrace_probeid_end - `1`];
14483	ASSERT(probe != NULL);
14484
14485	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14486	if (ecb->dte_state != state)
14487	continue;
14488
14489	state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14490	}
14491	}
14492
14493	static int
14494	dtrace_state_go(dtrace_state_t state, processorid_t cpu)
14495	{
14496	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14497	dtrace_speculation_t *spec;
14498	dtrace_buffer_t *buf;
14499	cyc_handler_t hdlr;
14500	cyc_time_t when;
14501	int rval = `0`, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14502	dtrace_icookie_t cookie;
14503
14504	lck_mtx_lock(lck: &cpu_lock);
14505	lck_mtx_lock(lck: &dtrace_lock);
14506
14507	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14508	rval = EBUSY;
14509	goto out;
14510	}
14511
14512	/*
14513	* Before we can perform any checks, we must prime all of the
14514	* retained enablings that correspond to this state.
14515	*/
14516	dtrace_enabling_prime(state);
14517
14518	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14519	rval = EACCES;
14520	goto out;
14521	}
14522
14523	dtrace_state_prereserve(state);
14524
14525	/*
14526	* Now we want to do is try to allocate our speculations.
14527	* We do not automatically resize the number of speculations; if
14528	* this fails, we will fail the operation.
14529	*/
14530	nspec = opt[DTRACEOPT_NSPEC];
14531	ASSERT(nspec != DTRACEOPT_UNSET);
14532
14533	if (nspec > INT_MAX) {
14534	rval = ENOMEM;
14535	goto out;
14536	}
14537
14538	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14539
14540	if (spec == NULL) {
14541	rval = ENOMEM;
14542	goto out;
14543	}
14544
14545	state->dts_speculations = spec;
14546	state->dts_nspeculations = (int)nspec;
14547
14548	for (i = `0`; i < nspec; i++) {
14549	if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14550	rval = ENOMEM;
14551	goto err;
14552	}
14553
14554	spec[i].dtsp_buffer = buf;
14555	}
14556
14557	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14558	if (dtrace_anon.dta_state == NULL) {
14559	rval = ENOENT;
14560	goto out;
14561	}
14562
14563	if (state->dts_necbs != `0`) {
14564	rval = EALREADY;
14565	goto out;
14566	}
14567
14568	state->dts_anon = dtrace_anon_grab();
14569	ASSERT(state->dts_anon != NULL);
14570	state = state->dts_anon;
14571
14572	/*
14573	* We want "grabanon" to be set in the grabbed state, so we'll
14574	* copy that option value from the grabbing state into the
14575	* grabbed state.
14576	*/
14577	state->dts_options[DTRACEOPT_GRABANON] =
14578	opt[DTRACEOPT_GRABANON];
14579
14580	*cpu = dtrace_anon.dta_beganon;
14581
14582	/*
14583	* If the anonymous state is active (as it almost certainly
14584	* is if the anonymous enabling ultimately matched anything),
14585	* we don't allow any further option processing -- but we
14586	* don't return failure.
14587	*/
14588	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14589	goto out;
14590	}
14591
14592	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14593	opt[DTRACEOPT_AGGSIZE] != `0`) {
14594	if (state->dts_aggregations == NULL) {
14595	/*
14596	* We're not going to create an aggregation buffer
14597	* because we don't have any ECBs that contain
14598	* aggregations -- set this option to 0.
14599	*/
14600	opt[DTRACEOPT_AGGSIZE] = `0`;
14601	} else {
14602	/*
14603	* If we have an aggregation buffer, we must also have
14604	* a buffer to use as scratch.
14605	*/
14606	if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET \|\|
14607	(size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14608	opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14609	}
14610	}
14611	}
14612
14613	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14614	opt[DTRACEOPT_SPECSIZE] != `0`) {
14615	if (!state->dts_speculates) {
14616	/*
14617	* We're not going to create speculation buffers
14618	* because we don't have any ECBs that actually
14619	* speculate -- set the speculation size to 0.
14620	*/
14621	opt[DTRACEOPT_SPECSIZE] = `0`;
14622	}
14623	}
14624
14625	/*
14626	* The bare minimum size for any buffer that we're actually going to
14627	* do anything to is sizeof (uint64_t).
14628	*/
14629	sz = sizeof (uint64_t);
14630
14631	if ((state->dts_needed != `0` && opt[DTRACEOPT_BUFSIZE] < sz) \|\|
14632	(state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) \|\|
14633	(state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14634	/*
14635	* A buffer size has been explicitly set to 0 (or to a size
14636	* that will be adjusted to 0) and we need the space -- we
14637	* need to return failure. We return ENOSPC to differentiate
14638	* it from failing to allocate a buffer due to failure to meet
14639	* the reserve (for which we return E2BIG).
14640	*/
14641	rval = ENOSPC;
14642	goto out;
14643	}
14644
14645	if ((rval = dtrace_state_buffers(state)) != `0`)
14646	goto err;
14647
14648	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14649	sz = dtrace_dstate_defsize;
14650
14651	do {
14652	rval = dtrace_dstate_init(dstate: &state->dts_vstate.dtvs_dynvars, size: sz);
14653
14654	if (rval == `0`)
14655	break;
14656
14657	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14658	goto err;
14659	} while (sz >>= `1`);
14660
14661	opt[DTRACEOPT_DYNVARSIZE] = sz;
14662
14663	if (rval != `0`)
14664	goto err;
14665
14666	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14667	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14668
14669	if (opt[DTRACEOPT_CLEANRATE] == `0`)
14670	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14671
14672	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14673	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14674
14675	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14676	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14677
14678	if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14679	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14680
14681	if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14682	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14683
14684	if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14685	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14686
14687	if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14688	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14689
14690	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14691	hdlr.cyh_arg = state;
14692	hdlr.cyh_level = CY_LOW_LEVEL;
14693
14694	when.cyt_when = `0`;
14695	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14696
14697	state->dts_cleaner = cyclic_add(&hdlr, &when);
14698
14699	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14700	hdlr.cyh_arg = state;
14701	hdlr.cyh_level = CY_LOW_LEVEL;
14702
14703	when.cyt_when = `0`;
14704	when.cyt_interval = dtrace_deadman_interval;
14705
14706	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14707	state->dts_deadman = cyclic_add(&hdlr, &when);
14708
14709	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14710
14711	/*
14712	* Now it's time to actually fire the BEGIN probe. We need to disable
14713	* interrupts here both to record the CPU on which we fired the BEGIN
14714	* probe (the data from this CPU will be processed first at user
14715	* level) and to manually activate the buffer for this CPU.
14716	*/
14717	cookie = dtrace_interrupt_disable();
14718	*cpu = CPU->cpu_id;
14719	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14720	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14721
14722	dtrace_probe(id: dtrace_probeid_begin,
14723	arg0: (uint64_t)(uintptr_t)state, arg1: `0`, arg2: `0`, arg3: `0`, arg4: `0`);
14724	dtrace_interrupt_enable(cookie);
14725	/*
14726	* We may have had an exit action from a BEGIN probe; only change our
14727	* state to ACTIVE if we're still in WARMUP.
14728	*/
14729	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP \|\|
14730	state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14731
14732	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14733	state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14734
14735	/*
14736	* Regardless of whether or not now we're in ACTIVE or DRAINING, we
14737	* want each CPU to transition its principal buffer out of the
14738	* INACTIVE state. Doing this assures that no CPU will suddenly begin
14739	* processing an ECB halfway down a probe's ECB chain; all CPUs will
14740	* atomically transition from processing none of a state's ECBs to
14741	* processing all of them.
14742	*/
14743	dtrace_xcall(DTRACE_CPUALL,
14744	(dtrace_xcall_t)dtrace_buffer_activate, state);
14745	goto out;
14746
14747	err:
14748	dtrace_buffer_free(bufs: state->dts_buffer);
14749	dtrace_buffer_free(bufs: state->dts_aggbuffer);
14750
14751	if ((nspec = state->dts_nspeculations) == `0`) {
14752	ASSERT(state->dts_speculations == NULL);
14753	goto out;
14754	}
14755
14756	spec = state->dts_speculations;
14757	ASSERT(spec != NULL);
14758
14759	for (i = `0`; i < state->dts_nspeculations; i++) {
14760	if ((buf = spec[i].dtsp_buffer) == NULL)
14761	break;
14762
14763	dtrace_buffer_free(bufs: buf);
14764	kmem_free(buf, bufsize);
14765	}
14766
14767	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14768	state->dts_nspeculations = `0`;
14769	state->dts_speculations = NULL;
14770
14771	out:
14772	lck_mtx_unlock(lck: &dtrace_lock);
14773	lck_mtx_unlock(lck: &cpu_lock);
14774
14775	return (rval);
14776	}
14777
14778	static int
14779	dtrace_state_stop(dtrace_state_t state, processorid_t cpu)
14780	{
14781	dtrace_icookie_t cookie;
14782
14783	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14784
14785	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14786	state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14787	return (EINVAL);
14788
14789	/*
14790	* We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14791	* to be sure that every CPU has seen it. See below for the details
14792	* on why this is done.
14793	*/
14794	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14795	dtrace_sync();
14796
14797	/*
14798	* By this point, it is impossible for any CPU to be still processing
14799	* with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14800	* DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14801	* other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14802	* and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14803	* iff we're in the END probe.
14804	*/
14805	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14806	dtrace_sync();
14807	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14808
14809	/*
14810	* Finally, we can release the reserve and call the END probe. We
14811	* disable interrupts across calling the END probe to allow us to
14812	* return the CPU on which we actually called the END probe. This
14813	* allows user-land to be sure that this CPU's principal buffer is
14814	* processed last.
14815	*/
14816	state->dts_reserve = `0`;
14817
14818	cookie = dtrace_interrupt_disable();
14819	*cpu = CPU->cpu_id;
14820	dtrace_probe(id: dtrace_probeid_end,
14821	arg0: (uint64_t)(uintptr_t)state, arg1: `0`, arg2: `0`, arg3: `0`, arg4: `0`);
14822	dtrace_interrupt_enable(cookie);
14823
14824	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14825	dtrace_sync();
14826
14827	return (`0`);
14828	}
14829
14830	static int
14831	dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14832	dtrace_optval_t val)
14833	{
14834	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14835
14836	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14837	return (EBUSY);
14838
14839	if (option >= DTRACEOPT_MAX)
14840	return (EINVAL);
14841
14842	if (option != DTRACEOPT_CPU && val < `0`)
14843	return (EINVAL);
14844
14845	switch (option) {
14846	case DTRACEOPT_DESTRUCTIVE:
14847	if (dtrace_destructive_disallow)
14848	return (EACCES);
14849
14850	state->dts_cred.dcr_destructive = `1`;
14851	break;
14852
14853	case DTRACEOPT_BUFSIZE:
14854	case DTRACEOPT_DYNVARSIZE:
14855	case DTRACEOPT_AGGSIZE:
14856	case DTRACEOPT_SPECSIZE:
14857	case DTRACEOPT_STRSIZE:
14858	if (val < `0`)
14859	return (EINVAL);
14860
14861	if (val >= LONG_MAX) {
14862	/*
14863	* If this is an otherwise negative value, set it to
14864	* the highest multiple of 128m less than LONG_MAX.
14865	* Technically, we're adjusting the size without
14866	* regard to the buffer resizing policy, but in fact,
14867	* this has no effect -- if we set the buffer size to
14868	* ~LONG_MAX and the buffer policy is ultimately set to
14869	* be "manual", the buffer allocation is guaranteed to
14870	* fail, if only because the allocation requires two
14871	* buffers. (We set the the size to the highest
14872	* multiple of 128m because it ensures that the size
14873	* will remain a multiple of a megabyte when
14874	* repeatedly halved -- all the way down to 15m.)
14875	*/
14876	val = LONG_MAX - (`1` << `27`) + `1`;
14877	}
14878	}
14879
14880	state->dts_options[option] = val;
14881
14882	return (`0`);
14883	}
14884
14885	static void
14886	dtrace_state_destroy(dtrace_state_t *state)
14887	{
14888	dtrace_ecb_t *ecb;
14889	dtrace_vstate_t *vstate = &state->dts_vstate;
14890	minor_t minor = getminor(state->dts_dev);
14891	int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14892	dtrace_speculation_t *spec = state->dts_speculations;
14893	int nspec = state->dts_nspeculations;
14894	uint32_t match;
14895
14896	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14897	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14898
14899	/*
14900	* First, retract any retained enablings for this state.
14901	*/
14902	dtrace_enabling_retract(state);
14903	ASSERT(state->dts_nretained == `0`);
14904
14905	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE \|\|
14906	state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14907	/*
14908	* We have managed to come into dtrace_state_destroy() on a
14909	* hot enabling -- almost certainly because of a disorderly
14910	* shutdown of a consumer. (That is, a consumer that is
14911	* exiting without having called dtrace_stop().) In this case,
14912	* we're going to set our activity to be KILLED, and then
14913	* issue a sync to be sure that everyone is out of probe
14914	* context before we start blowing away ECBs.
14915	*/
14916	state->dts_activity = DTRACE_ACTIVITY_KILLED;
14917	dtrace_sync();
14918	}
14919
14920	/*
14921	* Release the credential hold we took in dtrace_state_create().
14922	*/
14923	if (state->dts_cred.dcr_cred != NULL)
14924	kauth_cred_unref(&state->dts_cred.dcr_cred);
14925
14926	/*
14927	* Now we can safely disable and destroy any enabled probes. Because
14928	* any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14929	* (especially if they're all enabled), we take two passes through the
14930	* ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14931	* in the second we disable whatever is left over.
14932	*/
14933	for (match = DTRACE_PRIV_KERNEL; ; match = `0`) {
14934	for (i = `0`; i < state->dts_necbs; i++) {
14935	if ((ecb = state->dts_ecbs[i]) == NULL)
14936	continue;
14937
14938	if (match && ecb->dte_probe != NULL) {
14939	dtrace_probe_t *probe = ecb->dte_probe;
14940	dtrace_provider_t *prov = probe->dtpr_provider;
14941
14942	if (!(prov->dtpv_priv.dtpp_flags & match))
14943	continue;
14944	}
14945
14946	dtrace_ecb_disable(ecb);
14947	dtrace_ecb_destroy(ecb);
14948	}
14949
14950	if (!match)
14951	break;
14952	}
14953
14954	/*
14955	* Before we free the buffers, perform one more sync to assure that
14956	* every CPU is out of probe context.
14957	*/
14958	dtrace_sync();
14959
14960	dtrace_buffer_free(bufs: state->dts_buffer);
14961	dtrace_buffer_free(bufs: state->dts_aggbuffer);
14962
14963	for (i = `0`; i < (int)NCPU; i++) {
14964	kmem_free(state->dts_rstate[i], `2` * sizeof(uint64_t));
14965	}
14966	kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14967
14968	for (i = `0`; i < nspec; i++)
14969	dtrace_buffer_free(bufs: spec[i].dtsp_buffer);
14970
14971	if (state->dts_cleaner != CYCLIC_NONE)
14972	cyclic_remove(state->dts_cleaner);
14973
14974	if (state->dts_deadman != CYCLIC_NONE)
14975	cyclic_remove(state->dts_deadman);
14976
14977	dtrace_dstate_fini(dstate: &vstate->dtvs_dynvars);
14978	dtrace_vstate_fini(vstate);
14979	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14980
14981	if (state->dts_aggregations != NULL) {
14982	#if DEBUG
14983	for (i = `0`; i < state->dts_naggregations; i++)
14984	ASSERT(state->dts_aggregations[i] == NULL);
14985	#endif
14986	ASSERT(state->dts_naggregations > `0`);
14987	kmem_free(state->dts_aggregations,
14988	state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14989	}
14990
14991	kmem_free(state->dts_buffer, bufsize);
14992	kmem_free(state->dts_aggbuffer, bufsize);
14993
14994	for (i = `0`; i < nspec; i++)
14995	kmem_free(spec[i].dtsp_buffer, bufsize);
14996
14997	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14998
14999	dtrace_format_destroy(state);
15000
15001	vmem_destroy(state->dts_aggid_arena);
15002	dtrace_state_free(minor);
15003	}
15004
15005	/*
15006	* DTrace Anonymous Enabling Functions
15007	*/
15008
15009	int
15010	dtrace_keep_kernel_symbols(void)
15011	{
15012	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
15013	return `0`;
15014	}
15015
15016	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
15017	return `1`;
15018
15019	return `0`;
15020	}
15021
15022	static dtrace_state_t *
15023	dtrace_anon_grab(void)
15024	{
15025	dtrace_state_t *state;
15026
15027	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15028
15029	if ((state = dtrace_anon.dta_state) == NULL) {
15030	ASSERT(dtrace_anon.dta_enabling == NULL);
15031	return (NULL);
15032	}
15033
15034	ASSERT(dtrace_anon.dta_enabling != NULL);
15035	ASSERT(dtrace_retained != NULL);
15036
15037	dtrace_enabling_destroy(enab: dtrace_anon.dta_enabling);
15038	dtrace_anon.dta_enabling = NULL;
15039	dtrace_anon.dta_state = NULL;
15040
15041	return (state);
15042	}
15043
15044	static void
15045	dtrace_anon_property(void)
15046	{
15047	int i, rv;
15048	dtrace_state_t *state;
15049	dof_hdr_t *dof;
15050	char c[`32`]; / enough for "dof-data-" + digits /
15051
15052	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15053	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15054
15055	for (i = `0`; ; i++) {
15056	(void) snprintf(c, count: sizeof (c), "dof-data-%d", i);
15057
15058	dtrace_err_verbose = `1`;
15059
15060	if ((dof = dtrace_dof_property(name: c)) == NULL) {
15061	dtrace_err_verbose = `0`;
15062	break;
15063	}
15064
15065	#ifdef illumos
15066	/*
15067	* We want to create anonymous state, so we need to transition
15068	* the kernel debugger to indicate that DTrace is active. If
15069	* this fails (e.g. because the debugger has modified text in
15070	* some way), we won't continue with the processing.
15071	*/
15072	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != `0`) {
15073	cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15074	"enabling ignored.");
15075	dtrace_dof_destroy(dof);
15076	break;
15077	}
15078	#endif
15079
15080	/*
15081	* If we haven't allocated an anonymous state, we'll do so now.
15082	*/
15083	if ((state = dtrace_anon.dta_state) == NULL) {
15084	rv = dtrace_state_create(NULL, NULL, new_state: &state);
15085	dtrace_anon.dta_state = state;
15086	if (rv != `0` \|\| state == NULL) {
15087	/*
15088	* This basically shouldn't happen: the only
15089	* failure mode from dtrace_state_create() is a
15090	* failure of ddi_soft_state_zalloc() that
15091	* itself should never happen. Still, the
15092	* interface allows for a failure mode, and
15093	* we want to fail as gracefully as possible:
15094	* we'll emit an error message and cease
15095	* processing anonymous state in this case.
15096	*/
15097	cmn_err(CE_WARN, "failed to create "
15098	"anonymous state");
15099	dtrace_dof_destroy(dof);
15100	break;
15101	}
15102	}
15103
15104	rv = dtrace_dof_slurp(dof, vstate: &state->dts_vstate, CRED(),
15105	enabp: &dtrace_anon.dta_enabling, ubase: `0`, noprobes: B_TRUE);
15106
15107	if (rv == `0`)
15108	rv = dtrace_dof_options(dof, state);
15109
15110	dtrace_err_verbose = `0`;
15111	dtrace_dof_destroy(dof);
15112
15113	if (rv != `0`) {
15114	/*
15115	* This is malformed DOF; chuck any anonymous state
15116	* that we created.
15117	*/
15118	ASSERT(dtrace_anon.dta_enabling == NULL);
15119	dtrace_state_destroy(state);
15120	dtrace_anon.dta_state = NULL;
15121	break;
15122	}
15123
15124	ASSERT(dtrace_anon.dta_enabling != NULL);
15125	}
15126
15127	if (dtrace_anon.dta_enabling != NULL) {
15128	int rval;
15129
15130	/*
15131	* dtrace_enabling_retain() can only fail because we are
15132	* trying to retain more enablings than are allowed -- but
15133	* we only have one anonymous enabling, and we are guaranteed
15134	* to be allowed at least one retained enabling; we assert
15135	* that dtrace_enabling_retain() returns success.
15136	*/
15137	rval = dtrace_enabling_retain(enab: dtrace_anon.dta_enabling);
15138	ASSERT(rval == `0`);
15139
15140	dtrace_enabling_dump(enab: dtrace_anon.dta_enabling);
15141	}
15142	}
15143
15144	/*
15145	* DTrace Helper Functions
15146	*/
15147	static void
15148	dtrace_helper_trace(dtrace_helper_action_t *helper,
15149	dtrace_mstate_t mstate, dtrace_vstate_t vstate, int where)
15150	{
15151	uint32_t size, next, nnext;
15152	int i;
15153	dtrace_helptrace_t *ent;
15154	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15155
15156	if (!dtrace_helptrace_enabled)
15157	return;
15158
15159	ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15160
15161	/*
15162	* What would a tracing framework be without its own tracing
15163	* framework? (Well, a hell of a lot simpler, for starters...)
15164	*/
15165	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15166	sizeof (uint64_t) - sizeof (uint64_t);
15167
15168	/*
15169	* Iterate until we can allocate a slot in the trace buffer.
15170	*/
15171	do {
15172	next = dtrace_helptrace_next;
15173
15174	if (next + size < dtrace_helptrace_bufsize) {
15175	nnext = next + size;
15176	} else {
15177	nnext = size;
15178	}
15179	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15180
15181	/*
15182	* We have our slot; fill it in.
15183	*/
15184	if (nnext == size)
15185	next = `0`;
15186
15187	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15188	ent->dtht_helper = helper;
15189	ent->dtht_where = where;
15190	ent->dtht_nlocals = vstate->dtvs_nlocals;
15191
15192	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15193	mstate->dtms_fltoffs : -`1`;
15194	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15195	ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15196
15197	for (i = `0`; i < vstate->dtvs_nlocals; i++) {
15198	dtrace_statvar_t *svar;
15199
15200	if ((svar = vstate->dtvs_locals[i]) == NULL)
15201	continue;
15202
15203	ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15204	ent->dtht_locals[i] =
15205	((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15206	}
15207	}
15208
15209	__attribute__((noinline))
15210	static uint64_t
15211	dtrace_helper(int which, dtrace_mstate_t *mstate,
15212	dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15213	{
15214	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15215	uint64_t sarg0 = mstate->dtms_arg[`0`];
15216	uint64_t sarg1 = mstate->dtms_arg[`1`];
15217	uint64_t rval = `0`;
15218	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15219	dtrace_helper_action_t *helper;
15220	dtrace_vstate_t *vstate;
15221	dtrace_difo_t *pred;
15222	int i, trace = dtrace_helptrace_enabled;
15223
15224	ASSERT(which >= `0` && which < DTRACE_NHELPER_ACTIONS);
15225
15226	if (helpers == NULL)
15227	return (`0`);
15228
15229	if ((helper = helpers->dthps_actions[which]) == NULL)
15230	return (`0`);
15231
15232	vstate = &helpers->dthps_vstate;
15233	mstate->dtms_arg[`0`] = arg0;
15234	mstate->dtms_arg[`1`] = arg1;
15235
15236	/*
15237	* Now iterate over each helper. If its predicate evaluates to 'true',
15238	* we'll call the corresponding actions. Note that the below calls
15239	* to dtrace_dif_emulate() may set faults in machine state. This is
15240	* okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15241	* the stored DIF offset with its own (which is the desired behavior).
15242	* Also, note the calls to dtrace_dif_emulate() may allocate scratch
15243	* from machine state; this is okay, too.
15244	*/
15245	for (; helper != NULL; helper = helper->dtha_next) {
15246	if ((pred = helper->dtha_predicate) != NULL) {
15247	if (trace)
15248	dtrace_helper_trace(helper, mstate, vstate, where: `0`);
15249
15250	if (!dtrace_dif_emulate(difo: pred, mstate, vstate, state))
15251	goto next;
15252
15253	if (*flags & CPU_DTRACE_FAULT)
15254	goto err;
15255	}
15256
15257	for (i = `0`; i < helper->dtha_nactions; i++) {
15258	if (trace)
15259	dtrace_helper_trace(helper,
15260	mstate, vstate, where: i + `1`);
15261
15262	rval = dtrace_dif_emulate(difo: helper->dtha_actions[i],
15263	mstate, vstate, state);
15264
15265	if (*flags & CPU_DTRACE_FAULT)
15266	goto err;
15267	}
15268
15269	next:
15270	if (trace)
15271	dtrace_helper_trace(helper, mstate, vstate,
15272	DTRACE_HELPTRACE_NEXT);
15273	}
15274
15275	if (trace)
15276	dtrace_helper_trace(helper, mstate, vstate,
15277	DTRACE_HELPTRACE_DONE);
15278
15279	/*
15280	* Restore the arg0 that we saved upon entry.
15281	*/
15282	mstate->dtms_arg[`0`] = sarg0;
15283	mstate->dtms_arg[`1`] = sarg1;
15284
15285	return (rval);
15286
15287	err:
15288	if (trace)
15289	dtrace_helper_trace(helper, mstate, vstate,
15290	DTRACE_HELPTRACE_ERR);
15291
15292	/*
15293	* Restore the arg0 that we saved upon entry.
15294	*/
15295	mstate->dtms_arg[`0`] = sarg0;
15296	mstate->dtms_arg[`1`] = sarg1;
15297
15298	return (`0`);
15299	}
15300
15301	static void
15302	dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15303	dtrace_vstate_t *vstate)
15304	{
15305	int i;
15306
15307	if (helper->dtha_predicate != NULL)
15308	dtrace_difo_release(dp: helper->dtha_predicate, vstate);
15309
15310	for (i = `0`; i < helper->dtha_nactions; i++) {
15311	ASSERT(helper->dtha_actions[i] != NULL);
15312	dtrace_difo_release(dp: helper->dtha_actions[i], vstate);
15313	}
15314
15315	kmem_free(helper->dtha_actions,
15316	helper->dtha_nactions * sizeof (dtrace_difo_t *));
15317	kmem_free(helper, sizeof (dtrace_helper_action_t));
15318	}
15319
15320	static int
15321	dtrace_helper_destroygen(proc_t* p, int gen)
15322	{
15323	dtrace_helpers_t *help = p->p_dtrace_helpers;
15324	dtrace_vstate_t *vstate;
15325	uint_t i;
15326
15327	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15328	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15329
15330	if (help == NULL \|\| gen > help->dthps_generation)
15331	return (EINVAL);
15332
15333	vstate = &help->dthps_vstate;
15334
15335	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
15336	dtrace_helper_action_t last = NULL, h, *next;
15337
15338	for (h = help->dthps_actions[i]; h != NULL; h = next) {
15339	next = h->dtha_next;
15340
15341	if (h->dtha_generation == gen) {
15342	if (last != NULL) {
15343	last->dtha_next = next;
15344	} else {
15345	help->dthps_actions[i] = next;
15346	}
15347
15348	dtrace_helper_action_destroy(helper: h, vstate);
15349	} else {
15350	last = h;
15351	}
15352	}
15353	}
15354
15355	/*
15356	* Interate until we've cleared out all helper providers with the
15357	* given generation number.
15358	*/
15359	for (;;) {
15360	dtrace_helper_provider_t *prov = NULL;
15361
15362	/*
15363	* Look for a helper provider with the right generation. We
15364	* have to start back at the beginning of the list each time
15365	* because we drop dtrace_lock. It's unlikely that we'll make
15366	* more than two passes.
15367	*/
15368	for (i = `0`; i < help->dthps_nprovs; i++) {
15369	prov = help->dthps_provs[i];
15370
15371	if (prov->dthp_generation == gen)
15372	break;
15373	}
15374
15375	/*
15376	* If there were no matches, we're done.
15377	*/
15378	if (i == help->dthps_nprovs)
15379	break;
15380
15381	/*
15382	* Move the last helper provider into this slot.
15383	*/
15384	help->dthps_nprovs--;
15385	help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15386	help->dthps_provs[help->dthps_nprovs] = NULL;
15387
15388	lck_mtx_unlock(lck: &dtrace_lock);
15389
15390	/*
15391	* If we have a meta provider, remove this helper provider.
15392	*/
15393	if (dtrace_meta_pid != NULL) {
15394	ASSERT(dtrace_deferred_pid == NULL);
15395	dtrace_helper_provider_remove(dhp: &prov->dthp_prov,
15396	p);
15397	}
15398
15399	dtrace_helper_provider_destroy(prov);
15400
15401	lck_mtx_lock(lck: &dtrace_lock);
15402	}
15403
15404	return (`0`);
15405	}
15406
15407	static int
15408	dtrace_helper_validate(dtrace_helper_action_t *helper)
15409	{
15410	int err = `0`, i;
15411	dtrace_difo_t *dp;
15412
15413	if ((dp = helper->dtha_predicate) != NULL)
15414	err += dtrace_difo_validate_helper(dp);
15415
15416	for (i = `0`; i < helper->dtha_nactions; i++)
15417	err += dtrace_difo_validate_helper(dp: helper->dtha_actions[i]);
15418
15419	return (err == `0`);
15420	}
15421
15422	static int
15423	dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15424	{
15425	dtrace_helpers_t *help;
15426	dtrace_helper_action_t helper, last;
15427	dtrace_actdesc_t *act;
15428	dtrace_vstate_t *vstate;
15429	dtrace_predicate_t *pred;
15430	int count = `0`, nactions = `0`, i;
15431
15432	if (which < `0` \|\| which >= DTRACE_NHELPER_ACTIONS)
15433	return (EINVAL);
15434
15435	help = p->p_dtrace_helpers;
15436	last = help->dthps_actions[which];
15437	vstate = &help->dthps_vstate;
15438
15439	for (count = `0`; last != NULL; last = last->dtha_next) {
15440	count++;
15441	if (last->dtha_next == NULL)
15442	break;
15443	}
15444
15445	/*
15446	* If we already have dtrace_helper_actions_max helper actions for this
15447	* helper action type, we'll refuse to add a new one.
15448	*/
15449	if (count >= dtrace_helper_actions_max)
15450	return (ENOSPC);
15451
15452	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15453	helper->dtha_generation = help->dthps_generation;
15454
15455	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15456	ASSERT(pred->dtp_difo != NULL);
15457	dtrace_difo_hold(dp: pred->dtp_difo);
15458	helper->dtha_predicate = pred->dtp_difo;
15459	}
15460
15461	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15462	if (act->dtad_kind != DTRACEACT_DIFEXPR)
15463	goto err;
15464
15465	if (act->dtad_difo == NULL)
15466	goto err;
15467
15468	nactions++;
15469	}
15470
15471	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t )
15472	(helper->dtha_nactions = nactions), KM_SLEEP);
15473
15474	for (act = ep->dted_action, i = `0`; act != NULL; act = act->dtad_next) {
15475	dtrace_difo_hold(dp: act->dtad_difo);
15476	helper->dtha_actions[i++] = act->dtad_difo;
15477	}
15478
15479	if (!dtrace_helper_validate(helper))
15480	goto err;
15481
15482	if (last == NULL) {
15483	help->dthps_actions[which] = helper;
15484	} else {
15485	last->dtha_next = helper;
15486	}
15487
15488	if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15489	dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15490	dtrace_helptrace_next = `0`;
15491	}
15492
15493	return (`0`);
15494	err:
15495	dtrace_helper_action_destroy(helper, vstate);
15496	return (EINVAL);
15497	}
15498
15499	static void
15500	dtrace_helper_provider_register(proc_t p, dtrace_helpers_t help,
15501	dof_helper_t *dofhp)
15502	{
15503	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15504	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15505
15506	lck_mtx_lock(lck: &dtrace_lock);
15507
15508	if (!dtrace_attached() \|\| dtrace_meta_pid == NULL) {
15509	/*
15510	* If the dtrace module is loaded but not attached, or if
15511	* there aren't isn't a meta provider registered to deal with
15512	* these provider descriptions, we need to postpone creating
15513	* the actual providers until later.
15514	*/
15515
15516	if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15517	dtrace_deferred_pid != help) {
15518	help->dthps_deferred = `1`;
15519	help->dthps_pid = proc_getpid(p);
15520	help->dthps_next = dtrace_deferred_pid;
15521	help->dthps_prev = NULL;
15522	if (dtrace_deferred_pid != NULL)
15523	dtrace_deferred_pid->dthps_prev = help;
15524	dtrace_deferred_pid = help;
15525	}
15526
15527	lck_mtx_unlock(lck: &dtrace_lock);
15528
15529	} else if (dofhp != NULL) {
15530	/*
15531	* If the dtrace module is loaded and we have a particular
15532	* helper provider description, pass that off to the
15533	* meta provider.
15534	*/
15535
15536	lck_mtx_unlock(lck: &dtrace_lock);
15537
15538	dtrace_helper_provide(dhp: dofhp, p);
15539
15540	} else {
15541	/*
15542	* Otherwise, just pass all the helper provider descriptions
15543	* off to the meta provider.
15544	*/
15545
15546	uint_t i;
15547	lck_mtx_unlock(lck: &dtrace_lock);
15548
15549	for (i = `0`; i < help->dthps_nprovs; i++) {
15550	dtrace_helper_provide(dhp: &help->dthps_provs[i]->dthp_prov,
15551	p);
15552	}
15553	}
15554	}
15555
15556	static int
15557	dtrace_helper_provider_add(proc_t* p, dof_helper_t dofhp, int* gen)
15558	{
15559	dtrace_helpers_t *help;
15560	dtrace_helper_provider_t hprov, *tmp_provs;
15561	uint_t tmp_maxprovs, i;
15562
15563	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15564	help = p->p_dtrace_helpers;
15565	ASSERT(help != NULL);
15566
15567	/*
15568	* If we already have dtrace_helper_providers_max helper providers,
15569	* we're refuse to add a new one.
15570	*/
15571	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15572	return (ENOSPC);
15573
15574	/*
15575	* Check to make sure this isn't a duplicate.
15576	*/
15577	for (i = `0`; i < help->dthps_nprovs; i++) {
15578	if (dofhp->dofhp_addr ==
15579	help->dthps_provs[i]->dthp_prov.dofhp_addr)
15580	return (EALREADY);
15581	}
15582
15583	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15584	hprov->dthp_prov = *dofhp;
15585	hprov->dthp_ref = `1`;
15586	hprov->dthp_generation = gen;
15587
15588	/*
15589	* Allocate a bigger table for helper providers if it's already full.
15590	*/
15591	if (help->dthps_maxprovs == help->dthps_nprovs) {
15592	tmp_maxprovs = help->dthps_maxprovs;
15593	tmp_provs = help->dthps_provs;
15594
15595	if (help->dthps_maxprovs == `0`)
15596	help->dthps_maxprovs = `2`;
15597	else
15598	help->dthps_maxprovs *= `2`;
15599	if (help->dthps_maxprovs > dtrace_helper_providers_max)
15600	help->dthps_maxprovs = dtrace_helper_providers_max;
15601
15602	ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15603
15604	help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15605	sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15606
15607	if (tmp_provs != NULL) {
15608	bcopy(src: tmp_provs, dst: help->dthps_provs, n: tmp_maxprovs *
15609	sizeof (dtrace_helper_provider_t *));
15610	kmem_free(tmp_provs, tmp_maxprovs *
15611	sizeof (dtrace_helper_provider_t *));
15612	}
15613	}
15614
15615	help->dthps_provs[help->dthps_nprovs] = hprov;
15616	help->dthps_nprovs++;
15617
15618	return (`0`);
15619	}
15620
15621	static void
15622	dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15623	{
15624	lck_mtx_lock(lck: &dtrace_lock);
15625
15626	if (--hprov->dthp_ref == `0`) {
15627	dof_hdr_t *dof;
15628	lck_mtx_unlock(lck: &dtrace_lock);
15629	dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15630	dtrace_dof_destroy(dof);
15631	kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15632	} else {
15633	lck_mtx_unlock(lck: &dtrace_lock);
15634	}
15635	}
15636
15637	static int
15638	dtrace_helper_provider_validate(dof_hdr_t dof, dof_sec_t sec)
15639	{
15640	uintptr_t daddr = (uintptr_t)dof;
15641	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
15642	dof_provider_t *provider;
15643	dof_probe_t *probe;
15644	uint8_t *arg;
15645	char strtab, typestr;
15646	dof_stridx_t typeidx;
15647	size_t typesz;
15648	uint_t nprobes, j, k;
15649
15650	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15651
15652	if (sec->dofs_offset & (sizeof (uint_t) - `1`)) {
15653	dtrace_dof_error(dof, str: "misaligned section offset");
15654	return (-`1`);
15655	}
15656
15657	/*
15658	* The section needs to be large enough to contain the DOF provider
15659	* structure appropriate for the given version.
15660	*/
15661	if (sec->dofs_size <
15662	((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15663	offsetof(dof_provider_t, dofpv_prenoffs) :
15664	sizeof (dof_provider_t))) {
15665	dtrace_dof_error(dof, str: "provider section too small");
15666	return (-`1`);
15667	}
15668
15669	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15670	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, i: provider->dofpv_strtab);
15671	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, i: provider->dofpv_probes);
15672	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, i: provider->dofpv_prargs);
15673	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, i: provider->dofpv_proffs);
15674
15675	if (str_sec == NULL \|\| prb_sec == NULL \|\|
15676	arg_sec == NULL \|\| off_sec == NULL)
15677	return (-`1`);
15678
15679	enoff_sec = NULL;
15680
15681	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15682	provider->dofpv_prenoffs != DOF_SECT_NONE &&
15683	(enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15684	i: provider->dofpv_prenoffs)) == NULL)
15685	return (-`1`);
15686
15687	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15688
15689	if (provider->dofpv_name >= str_sec->dofs_size \|\|
15690	strlen(s: strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15691	dtrace_dof_error(dof, str: "invalid provider name");
15692	return (-`1`);
15693	}
15694
15695	if (prb_sec->dofs_entsize == `0` \|\|
15696	prb_sec->dofs_entsize > prb_sec->dofs_size) {
15697	dtrace_dof_error(dof, str: "invalid entry size");
15698	return (-`1`);
15699	}
15700
15701	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - `1`)) {
15702	dtrace_dof_error(dof, str: "misaligned entry size");
15703	return (-`1`);
15704	}
15705
15706	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15707	dtrace_dof_error(dof, str: "invalid entry size");
15708	return (-`1`);
15709	}
15710
15711	if (off_sec->dofs_offset & (sizeof (uint32_t) - `1`)) {
15712	dtrace_dof_error(dof, str: "misaligned section offset");
15713	return (-`1`);
15714	}
15715
15716	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15717	dtrace_dof_error(dof, str: "invalid entry size");
15718	return (-`1`);
15719	}
15720
15721	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15722
15723	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15724
15725	/*
15726	* Take a pass through the probes to check for errors.
15727	*/
15728	for (j = `0`; j < nprobes; j++) {
15729	probe = (dof_probe_t *)(uintptr_t)(daddr +
15730	prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15731
15732	if (probe->dofpr_func >= str_sec->dofs_size) {
15733	dtrace_dof_error(dof, str: "invalid function name");
15734	return (-`1`);
15735	}
15736
15737	if (strlen(s: strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15738	dtrace_dof_error(dof, str: "function name too long");
15739	return (-`1`);
15740	}
15741
15742	if (probe->dofpr_name >= str_sec->dofs_size \|\|
15743	strlen(s: strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15744	dtrace_dof_error(dof, str: "invalid probe name");
15745	return (-`1`);
15746	}
15747
15748	/*
15749	* The offset count must not wrap the index, and the offsets
15750	* must also not overflow the section's data.
15751	*/
15752	if (probe->dofpr_offidx + probe->dofpr_noffs <
15753	probe->dofpr_offidx \|\|
15754	(probe->dofpr_offidx + probe->dofpr_noffs) *
15755	off_sec->dofs_entsize > off_sec->dofs_size) {
15756	dtrace_dof_error(dof, str: "invalid probe offset");
15757	return (-`1`);
15758	}
15759
15760	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15761	/*
15762	* If there's no is-enabled offset section, make sure
15763	* there aren't any is-enabled offsets. Otherwise
15764	* perform the same checks as for probe offsets
15765	* (immediately above).
15766	*/
15767	if (enoff_sec == NULL) {
15768	if (probe->dofpr_enoffidx != `0` \|\|
15769	probe->dofpr_nenoffs != `0`) {
15770	dtrace_dof_error(dof, str: "is-enabled "
15771	"offsets with null section");
15772	return (-`1`);
15773	}
15774	} else if (probe->dofpr_enoffidx +
15775	probe->dofpr_nenoffs < probe->dofpr_enoffidx \|\|
15776	(probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15777	enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15778	dtrace_dof_error(dof, str: "invalid is-enabled "
15779	"offset");
15780	return (-`1`);
15781	}
15782
15783	if (probe->dofpr_noffs + probe->dofpr_nenoffs == `0`) {
15784	dtrace_dof_error(dof, str: "zero probe and "
15785	"is-enabled offsets");
15786	return (-`1`);
15787	}
15788	} else if (probe->dofpr_noffs == `0`) {
15789	dtrace_dof_error(dof, str: "zero probe offsets");
15790	return (-`1`);
15791	}
15792
15793	if (probe->dofpr_argidx + probe->dofpr_xargc <
15794	probe->dofpr_argidx \|\|
15795	(probe->dofpr_argidx + probe->dofpr_xargc) *
15796	arg_sec->dofs_entsize > arg_sec->dofs_size) {
15797	dtrace_dof_error(dof, str: "invalid args");
15798	return (-`1`);
15799	}
15800
15801	typeidx = probe->dofpr_nargv;
15802	typestr = strtab + probe->dofpr_nargv;
15803	for (k = `0`; k < probe->dofpr_nargc; k++) {
15804	if (typeidx >= str_sec->dofs_size) {
15805	dtrace_dof_error(dof, str: "bad "
15806	"native argument type");
15807	return (-`1`);
15808	}
15809
15810	typesz = strlen(s: typestr) + `1`;
15811	if (typesz > DTRACE_ARGTYPELEN) {
15812	dtrace_dof_error(dof, str: "native "
15813	"argument type too long");
15814	return (-`1`);
15815	}
15816	typeidx += typesz;
15817	typestr += typesz;
15818	}
15819
15820	typeidx = probe->dofpr_xargv;
15821	typestr = strtab + probe->dofpr_xargv;
15822	for (k = `0`; k < probe->dofpr_xargc; k++) {
15823	if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15824	dtrace_dof_error(dof, str: "bad "
15825	"native argument index");
15826	return (-`1`);
15827	}
15828
15829	if (typeidx >= str_sec->dofs_size) {
15830	dtrace_dof_error(dof, str: "bad "
15831	"translated argument type");
15832	return (-`1`);
15833	}
15834
15835	typesz = strlen(s: typestr) + `1`;
15836	if (typesz > DTRACE_ARGTYPELEN) {
15837	dtrace_dof_error(dof, str: "translated argument "
15838	"type too long");
15839	return (-`1`);
15840	}
15841
15842	typeidx += typesz;
15843	typestr += typesz;
15844	}
15845	}
15846
15847	return (`0`);
15848	}
15849
15850	static int
15851	dtrace_helper_slurp(proc_t* p, dof_hdr_t dof, dof_helper_t dhp)
15852	{
15853	dtrace_helpers_t *help;
15854	dtrace_vstate_t *vstate;
15855	dtrace_enabling_t *enab = NULL;
15856	int i, gen, rv, nhelpers = `0`, nprovs = `0`, destroy = `1`;
15857	uintptr_t daddr = (uintptr_t)dof;
15858
15859	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15860	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15861
15862	if ((help = p->p_dtrace_helpers) == NULL)
15863	help = dtrace_helpers_create(p);
15864
15865	vstate = &help->dthps_vstate;
15866
15867	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, enabp: &enab,
15868	ubase: dhp != NULL ? dhp->dofhp_addr : `0`, noprobes: B_FALSE)) != `0`) {
15869	dtrace_dof_destroy(dof);
15870	return (rv);
15871	}
15872
15873	/*
15874	* Look for helper providers and validate their descriptions.
15875	*/
15876	if (dhp != NULL) {
15877	for (i = `0`; (uint32_t)i < dof->dofh_secnum; i++) {
15878	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
15879	dof->dofh_secoff + i * dof->dofh_secsize);
15880
15881	if (sec->dofs_type != DOF_SECT_PROVIDER)
15882	continue;
15883
15884	if (dtrace_helper_provider_validate(dof, sec) != `0`) {
15885	dtrace_enabling_destroy(enab);
15886	dtrace_dof_destroy(dof);
15887	return (-`1`);
15888	}
15889
15890	nprovs++;
15891	}
15892	}
15893
15894	/*
15895	* Now we need to walk through the ECB descriptions in the enabling.
15896	*/
15897	for (i = `0`; i < enab->dten_ndesc; i++) {
15898	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15899	dtrace_probedesc_t *desc = &ep->dted_probe;
15900
15901	/ APPLE NOTE: Darwin employs size bounded string operation. /
15902	if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15903	continue;
15904
15905	if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15906	continue;
15907
15908	if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15909	continue;
15910
15911	if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15912	ep)) != `0`) {
15913	/*
15914	* Adding this helper action failed -- we are now going
15915	* to rip out the entire generation and return failure.
15916	*/
15917	(void) dtrace_helper_destroygen(p, gen: help->dthps_generation);
15918	dtrace_enabling_destroy(enab);
15919	dtrace_dof_destroy(dof);
15920	return (-`1`);
15921	}
15922
15923	nhelpers++;
15924	}
15925
15926	if (nhelpers < enab->dten_ndesc)
15927	dtrace_dof_error(dof, str: "unmatched helpers");
15928
15929	gen = help->dthps_generation++;
15930	dtrace_enabling_destroy(enab);
15931
15932	if (dhp != NULL && nprovs > `0`) {
15933	dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15934	if (dtrace_helper_provider_add(p, dofhp: dhp, gen) == `0`) {
15935	lck_mtx_unlock(lck: &dtrace_lock);
15936	dtrace_helper_provider_register(p, help, dofhp: dhp);
15937	lck_mtx_lock(lck: &dtrace_lock);
15938
15939	destroy = `0`;
15940	}
15941	}
15942
15943	if (destroy)
15944	dtrace_dof_destroy(dof);
15945
15946	return (gen);
15947	}
15948
15949	/*
15950	* APPLE NOTE: DTrace lazy dof implementation
15951	*
15952	* DTrace user static probes (USDT probes) and helper actions are loaded
15953	* in a process by proccessing dof sections. The dof sections are passed
15954	* into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15955	* expensive to process dof for a process that will never use it. There
15956	* is a memory cost (allocating the providers/probes), and a cpu cost
15957	* (creating the providers/probes).
15958	*
15959	* To reduce this cost, we use "lazy dof". The normal proceedure for
15960	* dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15961	* block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15962	* used, each process retains the dof_ioctl_data_t block, instead of
15963	* copying in the data it points to.
15964	*
15965	* The dof_ioctl_data_t blocks are managed as if they were the actual
15966	* processed dof; on fork the block is copied to the child, on exec and
15967	* exit the block is freed.
15968	*
15969	* If the process loads library(s) containing additional dof, the
15970	* new dof_ioctl_data_t is merged with the existing block.
15971	*
15972	* There are a few catches that make this slightly more difficult.
15973	* When dyld registers dof_ioctl_data_t blocks, it expects a unique
15974	* identifier value for each dof in the block. In non-lazy dof terms,
15975	* this is the generation that dof was loaded in. If we hand back
15976	* a UID for a lazy dof, that same UID must be able to unload the
15977	* dof once it has become non-lazy. To meet this requirement, the
15978	* code that loads lazy dof requires that the UID's for dof(s) in
15979	* the lazy dof be sorted, and in ascending order. It is okay to skip
15980	* UID's, I.E., 1 -> 5 -> 6 is legal.
15981	*
15982	* Once a process has become non-lazy, it will stay non-lazy. All
15983	* future dof operations for that process will be non-lazy, even
15984	* if the dof mode transitions back to lazy.
15985	*
15986	* Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15987	* That way if the lazy check fails due to transitioning to non-lazy, the
15988	* right thing is done with the newly faulted in dof.
15989	*/
15990
15991	/*
15992	* This method is a bit squicky. It must handle:
15993	*
15994	* dof should not be lazy.
15995	* dof should have been handled lazily, but there was an error
15996	* dof was handled lazily, and needs to be freed.
15997	* dof was handled lazily, and must not be freed.
15998	*
15999	*
16000	* Returns EACCESS if dof should be handled non-lazily.
16001	*
16002	* KERN_SUCCESS and all other return codes indicate lazy handling of dof.
16003	*
16004	* If the dofs data is claimed by this method, dofs_claimed will be set.
16005	* Callers should not free claimed dofs.
16006	*/
16007	static int
16008	dtrace_lazy_dofs_add(proc_t p, dof_ioctl_data_t incoming_dofs, int *dofs_claimed)
16009	{
16010	ASSERT(p);
16011	ASSERT(incoming_dofs && incoming_dofs->dofiod_count > `0`);
16012
16013	int rval = `0`;
16014	*dofs_claimed = `0`;
16015
16016	lck_rw_lock_shared(lck: &dtrace_dof_mode_lock);
16017
16018	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
16019	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16020
16021	/*
16022	* Any existing helpers force non-lazy behavior.
16023	*/
16024	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16025	dtrace_sprlock(p);
16026
16027	dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16028	unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : `0`;
16029	unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
16030
16031	/*
16032	* Range check...
16033	*/
16034	if (merged_dofs_count == `0` \|\| merged_dofs_count > `1024`) {
16035	dtrace_dof_error(NULL, str: "lazy_dofs_add merged_dofs_count out of range");
16036	rval = EINVAL;
16037	goto unlock;
16038	}
16039
16040	/*
16041	* Each dof being added must be assigned a unique generation.
16042	*/
16043	uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - `1`].dofhp_dof + `1` : `1`;
16044	for (i=`0`; i<incoming_dofs->dofiod_count; i++) {
16045	/*
16046	* We rely on these being the same so we can overwrite dofhp_dof and not lose info.
16047	*/
16048	ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
16049	incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
16050	}
16051
16052
16053	if (existing_dofs) {
16054	/*
16055	* Merge the existing and incoming dofs
16056	*/
16057	size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
16058	dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
16059
16060	bcopy(src: &existing_dofs->dofiod_helpers[`0`],
16061	dst: &merged_dofs->dofiod_helpers[`0`],
16062	n: sizeof(dof_helper_t) * existing_dofs_count);
16063	bcopy(src: &incoming_dofs->dofiod_helpers[`0`],
16064	dst: &merged_dofs->dofiod_helpers[existing_dofs_count],
16065	n: sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
16066
16067	merged_dofs->dofiod_count = merged_dofs_count;
16068
16069	kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16070
16071	p->p_dtrace_lazy_dofs = merged_dofs;
16072	} else {
16073	/*
16074	* Claim the incoming dofs
16075	*/
16076	*dofs_claimed = `1`;
16077	p->p_dtrace_lazy_dofs = incoming_dofs;
16078	}
16079
16080	#if DEBUG
16081	dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16082	for (i=`0`; i<all_dofs->dofiod_count-`1`; i++) {
16083	ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+`1`].dofhp_dof);
16084	}
16085	#endif /* DEBUG */
16086
16087	unlock:
16088	dtrace_sprunlock(p);
16089	} else {
16090	rval = EACCES;
16091	}
16092
16093	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
16094
16095	return rval;
16096	}
16097
16098	/*
16099	* Returns:
16100	*
16101	* EINVAL: lazy dof is enabled, but the requested generation was not found.
16102	* EACCES: This removal needs to be handled non-lazily.
16103	*/
16104	static int
16105	dtrace_lazy_dofs_remove(proc_t p, int* generation)
16106	{
16107	int rval = EINVAL;
16108
16109	lck_rw_lock_shared(lck: &dtrace_dof_mode_lock);
16110
16111	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
16112	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16113
16114	/*
16115	* Any existing helpers force non-lazy behavior.
16116	*/
16117	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16118	dtrace_sprlock(p);
16119
16120	dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16121
16122	if (existing_dofs) {
16123	int index, existing_dofs_count = existing_dofs->dofiod_count;
16124	for (index=`0`; index<existing_dofs_count; index++) {
16125	if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
16126	dof_ioctl_data_t* removed_dofs = NULL;
16127
16128	/*
16129	* If there is only 1 dof, we'll delete it and swap in NULL.
16130	*/
16131	if (existing_dofs_count > `1`) {
16132	int removed_dofs_count = existing_dofs_count - `1`;
16133	size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
16134
16135	removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
16136	removed_dofs->dofiod_count = removed_dofs_count;
16137
16138	/*
16139	* copy the remaining data.
16140	*/
16141	if (index > `0`) {
16142	bcopy(src: &existing_dofs->dofiod_helpers[`0`],
16143	dst: &removed_dofs->dofiod_helpers[`0`],
16144	n: index * sizeof(dof_helper_t));
16145	}
16146
16147	if (index < existing_dofs_count-`1`) {
16148	bcopy(src: &existing_dofs->dofiod_helpers[index+`1`],
16149	dst: &removed_dofs->dofiod_helpers[index],
16150	n: (existing_dofs_count - index - `1`) * sizeof(dof_helper_t));
16151	}
16152	}
16153
16154	kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16155
16156	p->p_dtrace_lazy_dofs = removed_dofs;
16157
16158	rval = KERN_SUCCESS;
16159
16160	break;
16161	}
16162	}
16163
16164	#if DEBUG
16165	dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16166	if (all_dofs) {
16167	unsigned int i;
16168	for (i=`0`; i<all_dofs->dofiod_count-`1`; i++) {
16169	ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+`1`].dofhp_dof);
16170	}
16171	}
16172	#endif
16173
16174	}
16175	dtrace_sprunlock(p);
16176	} else {
16177	rval = EACCES;
16178	}
16179
16180	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
16181
16182	return rval;
16183	}
16184
16185	void
16186	dtrace_lazy_dofs_destroy(proc_t *p)
16187	{
16188	lck_rw_lock_shared(lck: &dtrace_dof_mode_lock);
16189	dtrace_sprlock(p);
16190
16191	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
16192
16193	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16194	p->p_dtrace_lazy_dofs = NULL;
16195
16196	dtrace_sprunlock(p);
16197	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
16198
16199	if (lazy_dofs) {
16200	kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16201	}
16202	}
16203
16204	static int
16205	dtrace_lazy_dofs_proc_iterate_filter(proc_t p, void** ignored)
16206	{
16207	#pragma unused(ignored)
16208	/*
16209	* Okay to NULL test without taking the sprlock.
16210	*/
16211	return p->p_dtrace_lazy_dofs != NULL;
16212	}
16213
16214	static void
16215	dtrace_lazy_dofs_process(proc_t *p) {
16216	/*
16217	* It is possible this process may exit during our attempt to
16218	* fault in the dof. We could fix this by holding locks longer,
16219	* but the errors are benign.
16220	*/
16221	dtrace_sprlock(p);
16222
16223
16224	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
16225	ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16226
16227	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16228	p->p_dtrace_lazy_dofs = NULL;
16229
16230	dtrace_sprunlock(p);
16231	lck_mtx_lock(lck: &dtrace_meta_lock);
16232	/*
16233	* Process each dof_helper_t
16234	*/
16235	if (lazy_dofs != NULL) {
16236	unsigned int i;
16237	int rval;
16238
16239	for (i=`0`; i<lazy_dofs->dofiod_count; i++) {
16240	/*
16241	* When loading lazy dof, we depend on the generations being sorted in ascending order.
16242	*/
16243	ASSERT(i >= (lazy_dofs->dofiod_count - `1`) \|\| lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+`1`].dofhp_dof);
16244
16245	dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16246
16247	/*
16248	* We stored the generation in dofhp_dof. Save it, and restore the original value.
16249	*/
16250	int generation = dhp->dofhp_dof;
16251	dhp->dofhp_dof = dhp->dofhp_addr;
16252
16253	dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, uarg: dhp->dofhp_dof, errp: &rval);
16254
16255	if (dof != NULL) {
16256	dtrace_helpers_t *help;
16257
16258	lck_mtx_lock(lck: &dtrace_lock);
16259
16260	/*
16261	* This must be done with the dtrace_lock held
16262	*/
16263	if ((help = p->p_dtrace_helpers) == NULL)
16264	help = dtrace_helpers_create(p);
16265
16266	/*
16267	* If the generation value has been bumped, someone snuck in
16268	* when we released the dtrace lock. We have to dump this generation,
16269	* there is no safe way to load it.
16270	*/
16271	if (help->dthps_generation <= generation) {
16272	help->dthps_generation = generation;
16273
16274	/*
16275	* dtrace_helper_slurp() takes responsibility for the dof --
16276	* it may free it now or it may save it and free it later.
16277	*/
16278	if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16279	dtrace_dof_error(NULL, str: "returned value did not match expected generation");
16280	}
16281	}
16282
16283	lck_mtx_unlock(lck: &dtrace_lock);
16284	}
16285	}
16286	lck_mtx_unlock(lck: &dtrace_meta_lock);
16287	kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16288	} else {
16289	lck_mtx_unlock(lck: &dtrace_meta_lock);
16290	}
16291	}
16292
16293	static int
16294	dtrace_lazy_dofs_proc_iterate_doit(proc_t p, void** ignored)
16295	{
16296	#pragma unused(ignored)
16297
16298	dtrace_lazy_dofs_process(p);
16299
16300	return PROC_RETURNED;
16301	}
16302
16303	#define DTRACE_LAZY_DOFS_DUPLICATED 1
16304
16305	static int
16306	dtrace_lazy_dofs_duplicate(proc_t parent, proc_t child)
16307	{
16308	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16309	LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16310	LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16311
16312	lck_rw_lock_shared(lck: &dtrace_dof_mode_lock);
16313	dtrace_sprlock(p: parent);
16314
16315	/*
16316	* We need to make sure that the transition to lazy dofs -> helpers
16317	* was atomic for our parent
16318	*/
16319	ASSERT(parent->p_dtrace_lazy_dofs == NULL \|\| parent->p_dtrace_helpers == NULL);
16320	/*
16321	* In theory we should hold the child sprlock, but this is safe...
16322	*/
16323	ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16324
16325	dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16326	dof_ioctl_data_t* child_dofs = NULL;
16327	if (parent_dofs) {
16328	size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16329	child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16330	bcopy(src: parent_dofs, dst: child_dofs, n: parent_dofs_size);
16331	}
16332
16333	dtrace_sprunlock(p: parent);
16334
16335	if (child_dofs) {
16336	dtrace_sprlock(p: child);
16337	child->p_dtrace_lazy_dofs = child_dofs;
16338	dtrace_sprunlock(p: child);
16339	/**
16340	* We process the DOF at this point if the mode is set to
16341	* LAZY_OFF. This can happen if DTrace is still processing the
16342	* DOF of other process (which can happen because the
16343	* protected pager can have a huge latency)
16344	* but has not processed our parent yet
16345	*/
16346	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16347	dtrace_lazy_dofs_process(p: child);
16348	}
16349	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
16350
16351	return DTRACE_LAZY_DOFS_DUPLICATED;
16352	}
16353	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
16354
16355	return `0`;
16356	}
16357
16358	static dtrace_helpers_t *
16359	dtrace_helpers_create(proc_t *p)
16360	{
16361	dtrace_helpers_t *help;
16362
16363	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16364	ASSERT(p->p_dtrace_helpers == NULL);
16365
16366	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16367	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t )
16368	DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16369
16370	p->p_dtrace_helpers = help;
16371	dtrace_helpers++;
16372
16373	return (help);
16374	}
16375
16376	static void
16377	dtrace_helpers_destroy(proc_t* p)
16378	{
16379	dtrace_helpers_t *help;
16380	dtrace_vstate_t *vstate;
16381	uint_t i;
16382
16383	lck_mtx_lock(lck: &dtrace_meta_lock);
16384	lck_mtx_lock(lck: &dtrace_lock);
16385
16386	ASSERT(p->p_dtrace_helpers != NULL);
16387	ASSERT(dtrace_helpers > `0`);
16388
16389	help = p->p_dtrace_helpers;
16390	vstate = &help->dthps_vstate;
16391
16392	/*
16393	* We're now going to lose the help from this process.
16394	*/
16395	p->p_dtrace_helpers = NULL;
16396	dtrace_sync();
16397
16398	/*
16399	* Destory the helper actions.
16400	*/
16401	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
16402	dtrace_helper_action_t h, next;
16403
16404	for (h = help->dthps_actions[i]; h != NULL; h = next) {
16405	next = h->dtha_next;
16406	dtrace_helper_action_destroy(helper: h, vstate);
16407	h = next;
16408	}
16409	}
16410
16411	lck_mtx_unlock(lck: &dtrace_lock);
16412
16413	/*
16414	* Destroy the helper providers.
16415	*/
16416	if (help->dthps_maxprovs > `0`) {
16417	if (dtrace_meta_pid != NULL) {
16418	ASSERT(dtrace_deferred_pid == NULL);
16419
16420	for (i = `0`; i < help->dthps_nprovs; i++) {
16421	dtrace_helper_provider_remove(
16422	dhp: &help->dthps_provs[i]->dthp_prov, p);
16423	}
16424	} else {
16425	lck_mtx_lock(lck: &dtrace_lock);
16426	ASSERT(help->dthps_deferred == `0` \|\|
16427	help->dthps_next != NULL \|\|
16428	help->dthps_prev != NULL \|\|
16429	help == dtrace_deferred_pid);
16430
16431	/*
16432	* Remove the helper from the deferred list.
16433	*/
16434	if (help->dthps_next != NULL)
16435	help->dthps_next->dthps_prev = help->dthps_prev;
16436	if (help->dthps_prev != NULL)
16437	help->dthps_prev->dthps_next = help->dthps_next;
16438	if (dtrace_deferred_pid == help) {
16439	dtrace_deferred_pid = help->dthps_next;
16440	ASSERT(help->dthps_prev == NULL);
16441	}
16442
16443	lck_mtx_unlock(lck: &dtrace_lock);
16444	}
16445
16446
16447	for (i = `0`; i < help->dthps_nprovs; i++) {
16448	dtrace_helper_provider_destroy(hprov: help->dthps_provs[i]);
16449	}
16450
16451	kmem_free(help->dthps_provs, help->dthps_maxprovs *
16452	sizeof (dtrace_helper_provider_t *));
16453	}
16454
16455	lck_mtx_lock(lck: &dtrace_lock);
16456
16457	dtrace_vstate_fini(vstate: &help->dthps_vstate);
16458	kmem_free(help->dthps_actions,
16459	sizeof (dtrace_helper_action_t ) DTRACE_NHELPER_ACTIONS);
16460	kmem_free(help, sizeof (dtrace_helpers_t));
16461
16462	--dtrace_helpers;
16463	lck_mtx_unlock(lck: &dtrace_lock);
16464	lck_mtx_unlock(lck: &dtrace_meta_lock);
16465	}
16466
16467	static void
16468	dtrace_helpers_duplicate(proc_t from, proc_t to)
16469	{
16470	dtrace_helpers_t help, newhelp;
16471	dtrace_helper_action_t helper, new, *last;
16472	dtrace_difo_t *dp;
16473	dtrace_vstate_t *vstate;
16474	uint_t i;
16475	int j, sz, hasprovs = `0`;
16476
16477	lck_mtx_lock(lck: &dtrace_meta_lock);
16478	lck_mtx_lock(lck: &dtrace_lock);
16479	ASSERT(from->p_dtrace_helpers != NULL);
16480	ASSERT(dtrace_helpers > `0`);
16481
16482	help = from->p_dtrace_helpers;
16483	newhelp = dtrace_helpers_create(p: to);
16484	ASSERT(to->p_dtrace_helpers != NULL);
16485
16486	newhelp->dthps_generation = help->dthps_generation;
16487	vstate = &newhelp->dthps_vstate;
16488
16489	/*
16490	* Duplicate the helper actions.
16491	*/
16492	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
16493	if ((helper = help->dthps_actions[i]) == NULL)
16494	continue;
16495
16496	for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16497	new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16498	KM_SLEEP);
16499	new->dtha_generation = helper->dtha_generation;
16500
16501	if ((dp = helper->dtha_predicate) != NULL) {
16502	dp = dtrace_difo_duplicate(dp, vstate);
16503	new->dtha_predicate = dp;
16504	}
16505
16506	new->dtha_nactions = helper->dtha_nactions;
16507	sz = sizeof (dtrace_difo_t ) new->dtha_nactions;
16508	new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16509
16510	for (j = `0`; j < new->dtha_nactions; j++) {
16511	dtrace_difo_t *dpj = helper->dtha_actions[j];
16512
16513	ASSERT(dpj != NULL);
16514	dpj = dtrace_difo_duplicate(dp: dpj, vstate);
16515	new->dtha_actions[j] = dpj;
16516	}
16517
16518	if (last != NULL) {
16519	last->dtha_next = new;
16520	} else {
16521	newhelp->dthps_actions[i] = new;
16522	}
16523
16524	last = new;
16525	}
16526	}
16527
16528	/*
16529	* Duplicate the helper providers and register them with the
16530	* DTrace framework.
16531	*/
16532	if (help->dthps_nprovs > `0`) {
16533	newhelp->dthps_nprovs = help->dthps_nprovs;
16534	newhelp->dthps_maxprovs = help->dthps_nprovs;
16535	newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16536	sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16537	for (i = `0`; i < newhelp->dthps_nprovs; i++) {
16538	newhelp->dthps_provs[i] = help->dthps_provs[i];
16539	newhelp->dthps_provs[i]->dthp_ref++;
16540	}
16541
16542	hasprovs = `1`;
16543	}
16544
16545	lck_mtx_unlock(lck: &dtrace_lock);
16546
16547	if (hasprovs)
16548	dtrace_helper_provider_register(p: to, help: newhelp, NULL);
16549
16550	lck_mtx_unlock(lck: &dtrace_meta_lock);
16551	}
16552
16553	/**
16554	* DTrace Process functions
16555	*/
16556
16557	void
16558	dtrace_proc_fork(proc_t parent_proc, proc_t child_proc, int spawn)
16559	{
16560	/*
16561	* This code applies to new processes who are copying the task
16562	* and thread state and address spaces of their parent process.
16563	*/
16564	if (!spawn) {
16565	/*
16566	* APPLE NOTE: Solaris does a sprlock() and drops the
16567	* proc_lock here. We're cheating a bit and only taking
16568	* the p_dtrace_sprlock lock. A full sprlock would
16569	* task_suspend the parent.
16570	*/
16571	dtrace_sprlock(p: parent_proc);
16572
16573	/*
16574	* Remove all DTrace tracepoints from the child process. We
16575	* need to do this _before_ duplicating USDT providers since
16576	* any associated probes may be immediately enabled.
16577	*/
16578	if (parent_proc->p_dtrace_count > `0`) {
16579	dtrace_fasttrap_fork(parent_proc, child_proc);
16580	}
16581
16582	dtrace_sprunlock(p: parent_proc);
16583
16584	/*
16585	* Duplicate any lazy dof(s). This must be done while NOT
16586	* holding the parent sprlock! Lock ordering is
16587	* dtrace_dof_mode_lock, then sprlock. It is imperative we
16588	* always call dtrace_lazy_dofs_duplicate, rather than null
16589	* check and call if !NULL. If we NULL test, during lazy dof
16590	* faulting we can race with the faulting code and proceed
16591	* from here to beyond the helpers copy. The lazy dof
16592	* faulting will then fail to copy the helpers to the child
16593	* process. We return if we duplicated lazy dofs as a process
16594	* can only have one at the same time to avoid a race between
16595	* a dtrace client and dtrace_proc_fork where a process would
16596	* end up with both lazy dofs and helpers.
16597	*/
16598	if (dtrace_lazy_dofs_duplicate(parent: parent_proc, child: child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16599	return;
16600	}
16601
16602	/*
16603	* Duplicate any helper actions and providers if they haven't
16604	* already.
16605	*/
16606	#if !defined(__APPLE__)
16607	/*
16608	* The SFORKING
16609	* we set above informs the code to enable USDT probes that
16610	* sprlock() may fail because the child is being forked.
16611	*/
16612	#endif
16613	/*
16614	* APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16615	* never fails to find the child. We do not set SFORKING.
16616	*/
16617	if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16618	(*dtrace_helpers_fork)(parent_proc, child_proc);
16619	}
16620	}
16621	}
16622
16623	void
16624	dtrace_proc_exec(proc_t *p)
16625	{
16626	/*
16627	* Invalidate any predicate evaluation already cached for this thread by DTrace.
16628	* That's because we've just stored to p_comm and DTrace refers to that when it
16629	* evaluates the "execname" special variable. uid and gid may have changed as well.
16630	*/
16631	dtrace_set_thread_predcache(current_thread(), `0`);
16632
16633	/*
16634	* Free any outstanding lazy dof entries. It is imperative we
16635	* always call dtrace_lazy_dofs_destroy, rather than null check
16636	* and call if !NULL. If we NULL test, during lazy dof faulting
16637	* we can race with the faulting code and proceed from here to
16638	* beyond the helpers cleanup. The lazy dof faulting will then
16639	* install new helpers which no longer belong to this process!
16640	*/
16641	dtrace_lazy_dofs_destroy(p);
16642
16643
16644	/*
16645	* Clean up any DTrace helpers for the process.
16646	*/
16647	if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16648	(*dtrace_helpers_cleanup)(p);
16649	}
16650
16651	/*
16652	* Cleanup the DTrace provider associated with this process.
16653	*/
16654	proc_lock(p);
16655	if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16656	(*dtrace_fasttrap_exec_ptr)(p);
16657	}
16658	proc_unlock(p);
16659	}
16660
16661	void
16662	dtrace_proc_exit(proc_t *p)
16663	{
16664	/*
16665	* Free any outstanding lazy dof entries. It is imperative we
16666	* always call dtrace_lazy_dofs_destroy, rather than null check
16667	* and call if !NULL. If we NULL test, during lazy dof faulting
16668	* we can race with the faulting code and proceed from here to
16669	* beyond the helpers cleanup. The lazy dof faulting will then
16670	* install new helpers which will never be cleaned up, and leak.
16671	*/
16672	dtrace_lazy_dofs_destroy(p);
16673
16674	/*
16675	* Clean up any DTrace helper actions or probes for the process.
16676	*/
16677	if (p->p_dtrace_helpers != NULL) {
16678	(*dtrace_helpers_cleanup)(p);
16679	}
16680
16681	/*
16682	* Clean up any DTrace probes associated with this process.
16683	*/
16684	/*
16685	* APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16686	* call this after dtrace_helpers_cleanup()
16687	*/
16688	proc_lock(p);
16689	if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16690	(*dtrace_fasttrap_exit_ptr)(p);
16691	}
16692	proc_unlock(p);
16693	}
16694
16695	/*
16696	* DTrace Hook Functions
16697	*/
16698
16699	/*
16700	* APPLE NOTE: dtrace_modctl_* routines for kext support.
16701	* Used to manipulate the modctl list within dtrace xnu.
16702	*/
16703
16704	modctl_t *dtrace_modctl_list;
16705
16706	static void
16707	dtrace_modctl_add(struct modctl * newctl)
16708	{
16709	struct modctl nextp, prevp;
16710
16711	ASSERT(newctl != NULL);
16712	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16713
16714	// Insert new module at the front of the list,
16715
16716	newctl->mod_next = dtrace_modctl_list;
16717	dtrace_modctl_list = newctl;
16718
16719	/*
16720	* If a module exists with the same name, then that module
16721	* must have been unloaded with enabled probes. We will move
16722	* the unloaded module to the new module's stale chain and
16723	* then stop traversing the list.
16724	*/
16725
16726	prevp = newctl;
16727	nextp = newctl->mod_next;
16728
16729	while (nextp != NULL) {
16730	if (nextp->mod_loaded) {
16731	/ This is a loaded module. Keep traversing. /
16732	prevp = nextp;
16733	nextp = nextp->mod_next;
16734	continue;
16735	}
16736	else {
16737	/ Found an unloaded module /
16738	if (strncmp (s1: newctl->mod_modname, s2: nextp->mod_modname, KMOD_MAX_NAME)) {
16739	/ Names don't match. Keep traversing. /
16740	prevp = nextp;
16741	nextp = nextp->mod_next;
16742	continue;
16743	}
16744	else {
16745	/ We found a stale entry, move it. We're done. /
16746	prevp->mod_next = nextp->mod_next;
16747	newctl->mod_stale = nextp;
16748	nextp->mod_next = NULL;
16749	break;
16750	}
16751	}
16752	}
16753	}
16754
16755	static modctl_t *
16756	dtrace_modctl_lookup(struct kmod_info * kmod)
16757	{
16758	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16759
16760	struct modctl * ctl;
16761
16762	for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16763	if (ctl->mod_id == kmod->id)
16764	return(ctl);
16765	}
16766	return (NULL);
16767	}
16768
16769	/*
16770	* This routine is called from dtrace_module_unloaded().
16771	* It removes a modctl structure and its stale chain
16772	* from the kext shadow list.
16773	*/
16774	static void
16775	dtrace_modctl_remove(struct modctl * ctl)
16776	{
16777	ASSERT(ctl != NULL);
16778	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16779	modctl_t prevp, nextp, *curp;
16780
16781	// Remove stale chain first
16782	for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16783	nextp = curp->mod_stale;
16784	/ There should NEVER be user symbols allocated at this point /
16785	ASSERT(curp->mod_user_symbols == NULL);
16786	kmem_free(curp, sizeof(modctl_t));
16787	}
16788
16789	prevp = NULL;
16790	curp = dtrace_modctl_list;
16791
16792	while (curp != ctl) {
16793	prevp = curp;
16794	curp = curp->mod_next;
16795	}
16796
16797	if (prevp != NULL) {
16798	prevp->mod_next = ctl->mod_next;
16799	}
16800	else {
16801	dtrace_modctl_list = ctl->mod_next;
16802	}
16803
16804	/ There should NEVER be user symbols allocated at this point /
16805	ASSERT(ctl->mod_user_symbols == NULL);
16806
16807	kmem_free (ctl, sizeof(modctl_t));
16808	}
16809
16810	/*
16811	* APPLE NOTE: The kext loader will call dtrace_module_loaded
16812	* when the kext is loaded in memory, but before calling the
16813	* kext's start routine.
16814	*
16815	* Return 0 on success
16816	* Return -1 on failure
16817	*/
16818
16819	static int
16820	dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16821	{
16822	dtrace_provider_t *prv;
16823
16824	/*
16825	* If kernel symbols have been disabled, return immediately
16826	* DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16827	*/
16828	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16829	return `0`;
16830
16831	#if CONFIG_SPTM
16832	/ Opt-out the SPTM/TXM fake kexts from being loaded by DTrace. /
16833	extern kmod_info_t g_sptm_kmod_info, g_txm_kmod_info;
16834	if ((kmod == &g_sptm_kmod_info) \|\| (kmod == &g_txm_kmod_info)) {
16835	return `0`;
16836	}
16837	#endif
16838
16839	struct modctl *ctl = NULL;
16840	if (!kmod \|\| kmod->address == `0` \|\| kmod->size == `0`)
16841	return(-`1`);
16842
16843	lck_mtx_lock(lck: &dtrace_provider_lock);
16844	lck_mtx_lock(lck: &mod_lock);
16845
16846	/*
16847	* Have we seen this kext before?
16848	*/
16849
16850	ctl = dtrace_modctl_lookup(kmod);
16851
16852	if (ctl != NULL) {
16853	/ bail... we already have this kext in the modctl list /
16854	lck_mtx_unlock(lck: &mod_lock);
16855	lck_mtx_unlock(lck: &dtrace_provider_lock);
16856	if (dtrace_err_verbose)
16857	cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16858	return(-`1`);
16859	}
16860	else {
16861	ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16862	if (ctl == NULL) {
16863	if (dtrace_err_verbose)
16864	cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16865	lck_mtx_unlock(lck: &mod_lock);
16866	lck_mtx_unlock(lck: &dtrace_provider_lock);
16867	return (-`1`);
16868	}
16869	ctl->mod_next = NULL;
16870	ctl->mod_stale = NULL;
16871	strlcpy (dst: ctl->mod_modname, src: kmod->name, n: sizeof(ctl->mod_modname));
16872	ctl->mod_loadcnt = kmod->id;
16873	ctl->mod_nenabled = `0`;
16874	ctl->mod_address = kmod->address;
16875	ctl->mod_size = kmod->size;
16876	ctl->mod_id = kmod->id;
16877	ctl->mod_loaded = `1`;
16878	ctl->mod_flags = `0`;
16879	ctl->mod_user_symbols = NULL;
16880	ctl->mod_sdtprobecnt = `0`;
16881	ctl->mod_sdtdesc = NULL;
16882
16883	/*
16884	* Find the UUID for this module, if it has one
16885	*/
16886	kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16887	struct load_command* load_cmd = (struct load_command *)&header[`1`];
16888	uint32_t i;
16889	for (i = `0`; i < header->ncmds; i++) {
16890	if (load_cmd->cmd == LC_UUID) {
16891	struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16892	memcpy(dst: ctl->mod_uuid, src: uuid_cmd->uuid, n: sizeof(uuid_cmd->uuid));
16893	ctl->mod_flags \|= MODCTL_HAS_UUID;
16894	break;
16895	}
16896	load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16897	}
16898
16899	if (ctl->mod_address == g_kernel_kmod_info.address) {
16900	ctl->mod_flags \|= MODCTL_IS_MACH_KERNEL;
16901	memcpy(dst: dtrace_kerneluuid, src: ctl->mod_uuid, n: sizeof(dtrace_kerneluuid));
16902	}
16903	/*
16904	* Static kexts have a UUID that is not used for symbolication, as all their
16905	* symbols are in kernel
16906	*/
16907	else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16908	memcpy(dst: ctl->mod_uuid, src: dtrace_kerneluuid, n: sizeof(dtrace_kerneluuid));
16909	ctl->mod_flags \|= MODCTL_IS_STATIC_KEXT;
16910	}
16911	}
16912	dtrace_modctl_add(newctl: ctl);
16913
16914	/*
16915	* We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16916	*/
16917	lck_mtx_lock(lck: &dtrace_lock);
16918
16919	/*
16920	* DTrace must decide if it will instrument modules lazily via
16921	* userspace symbols (default mode), or instrument immediately via
16922	* kernel symbols (non-default mode)
16923	*
16924	* When in default/lazy mode, DTrace will only support modules
16925	* built with a valid UUID.
16926	*
16927	* Overriding the default can be done explicitly in one of
16928	* the following two ways.
16929	*
16930	* A module can force symbols from kernel space using the plist key,
16931	* OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16932	* we fall through and instrument this module now.
16933	*
16934	* Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16935	* from kernel space (see dtrace_impl.h). If this system state is set
16936	* to a non-userspace mode, we fall through and instrument the module now.
16937	*/
16938
16939	if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16940	(!(flag & KMOD_DTRACE_FORCE_INIT)))
16941	{
16942	/ Load SDT section for module. Symbol related data will be handled lazily. /
16943	sdt_load_machsect(ctl);
16944
16945	/ We will instrument the module lazily -- this is the default /
16946	lck_mtx_unlock(lck: &dtrace_lock);
16947	lck_mtx_unlock(lck: &mod_lock);
16948	lck_mtx_unlock(lck: &dtrace_provider_lock);
16949	return `0`;
16950	}
16951
16952	/ We will instrument the module immediately using kernel symbols /
16953	if (!(flag & KMOD_DTRACE_NO_KERNEL_SYMS)) {
16954	ctl->mod_flags \|= MODCTL_HAS_KERNEL_SYMBOLS;
16955	}
16956
16957	/ Load SDT section for module. Symbol related data will be handled lazily. /
16958	sdt_load_machsect(ctl);
16959
16960	lck_mtx_unlock(lck: &dtrace_lock);
16961
16962	/*
16963	* We're going to call each providers per-module provide operation
16964	* specifying only this module.
16965	*/
16966	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16967	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16968
16969	/*
16970	* APPLE NOTE: The contract with the kext loader is that once this function
16971	* has completed, it may delete kernel symbols at will.
16972	* We must set this while still holding the mod_lock.
16973	*/
16974	ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16975
16976	lck_mtx_unlock(lck: &mod_lock);
16977	lck_mtx_unlock(lck: &dtrace_provider_lock);
16978
16979	/*
16980	* If we have any retained enablings, we need to match against them.
16981	* Enabling probes requires that cpu_lock be held, and we cannot hold
16982	* cpu_lock here -- it is legal for cpu_lock to be held when loading a
16983	* module. (In particular, this happens when loading scheduling
16984	* classes.) So if we have any retained enablings, we need to dispatch
16985	* our task queue to do the match for us.
16986	*/
16987	lck_mtx_lock(lck: &dtrace_lock);
16988
16989	if (dtrace_retained == NULL) {
16990	lck_mtx_unlock(lck: &dtrace_lock);
16991	return `0`;
16992	}
16993
16994	/ APPLE NOTE!*
16995	*
16996	* The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16997	* holds it for any reason. Thus the comment above is invalid, we can directly invoke
16998	* dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16999	* the delay call as well.
17000	*/
17001	lck_mtx_unlock(lck: &dtrace_lock);
17002
17003	dtrace_enabling_matchall();
17004
17005	return `0`;
17006	}
17007
17008	/*
17009	* Return 0 on success
17010	* Return -1 on failure
17011	*/
17012	static int
17013	dtrace_module_unloaded(struct kmod_info *kmod)
17014	{
17015	dtrace_probe_t template, probe, first, *next;
17016	dtrace_provider_t *prov;
17017	struct modctl *ctl = NULL;
17018	struct modctl *syncctl = NULL;
17019	struct modctl *nextsyncctl = NULL;
17020	int syncmode = `0`;
17021
17022	lck_mtx_lock(lck: &dtrace_provider_lock);
17023	lck_mtx_lock(lck: &mod_lock);
17024	lck_mtx_lock(lck: &dtrace_lock);
17025
17026	if (kmod == NULL) {
17027	syncmode = `1`;
17028	}
17029	else {
17030	ctl = dtrace_modctl_lookup(kmod);
17031	if (ctl == NULL)
17032	{
17033	lck_mtx_unlock(lck: &dtrace_lock);
17034	lck_mtx_unlock(lck: &mod_lock);
17035	lck_mtx_unlock(lck: &dtrace_provider_lock);
17036	return (-`1`);
17037	}
17038	ctl->mod_loaded = `0`;
17039	ctl->mod_address = `0`;
17040	ctl->mod_size = `0`;
17041	}
17042
17043	if (dtrace_bymod == NULL) {
17044	/*
17045	* The DTrace module is loaded (obviously) but not attached;
17046	* we don't have any work to do.
17047	*/
17048	if (ctl != NULL)
17049	(void)dtrace_modctl_remove(ctl);
17050	lck_mtx_unlock(lck: &dtrace_lock);
17051	lck_mtx_unlock(lck: &mod_lock);
17052	lck_mtx_unlock(lck: &dtrace_provider_lock);
17053	return(`0`);
17054	}
17055
17056	/ Syncmode set means we target and traverse entire modctl list. /
17057	if (syncmode)
17058	nextsyncctl = dtrace_modctl_list;
17059
17060	syncloop:
17061	if (syncmode)
17062	{
17063	/ find a stale modctl struct /
17064	for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
17065	if (syncctl->mod_address == `0`)
17066	break;
17067	}
17068	if (syncctl==NULL)
17069	{
17070	/ We have no more work to do /
17071	lck_mtx_unlock(lck: &dtrace_lock);
17072	lck_mtx_unlock(lck: &mod_lock);
17073	lck_mtx_unlock(lck: &dtrace_provider_lock);
17074	return(`0`);
17075	}
17076	else {
17077	/ keep track of next syncctl in case this one is removed /
17078	nextsyncctl = syncctl->mod_next;
17079	ctl = syncctl;
17080	}
17081	}
17082
17083	template.dtpr_mod = ctl->mod_modname;
17084
17085	for (probe = first = dtrace_hash_lookup(hash: dtrace_bymod, template: &template);
17086	probe != NULL; probe = probe->dtpr_nextmod) {
17087	if (probe->dtpr_ecb != NULL) {
17088	/*
17089	* This shouldn't _actually_ be possible -- we're
17090	* unloading a module that has an enabled probe in it.
17091	* (It's normally up to the provider to make sure that
17092	* this can't happen.) However, because dtps_enable()
17093	* doesn't have a failure mode, there can be an
17094	* enable/unload race. Upshot: we don't want to
17095	* assert, but we're not going to disable the
17096	* probe, either.
17097	*/
17098
17099
17100	if (syncmode) {
17101	/ We're syncing, let's look at next in list /
17102	goto syncloop;
17103	}
17104
17105	lck_mtx_unlock(lck: &dtrace_lock);
17106	lck_mtx_unlock(lck: &mod_lock);
17107	lck_mtx_unlock(lck: &dtrace_provider_lock);
17108
17109	if (dtrace_err_verbose) {
17110	cmn_err(CE_WARN, "unloaded module '%s' had "
17111	"enabled probes", ctl->mod_modname);
17112	}
17113	return(-`1`);
17114	}
17115	}
17116
17117	probe = first;
17118
17119	for (first = NULL; probe != NULL; probe = next) {
17120	ASSERT(dtrace_probes[probe->dtpr_id - `1`] == probe);
17121
17122	dtrace_probes[probe->dtpr_id - `1`] = NULL;
17123	probe->dtpr_provider->dtpv_probe_count--;
17124
17125	next = probe->dtpr_nextmod;
17126	dtrace_hash_remove(hash: dtrace_byprov, elm: probe);
17127	dtrace_hash_remove(hash: dtrace_bymod, elm: probe);
17128	dtrace_hash_remove(hash: dtrace_byfunc, elm: probe);
17129	dtrace_hash_remove(hash: dtrace_byname, elm: probe);
17130
17131	if (first == NULL) {
17132	first = probe;
17133	probe->dtpr_nextmod = NULL;
17134	} else {
17135	probe->dtpr_nextmod = first;
17136	first = probe;
17137	}
17138	}
17139
17140	/*
17141	* We've removed all of the module's probes from the hash chains and
17142	* from the probe array. Now issue a dtrace_sync() to be sure that
17143	* everyone has cleared out from any probe array processing.
17144	*/
17145	dtrace_sync();
17146
17147	for (probe = first; probe != NULL; probe = first) {
17148	first = probe->dtpr_nextmod;
17149	prov = probe->dtpr_provider;
17150	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
17151	probe->dtpr_arg);
17152	dtrace_strunref(str: probe->dtpr_mod);
17153	dtrace_strunref(str: probe->dtpr_func);
17154	dtrace_strunref(str: probe->dtpr_name);
17155	vmem_free(vmp: dtrace_arena, vaddr: (void *)(uintptr_t)probe->dtpr_id, size: `1`);
17156
17157	zfree(dtrace_probe_t_zone, probe);
17158	}
17159
17160	dtrace_modctl_remove(ctl);
17161
17162	if (syncmode)
17163	goto syncloop;
17164
17165	lck_mtx_unlock(lck: &dtrace_lock);
17166	lck_mtx_unlock(lck: &mod_lock);
17167	lck_mtx_unlock(lck: &dtrace_provider_lock);
17168
17169	return(`0`);
17170	}
17171
17172	void
17173	dtrace_suspend(void)
17174	{
17175	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17176	}
17177
17178	void
17179	dtrace_resume(void)
17180	{
17181	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17182	}
17183
17184	static int
17185	dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17186	{
17187	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17188	lck_mtx_lock(lck: &dtrace_lock);
17189
17190	switch (what) {
17191	case CPU_CONFIG: {
17192	dtrace_state_t *state;
17193	dtrace_optval_t *opt, rs, c;
17194
17195	/*
17196	* For now, we only allocate a new buffer for anonymous state.
17197	*/
17198	if ((state = dtrace_anon.dta_state) == NULL)
17199	break;
17200
17201	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17202	break;
17203
17204	opt = state->dts_options;
17205	c = opt[DTRACEOPT_CPU];
17206
17207	if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17208	break;
17209
17210	/*
17211	* Regardless of what the actual policy is, we're going to
17212	* temporarily set our resize policy to be manual. We're
17213	* also going to temporarily set our CPU option to denote
17214	* the newly configured CPU.
17215	*/
17216	rs = opt[DTRACEOPT_BUFRESIZE];
17217	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17218	opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17219
17220	(void) dtrace_state_buffers(state);
17221
17222	opt[DTRACEOPT_BUFRESIZE] = rs;
17223	opt[DTRACEOPT_CPU] = c;
17224
17225	break;
17226	}
17227
17228	case CPU_UNCONFIG:
17229	/*
17230	* We don't free the buffer in the CPU_UNCONFIG case. (The
17231	* buffer will be freed when the consumer exits.)
17232	*/
17233	break;
17234
17235	default:
17236	break;
17237	}
17238
17239	lck_mtx_unlock(lck: &dtrace_lock);
17240	return (`0`);
17241	}
17242
17243	static void
17244	dtrace_cpu_setup_initial(processorid_t cpu)
17245	{
17246	(void) dtrace_cpu_setup(what: CPU_CONFIG, cpu);
17247	}
17248
17249	static void
17250	dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17251	{
17252	if (dtrace_toxranges >= dtrace_toxranges_max) {
17253	int osize, nsize;
17254	dtrace_toxrange_t *range;
17255
17256	osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17257
17258	if (osize == `0`) {
17259	ASSERT(dtrace_toxrange == NULL);
17260	ASSERT(dtrace_toxranges_max == `0`);
17261	dtrace_toxranges_max = `1`;
17262	} else {
17263	dtrace_toxranges_max <<= `1`;
17264	}
17265
17266	nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17267	range = kmem_zalloc(nsize, KM_SLEEP);
17268
17269	if (dtrace_toxrange != NULL) {
17270	ASSERT(osize != `0`);
17271	bcopy(src: dtrace_toxrange, dst: range, n: osize);
17272	kmem_free(dtrace_toxrange, osize);
17273	}
17274
17275	dtrace_toxrange = range;
17276	}
17277
17278	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == `0`);
17279	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == `0`);
17280
17281	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17282	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17283	dtrace_toxranges++;
17284	}
17285
17286	/*
17287	* DTrace Driver Cookbook Functions
17288	*/
17289	/ARGSUSED/
17290	static int
17291	dtrace_attach(dev_info_t *devi)
17292	{
17293	dtrace_provider_id_t id;
17294	dtrace_state_t *state = NULL;
17295	dtrace_enabling_t *enab;
17296
17297	lck_mtx_lock(lck: &cpu_lock);
17298	lck_mtx_lock(lck: &dtrace_provider_lock);
17299	lck_mtx_lock(lck: &dtrace_lock);
17300
17301	/ Darwin uses BSD cloning device driver to automagically obtain minor device number. /
17302	dtrace_devi = devi;
17303
17304	dtrace_modload = dtrace_module_loaded;
17305	dtrace_modunload = dtrace_module_unloaded;
17306	dtrace_cpu_init = dtrace_cpu_setup_initial;
17307	dtrace_helpers_cleanup = dtrace_helpers_destroy;
17308	dtrace_helpers_fork = dtrace_helpers_duplicate;
17309	dtrace_cpustart_init = dtrace_suspend;
17310	dtrace_cpustart_fini = dtrace_resume;
17311	dtrace_debugger_init = dtrace_suspend;
17312	dtrace_debugger_fini = dtrace_resume;
17313
17314	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17315
17316	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17317
17318	dtrace_arena = vmem_create("dtrace", (void *)`1`, INT32_MAX, `1`,
17319	NULL, NULL, NULL, `0`, VM_SLEEP \| VMC_IDENTIFIER);
17320
17321	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17322
17323	dtrace_nprobes = dtrace_nprobes_default;
17324	dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t) dtrace_nprobes,
17325	KM_SLEEP);
17326
17327	dtrace_byprov = dtrace_hash_create(func: dtrace_strkey_probe_provider,
17328	arg: `0`, / unused /
17329	offsetof(dtrace_probe_t, dtpr_nextprov),
17330	offsetof(dtrace_probe_t, dtpr_prevprov));
17331
17332	dtrace_bymod = dtrace_hash_create(func: dtrace_strkey_deref_offset,
17333	offsetof(dtrace_probe_t, dtpr_mod),
17334	offsetof(dtrace_probe_t, dtpr_nextmod),
17335	offsetof(dtrace_probe_t, dtpr_prevmod));
17336
17337	dtrace_byfunc = dtrace_hash_create(func: dtrace_strkey_deref_offset,
17338	offsetof(dtrace_probe_t, dtpr_func),
17339	offsetof(dtrace_probe_t, dtpr_nextfunc),
17340	offsetof(dtrace_probe_t, dtpr_prevfunc));
17341
17342	dtrace_byname = dtrace_hash_create(func: dtrace_strkey_deref_offset,
17343	offsetof(dtrace_probe_t, dtpr_name),
17344	offsetof(dtrace_probe_t, dtpr_nextname),
17345	offsetof(dtrace_probe_t, dtpr_prevname));
17346
17347	if (dtrace_retain_max < `1`) {
17348	cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17349	"setting to 1", dtrace_retain_max);
17350	dtrace_retain_max = `1`;
17351	}
17352
17353	/*
17354	* Now discover our toxic ranges.
17355	*/
17356	dtrace_toxic_ranges(dtrace_toxrange_add);
17357
17358	/*
17359	* Before we register ourselves as a provider to our own framework,
17360	* we would like to assert that dtrace_provider is NULL -- but that's
17361	* not true if we were loaded as a dependency of a DTrace provider.
17362	* Once we've registered, we can assert that dtrace_provider is our
17363	* pseudo provider.
17364	*/
17365	(void) dtrace_register(name: "dtrace", pap: &dtrace_provider_attr,
17366	DTRACE_PRIV_NONE, cr: `0`, pops: &dtrace_provider_ops, NULL, idp: &id);
17367
17368	ASSERT(dtrace_provider != NULL);
17369	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17370
17371	#if defined (__x86_64__)
17372	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17373	dtrace_provider, NULL, NULL, "BEGIN", `1`, NULL);
17374	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17375	dtrace_provider, NULL, NULL, "END", `0`, NULL);
17376	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17377	dtrace_provider, NULL, NULL, "ERROR", `3`, NULL);
17378	#elif defined(__arm64__)
17379	dtrace_probeid_begin = dtrace_probe_create(prov: (dtrace_provider_id_t)
17380	dtrace_provider, NULL, NULL, name: "BEGIN", aframes: `2`, NULL);
17381	dtrace_probeid_end = dtrace_probe_create(prov: (dtrace_provider_id_t)
17382	dtrace_provider, NULL, NULL, name: "END", aframes: `1`, NULL);
17383	dtrace_probeid_error = dtrace_probe_create(prov: (dtrace_provider_id_t)
17384	dtrace_provider, NULL, NULL, name: "ERROR", aframes: `4`, NULL);
17385	#else
17386	#error Unknown Architecture
17387	#endif
17388
17389	dtrace_anon_property();
17390	lck_mtx_unlock(lck: &cpu_lock);
17391
17392	/*
17393	* If DTrace helper tracing is enabled, we need to allocate the
17394	* trace buffer and initialize the values.
17395	*/
17396	if (dtrace_helptrace_enabled) {
17397	ASSERT(dtrace_helptrace_buffer == NULL);
17398	dtrace_helptrace_buffer =
17399	kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17400	dtrace_helptrace_next = `0`;
17401	}
17402
17403	/*
17404	* If there are already providers, we must ask them to provide their
17405	* probes, and then match any anonymous enabling against them. Note
17406	* that there should be no other retained enablings at this time:
17407	* the only retained enablings at this time should be the anonymous
17408	* enabling.
17409	*/
17410	if (dtrace_anon.dta_enabling != NULL) {
17411	ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17412
17413	/*
17414	* APPLE NOTE: if handling anonymous dof, switch symbol modes.
17415	*/
17416	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17417	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17418	}
17419
17420	dtrace_enabling_provide(NULL);
17421	state = dtrace_anon.dta_state;
17422
17423	/*
17424	* We couldn't hold cpu_lock across the above call to
17425	* dtrace_enabling_provide(), but we must hold it to actually
17426	* enable the probes. We have to drop all of our locks, pick
17427	* up cpu_lock, and regain our locks before matching the
17428	* retained anonymous enabling.
17429	*/
17430	lck_mtx_unlock(lck: &dtrace_lock);
17431	lck_mtx_unlock(lck: &dtrace_provider_lock);
17432
17433	lck_mtx_lock(lck: &cpu_lock);
17434	lck_mtx_lock(lck: &dtrace_provider_lock);
17435	lck_mtx_lock(lck: &dtrace_lock);
17436
17437	if ((enab = dtrace_anon.dta_enabling) != NULL)
17438	(void) dtrace_enabling_match(enab, NULL, NULL);
17439
17440	lck_mtx_unlock(lck: &cpu_lock);
17441	}
17442
17443	lck_mtx_unlock(lck: &dtrace_lock);
17444	lck_mtx_unlock(lck: &dtrace_provider_lock);
17445
17446	if (state != NULL) {
17447	/*
17448	* If we created any anonymous state, set it going now.
17449	*/
17450	(void) dtrace_state_go(state, cpu: &dtrace_anon.dta_beganon);
17451	}
17452
17453	return (DDI_SUCCESS);
17454	}
17455
17456	/ARGSUSED/
17457	static int
17458	dtrace_open(dev_t devp, int* flag, int otyp, cred_t *cred_p)
17459	{
17460	#pragma unused(flag, otyp)
17461	dtrace_state_t *state;
17462	uint32_t priv;
17463	uid_t uid;
17464	zoneid_t zoneid;
17465	int rv;
17466
17467	if (minor(devp) < `0` \|\| minor(devp) >= DTRACE_NCLIENTS)
17468	return (ENXIO);
17469
17470	/ APPLE: Darwin puts Helper on its own major device. /
17471
17472	/*
17473	* If no DTRACE_PRIV_* bits are set in the credential, then the
17474	* caller lacks sufficient permission to do anything with DTrace.
17475	*/
17476	dtrace_cred2priv(cr: cred_p, privp: &priv, uidp: &uid, zoneidp: &zoneid);
17477	if (priv == DTRACE_PRIV_NONE)
17478	return (EACCES);
17479
17480	/*
17481	* APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17482	* It certainly can't be later than now!
17483	*/
17484	fasttrap_init();
17485
17486	/*
17487	* Ask all providers to provide all their probes.
17488	*/
17489	lck_mtx_lock(lck: &dtrace_provider_lock);
17490	dtrace_probe_provide(NULL, NULL);
17491	lck_mtx_unlock(lck: &dtrace_provider_lock);
17492
17493	lck_mtx_lock(lck: &cpu_lock);
17494	lck_mtx_lock(lck: &dtrace_lock);
17495	dtrace_opens++;
17496	dtrace_membar_producer();
17497
17498	#ifdef illumos
17499	/*
17500	* If the kernel debugger is active (that is, if the kernel debugger
17501	* modified text in some way), we won't allow the open.
17502	*/
17503	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != `0`) {
17504	dtrace_opens--;
17505	lck_mtx_unlock(&dtrace_lock);
17506	lck_mtx_unlock(&cpu_lock);
17507	return (EBUSY);
17508	}
17509	#endif
17510
17511	rv = dtrace_state_create(devp, cr: cred_p, new_state: &state);
17512	lck_mtx_unlock(lck: &cpu_lock);
17513
17514	if (rv != `0` \|\| state == NULL) {
17515	if (--dtrace_opens == `0` && dtrace_anon.dta_enabling == NULL) {
17516	#ifdef illumos
17517	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17518	#endif
17519	}
17520	lck_mtx_unlock(lck: &dtrace_lock);
17521	/ propagate EAGAIN or ERESTART /
17522	return (rv);
17523	}
17524
17525	lck_mtx_unlock(lck: &dtrace_lock);
17526
17527	lck_rw_lock_exclusive(lck: &dtrace_dof_mode_lock);
17528
17529	/*
17530	* If we are currently lazy, transition states.
17531	*
17532	* Unlike dtrace_close, we do not need to check the
17533	* value of dtrace_opens, as any positive value (and
17534	* we count as 1) means we transition states.
17535	*/
17536	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17537	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17538	/*
17539	* We do not need to hold the exclusive lock while processing
17540	* DOF on processes. We do need to make sure the mode does not get
17541	* changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17542	* (which should not happen anyway since it only happens in
17543	* dtrace_close). There is no way imcomplete USDT probes can be
17544	* activate by any DTrace clients here since they all have to
17545	* call dtrace_open and be blocked on dtrace_dof_mode_lock
17546	*/
17547	lck_rw_lock_exclusive_to_shared(lck: &dtrace_dof_mode_lock);
17548	/*
17549	* Iterate all existing processes and load lazy dofs.
17550	*/
17551	proc_iterate(PROC_ALLPROCLIST \| PROC_NOWAITTRANS,
17552	callout: dtrace_lazy_dofs_proc_iterate_doit,
17553	NULL,
17554	filterfn: dtrace_lazy_dofs_proc_iterate_filter,
17555	NULL);
17556
17557	lck_rw_unlock_shared(lck: &dtrace_dof_mode_lock);
17558	}
17559	else {
17560	lck_rw_unlock_exclusive(lck: &dtrace_dof_mode_lock);
17561	}
17562
17563
17564	/*
17565	* Update kernel symbol state.
17566	*
17567	* We must own the provider and dtrace locks.
17568	*
17569	* NOTE! It may appear there is a race by setting this value so late
17570	* after dtrace_probe_provide. However, any kext loaded after the
17571	* call to probe provide and before we set LAZY_OFF will be marked as
17572	* eligible for symbols from userspace. The same dtrace that is currently
17573	* calling dtrace_open() (this call!) will get a list of kexts needing
17574	* symbols and fill them in, thus closing the race window.
17575	*
17576	* We want to set this value only after it certain it will succeed, as
17577	* this significantly reduces the complexity of error exits.
17578	*/
17579	lck_mtx_lock(lck: &dtrace_lock);
17580	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17581	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17582	}
17583	lck_mtx_unlock(lck: &dtrace_lock);
17584
17585	/ Suspend cluster powerdown while DTrace device is opened. /
17586	suspend_cluster_powerdown();
17587	return (`0`);
17588	}
17589
17590	/ARGSUSED/
17591	static int
17592	dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17593	{
17594	#pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17595	minor_t minor = getminor(dev);
17596	dtrace_state_t *state;
17597
17598	/ APPLE NOTE: Darwin puts Helper on its own major device. /
17599	state = dtrace_state_get(minor);
17600
17601	lck_mtx_lock(lck: &cpu_lock);
17602	lck_mtx_lock(lck: &dtrace_lock);
17603
17604	if (state->dts_anon) {
17605	/*
17606	* There is anonymous state. Destroy that first.
17607	*/
17608	ASSERT(dtrace_anon.dta_state == NULL);
17609	dtrace_state_destroy(state: state->dts_anon);
17610	}
17611
17612	dtrace_state_destroy(state);
17613	ASSERT(dtrace_opens > `0`);
17614
17615	/*
17616	* Only relinquish control of the kernel debugger interface when there
17617	* are no consumers and no anonymous enablings.
17618	*/
17619	if (--dtrace_opens == `0` && dtrace_anon.dta_enabling == NULL) {
17620	#ifdef illumos
17621	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17622	#endif
17623	}
17624
17625	lck_mtx_unlock(lck: &dtrace_lock);
17626	lck_mtx_unlock(lck: &cpu_lock);
17627
17628	/*
17629	* Lock ordering requires the dof mode lock be taken before
17630	* the dtrace_lock.
17631	*/
17632	lck_rw_lock_exclusive(lck: &dtrace_dof_mode_lock);
17633	lck_mtx_lock(lck: &dtrace_lock);
17634
17635	if (dtrace_opens == `0`) {
17636	/*
17637	* If we are currently lazy-off, and this is the last close, transition to
17638	* lazy state.
17639	*/
17640	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17641	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17642	}
17643
17644	/*
17645	* If we are the last dtrace client, switch back to lazy (from userspace) symbols
17646	*/
17647	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17648	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17649	}
17650	}
17651
17652	lck_mtx_unlock(lck: &dtrace_lock);
17653	lck_rw_unlock_exclusive(lck: &dtrace_dof_mode_lock);
17654
17655	/*
17656	* Kext probes may be retained past the end of the kext's lifespan. The
17657	* probes are kept until the last reference to them has been removed.
17658	* Since closing an active dtrace context is likely to drop that last reference,
17659	* lets take a shot at cleaning out the orphaned probes now.
17660	*/
17661	dtrace_module_unloaded(NULL);
17662
17663	/ State is gone so resume cluster powerdown. /
17664	resume_cluster_powerdown();
17665	return (`0`);
17666	}
17667
17668	/ARGSUSED/
17669	static int
17670	dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17671	{
17672	#pragma unused(rv)
17673	/*
17674	* Safe to check this outside the dof mode lock
17675	*/
17676	if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17677	return KERN_SUCCESS;
17678
17679	switch (cmd) {
17680	#if defined (__arm64__)
17681	case DTRACEHIOC_ADDDOF_U32:
17682	case DTRACEHIOC_ADDDOF_U64:
17683	#else
17684	case DTRACEHIOC_ADDDOF:
17685	#endif /* __arm64__*/
17686	{
17687	dof_helper_t *dhp = NULL;
17688	size_t dof_ioctl_data_size;
17689	dof_ioctl_data_t* multi_dof;
17690	unsigned int i;
17691	int rval = `0`;
17692	user_addr_t user_address = (user_addr_t)arg;
17693	uint64_t dof_count;
17694	int multi_dof_claimed = `0`;
17695	proc_t* p = current_proc();
17696
17697	/*
17698	* If this is a restricted process and dtrace is restricted,
17699	* do not allow DOFs to be registered
17700	*/
17701	if (dtrace_is_restricted() &&
17702	!dtrace_are_restrictions_relaxed() &&
17703	!dtrace_can_attach_to_proc(current_proc())) {
17704	return (EACCES);
17705	}
17706
17707	/*
17708	* Read the number of DOF sections being passed in.
17709	*/
17710	if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17711	&dof_count,
17712	sizeof(dof_count))) {
17713	dtrace_dof_error(NULL, str: "failed to copyin dofiod_count");
17714	return (EFAULT);
17715	}
17716
17717	/*
17718	* Range check the count.
17719	*/
17720	if (dof_count == `0` \|\| dof_count > `1024`) {
17721	dtrace_dof_error(NULL, str: "dofiod_count is not valid");
17722	return (EINVAL);
17723	}
17724
17725	/*
17726	* Allocate a correctly sized structure and copyin the data.
17727	*/
17728	dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17729	if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17730	return (ENOMEM);
17731
17732	/ NOTE! We can no longer exit this method via return /
17733	if (copyin(user_address, multi_dof, dof_ioctl_data_size) != `0`) {
17734	dtrace_dof_error(NULL, str: "failed copyin of dof_ioctl_data_t");
17735	rval = EFAULT;
17736	goto cleanup;
17737	}
17738
17739	/*
17740	* Check that the count didn't change between the first copyin and the second.
17741	*/
17742	if (multi_dof->dofiod_count != dof_count) {
17743	rval = EINVAL;
17744	goto cleanup;
17745	}
17746
17747	/*
17748	* Try to process lazily first.
17749	*/
17750	rval = dtrace_lazy_dofs_add(p, incoming_dofs: multi_dof, dofs_claimed: &multi_dof_claimed);
17751
17752	/*
17753	* If rval is EACCES, we must be non-lazy.
17754	*/
17755	if (rval == EACCES) {
17756	rval = `0`;
17757	/*
17758	* Process each dof_helper_t
17759	*/
17760	i = `0`;
17761	do {
17762	dhp = &multi_dof->dofiod_helpers[i];
17763
17764	dof_hdr_t *dof = dtrace_dof_copyin(uarg: dhp->dofhp_dof, errp: &rval);
17765
17766	if (dof != NULL) {
17767	lck_mtx_lock(lck: &dtrace_meta_lock);
17768	lck_mtx_lock(lck: &dtrace_lock);
17769
17770	/*
17771	* dtrace_helper_slurp() takes responsibility for the dof --
17772	* it may free it now or it may save it and free it later.
17773	*/
17774	if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -`1ULL`) {
17775	rval = EINVAL;
17776	}
17777
17778	lck_mtx_unlock(lck: &dtrace_lock);
17779	lck_mtx_unlock(lck: &dtrace_meta_lock);
17780	}
17781	} while (++i < multi_dof->dofiod_count && rval == `0`);
17782	}
17783
17784	/*
17785	* We need to copyout the multi_dof struct, because it contains
17786	* the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17787	*
17788	* This could certainly be better optimized.
17789	*/
17790	if (copyout(multi_dof, user_address, dof_ioctl_data_size) != `0`) {
17791	dtrace_dof_error(NULL, str: "failed copyout of dof_ioctl_data_t");
17792	/ Don't overwrite pre-existing error code /
17793	if (rval == `0`) rval = EFAULT;
17794	}
17795
17796	cleanup:
17797	/*
17798	* If we had to allocate struct memory, free it.
17799	*/
17800	if (multi_dof != NULL && !multi_dof_claimed) {
17801	kmem_free(multi_dof, dof_ioctl_data_size);
17802	}
17803
17804	return rval;
17805	}
17806
17807	case DTRACEHIOC_REMOVE: {
17808	int generation = (int**)arg;
17809	proc_t* p = current_proc();
17810
17811	/*
17812	* Try lazy first.
17813	*/
17814	int rval = dtrace_lazy_dofs_remove(p, generation);
17815
17816	/*
17817	* EACCES means non-lazy
17818	*/
17819	if (rval == EACCES) {
17820	lck_mtx_lock(lck: &dtrace_meta_lock);
17821	lck_mtx_lock(lck: &dtrace_lock);
17822	rval = dtrace_helper_destroygen(p, gen: generation);
17823	lck_mtx_unlock(lck: &dtrace_lock);
17824	lck_mtx_unlock(lck: &dtrace_meta_lock);
17825	}
17826
17827	return (rval);
17828	}
17829
17830	default:
17831	break;
17832	}
17833
17834	return ENOTTY;
17835	}
17836
17837	/ARGSUSED/
17838	static int
17839	dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t cr, int* *rv)
17840	{
17841	#pragma unused(md)
17842	minor_t minor = getminor(dev);
17843	dtrace_state_t *state;
17844	int rval;
17845
17846	/ Darwin puts Helper on its own major device. /
17847
17848	state = dtrace_state_get(minor);
17849
17850	if (state->dts_anon) {
17851	ASSERT(dtrace_anon.dta_state == NULL);
17852	state = state->dts_anon;
17853	}
17854
17855	switch (cmd) {
17856	case DTRACEIOC_PROVIDER: {
17857	dtrace_providerdesc_t pvd;
17858	dtrace_provider_t *pvp;
17859
17860	if (copyin(arg, &pvd, sizeof (pvd)) != `0`)
17861	return (EFAULT);
17862
17863	pvd.dtvd_name[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
17864	lck_mtx_lock(lck: &dtrace_provider_lock);
17865
17866	for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17867	if (strncmp(s1: pvp->dtpv_name, s2: pvd.dtvd_name, DTRACE_PROVNAMELEN) == `0`)
17868	break;
17869	}
17870
17871	lck_mtx_unlock(lck: &dtrace_provider_lock);
17872
17873	if (pvp == NULL)
17874	return (ESRCH);
17875
17876	bcopy(src: &pvp->dtpv_priv, dst: &pvd.dtvd_priv, n: sizeof (dtrace_ppriv_t));
17877	bcopy(src: &pvp->dtpv_attr, dst: &pvd.dtvd_attr, n: sizeof (dtrace_pattr_t));
17878	if (copyout(&pvd, arg, sizeof (pvd)) != `0`)
17879	return (EFAULT);
17880
17881	return (`0`);
17882	}
17883
17884	case DTRACEIOC_EPROBE: {
17885	dtrace_eprobedesc_t epdesc;
17886	dtrace_ecb_t *ecb;
17887	dtrace_action_t *act;
17888	void *buf;
17889	size_t size;
17890	uintptr_t dest;
17891	int nrecs;
17892
17893	if (copyin(arg, &epdesc, sizeof (epdesc)) != `0`)
17894	return (EFAULT);
17895
17896	lck_mtx_lock(lck: &dtrace_lock);
17897
17898	if ((ecb = dtrace_epid2ecb(state, id: epdesc.dtepd_epid)) == NULL) {
17899	lck_mtx_unlock(lck: &dtrace_lock);
17900	return (EINVAL);
17901	}
17902
17903	if (ecb->dte_probe == NULL) {
17904	lck_mtx_unlock(lck: &dtrace_lock);
17905	return (EINVAL);
17906	}
17907
17908	epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17909	epdesc.dtepd_uarg = ecb->dte_uarg;
17910	epdesc.dtepd_size = ecb->dte_size;
17911
17912	nrecs = epdesc.dtepd_nrecs;
17913	epdesc.dtepd_nrecs = `0`;
17914	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17915	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
17916	continue;
17917
17918	epdesc.dtepd_nrecs++;
17919	}
17920
17921	/*
17922	* Now that we have the size, we need to allocate a temporary
17923	* buffer in which to store the complete description. We need
17924	* the temporary buffer to be able to drop dtrace_lock()
17925	* across the copyout(), below.
17926	*/
17927	size = sizeof (dtrace_eprobedesc_t) +
17928	(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17929
17930	buf = kmem_alloc(size, KM_SLEEP);
17931	dest = (uintptr_t)buf;
17932
17933	bcopy(src: &epdesc, dst: (void )dest, n: sizeof* (epdesc));
17934	dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[`0`]);
17935
17936	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17937	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
17938	continue;
17939
17940	if (nrecs-- == `0`)
17941	break;
17942
17943	bcopy(src: &act->dta_rec, dst: (void *)dest,
17944	n: sizeof (dtrace_recdesc_t));
17945	dest += sizeof (dtrace_recdesc_t);
17946	}
17947
17948	lck_mtx_unlock(lck: &dtrace_lock);
17949
17950	if (copyout(buf, arg, dest - (uintptr_t)buf) != `0`) {
17951	kmem_free(buf, size);
17952	return (EFAULT);
17953	}
17954
17955	kmem_free(buf, size);
17956	return (`0`);
17957	}
17958
17959	case DTRACEIOC_AGGDESC: {
17960	dtrace_aggdesc_t aggdesc;
17961	dtrace_action_t *act;
17962	dtrace_aggregation_t *agg;
17963	int nrecs;
17964	uint32_t offs;
17965	dtrace_recdesc_t *lrec;
17966	void *buf;
17967	size_t size;
17968	uintptr_t dest;
17969
17970	if (copyin(arg, &aggdesc, sizeof (aggdesc)) != `0`)
17971	return (EFAULT);
17972
17973	lck_mtx_lock(lck: &dtrace_lock);
17974
17975	if ((agg = dtrace_aggid2agg(state, id: aggdesc.dtagd_id)) == NULL) {
17976	lck_mtx_unlock(lck: &dtrace_lock);
17977	return (EINVAL);
17978	}
17979
17980	aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17981
17982	nrecs = aggdesc.dtagd_nrecs;
17983	aggdesc.dtagd_nrecs = `0`;
17984
17985	offs = agg->dtag_base;
17986	lrec = &agg->dtag_action.dta_rec;
17987	aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17988
17989	for (act = agg->dtag_first; ; act = act->dta_next) {
17990	ASSERT(act->dta_intuple \|\|
17991	DTRACEACT_ISAGG(act->dta_kind));
17992
17993	/*
17994	* If this action has a record size of zero, it
17995	* denotes an argument to the aggregating action.
17996	* Because the presence of this record doesn't (or
17997	* shouldn't) affect the way the data is interpreted,
17998	* we don't copy it out to save user-level the
17999	* confusion of dealing with a zero-length record.
18000	*/
18001	if (act->dta_rec.dtrd_size == `0`) {
18002	ASSERT(agg->dtag_hasarg);
18003	continue;
18004	}
18005
18006	aggdesc.dtagd_nrecs++;
18007
18008	if (act == &agg->dtag_action)
18009	break;
18010	}
18011
18012	/*
18013	* Now that we have the size, we need to allocate a temporary
18014	* buffer in which to store the complete description. We need
18015	* the temporary buffer to be able to drop dtrace_lock()
18016	* across the copyout(), below.
18017	*/
18018	size = sizeof (dtrace_aggdesc_t) +
18019	(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
18020
18021	buf = kmem_alloc(size, KM_SLEEP);
18022	dest = (uintptr_t)buf;
18023
18024	bcopy(src: &aggdesc, dst: (void )dest, n: sizeof* (aggdesc));
18025	dest += offsetof(dtrace_aggdesc_t, dtagd_rec[`0`]);
18026
18027	for (act = agg->dtag_first; ; act = act->dta_next) {
18028	dtrace_recdesc_t rec = act->dta_rec;
18029
18030	/*
18031	* See the comment in the above loop for why we pass
18032	* over zero-length records.
18033	*/
18034	if (rec.dtrd_size == `0`) {
18035	ASSERT(agg->dtag_hasarg);
18036	continue;
18037	}
18038
18039	if (nrecs-- == `0`)
18040	break;
18041
18042	rec.dtrd_offset -= offs;
18043	bcopy(src: &rec, dst: (void )dest, n: sizeof* (rec));
18044	dest += sizeof (dtrace_recdesc_t);
18045
18046	if (act == &agg->dtag_action)
18047	break;
18048	}
18049
18050	lck_mtx_unlock(lck: &dtrace_lock);
18051
18052	if (copyout(buf, arg, dest - (uintptr_t)buf) != `0`) {
18053	kmem_free(buf, size);
18054	return (EFAULT);
18055	}
18056
18057	kmem_free(buf, size);
18058	return (`0`);
18059	}
18060
18061	case DTRACEIOC_ENABLE: {
18062	dof_hdr_t *dof;
18063	dtrace_enabling_t *enab = NULL;
18064	dtrace_vstate_t *vstate;
18065	int err = `0`;
18066
18067	*rv = `0`;
18068
18069	/*
18070	* If a NULL argument has been passed, we take this as our
18071	* cue to reevaluate our enablings.
18072	*/
18073	if (arg == `0`) {
18074	dtrace_enabling_matchall();
18075
18076	return (`0`);
18077	}
18078
18079	if ((dof = dtrace_dof_copyin(uarg: arg, errp: &rval)) == NULL)
18080	return (rval);
18081
18082	lck_mtx_lock(lck: &cpu_lock);
18083	lck_mtx_lock(lck: &dtrace_lock);
18084	vstate = &state->dts_vstate;
18085
18086	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18087	lck_mtx_unlock(lck: &dtrace_lock);
18088	lck_mtx_unlock(lck: &cpu_lock);
18089	dtrace_dof_destroy(dof);
18090	return (EBUSY);
18091	}
18092
18093	if (dtrace_dof_slurp(dof, vstate, cr, enabp: &enab, ubase: `0`, noprobes: B_TRUE) != `0`) {
18094	lck_mtx_unlock(lck: &dtrace_lock);
18095	lck_mtx_unlock(lck: &cpu_lock);
18096	dtrace_dof_destroy(dof);
18097	return (EINVAL);
18098	}
18099
18100	if ((rval = dtrace_dof_options(dof, state)) != `0`) {
18101	dtrace_enabling_destroy(enab);
18102	lck_mtx_unlock(lck: &dtrace_lock);
18103	lck_mtx_unlock(lck: &cpu_lock);
18104	dtrace_dof_destroy(dof);
18105	return (rval);
18106	}
18107
18108	if ((err = dtrace_enabling_match(enab, nmatched: rv, NULL)) == `0`) {
18109	err = dtrace_enabling_retain(enab);
18110	} else {
18111	dtrace_enabling_destroy(enab);
18112	}
18113
18114	lck_mtx_unlock(lck: &dtrace_lock);
18115	lck_mtx_unlock(lck: &cpu_lock);
18116	dtrace_dof_destroy(dof);
18117
18118	return (err);
18119	}
18120
18121	case DTRACEIOC_REPLICATE: {
18122	dtrace_repldesc_t desc;
18123	dtrace_probedesc_t *match = &desc.dtrpd_match;
18124	dtrace_probedesc_t *create = &desc.dtrpd_create;
18125	int err;
18126
18127	if (copyin(arg, &desc, sizeof (desc)) != `0`)
18128	return (EFAULT);
18129
18130	match->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
18131	match->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
18132	match->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
18133	match->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
18134
18135	create->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
18136	create->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
18137	create->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
18138	create->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
18139
18140	lck_mtx_lock(lck: &dtrace_lock);
18141	err = dtrace_enabling_replicate(state, match, create);
18142	lck_mtx_unlock(lck: &dtrace_lock);
18143
18144	return (err);
18145	}
18146
18147	case DTRACEIOC_PROBEMATCH:
18148	case DTRACEIOC_PROBES: {
18149	dtrace_probe_t *probe = NULL;
18150	dtrace_probedesc_t desc;
18151	dtrace_probekey_t pkey;
18152	dtrace_id_t i;
18153	int m = `0`;
18154	uint32_t priv;
18155	uid_t uid;
18156	zoneid_t zoneid;
18157
18158	if (copyin(arg, &desc, sizeof (desc)) != `0`)
18159	return (EFAULT);
18160
18161	desc.dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
18162	desc.dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
18163	desc.dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
18164	desc.dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
18165
18166	/*
18167	* Before we attempt to match this probe, we want to give
18168	* all providers the opportunity to provide it.
18169	*/
18170	if (desc.dtpd_id == DTRACE_IDNONE) {
18171	lck_mtx_lock(lck: &dtrace_provider_lock);
18172	dtrace_probe_provide(desc: &desc, NULL);
18173	lck_mtx_unlock(lck: &dtrace_provider_lock);
18174	desc.dtpd_id++;
18175	}
18176
18177	dtrace_cred2priv(cr, privp: &priv, uidp: &uid, zoneidp: &zoneid);
18178
18179	lck_mtx_lock(lck: &dtrace_lock);
18180
18181	if (cmd == DTRACEIOC_PROBEMATCH) {
18182	dtrace_probekey(pdp: &desc, pkp: &pkey);
18183	pkey.dtpk_id = DTRACE_IDNONE;
18184
18185	/ Quiet compiler warning /
18186	for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18187	if ((probe = dtrace_probes[i - `1`]) != NULL &&
18188	(m = dtrace_match_probe(prp: probe, pkp: &pkey,
18189	priv, uid, zoneid)) != `0`)
18190	break;
18191	}
18192
18193	if (m < `0`) {
18194	lck_mtx_unlock(lck: &dtrace_lock);
18195	return (EINVAL);
18196	}
18197	dtrace_probekey_release(pkp: &pkey);
18198
18199	} else {
18200	/ Quiet compiler warning /
18201	for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18202	if ((probe = dtrace_probes[i - `1`]) != NULL &&
18203	dtrace_match_priv(prp: probe, priv, uid, zoneid))
18204	break;
18205	}
18206	}
18207
18208	if (probe == NULL) {
18209	lck_mtx_unlock(lck: &dtrace_lock);
18210	return (ESRCH);
18211	}
18212
18213	dtrace_probe_description(prp: probe, pdp: &desc);
18214	lck_mtx_unlock(lck: &dtrace_lock);
18215
18216	if (copyout(&desc, arg, sizeof (desc)) != `0`)
18217	return (EFAULT);
18218
18219	return (`0`);
18220	}
18221
18222	case DTRACEIOC_PROBEARG: {
18223	dtrace_argdesc_t desc;
18224	dtrace_probe_t *probe;
18225	dtrace_provider_t *prov;
18226
18227	if (copyin(arg, &desc, sizeof (desc)) != `0`)
18228	return (EFAULT);
18229
18230	if (desc.dtargd_id == DTRACE_IDNONE)
18231	return (EINVAL);
18232
18233	if (desc.dtargd_ndx == DTRACE_ARGNONE)
18234	return (EINVAL);
18235
18236	lck_mtx_lock(lck: &dtrace_provider_lock);
18237	lck_mtx_lock(lck: &mod_lock);
18238	lck_mtx_lock(lck: &dtrace_lock);
18239
18240	/ Quiet compiler warning /
18241	if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18242	lck_mtx_unlock(lck: &dtrace_lock);
18243	lck_mtx_unlock(lck: &mod_lock);
18244	lck_mtx_unlock(lck: &dtrace_provider_lock);
18245	return (EINVAL);
18246	}
18247
18248	if ((probe = dtrace_probes[desc.dtargd_id - `1`]) == NULL) {
18249	lck_mtx_unlock(lck: &dtrace_lock);
18250	lck_mtx_unlock(lck: &mod_lock);
18251	lck_mtx_unlock(lck: &dtrace_provider_lock);
18252	return (EINVAL);
18253	}
18254
18255	lck_mtx_unlock(lck: &dtrace_lock);
18256
18257	prov = probe->dtpr_provider;
18258
18259	if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18260	/*
18261	* There isn't any typed information for this probe.
18262	* Set the argument number to DTRACE_ARGNONE.
18263	*/
18264	desc.dtargd_ndx = DTRACE_ARGNONE;
18265	} else {
18266	desc.dtargd_native[`0`] = `'\0'`;
18267	desc.dtargd_xlate[`0`] = `'\0'`;
18268	desc.dtargd_mapping = desc.dtargd_ndx;
18269
18270	prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18271	probe->dtpr_id, probe->dtpr_arg, &desc);
18272	}
18273
18274	lck_mtx_unlock(lck: &mod_lock);
18275	lck_mtx_unlock(lck: &dtrace_provider_lock);
18276
18277	if (copyout(&desc, arg, sizeof (desc)) != `0`)
18278	return (EFAULT);
18279
18280	return (`0`);
18281	}
18282
18283	case DTRACEIOC_GO: {
18284	processorid_t cpuid;
18285	rval = dtrace_state_go(state, cpu: &cpuid);
18286
18287	if (rval != `0`)
18288	return (rval);
18289
18290	if (copyout(&cpuid, arg, sizeof (cpuid)) != `0`)
18291	return (EFAULT);
18292
18293	return (`0`);
18294	}
18295
18296	case DTRACEIOC_STOP: {
18297	processorid_t cpuid;
18298
18299	lck_mtx_lock(lck: &dtrace_lock);
18300	rval = dtrace_state_stop(state, cpu: &cpuid);
18301	lck_mtx_unlock(lck: &dtrace_lock);
18302
18303	if (rval != `0`)
18304	return (rval);
18305
18306	if (copyout(&cpuid, arg, sizeof (cpuid)) != `0`)
18307	return (EFAULT);
18308
18309	return (`0`);
18310	}
18311
18312	case DTRACEIOC_DOFGET: {
18313	dof_hdr_t hdr, *dof;
18314	uint64_t len;
18315
18316	if (copyin(arg, &hdr, sizeof (hdr)) != `0`)
18317	return (EFAULT);
18318
18319	lck_mtx_lock(lck: &dtrace_lock);
18320	dof = dtrace_dof_create(state);
18321	lck_mtx_unlock(lck: &dtrace_lock);
18322
18323	len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18324	rval = copyout(dof, arg, len);
18325	dtrace_dof_destroy(dof);
18326
18327	return (rval == `0` ? `0` : EFAULT);
18328	}
18329
18330	case DTRACEIOC_SLEEP: {
18331	int64_t time;
18332	uint64_t abstime;
18333	uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18334
18335	if (copyin(arg, &time, sizeof(time)) != `0`)
18336	return (EFAULT);
18337
18338	nanoseconds_to_absolutetime(nanoseconds: (uint64_t)time, result: &abstime);
18339	clock_absolutetime_interval_to_deadline(abstime, result: &abstime);
18340
18341	if (assert_wait_deadline(event: state, THREAD_ABORTSAFE, deadline: abstime) == THREAD_WAITING) {
18342	if (state->dts_buf_over_limit > `0`) {
18343	clear_wait(thread: current_thread(), THREAD_INTERRUPTED);
18344	rvalue = DTRACE_WAKE_BUF_LIMIT;
18345	} else {
18346	thread_block(THREAD_CONTINUE_NULL);
18347	if (state->dts_buf_over_limit > `0`) {
18348	rvalue = DTRACE_WAKE_BUF_LIMIT;
18349	}
18350	}
18351	}
18352
18353	if (copyout(&rvalue, arg, sizeof(rvalue)) != `0`)
18354	return (EFAULT);
18355
18356	return (`0`);
18357	}
18358
18359	case DTRACEIOC_SIGNAL: {
18360	wakeup(chan: state);
18361	return (`0`);
18362	}
18363
18364	case DTRACEIOC_AGGSNAP:
18365	case DTRACEIOC_BUFSNAP: {
18366	dtrace_bufdesc_t desc;
18367	caddr_t cached;
18368	boolean_t over_limit;
18369	dtrace_buffer_t *buf;
18370
18371	if (copyin(arg, &desc, sizeof (desc)) != `0`)
18372	return (EFAULT);
18373
18374	if ((int)desc.dtbd_cpu < `0` \|\| desc.dtbd_cpu >= NCPU)
18375	return (EINVAL);
18376
18377	lck_mtx_lock(lck: &dtrace_lock);
18378
18379	if (cmd == DTRACEIOC_BUFSNAP) {
18380	buf = &state->dts_buffer[desc.dtbd_cpu];
18381	} else {
18382	buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18383	}
18384
18385	if (buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL)) {
18386	size_t sz = buf->dtb_offset;
18387
18388	if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18389	lck_mtx_unlock(lck: &dtrace_lock);
18390	return (EBUSY);
18391	}
18392
18393	/*
18394	* If this buffer has already been consumed, we're
18395	* going to indicate that there's nothing left here
18396	* to consume.
18397	*/
18398	if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18399	lck_mtx_unlock(lck: &dtrace_lock);
18400
18401	desc.dtbd_size = `0`;
18402	desc.dtbd_drops = `0`;
18403	desc.dtbd_errors = `0`;
18404	desc.dtbd_oldest = `0`;
18405	sz = sizeof (desc);
18406
18407	if (copyout(&desc, arg, sz) != `0`)
18408	return (EFAULT);
18409
18410	return (`0`);
18411	}
18412
18413	/*
18414	* If this is a ring buffer that has wrapped, we want
18415	* to copy the whole thing out.
18416	*/
18417	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18418	dtrace_buffer_polish(buf);
18419	sz = buf->dtb_size;
18420	}
18421
18422	if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != `0`) {
18423	lck_mtx_unlock(lck: &dtrace_lock);
18424	return (EFAULT);
18425	}
18426
18427	desc.dtbd_size = sz;
18428	desc.dtbd_drops = buf->dtb_drops;
18429	desc.dtbd_errors = buf->dtb_errors;
18430	desc.dtbd_oldest = buf->dtb_xamot_offset;
18431	desc.dtbd_timestamp = dtrace_gethrtime();
18432
18433	lck_mtx_unlock(lck: &dtrace_lock);
18434
18435	if (copyout(&desc, arg, sizeof (desc)) != `0`)
18436	return (EFAULT);
18437
18438	buf->dtb_flags \|= DTRACEBUF_CONSUMED;
18439
18440	return (`0`);
18441	}
18442
18443	if (buf->dtb_tomax == NULL) {
18444	ASSERT(buf->dtb_xamot == NULL);
18445	lck_mtx_unlock(lck: &dtrace_lock);
18446	return (ENOENT);
18447	}
18448
18449	cached = buf->dtb_tomax;
18450	over_limit = buf->dtb_cur_limit == buf->dtb_size;
18451
18452	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18453
18454	dtrace_xcall(desc.dtbd_cpu,
18455	(dtrace_xcall_t)dtrace_buffer_switch, buf);
18456
18457	state->dts_errors += buf->dtb_xamot_errors;
18458
18459	/*
18460	* If the buffers did not actually switch, then the cross call
18461	* did not take place -- presumably because the given CPU is
18462	* not in the ready set. If this is the case, we'll return
18463	* ENOENT.
18464	*/
18465	if (buf->dtb_tomax == cached) {
18466	ASSERT(buf->dtb_xamot != cached);
18467	lck_mtx_unlock(lck: &dtrace_lock);
18468	return (ENOENT);
18469	}
18470
18471	ASSERT(cached == buf->dtb_xamot);
18472	/*
18473	* At this point we know the buffer have switched, so we
18474	* can decrement the over limit count if the buffer was over
18475	* its limit. The new buffer might already be over its limit
18476	* yet, but we don't care since we're guaranteed not to be
18477	* checking the buffer over limit count at this point.
18478	*/
18479	if (over_limit) {
18480	uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18481	#pragma unused(old)
18482
18483	/*
18484	* Verify that we didn't underflow the value
18485	*/
18486	ASSERT(old != `0`);
18487	}
18488
18489	/*
18490	* We have our snapshot; now copy it out.
18491	*/
18492	if (dtrace_buffer_copyout(buf->dtb_xamot,
18493	(user_addr_t)desc.dtbd_data,
18494	buf->dtb_xamot_offset) != `0`) {
18495	lck_mtx_unlock(lck: &dtrace_lock);
18496	return (EFAULT);
18497	}
18498
18499	desc.dtbd_size = buf->dtb_xamot_offset;
18500	desc.dtbd_drops = buf->dtb_xamot_drops;
18501	desc.dtbd_errors = buf->dtb_xamot_errors;
18502	desc.dtbd_oldest = `0`;
18503	desc.dtbd_timestamp = buf->dtb_switched;
18504
18505	lck_mtx_unlock(lck: &dtrace_lock);
18506
18507	/*
18508	* Finally, copy out the buffer description.
18509	*/
18510	if (copyout(&desc, arg, sizeof (desc)) != `0`)
18511	return (EFAULT);
18512
18513	return (`0`);
18514	}
18515
18516	case DTRACEIOC_CONF: {
18517	dtrace_conf_t conf;
18518
18519	bzero(s: &conf, n: sizeof (conf));
18520	conf.dtc_difversion = DIF_VERSION;
18521	conf.dtc_difintregs = DIF_DIR_NREGS;
18522	conf.dtc_diftupregs = DIF_DTR_NREGS;
18523	conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18524
18525	if (copyout(&conf, arg, sizeof (conf)) != `0`)
18526	return (EFAULT);
18527
18528	return (`0`);
18529	}
18530
18531	case DTRACEIOC_STATUS: {
18532	dtrace_status_t stat;
18533	dtrace_dstate_t *dstate;
18534	int j;
18535	uint64_t nerrs;
18536
18537	/*
18538	* See the comment in dtrace_state_deadman() for the reason
18539	* for setting dts_laststatus to INT64_MAX before setting
18540	* it to the correct value.
18541	*/
18542	state->dts_laststatus = INT64_MAX;
18543	dtrace_membar_producer();
18544	state->dts_laststatus = dtrace_gethrtime();
18545
18546	bzero(s: &stat, n: sizeof (stat));
18547
18548	lck_mtx_lock(lck: &dtrace_lock);
18549
18550	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18551	lck_mtx_unlock(lck: &dtrace_lock);
18552	return (ENOENT);
18553	}
18554
18555	if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18556	stat.dtst_exiting = `1`;
18557
18558	nerrs = state->dts_errors;
18559	dstate = &state->dts_vstate.dtvs_dynvars;
18560
18561	zpercpu_foreach_cpu(i) {
18562	dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
18563
18564	stat.dtst_dyndrops += dcpu->dtdsc_drops;
18565	stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18566	stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18567
18568	if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18569	stat.dtst_filled++;
18570
18571	nerrs += state->dts_buffer[i].dtb_errors;
18572
18573	for (j = `0`; j < state->dts_nspeculations; j++) {
18574	dtrace_speculation_t *spec;
18575	dtrace_buffer_t *buf;
18576
18577	spec = &state->dts_speculations[j];
18578	buf = &spec->dtsp_buffer[i];
18579	stat.dtst_specdrops += buf->dtb_xamot_drops;
18580	}
18581	}
18582
18583	stat.dtst_specdrops_busy = state->dts_speculations_busy;
18584	stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18585	stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18586	stat.dtst_dblerrors = state->dts_dblerrors;
18587	stat.dtst_killed =
18588	(state->dts_activity == DTRACE_ACTIVITY_KILLED);
18589	stat.dtst_errors = nerrs;
18590
18591	lck_mtx_unlock(lck: &dtrace_lock);
18592
18593	if (copyout(&stat, arg, sizeof (stat)) != `0`)
18594	return (EFAULT);
18595
18596	return (`0`);
18597	}
18598
18599	case DTRACEIOC_FORMAT: {
18600	dtrace_fmtdesc_t fmt;
18601	char *str;
18602	int len;
18603
18604	if (copyin(arg, &fmt, sizeof (fmt)) != `0`)
18605	return (EFAULT);
18606
18607	lck_mtx_lock(lck: &dtrace_lock);
18608
18609	if (fmt.dtfd_format == `0` \|\|
18610	fmt.dtfd_format > state->dts_nformats) {
18611	lck_mtx_unlock(lck: &dtrace_lock);
18612	return (EINVAL);
18613	}
18614
18615	/*
18616	* Format strings are allocated contiguously and they are
18617	* never freed; if a format index is less than the number
18618	* of formats, we can assert that the format map is non-NULL
18619	* and that the format for the specified index is non-NULL.
18620	*/
18621	ASSERT(state->dts_formats != NULL);
18622	str = state->dts_formats[fmt.dtfd_format - `1`]->dtf_str;
18623	ASSERT(str != NULL);
18624
18625	len = strlen(s: str) + `1`;
18626
18627	if (len > fmt.dtfd_length) {
18628	fmt.dtfd_length = len;
18629
18630	if (copyout(&fmt, arg, sizeof (fmt)) != `0`) {
18631	lck_mtx_unlock(lck: &dtrace_lock);
18632	return (EINVAL);
18633	}
18634	} else {
18635	if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != `0`) {
18636	lck_mtx_unlock(lck: &dtrace_lock);
18637	return (EINVAL);
18638	}
18639	}
18640
18641	lck_mtx_unlock(lck: &dtrace_lock);
18642	return (`0`);
18643	}
18644
18645	case DTRACEIOC_MODUUIDSLIST: {
18646	size_t module_uuids_list_size;
18647	dtrace_module_uuids_list_t* uuids_list;
18648	uint64_t dtmul_count;
18649
18650	/*
18651	* Security restrictions make this operation illegal, if this is enabled DTrace
18652	* must refuse to provide any fbt probes.
18653	*/
18654	if (dtrace_fbt_probes_restricted()) {
18655	cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18656	return (EPERM);
18657	}
18658
18659	/*
18660	* Fail if the kernel symbol mode makes this operation illegal.
18661	* Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18662	* for them without holding the dtrace_lock.
18663	*/
18664	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER \|\|
18665	dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18666	cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18667	return (EPERM);
18668	}
18669
18670	/*
18671	* Read the number of symbolsdesc structs being passed in.
18672	*/
18673	if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18674	&dtmul_count, sizeof(dtmul_count)) != `0`) {
18675	cmn_err(CE_WARN, "failed to copyin dtmul_count");
18676	return (EFAULT);
18677	}
18678
18679	/*
18680	* Range check the count. More than 2k kexts is probably an error.
18681	*/
18682	if (dtmul_count > `2048`) {
18683	cmn_err(CE_WARN, "dtmul_count is not valid");
18684	return (EINVAL);
18685	}
18686
18687	/*
18688	* For all queries, we return EINVAL when the user specified
18689	* count does not match the actual number of modules we find
18690	* available.
18691	*
18692	* If the user specified count is zero, then this serves as a
18693	* simple query to count the available modules in need of symbols.
18694	*/
18695
18696	rval = `0`;
18697
18698	if (dtmul_count == `0`)
18699	{
18700	lck_mtx_lock(lck: &mod_lock);
18701	struct modctl* ctl = dtrace_modctl_list;
18702	while (ctl) {
18703	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18704	if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18705	dtmul_count++;
18706	rval = EINVAL;
18707	}
18708	ctl = ctl->mod_next;
18709	}
18710	lck_mtx_unlock(lck: &mod_lock);
18711
18712	if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != `0`)
18713	return (EFAULT);
18714	else
18715	return (rval);
18716	}
18717
18718	/*
18719	* If we reach this point, then we have a request for full list data.
18720	* Allocate a correctly sized structure and copyin the data.
18721	*/
18722	module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18723	if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18724	return (ENOMEM);
18725
18726	/ NOTE! We can no longer exit this method via return /
18727	if (copyin(arg, uuids_list, module_uuids_list_size) != `0`) {
18728	cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18729	rval = EFAULT;
18730	goto moduuidslist_cleanup;
18731	}
18732
18733	/*
18734	* Check that the count didn't change between the first copyin and the second.
18735	*/
18736	if (uuids_list->dtmul_count != dtmul_count) {
18737	rval = EINVAL;
18738	goto moduuidslist_cleanup;
18739	}
18740
18741	/*
18742	* Build the list of UUID's that need symbols
18743	*/
18744	lck_mtx_lock(lck: &mod_lock);
18745
18746	dtmul_count = `0`;
18747
18748	struct modctl* ctl = dtrace_modctl_list;
18749	while (ctl) {
18750	/*
18751	* We assume that userspace symbols will be "better" than kernel level symbols,
18752	* as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18753	* are available, add user syms if the module might use them.
18754	*/
18755	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18756	if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18757	UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18758	if (dtmul_count++ < uuids_list->dtmul_count) {
18759	memcpy(dst: uuid, src: ctl->mod_uuid, n: sizeof(UUID));
18760	}
18761	}
18762	ctl = ctl->mod_next;
18763	}
18764
18765	lck_mtx_unlock(lck: &mod_lock);
18766
18767	if (uuids_list->dtmul_count < dtmul_count)
18768	rval = EINVAL;
18769
18770	uuids_list->dtmul_count = dtmul_count;
18771
18772	/*
18773	* Copyout the symbols list (or at least the count!)
18774	*/
18775	if (copyout(uuids_list, arg, module_uuids_list_size) != `0`) {
18776	cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18777	rval = EFAULT;
18778	}
18779
18780	moduuidslist_cleanup:
18781	/*
18782	* If we had to allocate struct memory, free it.
18783	*/
18784	if (uuids_list != NULL) {
18785	kmem_free(uuids_list, module_uuids_list_size);
18786	}
18787
18788	return rval;
18789	}
18790
18791	case DTRACEIOC_PROVMODSYMS: {
18792	size_t module_symbols_size;
18793	dtrace_module_symbols_t* module_symbols;
18794	uint64_t dtmodsyms_count;
18795
18796	/*
18797	* Security restrictions make this operation illegal, if this is enabled DTrace
18798	* must refuse to provide any fbt probes.
18799	*/
18800	if (dtrace_fbt_probes_restricted()) {
18801	cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18802	return (EPERM);
18803	}
18804
18805	/*
18806	* Fail if the kernel symbol mode makes this operation illegal.
18807	* Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18808	* for them without holding the dtrace_lock.
18809	*/
18810	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER \|\|
18811	dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18812	cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18813	return (EPERM);
18814	}
18815
18816	/*
18817	* Read the number of module symbols structs being passed in.
18818	*/
18819	if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18820	&dtmodsyms_count, sizeof(dtmodsyms_count)) != `0`) {
18821	cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18822	return (EFAULT);
18823	}
18824
18825	/ Ensure that we have at least one symbol. /
18826	if (dtmodsyms_count == `0`) {
18827	cmn_err(CE_WARN, "Invalid dtmodsyms_count value");
18828	return (EINVAL);
18829	}
18830
18831	/ Safely calculate size we need for copyin buffer. /
18832	module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18833	if (module_symbols_size == `0` \|\| module_symbols_size > (size_t)dtrace_copy_maxsize()) {
18834	cmn_err(CE_WARN, "Invalid module_symbols_size %ld", module_symbols_size);
18835	return (EINVAL);
18836	}
18837
18838	if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18839	return (ENOMEM);
18840
18841	rval = `0`;
18842
18843	/ NOTE! We can no longer exit this method via return /
18844	if (copyin(arg, module_symbols, module_symbols_size) != `0`) {
18845	cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18846	rval = EFAULT;
18847	goto module_symbols_cleanup;
18848	}
18849
18850	/*
18851	* Check that the count didn't change between the first copyin and the second.
18852	*/
18853	if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18854	rval = EINVAL;
18855	goto module_symbols_cleanup;
18856	}
18857
18858	/*
18859	* Find the modctl to add symbols to.
18860	*/
18861	lck_mtx_lock(lck: &dtrace_provider_lock);
18862	lck_mtx_lock(lck: &mod_lock);
18863
18864	struct modctl* ctl = dtrace_modctl_list;
18865	while (ctl) {
18866	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18867	if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(s1: module_symbols->dtmodsyms_uuid, s2: ctl->mod_uuid, n: sizeof(UUID)) == `0`) {
18868	dtrace_provider_t *prv;
18869	ctl->mod_user_symbols = module_symbols;
18870
18871	/*
18872	* We're going to call each providers per-module provide operation
18873	* specifying only this module.
18874	*/
18875	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18876	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18877	/*
18878	* We gave every provider a chance to provide with the user syms, go ahead and clear them
18879	*/
18880	ctl->mod_user_symbols = NULL; / MUST reset this to clear HAS_USERSPACE_SYMBOLS /
18881	}
18882	ctl = ctl->mod_next;
18883	}
18884
18885	lck_mtx_unlock(lck: &mod_lock);
18886	lck_mtx_unlock(lck: &dtrace_provider_lock);
18887
18888	module_symbols_cleanup:
18889	/*
18890	* If we had to allocate struct memory, free it.
18891	*/
18892	if (module_symbols != NULL) {
18893	kmem_free(module_symbols, module_symbols_size);
18894	}
18895
18896	return rval;
18897	}
18898
18899	case DTRACEIOC_PROCWAITFOR: {
18900	dtrace_procdesc_t pdesc = {
18901	.p_name = {`0`},
18902	.p_pid = -`1`
18903	};
18904
18905	if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != `0`)
18906	goto proc_waitfor_error;
18907
18908	if ((rval = dtrace_proc_waitfor(&pdesc)) != `0`)
18909	goto proc_waitfor_error;
18910
18911	if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != `0`)
18912	goto proc_waitfor_error;
18913
18914	return `0`;
18915
18916	proc_waitfor_error:
18917	/ The process was suspended, revert this since the client will not do it. /
18918	if (pdesc.p_pid != -`1`) {
18919	proc_t *proc = proc_find(pid: pdesc.p_pid);
18920	if (proc != PROC_NULL) {
18921	task_pidresume(task: proc_task(proc));
18922	proc_rele(p: proc);
18923	}
18924	}
18925
18926	return rval;
18927	}
18928
18929	default:
18930	break;
18931	}
18932
18933	return (ENOTTY);
18934	}
18935
18936	/*
18937	* APPLE NOTE: dtrace_detach not implemented
18938	*/
18939	#if !defined(__APPLE__)
18940	/ARGSUSED/
18941	static int
18942	dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18943	{
18944	dtrace_state_t *state;
18945
18946	switch (cmd) {
18947	case DDI_DETACH:
18948	break;
18949
18950	case DDI_SUSPEND:
18951	return (DDI_SUCCESS);
18952
18953	default:
18954	return (DDI_FAILURE);
18955	}
18956
18957	lck_mtx_lock(&cpu_lock);
18958	lck_mtx_lock(&dtrace_provider_lock);
18959	lck_mtx_lock(&dtrace_lock);
18960
18961	ASSERT(dtrace_opens == `0`);
18962
18963	if (dtrace_helpers > `0`) {
18964	lck_mtx_unlock(&dtrace_lock);
18965	lck_mtx_unlock(&dtrace_provider_lock);
18966	lck_mtx_unlock(&cpu_lock);
18967	return (DDI_FAILURE);
18968	}
18969
18970	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != `0`) {
18971	lck_mtx_unlock(&dtrace_lock);
18972	lck_mtx_unlock(&dtrace_provider_lock);
18973	lck_mtx_unlock(&cpu_lock);
18974	return (DDI_FAILURE);
18975	}
18976
18977	dtrace_provider = NULL;
18978
18979	if ((state = dtrace_anon_grab()) != NULL) {
18980	/*
18981	* If there were ECBs on this state, the provider should
18982	* have not been allowed to detach; assert that there is
18983	* none.
18984	*/
18985	ASSERT(state->dts_necbs == `0`);
18986	dtrace_state_destroy(state);
18987
18988	/*
18989	* If we're being detached with anonymous state, we need to
18990	* indicate to the kernel debugger that DTrace is now inactive.
18991	*/
18992	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18993	}
18994
18995	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18996	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18997	dtrace_cpu_init = NULL;
18998	dtrace_helpers_cleanup = NULL;
18999	dtrace_helpers_fork = NULL;
19000	dtrace_cpustart_init = NULL;
19001	dtrace_cpustart_fini = NULL;
19002	dtrace_debugger_init = NULL;
19003	dtrace_debugger_fini = NULL;
19004	dtrace_kreloc_init = NULL;
19005	dtrace_kreloc_fini = NULL;
19006	dtrace_modload = NULL;
19007	dtrace_modunload = NULL;
19008
19009	lck_mtx_unlock(&cpu_lock);
19010
19011	if (dtrace_helptrace_enabled) {
19012	kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
19013	dtrace_helptrace_buffer = NULL;
19014	}
19015
19016	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
19017	dtrace_probes = NULL;
19018	dtrace_nprobes = `0`;
19019
19020	dtrace_hash_destroy(dtrace_strings);
19021	dtrace_hash_destroy(dtrace_byprov);
19022	dtrace_hash_destroy(dtrace_bymod);
19023	dtrace_hash_destroy(dtrace_byfunc);
19024	dtrace_hash_destroy(dtrace_byname);
19025	dtrace_strings = NULL;
19026	dtrace_byprov = NULL;
19027	dtrace_bymod = NULL;
19028	dtrace_byfunc = NULL;
19029	dtrace_byname = NULL;
19030
19031	kmem_cache_destroy(dtrace_state_cache);
19032	vmem_destroy(dtrace_arena);
19033
19034	if (dtrace_toxrange != NULL) {
19035	kmem_free(dtrace_toxrange,
19036	dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
19037	dtrace_toxrange = NULL;
19038	dtrace_toxranges = `0`;
19039	dtrace_toxranges_max = `0`;
19040	}
19041
19042	ddi_remove_minor_node(dtrace_devi, NULL);
19043	dtrace_devi = NULL;
19044
19045	ddi_soft_state_fini(&dtrace_softstate);
19046
19047	ASSERT(dtrace_vtime_references == `0`);
19048	ASSERT(dtrace_opens == `0`);
19049	ASSERT(dtrace_retained == NULL);
19050
19051	lck_mtx_unlock(&dtrace_lock);
19052	lck_mtx_unlock(&dtrace_provider_lock);
19053
19054	#ifdef illumos
19055	/*
19056	* We don't destroy the task queue until after we have dropped our
19057	* locks (taskq_destroy() may block on running tasks). To prevent
19058	* attempting to do work after we have effectively detached but before
19059	* the task queue has been destroyed, all tasks dispatched via the
19060	* task queue must check that DTrace is still attached before
19061	* performing any operation.
19062	*/
19063	taskq_destroy(dtrace_taskq);
19064	dtrace_taskq = NULL;
19065	#endif
19066
19067	return (DDI_SUCCESS);
19068	}
19069	#endif /* __APPLE__ */
19070
19071	d_open_t _dtrace_open, helper_open;
19072	d_close_t _dtrace_close, helper_close;
19073	d_ioctl_t _dtrace_ioctl, helper_ioctl;
19074
19075	int
19076	_dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19077	{
19078	#pragma unused(p)
19079	dev_t locdev = dev;
19080
19081	return dtrace_open( devp: &locdev, flag: flags, otyp: devtype, CRED());
19082	}
19083
19084	int
19085	helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19086	{
19087	#pragma unused(dev,flags,devtype,p)
19088	return `0`;
19089	}
19090
19091	int
19092	_dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19093	{
19094	#pragma unused(p)
19095	return dtrace_close( dev, flag: flags, otyp: devtype, CRED());
19096	}
19097
19098	int
19099	helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19100	{
19101	#pragma unused(dev,flags,devtype,p)
19102	return `0`;
19103	}
19104
19105	int
19106	_dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19107	{
19108	#pragma unused(p)
19109	int err, rv = `0`;
19110	user_addr_t uaddrp;
19111
19112	if (proc_is64bit(p))
19113	uaddrp = (user_addr_t )data;
19114	else
19115	uaddrp = (user_addr_t) (uint32_t )data;
19116
19117	err = dtrace_ioctl(dev, cmd, arg: uaddrp, md: fflag, CRED(), rv: &rv);
19118
19119	/ Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. /
19120	if (err != `0`) {
19121	ASSERT( (err & `0xfffff000`) == `0` );
19122	return (err & `0xfff`); / ioctl will return -1 and will set errno to an error code < 4096 /
19123	} else if (rv != `0`) {
19124	ASSERT( (rv & `0xfff00000`) == `0` );
19125	return (((rv & `0xfffff`) << `12`)); / ioctl will return -1 and will set errno to a value >= 4096 /
19126	} else
19127	return `0`;
19128	}
19129
19130	int
19131	helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19132	{
19133	#pragma unused(dev,fflag,p)
19134	int err, rv = `0`;
19135
19136	err = dtrace_ioctl_helper(cmd, arg: data, rv: &rv);
19137	/ Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. /
19138	if (err != `0`) {
19139	ASSERT( (err & `0xfffff000`) == `0` );
19140	return (err & `0xfff`); / ioctl will return -1 and will set errno to an error code < 4096 /
19141	} else if (rv != `0`) {
19142	ASSERT( (rv & `0xfff00000`) == `0` );
19143	return (((rv & `0xfffff`) << `12`)); / ioctl will return -1 and will set errno to a value >= 4096 /
19144	} else
19145	return `0`;
19146	}
19147
19148	#define HELPER_MAJOR -24 /* let the kernel pick the device number */
19149
19150	#define nulldevfp (void (*)(void))&nulldev
19151
19152	const static struct cdevsw helper_cdevsw =
19153	{
19154	.d_open = helper_open,
19155	.d_close = helper_close,
19156	.d_read = eno_rdwrt,
19157	.d_write = eno_rdwrt,
19158	.d_ioctl = helper_ioctl,
19159	.d_stop = eno_stop,
19160	.d_reset = eno_reset,
19161	.d_select = eno_select,
19162	.d_mmap = eno_mmap,
19163	.d_strategy = eno_strat,
19164	.d_reserved_1 = eno_getc,
19165	.d_reserved_2 = eno_putc,
19166	};
19167
19168	static int helper_majdevno = `0`;
19169
19170	static int gDTraceInited = `0`;
19171
19172	void
19173	helper_init( void )
19174	{
19175	/*
19176	* Once the "helper" is initialized, it can take ioctl calls that use locks
19177	* and zones initialized in dtrace_init. Make certain dtrace_init was called
19178	* before us.
19179	*/
19180
19181	if (!gDTraceInited) {
19182	panic("helper_init before dtrace_init");
19183	}
19184
19185	if (`0` >= helper_majdevno)
19186	{
19187	helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19188
19189	if (helper_majdevno < `0`) {
19190	printf("helper_init: failed to allocate a major number!\n");
19191	return;
19192	}
19193
19194	if (NULL == devfs_make_node( makedev(helper_majdevno, `0`), DEVFS_CHAR, UID_ROOT, GID_WHEEL, perms: `0666`,
19195	DTRACEMNR_HELPER )) {
19196	printf("dtrace_init: failed to devfs_make_node for helper!\n");
19197	return;
19198	}
19199	} else
19200	panic("helper_init: called twice!");
19201	}
19202
19203	#undef HELPER_MAJOR
19204
19205	static int
19206	dtrace_clone_func(dev_t dev, int action)
19207	{
19208	#pragma unused(dev)
19209
19210	if (action == DEVFS_CLONE_ALLOC) {
19211	return dtrace_state_reserve();
19212	}
19213	else if (action == DEVFS_CLONE_FREE) {
19214	return `0`;
19215	}
19216	else return -`1`;
19217	}
19218
19219	void dtrace_ast(void);
19220
19221	void
19222	dtrace_ast(void)
19223	{
19224	int i;
19225	uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, `0`, relaxed);
19226	if (clients == `0`)
19227	return;
19228	/**
19229	* We disable preemption here to be sure that we won't get
19230	* interrupted by a wakeup to a thread that is higher
19231	* priority than us, so that we do issue all wakeups
19232	*/
19233	disable_preemption();
19234	for (i = `0`; i < DTRACE_NCLIENTS; i++) {
19235	if (clients & (`1` << i)) {
19236	dtrace_state_t *state = dtrace_state_get(minor: i);
19237	if (state) {
19238	wakeup(chan: state);
19239	}
19240
19241	}
19242	}
19243	enable_preemption();
19244	}
19245
19246
19247	#define DTRACE_MAJOR -24 /* let the kernel pick the device number */
19248
19249	static const struct cdevsw dtrace_cdevsw =
19250	{
19251	.d_open = _dtrace_open,
19252	.d_close = _dtrace_close,
19253	.d_read = eno_rdwrt,
19254	.d_write = eno_rdwrt,
19255	.d_ioctl = _dtrace_ioctl,
19256	.d_stop = eno_stop,
19257	.d_reset = eno_reset,
19258	.d_select = eno_select,
19259	.d_mmap = eno_mmap,
19260	.d_strategy = eno_strat,
19261	.d_reserved_1 = eno_getc,
19262	.d_reserved_2 = eno_putc,
19263	};
19264
19265	LCK_ATTR_DECLARE(dtrace_lck_attr, `0`, `0`);
19266	LCK_GRP_DECLARE(dtrace_lck_grp, "dtrace");
19267
19268	static int gMajDevNo;
19269
19270	void dtrace_early_init (void)
19271	{
19272	dtrace_restriction_policy_load();
19273
19274	/*
19275	* See dtrace_impl.h for a description of kernel symbol modes.
19276	* The default is to wait for symbols from userspace (lazy symbols).
19277	*/
19278	if (!PE_parse_boot_argn(arg_string: "dtrace_kernel_symbol_mode", arg_ptr: &dtrace_kernel_symbol_mode, max_arg: sizeof (dtrace_kernel_symbol_mode))) {
19279	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19280	}
19281	}
19282
19283	void
19284	dtrace_init( void )
19285	{
19286	if (`0` == gDTraceInited) {
19287	unsigned int i, ncpu;
19288	size_t size = sizeof(dtrace_buffer_memory_maxsize);
19289
19290	/*
19291	* Disable destructive actions when dtrace is running
19292	* in a restricted environment
19293	*/
19294	dtrace_destructive_disallow = dtrace_is_restricted() &&
19295	!dtrace_are_restrictions_relaxed();
19296
19297	/*
19298	* DTrace allocates buffers based on the maximum number
19299	* of enabled cpus. This call avoids any race when finding
19300	* that count.
19301	*/
19302	ASSERT(dtrace_max_cpus == `0`);
19303	ncpu = dtrace_max_cpus = ml_wait_max_cpus();
19304
19305	/*
19306	* Retrieve the size of the physical memory in order to define
19307	* the state buffer memory maximal size. If we cannot retrieve
19308	* this value, we'll consider that we have 1Gb of memory per CPU, that's
19309	* still better than raising a kernel panic.
19310	*/
19311	if (`0` != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19312	&size, NULL, `0`))
19313	{
19314	dtrace_buffer_memory_maxsize = ncpu * `1024` * `1024` * `1024`;
19315	printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19316	dtrace_buffer_memory_maxsize);
19317	}
19318
19319	/*
19320	* Finally, divide by three to prevent DTrace from eating too
19321	* much memory.
19322	*/
19323	dtrace_buffer_memory_maxsize /= `3`;
19324	ASSERT(dtrace_buffer_memory_maxsize > `0`);
19325
19326	gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19327
19328	if (gMajDevNo < `0`) {
19329	printf("dtrace_init: failed to allocate a major number!\n");
19330	gDTraceInited = `0`;
19331	return;
19332	}
19333
19334	if (NULL == devfs_make_node_clone( makedev(gMajDevNo, `0`), DEVFS_CHAR, UID_ROOT, GID_WHEEL, perms: `0666`,
19335	clone: dtrace_clone_func, DTRACEMNR_DTRACE )) {
19336	printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19337	gDTraceInited = `0`;
19338	return;
19339	}
19340
19341	/*
19342	* The cpu_core structure consists of per-CPU state available in any context.
19343	* On some architectures, this may mean that the page(s) containing the
19344	* NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19345	* is up to the platform to assure that this is performed properly. Note that
19346	* the structure is sized to avoid false sharing.
19347	*/
19348
19349	dtrace_modctl_list = NULL;
19350
19351	cpu_core = (cpu_core_t )kmem_zalloc( ncpu sizeof(cpu_core_t), KM_SLEEP );
19352	for (i = `0`; i < ncpu; ++i) {
19353	lck_mtx_init(lck: &cpu_core[i].cpuc_pid_lock, grp: &dtrace_lck_grp, attr: &dtrace_lck_attr);
19354	}
19355
19356	cpu_list = (dtrace_cpu_t )kmem_zalloc( ncpu sizeof(dtrace_cpu_t), KM_SLEEP );
19357	for (i = `0`; i < ncpu; ++i) {
19358	cpu_list[i].cpu_id = (processorid_t)i;
19359	cpu_list[i].cpu_next = &(cpu_list[(i+`1`) % ncpu]);
19360	LIST_INIT(&cpu_list[i].cpu_cyc_list);
19361	lck_rw_init(lck: &cpu_list[i].cpu_ft_lock, grp: &dtrace_lck_grp, attr: &dtrace_lck_attr);
19362	}
19363
19364	/*
19365	* Initialize the CPU offline/online hooks.
19366	*/
19367	dtrace_install_cpu_hooks();
19368
19369	lck_mtx_lock(lck: &cpu_lock);
19370	for (i = `0`; i < ncpu; ++i)
19371	/ FIXME: track CPU configuration /
19372	dtrace_cpu_setup_initial( cpu: (processorid_t)i ); / In lieu of register_cpu_setup_func() callback /
19373	lck_mtx_unlock(lck: &cpu_lock);
19374
19375	(void)dtrace_abs_to_nano(`0LL`); / Force once only call to clock_timebase_info (which can take a lock) /
19376
19377	dtrace_strings = dtrace_hash_create(func: dtrace_strkey_offset,
19378	offsetof(dtrace_string_t, dtst_str),
19379	offsetof(dtrace_string_t, dtst_next),
19380	offsetof(dtrace_string_t, dtst_prev));
19381
19382	/*
19383	* See dtrace_impl.h for a description of dof modes.
19384	* The default is lazy dof.
19385	*
19386	* FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19387	* makes no sense...
19388	*/
19389	if (!PE_parse_boot_argn(arg_string: "dtrace_dof_mode", arg_ptr: &dtrace_dof_mode, max_arg: sizeof (dtrace_dof_mode))) {
19390	#if defined(XNU_TARGET_OS_OSX)
19391	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19392	#else
19393	dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19394	#endif
19395	}
19396
19397	/*
19398	* Sanity check of dof mode value.
19399	*/
19400	switch (dtrace_dof_mode) {
19401	case DTRACE_DOF_MODE_NEVER:
19402	case DTRACE_DOF_MODE_LAZY_ON:
19403	/ valid modes, but nothing else we need to do /
19404	break;
19405
19406	case DTRACE_DOF_MODE_LAZY_OFF:
19407	case DTRACE_DOF_MODE_NON_LAZY:
19408	/ Cannot wait for a dtrace_open to init fasttrap /
19409	fasttrap_init();
19410	break;
19411
19412	default:
19413	/ Invalid, clamp to non lazy /
19414	dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19415	fasttrap_init();
19416	break;
19417	}
19418
19419	#if CONFIG_DTRACE
19420	if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19421	commpage_update_dof(true);
19422	#endif
19423
19424	gDTraceInited = `1`;
19425
19426	} else
19427	panic("dtrace_init: called twice!");
19428	}
19429
19430	void
19431	dtrace_postinit(void)
19432	{
19433	/*
19434	* Called from bsd_init after all provider's *_init() routines have been
19435	* run. That way, anonymous DOF enabled under dtrace_attach() is safe
19436	* to go.
19437	*/
19438	dtrace_attach( devi: (dev_info_t )(uintptr_t)makedev(gMajDevNo, `0`)); /* Punning a dev_t to a dev_info_t* /
19439
19440	/*
19441	* Add the mach_kernel to the module list for lazy processing
19442	*/
19443	struct kmod_info fake_kernel_kmod;
19444	memset(s: &fake_kernel_kmod, c: `0`, n: sizeof(fake_kernel_kmod));
19445
19446	strlcpy(dst: fake_kernel_kmod.name, src: "mach_kernel", n: sizeof(fake_kernel_kmod.name));
19447	fake_kernel_kmod.id = `1`;
19448	fake_kernel_kmod.address = g_kernel_kmod_info.address;
19449	fake_kernel_kmod.size = g_kernel_kmod_info.size;
19450
19451	/ Ensure we don't try to touch symbols if they are gone. /
19452	boolean_t keepsyms = false;
19453	PE_parse_boot_argn(arg_string: "keepsyms", arg_ptr: &keepsyms, max_arg: sizeof(keepsyms));
19454
19455	if (dtrace_module_loaded(kmod: &fake_kernel_kmod, flag: (keepsyms) ? `0` : KMOD_DTRACE_NO_KERNEL_SYMS) != `0`) {
19456	printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19457	}
19458
19459	(void)OSKextRegisterKextsWithDTrace();
19460	}
19461	#undef DTRACE_MAJOR
19462
19463	/*
19464	* Routines used to register interest in cpu's being added to or removed
19465	* from the system.
19466	*/
19467	void
19468	register_cpu_setup_func(cpu_setup_func_t ignore1, void* *ignore2)
19469	{
19470	#pragma unused(ignore1,ignore2)
19471	}
19472
19473	void
19474	unregister_cpu_setup_func(cpu_setup_func_t ignore1, void* *ignore2)
19475	{
19476	#pragma unused(ignore1,ignore2)
19477	}
19478

Browse the source code of xnu/bsd/dev/dtrace/dtrace.c