dtrace_impl.h source code [xnu/bsd/sys/dtrace_impl.h]

1	/*
2	* CDDL HEADER START
3	*
4	* The contents of this file are subject to the terms of the
5	* Common Development and Distribution License (the "License").
6	* You may not use this file except in compliance with the License.
7	*
8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9	* or http://www.opensolaris.org/os/licensing.
10	* See the License for the specific language governing permissions
11	* and limitations under the License.
12	*
13	* When distributing Covered Code, include this CDDL HEADER in each
14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15	* If applicable, add the following below this CDDL HEADER, with the
16	* fields enclosed by brackets "[]" replaced with your own identifying
17	* information: Portions Copyright [yyyy] [name of copyright owner]
18	*
19	* CDDL HEADER END
20	*/
21
22	/*
23	* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24	* Use is subject to license terms.
25	*
26	* Portions Copyright (c) 2012 by Delphix. All rights reserved.
27	* Portions Copyright (c) 2016 by Joyent, Inc.
28	*/
29
30	#ifndef _SYS_DTRACE_IMPL_H
31	#define _SYS_DTRACE_IMPL_H
32
33	#ifdef __cplusplus
34	extern "C" {
35	#endif
36
37	/*
38	* DTrace Dynamic Tracing Software: Kernel Implementation Interfaces
39	*
40	* Note: The contents of this file are private to the implementation of the
41	* Solaris system and DTrace subsystem and are subject to change at any time
42	* without notice. Applications and drivers using these interfaces will fail
43	* to run on future releases. These interfaces should not be used for any
44	* purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB).
45	* Please refer to the "Solaris Dynamic Tracing Guide" for more information.
46	*/
47
48	#include <sys/dtrace.h>
49	#include <kern/kalloc.h>
50
51	/*
52	* DTrace Implementation Locks
53	*/
54	extern lck_attr_t dtrace_lck_attr;
55	extern lck_grp_t dtrace_lck_grp;
56	extern lck_mtx_t dtrace_procwaitfor_lock;
57
58	/*
59	* DTrace Implementation Constants and Typedefs
60	*/
61	#define DTRACE_MAXPROPLEN 128
62	#define DTRACE_DYNVAR_CHUNKSIZE 256
63
64	struct dtrace_probe;
65	struct dtrace_ecb;
66	struct dtrace_predicate;
67	struct dtrace_action;
68	struct dtrace_provider;
69	struct dtrace_state;
70
71	typedef struct dtrace_probe dtrace_probe_t;
72	typedef struct dtrace_ecb dtrace_ecb_t;
73	typedef struct dtrace_predicate dtrace_predicate_t;
74	typedef struct dtrace_action dtrace_action_t;
75	typedef struct dtrace_provider dtrace_provider_t;
76	typedef struct dtrace_meta dtrace_meta_t;
77	typedef struct dtrace_state dtrace_state_t;
78	typedef uint32_t dtrace_optid_t;
79	typedef uint32_t dtrace_specid_t;
80	typedef uint64_t dtrace_genid_t;
81
82	/*
83	* DTrace Probes
84	*
85	* The probe is the fundamental unit of the DTrace architecture. Probes are
86	* created by DTrace providers, and managed by the DTrace framework. A probe
87	* is identified by a unique <provider, module, function, name> tuple, and has
88	* a unique probe identifier assigned to it. (Some probes are not associated
89	* with a specific point in text; these are called _unanchored probes_ and have
90	* no module or function associated with them.) Probes are represented as a
91	* dtrace_probe structure. To allow quick lookups based on each element of the
92	* probe tuple, probes are hashed by each of provider, module, function and
93	* name. (If a lookup is performed based on a regular expression, a
94	* dtrace_probekey is prepared, and a linear search is performed.) Each probe
95	* is additionally pointed to by a linear array indexed by its identifier. The
96	* identifier is the provider's mechanism for indicating to the DTrace
97	* framework that a probe has fired: the identifier is passed as the first
98	* argument to dtrace_probe(), where it is then mapped into the corresponding
99	* dtrace_probe structure. From the dtrace_probe structure, dtrace_probe() can
100	* iterate over the probe's list of enabling control blocks; see "DTrace
101	* Enabling Control Blocks", below.)
102	*/
103	struct dtrace_probe {
104	dtrace_id_t dtpr_id; / probe identifier /
105	dtrace_ecb_t dtpr_ecb; /* ECB list; see below /
106	dtrace_ecb_t dtpr_ecb_last; /* last ECB in list /
107	void dtpr_arg; /* provider argument /
108	dtrace_cacheid_t dtpr_predcache; / predicate cache ID /
109	int dtpr_aframes; / artificial frames /
110	dtrace_provider_t dtpr_provider; /* pointer to provider /
111	char dtpr_mod; /* probe's module name /
112	char dtpr_func; /* probe's function name /
113	char dtpr_name; /* probe's name /
114	dtrace_probe_t dtpr_nextprov; /* next in provider hash /
115	dtrace_probe_t dtpr_prevprov; /* previous in provider hash /
116	dtrace_probe_t dtpr_nextmod; /* next in module hash /
117	dtrace_probe_t dtpr_prevmod; /* previous in module hash /
118	dtrace_probe_t dtpr_nextfunc; /* next in function hash /
119	dtrace_probe_t dtpr_prevfunc; /* previous in function hash /
120	dtrace_probe_t dtpr_nextname; /* next in name hash /
121	dtrace_probe_t dtpr_prevname; /* previous in name hash /
122	dtrace_genid_t dtpr_gen; / probe generation ID /
123	};
124
125	typedef int dtrace_probekey_f(const char , const* char , int*);
126
127	typedef struct dtrace_probekey {
128	const char dtpk_prov; /* provider name to match /
129	dtrace_probekey_f dtpk_pmatch; /* provider matching function /
130	const char dtpk_mod; /* module name to match /
131	dtrace_probekey_f dtpk_mmatch; /* module matching function /
132	const char dtpk_func; /* func name to match /
133	dtrace_probekey_f dtpk_fmatch; /* func matching function /
134	const char dtpk_name; /* name to match /
135	dtrace_probekey_f dtpk_nmatch; /* name matching function /
136	dtrace_id_t dtpk_id; / identifier to match /
137	} dtrace_probekey_t;
138
139	typedef struct dtrace_hashbucket {
140	struct dtrace_hashbucket dthb_next; /* next on hash chain /
141	void dthb_chain; /* chain of elements /
142	int dthb_len; / number of probes here /
143	} dtrace_hashbucket_t;
144
145	typedef const char* dtrace_strkey_f(void*, uintptr_t);
146
147	typedef struct dtrace_hash {
148	dtrace_hashbucket_t *dth_tab; /* hash table /
149	int dth_size; / size of hash table /
150	int dth_mask; / mask to index into table /
151	int dth_nbuckets; / total number of buckets /
152	uintptr_t dth_nextoffs; / offset of next in element /
153	uintptr_t dth_prevoffs; / offset of prev in element /
154	dtrace_strkey_f dth_getstr; /* func to retrieve str in element /
155	uintptr_t dth_stroffs; / offset of str in element /
156	} dtrace_hash_t;
157
158	/*
159	* DTrace Enabling Control Blocks
160	*
161	* When a provider wishes to fire a probe, it calls into dtrace_probe(),
162	* passing the probe identifier as the first argument. As described above,
163	* dtrace_probe() maps the identifier into a pointer to a dtrace_probe_t
164	* structure. This structure contains information about the probe, and a
165	* pointer to the list of Enabling Control Blocks (ECBs). Each ECB points to
166	* DTrace consumer state, and contains an optional predicate, and a list of
167	* actions. (Shown schematically below.) The ECB abstraction allows a single
168	* probe to be multiplexed across disjoint consumers, or across disjoint
169	* enablings of a single probe within one consumer.
170	*
171	* Enabling Control Block
172	* dtrace_ecb_t
173	* +------------------------+
174	* \| dtrace_epid_t ---------+--------------> Enabled Probe ID (EPID)
175	* \| dtrace_state_t * ------+--------------> State associated with this ECB
176	* \| dtrace_predicate_t * --+---------+
177	* \| dtrace_action_t * -----+----+ \|
178	* \| dtrace_ecb_t * ---+ \| \| \| Predicate (if any)
179	* +-------------------+----+ \| \| dtrace_predicate_t
180	* \| \| +---> +--------------------+
181	* \| \| \| dtrace_difo_t * ---+----> DIFO
182	* \| \| +--------------------+
183	* \| \|
184	* Next ECB \| \| Action
185	* (if any) \| \| dtrace_action_t
186	* : +--> +-------------------+
187	* : \| dtrace_actkind_t -+------> kind
188	* v \| dtrace_difo_t * --+------> DIFO (if any)
189	* \| dtrace_recdesc_t -+------> record descr.
190	* \| dtrace_action_t * +------+
191	* +-------------------+ \|
192	* \| Next action
193	* +-------------------------------+ (if any)
194	* \|
195	* \| Action
196	* \| dtrace_action_t
197	* +--> +-------------------+
198	* \| dtrace_actkind_t -+------> kind
199	* \| dtrace_difo_t * --+------> DIFO (if any)
200	* \| dtrace_action_t * +------+
201	* +-------------------+ \|
202	* \| Next action
203	* +-------------------------------+ (if any)
204	* \|
205	* :
206	* v
207	*
208	*
209	* dtrace_probe() iterates over the ECB list. If the ECB needs less space
210	* than is available in the principal buffer, the ECB is processed: if the
211	* predicate is non-NULL, the DIF object is executed. If the result is
212	* non-zero, the action list is processed, with each action being executed
213	* accordingly. When the action list has been completely executed, processing
214	* advances to the next ECB. The ECB abstraction allows disjoint consumers
215	* to multiplex on single probes.
216	*
217	* Execution of the ECB results in consuming dte_size bytes in the buffer
218	* to record data. During execution, dte_needed bytes must be available in
219	* the buffer. This space is used for both recorded data and tuple data.
220	*/
221	struct dtrace_ecb {
222	dtrace_epid_t dte_epid; / enabled probe ID /
223	uint32_t dte_alignment; / required alignment /
224	size_t dte_needed; / space needed for execution /
225	size_t dte_size; / size of recorded payload /
226	dtrace_predicate_t dte_predicate; /* predicate, if any /
227	dtrace_action_t dte_action; /* actions, if any /
228	dtrace_ecb_t dte_next; /* next ECB on probe /
229	dtrace_state_t dte_state; /* pointer to state /
230	uint32_t dte_cond; / security condition /
231	dtrace_probe_t dte_probe; /* pointer to probe /
232	dtrace_action_t dte_action_last; /* last action on ECB /
233	uint64_t dte_uarg; / library argument /
234	};
235
236	struct dtrace_predicate {
237	dtrace_difo_t dtp_difo; /* DIF object /
238	dtrace_cacheid_t dtp_cacheid; / cache identifier /
239	int dtp_refcnt; / reference count /
240	};
241
242	struct dtrace_action {
243	dtrace_actkind_t dta_kind; / kind of action /
244	uint16_t dta_intuple; / boolean: in aggregation /
245	uint32_t dta_refcnt; / reference count /
246	dtrace_difo_t dta_difo; /* pointer to DIFO /
247	dtrace_recdesc_t dta_rec; / record description /
248	dtrace_action_t dta_prev; /* previous action /
249	dtrace_action_t dta_next; /* next action /
250	};
251
252	typedef struct dtrace_aggregation {
253	dtrace_action_t dtag_action; / action; must be first /
254	dtrace_aggid_t dtag_id; / identifier /
255	dtrace_ecb_t dtag_ecb; /* corresponding ECB /
256	dtrace_action_t dtag_first; /* first action in tuple /
257	uint32_t dtag_base; / base of aggregation /
258	uint8_t dtag_hasarg; / boolean: has argument /
259	uint64_t dtag_initial; / initial value /
260	void (dtag_aggregate)(uint64_t , uint64_t, uint64_t);
261	} dtrace_aggregation_t;
262
263	/*
264	* DTrace Buffers
265	*
266	* Principal buffers, aggregation buffers, and speculative buffers are all
267	* managed with the dtrace_buffer structure. By default, this structure
268	* includes twin data buffers -- dtb_tomax and dtb_xamot -- that serve as the
269	* active and passive buffers, respectively. For speculative buffers,
270	* dtb_xamot will be NULL; for "ring" and "fill" buffers, dtb_xamot will point
271	* to a scratch buffer. For all buffer types, the dtrace_buffer structure is
272	* always allocated on a per-CPU basis; a single dtrace_buffer structure is
273	* never shared among CPUs. (That is, there is never true sharing of the
274	* dtrace_buffer structure; to prevent false sharing of the structure, it must
275	* always be aligned to the coherence granularity -- generally 64 bytes.)
276	*
277	* One of the critical design decisions of DTrace is that a given ECB always
278	* stores the same quantity and type of data. This is done to assure that the
279	* only metadata required for an ECB's traced data is the EPID. That is, from
280	* the EPID, the consumer can determine the data layout. (The data buffer
281	* layout is shown schematically below.) By assuring that one can determine
282	* data layout from the EPID, the metadata stream can be separated from the
283	* data stream -- simplifying the data stream enormously. The ECB always
284	* proceeds the recorded data as part of the dtrace_rechdr_t structure that
285	* includes the EPID and a high-resolution timestamp used for output ordering
286	* consistency.
287	*
288	* base of data buffer ---> +--------+--------------------+--------+
289	* \| rechdr \| data \| rechdr \|
290	* +--------+------+--------+----+--------+
291	* \| data \| rechdr \| data \|
292	* +---------------+--------+-------------+
293	* \| data, cont. \|
294	* +--------+--------------------+--------+
295	* \| rechdr \| data \| \|
296	* +--------+--------------------+ \|
297	* \| \|\| \|
298	* \| \|\| \|
299	* \| \/ \|
300	* : :
301	* . .
302	* . .
303	* . .
304	* : :
305	* \| \|
306	* limit of data buffer ---> +--------------------------------------+
307	*
308	* When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
309	* principal buffer (both scratch and payload) exceed the available space. If
310	* the ECB's needs exceed available space (and if the principal buffer policy
311	* is the default "switch" policy), the ECB is dropped, the buffer's drop count
312	* is incremented, and processing advances to the next ECB. If the ECB's needs
313	* can be met with the available space, the ECB is processed, but the offset in
314	* the principal buffer is only advanced if the ECB completes processing
315	* without error.
316	*
317	* When a buffer is to be switched (either because the buffer is the principal
318	* buffer with a "switch" policy or because it is an aggregation buffer), a
319	* cross call is issued to the CPU associated with the buffer. In the cross
320	* call context, interrupts are disabled, and the active and the inactive
321	* buffers are atomically switched. This involves switching the data pointers,
322	* copying the various state fields (offset, drops, errors, etc.) into their
323	* inactive equivalents, and clearing the state fields. Because interrupts are
324	* disabled during this procedure, the switch is guaranteed to appear atomic to
325	* dtrace_probe().
326	*
327	* DTrace Ring Buffering
328	*
329	* To process a ring buffer correctly, one must know the oldest valid record.
330	* Processing starts at the oldest record in the buffer and continues until
331	* the end of the buffer is reached. Processing then resumes starting with
332	* the record stored at offset 0 in the buffer, and continues until the
333	* youngest record is processed. If trace records are of a fixed-length,
334	* determining the oldest record is trivial:
335	*
336	* - If the ring buffer has not wrapped, the oldest record is the record
337	* stored at offset 0.
338	*
339	* - If the ring buffer has wrapped, the oldest record is the record stored
340	* at the current offset.
341	*
342	* With variable length records, however, just knowing the current offset
343	* doesn't suffice for determining the oldest valid record: assuming that one
344	* allows for arbitrary data, one has no way of searching forward from the
345	* current offset to find the oldest valid record. (That is, one has no way
346	* of separating data from metadata.) It would be possible to simply refuse to
347	* process any data in the ring buffer between the current offset and the
348	* limit, but this leaves (potentially) an enormous amount of otherwise valid
349	* data unprocessed.
350	*
351	* To effect ring buffering, we track two offsets in the buffer: the current
352	* offset and the _wrapped_ offset. If a request is made to reserve some
353	* amount of data, and the buffer has wrapped, the wrapped offset is
354	* incremented until the wrapped offset minus the current offset is greater
355	* than or equal to the reserve request. This is done by repeatedly looking
356	* up the ECB corresponding to the EPID at the current wrapped offset, and
357	* incrementing the wrapped offset by the size of the data payload
358	* corresponding to that ECB. If this offset is greater than or equal to the
359	* limit of the data buffer, the wrapped offset is set to 0. Thus, the
360	* current offset effectively "chases" the wrapped offset around the buffer.
361	* Schematically:
362	*
363	* base of data buffer ---> +------+--------------------+------+
364	* \| EPID \| data \| EPID \|
365	* +------+--------+------+----+------+
366	* \| data \| EPID \| data \|
367	* +---------------+------+-----------+
368	* \| data, cont. \|
369	* +------+---------------------------+
370	* \| EPID \| data \|
371	* current offset ---> +------+---------------------------+
372	* \| invalid data \|
373	* wrapped offset ---> +------+--------------------+------+
374	* \| EPID \| data \| EPID \|
375	* +------+--------+------+----+------+
376	* \| data \| EPID \| data \|
377	* +---------------+------+-----------+
378	* : :
379	* . .
380	* . ... valid data ... .
381	* . .
382	* : :
383	* +------+-------------+------+------+
384	* \| EPID \| data \| EPID \| data \|
385	* +------+------------++------+------+
386	* \| data, cont. \| leftover \|
387	* limit of data buffer ---> +-------------------+--------------+
388	*
389	* If the amount of requested buffer space exceeds the amount of space
390	* available between the current offset and the end of the buffer:
391	*
392	* (1) all words in the data buffer between the current offset and the limit
393	* of the data buffer (marked "leftover", above) are set to
394	* DTRACE_EPIDNONE
395	*
396	* (2) the wrapped offset is set to zero
397	*
398	* (3) the iteration process described above occurs until the wrapped offset
399	* is greater than the amount of desired space.
400	*
401	* The wrapped offset is implemented by (re-)using the inactive offset.
402	* In a "switch" buffer policy, the inactive offset stores the offset in
403	* the inactive buffer; in a "ring" buffer policy, it stores the wrapped
404	* offset.
405	*
406	* DTrace Scratch Buffering
407	*
408	* Some ECBs may wish to allocate dynamically-sized temporary scratch memory.
409	* To accommodate such requests easily, scratch memory may be allocated in
410	* the buffer beyond the current offset plus the needed memory of the current
411	* ECB. If there isn't sufficient room in the buffer for the requested amount
412	* of scratch space, the allocation fails and an error is generated. Scratch
413	* memory is tracked in the dtrace_mstate_t and is automatically freed when
414	* the ECB ceases processing. Note that ring buffers cannot allocate their
415	* scratch from the principal buffer -- lest they needlessly overwrite older,
416	* valid data. Ring buffers therefore have their own dedicated scratch buffer
417	* from which scratch is allocated.
418	*/
419	#define DTRACEBUF_RING 0x0001 /* bufpolicy set to "ring" */
420	#define DTRACEBUF_FILL 0x0002 /* bufpolicy set to "fill" */
421	#define DTRACEBUF_NOSWITCH 0x0004 /* do not switch buffer */
422	#define DTRACEBUF_WRAPPED 0x0008 /* ring buffer has wrapped */
423	#define DTRACEBUF_DROPPED 0x0010 /* drops occurred */
424	#define DTRACEBUF_ERROR 0x0020 /* errors occurred */
425	#define DTRACEBUF_FULL 0x0040 /* "fill" buffer is full */
426	#define DTRACEBUF_CONSUMED 0x0080 /* buffer has been consumed */
427	#define DTRACEBUF_INACTIVE 0x0100 /* buffer is not yet active */
428
429	typedef struct dtrace_buffer {
430	uint64_t dtb_offset; / current offset in buffer /
431	uint64_t dtb_cur_limit; / current limit before signaling/dropping /
432	uint64_t dtb_limit; / limit before signaling /
433	uint64_t dtb_size; / size of buffer /
434	uint32_t dtb_flags; / flags /
435	uint32_t dtb_drops; / number of drops /
436	caddr_t dtb_tomax; / active buffer /
437	caddr_t dtb_xamot; / inactive buffer /
438	uint32_t dtb_xamot_flags; / inactive flags /
439	uint32_t dtb_xamot_drops; / drops in inactive buffer /
440	uint64_t dtb_xamot_offset; / offset in inactive buffer /
441	uint32_t dtb_errors; / number of errors /
442	uint32_t dtb_xamot_errors; / errors in inactive buffer /
443	#ifndef _LP64
444	uint64_t dtb_pad1;
445	#endif
446	uint64_t dtb_switched; / time of last switch /
447	uint64_t dtb_interval; / observed switch interval /
448	uint64_t dtb_pad2[`4`]; / pad to avoid false sharing /
449	} dtrace_buffer_t;
450
451	/*
452	* DTrace Aggregation Buffers
453	*
454	* Aggregation buffers use much of the same mechanism as described above
455	* ("DTrace Buffers"). However, because an aggregation is fundamentally a
456	* hash, there exists dynamic metadata associated with an aggregation buffer
457	* that is not associated with other kinds of buffers. This aggregation
458	* metadata is _only_ relevant for the in-kernel implementation of
459	* aggregations; it is not actually relevant to user-level consumers. To do
460	* this, we allocate dynamic aggregation data (hash keys and hash buckets)
461	* starting below the _limit_ of the buffer, and we allocate data from the
462	* _base_ of the buffer. When the aggregation buffer is copied out, _only_ the
463	* data is copied out; the metadata is simply discarded. Schematically,
464	* aggregation buffers look like:
465	*
466	* base of data buffer ---> +-------+------+-----------+-------+
467	* \| aggid \| key \| value \| aggid \|
468	* +-------+------+-----------+-------+
469	* \| key \|
470	* +-------+-------+-----+------------+
471	* \| value \| aggid \| key \| value \|
472	* +-------+------++-----+------+-----+
473	* \| aggid \| key \| value \| \|
474	* +-------+------+-------------+ \|
475	* \| \|\| \|
476	* \| \|\| \|
477	* \| \/ \|
478	* : :
479	* . .
480	* . .
481	* . .
482	* : :
483	* \| /\ \|
484	* \| \|\| +------------+
485	* \| \|\| \| \|
486	* +---------------------+ \|
487	* \| hash keys \|
488	* \| (dtrace_aggkey structures) \|
489	* \| \|
490	* +----------------------------------+
491	* \| hash buckets \|
492	* \| (dtrace_aggbuffer structure) \|
493	* \| \|
494	* limit of data buffer ---> +----------------------------------+
495	*
496	*
497	* As implied above, just as we assure that ECBs always store a constant
498	* amount of data, we assure that a given aggregation -- identified by its
499	* aggregation ID -- always stores data of a constant quantity and type.
500	* As with EPIDs, this allows the aggregation ID to serve as the metadata for a
501	* given record.
502	*
503	* Note that the size of the dtrace_aggkey structure must be sizeof (uintptr_t)
504	* aligned. (If this the structure changes such that this becomes false, an
505	* assertion will fail in dtrace_aggregate().)
506	*/
507	typedef struct dtrace_aggkey {
508	uint32_t dtak_hashval; / hash value /
509	uint32_t dtak_action:`4`; / action -- 4 bits /
510	uint32_t dtak_size:`28`; / size -- 28 bits /
511	caddr_t dtak_data; / data pointer /
512	struct dtrace_aggkey dtak_next; /* next in hash chain /
513	} dtrace_aggkey_t;
514
515	typedef struct dtrace_aggbuffer {
516	uintptr_t dtagb_hashsize; / number of buckets /
517	uintptr_t dtagb_free; / free list of keys /
518	dtrace_aggkey_t *dtagb_hash; /* hash table /
519	} dtrace_aggbuffer_t;
520
521	/*
522	* DTrace Speculations
523	*
524	* Speculations have a per-CPU buffer and a global state. Once a speculation
525	* buffer has been comitted or discarded, it cannot be reused until all CPUs
526	* have taken the same action (commit or discard) on their respective
527	* speculative buffer. However, because DTrace probes may execute in arbitrary
528	* context, other CPUs cannot simply be cross-called at probe firing time to
529	* perform the necessary commit or discard. The speculation states thus
530	* optimize for the case that a speculative buffer is only active on one CPU at
531	* the time of a commit() or discard() -- for if this is the case, other CPUs
532	* need not take action, and the speculation is immediately available for
533	* reuse. If the speculation is active on multiple CPUs, it must be
534	* asynchronously cleaned -- potentially leading to a higher rate of dirty
535	* speculative drops. The speculation states are as follows:
536	*
537	* DTRACESPEC_INACTIVE <= Initial state; inactive speculation
538	* DTRACESPEC_ACTIVE <= Allocated, but not yet speculatively traced to
539	* DTRACESPEC_ACTIVEONE <= Speculatively traced to on one CPU
540	* DTRACESPEC_ACTIVEMANY <= Speculatively traced to on more than one CPU
541	* DTRACESPEC_COMMITTING <= Currently being commited on one CPU
542	* DTRACESPEC_COMMITTINGMANY <= Currently being commited on many CPUs
543	* DTRACESPEC_DISCARDING <= Currently being discarded on many CPUs
544	*
545	* The state transition diagram is as follows:
546	*
547	* +----------------------------------------------------------+
548	* \| \|
549	* \| +------------+ \|
550	* \| +-------------------\| COMMITTING \|<-----------------+ \|
551	* \| \| +------------+ \| \|
552	* \| \| copied spec. ^ commit() on \| \| discard() on
553	* \| \| into principal \| active CPU \| \| active CPU
554	* \| \| \| commit() \| \|
555	* V V \| \| \|
556	* +----------+ +--------+ +-----------+
557	* \| INACTIVE \|---------------->\| ACTIVE \|--------------->\| ACTIVEONE \|
558	* +----------+ speculation() +--------+ speculate() +-----------+
559	* ^ ^ \| \| \|
560	* \| \| \| discard() \| \|
561	* \| \| asynchronously \| discard() on \| \| speculate()
562	* \| \| cleaned V inactive CPU \| \| on inactive
563	* \| \| +------------+ \| \| CPU
564	* \| +-------------------\| DISCARDING \|<-----------------+ \|
565	* \| +------------+ \|
566	* \| asynchronously ^ \|
567	* \| copied spec. \| discard() \|
568	* \| into principal +------------------------+ \|
569	* \| \| V
570	* +----------------+ commit() +------------+
571	* \| COMMITTINGMANY \|<----------------------------------\| ACTIVEMANY \|
572	* +----------------+ +------------+
573	*/
574	typedef enum dtrace_speculation_state {
575	DTRACESPEC_INACTIVE = `0`,
576	DTRACESPEC_ACTIVE,
577	DTRACESPEC_ACTIVEONE,
578	DTRACESPEC_ACTIVEMANY,
579	DTRACESPEC_COMMITTING,
580	DTRACESPEC_COMMITTINGMANY,
581	DTRACESPEC_DISCARDING
582	} dtrace_speculation_state_t;
583
584	typedef struct dtrace_speculation {
585	dtrace_speculation_state_t dtsp_state; / current speculation state /
586	int dtsp_cleaning; / non-zero if being cleaned /
587	dtrace_buffer_t dtsp_buffer; /* speculative buffer /
588	} dtrace_speculation_t;
589
590	/*
591	* DTrace Dynamic Variables
592	*
593	* The dynamic variable problem is obviously decomposed into two subproblems:
594	* allocating new dynamic storage, and freeing old dynamic storage. The
595	* presence of the second problem makes the first much more complicated -- or
596	* rather, the absence of the second renders the first trivial. This is the
597	* case with aggregations, for which there is effectively no deallocation of
598	* dynamic storage. (Or more accurately, all dynamic storage is deallocated
599	* when a snapshot is taken of the aggregation.) As DTrace dynamic variables
600	* allow for both dynamic allocation and dynamic deallocation, the
601	* implementation of dynamic variables is quite a bit more complicated than
602	* that of their aggregation kin.
603	*
604	* We observe that allocating new dynamic storage is tricky only because the
605	* size can vary -- the allocation problem is much easier if allocation sizes
606	* are uniform. We further observe that in D, the size of dynamic variables is
607	* actually _not_ dynamic -- dynamic variable sizes may be determined by static
608	* analysis of DIF text. (This is true even of putatively dynamically-sized
609	* objects like strings and stacks, the sizes of which are dictated by the
610	* "stringsize" and "stackframes" variables, respectively.) We exploit this by
611	* performing this analysis on all DIF before enabling any probes. For each
612	* dynamic load or store, we calculate the dynamically-allocated size plus the
613	* size of the dtrace_dynvar structure plus the storage required to key the
614	* data. For all DIF, we take the largest value and dub it the _chunksize_.
615	* We then divide dynamic memory into two parts: a hash table that is wide
616	* enough to have every chunk in its own bucket, and a larger region of equal
617	* chunksize units. Whenever we wish to dynamically allocate a variable, we
618	* always allocate a single chunk of memory. Depending on the uniformity of
619	* allocation, this will waste some amount of memory -- but it eliminates the
620	* non-determinism inherent in traditional heap fragmentation.
621	*
622	* Dynamic objects are allocated by storing a non-zero value to them; they are
623	* deallocated by storing a zero value to them. Dynamic variables are
624	* complicated enormously by being shared between CPUs. In particular,
625	* consider the following scenario:
626	*
627	* CPU A CPU B
628	* +---------------------------------+ +---------------------------------+
629	* \| \| \| \|
630	* \| allocates dynamic object a[123] \| \| \|
631	* \| by storing the value 345 to it \| \| \|
632	* \| ---------> \|
633	* \| \| \| wishing to load from object \|
634	* \| \| \| a[123], performs lookup in \|
635	* \| \| \| dynamic variable space \|
636	* \| <--------- \|
637	* \| deallocates object a[123] by \| \| \|
638	* \| storing 0 to it \| \| \|
639	* \| \| \| \|
640	* \| allocates dynamic object b[567] \| \| performs load from a[123] \|
641	* \| by storing the value 789 to it \| \| \|
642	* : : : :
643	* . . . .
644	*
645	* This is obviously a race in the D program, but there are nonetheless only
646	* two valid values for CPU B's load from a[123]: 345 or 0. Most importantly,
647	* CPU B may _not_ see the value 789 for a[123].
648	*
649	* There are essentially two ways to deal with this:
650	*
651	* (1) Explicitly spin-lock variables. That is, if CPU B wishes to load
652	* from a[123], it needs to lock a[123] and hold the lock for the
653	* duration that it wishes to manipulate it.
654	*
655	* (2) Avoid reusing freed chunks until it is known that no CPU is referring
656	* to them.
657	*
658	* The implementation of (1) is rife with complexity, because it requires the
659	* user of a dynamic variable to explicitly decree when they are done using it.
660	* Were all variables by value, this perhaps wouldn't be debilitating -- but
661	* dynamic variables of non-scalar types are tracked by reference. That is, if
662	* a dynamic variable is, say, a string, and that variable is to be traced to,
663	* say, the principal buffer, the DIF emulation code returns to the main
664	* dtrace_probe() loop a pointer to the underlying storage, not the contents of
665	* the storage. Further, code calling on DIF emulation would have to be aware
666	* that the DIF emulation has returned a reference to a dynamic variable that
667	* has been potentially locked. The variable would have to be unlocked after
668	* the main dtrace_probe() loop is finished with the variable, and the main
669	* dtrace_probe() loop would have to be careful to not call any further DIF
670	* emulation while the variable is locked to avoid deadlock. More generally,
671	* if one were to implement (1), DIF emulation code dealing with dynamic
672	* variables could only deal with one dynamic variable at a time (lest deadlock
673	* result). To sum, (1) exports too much subtlety to the users of dynamic
674	* variables -- increasing maintenance burden and imposing serious constraints
675	* on future DTrace development.
676	*
677	* The implementation of (2) is also complex, but the complexity is more
678	* manageable. We need to be sure that when a variable is deallocated, it is
679	* not placed on a traditional free list, but rather on a _dirty_ list. Once a
680	* variable is on a dirty list, it cannot be found by CPUs performing a
681	* subsequent lookup of the variable -- but it may still be in use by other
682	* CPUs. To assure that all CPUs that may be seeing the old variable have
683	* cleared out of probe context, a dtrace_sync() can be issued. Once the
684	* dtrace_sync() has completed, it can be known that all CPUs are done
685	* manipulating the dynamic variable -- the dirty list can be atomically
686	* appended to the free list. Unfortunately, there's a slight hiccup in this
687	* mechanism: dtrace_sync() may not be issued from probe context. The
688	* dtrace_sync() must be therefore issued asynchronously from non-probe
689	* context. For this we rely on the DTrace cleaner, a cyclic that runs at the
690	* "cleanrate" frequency. To ease this implementation, we define several chunk
691	* lists:
692	*
693	* - Dirty. Deallocated chunks, not yet cleaned. Not available.
694	*
695	* - Rinsing. Formerly dirty chunks that are currently being asynchronously
696	* cleaned. Not available, but will be shortly. Dynamic variable
697	* allocation may not spin or block for availability, however.
698	*
699	* - Clean. Clean chunks, ready for allocation -- but not on the free list.
700	*
701	* - Free. Available for allocation.
702	*
703	* Moreover, to avoid absurd contention, _each_ of these lists is implemented
704	* on a per-CPU basis. This is only for performance, not correctness; chunks
705	* may be allocated from another CPU's free list. The algorithm for allocation
706	* then is this:
707	*
708	* (1) Attempt to atomically allocate from current CPU's free list. If list
709	* is non-empty and allocation is successful, allocation is complete.
710	*
711	* (2) If the clean list is non-empty, atomically move it to the free list,
712	* and reattempt (1).
713	*
714	* (3) If the dynamic variable space is in the CLEAN state, look for free
715	* and clean lists on other CPUs by setting the current CPU to the next
716	* CPU, and reattempting (1). If the next CPU is the current CPU (that
717	* is, if all CPUs have been checked), atomically switch the state of
718	* the dynamic variable space based on the following:
719	*
720	* - If no free chunks were found and no dirty chunks were found,
721	* atomically set the state to EMPTY.
722	*
723	* - If dirty chunks were found, atomically set the state to DIRTY.
724	*
725	* - If rinsing chunks were found, atomically set the state to RINSING.
726	*
727	* (4) Based on state of dynamic variable space state, increment appropriate
728	* counter to indicate dynamic drops (if in EMPTY state) vs. dynamic
729	* dirty drops (if in DIRTY state) vs. dynamic rinsing drops (if in
730	* RINSING state). Fail the allocation.
731	*
732	* The cleaning cyclic operates with the following algorithm: for all CPUs
733	* with a non-empty dirty list, atomically move the dirty list to the rinsing
734	* list. Perform a dtrace_sync(). For all CPUs with a non-empty rinsing list,
735	* atomically move the rinsing list to the clean list. Perform another
736	* dtrace_sync(). By this point, all CPUs have seen the new clean list; the
737	* state of the dynamic variable space can be restored to CLEAN.
738	*
739	* There exist two final races that merit explanation. The first is a simple
740	* allocation race:
741	*
742	* CPU A CPU B
743	* +---------------------------------+ +---------------------------------+
744	* \| \| \| \|
745	* \| allocates dynamic object a[123] \| \| allocates dynamic object a[123] \|
746	* \| by storing the value 345 to it \| \| by storing the value 567 to it \|
747	* \| \| \| \|
748	* : : : :
749	* . . . .
750	*
751	* Again, this is a race in the D program. It can be resolved by having a[123]
752	* hold the value 345 or a[123] hold the value 567 -- but it must be true that
753	* a[123] have only _one_ of these values. (That is, the racing CPUs may not
754	* put the same element twice on the same hash chain.) This is resolved
755	* simply: before the allocation is undertaken, the start of the new chunk's
756	* hash chain is noted. Later, after the allocation is complete, the hash
757	* chain is atomically switched to point to the new element. If this fails
758	* (because of either concurrent allocations or an allocation concurrent with a
759	* deletion), the newly allocated chunk is deallocated to the dirty list, and
760	* the whole process of looking up (and potentially allocating) the dynamic
761	* variable is reattempted.
762	*
763	* The final race is a simple deallocation race:
764	*
765	* CPU A CPU B
766	* +---------------------------------+ +---------------------------------+
767	* \| \| \| \|
768	* \| deallocates dynamic object \| \| deallocates dynamic object \|
769	* \| a[123] by storing the value 0 \| \| a[123] by storing the value 0 \|
770	* \| to it \| \| to it \|
771	* \| \| \| \|
772	* : : : :
773	* . . . .
774	*
775	* Once again, this is a race in the D program, but it is one that we must
776	* handle without corrupting the underlying data structures. Because
777	* deallocations require the deletion of a chunk from the middle of a hash
778	* chain, we cannot use a single-word atomic operation to remove it. For this,
779	* we add a spin lock to the hash buckets that is _only_ used for deallocations
780	* (allocation races are handled as above). Further, this spin lock is _only_
781	* held for the duration of the delete; before control is returned to the DIF
782	* emulation code, the hash bucket is unlocked.
783	*/
784	typedef struct dtrace_key {
785	uint64_t dttk_value; / data value or data pointer /
786	uint64_t dttk_size; / 0 if by-val, >0 if by-ref /
787	} dtrace_key_t;
788
789	typedef struct dtrace_tuple {
790	uint32_t dtt_nkeys; / number of keys in tuple /
791	uint32_t dtt_pad; / padding /
792	dtrace_key_t dtt_key[`1`]; / array of tuple keys /
793	} dtrace_tuple_t;
794
795	typedef struct dtrace_dynvar {
796	uint64_t dtdv_hashval; / hash value -- 0 if free /
797	struct dtrace_dynvar dtdv_next; /* next on list or hash chain /
798	void dtdv_data; /* pointer to data /
799	dtrace_tuple_t dtdv_tuple; / tuple key /
800	} dtrace_dynvar_t;
801
802	typedef enum dtrace_dynvar_op {
803	DTRACE_DYNVAR_ALLOC,
804	DTRACE_DYNVAR_NOALLOC,
805	DTRACE_DYNVAR_DEALLOC
806	} dtrace_dynvar_op_t;
807
808	typedef struct dtrace_dynhash {
809	dtrace_dynvar_t dtdh_chain; /* hash chain for this bucket /
810	uintptr_t dtdh_lock; / deallocation lock /
811	#ifdef _LP64
812	uintptr_t dtdh_pad[`6`]; / pad to avoid false sharing /
813	#else
814	uintptr_t dtdh_pad[`14`]; / pad to avoid false sharing /
815	#endif
816	} dtrace_dynhash_t;
817
818	typedef struct dtrace_dstate_percpu {
819	dtrace_dynvar_t dtdsc_free; /* free list for this CPU /
820	dtrace_dynvar_t dtdsc_dirty; /* dirty list for this CPU /
821	dtrace_dynvar_t dtdsc_rinsing; /* rinsing list for this CPU /
822	dtrace_dynvar_t dtdsc_clean; /* clean list for this CPU /
823	uint64_t dtdsc_drops; / number of capacity drops /
824	uint64_t dtdsc_dirty_drops; / number of dirty drops /
825	uint64_t dtdsc_rinsing_drops; / number of rinsing drops /
826	} dtrace_dstate_percpu_t;
827
828	typedef enum dtrace_dstate_state {
829	DTRACE_DSTATE_CLEAN = `0`,
830	DTRACE_DSTATE_EMPTY,
831	DTRACE_DSTATE_DIRTY,
832	DTRACE_DSTATE_RINSING
833	} dtrace_dstate_state_t;
834
835	typedef struct dtrace_dstate {
836	void dtds_base; /* base of dynamic var. space /
837	size_t dtds_size; / size of dynamic var. space /
838	size_t dtds_hashsize; / number of buckets in hash /
839	size_t dtds_chunksize; / size of each chunk /
840	dtrace_dynhash_t dtds_hash; /* pointer to hash table /
841	dtrace_dstate_state_t dtds_state; / current dynamic var. state /
842	dtrace_dstate_percpu_t __zpercpu dtds_percpu; /* per-CPU dyn. var. state /
843	} dtrace_dstate_t;
844
845	/*
846	* DTrace Variable State
847	*
848	* The DTrace variable state tracks user-defined variables in its dtrace_vstate
849	* structure. Each DTrace consumer has exactly one dtrace_vstate structure,
850	* but some dtrace_vstate structures may exist without a corresponding DTrace
851	* consumer (see "DTrace Helpers", below). As described in <sys/dtrace.h>,
852	* user-defined variables can have one of three scopes:
853	*
854	* DIFV_SCOPE_GLOBAL => global scope
855	* DIFV_SCOPE_THREAD => thread-local scope (i.e. "self->" variables)
856	* DIFV_SCOPE_LOCAL => clause-local scope (i.e. "this->" variables)
857	*
858	* The variable state tracks variables by both their scope and their allocation
859	* type:
860	*
861	* - The dtvs_globals and dtvs_locals members each point to an array of
862	* dtrace_statvar structures. These structures contain both the variable
863	* metadata (dtrace_difv structures) and the underlying storage for all
864	* statically allocated variables, including statically allocated
865	* DIFV_SCOPE_GLOBAL variables and all DIFV_SCOPE_LOCAL variables.
866	*
867	* - The dtvs_tlocals member points to an array of dtrace_difv structures for
868	* DIFV_SCOPE_THREAD variables. As such, this array tracks _only_ the
869	* variable metadata for DIFV_SCOPE_THREAD variables; the underlying storage
870	* is allocated out of the dynamic variable space.
871	*
872	* - The dtvs_dynvars member is the dynamic variable state associated with the
873	* variable state. The dynamic variable state (described in "DTrace Dynamic
874	* Variables", above) tracks all DIFV_SCOPE_THREAD variables and all
875	* dynamically-allocated DIFV_SCOPE_GLOBAL variables.
876	*/
877	typedef struct dtrace_statvar {
878	uint64_t dtsv_data; / data or pointer to it /
879	size_t dtsv_size; / size of pointed-to data /
880	int dtsv_refcnt; / reference count /
881	dtrace_difv_t dtsv_var; / variable metadata /
882	} dtrace_statvar_t;
883
884	typedef struct dtrace_vstate {
885	dtrace_state_t dtvs_state; /* back pointer to state /
886	dtrace_statvar_t *dtvs_globals; /* statically-allocated glbls /
887	int dtvs_nglobals; / number of globals /
888	dtrace_difv_t dtvs_tlocals; /* thread-local metadata /
889	int dtvs_ntlocals; / number of thread-locals /
890	dtrace_statvar_t *dtvs_locals; /* clause-local data /
891	int dtvs_nlocals; / number of clause-locals /
892	dtrace_dstate_t dtvs_dynvars; / dynamic variable state /
893	} dtrace_vstate_t;
894
895	/*
896	* DTrace Machine State
897	*
898	* In the process of processing a fired probe, DTrace needs to track and/or
899	* cache some per-CPU state associated with that particular firing. This is
900	* state that is always discarded after the probe firing has completed, and
901	* much of it is not specific to any DTrace consumer, remaining valid across
902	* all ECBs. This state is tracked in the dtrace_mstate structure.
903	*/
904	#define DTRACE_MSTATE_ARGS 0x00000001
905	#define DTRACE_MSTATE_PROBE 0x00000002
906	#define DTRACE_MSTATE_EPID 0x00000004
907	#define DTRACE_MSTATE_TIMESTAMP 0x00000008
908	#define DTRACE_MSTATE_STACKDEPTH 0x00000010
909	#define DTRACE_MSTATE_CALLER 0x00000020
910	#define DTRACE_MSTATE_IPL 0x00000040
911	#define DTRACE_MSTATE_FLTOFFS 0x00000080
912	#define DTRACE_MSTATE_WALLTIMESTAMP 0x00000100
913	#define DTRACE_MSTATE_USTACKDEPTH 0x00000200
914	#define DTRACE_MSTATE_UCALLER 0x00000400
915	#define DTRACE_MSTATE_MACHTIMESTAMP 0x00000800
916	#define DTRACE_MSTATE_MACHCTIMESTAMP 0x00001000
917
918	typedef struct dtrace_mstate {
919	uintptr_t dtms_scratch_base; / base of scratch space /
920	uintptr_t dtms_scratch_ptr; / current scratch pointer /
921	size_t dtms_scratch_size; / scratch size /
922	uint32_t dtms_present; / variables that are present /
923	uint64_t dtms_arg[`5`]; / cached arguments /
924	dtrace_epid_t dtms_epid; / current EPID /
925	uint64_t dtms_timestamp; / cached timestamp /
926	hrtime_t dtms_walltimestamp; / cached wall timestamp /
927	uint64_t dtms_machtimestamp; / cached mach absolute timestamp /
928	uint64_t dtms_machctimestamp; / cached mach continuous timestamp /
929	int dtms_stackdepth; / cached stackdepth /
930	int dtms_ustackdepth; / cached ustackdepth /
931	struct dtrace_probe dtms_probe; /* current probe /
932	uintptr_t dtms_caller; / cached caller /
933	uint64_t dtms_ucaller; / cached user-level caller /
934	int dtms_ipl; / cached interrupt pri lev /
935	int dtms_fltoffs; / faulting DIFO offset /
936	uintptr_t dtms_strtok; / saved strtok() pointer /
937	uintptr_t dtms_strtok_limit; / upper bound of strtok ptr /
938	uint32_t dtms_access; / memory access rights /
939	dtrace_difo_t dtms_difo; /* current dif object /
940	} dtrace_mstate_t;
941
942	#define DTRACE_COND_OWNER 0x1
943	#define DTRACE_COND_USERMODE 0x2
944	#define DTRACE_COND_ZONEOWNER 0x4
945
946	#define DTRACE_PROBEKEY_MAXDEPTH 8 /* max glob recursion depth */
947
948	/*
949	* Access flag used by dtrace_mstate.dtms_access.
950	*/
951	#define DTRACE_ACCESS_KERNEL 0x1 /* the priv to read kmem */
952
953
954	/*
955	* DTrace Activity
956	*
957	* Each DTrace consumer is in one of several states, which (for purposes of
958	* avoiding yet-another overloading of the noun "state") we call the current
959	* _activity_. The activity transitions on dtrace_go() (from DTRACIOCGO), on
960	* dtrace_stop() (from DTRACIOCSTOP) and on the exit() action. Activities may
961	* only transition in one direction; the activity transition diagram is a
962	* directed acyclic graph. The activity transition diagram is as follows:
963	*
964	*
965	*
966	* +----------+ +--------+ +--------+
967	* \| INACTIVE \|------------------>\| WARMUP \|------------------>\| ACTIVE \|
968	* +----------+ dtrace_go(), +--------+ dtrace_go(), +--------+
969	* before BEGIN \| after BEGIN \| \| \|
970	* \| \| \| \|
971	* exit() action \| \| \| \|
972	* from BEGIN ECB \| \| \| \|
973	* \| \| \| \|
974	* v \| \| \|
975	* +----------+ exit() action \| \| \|
976	* +-----------------------------\| DRAINING \|<-------------------+ \| \|
977	* \| +----------+ \| \|
978	* \| \| \| \|
979	* \| dtrace_stop(), \| \| \|
980	* \| before END \| \| \|
981	* \| \| \| \|
982	* \| v \| \|
983	* \| +---------+ +----------+ \| \|
984	* \| \| STOPPED \|<----------------\| COOLDOWN \|<----------------------+ \|
985	* \| +---------+ dtrace_stop(), +----------+ dtrace_stop(), \|
986	* \| after END before END \|
987	* \| \|
988	* \| +--------+ \|
989	* +----------------------------->\| KILLED \|<--------------------------+
990	* deadman timeout or +--------+ deadman timeout or
991	* killed consumer killed consumer
992	*
993	* Note that once a DTrace consumer has stopped tracing, there is no way to
994	* restart it; if a DTrace consumer wishes to restart tracing, it must reopen
995	* the DTrace pseudodevice.
996	*/
997	typedef enum dtrace_activity {
998	DTRACE_ACTIVITY_INACTIVE = `0`, / not yet running /
999	DTRACE_ACTIVITY_WARMUP, / while starting /
1000	DTRACE_ACTIVITY_ACTIVE, / running /
1001	DTRACE_ACTIVITY_DRAINING, / before stopping /
1002	DTRACE_ACTIVITY_COOLDOWN, / while stopping /
1003	DTRACE_ACTIVITY_STOPPED, / after stopping /
1004	DTRACE_ACTIVITY_KILLED / killed /
1005	} dtrace_activity_t;
1006
1007
1008	/*
1009	* APPLE NOTE: DTrace dof modes implementation
1010	*
1011	* DTrace has four "dof modes". They are:
1012	*
1013	* DTRACE_DOF_MODE_NEVER Never load any dof, period.
1014	* DTRACE_DOF_MODE_LAZY_ON Defer loading dof until later
1015	* DTRACE_DOF_MODE_LAZY_OFF Load all deferred dof now, and any new dof
1016	* DTRACE_DOF_MODE_NON_LAZY Load all dof immediately.
1017	*
1018	* It is legal to transition between the two lazy modes. The NEVER and
1019	* NON_LAZY modes are permanent, and must not change once set.
1020	*
1021	* The current dof mode is kept in dtrace_dof_mode, which is protected by the
1022	* dtrace_dof_mode_lock. This is a RW lock, reads require shared access, writes
1023	* require exclusive access. Because NEVER and NON_LAZY are permanent states,
1024	* it is legal to test for those modes without holding the dof mode lock.
1025	*
1026	* Lock ordering is dof mode lock before any dtrace lock, and before the
1027	* process p_dtrace_sprlock. In general, other locks should not be held when
1028	* taking the dof mode lock. Acquiring the dof mode lock in exclusive mode
1029	* will block process fork, exec, and exit, so it should be held exclusive
1030	* for as short a time as possible.
1031	*/
1032
1033	#define DTRACE_DOF_MODE_NEVER 0
1034	#define DTRACE_DOF_MODE_LAZY_ON 1
1035	#define DTRACE_DOF_MODE_LAZY_OFF 2
1036	#define DTRACE_DOF_MODE_NON_LAZY 3
1037
1038	/*
1039	* dtrace kernel symbol modes are used to control when the kernel may dispose of
1040	* symbol information used by the fbt/sdt provider. The kernel itself, as well as
1041	* every kext, has symbol table/nlist info that has historically been preserved
1042	* for dtrace's use. This allowed dtrace to be lazy about allocating fbt/sdt probes,
1043	* at the expense of keeping the symbol info in the kernel permanently.
1044	*
1045	* Starting in 10.7+, fbt probes may be created from userspace, in the same
1046	* fashion as pid probes. The kernel allows dtrace "first right of refusal"
1047	* whenever symbol data becomes available (such as a kext load). If dtrace is
1048	* active, it will immediately read/copy the needed data, and then the kernel
1049	* may free it. If dtrace is not active, it returns immediately, having done
1050	* no work or allocations, and the symbol data is freed. Should dtrace need
1051	* this data later, it is expected that the userspace client will push the
1052	* data into the kernel via ioctl calls.
1053	*
1054	* The kernel symbol modes are used to control what dtrace does with symbol data:
1055	*
1056	* DTRACE_KERNEL_SYMBOLS_NEVER Effectively disables fbt/sdt
1057	* DTRACE_KERNEL_SYMBOLS_FROM_KERNEL Immediately read/copy symbol data
1058	* DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE Wait for symbols from userspace
1059	* DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL Immediately read/copy symbol data
1060	*
1061	* It is legal to transition between DTRACE_KERNEL_SYMBOLS_FROM_KERNEL and
1062	* DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE. The DTRACE_KERNEL_SYMBOLS_NEVER and
1063	* DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL are permanent modes, intended to
1064	* disable fbt probes entirely, or prevent any symbols being loaded from
1065	* userspace.
1066	*
1067	* The kernel symbol mode is kept in dtrace_kernel_symbol_mode, which is protected
1068	* by the dtrace_lock.
1069	*/
1070
1071	#define DTRACE_KERNEL_SYMBOLS_NEVER 0
1072	#define DTRACE_KERNEL_SYMBOLS_FROM_KERNEL 1
1073	#define DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE 2
1074	#define DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL 3
1075
1076
1077	/*
1078	* DTrace Helper Implementation
1079	*
1080	* A description of the helper architecture may be found in <sys/dtrace.h>.
1081	* Each process contains a pointer to its helpers in its p_dtrace_helpers
1082	* member. This is a pointer to a dtrace_helpers structure, which contains an
1083	* array of pointers to dtrace_helper structures, helper variable state (shared
1084	* among a process's helpers) and a generation count. (The generation count is
1085	* used to provide an identifier when a helper is added so that it may be
1086	* subsequently removed.) The dtrace_helper structure is self-explanatory,
1087	* containing pointers to the objects needed to execute the helper. Note that
1088	* helpers are _duplicated_ across fork(2), and destroyed on exec(2). No more
1089	* than dtrace_helpers_max are allowed per-process.
1090	*/
1091	#define DTRACE_HELPER_ACTION_USTACK 0
1092	#define DTRACE_NHELPER_ACTIONS 1
1093
1094	typedef struct dtrace_helper_action {
1095	int dtha_generation; / helper action generation /
1096	int dtha_nactions; / number of actions /
1097	dtrace_difo_t dtha_predicate; /* helper action predicate /
1098	dtrace_difo_t *dtha_actions; /* array of actions /
1099	struct dtrace_helper_action dtha_next; /* next helper action /
1100	} dtrace_helper_action_t;
1101
1102	typedef struct dtrace_helper_provider {
1103	int dthp_generation; / helper provider generation /
1104	uint32_t dthp_ref; / reference count /
1105	dof_helper_t dthp_prov; / DOF w/ provider and probes /
1106	} dtrace_helper_provider_t;
1107
1108	typedef struct dtrace_helpers {
1109	dtrace_helper_action_t *dthps_actions; /* array of helper actions /
1110	dtrace_vstate_t dthps_vstate; / helper action var. state /
1111	dtrace_helper_provider_t *dthps_provs; /* array of providers /
1112	uint_t dthps_nprovs; / count of providers /
1113	uint_t dthps_maxprovs; / provider array size /
1114	int dthps_generation; / current generation /
1115	pid_t dthps_pid; / pid of associated proc /
1116	int dthps_deferred; / helper in deferred list /
1117	struct dtrace_helpers dthps_next; /* next pointer /
1118	struct dtrace_helpers dthps_prev; /* prev pointer /
1119	} dtrace_helpers_t;
1120
1121	/*
1122	* DTrace Helper Action Tracing
1123	*
1124	* Debugging helper actions can be arduous. To ease the development and
1125	* debugging of helpers, DTrace contains a tracing-framework-within-a-tracing-
1126	* framework: helper tracing. If dtrace_helptrace_enabled is non-zero (which
1127	* it is by default on DEBUG kernels), all helper activity will be traced to a
1128	* global, in-kernel ring buffer. Each entry includes a pointer to the specific
1129	* helper, the location within the helper, and a trace of all local variables.
1130	* The ring buffer may be displayed in a human-readable format with the
1131	* ::dtrace_helptrace mdb(1) dcmd.
1132	*/
1133	#define DTRACE_HELPTRACE_NEXT (-1)
1134	#define DTRACE_HELPTRACE_DONE (-2)
1135	#define DTRACE_HELPTRACE_ERR (-3)
1136
1137
1138	typedef struct dtrace_helptrace {
1139	dtrace_helper_action_t dtht_helper; /* helper action /
1140	int dtht_where; / where in helper action /
1141	int dtht_nlocals; / number of locals /
1142	int dtht_fault; / type of fault (if any) /
1143	int dtht_fltoffs; / DIF offset /
1144	uint64_t dtht_illval; / faulting value /
1145	uint64_t dtht_locals[`1`]; / local variables /
1146	} dtrace_helptrace_t;
1147
1148	/*
1149	* DTrace Credentials
1150	*
1151	* In probe context, we have limited flexibility to examine the credentials
1152	* of the DTrace consumer that created a particular enabling. We use
1153	* the Least Privilege interfaces to cache the consumer's cred pointer and
1154	* some facts about that credential in a dtrace_cred_t structure. These
1155	* can limit the consumer's breadth of visibility and what actions the
1156	* consumer may take.
1157	*/
1158	#define DTRACE_CRV_ALLPROC 0x01
1159	#define DTRACE_CRV_KERNEL 0x02
1160	#define DTRACE_CRV_ALLZONE 0x04
1161
1162	#define DTRACE_CRV_ALL (DTRACE_CRV_ALLPROC \| DTRACE_CRV_KERNEL \| \
1163	DTRACE_CRV_ALLZONE)
1164
1165	#define DTRACE_CRA_PROC 0x0001
1166	#define DTRACE_CRA_PROC_CONTROL 0x0002
1167	#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER 0x0004
1168	#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE 0x0008
1169	#define DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG 0x0010
1170	#define DTRACE_CRA_KERNEL 0x0020
1171	#define DTRACE_CRA_KERNEL_DESTRUCTIVE 0x0040
1172
1173	#define DTRACE_CRA_ALL (DTRACE_CRA_PROC \| \
1174	DTRACE_CRA_PROC_CONTROL \| \
1175	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER \| \
1176	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE \| \
1177	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG \| \
1178	DTRACE_CRA_KERNEL \| \
1179	DTRACE_CRA_KERNEL_DESTRUCTIVE)
1180
1181	typedef struct dtrace_cred {
1182	cred_t *dcr_cred;
1183	uint8_t dcr_destructive;
1184	uint8_t dcr_visible;
1185	uint16_t dcr_action;
1186	} dtrace_cred_t;
1187
1188	typedef struct dtrace_format {
1189	uint64_t dtf_refcount;
1190	char dtf_str[];
1191	} dtrace_format_t;
1192
1193	#define DTRACE_FORMAT_SIZE(fmt) (strlen(fmt->dtf_str) + 1 + sizeof(dtrace_format_t))
1194
1195	/*
1196	* DTrace Consumer State
1197	*
1198	* Each DTrace consumer has an associated dtrace_state structure that contains
1199	* its in-kernel DTrace state -- including options, credentials, statistics and
1200	* pointers to ECBs, buffers, speculations and formats. A dtrace_state
1201	* structure is also allocated for anonymous enablings. When anonymous state
1202	* is grabbed, the grabbing consumers dts_anon pointer is set to the grabbed
1203	* dtrace_state structure.
1204	*/
1205	struct dtrace_state {
1206	dev_t dts_dev; / device /
1207	int dts_necbs; / total number of ECBs /
1208	dtrace_ecb_t *dts_ecbs; /* array of ECBs /
1209	dtrace_epid_t dts_epid; / next EPID to allocate /
1210	size_t dts_needed; / greatest needed space /
1211	struct dtrace_state dts_anon; /* anon. state, if grabbed /
1212	dtrace_activity_t dts_activity; / current activity /
1213	dtrace_vstate_t dts_vstate; / variable state /
1214	dtrace_buffer_t dts_buffer; /* principal buffer /
1215	dtrace_buffer_t dts_aggbuffer; /* aggregation buffer /
1216	dtrace_speculation_t dts_speculations; /* speculation array /
1217	int dts_nspeculations; / number of speculations /
1218	int dts_naggregations; / number of aggregations /
1219	dtrace_aggregation_t *dts_aggregations; /* aggregation array /
1220	vmem_t dts_aggid_arena; /* arena for aggregation IDs /
1221	uint64_t dts_errors; / total number of errors /
1222	uint32_t dts_speculations_busy; / number of spec. busy /
1223	uint32_t dts_speculations_unavail; / number of spec unavail /
1224	uint32_t dts_stkstroverflows; / stack string tab overflows /
1225	uint32_t dts_dblerrors; / errors in ERROR probes /
1226	uint32_t dts_reserve; / space reserved for END /
1227	hrtime_t dts_laststatus; / time of last status /
1228	cyclic_id_t dts_cleaner; / cleaning cyclic /
1229	cyclic_id_t dts_deadman; / deadman cyclic /
1230	hrtime_t dts_alive; / time last alive /
1231	char dts_speculates; / boolean: has speculations /
1232	char dts_destructive; / boolean: has dest. actions /
1233	int dts_nformats; / number of formats /
1234	dtrace_format_t *dts_formats; /* format string array /
1235	dtrace_optval_t dts_options[DTRACEOPT_MAX]; / options /
1236	dtrace_cred_t dts_cred; / credentials /
1237	size_t dts_nretained; / number of retained enabs /
1238	uint64_t dts_arg_error_illval;
1239	uint32_t dts_buf_over_limit; / number of bufs over dtb_limit /
1240	uint64_t *dts_rstate; /* per-CPU random state /
1241	};
1242
1243	struct dtrace_provider {
1244	dtrace_pattr_t dtpv_attr; / provider attributes /
1245	dtrace_ppriv_t dtpv_priv; / provider privileges /
1246	dtrace_pops_t dtpv_pops; / provider operations /
1247	char dtpv_name; /* provider name /
1248	void dtpv_arg; /* provider argument /
1249	uint_t dtpv_defunct; / boolean: defunct provider /
1250	struct dtrace_provider dtpv_next; /* next provider /
1251	uint64_t dtpv_probe_count; / number of associated probes /
1252	uint64_t dtpv_ecb_count; / number of associated enabled ECBs /
1253	};
1254
1255	struct dtrace_meta {
1256	dtrace_mops_t dtm_mops; / meta provider operations /
1257	char dtm_name; /* meta provider name /
1258	void dtm_arg; /* meta provider user arg /
1259	uint64_t dtm_count; / number of associated providers /
1260	};
1261
1262	/*
1263	* DTrace Enablings
1264	*
1265	* A dtrace_enabling structure is used to track a collection of ECB
1266	* descriptions -- before they have been turned into actual ECBs. This is
1267	* created as a result of DOF processing, and is generally used to generate
1268	* ECBs immediately thereafter. However, enablings are also generally
1269	* retained should the probes they describe be created at a later time; as
1270	* each new module or provider registers with the framework, the retained
1271	* enablings are reevaluated, with any new match resulting in new ECBs. To
1272	* prevent probes from being matched more than once, the enabling tracks the
1273	* last probe generation matched, and only matches probes from subsequent
1274	* generations.
1275	*/
1276	typedef struct dtrace_enabling {
1277	dtrace_ecbdesc_t *dten_desc; /* all ECB descriptions /
1278	int dten_ndesc; / number of ECB descriptions /
1279	int dten_maxdesc; / size of ECB array /
1280	dtrace_vstate_t dten_vstate; /* associated variable state /
1281	dtrace_genid_t dten_probegen; / matched probe generation /
1282	dtrace_ecbdesc_t dten_current; /* current ECB description /
1283	int dten_error; / current error value /
1284	int dten_primed; / boolean: set if primed /
1285	struct dtrace_enabling dten_prev; /* previous enabling /
1286	struct dtrace_enabling dten_next; /* next enabling /
1287	} dtrace_enabling_t;
1288
1289	/*
1290	* DTrace Anonymous Enablings
1291	*
1292	* Anonymous enablings are DTrace enablings that are not associated with a
1293	* controlling process, but rather derive their enabling from DOF stored as
1294	* properties in the dtrace.conf file. If there is an anonymous enabling, a
1295	* DTrace consumer state and enabling are created on attach. The state may be
1296	* subsequently grabbed by the first consumer specifying the "grabanon"
1297	* option. As long as an anonymous DTrace enabling exists, dtrace(7D) will
1298	* refuse to unload.
1299	*/
1300	typedef struct dtrace_anon {
1301	dtrace_state_t dta_state; /* DTrace consumer state /
1302	dtrace_enabling_t dta_enabling; /* pointer to enabling /
1303	processorid_t dta_beganon; / which CPU BEGIN ran on /
1304	} dtrace_anon_t;
1305
1306	/*
1307	* DTrace Error Debugging
1308	*/
1309	#if DEBUG
1310	#define DTRACE_ERRDEBUG
1311	#endif
1312
1313	#ifdef DTRACE_ERRDEBUG
1314
1315	typedef struct dtrace_errhash {
1316	const char dter_msg; /* error message /
1317	int dter_count; / number of times seen /
1318	} dtrace_errhash_t;
1319
1320	#define DTRACE_ERRHASHSZ 256 /* must be > number of err msgs */
1321
1322	#endif /* DTRACE_ERRDEBUG */
1323
1324	typedef struct dtrace_string dtrace_string_t;
1325
1326	typedef struct dtrace_string {
1327	dtrace_string_t *dtst_next;
1328	dtrace_string_t *dtst_prev;
1329	uint32_t dtst_refcount;
1330	char dtst_str[];
1331	} dtrace_string_t;
1332
1333	/**
1334	* DTrace Matching pre-conditions
1335	*
1336	* Used when matching new probes to discard matching of enablings that
1337	* doesn't match the condition tested by dmc_func
1338	*/
1339	typedef struct dtrace_match_cond {
1340	int (dmc_func)(dtrace_probedesc_t, void*);
1341	void *dmc_data;
1342	} dtrace_match_cond_t;
1343
1344
1345	/*
1346	* DTrace Toxic Ranges
1347	*
1348	* DTrace supports safe loads from probe context; if the address turns out to
1349	* be invalid, a bit will be set by the kernel indicating that DTrace
1350	* encountered a memory error, and DTrace will propagate the error to the user
1351	* accordingly. However, there may exist some regions of memory in which an
1352	* arbitrary load can change system state, and from which it is impossible to
1353	* recover from such a load after it has been attempted. Examples of this may
1354	* include memory in which programmable I/O registers are mapped (for which a
1355	* read may have some implications for the device) or (in the specific case of
1356	* UltraSPARC-I and -II) the virtual address hole. The platform is required
1357	* to make DTrace aware of these toxic ranges; DTrace will then check that
1358	* target addresses are not in a toxic range before attempting to issue a
1359	* safe load.
1360	*/
1361	typedef struct dtrace_toxrange {
1362	uintptr_t dtt_base; / base of toxic range /
1363	uintptr_t dtt_limit; / limit of toxic range /
1364	} dtrace_toxrange_t;
1365
1366	extern uint64_t dtrace_getarg(int, int, dtrace_mstate_t, dtrace_vstate_t);
1367	extern int dtrace_getipl(void);
1368	extern uintptr_t dtrace_caller(int);
1369	extern uint32_t dtrace_cas32(uint32_t *, uint32_t, uint32_t);
1370	extern void dtrace_casptr(void* , void* , void* *);
1371	extern void dtrace_copyin(user_addr_t, uintptr_t, size_t, volatile uint16_t *);
1372	extern void dtrace_copyinstr(user_addr_t, uintptr_t, size_t, volatile uint16_t *);
1373	extern void dtrace_copyout(uintptr_t, user_addr_t, size_t, volatile uint16_t *);
1374	extern void dtrace_copyoutstr(uintptr_t, user_addr_t, size_t, volatile uint16_t *);
1375	extern void dtrace_getpcstack(pc_t , int, int, uint32_t );
1376	extern uint64_t dtrace_load64(uintptr_t);
1377	extern int dtrace_canload(uint64_t, size_t, dtrace_mstate_t, dtrace_vstate_t);
1378
1379	extern uint64_t dtrace_getreg(struct regs *, uint_t);
1380	extern uint64_t dtrace_getvmreg(uint_t);
1381	extern int dtrace_getstackdepth(int);
1382	extern void dtrace_getupcstack(uint64_t , int*);
1383	extern void dtrace_getufpstack(uint64_t , uint64_t , int);
1384	extern int dtrace_getustackdepth(void);
1385	extern uintptr_t dtrace_fulword(void *);
1386	extern uint8_t dtrace_fuword8(user_addr_t);
1387	extern uint16_t dtrace_fuword16(user_addr_t);
1388	extern uint32_t dtrace_fuword32(user_addr_t);
1389	extern uint64_t dtrace_fuword64(user_addr_t);
1390	extern int dtrace_proc_waitfor(dtrace_procdesc_t*);
1391	extern void dtrace_probe_error(dtrace_state_t , dtrace_epid_t, int, int*,
1392	int, uint64_t);
1393	extern int dtrace_assfail(const char , const* char , int*);
1394	extern int dtrace_attached(void);
1395	extern hrtime_t dtrace_gethrestime(void);
1396
1397	extern void dtrace_flush_caches(void);
1398
1399	extern void dtrace_copy(uintptr_t, uintptr_t, size_t);
1400	extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
1401
1402	extern void* dtrace_ptrauth_strip(void*, uint64_t);
1403	extern int dtrace_is_valid_ptrauth_key(uint64_t);
1404
1405	extern uint64_t dtrace_physmem_read(uint64_t, size_t);
1406	extern void dtrace_physmem_write(uint64_t, uint64_t, size_t);
1407
1408	extern void dtrace_livedump(char *, size_t);
1409
1410	/*
1411	* DTrace state handling
1412	*/
1413	extern minor_t dtrace_state_reserve(void);
1414	extern dtrace_state_t* dtrace_state_allocate(minor_t minor);
1415	extern dtrace_state_t* dtrace_state_get(minor_t minor);
1416	extern void dtrace_state_free(minor_t minor);
1417
1418	/*
1419	* DTrace restriction checks
1420	*/
1421	extern void dtrace_restriction_policy_load(void);
1422	extern boolean_t dtrace_is_restricted(void);
1423	extern boolean_t dtrace_are_restrictions_relaxed(void);
1424	extern boolean_t dtrace_fbt_probes_restricted(void);
1425	extern boolean_t dtrace_sdt_probes_restricted(void);
1426	extern boolean_t dtrace_can_attach_to_proc(proc_t);
1427
1428	/*
1429	* DTrace Assertions
1430	*
1431	* DTrace calls ASSERT and VERIFY from probe context. To assure that a failed
1432	* ASSERT or VERIFYdoes not induce a markedly more catastrophic failure (e.g.,
1433	* one from which a dump cannot be gleaned), DTrace must define its own ASSERT
1434	* and VERIFY macros to be ones that may safely be called from probe context.
1435	* This header file must thus be included by any DTrace component that calls
1436	* ASSERT and/or VERIFY from probe context, and _only_ by those components.
1437	* (The only exception to this is kernel debugging infrastructure at user-level
1438	* that doesn't depend on calling ASSERT.)
1439	*/
1440	#undef ASSERT
1441	#undef VERIFY
1442
1443	#define VERIFY(EX) ((void)((EX) \|\| \
1444	dtrace_assfail(#EX, __FILE__, __LINE__)))
1445
1446	#if DEBUG
1447	#define ASSERT(EX) ((void)((EX) \|\| \
1448	dtrace_assfail(#EX, __FILE__, __LINE__)))
1449	#else
1450	#define ASSERT(X) ((void)0)
1451	#endif
1452
1453	#ifdef __cplusplus
1454	}
1455	#endif
1456
1457	#endif /* _SYS_DTRACE_IMPL_H */
1458
1459

Browse the source code of xnu/bsd/sys/dtrace_impl.h