| 1 | /* | 
| 2 |  * CDDL HEADER START | 
| 3 |  * | 
| 4 |  * The contents of this file are subject to the terms of the | 
| 5 |  * Common Development and Distribution License (the "License"). | 
| 6 |  * You may not use this file except in compliance with the License. | 
| 7 |  * | 
| 8 |  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | 
| 9 |  * or http://www.opensolaris.org/os/licensing. | 
| 10 |  * See the License for the specific language governing permissions | 
| 11 |  * and limitations under the License. | 
| 12 |  * | 
| 13 |  * When distributing Covered Code, include this CDDL HEADER in each | 
| 14 |  * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | 
| 15 |  * If applicable, add the following below this CDDL HEADER, with the | 
| 16 |  * fields enclosed by brackets "[]" replaced with your own identifying | 
| 17 |  * information: Portions Copyright [yyyy] [name of copyright owner] | 
| 18 |  * | 
| 19 |  * CDDL HEADER END | 
| 20 |  */ | 
| 21 |  | 
| 22 | /* | 
| 23 |  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved. | 
| 24 |  * Use is subject to license terms. | 
| 25 |  * | 
| 26 |  * Portions Copyright (c) 2012 by Delphix. All rights reserved. | 
| 27 |  * Portions Copyright (c) 2016 by Joyent, Inc. | 
| 28 |  */ | 
| 29 |  | 
| 30 | #ifndef _SYS_DTRACE_IMPL_H | 
| 31 | #define	_SYS_DTRACE_IMPL_H | 
| 32 |  | 
| 33 | #ifdef	__cplusplus | 
| 34 | extern "C"  { | 
| 35 | #endif | 
| 36 |  | 
| 37 | /* | 
| 38 |  * DTrace Dynamic Tracing Software: Kernel Implementation Interfaces | 
| 39 |  * | 
| 40 |  * Note: The contents of this file are private to the implementation of the | 
| 41 |  * Solaris system and DTrace subsystem and are subject to change at any time | 
| 42 |  * without notice.  Applications and drivers using these interfaces will fail | 
| 43 |  * to run on future releases.  These interfaces should not be used for any | 
| 44 |  * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). | 
| 45 |  * Please refer to the "Solaris Dynamic Tracing Guide" for more information. | 
| 46 |  */ | 
| 47 |  | 
| 48 | #include <sys/dtrace.h> | 
| 49 | #include <kern/kalloc.h> | 
| 50 |  | 
| 51 | /* | 
| 52 |  * DTrace Implementation Locks | 
| 53 |  */ | 
| 54 | extern lck_attr_t dtrace_lck_attr; | 
| 55 | extern lck_grp_t dtrace_lck_grp; | 
| 56 | extern lck_mtx_t dtrace_procwaitfor_lock; | 
| 57 |  | 
| 58 | /* | 
| 59 |  * DTrace Implementation Constants and Typedefs | 
| 60 |  */ | 
| 61 | #define	DTRACE_MAXPROPLEN		128 | 
| 62 | #define	DTRACE_DYNVAR_CHUNKSIZE		256 | 
| 63 |  | 
| 64 | struct dtrace_probe; | 
| 65 | struct dtrace_ecb; | 
| 66 | struct dtrace_predicate; | 
| 67 | struct dtrace_action; | 
| 68 | struct dtrace_provider; | 
| 69 | struct dtrace_state; | 
| 70 |  | 
| 71 | typedef struct dtrace_probe dtrace_probe_t; | 
| 72 | typedef struct dtrace_ecb dtrace_ecb_t; | 
| 73 | typedef struct dtrace_predicate dtrace_predicate_t; | 
| 74 | typedef struct dtrace_action dtrace_action_t; | 
| 75 | typedef struct dtrace_provider dtrace_provider_t; | 
| 76 | typedef struct dtrace_meta dtrace_meta_t; | 
| 77 | typedef struct dtrace_state dtrace_state_t; | 
| 78 | typedef uint32_t dtrace_optid_t; | 
| 79 | typedef uint32_t dtrace_specid_t; | 
| 80 | typedef uint64_t dtrace_genid_t; | 
| 81 |  | 
| 82 | /* | 
| 83 |  * DTrace Probes | 
| 84 |  * | 
| 85 |  * The probe is the fundamental unit of the DTrace architecture.  Probes are | 
| 86 |  * created by DTrace providers, and managed by the DTrace framework.  A probe | 
| 87 |  * is identified by a unique <provider, module, function, name> tuple, and has | 
| 88 |  * a unique probe identifier assigned to it.  (Some probes are not associated | 
| 89 |  * with a specific point in text; these are called _unanchored probes_ and have | 
| 90 |  * no module or function associated with them.)  Probes are represented as a | 
| 91 |  * dtrace_probe structure.  To allow quick lookups based on each element of the | 
| 92 |  * probe tuple, probes are hashed by each of provider, module, function and | 
| 93 |  * name.  (If a lookup is performed based on a regular expression, a | 
| 94 |  * dtrace_probekey is prepared, and a linear search is performed.) Each probe | 
| 95 |  * is additionally pointed to by a linear array indexed by its identifier.  The | 
| 96 |  * identifier is the provider's mechanism for indicating to the DTrace | 
| 97 |  * framework that a probe has fired:  the identifier is passed as the first | 
| 98 |  * argument to dtrace_probe(), where it is then mapped into the corresponding | 
| 99 |  * dtrace_probe structure.  From the dtrace_probe structure, dtrace_probe() can | 
| 100 |  * iterate over the probe's list of enabling control blocks; see "DTrace | 
| 101 |  * Enabling Control Blocks", below.) | 
| 102 |  */ | 
| 103 | struct dtrace_probe { | 
| 104 | 	dtrace_id_t dtpr_id;			/* probe identifier */ | 
| 105 | 	dtrace_ecb_t *dtpr_ecb;			/* ECB list; see below */ | 
| 106 | 	dtrace_ecb_t *dtpr_ecb_last;		/* last ECB in list */ | 
| 107 | 	void *dtpr_arg;				/* provider argument */ | 
| 108 | 	dtrace_cacheid_t dtpr_predcache;	/* predicate cache ID */ | 
| 109 | 	int dtpr_aframes;			/* artificial frames */ | 
| 110 | 	dtrace_provider_t *dtpr_provider;	/* pointer to provider */ | 
| 111 | 	char *dtpr_mod;				/* probe's module name */ | 
| 112 | 	char *dtpr_func;			/* probe's function name */ | 
| 113 | 	char *dtpr_name;			/* probe's name */ | 
| 114 | 	dtrace_probe_t *dtpr_nextprov;		/* next in provider hash */ | 
| 115 | 	dtrace_probe_t *dtpr_prevprov;		/* previous in provider hash */ | 
| 116 | 	dtrace_probe_t *dtpr_nextmod;		/* next in module hash */ | 
| 117 | 	dtrace_probe_t *dtpr_prevmod;		/* previous in module hash */ | 
| 118 | 	dtrace_probe_t *dtpr_nextfunc;		/* next in function hash */ | 
| 119 | 	dtrace_probe_t *dtpr_prevfunc;		/* previous in function hash */ | 
| 120 | 	dtrace_probe_t *dtpr_nextname;		/* next in name hash */ | 
| 121 | 	dtrace_probe_t *dtpr_prevname;		/* previous in name hash */ | 
| 122 | 	dtrace_genid_t dtpr_gen;		/* probe generation ID */ | 
| 123 | }; | 
| 124 |  | 
| 125 | typedef int dtrace_probekey_f(const char *, const char *, int); | 
| 126 |  | 
| 127 | typedef struct dtrace_probekey { | 
| 128 | 	const char *dtpk_prov;			/* provider name to match */ | 
| 129 | 	dtrace_probekey_f *dtpk_pmatch;		/* provider matching function */ | 
| 130 | 	const char *dtpk_mod;			/* module name to match */ | 
| 131 | 	dtrace_probekey_f *dtpk_mmatch;		/* module matching function */ | 
| 132 | 	const char *dtpk_func;			/* func name to match */ | 
| 133 | 	dtrace_probekey_f *dtpk_fmatch;		/* func matching function */ | 
| 134 | 	const char *dtpk_name;			/* name to match */ | 
| 135 | 	dtrace_probekey_f *dtpk_nmatch;		/* name matching function */ | 
| 136 | 	dtrace_id_t dtpk_id;			/* identifier to match */ | 
| 137 | } dtrace_probekey_t; | 
| 138 |  | 
| 139 | typedef struct dtrace_hashbucket { | 
| 140 | 	struct dtrace_hashbucket *dthb_next;	/* next on hash chain */ | 
| 141 | 	void *dthb_chain;			/* chain of elements */ | 
| 142 | 	int dthb_len;				/* number of probes here */ | 
| 143 | } dtrace_hashbucket_t; | 
| 144 |  | 
| 145 | typedef const char* dtrace_strkey_f(void*, uintptr_t); | 
| 146 |  | 
| 147 | typedef struct dtrace_hash { | 
| 148 | 	dtrace_hashbucket_t **dth_tab;	/* hash table */ | 
| 149 | 	int dth_size;			/* size of hash table */ | 
| 150 | 	int dth_mask;			/* mask to index into table */ | 
| 151 | 	int dth_nbuckets;		/* total number of buckets */ | 
| 152 | 	uintptr_t dth_nextoffs;		/* offset of next in element */ | 
| 153 | 	uintptr_t dth_prevoffs;		/* offset of prev in element */ | 
| 154 | 	dtrace_strkey_f *dth_getstr;	/* func to retrieve str in element */ | 
| 155 | 	uintptr_t dth_stroffs;		/* offset of str in element */ | 
| 156 | } dtrace_hash_t; | 
| 157 |  | 
| 158 | /* | 
| 159 |  * DTrace Enabling Control Blocks | 
| 160 |  * | 
| 161 |  * When a provider wishes to fire a probe, it calls into dtrace_probe(), | 
| 162 |  * passing the probe identifier as the first argument.  As described above, | 
| 163 |  * dtrace_probe() maps the identifier into a pointer to a dtrace_probe_t | 
| 164 |  * structure.  This structure contains information about the probe, and a | 
| 165 |  * pointer to the list of Enabling Control Blocks (ECBs).  Each ECB points to | 
| 166 |  * DTrace consumer state, and contains an optional predicate, and a list of | 
| 167 |  * actions.  (Shown schematically below.)  The ECB abstraction allows a single | 
| 168 |  * probe to be multiplexed across disjoint consumers, or across disjoint | 
| 169 |  * enablings of a single probe within one consumer. | 
| 170 |  * | 
| 171 |  *   Enabling Control Block | 
| 172 |  *        dtrace_ecb_t | 
| 173 |  * +------------------------+ | 
| 174 |  * | dtrace_epid_t ---------+--------------> Enabled Probe ID (EPID) | 
| 175 |  * | dtrace_state_t * ------+--------------> State associated with this ECB | 
| 176 |  * | dtrace_predicate_t * --+---------+ | 
| 177 |  * | dtrace_action_t * -----+----+    | | 
| 178 |  * | dtrace_ecb_t * ---+    |    |    |       Predicate (if any) | 
| 179 |  * +-------------------+----+    |    |       dtrace_predicate_t | 
| 180 |  *                     |         |    +---> +--------------------+ | 
| 181 |  *                     |         |          | dtrace_difo_t * ---+----> DIFO | 
| 182 |  *                     |         |          +--------------------+ | 
| 183 |  *                     |         | | 
| 184 |  *            Next ECB |         |           Action | 
| 185 |  *            (if any) |         |       dtrace_action_t | 
| 186 |  *                     :         +--> +-------------------+ | 
| 187 |  *                     :              | dtrace_actkind_t -+------> kind | 
| 188 |  *                     v              | dtrace_difo_t * --+------> DIFO (if any) | 
| 189 |  *                                    | dtrace_recdesc_t -+------> record descr. | 
| 190 |  *                                    | dtrace_action_t * +------+ | 
| 191 |  *                                    +-------------------+      | | 
| 192 |  *                                                               | Next action | 
| 193 |  *                               +-------------------------------+  (if any) | 
| 194 |  *                               | | 
| 195 |  *                               |           Action | 
| 196 |  *                               |       dtrace_action_t | 
| 197 |  *                               +--> +-------------------+ | 
| 198 |  *                                    | dtrace_actkind_t -+------> kind | 
| 199 |  *                                    | dtrace_difo_t * --+------> DIFO (if any) | 
| 200 |  *                                    | dtrace_action_t * +------+ | 
| 201 |  *                                    +-------------------+      | | 
| 202 |  *                                                               | Next action | 
| 203 |  *                               +-------------------------------+  (if any) | 
| 204 |  *                               | | 
| 205 |  *                               : | 
| 206 |  *                               v | 
| 207 |  * | 
| 208 |  * | 
| 209 |  * dtrace_probe() iterates over the ECB list.  If the ECB needs less space | 
| 210 |  * than is available in the principal buffer, the ECB is processed:  if the | 
| 211 |  * predicate is non-NULL, the DIF object is executed.  If the result is | 
| 212 |  * non-zero, the action list is processed, with each action being executed | 
| 213 |  * accordingly.  When the action list has been completely executed, processing | 
| 214 |  * advances to the next ECB. The ECB abstraction allows disjoint consumers | 
| 215 |  * to multiplex on single probes. | 
| 216 |  * | 
| 217 |  * Execution of the ECB results in consuming dte_size bytes in the buffer | 
| 218 |  * to record data.  During execution, dte_needed bytes must be available in | 
| 219 |  * the buffer.  This space is used for both recorded data and tuple data. | 
| 220 |  */ | 
| 221 | struct dtrace_ecb { | 
| 222 | 	dtrace_epid_t dte_epid;			/* enabled probe ID */ | 
| 223 | 	uint32_t dte_alignment;			/* required alignment */ | 
| 224 | 	size_t dte_needed;			/* space needed for execution */ | 
| 225 | 	size_t dte_size;			/* size of recorded payload */ | 
| 226 | 	dtrace_predicate_t *dte_predicate;	/* predicate, if any */ | 
| 227 | 	dtrace_action_t *dte_action;		/* actions, if any */ | 
| 228 | 	dtrace_ecb_t *dte_next;			/* next ECB on probe */ | 
| 229 | 	dtrace_state_t *dte_state;		/* pointer to state */ | 
| 230 | 	uint32_t dte_cond;			/* security condition */ | 
| 231 | 	dtrace_probe_t *dte_probe;		/* pointer to probe */ | 
| 232 | 	dtrace_action_t *dte_action_last;	/* last action on ECB */ | 
| 233 | 	uint64_t dte_uarg;			/* library argument */ | 
| 234 | }; | 
| 235 |  | 
| 236 | struct dtrace_predicate { | 
| 237 | 	dtrace_difo_t *dtp_difo;		/* DIF object */ | 
| 238 | 	dtrace_cacheid_t dtp_cacheid;		/* cache identifier */ | 
| 239 | 	int dtp_refcnt;				/* reference count */ | 
| 240 | }; | 
| 241 |  | 
| 242 | struct dtrace_action { | 
| 243 | 	dtrace_actkind_t dta_kind;		/* kind of action */ | 
| 244 | 	uint16_t dta_intuple;			/* boolean:  in aggregation */ | 
| 245 | 	uint32_t dta_refcnt;			/* reference count */ | 
| 246 | 	dtrace_difo_t *dta_difo;		/* pointer to DIFO */ | 
| 247 | 	dtrace_recdesc_t dta_rec;		/* record description */ | 
| 248 | 	dtrace_action_t *dta_prev;		/* previous action */ | 
| 249 | 	dtrace_action_t *dta_next;		/* next action */ | 
| 250 | }; | 
| 251 |  | 
| 252 | typedef struct dtrace_aggregation { | 
| 253 | 	dtrace_action_t dtag_action;		/* action; must be first */ | 
| 254 | 	dtrace_aggid_t dtag_id;			/* identifier */ | 
| 255 | 	dtrace_ecb_t *dtag_ecb;			/* corresponding ECB */ | 
| 256 | 	dtrace_action_t *dtag_first;		/* first action in tuple */ | 
| 257 | 	uint32_t dtag_base;			/* base of aggregation */ | 
| 258 | 	uint8_t dtag_hasarg;			/* boolean:  has argument */ | 
| 259 | 	uint64_t dtag_initial;			/* initial value */ | 
| 260 | 	void (*dtag_aggregate)(uint64_t *, uint64_t, uint64_t); | 
| 261 | } dtrace_aggregation_t; | 
| 262 |  | 
| 263 | /* | 
| 264 |  * DTrace Buffers | 
| 265 |  * | 
| 266 |  * Principal buffers, aggregation buffers, and speculative buffers are all | 
| 267 |  * managed with the dtrace_buffer structure.  By default, this structure | 
| 268 |  * includes twin data buffers -- dtb_tomax and dtb_xamot -- that serve as the | 
| 269 |  * active and passive buffers, respectively.  For speculative buffers, | 
| 270 |  * dtb_xamot will be NULL; for "ring" and "fill" buffers, dtb_xamot will point | 
| 271 |  * to a scratch buffer.  For all buffer types, the dtrace_buffer structure is | 
| 272 |  * always allocated on a per-CPU basis; a single dtrace_buffer structure is | 
| 273 |  * never shared among CPUs.  (That is, there is never true sharing of the | 
| 274 |  * dtrace_buffer structure; to prevent false sharing of the structure, it must | 
| 275 |  * always be aligned to the coherence granularity -- generally 64 bytes.) | 
| 276 |  * | 
| 277 |  * One of the critical design decisions of DTrace is that a given ECB always | 
| 278 |  * stores the same quantity and type of data.  This is done to assure that the | 
| 279 |  * only metadata required for an ECB's traced data is the EPID.  That is, from | 
| 280 |  * the EPID, the consumer can determine the data layout.  (The data buffer | 
| 281 |  * layout is shown schematically below.)  By assuring that one can determine | 
| 282 |  * data layout from the EPID, the metadata stream can be separated from the | 
| 283 |  * data stream -- simplifying the data stream enormously.  The ECB always | 
| 284 |  * proceeds the recorded data as part of the dtrace_rechdr_t structure that | 
| 285 |  * includes the EPID and a high-resolution timestamp used for output ordering | 
| 286 |  * consistency. | 
| 287 |  * | 
| 288 |  *      base of data buffer --->  +--------+--------------------+--------+ | 
| 289 |  *                                | rechdr | data               | rechdr | | 
| 290 |  *                                +--------+------+--------+----+--------+ | 
| 291 |  *                                | data          | rechdr | data        | | 
| 292 |  *                                +---------------+--------+-------------+ | 
| 293 |  *                                | data, cont.                          | | 
| 294 |  *                                +--------+--------------------+--------+ | 
| 295 |  *                                | rechdr | data               |        | | 
| 296 |  *                                +--------+--------------------+        | | 
| 297 |  *                                |                ||                    | | 
| 298 |  *                                |                ||                    | | 
| 299 |  *                                |                \/                    | | 
| 300 |  *                                :                                      : | 
| 301 |  *                                .                                      . | 
| 302 |  *                                .                                      . | 
| 303 |  *                                .                                      . | 
| 304 |  *                                :                                      : | 
| 305 |  *                                |                                      | | 
| 306 |  *     limit of data buffer --->  +--------------------------------------+ | 
| 307 |  * | 
| 308 |  * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the | 
| 309 |  * principal buffer (both scratch and payload) exceed the available space.  If | 
| 310 |  * the ECB's needs exceed available space (and if the principal buffer policy | 
| 311 |  * is the default "switch" policy), the ECB is dropped, the buffer's drop count | 
| 312 |  * is incremented, and processing advances to the next ECB.  If the ECB's needs | 
| 313 |  * can be met with the available space, the ECB is processed, but the offset in | 
| 314 |  * the principal buffer is only advanced if the ECB completes processing | 
| 315 |  * without error. | 
| 316 |  * | 
| 317 |  * When a buffer is to be switched (either because the buffer is the principal | 
| 318 |  * buffer with a "switch" policy or because it is an aggregation buffer), a | 
| 319 |  * cross call is issued to the CPU associated with the buffer.  In the cross | 
| 320 |  * call context, interrupts are disabled, and the active and the inactive | 
| 321 |  * buffers are atomically switched.  This involves switching the data pointers, | 
| 322 |  * copying the various state fields (offset, drops, errors, etc.) into their | 
| 323 |  * inactive equivalents, and clearing the state fields.  Because interrupts are | 
| 324 |  * disabled during this procedure, the switch is guaranteed to appear atomic to | 
| 325 |  * dtrace_probe(). | 
| 326 |  * | 
| 327 |  * DTrace Ring Buffering | 
| 328 |  * | 
| 329 |  * To process a ring buffer correctly, one must know the oldest valid record. | 
| 330 |  * Processing starts at the oldest record in the buffer and continues until | 
| 331 |  * the end of the buffer is reached.  Processing then resumes starting with | 
| 332 |  * the record stored at offset 0 in the buffer, and continues until the | 
| 333 |  * youngest record is processed.  If trace records are of a fixed-length, | 
| 334 |  * determining the oldest record is trivial: | 
| 335 |  * | 
| 336 |  *   - If the ring buffer has not wrapped, the oldest record is the record | 
| 337 |  *     stored at offset 0. | 
| 338 |  * | 
| 339 |  *   - If the ring buffer has wrapped, the oldest record is the record stored | 
| 340 |  *     at the current offset. | 
| 341 |  * | 
| 342 |  * With variable length records, however, just knowing the current offset | 
| 343 |  * doesn't suffice for determining the oldest valid record:  assuming that one | 
| 344 |  * allows for arbitrary data, one has no way of searching forward from the | 
| 345 |  * current offset to find the oldest valid record.  (That is, one has no way | 
| 346 |  * of separating data from metadata.) It would be possible to simply refuse to | 
| 347 |  * process any data in the ring buffer between the current offset and the | 
| 348 |  * limit, but this leaves (potentially) an enormous amount of otherwise valid | 
| 349 |  * data unprocessed. | 
| 350 |  * | 
| 351 |  * To effect ring buffering, we track two offsets in the buffer:  the current | 
| 352 |  * offset and the _wrapped_ offset.  If a request is made to reserve some | 
| 353 |  * amount of data, and the buffer has wrapped, the wrapped offset is | 
| 354 |  * incremented until the wrapped offset minus the current offset is greater | 
| 355 |  * than or equal to the reserve request.  This is done by repeatedly looking | 
| 356 |  * up the ECB corresponding to the EPID at the current wrapped offset, and | 
| 357 |  * incrementing the wrapped offset by the size of the data payload | 
| 358 |  * corresponding to that ECB.  If this offset is greater than or equal to the | 
| 359 |  * limit of the data buffer, the wrapped offset is set to 0.  Thus, the | 
| 360 |  * current offset effectively "chases" the wrapped offset around the buffer. | 
| 361 |  * Schematically: | 
| 362 |  * | 
| 363 |  *      base of data buffer --->  +------+--------------------+------+ | 
| 364 |  *                                | EPID | data               | EPID | | 
| 365 |  *                                +------+--------+------+----+------+ | 
| 366 |  *                                | data          | EPID | data      | | 
| 367 |  *                                +---------------+------+-----------+ | 
| 368 |  *                                | data, cont.                      | | 
| 369 |  *                                +------+---------------------------+ | 
| 370 |  *                                | EPID | data                      | | 
| 371 |  *           current offset --->  +------+---------------------------+ | 
| 372 |  *                                | invalid data                     | | 
| 373 |  *           wrapped offset --->  +------+--------------------+------+ | 
| 374 |  *                                | EPID | data               | EPID | | 
| 375 |  *                                +------+--------+------+----+------+ | 
| 376 |  *                                | data          | EPID | data      | | 
| 377 |  *                                +---------------+------+-----------+ | 
| 378 |  *                                :                                  : | 
| 379 |  *                                .                                  . | 
| 380 |  *                                .        ... valid data ...        . | 
| 381 |  *                                .                                  . | 
| 382 |  *                                :                                  : | 
| 383 |  *                                +------+-------------+------+------+ | 
| 384 |  *                                | EPID | data        | EPID | data | | 
| 385 |  *                                +------+------------++------+------+ | 
| 386 |  *                                | data, cont.       | leftover     | | 
| 387 |  *     limit of data buffer --->  +-------------------+--------------+ | 
| 388 |  * | 
| 389 |  * If the amount of requested buffer space exceeds the amount of space | 
| 390 |  * available between the current offset and the end of the buffer: | 
| 391 |  * | 
| 392 |  *  (1)  all words in the data buffer between the current offset and the limit | 
| 393 |  *       of the data buffer (marked "leftover", above) are set to | 
| 394 |  *       DTRACE_EPIDNONE | 
| 395 |  * | 
| 396 |  *  (2)  the wrapped offset is set to zero | 
| 397 |  * | 
| 398 |  *  (3)  the iteration process described above occurs until the wrapped offset | 
| 399 |  *       is greater than the amount of desired space. | 
| 400 |  * | 
| 401 |  * The wrapped offset is implemented by (re-)using the inactive offset. | 
| 402 |  * In a "switch" buffer policy, the inactive offset stores the offset in | 
| 403 |  * the inactive buffer; in a "ring" buffer policy, it stores the wrapped | 
| 404 |  * offset. | 
| 405 |  * | 
| 406 |  * DTrace Scratch Buffering | 
| 407 |  * | 
| 408 |  * Some ECBs may wish to allocate dynamically-sized temporary scratch memory. | 
| 409 |  * To accommodate such requests easily, scratch memory may be allocated in | 
| 410 |  * the buffer beyond the current offset plus the needed memory of the current | 
| 411 |  * ECB.  If there isn't sufficient room in the buffer for the requested amount | 
| 412 |  * of scratch space, the allocation fails and an error is generated.  Scratch | 
| 413 |  * memory is tracked in the dtrace_mstate_t and is automatically freed when | 
| 414 |  * the ECB ceases processing.  Note that ring buffers cannot allocate their | 
| 415 |  * scratch from the principal buffer -- lest they needlessly overwrite older, | 
| 416 |  * valid data.  Ring buffers therefore have their own dedicated scratch buffer | 
| 417 |  * from which scratch is allocated. | 
| 418 |  */ | 
| 419 | #define	DTRACEBUF_RING		0x0001		/* bufpolicy set to "ring" */ | 
| 420 | #define	DTRACEBUF_FILL		0x0002		/* bufpolicy set to "fill" */ | 
| 421 | #define	DTRACEBUF_NOSWITCH	0x0004		/* do not switch buffer */ | 
| 422 | #define	DTRACEBUF_WRAPPED	0x0008		/* ring buffer has wrapped */ | 
| 423 | #define	DTRACEBUF_DROPPED	0x0010		/* drops occurred */ | 
| 424 | #define	DTRACEBUF_ERROR		0x0020		/* errors occurred */ | 
| 425 | #define	DTRACEBUF_FULL		0x0040		/* "fill" buffer is full */ | 
| 426 | #define	DTRACEBUF_CONSUMED	0x0080		/* buffer has been consumed */ | 
| 427 | #define	DTRACEBUF_INACTIVE	0x0100		/* buffer is not yet active */ | 
| 428 |  | 
| 429 | typedef struct dtrace_buffer { | 
| 430 | 	uint64_t dtb_offset;			/* current offset in buffer */ | 
| 431 | 	uint64_t dtb_cur_limit;			/* current limit before signaling/dropping */ | 
| 432 | 	uint64_t dtb_limit;			/* limit before signaling */ | 
| 433 | 	uint64_t dtb_size;			/* size of buffer */ | 
| 434 | 	uint32_t dtb_flags;			/* flags */ | 
| 435 | 	uint32_t dtb_drops;			/* number of drops */ | 
| 436 | 	caddr_t dtb_tomax;			/* active buffer */ | 
| 437 | 	caddr_t dtb_xamot;			/* inactive buffer */ | 
| 438 | 	uint32_t dtb_xamot_flags;		/* inactive flags */ | 
| 439 | 	uint32_t dtb_xamot_drops;		/* drops in inactive buffer */ | 
| 440 | 	uint64_t dtb_xamot_offset;		/* offset in inactive buffer */ | 
| 441 | 	uint32_t dtb_errors;			/* number of errors */ | 
| 442 | 	uint32_t dtb_xamot_errors;		/* errors in inactive buffer */ | 
| 443 | #ifndef _LP64 | 
| 444 | 	uint64_t dtb_pad1; | 
| 445 | #endif | 
| 446 | 	uint64_t dtb_switched;			/* time of last switch */ | 
| 447 | 	uint64_t dtb_interval;			/* observed switch interval */ | 
| 448 | 	uint64_t dtb_pad2[4];			/* pad to avoid false sharing */ | 
| 449 | } dtrace_buffer_t; | 
| 450 |  | 
| 451 | /* | 
| 452 |  * DTrace Aggregation Buffers | 
| 453 |  * | 
| 454 |  * Aggregation buffers use much of the same mechanism as described above | 
| 455 |  * ("DTrace Buffers").  However, because an aggregation is fundamentally a | 
| 456 |  * hash, there exists dynamic metadata associated with an aggregation buffer | 
| 457 |  * that is not associated with other kinds of buffers.  This aggregation | 
| 458 |  * metadata is _only_ relevant for the in-kernel implementation of | 
| 459 |  * aggregations; it is not actually relevant to user-level consumers.  To do | 
| 460 |  * this, we allocate dynamic aggregation data (hash keys and hash buckets) | 
| 461 |  * starting below the _limit_ of the buffer, and we allocate data from the | 
| 462 |  * _base_ of the buffer.  When the aggregation buffer is copied out, _only_ the | 
| 463 |  * data is copied out; the metadata is simply discarded.  Schematically, | 
| 464 |  * aggregation buffers look like: | 
| 465 |  * | 
| 466 |  *      base of data buffer --->  +-------+------+-----------+-------+ | 
| 467 |  *                                | aggid | key  | value     | aggid | | 
| 468 |  *                                +-------+------+-----------+-------+ | 
| 469 |  *                                | key                              | | 
| 470 |  *                                +-------+-------+-----+------------+ | 
| 471 |  *                                | value | aggid | key | value      | | 
| 472 |  *                                +-------+------++-----+------+-----+ | 
| 473 |  *                                | aggid | key  | value       |     | | 
| 474 |  *                                +-------+------+-------------+     | | 
| 475 |  *                                |                ||                | | 
| 476 |  *                                |                ||                | | 
| 477 |  *                                |                \/                | | 
| 478 |  *                                :                                  : | 
| 479 |  *                                .                                  . | 
| 480 |  *                                .                                  . | 
| 481 |  *                                .                                  . | 
| 482 |  *                                :                                  : | 
| 483 |  *                                |                /\                | | 
| 484 |  *                                |                ||   +------------+ | 
| 485 |  *                                |                ||   |            | | 
| 486 |  *                                +---------------------+            | | 
| 487 |  *                                | hash keys                        | | 
| 488 |  *                                | (dtrace_aggkey structures)       | | 
| 489 |  *                                |                                  | | 
| 490 |  *                                +----------------------------------+ | 
| 491 |  *                                | hash buckets                     | | 
| 492 |  *                                | (dtrace_aggbuffer structure)     | | 
| 493 |  *                                |                                  | | 
| 494 |  *     limit of data buffer --->  +----------------------------------+ | 
| 495 |  * | 
| 496 |  * | 
| 497 |  * As implied above, just as we assure that ECBs always store a constant | 
| 498 |  * amount of data, we assure that a given aggregation -- identified by its | 
| 499 |  * aggregation ID -- always stores data of a constant quantity and type. | 
| 500 |  * As with EPIDs, this allows the aggregation ID to serve as the metadata for a | 
| 501 |  * given record. | 
| 502 |  * | 
| 503 |  * Note that the size of the dtrace_aggkey structure must be sizeof (uintptr_t) | 
| 504 |  * aligned.  (If this the structure changes such that this becomes false, an | 
| 505 |  * assertion will fail in dtrace_aggregate().) | 
| 506 |  */ | 
| 507 | typedef struct dtrace_aggkey { | 
| 508 | 	uint32_t dtak_hashval;			/* hash value */ | 
| 509 | 	uint32_t dtak_action:4;			/* action -- 4 bits */ | 
| 510 | 	uint32_t dtak_size:28;			/* size -- 28 bits */ | 
| 511 | 	caddr_t dtak_data;			/* data pointer */ | 
| 512 | 	struct dtrace_aggkey *dtak_next;	/* next in hash chain */ | 
| 513 | } dtrace_aggkey_t; | 
| 514 |  | 
| 515 | typedef struct dtrace_aggbuffer { | 
| 516 | 	uintptr_t dtagb_hashsize;		/* number of buckets */ | 
| 517 | 	uintptr_t dtagb_free;			/* free list of keys */ | 
| 518 | 	dtrace_aggkey_t **dtagb_hash;		/* hash table */ | 
| 519 | } dtrace_aggbuffer_t; | 
| 520 |  | 
| 521 | /* | 
| 522 |  * DTrace Speculations | 
| 523 |  * | 
| 524 |  * Speculations have a per-CPU buffer and a global state.  Once a speculation | 
| 525 |  * buffer has been comitted or discarded, it cannot be reused until all CPUs | 
| 526 |  * have taken the same action (commit or discard) on their respective | 
| 527 |  * speculative buffer.  However, because DTrace probes may execute in arbitrary | 
| 528 |  * context, other CPUs cannot simply be cross-called at probe firing time to | 
| 529 |  * perform the necessary commit or discard.  The speculation states thus | 
| 530 |  * optimize for the case that a speculative buffer is only active on one CPU at | 
| 531 |  * the time of a commit() or discard() -- for if this is the case, other CPUs | 
| 532 |  * need not take action, and the speculation is immediately available for | 
| 533 |  * reuse.  If the speculation is active on multiple CPUs, it must be | 
| 534 |  * asynchronously cleaned -- potentially leading to a higher rate of dirty | 
| 535 |  * speculative drops.  The speculation states are as follows: | 
| 536 |  * | 
| 537 |  *  DTRACESPEC_INACTIVE       <= Initial state; inactive speculation | 
| 538 |  *  DTRACESPEC_ACTIVE         <= Allocated, but not yet speculatively traced to | 
| 539 |  *  DTRACESPEC_ACTIVEONE      <= Speculatively traced to on one CPU | 
| 540 |  *  DTRACESPEC_ACTIVEMANY     <= Speculatively traced to on more than one CPU | 
| 541 |  *  DTRACESPEC_COMMITTING     <= Currently being commited on one CPU | 
| 542 |  *  DTRACESPEC_COMMITTINGMANY <= Currently being commited on many CPUs | 
| 543 |  *  DTRACESPEC_DISCARDING     <= Currently being discarded on many CPUs | 
| 544 |  * | 
| 545 |  * The state transition diagram is as follows: | 
| 546 |  * | 
| 547 |  *     +----------------------------------------------------------+ | 
| 548 |  *     |                                                          | | 
| 549 |  *     |                      +------------+                      | | 
| 550 |  *     |  +-------------------| COMMITTING |<-----------------+   | | 
| 551 |  *     |  |                   +------------+                  |   | | 
| 552 |  *     |  | copied spec.            ^             commit() on |   | discard() on | 
| 553 |  *     |  | into principal          |              active CPU |   | active CPU | 
| 554 |  *     |  |                         | commit()                |   | | 
| 555 |  *     V  V                         |                         |   | | 
| 556 |  * +----------+                 +--------+                +-----------+ | 
| 557 |  * | INACTIVE |---------------->| ACTIVE |--------------->| ACTIVEONE | | 
| 558 |  * +----------+  speculation()  +--------+  speculate()   +-----------+ | 
| 559 |  *     ^  ^                         |                         |   | | 
| 560 |  *     |  |                         | discard()               |   | | 
| 561 |  *     |  | asynchronously          |            discard() on |   | speculate() | 
| 562 |  *     |  | cleaned                 V            inactive CPU |   | on inactive | 
| 563 |  *     |  |                   +------------+                  |   | CPU | 
| 564 |  *     |  +-------------------| DISCARDING |<-----------------+   | | 
| 565 |  *     |                      +------------+                      | | 
| 566 |  *     | asynchronously             ^                             | | 
| 567 |  *     | copied spec.               |       discard()             | | 
| 568 |  *     | into principal             +------------------------+    | | 
| 569 |  *     |                                                     |    V | 
| 570 |  *  +----------------+             commit()              +------------+ | 
| 571 |  *  | COMMITTINGMANY |<----------------------------------| ACTIVEMANY | | 
| 572 |  *  +----------------+                                   +------------+ | 
| 573 |  */ | 
| 574 | typedef enum dtrace_speculation_state { | 
| 575 | 	DTRACESPEC_INACTIVE = 0, | 
| 576 | 	DTRACESPEC_ACTIVE, | 
| 577 | 	DTRACESPEC_ACTIVEONE, | 
| 578 | 	DTRACESPEC_ACTIVEMANY, | 
| 579 | 	DTRACESPEC_COMMITTING, | 
| 580 | 	DTRACESPEC_COMMITTINGMANY, | 
| 581 | 	DTRACESPEC_DISCARDING | 
| 582 | } dtrace_speculation_state_t; | 
| 583 |  | 
| 584 | typedef struct dtrace_speculation { | 
| 585 | 	dtrace_speculation_state_t dtsp_state;	/* current speculation state */ | 
| 586 | 	int dtsp_cleaning;			/* non-zero if being cleaned */ | 
| 587 | 	dtrace_buffer_t *dtsp_buffer;		/* speculative buffer */ | 
| 588 | } dtrace_speculation_t; | 
| 589 |  | 
| 590 | /* | 
| 591 |  * DTrace Dynamic Variables | 
| 592 |  * | 
| 593 |  * The dynamic variable problem is obviously decomposed into two subproblems: | 
| 594 |  * allocating new dynamic storage, and freeing old dynamic storage.  The | 
| 595 |  * presence of the second problem makes the first much more complicated -- or | 
| 596 |  * rather, the absence of the second renders the first trivial.  This is the | 
| 597 |  * case with aggregations, for which there is effectively no deallocation of | 
| 598 |  * dynamic storage.  (Or more accurately, all dynamic storage is deallocated | 
| 599 |  * when a snapshot is taken of the aggregation.)  As DTrace dynamic variables | 
| 600 |  * allow for both dynamic allocation and dynamic deallocation, the | 
| 601 |  * implementation of dynamic variables is quite a bit more complicated than | 
| 602 |  * that of their aggregation kin. | 
| 603 |  * | 
| 604 |  * We observe that allocating new dynamic storage is tricky only because the | 
| 605 |  * size can vary -- the allocation problem is much easier if allocation sizes | 
| 606 |  * are uniform.  We further observe that in D, the size of dynamic variables is | 
| 607 |  * actually _not_ dynamic -- dynamic variable sizes may be determined by static | 
| 608 |  * analysis of DIF text.  (This is true even of putatively dynamically-sized | 
| 609 |  * objects like strings and stacks, the sizes of which are dictated by the | 
| 610 |  * "stringsize" and "stackframes" variables, respectively.)  We exploit this by | 
| 611 |  * performing this analysis on all DIF before enabling any probes.  For each | 
| 612 |  * dynamic load or store, we calculate the dynamically-allocated size plus the | 
| 613 |  * size of the dtrace_dynvar structure plus the storage required to key the | 
| 614 |  * data.  For all DIF, we take the largest value and dub it the _chunksize_. | 
| 615 |  * We then divide dynamic memory into two parts:  a hash table that is wide | 
| 616 |  * enough to have every chunk in its own bucket, and a larger region of equal | 
| 617 |  * chunksize units.  Whenever we wish to dynamically allocate a variable, we | 
| 618 |  * always allocate a single chunk of memory.  Depending on the uniformity of | 
| 619 |  * allocation, this will waste some amount of memory -- but it eliminates the | 
| 620 |  * non-determinism inherent in traditional heap fragmentation. | 
| 621 |  * | 
| 622 |  * Dynamic objects are allocated by storing a non-zero value to them; they are | 
| 623 |  * deallocated by storing a zero value to them.  Dynamic variables are | 
| 624 |  * complicated enormously by being shared between CPUs.  In particular, | 
| 625 |  * consider the following scenario: | 
| 626 |  * | 
| 627 |  *                 CPU A                                 CPU B | 
| 628 |  *  +---------------------------------+   +---------------------------------+ | 
| 629 |  *  |                                 |   |                                 | | 
| 630 |  *  | allocates dynamic object a[123] |   |                                 | | 
| 631 |  *  | by storing the value 345 to it  |   |                                 | | 
| 632 |  *  |                               --------->                              | | 
| 633 |  *  |                                 |   | wishing to load from object     | | 
| 634 |  *  |                                 |   | a[123], performs lookup in      | | 
| 635 |  *  |                                 |   | dynamic variable space          | | 
| 636 |  *  |                               <---------                              | | 
| 637 |  *  | deallocates object a[123] by    |   |                                 | | 
| 638 |  *  | storing 0 to it                 |   |                                 | | 
| 639 |  *  |                                 |   |                                 | | 
| 640 |  *  | allocates dynamic object b[567] |   | performs load from a[123]       | | 
| 641 |  *  | by storing the value 789 to it  |   |                                 | | 
| 642 |  *  :                                 :   :                                 : | 
| 643 |  *  .                                 .   .                                 . | 
| 644 |  * | 
| 645 |  * This is obviously a race in the D program, but there are nonetheless only | 
| 646 |  * two valid values for CPU B's load from a[123]:  345 or 0.  Most importantly, | 
| 647 |  * CPU B may _not_ see the value 789 for a[123]. | 
| 648 |  * | 
| 649 |  * There are essentially two ways to deal with this: | 
| 650 |  * | 
| 651 |  *  (1)  Explicitly spin-lock variables.  That is, if CPU B wishes to load | 
| 652 |  *       from a[123], it needs to lock a[123] and hold the lock for the | 
| 653 |  *       duration that it wishes to manipulate it. | 
| 654 |  * | 
| 655 |  *  (2)  Avoid reusing freed chunks until it is known that no CPU is referring | 
| 656 |  *       to them. | 
| 657 |  * | 
| 658 |  * The implementation of (1) is rife with complexity, because it requires the | 
| 659 |  * user of a dynamic variable to explicitly decree when they are done using it. | 
| 660 |  * Were all variables by value, this perhaps wouldn't be debilitating -- but | 
| 661 |  * dynamic variables of non-scalar types are tracked by reference.  That is, if | 
| 662 |  * a dynamic variable is, say, a string, and that variable is to be traced to, | 
| 663 |  * say, the principal buffer, the DIF emulation code returns to the main | 
| 664 |  * dtrace_probe() loop a pointer to the underlying storage, not the contents of | 
| 665 |  * the storage.  Further, code calling on DIF emulation would have to be aware | 
| 666 |  * that the DIF emulation has returned a reference to a dynamic variable that | 
| 667 |  * has been potentially locked.  The variable would have to be unlocked after | 
| 668 |  * the main dtrace_probe() loop is finished with the variable, and the main | 
| 669 |  * dtrace_probe() loop would have to be careful to not call any further DIF | 
| 670 |  * emulation while the variable is locked to avoid deadlock.  More generally, | 
| 671 |  * if one were to implement (1), DIF emulation code dealing with dynamic | 
| 672 |  * variables could only deal with one dynamic variable at a time (lest deadlock | 
| 673 |  * result).  To sum, (1) exports too much subtlety to the users of dynamic | 
| 674 |  * variables -- increasing maintenance burden and imposing serious constraints | 
| 675 |  * on future DTrace development. | 
| 676 |  * | 
| 677 |  * The implementation of (2) is also complex, but the complexity is more | 
| 678 |  * manageable.  We need to be sure that when a variable is deallocated, it is | 
| 679 |  * not placed on a traditional free list, but rather on a _dirty_ list.  Once a | 
| 680 |  * variable is on a dirty list, it cannot be found by CPUs performing a | 
| 681 |  * subsequent lookup of the variable -- but it may still be in use by other | 
| 682 |  * CPUs.  To assure that all CPUs that may be seeing the old variable have | 
| 683 |  * cleared out of probe context, a dtrace_sync() can be issued.  Once the | 
| 684 |  * dtrace_sync() has completed, it can be known that all CPUs are done | 
| 685 |  * manipulating the dynamic variable -- the dirty list can be atomically | 
| 686 |  * appended to the free list.  Unfortunately, there's a slight hiccup in this | 
| 687 |  * mechanism:  dtrace_sync() may not be issued from probe context.  The | 
| 688 |  * dtrace_sync() must be therefore issued asynchronously from non-probe | 
| 689 |  * context.  For this we rely on the DTrace cleaner, a cyclic that runs at the | 
| 690 |  * "cleanrate" frequency.  To ease this implementation, we define several chunk | 
| 691 |  * lists: | 
| 692 |  * | 
| 693 |  *   - Dirty.  Deallocated chunks, not yet cleaned.  Not available. | 
| 694 |  * | 
| 695 |  *   - Rinsing.  Formerly dirty chunks that are currently being asynchronously | 
| 696 |  *     cleaned.  Not available, but will be shortly.  Dynamic variable | 
| 697 |  *     allocation may not spin or block for availability, however. | 
| 698 |  * | 
| 699 |  *   - Clean.  Clean chunks, ready for allocation -- but not on the free list. | 
| 700 |  * | 
| 701 |  *   - Free.  Available for allocation. | 
| 702 |  * | 
| 703 |  * Moreover, to avoid absurd contention, _each_ of these lists is implemented | 
| 704 |  * on a per-CPU basis.  This is only for performance, not correctness; chunks | 
| 705 |  * may be allocated from another CPU's free list.  The algorithm for allocation | 
| 706 |  * then is this: | 
| 707 |  * | 
| 708 |  *   (1)  Attempt to atomically allocate from current CPU's free list.  If list | 
| 709 |  *        is non-empty and allocation is successful, allocation is complete. | 
| 710 |  * | 
| 711 |  *   (2)  If the clean list is non-empty, atomically move it to the free list, | 
| 712 |  *        and reattempt (1). | 
| 713 |  * | 
| 714 |  *   (3)  If the dynamic variable space is in the CLEAN state, look for free | 
| 715 |  *        and clean lists on other CPUs by setting the current CPU to the next | 
| 716 |  *        CPU, and reattempting (1).  If the next CPU is the current CPU (that | 
| 717 |  *        is, if all CPUs have been checked), atomically switch the state of | 
| 718 |  *        the dynamic variable space based on the following: | 
| 719 |  * | 
| 720 |  *        - If no free chunks were found and no dirty chunks were found, | 
| 721 |  *          atomically set the state to EMPTY. | 
| 722 |  * | 
| 723 |  *        - If dirty chunks were found, atomically set the state to DIRTY. | 
| 724 |  * | 
| 725 |  *        - If rinsing chunks were found, atomically set the state to RINSING. | 
| 726 |  * | 
| 727 |  *   (4)  Based on state of dynamic variable space state, increment appropriate | 
| 728 |  *        counter to indicate dynamic drops (if in EMPTY state) vs. dynamic | 
| 729 |  *        dirty drops (if in DIRTY state) vs. dynamic rinsing drops (if in | 
| 730 |  *        RINSING state).  Fail the allocation. | 
| 731 |  * | 
| 732 |  * The cleaning cyclic operates with the following algorithm:  for all CPUs | 
| 733 |  * with a non-empty dirty list, atomically move the dirty list to the rinsing | 
| 734 |  * list.  Perform a dtrace_sync().  For all CPUs with a non-empty rinsing list, | 
| 735 |  * atomically move the rinsing list to the clean list.  Perform another | 
| 736 |  * dtrace_sync().  By this point, all CPUs have seen the new clean list; the | 
| 737 |  * state of the dynamic variable space can be restored to CLEAN. | 
| 738 |  * | 
| 739 |  * There exist two final races that merit explanation.  The first is a simple | 
| 740 |  * allocation race: | 
| 741 |  * | 
| 742 |  *                 CPU A                                 CPU B | 
| 743 |  *  +---------------------------------+   +---------------------------------+ | 
| 744 |  *  |                                 |   |                                 | | 
| 745 |  *  | allocates dynamic object a[123] |   | allocates dynamic object a[123] | | 
| 746 |  *  | by storing the value 345 to it  |   | by storing the value 567 to it  | | 
| 747 |  *  |                                 |   |                                 | | 
| 748 |  *  :                                 :   :                                 : | 
| 749 |  *  .                                 .   .                                 . | 
| 750 |  * | 
| 751 |  * Again, this is a race in the D program.  It can be resolved by having a[123] | 
| 752 |  * hold the value 345 or a[123] hold the value 567 -- but it must be true that | 
| 753 |  * a[123] have only _one_ of these values.  (That is, the racing CPUs may not | 
| 754 |  * put the same element twice on the same hash chain.)  This is resolved | 
| 755 |  * simply:  before the allocation is undertaken, the start of the new chunk's | 
| 756 |  * hash chain is noted.  Later, after the allocation is complete, the hash | 
| 757 |  * chain is atomically switched to point to the new element.  If this fails | 
| 758 |  * (because of either concurrent allocations or an allocation concurrent with a | 
| 759 |  * deletion), the newly allocated chunk is deallocated to the dirty list, and | 
| 760 |  * the whole process of looking up (and potentially allocating) the dynamic | 
| 761 |  * variable is reattempted. | 
| 762 |  * | 
| 763 |  * The final race is a simple deallocation race: | 
| 764 |  * | 
| 765 |  *                 CPU A                                 CPU B | 
| 766 |  *  +---------------------------------+   +---------------------------------+ | 
| 767 |  *  |                                 |   |                                 | | 
| 768 |  *  | deallocates dynamic object      |   | deallocates dynamic object      | | 
| 769 |  *  | a[123] by storing the value 0   |   | a[123] by storing the value 0   | | 
| 770 |  *  | to it                           |   | to it                           | | 
| 771 |  *  |                                 |   |                                 | | 
| 772 |  *  :                                 :   :                                 : | 
| 773 |  *  .                                 .   .                                 . | 
| 774 |  * | 
| 775 |  * Once again, this is a race in the D program, but it is one that we must | 
| 776 |  * handle without corrupting the underlying data structures.  Because | 
| 777 |  * deallocations require the deletion of a chunk from the middle of a hash | 
| 778 |  * chain, we cannot use a single-word atomic operation to remove it.  For this, | 
| 779 |  * we add a spin lock to the hash buckets that is _only_ used for deallocations | 
| 780 |  * (allocation races are handled as above).  Further, this spin lock is _only_ | 
| 781 |  * held for the duration of the delete; before control is returned to the DIF | 
| 782 |  * emulation code, the hash bucket is unlocked. | 
| 783 |  */ | 
| 784 | typedef struct dtrace_key { | 
| 785 | 	uint64_t dttk_value;			/* data value or data pointer */ | 
| 786 | 	uint64_t dttk_size;			/* 0 if by-val, >0 if by-ref */ | 
| 787 | } dtrace_key_t; | 
| 788 |  | 
| 789 | typedef struct dtrace_tuple { | 
| 790 | 	uint32_t dtt_nkeys;			/* number of keys in tuple */ | 
| 791 | 	uint32_t dtt_pad;			/* padding */ | 
| 792 | 	dtrace_key_t dtt_key[1];		/* array of tuple keys */ | 
| 793 | } dtrace_tuple_t; | 
| 794 |  | 
| 795 | typedef struct dtrace_dynvar { | 
| 796 | 	uint64_t dtdv_hashval;			/* hash value -- 0 if free */ | 
| 797 | 	struct dtrace_dynvar *dtdv_next;	/* next on list or hash chain */ | 
| 798 | 	void *dtdv_data;			/* pointer to data */ | 
| 799 | 	dtrace_tuple_t dtdv_tuple;		/* tuple key */ | 
| 800 | } dtrace_dynvar_t; | 
| 801 |  | 
| 802 | typedef enum dtrace_dynvar_op { | 
| 803 | 	DTRACE_DYNVAR_ALLOC, | 
| 804 | 	DTRACE_DYNVAR_NOALLOC, | 
| 805 | 	DTRACE_DYNVAR_DEALLOC | 
| 806 | } dtrace_dynvar_op_t; | 
| 807 |  | 
| 808 | typedef struct dtrace_dynhash { | 
| 809 | 	dtrace_dynvar_t *dtdh_chain;		/* hash chain for this bucket */ | 
| 810 | 	uintptr_t dtdh_lock;			/* deallocation lock */ | 
| 811 | #ifdef _LP64 | 
| 812 | 	uintptr_t dtdh_pad[6];			/* pad to avoid false sharing */ | 
| 813 | #else | 
| 814 | 	uintptr_t dtdh_pad[14];			/* pad to avoid false sharing */ | 
| 815 | #endif | 
| 816 | } dtrace_dynhash_t; | 
| 817 |  | 
| 818 | typedef struct dtrace_dstate_percpu { | 
| 819 | 	dtrace_dynvar_t *dtdsc_free;		/* free list for this CPU */ | 
| 820 | 	dtrace_dynvar_t *dtdsc_dirty;		/* dirty list for this CPU */ | 
| 821 | 	dtrace_dynvar_t *dtdsc_rinsing;		/* rinsing list for this CPU */ | 
| 822 | 	dtrace_dynvar_t *dtdsc_clean;		/* clean list for this CPU */ | 
| 823 | 	uint64_t dtdsc_drops;			/* number of capacity drops */ | 
| 824 | 	uint64_t dtdsc_dirty_drops;		/* number of dirty drops */ | 
| 825 | 	uint64_t dtdsc_rinsing_drops;		/* number of rinsing drops */ | 
| 826 | } dtrace_dstate_percpu_t; | 
| 827 |  | 
| 828 | typedef enum dtrace_dstate_state { | 
| 829 | 	DTRACE_DSTATE_CLEAN = 0, | 
| 830 | 	DTRACE_DSTATE_EMPTY, | 
| 831 | 	DTRACE_DSTATE_DIRTY, | 
| 832 | 	DTRACE_DSTATE_RINSING | 
| 833 | } dtrace_dstate_state_t; | 
| 834 |  | 
| 835 | typedef struct dtrace_dstate { | 
| 836 | 	void *dtds_base;			/* base of dynamic var. space */ | 
| 837 | 	size_t dtds_size;			/* size of dynamic var. space */ | 
| 838 | 	size_t dtds_hashsize;			/* number of buckets in hash */ | 
| 839 | 	size_t dtds_chunksize;			/* size of each chunk */ | 
| 840 | 	dtrace_dynhash_t *dtds_hash;		/* pointer to hash table */ | 
| 841 | 	dtrace_dstate_state_t dtds_state;	/* current dynamic var. state */ | 
| 842 | 	dtrace_dstate_percpu_t *__zpercpu dtds_percpu;	/* per-CPU dyn. var. state */ | 
| 843 | } dtrace_dstate_t; | 
| 844 |  | 
| 845 | /* | 
| 846 |  * DTrace Variable State | 
| 847 |  * | 
| 848 |  * The DTrace variable state tracks user-defined variables in its dtrace_vstate | 
| 849 |  * structure.  Each DTrace consumer has exactly one dtrace_vstate structure, | 
| 850 |  * but some dtrace_vstate structures may exist without a corresponding DTrace | 
| 851 |  * consumer (see "DTrace Helpers", below).  As described in <sys/dtrace.h>, | 
| 852 |  * user-defined variables can have one of three scopes: | 
| 853 |  * | 
| 854 |  *  DIFV_SCOPE_GLOBAL  =>  global scope | 
| 855 |  *  DIFV_SCOPE_THREAD  =>  thread-local scope (i.e. "self->" variables) | 
| 856 |  *  DIFV_SCOPE_LOCAL   =>  clause-local scope (i.e. "this->" variables) | 
| 857 |  * | 
| 858 |  * The variable state tracks variables by both their scope and their allocation | 
| 859 |  * type: | 
| 860 |  * | 
| 861 |  *  - The dtvs_globals and dtvs_locals members each point to an array of | 
| 862 |  *    dtrace_statvar structures.  These structures contain both the variable | 
| 863 |  *    metadata (dtrace_difv structures) and the underlying storage for all | 
| 864 |  *    statically allocated variables, including statically allocated | 
| 865 |  *    DIFV_SCOPE_GLOBAL variables and all DIFV_SCOPE_LOCAL variables. | 
| 866 |  * | 
| 867 |  *  - The dtvs_tlocals member points to an array of dtrace_difv structures for | 
| 868 |  *    DIFV_SCOPE_THREAD variables.  As such, this array tracks _only_ the | 
| 869 |  *    variable metadata for DIFV_SCOPE_THREAD variables; the underlying storage | 
| 870 |  *    is allocated out of the dynamic variable space. | 
| 871 |  * | 
| 872 |  *  - The dtvs_dynvars member is the dynamic variable state associated with the | 
| 873 |  *    variable state.  The dynamic variable state (described in "DTrace Dynamic | 
| 874 |  *    Variables", above) tracks all DIFV_SCOPE_THREAD variables and all | 
| 875 |  *    dynamically-allocated DIFV_SCOPE_GLOBAL variables. | 
| 876 |  */ | 
| 877 | typedef struct dtrace_statvar { | 
| 878 | 	uint64_t dtsv_data;			/* data or pointer to it */ | 
| 879 | 	size_t dtsv_size;			/* size of pointed-to data */ | 
| 880 | 	int dtsv_refcnt;			/* reference count */ | 
| 881 | 	dtrace_difv_t dtsv_var;			/* variable metadata */ | 
| 882 | } dtrace_statvar_t; | 
| 883 |  | 
| 884 | typedef struct dtrace_vstate { | 
| 885 | 	dtrace_state_t *dtvs_state;		/* back pointer to state */ | 
| 886 | 	dtrace_statvar_t **dtvs_globals;	/* statically-allocated glbls */ | 
| 887 | 	int dtvs_nglobals;			/* number of globals */ | 
| 888 | 	dtrace_difv_t *dtvs_tlocals;		/* thread-local metadata */ | 
| 889 | 	int dtvs_ntlocals;			/* number of thread-locals */ | 
| 890 | 	dtrace_statvar_t **dtvs_locals;		/* clause-local data */ | 
| 891 | 	int dtvs_nlocals;			/* number of clause-locals */ | 
| 892 | 	dtrace_dstate_t dtvs_dynvars;		/* dynamic variable state */ | 
| 893 | } dtrace_vstate_t; | 
| 894 |  | 
| 895 | /* | 
| 896 |  * DTrace Machine State | 
| 897 |  * | 
| 898 |  * In the process of processing a fired probe, DTrace needs to track and/or | 
| 899 |  * cache some per-CPU state associated with that particular firing.  This is | 
| 900 |  * state that is always discarded after the probe firing has completed, and | 
| 901 |  * much of it is not specific to any DTrace consumer, remaining valid across | 
| 902 |  * all ECBs.  This state is tracked in the dtrace_mstate structure. | 
| 903 |  */ | 
| 904 | #define	DTRACE_MSTATE_ARGS		0x00000001 | 
| 905 | #define	DTRACE_MSTATE_PROBE		0x00000002 | 
| 906 | #define	DTRACE_MSTATE_EPID		0x00000004 | 
| 907 | #define	DTRACE_MSTATE_TIMESTAMP		0x00000008 | 
| 908 | #define	DTRACE_MSTATE_STACKDEPTH	0x00000010 | 
| 909 | #define	DTRACE_MSTATE_CALLER		0x00000020 | 
| 910 | #define	DTRACE_MSTATE_IPL		0x00000040 | 
| 911 | #define	DTRACE_MSTATE_FLTOFFS		0x00000080 | 
| 912 | #define	DTRACE_MSTATE_WALLTIMESTAMP	0x00000100 | 
| 913 | #define	DTRACE_MSTATE_USTACKDEPTH	0x00000200 | 
| 914 | #define	DTRACE_MSTATE_UCALLER		0x00000400 | 
| 915 | #define	DTRACE_MSTATE_MACHTIMESTAMP	0x00000800 | 
| 916 | #define	DTRACE_MSTATE_MACHCTIMESTAMP	0x00001000 | 
| 917 |  | 
| 918 | typedef struct dtrace_mstate { | 
| 919 | 	uintptr_t dtms_scratch_base;		/* base of scratch space */ | 
| 920 | 	uintptr_t dtms_scratch_ptr;		/* current scratch pointer */ | 
| 921 | 	size_t dtms_scratch_size;		/* scratch size */ | 
| 922 | 	uint32_t dtms_present;			/* variables that are present */ | 
| 923 | 	uint64_t dtms_arg[5];			/* cached arguments */ | 
| 924 | 	dtrace_epid_t dtms_epid;		/* current EPID */ | 
| 925 | 	uint64_t dtms_timestamp;		/* cached timestamp */ | 
| 926 | 	hrtime_t dtms_walltimestamp;		/* cached wall timestamp */ | 
| 927 | 	uint64_t dtms_machtimestamp;		/* cached mach absolute timestamp */ | 
| 928 | 	uint64_t dtms_machctimestamp;		/* cached mach continuous timestamp */ | 
| 929 | 	int dtms_stackdepth;			/* cached stackdepth */ | 
| 930 | 	int dtms_ustackdepth;			/* cached ustackdepth */ | 
| 931 | 	struct dtrace_probe *dtms_probe;	/* current probe */ | 
| 932 | 	uintptr_t dtms_caller;			/* cached caller */ | 
| 933 | 	uint64_t dtms_ucaller;			/* cached user-level caller */ | 
| 934 | 	int dtms_ipl;				/* cached interrupt pri lev */ | 
| 935 | 	int dtms_fltoffs;			/* faulting DIFO offset */ | 
| 936 | 	uintptr_t dtms_strtok;			/* saved strtok() pointer */ | 
| 937 | 	uintptr_t dtms_strtok_limit;		/* upper bound of strtok ptr */ | 
| 938 | 	uint32_t dtms_access;			/* memory access rights */ | 
| 939 | 	dtrace_difo_t *dtms_difo;		/* current dif object */ | 
| 940 | } dtrace_mstate_t; | 
| 941 |  | 
| 942 | #define	DTRACE_COND_OWNER	0x1 | 
| 943 | #define	DTRACE_COND_USERMODE	0x2 | 
| 944 | #define	DTRACE_COND_ZONEOWNER	0x4 | 
| 945 |  | 
| 946 | #define	DTRACE_PROBEKEY_MAXDEPTH	8	/* max glob recursion depth */ | 
| 947 |  | 
| 948 | /* | 
| 949 |  * Access flag used by dtrace_mstate.dtms_access. | 
| 950 |  */ | 
| 951 | #define	DTRACE_ACCESS_KERNEL	0x1		/* the priv to read kmem */ | 
| 952 |  | 
| 953 |  | 
| 954 | /* | 
| 955 |  * DTrace Activity | 
| 956 |  * | 
| 957 |  * Each DTrace consumer is in one of several states, which (for purposes of | 
| 958 |  * avoiding yet-another overloading of the noun "state") we call the current | 
| 959 |  * _activity_.  The activity transitions on dtrace_go() (from DTRACIOCGO), on | 
| 960 |  * dtrace_stop() (from DTRACIOCSTOP) and on the exit() action.  Activities may | 
| 961 |  * only transition in one direction; the activity transition diagram is a | 
| 962 |  * directed acyclic graph.  The activity transition diagram is as follows: | 
| 963 |  * | 
| 964 |  * | 
| 965 |  * | 
| 966 |  * +----------+                   +--------+                   +--------+ | 
| 967 |  * | INACTIVE |------------------>| WARMUP |------------------>| ACTIVE | | 
| 968 |  * +----------+   dtrace_go(),    +--------+   dtrace_go(),    +--------+ | 
| 969 |  *                before BEGIN        |        after BEGIN       |  |  | | 
| 970 |  *                                    |                          |  |  | | 
| 971 |  *                      exit() action |                          |  |  | | 
| 972 |  *                     from BEGIN ECB |                          |  |  | | 
| 973 |  *                                    |                          |  |  | | 
| 974 |  *                                    v                          |  |  | | 
| 975 |  *                               +----------+     exit() action  |  |  | | 
| 976 |  * +-----------------------------| DRAINING |<-------------------+  |  | | 
| 977 |  * |                             +----------+                       |  | | 
| 978 |  * |                                  |                             |  | | 
| 979 |  * |                   dtrace_stop(), |                             |  | | 
| 980 |  * |                     before END   |                             |  | | 
| 981 |  * |                                  |                             |  | | 
| 982 |  * |                                  v                             |  | | 
| 983 |  * | +---------+                 +----------+                       |  | | 
| 984 |  * | | STOPPED |<----------------| COOLDOWN |<----------------------+  | | 
| 985 |  * | +---------+  dtrace_stop(), +----------+     dtrace_stop(),       | | 
| 986 |  * |                after END                       before END         | | 
| 987 |  * |                                                                   | | 
| 988 |  * |                              +--------+                           | | 
| 989 |  * +----------------------------->| KILLED |<--------------------------+ | 
| 990 |  *       deadman timeout or       +--------+     deadman timeout or | 
| 991 |  *        killed consumer                         killed consumer | 
| 992 |  * | 
| 993 |  * Note that once a DTrace consumer has stopped tracing, there is no way to | 
| 994 |  * restart it; if a DTrace consumer wishes to restart tracing, it must reopen | 
| 995 |  * the DTrace pseudodevice. | 
| 996 |  */ | 
| 997 | typedef enum dtrace_activity { | 
| 998 | 	DTRACE_ACTIVITY_INACTIVE = 0,		/* not yet running */ | 
| 999 | 	DTRACE_ACTIVITY_WARMUP,			/* while starting */ | 
| 1000 | 	DTRACE_ACTIVITY_ACTIVE,			/* running */ | 
| 1001 | 	DTRACE_ACTIVITY_DRAINING,		/* before stopping */ | 
| 1002 | 	DTRACE_ACTIVITY_COOLDOWN,		/* while stopping */ | 
| 1003 | 	DTRACE_ACTIVITY_STOPPED,		/* after stopping */ | 
| 1004 | 	DTRACE_ACTIVITY_KILLED			/* killed */ | 
| 1005 | } dtrace_activity_t; | 
| 1006 |  | 
| 1007 |  | 
| 1008 | /* | 
| 1009 |  * APPLE NOTE:  DTrace dof modes implementation | 
| 1010 |  * | 
| 1011 |  * DTrace has four "dof modes". They are: | 
| 1012 |  * | 
| 1013 |  * DTRACE_DOF_MODE_NEVER	Never load any dof, period. | 
| 1014 |  * DTRACE_DOF_MODE_LAZY_ON	Defer loading dof until later | 
| 1015 |  * DTRACE_DOF_MODE_LAZY_OFF	Load all deferred dof now, and any new dof  | 
| 1016 |  * DTRACE_DOF_MODE_NON_LAZY	Load all dof immediately. | 
| 1017 |  * | 
| 1018 |  * It is legal to transition between the two lazy modes. The NEVER and | 
| 1019 |  * NON_LAZY modes are permanent, and must not change once set. | 
| 1020 |  * | 
| 1021 |  * The current dof mode is kept in dtrace_dof_mode, which is protected by the | 
| 1022 |  * dtrace_dof_mode_lock. This is a RW lock, reads require shared access, writes | 
| 1023 |  * require exclusive access. Because NEVER and NON_LAZY are permanent states, | 
| 1024 |  * it is legal to test for those modes without holding the dof mode lock. | 
| 1025 |  * | 
| 1026 |  * Lock ordering is dof mode lock before any dtrace lock, and before the | 
| 1027 |  * process p_dtrace_sprlock. In general, other locks should not be held when | 
| 1028 |  * taking the dof mode lock. Acquiring the dof mode lock in exclusive mode | 
| 1029 |  * will block process fork, exec, and exit, so it should be held exclusive | 
| 1030 |  * for as short a time as possible. | 
| 1031 |  */ | 
| 1032 |  | 
| 1033 | #define DTRACE_DOF_MODE_NEVER 		0 | 
| 1034 | #define DTRACE_DOF_MODE_LAZY_ON		1 | 
| 1035 | #define DTRACE_DOF_MODE_LAZY_OFF	2 | 
| 1036 | #define DTRACE_DOF_MODE_NON_LAZY	3 | 
| 1037 |  | 
| 1038 | /* | 
| 1039 |  * dtrace kernel symbol modes are used to control when the kernel may dispose of | 
| 1040 |  * symbol information used by the fbt/sdt provider. The kernel itself, as well as | 
| 1041 |  * every kext, has symbol table/nlist info that has historically been preserved | 
| 1042 |  * for dtrace's use. This allowed dtrace to be lazy about allocating fbt/sdt probes, | 
| 1043 |  * at the expense of keeping the symbol info in the kernel permanently. | 
| 1044 |  * | 
| 1045 |  * Starting in 10.7+, fbt probes may be created from userspace, in the same | 
| 1046 |  * fashion as pid probes. The kernel allows dtrace "first right of refusal" | 
| 1047 |  * whenever symbol data becomes available (such as a kext load). If dtrace is | 
| 1048 |  * active, it will immediately read/copy the needed data, and then the kernel | 
| 1049 |  * may free it. If dtrace is not active, it returns immediately, having done | 
| 1050 |  * no work or allocations, and the symbol data is freed. Should dtrace need | 
| 1051 |  * this data later, it is expected that the userspace client will push the | 
| 1052 |  * data into the kernel via ioctl calls. | 
| 1053 |  * | 
| 1054 |  * The kernel symbol modes are used to control what dtrace does with symbol data: | 
| 1055 |  * | 
| 1056 |  * DTRACE_KERNEL_SYMBOLS_NEVER			Effectively disables fbt/sdt | 
| 1057 |  * DTRACE_KERNEL_SYMBOLS_FROM_KERNEL		Immediately read/copy symbol data | 
| 1058 |  * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE		Wait for symbols from userspace | 
| 1059 |  * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL	Immediately read/copy symbol data | 
| 1060 |  * | 
| 1061 |  * It is legal to transition between DTRACE_KERNEL_SYMBOLS_FROM_KERNEL and  | 
| 1062 |  * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE. The DTRACE_KERNEL_SYMBOLS_NEVER and | 
| 1063 |  * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL are permanent modes, intended to | 
| 1064 |  * disable fbt probes entirely, or prevent any symbols being loaded from | 
| 1065 |  * userspace. | 
| 1066 | * | 
| 1067 |  * The kernel symbol mode is kept in dtrace_kernel_symbol_mode, which is protected | 
| 1068 |  * by the dtrace_lock. | 
| 1069 |  */ | 
| 1070 |  | 
| 1071 | #define DTRACE_KERNEL_SYMBOLS_NEVER 			0 | 
| 1072 | #define DTRACE_KERNEL_SYMBOLS_FROM_KERNEL		1 | 
| 1073 | #define DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE		2 | 
| 1074 | #define DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL	3 | 
| 1075 | 	 | 
| 1076 |  | 
| 1077 | /* | 
| 1078 |  * DTrace Helper Implementation | 
| 1079 |  * | 
| 1080 |  * A description of the helper architecture may be found in <sys/dtrace.h>. | 
| 1081 |  * Each process contains a pointer to its helpers in its p_dtrace_helpers | 
| 1082 |  * member.  This is a pointer to a dtrace_helpers structure, which contains an | 
| 1083 |  * array of pointers to dtrace_helper structures, helper variable state (shared | 
| 1084 |  * among a process's helpers) and a generation count.  (The generation count is | 
| 1085 |  * used to provide an identifier when a helper is added so that it may be | 
| 1086 |  * subsequently removed.)  The dtrace_helper structure is self-explanatory, | 
| 1087 |  * containing pointers to the objects needed to execute the helper.  Note that | 
| 1088 |  * helpers are _duplicated_ across fork(2), and destroyed on exec(2).  No more | 
| 1089 |  * than dtrace_helpers_max are allowed per-process. | 
| 1090 |  */ | 
| 1091 | #define	DTRACE_HELPER_ACTION_USTACK	0 | 
| 1092 | #define	DTRACE_NHELPER_ACTIONS		1 | 
| 1093 |  | 
| 1094 | typedef struct dtrace_helper_action { | 
| 1095 | 	int dtha_generation;			/* helper action generation */ | 
| 1096 | 	int dtha_nactions;			/* number of actions */ | 
| 1097 | 	dtrace_difo_t *dtha_predicate;		/* helper action predicate */ | 
| 1098 | 	dtrace_difo_t **dtha_actions;		/* array of actions */ | 
| 1099 | 	struct dtrace_helper_action *dtha_next;	/* next helper action */ | 
| 1100 | } dtrace_helper_action_t; | 
| 1101 |  | 
| 1102 | typedef struct dtrace_helper_provider { | 
| 1103 | 	int dthp_generation;			/* helper provider generation */ | 
| 1104 | 	uint32_t dthp_ref;			/* reference count */ | 
| 1105 | 	dof_helper_t dthp_prov;			/* DOF w/ provider and probes */ | 
| 1106 | } dtrace_helper_provider_t; | 
| 1107 |  | 
| 1108 | typedef struct dtrace_helpers { | 
| 1109 | 	dtrace_helper_action_t **dthps_actions;	/* array of helper actions */ | 
| 1110 | 	dtrace_vstate_t dthps_vstate;		/* helper action var. state */ | 
| 1111 | 	dtrace_helper_provider_t **dthps_provs;	/* array of providers */ | 
| 1112 | 	uint_t dthps_nprovs;			/* count of providers */ | 
| 1113 | 	uint_t dthps_maxprovs;			/* provider array size */ | 
| 1114 | 	int dthps_generation;			/* current generation */ | 
| 1115 | 	pid_t dthps_pid;			/* pid of associated proc */ | 
| 1116 | 	int dthps_deferred;			/* helper in deferred list */ | 
| 1117 | 	struct dtrace_helpers *dthps_next;	/* next pointer */ | 
| 1118 | 	struct dtrace_helpers *dthps_prev;	/* prev pointer */ | 
| 1119 | } dtrace_helpers_t; | 
| 1120 |  | 
| 1121 | /* | 
| 1122 |  * DTrace Helper Action Tracing | 
| 1123 |  * | 
| 1124 |  * Debugging helper actions can be arduous.  To ease the development and | 
| 1125 |  * debugging of helpers, DTrace contains a tracing-framework-within-a-tracing- | 
| 1126 |  * framework: helper tracing.  If dtrace_helptrace_enabled is non-zero (which | 
| 1127 |  * it is by default on DEBUG kernels), all helper activity will be traced to a | 
| 1128 |  * global, in-kernel ring buffer.  Each entry includes a pointer to the specific | 
| 1129 |  * helper, the location within the helper, and a trace of all local variables. | 
| 1130 |  * The ring buffer may be displayed in a human-readable format with the | 
| 1131 |  * ::dtrace_helptrace mdb(1) dcmd. | 
| 1132 |  */ | 
| 1133 | #define	DTRACE_HELPTRACE_NEXT	(-1) | 
| 1134 | #define	DTRACE_HELPTRACE_DONE	(-2) | 
| 1135 | #define	DTRACE_HELPTRACE_ERR	(-3) | 
| 1136 |  | 
| 1137 |  | 
| 1138 | typedef struct dtrace_helptrace { | 
| 1139 | 	dtrace_helper_action_t	*dtht_helper;	/* helper action */ | 
| 1140 | 	int dtht_where;				/* where in helper action */ | 
| 1141 | 	int dtht_nlocals;			/* number of locals */ | 
| 1142 | 	int dtht_fault;				/* type of fault (if any) */ | 
| 1143 | 	int dtht_fltoffs;			/* DIF offset */ | 
| 1144 | 	uint64_t dtht_illval;			/* faulting value */ | 
| 1145 | 	uint64_t dtht_locals[1];		/* local variables */ | 
| 1146 | } dtrace_helptrace_t; | 
| 1147 |  | 
| 1148 | /* | 
| 1149 |  * DTrace Credentials | 
| 1150 |  * | 
| 1151 |  * In probe context, we have limited flexibility to examine the credentials | 
| 1152 |  * of the DTrace consumer that created a particular enabling.  We use | 
| 1153 |  * the Least Privilege interfaces to cache the consumer's cred pointer and | 
| 1154 |  * some facts about that credential in a dtrace_cred_t structure. These | 
| 1155 |  * can limit the consumer's breadth of visibility and what actions the | 
| 1156 |  * consumer may take. | 
| 1157 |  */ | 
| 1158 | #define	DTRACE_CRV_ALLPROC		0x01 | 
| 1159 | #define	DTRACE_CRV_KERNEL		0x02 | 
| 1160 | #define	DTRACE_CRV_ALLZONE		0x04 | 
| 1161 |  | 
| 1162 | #define	DTRACE_CRV_ALL		(DTRACE_CRV_ALLPROC | DTRACE_CRV_KERNEL | \ | 
| 1163 | 	DTRACE_CRV_ALLZONE) | 
| 1164 |  | 
| 1165 | #define	DTRACE_CRA_PROC				0x0001 | 
| 1166 | #define	DTRACE_CRA_PROC_CONTROL			0x0002 | 
| 1167 | #define	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER	0x0004 | 
| 1168 | #define	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE	0x0008 | 
| 1169 | #define	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG	0x0010 | 
| 1170 | #define	DTRACE_CRA_KERNEL			0x0020 | 
| 1171 | #define	DTRACE_CRA_KERNEL_DESTRUCTIVE		0x0040 | 
| 1172 |  | 
| 1173 | #define	DTRACE_CRA_ALL		(DTRACE_CRA_PROC | \ | 
| 1174 | 	DTRACE_CRA_PROC_CONTROL | \ | 
| 1175 | 	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER | \ | 
| 1176 | 	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE | \ | 
| 1177 | 	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG | \ | 
| 1178 | 	DTRACE_CRA_KERNEL | \ | 
| 1179 | 	DTRACE_CRA_KERNEL_DESTRUCTIVE) | 
| 1180 |  | 
| 1181 | typedef struct dtrace_cred { | 
| 1182 | 	cred_t			*dcr_cred; | 
| 1183 | 	uint8_t			dcr_destructive; | 
| 1184 | 	uint8_t			dcr_visible; | 
| 1185 | 	uint16_t		dcr_action; | 
| 1186 | } dtrace_cred_t; | 
| 1187 |  | 
| 1188 | typedef struct dtrace_format { | 
| 1189 | 	uint64_t dtf_refcount; | 
| 1190 | 	char dtf_str[]; | 
| 1191 | } dtrace_format_t; | 
| 1192 |  | 
| 1193 | #define DTRACE_FORMAT_SIZE(fmt) (strlen(fmt->dtf_str) + 1 + sizeof(dtrace_format_t)) | 
| 1194 |  | 
| 1195 | /* | 
| 1196 |  * DTrace Consumer State | 
| 1197 |  * | 
| 1198 |  * Each DTrace consumer has an associated dtrace_state structure that contains | 
| 1199 |  * its in-kernel DTrace state -- including options, credentials, statistics and | 
| 1200 |  * pointers to ECBs, buffers, speculations and formats.  A dtrace_state | 
| 1201 |  * structure is also allocated for anonymous enablings.  When anonymous state | 
| 1202 |  * is grabbed, the grabbing consumers dts_anon pointer is set to the grabbed | 
| 1203 |  * dtrace_state structure. | 
| 1204 |  */ | 
| 1205 | struct dtrace_state { | 
| 1206 | 	dev_t dts_dev;				/* device */ | 
| 1207 | 	int dts_necbs;				/* total number of ECBs */ | 
| 1208 | 	dtrace_ecb_t **dts_ecbs;		/* array of ECBs */ | 
| 1209 | 	dtrace_epid_t dts_epid;			/* next EPID to allocate */ | 
| 1210 | 	size_t dts_needed;			/* greatest needed space */ | 
| 1211 | 	struct dtrace_state *dts_anon;		/* anon. state, if grabbed */ | 
| 1212 | 	dtrace_activity_t dts_activity;		/* current activity */ | 
| 1213 | 	dtrace_vstate_t dts_vstate;		/* variable state */ | 
| 1214 | 	dtrace_buffer_t *dts_buffer;		/* principal buffer */ | 
| 1215 | 	dtrace_buffer_t *dts_aggbuffer;		/* aggregation buffer */ | 
| 1216 | 	dtrace_speculation_t *dts_speculations;	/* speculation array */ | 
| 1217 | 	int dts_nspeculations;			/* number of speculations */ | 
| 1218 | 	int dts_naggregations;			/* number of aggregations */ | 
| 1219 | 	dtrace_aggregation_t **dts_aggregations; /* aggregation array */ | 
| 1220 | 	vmem_t *dts_aggid_arena;		/* arena for aggregation IDs */ | 
| 1221 | 	uint64_t dts_errors;			/* total number of errors */ | 
| 1222 | 	uint32_t dts_speculations_busy;		/* number of spec. busy */ | 
| 1223 | 	uint32_t dts_speculations_unavail;	/* number of spec unavail */ | 
| 1224 | 	uint32_t dts_stkstroverflows;		/* stack string tab overflows */ | 
| 1225 | 	uint32_t dts_dblerrors;			/* errors in ERROR probes */ | 
| 1226 | 	uint32_t dts_reserve;			/* space reserved for END */ | 
| 1227 | 	hrtime_t dts_laststatus;		/* time of last status */ | 
| 1228 | 	cyclic_id_t dts_cleaner;		/* cleaning cyclic */ | 
| 1229 | 	cyclic_id_t dts_deadman;		/* deadman cyclic */ | 
| 1230 | 	hrtime_t dts_alive;			/* time last alive */ | 
| 1231 | 	char dts_speculates;			/* boolean: has speculations */ | 
| 1232 | 	char dts_destructive;			/* boolean: has dest. actions */ | 
| 1233 | 	int dts_nformats;			/* number of formats */ | 
| 1234 | 	dtrace_format_t **dts_formats;		/* format string array */ | 
| 1235 | 	dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */ | 
| 1236 | 	dtrace_cred_t dts_cred;			/* credentials */ | 
| 1237 | 	size_t dts_nretained;			/* number of retained enabs */ | 
| 1238 | 	uint64_t dts_arg_error_illval; | 
| 1239 | 	uint32_t dts_buf_over_limit;		/* number of bufs over dtb_limit */ | 
| 1240 | 	uint64_t **dts_rstate;			/* per-CPU random state */ | 
| 1241 | }; | 
| 1242 |  | 
| 1243 | struct dtrace_provider { | 
| 1244 | 	dtrace_pattr_t dtpv_attr;		/* provider attributes */ | 
| 1245 | 	dtrace_ppriv_t dtpv_priv;		/* provider privileges */ | 
| 1246 | 	dtrace_pops_t dtpv_pops;		/* provider operations */ | 
| 1247 | 	char *dtpv_name;			/* provider name */ | 
| 1248 | 	void *dtpv_arg;				/* provider argument */ | 
| 1249 | 	uint_t dtpv_defunct;			/* boolean: defunct provider */ | 
| 1250 | 	struct dtrace_provider *dtpv_next;	/* next provider */ | 
| 1251 | 	uint64_t dtpv_probe_count;		/* number of associated probes */ | 
| 1252 | 	uint64_t dtpv_ecb_count;		/* number of associated enabled ECBs */ | 
| 1253 | }; | 
| 1254 |  | 
| 1255 | struct dtrace_meta { | 
| 1256 | 	dtrace_mops_t dtm_mops;			/* meta provider operations */ | 
| 1257 | 	char *dtm_name;				/* meta provider name */ | 
| 1258 | 	void *dtm_arg;				/* meta provider user arg */ | 
| 1259 | 	uint64_t dtm_count;			/* number of associated providers */ | 
| 1260 | }; | 
| 1261 |  | 
| 1262 | /* | 
| 1263 |  * DTrace Enablings | 
| 1264 |  * | 
| 1265 |  * A dtrace_enabling structure is used to track a collection of ECB | 
| 1266 |  * descriptions -- before they have been turned into actual ECBs.  This is | 
| 1267 |  * created as a result of DOF processing, and is generally used to generate | 
| 1268 |  * ECBs immediately thereafter.  However, enablings are also generally | 
| 1269 |  * retained should the probes they describe be created at a later time; as | 
| 1270 |  * each new module or provider registers with the framework, the retained | 
| 1271 |  * enablings are reevaluated, with any new match resulting in new ECBs.  To | 
| 1272 |  * prevent probes from being matched more than once, the enabling tracks the | 
| 1273 |  * last probe generation matched, and only matches probes from subsequent | 
| 1274 |  * generations. | 
| 1275 |  */ | 
| 1276 | typedef struct dtrace_enabling { | 
| 1277 | 	dtrace_ecbdesc_t **dten_desc;		/* all ECB descriptions */ | 
| 1278 | 	int dten_ndesc;				/* number of ECB descriptions */ | 
| 1279 | 	int dten_maxdesc;			/* size of ECB array */ | 
| 1280 | 	dtrace_vstate_t *dten_vstate;		/* associated variable state */ | 
| 1281 | 	dtrace_genid_t dten_probegen;		/* matched probe generation */ | 
| 1282 | 	dtrace_ecbdesc_t *dten_current;		/* current ECB description */ | 
| 1283 | 	int dten_error;				/* current error value */ | 
| 1284 | 	int dten_primed;			/* boolean: set if primed */ | 
| 1285 | 	struct dtrace_enabling *dten_prev;	/* previous enabling */ | 
| 1286 | 	struct dtrace_enabling *dten_next;	/* next enabling */ | 
| 1287 | } dtrace_enabling_t; | 
| 1288 |  | 
| 1289 | /* | 
| 1290 |  * DTrace Anonymous Enablings | 
| 1291 |  * | 
| 1292 |  * Anonymous enablings are DTrace enablings that are not associated with a | 
| 1293 |  * controlling process, but rather derive their enabling from DOF stored as | 
| 1294 |  * properties in the dtrace.conf file.  If there is an anonymous enabling, a | 
| 1295 |  * DTrace consumer state and enabling are created on attach.  The state may be | 
| 1296 |  * subsequently grabbed by the first consumer specifying the "grabanon" | 
| 1297 |  * option.  As long as an anonymous DTrace enabling exists, dtrace(7D) will | 
| 1298 |  * refuse to unload. | 
| 1299 |  */ | 
| 1300 | typedef struct dtrace_anon { | 
| 1301 | 	dtrace_state_t *dta_state;		/* DTrace consumer state */ | 
| 1302 | 	dtrace_enabling_t *dta_enabling;	/* pointer to enabling */ | 
| 1303 | 	processorid_t dta_beganon;		/* which CPU BEGIN ran on */ | 
| 1304 | } dtrace_anon_t; | 
| 1305 |  | 
| 1306 | /* | 
| 1307 |  * DTrace Error Debugging | 
| 1308 |  */ | 
| 1309 | #if DEBUG | 
| 1310 | #define	DTRACE_ERRDEBUG | 
| 1311 | #endif | 
| 1312 |  | 
| 1313 | #ifdef DTRACE_ERRDEBUG | 
| 1314 |  | 
| 1315 | typedef struct dtrace_errhash { | 
| 1316 | 	const char	*dter_msg;	/* error message */ | 
| 1317 | 	int		dter_count;	/* number of times seen */ | 
| 1318 | } dtrace_errhash_t; | 
| 1319 |  | 
| 1320 | #define	DTRACE_ERRHASHSZ	256	/* must be > number of err msgs */ | 
| 1321 |  | 
| 1322 | #endif	/* DTRACE_ERRDEBUG */ | 
| 1323 |  | 
| 1324 | typedef struct dtrace_string dtrace_string_t; | 
| 1325 |  | 
| 1326 | typedef struct dtrace_string { | 
| 1327 | 	dtrace_string_t *dtst_next; | 
| 1328 | 	dtrace_string_t *dtst_prev; | 
| 1329 | 	uint32_t dtst_refcount; | 
| 1330 | 	char dtst_str[]; | 
| 1331 | } dtrace_string_t; | 
| 1332 |  | 
| 1333 | /** | 
| 1334 |  * DTrace Matching pre-conditions | 
| 1335 |  * | 
| 1336 |  * Used when matching new probes to discard matching of enablings that | 
| 1337 |  * doesn't match the condition tested by dmc_func | 
| 1338 |  */ | 
| 1339 | typedef struct dtrace_match_cond { | 
| 1340 | 	int (*dmc_func)(dtrace_probedesc_t*, void*); | 
| 1341 | 	void *dmc_data; | 
| 1342 | } dtrace_match_cond_t; | 
| 1343 |  | 
| 1344 |  | 
| 1345 | /* | 
| 1346 |  * DTrace Toxic Ranges | 
| 1347 |  * | 
| 1348 |  * DTrace supports safe loads from probe context; if the address turns out to | 
| 1349 |  * be invalid, a bit will be set by the kernel indicating that DTrace | 
| 1350 |  * encountered a memory error, and DTrace will propagate the error to the user | 
| 1351 |  * accordingly.  However, there may exist some regions of memory in which an | 
| 1352 |  * arbitrary load can change system state, and from which it is impossible to | 
| 1353 |  * recover from such a load after it has been attempted.  Examples of this may | 
| 1354 |  * include memory in which programmable I/O registers are mapped (for which a | 
| 1355 |  * read may have some implications for the device) or (in the specific case of | 
| 1356 |  * UltraSPARC-I and -II) the virtual address hole.  The platform is required | 
| 1357 |  * to make DTrace aware of these toxic ranges; DTrace will then check that | 
| 1358 |  * target addresses are not in a toxic range before attempting to issue a | 
| 1359 |  * safe load. | 
| 1360 |  */ | 
| 1361 | typedef struct dtrace_toxrange { | 
| 1362 | 	uintptr_t	dtt_base;		/* base of toxic range */ | 
| 1363 | 	uintptr_t	dtt_limit;		/* limit of toxic range */ | 
| 1364 | } dtrace_toxrange_t; | 
| 1365 |  | 
| 1366 | extern uint64_t dtrace_getarg(int, int, dtrace_mstate_t*, dtrace_vstate_t*); | 
| 1367 | extern int dtrace_getipl(void); | 
| 1368 | extern uintptr_t dtrace_caller(int); | 
| 1369 | extern uint32_t dtrace_cas32(uint32_t *, uint32_t, uint32_t); | 
| 1370 | extern void *dtrace_casptr(void *, void *, void *); | 
| 1371 | extern void dtrace_copyin(user_addr_t, uintptr_t, size_t, volatile uint16_t *); | 
| 1372 | extern void dtrace_copyinstr(user_addr_t, uintptr_t, size_t, volatile uint16_t *); | 
| 1373 | extern void dtrace_copyout(uintptr_t, user_addr_t, size_t, volatile uint16_t *); | 
| 1374 | extern void dtrace_copyoutstr(uintptr_t, user_addr_t, size_t, volatile uint16_t *); | 
| 1375 | extern void dtrace_getpcstack(pc_t *, int, int, uint32_t *); | 
| 1376 | extern uint64_t dtrace_load64(uintptr_t); | 
| 1377 | extern int dtrace_canload(uint64_t, size_t, dtrace_mstate_t*, dtrace_vstate_t*); | 
| 1378 |  | 
| 1379 | extern uint64_t dtrace_getreg(struct regs *, uint_t); | 
| 1380 | extern uint64_t dtrace_getvmreg(uint_t); | 
| 1381 | extern int dtrace_getstackdepth(int); | 
| 1382 | extern void dtrace_getupcstack(uint64_t *, int); | 
| 1383 | extern void dtrace_getufpstack(uint64_t *, uint64_t *, int); | 
| 1384 | extern int dtrace_getustackdepth(void); | 
| 1385 | extern uintptr_t dtrace_fulword(void *); | 
| 1386 | extern uint8_t dtrace_fuword8(user_addr_t); | 
| 1387 | extern uint16_t dtrace_fuword16(user_addr_t); | 
| 1388 | extern uint32_t dtrace_fuword32(user_addr_t); | 
| 1389 | extern uint64_t dtrace_fuword64(user_addr_t); | 
| 1390 | extern int dtrace_proc_waitfor(dtrace_procdesc_t*); | 
| 1391 | extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int, | 
| 1392 |     int, uint64_t); | 
| 1393 | extern int dtrace_assfail(const char *, const char *, int); | 
| 1394 | extern int dtrace_attached(void); | 
| 1395 | extern hrtime_t dtrace_gethrestime(void); | 
| 1396 |  | 
| 1397 | extern void dtrace_flush_caches(void); | 
| 1398 |  | 
| 1399 | extern void dtrace_copy(uintptr_t, uintptr_t, size_t); | 
| 1400 | extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); | 
| 1401 |  | 
| 1402 | extern void* dtrace_ptrauth_strip(void*, uint64_t); | 
| 1403 | extern int dtrace_is_valid_ptrauth_key(uint64_t); | 
| 1404 |  | 
| 1405 | extern uint64_t dtrace_physmem_read(uint64_t, size_t); | 
| 1406 | extern void dtrace_physmem_write(uint64_t, uint64_t, size_t); | 
| 1407 |  | 
| 1408 | extern void dtrace_livedump(char *, size_t); | 
| 1409 |  | 
| 1410 | /* | 
| 1411 |  * DTrace state handling | 
| 1412 |  */ | 
| 1413 | extern minor_t dtrace_state_reserve(void); | 
| 1414 | extern dtrace_state_t* dtrace_state_allocate(minor_t minor); | 
| 1415 | extern dtrace_state_t* dtrace_state_get(minor_t minor); | 
| 1416 | extern void dtrace_state_free(minor_t minor); | 
| 1417 |  | 
| 1418 | /* | 
| 1419 |  * DTrace restriction checks | 
| 1420 |  */ | 
| 1421 | extern void dtrace_restriction_policy_load(void); | 
| 1422 | extern boolean_t dtrace_is_restricted(void); | 
| 1423 | extern boolean_t dtrace_are_restrictions_relaxed(void); | 
| 1424 | extern boolean_t dtrace_fbt_probes_restricted(void); | 
| 1425 | extern boolean_t dtrace_sdt_probes_restricted(void); | 
| 1426 | extern boolean_t dtrace_can_attach_to_proc(proc_t); | 
| 1427 |  | 
| 1428 | /* | 
| 1429 |  * DTrace Assertions | 
| 1430 |  * | 
| 1431 |  * DTrace calls ASSERT and VERIFY from probe context.  To assure that a failed | 
| 1432 |  * ASSERT or VERIFYdoes not induce a markedly more catastrophic failure (e.g., | 
| 1433 |  * one from which a dump cannot be gleaned), DTrace must define its own ASSERT | 
| 1434 |  * and VERIFY macros to be ones that may safely be called from probe context. | 
| 1435 |  * This header file must thus be included by any DTrace component that calls | 
| 1436 |  * ASSERT and/or VERIFY from probe context, and _only_ by those components. | 
| 1437 |  * (The only exception to this is kernel debugging infrastructure at user-level | 
| 1438 |  * that doesn't depend on calling ASSERT.) | 
| 1439 |  */ | 
| 1440 | #undef ASSERT | 
| 1441 | #undef VERIFY | 
| 1442 |  | 
| 1443 | #define	VERIFY(EX)	((void)((EX) || \ | 
| 1444 | 			dtrace_assfail(#EX, __FILE__, __LINE__))) | 
| 1445 |  | 
| 1446 | #if DEBUG | 
| 1447 | #define	ASSERT(EX)	((void)((EX) || \ | 
| 1448 | 			dtrace_assfail(#EX, __FILE__, __LINE__))) | 
| 1449 | #else | 
| 1450 | #define	ASSERT(X)	((void)0) | 
| 1451 | #endif | 
| 1452 |  | 
| 1453 | #ifdef	__cplusplus | 
| 1454 | } | 
| 1455 | #endif | 
| 1456 |  | 
| 1457 | #endif /* _SYS_DTRACE_IMPL_H */ | 
| 1458 |  | 
| 1459 |  |