1 | /* |
2 | * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Profile Every Thread (PET) provides a profile of all threads on the system |
31 | * when a timer fires. PET supports the "record waiting threads" mode in |
32 | * Instruments, and used to be called All Thread States (ATS). New tools should |
33 | * adopt the lightweight PET mode, which provides the same information, but with |
34 | * much less overhead. |
35 | * |
36 | * When traditional (non-lightweight) PET is active, a migrating timer call |
37 | * causes the PET thread to wake up. The timer handler also issues a broadcast |
38 | * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core |
39 | * samples. This is provided for backwards-compatibility with clients that |
40 | * expect on-core samples, when PET's timer was based off the on-core timers. |
41 | * Because PET sampling can take on the order of milliseconds, the PET thread |
42 | * will enter a new timer deadline after it finished sampling This perturbs the |
43 | * timer cadence by the duration of PET sampling, but it leaves the system to |
44 | * work on non-profiling tasks for the duration of the timer period. |
45 | * |
46 | * Lightweight PET samples the system less-intrusively than normal PET |
47 | * mode. Instead of iterating tasks and threads on each sample, it increments |
48 | * a global generation count, `kppet_gencount`, which is checked as threads are |
49 | * context switched on-core. If the thread's local generation count is older |
50 | * than the global generation, the thread samples itself. |
51 | * |
52 | * | | |
53 | * thread A +--+---------| |
54 | * | | |
55 | * thread B |--+---------------| |
56 | * | | |
57 | * thread C | | |------------------------------------- |
58 | * | | | |
59 | * thread D | | | |------------------------------- |
60 | * | | | | |
61 | * +--+---------+-----+--------------------------------> time |
62 | * | │ | |
63 | * | +-----+--- threads sampled when they come on-core in |
64 | * | kperf_pet_switch_context |
65 | * | |
66 | * +--- PET timer fire, sample on-core threads A and B, |
67 | * increment kppet_gencount |
68 | */ |
69 | |
70 | #include <mach/mach_types.h> |
71 | #include <sys/errno.h> |
72 | |
73 | #include <kperf/kperf.h> |
74 | #include <kperf/buffer.h> |
75 | #include <kperf/sample.h> |
76 | #include <kperf/context.h> |
77 | #include <kperf/action.h> |
78 | #include <kperf/pet.h> |
79 | #include <kperf/kptimer.h> |
80 | |
81 | #include <kern/task.h> |
82 | #include <kern/kalloc.h> |
83 | #if defined(__x86_64__) |
84 | #include <i386/mp.h> |
85 | #endif /* defined(__x86_64__) */ |
86 | |
87 | static LCK_MTX_DECLARE(kppet_mtx, &kperf_lck_grp); |
88 | |
89 | static struct { |
90 | unsigned int g_actionid; |
91 | /* |
92 | * The idle rate controls how many sampling periods to skip if a thread |
93 | * is idle. |
94 | */ |
95 | uint32_t g_idle_rate; |
96 | bool g_setup:1; |
97 | bool g_lightweight:1; |
98 | struct kperf_sample *g_sample; |
99 | |
100 | thread_t g_sample_thread; |
101 | |
102 | /* |
103 | * Used by the PET thread to manage which threads and tasks to sample. |
104 | */ |
105 | thread_t *g_threads; |
106 | unsigned int g_nthreads; |
107 | size_t g_threads_count; |
108 | |
109 | task_t *g_tasks; |
110 | unsigned int g_ntasks; |
111 | size_t g_tasks_count; |
112 | } kppet = { |
113 | .g_actionid = 0, |
114 | .g_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE, |
115 | }; |
116 | |
117 | bool kppet_lightweight_active = false; |
118 | _Atomic uint32_t kppet_gencount = 0; |
119 | |
120 | static uint64_t kppet_sample_tasks(uint32_t idle_rate); |
121 | static void kppet_thread(void * param, wait_result_t wr); |
122 | |
123 | static void |
124 | kppet_lock_assert_owned(void) |
125 | { |
126 | lck_mtx_assert(lck: &kppet_mtx, LCK_MTX_ASSERT_OWNED); |
127 | } |
128 | |
129 | static void |
130 | kppet_lock(void) |
131 | { |
132 | lck_mtx_lock(lck: &kppet_mtx); |
133 | } |
134 | |
135 | static void |
136 | kppet_unlock(void) |
137 | { |
138 | lck_mtx_unlock(lck: &kppet_mtx); |
139 | } |
140 | |
141 | void |
142 | kppet_on_cpu(thread_t thread, thread_continue_t continuation, |
143 | uintptr_t *starting_fp) |
144 | { |
145 | assert(thread != NULL); |
146 | assert(ml_get_interrupts_enabled() == FALSE); |
147 | |
148 | uint32_t actionid = kppet.g_actionid; |
149 | if (actionid == 0) { |
150 | return; |
151 | } |
152 | |
153 | if (thread->kperf_pet_gen != atomic_load(&kppet_gencount)) { |
154 | BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START, |
155 | atomic_load_explicit(&kppet_gencount, |
156 | memory_order_relaxed), thread->kperf_pet_gen); |
157 | |
158 | task_t task = get_threadtask(thread); |
159 | struct kperf_context ctx = { |
160 | .cur_thread = thread, |
161 | .cur_task = task, |
162 | .cur_pid = task_pid(task), |
163 | .starting_fp = starting_fp, |
164 | }; |
165 | /* |
166 | * Use a per-CPU interrupt buffer, since this is only called |
167 | * while interrupts are disabled, from the scheduler. |
168 | */ |
169 | struct kperf_sample *sample = kperf_intr_sample_buffer(); |
170 | if (!sample) { |
171 | BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1); |
172 | return; |
173 | } |
174 | |
175 | unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER; |
176 | if (continuation != NULL) { |
177 | flags |= SAMPLE_FLAG_CONTINUATION; |
178 | } |
179 | kperf_sample(sbuf: sample, ctx: &ctx, actionid, sample_flags: flags); |
180 | |
181 | BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END); |
182 | } else { |
183 | BUF_VERB(PERF_PET_SAMPLE_THREAD, |
184 | os_atomic_load(&kppet_gencount, relaxed), thread->kperf_pet_gen); |
185 | } |
186 | } |
187 | |
188 | #pragma mark - state transitions |
189 | |
190 | /* |
191 | * Lazily initialize PET. The PET thread never exits once PET has been used |
192 | * once. |
193 | */ |
194 | static void |
195 | kppet_setup(void) |
196 | { |
197 | if (kppet.g_setup) { |
198 | return; |
199 | } |
200 | |
201 | kern_return_t kr = kernel_thread_start(continuation: kppet_thread, NULL, |
202 | new_thread: &kppet.g_sample_thread); |
203 | if (kr != KERN_SUCCESS) { |
204 | panic("kperf: failed to create PET thread %d" , kr); |
205 | } |
206 | |
207 | thread_set_thread_name(th: kppet.g_sample_thread, name: "kperf-pet-sampling" ); |
208 | kppet.g_setup = true; |
209 | } |
210 | |
211 | void |
212 | kppet_config(unsigned int actionid) |
213 | { |
214 | /* |
215 | * Resetting kperf shouldn't get the PET thread started. |
216 | */ |
217 | if (actionid == 0 && !kppet.g_setup) { |
218 | return; |
219 | } |
220 | |
221 | kppet_setup(); |
222 | |
223 | kppet_lock(); |
224 | |
225 | kppet.g_actionid = actionid; |
226 | |
227 | if (actionid > 0) { |
228 | if (!kppet.g_sample) { |
229 | kppet.g_sample = kalloc_type_tag(struct kperf_sample, |
230 | Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG); |
231 | kppet.g_sample->usample.usample_min = kalloc_type_tag( |
232 | struct kperf_usample_min, Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG); |
233 | } |
234 | } else { |
235 | if (kppet.g_tasks) { |
236 | assert(kppet.g_tasks_count != 0); |
237 | kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks); |
238 | kppet.g_tasks = NULL; |
239 | kppet.g_tasks_count = 0; |
240 | kppet.g_ntasks = 0; |
241 | } |
242 | if (kppet.g_threads) { |
243 | assert(kppet.g_threads_count != 0); |
244 | void *g_tasks = (void *)kppet.g_tasks; |
245 | kfree_type(thread_t, kppet.g_threads_count, g_tasks); |
246 | kppet.g_tasks = NULL; |
247 | kppet.g_threads = NULL; |
248 | kppet.g_threads_count = 0; |
249 | kppet.g_nthreads = 0; |
250 | } |
251 | if (kppet.g_sample != NULL) { |
252 | kfree_type(struct kperf_usample_min, |
253 | kppet.g_sample->usample.usample_min); |
254 | kfree_type(struct kperf_sample, kppet.g_sample); |
255 | } |
256 | } |
257 | |
258 | kppet_unlock(); |
259 | } |
260 | |
261 | void |
262 | kppet_reset(void) |
263 | { |
264 | kppet_config(actionid: 0); |
265 | kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE); |
266 | kppet_set_lightweight_pet(on: 0); |
267 | } |
268 | |
269 | void |
270 | kppet_wake_thread(void) |
271 | { |
272 | thread_wakeup(&kppet); |
273 | } |
274 | |
275 | __attribute__((noreturn)) |
276 | static void |
277 | kppet_thread(void * __unused param, wait_result_t __unused wr) |
278 | { |
279 | kppet_lock(); |
280 | |
281 | for (;;) { |
282 | BUF_INFO(PERF_PET_IDLE); |
283 | |
284 | do { |
285 | (void)lck_mtx_sleep(lck: &kppet_mtx, lck_sleep_action: LCK_SLEEP_DEFAULT, event: &kppet, |
286 | THREAD_UNINT); |
287 | } while (kppet.g_actionid == 0); |
288 | |
289 | BUF_INFO(PERF_PET_RUN); |
290 | |
291 | uint64_t sampledur_abs = kppet_sample_tasks(idle_rate: kppet.g_idle_rate); |
292 | |
293 | kptimer_pet_enter(sampledur_abs); |
294 | } |
295 | } |
296 | |
297 | #pragma mark - sampling |
298 | |
299 | static void |
300 | kppet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate) |
301 | { |
302 | kppet_lock_assert_owned(); |
303 | |
304 | uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS | |
305 | SAMPLE_FLAG_THREAD_ONLY; |
306 | |
307 | BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START); |
308 | |
309 | struct kperf_context ctx = { |
310 | .cur_thread = thread, |
311 | .cur_task = task, |
312 | .cur_pid = pid, |
313 | }; |
314 | |
315 | boolean_t thread_dirty = kperf_thread_get_dirty(thread); |
316 | |
317 | /* |
318 | * Clean a dirty thread and skip callstack sample if the thread was not |
319 | * dirty and thread had skipped less than `idle_rate` samples. |
320 | */ |
321 | if (thread_dirty) { |
322 | kperf_thread_set_dirty(thread, FALSE); |
323 | } else if ((thread->kperf_pet_cnt % idle_rate) != 0) { |
324 | sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK; |
325 | } |
326 | thread->kperf_pet_cnt++; |
327 | |
328 | kperf_sample(sbuf: kppet.g_sample, ctx: &ctx, actionid: kppet.g_actionid, sample_flags); |
329 | kperf_sample_user(sbuf: &kppet.g_sample->usample, ctx: &ctx, actionid: kppet.g_actionid, |
330 | sample_flags); |
331 | |
332 | BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END); |
333 | } |
334 | |
335 | static kern_return_t |
336 | kppet_threads_prepare(task_t task) |
337 | { |
338 | kppet_lock_assert_owned(); |
339 | |
340 | vm_size_t count_needed; |
341 | |
342 | for (;;) { |
343 | task_lock(task); |
344 | |
345 | if (!task->active) { |
346 | task_unlock(task); |
347 | return KERN_FAILURE; |
348 | } |
349 | |
350 | /* |
351 | * With the task locked, figure out if enough space has been allocated to |
352 | * contain all of the thread references. |
353 | */ |
354 | count_needed = task->thread_count; |
355 | if (count_needed <= kppet.g_threads_count) { |
356 | break; |
357 | } |
358 | |
359 | /* |
360 | * Otherwise, allocate more and try again. |
361 | */ |
362 | task_unlock(task); |
363 | |
364 | kfree_type(thread_t, kppet.g_threads_count, kppet.g_threads); |
365 | |
366 | assert(count_needed > 0); |
367 | kppet.g_threads_count = count_needed; |
368 | |
369 | kppet.g_threads = kalloc_type_tag(thread_t, kppet.g_threads_count, |
370 | Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG); |
371 | if (kppet.g_threads == NULL) { |
372 | kppet.g_threads_count = 0; |
373 | return KERN_RESOURCE_SHORTAGE; |
374 | } |
375 | } |
376 | |
377 | thread_t thread; |
378 | kppet.g_nthreads = 0; |
379 | queue_iterate(&(task->threads), thread, thread_t, task_threads) { |
380 | thread_reference(thread); |
381 | kppet.g_threads[kppet.g_nthreads++] = thread; |
382 | } |
383 | |
384 | task_unlock(task); |
385 | |
386 | return (kppet.g_nthreads > 0) ? KERN_SUCCESS : KERN_FAILURE; |
387 | } |
388 | |
389 | /* |
390 | * Sample a `task`, using `idle_rate` to control whether idle threads need to be |
391 | * re-sampled. |
392 | * |
393 | * The task must be referenced. |
394 | */ |
395 | static void |
396 | kppet_sample_task(task_t task, uint32_t idle_rate) |
397 | { |
398 | kppet_lock_assert_owned(); |
399 | assert(task != kernel_task); |
400 | if (task == kernel_task) { |
401 | return; |
402 | } |
403 | |
404 | BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START); |
405 | |
406 | int pid = task_pid(task); |
407 | if (kperf_action_has_task(actionid: kppet.g_actionid)) { |
408 | struct kperf_context ctx = { |
409 | .cur_task = task, |
410 | .cur_pid = pid, |
411 | }; |
412 | |
413 | kperf_sample(sbuf: kppet.g_sample, ctx: &ctx, actionid: kppet.g_actionid, |
414 | SAMPLE_FLAG_TASK_ONLY); |
415 | } |
416 | |
417 | if (!kperf_action_has_thread(actionid: kppet.g_actionid)) { |
418 | BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END); |
419 | return; |
420 | } |
421 | |
422 | /* |
423 | * Suspend the task to see an atomic snapshot of all its threads. This |
424 | * is expensive and disruptive. |
425 | */ |
426 | kern_return_t kr = task_suspend_internal(task); |
427 | if (kr != KERN_SUCCESS) { |
428 | BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1); |
429 | return; |
430 | } |
431 | |
432 | kr = kppet_threads_prepare(task); |
433 | if (kr != KERN_SUCCESS) { |
434 | BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr); |
435 | goto out; |
436 | } |
437 | |
438 | for (unsigned int i = 0; i < kppet.g_nthreads; i++) { |
439 | thread_t thread = kppet.g_threads[i]; |
440 | assert(thread != THREAD_NULL); |
441 | |
442 | kppet_sample_thread(pid, task, thread, idle_rate); |
443 | |
444 | thread_deallocate(thread: kppet.g_threads[i]); |
445 | } |
446 | |
447 | out: |
448 | task_resume_internal(task); |
449 | |
450 | BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, kppet.g_nthreads); |
451 | } |
452 | |
453 | /* |
454 | * Store and reference all tasks on the system, so they can be safely inspected |
455 | * outside the `tasks_threads_lock`. |
456 | */ |
457 | static kern_return_t |
458 | kppet_tasks_prepare(void) |
459 | { |
460 | kppet_lock_assert_owned(); |
461 | |
462 | vm_size_t count_needed = 0; |
463 | |
464 | for (;;) { |
465 | lck_mtx_lock(lck: &tasks_threads_lock); |
466 | |
467 | /* |
468 | * With the lock held, break out of the lock/unlock loop if |
469 | * there's enough space to store all the tasks. |
470 | */ |
471 | count_needed = tasks_count; |
472 | if (count_needed <= kppet.g_tasks_count) { |
473 | break; |
474 | } |
475 | |
476 | /* |
477 | * Otherwise, allocate more memory outside of the lock. |
478 | */ |
479 | lck_mtx_unlock(lck: &tasks_threads_lock); |
480 | |
481 | if (count_needed > kppet.g_tasks_count) { |
482 | if (kppet.g_tasks_count != 0) { |
483 | kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks); |
484 | } |
485 | |
486 | assert(count_needed > 0); |
487 | kppet.g_tasks_count = count_needed; |
488 | |
489 | kppet.g_tasks = kalloc_type_tag(task_t, kppet.g_tasks_count, |
490 | Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG); |
491 | if (!kppet.g_tasks) { |
492 | kppet.g_tasks_count = 0; |
493 | return KERN_RESOURCE_SHORTAGE; |
494 | } |
495 | } |
496 | } |
497 | |
498 | task_t task = TASK_NULL; |
499 | kppet.g_ntasks = 0; |
500 | queue_iterate(&tasks, task, task_t, tasks) { |
501 | bool eligible_task = task != kernel_task; |
502 | if (eligible_task) { |
503 | task_reference(task); |
504 | kppet.g_tasks[kppet.g_ntasks++] = task; |
505 | } |
506 | } |
507 | |
508 | lck_mtx_unlock(lck: &tasks_threads_lock); |
509 | |
510 | return KERN_SUCCESS; |
511 | } |
512 | |
513 | static uint64_t |
514 | kppet_sample_tasks(uint32_t idle_rate) |
515 | { |
516 | kppet_lock_assert_owned(); |
517 | assert(kppet.g_actionid > 0); |
518 | |
519 | uint64_t start_abs = mach_absolute_time(); |
520 | |
521 | BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START); |
522 | |
523 | kern_return_t kr = kppet_tasks_prepare(); |
524 | if (kr != KERN_SUCCESS) { |
525 | BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr); |
526 | BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END); |
527 | return mach_absolute_time() - start_abs; |
528 | } |
529 | |
530 | for (unsigned int i = 0; i < kppet.g_ntasks; i++) { |
531 | task_t task = kppet.g_tasks[i]; |
532 | assert(task != TASK_NULL); |
533 | kppet_sample_task(task, idle_rate); |
534 | task_deallocate(task); |
535 | kppet.g_tasks[i] = TASK_NULL; |
536 | } |
537 | |
538 | BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, kppet.g_ntasks); |
539 | kppet.g_ntasks = 0; |
540 | return mach_absolute_time() - start_abs; |
541 | } |
542 | |
543 | #pragma mark - sysctl accessors |
544 | |
545 | int |
546 | kppet_get_idle_rate(void) |
547 | { |
548 | return kppet.g_idle_rate; |
549 | } |
550 | |
551 | int |
552 | kppet_set_idle_rate(int new_idle_rate) |
553 | { |
554 | kppet.g_idle_rate = new_idle_rate; |
555 | return 0; |
556 | } |
557 | |
558 | void |
559 | kppet_lightweight_active_update(void) |
560 | { |
561 | kppet_lightweight_active = (kperf_is_sampling() && kppet.g_lightweight); |
562 | kperf_on_cpu_update(); |
563 | } |
564 | |
565 | int |
566 | kppet_get_lightweight_pet(void) |
567 | { |
568 | return kppet.g_lightweight; |
569 | } |
570 | |
571 | int |
572 | kppet_set_lightweight_pet(int on) |
573 | { |
574 | if (kperf_is_sampling()) { |
575 | return EBUSY; |
576 | } |
577 | |
578 | kppet.g_lightweight = (on == 1); |
579 | kppet_lightweight_active_update(); |
580 | return 0; |
581 | } |
582 | |