1/*
2 * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Profile Every Thread (PET) provides a profile of all threads on the system
31 * when a timer fires. PET supports the "record waiting threads" mode in
32 * Instruments, and used to be called All Thread States (ATS). New tools should
33 * adopt the lightweight PET mode, which provides the same information, but with
34 * much less overhead.
35 *
36 * When traditional (non-lightweight) PET is active, a migrating timer call
37 * causes the PET thread to wake up. The timer handler also issues a broadcast
38 * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core
39 * samples. This is provided for backwards-compatibility with clients that
40 * expect on-core samples, when PET's timer was based off the on-core timers.
41 * Because PET sampling can take on the order of milliseconds, the PET thread
42 * will enter a new timer deadline after it finished sampling This perturbs the
43 * timer cadence by the duration of PET sampling, but it leaves the system to
44 * work on non-profiling tasks for the duration of the timer period.
45 *
46 * Lightweight PET samples the system less-intrusively than normal PET
47 * mode. Instead of iterating tasks and threads on each sample, it increments
48 * a global generation count, `kppet_gencount`, which is checked as threads are
49 * context switched on-core. If the thread's local generation count is older
50 * than the global generation, the thread samples itself.
51 *
52 * | |
53 * thread A +--+---------|
54 * | |
55 * thread B |--+---------------|
56 * | |
57 * thread C | | |-------------------------------------
58 * | | |
59 * thread D | | | |-------------------------------
60 * | | | |
61 * +--+---------+-----+--------------------------------> time
62 * | │ |
63 * | +-----+--- threads sampled when they come on-core in
64 * | kperf_pet_switch_context
65 * |
66 * +--- PET timer fire, sample on-core threads A and B,
67 * increment kppet_gencount
68 */
69
70#include <mach/mach_types.h>
71#include <sys/errno.h>
72
73#include <kperf/kperf.h>
74#include <kperf/buffer.h>
75#include <kperf/sample.h>
76#include <kperf/context.h>
77#include <kperf/action.h>
78#include <kperf/pet.h>
79#include <kperf/kptimer.h>
80
81#include <kern/task.h>
82#include <kern/kalloc.h>
83#if defined(__x86_64__)
84#include <i386/mp.h>
85#endif /* defined(__x86_64__) */
86
87static LCK_MTX_DECLARE(kppet_mtx, &kperf_lck_grp);
88
89static struct {
90 unsigned int g_actionid;
91 /*
92 * The idle rate controls how many sampling periods to skip if a thread
93 * is idle.
94 */
95 uint32_t g_idle_rate;
96 bool g_setup:1;
97 bool g_lightweight:1;
98 struct kperf_sample *g_sample;
99
100 thread_t g_sample_thread;
101
102 /*
103 * Used by the PET thread to manage which threads and tasks to sample.
104 */
105 thread_t *g_threads;
106 unsigned int g_nthreads;
107 size_t g_threads_count;
108
109 task_t *g_tasks;
110 unsigned int g_ntasks;
111 size_t g_tasks_count;
112} kppet = {
113 .g_actionid = 0,
114 .g_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE,
115};
116
117bool kppet_lightweight_active = false;
118_Atomic uint32_t kppet_gencount = 0;
119
120static uint64_t kppet_sample_tasks(uint32_t idle_rate);
121static void kppet_thread(void * param, wait_result_t wr);
122
123static void
124kppet_lock_assert_owned(void)
125{
126 lck_mtx_assert(lck: &kppet_mtx, LCK_MTX_ASSERT_OWNED);
127}
128
129static void
130kppet_lock(void)
131{
132 lck_mtx_lock(lck: &kppet_mtx);
133}
134
135static void
136kppet_unlock(void)
137{
138 lck_mtx_unlock(lck: &kppet_mtx);
139}
140
141void
142kppet_on_cpu(thread_t thread, thread_continue_t continuation,
143 uintptr_t *starting_fp)
144{
145 assert(thread != NULL);
146 assert(ml_get_interrupts_enabled() == FALSE);
147
148 uint32_t actionid = kppet.g_actionid;
149 if (actionid == 0) {
150 return;
151 }
152
153 if (thread->kperf_pet_gen != atomic_load(&kppet_gencount)) {
154 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START,
155 atomic_load_explicit(&kppet_gencount,
156 memory_order_relaxed), thread->kperf_pet_gen);
157
158 task_t task = get_threadtask(thread);
159 struct kperf_context ctx = {
160 .cur_thread = thread,
161 .cur_task = task,
162 .cur_pid = task_pid(task),
163 .starting_fp = starting_fp,
164 };
165 /*
166 * Use a per-CPU interrupt buffer, since this is only called
167 * while interrupts are disabled, from the scheduler.
168 */
169 struct kperf_sample *sample = kperf_intr_sample_buffer();
170 if (!sample) {
171 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1);
172 return;
173 }
174
175 unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER;
176 if (continuation != NULL) {
177 flags |= SAMPLE_FLAG_CONTINUATION;
178 }
179 kperf_sample(sbuf: sample, ctx: &ctx, actionid, sample_flags: flags);
180
181 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
182 } else {
183 BUF_VERB(PERF_PET_SAMPLE_THREAD,
184 os_atomic_load(&kppet_gencount, relaxed), thread->kperf_pet_gen);
185 }
186}
187
188#pragma mark - state transitions
189
190/*
191 * Lazily initialize PET. The PET thread never exits once PET has been used
192 * once.
193 */
194static void
195kppet_setup(void)
196{
197 if (kppet.g_setup) {
198 return;
199 }
200
201 kern_return_t kr = kernel_thread_start(continuation: kppet_thread, NULL,
202 new_thread: &kppet.g_sample_thread);
203 if (kr != KERN_SUCCESS) {
204 panic("kperf: failed to create PET thread %d", kr);
205 }
206
207 thread_set_thread_name(th: kppet.g_sample_thread, name: "kperf-pet-sampling");
208 kppet.g_setup = true;
209}
210
211void
212kppet_config(unsigned int actionid)
213{
214 /*
215 * Resetting kperf shouldn't get the PET thread started.
216 */
217 if (actionid == 0 && !kppet.g_setup) {
218 return;
219 }
220
221 kppet_setup();
222
223 kppet_lock();
224
225 kppet.g_actionid = actionid;
226
227 if (actionid > 0) {
228 if (!kppet.g_sample) {
229 kppet.g_sample = kalloc_type_tag(struct kperf_sample,
230 Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
231 kppet.g_sample->usample.usample_min = kalloc_type_tag(
232 struct kperf_usample_min, Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
233 }
234 } else {
235 if (kppet.g_tasks) {
236 assert(kppet.g_tasks_count != 0);
237 kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
238 kppet.g_tasks = NULL;
239 kppet.g_tasks_count = 0;
240 kppet.g_ntasks = 0;
241 }
242 if (kppet.g_threads) {
243 assert(kppet.g_threads_count != 0);
244 void *g_tasks = (void *)kppet.g_tasks;
245 kfree_type(thread_t, kppet.g_threads_count, g_tasks);
246 kppet.g_tasks = NULL;
247 kppet.g_threads = NULL;
248 kppet.g_threads_count = 0;
249 kppet.g_nthreads = 0;
250 }
251 if (kppet.g_sample != NULL) {
252 kfree_type(struct kperf_usample_min,
253 kppet.g_sample->usample.usample_min);
254 kfree_type(struct kperf_sample, kppet.g_sample);
255 }
256 }
257
258 kppet_unlock();
259}
260
261void
262kppet_reset(void)
263{
264 kppet_config(actionid: 0);
265 kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE);
266 kppet_set_lightweight_pet(on: 0);
267}
268
269void
270kppet_wake_thread(void)
271{
272 thread_wakeup(&kppet);
273}
274
275__attribute__((noreturn))
276static void
277kppet_thread(void * __unused param, wait_result_t __unused wr)
278{
279 kppet_lock();
280
281 for (;;) {
282 BUF_INFO(PERF_PET_IDLE);
283
284 do {
285 (void)lck_mtx_sleep(lck: &kppet_mtx, lck_sleep_action: LCK_SLEEP_DEFAULT, event: &kppet,
286 THREAD_UNINT);
287 } while (kppet.g_actionid == 0);
288
289 BUF_INFO(PERF_PET_RUN);
290
291 uint64_t sampledur_abs = kppet_sample_tasks(idle_rate: kppet.g_idle_rate);
292
293 kptimer_pet_enter(sampledur_abs);
294 }
295}
296
297#pragma mark - sampling
298
299static void
300kppet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate)
301{
302 kppet_lock_assert_owned();
303
304 uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS |
305 SAMPLE_FLAG_THREAD_ONLY;
306
307 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START);
308
309 struct kperf_context ctx = {
310 .cur_thread = thread,
311 .cur_task = task,
312 .cur_pid = pid,
313 };
314
315 boolean_t thread_dirty = kperf_thread_get_dirty(thread);
316
317 /*
318 * Clean a dirty thread and skip callstack sample if the thread was not
319 * dirty and thread had skipped less than `idle_rate` samples.
320 */
321 if (thread_dirty) {
322 kperf_thread_set_dirty(thread, FALSE);
323 } else if ((thread->kperf_pet_cnt % idle_rate) != 0) {
324 sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK;
325 }
326 thread->kperf_pet_cnt++;
327
328 kperf_sample(sbuf: kppet.g_sample, ctx: &ctx, actionid: kppet.g_actionid, sample_flags);
329 kperf_sample_user(sbuf: &kppet.g_sample->usample, ctx: &ctx, actionid: kppet.g_actionid,
330 sample_flags);
331
332 BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
333}
334
335static kern_return_t
336kppet_threads_prepare(task_t task)
337{
338 kppet_lock_assert_owned();
339
340 vm_size_t count_needed;
341
342 for (;;) {
343 task_lock(task);
344
345 if (!task->active) {
346 task_unlock(task);
347 return KERN_FAILURE;
348 }
349
350 /*
351 * With the task locked, figure out if enough space has been allocated to
352 * contain all of the thread references.
353 */
354 count_needed = task->thread_count;
355 if (count_needed <= kppet.g_threads_count) {
356 break;
357 }
358
359 /*
360 * Otherwise, allocate more and try again.
361 */
362 task_unlock(task);
363
364 kfree_type(thread_t, kppet.g_threads_count, kppet.g_threads);
365
366 assert(count_needed > 0);
367 kppet.g_threads_count = count_needed;
368
369 kppet.g_threads = kalloc_type_tag(thread_t, kppet.g_threads_count,
370 Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
371 if (kppet.g_threads == NULL) {
372 kppet.g_threads_count = 0;
373 return KERN_RESOURCE_SHORTAGE;
374 }
375 }
376
377 thread_t thread;
378 kppet.g_nthreads = 0;
379 queue_iterate(&(task->threads), thread, thread_t, task_threads) {
380 thread_reference(thread);
381 kppet.g_threads[kppet.g_nthreads++] = thread;
382 }
383
384 task_unlock(task);
385
386 return (kppet.g_nthreads > 0) ? KERN_SUCCESS : KERN_FAILURE;
387}
388
389/*
390 * Sample a `task`, using `idle_rate` to control whether idle threads need to be
391 * re-sampled.
392 *
393 * The task must be referenced.
394 */
395static void
396kppet_sample_task(task_t task, uint32_t idle_rate)
397{
398 kppet_lock_assert_owned();
399 assert(task != kernel_task);
400 if (task == kernel_task) {
401 return;
402 }
403
404 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START);
405
406 int pid = task_pid(task);
407 if (kperf_action_has_task(actionid: kppet.g_actionid)) {
408 struct kperf_context ctx = {
409 .cur_task = task,
410 .cur_pid = pid,
411 };
412
413 kperf_sample(sbuf: kppet.g_sample, ctx: &ctx, actionid: kppet.g_actionid,
414 SAMPLE_FLAG_TASK_ONLY);
415 }
416
417 if (!kperf_action_has_thread(actionid: kppet.g_actionid)) {
418 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END);
419 return;
420 }
421
422 /*
423 * Suspend the task to see an atomic snapshot of all its threads. This
424 * is expensive and disruptive.
425 */
426 kern_return_t kr = task_suspend_internal(task);
427 if (kr != KERN_SUCCESS) {
428 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
429 return;
430 }
431
432 kr = kppet_threads_prepare(task);
433 if (kr != KERN_SUCCESS) {
434 BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
435 goto out;
436 }
437
438 for (unsigned int i = 0; i < kppet.g_nthreads; i++) {
439 thread_t thread = kppet.g_threads[i];
440 assert(thread != THREAD_NULL);
441
442 kppet_sample_thread(pid, task, thread, idle_rate);
443
444 thread_deallocate(thread: kppet.g_threads[i]);
445 }
446
447out:
448 task_resume_internal(task);
449
450 BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, kppet.g_nthreads);
451}
452
453/*
454 * Store and reference all tasks on the system, so they can be safely inspected
455 * outside the `tasks_threads_lock`.
456 */
457static kern_return_t
458kppet_tasks_prepare(void)
459{
460 kppet_lock_assert_owned();
461
462 vm_size_t count_needed = 0;
463
464 for (;;) {
465 lck_mtx_lock(lck: &tasks_threads_lock);
466
467 /*
468 * With the lock held, break out of the lock/unlock loop if
469 * there's enough space to store all the tasks.
470 */
471 count_needed = tasks_count;
472 if (count_needed <= kppet.g_tasks_count) {
473 break;
474 }
475
476 /*
477 * Otherwise, allocate more memory outside of the lock.
478 */
479 lck_mtx_unlock(lck: &tasks_threads_lock);
480
481 if (count_needed > kppet.g_tasks_count) {
482 if (kppet.g_tasks_count != 0) {
483 kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
484 }
485
486 assert(count_needed > 0);
487 kppet.g_tasks_count = count_needed;
488
489 kppet.g_tasks = kalloc_type_tag(task_t, kppet.g_tasks_count,
490 Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
491 if (!kppet.g_tasks) {
492 kppet.g_tasks_count = 0;
493 return KERN_RESOURCE_SHORTAGE;
494 }
495 }
496 }
497
498 task_t task = TASK_NULL;
499 kppet.g_ntasks = 0;
500 queue_iterate(&tasks, task, task_t, tasks) {
501 bool eligible_task = task != kernel_task;
502 if (eligible_task) {
503 task_reference(task);
504 kppet.g_tasks[kppet.g_ntasks++] = task;
505 }
506 }
507
508 lck_mtx_unlock(lck: &tasks_threads_lock);
509
510 return KERN_SUCCESS;
511}
512
513static uint64_t
514kppet_sample_tasks(uint32_t idle_rate)
515{
516 kppet_lock_assert_owned();
517 assert(kppet.g_actionid > 0);
518
519 uint64_t start_abs = mach_absolute_time();
520
521 BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START);
522
523 kern_return_t kr = kppet_tasks_prepare();
524 if (kr != KERN_SUCCESS) {
525 BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr);
526 BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END);
527 return mach_absolute_time() - start_abs;
528 }
529
530 for (unsigned int i = 0; i < kppet.g_ntasks; i++) {
531 task_t task = kppet.g_tasks[i];
532 assert(task != TASK_NULL);
533 kppet_sample_task(task, idle_rate);
534 task_deallocate(task);
535 kppet.g_tasks[i] = TASK_NULL;
536 }
537
538 BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, kppet.g_ntasks);
539 kppet.g_ntasks = 0;
540 return mach_absolute_time() - start_abs;
541}
542
543#pragma mark - sysctl accessors
544
545int
546kppet_get_idle_rate(void)
547{
548 return kppet.g_idle_rate;
549}
550
551int
552kppet_set_idle_rate(int new_idle_rate)
553{
554 kppet.g_idle_rate = new_idle_rate;
555 return 0;
556}
557
558void
559kppet_lightweight_active_update(void)
560{
561 kppet_lightweight_active = (kperf_is_sampling() && kppet.g_lightweight);
562 kperf_on_cpu_update();
563}
564
565int
566kppet_get_lightweight_pet(void)
567{
568 return kppet.g_lightweight;
569}
570
571int
572kppet_set_lightweight_pet(int on)
573{
574 if (kperf_is_sampling()) {
575 return EBUSY;
576 }
577
578 kppet.g_lightweight = (on == 1);
579 kppet_lightweight_active_update();
580 return 0;
581}
582