sfi.c source code [xnu/osfmk/kern/sfi.c]

1	/*
2	* Copyright (c) 2013 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	#include <mach/mach_types.h>
29	#include <kern/assert.h>
30	#include <kern/clock.h>
31	#include <kern/coalition.h>
32	#include <kern/debug.h>
33	#include <kern/host.h>
34	#include <kern/kalloc.h>
35	#include <kern/kern_types.h>
36	#include <kern/machine.h>
37	#include <kern/simple_lock.h>
38	#include <kern/misc_protos.h>
39	#include <kern/sched.h>
40	#include <kern/sched_prim.h>
41	#include <kern/sfi.h>
42	#include <kern/timer_call.h>
43	#include <kern/waitq.h>
44	#include <kern/ledger.h>
45	#include <kern/policy_internal.h>
46
47	#include <machine/atomic.h>
48
49	#include <pexpert/pexpert.h>
50
51	#include <libkern/kernel_mach_header.h>
52
53	#include <sys/kdebug.h>
54
55	#if CONFIG_SCHED_SFI
56
57	#define SFI_DEBUG 0
58
59	#if SFI_DEBUG
60	#define dprintf(...) kprintf(__VA_ARGS__)
61	#else
62	#define dprintf(...) do { } while(0)
63	#endif
64
65	/*
66	* SFI (Selective Forced Idle) operates by enabling a global
67	* timer on the SFI window interval. When it fires, all processors
68	* running a thread that should be SFI-ed are sent an AST.
69	* As threads become runnable while in their "off phase", they
70	* are placed on a deferred ready queue. When a per-class
71	* "on timer" fires, the ready threads for that class are
72	* re-enqueued for running. As an optimization to avoid spurious
73	* wakeups, the timer may be lazily programmed.
74	*/
75
76	/*
77	* The "sfi_lock" simple lock guards access to static configuration
78	* parameters (as specified by userspace), dynamic state changes
79	* (as updated by the timer event routine), and timer data structures.
80	* Since it can be taken with interrupts disabled in some cases, all
81	* uses should be taken with interrupts disabled at splsched(). The
82	* "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and
83	* must only be accessed with it held.
84	*
85	* When an "on timer" fires, we must deterministically be able to drain
86	* the wait queue, since if any threads are added to the queue afterwards,
87	* they may never get woken out of SFI wait. So sfi_lock must be
88	* taken before the wait queue's own spinlock.
89	*
90	* The wait queue will take the thread's scheduling lock. We may also take
91	* the thread_lock directly to update the "sfi_class" field and determine
92	* if the thread should block in the wait queue, but the lock will be
93	* released before doing so.
94	*
95	* The pset lock may also be taken, but not while any other locks are held.
96	*
97	* The task and thread mutex may also be held while reevaluating sfi state.
98	*
99	* splsched ---> sfi_lock ---> waitq ---> thread_lock
100	* \ \ \__ thread_lock (*)
101	* \ \__ pset_lock
102	* \
103	* \__ thread_lock
104	*/
105
106	decl_simple_lock_data(static,sfi_lock);
107	static timer_call_data_t sfi_timer_call_entry;
108	volatile boolean_t sfi_is_enabled;
109
110	boolean_t sfi_window_is_set;
111	uint64_t sfi_window_usecs;
112	uint64_t sfi_window_interval;
113	uint64_t sfi_next_off_deadline;
114
115	typedef struct {
116	sfi_class_id_t class_id;
117	thread_continue_t class_continuation;
118	const char * class_name;
119	const char * class_ledger_name;
120	} sfi_class_registration_t;
121
122	/*
123	* To add a new SFI class:
124	*
125	* 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h
126	* 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness.
127	* 3) Add a call to SFI_CLASS_REGISTER below
128	* 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible.
129	* 5) Modify thermald to use the SFI class
130	*/
131
132	static inline void _sfi_wait_cleanup(void);
133
134	#define SFI_CLASS_REGISTER(clsid, ledger_name) \
135	static void __attribute__((noinline, noreturn)) \
136	SFI_ ## clsid ## _THREAD_IS_WAITING(void *arg __unused, wait_result_t wret __unused) \
137	{ \
138	_sfi_wait_cleanup(); \
139	thread_exception_return(); \
140	} \
141	\
142	_Static_assert(SFI_CLASS_ ## clsid < MAX_SFI_CLASS_ID, "Invalid ID"); \
143	\
144	__attribute__((section("__DATA,__sfi_class_reg"), used)) \
145	static sfi_class_registration_t SFI_ ## clsid ## _registration = { \
146	.class_id = SFI_CLASS_ ## clsid, \
147	.class_continuation = SFI_ ## clsid ## _THREAD_IS_WAITING, \
148	.class_name = "SFI_CLASS_" # clsid, \
149	.class_ledger_name = "SFI_CLASS_" # ledger_name, \
150	}
151
152	/ SFI_CLASS_UNSPECIFIED not included here /
153	SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE);
154	SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG);
155	SFI_CLASS_REGISTER(APP_NAP, APP_NAP);
156	SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED);
157	SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED);
158	SFI_CLASS_REGISTER(UTILITY, UTILITY);
159	SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT);
160	SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT);
161	SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY);
162	SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY);
163	SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED);
164	SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED);
165	SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE);
166	SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE);
167	SFI_CLASS_REGISTER(KERNEL, OPTED_OUT);
168	SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT);
169
170	struct sfi_class_state {
171	uint64_t off_time_usecs;
172	uint64_t off_time_interval;
173
174	timer_call_data_t on_timer;
175	uint64_t on_timer_deadline;
176	boolean_t on_timer_programmed;
177
178	boolean_t class_sfi_is_enabled;
179	volatile boolean_t class_in_on_phase;
180
181	struct waitq waitq; / threads in ready state /
182	thread_continue_t continuation;
183
184	const char * class_name;
185	const char * class_ledger_name;
186	};
187
188	/ Static configuration performed in sfi_early_init() /
189	struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID];
190
191	int sfi_enabled_class_count;
192
193	static void sfi_timer_global_off(
194	timer_call_param_t param0,
195	timer_call_param_t param1);
196
197	static void sfi_timer_per_class_on(
198	timer_call_param_t param0,
199	timer_call_param_t param1);
200
201	static sfi_class_registration_t *
202	sfi_get_registration_data(unsigned long *count)
203	{
204	unsigned long sectlen = `0`;
205	void *sectdata;
206
207	sectdata = getsectdatafromheader(&_mh_execute_header, "__DATA", "__sfi_class_reg", &sectlen);
208	if (sectdata) {
209
210	if (sectlen % sizeof(sfi_class_registration_t) != `0`) {
211	/ corrupt data? /
212	panic("__sfi_class_reg section has invalid size %lu", sectlen);
213	__builtin_unreachable();
214	}
215
216	count = sectlen / sizeof*(sfi_class_registration_t);
217	return (sfi_class_registration_t *)sectdata;
218	} else {
219	panic("__sfi_class_reg section not found");
220	__builtin_unreachable();
221	}
222	}
223
224	/ Called early in boot, when kernel is single-threaded /
225	void sfi_early_init(void)
226	{
227	unsigned long i, count;
228	sfi_class_registration_t *registrations;
229
230	registrations = sfi_get_registration_data(&count);
231	for (i=`0`; i < count; i++) {
232	sfi_class_id_t class_id = registrations[i].class_id;
233
234	assert(class_id < MAX_SFI_CLASS_ID); / should be caught at compile-time /
235	if (class_id < MAX_SFI_CLASS_ID) {
236	if (sfi_classes[class_id].continuation != NULL) {
237	panic("Duplicate SFI registration for class 0x%x", class_id);
238	}
239	sfi_classes[class_id].class_sfi_is_enabled = FALSE;
240	sfi_classes[class_id].class_in_on_phase = TRUE;
241	sfi_classes[class_id].continuation = registrations[i].class_continuation;
242	sfi_classes[class_id].class_name = registrations[i].class_name;
243	sfi_classes[class_id].class_ledger_name = registrations[i].class_ledger_name;
244	}
245	}
246	}
247
248	void sfi_init(void)
249	{
250	sfi_class_id_t i;
251	kern_return_t kret;
252
253	simple_lock_init(&sfi_lock, `0`);
254	timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL);
255	sfi_window_is_set = FALSE;
256	sfi_enabled_class_count = `0`;
257	sfi_is_enabled = FALSE;
258
259	for (i = `0`; i < MAX_SFI_CLASS_ID; i++) {
260	/ If the class was set up in sfi_early_init(), initialize remaining fields /
261	if (sfi_classes[i].continuation) {
262	timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i);
263	sfi_classes[i].on_timer_programmed = FALSE;
264
265	kret = waitq_init(&sfi_classes[i].waitq, SYNC_POLICY_FIFO\|SYNC_POLICY_DISABLE_IRQ);
266	assert(kret == KERN_SUCCESS);
267	} else {
268	/ The only allowed gap is for SFI_CLASS_UNSPECIFIED /
269	if(i != SFI_CLASS_UNSPECIFIED) {
270	panic("Gap in registered SFI classes");
271	}
272	}
273	}
274	}
275
276	/ Can be called before sfi_init() by task initialization, but after sfi_early_init() /
277	sfi_class_id_t
278	sfi_get_ledger_alias_for_class(sfi_class_id_t class_id)
279	{
280	sfi_class_id_t i;
281	const char *ledger_name = NULL;
282
283	ledger_name = sfi_classes[class_id].class_ledger_name;
284
285	/ Find the first class in the registration table with this ledger name /
286	if (ledger_name) {
287	for (i = SFI_CLASS_UNSPECIFIED + `1`; i < class_id; i++) {
288	if (`0` == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) {
289	dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i);
290	return i;
291	}
292	}
293
294	/ This class is the primary one for the ledger, so there is no alias /
295	dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED);
296	return SFI_CLASS_UNSPECIFIED;
297	}
298
299	/ We are permissive on SFI class lookup failures. In sfi_init(), we assert more /
300	return SFI_CLASS_UNSPECIFIED;
301	}
302
303	int
304	sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id)
305	{
306	const char *ledger_name = NULL;
307
308	ledger_name = sfi_classes[class_id].class_ledger_name;
309
310	dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name);
311	return ledger_entry_add(template, ledger_name, "sfi", "MATUs");
312	}
313
314	static void sfi_timer_global_off(
315	timer_call_param_t param0 __unused,
316	timer_call_param_t param1 __unused)
317	{
318	uint64_t now = mach_absolute_time();
319	sfi_class_id_t i;
320	processor_set_t pset, nset;
321	processor_t processor;
322	uint32_t needs_cause_ast_mask = `0x0`;
323	spl_t s;
324
325	s = splsched();
326
327	simple_lock(&sfi_lock);
328	if (!sfi_is_enabled) {
329	/ If SFI has been disabled, let all "on" timers drain naturally /
330	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_NONE, `1`, `0`, `0`, `0`, `0`);
331
332	simple_unlock(&sfi_lock);
333	splx(s);
334	return;
335	}
336
337	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
338
339	/ First set all configured classes into the off state, and program their "on" timer /
340	for (i = `0`; i < MAX_SFI_CLASS_ID; i++) {
341	if (sfi_classes[i].class_sfi_is_enabled) {
342	uint64_t on_timer_deadline;
343
344	sfi_classes[i].class_in_on_phase = FALSE;
345	sfi_classes[i].on_timer_programmed = TRUE;
346
347	/ Push out on-timer /
348	on_timer_deadline = now + sfi_classes[i].off_time_interval;
349	sfi_classes[i].on_timer_deadline = on_timer_deadline;
350
351	timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
352	} else {
353	/ If this class no longer needs SFI, make sure the timer is cancelled /
354	sfi_classes[i].class_in_on_phase = TRUE;
355	if (sfi_classes[i].on_timer_programmed) {
356	sfi_classes[i].on_timer_programmed = FALSE;
357	sfi_classes[i].on_timer_deadline = ~`0ULL`;
358	timer_call_cancel(&sfi_classes[i].on_timer);
359	}
360	}
361	}
362	simple_unlock(&sfi_lock);
363
364	/ Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase /
365	processor = processor_list;
366	pset = processor->processor_set;
367
368	pset_lock(pset);
369
370	do {
371	nset = processor->processor_set;
372	if (nset != pset) {
373	pset_unlock(pset);
374	pset = nset;
375	pset_lock(pset);
376	}
377
378	/ "processor" and its pset are locked /
379	if (processor->state == PROCESSOR_RUNNING) {
380	if (AST_NONE != sfi_processor_needs_ast(processor)) {
381	needs_cause_ast_mask \|= (`1U` << processor->cpu_id);
382	}
383	}
384	} while ((processor = processor->processor_list) != NULL);
385
386	pset_unlock(pset);
387
388	for (int cpuid = lsb_first(needs_cause_ast_mask); cpuid >= `0`; cpuid = lsb_next(needs_cause_ast_mask, cpuid)) {
389	processor = processor_array[cpuid];
390	if (processor == current_processor()) {
391	ast_on(AST_SFI);
392	} else {
393	cause_ast_check(processor);
394	}
395	}
396
397	/ Re-arm timer if still enabled /
398	simple_lock(&sfi_lock);
399	if (sfi_is_enabled) {
400	clock_deadline_for_periodic_event(sfi_window_interval,
401	now,
402	&sfi_next_off_deadline);
403	timer_call_enter1(&sfi_timer_call_entry,
404	NULL,
405	sfi_next_off_deadline,
406	TIMER_CALL_SYS_CRITICAL);
407	}
408
409	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
410
411	simple_unlock(&sfi_lock);
412
413	splx(s);
414	}
415
416	static void sfi_timer_per_class_on(
417	timer_call_param_t param0,
418	timer_call_param_t param1 __unused)
419	{
420	sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0;
421	struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id];
422	kern_return_t kret;
423	spl_t s;
424
425	s = splsched();
426
427	simple_lock(&sfi_lock);
428
429	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) \| DBG_FUNC_START, sfi_class_id, `0`, `0`, `0`, `0`);
430
431	/*
432	* Any threads that may have accumulated in the ready queue for this class should get re-enqueued.
433	* Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
434	* no new threads to be put on this wait queue until the global "off timer" has fired.
435	*/
436
437	sfi_class->class_in_on_phase = TRUE;
438	sfi_class->on_timer_programmed = FALSE;
439
440	kret = waitq_wakeup64_all(&sfi_class->waitq,
441	CAST_EVENT64_T(sfi_class_id),
442	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
443	assert(kret == KERN_SUCCESS \|\| kret == KERN_NOT_WAITING);
444
445	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
446
447	simple_unlock(&sfi_lock);
448
449	splx(s);
450	}
451
452
453	kern_return_t sfi_set_window(uint64_t window_usecs)
454	{
455	uint64_t interval, deadline;
456	uint64_t now = mach_absolute_time();
457	sfi_class_id_t i;
458	spl_t s;
459	uint64_t largest_class_off_interval = `0`;
460
461	if (window_usecs < MIN_SFI_WINDOW_USEC)
462	window_usecs = MIN_SFI_WINDOW_USEC;
463
464	if (window_usecs > UINT32_MAX)
465	return (KERN_INVALID_ARGUMENT);
466
467	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, `0`, `0`, `0`, `0`);
468
469	clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval);
470	deadline = now + interval;
471
472	s = splsched();
473
474	simple_lock(&sfi_lock);
475
476	/ Check that we are not bringing in the SFI window smaller than any class /
477	for (i = `0`; i < MAX_SFI_CLASS_ID; i++) {
478	if (sfi_classes[i].class_sfi_is_enabled) {
479	largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval);
480	}
481	}
482
483	/*
484	* Off window must be strictly greater than all enabled classes,
485	* otherwise threads would build up on ready queue and never be able to run.
486	*/
487	if (interval <= largest_class_off_interval) {
488	simple_unlock(&sfi_lock);
489	splx(s);
490	return (KERN_INVALID_ARGUMENT);
491	}
492
493	/*
494	* If the new "off" deadline is further out than the current programmed timer,
495	* just let the current one expire (and the new cadence will be established thereafter).
496	* If the new "off" deadline is nearer than the current one, bring it in, so we
497	* can start the new behavior sooner. Note that this may cause the "off" timer to
498	* fire before some of the class "on" timers have fired.
499	*/
500	sfi_window_usecs = window_usecs;
501	sfi_window_interval = interval;
502	sfi_window_is_set = TRUE;
503
504	if (sfi_enabled_class_count == `0`) {
505	/ Can't program timer yet /
506	} else if (!sfi_is_enabled) {
507	sfi_is_enabled = TRUE;
508	sfi_next_off_deadline = deadline;
509	timer_call_enter1(&sfi_timer_call_entry,
510	NULL,
511	sfi_next_off_deadline,
512	TIMER_CALL_SYS_CRITICAL);
513	} else if (deadline >= sfi_next_off_deadline) {
514	sfi_next_off_deadline = deadline;
515	} else {
516	sfi_next_off_deadline = deadline;
517	timer_call_enter1(&sfi_timer_call_entry,
518	NULL,
519	sfi_next_off_deadline,
520	TIMER_CALL_SYS_CRITICAL);
521	}
522
523	simple_unlock(&sfi_lock);
524	splx(s);
525
526	return (KERN_SUCCESS);
527	}
528
529	kern_return_t sfi_window_cancel(void)
530	{
531	spl_t s;
532
533	s = splsched();
534
535	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), `0`, `0`, `0`, `0`, `0`);
536
537	/ Disable globals so that global "off-timer" is not re-armed /
538	simple_lock(&sfi_lock);
539	sfi_window_is_set = FALSE;
540	sfi_window_usecs = `0`;
541	sfi_window_interval = `0`;
542	sfi_next_off_deadline = `0`;
543	sfi_is_enabled = FALSE;
544	simple_unlock(&sfi_lock);
545
546	splx(s);
547
548	return (KERN_SUCCESS);
549	}
550
551	/ Defers SFI off and per-class on timers (if live) by the specified interval*
552	* in Mach Absolute Time Units. Currently invoked to align with the global
553	* forced idle mechanism. Making some simplifying assumptions, the iterative GFI
554	* induced SFI on+off deferrals form a geometric series that converges to yield
555	* an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
556	* alignment and congruency of the SFI/GFI periods can distort this to some extent.
557	*/
558
559	kern_return_t sfi_defer(uint64_t sfi_defer_matus)
560	{
561	spl_t s;
562	kern_return_t kr = KERN_FAILURE;
563	s = splsched();
564
565	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, `0`, `0`, `0`, `0`);
566
567	simple_lock(&sfi_lock);
568	if (!sfi_is_enabled) {
569	goto sfi_defer_done;
570	}
571
572	assert(sfi_next_off_deadline != `0`);
573
574	sfi_next_off_deadline += sfi_defer_matus;
575	timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
576
577	int i;
578	for (i = `0`; i < MAX_SFI_CLASS_ID; i++) {
579	if (sfi_classes[i].class_sfi_is_enabled) {
580	if (sfi_classes[i].on_timer_programmed) {
581	uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
582	sfi_classes[i].on_timer_deadline = new_on_deadline;
583	timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
584	}
585	}
586	}
587
588	kr = KERN_SUCCESS;
589	sfi_defer_done:
590	simple_unlock(&sfi_lock);
591
592	splx(s);
593
594	return (kr);
595	}
596
597
598	kern_return_t sfi_get_window(uint64_t *window_usecs)
599	{
600	spl_t s;
601	uint64_t off_window_us;
602
603	s = splsched();
604	simple_lock(&sfi_lock);
605
606	off_window_us = sfi_window_usecs;
607
608	simple_unlock(&sfi_lock);
609	splx(s);
610
611	*window_usecs = off_window_us;
612
613	return (KERN_SUCCESS);
614	}
615
616
617	kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs)
618	{
619	uint64_t interval;
620	spl_t s;
621	uint64_t off_window_interval;
622
623	if (offtime_usecs < MIN_SFI_WINDOW_USEC)
624	offtime_usecs = MIN_SFI_WINDOW_USEC;
625
626	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID)
627	return (KERN_INVALID_ARGUMENT);
628
629	if (offtime_usecs > UINT32_MAX)
630	return (KERN_INVALID_ARGUMENT);
631
632	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, `0`, `0`, `0`);
633
634	clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval);
635
636	s = splsched();
637
638	simple_lock(&sfi_lock);
639	off_window_interval = sfi_window_interval;
640
641	/ Check that we are not bringing in class off-time larger than the SFI window /
642	if (off_window_interval && (interval >= off_window_interval)) {
643	simple_unlock(&sfi_lock);
644	splx(s);
645	return (KERN_INVALID_ARGUMENT);
646	}
647
648	/ We never re-program the per-class on-timer, but rather just let it expire naturally /
649	if (!sfi_classes[class_id].class_sfi_is_enabled) {
650	sfi_enabled_class_count++;
651	}
652	sfi_classes[class_id].off_time_usecs = offtime_usecs;
653	sfi_classes[class_id].off_time_interval = interval;
654	sfi_classes[class_id].class_sfi_is_enabled = TRUE;
655
656	if (sfi_window_is_set && !sfi_is_enabled) {
657	/ start global off timer /
658	sfi_is_enabled = TRUE;
659	sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval;
660	timer_call_enter1(&sfi_timer_call_entry,
661	NULL,
662	sfi_next_off_deadline,
663	TIMER_CALL_SYS_CRITICAL);
664	}
665
666	simple_unlock(&sfi_lock);
667
668	splx(s);
669
670	return (KERN_SUCCESS);
671	}
672
673	kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id)
674	{
675	spl_t s;
676
677	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID)
678	return (KERN_INVALID_ARGUMENT);
679
680	s = splsched();
681
682	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, `0`, `0`, `0`, `0`);
683
684	simple_lock(&sfi_lock);
685
686	/ We never re-program the per-class on-timer, but rather just let it expire naturally /
687	if (sfi_classes[class_id].class_sfi_is_enabled) {
688	sfi_enabled_class_count--;
689	}
690	sfi_classes[class_id].off_time_usecs = `0`;
691	sfi_classes[class_id].off_time_interval = `0`;
692	sfi_classes[class_id].class_sfi_is_enabled = FALSE;
693
694	if (sfi_enabled_class_count == `0`) {
695	sfi_is_enabled = FALSE;
696	}
697
698	simple_unlock(&sfi_lock);
699
700	splx(s);
701
702	return (KERN_SUCCESS);
703	}
704
705	kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs)
706	{
707	uint64_t off_time_us;
708	spl_t s;
709
710	if (class_id == SFI_CLASS_UNSPECIFIED \|\| class_id >= MAX_SFI_CLASS_ID)
711	return (`0`);
712
713	s = splsched();
714
715	simple_lock(&sfi_lock);
716	off_time_us = sfi_classes[class_id].off_time_usecs;
717	simple_unlock(&sfi_lock);
718
719	splx(s);
720
721	*offtime_usecs = off_time_us;
722
723	return (KERN_SUCCESS);
724	}
725
726	/*
727	* sfi_thread_classify and sfi_processor_active_thread_classify perform the critical
728	* role of quickly categorizing a thread into its SFI class so that an AST_SFI can be
729	* set. As the thread is unwinding to userspace, sfi_ast() performs full locking
730	* and determines whether the thread should enter an SFI wait state. Because of
731	* the inherent races between the time the AST is set and when it is evaluated,
732	* thread classification can be inaccurate (but should always be safe). This is
733	* especially the case for sfi_processor_active_thread_classify, which must
734	* classify the active thread on a remote processor without taking the thread lock.
735	* When in doubt, classification should err on the side of not classifying a
736	* thread at all, and wait for the thread itself to either hit a quantum expiration
737	* or block inside the kernel.
738	*/
739
740	/*
741	* Thread must be locked. Ultimately, the real decision to enter
742	* SFI wait happens at the AST boundary.
743	*/
744	sfi_class_id_t sfi_thread_classify(thread_t thread)
745	{
746	task_t task = thread->task;
747	boolean_t is_kernel_thread = (task == kernel_task);
748	sched_mode_t thmode = thread->sched_mode;
749	boolean_t focal = FALSE;
750
751	int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE);
752	int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS);
753	int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED);
754
755	int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
756	int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG);
757
758	/ kernel threads never reach the user AST boundary, and are in a separate world for SFI /
759	if (is_kernel_thread) {
760	return SFI_CLASS_KERNEL;
761	}
762
763	if (thread_qos == THREAD_QOS_MAINTENANCE)
764	return SFI_CLASS_MAINTENANCE;
765
766	if (thread_bg \|\| thread_qos == THREAD_QOS_BACKGROUND) {
767	return SFI_CLASS_DARWIN_BG;
768	}
769
770	if (latency_qos != `0`) {
771	int latency_qos_wtf = latency_qos - `1`;
772
773	if ((latency_qos_wtf >= `4`) && (latency_qos_wtf <= `5`)) {
774	return SFI_CLASS_APP_NAP;
775	}
776	}
777
778	/*
779	* Realtime and fixed priority threads express their duty cycle constraints
780	* via other mechanisms, and are opted out of (most) forms of SFI
781	*/
782	if (thmode == TH_MODE_REALTIME \|\| thmode == TH_MODE_FIXED \|\| task_role == TASK_GRAPHICS_SERVER) {
783	return SFI_CLASS_OPTED_OUT;
784	}
785
786	/*
787	* Threads with unspecified, legacy, or user-initiated QOS class can be individually managed.
788	*/
789	switch (task_role) {
790	case TASK_CONTROL_APPLICATION:
791	case TASK_FOREGROUND_APPLICATION:
792	focal = TRUE;
793	break;
794	case TASK_BACKGROUND_APPLICATION:
795	case TASK_DEFAULT_APPLICATION:
796	case TASK_UNSPECIFIED:
797	/ Focal if the task is in a coalition with a FG/focal app /
798	if (task_coalition_focal_count(thread->task) > `0`)
799	focal = TRUE;
800	break;
801	case TASK_THROTTLE_APPLICATION:
802	case TASK_DARWINBG_APPLICATION:
803	case TASK_NONUI_APPLICATION:
804	/ Definitely not focal /
805	default:
806	break;
807	}
808
809	if (managed_task) {
810	switch (thread_qos) {
811	case THREAD_QOS_UNSPECIFIED:
812	case THREAD_QOS_LEGACY:
813	case THREAD_QOS_USER_INITIATED:
814	if (focal)
815	return SFI_CLASS_MANAGED_FOCAL;
816	else
817	return SFI_CLASS_MANAGED_NONFOCAL;
818	default:
819	break;
820	}
821	}
822
823	if (thread_qos == THREAD_QOS_UTILITY)
824	return SFI_CLASS_UTILITY;
825
826	/*
827	* Classify threads in non-managed tasks
828	*/
829	if (focal) {
830	switch (thread_qos) {
831	case THREAD_QOS_USER_INTERACTIVE:
832	return SFI_CLASS_USER_INTERACTIVE_FOCAL;
833	case THREAD_QOS_USER_INITIATED:
834	return SFI_CLASS_USER_INITIATED_FOCAL;
835	case THREAD_QOS_LEGACY:
836	return SFI_CLASS_LEGACY_FOCAL;
837	default:
838	return SFI_CLASS_DEFAULT_FOCAL;
839	}
840	} else {
841	switch (thread_qos) {
842	case THREAD_QOS_USER_INTERACTIVE:
843	return SFI_CLASS_USER_INTERACTIVE_NONFOCAL;
844	case THREAD_QOS_USER_INITIATED:
845	return SFI_CLASS_USER_INITIATED_NONFOCAL;
846	case THREAD_QOS_LEGACY:
847	return SFI_CLASS_LEGACY_NONFOCAL;
848	default:
849	return SFI_CLASS_DEFAULT_NONFOCAL;
850	}
851	}
852	}
853
854	/*
855	* pset must be locked.
856	*/
857	sfi_class_id_t sfi_processor_active_thread_classify(processor_t processor)
858	{
859	return processor->current_sfi_class;
860	}
861
862	/*
863	* thread must be locked. This is inherently racy, with the intent that
864	* at the AST boundary, it will be fully evaluated whether we need to
865	* perform an AST wait
866	*/
867	ast_t sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class)
868	{
869	sfi_class_id_t class_id;
870
871	class_id = sfi_thread_classify(thread);
872
873	if (out_class)
874	*out_class = class_id;
875
876	/ No lock taken, so a stale value may be used. /
877	if (!sfi_classes[class_id].class_in_on_phase)
878	return AST_SFI;
879	else
880	return AST_NONE;
881	}
882
883	/*
884	* pset must be locked. We take the SFI class for
885	* the currently running thread which is cached on
886	* the processor_t, and assume it is accurate. In the
887	* worst case, the processor will get an IPI and be asked
888	* to evaluate if the current running thread at that
889	* later point in time should be in an SFI wait.
890	*/
891	ast_t sfi_processor_needs_ast(processor_t processor)
892	{
893	sfi_class_id_t class_id;
894
895	class_id = sfi_processor_active_thread_classify(processor);
896
897	/ No lock taken, so a stale value may be used. /
898	if (!sfi_classes[class_id].class_in_on_phase)
899	return AST_SFI;
900	else
901	return AST_NONE;
902
903	}
904
905	static inline void _sfi_wait_cleanup(void)
906	{
907	thread_t self = current_thread();
908
909	spl_t s = splsched();
910	simple_lock(&sfi_lock);
911
912	sfi_class_id_t current_sfi_wait_class = self->sfi_wait_class;
913
914	assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) &&
915	(current_sfi_wait_class < MAX_SFI_CLASS_ID));
916
917	self->sfi_wait_class = SFI_CLASS_UNSPECIFIED;
918
919	simple_unlock(&sfi_lock);
920	splx(s);
921
922	/*
923	* It's possible for the thread to be woken up due to the SFI period
924	* ending before it finishes blocking. In that case,
925	* wait_sfi_begin_time won't be set.
926	*
927	* Derive the time sacrificed to SFI by looking at when this thread was
928	* awoken by the on-timer, to avoid counting the time this thread spent
929	* waiting to get scheduled.
930	*
931	* Note that last_made_runnable_time could be reset if this thread
932	* gets preempted before we read the value. To fix that, we'd need to
933	* track wait time in a thread timer, sample the timer before blocking,
934	* pass the value through thread->parameter, and subtract that.
935	*/
936
937	if (self->wait_sfi_begin_time != `0`) {
938	#if !CONFIG_EMBEDDED
939	uint64_t made_runnable = os_atomic_load(&self->last_made_runnable_time, relaxed);
940	int64_t sfi_wait_time = made_runnable - self->wait_sfi_begin_time;
941	assert(sfi_wait_time >= `0`);
942
943	ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class],
944	sfi_wait_time);
945	#endif /* !CONFIG_EMBEDDED */
946
947	self->wait_sfi_begin_time = `0`;
948	}
949	}
950
951	/*
952	* Called at AST context to fully evaluate if the current thread
953	* (which is obviously running) should instead block in an SFI wait.
954	* We must take the sfi_lock to check whether we are in the "off" period
955	* for the class, and if so, block.
956	*/
957	void sfi_ast(thread_t thread)
958	{
959	sfi_class_id_t class_id;
960	spl_t s;
961	struct sfi_class_state *sfi_class;
962	wait_result_t waitret;
963	boolean_t did_wait = FALSE;
964	thread_continue_t continuation;
965
966	s = splsched();
967
968	simple_lock(&sfi_lock);
969
970	if (!sfi_is_enabled) {
971	/*
972	* SFI is not enabled, or has recently been disabled.
973	* There is no point putting this thread on a deferred ready
974	* queue, even if it were classified as needing it, since
975	* SFI will truly be off at the next global off timer
976	*/
977	simple_unlock(&sfi_lock);
978	splx(s);
979
980	return;
981	}
982
983	thread_lock(thread);
984	thread->sfi_class = class_id = sfi_thread_classify(thread);
985	thread_unlock(thread);
986
987	/*
988	* Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we
989	* are committed to transitioning to whatever state is indicated by "->class_in_on_phase".
990	* If another thread tries to call sfi_reevaluate() after this point, it will take the
991	* sfi_lock and see the thread in this wait state. If another thread calls
992	* sfi_reevaluate() before this point, it would see a runnable thread and at most
993	* attempt to send an AST to this processor, but we would have the most accurate
994	* classification.
995	*/
996
997	sfi_class = &sfi_classes[class_id];
998	if (!sfi_class->class_in_on_phase) {
999	/ Need to block thread in wait queue /
1000	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER),
1001	thread_tid(thread), class_id, `0`, `0`, `0`);
1002
1003	waitret = waitq_assert_wait64(&sfi_class->waitq,
1004	CAST_EVENT64_T(class_id),
1005	THREAD_INTERRUPTIBLE \| THREAD_WAIT_NOREPORT, `0`);
1006	if (waitret == THREAD_WAITING) {
1007	thread->sfi_wait_class = class_id;
1008	did_wait = TRUE;
1009	continuation = sfi_class->continuation;
1010	} else {
1011	/ thread may be exiting already, all other errors are unexpected /
1012	assert(waitret == THREAD_INTERRUPTED);
1013	}
1014	}
1015	simple_unlock(&sfi_lock);
1016
1017	splx(s);
1018
1019	if (did_wait) {
1020	assert(thread->wait_sfi_begin_time == `0`);
1021
1022	thread_block_reason(continuation, NULL, AST_SFI);
1023	}
1024	}
1025
1026	/ Thread must be unlocked /
1027	void sfi_reevaluate(thread_t thread)
1028	{
1029	kern_return_t kret;
1030	spl_t s;
1031	sfi_class_id_t class_id, current_class_id;
1032	ast_t sfi_ast;
1033
1034	s = splsched();
1035
1036	simple_lock(&sfi_lock);
1037
1038	thread_lock(thread);
1039	sfi_ast = sfi_thread_needs_ast(thread, &class_id);
1040	thread->sfi_class = class_id;
1041
1042	/*
1043	* This routine chiefly exists to boost threads out of an SFI wait
1044	* if their classification changes before the "on" timer fires.
1045	*
1046	* If we calculate that a thread is in a different ->sfi_wait_class
1047	* than we think it should be (including no-SFI-wait), we need to
1048	* correct that:
1049	*
1050	* If the thread is in SFI wait and should not be (or should be waiting
1051	* on a different class' "on" timer), we wake it up. If needed, the
1052	* thread may immediately block again in the different SFI wait state.
1053	*
1054	* If the thread is not in an SFI wait state and it should be, we need
1055	* to get that thread's attention, possibly by sending an AST to another
1056	* processor.
1057	*/
1058
1059	if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) {
1060
1061	thread_unlock(thread); / not needed anymore /
1062
1063	assert(current_class_id < MAX_SFI_CLASS_ID);
1064
1065	if ((sfi_ast == AST_NONE) \|\| (class_id != current_class_id)) {
1066	struct sfi_class_state *sfi_class = &sfi_classes[current_class_id];
1067
1068	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, `0`, `0`);
1069
1070	kret = waitq_wakeup64_thread(&sfi_class->waitq,
1071	CAST_EVENT64_T(current_class_id),
1072	thread,
1073	THREAD_AWAKENED);
1074	assert(kret == KERN_SUCCESS \|\| kret == KERN_NOT_WAITING);
1075	}
1076	} else {
1077	/*
1078	* Thread's current SFI wait class is not set, and because we
1079	* have the sfi_lock, it won't get set.
1080	*/
1081
1082	if ((thread->state & (TH_RUN \| TH_IDLE)) == TH_RUN) {
1083	if (sfi_ast != AST_NONE) {
1084	if (thread == current_thread())
1085	ast_on(sfi_ast);
1086	else {
1087	processor_t processor = thread->last_processor;
1088
1089	if (processor != PROCESSOR_NULL &&
1090	processor->state == PROCESSOR_RUNNING &&
1091	processor->active_thread == thread) {
1092	cause_ast_check(processor);
1093	} else {
1094	/*
1095	* Runnable thread that's not on a CPU currently. When a processor
1096	* does context switch to it, the AST will get set based on whether
1097	* the thread is in its "off time".
1098	*/
1099	}
1100	}
1101	}
1102	}
1103
1104	thread_unlock(thread);
1105	}
1106
1107	simple_unlock(&sfi_lock);
1108	splx(s);
1109	}
1110
1111	#else /* !CONFIG_SCHED_SFI */
1112
1113	kern_return_t sfi_set_window(uint64_t window_usecs __unused)
1114	{
1115	return (KERN_NOT_SUPPORTED);
1116	}
1117
1118	kern_return_t sfi_window_cancel(void)
1119	{
1120	return (KERN_NOT_SUPPORTED);
1121	}
1122
1123
1124	kern_return_t sfi_get_window(uint64_t *window_usecs __unused)
1125	{
1126	return (KERN_NOT_SUPPORTED);
1127	}
1128
1129
1130	kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id __unused, uint64_t offtime_usecs __unused)
1131	{
1132	return (KERN_NOT_SUPPORTED);
1133	}
1134
1135	kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id __unused)
1136	{
1137	return (KERN_NOT_SUPPORTED);
1138	}
1139
1140	kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id __unused, uint64_t *offtime_usecs __unused)
1141	{
1142	return (KERN_NOT_SUPPORTED);
1143	}
1144
1145	void sfi_reevaluate(thread_t thread __unused)
1146	{
1147	return;
1148	}
1149
1150	sfi_class_id_t sfi_thread_classify(thread_t thread)
1151	{
1152	task_t task = thread->task;
1153	boolean_t is_kernel_thread = (task == kernel_task);
1154
1155	if (is_kernel_thread) {
1156	return SFI_CLASS_KERNEL;
1157	}
1158
1159	return SFI_CLASS_OPTED_OUT;
1160	}
1161
1162	#endif /* !CONFIG_SCHED_SFI */
1163

Browse the source code of xnu/osfmk/kern/sfi.c