thread_policy.c source code [xnu/osfmk/kern/thread_policy.c]

1	/*
2	* Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <mach/mach_types.h>
30	#include <mach/thread_act_server.h>
31
32	#include <kern/kern_types.h>
33	#include <kern/processor.h>
34	#include <kern/thread.h>
35	#include <kern/affinity.h>
36	#include <kern/work_interval.h>
37	#include <mach/task_policy.h>
38	#include <kern/sfi.h>
39	#include <kern/policy_internal.h>
40	#include <sys/errno.h>
41	#include <sys/ulock.h>
42
43	#include <mach/machine/sdt.h>
44
45	static KALLOC_TYPE_DEFINE(thread_qos_override_zone,
46	struct thread_qos_override, KT_DEFAULT);
47
48	#ifdef MACH_BSD
49	extern int proc_selfpid(void);
50	extern char * proc_name_address(void *p);
51	extern void rethrottle_thread(void * uthread);
52	#endif /* MACH_BSD */
53
54	#define QOS_EXTRACT(q) ((q) & 0xff)
55
56	#define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0
57	#define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1
58	#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2
59	#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 3
60
61	TUNABLE(uint32_t, qos_override_mode, "qos_override_mode",
62	QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE);
63
64	static void
65	proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset);
66
67	const int thread_default_iotier_override = THROTTLE_LEVEL_END;
68
69	const struct thread_requested_policy default_thread_requested_policy = {
70	.thrp_iotier_kevent_override = thread_default_iotier_override
71	};
72
73	/*
74	* THREAD_QOS_UNSPECIFIED is assigned the highest tier available, so it does not provide a limit
75	* to threads that don't have a QoS class set.
76	*/
77	const qos_policy_params_t thread_qos_policy_params = {
78	/*
79	* This table defines the starting base priority of the thread,
80	* which will be modified by the thread importance and the task max priority
81	* before being applied.
82	*/
83	.qos_pri[THREAD_QOS_UNSPECIFIED] = `0`, / not consulted /
84	.qos_pri[THREAD_QOS_USER_INTERACTIVE] = BASEPRI_BACKGROUND, / i.e. 46 /
85	.qos_pri[THREAD_QOS_USER_INITIATED] = BASEPRI_USER_INITIATED,
86	.qos_pri[THREAD_QOS_LEGACY] = BASEPRI_DEFAULT,
87	.qos_pri[THREAD_QOS_UTILITY] = BASEPRI_UTILITY,
88	.qos_pri[THREAD_QOS_BACKGROUND] = MAXPRI_THROTTLE,
89	.qos_pri[THREAD_QOS_MAINTENANCE] = MAXPRI_THROTTLE,
90
91	/*
92	* This table defines the highest IO priority that a thread marked with this
93	* QoS class can have.
94	*/
95	.qos_iotier[THREAD_QOS_UNSPECIFIED] = THROTTLE_LEVEL_TIER0,
96	.qos_iotier[THREAD_QOS_USER_INTERACTIVE] = THROTTLE_LEVEL_TIER0,
97	.qos_iotier[THREAD_QOS_USER_INITIATED] = THROTTLE_LEVEL_TIER0,
98	.qos_iotier[THREAD_QOS_LEGACY] = THROTTLE_LEVEL_TIER0,
99	.qos_iotier[THREAD_QOS_UTILITY] = THROTTLE_LEVEL_TIER1,
100	.qos_iotier[THREAD_QOS_BACKGROUND] = THROTTLE_LEVEL_TIER2, / possibly overridden by bg_iotier /
101	.qos_iotier[THREAD_QOS_MAINTENANCE] = THROTTLE_LEVEL_TIER3,
102
103	/*
104	* This table defines the highest QoS level that
105	* a thread marked with this QoS class can have.
106	*/
107
108	.qos_through_qos[THREAD_QOS_UNSPECIFIED] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_UNSPECIFIED),
109	.qos_through_qos[THREAD_QOS_USER_INTERACTIVE] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_0),
110	.qos_through_qos[THREAD_QOS_USER_INITIATED] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_1),
111	.qos_through_qos[THREAD_QOS_LEGACY] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_1),
112	.qos_through_qos[THREAD_QOS_UTILITY] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_2),
113	.qos_through_qos[THREAD_QOS_BACKGROUND] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_5),
114	.qos_through_qos[THREAD_QOS_MAINTENANCE] = QOS_EXTRACT(THROUGHPUT_QOS_TIER_5),
115
116	.qos_latency_qos[THREAD_QOS_UNSPECIFIED] = QOS_EXTRACT(LATENCY_QOS_TIER_UNSPECIFIED),
117	.qos_latency_qos[THREAD_QOS_USER_INTERACTIVE] = QOS_EXTRACT(LATENCY_QOS_TIER_0),
118	.qos_latency_qos[THREAD_QOS_USER_INITIATED] = QOS_EXTRACT(LATENCY_QOS_TIER_1),
119	.qos_latency_qos[THREAD_QOS_LEGACY] = QOS_EXTRACT(LATENCY_QOS_TIER_1),
120	.qos_latency_qos[THREAD_QOS_UTILITY] = QOS_EXTRACT(LATENCY_QOS_TIER_3),
121	.qos_latency_qos[THREAD_QOS_BACKGROUND] = QOS_EXTRACT(LATENCY_QOS_TIER_3),
122	.qos_latency_qos[THREAD_QOS_MAINTENANCE] = QOS_EXTRACT(LATENCY_QOS_TIER_3),
123	};
124
125	static void
126	thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode);
127
128	static int
129	thread_qos_scaled_relative_priority(int qos, int qos_relprio);
130
131	static void
132	proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info);
133
134	static void
135	proc_set_thread_policy_locked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
136
137	static void
138	proc_set_thread_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
139
140	static void
141	thread_set_requested_policy_spinlocked(thread_t thread, int category, int flavor, int value, int value2, task_pend_token_t pend_token);
142
143	static int
144	thread_get_requested_policy_spinlocked(thread_t thread, int category, int flavor, int* value2);
145
146	static int
147	proc_get_thread_policy_locked(thread_t thread, int category, int flavor, int* value2);
148
149	static void
150	thread_policy_update_spinlocked(thread_t thread, bool recompute_priority, task_pend_token_t pend_token);
151
152	static void
153	thread_policy_update_internal_spinlocked(thread_t thread, bool recompute_priority, task_pend_token_t pend_token);
154
155	boolean_t
156	thread_has_qos_policy(thread_t thread)
157	{
158	return (proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS) != THREAD_QOS_UNSPECIFIED) ? TRUE : FALSE;
159	}
160
161
162	static void
163	thread_remove_qos_policy_locked(thread_t thread,
164	task_pend_token_t pend_token)
165	{
166	__unused int prev_qos = thread->requested_policy.thrp_qos;
167
168	DTRACE_PROC2(qos__remove, thread_t, thread, int, prev_qos);
169
170	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
171	THREAD_QOS_UNSPECIFIED, value2: `0`, pend_token);
172	}
173
174	kern_return_t
175	thread_remove_qos_policy(thread_t thread)
176	{
177	struct task_pend_token pend_token = {};
178
179	thread_mtx_lock(thread);
180	if (!thread->active) {
181	thread_mtx_unlock(thread);
182	return KERN_TERMINATED;
183	}
184
185	thread_remove_qos_policy_locked(thread, pend_token: &pend_token);
186
187	thread_mtx_unlock(thread);
188
189	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
190
191	return KERN_SUCCESS;
192	}
193
194
195	boolean_t
196	thread_is_static_param(thread_t thread)
197	{
198	if (thread->static_param) {
199	DTRACE_PROC1(qos__legacy__denied, thread_t, thread);
200	return TRUE;
201	}
202	return FALSE;
203	}
204
205	/*
206	* Relative priorities can range between 0REL and -15REL. These
207	* map to QoS-specific ranges, to create non-overlapping priority
208	* ranges.
209	*/
210	static int
211	thread_qos_scaled_relative_priority(int qos, int qos_relprio)
212	{
213	int next_lower_qos;
214
215	/ Fast path, since no validation or scaling is needed /
216	if (qos_relprio == `0`) {
217	return `0`;
218	}
219
220	switch (qos) {
221	case THREAD_QOS_USER_INTERACTIVE:
222	next_lower_qos = THREAD_QOS_USER_INITIATED;
223	break;
224	case THREAD_QOS_USER_INITIATED:
225	next_lower_qos = THREAD_QOS_LEGACY;
226	break;
227	case THREAD_QOS_LEGACY:
228	next_lower_qos = THREAD_QOS_UTILITY;
229	break;
230	case THREAD_QOS_UTILITY:
231	next_lower_qos = THREAD_QOS_BACKGROUND;
232	break;
233	case THREAD_QOS_MAINTENANCE:
234	case THREAD_QOS_BACKGROUND:
235	next_lower_qos = `0`;
236	break;
237	default:
238	panic("Unrecognized QoS %d", qos);
239	return `0`;
240	}
241
242	int prio_range_max = thread_qos_policy_params.qos_pri[qos];
243	int prio_range_min = next_lower_qos ? thread_qos_policy_params.qos_pri[next_lower_qos] : `0`;
244
245	/*
246	* We now have the valid range that the scaled relative priority can map to. Note
247	* that the lower bound is exclusive, but the upper bound is inclusive. If the
248	* range is (21,31], 0REL should map to 31 and -15REL should map to 22. We use the
249	* fact that the max relative priority is -15 and use ">>4" to divide by 16 and discard
250	* remainder.
251	*/
252	int scaled_relprio = -(((prio_range_max - prio_range_min) * (-qos_relprio)) >> `4`);
253
254	return scaled_relprio;
255	}
256
257	/*
258	* flag set by -qos-policy-allow boot-arg to allow
259	* testing thread qos policy from userspace
260	*/
261	static TUNABLE(bool, allow_qos_policy_set, "-qos-policy-allow", false);
262
263	kern_return_t
264	thread_policy_set(
265	thread_t thread,
266	thread_policy_flavor_t flavor,
267	thread_policy_t policy_info,
268	mach_msg_type_number_t count)
269	{
270	thread_qos_policy_data_t req_qos;
271	kern_return_t kr;
272
273	req_qos.qos_tier = THREAD_QOS_UNSPECIFIED;
274
275	if (thread == THREAD_NULL) {
276	return KERN_INVALID_ARGUMENT;
277	}
278
279	if (!allow_qos_policy_set) {
280	if (thread_is_static_param(thread)) {
281	return KERN_POLICY_STATIC;
282	}
283
284	if (flavor == THREAD_QOS_POLICY) {
285	return KERN_INVALID_ARGUMENT;
286	}
287
288	if (flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
289	if (count < THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY_COUNT) {
290	return KERN_INVALID_ARGUMENT;
291	}
292	thread_time_constraint_with_priority_policy_t info = (thread_time_constraint_with_priority_policy_t)policy_info;
293	if (info->priority != BASEPRI_RTQUEUES) {
294	return KERN_INVALID_ARGUMENT;
295	}
296	}
297	}
298
299	if (flavor == THREAD_TIME_CONSTRAINT_POLICY \|\| flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
300	thread_work_interval_flags_t th_wi_flags = os_atomic_load(
301	&thread->th_work_interval_flags, relaxed);
302	if ((th_wi_flags & TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID) &&
303	!(th_wi_flags & TH_WORK_INTERVAL_FLAGS_RT_ALLOWED)) {
304	/ Fail requests to become realtime for threads having joined workintervals*
305	* with workload ID that don't have the rt-allowed flag. */
306	return KERN_INVALID_POLICY;
307	}
308	}
309
310	/ Threads without static_param set reset their QoS when other policies are applied. /
311	if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED) {
312	/ Store the existing tier, if we fail this call it is used to reset back. /
313	req_qos.qos_tier = thread->requested_policy.thrp_qos;
314	req_qos.tier_importance = thread->requested_policy.thrp_qos_relprio;
315
316	kr = thread_remove_qos_policy(thread);
317	if (kr != KERN_SUCCESS) {
318	return kr;
319	}
320	}
321
322	kr = thread_policy_set_internal(thread, flavor, policy_info, count);
323
324	if (req_qos.qos_tier != THREAD_QOS_UNSPECIFIED) {
325	if (kr != KERN_SUCCESS) {
326	/ Reset back to our original tier as the set failed. /
327	(void)thread_policy_set_internal(thread, THREAD_QOS_POLICY, policy_info: (thread_policy_t)&req_qos, THREAD_QOS_POLICY_COUNT);
328	}
329	}
330
331	return kr;
332	}
333
334	static_assert(offsetof(thread_time_constraint_with_priority_policy_data_t, period) == offsetof(thread_time_constraint_policy_data_t, period));
335	static_assert(offsetof(thread_time_constraint_with_priority_policy_data_t, computation) == offsetof(thread_time_constraint_policy_data_t, computation));
336	static_assert(offsetof(thread_time_constraint_with_priority_policy_data_t, constraint) == offsetof(thread_time_constraint_policy_data_t, constraint));
337	static_assert(offsetof(thread_time_constraint_with_priority_policy_data_t, preemptible) == offsetof(thread_time_constraint_policy_data_t, preemptible));
338
339	kern_return_t
340	thread_policy_set_internal(
341	thread_t thread,
342	thread_policy_flavor_t flavor,
343	thread_policy_t policy_info,
344	mach_msg_type_number_t count)
345	{
346	kern_return_t result = KERN_SUCCESS;
347	struct task_pend_token pend_token = {};
348
349	thread_mtx_lock(thread);
350	if (!thread->active) {
351	thread_mtx_unlock(thread);
352
353	return KERN_TERMINATED;
354	}
355
356	switch (flavor) {
357	case THREAD_EXTENDED_POLICY:
358	{
359	boolean_t timeshare = TRUE;
360
361	if (count >= THREAD_EXTENDED_POLICY_COUNT) {
362	thread_extended_policy_t info;
363
364	info = (thread_extended_policy_t)policy_info;
365	timeshare = info->timeshare;
366	}
367
368	sched_mode_t mode = (timeshare == TRUE) ? TH_MODE_TIMESHARE : TH_MODE_FIXED;
369
370	spl_t s = splsched();
371	thread_lock(thread);
372
373	thread_set_user_sched_mode_and_recompute_pri(thread, mode);
374
375	thread_unlock(thread);
376	splx(s);
377
378	/*
379	* The thread may be demoted with RT_DISALLOWED but has just
380	* changed its sched mode to TIMESHARE or FIXED. Make sure to
381	* undemote the thread so the new sched mode takes effect.
382	*/
383	thread_rt_evaluate(thread);
384
385	pend_token.tpt_update_thread_sfi = `1`;
386
387	break;
388	}
389
390	case THREAD_TIME_CONSTRAINT_POLICY:
391	case THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY:
392	{
393	thread_time_constraint_with_priority_policy_t info;
394
395	mach_msg_type_number_t min_count = (flavor == THREAD_TIME_CONSTRAINT_POLICY ?
396	THREAD_TIME_CONSTRAINT_POLICY_COUNT :
397	THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY_COUNT);
398
399	if (count < min_count) {
400	result = KERN_INVALID_ARGUMENT;
401	break;
402	}
403
404	info = (thread_time_constraint_with_priority_policy_t)policy_info;
405
406
407	if (info->constraint < info->computation \|\|
408	info->computation > max_rt_quantum \|\|
409	info->computation < min_rt_quantum) {
410	result = KERN_INVALID_ARGUMENT;
411	break;
412	}
413
414	if (info->computation < (info->constraint / `2`)) {
415	info->computation = (info->constraint / `2`);
416	if (info->computation > max_rt_quantum) {
417	info->computation = max_rt_quantum;
418	}
419	}
420
421	if (flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
422	if ((info->priority < BASEPRI_RTQUEUES) \|\| (info->priority > MAXPRI)) {
423	result = KERN_INVALID_ARGUMENT;
424	break;
425	}
426	}
427
428	spl_t s = splsched();
429	thread_lock(thread);
430
431	thread->realtime.period = info->period;
432	thread->realtime.computation = info->computation;
433	thread->realtime.constraint = info->constraint;
434	thread->realtime.preemptible = info->preemptible;
435
436	/*
437	* If the thread has a work interval driven policy, the priority
438	* offset has been set by the work interval.
439	*/
440	if (!thread->requested_policy.thrp_wi_driven) {
441	if (flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
442	thread->realtime.priority_offset = (uint8_t)(info->priority - BASEPRI_RTQUEUES);
443	} else {
444	thread->realtime.priority_offset = `0`;
445	}
446	}
447
448	thread_set_user_sched_mode_and_recompute_pri(thread, mode: TH_MODE_REALTIME);
449
450	thread_unlock(thread);
451	splx(s);
452
453	thread_rt_evaluate(thread);
454
455	pend_token.tpt_update_thread_sfi = `1`;
456
457	break;
458	}
459
460	case THREAD_PRECEDENCE_POLICY:
461	{
462	thread_precedence_policy_t info;
463
464	if (count < THREAD_PRECEDENCE_POLICY_COUNT) {
465	result = KERN_INVALID_ARGUMENT;
466	break;
467	}
468	info = (thread_precedence_policy_t)policy_info;
469
470	spl_t s = splsched();
471	thread_lock(thread);
472
473	thread->importance = info->importance;
474
475	thread_recompute_priority(thread);
476
477	thread_unlock(thread);
478	splx(s);
479
480	break;
481	}
482
483	case THREAD_AFFINITY_POLICY:
484	{
485	extern boolean_t affinity_sets_enabled;
486	thread_affinity_policy_t info;
487
488	if (!affinity_sets_enabled) {
489	result = KERN_INVALID_POLICY;
490	break;
491	}
492
493	if (!thread_affinity_is_supported()) {
494	result = KERN_NOT_SUPPORTED;
495	break;
496	}
497	if (count < THREAD_AFFINITY_POLICY_COUNT) {
498	result = KERN_INVALID_ARGUMENT;
499	break;
500	}
501
502	info = (thread_affinity_policy_t) policy_info;
503	/*
504	* Unlock the thread mutex here and
505	* return directly after calling thread_affinity_set().
506	* This is necessary for correct lock ordering because
507	* thread_affinity_set() takes the task lock.
508	*/
509	thread_mtx_unlock(thread);
510	return thread_affinity_set(thread, tag: info->affinity_tag);
511	}
512
513	#if !defined(XNU_TARGET_OS_OSX)
514	case THREAD_BACKGROUND_POLICY:
515	{
516	thread_background_policy_t info;
517
518	if (count < THREAD_BACKGROUND_POLICY_COUNT) {
519	result = KERN_INVALID_ARGUMENT;
520	break;
521	}
522
523	if (get_threadtask(thread) != current_task()) {
524	result = KERN_PROTECTION_FAILURE;
525	break;
526	}
527
528	info = (thread_background_policy_t) policy_info;
529
530	int enable;
531
532	if (info->priority == THREAD_BACKGROUND_POLICY_DARWIN_BG) {
533	enable = TASK_POLICY_ENABLE;
534	} else {
535	enable = TASK_POLICY_DISABLE;
536	}
537
538	int category = (current_thread() == thread) ? TASK_POLICY_INTERNAL : TASK_POLICY_EXTERNAL;
539
540	proc_set_thread_policy_locked(thread, category, TASK_POLICY_DARWIN_BG, enable, `0`, &pend_token);
541
542	break;
543	}
544	#endif /* !defined(XNU_TARGET_OS_OSX) */
545
546	case THREAD_THROUGHPUT_QOS_POLICY:
547	{
548	thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info;
549	thread_throughput_qos_t tqos;
550
551	if (count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) {
552	result = KERN_INVALID_ARGUMENT;
553	break;
554	}
555
556	if ((result = qos_throughput_policy_validate(info->thread_throughput_qos_tier)) != KERN_SUCCESS) {
557	break;
558	}
559
560	tqos = qos_extract(info->thread_throughput_qos_tier);
561
562	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
563	TASK_POLICY_THROUGH_QOS, value: tqos, value2: `0`, pend_token: &pend_token);
564
565	break;
566	}
567
568	case THREAD_LATENCY_QOS_POLICY:
569	{
570	thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info;
571	thread_latency_qos_t lqos;
572
573	if (count < THREAD_LATENCY_QOS_POLICY_COUNT) {
574	result = KERN_INVALID_ARGUMENT;
575	break;
576	}
577
578	if ((result = qos_latency_policy_validate(info->thread_latency_qos_tier)) != KERN_SUCCESS) {
579	break;
580	}
581
582	lqos = qos_extract(info->thread_latency_qos_tier);
583
584	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
585	TASK_POLICY_LATENCY_QOS, value: lqos, value2: `0`, pend_token: &pend_token);
586
587	break;
588	}
589
590	case THREAD_QOS_POLICY:
591	{
592	thread_qos_policy_t info = (thread_qos_policy_t)policy_info;
593
594	if (count < THREAD_QOS_POLICY_COUNT) {
595	result = KERN_INVALID_ARGUMENT;
596	break;
597	}
598
599	if (info->qos_tier < `0` \|\| info->qos_tier >= THREAD_QOS_LAST) {
600	result = KERN_INVALID_ARGUMENT;
601	break;
602	}
603
604	if (info->tier_importance > `0` \|\| info->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
605	result = KERN_INVALID_ARGUMENT;
606	break;
607	}
608
609	if (info->qos_tier == THREAD_QOS_UNSPECIFIED && info->tier_importance != `0`) {
610	result = KERN_INVALID_ARGUMENT;
611	break;
612	}
613
614	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
615	value: info->qos_tier, value2: -info->tier_importance, pend_token: &pend_token);
616
617	break;
618	}
619
620	default:
621	result = KERN_INVALID_ARGUMENT;
622	break;
623	}
624
625	thread_mtx_unlock(thread);
626
627	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
628
629	return result;
630	}
631
632	/*
633	* Note that there is no implemented difference between POLICY_RR and POLICY_FIFO.
634	* Both result in FIXED mode scheduling.
635	*/
636	static sched_mode_t
637	convert_policy_to_sched_mode(integer_t policy)
638	{
639	switch (policy) {
640	case POLICY_TIMESHARE:
641	return TH_MODE_TIMESHARE;
642	case POLICY_RR:
643	case POLICY_FIFO:
644	return TH_MODE_FIXED;
645	default:
646	panic("unexpected sched policy: %d", policy);
647	return TH_MODE_NONE;
648	}
649	}
650
651	/*
652	* Called either with the thread mutex locked
653	* or from the pthread kext in a 'safe place'.
654	*/
655	static kern_return_t
656	thread_set_mode_and_absolute_pri_internal(thread_t thread,
657	sched_mode_t mode,
658	integer_t priority,
659	task_pend_token_t pend_token)
660	{
661	kern_return_t kr = KERN_SUCCESS;
662
663	spl_t s = splsched();
664	thread_lock(thread);
665
666	/ This path isn't allowed to change a thread out of realtime. /
667	if ((thread->sched_mode == TH_MODE_REALTIME) \|\|
668	(thread->saved_mode == TH_MODE_REALTIME)) {
669	kr = KERN_FAILURE;
670	goto unlock;
671	}
672
673	if (thread->policy_reset) {
674	kr = KERN_SUCCESS;
675	goto unlock;
676	}
677
678	sched_mode_t old_mode = thread->sched_mode;
679	integer_t old_base_pri = thread->base_pri;
680	integer_t old_sched_pri = thread->sched_pri;
681
682	/*
683	* Reverse engineer and apply the correct importance value
684	* from the requested absolute priority value.
685	*
686	* TODO: Store the absolute priority value instead
687	*/
688
689	if (priority >= thread->max_priority) {
690	priority = thread->max_priority - thread->task_priority;
691	} else if (priority >= MINPRI_KERNEL) {
692	priority -= MINPRI_KERNEL;
693	} else if (priority >= MINPRI_RESERVED) {
694	priority -= MINPRI_RESERVED;
695	} else {
696	priority -= BASEPRI_DEFAULT;
697	}
698
699	priority += thread->task_priority;
700
701	if (priority > thread->max_priority) {
702	priority = thread->max_priority;
703	} else if (priority < MINPRI) {
704	priority = MINPRI;
705	}
706
707	thread->importance = priority - thread->task_priority;
708
709	thread_set_user_sched_mode_and_recompute_pri(thread, mode);
710
711	if (mode != old_mode) {
712	pend_token->tpt_update_thread_sfi = `1`;
713	}
714
715	if (thread->base_pri != old_base_pri \|\|
716	thread->sched_pri != old_sched_pri) {
717	pend_token->tpt_update_turnstile = `1`;
718	}
719
720	unlock:
721	thread_unlock(thread);
722	splx(s);
723
724	return kr;
725	}
726
727	void
728	thread_freeze_base_pri(thread_t thread)
729	{
730	assert(thread == current_thread());
731
732	spl_t s = splsched();
733	thread_lock(thread);
734
735	assert((thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) == `0`);
736	thread->sched_flags \|= TH_SFLAG_BASE_PRI_FROZEN;
737
738	thread_unlock(thread);
739	splx(s);
740	}
741
742	bool
743	thread_unfreeze_base_pri(thread_t thread)
744	{
745	assert(thread == current_thread());
746	integer_t base_pri;
747	ast_t ast = `0`;
748
749	spl_t s = splsched();
750	thread_lock(thread);
751
752	assert(thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN);
753	thread->sched_flags &= ~TH_SFLAG_BASE_PRI_FROZEN;
754
755	base_pri = thread->req_base_pri;
756	if (base_pri != thread->base_pri) {
757	/*
758	* This function returns "true" if the base pri change
759	* is the most likely cause for the preemption.
760	*/
761	sched_set_thread_base_priority(thread, priority: base_pri);
762	ast = ast_peek(AST_PREEMPT);
763	}
764
765	thread_unlock(thread);
766	splx(s);
767
768	return ast != `0`;
769	}
770
771	uint8_t
772	thread_workq_pri_for_qos(thread_qos_t qos)
773	{
774	assert(qos < THREAD_QOS_LAST);
775	return (uint8_t)thread_qos_policy_params.qos_pri[qos];
776	}
777
778	thread_qos_t
779	thread_workq_qos_for_pri(int priority)
780	{
781	thread_qos_t qos;
782	if (priority > thread_qos_policy_params.qos_pri[THREAD_QOS_USER_INTERACTIVE]) {
783	// indicate that workq should map >UI threads to workq's
784	// internal notation for above-UI work.
785	return THREAD_QOS_UNSPECIFIED;
786	}
787	for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) {
788	// map a given priority up to the next nearest qos band.
789	if (thread_qos_policy_params.qos_pri[qos - `1`] < priority) {
790	return qos;
791	}
792	}
793	return THREAD_QOS_MAINTENANCE;
794	}
795
796	/*
797	* private interface for pthread workqueues
798	*
799	* Set scheduling policy & absolute priority for thread
800	* May be called with spinlocks held
801	* Thread mutex lock is not held
802	*/
803	void
804	thread_reset_workq_qos(thread_t thread, uint32_t qos)
805	{
806	struct task_pend_token pend_token = {};
807
808	assert(qos < THREAD_QOS_LAST);
809
810	spl_t s = splsched();
811	thread_lock(thread);
812
813	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
814	TASK_POLICY_QOS_AND_RELPRIO, value: qos, value2: `0`, pend_token: &pend_token);
815	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
816	TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED, value2: `0`,
817	pend_token: &pend_token);
818
819	assert(pend_token.tpt_update_sockets == `0`);
820
821	thread_unlock(thread);
822	splx(s);
823
824	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
825	}
826
827	/*
828	* private interface for pthread workqueues
829	*
830	* Set scheduling policy & absolute priority for thread
831	* May be called with spinlocks held
832	* Thread mutex lock is held
833	*/
834	void
835	thread_set_workq_override(thread_t thread, uint32_t qos)
836	{
837	struct task_pend_token pend_token = {};
838
839	assert(qos < THREAD_QOS_LAST);
840
841	spl_t s = splsched();
842	thread_lock(thread);
843
844	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
845	TASK_POLICY_QOS_WORKQ_OVERRIDE, value: qos, value2: `0`, pend_token: &pend_token);
846
847	assert(pend_token.tpt_update_sockets == `0`);
848
849	thread_unlock(thread);
850	splx(s);
851
852	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
853	}
854
855	/*
856	* private interface for pthread workqueues
857	*
858	* Set scheduling policy & absolute priority for thread
859	* May be called with spinlocks held
860	* Thread mutex lock is not held
861	*/
862	void
863	thread_set_workq_pri(thread_t thread,
864	thread_qos_t qos,
865	integer_t priority,
866	integer_t policy)
867	{
868	struct task_pend_token pend_token = {};
869	sched_mode_t mode = convert_policy_to_sched_mode(policy);
870
871	assert(qos < THREAD_QOS_LAST);
872	assert(thread->static_param);
873
874	if (!thread->static_param \|\| !thread->active) {
875	return;
876	}
877
878	spl_t s = splsched();
879	thread_lock(thread);
880
881	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
882	TASK_POLICY_QOS_AND_RELPRIO, value: qos, value2: `0`, pend_token: &pend_token);
883	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
884	TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED,
885	value2: `0`, pend_token: &pend_token);
886
887	thread_unlock(thread);
888	splx(s);
889
890	/ Concern: this doesn't hold the mutex... /
891
892	__assert_only kern_return_t kr;
893	kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority,
894	pend_token: &pend_token);
895	assert(kr == KERN_SUCCESS);
896
897	assert(pend_token.tpt_update_sockets == `0`);
898
899	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
900	}
901
902	/*
903	* thread_set_mode_and_absolute_pri:
904	*
905	* Set scheduling policy & absolute priority for thread, for deprecated
906	* thread_set_policy and thread_policy interfaces.
907	*
908	* Called with nothing locked.
909	*/
910	kern_return_t
911	thread_set_mode_and_absolute_pri(thread_t thread,
912	integer_t policy,
913	integer_t priority)
914	{
915	kern_return_t kr = KERN_SUCCESS;
916	struct task_pend_token pend_token = {};
917
918	sched_mode_t mode = convert_policy_to_sched_mode(policy);
919
920	thread_mtx_lock(thread);
921
922	if (!thread->active) {
923	kr = KERN_TERMINATED;
924	goto unlock;
925	}
926
927	if (thread_is_static_param(thread)) {
928	kr = KERN_POLICY_STATIC;
929	goto unlock;
930	}
931
932	/ Setting legacy policies on threads kills the current QoS /
933	if (thread->requested_policy.thrp_qos != THREAD_QOS_UNSPECIFIED) {
934	thread_remove_qos_policy_locked(thread, pend_token: &pend_token);
935	}
936
937	kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, pend_token: &pend_token);
938
939	unlock:
940	thread_mtx_unlock(thread);
941
942	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
943
944	return kr;
945	}
946
947	/*
948	* Set the thread's requested mode and recompute priority
949	* Called with thread mutex and thread locked
950	*
951	* TODO: Mitigate potential problems caused by moving thread to end of runq
952	* whenever its priority is recomputed
953	* Only remove when it actually changes? Attempt to re-insert at appropriate location?
954	*/
955	static void
956	thread_set_user_sched_mode_and_recompute_pri(thread_t thread, sched_mode_t mode)
957	{
958	if (thread->policy_reset) {
959	return;
960	}
961
962	boolean_t removed = thread_run_queue_remove(thread);
963
964	sched_set_thread_mode_user(thread, mode);
965
966	thread_recompute_priority(thread);
967
968	if (removed) {
969	thread_run_queue_reinsert(thread, options: SCHED_TAILQ);
970	}
971	}
972
973	/ called at splsched with thread lock locked /
974	static void
975	thread_update_qos_cpu_time_locked(thread_t thread)
976	{
977	task_t task = get_threadtask(thread);
978	uint64_t timer_sum, timer_delta;
979
980	/*
981	* This is only as accurate the thread's last context switch or user/kernel
982	* transition (unless precise user/kernel time is disabled).
983	*
984	* TODO: Consider running an update operation here to update it first.
985	* Maybe doable with interrupts disabled from current thread.
986	* If the thread is on a different core, may not be easy to get right.
987	*/
988
989	timer_sum = recount_thread_time_mach(thread);
990	timer_delta = timer_sum - thread->vtimer_qos_save;
991
992	thread->vtimer_qos_save = timer_sum;
993
994	uint64_t* task_counter = NULL;
995
996	/ Update the task-level effective and requested qos stats atomically, because we don't have the task lock. /
997	switch (thread->effective_policy.thep_qos) {
998	case THREAD_QOS_UNSPECIFIED: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_default; break;
999	case THREAD_QOS_MAINTENANCE: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_maintenance; break;
1000	case THREAD_QOS_BACKGROUND: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_background; break;
1001	case THREAD_QOS_UTILITY: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_utility; break;
1002	case THREAD_QOS_LEGACY: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_legacy; break;
1003	case THREAD_QOS_USER_INITIATED: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_user_initiated; break;
1004	case THREAD_QOS_USER_INTERACTIVE: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_user_interactive; break;
1005	default:
1006	panic("unknown effective QoS: %d", thread->effective_policy.thep_qos);
1007	}
1008
1009	OSAddAtomic64(timer_delta, task_counter);
1010
1011	/ Update the task-level qos stats atomically, because we don't have the task lock. /
1012	switch (thread->requested_policy.thrp_qos) {
1013	case THREAD_QOS_UNSPECIFIED: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_default; break;
1014	case THREAD_QOS_MAINTENANCE: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_maintenance; break;
1015	case THREAD_QOS_BACKGROUND: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_background; break;
1016	case THREAD_QOS_UTILITY: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_utility; break;
1017	case THREAD_QOS_LEGACY: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_legacy; break;
1018	case THREAD_QOS_USER_INITIATED: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_user_initiated; break;
1019	case THREAD_QOS_USER_INTERACTIVE: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_user_interactive; break;
1020	default:
1021	panic("unknown requested QoS: %d", thread->requested_policy.thrp_qos);
1022	}
1023
1024	OSAddAtomic64(timer_delta, task_counter);
1025	}
1026
1027	/*
1028	* called with no thread locks held
1029	* may hold task lock
1030	*/
1031	void
1032	thread_update_qos_cpu_time(thread_t thread)
1033	{
1034	thread_mtx_lock(thread);
1035
1036	spl_t s = splsched();
1037	thread_lock(thread);
1038
1039	thread_update_qos_cpu_time_locked(thread);
1040
1041	thread_unlock(thread);
1042	splx(s);
1043
1044	thread_mtx_unlock(thread);
1045	}
1046
1047	/*
1048	* Calculate base priority from thread attributes, and set it on the thread
1049	*
1050	* Called with thread_lock and thread mutex held.
1051	*/
1052	void
1053	thread_recompute_priority(
1054	thread_t thread)
1055	{
1056	integer_t priority;
1057	integer_t adj_priority;
1058	bool wi_priority = false;
1059
1060	if (thread->policy_reset) {
1061	return;
1062	}
1063
1064	if (thread->sched_mode == TH_MODE_REALTIME) {
1065	uint8_t i = thread->realtime.priority_offset;
1066	assert((i >= `0`) && (i < NRTQS));
1067	priority = BASEPRI_RTQUEUES + i;
1068
1069	sched_set_thread_base_priority(thread, priority);
1070	if (thread->realtime.deadline == RT_DEADLINE_NONE) {
1071	/ Make sure the thread has a valid deadline /
1072	uint64_t ctime = mach_absolute_time();
1073	thread->realtime.deadline = thread->realtime.constraint + ctime;
1074	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) \| DBG_FUNC_NONE,
1075	(uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, `1`);
1076	}
1077	return;
1078
1079	/*
1080	* A thread may have joined a RT work interval but then never
1081	* changed its sched mode or have been demoted. RT work
1082	* intervals will have RT priorities - ignore the priority if
1083	* the thread isn't RT.
1084	*/
1085	} else if (thread->effective_policy.thep_wi_driven &&
1086	work_interval_get_priority(thread) < BASEPRI_RTQUEUES) {
1087	priority = work_interval_get_priority(thread);
1088	wi_priority = true;
1089	} else if (thread->effective_policy.thep_qos != THREAD_QOS_UNSPECIFIED) {
1090	int qos = thread->effective_policy.thep_qos;
1091	int qos_ui_is_urgent = thread->effective_policy.thep_qos_ui_is_urgent;
1092	int qos_relprio = -(thread->effective_policy.thep_qos_relprio); / stored in task policy inverted /
1093	int qos_scaled_relprio;
1094
1095	assert(qos >= `0` && qos < THREAD_QOS_LAST);
1096	assert(qos_relprio <= `0` && qos_relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE);
1097
1098	priority = thread_qos_policy_params.qos_pri[qos];
1099	qos_scaled_relprio = thread_qos_scaled_relative_priority(qos, qos_relprio);
1100
1101	if (qos == THREAD_QOS_USER_INTERACTIVE && qos_ui_is_urgent == `1`) {
1102	/ Bump priority 46 to 47 when in a frontmost app /
1103	qos_scaled_relprio += `1`;
1104	}
1105
1106	/ TODO: factor in renice priority here? /
1107
1108	priority += qos_scaled_relprio;
1109	} else {
1110	if (thread->importance > MAXPRI) {
1111	priority = MAXPRI;
1112	} else if (thread->importance < -MAXPRI) {
1113	priority = -MAXPRI;
1114	} else {
1115	priority = thread->importance;
1116	}
1117
1118	priority += thread->task_priority;
1119	}
1120
1121	/ Boost the priority of threads which are RT demoted. /
1122	if (sched_thread_mode_has_demotion(thread, TH_SFLAG_RT_DISALLOWED)) {
1123	priority = MAX(priority, MAXPRI_USER);
1124	}
1125
1126	priority = MAX(priority, thread->user_promotion_basepri);
1127
1128	/*
1129	* Clamp priority back into the allowed range for this task.
1130	* The initial priority value could be out of this range due to:
1131	* Task clamped to BG or Utility (max-pri is 4, or 20)
1132	* Task is user task (max-pri is 63)
1133	* Task is kernel task (max-pri is 95)
1134	* Note that thread->importance is user-settable to any integer
1135	* via THREAD_PRECEDENCE_POLICY.
1136	*/
1137	adj_priority = priority;
1138	adj_priority = MIN(adj_priority, thread->max_priority);
1139	adj_priority = MAX(adj_priority, MINPRI);
1140
1141	/ Allow workload driven priorities to exceed max_priority. /
1142	if (wi_priority) {
1143	adj_priority = MAX(adj_priority, priority);
1144	}
1145
1146	/ Allow priority to exceed max_priority for promotions. /
1147	if (thread->effective_policy.thep_promote_above_task) {
1148	adj_priority = MAX(adj_priority, thread->user_promotion_basepri);
1149	}
1150	priority = adj_priority;
1151	assert3u(priority, <=, MAXPRI);
1152
1153	if (thread->saved_mode == TH_MODE_REALTIME &&
1154	sched_thread_mode_has_demotion(thread, TH_SFLAG_FAILSAFE)) {
1155	priority = DEPRESSPRI;
1156	}
1157
1158	if (thread->effective_policy.thep_terminated == TRUE) {
1159	/*
1160	* We temporarily want to override the expected priority to
1161	* ensure that the thread exits in a timely manner.
1162	* Note that this is allowed to exceed thread->max_priority
1163	* so that the thread is no longer clamped to background
1164	* during the final exit phase.
1165	*/
1166	if (priority < thread->task_priority) {
1167	priority = thread->task_priority;
1168	}
1169	if (priority < BASEPRI_DEFAULT) {
1170	priority = BASEPRI_DEFAULT;
1171	}
1172	}
1173
1174	#if !defined(XNU_TARGET_OS_OSX)
1175	/ No one can have a base priority less than MAXPRI_THROTTLE /
1176	if (priority < MAXPRI_THROTTLE) {
1177	priority = MAXPRI_THROTTLE;
1178	}
1179	#endif /* !defined(XNU_TARGET_OS_OSX) */
1180
1181	sched_set_thread_base_priority(thread, priority);
1182	}
1183
1184	/ Called with the task lock held, but not the thread mutex or spinlock /
1185	void
1186	thread_policy_update_tasklocked(
1187	thread_t thread,
1188	integer_t priority,
1189	integer_t max_priority,
1190	task_pend_token_t pend_token)
1191	{
1192	thread_mtx_lock(thread);
1193
1194	if (!thread->active \|\| thread->policy_reset) {
1195	thread_mtx_unlock(thread);
1196	return;
1197	}
1198
1199	spl_t s = splsched();
1200	thread_lock(thread);
1201
1202	__unused
1203	integer_t old_max_priority = thread->max_priority;
1204
1205	assert(priority >= INT16_MIN && priority <= INT16_MAX);
1206	thread->task_priority = (int16_t)priority;
1207
1208	assert(max_priority >= INT16_MIN && max_priority <= INT16_MAX);
1209	thread->max_priority = (int16_t)max_priority;
1210
1211	/*
1212	* When backgrounding a thread, realtime and fixed priority threads
1213	* should be demoted to timeshare background threads.
1214	*
1215	* TODO: Do this inside the thread policy update routine in order to avoid double
1216	* remove/reinsert for a runnable thread
1217	*/
1218	if ((max_priority <= MAXPRI_THROTTLE) && (old_max_priority > MAXPRI_THROTTLE)) {
1219	sched_thread_mode_demote(thread, TH_SFLAG_THROTTLED);
1220	} else if ((max_priority > MAXPRI_THROTTLE) && (old_max_priority <= MAXPRI_THROTTLE)) {
1221	sched_thread_mode_undemote(thread, TH_SFLAG_THROTTLED);
1222	}
1223
1224	thread_policy_update_spinlocked(thread, true, pend_token);
1225
1226	thread_unlock(thread);
1227	splx(s);
1228
1229	thread_mtx_unlock(thread);
1230	}
1231
1232	/*
1233	* Reset thread to default state in preparation for termination
1234	* Called with thread mutex locked
1235	*
1236	* Always called on current thread, so we don't need a run queue remove
1237	*/
1238	void
1239	thread_policy_reset(
1240	thread_t thread)
1241	{
1242	spl_t s;
1243
1244	assert(thread == current_thread());
1245
1246	s = splsched();
1247	thread_lock(thread);
1248
1249	if (thread->sched_flags & TH_SFLAG_FAILSAFE) {
1250	sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
1251	}
1252
1253	if (thread->sched_flags & TH_SFLAG_THROTTLED) {
1254	sched_thread_mode_undemote(thread, TH_SFLAG_THROTTLED);
1255	}
1256
1257	if (thread->sched_flags & TH_SFLAG_RT_DISALLOWED) {
1258	sched_thread_mode_undemote(thread, TH_SFLAG_RT_DISALLOWED);
1259	}
1260
1261	/ At this point, the various demotions should be inactive /
1262	assert(!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK));
1263	assert(!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK));
1264
1265	/ Reset thread back to task-default basepri and mode /
1266	sched_mode_t newmode = SCHED(initial_thread_sched_mode)(get_threadtask(thread));
1267
1268	sched_set_thread_mode(thread, mode: newmode);
1269
1270	thread->importance = `0`;
1271
1272	/ Prevent further changes to thread base priority or mode /
1273	thread->policy_reset = `1`;
1274
1275	sched_set_thread_base_priority(thread, priority: thread->task_priority);
1276
1277	thread_unlock(thread);
1278	splx(s);
1279	}
1280
1281	kern_return_t
1282	thread_policy_get(
1283	thread_t thread,
1284	thread_policy_flavor_t flavor,
1285	thread_policy_t policy_info,
1286	mach_msg_type_number_t *count,
1287	boolean_t *get_default)
1288	{
1289	kern_return_t result = KERN_SUCCESS;
1290
1291	if (thread == THREAD_NULL) {
1292	return KERN_INVALID_ARGUMENT;
1293	}
1294
1295	thread_mtx_lock(thread);
1296	if (!thread->active) {
1297	thread_mtx_unlock(thread);
1298
1299	return KERN_TERMINATED;
1300	}
1301
1302	switch (flavor) {
1303	case THREAD_EXTENDED_POLICY:
1304	{
1305	boolean_t timeshare = TRUE;
1306
1307	if (!(*get_default)) {
1308	spl_t s = splsched();
1309	thread_lock(thread);
1310
1311	if ((thread->sched_mode != TH_MODE_REALTIME) &&
1312	(thread->saved_mode != TH_MODE_REALTIME)) {
1313	if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) {
1314	timeshare = (thread->sched_mode == TH_MODE_TIMESHARE) != `0`;
1315	} else {
1316	timeshare = (thread->saved_mode == TH_MODE_TIMESHARE) != `0`;
1317	}
1318	} else {
1319	*get_default = TRUE;
1320	}
1321
1322	thread_unlock(thread);
1323	splx(s);
1324	}
1325
1326	if (*count >= THREAD_EXTENDED_POLICY_COUNT) {
1327	thread_extended_policy_t info;
1328
1329	info = (thread_extended_policy_t)policy_info;
1330	info->timeshare = timeshare;
1331	}
1332
1333	break;
1334	}
1335
1336	case THREAD_TIME_CONSTRAINT_POLICY:
1337	case THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY:
1338	{
1339	thread_time_constraint_with_priority_policy_t info;
1340
1341	mach_msg_type_number_t min_count = (flavor == THREAD_TIME_CONSTRAINT_POLICY ?
1342	THREAD_TIME_CONSTRAINT_POLICY_COUNT :
1343	THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY_COUNT);
1344
1345	if (*count < min_count) {
1346	result = KERN_INVALID_ARGUMENT;
1347	break;
1348	}
1349
1350	info = (thread_time_constraint_with_priority_policy_t)policy_info;
1351
1352	if (!(*get_default)) {
1353	spl_t s = splsched();
1354	thread_lock(thread);
1355
1356	if ((thread->sched_mode == TH_MODE_REALTIME) \|\|
1357	(thread->saved_mode == TH_MODE_REALTIME)) {
1358	info->period = thread->realtime.period;
1359	info->computation = thread->realtime.computation;
1360	info->constraint = thread->realtime.constraint;
1361	info->preemptible = thread->realtime.preemptible;
1362	if (flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
1363	info->priority = thread->realtime.priority_offset + BASEPRI_RTQUEUES;
1364	}
1365	} else {
1366	*get_default = TRUE;
1367	}
1368
1369	thread_unlock(thread);
1370	splx(s);
1371	}
1372
1373	if (*get_default) {
1374	info->period = `0`;
1375	info->computation = default_timeshare_computation;
1376	info->constraint = default_timeshare_constraint;
1377	info->preemptible = TRUE;
1378	if (flavor == THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY) {
1379	info->priority = BASEPRI_RTQUEUES;
1380	}
1381	}
1382
1383
1384	break;
1385	}
1386
1387	case THREAD_PRECEDENCE_POLICY:
1388	{
1389	thread_precedence_policy_t info;
1390
1391	if (*count < THREAD_PRECEDENCE_POLICY_COUNT) {
1392	result = KERN_INVALID_ARGUMENT;
1393	break;
1394	}
1395
1396	info = (thread_precedence_policy_t)policy_info;
1397
1398	if (!(*get_default)) {
1399	spl_t s = splsched();
1400	thread_lock(thread);
1401
1402	info->importance = thread->importance;
1403
1404	thread_unlock(thread);
1405	splx(s);
1406	} else {
1407	info->importance = `0`;
1408	}
1409
1410	break;
1411	}
1412
1413	case THREAD_AFFINITY_POLICY:
1414	{
1415	thread_affinity_policy_t info;
1416
1417	if (!thread_affinity_is_supported()) {
1418	result = KERN_NOT_SUPPORTED;
1419	break;
1420	}
1421	if (*count < THREAD_AFFINITY_POLICY_COUNT) {
1422	result = KERN_INVALID_ARGUMENT;
1423	break;
1424	}
1425
1426	info = (thread_affinity_policy_t)policy_info;
1427
1428	if (!(*get_default)) {
1429	info->affinity_tag = thread_affinity_get(thread);
1430	} else {
1431	info->affinity_tag = THREAD_AFFINITY_TAG_NULL;
1432	}
1433
1434	break;
1435	}
1436
1437	case THREAD_POLICY_STATE:
1438	{
1439	thread_policy_state_t info;
1440
1441	if (*count < THREAD_POLICY_STATE_COUNT) {
1442	result = KERN_INVALID_ARGUMENT;
1443	break;
1444	}
1445
1446	/ Only root can get this info /
1447	if (!task_is_privileged(task: current_task())) {
1448	result = KERN_PROTECTION_FAILURE;
1449	break;
1450	}
1451
1452	info = (thread_policy_state_t)(void*)policy_info;
1453
1454	if (!(*get_default)) {
1455	info->flags = `0`;
1456
1457	spl_t s = splsched();
1458	thread_lock(thread);
1459
1460	info->flags \|= (thread->static_param ? THREAD_POLICY_STATE_FLAG_STATIC_PARAM : `0`);
1461
1462	info->thps_requested_policy = (uint64_t)(void*)(&thread->requested_policy);
1463	info->thps_effective_policy = (uint64_t)(void*)(&thread->effective_policy);
1464
1465	info->thps_user_promotions = `0`;
1466	info->thps_user_promotion_basepri = thread->user_promotion_basepri;
1467	info->thps_ipc_overrides = thread->kevent_overrides;
1468
1469	proc_get_thread_policy_bitfield(thread, info);
1470
1471	thread_unlock(thread);
1472	splx(s);
1473	} else {
1474	info->requested = `0`;
1475	info->effective = `0`;
1476	info->pending = `0`;
1477	}
1478
1479	break;
1480	}
1481
1482	case THREAD_REQUESTED_STATE_POLICY:
1483	{
1484	if (*count < THREAD_REQUESTED_STATE_POLICY_COUNT) {
1485	result = KERN_INVALID_ARGUMENT;
1486	break;
1487	}
1488
1489	thread_requested_qos_policy_t info = (thread_requested_qos_policy_t) policy_info;
1490	struct thread_requested_policy *req_policy = &thread->requested_policy;
1491
1492	info->thrq_base_qos = req_policy->thrp_qos;
1493	info->thrq_qos_relprio = req_policy->thrp_qos_relprio;
1494	info->thrq_qos_override = req_policy->thrp_qos_override;
1495	info->thrq_qos_promote = req_policy->thrp_qos_promote;
1496	info->thrq_qos_kevent_override = req_policy->thrp_qos_kevent_override;
1497	info->thrq_qos_workq_override = req_policy->thrp_qos_workq_override;
1498	info->thrq_qos_wlsvc_override = req_policy->thrp_qos_wlsvc_override;
1499
1500	break;
1501	}
1502
1503	case THREAD_LATENCY_QOS_POLICY:
1504	{
1505	thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info;
1506	thread_latency_qos_t plqos;
1507
1508	if (*count < THREAD_LATENCY_QOS_POLICY_COUNT) {
1509	result = KERN_INVALID_ARGUMENT;
1510	break;
1511	}
1512
1513	if (*get_default) {
1514	plqos = `0`;
1515	} else {
1516	plqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_LATENCY_QOS, NULL);
1517	}
1518
1519	info->thread_latency_qos_tier = qos_latency_policy_package(plqos);
1520	}
1521	break;
1522
1523	case THREAD_THROUGHPUT_QOS_POLICY:
1524	{
1525	thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info;
1526	thread_throughput_qos_t ptqos;
1527
1528	if (*count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) {
1529	result = KERN_INVALID_ARGUMENT;
1530	break;
1531	}
1532
1533	if (*get_default) {
1534	ptqos = `0`;
1535	} else {
1536	ptqos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_THROUGH_QOS, NULL);
1537	}
1538
1539	info->thread_throughput_qos_tier = qos_throughput_policy_package(ptqos);
1540	}
1541	break;
1542
1543	case THREAD_QOS_POLICY:
1544	{
1545	thread_qos_policy_t info = (thread_qos_policy_t)policy_info;
1546
1547	if (*count < THREAD_QOS_POLICY_COUNT) {
1548	result = KERN_INVALID_ARGUMENT;
1549	break;
1550	}
1551
1552	if (!(*get_default)) {
1553	int relprio_value = `0`;
1554	info->qos_tier = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
1555	TASK_POLICY_QOS_AND_RELPRIO, value2: &relprio_value);
1556
1557	info->tier_importance = -relprio_value;
1558	} else {
1559	info->qos_tier = THREAD_QOS_UNSPECIFIED;
1560	info->tier_importance = `0`;
1561	}
1562
1563	break;
1564	}
1565
1566	default:
1567	result = KERN_INVALID_ARGUMENT;
1568	break;
1569	}
1570
1571	thread_mtx_unlock(thread);
1572
1573	return result;
1574	}
1575
1576	void
1577	thread_policy_create(thread_t thread)
1578	{
1579	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1580	(IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE \| TASK_POLICY_THREAD))) \| DBG_FUNC_START,
1581	thread_tid(thread), theffective_0(thread),
1582	theffective_1(thread), thread->base_pri, `0`);
1583
1584	/ We pass a pend token but ignore it /
1585	struct task_pend_token pend_token = {};
1586
1587	thread_policy_update_internal_spinlocked(thread, true, pend_token: &pend_token);
1588
1589	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1590	(IMPORTANCE_CODE(IMP_UPDATE, (IMP_UPDATE_TASK_CREATE \| TASK_POLICY_THREAD))) \| DBG_FUNC_END,
1591	thread_tid(thread), theffective_0(thread),
1592	theffective_1(thread), thread->base_pri, `0`);
1593	}
1594
1595	static void
1596	thread_policy_update_spinlocked(thread_t thread, bool recompute_priority, task_pend_token_t pend_token)
1597	{
1598	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1599	(IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD) \| DBG_FUNC_START),
1600	thread_tid(thread), theffective_0(thread),
1601	theffective_1(thread), thread->base_pri, `0`);
1602
1603	thread_policy_update_internal_spinlocked(thread, recompute_priority, pend_token);
1604
1605	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1606	(IMPORTANCE_CODE(IMP_UPDATE, TASK_POLICY_THREAD)) \| DBG_FUNC_END,
1607	thread_tid(thread), theffective_0(thread),
1608	theffective_1(thread), thread->base_pri, `0`);
1609	}
1610
1611
1612
1613	/*
1614	* One thread state update function TO RULE THEM ALL
1615	*
1616	* This function updates the thread effective policy fields
1617	* and pushes the results to the relevant subsystems.
1618	*
1619	* Called with thread spinlock locked, task may be locked, thread mutex may be locked
1620	*/
1621	static void
1622	thread_policy_update_internal_spinlocked(thread_t thread, bool recompute_priority,
1623	task_pend_token_t pend_token)
1624	{
1625	/*
1626	* Step 1:
1627	* Gather requested policy and effective task state
1628	*/
1629
1630	const struct thread_requested_policy requested = thread->requested_policy;
1631	const struct task_effective_policy task_effective = get_threadtask(thread)->effective_policy;
1632
1633	/*
1634	* Step 2:
1635	* Calculate new effective policies from requested policy, task and thread state
1636	* Rules:
1637	* Don't change requested, it won't take effect
1638	*/
1639
1640	struct thread_effective_policy next = {};
1641
1642	next.thep_wi_driven = requested.thrp_wi_driven;
1643
1644	next.thep_qos_ui_is_urgent = task_effective.tep_qos_ui_is_urgent;
1645
1646	uint32_t next_qos = requested.thrp_qos;
1647
1648	if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) {
1649	next_qos = MAX(requested.thrp_qos_override, next_qos);
1650	next_qos = MAX(requested.thrp_qos_promote, next_qos);
1651	next_qos = MAX(requested.thrp_qos_kevent_override, next_qos);
1652	next_qos = MAX(requested.thrp_qos_wlsvc_override, next_qos);
1653	next_qos = MAX(requested.thrp_qos_workq_override, next_qos);
1654	}
1655
1656	if (task_effective.tep_darwinbg && task_effective.tep_adaptive_bg &&
1657	requested.thrp_qos_promote > THREAD_QOS_BACKGROUND) {
1658	/*
1659	* This thread is turnstile-boosted higher than the adaptive clamp
1660	* by a synchronous waiter. Allow that to override the adaptive
1661	* clamp temporarily for this thread only.
1662	*/
1663	next.thep_promote_above_task = true;
1664	next_qos = requested.thrp_qos_promote;
1665	}
1666
1667	next.thep_qos = next_qos;
1668
1669	/ A task clamp will result in an effective QoS even when requested is UNSPECIFIED /
1670	if (task_effective.tep_qos_clamp != THREAD_QOS_UNSPECIFIED) {
1671	if (next.thep_qos != THREAD_QOS_UNSPECIFIED) {
1672	next.thep_qos = MIN(task_effective.tep_qos_clamp, next.thep_qos);
1673	} else {
1674	next.thep_qos = task_effective.tep_qos_clamp;
1675	}
1676	next.thep_wi_driven = `0`;
1677	}
1678
1679	/*
1680	* Extract outbound-promotion QoS before applying task ceiling or BG clamp
1681	* This allows QoS promotions to work properly even after the process is unclamped.
1682	*/
1683	next.thep_qos_promote = next.thep_qos;
1684
1685	/ The ceiling only applies to threads that are in the QoS world /
1686	/ TODO: is it appropriate for this to limit a turnstile-boosted thread's QoS? /
1687	if (task_effective.tep_qos_ceiling != THREAD_QOS_UNSPECIFIED &&
1688	next.thep_qos != THREAD_QOS_UNSPECIFIED) {
1689	next.thep_qos = MIN(task_effective.tep_qos_ceiling, next.thep_qos);
1690	}
1691
1692	/*
1693	* The QoS relative priority is only applicable when the original programmer's
1694	* intended (requested) QoS is in effect. When the QoS is clamped (e.g.
1695	* USER_INITIATED-13REL clamped to UTILITY), the relative priority is not honored,
1696	* since otherwise it would be lower than unclamped threads. Similarly, in the
1697	* presence of boosting, the programmer doesn't know what other actors
1698	* are boosting the thread.
1699	*/
1700	if ((requested.thrp_qos != THREAD_QOS_UNSPECIFIED) &&
1701	(requested.thrp_qos == next.thep_qos) &&
1702	(requested.thrp_qos_override == THREAD_QOS_UNSPECIFIED)) {
1703	next.thep_qos_relprio = requested.thrp_qos_relprio;
1704	} else {
1705	next.thep_qos_relprio = `0`;
1706	}
1707
1708	/ Calculate DARWIN_BG /
1709	bool wants_darwinbg = false;
1710	bool wants_all_sockets_bg = false; / Do I want my existing sockets to be bg /
1711
1712	if (task_effective.tep_darwinbg && !next.thep_promote_above_task) {
1713	wants_darwinbg = true;
1714	}
1715
1716	/*
1717	* If DARWIN_BG has been requested at either level, it's engaged.
1718	* darwinbg threads always create bg sockets,
1719	* but only some types of darwinbg change the sockets
1720	* after they're created
1721	*/
1722	if (requested.thrp_int_darwinbg \|\| requested.thrp_ext_darwinbg) {
1723	wants_all_sockets_bg = wants_darwinbg = true;
1724	}
1725
1726	if (requested.thrp_pidbind_bg) {
1727	wants_all_sockets_bg = wants_darwinbg = true;
1728	}
1729
1730	if (next.thep_qos == THREAD_QOS_BACKGROUND \|\|
1731	next.thep_qos == THREAD_QOS_MAINTENANCE) {
1732	wants_darwinbg = true;
1733	}
1734
1735	/ Calculate side effects of DARWIN_BG /
1736
1737	if (wants_darwinbg) {
1738	next.thep_darwinbg = `1`;
1739	next.thep_wi_driven = `0`;
1740	}
1741
1742	if (next.thep_darwinbg \|\| task_effective.tep_new_sockets_bg) {
1743	next.thep_new_sockets_bg = `1`;
1744	}
1745
1746	/ Don't use task_effective.tep_all_sockets_bg here /
1747	if (wants_all_sockets_bg) {
1748	next.thep_all_sockets_bg = `1`;
1749	}
1750
1751	/ darwinbg implies background QOS (or lower) /
1752	if (next.thep_darwinbg &&
1753	(next.thep_qos > THREAD_QOS_BACKGROUND \|\| next.thep_qos == THREAD_QOS_UNSPECIFIED)) {
1754	next.thep_qos = THREAD_QOS_BACKGROUND;
1755	next.thep_qos_relprio = `0`;
1756	}
1757
1758	/ Calculate IO policy /
1759
1760	int iopol = THROTTLE_LEVEL_TIER0;
1761
1762	/ Factor in the task's IO policy /
1763	if (next.thep_darwinbg) {
1764	iopol = MAX(iopol, task_effective.tep_bg_iotier);
1765	}
1766
1767	if (!next.thep_promote_above_task) {
1768	iopol = MAX(iopol, task_effective.tep_io_tier);
1769	}
1770
1771	/ Look up the associated IO tier value for the QoS class /
1772	iopol = MAX(iopol, thread_qos_policy_params.qos_iotier[next.thep_qos]);
1773
1774	iopol = MAX(iopol, requested.thrp_int_iotier);
1775	iopol = MAX(iopol, requested.thrp_ext_iotier);
1776
1777	/ Apply the kevent iotier override /
1778	iopol = MIN(iopol, requested.thrp_iotier_kevent_override);
1779
1780	next.thep_io_tier = iopol;
1781
1782	/*
1783	* If a QoS override is causing IO to go into a lower tier, we also set
1784	* the passive bit so that a thread doesn't end up stuck in its own throttle
1785	* window when the override goes away.
1786	*/
1787
1788	int next_qos_iotier = thread_qos_policy_params.qos_iotier[next.thep_qos];
1789	int req_qos_iotier = thread_qos_policy_params.qos_iotier[requested.thrp_qos];
1790	bool qos_io_override_active = (next_qos_iotier < req_qos_iotier);
1791
1792	/ Calculate Passive IO policy /
1793	if (requested.thrp_ext_iopassive \|\|
1794	requested.thrp_int_iopassive \|\|
1795	qos_io_override_active \|\|
1796	task_effective.tep_io_passive) {
1797	next.thep_io_passive = `1`;
1798	}
1799
1800	/ Calculate timer QOS /
1801	uint32_t latency_qos = requested.thrp_latency_qos;
1802
1803	if (!next.thep_promote_above_task) {
1804	latency_qos = MAX(latency_qos, task_effective.tep_latency_qos);
1805	}
1806
1807	latency_qos = MAX(latency_qos, thread_qos_policy_params.qos_latency_qos[next.thep_qos]);
1808
1809	next.thep_latency_qos = latency_qos;
1810
1811	/ Calculate throughput QOS /
1812	uint32_t through_qos = requested.thrp_through_qos;
1813
1814	if (!next.thep_promote_above_task) {
1815	through_qos = MAX(through_qos, task_effective.tep_through_qos);
1816	}
1817
1818	through_qos = MAX(through_qos, thread_qos_policy_params.qos_through_qos[next.thep_qos]);
1819
1820	next.thep_through_qos = through_qos;
1821
1822	if (task_effective.tep_terminated \|\| requested.thrp_terminated) {
1823	/ Shoot down the throttles that slow down exit or response to SIGTERM /
1824	next.thep_terminated = `1`;
1825	next.thep_darwinbg = `0`;
1826	next.thep_io_tier = THROTTLE_LEVEL_TIER0;
1827	next.thep_qos = THREAD_QOS_UNSPECIFIED;
1828	next.thep_latency_qos = LATENCY_QOS_TIER_UNSPECIFIED;
1829	next.thep_through_qos = THROUGHPUT_QOS_TIER_UNSPECIFIED;
1830	next.thep_wi_driven = `0`;
1831	}
1832
1833	/*
1834	* Step 3:
1835	* Swap out old policy for new policy
1836	*/
1837
1838	struct thread_effective_policy prev = thread->effective_policy;
1839
1840	thread_update_qos_cpu_time_locked(thread);
1841
1842	/ This is the point where the new values become visible to other threads /
1843	thread->effective_policy = next;
1844
1845	/*
1846	* Step 4:
1847	* Pend updates that can't be done while holding the thread lock
1848	*/
1849
1850	if (prev.thep_all_sockets_bg != next.thep_all_sockets_bg) {
1851	pend_token->tpt_update_sockets = `1`;
1852	}
1853
1854	/ TODO: Doesn't this only need to be done if the throttle went up? /
1855	if (prev.thep_io_tier != next.thep_io_tier) {
1856	pend_token->tpt_update_throttle = `1`;
1857	}
1858
1859	/*
1860	* Check for the attributes that sfi_thread_classify() consults,
1861	* and trigger SFI re-evaluation.
1862	*/
1863	if (prev.thep_qos != next.thep_qos \|\|
1864	prev.thep_darwinbg != next.thep_darwinbg) {
1865	pend_token->tpt_update_thread_sfi = `1`;
1866	}
1867
1868	integer_t old_base_pri = thread->base_pri;
1869
1870	/*
1871	* Step 5:
1872	* Update other subsystems as necessary if something has changed
1873	*/
1874
1875	/ Check for the attributes that thread_recompute_priority() consults /
1876	if (prev.thep_qos != next.thep_qos \|\|
1877	prev.thep_qos_relprio != next.thep_qos_relprio \|\|
1878	prev.thep_qos_ui_is_urgent != next.thep_qos_ui_is_urgent \|\|
1879	prev.thep_promote_above_task != next.thep_promote_above_task \|\|
1880	prev.thep_terminated != next.thep_terminated \|\|
1881	prev.thep_wi_driven != next.thep_wi_driven \|\|
1882	pend_token->tpt_force_recompute_pri == `1` \|\|
1883	recompute_priority) {
1884	thread_recompute_priority(thread);
1885	}
1886
1887	/*
1888	* Check if the thread is waiting on a turnstile and needs priority propagation.
1889	*/
1890	if (pend_token->tpt_update_turnstile &&
1891	((old_base_pri == thread->base_pri) \|\|
1892	!thread_get_waiting_turnstile(thread))) {
1893	/*
1894	* Reset update turnstile pend token since either
1895	* the thread priority did not change or thread is
1896	* not blocked on a turnstile.
1897	*/
1898	pend_token->tpt_update_turnstile = `0`;
1899	}
1900	}
1901
1902
1903	/*
1904	* Initiate a thread policy state transition on a thread with its TID
1905	* Useful if you cannot guarantee the thread won't get terminated
1906	* Precondition: No locks are held
1907	* Will take task lock - using the non-tid variant is faster
1908	* if you already have a thread ref.
1909	*/
1910	void
1911	proc_set_thread_policy_with_tid(task_t task,
1912	uint64_t tid,
1913	int category,
1914	int flavor,
1915	int value)
1916	{
1917	/ takes task lock, returns ref'ed thread or NULL /
1918	thread_t thread = task_findtid(task, tid);
1919
1920	if (thread == THREAD_NULL) {
1921	return;
1922	}
1923
1924	proc_set_thread_policy(thread, category, flavor, value);
1925
1926	thread_deallocate(thread);
1927	}
1928
1929	/*
1930	* Initiate a thread policy transition on a thread
1931	* This path supports networking transitions (i.e. darwinbg transitions)
1932	* Precondition: No locks are held
1933	*/
1934	void
1935	proc_set_thread_policy(thread_t thread,
1936	int category,
1937	int flavor,
1938	int value)
1939	{
1940	proc_set_thread_policy_ext(thread, category, flavor, value, value2: `0`);
1941	}
1942
1943	void
1944	proc_set_thread_policy_ext(thread_t thread,
1945	int category,
1946	int flavor,
1947	int value,
1948	int value2)
1949	{
1950	struct task_pend_token pend_token = {};
1951
1952	thread_mtx_lock(thread);
1953
1954	proc_set_thread_policy_locked(thread, category, flavor, value, value2, pend_token: &pend_token);
1955
1956	thread_mtx_unlock(thread);
1957
1958	thread_policy_update_complete_unlocked(task: thread, pend_token: &pend_token);
1959	}
1960
1961	/*
1962	* Do the things that can't be done while holding a thread mutex.
1963	* These are set up to call back into thread policy to get the latest value,
1964	* so they don't have to be synchronized with the update.
1965	* The only required semantic is 'call this sometime after updating effective policy'
1966	*
1967	* Precondition: Thread mutex is not held
1968	*
1969	* This may be called with the task lock held, but in that case it won't be
1970	* called with tpt_update_sockets set.
1971	*/
1972	void
1973	thread_policy_update_complete_unlocked(thread_t thread, task_pend_token_t pend_token)
1974	{
1975	#ifdef MACH_BSD
1976	if (pend_token->tpt_update_sockets) {
1977	proc_apply_task_networkbg(pid: task_pid(task: get_threadtask(thread)), thread);
1978	}
1979	#endif /* MACH_BSD */
1980
1981	if (pend_token->tpt_update_throttle) {
1982	rethrottle_thread(uthread: get_bsdthread_info(thread));
1983	}
1984
1985	if (pend_token->tpt_update_thread_sfi) {
1986	sfi_reevaluate(thread);
1987	}
1988
1989	if (pend_token->tpt_update_turnstile) {
1990	turnstile_update_thread_priority_chain(thread);
1991	}
1992	}
1993
1994	/*
1995	* Set and update thread policy
1996	* Thread mutex might be held
1997	*/
1998	static void
1999	proc_set_thread_policy_locked(thread_t thread,
2000	int category,
2001	int flavor,
2002	int value,
2003	int value2,
2004	task_pend_token_t pend_token)
2005	{
2006	spl_t s = splsched();
2007	thread_lock(thread);
2008
2009	proc_set_thread_policy_spinlocked(thread, category, flavor, value, value2, pend_token);
2010
2011	thread_unlock(thread);
2012	splx(s);
2013	}
2014
2015	/*
2016	* Set and update thread policy
2017	* Thread spinlock is held
2018	*/
2019	static void
2020	proc_set_thread_policy_spinlocked(thread_t thread,
2021	int category,
2022	int flavor,
2023	int value,
2024	int value2,
2025	task_pend_token_t pend_token)
2026	{
2027	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2028	(IMPORTANCE_CODE(flavor, (category \| TASK_POLICY_THREAD))) \| DBG_FUNC_START,
2029	thread_tid(thread), threquested_0(thread),
2030	threquested_1(thread), value, `0`);
2031
2032	thread_set_requested_policy_spinlocked(thread, category, flavor, value, value2, pend_token);
2033
2034	thread_policy_update_spinlocked(thread, false, pend_token);
2035
2036	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2037	(IMPORTANCE_CODE(flavor, (category \| TASK_POLICY_THREAD))) \| DBG_FUNC_END,
2038	thread_tid(thread), threquested_0(thread),
2039	threquested_1(thread), tpending(pend_token), `0`);
2040	}
2041
2042	/*
2043	* Set the requested state for a specific flavor to a specific value.
2044	*/
2045	static void
2046	thread_set_requested_policy_spinlocked(thread_t thread,
2047	int category,
2048	int flavor,
2049	int value,
2050	int value2,
2051	task_pend_token_t pend_token)
2052	{
2053	int tier, passive;
2054
2055	struct thread_requested_policy requested = thread->requested_policy;
2056
2057	switch (flavor) {
2058	/ Category: EXTERNAL and INTERNAL, thread and task /
2059
2060	case TASK_POLICY_DARWIN_BG:
2061	if (category == TASK_POLICY_EXTERNAL) {
2062	requested.thrp_ext_darwinbg = value;
2063	} else {
2064	requested.thrp_int_darwinbg = value;
2065	}
2066	pend_token->tpt_update_turnstile = `1`;
2067	break;
2068
2069	case TASK_POLICY_IOPOL:
2070	proc_iopol_to_tier(iopolicy: value, tier: &tier, passive: &passive);
2071	if (category == TASK_POLICY_EXTERNAL) {
2072	requested.thrp_ext_iotier = tier;
2073	requested.thrp_ext_iopassive = passive;
2074	} else {
2075	requested.thrp_int_iotier = tier;
2076	requested.thrp_int_iopassive = passive;
2077	}
2078	break;
2079
2080	case TASK_POLICY_IO:
2081	if (category == TASK_POLICY_EXTERNAL) {
2082	requested.thrp_ext_iotier = value;
2083	} else {
2084	requested.thrp_int_iotier = value;
2085	}
2086	break;
2087
2088	case TASK_POLICY_PASSIVE_IO:
2089	if (category == TASK_POLICY_EXTERNAL) {
2090	requested.thrp_ext_iopassive = value;
2091	} else {
2092	requested.thrp_int_iopassive = value;
2093	}
2094	break;
2095
2096	/ Category: ATTRIBUTE, thread only /
2097
2098	case TASK_POLICY_PIDBIND_BG:
2099	assert(category == TASK_POLICY_ATTRIBUTE);
2100	requested.thrp_pidbind_bg = value;
2101	pend_token->tpt_update_turnstile = `1`;
2102	break;
2103
2104	case TASK_POLICY_LATENCY_QOS:
2105	assert(category == TASK_POLICY_ATTRIBUTE);
2106	requested.thrp_latency_qos = value;
2107	break;
2108
2109	case TASK_POLICY_THROUGH_QOS:
2110	assert(category == TASK_POLICY_ATTRIBUTE);
2111	requested.thrp_through_qos = value;
2112	break;
2113
2114	case TASK_POLICY_QOS_OVERRIDE:
2115	assert(category == TASK_POLICY_ATTRIBUTE);
2116	requested.thrp_qos_override = value;
2117	pend_token->tpt_update_turnstile = `1`;
2118	break;
2119
2120	case TASK_POLICY_QOS_AND_RELPRIO:
2121	assert(category == TASK_POLICY_ATTRIBUTE);
2122	requested.thrp_qos = value;
2123	requested.thrp_qos_relprio = value2;
2124	pend_token->tpt_update_turnstile = `1`;
2125	DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio);
2126	break;
2127
2128	case TASK_POLICY_QOS_WORKQ_OVERRIDE:
2129	assert(category == TASK_POLICY_ATTRIBUTE);
2130	requested.thrp_qos_workq_override = value;
2131	pend_token->tpt_update_turnstile = `1`;
2132	break;
2133
2134	case TASK_POLICY_QOS_PROMOTE:
2135	assert(category == TASK_POLICY_ATTRIBUTE);
2136	requested.thrp_qos_promote = value;
2137	break;
2138
2139	case TASK_POLICY_QOS_KEVENT_OVERRIDE:
2140	assert(category == TASK_POLICY_ATTRIBUTE);
2141	requested.thrp_qos_kevent_override = value;
2142	pend_token->tpt_update_turnstile = `1`;
2143	break;
2144
2145	case TASK_POLICY_QOS_SERVICER_OVERRIDE:
2146	assert(category == TASK_POLICY_ATTRIBUTE);
2147	requested.thrp_qos_wlsvc_override = value;
2148	pend_token->tpt_update_turnstile = `1`;
2149	break;
2150
2151	case TASK_POLICY_TERMINATED:
2152	assert(category == TASK_POLICY_ATTRIBUTE);
2153	requested.thrp_terminated = value;
2154	break;
2155
2156	case TASK_POLICY_IOTIER_KEVENT_OVERRIDE:
2157	assert(category == TASK_POLICY_ATTRIBUTE);
2158	requested.thrp_iotier_kevent_override = value;
2159	break;
2160
2161	case TASK_POLICY_WI_DRIVEN:
2162	assert(category == TASK_POLICY_ATTRIBUTE);
2163	assert(thread == current_thread());
2164
2165	const bool set_policy = value;
2166	const sched_mode_t mode = value2;
2167
2168	requested.thrp_wi_driven = set_policy ? `1` : `0`;
2169
2170	/*
2171	* No sched mode change for REALTIME (threads must explicitly
2172	* opt-in), however the priority_offset needs to be updated.
2173	*/
2174	if (mode == TH_MODE_REALTIME) {
2175	const int pri = work_interval_get_priority(thread);
2176	assert3u(pri, >=, BASEPRI_RTQUEUES);
2177	thread->realtime.priority_offset = set_policy ?
2178	(uint8_t)(pri - BASEPRI_RTQUEUES) : `0`;
2179	} else {
2180	sched_set_thread_mode_user(thread, mode);
2181	if (set_policy) {
2182	thread->static_param = true;
2183	}
2184	}
2185	break;
2186
2187	default:
2188	panic("unknown task policy: %d %d %d", category, flavor, value);
2189	break;
2190	}
2191
2192	thread->requested_policy = requested;
2193	}
2194
2195	/*
2196	* Gets what you set. Effective values may be different.
2197	* Precondition: No locks are held
2198	*/
2199	int
2200	proc_get_thread_policy(thread_t thread,
2201	int category,
2202	int flavor)
2203	{
2204	int value = `0`;
2205	thread_mtx_lock(thread);
2206	value = proc_get_thread_policy_locked(thread, category, flavor, NULL);
2207	thread_mtx_unlock(thread);
2208	return value;
2209	}
2210
2211	static int
2212	proc_get_thread_policy_locked(thread_t thread,
2213	int category,
2214	int flavor,
2215	int* value2)
2216	{
2217	int value = `0`;
2218
2219	spl_t s = splsched();
2220	thread_lock(thread);
2221
2222	value = thread_get_requested_policy_spinlocked(thread, category, flavor, value2);
2223
2224	thread_unlock(thread);
2225	splx(s);
2226
2227	return value;
2228	}
2229
2230	/*
2231	* Gets what you set. Effective values may be different.
2232	*/
2233	static int
2234	thread_get_requested_policy_spinlocked(thread_t thread,
2235	int category,
2236	int flavor,
2237	int* value2)
2238	{
2239	int value = `0`;
2240
2241	struct thread_requested_policy requested = thread->requested_policy;
2242
2243	switch (flavor) {
2244	case TASK_POLICY_DARWIN_BG:
2245	if (category == TASK_POLICY_EXTERNAL) {
2246	value = requested.thrp_ext_darwinbg;
2247	} else {
2248	value = requested.thrp_int_darwinbg;
2249	}
2250	break;
2251	case TASK_POLICY_IOPOL:
2252	if (category == TASK_POLICY_EXTERNAL) {
2253	value = proc_tier_to_iopol(tier: requested.thrp_ext_iotier,
2254	passive: requested.thrp_ext_iopassive);
2255	} else {
2256	value = proc_tier_to_iopol(tier: requested.thrp_int_iotier,
2257	passive: requested.thrp_int_iopassive);
2258	}
2259	break;
2260	case TASK_POLICY_IO:
2261	if (category == TASK_POLICY_EXTERNAL) {
2262	value = requested.thrp_ext_iotier;
2263	} else {
2264	value = requested.thrp_int_iotier;
2265	}
2266	break;
2267	case TASK_POLICY_PASSIVE_IO:
2268	if (category == TASK_POLICY_EXTERNAL) {
2269	value = requested.thrp_ext_iopassive;
2270	} else {
2271	value = requested.thrp_int_iopassive;
2272	}
2273	break;
2274	case TASK_POLICY_QOS:
2275	assert(category == TASK_POLICY_ATTRIBUTE);
2276	value = requested.thrp_qos;
2277	break;
2278	case TASK_POLICY_QOS_OVERRIDE:
2279	assert(category == TASK_POLICY_ATTRIBUTE);
2280	value = requested.thrp_qos_override;
2281	break;
2282	case TASK_POLICY_LATENCY_QOS:
2283	assert(category == TASK_POLICY_ATTRIBUTE);
2284	value = requested.thrp_latency_qos;
2285	break;
2286	case TASK_POLICY_THROUGH_QOS:
2287	assert(category == TASK_POLICY_ATTRIBUTE);
2288	value = requested.thrp_through_qos;
2289	break;
2290	case TASK_POLICY_QOS_WORKQ_OVERRIDE:
2291	assert(category == TASK_POLICY_ATTRIBUTE);
2292	value = requested.thrp_qos_workq_override;
2293	break;
2294	case TASK_POLICY_QOS_AND_RELPRIO:
2295	assert(category == TASK_POLICY_ATTRIBUTE);
2296	assert(value2 != NULL);
2297	value = requested.thrp_qos;
2298	*value2 = requested.thrp_qos_relprio;
2299	break;
2300	case TASK_POLICY_QOS_PROMOTE:
2301	assert(category == TASK_POLICY_ATTRIBUTE);
2302	value = requested.thrp_qos_promote;
2303	break;
2304	case TASK_POLICY_QOS_KEVENT_OVERRIDE:
2305	assert(category == TASK_POLICY_ATTRIBUTE);
2306	value = requested.thrp_qos_kevent_override;
2307	break;
2308	case TASK_POLICY_QOS_SERVICER_OVERRIDE:
2309	assert(category == TASK_POLICY_ATTRIBUTE);
2310	value = requested.thrp_qos_wlsvc_override;
2311	break;
2312	case TASK_POLICY_TERMINATED:
2313	assert(category == TASK_POLICY_ATTRIBUTE);
2314	value = requested.thrp_terminated;
2315	break;
2316	case TASK_POLICY_IOTIER_KEVENT_OVERRIDE:
2317	assert(category == TASK_POLICY_ATTRIBUTE);
2318	value = requested.thrp_iotier_kevent_override;
2319	break;
2320
2321	case TASK_POLICY_WI_DRIVEN:
2322	assert(category == TASK_POLICY_ATTRIBUTE);
2323	value = requested.thrp_wi_driven;
2324	break;
2325
2326	default:
2327	panic("unknown policy_flavor %d", flavor);
2328	break;
2329	}
2330
2331	return value;
2332	}
2333
2334	/*
2335	* Gets what is actually in effect, for subsystems which pull policy instead of receive updates.
2336	*
2337	* NOTE: This accessor does not take the task or thread lock.
2338	* Notifications of state updates need to be externally synchronized with state queries.
2339	* This routine MUST remain interrupt safe, as it is potentially invoked
2340	* within the context of a timer interrupt.
2341	*
2342	* TODO: I think we can get away with architecting this such that we don't need to look at the task ever.
2343	* Is that a good idea? Maybe it's best to avoid evaluate-all-the-threads updates.
2344	* I don't think that cost is worth not having the right answer.
2345	*/
2346	int
2347	proc_get_effective_thread_policy(thread_t thread,
2348	int flavor)
2349	{
2350	int value = `0`;
2351
2352	switch (flavor) {
2353	case TASK_POLICY_DARWIN_BG:
2354	/*
2355	* This call is used within the timer layer, as well as
2356	* prioritizing requests to the graphics system.
2357	* It also informs SFI and originator-bg-state.
2358	* Returns 1 for background mode, 0 for normal mode
2359	*/
2360
2361	value = thread->effective_policy.thep_darwinbg ? `1` : `0`;
2362	break;
2363	case TASK_POLICY_IO:
2364	/*
2365	* The I/O system calls here to find out what throttling tier to apply to an operation.
2366	* Returns THROTTLE_LEVEL_* values
2367	*/
2368	value = thread->effective_policy.thep_io_tier;
2369	if (thread->iotier_override != THROTTLE_LEVEL_NONE) {
2370	value = MIN(value, thread->iotier_override);
2371	}
2372	break;
2373	case TASK_POLICY_PASSIVE_IO:
2374	/*
2375	* The I/O system calls here to find out whether an operation should be passive.
2376	* (i.e. not cause operations with lower throttle tiers to be throttled)
2377	* Returns 1 for passive mode, 0 for normal mode
2378	*
2379	* If an override is causing IO to go into a lower tier, we also set
2380	* the passive bit so that a thread doesn't end up stuck in its own throttle
2381	* window when the override goes away.
2382	*/
2383	value = thread->effective_policy.thep_io_passive ? `1` : `0`;
2384	if (thread->iotier_override != THROTTLE_LEVEL_NONE &&
2385	thread->iotier_override < thread->effective_policy.thep_io_tier) {
2386	value = `1`;
2387	}
2388	break;
2389	case TASK_POLICY_ALL_SOCKETS_BG:
2390	/*
2391	* do_background_socket() calls this to determine whether
2392	* it should change the thread's sockets
2393	* Returns 1 for background mode, 0 for normal mode
2394	* This consults both thread and task so un-DBGing a thread while the task is BG
2395	* doesn't get you out of the network throttle.
2396	*/
2397	value = (thread->effective_policy.thep_all_sockets_bg \|\|
2398	get_threadtask(thread)->effective_policy.tep_all_sockets_bg) ? `1` : `0`;
2399	break;
2400	case TASK_POLICY_NEW_SOCKETS_BG:
2401	/*
2402	* socreate() calls this to determine if it should mark a new socket as background
2403	* Returns 1 for background mode, 0 for normal mode
2404	*/
2405	value = thread->effective_policy.thep_new_sockets_bg ? `1` : `0`;
2406	break;
2407	case TASK_POLICY_LATENCY_QOS:
2408	/*
2409	* timer arming calls into here to find out the timer coalescing level
2410	* Returns a latency QoS tier (0-6)
2411	*/
2412	value = thread->effective_policy.thep_latency_qos;
2413	break;
2414	case TASK_POLICY_THROUGH_QOS:
2415	/*
2416	* This value is passed into the urgency callout from the scheduler
2417	* to the performance management subsystem.
2418	*
2419	* Returns a throughput QoS tier (0-6)
2420	*/
2421	value = thread->effective_policy.thep_through_qos;
2422	break;
2423	case TASK_POLICY_QOS:
2424	/*
2425	* This is communicated to the performance management layer and SFI.
2426	*
2427	* Returns a QoS policy tier
2428	*/
2429	value = thread->effective_policy.thep_qos;
2430	break;
2431	default:
2432	panic("unknown thread policy flavor %d", flavor);
2433	break;
2434	}
2435
2436	return value;
2437	}
2438
2439
2440	/*
2441	* (integer_t) casts limit the number of bits we can fit here
2442	* this interface is deprecated and replaced by the _EXT struct ?
2443	*/
2444	static void
2445	proc_get_thread_policy_bitfield(thread_t thread, thread_policy_state_t info)
2446	{
2447	uint64_t bits = `0`;
2448	struct thread_requested_policy requested = thread->requested_policy;
2449
2450	bits \|= (requested.thrp_int_darwinbg ? POLICY_REQ_INT_DARWIN_BG : `0`);
2451	bits \|= (requested.thrp_ext_darwinbg ? POLICY_REQ_EXT_DARWIN_BG : `0`);
2452	bits \|= (requested.thrp_int_iotier ? (((uint64_t)requested.thrp_int_iotier) << POLICY_REQ_INT_IO_TIER_SHIFT) : `0`);
2453	bits \|= (requested.thrp_ext_iotier ? (((uint64_t)requested.thrp_ext_iotier) << POLICY_REQ_EXT_IO_TIER_SHIFT) : `0`);
2454	bits \|= (requested.thrp_int_iopassive ? POLICY_REQ_INT_PASSIVE_IO : `0`);
2455	bits \|= (requested.thrp_ext_iopassive ? POLICY_REQ_EXT_PASSIVE_IO : `0`);
2456
2457	bits \|= (requested.thrp_qos ? (((uint64_t)requested.thrp_qos) << POLICY_REQ_TH_QOS_SHIFT) : `0`);
2458	bits \|= (requested.thrp_qos_override ? (((uint64_t)requested.thrp_qos_override) << POLICY_REQ_TH_QOS_OVER_SHIFT) : `0`);
2459
2460	bits \|= (requested.thrp_pidbind_bg ? POLICY_REQ_PIDBIND_BG : `0`);
2461
2462	bits \|= (requested.thrp_latency_qos ? (((uint64_t)requested.thrp_latency_qos) << POLICY_REQ_BASE_LATENCY_QOS_SHIFT) : `0`);
2463	bits \|= (requested.thrp_through_qos ? (((uint64_t)requested.thrp_through_qos) << POLICY_REQ_BASE_THROUGH_QOS_SHIFT) : `0`);
2464
2465	info->requested = (integer_t) bits;
2466	bits = `0`;
2467
2468	struct thread_effective_policy effective = thread->effective_policy;
2469
2470	bits \|= (effective.thep_darwinbg ? POLICY_EFF_DARWIN_BG : `0`);
2471
2472	bits \|= (effective.thep_io_tier ? (((uint64_t)effective.thep_io_tier) << POLICY_EFF_IO_TIER_SHIFT) : `0`);
2473	bits \|= (effective.thep_io_passive ? POLICY_EFF_IO_PASSIVE : `0`);
2474	bits \|= (effective.thep_all_sockets_bg ? POLICY_EFF_ALL_SOCKETS_BG : `0`);
2475	bits \|= (effective.thep_new_sockets_bg ? POLICY_EFF_NEW_SOCKETS_BG : `0`);
2476
2477	bits \|= (effective.thep_qos ? (((uint64_t)effective.thep_qos) << POLICY_EFF_TH_QOS_SHIFT) : `0`);
2478
2479	bits \|= (effective.thep_latency_qos ? (((uint64_t)effective.thep_latency_qos) << POLICY_EFF_LATENCY_QOS_SHIFT) : `0`);
2480	bits \|= (effective.thep_through_qos ? (((uint64_t)effective.thep_through_qos) << POLICY_EFF_THROUGH_QOS_SHIFT) : `0`);
2481
2482	info->effective = (integer_t)bits;
2483	bits = `0`;
2484
2485	info->pending = `0`;
2486	}
2487
2488	/*
2489	* Sneakily trace either the task and thread requested
2490	* or just the thread requested, depending on if we have enough room.
2491	* We do have room on LP64. On LP32, we have to split it between two uintptr_t's.
2492	*
2493	* LP32 LP64
2494	* threquested_0(thread) thread[0] task[0]
2495	* threquested_1(thread) thread[1] thread[0]
2496	*
2497	*/
2498
2499	uintptr_t
2500	threquested_0(thread_t thread)
2501	{
2502	static_assert(sizeof(struct thread_requested_policy) == sizeof(uint64_t), "size invariant violated");
2503
2504	uintptr_t* raw = (uintptr_t)(void**)&thread->requested_policy;
2505
2506	return raw[`0`];
2507	}
2508
2509	uintptr_t
2510	threquested_1(thread_t thread)
2511	{
2512	#if defined __LP64__
2513	return (uintptr_t)&get_threadtask(thread)->requested_policy;
2514	#else
2515	uintptr_t* raw = (uintptr_t)(void**)&thread->requested_policy;
2516	return raw[`1`];
2517	#endif
2518	}
2519
2520	uintptr_t
2521	theffective_0(thread_t thread)
2522	{
2523	static_assert(sizeof(struct thread_effective_policy) == sizeof(uint64_t), "size invariant violated");
2524
2525	uintptr_t* raw = (uintptr_t)(void**)&thread->effective_policy;
2526	return raw[`0`];
2527	}
2528
2529	uintptr_t
2530	theffective_1(thread_t thread)
2531	{
2532	#if defined __LP64__
2533	return (uintptr_t)&get_threadtask(thread)->effective_policy;
2534	#else
2535	uintptr_t* raw = (uintptr_t)(void**)&thread->effective_policy;
2536	return raw[`1`];
2537	#endif
2538	}
2539
2540
2541	/*
2542	* Set an override on the thread which is consulted with a
2543	* higher priority than the task/thread policy. This should
2544	* only be set for temporary grants until the thread
2545	* returns to the userspace boundary
2546	*
2547	* We use atomic operations to swap in the override, with
2548	* the assumption that the thread itself can
2549	* read the override and clear it on return to userspace.
2550	*
2551	* No locking is performed, since it is acceptable to see
2552	* a stale override for one loop through throttle_lowpri_io().
2553	* However a thread reference must be held on the thread.
2554	*/
2555
2556	void
2557	set_thread_iotier_override(thread_t thread, int policy)
2558	{
2559	int current_override;
2560
2561	/ Let most aggressive I/O policy win until user boundary /
2562	do {
2563	current_override = thread->iotier_override;
2564
2565	if (current_override != THROTTLE_LEVEL_NONE) {
2566	policy = MIN(current_override, policy);
2567	}
2568
2569	if (current_override == policy) {
2570	/ no effective change /
2571	return;
2572	}
2573	} while (!OSCompareAndSwap(current_override, policy, &thread->iotier_override));
2574
2575	/*
2576	* Since the thread may be currently throttled,
2577	* re-evaluate tiers and potentially break out
2578	* of an msleep
2579	*/
2580	rethrottle_thread(uthread: get_bsdthread_info(thread));
2581	}
2582
2583	/*
2584	* Userspace synchronization routines (like pthread mutexes, pthread reader-writer locks,
2585	* semaphores, dispatch_sync) may result in priority inversions where a higher priority
2586	* (i.e. scheduler priority, I/O tier, QoS tier) is waiting on a resource owned by a lower
2587	* priority thread. In these cases, we attempt to propagate the priority token, as long
2588	* as the subsystem informs us of the relationships between the threads. The userspace
2589	* synchronization subsystem should maintain the information of owner->resource and
2590	* resource->waiters itself.
2591	*/
2592
2593	/*
2594	* This helper canonicalizes the resource/resource_type given the current qos_override_mode
2595	* in effect. Note that wildcards (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD) may need
2596	* to be handled specially in the future, but for now it's fine to slam
2597	* *resource to USER_ADDR_NULL even if it was previously a wildcard.
2598	*/
2599	static void
2600	canonicalize_resource_and_type(user_addr_t resource, int* *resource_type)
2601	{
2602	if (qos_override_mode == QOS_OVERRIDE_MODE_OVERHANG_PEAK \|\| qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2603	/ Map all input resource/type to a single one /
2604	*resource = USER_ADDR_NULL;
2605	*resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN;
2606	} else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) {
2607	/ no transform /
2608	} else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) {
2609	/ Map all mutex overrides to a single one, to avoid memory overhead /
2610	if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) {
2611	*resource = USER_ADDR_NULL;
2612	}
2613	}
2614	}
2615
2616	/ This helper routine finds an existing override if known. Locking should be done by caller /
2617	static struct thread_qos_override *
2618	find_qos_override(thread_t thread,
2619	user_addr_t resource,
2620	int resource_type)
2621	{
2622	struct thread_qos_override *override;
2623
2624	override = thread->overrides;
2625	while (override) {
2626	if (override->override_resource == resource &&
2627	override->override_resource_type == resource_type) {
2628	return override;
2629	}
2630
2631	override = override->override_next;
2632	}
2633
2634	return NULL;
2635	}
2636
2637	static void
2638	find_and_decrement_qos_override(thread_t thread,
2639	user_addr_t resource,
2640	int resource_type,
2641	boolean_t reset,
2642	struct thread_qos_override **free_override_list)
2643	{
2644	struct thread_qos_override override, override_prev;
2645
2646	override_prev = NULL;
2647	override = thread->overrides;
2648	while (override) {
2649	struct thread_qos_override *override_next = override->override_next;
2650
2651	if ((THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD == resource \|\| override->override_resource == resource) &&
2652	(THREAD_QOS_OVERRIDE_TYPE_WILDCARD == resource_type \|\| override->override_resource_type == resource_type)) {
2653	if (reset) {
2654	override->override_contended_resource_count = `0`;
2655	} else {
2656	override->override_contended_resource_count--;
2657	}
2658
2659	if (override->override_contended_resource_count == `0`) {
2660	if (override_prev == NULL) {
2661	thread->overrides = override_next;
2662	} else {
2663	override_prev->override_next = override_next;
2664	}
2665
2666	/ Add to out-param for later zfree /
2667	override->override_next = *free_override_list;
2668	*free_override_list = override;
2669	} else {
2670	override_prev = override;
2671	}
2672
2673	if (THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD != resource) {
2674	return;
2675	}
2676	} else {
2677	override_prev = override;
2678	}
2679
2680	override = override_next;
2681	}
2682	}
2683
2684	/ This helper recalculates the current requested override using the policy selected at boot /
2685	static int
2686	calculate_requested_qos_override(thread_t thread)
2687	{
2688	if (qos_override_mode == QOS_OVERRIDE_MODE_IGNORE_OVERRIDE) {
2689	return THREAD_QOS_UNSPECIFIED;
2690	}
2691
2692	/ iterate over all overrides and calculate MAX /
2693	struct thread_qos_override *override;
2694	int qos_override = THREAD_QOS_UNSPECIFIED;
2695
2696	override = thread->overrides;
2697	while (override) {
2698	qos_override = MAX(qos_override, override->override_qos);
2699	override = override->override_next;
2700	}
2701
2702	return qos_override;
2703	}
2704
2705	/*
2706	* Returns:
2707	* - 0 on success
2708	* - EINVAL if some invalid input was passed
2709	*/
2710	static int
2711	proc_thread_qos_add_override_internal(thread_t thread,
2712	int override_qos,
2713	boolean_t first_override_for_resource,
2714	user_addr_t resource,
2715	int resource_type)
2716	{
2717	struct task_pend_token pend_token = {};
2718	int rc = `0`;
2719
2720	thread_mtx_lock(thread);
2721
2722	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) \| DBG_FUNC_START,
2723	thread_tid(thread), override_qos, first_override_for_resource ? `1` : `0`, `0`, `0`);
2724
2725	DTRACE_BOOST5(qos_add_override_pre, uint64_t, thread_tid(thread),
2726	uint64_t, thread->requested_policy.thrp_qos,
2727	uint64_t, thread->effective_policy.thep_qos,
2728	int, override_qos, boolean_t, first_override_for_resource);
2729
2730	struct thread_qos_override *override;
2731	struct thread_qos_override *override_new = NULL;
2732	int new_qos_override, prev_qos_override;
2733	int new_effective_qos;
2734
2735	canonicalize_resource_and_type(resource: &resource, resource_type: &resource_type);
2736
2737	override = find_qos_override(thread, resource, resource_type);
2738	if (first_override_for_resource && !override) {
2739	/ We need to allocate a new object. Drop the thread lock and*
2740	* recheck afterwards in case someone else added the override
2741	*/
2742	thread_mtx_unlock(thread);
2743	override_new = zalloc(kt_view: thread_qos_override_zone);
2744	thread_mtx_lock(thread);
2745	override = find_qos_override(thread, resource, resource_type);
2746	}
2747	if (first_override_for_resource && override) {
2748	/ Someone else already allocated while the thread lock was dropped /
2749	override->override_contended_resource_count++;
2750	} else if (!override && override_new) {
2751	override = override_new;
2752	override_new = NULL;
2753	override->override_next = thread->overrides;
2754	/ since first_override_for_resource was TRUE /
2755	override->override_contended_resource_count = `1`;
2756	override->override_resource = resource;
2757	override->override_resource_type = (int16_t)resource_type;
2758	override->override_qos = THREAD_QOS_UNSPECIFIED;
2759	thread->overrides = override;
2760	}
2761
2762	if (override) {
2763	if (override->override_qos == THREAD_QOS_UNSPECIFIED) {
2764	override->override_qos = (int16_t)override_qos;
2765	} else {
2766	override->override_qos = MAX(override->override_qos, (int16_t)override_qos);
2767	}
2768	}
2769
2770	/ Determine how to combine the various overrides into a single current*
2771	* requested override
2772	*/
2773	new_qos_override = calculate_requested_qos_override(thread);
2774
2775	prev_qos_override = proc_get_thread_policy_locked(thread,
2776	TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
2777
2778	if (new_qos_override != prev_qos_override) {
2779	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
2780	TASK_POLICY_QOS_OVERRIDE,
2781	value: new_qos_override, value2: `0`, pend_token: &pend_token);
2782	}
2783
2784	new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
2785
2786	thread_mtx_unlock(thread);
2787
2788	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
2789
2790	if (override_new) {
2791	zfree(thread_qos_override_zone, override_new);
2792	}
2793
2794	DTRACE_BOOST4(qos_add_override_post, int, prev_qos_override,
2795	int, new_qos_override, int, new_effective_qos, int, rc);
2796
2797	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) \| DBG_FUNC_END,
2798	new_qos_override, resource, resource_type, `0`, `0`);
2799
2800	return rc;
2801	}
2802
2803	int
2804	proc_thread_qos_add_override(task_t task,
2805	thread_t thread,
2806	uint64_t tid,
2807	int override_qos,
2808	boolean_t first_override_for_resource,
2809	user_addr_t resource,
2810	int resource_type)
2811	{
2812	boolean_t has_thread_reference = FALSE;
2813	int rc = `0`;
2814
2815	if (thread == THREAD_NULL) {
2816	thread = task_findtid(task, tid);
2817	/ returns referenced thread /
2818
2819	if (thread == THREAD_NULL) {
2820	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) \| DBG_FUNC_NONE,
2821	tid, `0`, `0xdead`, `0`, `0`);
2822	return ESRCH;
2823	}
2824	has_thread_reference = TRUE;
2825	} else {
2826	assert(get_threadtask(thread) == task);
2827	}
2828	rc = proc_thread_qos_add_override_internal(thread, override_qos,
2829	first_override_for_resource, resource, resource_type);
2830	if (has_thread_reference) {
2831	thread_deallocate(thread);
2832	}
2833
2834	return rc;
2835	}
2836
2837	static void
2838	proc_thread_qos_remove_override_internal(thread_t thread,
2839	user_addr_t resource,
2840	int resource_type,
2841	boolean_t reset)
2842	{
2843	struct task_pend_token pend_token = {};
2844
2845	struct thread_qos_override *deferred_free_override_list = NULL;
2846	int new_qos_override, prev_qos_override, new_effective_qos;
2847
2848	thread_mtx_lock(thread);
2849
2850	canonicalize_resource_and_type(resource: &resource, resource_type: &resource_type);
2851
2852	find_and_decrement_qos_override(thread, resource, resource_type, reset, free_override_list: &deferred_free_override_list);
2853
2854	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) \| DBG_FUNC_START,
2855	thread_tid(thread), resource, reset, `0`, `0`);
2856
2857	DTRACE_BOOST3(qos_remove_override_pre, uint64_t, thread_tid(thread),
2858	uint64_t, thread->requested_policy.thrp_qos,
2859	uint64_t, thread->effective_policy.thep_qos);
2860
2861	/ Determine how to combine the various overrides into a single current requested override /
2862	new_qos_override = calculate_requested_qos_override(thread);
2863
2864	spl_t s = splsched();
2865	thread_lock(thread);
2866
2867	/*
2868	* The override chain and therefore the value of the current override is locked with thread mutex,
2869	* so we can do a get/set without races. However, the rest of thread policy is locked under the spinlock.
2870	* This means you can't change the current override from a spinlock-only setter.
2871	*/
2872	prev_qos_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
2873
2874	if (new_qos_override != prev_qos_override) {
2875	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, value: new_qos_override, value2: `0`, pend_token: &pend_token);
2876	}
2877
2878	new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
2879
2880	thread_unlock(thread);
2881	splx(s);
2882
2883	thread_mtx_unlock(thread);
2884
2885	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
2886
2887	while (deferred_free_override_list) {
2888	struct thread_qos_override *override_next = deferred_free_override_list->override_next;
2889
2890	zfree(thread_qos_override_zone, deferred_free_override_list);
2891	deferred_free_override_list = override_next;
2892	}
2893
2894	DTRACE_BOOST3(qos_remove_override_post, int, prev_qos_override,
2895	int, new_qos_override, int, new_effective_qos);
2896
2897	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) \| DBG_FUNC_END,
2898	thread_tid(thread), `0`, `0`, `0`, `0`);
2899	}
2900
2901	int
2902	proc_thread_qos_remove_override(task_t task,
2903	thread_t thread,
2904	uint64_t tid,
2905	user_addr_t resource,
2906	int resource_type)
2907	{
2908	boolean_t has_thread_reference = FALSE;
2909
2910	if (thread == THREAD_NULL) {
2911	thread = task_findtid(task, tid);
2912	/ returns referenced thread /
2913
2914	if (thread == THREAD_NULL) {
2915	KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) \| DBG_FUNC_NONE,
2916	tid, `0`, `0xdead`, `0`, `0`);
2917	return ESRCH;
2918	}
2919	has_thread_reference = TRUE;
2920	} else {
2921	assert(task == get_threadtask(thread));
2922	}
2923
2924	proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE);
2925
2926	if (has_thread_reference) {
2927	thread_deallocate(thread);
2928	}
2929
2930	return `0`;
2931	}
2932
2933	/ Deallocate before thread termination /
2934	void
2935	proc_thread_qos_deallocate(thread_t thread)
2936	{
2937	/ This thread must have no more IPC overrides. /
2938	assert(thread->kevent_overrides == `0`);
2939	assert(thread->requested_policy.thrp_qos_kevent_override == THREAD_QOS_UNSPECIFIED);
2940	assert(thread->requested_policy.thrp_qos_wlsvc_override == THREAD_QOS_UNSPECIFIED);
2941
2942	/*
2943	* Clear out any lingering override objects.
2944	*/
2945	struct thread_qos_override *override;
2946
2947	thread_mtx_lock(thread);
2948	override = thread->overrides;
2949	thread->overrides = NULL;
2950	thread->requested_policy.thrp_qos_override = THREAD_QOS_UNSPECIFIED;
2951	/ We don't need to re-evaluate thread policy here because the thread has already exited /
2952	thread_mtx_unlock(thread);
2953
2954	while (override) {
2955	struct thread_qos_override *override_next = override->override_next;
2956
2957	zfree(thread_qos_override_zone, override);
2958	override = override_next;
2959	}
2960	}
2961
2962	/*
2963	* Set up the primordial thread's QoS
2964	*/
2965	void
2966	task_set_main_thread_qos(task_t task, thread_t thread)
2967	{
2968	struct task_pend_token pend_token = {};
2969
2970	assert(get_threadtask(thread) == task);
2971
2972	thread_mtx_lock(thread);
2973
2974	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2975	(IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, `0`)) \| DBG_FUNC_START,
2976	thread_tid(thread), threquested_0(thread), threquested_1(thread),
2977	thread->requested_policy.thrp_qos, `0`);
2978
2979	thread_qos_t primordial_qos = task_compute_main_thread_qos(task);
2980
2981	proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO,
2982	value: primordial_qos, value2: `0`, pend_token: &pend_token);
2983
2984	thread_mtx_unlock(thread);
2985
2986	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
2987
2988	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2989	(IMPORTANCE_CODE(IMP_MAIN_THREAD_QOS, `0`)) \| DBG_FUNC_END,
2990	thread_tid(thread), threquested_0(thread), threquested_1(thread),
2991	primordial_qos, `0`);
2992	}
2993
2994	/*
2995	* KPI for pthread kext
2996	*
2997	* Return a good guess at what the initial manager QoS will be
2998	* Dispatch can override this in userspace if it so chooses
2999	*/
3000	thread_qos_t
3001	task_get_default_manager_qos(task_t task)
3002	{
3003	thread_qos_t primordial_qos = task_compute_main_thread_qos(task);
3004
3005	if (primordial_qos == THREAD_QOS_LEGACY) {
3006	primordial_qos = THREAD_QOS_USER_INITIATED;
3007	}
3008
3009	return primordial_qos;
3010	}
3011
3012	/*
3013	* Check if the kernel promotion on thread has changed
3014	* and apply it.
3015	*
3016	* thread locked on entry and exit
3017	*/
3018	boolean_t
3019	thread_recompute_kernel_promotion_locked(thread_t thread)
3020	{
3021	boolean_t needs_update = FALSE;
3022	uint8_t kern_promotion_schedpri = (uint8_t)thread_get_inheritor_turnstile_sched_priority(thread);
3023
3024	/*
3025	* For now just assert that kern_promotion_schedpri <= MAXPRI_PROMOTE.
3026	* TURNSTILE_KERNEL_PROMOTE adds threads on the waitq already capped to MAXPRI_PROMOTE
3027	* and propagates the priority through the chain with the same cap, because as of now it does
3028	* not differenciate on the kernel primitive.
3029	*
3030	* If this assumption will change with the adoption of a kernel primitive that does not
3031	* cap the when adding/propagating,
3032	* then here is the place to put the generic cap for all kernel primitives
3033	* (converts the assert to kern_promotion_schedpri = MIN(priority, MAXPRI_PROMOTE))
3034	*/
3035	assert(kern_promotion_schedpri <= MAXPRI_PROMOTE);
3036
3037	if (kern_promotion_schedpri != thread->kern_promotion_schedpri) {
3038	KDBG(MACHDBG_CODE(
3039	DBG_MACH_SCHED, MACH_TURNSTILE_KERNEL_CHANGE) \| DBG_FUNC_NONE,
3040	thread_tid(thread),
3041	kern_promotion_schedpri,
3042	thread->kern_promotion_schedpri);
3043
3044	needs_update = TRUE;
3045	thread->kern_promotion_schedpri = kern_promotion_schedpri;
3046	thread_recompute_sched_pri(thread, options: SETPRI_DEFAULT);
3047	}
3048
3049	return needs_update;
3050	}
3051
3052	/*
3053	* Check if the user promotion on thread has changed
3054	* and apply it.
3055	*
3056	* thread locked on entry, might drop the thread lock
3057	* and reacquire it.
3058	*/
3059	boolean_t
3060	thread_recompute_user_promotion_locked(thread_t thread)
3061	{
3062	boolean_t needs_update = FALSE;
3063	struct task_pend_token pend_token = {};
3064	uint8_t user_promotion_basepri = MIN((uint8_t)thread_get_inheritor_turnstile_base_priority(thread), MAXPRI_USER);
3065	int old_base_pri = thread->base_pri;
3066	thread_qos_t qos_promotion;
3067
3068	/ Check if user promotion has changed /
3069	if (thread->user_promotion_basepri == user_promotion_basepri) {
3070	return needs_update;
3071	} else {
3072	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3073	(TURNSTILE_CODE(TURNSTILE_PRIORITY_OPERATIONS, (THREAD_USER_PROMOTION_CHANGE))) \| DBG_FUNC_NONE,
3074	thread_tid(thread),
3075	user_promotion_basepri,
3076	thread->user_promotion_basepri,
3077	`0`, `0`);
3078	KDBG(MACHDBG_CODE(
3079	DBG_MACH_SCHED, MACH_TURNSTILE_USER_CHANGE) \| DBG_FUNC_NONE,
3080	thread_tid(thread),
3081	user_promotion_basepri,
3082	thread->user_promotion_basepri);
3083	}
3084
3085	/ Update the user promotion base pri /
3086	thread->user_promotion_basepri = user_promotion_basepri;
3087	pend_token.tpt_force_recompute_pri = `1`;
3088
3089	if (user_promotion_basepri <= MAXPRI_THROTTLE) {
3090	qos_promotion = THREAD_QOS_UNSPECIFIED;
3091	} else {
3092	qos_promotion = thread_user_promotion_qos_for_pri(priority: user_promotion_basepri);
3093	}
3094
3095	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
3096	TASK_POLICY_QOS_PROMOTE, value: qos_promotion, value2: `0`, pend_token: &pend_token);
3097
3098	if (thread_get_waiting_turnstile(thread) &&
3099	thread->base_pri != old_base_pri) {
3100	needs_update = TRUE;
3101	}
3102
3103	thread_unlock(thread);
3104
3105	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
3106
3107	thread_lock(thread);
3108
3109	return needs_update;
3110	}
3111
3112	/*
3113	* Convert the thread user promotion base pri to qos for threads in qos world.
3114	* For priority above UI qos, the qos would be set to UI.
3115	*/
3116	thread_qos_t
3117	thread_user_promotion_qos_for_pri(int priority)
3118	{
3119	thread_qos_t qos;
3120	for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) {
3121	if (thread_qos_policy_params.qos_pri[qos] <= priority) {
3122	return qos;
3123	}
3124	}
3125	return THREAD_QOS_MAINTENANCE;
3126	}
3127
3128	/*
3129	* Set the thread's QoS Kevent override
3130	* Owned by the Kevent subsystem
3131	*
3132	* May be called with spinlocks held, but not spinlocks
3133	* that may deadlock against the thread lock, the throttle lock, or the SFI lock.
3134	*
3135	* One 'add' must be balanced by one 'drop'.
3136	* Between 'add' and 'drop', the overide QoS value may be updated with an 'update'.
3137	* Before the thread is deallocated, there must be 0 remaining overrides.
3138	*/
3139	static void
3140	thread_kevent_override(thread_t thread,
3141	uint32_t qos_override,
3142	boolean_t is_new_override)
3143	{
3144	struct task_pend_token pend_token = {};
3145	boolean_t needs_update;
3146
3147	spl_t s = splsched();
3148	thread_lock(thread);
3149
3150	uint32_t old_override = thread->requested_policy.thrp_qos_kevent_override;
3151
3152	assert(qos_override > THREAD_QOS_UNSPECIFIED);
3153	assert(qos_override < THREAD_QOS_LAST);
3154
3155	if (is_new_override) {
3156	if (thread->kevent_overrides++ == `0`) {
3157	/ This add is the first override for this thread /
3158	assert(old_override == THREAD_QOS_UNSPECIFIED);
3159	} else {
3160	/ There are already other overrides in effect for this thread /
3161	assert(old_override > THREAD_QOS_UNSPECIFIED);
3162	}
3163	} else {
3164	/ There must be at least one override (the previous add call) in effect /
3165	assert(thread->kevent_overrides > `0`);
3166	assert(old_override > THREAD_QOS_UNSPECIFIED);
3167	}
3168
3169	/*
3170	* We can't allow lowering if there are several IPC overrides because
3171	* the caller can't possibly know the whole truth
3172	*/
3173	if (thread->kevent_overrides == `1`) {
3174	needs_update = qos_override != old_override;
3175	} else {
3176	needs_update = qos_override > old_override;
3177	}
3178
3179	if (needs_update) {
3180	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
3181	TASK_POLICY_QOS_KEVENT_OVERRIDE,
3182	value: qos_override, value2: `0`, pend_token: &pend_token);
3183	assert(pend_token.tpt_update_sockets == `0`);
3184	}
3185
3186	thread_unlock(thread);
3187	splx(s);
3188
3189	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
3190	}
3191
3192	void
3193	thread_add_kevent_override(thread_t thread, uint32_t qos_override)
3194	{
3195	thread_kevent_override(thread, qos_override, TRUE);
3196	}
3197
3198	void
3199	thread_update_kevent_override(thread_t thread, uint32_t qos_override)
3200	{
3201	thread_kevent_override(thread, qos_override, FALSE);
3202	}
3203
3204	void
3205	thread_drop_kevent_override(thread_t thread)
3206	{
3207	struct task_pend_token pend_token = {};
3208
3209	spl_t s = splsched();
3210	thread_lock(thread);
3211
3212	assert(thread->kevent_overrides > `0`);
3213
3214	if (--thread->kevent_overrides == `0`) {
3215	/*
3216	* There are no more overrides for this thread, so we should
3217	* clear out the saturated override value
3218	*/
3219
3220	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
3221	TASK_POLICY_QOS_KEVENT_OVERRIDE, THREAD_QOS_UNSPECIFIED,
3222	value2: `0`, pend_token: &pend_token);
3223	}
3224
3225	thread_unlock(thread);
3226	splx(s);
3227
3228	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
3229	}
3230
3231	/*
3232	* Set the thread's QoS Workloop Servicer override
3233	* Owned by the Kevent subsystem
3234	*
3235	* May be called with spinlocks held, but not spinlocks
3236	* that may deadlock against the thread lock, the throttle lock, or the SFI lock.
3237	*
3238	* One 'add' must be balanced by one 'drop'.
3239	* Between 'add' and 'drop', the overide QoS value may be updated with an 'update'.
3240	* Before the thread is deallocated, there must be 0 remaining overrides.
3241	*/
3242	static void
3243	thread_servicer_override(thread_t thread,
3244	uint32_t qos_override,
3245	boolean_t is_new_override)
3246	{
3247	struct task_pend_token pend_token = {};
3248
3249	spl_t s = splsched();
3250	thread_lock(thread);
3251
3252	if (is_new_override) {
3253	assert(!thread->requested_policy.thrp_qos_wlsvc_override);
3254	} else {
3255	assert(thread->requested_policy.thrp_qos_wlsvc_override);
3256	}
3257
3258	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
3259	TASK_POLICY_QOS_SERVICER_OVERRIDE,
3260	value: qos_override, value2: `0`, pend_token: &pend_token);
3261
3262	thread_unlock(thread);
3263	splx(s);
3264
3265	assert(pend_token.tpt_update_sockets == `0`);
3266	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
3267	}
3268
3269	void
3270	thread_add_servicer_override(thread_t thread, uint32_t qos_override)
3271	{
3272	assert(qos_override > THREAD_QOS_UNSPECIFIED);
3273	assert(qos_override < THREAD_QOS_LAST);
3274
3275	thread_servicer_override(thread, qos_override, TRUE);
3276	}
3277
3278	void
3279	thread_update_servicer_override(thread_t thread, uint32_t qos_override)
3280	{
3281	assert(qos_override > THREAD_QOS_UNSPECIFIED);
3282	assert(qos_override < THREAD_QOS_LAST);
3283
3284	thread_servicer_override(thread, qos_override, FALSE);
3285	}
3286
3287	void
3288	thread_drop_servicer_override(thread_t thread)
3289	{
3290	thread_servicer_override(thread, THREAD_QOS_UNSPECIFIED, FALSE);
3291	}
3292
3293	void
3294	thread_update_servicer_iotier_override(thread_t thread, uint8_t iotier_override)
3295	{
3296	struct task_pend_token pend_token = {};
3297	uint8_t current_iotier;
3298
3299	/ Check if the update is needed /
3300	current_iotier = (uint8_t)thread_get_requested_policy_spinlocked(thread,
3301	TASK_POLICY_ATTRIBUTE, TASK_POLICY_IOTIER_KEVENT_OVERRIDE, NULL);
3302
3303	if (iotier_override == current_iotier) {
3304	return;
3305	}
3306
3307	spl_t s = splsched();
3308	thread_lock(thread);
3309
3310	proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
3311	TASK_POLICY_IOTIER_KEVENT_OVERRIDE,
3312	value: iotier_override, value2: `0`, pend_token: &pend_token);
3313
3314	thread_unlock(thread);
3315	splx(s);
3316
3317	assert(pend_token.tpt_update_sockets == `0`);
3318	thread_policy_update_complete_unlocked(thread, pend_token: &pend_token);
3319	}
3320
3321	/ Get current requested qos / relpri, may be called from spinlock context /
3322	thread_qos_t
3323	thread_get_requested_qos(thread_t thread, int *relpri)
3324	{
3325	int relprio_value = `0`;
3326	thread_qos_t qos;
3327
3328	qos = (thread_qos_t)proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
3329	TASK_POLICY_QOS_AND_RELPRIO, value2: &relprio_value);
3330	if (relpri) {
3331	*relpri = -relprio_value;
3332	}
3333	return qos;
3334	}
3335
3336	/*
3337	* This function will promote the thread priority
3338	* since exec could block other threads calling
3339	* proc_find on the proc. This boost must be removed
3340	* via call to thread_clear_exec_promotion.
3341	*
3342	* This should be replaced with a generic 'priority inheriting gate' mechanism (24194397)
3343	*/
3344	void
3345	thread_set_exec_promotion(thread_t thread)
3346	{
3347	spl_t s = splsched();
3348	thread_lock(thread);
3349
3350	sched_thread_promote_reason(thread, TH_SFLAG_EXEC_PROMOTED, trace_obj: `0`);
3351
3352	thread_unlock(thread);
3353	splx(s);
3354	}
3355
3356	/*
3357	* This function will clear the exec thread
3358	* promotion set on the thread by thread_set_exec_promotion.
3359	*/
3360	void
3361	thread_clear_exec_promotion(thread_t thread)
3362	{
3363	spl_t s = splsched();
3364	thread_lock(thread);
3365
3366	sched_thread_unpromote_reason(thread, TH_SFLAG_EXEC_PROMOTED, trace_obj: `0`);
3367
3368	thread_unlock(thread);
3369	splx(s);
3370	}
3371
3372	#if CONFIG_SCHED_RT_ALLOW
3373
3374	/*
3375	* flag set by -rt-allow-policy-enable boot-arg to restrict use of
3376	* THREAD_TIME_CONSTRAINT_POLICY and THREAD_TIME_CONSTRAINT_WITH_PRIORITY_POLICY
3377	* to threads that have joined a workinterval with WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED.
3378	*/
3379	static TUNABLE(
3380	bool,
3381	rt_allow_policy_enabled,
3382	"-rt-allow_policy-enable",
3383	false
3384	);
3385
3386	/*
3387	* When the RT allow policy is enabled and a thread allowed to become RT,
3388	* sometimes (if the processes RT allow policy is restricted) the thread will
3389	* have a CPU limit enforced. The following two tunables determine the
3390	* parameters for that CPU limit.
3391	*/
3392
3393	/ % of the interval allowed to run. /
3394	TUNABLE_DEV_WRITEABLE(uint8_t, rt_allow_limit_percent,
3395	"rt_allow_limit_percent", `70`);
3396
3397	/ The length of interval in nanoseconds. /
3398	TUNABLE_DEV_WRITEABLE(uint16_t, rt_allow_limit_interval_ms,
3399	"rt_allow_limit_interval", `10`);
3400
3401	static bool
3402	thread_has_rt(thread_t thread)
3403	{
3404	return
3405	thread->sched_mode == TH_MODE_REALTIME \|\|
3406	thread->saved_mode == TH_MODE_REALTIME;
3407	}
3408
3409	/*
3410	* Set a CPU limit on a thread based on the RT allow policy. This will be picked
3411	* up by the target thread via the ledger AST.
3412	*/
3413	static void
3414	thread_rt_set_cpulimit(thread_t thread)
3415	{
3416	/ Force reasonable values for the cpu limit. /
3417	const uint8_t percent = MAX(MIN(rt_allow_limit_percent, `99`), `1`);
3418	const uint16_t interval_ms = MAX(rt_allow_limit_interval_ms, `1`);
3419
3420	thread->t_ledger_req_percentage = percent;
3421	thread->t_ledger_req_interval_ms = interval_ms;
3422	thread->t_ledger_req_action = THREAD_CPULIMIT_BLOCK;
3423
3424	thread->sched_flags \|= TH_SFLAG_RT_CPULIMIT;
3425	}
3426
3427	/ Similar to the above but removes any CPU limit. /
3428	static void
3429	thread_rt_clear_cpulimit(thread_t thread)
3430	{
3431	thread->sched_flags &= ~TH_SFLAG_RT_CPULIMIT;
3432
3433	thread->t_ledger_req_percentage = `0`;
3434	thread->t_ledger_req_interval_ms = `0`;
3435	thread->t_ledger_req_action = THREAD_CPULIMIT_DISABLE;
3436	}
3437
3438	/*
3439	* Evaluate RT policy for a thread, demoting and undemoting as needed.
3440	*/
3441	void
3442	thread_rt_evaluate(thread_t thread)
3443	{
3444	task_t task = get_threadtask(thread);
3445	bool platform_binary = false;
3446
3447	/ If the RT allow policy is not enabled - nothing to do. /
3448	if (!rt_allow_policy_enabled) {
3449	return;
3450	}
3451
3452	/ User threads only. /
3453	if (task == kernel_task) {
3454	return;
3455	}
3456
3457	/ Check for platform binary. /
3458	platform_binary = (task_ro_flags_get(task) & TFRO_PLATFORM) != `0`;
3459
3460	spl_t s = splsched();
3461	thread_lock(thread);
3462
3463	const thread_work_interval_flags_t wi_flags =
3464	os_atomic_load(&thread->th_work_interval_flags, relaxed);
3465
3466	/*
3467	* RT threads which are not joined to a work interval which allows RT
3468	* threads are demoted. Once those conditions no longer hold, the thread
3469	* undemoted.
3470	*/
3471	if (thread_has_rt(thread) && (wi_flags & TH_WORK_INTERVAL_FLAGS_RT_ALLOWED) == `0`) {
3472	if (!sched_thread_mode_has_demotion(thread, TH_SFLAG_RT_DISALLOWED)) {
3473	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_DISALLOWED_WORK_INTERVAL),
3474	thread_tid(thread));
3475	sched_thread_mode_demote(thread, TH_SFLAG_RT_DISALLOWED);
3476	}
3477	} else {
3478	if (sched_thread_mode_has_demotion(thread, TH_SFLAG_RT_DISALLOWED)) {
3479	sched_thread_mode_undemote(thread, TH_SFLAG_RT_DISALLOWED);
3480	}
3481	}
3482
3483	/*
3484	* RT threads get a CPU limit unless they're part of a platform binary
3485	* task. If the thread is no longer RT, any existing CPU limit should be
3486	* removed.
3487	*/
3488	bool set_ast = false;
3489	if (!platform_binary &&
3490	thread_has_rt(thread) &&
3491	(thread->sched_flags & TH_SFLAG_RT_CPULIMIT) == `0`) {
3492	thread_rt_set_cpulimit(thread);
3493	set_ast = true;
3494	}
3495
3496	if (!platform_binary &&
3497	!thread_has_rt(thread) &&
3498	(thread->sched_flags & TH_SFLAG_RT_CPULIMIT) != `0`) {
3499	thread_rt_clear_cpulimit(thread);
3500	set_ast = true;
3501	}
3502
3503	thread_unlock(thread);
3504	splx(s);
3505
3506	if (set_ast) {
3507	/ Ensure the target thread picks up any CPU limit change. /
3508	act_set_astledger(thread);
3509	}
3510	}
3511
3512	#else
3513
3514	void
3515	thread_rt_evaluate(__unused thread_t thread)
3516	{
3517	}
3518
3519	#endif /* CONFIG_SCHED_RT_ALLOW */
3520

Browse the source code of xnu/osfmk/kern/thread_policy.c