work_interval.c source code [xnu/osfmk/kern/work_interval.c]

1	/*
2	* Copyright (c) 2017 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29
30	#include <sys/work_interval.h>
31
32	#include <kern/work_interval.h>
33
34	#include <kern/thread.h>
35	#include <kern/sched_prim.h>
36	#include <kern/machine.h>
37	#include <kern/thread_group.h>
38	#include <kern/ipc_kobject.h>
39	#include <kern/task.h>
40	#include <kern/coalition.h>
41	#include <kern/policy_internal.h>
42	#include <kern/mpsc_queue.h>
43	#include <kern/workload_config.h>
44	#include <kern/assert.h>
45
46	#include <mach/kern_return.h>
47	#include <mach/notify.h>
48	#include <os/refcnt.h>
49
50	#include <stdatomic.h>
51
52	/*
53	* With the introduction of auto-join work intervals, it is possible
54	* to change the work interval (and related thread group) of a thread in a
55	* variety of contexts (thread termination, context switch, thread mode
56	* change etc.). In order to clearly specify the policy expectation and
57	* the locking behavior, all calls to thread_set_work_interval() pass
58	* in a set of flags.
59	*/
60
61	__options_decl(thread_work_interval_options_t, uint32_t, {
62	/ Change the work interval using the explicit join rules /
63	THREAD_WI_EXPLICIT_JOIN_POLICY = `0x1`,
64	/ Change the work interval using the auto-join rules /
65	THREAD_WI_AUTO_JOIN_POLICY = `0x2`,
66	/ Caller already holds the thread lock /
67	THREAD_WI_THREAD_LOCK_HELD = `0x4`,
68	/ Caller does not hold the thread lock /
69	THREAD_WI_THREAD_LOCK_NEEDED = `0x8`,
70	/ Change the work interval from the context switch path (thread may not be running or on a runq) /
71	THREAD_WI_THREAD_CTX_SWITCH = `0x10`,
72	});
73
74	static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75	static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76
77	IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78	.iko_op_stable = true,
79	.iko_op_no_senders = work_interval_port_no_senders);
80
81	#if CONFIG_SCHED_AUTO_JOIN
82	/ MPSC queue used to defer deallocate work intervals /
83	static struct mpsc_daemon_queue work_interval_deallocate_queue;
84
85	static void work_interval_deferred_release(struct work_interval *);
86
87	/*
88	* Work Interval Auto-Join Status
89	*
90	* work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91	* It packs the following information:
92	* - A bit representing if a "finish" is deferred on the work interval
93	* - Count of number of threads auto-joined to the work interval
94	*/
95	#define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
96	#define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97	#define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98	typedef uint32_t work_interval_auto_join_status_t;
99
100	static inline bool __unused
101	work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102	{
103	return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104	}
105
106	static inline uint32_t __unused
107	work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108	{
109	return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110	}
111
112	/*
113	* struct work_interval_deferred_finish_state
114	*
115	* Contains the parameters of the finish operation which is being deferred.
116	*/
117	struct work_interval_deferred_finish_state {
118	uint64_t instance_id;
119	uint64_t start;
120	uint64_t deadline;
121	uint64_t complexity;
122	};
123
124	struct work_interval_auto_join_info {
125	struct work_interval_deferred_finish_state deferred_finish_state;
126	work_interval_auto_join_status_t _Atomic status;
127	};
128	#endif /* CONFIG_SCHED_AUTO_JOIN */
129
130	#if CONFIG_THREAD_GROUPS
131	/ Flags atomically set in wi_group_flags wi_group_flags /
132	#define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133	#endif
134
135	/*
136	* Work Interval struct
137	*
138	* This struct represents a thread group and/or work interval context
139	* in a mechanism that is represented with a kobject.
140	*
141	* Every thread that has joined a WI has a +1 ref, and the port
142	* has a +1 ref as well.
143	*
144	* TODO: groups need to have a 'is for WI' flag
145	* and they need a flag to create that says 'for WI'
146	* This would allow CLPC to avoid allocating WI support
147	* data unless it is needed
148	*
149	* TODO: Enforce not having more than one non-group joinable work
150	* interval per thread group.
151	* CLPC only wants to see one WI-notify callout per group.
152	*/
153	struct work_interval {
154	uint64_t wi_id;
155	struct os_refcnt wi_ref_count;
156	uint32_t wi_create_flags;
157
158	/ for debugging purposes only, does not hold a ref on port /
159	ipc_port_t wi_port;
160
161	/*
162	* holds uniqueid and version of creating process,
163	* used to permission-gate notify
164	* TODO: you'd think there would be a better way to do this
165	*/
166	uint64_t wi_creator_uniqueid;
167	uint32_t wi_creator_pid;
168	int wi_creator_pidversion;
169
170	/ flags set by work_interval_set_workload_id and reflected onto*
171	* thread->th_work_interval_flags upon join */
172	uint32_t wi_wlid_flags;
173
174	#if CONFIG_THREAD_GROUPS
175	uint32_t wi_group_flags;
176	struct thread_group wi_group; /* holds +1 ref on group /
177	#endif /* CONFIG_THREAD_GROUPS */
178
179	#if CONFIG_SCHED_AUTO_JOIN
180	/ Information related to auto-join and deferred finish for work interval /
181	struct work_interval_auto_join_info wi_auto_join_info;
182
183	/*
184	* Since the deallocation of auto-join work intervals
185	* can happen in the scheduler when the last thread in
186	* the WI blocks and the thread lock is held, the deallocation
187	* might have to be done on a separate thread.
188	*/
189	struct mpsc_queue_chain wi_deallocate_link;
190	#endif /* CONFIG_SCHED_AUTO_JOIN */
191
192	/*
193	* Work interval class info - determines thread priority for threads
194	* with a work interval driven policy.
195	*/
196	wi_class_t wi_class;
197	uint8_t wi_class_offset;
198
199	struct recount_work_interval wi_recount;
200	};
201
202	/*
203	* work_interval_telemetry_data_enabled()
204	*
205	* Helper routine to check if work interval has the collection of telemetry data enabled.
206	*/
207	static inline bool
208	work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209	{
210	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != `0`;
211	}
212
213
214	/*
215	* work_interval_get_recount_tracks()
216	*
217	* Returns the recount tracks associated with a work interval, or NULL
218	* if the work interval is NULL or has telemetry disabled.
219	*/
220	inline struct recount_track *
221	work_interval_get_recount_tracks(struct work_interval *work_interval)
222	{
223	if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
224	return work_interval->wi_recount.rwi_current_instance;
225	}
226	return NULL;
227	}
228
229	#if CONFIG_SCHED_AUTO_JOIN
230
231	/*
232	* work_interval_perform_deferred_finish()
233	*
234	* Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
235	* argument rather than looking at the work_interval since the deferred finish can race with another
236	* start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
237	* deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
238	* the deferred state without issues.
239	*/
240	static inline void
241	work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
242	__unused struct work_interval *work_interval, __unused thread_t thread)
243	{
244
245	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
246	thread_tid(thread), thread_group_get_id(work_interval->wi_group));
247	}
248
249	/*
250	* work_interval_auto_join_increment()
251	*
252	* Routine to increment auto-join counter when a new thread is auto-joined to
253	* the work interval.
254	*/
255	static void
256	work_interval_auto_join_increment(struct work_interval *work_interval)
257	{
258	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
259	__assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, `1`, relaxed);
260	assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
261	}
262
263	/*
264	* work_interval_auto_join_decrement()
265	*
266	* Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
267	* blocking or termination). If this was the last auto-joined thread in the work interval and
268	* there was a deferred finish, performs the finish operation for the work interval.
269	*/
270	static void
271	work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
272	{
273	struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
274	work_interval_auto_join_status_t old_status, new_status;
275	struct work_interval_deferred_finish_state deferred_finish_state;
276	bool perform_finish;
277
278	/ Update the auto-join count for the work interval atomically /
279	os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
280	perform_finish = false;
281	new_status = old_status;
282	assert(work_interval_status_auto_join_count(old_status) > `0`);
283	new_status -= `1`;
284	if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
285	/ No auto-joined threads remaining and finish is deferred /
286	new_status = `0`;
287	perform_finish = true;
288	/*
289	* Its important to copy the deferred finish state here so that this works
290	* when racing with another start-finish cycle.
291	*/
292	deferred_finish_state = join_info->deferred_finish_state;
293	}
294	});
295
296	if (perform_finish == true) {
297	/*
298	* Since work_interval_perform_deferred_finish() calls down to
299	* the machine layer callout for finish which gets the thread
300	* group from the thread passed in here, it is important to
301	* make sure that the thread still has the work interval thread
302	* group here.
303	*/
304	assert(thread->thread_group == work_interval->wi_group);
305	work_interval_perform_deferred_finish(deferred_finish_state: &deferred_finish_state, work_interval, thread);
306	}
307	}
308
309	/*
310	* work_interval_auto_join_enabled()
311	*
312	* Helper routine to check if work interval has auto-join enabled.
313	*/
314	static inline bool
315	work_interval_auto_join_enabled(struct work_interval *work_interval)
316	{
317	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != `0`;
318	}
319
320	/*
321	* work_interval_deferred_finish_enabled()
322	*
323	* Helper routine to check if work interval has deferred finish enabled.
324	*/
325	static inline bool __unused
326	work_interval_deferred_finish_enabled(struct work_interval *work_interval)
327	{
328	return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != `0`;
329	}
330
331	#endif /* CONFIG_SCHED_AUTO_JOIN */
332
333	static inline void
334	work_interval_retain(struct work_interval *work_interval)
335	{
336	/*
337	* Even though wi_retain is called under a port lock, we have
338	* to use os_ref_retain instead of os_ref_retain_locked
339	* because wi_release is not synchronized. wi_release calls
340	* os_ref_release which is unsafe to pair with os_ref_retain_locked.
341	*/
342	os_ref_retain(rc: &work_interval->wi_ref_count);
343	}
344
345	static inline void
346	work_interval_deallocate(struct work_interval *work_interval)
347	{
348	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
349	work_interval->wi_id);
350	if (work_interval_telemetry_data_enabled(work_interval)) {
351	recount_work_interval_deinit(wi: &work_interval->wi_recount);
352	}
353	kfree_type(struct work_interval, work_interval);
354	}
355
356	/*
357	* work_interval_release()
358	*
359	* Routine to release a ref count on the work interval. If the refcount goes down
360	* to zero, the work interval needs to be de-allocated.
361	*
362	* For non auto-join work intervals, they are de-allocated in this context.
363	*
364	* For auto-join work intervals, the de-allocation cannot be done from this context
365	* since that might need the kernel memory allocator lock. In that case, the
366	* deallocation is done via a thread-call based mpsc queue.
367	*/
368	static void
369	work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
370	{
371	if (os_ref_release(rc: &work_interval->wi_ref_count) == `0`) {
372	#if CONFIG_SCHED_AUTO_JOIN
373	if (options & THREAD_WI_THREAD_LOCK_HELD) {
374	work_interval_deferred_release(work_interval);
375	} else {
376	work_interval_deallocate(work_interval);
377	}
378	#else /* CONFIG_SCHED_AUTO_JOIN */
379	work_interval_deallocate(work_interval);
380	#endif /* CONFIG_SCHED_AUTO_JOIN */
381	}
382	}
383
384	#if CONFIG_SCHED_AUTO_JOIN
385
386	/*
387	* work_interval_deferred_release()
388	*
389	* Routine to enqueue the work interval on the deallocation mpsc queue.
390	*/
391	static void
392	work_interval_deferred_release(struct work_interval *work_interval)
393	{
394	mpsc_daemon_enqueue(dq: &work_interval_deallocate_queue,
395	elm: &work_interval->wi_deallocate_link, options: MPSC_QUEUE_NONE);
396	}
397
398	/*
399	* work_interval_should_propagate()
400	*
401	* Main policy routine to decide if a thread should be auto-joined to
402	* another thread's work interval. The conditions are arranged such that
403	* the most common bailout condition are checked the earliest. This routine
404	* is called from the scheduler context; so it needs to be efficient and
405	* be careful when taking locks or performing wakeups.
406	*/
407	inline bool
408	work_interval_should_propagate(thread_t cthread, thread_t thread)
409	{
410	/ Only allow propagation if the current thread has a work interval and the woken up thread does not /
411	if ((cthread->th_work_interval == NULL) \|\| (thread->th_work_interval != NULL)) {
412	return false;
413	}
414
415	/ Only propagate work intervals which have auto-join enabled /
416	if (work_interval_auto_join_enabled(work_interval: cthread->th_work_interval) == false) {
417	return false;
418	}
419
420	/ Work interval propagation is enabled for realtime threads only /
421	if ((cthread->sched_mode != TH_MODE_REALTIME) \|\| (thread->sched_mode != TH_MODE_REALTIME)) {
422	return false;
423	}
424
425
426	/ Work interval propagation only works for threads with the same home thread group /
427	struct thread_group *thread_home_tg = thread_group_get_home_group(t: thread);
428	if (thread_group_get_home_group(t: cthread) != thread_home_tg) {
429	return false;
430	}
431
432	/ If woken up thread has adopted vouchers and other thread groups, it does not get propagation /
433	if (thread->thread_group != thread_home_tg) {
434	return false;
435	}
436
437	/ If either thread is inactive (in the termination path), do not propagate auto-join /
438	if ((!cthread->active) \|\| (!thread->active)) {
439	return false;
440	}
441
442	return true;
443	}
444
445	/*
446	* work_interval_auto_join_propagate()
447	*
448	* Routine to auto-join a thread into another thread's work interval
449	*
450	* Should only be invoked if work_interval_should_propagate() returns
451	* true. Also expects "from" thread to be current thread and "to" thread
452	* to be locked.
453	*/
454	void
455	work_interval_auto_join_propagate(thread_t from, thread_t to)
456	{
457	assert(from == current_thread());
458	work_interval_retain(work_interval: from->th_work_interval);
459	work_interval_auto_join_increment(work_interval: from->th_work_interval);
460	__assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
461	THREAD_WI_AUTO_JOIN_POLICY \| THREAD_WI_THREAD_LOCK_HELD \| THREAD_WI_THREAD_CTX_SWITCH);
462	assert(kr == KERN_SUCCESS);
463	}
464
465	/*
466	* work_interval_auto_join_unwind()
467	*
468	* Routine to un-join an auto-joined work interval for a thread that is blocking.
469	*
470	* Expects thread to be locked.
471	*/
472	void
473	work_interval_auto_join_unwind(thread_t thread)
474	{
475	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
476	THREAD_WI_AUTO_JOIN_POLICY \| THREAD_WI_THREAD_LOCK_HELD \| THREAD_WI_THREAD_CTX_SWITCH);
477	assert(kr == KERN_SUCCESS);
478	}
479
480	/*
481	* work_interval_auto_join_demote()
482	*
483	* Routine to un-join an auto-joined work interval when a thread is changing from
484	* realtime to non-realtime scheduling mode. This could happen due to multiple
485	* reasons such as RT failsafe, thread backgrounding or thread termination. Also,
486	* the thread being demoted may not be the current thread.
487	*
488	* Expects thread to be locked.
489	*/
490	void
491	work_interval_auto_join_demote(thread_t thread)
492	{
493	__assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
494	THREAD_WI_AUTO_JOIN_POLICY \| THREAD_WI_THREAD_LOCK_HELD);
495	assert(kr == KERN_SUCCESS);
496	}
497
498	static void
499	work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
500	__assert_only mpsc_daemon_queue_t dq)
501	{
502	struct work_interval *work_interval = NULL;
503	work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
504	assert(dq == &work_interval_deallocate_queue);
505	assert(os_ref_get_count(&work_interval->wi_ref_count) == `0`);
506	work_interval_deallocate(work_interval);
507	}
508
509	#endif /* CONFIG_SCHED_AUTO_JOIN */
510
511	#if CONFIG_SCHED_AUTO_JOIN
512	__startup_func
513	static void
514	work_interval_subsystem_init(void)
515	{
516	/*
517	* The work interval deallocation queue must be a thread call based queue
518	* because it is woken up from contexts where the thread lock is held. The
519	* only way to perform wakeups safely in those contexts is to wakeup a
520	* thread call which is guaranteed to be on a different waitq and would
521	* not hash onto the same global waitq which might be currently locked.
522	*/
523	mpsc_daemon_queue_init_with_thread_call(dq: &work_interval_deallocate_queue,
524	invoke: work_interval_deallocate_queue_invoke, pri: THREAD_CALL_PRIORITY_KERNEL,
525	flags: MPSC_DAEMON_INIT_NONE);
526	}
527	STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
528	#endif /* CONFIG_SCHED_AUTO_JOIN */
529
530	/*
531	* work_interval_port_convert
532	*
533	* Called with port locked, returns reference to work interval
534	* if indeed the port is a work interval kobject port
535	*/
536	static struct work_interval *
537	work_interval_port_convert_locked(ipc_port_t port)
538	{
539	struct work_interval *work_interval = NULL;
540
541	if (IP_VALID(port)) {
542	work_interval = ipc_kobject_get_stable(port, type: IKOT_WORK_INTERVAL);
543	if (work_interval) {
544	work_interval_retain(work_interval);
545	}
546	}
547
548	return work_interval;
549	}
550
551	/*
552	* port_name_to_work_interval
553	*
554	* Description: Obtain a reference to the work_interval associated with a given port.
555	*
556	* Parameters: name A Mach port name to translate.
557	*
558	* Returns: NULL The given Mach port did not reference a work_interval.
559	* !NULL The work_interval that is associated with the Mach port.
560	*/
561	static kern_return_t
562	port_name_to_work_interval(mach_port_name_t name,
563	struct work_interval **work_interval)
564	{
565	if (!MACH_PORT_VALID(name)) {
566	return KERN_INVALID_NAME;
567	}
568
569	ipc_port_t port = IP_NULL;
570	kern_return_t kr = KERN_SUCCESS;
571
572	kr = ipc_port_translate_send(current_space(), name, portp: &port);
573	if (kr != KERN_SUCCESS) {
574	return kr;
575	}
576	/ port is locked /
577
578	assert(IP_VALID(port));
579
580	struct work_interval *converted_work_interval;
581
582	converted_work_interval = work_interval_port_convert_locked(port);
583
584	/ the port is valid, but doesn't denote a work_interval /
585	if (converted_work_interval == NULL) {
586	kr = KERN_INVALID_CAPABILITY;
587	}
588
589	ip_mq_unlock(port);
590
591	if (kr == KERN_SUCCESS) {
592	*work_interval = converted_work_interval;
593	}
594
595	return kr;
596	}
597
598
599	/*
600	* work_interval_port_no_senders
601	*
602	* Description: Handle a no-senders notification for a work interval port.
603	* Destroys the port and releases its reference on the work interval.
604	*
605	* Parameters: msg A Mach no-senders notification message.
606	*
607	* Note: This assumes that there is only one create-right-from-work-interval point,
608	* if the ability to extract another send right after creation is added,
609	* this will have to change to handle make-send counts correctly.
610	*/
611	static void
612	work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
613	{
614	struct work_interval *work_interval = NULL;
615
616	work_interval = ipc_kobject_dealloc_port(port, mscount,
617	type: IKOT_WORK_INTERVAL);
618
619	work_interval->wi_port = MACH_PORT_NULL;
620
621	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
622	}
623
624	/*
625	* work_interval_port_type()
626	*
627	* Converts a port name into the work interval object and returns its type.
628	*
629	* For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
630	* valid type for work intervals).
631	*/
632	static uint32_t
633	work_interval_port_type(mach_port_name_t port_name)
634	{
635	struct work_interval *work_interval = NULL;
636	kern_return_t kr;
637	uint32_t work_interval_type;
638
639	if (port_name == MACH_PORT_NULL) {
640	return WORK_INTERVAL_TYPE_LAST;
641	}
642
643	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
644	if (kr != KERN_SUCCESS) {
645	return WORK_INTERVAL_TYPE_LAST;
646	}
647	/ work_interval has a +1 ref /
648
649	assert(work_interval != NULL);
650	work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
651	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
652	return work_interval_type;
653	}
654
655	/*
656	* Sparse - not all work interval classes imply a scheduling policy change.
657	* The REALTIME_CRITICAL class also requires the thread to have explicitly
658	* adopted the REALTIME sched mode to take effect.
659	*/
660	static const struct {
661	int priority;
662	sched_mode_t sched_mode;
663	} work_interval_class_data[WI_CLASS_COUNT] = {
664	[WI_CLASS_BEST_EFFORT] = {
665	BASEPRI_DEFAULT, // 31
666	TH_MODE_TIMESHARE,
667	},
668
669	[WI_CLASS_APP_SUPPORT] = {
670	BASEPRI_DEFAULT, // 31
671	TH_MODE_TIMESHARE,
672	},
673
674	[WI_CLASS_SYSTEM] = {
675	BASEPRI_FOREGROUND + `1`, // 48
676	TH_MODE_FIXED,
677	},
678
679	[WI_CLASS_SYSTEM_CRITICAL] = {
680	MAXPRI_USER + `1`, // 64
681	TH_MODE_FIXED,
682	},
683
684	[WI_CLASS_REALTIME_CRITICAL] = {
685	BASEPRI_RTQUEUES + `1`, // 98
686	TH_MODE_REALTIME,
687	},
688	};
689
690	/*
691	* Called when a thread gets its scheduling priority from its associated work
692	* interval.
693	*/
694	int
695	work_interval_get_priority(thread_t thread)
696	{
697	const struct work_interval *work_interval = thread->th_work_interval;
698	assert(work_interval != NULL);
699
700	assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
701	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
702	int priority = work_interval_class_data[work_interval->wi_class].priority;
703	assert(priority != `0`);
704
705	priority += work_interval->wi_class_offset;
706	assert3u(priority, <=, MAXPRI);
707
708	return priority;
709	}
710
711	#if CONFIG_THREAD_GROUPS
712	extern kern_return_t
713	kern_work_interval_get_policy_from_port(mach_port_name_t port_name,
714	integer_t *policy,
715	integer_t *priority,
716	struct thread_group **tg)
717	{
718	assert((priority != NULL) && (policy != NULL) && (tg != NULL));
719
720	kern_return_t kr;
721	struct work_interval *work_interval;
722
723	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
724	if (kr != KERN_SUCCESS) {
725	return kr;
726	}
727
728	/ work_interval has a +1 ref /
729	assert(work_interval != NULL);
730	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
731
732	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
733
734	if ((mode == TH_MODE_TIMESHARE) \|\| (mode == TH_MODE_FIXED)) {
735	*policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
736	*priority = work_interval_class_data[work_interval->wi_class].priority;
737	assert(*priority != `0`);
738	*priority += work_interval->wi_class_offset;
739	assert3u(*priority, <=, MAXPRI);
740	} / No sched mode change for REALTIME (threads must explicitly opt-in) /
741
742	if (work_interval->wi_group) {
743	*tg = thread_group_retain(tg: work_interval->wi_group);
744	}
745
746	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
747	return KERN_SUCCESS;
748	}
749	#endif /* CONFIG_THREAD_GROUPS */
750
751	/*
752	* Switch to a policy driven by the work interval (if applicable).
753	*/
754	static void
755	work_interval_set_policy(thread_t thread)
756	{
757	assert3p(thread, ==, current_thread());
758
759	/*
760	* Ignore policy changes if the workload context shouldn't affect the
761	* scheduling policy.
762	*/
763	workload_config_flags_t flags = WLC_F_NONE;
764
765	/ There may be no config at all. That's ok. /
766	if (workload_config_get_flags(flags: &flags) != KERN_SUCCESS \|\|
767	(flags & WLC_F_THREAD_POLICY) == `0`) {
768	return;
769	}
770
771	const struct work_interval *work_interval = thread->th_work_interval;
772	assert(work_interval != NULL);
773
774	assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
775	const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
776
777	/*
778	* A mode of TH_MODE_NONE implies that this work interval has no
779	* associated scheduler effects.
780	*/
781	if (mode == TH_MODE_NONE) {
782	return;
783	}
784
785	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
786	TASK_POLICY_WI_DRIVEN, true, value2: mode);
787	assert(thread->requested_policy.thrp_wi_driven);
788
789	return;
790	}
791
792	/*
793	* Clear a work interval driven policy.
794	*/
795	static void
796	work_interval_clear_policy(thread_t thread)
797	{
798	assert3p(thread, ==, current_thread());
799
800	if (!thread->requested_policy.thrp_wi_driven) {
801	return;
802	}
803
804	const sched_mode_t mode = sched_get_thread_mode_user(thread);
805
806	proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
807	TASK_POLICY_WI_DRIVEN, false,
808	value2: mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
809
810	assert(!thread->requested_policy.thrp_wi_driven);
811
812	return;
813	}
814
815	/*
816	* thread_set_work_interval()
817	*
818	* Change thread's bound work interval to the passed-in work interval
819	* Consumes +1 ref on work_interval upon success.
820	*
821	* May also pass NULL to un-set work_interval on the thread
822	* Will deallocate any old work interval on the thread
823	* Return error if thread does not satisfy requirements to join work interval
824	*
825	* For non auto-join work intervals, deallocate any old work interval on the thread
826	* For auto-join work intervals, the routine may wakeup the work interval deferred
827	* deallocation queue since thread locks might be currently held.
828	*/
829	static kern_return_t
830	thread_set_work_interval(thread_t thread,
831	struct work_interval *work_interval, thread_work_interval_options_t options)
832	{
833	/ All explicit work interval operations should always be from the current thread /
834	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
835	assert(thread == current_thread());
836	}
837
838	/ All cases of needing the thread lock should be from explicit join scenarios /
839	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
840	assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != `0`);
841	}
842
843	/ For all cases of auto join must come in with the thread lock held /
844	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
845	assert((options & THREAD_WI_THREAD_LOCK_HELD) != `0`);
846	}
847
848	#if CONFIG_THREAD_GROUPS
849	if (work_interval && !work_interval->wi_group) {
850	/ Reject join on work intervals with deferred thread group creation /
851	return KERN_INVALID_ARGUMENT;
852	}
853	#endif /* CONFIG_THREAD_GROUPS */
854
855	if (work_interval) {
856	uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
857
858	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
859	/ Ensure no kern_work_interval_set_workload_id can happen after this point /
860	uint32_t wlid_flags;
861	(void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, `0`,
862	WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
863	if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
864	/ For workload IDs with rt-allowed, neuter the check below to*
865	* enable joining before the thread has become realtime for all
866	* work interval types */
867	work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
868	}
869	}
870
871	if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
872	(thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
873	return KERN_INVALID_ARGUMENT;
874	}
875	}
876
877	/*
878	* Ensure a work interval scheduling policy is not used if the thread is
879	* leaving the work interval.
880	*/
881	if (work_interval == NULL &&
882	(options & THREAD_WI_EXPLICIT_JOIN_POLICY) != `0`) {
883	work_interval_clear_policy(thread);
884	}
885
886	struct work_interval *old_th_wi = thread->th_work_interval;
887	#if CONFIG_SCHED_AUTO_JOIN
888	spl_t s;
889	/ Take the thread lock if needed /
890	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
891	s = splsched();
892	thread_lock(thread);
893	}
894
895	/*
896	* Work interval auto-join leak to non-RT threads.
897	*
898	* If thread might be running on a remote core and it's not in the context switch path (where
899	* thread is neither running, blocked or in the runq), its not possible to update the
900	* work interval & thread group remotely since its not possible to update CLPC for a remote
901	* core. This situation might happen when a thread is transitioning from realtime to
902	* non-realtime due to backgrounding etc., which would mean that non-RT threads would now
903	* be part of the work interval.
904	*
905	* Since there is no immediate mitigation to this issue, the policy is to set a new
906	* flag on the thread which indicates that such a "leak" has happened. This flag will
907	* be cleared when the remote thread eventually blocks and unjoins from the work interval.
908	*/
909	bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
910
911	if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == `0`)) {
912	assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == `0`);
913	os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
914	return KERN_SUCCESS;
915	}
916
917	const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != `0`);
918
919	if ((options & THREAD_WI_AUTO_JOIN_POLICY) \|\| old_wi_auto_joined) {
920	__kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(tg: old_th_wi->wi_group) : ~`0`;
921	__kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(tg: work_interval->wi_group) : ~`0`;
922	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
923	thread_tid(thread), old_tg_id, new_tg_id, options);
924	}
925
926	if (old_wi_auto_joined) {
927	/*
928	* If thread was auto-joined to a work interval and is not realtime, make sure it
929	* happened due to the "leak" described above.
930	*/
931	if (thread->sched_mode != TH_MODE_REALTIME) {
932	assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != `0`);
933	}
934
935	os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
936	work_interval_auto_join_decrement(work_interval: old_th_wi, thread);
937	thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
938	}
939
940	#endif /* CONFIG_SCHED_AUTO_JOIN */
941
942	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
943	thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : `0`), (work_interval ? work_interval->wi_id : `0`), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
944
945	/ transfer +1 ref to thread /
946	thread->th_work_interval = work_interval;
947
948	#if CONFIG_SCHED_AUTO_JOIN
949
950	if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
951	assert(work_interval_auto_join_enabled(work_interval) == true);
952	thread->sched_flags \|= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
953	}
954
955	if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
956	thread_unlock(thread);
957	splx(s);
958	}
959	#endif /* CONFIG_SCHED_AUTO_JOIN */
960
961	/*
962	* The thread got a new work interval. It may come with a work interval
963	* scheduling policy that needs to be applied.
964	*/
965	if (work_interval != NULL &&
966	(options & THREAD_WI_EXPLICIT_JOIN_POLICY) != `0`) {
967	work_interval_set_policy(thread);
968	}
969
970	#if CONFIG_THREAD_GROUPS
971	if (work_interval) {
972	/ Prevent thread_group_set_name after CLPC may have already heard*
973	* about the thread group */
974	(void)os_atomic_cmpxchg(&work_interval->wi_group_flags, `0`,
975	WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
976	}
977	struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
978
979	if (options & THREAD_WI_AUTO_JOIN_POLICY) {
980	#if CONFIG_SCHED_AUTO_JOIN
981	thread_set_autojoin_thread_group_locked(t: thread, tg: new_tg);
982	#endif
983	} else {
984	thread_set_work_interval_thread_group(t: thread, tg: new_tg);
985	}
986	#endif /* CONFIG_THREAD_GROUPS */
987
988	if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
989	/ Construct mask to XOR with th_work_interval_flags to clear the*
990	* currently present flags and set the new flags in wlid_flags. */
991	uint32_t wlid_flags = `0`;
992	if (work_interval) {
993	wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
994	}
995	thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
996	&thread->th_work_interval_flags, relaxed);
997	th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID \|
998	TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
999	if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1000	th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1001	if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1002	th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1003	}
1004	}
1005	if (th_wi_xor_mask) {
1006	os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1007	}
1008
1009	/*
1010	* Now that the interval flags have been set, re-evaluate
1011	* whether the thread needs to be undemoted - the new work
1012	* interval may have the RT_ALLOWED flag. and the thread may
1013	* have have a realtime policy but be demoted.
1014	*/
1015	thread_rt_evaluate(thread);
1016	}
1017
1018	if (old_th_wi != NULL) {
1019	work_interval_release(work_interval: old_th_wi, options);
1020	}
1021
1022	return KERN_SUCCESS;
1023	}
1024
1025	static kern_return_t
1026	thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1027	{
1028	assert(thread == current_thread());
1029	return thread_set_work_interval(thread, work_interval, options: THREAD_WI_EXPLICIT_JOIN_POLICY \| THREAD_WI_THREAD_LOCK_NEEDED);
1030	}
1031
1032	kern_return_t
1033	work_interval_thread_terminate(thread_t thread)
1034	{
1035	assert(thread == current_thread());
1036	if (thread->th_work_interval != NULL) {
1037	return thread_set_work_interval(thread, NULL, options: THREAD_WI_EXPLICIT_JOIN_POLICY \| THREAD_WI_THREAD_LOCK_NEEDED);
1038	}
1039	return KERN_SUCCESS;
1040	}
1041
1042	kern_return_t
1043	kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1044	{
1045	assert(thread == current_thread());
1046	assert(kwi_args->work_interval_id != `0`);
1047
1048	struct work_interval *work_interval = thread->th_work_interval;
1049
1050	if (work_interval == NULL \|\|
1051	work_interval->wi_id != kwi_args->work_interval_id) {
1052	/ This thread must have adopted the work interval to be able to notify /
1053	return KERN_INVALID_ARGUMENT;
1054	}
1055
1056	task_t notifying_task = current_task();
1057
1058	if (work_interval->wi_creator_uniqueid != get_task_uniqueid(task: notifying_task) \|\|
1059	work_interval->wi_creator_pidversion != get_task_version(task: notifying_task)) {
1060	/ Only the creating task can do a notify /
1061	return KERN_INVALID_ARGUMENT;
1062	}
1063
1064	spl_t s = splsched();
1065
1066	#if CONFIG_THREAD_GROUPS
1067	assert(work_interval->wi_group == thread->thread_group);
1068	#endif /* CONFIG_THREAD_GROUPS */
1069
1070	uint64_t urgency_param1, urgency_param2;
1071	kwi_args->urgency = (uint16_t)thread_get_urgency(thread, rt_period: &urgency_param1, rt_deadline: &urgency_param2);
1072
1073	splx(s);
1074
1075	/ called without interrupts disabled /
1076	machine_work_interval_notify(thread, kwi_args);
1077
1078	return KERN_SUCCESS;
1079	}
1080
1081	/ Start at 1, 0 is not a valid work interval ID /
1082	static _Atomic uint64_t unique_work_interval_id = `1`;
1083
1084	kern_return_t
1085	kern_work_interval_create(thread_t thread,
1086	struct kern_work_interval_create_args *create_params)
1087	{
1088	assert(thread == current_thread());
1089
1090	uint32_t create_flags = create_params->wica_create_flags;
1091
1092	if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == `0`) &&
1093	thread->th_work_interval != NULL) {
1094	/*
1095	* If the thread is doing a legacy combined create and join,
1096	* it shouldn't already be part of a work interval.
1097	*
1098	* (Creating a joinable WI is allowed anytime.)
1099	*/
1100	return KERN_FAILURE;
1101	}
1102
1103	/*
1104	* Check the validity of the create flags before allocating the work
1105	* interval.
1106	*/
1107	task_t creating_task = current_task();
1108	if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1109	/*
1110	* CA_CLIENT work intervals do not create new thread groups.
1111	* There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1112	* per each application task
1113	*/
1114	if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1115	return KERN_FAILURE;
1116	}
1117	if (!task_is_app(task: creating_task)) {
1118	#if XNU_TARGET_OS_OSX
1119	/*
1120	* Soft-fail the case of a non-app pretending to be an
1121	* app, by allowing it to press the buttons, but they're
1122	* not actually connected to anything.
1123	*/
1124	create_flags \|= WORK_INTERVAL_FLAG_IGNORED;
1125	#else
1126	/*
1127	* On iOS, it's a hard failure to get your apptype
1128	* wrong and then try to render something.
1129	*/
1130	return KERN_NOT_SUPPORTED;
1131	#endif /* XNU_TARGET_OS_OSX */
1132	}
1133	if (task_set_ca_client_wi(task: creating_task, true) == false) {
1134	return KERN_FAILURE;
1135	}
1136	}
1137
1138	#if CONFIG_SCHED_AUTO_JOIN
1139	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1140	uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1141	if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1142	return KERN_NOT_SUPPORTED;
1143	}
1144	if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == `0`) {
1145	return KERN_NOT_SUPPORTED;
1146	}
1147	}
1148
1149	if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1150	if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == `0`) {
1151	return KERN_NOT_SUPPORTED;
1152	}
1153	}
1154	#endif /* CONFIG_SCHED_AUTO_JOIN */
1155
1156	struct work_interval work_interval = kalloc_type(struct* work_interval,
1157	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
1158
1159	uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1160
1161	work_interval = (struct* work_interval) {
1162	.wi_id = work_interval_id,
1163	.wi_ref_count = {},
1164	.wi_create_flags = create_flags,
1165	.wi_creator_pid = pid_from_task(task: creating_task),
1166	.wi_creator_uniqueid = get_task_uniqueid(task: creating_task),
1167	.wi_creator_pidversion = get_task_version(task: creating_task),
1168	};
1169	os_ref_init(&work_interval->wi_ref_count, NULL);
1170
1171	if (work_interval_telemetry_data_enabled(work_interval)) {
1172	recount_work_interval_init(wi: &work_interval->wi_recount);
1173	}
1174
1175	__kdebug_only uint64_t tg_id = `0`;
1176	#if CONFIG_THREAD_GROUPS
1177	struct thread_group *tg;
1178	if ((create_flags &
1179	(WORK_INTERVAL_FLAG_GROUP \| WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1180	(WORK_INTERVAL_FLAG_GROUP \| WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1181	/ defer creation of the thread group until the*
1182	* kern_work_interval_set_workload_id() call */
1183	work_interval->wi_group = NULL;
1184	} else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1185	/ create a new group for the interval to represent /
1186	char name[THREAD_GROUP_MAXNAME] = "";
1187
1188	snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1189	work_interval->wi_creator_pid);
1190
1191	tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1192
1193	thread_group_set_name(tg, name);
1194
1195	work_interval->wi_group = tg;
1196	} else {
1197	/ the interval represents the thread's home group /
1198	tg = thread_group_get_home_group(t: thread);
1199
1200	thread_group_retain(tg);
1201
1202	work_interval->wi_group = tg;
1203	}
1204
1205	/ Capture the tg_id for tracing purposes /
1206	tg_id = work_interval->wi_group ? thread_group_get_id(tg: work_interval->wi_group) : ~`0`;
1207
1208	#endif /* CONFIG_THREAD_GROUPS */
1209
1210	if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1211	mach_port_name_t name = MACH_PORT_NULL;
1212
1213	/ work_interval has a +1 ref, moves to the port /
1214	work_interval->wi_port = ipc_kobject_alloc_port(
1215	kobject: (ipc_kobject_t)work_interval, type: IKOT_WORK_INTERVAL,
1216	options: IPC_KOBJECT_ALLOC_MAKE_SEND \| IPC_KOBJECT_ALLOC_NSREQUEST);
1217
1218
1219	name = ipc_port_copyout_send(sright: work_interval->wi_port, current_space());
1220
1221	if (!MACH_PORT_VALID(name)) {
1222	/*
1223	* copyout failed (port is already deallocated)
1224	* Because of the port-destroyed magic,
1225	* the work interval is already deallocated too.
1226	*/
1227	return KERN_RESOURCE_SHORTAGE;
1228	}
1229
1230	create_params->wica_port = name;
1231	} else {
1232	/ work_interval has a +1 ref, moves to the thread /
1233	kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1234	if (kr != KERN_SUCCESS) {
1235	/ No other thread can join this work interval since it isn't*
1236	* JOINABLE so release the reference on work interval */
1237	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1238	return kr;
1239	}
1240
1241	create_params->wica_port = MACH_PORT_NULL;
1242	}
1243
1244	create_params->wica_id = work_interval_id;
1245
1246	if (tg_id != ~`0`) {
1247	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1248	work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1249	}
1250	return KERN_SUCCESS;
1251	}
1252
1253	kern_return_t
1254	kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1255	{
1256	assert(flags != NULL);
1257
1258	kern_return_t kr;
1259	struct work_interval *work_interval;
1260
1261	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1262	if (kr != KERN_SUCCESS) {
1263	return kr;
1264	}
1265
1266	assert(work_interval != NULL);
1267	*flags = work_interval->wi_create_flags;
1268
1269	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1270
1271	return KERN_SUCCESS;
1272	}
1273
1274	#if CONFIG_THREAD_GROUPS
1275	_Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1276	"WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1277	#endif /* CONFIG_THREAD_GROUPS */
1278
1279	kern_return_t
1280	kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1281	size_t len)
1282	{
1283	kern_return_t kr;
1284	struct work_interval *work_interval;
1285
1286	if (len > WORK_INTERVAL_NAME_MAX) {
1287	return KERN_INVALID_ARGUMENT;
1288	}
1289	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1290	if (kr != KERN_SUCCESS) {
1291	return kr;
1292	}
1293
1294	assert(work_interval != NULL);
1295
1296	#if CONFIG_THREAD_GROUPS
1297	uint32_t wi_group_flags = os_atomic_load(
1298	&work_interval->wi_group_flags, relaxed);
1299	if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1300	kr = KERN_INVALID_ARGUMENT;
1301	goto out;
1302	}
1303	if (!work_interval->wi_group) {
1304	kr = KERN_INVALID_ARGUMENT;
1305	goto out;
1306	}
1307
1308	if (name[`0`] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1309	char tgname[THREAD_GROUP_MAXNAME];
1310	snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1311	name);
1312	thread_group_set_name(tg: work_interval->wi_group, name: tgname);
1313	}
1314
1315	out:
1316	#endif /* CONFIG_THREAD_GROUPS */
1317	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1318
1319	return kr;
1320	}
1321
1322	kern_return_t
1323	kern_work_interval_set_workload_id(mach_port_name_t port_name,
1324	struct kern_work_interval_workload_id_args *workload_id_args)
1325	{
1326	kern_return_t kr;
1327	struct work_interval *work_interval;
1328	uint32_t wlida_flags = `0`;
1329	uint32_t wlid_flags = `0`;
1330	#if CONFIG_THREAD_GROUPS
1331	uint32_t tg_flags = `0`;
1332	#endif
1333	bool from_workload_config = false;
1334
1335	/ Ensure workload ID name is non-empty. /
1336	if (!workload_id_args->wlida_name[`0`]) {
1337	return KERN_INVALID_ARGUMENT;
1338	}
1339
1340	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1341	if (kr != KERN_SUCCESS) {
1342	return kr;
1343	}
1344
1345	assert(work_interval != NULL);
1346	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1347	kr = KERN_INVALID_ARGUMENT;
1348	goto out;
1349	}
1350
1351	if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1352	/ Reject work intervals that didn't indicate they will have a workload ID*
1353	* at creation. In particular if the work interval has its own thread group,
1354	* its creation must have been deferred in kern_work_interval_create */
1355	kr = KERN_INVALID_ARGUMENT;
1356	goto out;
1357	}
1358
1359	workload_config_t wl_config = {};
1360	kr = workload_config_lookup_default(id: workload_id_args->wlida_name, config: &wl_config);
1361	if (kr == KERN_SUCCESS) {
1362	if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1363	(work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1364	if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1365	(work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1366	/ WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER /
1367	} else {
1368	kr = KERN_INVALID_ARGUMENT;
1369	goto out;
1370	}
1371	}
1372
1373	wlida_flags = wl_config.wc_flags;
1374
1375	wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1376
1377	#if CONFIG_THREAD_GROUPS
1378	tg_flags = wl_config.wc_thread_group_flags;
1379	if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1380	(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == `0`) {
1381	kr = KERN_INVALID_ARGUMENT;
1382	goto out;
1383	}
1384	#endif /* CONFIG_THREAD_GROUPS */
1385
1386	from_workload_config = true;
1387	} else {
1388	/ If the workload is not present in the table, perform basic validation*
1389	* that the create flags passed in match the ones used at work interval
1390	* create time */
1391	if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1392	(work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1393	kr = KERN_INVALID_ARGUMENT;
1394	goto out;
1395	}
1396
1397	const bool wc_avail = workload_config_available();
1398	if (!wc_avail) {
1399	wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1400	}
1401
1402	/*
1403	* If the workload config wasn't even loaded then fallback to
1404	* older behaviour where the new thread group gets the default
1405	* thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1406	*/
1407	#if CONFIG_THREAD_GROUPS
1408	if (!wc_avail) {
1409	tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1410	} else {
1411	struct thread_group *home_group =
1412	thread_group_get_home_group(t: current_thread());
1413	if (home_group != NULL) {
1414	tg_flags = thread_group_get_flags(home_group);
1415	}
1416	}
1417	#endif /* CONFIG_THREAD_GROUPS */
1418	}
1419
1420	workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1421
1422	/ cmpxchg a non-zero workload ID flags value (indicating that workload ID*
1423	* has been set). */
1424	wlida_flags \|= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1425	if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, `0`, wlida_flags,
1426	&wlid_flags, relaxed)) {
1427	if (from_workload_config) {
1428	work_interval->wi_class = wl_config.wc_class;
1429	work_interval->wi_class_offset = wl_config.wc_class_offset;
1430	}
1431	#if CONFIG_THREAD_GROUPS
1432	if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1433	/ Perform deferred thread group creation, now that tgflags are known /
1434	struct thread_group *tg;
1435	tg = thread_group_create_and_retain(flags: tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1436	THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1437
1438	char tgname[THREAD_GROUP_MAXNAME] = "";
1439	snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1440	workload_id_args->wlida_name);
1441	thread_group_set_name(tg, name: tgname);
1442
1443	assert(work_interval->wi_group == NULL);
1444	work_interval->wi_group = tg;
1445	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1446	work_interval->wi_id, work_interval->wi_create_flags,
1447	work_interval->wi_creator_pid, thread_group_get_id(tg));
1448	}
1449	#endif /* CONFIG_THREAD_GROUPS */
1450	} else {
1451	/ Workload ID has previously been set (or a thread has already joined). /
1452	if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1453	kr = KERN_INVALID_ARGUMENT;
1454	goto out;
1455	}
1456	/ Treat this request as a query for the out parameters of the ID /
1457	workload_id_args->wlida_flags = wlid_flags;
1458	}
1459
1460	/*
1461	* Emit tracepoints for successfully setting the workload ID.
1462	*
1463	* After rdar://89342390 has been fixed and a new work interval ktrace
1464	* provider has been added, it will be possible to associate a numeric
1465	* ID with an ID name. Thus, for those cases where the ID name has been
1466	* looked up successfully (`from_workload_config` is true) it will no
1467	* longer be necessary to emit a tracepoint with the full ID name.
1468	*/
1469	KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1470	work_interval->wi_id, from_workload_config);
1471	kernel_debug_string_simple(
1472	MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1473	str: workload_id_args->wlida_name);
1474
1475	kr = KERN_SUCCESS;
1476
1477	out:
1478	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1479
1480	return kr;
1481	}
1482
1483
1484	kern_return_t
1485	kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1486	{
1487	if (work_interval_id == `0`) {
1488	return KERN_INVALID_ARGUMENT;
1489	}
1490
1491	if (thread->th_work_interval == NULL \|\|
1492	thread->th_work_interval->wi_id != work_interval_id) {
1493	/ work ID isn't valid or doesn't match joined work interval ID /
1494	return KERN_INVALID_ARGUMENT;
1495	}
1496
1497	return thread_set_work_interval_explicit_join(thread, NULL);
1498	}
1499
1500	kern_return_t
1501	kern_work_interval_join(thread_t thread,
1502	mach_port_name_t port_name)
1503	{
1504	struct work_interval *work_interval = NULL;
1505	kern_return_t kr;
1506
1507	if (port_name == MACH_PORT_NULL) {
1508	/ 'Un-join' the current work interval /
1509	return thread_set_work_interval_explicit_join(thread, NULL);
1510	}
1511
1512	kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1513	if (kr != KERN_SUCCESS) {
1514	return kr;
1515	}
1516	/ work_interval has a +1 ref /
1517
1518	assert(work_interval != NULL);
1519
1520	kr = thread_set_work_interval_explicit_join(thread, work_interval);
1521	/ ref was consumed by passing it to the thread in the successful case /
1522	if (kr != KERN_SUCCESS) {
1523	work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1524	}
1525	return kr;
1526	}
1527
1528	/*
1529	* work_interval_port_type_render_server()
1530	*
1531	* Helper routine to determine if the port points to a
1532	* WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1533	*/
1534	bool
1535	work_interval_port_type_render_server(mach_port_name_t port_name)
1536	{
1537	return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1538	}
1539

Browse the source code of xnu/osfmk/kern/work_interval.c