1/*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30#include <sys/work_interval.h>
31
32#include <kern/work_interval.h>
33
34#include <kern/thread.h>
35#include <kern/sched_prim.h>
36#include <kern/machine.h>
37#include <kern/thread_group.h>
38#include <kern/ipc_kobject.h>
39#include <kern/task.h>
40#include <kern/coalition.h>
41#include <kern/policy_internal.h>
42#include <kern/mpsc_queue.h>
43#include <kern/workload_config.h>
44#include <kern/assert.h>
45
46#include <mach/kern_return.h>
47#include <mach/notify.h>
48#include <os/refcnt.h>
49
50#include <stdatomic.h>
51
52/*
53 * With the introduction of auto-join work intervals, it is possible
54 * to change the work interval (and related thread group) of a thread in a
55 * variety of contexts (thread termination, context switch, thread mode
56 * change etc.). In order to clearly specify the policy expectation and
57 * the locking behavior, all calls to thread_set_work_interval() pass
58 * in a set of flags.
59 */
60
61__options_decl(thread_work_interval_options_t, uint32_t, {
62 /* Change the work interval using the explicit join rules */
63 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
64 /* Change the work interval using the auto-join rules */
65 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
66 /* Caller already holds the thread lock */
67 THREAD_WI_THREAD_LOCK_HELD = 0x4,
68 /* Caller does not hold the thread lock */
69 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
70 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
71 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
72});
73
74static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
75static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t);
76
77IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL,
78 .iko_op_stable = true,
79 .iko_op_no_senders = work_interval_port_no_senders);
80
81#if CONFIG_SCHED_AUTO_JOIN
82/* MPSC queue used to defer deallocate work intervals */
83static struct mpsc_daemon_queue work_interval_deallocate_queue;
84
85static void work_interval_deferred_release(struct work_interval *);
86
87/*
88 * Work Interval Auto-Join Status
89 *
90 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
91 * It packs the following information:
92 * - A bit representing if a "finish" is deferred on the work interval
93 * - Count of number of threads auto-joined to the work interval
94 */
95#define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
96#define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
97#define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
98typedef uint32_t work_interval_auto_join_status_t;
99
100static inline bool __unused
101work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
102{
103 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
104}
105
106static inline uint32_t __unused
107work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
108{
109 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
110}
111
112/*
113 * struct work_interval_deferred_finish_state
114 *
115 * Contains the parameters of the finish operation which is being deferred.
116 */
117struct work_interval_deferred_finish_state {
118 uint64_t instance_id;
119 uint64_t start;
120 uint64_t deadline;
121 uint64_t complexity;
122};
123
124struct work_interval_auto_join_info {
125 struct work_interval_deferred_finish_state deferred_finish_state;
126 work_interval_auto_join_status_t _Atomic status;
127};
128#endif /* CONFIG_SCHED_AUTO_JOIN */
129
130#if CONFIG_THREAD_GROUPS
131/* Flags atomically set in wi_group_flags wi_group_flags */
132#define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1
133#endif
134
135/*
136 * Work Interval struct
137 *
138 * This struct represents a thread group and/or work interval context
139 * in a mechanism that is represented with a kobject.
140 *
141 * Every thread that has joined a WI has a +1 ref, and the port
142 * has a +1 ref as well.
143 *
144 * TODO: groups need to have a 'is for WI' flag
145 * and they need a flag to create that says 'for WI'
146 * This would allow CLPC to avoid allocating WI support
147 * data unless it is needed
148 *
149 * TODO: Enforce not having more than one non-group joinable work
150 * interval per thread group.
151 * CLPC only wants to see one WI-notify callout per group.
152 */
153struct work_interval {
154 uint64_t wi_id;
155 struct os_refcnt wi_ref_count;
156 uint32_t wi_create_flags;
157
158 /* for debugging purposes only, does not hold a ref on port */
159 ipc_port_t wi_port;
160
161 /*
162 * holds uniqueid and version of creating process,
163 * used to permission-gate notify
164 * TODO: you'd think there would be a better way to do this
165 */
166 uint64_t wi_creator_uniqueid;
167 uint32_t wi_creator_pid;
168 int wi_creator_pidversion;
169
170 /* flags set by work_interval_set_workload_id and reflected onto
171 * thread->th_work_interval_flags upon join */
172 uint32_t wi_wlid_flags;
173
174#if CONFIG_THREAD_GROUPS
175 uint32_t wi_group_flags;
176 struct thread_group *wi_group; /* holds +1 ref on group */
177#endif /* CONFIG_THREAD_GROUPS */
178
179#if CONFIG_SCHED_AUTO_JOIN
180 /* Information related to auto-join and deferred finish for work interval */
181 struct work_interval_auto_join_info wi_auto_join_info;
182
183 /*
184 * Since the deallocation of auto-join work intervals
185 * can happen in the scheduler when the last thread in
186 * the WI blocks and the thread lock is held, the deallocation
187 * might have to be done on a separate thread.
188 */
189 struct mpsc_queue_chain wi_deallocate_link;
190#endif /* CONFIG_SCHED_AUTO_JOIN */
191
192 /*
193 * Work interval class info - determines thread priority for threads
194 * with a work interval driven policy.
195 */
196 wi_class_t wi_class;
197 uint8_t wi_class_offset;
198
199 struct recount_work_interval wi_recount;
200};
201
202/*
203 * work_interval_telemetry_data_enabled()
204 *
205 * Helper routine to check if work interval has the collection of telemetry data enabled.
206 */
207static inline bool
208work_interval_telemetry_data_enabled(struct work_interval *work_interval)
209{
210 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0;
211}
212
213
214/*
215 * work_interval_get_recount_tracks()
216 *
217 * Returns the recount tracks associated with a work interval, or NULL
218 * if the work interval is NULL or has telemetry disabled.
219 */
220inline struct recount_track *
221work_interval_get_recount_tracks(struct work_interval *work_interval)
222{
223 if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) {
224 return work_interval->wi_recount.rwi_current_instance;
225 }
226 return NULL;
227}
228
229#if CONFIG_SCHED_AUTO_JOIN
230
231/*
232 * work_interval_perform_deferred_finish()
233 *
234 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
235 * argument rather than looking at the work_interval since the deferred finish can race with another
236 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
237 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
238 * the deferred state without issues.
239 */
240static inline void
241work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
242 __unused struct work_interval *work_interval, __unused thread_t thread)
243{
244
245 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
246 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
247}
248
249/*
250 * work_interval_auto_join_increment()
251 *
252 * Routine to increment auto-join counter when a new thread is auto-joined to
253 * the work interval.
254 */
255static void
256work_interval_auto_join_increment(struct work_interval *work_interval)
257{
258 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
259 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
260 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
261}
262
263/*
264 * work_interval_auto_join_decrement()
265 *
266 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
267 * blocking or termination). If this was the last auto-joined thread in the work interval and
268 * there was a deferred finish, performs the finish operation for the work interval.
269 */
270static void
271work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
272{
273 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
274 work_interval_auto_join_status_t old_status, new_status;
275 struct work_interval_deferred_finish_state deferred_finish_state;
276 bool perform_finish;
277
278 /* Update the auto-join count for the work interval atomically */
279 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
280 perform_finish = false;
281 new_status = old_status;
282 assert(work_interval_status_auto_join_count(old_status) > 0);
283 new_status -= 1;
284 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
285 /* No auto-joined threads remaining and finish is deferred */
286 new_status = 0;
287 perform_finish = true;
288 /*
289 * Its important to copy the deferred finish state here so that this works
290 * when racing with another start-finish cycle.
291 */
292 deferred_finish_state = join_info->deferred_finish_state;
293 }
294 });
295
296 if (perform_finish == true) {
297 /*
298 * Since work_interval_perform_deferred_finish() calls down to
299 * the machine layer callout for finish which gets the thread
300 * group from the thread passed in here, it is important to
301 * make sure that the thread still has the work interval thread
302 * group here.
303 */
304 assert(thread->thread_group == work_interval->wi_group);
305 work_interval_perform_deferred_finish(deferred_finish_state: &deferred_finish_state, work_interval, thread);
306 }
307}
308
309/*
310 * work_interval_auto_join_enabled()
311 *
312 * Helper routine to check if work interval has auto-join enabled.
313 */
314static inline bool
315work_interval_auto_join_enabled(struct work_interval *work_interval)
316{
317 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
318}
319
320/*
321 * work_interval_deferred_finish_enabled()
322 *
323 * Helper routine to check if work interval has deferred finish enabled.
324 */
325static inline bool __unused
326work_interval_deferred_finish_enabled(struct work_interval *work_interval)
327{
328 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
329}
330
331#endif /* CONFIG_SCHED_AUTO_JOIN */
332
333static inline void
334work_interval_retain(struct work_interval *work_interval)
335{
336 /*
337 * Even though wi_retain is called under a port lock, we have
338 * to use os_ref_retain instead of os_ref_retain_locked
339 * because wi_release is not synchronized. wi_release calls
340 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
341 */
342 os_ref_retain(rc: &work_interval->wi_ref_count);
343}
344
345static inline void
346work_interval_deallocate(struct work_interval *work_interval)
347{
348 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
349 work_interval->wi_id);
350 if (work_interval_telemetry_data_enabled(work_interval)) {
351 recount_work_interval_deinit(wi: &work_interval->wi_recount);
352 }
353 kfree_type(struct work_interval, work_interval);
354}
355
356/*
357 * work_interval_release()
358 *
359 * Routine to release a ref count on the work interval. If the refcount goes down
360 * to zero, the work interval needs to be de-allocated.
361 *
362 * For non auto-join work intervals, they are de-allocated in this context.
363 *
364 * For auto-join work intervals, the de-allocation cannot be done from this context
365 * since that might need the kernel memory allocator lock. In that case, the
366 * deallocation is done via a thread-call based mpsc queue.
367 */
368static void
369work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
370{
371 if (os_ref_release(rc: &work_interval->wi_ref_count) == 0) {
372#if CONFIG_SCHED_AUTO_JOIN
373 if (options & THREAD_WI_THREAD_LOCK_HELD) {
374 work_interval_deferred_release(work_interval);
375 } else {
376 work_interval_deallocate(work_interval);
377 }
378#else /* CONFIG_SCHED_AUTO_JOIN */
379 work_interval_deallocate(work_interval);
380#endif /* CONFIG_SCHED_AUTO_JOIN */
381 }
382}
383
384#if CONFIG_SCHED_AUTO_JOIN
385
386/*
387 * work_interval_deferred_release()
388 *
389 * Routine to enqueue the work interval on the deallocation mpsc queue.
390 */
391static void
392work_interval_deferred_release(struct work_interval *work_interval)
393{
394 mpsc_daemon_enqueue(dq: &work_interval_deallocate_queue,
395 elm: &work_interval->wi_deallocate_link, options: MPSC_QUEUE_NONE);
396}
397
398/*
399 * work_interval_should_propagate()
400 *
401 * Main policy routine to decide if a thread should be auto-joined to
402 * another thread's work interval. The conditions are arranged such that
403 * the most common bailout condition are checked the earliest. This routine
404 * is called from the scheduler context; so it needs to be efficient and
405 * be careful when taking locks or performing wakeups.
406 */
407inline bool
408work_interval_should_propagate(thread_t cthread, thread_t thread)
409{
410 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
411 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
412 return false;
413 }
414
415 /* Only propagate work intervals which have auto-join enabled */
416 if (work_interval_auto_join_enabled(work_interval: cthread->th_work_interval) == false) {
417 return false;
418 }
419
420 /* Work interval propagation is enabled for realtime threads only */
421 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
422 return false;
423 }
424
425
426 /* Work interval propagation only works for threads with the same home thread group */
427 struct thread_group *thread_home_tg = thread_group_get_home_group(t: thread);
428 if (thread_group_get_home_group(t: cthread) != thread_home_tg) {
429 return false;
430 }
431
432 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
433 if (thread->thread_group != thread_home_tg) {
434 return false;
435 }
436
437 /* If either thread is inactive (in the termination path), do not propagate auto-join */
438 if ((!cthread->active) || (!thread->active)) {
439 return false;
440 }
441
442 return true;
443}
444
445/*
446 * work_interval_auto_join_propagate()
447 *
448 * Routine to auto-join a thread into another thread's work interval
449 *
450 * Should only be invoked if work_interval_should_propagate() returns
451 * true. Also expects "from" thread to be current thread and "to" thread
452 * to be locked.
453 */
454void
455work_interval_auto_join_propagate(thread_t from, thread_t to)
456{
457 assert(from == current_thread());
458 work_interval_retain(work_interval: from->th_work_interval);
459 work_interval_auto_join_increment(work_interval: from->th_work_interval);
460 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
461 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
462 assert(kr == KERN_SUCCESS);
463}
464
465/*
466 * work_interval_auto_join_unwind()
467 *
468 * Routine to un-join an auto-joined work interval for a thread that is blocking.
469 *
470 * Expects thread to be locked.
471 */
472void
473work_interval_auto_join_unwind(thread_t thread)
474{
475 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
476 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
477 assert(kr == KERN_SUCCESS);
478}
479
480/*
481 * work_interval_auto_join_demote()
482 *
483 * Routine to un-join an auto-joined work interval when a thread is changing from
484 * realtime to non-realtime scheduling mode. This could happen due to multiple
485 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
486 * the thread being demoted may not be the current thread.
487 *
488 * Expects thread to be locked.
489 */
490void
491work_interval_auto_join_demote(thread_t thread)
492{
493 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
494 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
495 assert(kr == KERN_SUCCESS);
496}
497
498static void
499work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
500 __assert_only mpsc_daemon_queue_t dq)
501{
502 struct work_interval *work_interval = NULL;
503 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
504 assert(dq == &work_interval_deallocate_queue);
505 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
506 work_interval_deallocate(work_interval);
507}
508
509#endif /* CONFIG_SCHED_AUTO_JOIN */
510
511#if CONFIG_SCHED_AUTO_JOIN
512__startup_func
513static void
514work_interval_subsystem_init(void)
515{
516 /*
517 * The work interval deallocation queue must be a thread call based queue
518 * because it is woken up from contexts where the thread lock is held. The
519 * only way to perform wakeups safely in those contexts is to wakeup a
520 * thread call which is guaranteed to be on a different waitq and would
521 * not hash onto the same global waitq which might be currently locked.
522 */
523 mpsc_daemon_queue_init_with_thread_call(dq: &work_interval_deallocate_queue,
524 invoke: work_interval_deallocate_queue_invoke, pri: THREAD_CALL_PRIORITY_KERNEL,
525 flags: MPSC_DAEMON_INIT_NONE);
526}
527STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init);
528#endif /* CONFIG_SCHED_AUTO_JOIN */
529
530/*
531 * work_interval_port_convert
532 *
533 * Called with port locked, returns reference to work interval
534 * if indeed the port is a work interval kobject port
535 */
536static struct work_interval *
537work_interval_port_convert_locked(ipc_port_t port)
538{
539 struct work_interval *work_interval = NULL;
540
541 if (IP_VALID(port)) {
542 work_interval = ipc_kobject_get_stable(port, type: IKOT_WORK_INTERVAL);
543 if (work_interval) {
544 work_interval_retain(work_interval);
545 }
546 }
547
548 return work_interval;
549}
550
551/*
552 * port_name_to_work_interval
553 *
554 * Description: Obtain a reference to the work_interval associated with a given port.
555 *
556 * Parameters: name A Mach port name to translate.
557 *
558 * Returns: NULL The given Mach port did not reference a work_interval.
559 * !NULL The work_interval that is associated with the Mach port.
560 */
561static kern_return_t
562port_name_to_work_interval(mach_port_name_t name,
563 struct work_interval **work_interval)
564{
565 if (!MACH_PORT_VALID(name)) {
566 return KERN_INVALID_NAME;
567 }
568
569 ipc_port_t port = IP_NULL;
570 kern_return_t kr = KERN_SUCCESS;
571
572 kr = ipc_port_translate_send(current_space(), name, portp: &port);
573 if (kr != KERN_SUCCESS) {
574 return kr;
575 }
576 /* port is locked */
577
578 assert(IP_VALID(port));
579
580 struct work_interval *converted_work_interval;
581
582 converted_work_interval = work_interval_port_convert_locked(port);
583
584 /* the port is valid, but doesn't denote a work_interval */
585 if (converted_work_interval == NULL) {
586 kr = KERN_INVALID_CAPABILITY;
587 }
588
589 ip_mq_unlock(port);
590
591 if (kr == KERN_SUCCESS) {
592 *work_interval = converted_work_interval;
593 }
594
595 return kr;
596}
597
598
599/*
600 * work_interval_port_no_senders
601 *
602 * Description: Handle a no-senders notification for a work interval port.
603 * Destroys the port and releases its reference on the work interval.
604 *
605 * Parameters: msg A Mach no-senders notification message.
606 *
607 * Note: This assumes that there is only one create-right-from-work-interval point,
608 * if the ability to extract another send right after creation is added,
609 * this will have to change to handle make-send counts correctly.
610 */
611static void
612work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount)
613{
614 struct work_interval *work_interval = NULL;
615
616 work_interval = ipc_kobject_dealloc_port(port, mscount,
617 type: IKOT_WORK_INTERVAL);
618
619 work_interval->wi_port = MACH_PORT_NULL;
620
621 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
622}
623
624/*
625 * work_interval_port_type()
626 *
627 * Converts a port name into the work interval object and returns its type.
628 *
629 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
630 * valid type for work intervals).
631 */
632static uint32_t
633work_interval_port_type(mach_port_name_t port_name)
634{
635 struct work_interval *work_interval = NULL;
636 kern_return_t kr;
637 uint32_t work_interval_type;
638
639 if (port_name == MACH_PORT_NULL) {
640 return WORK_INTERVAL_TYPE_LAST;
641 }
642
643 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
644 if (kr != KERN_SUCCESS) {
645 return WORK_INTERVAL_TYPE_LAST;
646 }
647 /* work_interval has a +1 ref */
648
649 assert(work_interval != NULL);
650 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
651 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
652 return work_interval_type;
653}
654
655/*
656 * Sparse - not all work interval classes imply a scheduling policy change.
657 * The REALTIME_CRITICAL class *also* requires the thread to have explicitly
658 * adopted the REALTIME sched mode to take effect.
659 */
660static const struct {
661 int priority;
662 sched_mode_t sched_mode;
663} work_interval_class_data[WI_CLASS_COUNT] = {
664 [WI_CLASS_BEST_EFFORT] = {
665 BASEPRI_DEFAULT, // 31
666 TH_MODE_TIMESHARE,
667 },
668
669 [WI_CLASS_APP_SUPPORT] = {
670 BASEPRI_DEFAULT, // 31
671 TH_MODE_TIMESHARE,
672 },
673
674 [WI_CLASS_SYSTEM] = {
675 BASEPRI_FOREGROUND + 1, // 48
676 TH_MODE_FIXED,
677 },
678
679 [WI_CLASS_SYSTEM_CRITICAL] = {
680 MAXPRI_USER + 1, // 64
681 TH_MODE_FIXED,
682 },
683
684 [WI_CLASS_REALTIME_CRITICAL] = {
685 BASEPRI_RTQUEUES + 1, // 98
686 TH_MODE_REALTIME,
687 },
688};
689
690/*
691 * Called when a thread gets its scheduling priority from its associated work
692 * interval.
693 */
694int
695work_interval_get_priority(thread_t thread)
696{
697 const struct work_interval *work_interval = thread->th_work_interval;
698 assert(work_interval != NULL);
699
700 assert3u(work_interval->wi_class, !=, WI_CLASS_NONE);
701 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
702 int priority = work_interval_class_data[work_interval->wi_class].priority;
703 assert(priority != 0);
704
705 priority += work_interval->wi_class_offset;
706 assert3u(priority, <=, MAXPRI);
707
708 return priority;
709}
710
711#if CONFIG_THREAD_GROUPS
712extern kern_return_t
713kern_work_interval_get_policy_from_port(mach_port_name_t port_name,
714 integer_t *policy,
715 integer_t *priority,
716 struct thread_group **tg)
717{
718 assert((priority != NULL) && (policy != NULL) && (tg != NULL));
719
720 kern_return_t kr;
721 struct work_interval *work_interval;
722
723 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
724 if (kr != KERN_SUCCESS) {
725 return kr;
726 }
727
728 /* work_interval has a +1 ref */
729 assert(work_interval != NULL);
730 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
731
732 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
733
734 if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) {
735 *policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR);
736 *priority = work_interval_class_data[work_interval->wi_class].priority;
737 assert(*priority != 0);
738 *priority += work_interval->wi_class_offset;
739 assert3u(*priority, <=, MAXPRI);
740 } /* No sched mode change for REALTIME (threads must explicitly opt-in) */
741
742 if (work_interval->wi_group) {
743 *tg = thread_group_retain(tg: work_interval->wi_group);
744 }
745
746 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
747 return KERN_SUCCESS;
748}
749#endif /* CONFIG_THREAD_GROUPS */
750
751/*
752 * Switch to a policy driven by the work interval (if applicable).
753 */
754static void
755work_interval_set_policy(thread_t thread)
756{
757 assert3p(thread, ==, current_thread());
758
759 /*
760 * Ignore policy changes if the workload context shouldn't affect the
761 * scheduling policy.
762 */
763 workload_config_flags_t flags = WLC_F_NONE;
764
765 /* There may be no config at all. That's ok. */
766 if (workload_config_get_flags(flags: &flags) != KERN_SUCCESS ||
767 (flags & WLC_F_THREAD_POLICY) == 0) {
768 return;
769 }
770
771 const struct work_interval *work_interval = thread->th_work_interval;
772 assert(work_interval != NULL);
773
774 assert3u(work_interval->wi_class, <, WI_CLASS_COUNT);
775 const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode;
776
777 /*
778 * A mode of TH_MODE_NONE implies that this work interval has no
779 * associated scheduler effects.
780 */
781 if (mode == TH_MODE_NONE) {
782 return;
783 }
784
785 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
786 TASK_POLICY_WI_DRIVEN, true, value2: mode);
787 assert(thread->requested_policy.thrp_wi_driven);
788
789 return;
790}
791
792/*
793 * Clear a work interval driven policy.
794 */
795static void
796work_interval_clear_policy(thread_t thread)
797{
798 assert3p(thread, ==, current_thread());
799
800 if (!thread->requested_policy.thrp_wi_driven) {
801 return;
802 }
803
804 const sched_mode_t mode = sched_get_thread_mode_user(thread);
805
806 proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE,
807 TASK_POLICY_WI_DRIVEN, false,
808 value2: mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE);
809
810 assert(!thread->requested_policy.thrp_wi_driven);
811
812 return;
813}
814
815/*
816 * thread_set_work_interval()
817 *
818 * Change thread's bound work interval to the passed-in work interval
819 * Consumes +1 ref on work_interval upon success.
820 *
821 * May also pass NULL to un-set work_interval on the thread
822 * Will deallocate any old work interval on the thread
823 * Return error if thread does not satisfy requirements to join work interval
824 *
825 * For non auto-join work intervals, deallocate any old work interval on the thread
826 * For auto-join work intervals, the routine may wakeup the work interval deferred
827 * deallocation queue since thread locks might be currently held.
828 */
829static kern_return_t
830thread_set_work_interval(thread_t thread,
831 struct work_interval *work_interval, thread_work_interval_options_t options)
832{
833 /* All explicit work interval operations should always be from the current thread */
834 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
835 assert(thread == current_thread());
836 }
837
838 /* All cases of needing the thread lock should be from explicit join scenarios */
839 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
840 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
841 }
842
843 /* For all cases of auto join must come in with the thread lock held */
844 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
845 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
846 }
847
848#if CONFIG_THREAD_GROUPS
849 if (work_interval && !work_interval->wi_group) {
850 /* Reject join on work intervals with deferred thread group creation */
851 return KERN_INVALID_ARGUMENT;
852 }
853#endif /* CONFIG_THREAD_GROUPS */
854
855 if (work_interval) {
856 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
857
858 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
859 /* Ensure no kern_work_interval_set_workload_id can happen after this point */
860 uint32_t wlid_flags;
861 (void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0,
862 WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed);
863 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
864 /* For workload IDs with rt-allowed, neuter the check below to
865 * enable joining before the thread has become realtime for all
866 * work interval types */
867 work_interval_type = WORK_INTERVAL_TYPE_DEFAULT;
868 }
869 }
870
871 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
872 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
873 return KERN_INVALID_ARGUMENT;
874 }
875 }
876
877 /*
878 * Ensure a work interval scheduling policy is not used if the thread is
879 * leaving the work interval.
880 */
881 if (work_interval == NULL &&
882 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
883 work_interval_clear_policy(thread);
884 }
885
886 struct work_interval *old_th_wi = thread->th_work_interval;
887#if CONFIG_SCHED_AUTO_JOIN
888 spl_t s;
889 /* Take the thread lock if needed */
890 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
891 s = splsched();
892 thread_lock(thread);
893 }
894
895 /*
896 * Work interval auto-join leak to non-RT threads.
897 *
898 * If thread might be running on a remote core and it's not in the context switch path (where
899 * thread is neither running, blocked or in the runq), its not possible to update the
900 * work interval & thread group remotely since its not possible to update CLPC for a remote
901 * core. This situation might happen when a thread is transitioning from realtime to
902 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
903 * be part of the work interval.
904 *
905 * Since there is no immediate mitigation to this issue, the policy is to set a new
906 * flag on the thread which indicates that such a "leak" has happened. This flag will
907 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
908 */
909 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL));
910
911 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
912 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
913 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
914 return KERN_SUCCESS;
915 }
916
917 const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
918
919 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
920 __kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(tg: old_th_wi->wi_group) : ~0;
921 __kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(tg: work_interval->wi_group) : ~0;
922 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
923 thread_tid(thread), old_tg_id, new_tg_id, options);
924 }
925
926 if (old_wi_auto_joined) {
927 /*
928 * If thread was auto-joined to a work interval and is not realtime, make sure it
929 * happened due to the "leak" described above.
930 */
931 if (thread->sched_mode != TH_MODE_REALTIME) {
932 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
933 }
934
935 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
936 work_interval_auto_join_decrement(work_interval: old_th_wi, thread);
937 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
938 }
939
940#endif /* CONFIG_SCHED_AUTO_JOIN */
941
942 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
943 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
944
945 /* transfer +1 ref to thread */
946 thread->th_work_interval = work_interval;
947
948#if CONFIG_SCHED_AUTO_JOIN
949
950 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
951 assert(work_interval_auto_join_enabled(work_interval) == true);
952 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
953 }
954
955 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
956 thread_unlock(thread);
957 splx(s);
958 }
959#endif /* CONFIG_SCHED_AUTO_JOIN */
960
961 /*
962 * The thread got a new work interval. It may come with a work interval
963 * scheduling policy that needs to be applied.
964 */
965 if (work_interval != NULL &&
966 (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) {
967 work_interval_set_policy(thread);
968 }
969
970#if CONFIG_THREAD_GROUPS
971 if (work_interval) {
972 /* Prevent thread_group_set_name after CLPC may have already heard
973 * about the thread group */
974 (void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0,
975 WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed);
976 }
977 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
978
979 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
980#if CONFIG_SCHED_AUTO_JOIN
981 thread_set_autojoin_thread_group_locked(t: thread, tg: new_tg);
982#endif
983 } else {
984 thread_set_work_interval_thread_group(t: thread, tg: new_tg);
985 }
986#endif /* CONFIG_THREAD_GROUPS */
987
988 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
989 /* Construct mask to XOR with th_work_interval_flags to clear the
990 * currently present flags and set the new flags in wlid_flags. */
991 uint32_t wlid_flags = 0;
992 if (work_interval) {
993 wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed);
994 }
995 thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load(
996 &thread->th_work_interval_flags, relaxed);
997 th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID |
998 TH_WORK_INTERVAL_FLAGS_RT_ALLOWED);
999 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) {
1000 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID;
1001 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) {
1002 th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED;
1003 }
1004 }
1005 if (th_wi_xor_mask) {
1006 os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed);
1007 }
1008
1009 /*
1010 * Now that the interval flags have been set, re-evaluate
1011 * whether the thread needs to be undemoted - the new work
1012 * interval may have the RT_ALLOWED flag. and the thread may
1013 * have have a realtime policy but be demoted.
1014 */
1015 thread_rt_evaluate(thread);
1016 }
1017
1018 if (old_th_wi != NULL) {
1019 work_interval_release(work_interval: old_th_wi, options);
1020 }
1021
1022 return KERN_SUCCESS;
1023}
1024
1025static kern_return_t
1026thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
1027{
1028 assert(thread == current_thread());
1029 return thread_set_work_interval(thread, work_interval, options: THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1030}
1031
1032kern_return_t
1033work_interval_thread_terminate(thread_t thread)
1034{
1035 assert(thread == current_thread());
1036 if (thread->th_work_interval != NULL) {
1037 return thread_set_work_interval(thread, NULL, options: THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
1038 }
1039 return KERN_SUCCESS;
1040}
1041
1042kern_return_t
1043kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
1044{
1045 assert(thread == current_thread());
1046 assert(kwi_args->work_interval_id != 0);
1047
1048 struct work_interval *work_interval = thread->th_work_interval;
1049
1050 if (work_interval == NULL ||
1051 work_interval->wi_id != kwi_args->work_interval_id) {
1052 /* This thread must have adopted the work interval to be able to notify */
1053 return KERN_INVALID_ARGUMENT;
1054 }
1055
1056 task_t notifying_task = current_task();
1057
1058 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(task: notifying_task) ||
1059 work_interval->wi_creator_pidversion != get_task_version(task: notifying_task)) {
1060 /* Only the creating task can do a notify */
1061 return KERN_INVALID_ARGUMENT;
1062 }
1063
1064 spl_t s = splsched();
1065
1066#if CONFIG_THREAD_GROUPS
1067 assert(work_interval->wi_group == thread->thread_group);
1068#endif /* CONFIG_THREAD_GROUPS */
1069
1070 uint64_t urgency_param1, urgency_param2;
1071 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, rt_period: &urgency_param1, rt_deadline: &urgency_param2);
1072
1073 splx(s);
1074
1075 /* called without interrupts disabled */
1076 machine_work_interval_notify(thread, kwi_args);
1077
1078 return KERN_SUCCESS;
1079}
1080
1081/* Start at 1, 0 is not a valid work interval ID */
1082static _Atomic uint64_t unique_work_interval_id = 1;
1083
1084kern_return_t
1085kern_work_interval_create(thread_t thread,
1086 struct kern_work_interval_create_args *create_params)
1087{
1088 assert(thread == current_thread());
1089
1090 uint32_t create_flags = create_params->wica_create_flags;
1091
1092 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
1093 thread->th_work_interval != NULL) {
1094 /*
1095 * If the thread is doing a legacy combined create and join,
1096 * it shouldn't already be part of a work interval.
1097 *
1098 * (Creating a joinable WI is allowed anytime.)
1099 */
1100 return KERN_FAILURE;
1101 }
1102
1103 /*
1104 * Check the validity of the create flags before allocating the work
1105 * interval.
1106 */
1107 task_t creating_task = current_task();
1108 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
1109 /*
1110 * CA_CLIENT work intervals do not create new thread groups.
1111 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
1112 * per each application task
1113 */
1114 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1115 return KERN_FAILURE;
1116 }
1117 if (!task_is_app(task: creating_task)) {
1118#if XNU_TARGET_OS_OSX
1119 /*
1120 * Soft-fail the case of a non-app pretending to be an
1121 * app, by allowing it to press the buttons, but they're
1122 * not actually connected to anything.
1123 */
1124 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
1125#else
1126 /*
1127 * On iOS, it's a hard failure to get your apptype
1128 * wrong and then try to render something.
1129 */
1130 return KERN_NOT_SUPPORTED;
1131#endif /* XNU_TARGET_OS_OSX */
1132 }
1133 if (task_set_ca_client_wi(task: creating_task, true) == false) {
1134 return KERN_FAILURE;
1135 }
1136 }
1137
1138#if CONFIG_SCHED_AUTO_JOIN
1139 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
1140 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
1141 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
1142 return KERN_NOT_SUPPORTED;
1143 }
1144 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1145 return KERN_NOT_SUPPORTED;
1146 }
1147 }
1148
1149 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
1150 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
1151 return KERN_NOT_SUPPORTED;
1152 }
1153 }
1154#endif /* CONFIG_SCHED_AUTO_JOIN */
1155
1156 struct work_interval *work_interval = kalloc_type(struct work_interval,
1157 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1158
1159 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
1160
1161 *work_interval = (struct work_interval) {
1162 .wi_id = work_interval_id,
1163 .wi_ref_count = {},
1164 .wi_create_flags = create_flags,
1165 .wi_creator_pid = pid_from_task(task: creating_task),
1166 .wi_creator_uniqueid = get_task_uniqueid(task: creating_task),
1167 .wi_creator_pidversion = get_task_version(task: creating_task),
1168 };
1169 os_ref_init(&work_interval->wi_ref_count, NULL);
1170
1171 if (work_interval_telemetry_data_enabled(work_interval)) {
1172 recount_work_interval_init(wi: &work_interval->wi_recount);
1173 }
1174
1175 __kdebug_only uint64_t tg_id = 0;
1176#if CONFIG_THREAD_GROUPS
1177 struct thread_group *tg;
1178 if ((create_flags &
1179 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) ==
1180 (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1181 /* defer creation of the thread group until the
1182 * kern_work_interval_set_workload_id() call */
1183 work_interval->wi_group = NULL;
1184 } else if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
1185 /* create a new group for the interval to represent */
1186 char name[THREAD_GROUP_MAXNAME] = "";
1187
1188 snprintf(name, sizeof(name), "WI%lld (pid %d)", work_interval_id,
1189 work_interval->wi_creator_pid);
1190
1191 tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT);
1192
1193 thread_group_set_name(tg, name);
1194
1195 work_interval->wi_group = tg;
1196 } else {
1197 /* the interval represents the thread's home group */
1198 tg = thread_group_get_home_group(t: thread);
1199
1200 thread_group_retain(tg);
1201
1202 work_interval->wi_group = tg;
1203 }
1204
1205 /* Capture the tg_id for tracing purposes */
1206 tg_id = work_interval->wi_group ? thread_group_get_id(tg: work_interval->wi_group) : ~0;
1207
1208#endif /* CONFIG_THREAD_GROUPS */
1209
1210 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
1211 mach_port_name_t name = MACH_PORT_NULL;
1212
1213 /* work_interval has a +1 ref, moves to the port */
1214 work_interval->wi_port = ipc_kobject_alloc_port(
1215 kobject: (ipc_kobject_t)work_interval, type: IKOT_WORK_INTERVAL,
1216 options: IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
1217
1218
1219 name = ipc_port_copyout_send(sright: work_interval->wi_port, current_space());
1220
1221 if (!MACH_PORT_VALID(name)) {
1222 /*
1223 * copyout failed (port is already deallocated)
1224 * Because of the port-destroyed magic,
1225 * the work interval is already deallocated too.
1226 */
1227 return KERN_RESOURCE_SHORTAGE;
1228 }
1229
1230 create_params->wica_port = name;
1231 } else {
1232 /* work_interval has a +1 ref, moves to the thread */
1233 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
1234 if (kr != KERN_SUCCESS) {
1235 /* No other thread can join this work interval since it isn't
1236 * JOINABLE so release the reference on work interval */
1237 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1238 return kr;
1239 }
1240
1241 create_params->wica_port = MACH_PORT_NULL;
1242 }
1243
1244 create_params->wica_id = work_interval_id;
1245
1246 if (tg_id != ~0) {
1247 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1248 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
1249 }
1250 return KERN_SUCCESS;
1251}
1252
1253kern_return_t
1254kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
1255{
1256 assert(flags != NULL);
1257
1258 kern_return_t kr;
1259 struct work_interval *work_interval;
1260
1261 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1262 if (kr != KERN_SUCCESS) {
1263 return kr;
1264 }
1265
1266 assert(work_interval != NULL);
1267 *flags = work_interval->wi_create_flags;
1268
1269 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1270
1271 return KERN_SUCCESS;
1272}
1273
1274#if CONFIG_THREAD_GROUPS
1275_Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME,
1276 "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME");
1277#endif /* CONFIG_THREAD_GROUPS */
1278
1279kern_return_t
1280kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name,
1281 size_t len)
1282{
1283 kern_return_t kr;
1284 struct work_interval *work_interval;
1285
1286 if (len > WORK_INTERVAL_NAME_MAX) {
1287 return KERN_INVALID_ARGUMENT;
1288 }
1289 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1290 if (kr != KERN_SUCCESS) {
1291 return kr;
1292 }
1293
1294 assert(work_interval != NULL);
1295
1296#if CONFIG_THREAD_GROUPS
1297 uint32_t wi_group_flags = os_atomic_load(
1298 &work_interval->wi_group_flags, relaxed);
1299 if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) {
1300 kr = KERN_INVALID_ARGUMENT;
1301 goto out;
1302 }
1303 if (!work_interval->wi_group) {
1304 kr = KERN_INVALID_ARGUMENT;
1305 goto out;
1306 }
1307
1308 if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) {
1309 char tgname[THREAD_GROUP_MAXNAME];
1310 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1311 name);
1312 thread_group_set_name(tg: work_interval->wi_group, name: tgname);
1313 }
1314
1315out:
1316#endif /* CONFIG_THREAD_GROUPS */
1317 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1318
1319 return kr;
1320}
1321
1322kern_return_t
1323kern_work_interval_set_workload_id(mach_port_name_t port_name,
1324 struct kern_work_interval_workload_id_args *workload_id_args)
1325{
1326 kern_return_t kr;
1327 struct work_interval *work_interval;
1328 uint32_t wlida_flags = 0;
1329 uint32_t wlid_flags = 0;
1330#if CONFIG_THREAD_GROUPS
1331 uint32_t tg_flags = 0;
1332#endif
1333 bool from_workload_config = false;
1334
1335 /* Ensure workload ID name is non-empty. */
1336 if (!workload_id_args->wlida_name[0]) {
1337 return KERN_INVALID_ARGUMENT;
1338 }
1339
1340 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1341 if (kr != KERN_SUCCESS) {
1342 return kr;
1343 }
1344
1345 assert(work_interval != NULL);
1346 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) {
1347 kr = KERN_INVALID_ARGUMENT;
1348 goto out;
1349 }
1350
1351 if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) {
1352 /* Reject work intervals that didn't indicate they will have a workload ID
1353 * at creation. In particular if the work interval has its own thread group,
1354 * its creation must have been deferred in kern_work_interval_create */
1355 kr = KERN_INVALID_ARGUMENT;
1356 goto out;
1357 }
1358
1359 workload_config_t wl_config = {};
1360 kr = workload_config_lookup_default(id: workload_id_args->wlida_name, config: &wl_config);
1361 if (kr == KERN_SUCCESS) {
1362 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) !=
1363 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1364 if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER &&
1365 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) {
1366 /* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */
1367 } else {
1368 kr = KERN_INVALID_ARGUMENT;
1369 goto out;
1370 }
1371 }
1372
1373 wlida_flags = wl_config.wc_flags;
1374
1375 wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL;
1376
1377#if CONFIG_THREAD_GROUPS
1378 tg_flags = wl_config.wc_thread_group_flags;
1379 if (tg_flags != THREAD_GROUP_FLAGS_ABSENT &&
1380 (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
1381 kr = KERN_INVALID_ARGUMENT;
1382 goto out;
1383 }
1384#endif /* CONFIG_THREAD_GROUPS */
1385
1386 from_workload_config = true;
1387 } else {
1388 /* If the workload is not present in the table, perform basic validation
1389 * that the create flags passed in match the ones used at work interval
1390 * create time */
1391 if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) !=
1392 (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) {
1393 kr = KERN_INVALID_ARGUMENT;
1394 goto out;
1395 }
1396
1397 const bool wc_avail = workload_config_available();
1398 if (!wc_avail) {
1399 wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED;
1400 }
1401
1402 /*
1403 * If the workload config wasn't even loaded then fallback to
1404 * older behaviour where the new thread group gets the default
1405 * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set).
1406 */
1407#if CONFIG_THREAD_GROUPS
1408 if (!wc_avail) {
1409 tg_flags = THREAD_GROUP_FLAGS_DEFAULT;
1410 } else {
1411 struct thread_group *home_group =
1412 thread_group_get_home_group(t: current_thread());
1413 if (home_group != NULL) {
1414 tg_flags = thread_group_get_flags(home_group);
1415 }
1416 }
1417#endif /* CONFIG_THREAD_GROUPS */
1418 }
1419
1420 workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags;
1421
1422 /* cmpxchg a non-zero workload ID flags value (indicating that workload ID
1423 * has been set). */
1424 wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID;
1425 if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags,
1426 &wlid_flags, relaxed)) {
1427 if (from_workload_config) {
1428 work_interval->wi_class = wl_config.wc_class;
1429 work_interval->wi_class_offset = wl_config.wc_class_offset;
1430 }
1431#if CONFIG_THREAD_GROUPS
1432 if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) {
1433 /* Perform deferred thread group creation, now that tgflags are known */
1434 struct thread_group *tg;
1435 tg = thread_group_create_and_retain(flags: tg_flags == THREAD_GROUP_FLAGS_ABSENT ?
1436 THREAD_GROUP_FLAGS_DEFAULT : tg_flags);
1437
1438 char tgname[THREAD_GROUP_MAXNAME] = "";
1439 snprintf(tgname, sizeof(tgname), "WI%lld %s", work_interval->wi_id,
1440 workload_id_args->wlida_name);
1441 thread_group_set_name(tg, name: tgname);
1442
1443 assert(work_interval->wi_group == NULL);
1444 work_interval->wi_group = tg;
1445 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
1446 work_interval->wi_id, work_interval->wi_create_flags,
1447 work_interval->wi_creator_pid, thread_group_get_id(tg));
1448 }
1449#endif /* CONFIG_THREAD_GROUPS */
1450 } else {
1451 /* Workload ID has previously been set (or a thread has already joined). */
1452 if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) {
1453 kr = KERN_INVALID_ARGUMENT;
1454 goto out;
1455 }
1456 /* Treat this request as a query for the out parameters of the ID */
1457 workload_id_args->wlida_flags = wlid_flags;
1458 }
1459
1460 /*
1461 * Emit tracepoints for successfully setting the workload ID.
1462 *
1463 * After rdar://89342390 has been fixed and a new work interval ktrace
1464 * provider has been added, it will be possible to associate a numeric
1465 * ID with an ID name. Thus, for those cases where the ID name has been
1466 * looked up successfully (`from_workload_config` is true) it will no
1467 * longer be necessary to emit a tracepoint with the full ID name.
1468 */
1469 KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID),
1470 work_interval->wi_id, from_workload_config);
1471 kernel_debug_string_simple(
1472 MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME),
1473 str: workload_id_args->wlida_name);
1474
1475 kr = KERN_SUCCESS;
1476
1477out:
1478 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1479
1480 return kr;
1481}
1482
1483
1484kern_return_t
1485kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1486{
1487 if (work_interval_id == 0) {
1488 return KERN_INVALID_ARGUMENT;
1489 }
1490
1491 if (thread->th_work_interval == NULL ||
1492 thread->th_work_interval->wi_id != work_interval_id) {
1493 /* work ID isn't valid or doesn't match joined work interval ID */
1494 return KERN_INVALID_ARGUMENT;
1495 }
1496
1497 return thread_set_work_interval_explicit_join(thread, NULL);
1498}
1499
1500kern_return_t
1501kern_work_interval_join(thread_t thread,
1502 mach_port_name_t port_name)
1503{
1504 struct work_interval *work_interval = NULL;
1505 kern_return_t kr;
1506
1507 if (port_name == MACH_PORT_NULL) {
1508 /* 'Un-join' the current work interval */
1509 return thread_set_work_interval_explicit_join(thread, NULL);
1510 }
1511
1512 kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval);
1513 if (kr != KERN_SUCCESS) {
1514 return kr;
1515 }
1516 /* work_interval has a +1 ref */
1517
1518 assert(work_interval != NULL);
1519
1520 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1521 /* ref was consumed by passing it to the thread in the successful case */
1522 if (kr != KERN_SUCCESS) {
1523 work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED);
1524 }
1525 return kr;
1526}
1527
1528/*
1529 * work_interval_port_type_render_server()
1530 *
1531 * Helper routine to determine if the port points to a
1532 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1533 */
1534bool
1535work_interval_port_type_render_server(mach_port_name_t port_name)
1536{
1537 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1538}
1539