1 | /* |
2 | * Copyright (c) 2017 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | |
30 | #include <sys/work_interval.h> |
31 | |
32 | #include <kern/work_interval.h> |
33 | |
34 | #include <kern/thread.h> |
35 | #include <kern/sched_prim.h> |
36 | #include <kern/machine.h> |
37 | #include <kern/thread_group.h> |
38 | #include <kern/ipc_kobject.h> |
39 | #include <kern/task.h> |
40 | #include <kern/coalition.h> |
41 | #include <kern/policy_internal.h> |
42 | #include <kern/mpsc_queue.h> |
43 | #include <kern/workload_config.h> |
44 | #include <kern/assert.h> |
45 | |
46 | #include <mach/kern_return.h> |
47 | #include <mach/notify.h> |
48 | #include <os/refcnt.h> |
49 | |
50 | #include <stdatomic.h> |
51 | |
52 | /* |
53 | * With the introduction of auto-join work intervals, it is possible |
54 | * to change the work interval (and related thread group) of a thread in a |
55 | * variety of contexts (thread termination, context switch, thread mode |
56 | * change etc.). In order to clearly specify the policy expectation and |
57 | * the locking behavior, all calls to thread_set_work_interval() pass |
58 | * in a set of flags. |
59 | */ |
60 | |
61 | __options_decl(thread_work_interval_options_t, uint32_t, { |
62 | /* Change the work interval using the explicit join rules */ |
63 | THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1, |
64 | /* Change the work interval using the auto-join rules */ |
65 | THREAD_WI_AUTO_JOIN_POLICY = 0x2, |
66 | /* Caller already holds the thread lock */ |
67 | THREAD_WI_THREAD_LOCK_HELD = 0x4, |
68 | /* Caller does not hold the thread lock */ |
69 | THREAD_WI_THREAD_LOCK_NEEDED = 0x8, |
70 | /* Change the work interval from the context switch path (thread may not be running or on a runq) */ |
71 | THREAD_WI_THREAD_CTX_SWITCH = 0x10, |
72 | }); |
73 | |
74 | static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t); |
75 | static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t); |
76 | |
77 | IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL, |
78 | .iko_op_stable = true, |
79 | .iko_op_no_senders = work_interval_port_no_senders); |
80 | |
81 | #if CONFIG_SCHED_AUTO_JOIN |
82 | /* MPSC queue used to defer deallocate work intervals */ |
83 | static struct mpsc_daemon_queue work_interval_deallocate_queue; |
84 | |
85 | static void work_interval_deferred_release(struct work_interval *); |
86 | |
87 | /* |
88 | * Work Interval Auto-Join Status |
89 | * |
90 | * work_interval_auto_join_status_t represents the state of auto-join for a given work interval. |
91 | * It packs the following information: |
92 | * - A bit representing if a "finish" is deferred on the work interval |
93 | * - Count of number of threads auto-joined to the work interval |
94 | */ |
95 | #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31)) |
96 | #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1)) |
97 | #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK |
98 | typedef uint32_t work_interval_auto_join_status_t; |
99 | |
100 | static inline bool __unused |
101 | work_interval_status_deferred_finish(work_interval_auto_join_status_t status) |
102 | { |
103 | return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false; |
104 | } |
105 | |
106 | static inline uint32_t __unused |
107 | work_interval_status_auto_join_count(work_interval_auto_join_status_t status) |
108 | { |
109 | return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK); |
110 | } |
111 | |
112 | /* |
113 | * struct work_interval_deferred_finish_state |
114 | * |
115 | * Contains the parameters of the finish operation which is being deferred. |
116 | */ |
117 | struct work_interval_deferred_finish_state { |
118 | uint64_t instance_id; |
119 | uint64_t start; |
120 | uint64_t deadline; |
121 | uint64_t complexity; |
122 | }; |
123 | |
124 | struct work_interval_auto_join_info { |
125 | struct work_interval_deferred_finish_state deferred_finish_state; |
126 | work_interval_auto_join_status_t _Atomic status; |
127 | }; |
128 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
129 | |
130 | #if CONFIG_THREAD_GROUPS |
131 | /* Flags atomically set in wi_group_flags wi_group_flags */ |
132 | #define WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED 0x1 |
133 | #endif |
134 | |
135 | /* |
136 | * Work Interval struct |
137 | * |
138 | * This struct represents a thread group and/or work interval context |
139 | * in a mechanism that is represented with a kobject. |
140 | * |
141 | * Every thread that has joined a WI has a +1 ref, and the port |
142 | * has a +1 ref as well. |
143 | * |
144 | * TODO: groups need to have a 'is for WI' flag |
145 | * and they need a flag to create that says 'for WI' |
146 | * This would allow CLPC to avoid allocating WI support |
147 | * data unless it is needed |
148 | * |
149 | * TODO: Enforce not having more than one non-group joinable work |
150 | * interval per thread group. |
151 | * CLPC only wants to see one WI-notify callout per group. |
152 | */ |
153 | struct work_interval { |
154 | uint64_t wi_id; |
155 | struct os_refcnt wi_ref_count; |
156 | uint32_t wi_create_flags; |
157 | |
158 | /* for debugging purposes only, does not hold a ref on port */ |
159 | ipc_port_t wi_port; |
160 | |
161 | /* |
162 | * holds uniqueid and version of creating process, |
163 | * used to permission-gate notify |
164 | * TODO: you'd think there would be a better way to do this |
165 | */ |
166 | uint64_t wi_creator_uniqueid; |
167 | uint32_t wi_creator_pid; |
168 | int wi_creator_pidversion; |
169 | |
170 | /* flags set by work_interval_set_workload_id and reflected onto |
171 | * thread->th_work_interval_flags upon join */ |
172 | uint32_t wi_wlid_flags; |
173 | |
174 | #if CONFIG_THREAD_GROUPS |
175 | uint32_t wi_group_flags; |
176 | struct thread_group *wi_group; /* holds +1 ref on group */ |
177 | #endif /* CONFIG_THREAD_GROUPS */ |
178 | |
179 | #if CONFIG_SCHED_AUTO_JOIN |
180 | /* Information related to auto-join and deferred finish for work interval */ |
181 | struct work_interval_auto_join_info wi_auto_join_info; |
182 | |
183 | /* |
184 | * Since the deallocation of auto-join work intervals |
185 | * can happen in the scheduler when the last thread in |
186 | * the WI blocks and the thread lock is held, the deallocation |
187 | * might have to be done on a separate thread. |
188 | */ |
189 | struct mpsc_queue_chain wi_deallocate_link; |
190 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
191 | |
192 | /* |
193 | * Work interval class info - determines thread priority for threads |
194 | * with a work interval driven policy. |
195 | */ |
196 | wi_class_t wi_class; |
197 | uint8_t wi_class_offset; |
198 | |
199 | struct recount_work_interval wi_recount; |
200 | }; |
201 | |
202 | /* |
203 | * work_interval_telemetry_data_enabled() |
204 | * |
205 | * Helper routine to check if work interval has the collection of telemetry data enabled. |
206 | */ |
207 | static inline bool |
208 | work_interval_telemetry_data_enabled(struct work_interval *work_interval) |
209 | { |
210 | return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_TELEMETRY_DATA) != 0; |
211 | } |
212 | |
213 | |
214 | /* |
215 | * work_interval_get_recount_tracks() |
216 | * |
217 | * Returns the recount tracks associated with a work interval, or NULL |
218 | * if the work interval is NULL or has telemetry disabled. |
219 | */ |
220 | inline struct recount_track * |
221 | work_interval_get_recount_tracks(struct work_interval *work_interval) |
222 | { |
223 | if (work_interval != NULL && work_interval_telemetry_data_enabled(work_interval)) { |
224 | return work_interval->wi_recount.rwi_current_instance; |
225 | } |
226 | return NULL; |
227 | } |
228 | |
229 | #if CONFIG_SCHED_AUTO_JOIN |
230 | |
231 | /* |
232 | * work_interval_perform_deferred_finish() |
233 | * |
234 | * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an |
235 | * argument rather than looking at the work_interval since the deferred finish can race with another |
236 | * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the |
237 | * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite |
238 | * the deferred state without issues. |
239 | */ |
240 | static inline void |
241 | work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state, |
242 | __unused struct work_interval *work_interval, __unused thread_t thread) |
243 | { |
244 | |
245 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH), |
246 | thread_tid(thread), thread_group_get_id(work_interval->wi_group)); |
247 | } |
248 | |
249 | /* |
250 | * work_interval_auto_join_increment() |
251 | * |
252 | * Routine to increment auto-join counter when a new thread is auto-joined to |
253 | * the work interval. |
254 | */ |
255 | static void |
256 | work_interval_auto_join_increment(struct work_interval *work_interval) |
257 | { |
258 | struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info; |
259 | __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed); |
260 | assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX); |
261 | } |
262 | |
263 | /* |
264 | * work_interval_auto_join_decrement() |
265 | * |
266 | * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to |
267 | * blocking or termination). If this was the last auto-joined thread in the work interval and |
268 | * there was a deferred finish, performs the finish operation for the work interval. |
269 | */ |
270 | static void |
271 | work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread) |
272 | { |
273 | struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info; |
274 | work_interval_auto_join_status_t old_status, new_status; |
275 | struct work_interval_deferred_finish_state deferred_finish_state; |
276 | bool perform_finish; |
277 | |
278 | /* Update the auto-join count for the work interval atomically */ |
279 | os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, { |
280 | perform_finish = false; |
281 | new_status = old_status; |
282 | assert(work_interval_status_auto_join_count(old_status) > 0); |
283 | new_status -= 1; |
284 | if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) { |
285 | /* No auto-joined threads remaining and finish is deferred */ |
286 | new_status = 0; |
287 | perform_finish = true; |
288 | /* |
289 | * Its important to copy the deferred finish state here so that this works |
290 | * when racing with another start-finish cycle. |
291 | */ |
292 | deferred_finish_state = join_info->deferred_finish_state; |
293 | } |
294 | }); |
295 | |
296 | if (perform_finish == true) { |
297 | /* |
298 | * Since work_interval_perform_deferred_finish() calls down to |
299 | * the machine layer callout for finish which gets the thread |
300 | * group from the thread passed in here, it is important to |
301 | * make sure that the thread still has the work interval thread |
302 | * group here. |
303 | */ |
304 | assert(thread->thread_group == work_interval->wi_group); |
305 | work_interval_perform_deferred_finish(deferred_finish_state: &deferred_finish_state, work_interval, thread); |
306 | } |
307 | } |
308 | |
309 | /* |
310 | * work_interval_auto_join_enabled() |
311 | * |
312 | * Helper routine to check if work interval has auto-join enabled. |
313 | */ |
314 | static inline bool |
315 | work_interval_auto_join_enabled(struct work_interval *work_interval) |
316 | { |
317 | return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0; |
318 | } |
319 | |
320 | /* |
321 | * work_interval_deferred_finish_enabled() |
322 | * |
323 | * Helper routine to check if work interval has deferred finish enabled. |
324 | */ |
325 | static inline bool __unused |
326 | work_interval_deferred_finish_enabled(struct work_interval *work_interval) |
327 | { |
328 | return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0; |
329 | } |
330 | |
331 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
332 | |
333 | static inline void |
334 | work_interval_retain(struct work_interval *work_interval) |
335 | { |
336 | /* |
337 | * Even though wi_retain is called under a port lock, we have |
338 | * to use os_ref_retain instead of os_ref_retain_locked |
339 | * because wi_release is not synchronized. wi_release calls |
340 | * os_ref_release which is unsafe to pair with os_ref_retain_locked. |
341 | */ |
342 | os_ref_retain(rc: &work_interval->wi_ref_count); |
343 | } |
344 | |
345 | static inline void |
346 | work_interval_deallocate(struct work_interval *work_interval) |
347 | { |
348 | KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY), |
349 | work_interval->wi_id); |
350 | if (work_interval_telemetry_data_enabled(work_interval)) { |
351 | recount_work_interval_deinit(wi: &work_interval->wi_recount); |
352 | } |
353 | kfree_type(struct work_interval, work_interval); |
354 | } |
355 | |
356 | /* |
357 | * work_interval_release() |
358 | * |
359 | * Routine to release a ref count on the work interval. If the refcount goes down |
360 | * to zero, the work interval needs to be de-allocated. |
361 | * |
362 | * For non auto-join work intervals, they are de-allocated in this context. |
363 | * |
364 | * For auto-join work intervals, the de-allocation cannot be done from this context |
365 | * since that might need the kernel memory allocator lock. In that case, the |
366 | * deallocation is done via a thread-call based mpsc queue. |
367 | */ |
368 | static void |
369 | work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options) |
370 | { |
371 | if (os_ref_release(rc: &work_interval->wi_ref_count) == 0) { |
372 | #if CONFIG_SCHED_AUTO_JOIN |
373 | if (options & THREAD_WI_THREAD_LOCK_HELD) { |
374 | work_interval_deferred_release(work_interval); |
375 | } else { |
376 | work_interval_deallocate(work_interval); |
377 | } |
378 | #else /* CONFIG_SCHED_AUTO_JOIN */ |
379 | work_interval_deallocate(work_interval); |
380 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
381 | } |
382 | } |
383 | |
384 | #if CONFIG_SCHED_AUTO_JOIN |
385 | |
386 | /* |
387 | * work_interval_deferred_release() |
388 | * |
389 | * Routine to enqueue the work interval on the deallocation mpsc queue. |
390 | */ |
391 | static void |
392 | work_interval_deferred_release(struct work_interval *work_interval) |
393 | { |
394 | mpsc_daemon_enqueue(dq: &work_interval_deallocate_queue, |
395 | elm: &work_interval->wi_deallocate_link, options: MPSC_QUEUE_NONE); |
396 | } |
397 | |
398 | /* |
399 | * work_interval_should_propagate() |
400 | * |
401 | * Main policy routine to decide if a thread should be auto-joined to |
402 | * another thread's work interval. The conditions are arranged such that |
403 | * the most common bailout condition are checked the earliest. This routine |
404 | * is called from the scheduler context; so it needs to be efficient and |
405 | * be careful when taking locks or performing wakeups. |
406 | */ |
407 | inline bool |
408 | work_interval_should_propagate(thread_t cthread, thread_t thread) |
409 | { |
410 | /* Only allow propagation if the current thread has a work interval and the woken up thread does not */ |
411 | if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) { |
412 | return false; |
413 | } |
414 | |
415 | /* Only propagate work intervals which have auto-join enabled */ |
416 | if (work_interval_auto_join_enabled(work_interval: cthread->th_work_interval) == false) { |
417 | return false; |
418 | } |
419 | |
420 | /* Work interval propagation is enabled for realtime threads only */ |
421 | if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) { |
422 | return false; |
423 | } |
424 | |
425 | |
426 | /* Work interval propagation only works for threads with the same home thread group */ |
427 | struct thread_group *thread_home_tg = thread_group_get_home_group(t: thread); |
428 | if (thread_group_get_home_group(t: cthread) != thread_home_tg) { |
429 | return false; |
430 | } |
431 | |
432 | /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */ |
433 | if (thread->thread_group != thread_home_tg) { |
434 | return false; |
435 | } |
436 | |
437 | /* If either thread is inactive (in the termination path), do not propagate auto-join */ |
438 | if ((!cthread->active) || (!thread->active)) { |
439 | return false; |
440 | } |
441 | |
442 | return true; |
443 | } |
444 | |
445 | /* |
446 | * work_interval_auto_join_propagate() |
447 | * |
448 | * Routine to auto-join a thread into another thread's work interval |
449 | * |
450 | * Should only be invoked if work_interval_should_propagate() returns |
451 | * true. Also expects "from" thread to be current thread and "to" thread |
452 | * to be locked. |
453 | */ |
454 | void |
455 | work_interval_auto_join_propagate(thread_t from, thread_t to) |
456 | { |
457 | assert(from == current_thread()); |
458 | work_interval_retain(work_interval: from->th_work_interval); |
459 | work_interval_auto_join_increment(work_interval: from->th_work_interval); |
460 | __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval, |
461 | THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH); |
462 | assert(kr == KERN_SUCCESS); |
463 | } |
464 | |
465 | /* |
466 | * work_interval_auto_join_unwind() |
467 | * |
468 | * Routine to un-join an auto-joined work interval for a thread that is blocking. |
469 | * |
470 | * Expects thread to be locked. |
471 | */ |
472 | void |
473 | work_interval_auto_join_unwind(thread_t thread) |
474 | { |
475 | __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL, |
476 | THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH); |
477 | assert(kr == KERN_SUCCESS); |
478 | } |
479 | |
480 | /* |
481 | * work_interval_auto_join_demote() |
482 | * |
483 | * Routine to un-join an auto-joined work interval when a thread is changing from |
484 | * realtime to non-realtime scheduling mode. This could happen due to multiple |
485 | * reasons such as RT failsafe, thread backgrounding or thread termination. Also, |
486 | * the thread being demoted may not be the current thread. |
487 | * |
488 | * Expects thread to be locked. |
489 | */ |
490 | void |
491 | work_interval_auto_join_demote(thread_t thread) |
492 | { |
493 | __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL, |
494 | THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD); |
495 | assert(kr == KERN_SUCCESS); |
496 | } |
497 | |
498 | static void |
499 | work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e, |
500 | __assert_only mpsc_daemon_queue_t dq) |
501 | { |
502 | struct work_interval *work_interval = NULL; |
503 | work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link); |
504 | assert(dq == &work_interval_deallocate_queue); |
505 | assert(os_ref_get_count(&work_interval->wi_ref_count) == 0); |
506 | work_interval_deallocate(work_interval); |
507 | } |
508 | |
509 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
510 | |
511 | #if CONFIG_SCHED_AUTO_JOIN |
512 | __startup_func |
513 | static void |
514 | work_interval_subsystem_init(void) |
515 | { |
516 | /* |
517 | * The work interval deallocation queue must be a thread call based queue |
518 | * because it is woken up from contexts where the thread lock is held. The |
519 | * only way to perform wakeups safely in those contexts is to wakeup a |
520 | * thread call which is guaranteed to be on a different waitq and would |
521 | * not hash onto the same global waitq which might be currently locked. |
522 | */ |
523 | mpsc_daemon_queue_init_with_thread_call(dq: &work_interval_deallocate_queue, |
524 | invoke: work_interval_deallocate_queue_invoke, pri: THREAD_CALL_PRIORITY_KERNEL, |
525 | flags: MPSC_DAEMON_INIT_NONE); |
526 | } |
527 | STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, work_interval_subsystem_init); |
528 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
529 | |
530 | /* |
531 | * work_interval_port_convert |
532 | * |
533 | * Called with port locked, returns reference to work interval |
534 | * if indeed the port is a work interval kobject port |
535 | */ |
536 | static struct work_interval * |
537 | work_interval_port_convert_locked(ipc_port_t port) |
538 | { |
539 | struct work_interval *work_interval = NULL; |
540 | |
541 | if (IP_VALID(port)) { |
542 | work_interval = ipc_kobject_get_stable(port, type: IKOT_WORK_INTERVAL); |
543 | if (work_interval) { |
544 | work_interval_retain(work_interval); |
545 | } |
546 | } |
547 | |
548 | return work_interval; |
549 | } |
550 | |
551 | /* |
552 | * port_name_to_work_interval |
553 | * |
554 | * Description: Obtain a reference to the work_interval associated with a given port. |
555 | * |
556 | * Parameters: name A Mach port name to translate. |
557 | * |
558 | * Returns: NULL The given Mach port did not reference a work_interval. |
559 | * !NULL The work_interval that is associated with the Mach port. |
560 | */ |
561 | static kern_return_t |
562 | port_name_to_work_interval(mach_port_name_t name, |
563 | struct work_interval **work_interval) |
564 | { |
565 | if (!MACH_PORT_VALID(name)) { |
566 | return KERN_INVALID_NAME; |
567 | } |
568 | |
569 | ipc_port_t port = IP_NULL; |
570 | kern_return_t kr = KERN_SUCCESS; |
571 | |
572 | kr = ipc_port_translate_send(current_space(), name, portp: &port); |
573 | if (kr != KERN_SUCCESS) { |
574 | return kr; |
575 | } |
576 | /* port is locked */ |
577 | |
578 | assert(IP_VALID(port)); |
579 | |
580 | struct work_interval *converted_work_interval; |
581 | |
582 | converted_work_interval = work_interval_port_convert_locked(port); |
583 | |
584 | /* the port is valid, but doesn't denote a work_interval */ |
585 | if (converted_work_interval == NULL) { |
586 | kr = KERN_INVALID_CAPABILITY; |
587 | } |
588 | |
589 | ip_mq_unlock(port); |
590 | |
591 | if (kr == KERN_SUCCESS) { |
592 | *work_interval = converted_work_interval; |
593 | } |
594 | |
595 | return kr; |
596 | } |
597 | |
598 | |
599 | /* |
600 | * work_interval_port_no_senders |
601 | * |
602 | * Description: Handle a no-senders notification for a work interval port. |
603 | * Destroys the port and releases its reference on the work interval. |
604 | * |
605 | * Parameters: msg A Mach no-senders notification message. |
606 | * |
607 | * Note: This assumes that there is only one create-right-from-work-interval point, |
608 | * if the ability to extract another send right after creation is added, |
609 | * this will have to change to handle make-send counts correctly. |
610 | */ |
611 | static void |
612 | work_interval_port_no_senders(ipc_port_t port, mach_port_mscount_t mscount) |
613 | { |
614 | struct work_interval *work_interval = NULL; |
615 | |
616 | work_interval = ipc_kobject_dealloc_port(port, mscount, |
617 | type: IKOT_WORK_INTERVAL); |
618 | |
619 | work_interval->wi_port = MACH_PORT_NULL; |
620 | |
621 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
622 | } |
623 | |
624 | /* |
625 | * work_interval_port_type() |
626 | * |
627 | * Converts a port name into the work interval object and returns its type. |
628 | * |
629 | * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a |
630 | * valid type for work intervals). |
631 | */ |
632 | static uint32_t |
633 | work_interval_port_type(mach_port_name_t port_name) |
634 | { |
635 | struct work_interval *work_interval = NULL; |
636 | kern_return_t kr; |
637 | uint32_t work_interval_type; |
638 | |
639 | if (port_name == MACH_PORT_NULL) { |
640 | return WORK_INTERVAL_TYPE_LAST; |
641 | } |
642 | |
643 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
644 | if (kr != KERN_SUCCESS) { |
645 | return WORK_INTERVAL_TYPE_LAST; |
646 | } |
647 | /* work_interval has a +1 ref */ |
648 | |
649 | assert(work_interval != NULL); |
650 | work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK; |
651 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
652 | return work_interval_type; |
653 | } |
654 | |
655 | /* |
656 | * Sparse - not all work interval classes imply a scheduling policy change. |
657 | * The REALTIME_CRITICAL class *also* requires the thread to have explicitly |
658 | * adopted the REALTIME sched mode to take effect. |
659 | */ |
660 | static const struct { |
661 | int priority; |
662 | sched_mode_t sched_mode; |
663 | } work_interval_class_data[WI_CLASS_COUNT] = { |
664 | [WI_CLASS_BEST_EFFORT] = { |
665 | BASEPRI_DEFAULT, // 31 |
666 | TH_MODE_TIMESHARE, |
667 | }, |
668 | |
669 | [WI_CLASS_APP_SUPPORT] = { |
670 | BASEPRI_DEFAULT, // 31 |
671 | TH_MODE_TIMESHARE, |
672 | }, |
673 | |
674 | [WI_CLASS_SYSTEM] = { |
675 | BASEPRI_FOREGROUND + 1, // 48 |
676 | TH_MODE_FIXED, |
677 | }, |
678 | |
679 | [WI_CLASS_SYSTEM_CRITICAL] = { |
680 | MAXPRI_USER + 1, // 64 |
681 | TH_MODE_FIXED, |
682 | }, |
683 | |
684 | [WI_CLASS_REALTIME_CRITICAL] = { |
685 | BASEPRI_RTQUEUES + 1, // 98 |
686 | TH_MODE_REALTIME, |
687 | }, |
688 | }; |
689 | |
690 | /* |
691 | * Called when a thread gets its scheduling priority from its associated work |
692 | * interval. |
693 | */ |
694 | int |
695 | work_interval_get_priority(thread_t thread) |
696 | { |
697 | const struct work_interval *work_interval = thread->th_work_interval; |
698 | assert(work_interval != NULL); |
699 | |
700 | assert3u(work_interval->wi_class, !=, WI_CLASS_NONE); |
701 | assert3u(work_interval->wi_class, <, WI_CLASS_COUNT); |
702 | int priority = work_interval_class_data[work_interval->wi_class].priority; |
703 | assert(priority != 0); |
704 | |
705 | priority += work_interval->wi_class_offset; |
706 | assert3u(priority, <=, MAXPRI); |
707 | |
708 | return priority; |
709 | } |
710 | |
711 | #if CONFIG_THREAD_GROUPS |
712 | extern kern_return_t |
713 | kern_work_interval_get_policy_from_port(mach_port_name_t port_name, |
714 | integer_t *policy, |
715 | integer_t *priority, |
716 | struct thread_group **tg) |
717 | { |
718 | assert((priority != NULL) && (policy != NULL) && (tg != NULL)); |
719 | |
720 | kern_return_t kr; |
721 | struct work_interval *work_interval; |
722 | |
723 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
724 | if (kr != KERN_SUCCESS) { |
725 | return kr; |
726 | } |
727 | |
728 | /* work_interval has a +1 ref */ |
729 | assert(work_interval != NULL); |
730 | assert3u(work_interval->wi_class, <, WI_CLASS_COUNT); |
731 | |
732 | const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode; |
733 | |
734 | if ((mode == TH_MODE_TIMESHARE) || (mode == TH_MODE_FIXED)) { |
735 | *policy = ((mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR); |
736 | *priority = work_interval_class_data[work_interval->wi_class].priority; |
737 | assert(*priority != 0); |
738 | *priority += work_interval->wi_class_offset; |
739 | assert3u(*priority, <=, MAXPRI); |
740 | } /* No sched mode change for REALTIME (threads must explicitly opt-in) */ |
741 | |
742 | if (work_interval->wi_group) { |
743 | *tg = thread_group_retain(tg: work_interval->wi_group); |
744 | } |
745 | |
746 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
747 | return KERN_SUCCESS; |
748 | } |
749 | #endif /* CONFIG_THREAD_GROUPS */ |
750 | |
751 | /* |
752 | * Switch to a policy driven by the work interval (if applicable). |
753 | */ |
754 | static void |
755 | work_interval_set_policy(thread_t thread) |
756 | { |
757 | assert3p(thread, ==, current_thread()); |
758 | |
759 | /* |
760 | * Ignore policy changes if the workload context shouldn't affect the |
761 | * scheduling policy. |
762 | */ |
763 | workload_config_flags_t flags = WLC_F_NONE; |
764 | |
765 | /* There may be no config at all. That's ok. */ |
766 | if (workload_config_get_flags(flags: &flags) != KERN_SUCCESS || |
767 | (flags & WLC_F_THREAD_POLICY) == 0) { |
768 | return; |
769 | } |
770 | |
771 | const struct work_interval *work_interval = thread->th_work_interval; |
772 | assert(work_interval != NULL); |
773 | |
774 | assert3u(work_interval->wi_class, <, WI_CLASS_COUNT); |
775 | const sched_mode_t mode = work_interval_class_data[work_interval->wi_class].sched_mode; |
776 | |
777 | /* |
778 | * A mode of TH_MODE_NONE implies that this work interval has no |
779 | * associated scheduler effects. |
780 | */ |
781 | if (mode == TH_MODE_NONE) { |
782 | return; |
783 | } |
784 | |
785 | proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE, |
786 | TASK_POLICY_WI_DRIVEN, true, value2: mode); |
787 | assert(thread->requested_policy.thrp_wi_driven); |
788 | |
789 | return; |
790 | } |
791 | |
792 | /* |
793 | * Clear a work interval driven policy. |
794 | */ |
795 | static void |
796 | work_interval_clear_policy(thread_t thread) |
797 | { |
798 | assert3p(thread, ==, current_thread()); |
799 | |
800 | if (!thread->requested_policy.thrp_wi_driven) { |
801 | return; |
802 | } |
803 | |
804 | const sched_mode_t mode = sched_get_thread_mode_user(thread); |
805 | |
806 | proc_set_thread_policy_ext(thread, TASK_POLICY_ATTRIBUTE, |
807 | TASK_POLICY_WI_DRIVEN, false, |
808 | value2: mode == TH_MODE_REALTIME ? mode : TH_MODE_TIMESHARE); |
809 | |
810 | assert(!thread->requested_policy.thrp_wi_driven); |
811 | |
812 | return; |
813 | } |
814 | |
815 | /* |
816 | * thread_set_work_interval() |
817 | * |
818 | * Change thread's bound work interval to the passed-in work interval |
819 | * Consumes +1 ref on work_interval upon success. |
820 | * |
821 | * May also pass NULL to un-set work_interval on the thread |
822 | * Will deallocate any old work interval on the thread |
823 | * Return error if thread does not satisfy requirements to join work interval |
824 | * |
825 | * For non auto-join work intervals, deallocate any old work interval on the thread |
826 | * For auto-join work intervals, the routine may wakeup the work interval deferred |
827 | * deallocation queue since thread locks might be currently held. |
828 | */ |
829 | static kern_return_t |
830 | thread_set_work_interval(thread_t thread, |
831 | struct work_interval *work_interval, thread_work_interval_options_t options) |
832 | { |
833 | /* All explicit work interval operations should always be from the current thread */ |
834 | if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) { |
835 | assert(thread == current_thread()); |
836 | } |
837 | |
838 | /* All cases of needing the thread lock should be from explicit join scenarios */ |
839 | if (options & THREAD_WI_THREAD_LOCK_NEEDED) { |
840 | assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0); |
841 | } |
842 | |
843 | /* For all cases of auto join must come in with the thread lock held */ |
844 | if (options & THREAD_WI_AUTO_JOIN_POLICY) { |
845 | assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0); |
846 | } |
847 | |
848 | #if CONFIG_THREAD_GROUPS |
849 | if (work_interval && !work_interval->wi_group) { |
850 | /* Reject join on work intervals with deferred thread group creation */ |
851 | return KERN_INVALID_ARGUMENT; |
852 | } |
853 | #endif /* CONFIG_THREAD_GROUPS */ |
854 | |
855 | if (work_interval) { |
856 | uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK; |
857 | |
858 | if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) { |
859 | /* Ensure no kern_work_interval_set_workload_id can happen after this point */ |
860 | uint32_t wlid_flags; |
861 | (void)os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, |
862 | WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED, &wlid_flags, relaxed); |
863 | if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) { |
864 | /* For workload IDs with rt-allowed, neuter the check below to |
865 | * enable joining before the thread has become realtime for all |
866 | * work interval types */ |
867 | work_interval_type = WORK_INTERVAL_TYPE_DEFAULT; |
868 | } |
869 | } |
870 | |
871 | if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) && |
872 | (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) { |
873 | return KERN_INVALID_ARGUMENT; |
874 | } |
875 | } |
876 | |
877 | /* |
878 | * Ensure a work interval scheduling policy is not used if the thread is |
879 | * leaving the work interval. |
880 | */ |
881 | if (work_interval == NULL && |
882 | (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) { |
883 | work_interval_clear_policy(thread); |
884 | } |
885 | |
886 | struct work_interval *old_th_wi = thread->th_work_interval; |
887 | #if CONFIG_SCHED_AUTO_JOIN |
888 | spl_t s; |
889 | /* Take the thread lock if needed */ |
890 | if (options & THREAD_WI_THREAD_LOCK_NEEDED) { |
891 | s = splsched(); |
892 | thread_lock(thread); |
893 | } |
894 | |
895 | /* |
896 | * Work interval auto-join leak to non-RT threads. |
897 | * |
898 | * If thread might be running on a remote core and it's not in the context switch path (where |
899 | * thread is neither running, blocked or in the runq), its not possible to update the |
900 | * work interval & thread group remotely since its not possible to update CLPC for a remote |
901 | * core. This situation might happen when a thread is transitioning from realtime to |
902 | * non-realtime due to backgrounding etc., which would mean that non-RT threads would now |
903 | * be part of the work interval. |
904 | * |
905 | * Since there is no immediate mitigation to this issue, the policy is to set a new |
906 | * flag on the thread which indicates that such a "leak" has happened. This flag will |
907 | * be cleared when the remote thread eventually blocks and unjoins from the work interval. |
908 | */ |
909 | bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread_get_runq(thread) == PROCESSOR_NULL)); |
910 | |
911 | if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) { |
912 | assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0); |
913 | os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed); |
914 | return KERN_SUCCESS; |
915 | } |
916 | |
917 | const bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0); |
918 | |
919 | if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) { |
920 | __kdebug_only uint64_t old_tg_id = (old_th_wi && old_th_wi->wi_group) ? thread_group_get_id(tg: old_th_wi->wi_group) : ~0; |
921 | __kdebug_only uint64_t new_tg_id = (work_interval && work_interval->wi_group) ? thread_group_get_id(tg: work_interval->wi_group) : ~0; |
922 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN), |
923 | thread_tid(thread), old_tg_id, new_tg_id, options); |
924 | } |
925 | |
926 | if (old_wi_auto_joined) { |
927 | /* |
928 | * If thread was auto-joined to a work interval and is not realtime, make sure it |
929 | * happened due to the "leak" described above. |
930 | */ |
931 | if (thread->sched_mode != TH_MODE_REALTIME) { |
932 | assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0); |
933 | } |
934 | |
935 | os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed); |
936 | work_interval_auto_join_decrement(work_interval: old_th_wi, thread); |
937 | thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN; |
938 | } |
939 | |
940 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
941 | |
942 | KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE), |
943 | thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY)); |
944 | |
945 | /* transfer +1 ref to thread */ |
946 | thread->th_work_interval = work_interval; |
947 | |
948 | #if CONFIG_SCHED_AUTO_JOIN |
949 | |
950 | if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) { |
951 | assert(work_interval_auto_join_enabled(work_interval) == true); |
952 | thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN; |
953 | } |
954 | |
955 | if (options & THREAD_WI_THREAD_LOCK_NEEDED) { |
956 | thread_unlock(thread); |
957 | splx(s); |
958 | } |
959 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
960 | |
961 | /* |
962 | * The thread got a new work interval. It may come with a work interval |
963 | * scheduling policy that needs to be applied. |
964 | */ |
965 | if (work_interval != NULL && |
966 | (options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0) { |
967 | work_interval_set_policy(thread); |
968 | } |
969 | |
970 | #if CONFIG_THREAD_GROUPS |
971 | if (work_interval) { |
972 | /* Prevent thread_group_set_name after CLPC may have already heard |
973 | * about the thread group */ |
974 | (void)os_atomic_cmpxchg(&work_interval->wi_group_flags, 0, |
975 | WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED, relaxed); |
976 | } |
977 | struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL; |
978 | |
979 | if (options & THREAD_WI_AUTO_JOIN_POLICY) { |
980 | #if CONFIG_SCHED_AUTO_JOIN |
981 | thread_set_autojoin_thread_group_locked(t: thread, tg: new_tg); |
982 | #endif |
983 | } else { |
984 | thread_set_work_interval_thread_group(t: thread, tg: new_tg); |
985 | } |
986 | #endif /* CONFIG_THREAD_GROUPS */ |
987 | |
988 | if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) { |
989 | /* Construct mask to XOR with th_work_interval_flags to clear the |
990 | * currently present flags and set the new flags in wlid_flags. */ |
991 | uint32_t wlid_flags = 0; |
992 | if (work_interval) { |
993 | wlid_flags = os_atomic_load(&work_interval->wi_wlid_flags, relaxed); |
994 | } |
995 | thread_work_interval_flags_t th_wi_xor_mask = os_atomic_load( |
996 | &thread->th_work_interval_flags, relaxed); |
997 | th_wi_xor_mask &= (TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID | |
998 | TH_WORK_INTERVAL_FLAGS_RT_ALLOWED); |
999 | if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_HAS_ID) { |
1000 | th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_HAS_WORKLOAD_ID; |
1001 | if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) { |
1002 | th_wi_xor_mask ^= TH_WORK_INTERVAL_FLAGS_RT_ALLOWED; |
1003 | } |
1004 | } |
1005 | if (th_wi_xor_mask) { |
1006 | os_atomic_xor(&thread->th_work_interval_flags, th_wi_xor_mask, relaxed); |
1007 | } |
1008 | |
1009 | /* |
1010 | * Now that the interval flags have been set, re-evaluate |
1011 | * whether the thread needs to be undemoted - the new work |
1012 | * interval may have the RT_ALLOWED flag. and the thread may |
1013 | * have have a realtime policy but be demoted. |
1014 | */ |
1015 | thread_rt_evaluate(thread); |
1016 | } |
1017 | |
1018 | if (old_th_wi != NULL) { |
1019 | work_interval_release(work_interval: old_th_wi, options); |
1020 | } |
1021 | |
1022 | return KERN_SUCCESS; |
1023 | } |
1024 | |
1025 | static kern_return_t |
1026 | thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval) |
1027 | { |
1028 | assert(thread == current_thread()); |
1029 | return thread_set_work_interval(thread, work_interval, options: THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED); |
1030 | } |
1031 | |
1032 | kern_return_t |
1033 | work_interval_thread_terminate(thread_t thread) |
1034 | { |
1035 | assert(thread == current_thread()); |
1036 | if (thread->th_work_interval != NULL) { |
1037 | return thread_set_work_interval(thread, NULL, options: THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED); |
1038 | } |
1039 | return KERN_SUCCESS; |
1040 | } |
1041 | |
1042 | kern_return_t |
1043 | kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args) |
1044 | { |
1045 | assert(thread == current_thread()); |
1046 | assert(kwi_args->work_interval_id != 0); |
1047 | |
1048 | struct work_interval *work_interval = thread->th_work_interval; |
1049 | |
1050 | if (work_interval == NULL || |
1051 | work_interval->wi_id != kwi_args->work_interval_id) { |
1052 | /* This thread must have adopted the work interval to be able to notify */ |
1053 | return KERN_INVALID_ARGUMENT; |
1054 | } |
1055 | |
1056 | task_t notifying_task = current_task(); |
1057 | |
1058 | if (work_interval->wi_creator_uniqueid != get_task_uniqueid(task: notifying_task) || |
1059 | work_interval->wi_creator_pidversion != get_task_version(task: notifying_task)) { |
1060 | /* Only the creating task can do a notify */ |
1061 | return KERN_INVALID_ARGUMENT; |
1062 | } |
1063 | |
1064 | spl_t s = splsched(); |
1065 | |
1066 | #if CONFIG_THREAD_GROUPS |
1067 | assert(work_interval->wi_group == thread->thread_group); |
1068 | #endif /* CONFIG_THREAD_GROUPS */ |
1069 | |
1070 | uint64_t urgency_param1, urgency_param2; |
1071 | kwi_args->urgency = (uint16_t)thread_get_urgency(thread, rt_period: &urgency_param1, rt_deadline: &urgency_param2); |
1072 | |
1073 | splx(s); |
1074 | |
1075 | /* called without interrupts disabled */ |
1076 | machine_work_interval_notify(thread, kwi_args); |
1077 | |
1078 | return KERN_SUCCESS; |
1079 | } |
1080 | |
1081 | /* Start at 1, 0 is not a valid work interval ID */ |
1082 | static _Atomic uint64_t unique_work_interval_id = 1; |
1083 | |
1084 | kern_return_t |
1085 | kern_work_interval_create(thread_t thread, |
1086 | struct kern_work_interval_create_args *create_params) |
1087 | { |
1088 | assert(thread == current_thread()); |
1089 | |
1090 | uint32_t create_flags = create_params->wica_create_flags; |
1091 | |
1092 | if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) && |
1093 | thread->th_work_interval != NULL) { |
1094 | /* |
1095 | * If the thread is doing a legacy combined create and join, |
1096 | * it shouldn't already be part of a work interval. |
1097 | * |
1098 | * (Creating a joinable WI is allowed anytime.) |
1099 | */ |
1100 | return KERN_FAILURE; |
1101 | } |
1102 | |
1103 | /* |
1104 | * Check the validity of the create flags before allocating the work |
1105 | * interval. |
1106 | */ |
1107 | task_t creating_task = current_task(); |
1108 | if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) { |
1109 | /* |
1110 | * CA_CLIENT work intervals do not create new thread groups. |
1111 | * There can only be one CA_CLIENT work interval (created by UIKit or AppKit) |
1112 | * per each application task |
1113 | */ |
1114 | if (create_flags & WORK_INTERVAL_FLAG_GROUP) { |
1115 | return KERN_FAILURE; |
1116 | } |
1117 | if (!task_is_app(task: creating_task)) { |
1118 | #if XNU_TARGET_OS_OSX |
1119 | /* |
1120 | * Soft-fail the case of a non-app pretending to be an |
1121 | * app, by allowing it to press the buttons, but they're |
1122 | * not actually connected to anything. |
1123 | */ |
1124 | create_flags |= WORK_INTERVAL_FLAG_IGNORED; |
1125 | #else |
1126 | /* |
1127 | * On iOS, it's a hard failure to get your apptype |
1128 | * wrong and then try to render something. |
1129 | */ |
1130 | return KERN_NOT_SUPPORTED; |
1131 | #endif /* XNU_TARGET_OS_OSX */ |
1132 | } |
1133 | if (task_set_ca_client_wi(task: creating_task, true) == false) { |
1134 | return KERN_FAILURE; |
1135 | } |
1136 | } |
1137 | |
1138 | #if CONFIG_SCHED_AUTO_JOIN |
1139 | if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) { |
1140 | uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK); |
1141 | if (type != WORK_INTERVAL_TYPE_COREAUDIO) { |
1142 | return KERN_NOT_SUPPORTED; |
1143 | } |
1144 | if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) { |
1145 | return KERN_NOT_SUPPORTED; |
1146 | } |
1147 | } |
1148 | |
1149 | if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) { |
1150 | if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) { |
1151 | return KERN_NOT_SUPPORTED; |
1152 | } |
1153 | } |
1154 | #endif /* CONFIG_SCHED_AUTO_JOIN */ |
1155 | |
1156 | struct work_interval *work_interval = kalloc_type(struct work_interval, |
1157 | Z_WAITOK | Z_ZERO | Z_NOFAIL); |
1158 | |
1159 | uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed); |
1160 | |
1161 | *work_interval = (struct work_interval) { |
1162 | .wi_id = work_interval_id, |
1163 | .wi_ref_count = {}, |
1164 | .wi_create_flags = create_flags, |
1165 | .wi_creator_pid = pid_from_task(task: creating_task), |
1166 | .wi_creator_uniqueid = get_task_uniqueid(task: creating_task), |
1167 | .wi_creator_pidversion = get_task_version(task: creating_task), |
1168 | }; |
1169 | os_ref_init(&work_interval->wi_ref_count, NULL); |
1170 | |
1171 | if (work_interval_telemetry_data_enabled(work_interval)) { |
1172 | recount_work_interval_init(wi: &work_interval->wi_recount); |
1173 | } |
1174 | |
1175 | __kdebug_only uint64_t tg_id = 0; |
1176 | #if CONFIG_THREAD_GROUPS |
1177 | struct thread_group *tg; |
1178 | if ((create_flags & |
1179 | (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) == |
1180 | (WORK_INTERVAL_FLAG_GROUP | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) { |
1181 | /* defer creation of the thread group until the |
1182 | * kern_work_interval_set_workload_id() call */ |
1183 | work_interval->wi_group = NULL; |
1184 | } else if (create_flags & WORK_INTERVAL_FLAG_GROUP) { |
1185 | /* create a new group for the interval to represent */ |
1186 | char name[THREAD_GROUP_MAXNAME] = "" ; |
1187 | |
1188 | snprintf(name, sizeof(name), "WI%lld (pid %d)" , work_interval_id, |
1189 | work_interval->wi_creator_pid); |
1190 | |
1191 | tg = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT); |
1192 | |
1193 | thread_group_set_name(tg, name); |
1194 | |
1195 | work_interval->wi_group = tg; |
1196 | } else { |
1197 | /* the interval represents the thread's home group */ |
1198 | tg = thread_group_get_home_group(t: thread); |
1199 | |
1200 | thread_group_retain(tg); |
1201 | |
1202 | work_interval->wi_group = tg; |
1203 | } |
1204 | |
1205 | /* Capture the tg_id for tracing purposes */ |
1206 | tg_id = work_interval->wi_group ? thread_group_get_id(tg: work_interval->wi_group) : ~0; |
1207 | |
1208 | #endif /* CONFIG_THREAD_GROUPS */ |
1209 | |
1210 | if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) { |
1211 | mach_port_name_t name = MACH_PORT_NULL; |
1212 | |
1213 | /* work_interval has a +1 ref, moves to the port */ |
1214 | work_interval->wi_port = ipc_kobject_alloc_port( |
1215 | kobject: (ipc_kobject_t)work_interval, type: IKOT_WORK_INTERVAL, |
1216 | options: IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); |
1217 | |
1218 | |
1219 | name = ipc_port_copyout_send(sright: work_interval->wi_port, current_space()); |
1220 | |
1221 | if (!MACH_PORT_VALID(name)) { |
1222 | /* |
1223 | * copyout failed (port is already deallocated) |
1224 | * Because of the port-destroyed magic, |
1225 | * the work interval is already deallocated too. |
1226 | */ |
1227 | return KERN_RESOURCE_SHORTAGE; |
1228 | } |
1229 | |
1230 | create_params->wica_port = name; |
1231 | } else { |
1232 | /* work_interval has a +1 ref, moves to the thread */ |
1233 | kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval); |
1234 | if (kr != KERN_SUCCESS) { |
1235 | /* No other thread can join this work interval since it isn't |
1236 | * JOINABLE so release the reference on work interval */ |
1237 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
1238 | return kr; |
1239 | } |
1240 | |
1241 | create_params->wica_port = MACH_PORT_NULL; |
1242 | } |
1243 | |
1244 | create_params->wica_id = work_interval_id; |
1245 | |
1246 | if (tg_id != ~0) { |
1247 | KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE), |
1248 | work_interval_id, create_flags, pid_from_task(creating_task), tg_id); |
1249 | } |
1250 | return KERN_SUCCESS; |
1251 | } |
1252 | |
1253 | kern_return_t |
1254 | kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags) |
1255 | { |
1256 | assert(flags != NULL); |
1257 | |
1258 | kern_return_t kr; |
1259 | struct work_interval *work_interval; |
1260 | |
1261 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
1262 | if (kr != KERN_SUCCESS) { |
1263 | return kr; |
1264 | } |
1265 | |
1266 | assert(work_interval != NULL); |
1267 | *flags = work_interval->wi_create_flags; |
1268 | |
1269 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
1270 | |
1271 | return KERN_SUCCESS; |
1272 | } |
1273 | |
1274 | #if CONFIG_THREAD_GROUPS |
1275 | _Static_assert(WORK_INTERVAL_NAME_MAX == THREAD_GROUP_MAXNAME, |
1276 | "WORK_INTERVAL_NAME_MAX does not match THREAD_GROUP_MAXNAME" ); |
1277 | #endif /* CONFIG_THREAD_GROUPS */ |
1278 | |
1279 | kern_return_t |
1280 | kern_work_interval_set_name(mach_port_name_t port_name, __unused char *name, |
1281 | size_t len) |
1282 | { |
1283 | kern_return_t kr; |
1284 | struct work_interval *work_interval; |
1285 | |
1286 | if (len > WORK_INTERVAL_NAME_MAX) { |
1287 | return KERN_INVALID_ARGUMENT; |
1288 | } |
1289 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
1290 | if (kr != KERN_SUCCESS) { |
1291 | return kr; |
1292 | } |
1293 | |
1294 | assert(work_interval != NULL); |
1295 | |
1296 | #if CONFIG_THREAD_GROUPS |
1297 | uint32_t wi_group_flags = os_atomic_load( |
1298 | &work_interval->wi_group_flags, relaxed); |
1299 | if (wi_group_flags & WORK_INTERVAL_GROUP_FLAGS_THREAD_JOINED) { |
1300 | kr = KERN_INVALID_ARGUMENT; |
1301 | goto out; |
1302 | } |
1303 | if (!work_interval->wi_group) { |
1304 | kr = KERN_INVALID_ARGUMENT; |
1305 | goto out; |
1306 | } |
1307 | |
1308 | if (name[0] && (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP)) { |
1309 | char tgname[THREAD_GROUP_MAXNAME]; |
1310 | snprintf(tgname, sizeof(tgname), "WI%lld %s" , work_interval->wi_id, |
1311 | name); |
1312 | thread_group_set_name(tg: work_interval->wi_group, name: tgname); |
1313 | } |
1314 | |
1315 | out: |
1316 | #endif /* CONFIG_THREAD_GROUPS */ |
1317 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
1318 | |
1319 | return kr; |
1320 | } |
1321 | |
1322 | kern_return_t |
1323 | kern_work_interval_set_workload_id(mach_port_name_t port_name, |
1324 | struct kern_work_interval_workload_id_args *workload_id_args) |
1325 | { |
1326 | kern_return_t kr; |
1327 | struct work_interval *work_interval; |
1328 | uint32_t wlida_flags = 0; |
1329 | uint32_t wlid_flags = 0; |
1330 | #if CONFIG_THREAD_GROUPS |
1331 | uint32_t tg_flags = 0; |
1332 | #endif |
1333 | bool from_workload_config = false; |
1334 | |
1335 | /* Ensure workload ID name is non-empty. */ |
1336 | if (!workload_id_args->wlida_name[0]) { |
1337 | return KERN_INVALID_ARGUMENT; |
1338 | } |
1339 | |
1340 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
1341 | if (kr != KERN_SUCCESS) { |
1342 | return kr; |
1343 | } |
1344 | |
1345 | assert(work_interval != NULL); |
1346 | if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_JOINABLE)) { |
1347 | kr = KERN_INVALID_ARGUMENT; |
1348 | goto out; |
1349 | } |
1350 | |
1351 | if (!(work_interval->wi_create_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID)) { |
1352 | /* Reject work intervals that didn't indicate they will have a workload ID |
1353 | * at creation. In particular if the work interval has its own thread group, |
1354 | * its creation must have been deferred in kern_work_interval_create */ |
1355 | kr = KERN_INVALID_ARGUMENT; |
1356 | goto out; |
1357 | } |
1358 | |
1359 | workload_config_t wl_config = {}; |
1360 | kr = workload_config_lookup_default(id: workload_id_args->wlida_name, config: &wl_config); |
1361 | if (kr == KERN_SUCCESS) { |
1362 | if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) != |
1363 | (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) { |
1364 | if ((wl_config.wc_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER && |
1365 | (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_FRAME_COMPOSITOR) { |
1366 | /* WORK_INTERVAL_TYPE_FRAME_COMPOSITOR is a valid related type of WORK_INTERVAL_TYPE_CA_RENDER_SERVER */ |
1367 | } else { |
1368 | kr = KERN_INVALID_ARGUMENT; |
1369 | goto out; |
1370 | } |
1371 | } |
1372 | |
1373 | wlida_flags = wl_config.wc_flags; |
1374 | |
1375 | wlida_flags &= ~WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL; |
1376 | |
1377 | #if CONFIG_THREAD_GROUPS |
1378 | tg_flags = wl_config.wc_thread_group_flags; |
1379 | if (tg_flags != THREAD_GROUP_FLAGS_ABSENT && |
1380 | (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) { |
1381 | kr = KERN_INVALID_ARGUMENT; |
1382 | goto out; |
1383 | } |
1384 | #endif /* CONFIG_THREAD_GROUPS */ |
1385 | |
1386 | from_workload_config = true; |
1387 | } else { |
1388 | /* If the workload is not present in the table, perform basic validation |
1389 | * that the create flags passed in match the ones used at work interval |
1390 | * create time */ |
1391 | if ((workload_id_args->wlida_wicreate_flags & WORK_INTERVAL_TYPE_MASK) != |
1392 | (work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK)) { |
1393 | kr = KERN_INVALID_ARGUMENT; |
1394 | goto out; |
1395 | } |
1396 | |
1397 | const bool wc_avail = workload_config_available(); |
1398 | if (!wc_avail) { |
1399 | wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED; |
1400 | } |
1401 | |
1402 | /* |
1403 | * If the workload config wasn't even loaded then fallback to |
1404 | * older behaviour where the new thread group gets the default |
1405 | * thread group flags (when WORK_INTERVAL_FLAG_GROUP is set). |
1406 | */ |
1407 | #if CONFIG_THREAD_GROUPS |
1408 | if (!wc_avail) { |
1409 | tg_flags = THREAD_GROUP_FLAGS_DEFAULT; |
1410 | } else { |
1411 | struct thread_group *home_group = |
1412 | thread_group_get_home_group(t: current_thread()); |
1413 | if (home_group != NULL) { |
1414 | tg_flags = thread_group_get_flags(home_group); |
1415 | } |
1416 | } |
1417 | #endif /* CONFIG_THREAD_GROUPS */ |
1418 | } |
1419 | |
1420 | workload_id_args->wlida_wicreate_flags = work_interval->wi_create_flags; |
1421 | |
1422 | /* cmpxchg a non-zero workload ID flags value (indicating that workload ID |
1423 | * has been set). */ |
1424 | wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_HAS_ID; |
1425 | if (os_atomic_cmpxchgv(&work_interval->wi_wlid_flags, 0, wlida_flags, |
1426 | &wlid_flags, relaxed)) { |
1427 | if (from_workload_config) { |
1428 | work_interval->wi_class = wl_config.wc_class; |
1429 | work_interval->wi_class_offset = wl_config.wc_class_offset; |
1430 | } |
1431 | #if CONFIG_THREAD_GROUPS |
1432 | if (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_GROUP) { |
1433 | /* Perform deferred thread group creation, now that tgflags are known */ |
1434 | struct thread_group *tg; |
1435 | tg = thread_group_create_and_retain(flags: tg_flags == THREAD_GROUP_FLAGS_ABSENT ? |
1436 | THREAD_GROUP_FLAGS_DEFAULT : tg_flags); |
1437 | |
1438 | char tgname[THREAD_GROUP_MAXNAME] = "" ; |
1439 | snprintf(tgname, sizeof(tgname), "WI%lld %s" , work_interval->wi_id, |
1440 | workload_id_args->wlida_name); |
1441 | thread_group_set_name(tg, name: tgname); |
1442 | |
1443 | assert(work_interval->wi_group == NULL); |
1444 | work_interval->wi_group = tg; |
1445 | KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE), |
1446 | work_interval->wi_id, work_interval->wi_create_flags, |
1447 | work_interval->wi_creator_pid, thread_group_get_id(tg)); |
1448 | } |
1449 | #endif /* CONFIG_THREAD_GROUPS */ |
1450 | } else { |
1451 | /* Workload ID has previously been set (or a thread has already joined). */ |
1452 | if (wlid_flags & WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED) { |
1453 | kr = KERN_INVALID_ARGUMENT; |
1454 | goto out; |
1455 | } |
1456 | /* Treat this request as a query for the out parameters of the ID */ |
1457 | workload_id_args->wlida_flags = wlid_flags; |
1458 | } |
1459 | |
1460 | /* |
1461 | * Emit tracepoints for successfully setting the workload ID. |
1462 | * |
1463 | * After rdar://89342390 has been fixed and a new work interval ktrace |
1464 | * provider has been added, it will be possible to associate a numeric |
1465 | * ID with an ID name. Thus, for those cases where the ID name has been |
1466 | * looked up successfully (`from_workload_config` is true) it will no |
1467 | * longer be necessary to emit a tracepoint with the full ID name. |
1468 | */ |
1469 | KDBG(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID), |
1470 | work_interval->wi_id, from_workload_config); |
1471 | kernel_debug_string_simple( |
1472 | MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_SET_WORKLOAD_ID_NAME), |
1473 | str: workload_id_args->wlida_name); |
1474 | |
1475 | kr = KERN_SUCCESS; |
1476 | |
1477 | out: |
1478 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
1479 | |
1480 | return kr; |
1481 | } |
1482 | |
1483 | |
1484 | kern_return_t |
1485 | kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id) |
1486 | { |
1487 | if (work_interval_id == 0) { |
1488 | return KERN_INVALID_ARGUMENT; |
1489 | } |
1490 | |
1491 | if (thread->th_work_interval == NULL || |
1492 | thread->th_work_interval->wi_id != work_interval_id) { |
1493 | /* work ID isn't valid or doesn't match joined work interval ID */ |
1494 | return KERN_INVALID_ARGUMENT; |
1495 | } |
1496 | |
1497 | return thread_set_work_interval_explicit_join(thread, NULL); |
1498 | } |
1499 | |
1500 | kern_return_t |
1501 | kern_work_interval_join(thread_t thread, |
1502 | mach_port_name_t port_name) |
1503 | { |
1504 | struct work_interval *work_interval = NULL; |
1505 | kern_return_t kr; |
1506 | |
1507 | if (port_name == MACH_PORT_NULL) { |
1508 | /* 'Un-join' the current work interval */ |
1509 | return thread_set_work_interval_explicit_join(thread, NULL); |
1510 | } |
1511 | |
1512 | kr = port_name_to_work_interval(name: port_name, work_interval: &work_interval); |
1513 | if (kr != KERN_SUCCESS) { |
1514 | return kr; |
1515 | } |
1516 | /* work_interval has a +1 ref */ |
1517 | |
1518 | assert(work_interval != NULL); |
1519 | |
1520 | kr = thread_set_work_interval_explicit_join(thread, work_interval); |
1521 | /* ref was consumed by passing it to the thread in the successful case */ |
1522 | if (kr != KERN_SUCCESS) { |
1523 | work_interval_release(work_interval, options: THREAD_WI_THREAD_LOCK_NEEDED); |
1524 | } |
1525 | return kr; |
1526 | } |
1527 | |
1528 | /* |
1529 | * work_interval_port_type_render_server() |
1530 | * |
1531 | * Helper routine to determine if the port points to a |
1532 | * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval. |
1533 | */ |
1534 | bool |
1535 | work_interval_port_type_render_server(mach_port_name_t port_name) |
1536 | { |
1537 | return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER; |
1538 | } |
1539 | |