thread.c source code [xnu/osfmk/kern/thread.c]

1	/*
2	* Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_FREE_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: kern/thread.c
60	* Author: Avadis Tevanian, Jr., Michael Wayne Young, David Golub
61	* Date: 1986
62	*
63	* Thread management primitives implementation.
64	*/
65	/*
66	* Copyright (c) 1993 The University of Utah and
67	* the Computer Systems Laboratory (CSL). All rights reserved.
68	*
69	* Permission to use, copy, modify and distribute this software and its
70	* documentation is hereby granted, provided that both the copyright
71	* notice and this permission notice appear in all copies of the
72	* software, derivative works or modified versions, and any portions
73	* thereof, and that both notices appear in supporting documentation.
74	*
75	* THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS
76	* IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF
77	* ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
78	*
79	* CSL requests users of this software to return to csl-dist@cs.utah.edu any
80	* improvements that they make and grant CSL redistribution rights.
81	*
82	*/
83
84	#include <mach/mach_types.h>
85	#include <mach/boolean.h>
86	#include <mach/policy.h>
87	#include <mach/thread_info.h>
88	#include <mach/thread_special_ports.h>
89	#include <mach/thread_act.h>
90	#include <mach/thread_status.h>
91	#include <mach/time_value.h>
92	#include <mach/vm_param.h>
93
94	#include <machine/thread.h>
95	#include <machine/pal_routines.h>
96	#include <machine/limits.h>
97
98	#include <kern/kern_types.h>
99	#include <kern/kalloc.h>
100	#include <kern/cpu_data.h>
101	#include <kern/extmod_statistics.h>
102	#include <kern/ipc_mig.h>
103	#include <kern/ipc_tt.h>
104	#include <kern/mach_param.h>
105	#include <kern/machine.h>
106	#include <kern/misc_protos.h>
107	#include <kern/processor.h>
108	#include <kern/queue.h>
109	#include <kern/restartable.h>
110	#include <kern/sched.h>
111	#include <kern/sched_prim.h>
112	#include <kern/syscall_subr.h>
113	#include <kern/task.h>
114	#include <kern/thread.h>
115	#include <kern/thread_group.h>
116	#include <kern/coalition.h>
117	#include <kern/host.h>
118	#include <kern/zalloc.h>
119	#include <kern/assert.h>
120	#include <kern/exc_resource.h>
121	#include <kern/exc_guard.h>
122	#include <kern/telemetry.h>
123	#include <kern/policy_internal.h>
124	#include <kern/turnstile.h>
125	#include <kern/sched_clutch.h>
126	#include <kern/recount.h>
127	#include <kern/smr.h>
128	#include <kern/ast.h>
129	#include <kern/compact_id.h>
130
131	#include <corpses/task_corpse.h>
132	#include <kern/kpc.h>
133
134	#if CONFIG_PERVASIVE_CPI
135	#include <kern/monotonic.h>
136	#include <machine/monotonic.h>
137	#endif /* CONFIG_PERVASIVE_CPI */
138
139	#include <ipc/ipc_kmsg.h>
140	#include <ipc/ipc_port.h>
141	#include <bank/bank_types.h>
142
143	#include <vm/vm_kern.h>
144	#include <vm/vm_pageout.h>
145
146	#include <sys/kdebug.h>
147	#include <sys/bsdtask_info.h>
148	#include <mach/sdt.h>
149	#include <san/kasan.h>
150	#include <san/kcov_stksz.h>
151
152	#include <stdatomic.h>
153
154	#if defined(HAS_APPLE_PAC)
155	#include <ptrauth.h>
156	#include <arm64/proc_reg.h>
157	#endif /* defined(HAS_APPLE_PAC) */
158
159	/*
160	* Exported interfaces
161	*/
162	#include <mach/task_server.h>
163	#include <mach/thread_act_server.h>
164	#include <mach/mach_host_server.h>
165	#include <mach/host_priv_server.h>
166	#include <mach/mach_voucher_server.h>
167	#include <kern/policy_internal.h>
168
169	#if CONFIG_MACF
170	#include <security/mac_mach_internal.h>
171	#endif
172
173	#include <pthread/workqueue_trace.h>
174
175	#if CONFIG_EXCLAVES
176	#include <mach/exclaves.h>
177	#endif
178
179	LCK_GRP_DECLARE(thread_lck_grp, "thread");
180
181	static SECURITY_READ_ONLY_LATE(zone_t) thread_zone;
182	ZONE_DEFINE_ID(ZONE_ID_THREAD_RO, "threads_ro", struct thread_ro, ZC_READONLY);
183
184	static void thread_port_with_flavor_no_senders(ipc_port_t, mach_port_mscount_t);
185
186	IPC_KOBJECT_DEFINE(IKOT_THREAD_CONTROL);
187	IPC_KOBJECT_DEFINE(IKOT_THREAD_READ,
188	.iko_op_no_senders = thread_port_with_flavor_no_senders);
189	IPC_KOBJECT_DEFINE(IKOT_THREAD_INSPECT,
190	.iko_op_no_senders = thread_port_with_flavor_no_senders);
191
192	static struct mpsc_daemon_queue thread_stack_queue;
193	static struct mpsc_daemon_queue thread_terminate_queue;
194	static struct mpsc_daemon_queue thread_deallocate_queue;
195	static struct mpsc_daemon_queue thread_exception_queue;
196	static struct mpsc_daemon_queue thread_backtrace_queue;
197
198	decl_simple_lock_data(static, crashed_threads_lock);
199	static queue_head_t crashed_threads_queue;
200
201	struct thread_exception_elt {
202	struct mpsc_queue_chain link;
203	exception_type_t exception_type;
204	task_t exception_task;
205	thread_t exception_thread;
206	};
207
208	struct thread_backtrace_elt {
209	struct mpsc_queue_chain link;
210	exception_type_t exception_type;
211	kcdata_object_t obj;
212	exception_port_t exc_ports[BT_EXC_PORTS_COUNT]; / send rights /
213	};
214
215	static SECURITY_READ_ONLY_LATE(struct thread) thread_template = {
216	#if MACH_ASSERT
217	.thread_magic = THREAD_MAGIC,
218	#endif /* MACH_ASSERT */
219	.wait_result = THREAD_WAITING,
220	.options = THREAD_ABORTSAFE,
221	.state = TH_WAIT \| TH_UNINT,
222	.th_sched_bucket = TH_BUCKET_RUN,
223	.base_pri = BASEPRI_DEFAULT,
224	.realtime.deadline = UINT64_MAX,
225	.last_made_runnable_time = THREAD_NOT_RUNNABLE,
226	.last_basepri_change_time = THREAD_NOT_RUNNABLE,
227	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
228	.pri_shift = INT8_MAX,
229	#endif
230	/ timers are initialized in thread_bootstrap /
231	};
232
233	#define CTID_SIZE_BIT 20
234	#define CTID_MASK ((1u << CTID_SIZE_BIT) - 1)
235	#define CTID_MAX_THREAD_NUMBER (CTID_MASK - 1)
236	static_assert(CTID_MAX_THREAD_NUMBER <= COMPACT_ID_MAX);
237
238	#ifndef __LITTLE_ENDIAN__
239	#error "ctid relies on the ls bits of uint32_t to be populated"
240	#endif
241
242	__startup_data
243	static struct thread init_thread;
244	static SECURITY_READ_ONLY_LATE(uint32_t) ctid_nonce;
245	COMPACT_ID_TABLE_DEFINE(static, ctid_table);
246
247	__startup_func
248	static void
249	thread_zone_startup(void)
250	{
251	size_t size = sizeof(struct thread);
252
253	#ifdef MACH_BSD
254	size += roundup(uthread_size, _Alignof(struct thread));
255	#endif
256	thread_zone = zone_create_ext(name: "threads", size,
257	flags: ZC_SEQUESTER \| ZC_ZFREE_CLEARMEM, desired_zid: ZONE_ID_THREAD, NULL);
258	}
259	STARTUP(ZALLOC, STARTUP_RANK_FOURTH, thread_zone_startup);
260
261	static void thread_deallocate_enqueue(thread_t thread);
262	static void thread_deallocate_complete(thread_t thread);
263
264	static void ctid_table_remove(thread_t thread);
265	static void ctid_table_add(thread_t thread);
266	static void ctid_table_init(void);
267
268	#ifdef MACH_BSD
269	extern void proc_exit(void *);
270	extern mach_exception_data_type_t proc_encode_exit_exception_code(void *);
271	extern uint64_t get_dispatchqueue_offset_from_proc(void *);
272	extern uint64_t get_return_to_kernel_offset_from_proc(void *p);
273	extern uint64_t get_wq_quantum_offset_from_proc(void *);
274	extern int proc_selfpid(void);
275	extern void proc_name(int, char, int*);
276	extern char * proc_name_address(void *p);
277	exception_type_t get_exception_from_corpse_crashinfo(kcdata_descriptor_t corpse_info);
278	extern void kdebug_proc_name_args(struct proc proc, long* args[static `4`]);
279	#endif /* MACH_BSD */
280
281	extern bool bsdthread_part_of_cooperative_workqueue(struct uthread *uth);
282	extern bool disable_exc_resource;
283	extern bool disable_exc_resource_during_audio;
284	extern int audio_active;
285	extern int debug_task;
286	int thread_max = CONFIG_THREAD_MAX; / Max number of threads /
287	int task_threadmax = CONFIG_THREAD_MAX;
288
289	static uint64_t thread_unique_id = `100`;
290
291	struct _thread_ledger_indices thread_ledgers = { .cpu_time = -`1` };
292	static ledger_template_t thread_ledger_template = NULL;
293	static void init_thread_ledgers(void);
294
295	#if CONFIG_JETSAM
296	void jetsam_on_ledger_cpulimit_exceeded(void);
297	#endif
298
299	extern int task_thread_soft_limit;
300
301
302	/*
303	* Level (in terms of percentage of the limit) at which the CPU usage monitor triggers telemetry.
304	*
305	* (ie when any thread's CPU consumption exceeds 70% of the limit, start taking user
306	* stacktraces, aka micro-stackshots)
307	*/
308	#define CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT 70
309
310	/ Percentage. Level at which we start gathering telemetry. /
311	static TUNABLE(uint8_t, cpumon_ustackshots_trigger_pct,
312	"cpumon_ustackshots_trigger_pct", CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT);
313	void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void);
314
315	#if DEVELOPMENT \|\| DEBUG
316	TUNABLE_WRITEABLE(int, exc_resource_threads_enabled, "exc_resource_threads_enabled", `1`);
317
318	void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t, int);
319	#endif /* DEVELOPMENT \|\| DEBUG */
320
321	/*
322	* The smallest interval over which we support limiting CPU consumption is 1ms
323	*/
324	#define MINIMUM_CPULIMIT_INTERVAL_MS 1
325
326	os_refgrp_decl(static, thread_refgrp, "thread", NULL);
327
328	static inline void
329	init_thread_from_template(thread_t thread)
330	{
331	/*
332	* In general, struct thread isn't trivially-copyable, since it may
333	* contain pointers to thread-specific state. This may be enforced at
334	* compile time on architectures that store authed + diversified
335	* pointers in machine_thread.
336	*
337	* In this specific case, where we're initializing a new thread from a
338	* thread_template, we know all diversified pointers are NULL; these are
339	* safe to bitwise copy.
340	*/
341	#pragma clang diagnostic push
342	#pragma clang diagnostic ignored "-Wnontrivial-memaccess"
343	memcpy(dst: thread, src: &thread_template, n: sizeof(*thread));
344	#pragma clang diagnostic pop
345	}
346
347	static void
348	thread_ro_create(task_t parent_task, thread_t th, thread_ro_t tro_tpl)
349	{
350	#if __x86_64__
351	th->t_task = parent_task;
352	#endif
353	tro_tpl->tro_owner = th;
354	tro_tpl->tro_task = parent_task;
355	th->t_tro = zalloc_ro(ZONE_ID_THREAD_RO, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
356	zalloc_ro_update_elem(ZONE_ID_THREAD_RO, th->t_tro, tro_tpl);
357	}
358
359	static void
360	thread_ro_destroy(thread_t th)
361	{
362	thread_ro_t tro = get_thread_ro(th);
363	#if MACH_BSD
364	struct ucred *cred = tro->tro_cred;
365	struct ucred *rcred = tro->tro_realcred;
366	#endif
367	zfree_ro(ZONE_ID_THREAD_RO, tro);
368	#if MACH_BSD
369	uthread_cred_free(cred);
370	uthread_cred_free(rcred);
371	#endif
372	}
373
374	__startup_func
375	thread_t
376	thread_bootstrap(void)
377	{
378	/*
379	* Fill in a template thread for fast initialization.
380	*/
381	timer_init(timer: &thread_template.runnable_timer);
382
383	init_thread_from_template(thread: &init_thread);
384	/ fiddle with init thread to skip asserts in set_sched_pri /
385	init_thread.sched_pri = MAXPRI_KERNEL;
386
387	/*
388	* We can't quite use ctid yet, on ARM thread_bootstrap() is called
389	* before we can call random or anything,
390	* so we just make it barely work and it will get fixed up
391	* when the first thread is actually made.
392	*/
393	*compact_id_resolve(table: &ctid_table, compact_id: `0`) = &init_thread;
394	init_thread.ctid = CTID_MASK;
395
396	return &init_thread;
397	}
398
399	void
400	thread_machine_init_template(void)
401	{
402	machine_thread_template_init(thr_template: &thread_template);
403	}
404
405	void
406	thread_init(void)
407	{
408	/*
409	* Initialize any machine-dependent
410	* per-thread structures necessary.
411	*/
412	machine_thread_init();
413
414	init_thread_ledgers();
415	}
416
417	boolean_t
418	thread_is_active(thread_t thread)
419	{
420	return thread->active;
421	}
422
423	void
424	thread_corpse_continue(void)
425	{
426	thread_t thread = current_thread();
427
428	thread_terminate_internal(thread);
429
430	/*
431	* Handle the thread termination directly
432	* here instead of returning to userspace.
433	*/
434	assert(thread->active == FALSE);
435	thread_ast_clear(thread, AST_APC);
436	thread_apc_ast(thread);
437
438	panic("thread_corpse_continue");
439	/NOTREACHED/
440	}
441
442	__dead2
443	static void
444	thread_terminate_continue(void)
445	{
446	panic("thread_terminate_continue");
447	/NOTREACHED/
448	}
449
450	/*
451	* thread_terminate_self:
452	*/
453	void
454	thread_terminate_self(void)
455	{
456	thread_t thread = current_thread();
457	thread_ro_t tro = get_thread_ro(thread);
458	task_t task = tro->tro_task;
459	void *bsd_info = get_bsdtask_info(task);
460	int threadcnt;
461
462	pal_thread_terminate_self(thread);
463
464	DTRACE_PROC(lwp__exit);
465
466	thread_mtx_lock(thread);
467
468	ipc_thread_disable(thread);
469
470	thread_mtx_unlock(thread);
471
472	thread_sched_call(thread, NULL);
473
474	spl_t s = splsched();
475	thread_lock(thread);
476
477	thread_depress_abort_locked(thread);
478
479	/*
480	* Before we take the thread_lock right above,
481	* act_set_ast_reset_pcs() might not yet observe
482	* that the thread is inactive, and could have
483	* requested an IPI Ack.
484	*
485	* Once we unlock the thread, we know that
486	* act_set_ast_reset_pcs() can't fail to notice
487	* that thread->active is false,
488	* and won't set new ones.
489	*/
490	thread_reset_pcs_ack_IPI(thread);
491
492	thread_unlock(thread);
493
494	splx(s);
495
496	#if CONFIG_TASKWATCH
497	thead_remove_taskwatch(thread);
498	#endif /* CONFIG_TASKWATCH */
499
500	work_interval_thread_terminate(thread);
501
502	thread_mtx_lock(thread);
503
504	thread_policy_reset(thread);
505
506	thread_mtx_unlock(thread);
507
508	assert(thread->th_work_interval == NULL);
509
510	bank_swap_thread_bank_ledger(thread, NULL);
511
512	if (kdebug_enable && bsd_hasthreadname(uth: get_bsdthread_info(thread))) {
513	char threadname[MAXTHREADNAMESIZE];
514	bsd_getthreadname(uth: get_bsdthread_info(thread), buffer: threadname);
515	kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, str: threadname);
516	}
517
518	uthread_cleanup(get_bsdthread_info(thread), tro);
519
520	if (kdebug_enable && bsd_info && !task_is_exec_copy(task)) {
521	/ trace out pid before we sign off /
522	long dbg_arg1 = `0`;
523	long dbg_arg2 = `0`;
524
525	kdbg_trace_data(proc: get_bsdtask_info(task), arg_pid: &dbg_arg1, arg_uniqueid: &dbg_arg2);
526	#if CONFIG_PERVASIVE_CPI
527	if (kdebug_debugid_enabled(DBG_MT_INSTRS_CYCLES_THR_EXIT)) {
528	struct recount_usage usage = { `0` };
529	struct recount_usage perf_only = { `0` };
530	boolean_t intrs_end = ml_set_interrupts_enabled(FALSE);
531	recount_current_thread_usage_perf_only(&usage, &perf_only);
532	ml_set_interrupts_enabled(intrs_end);
533	KDBG_RELEASE(DBG_MT_INSTRS_CYCLES_THR_EXIT,
534	recount_usage_instructions(&usage),
535	recount_usage_cycles(&usage),
536	recount_usage_system_time_mach(&usage),
537	usage.ru_metrics[RCT_LVL_USER].rm_time_mach);
538	#if __AMP__
539	KDBG_RELEASE(DBG_MT_P_INSTRS_CYCLES_THR_EXIT,
540	recount_usage_instructions(&perf_only),
541	recount_usage_cycles(&perf_only),
542	recount_usage_system_time_mach(&perf_only),
543	perf_only.ru_metrics[RCT_LVL_USER].rm_time_mach);
544	#endif // __AMP__
545	}
546	#endif/* CONFIG_PERVASIVE_CPI */
547	KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE_PID, dbg_arg1, dbg_arg2);
548	}
549
550	/*
551	* After this subtraction, this thread should never access
552	* task->bsd_info unless it got 0 back from the os_atomic_dec. It
553	* could be racing with other threads to be the last thread in the
554	* process, and the last thread in the process will tear down the proc
555	* structure and zero-out task->bsd_info.
556	*/
557	threadcnt = os_atomic_dec(&task->active_thread_count, relaxed);
558
559	#if CONFIG_COALITIONS
560	/*
561	* Leave the coalitions when last thread of task is exiting and the
562	* task is not a corpse.
563	*/
564	if (threadcnt == `0` && !task->corpse_info) {
565	coalitions_remove_task(task);
566	}
567	#endif
568
569	/*
570	* If we are the last thread to terminate and the task is
571	* associated with a BSD process, perform BSD process exit.
572	*/
573	if (threadcnt == `0` && bsd_info != NULL) {
574	mach_exception_data_type_t subcode = `0`;
575	if (kdebug_enable) {
576	/ since we're the last thread in this process, trace out the command name too /
577	long args[`4`] = { `0` };
578	kdebug_proc_name_args(proc: bsd_info, args);
579	#if CONFIG_PERVASIVE_CPI
580	if (kdebug_debugid_enabled(DBG_MT_INSTRS_CYCLES_PROC_EXIT)) {
581	struct recount_usage usage = { `0` };
582	struct recount_usage perf_only = { `0` };
583	recount_current_task_usage_perf_only(&usage, &perf_only);
584	KDBG_RELEASE(DBG_MT_INSTRS_CYCLES_PROC_EXIT,
585	recount_usage_instructions(&usage),
586	recount_usage_cycles(&usage),
587	recount_usage_system_time_mach(&usage),
588	usage.ru_metrics[RCT_LVL_USER].rm_time_mach);
589	#if __AMP__
590	KDBG_RELEASE(DBG_MT_P_INSTRS_CYCLES_PROC_EXIT,
591	recount_usage_instructions(&perf_only),
592	recount_usage_cycles(&perf_only),
593	recount_usage_system_time_mach(&perf_only),
594	perf_only.ru_metrics[RCT_LVL_USER].rm_time_mach);
595	#endif // __AMP__
596	}
597	#endif/* CONFIG_PERVASIVE_CPI */
598	KDBG_RELEASE(TRACE_STRING_PROC_EXIT, args[`0`], args[`1`], args[`2`], args[`3`]);
599	}
600
601	/ Get the exit reason before proc_exit /
602	subcode = proc_encode_exit_exception_code(bsd_info);
603	proc_exit(bsd_info);
604	bsd_info = NULL;
605	#if CONFIG_EXCLAVES
606	task_clear_conclave(task);
607	#endif
608	/*
609	* if there is crash info in task
610	* then do the deliver action since this is
611	* last thread for this task.
612	*/
613	if (task->corpse_info) {
614	/ reset all except task name port /
615	ipc_task_reset(task);
616	/ enable all task ports (name port unchanged) /
617	ipc_task_enable(task);
618	exception_type_t etype = get_exception_from_corpse_crashinfo(corpse_info: task->corpse_info);
619	task_deliver_crash_notification(task, current_thread(), etype, subcode);
620	}
621	}
622
623	if (threadcnt == `0`) {
624	task_lock(task);
625	if (task_is_a_corpse_fork(task)) {
626	thread_wakeup((event_t)&task->active_thread_count);
627	}
628	task_unlock(task);
629	}
630
631	#if CONFIG_EXCLAVES
632	exclaves_thread_terminate(thread);
633	#endif
634
635	s = splsched();
636	thread_lock(thread);
637
638	/*
639	* Ensure that the depress timer is no longer enqueued,
640	* so the timer can be safely deallocated
641	*
642	* TODO: build timer_call_cancel_wait
643	*/
644
645	assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == `0`);
646
647	uint32_t delay_us = `1`;
648
649	while (thread->depress_timer_active > `0`) {
650	thread_unlock(thread);
651	splx(s);
652
653	delay(usec: delay_us++);
654
655	if (delay_us > USEC_PER_SEC) {
656	panic("depress timer failed to inactivate!"
657	"thread: %p depress_timer_active: %d",
658	thread, thread->depress_timer_active);
659	}
660
661	s = splsched();
662	thread_lock(thread);
663	}
664
665	/*
666	* Cancel wait timer, and wait for
667	* concurrent expirations.
668	*/
669	if (thread->wait_timer_armed) {
670	thread->wait_timer_armed = false;
671
672	if (timer_call_cancel(call: thread->wait_timer)) {
673	thread->wait_timer_active--;
674	}
675	}
676
677	delay_us = `1`;
678
679	while (thread->wait_timer_active > `0`) {
680	thread_unlock(thread);
681	splx(s);
682
683	delay(usec: delay_us++);
684
685	if (delay_us > USEC_PER_SEC) {
686	panic("wait timer failed to inactivate!"
687	"thread: %p, wait_timer_active: %d, "
688	"wait_timer_armed: %d",
689	thread, thread->wait_timer_active,
690	thread->wait_timer_armed);
691	}
692
693	s = splsched();
694	thread_lock(thread);
695	}
696
697	/*
698	* If there is a reserved stack, release it.
699	*/
700	if (thread->reserved_stack != `0`) {
701	stack_free_reserved(thread);
702	thread->reserved_stack = `0`;
703	}
704
705	/*
706	* Mark thread as terminating, and block.
707	*/
708	thread->state \|= TH_TERMINATE;
709	thread_mark_wait_locked(thread, THREAD_UNINT);
710
711	#if CONFIG_EXCLAVES
712	assert(thread->th_exclaves_ipc_buffer == NULL);
713	assert(thread->th_exclaves_scheduling_context_id == `0`);
714	assert(thread->th_exclaves_intstate == `0`);
715	assert(thread->th_exclaves_state == `0`);
716	#endif
717	assert(thread->th_work_interval_flags == TH_WORK_INTERVAL_FLAGS_NONE);
718	assert(thread->kern_promotion_schedpri == `0`);
719	if (thread->rwlock_count > `0`) {
720	panic("rwlock_count is %d for thread %p, possibly it still holds a rwlock", thread->rwlock_count, thread);
721	}
722	assert(thread->priority_floor_count == `0`);
723	assert(thread->handoff_thread == THREAD_NULL);
724	assert(thread->th_work_interval == NULL);
725	assert(thread->t_rr_state.trr_value == `0`);
726
727	assert3u(`0`, ==, thread->sched_flags &
728	(TH_SFLAG_WAITQ_PROMOTED \|
729	TH_SFLAG_RW_PROMOTED \|
730	TH_SFLAG_EXEC_PROMOTED \|
731	TH_SFLAG_FLOOR_PROMOTED \|
732	TH_SFLAG_PROMOTED \|
733	TH_SFLAG_DEPRESS));
734
735	thread_unlock(thread);
736	/ splsched /
737
738	thread_block(continuation: (thread_continue_t)thread_terminate_continue);
739	/NOTREACHED/
740	}
741
742	static bool
743	thread_ref_release(thread_t thread)
744	{
745	if (thread == THREAD_NULL) {
746	return false;
747	}
748
749	assert_thread_magic(thread);
750
751	return os_ref_release_raw(&thread->ref_count, &thread_refgrp) == `0`;
752	}
753
754	/ Drop a thread refcount safely without triggering a zfree /
755	void
756	thread_deallocate_safe(thread_t thread)
757	{
758	if (__improbable(thread_ref_release(thread))) {
759	/ enqueue the thread for thread deallocate deamon to call thread_deallocate_complete /
760	thread_deallocate_enqueue(thread);
761	}
762	}
763
764	void
765	thread_deallocate(thread_t thread)
766	{
767	if (__improbable(thread_ref_release(thread))) {
768	thread_deallocate_complete(thread);
769	}
770	}
771
772	void
773	thread_deallocate_complete(
774	thread_t thread)
775	{
776	task_t task;
777
778	assert_thread_magic(thread);
779
780	assert(os_ref_get_count_raw(&thread->ref_count) == `0`);
781
782	if (!(thread->state & TH_TERMINATE2)) {
783	panic("thread_deallocate: thread not properly terminated");
784	}
785
786	thread_assert_runq_null(thread);
787	assert(!(thread->state & TH_WAKING));
788
789	#if CONFIG_CPU_COUNTERS
790	kpc_thread_destroy(thread);
791	#endif /* CONFIG_CPU_COUNTERS */
792
793	ipc_thread_terminate(thread);
794
795	proc_thread_qos_deallocate(thread);
796
797	task = get_threadtask(thread);
798
799	#ifdef MACH_BSD
800	uthread_destroy(get_bsdthread_info(thread));
801	#endif /* MACH_BSD */
802
803	if (thread->t_ledger) {
804	ledger_dereference(ledger: thread->t_ledger);
805	}
806	if (thread->t_threadledger) {
807	ledger_dereference(ledger: thread->t_threadledger);
808	}
809
810	assert(thread->turnstile != TURNSTILE_NULL);
811	if (thread->turnstile) {
812	turnstile_deallocate(turnstile: thread->turnstile);
813	}
814	turnstile_compact_id_put(cid: thread->ctsid);
815
816	if (IPC_VOUCHER_NULL != thread->ith_voucher) {
817	ipc_voucher_release(voucher: thread->ith_voucher);
818	}
819
820	kfree_data(thread->thread_io_stats, sizeof(struct io_stat_info));
821	#if CONFIG_PREADOPT_TG
822	if (thread->old_preadopt_thread_group) {
823	thread_group_release(tg: thread->old_preadopt_thread_group);
824	}
825
826	if (thread->preadopt_thread_group) {
827	thread_group_release(tg: thread->preadopt_thread_group);
828	}
829	#endif /* CONFIG_PREADOPT_TG */
830
831	if (thread->kernel_stack != `0`) {
832	stack_free(thread);
833	}
834
835	recount_thread_deinit(th: &thread->th_recount);
836
837	lck_mtx_destroy(lck: &thread->mutex, grp: &thread_lck_grp);
838	machine_thread_destroy(thread);
839
840	task_deallocate_grp(task, TASK_GRP_INTERNAL);
841
842	#if MACH_ASSERT
843	assert_thread_magic(thread);
844	thread->thread_magic = `0`;
845	#endif /* MACH_ASSERT */
846
847	lck_mtx_lock(lck: &tasks_threads_lock);
848	assert(terminated_threads_count > `0`);
849	queue_remove(&terminated_threads, thread, thread_t, threads);
850	terminated_threads_count--;
851	lck_mtx_unlock(lck: &tasks_threads_lock);
852
853	timer_call_free(call: thread->depress_timer);
854	timer_call_free(call: thread->wait_timer);
855
856	ctid_table_remove(thread);
857
858	thread_ro_destroy(th: thread);
859	zfree(thread_zone, thread);
860	}
861
862	/*
863	* thread_inspect_deallocate:
864	*
865	* Drop a thread inspection reference.
866	*/
867	void
868	thread_inspect_deallocate(
869	thread_inspect_t thread_inspect)
870	{
871	return thread_deallocate(thread: (thread_t)thread_inspect);
872	}
873
874	/*
875	* thread_read_deallocate:
876	*
877	* Drop a reference on thread read port.
878	*/
879	void
880	thread_read_deallocate(
881	thread_read_t thread_read)
882	{
883	return thread_deallocate(thread: (thread_t)thread_read);
884	}
885
886
887	/*
888	* thread_exception_queue_invoke:
889	*
890	* Deliver EXC_{RESOURCE,GUARD} exception
891	*/
892	static void
893	thread_exception_queue_invoke(mpsc_queue_chain_t elm,
894	__assert_only mpsc_daemon_queue_t dq)
895	{
896	struct thread_exception_elt *elt;
897	task_t task;
898	thread_t thread;
899	exception_type_t etype;
900
901	assert(dq == &thread_exception_queue);
902	elt = mpsc_queue_element(elm, struct thread_exception_elt, link);
903
904	etype = elt->exception_type;
905	task = elt->exception_task;
906	thread = elt->exception_thread;
907	assert_thread_magic(thread);
908
909	kfree_type(struct thread_exception_elt, elt);
910
911	/ wait for all the threads in the task to terminate /
912	task_lock(task);
913	task_wait_till_threads_terminate_locked(task);
914	task_unlock(task);
915
916	/ Consumes the task ref returned by task_generate_corpse_internal /
917	task_deallocate(task);
918	/ Consumes the thread ref returned by task_generate_corpse_internal /
919	thread_deallocate(thread);
920
921	/ Deliver the notification, also clears the corpse. /
922	task_deliver_crash_notification(task, thread, etype, `0`);
923	}
924
925	static void
926	thread_backtrace_queue_invoke(mpsc_queue_chain_t elm,
927	__assert_only mpsc_daemon_queue_t dq)
928	{
929	struct thread_backtrace_elt *elt;
930	kcdata_object_t obj;
931	exception_port_t exc_ports[BT_EXC_PORTS_COUNT]; / send rights /
932	exception_type_t etype;
933
934	assert(dq == &thread_backtrace_queue);
935	elt = mpsc_queue_element(elm, struct thread_backtrace_elt, link);
936
937	obj = elt->obj;
938	memcpy(dst: exc_ports, src: elt->exc_ports, n: sizeof(ipc_port_t) * BT_EXC_PORTS_COUNT);
939	etype = elt->exception_type;
940
941	kfree_type(struct thread_backtrace_elt, elt);
942
943	/ Deliver to backtrace exception ports /
944	exception_deliver_backtrace(bt_object: obj, exc_ports, exception: etype);
945
946	/*
947	* Release port right and kcdata object refs given by
948	* task_enqueue_exception_with_corpse()
949	*/
950
951	for (unsigned int i = `0`; i < BT_EXC_PORTS_COUNT; i++) {
952	ipc_port_release_send(port: exc_ports[i]);
953	}
954
955	kcdata_object_release(obj);
956	}
957
958	/*
959	* thread_exception_enqueue:
960	*
961	* Enqueue a corpse port to be delivered an EXC_{RESOURCE,GUARD}.
962	*/
963	void
964	thread_exception_enqueue(
965	task_t task,
966	thread_t thread,
967	exception_type_t etype)
968	{
969	assert(EXC_RESOURCE == etype \|\| EXC_GUARD == etype);
970	struct thread_exception_elt elt = kalloc_type(struct* thread_exception_elt, Z_WAITOK \| Z_NOFAIL);
971	elt->exception_type = etype;
972	elt->exception_task = task;
973	elt->exception_thread = thread;
974
975	mpsc_daemon_enqueue(dq: &thread_exception_queue, elm: &elt->link,
976	options: MPSC_QUEUE_DISABLE_PREEMPTION);
977	}
978
979	void
980	thread_backtrace_enqueue(
981	kcdata_object_t obj,
982	exception_port_t ports[static BT_EXC_PORTS_COUNT],
983	exception_type_t etype)
984	{
985	struct thread_backtrace_elt elt = kalloc_type(struct* thread_backtrace_elt, Z_WAITOK \| Z_NOFAIL);
986	elt->obj = obj;
987	elt->exception_type = etype;
988
989	memcpy(dst: elt->exc_ports, src: ports, n: sizeof(ipc_port_t) * BT_EXC_PORTS_COUNT);
990
991	mpsc_daemon_enqueue(dq: &thread_backtrace_queue, elm: &elt->link,
992	options: MPSC_QUEUE_DISABLE_PREEMPTION);
993	}
994
995	/*
996	* thread_copy_resource_info
997	*
998	* Copy the resource info counters from source
999	* thread to destination thread.
1000	*/
1001	void
1002	thread_copy_resource_info(
1003	thread_t dst_thread,
1004	thread_t src_thread)
1005	{
1006	dst_thread->c_switch = src_thread->c_switch;
1007	dst_thread->p_switch = src_thread->p_switch;
1008	dst_thread->ps_switch = src_thread->ps_switch;
1009	dst_thread->sched_time_save = src_thread->sched_time_save;
1010	dst_thread->runnable_timer = src_thread->runnable_timer;
1011	dst_thread->vtimer_user_save = src_thread->vtimer_user_save;
1012	dst_thread->vtimer_prof_save = src_thread->vtimer_prof_save;
1013	dst_thread->vtimer_rlim_save = src_thread->vtimer_rlim_save;
1014	dst_thread->vtimer_qos_save = src_thread->vtimer_qos_save;
1015	dst_thread->syscalls_unix = src_thread->syscalls_unix;
1016	dst_thread->syscalls_mach = src_thread->syscalls_mach;
1017	ledger_rollup(to_ledger: dst_thread->t_threadledger, from_ledger: src_thread->t_threadledger);
1018	recount_thread_copy(dst: &dst_thread->th_recount, src: &src_thread->th_recount);
1019	dst_thread->thread_io_stats = src_thread->thread_io_stats;
1020	}
1021
1022	static void
1023	thread_terminate_queue_invoke(mpsc_queue_chain_t e,
1024	__assert_only mpsc_daemon_queue_t dq)
1025	{
1026	thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
1027	task_t task = get_threadtask(thread);
1028
1029	assert(dq == &thread_terminate_queue);
1030
1031	task_lock(task);
1032
1033	/*
1034	* if marked for crash reporting, skip reaping.
1035	* The corpse delivery thread will clear bit and enqueue
1036	* for reaping when done
1037	*
1038	* Note: the inspection field is set under the task lock
1039	*
1040	* FIXME[mad]: why enqueue for termination before `inspection` is false ?
1041	*/
1042	if (__improbable(thread->inspection)) {
1043	simple_lock(&crashed_threads_lock, &thread_lck_grp);
1044	task_unlock(task);
1045
1046	enqueue_tail(que: &crashed_threads_queue, elt: &thread->runq_links);
1047	simple_unlock(&crashed_threads_lock);
1048	return;
1049	}
1050
1051	recount_task_rollup_thread(tk: &task->tk_recount, th: &thread->th_recount);
1052
1053	task->total_runnable_time += timer_grab(timer: &thread->runnable_timer);
1054	task->c_switch += thread->c_switch;
1055	task->p_switch += thread->p_switch;
1056	task->ps_switch += thread->ps_switch;
1057
1058	task->syscalls_unix += thread->syscalls_unix;
1059	task->syscalls_mach += thread->syscalls_mach;
1060
1061	task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
1062	task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
1063	task->task_gpu_ns += ml_gpu_stat(thread);
1064	task->decompressions += thread->decompressions;
1065
1066	thread_update_qos_cpu_time(thread);
1067
1068	queue_remove(&task->threads, thread, thread_t, task_threads);
1069	task->thread_count--;
1070
1071	/*
1072	* If the task is being halted, and there is only one thread
1073	* left in the task after this one, then wakeup that thread.
1074	*/
1075	if (task->thread_count == `1` && task->halting) {
1076	thread_wakeup((event_t)&task->halting);
1077	}
1078
1079	task_unlock(task);
1080
1081	lck_mtx_lock(lck: &tasks_threads_lock);
1082	queue_remove(&threads, thread, thread_t, threads);
1083	threads_count--;
1084	queue_enter(&terminated_threads, thread, thread_t, threads);
1085	terminated_threads_count++;
1086	lck_mtx_unlock(lck: &tasks_threads_lock);
1087
1088	#if MACH_BSD
1089	/*
1090	* The thread no longer counts against the task's thread count,
1091	* we can now wake up any pending joiner.
1092	*
1093	* Note that the inheritor will be set to `thread` which is
1094	* incorrect once it is on the termination queue, however
1095	* the termination queue runs at MINPRI_KERNEL which is higher
1096	* than any user thread, so this isn't a priority inversion.
1097	*/
1098	if (thread_get_tag(thread) & THREAD_TAG_USER_JOIN) {
1099	struct uthread *uth = get_bsdthread_info(thread);
1100	mach_port_name_t kport = uthread_joiner_port(uth);
1101
1102	/*
1103	* Clear the port low two bits to tell pthread that thread is gone.
1104	*/
1105	#ifndef NO_PORT_GEN
1106	kport &= ~MACH_PORT_MAKE(`0`, IE_BITS_GEN_MASK + IE_BITS_GEN_ONE);
1107	#else
1108	kport \|= MACH_PORT_MAKE(`0`, ~(IE_BITS_GEN_MASK + IE_BITS_GEN_ONE));
1109	#endif
1110	(void)copyoutmap_atomic32(map: task->map, value: kport,
1111	toaddr: uthread_joiner_address(uth));
1112	uthread_joiner_wake(task, uth);
1113	}
1114	#endif
1115
1116	thread_deallocate(thread);
1117	}
1118
1119	static void
1120	thread_deallocate_queue_invoke(mpsc_queue_chain_t e,
1121	__assert_only mpsc_daemon_queue_t dq)
1122	{
1123	thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
1124
1125	assert(dq == &thread_deallocate_queue);
1126
1127	thread_deallocate_complete(thread);
1128	}
1129
1130	/*
1131	* thread_terminate_enqueue:
1132	*
1133	* Enqueue a terminating thread for final disposition.
1134	*
1135	* Called at splsched.
1136	*/
1137	void
1138	thread_terminate_enqueue(
1139	thread_t thread)
1140	{
1141	KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE, thread->thread_id);
1142
1143	mpsc_daemon_enqueue(dq: &thread_terminate_queue, elm: &thread->mpsc_links,
1144	options: MPSC_QUEUE_DISABLE_PREEMPTION);
1145	}
1146
1147	/*
1148	* thread_deallocate_enqueue:
1149	*
1150	* Enqueue a thread for final deallocation.
1151	*/
1152	static void
1153	thread_deallocate_enqueue(
1154	thread_t thread)
1155	{
1156	mpsc_daemon_enqueue(dq: &thread_deallocate_queue, elm: &thread->mpsc_links,
1157	options: MPSC_QUEUE_DISABLE_PREEMPTION);
1158	}
1159
1160	/*
1161	* thread_terminate_crashed_threads:
1162	* walk the list of crashed threads and put back set of threads
1163	* who are no longer being inspected.
1164	*/
1165	void
1166	thread_terminate_crashed_threads(void)
1167	{
1168	thread_t th_remove;
1169
1170	simple_lock(&crashed_threads_lock, &thread_lck_grp);
1171	/*
1172	* loop through the crashed threads queue
1173	* to put any threads that are not being inspected anymore
1174	*/
1175
1176	qe_foreach_element_safe(th_remove, &crashed_threads_queue, runq_links) {
1177	/ make sure current_thread is never in crashed queue /
1178	assert(th_remove != current_thread());
1179
1180	if (th_remove->inspection == FALSE) {
1181	remqueue(elt: &th_remove->runq_links);
1182	mpsc_daemon_enqueue(dq: &thread_terminate_queue, elm: &th_remove->mpsc_links,
1183	options: MPSC_QUEUE_NONE);
1184	}
1185	}
1186
1187	simple_unlock(&crashed_threads_lock);
1188	}
1189
1190	/*
1191	* thread_stack_queue_invoke:
1192	*
1193	* Perform stack allocation as required due to
1194	* invoke failures.
1195	*/
1196	static void
1197	thread_stack_queue_invoke(mpsc_queue_chain_t elm,
1198	__assert_only mpsc_daemon_queue_t dq)
1199	{
1200	thread_t thread = mpsc_queue_element(elm, struct thread, mpsc_links);
1201
1202	assert(dq == &thread_stack_queue);
1203
1204	/ allocate stack with interrupts enabled so that we can call into VM /
1205	stack_alloc(thread);
1206
1207	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) \| DBG_FUNC_END, thread_tid(thread), `0`, `0`, `0`, `0`);
1208
1209	spl_t s = splsched();
1210	thread_lock(thread);
1211	thread_setrun(thread, options: SCHED_PREEMPT \| SCHED_TAILQ);
1212	thread_unlock(thread);
1213	splx(s);
1214	}
1215
1216	/*
1217	* thread_stack_enqueue:
1218	*
1219	* Enqueue a thread for stack allocation.
1220	*
1221	* Called at splsched.
1222	*/
1223	void
1224	thread_stack_enqueue(
1225	thread_t thread)
1226	{
1227	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) \| DBG_FUNC_START, thread_tid(thread), `0`, `0`, `0`, `0`);
1228	assert_thread_magic(thread);
1229
1230	mpsc_daemon_enqueue(dq: &thread_stack_queue, elm: &thread->mpsc_links,
1231	options: MPSC_QUEUE_DISABLE_PREEMPTION);
1232	}
1233
1234	void
1235	thread_daemon_init(void)
1236	{
1237	kern_return_t result;
1238
1239	thread_deallocate_daemon_init();
1240
1241	thread_deallocate_daemon_register_queue(dq: &thread_terminate_queue,
1242	invoke: thread_terminate_queue_invoke);
1243
1244	thread_deallocate_daemon_register_queue(dq: &thread_deallocate_queue,
1245	invoke: thread_deallocate_queue_invoke);
1246
1247	ipc_object_deallocate_register_queue();
1248
1249	simple_lock_init(&crashed_threads_lock, `0`);
1250	queue_init(&crashed_threads_queue);
1251
1252	result = mpsc_daemon_queue_init_with_thread(dq: &thread_stack_queue,
1253	invoke: thread_stack_queue_invoke, BASEPRI_PREEMPT_HIGH,
1254	name: "daemon.thread-stack", flags: MPSC_DAEMON_INIT_NONE);
1255	if (result != KERN_SUCCESS) {
1256	panic("thread_daemon_init: thread_stack_daemon");
1257	}
1258
1259	result = mpsc_daemon_queue_init_with_thread(dq: &thread_exception_queue,
1260	invoke: thread_exception_queue_invoke, MINPRI_KERNEL,
1261	name: "daemon.thread-exception", flags: MPSC_DAEMON_INIT_NONE);
1262
1263	if (result != KERN_SUCCESS) {
1264	panic("thread_daemon_init: thread_exception_daemon");
1265	}
1266
1267	result = mpsc_daemon_queue_init_with_thread(dq: &thread_backtrace_queue,
1268	invoke: thread_backtrace_queue_invoke, MINPRI_KERNEL,
1269	name: "daemon.thread-backtrace", flags: MPSC_DAEMON_INIT_NONE);
1270
1271	if (result != KERN_SUCCESS) {
1272	panic("thread_daemon_init: thread_backtrace_daemon");
1273	}
1274	}
1275
1276	__options_decl(thread_create_internal_options_t, uint32_t, {
1277	TH_OPTION_NONE = `0x00`,
1278	TH_OPTION_NOSUSP = `0x02`,
1279	TH_OPTION_WORKQ = `0x04`,
1280	TH_OPTION_MAINTHREAD = `0x08`,
1281	});
1282
1283	void
1284	main_thread_set_immovable_pinned(thread_t thread)
1285	{
1286	ipc_main_thread_set_immovable_pinned(thread);
1287	}
1288
1289	/*
1290	* Create a new thread.
1291	* Doesn't start the thread running.
1292	*
1293	* Task and tasks_threads_lock are returned locked on success.
1294	*/
1295	static kern_return_t
1296	thread_create_internal(
1297	task_t parent_task,
1298	integer_t priority,
1299	thread_continue_t continuation,
1300	void *parameter,
1301	thread_create_internal_options_t options,
1302	thread_t *out_thread)
1303	{
1304	thread_t new_thread;
1305	ipc_thread_init_options_t init_options = IPC_THREAD_INIT_NONE;
1306	struct thread_ro tro_tpl = { };
1307	bool first_thread = false;
1308	kern_return_t kr = KERN_FAILURE;
1309
1310	/*
1311	* Allocate a thread and initialize static fields
1312	*/
1313	new_thread = zalloc_flags(thread_zone, Z_WAITOK \| Z_NOFAIL);
1314
1315	if (__improbable(current_thread() == &init_thread)) {
1316	/*
1317	* The first thread ever is a global, but because we want to be
1318	* able to zone_id_require() threads, we have to stop using the
1319	* global piece of memory we used to boostrap the kernel and
1320	* jump to a proper thread from a zone.
1321	*
1322	* This is why that one thread will inherit its original
1323	* state differently.
1324	*
1325	* Also remember this thread in `vm_pageout_scan_thread`
1326	* as this is what the first thread ever becomes.
1327	*
1328	* Also pre-warm the depress timer since the VM pageout scan
1329	* daemon might need to use it.
1330	*/
1331	assert(vm_pageout_scan_thread == THREAD_NULL);
1332	vm_pageout_scan_thread = new_thread;
1333
1334	first_thread = true;
1335	#pragma clang diagnostic push
1336	#pragma clang diagnostic ignored "-Wnontrivial-memaccess"
1337	/ work around 74481146 /
1338	memcpy(dst: new_thread, src: &init_thread, n: sizeof(*new_thread));
1339	#pragma clang diagnostic pop
1340
1341	/*
1342	* Make the ctid table functional
1343	*/
1344	ctid_table_init();
1345	new_thread->ctid = `0`;
1346	} else {
1347	init_thread_from_template(thread: new_thread);
1348	}
1349
1350	if (options & TH_OPTION_MAINTHREAD) {
1351	init_options \|= IPC_THREAD_INIT_MAINTHREAD;
1352	}
1353
1354	os_ref_init_count_raw(&new_thread->ref_count, &thread_refgrp, `2`);
1355	machine_thread_create(thread: new_thread, task: parent_task, first_thread);
1356
1357	machine_thread_process_signature(thread: new_thread, task: parent_task);
1358
1359	#ifdef MACH_BSD
1360	uthread_init(parent_task, get_bsdthread_info(new_thread),
1361	&tro_tpl, (options & TH_OPTION_WORKQ) != `0`);
1362	if (!task_is_a_corpse(task: parent_task)) {
1363	/*
1364	* uthread_init will set tro_cred (with a +1)
1365	* and tro_proc for live tasks.
1366	*/
1367	assert(tro_tpl.tro_cred && tro_tpl.tro_proc);
1368	}
1369	#endif /* MACH_BSD */
1370
1371	thread_lock_init(new_thread);
1372	wake_lock_init(new_thread);
1373
1374	lck_mtx_init(lck: &new_thread->mutex, grp: &thread_lck_grp, LCK_ATTR_NULL);
1375
1376	ipc_thread_init(task: parent_task, thread: new_thread, tro: &tro_tpl, options: init_options);
1377
1378	thread_ro_create(parent_task, th: new_thread, tro_tpl: &tro_tpl);
1379
1380	new_thread->continuation = continuation;
1381	new_thread->parameter = parameter;
1382	new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
1383	new_thread->requested_policy = default_thread_requested_policy;
1384	new_thread->__runq.runq = PROCESSOR_NULL;
1385	priority_queue_init(que: &new_thread->sched_inheritor_queue);
1386	priority_queue_init(que: &new_thread->base_inheritor_queue);
1387	#if CONFIG_SCHED_CLUTCH
1388	priority_queue_entry_init(&new_thread->th_clutch_runq_link);
1389	priority_queue_entry_init(&new_thread->th_clutch_pri_link);
1390	#endif /* CONFIG_SCHED_CLUTCH */
1391
1392	#if CONFIG_SCHED_EDGE
1393	new_thread->th_bound_cluster_enqueued = false;
1394	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
1395	new_thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
1396	new_thread->th_shared_rsrc_heavy_user[shared_rsrc_type] = false;
1397	new_thread->th_shared_rsrc_heavy_perf_control[shared_rsrc_type] = false;
1398	}
1399	#endif /* CONFIG_SCHED_EDGE */
1400	new_thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
1401
1402	/ Allocate I/O Statistics structure /
1403	new_thread->thread_io_stats = kalloc_data(sizeof(struct io_stat_info),
1404	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
1405
1406	#if KASAN_CLASSIC
1407	kasan_init_thread(&new_thread->kasan_data);
1408	#endif /* KASAN_CLASSIC */
1409
1410	#if CONFIG_KCOV
1411	kcov_init_thread(&new_thread->kcov_data);
1412	#endif
1413
1414	#if CONFIG_IOSCHED
1415	/ Clear out the I/O Scheduling info for AppleFSCompression /
1416	new_thread->decmp_upl = NULL;
1417	#endif /* CONFIG_IOSCHED */
1418
1419	new_thread->thread_region_page_shift = `0`;
1420
1421	#if DEVELOPMENT \|\| DEBUG
1422	task_lock(parent_task);
1423	uint16_t thread_limit = parent_task->task_thread_limit;
1424	if (exc_resource_threads_enabled &&
1425	thread_limit > `0` &&
1426	parent_task->thread_count >= thread_limit &&
1427	!parent_task->task_has_crossed_thread_limit &&
1428	!(task_is_a_corpse(parent_task))) {
1429	int thread_count = parent_task->thread_count;
1430	parent_task->task_has_crossed_thread_limit = TRUE;
1431	task_unlock(parent_task);
1432	SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(parent_task, thread_count);
1433	} else {
1434	task_unlock(parent_task);
1435	}
1436	#endif
1437
1438	lck_mtx_lock(lck: &tasks_threads_lock);
1439	task_lock(parent_task);
1440
1441	/*
1442	* Fail thread creation if parent task is being torn down or has too many threads
1443	* If the caller asked for TH_OPTION_NOSUSP, also fail if the parent task is suspended
1444	*/
1445	if (parent_task->active == `0` \|\| parent_task->halting \|\|
1446	(parent_task->suspend_count > `0` && (options & TH_OPTION_NOSUSP) != `0`) \|\|
1447	(parent_task->thread_count >= task_threadmax && parent_task != kernel_task)) {
1448	task_unlock(parent_task);
1449	lck_mtx_unlock(lck: &tasks_threads_lock);
1450
1451	ipc_thread_disable(thread: new_thread);
1452	ipc_thread_terminate(thread: new_thread);
1453	kfree_data(new_thread->thread_io_stats,
1454	sizeof(struct io_stat_info));
1455	lck_mtx_destroy(lck: &new_thread->mutex, grp: &thread_lck_grp);
1456	kr = KERN_FAILURE;
1457	goto out_thread_cleanup;
1458	}
1459
1460	/ Protected by the tasks_threads_lock /
1461	new_thread->thread_id = ++thread_unique_id;
1462
1463	ctid_table_add(thread: new_thread);
1464
1465	/ New threads inherit any default state on the task /
1466	machine_thread_inherit_taskwide(thread: new_thread, parent_task);
1467
1468	task_reference_grp(parent_task, TASK_GRP_INTERNAL);
1469
1470	if (parent_task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) {
1471	/*
1472	* This task has a per-thread CPU limit; make sure this new thread
1473	* gets its limit set too, before it gets out of the kernel.
1474	*/
1475	act_set_astledger(thread: new_thread);
1476	}
1477
1478	/ Instantiate a thread ledger. Do not fail thread creation if ledger creation fails. /
1479	if ((new_thread->t_threadledger = ledger_instantiate(template: thread_ledger_template,
1480	LEDGER_CREATE_INACTIVE_ENTRIES)) != LEDGER_NULL) {
1481	ledger_entry_setactive(ledger: new_thread->t_threadledger, entry: thread_ledgers.cpu_time);
1482	}
1483
1484	new_thread->t_bankledger = LEDGER_NULL;
1485	new_thread->t_deduct_bank_ledger_time = `0`;
1486	new_thread->t_deduct_bank_ledger_energy = `0`;
1487
1488	new_thread->t_ledger = parent_task->ledger;
1489	if (new_thread->t_ledger) {
1490	ledger_reference(ledger: new_thread->t_ledger);
1491	}
1492
1493	recount_thread_init(th: &new_thread->th_recount);
1494
1495	#if defined(CONFIG_SCHED_MULTIQ)
1496	/ Cache the task's sched_group /
1497	new_thread->sched_group = parent_task->sched_group;
1498	#endif /* defined(CONFIG_SCHED_MULTIQ) */
1499
1500	/ Cache the task's map /
1501	new_thread->map = parent_task->map;
1502
1503	new_thread->depress_timer = timer_call_alloc(func: thread_depress_expire, param0: new_thread);
1504	new_thread->wait_timer = timer_call_alloc(func: thread_timer_expire, param0: new_thread);
1505
1506	#if CONFIG_CPU_COUNTERS
1507	kpc_thread_create(new_thread);
1508	#endif /* CONFIG_CPU_COUNTERS */
1509
1510	/ Set the thread's scheduling parameters /
1511	new_thread->sched_mode = SCHED(initial_thread_sched_mode)(parent_task);
1512	new_thread->max_priority = parent_task->max_priority;
1513	new_thread->task_priority = parent_task->priority;
1514
1515	#if CONFIG_THREAD_GROUPS
1516	thread_group_init_thread(t: new_thread, task: parent_task);
1517	#endif /* CONFIG_THREAD_GROUPS */
1518
1519	int new_priority = (priority < `0`) ? parent_task->priority: priority;
1520	new_priority = (priority < `0`)? parent_task->priority: priority;
1521	if (new_priority > new_thread->max_priority) {
1522	new_priority = new_thread->max_priority;
1523	}
1524	#if !defined(XNU_TARGET_OS_OSX)
1525	if (new_priority < MAXPRI_THROTTLE) {
1526	new_priority = MAXPRI_THROTTLE;
1527	}
1528	#endif /* !defined(XNU_TARGET_OS_OSX) */
1529
1530	new_thread->importance = new_priority - new_thread->task_priority;
1531
1532	sched_set_thread_base_priority(thread: new_thread, priority: new_priority);
1533
1534	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
1535	new_thread->sched_stamp = sched_tick;
1536	#if CONFIG_SCHED_CLUTCH
1537	new_thread->pri_shift = sched_clutch_thread_pri_shift(new_thread, new_thread->th_sched_bucket);
1538	#else /* CONFIG_SCHED_CLUTCH */
1539	new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket];
1540	#endif /* CONFIG_SCHED_CLUTCH */
1541	#endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */
1542
1543	if (parent_task->max_priority <= MAXPRI_THROTTLE) {
1544	sched_thread_mode_demote(thread: new_thread, TH_SFLAG_THROTTLED);
1545	}
1546
1547	thread_policy_create(thread: new_thread);
1548
1549	/ Chain the thread onto the task's list /
1550	queue_enter(&parent_task->threads, new_thread, thread_t, task_threads);
1551	parent_task->thread_count++;
1552
1553	/ So terminating threads don't need to take the task lock to decrement /
1554	os_atomic_inc(&parent_task->active_thread_count, relaxed);
1555
1556	queue_enter(&threads, new_thread, thread_t, threads);
1557	threads_count++;
1558
1559	new_thread->active = TRUE;
1560	if (task_is_a_corpse_fork(parent_task)) {
1561	/ Set the inspection bit if the task is a corpse fork /
1562	new_thread->inspection = TRUE;
1563	} else {
1564	new_thread->inspection = FALSE;
1565	}
1566	new_thread->corpse_dup = FALSE;
1567	new_thread->turnstile = turnstile_alloc();
1568	new_thread->ctsid = turnstile_compact_id_get();
1569
1570
1571	*out_thread = new_thread;
1572
1573	if (kdebug_enable) {
1574	long args[`4`] = {};
1575
1576	kdbg_trace_data(proc: get_bsdtask_info(parent_task), arg_pid: &args[`1`], arg_uniqueid: &args[`3`]);
1577
1578	/*
1579	* Starting with 26604425, exec'ing creates a new task/thread.
1580	*
1581	* NEWTHREAD in the current process has two possible meanings:
1582	*
1583	* 1) Create a new thread for this process.
1584	* 2) Create a new thread for the future process this will become in an
1585	* exec.
1586	*
1587	* To disambiguate these, arg3 will be set to TRUE for case #2.
1588	*
1589	* The value we need to find (TPF_EXEC_COPY) is stable in the case of a
1590	* task exec'ing. The read of t_procflags does not take the proc_lock.
1591	*/
1592	args[`2`] = task_is_exec_copy(parent_task) ? `1` : `0`;
1593
1594	KDBG_RELEASE(TRACE_DATA_NEWTHREAD, (uintptr_t)thread_tid(new_thread),
1595	args[`1`], args[`2`], args[`3`]);
1596
1597	kdebug_proc_name_args(proc: get_bsdtask_info(parent_task), args);
1598	KDBG_RELEASE(TRACE_STRING_NEWTHREAD, args[`0`], args[`1`], args[`2`],
1599	args[`3`]);
1600	}
1601
1602	DTRACE_PROC1(lwp__create, thread_t, *out_thread);
1603
1604	kr = KERN_SUCCESS;
1605	goto done;
1606
1607	out_thread_cleanup:
1608	#ifdef MACH_BSD
1609	{
1610	struct uthread *ut = get_bsdthread_info(new_thread);
1611
1612	uthread_cleanup(ut, &tro_tpl);
1613	uthread_destroy(ut);
1614	}
1615	#endif /* MACH_BSD */
1616
1617	machine_thread_destroy(thread: new_thread);
1618
1619	thread_ro_destroy(th: new_thread);
1620	zfree(thread_zone, new_thread);
1621
1622	done:
1623	return kr;
1624	}
1625
1626	static kern_return_t
1627	thread_create_with_options_internal(
1628	task_t task,
1629	thread_t *new_thread,
1630	boolean_t from_user,
1631	thread_create_internal_options_t options,
1632	thread_continue_t continuation)
1633	{
1634	kern_return_t result;
1635	thread_t thread;
1636
1637	if (task == TASK_NULL \|\| task == kernel_task) {
1638	return KERN_INVALID_ARGUMENT;
1639	}
1640
1641	#if CONFIG_MACF
1642	if (from_user && current_task() != task &&
1643	mac_proc_check_remote_thread_create(task, flavor: -`1`, NULL, new_state_count: `0`) != `0`) {
1644	return KERN_DENIED;
1645	}
1646	#endif
1647
1648	result = thread_create_internal(parent_task: task, priority: -`1`, continuation, NULL, options, out_thread: &thread);
1649	if (result != KERN_SUCCESS) {
1650	return result;
1651	}
1652
1653	thread->user_stop_count = `1`;
1654	thread_hold(thread);
1655	if (task->suspend_count > `0`) {
1656	thread_hold(thread);
1657	}
1658
1659	if (from_user) {
1660	extmod_statistics_incr_thread_create(target: task);
1661	}
1662
1663	task_unlock(task);
1664	lck_mtx_unlock(lck: &tasks_threads_lock);
1665
1666	*new_thread = thread;
1667
1668	return KERN_SUCCESS;
1669	}
1670
1671	kern_return_t
1672	thread_create_immovable(
1673	task_t task,
1674	thread_t *new_thread)
1675	{
1676	return thread_create_with_options_internal(task, new_thread, FALSE,
1677	options: TH_OPTION_NONE, continuation: (thread_continue_t)thread_bootstrap_return);
1678	}
1679
1680	kern_return_t
1681	thread_create_from_user(
1682	task_t task,
1683	thread_t *new_thread)
1684	{
1685	/ All thread ports are created immovable by default /
1686	return thread_create_with_options_internal(task, new_thread, TRUE, options: TH_OPTION_NONE,
1687	continuation: (thread_continue_t)thread_bootstrap_return);
1688	}
1689
1690	kern_return_t
1691	thread_create_with_continuation(
1692	task_t task,
1693	thread_t *new_thread,
1694	thread_continue_t continuation)
1695	{
1696	return thread_create_with_options_internal(task, new_thread, FALSE, options: TH_OPTION_NONE, continuation);
1697	}
1698
1699	/*
1700	* Create a thread that is already started, but is waiting on an event
1701	*/
1702	static kern_return_t
1703	thread_create_waiting_internal(
1704	task_t task,
1705	thread_continue_t continuation,
1706	event_t event,
1707	block_hint_t block_hint,
1708	thread_create_internal_options_t options,
1709	thread_t *new_thread)
1710	{
1711	kern_return_t result;
1712	thread_t thread;
1713	wait_interrupt_t wait_interrupt = THREAD_INTERRUPTIBLE;
1714
1715	if (task == TASK_NULL \|\| task == kernel_task) {
1716	return KERN_INVALID_ARGUMENT;
1717	}
1718
1719	result = thread_create_internal(parent_task: task, priority: -`1`, continuation, NULL,
1720	options, out_thread: &thread);
1721	if (result != KERN_SUCCESS) {
1722	return result;
1723	}
1724
1725	/ note no user_stop_count or thread_hold here /
1726
1727	if (task->suspend_count > `0`) {
1728	thread_hold(thread);
1729	}
1730
1731	thread_mtx_lock(thread);
1732	thread_set_pending_block_hint(thread, block_hint);
1733	if (options & TH_OPTION_WORKQ) {
1734	thread->static_param = true;
1735	event = workq_thread_init_and_wq_lock(task, thread);
1736	} else if (options & TH_OPTION_MAINTHREAD) {
1737	wait_interrupt = THREAD_UNINT;
1738	}
1739	thread_start_in_assert_wait(thread,
1740	waitq: assert_wait_queue(event), CAST_EVENT64_T(event),
1741	interruptible: wait_interrupt);
1742	thread_mtx_unlock(thread);
1743
1744	task_unlock(task);
1745	lck_mtx_unlock(lck: &tasks_threads_lock);
1746
1747	*new_thread = thread;
1748
1749	return KERN_SUCCESS;
1750	}
1751
1752	kern_return_t
1753	main_thread_create_waiting(
1754	task_t task,
1755	thread_continue_t continuation,
1756	event_t event,
1757	thread_t *new_thread)
1758	{
1759	return thread_create_waiting_internal(task, continuation, event,
1760	block_hint: kThreadWaitNone, options: TH_OPTION_MAINTHREAD, new_thread);
1761	}
1762
1763
1764	static kern_return_t
1765	thread_create_running_internal2(
1766	task_t task,
1767	int flavor,
1768	thread_state_t new_state,
1769	mach_msg_type_number_t new_state_count,
1770	thread_t *new_thread,
1771	boolean_t from_user)
1772	{
1773	kern_return_t result;
1774	thread_t thread;
1775
1776	if (task == TASK_NULL \|\| task == kernel_task) {
1777	return KERN_INVALID_ARGUMENT;
1778	}
1779
1780	#if CONFIG_MACF
1781	if (from_user && current_task() != task &&
1782	mac_proc_check_remote_thread_create(task, flavor, new_state, new_state_count) != `0`) {
1783	return KERN_DENIED;
1784	}
1785	#endif
1786
1787	result = thread_create_internal(parent_task: task, priority: -`1`,
1788	continuation: (thread_continue_t)thread_bootstrap_return, NULL,
1789	options: TH_OPTION_NONE, out_thread: &thread);
1790	if (result != KERN_SUCCESS) {
1791	return result;
1792	}
1793
1794	if (task->suspend_count > `0`) {
1795	thread_hold(thread);
1796	}
1797
1798	if (from_user) {
1799	result = machine_thread_state_convert_from_user(thread, flavor,
1800	tstate: new_state, count: new_state_count, NULL, old_count: `0`, tssf_flags: TSSF_FLAGS_NONE);
1801	}
1802	if (result == KERN_SUCCESS) {
1803	result = machine_thread_set_state(thread, flavor, state: new_state,
1804	count: new_state_count);
1805	}
1806	if (result != KERN_SUCCESS) {
1807	task_unlock(task);
1808	lck_mtx_unlock(lck: &tasks_threads_lock);
1809
1810	thread_terminate(target_act: thread);
1811	thread_deallocate(thread);
1812	return result;
1813	}
1814
1815	thread_mtx_lock(thread);
1816	thread_start(thread);
1817	thread_mtx_unlock(thread);
1818
1819	if (from_user) {
1820	extmod_statistics_incr_thread_create(target: task);
1821	}
1822
1823	task_unlock(task);
1824	lck_mtx_unlock(lck: &tasks_threads_lock);
1825
1826	*new_thread = thread;
1827
1828	return result;
1829	}
1830
1831	/ Prototype, see justification above /
1832	kern_return_t
1833	thread_create_running(
1834	task_t task,
1835	int flavor,
1836	thread_state_t new_state,
1837	mach_msg_type_number_t new_state_count,
1838	thread_t *new_thread);
1839
1840	kern_return_t
1841	thread_create_running(
1842	task_t task,
1843	int flavor,
1844	thread_state_t new_state,
1845	mach_msg_type_number_t new_state_count,
1846	thread_t *new_thread)
1847	{
1848	return thread_create_running_internal2(
1849	task, flavor, new_state, new_state_count,
1850	new_thread, FALSE);
1851	}
1852
1853	kern_return_t
1854	thread_create_running_from_user(
1855	task_t task,
1856	int flavor,
1857	thread_state_t new_state,
1858	mach_msg_type_number_t new_state_count,
1859	thread_t *new_thread)
1860	{
1861	return thread_create_running_internal2(
1862	task, flavor, new_state, new_state_count,
1863	new_thread, TRUE);
1864	}
1865
1866	kern_return_t
1867	thread_create_workq_waiting(
1868	task_t task,
1869	thread_continue_t continuation,
1870	thread_t *new_thread)
1871	{
1872	/*
1873	* Create thread, but don't pin control port just yet, in case someone calls
1874	* task_threads() and deallocates pinned port before kernel copyout happens,
1875	* which will result in pinned port guard exception. Instead, pin and copyout
1876	* atomically during workq_setup_and_run().
1877	*/
1878	int options = TH_OPTION_NOSUSP \| TH_OPTION_WORKQ;
1879	return thread_create_waiting_internal(task, continuation, NULL,
1880	block_hint: kThreadWaitParkedWorkQueue, options, new_thread);
1881	}
1882
1883	/*
1884	* kernel_thread_create:
1885	*
1886	* Create a thread in the kernel task
1887	* to execute in kernel context.
1888	*/
1889	kern_return_t
1890	kernel_thread_create(
1891	thread_continue_t continuation,
1892	void *parameter,
1893	integer_t priority,
1894	thread_t *new_thread)
1895	{
1896	kern_return_t result;
1897	thread_t thread;
1898	task_t task = kernel_task;
1899
1900	result = thread_create_internal(parent_task: task, priority, continuation, parameter,
1901	options: TH_OPTION_NONE, out_thread: &thread);
1902	if (result != KERN_SUCCESS) {
1903	return result;
1904	}
1905
1906	task_unlock(task);
1907	lck_mtx_unlock(lck: &tasks_threads_lock);
1908
1909	stack_alloc(thread);
1910	assert(thread->kernel_stack != `0`);
1911	#if !defined(XNU_TARGET_OS_OSX)
1912	if (priority > BASEPRI_KERNEL)
1913	#endif
1914	thread->reserved_stack = thread->kernel_stack;
1915
1916	if (debug_task & `1`) {
1917	kprintf(fmt: "kernel_thread_create: thread = %p continuation = %p\n", thread, continuation);
1918	}
1919	*new_thread = thread;
1920
1921	return result;
1922	}
1923
1924	kern_return_t
1925	kernel_thread_start_priority(
1926	thread_continue_t continuation,
1927	void *parameter,
1928	integer_t priority,
1929	thread_t *new_thread)
1930	{
1931	kern_return_t result;
1932	thread_t thread;
1933
1934	result = kernel_thread_create(continuation, parameter, priority, new_thread: &thread);
1935	if (result != KERN_SUCCESS) {
1936	return result;
1937	}
1938
1939	*new_thread = thread;
1940
1941	thread_mtx_lock(thread);
1942	thread_start(thread);
1943	thread_mtx_unlock(thread);
1944
1945	return result;
1946	}
1947
1948	kern_return_t
1949	kernel_thread_start(
1950	thread_continue_t continuation,
1951	void *parameter,
1952	thread_t *new_thread)
1953	{
1954	return kernel_thread_start_priority(continuation, parameter, priority: -`1`, new_thread);
1955	}
1956
1957	/ Separated into helper function so it can be used by THREAD_BASIC_INFO and THREAD_EXTENDED_INFO /
1958	/ it is assumed that the thread is locked by the caller /
1959	static void
1960	retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info)
1961	{
1962	int state, flags;
1963
1964	/ fill in info /
1965
1966	thread_read_times(thread, user_time: &basic_info->user_time,
1967	system_time: &basic_info->system_time, NULL);
1968
1969	/*
1970	* Update lazy-evaluated scheduler info because someone wants it.
1971	*/
1972	if (SCHED(can_update_priority)(thread)) {
1973	SCHED(update_priority)(thread);
1974	}
1975
1976	basic_info->sleep_time = `0`;
1977
1978	/*
1979	* To calculate cpu_usage, first correct for timer rate,
1980	* then for 5/8 ageing. The correction factor [3/5] is
1981	* (1/(5/8) - 1).
1982	*/
1983	basic_info->cpu_usage = `0`;
1984	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
1985	if (sched_tick_interval) {
1986	basic_info->cpu_usage = (integer_t)(((uint64_t)thread->cpu_usage
1987	* TH_USAGE_SCALE) / sched_tick_interval);
1988	basic_info->cpu_usage = (basic_info->cpu_usage * `3`) / `5`;
1989	}
1990	#endif
1991
1992	if (basic_info->cpu_usage > TH_USAGE_SCALE) {
1993	basic_info->cpu_usage = TH_USAGE_SCALE;
1994	}
1995
1996	basic_info->policy = ((thread->sched_mode == TH_MODE_TIMESHARE)?
1997	POLICY_TIMESHARE: POLICY_RR);
1998
1999	flags = `0`;
2000	if (thread->options & TH_OPT_IDLE_THREAD) {
2001	flags \|= TH_FLAGS_IDLE;
2002	}
2003
2004	if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
2005	flags \|= TH_FLAGS_GLOBAL_FORCED_IDLE;
2006	}
2007
2008	if (!thread->kernel_stack) {
2009	flags \|= TH_FLAGS_SWAPPED;
2010	}
2011
2012	state = `0`;
2013	if (thread->state & TH_TERMINATE) {
2014	state = TH_STATE_HALTED;
2015	} else if (thread->state & TH_RUN) {
2016	state = TH_STATE_RUNNING;
2017	} else if (thread->state & TH_UNINT) {
2018	state = TH_STATE_UNINTERRUPTIBLE;
2019	} else if (thread->state & TH_SUSP) {
2020	state = TH_STATE_STOPPED;
2021	} else if (thread->state & TH_WAIT) {
2022	state = TH_STATE_WAITING;
2023	}
2024
2025	basic_info->run_state = state;
2026	basic_info->flags = flags;
2027
2028	basic_info->suspend_count = thread->user_stop_count;
2029
2030	return;
2031	}
2032
2033	kern_return_t
2034	thread_info_internal(
2035	thread_t thread,
2036	thread_flavor_t flavor,
2037	thread_info_t thread_info_out, / ptr to OUT array /
2038	mach_msg_type_number_t thread_info_count) /IN/OUT/*
2039	{
2040	spl_t s;
2041
2042	if (thread == THREAD_NULL) {
2043	return KERN_INVALID_ARGUMENT;
2044	}
2045
2046	if (flavor == THREAD_BASIC_INFO) {
2047	if (*thread_info_count < THREAD_BASIC_INFO_COUNT) {
2048	return KERN_INVALID_ARGUMENT;
2049	}
2050
2051	s = splsched();
2052	thread_lock(thread);
2053
2054	retrieve_thread_basic_info(thread, basic_info: (thread_basic_info_t) thread_info_out);
2055
2056	thread_unlock(thread);
2057	splx(s);
2058
2059	*thread_info_count = THREAD_BASIC_INFO_COUNT;
2060
2061	return KERN_SUCCESS;
2062	} else if (flavor == THREAD_IDENTIFIER_INFO) {
2063	thread_identifier_info_t identifier_info;
2064
2065	if (*thread_info_count < THREAD_IDENTIFIER_INFO_COUNT) {
2066	return KERN_INVALID_ARGUMENT;
2067	}
2068
2069	identifier_info = __IGNORE_WCASTALIGN((thread_identifier_info_t)thread_info_out);
2070
2071	s = splsched();
2072	thread_lock(thread);
2073
2074	identifier_info->thread_id = thread->thread_id;
2075	identifier_info->thread_handle = thread->machine.cthread_self;
2076	identifier_info->dispatch_qaddr = thread_dispatchqaddr(thread);
2077
2078	thread_unlock(thread);
2079	splx(s);
2080	return KERN_SUCCESS;
2081	} else if (flavor == THREAD_SCHED_TIMESHARE_INFO) {
2082	policy_timeshare_info_t ts_info;
2083
2084	if (*thread_info_count < POLICY_TIMESHARE_INFO_COUNT) {
2085	return KERN_INVALID_ARGUMENT;
2086	}
2087
2088	ts_info = (policy_timeshare_info_t)thread_info_out;
2089
2090	s = splsched();
2091	thread_lock(thread);
2092
2093	if (thread->sched_mode != TH_MODE_TIMESHARE) {
2094	thread_unlock(thread);
2095	splx(s);
2096	return KERN_INVALID_POLICY;
2097	}
2098
2099	ts_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != `0`;
2100	if (ts_info->depressed) {
2101	ts_info->base_priority = DEPRESSPRI;
2102	ts_info->depress_priority = thread->base_pri;
2103	} else {
2104	ts_info->base_priority = thread->base_pri;
2105	ts_info->depress_priority = -`1`;
2106	}
2107
2108	ts_info->cur_priority = thread->sched_pri;
2109	ts_info->max_priority = thread->max_priority;
2110
2111	thread_unlock(thread);
2112	splx(s);
2113
2114	*thread_info_count = POLICY_TIMESHARE_INFO_COUNT;
2115
2116	return KERN_SUCCESS;
2117	} else if (flavor == THREAD_SCHED_FIFO_INFO) {
2118	if (*thread_info_count < POLICY_FIFO_INFO_COUNT) {
2119	return KERN_INVALID_ARGUMENT;
2120	}
2121
2122	return KERN_INVALID_POLICY;
2123	} else if (flavor == THREAD_SCHED_RR_INFO) {
2124	policy_rr_info_t rr_info;
2125	uint32_t quantum_time;
2126	uint64_t quantum_ns;
2127
2128	if (*thread_info_count < POLICY_RR_INFO_COUNT) {
2129	return KERN_INVALID_ARGUMENT;
2130	}
2131
2132	rr_info = (policy_rr_info_t) thread_info_out;
2133
2134	s = splsched();
2135	thread_lock(thread);
2136
2137	if (thread->sched_mode == TH_MODE_TIMESHARE) {
2138	thread_unlock(thread);
2139	splx(s);
2140
2141	return KERN_INVALID_POLICY;
2142	}
2143
2144	rr_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != `0`;
2145	if (rr_info->depressed) {
2146	rr_info->base_priority = DEPRESSPRI;
2147	rr_info->depress_priority = thread->base_pri;
2148	} else {
2149	rr_info->base_priority = thread->base_pri;
2150	rr_info->depress_priority = -`1`;
2151	}
2152
2153	quantum_time = SCHED(initial_quantum_size)(THREAD_NULL);
2154	absolutetime_to_nanoseconds(abstime: quantum_time, result: &quantum_ns);
2155
2156	rr_info->max_priority = thread->max_priority;
2157	rr_info->quantum = (uint32_t)(quantum_ns / `1000` / `1000`);
2158
2159	thread_unlock(thread);
2160	splx(s);
2161
2162	*thread_info_count = POLICY_RR_INFO_COUNT;
2163
2164	return KERN_SUCCESS;
2165	} else if (flavor == THREAD_EXTENDED_INFO) {
2166	thread_basic_info_data_t basic_info;
2167	thread_extended_info_t extended_info = __IGNORE_WCASTALIGN((thread_extended_info_t)thread_info_out);
2168
2169	if (*thread_info_count < THREAD_EXTENDED_INFO_COUNT) {
2170	return KERN_INVALID_ARGUMENT;
2171	}
2172
2173	s = splsched();
2174	thread_lock(thread);
2175
2176	/ NOTE: This mimics fill_taskthreadinfo(), which is the function used by proc_pidinfo() for*
2177	* the PROC_PIDTHREADINFO flavor (which can't be used on corpses)
2178	*/
2179	retrieve_thread_basic_info(thread, basic_info: &basic_info);
2180	extended_info->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC));
2181	extended_info->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC));
2182
2183	extended_info->pth_cpu_usage = basic_info.cpu_usage;
2184	extended_info->pth_policy = basic_info.policy;
2185	extended_info->pth_run_state = basic_info.run_state;
2186	extended_info->pth_flags = basic_info.flags;
2187	extended_info->pth_sleep_time = basic_info.sleep_time;
2188	extended_info->pth_curpri = thread->sched_pri;
2189	extended_info->pth_priority = thread->base_pri;
2190	extended_info->pth_maxpriority = thread->max_priority;
2191
2192	bsd_getthreadname(uth: get_bsdthread_info(thread), buffer: extended_info->pth_name);
2193
2194	thread_unlock(thread);
2195	splx(s);
2196
2197	*thread_info_count = THREAD_EXTENDED_INFO_COUNT;
2198
2199	return KERN_SUCCESS;
2200	} else if (flavor == THREAD_DEBUG_INFO_INTERNAL) {
2201	#if DEVELOPMENT \|\| DEBUG
2202	thread_debug_info_internal_t dbg_info;
2203	if (*thread_info_count < THREAD_DEBUG_INFO_INTERNAL_COUNT) {
2204	return KERN_NOT_SUPPORTED;
2205	}
2206
2207	if (thread_info_out == NULL) {
2208	return KERN_INVALID_ARGUMENT;
2209	}
2210
2211	dbg_info = __IGNORE_WCASTALIGN((thread_debug_info_internal_t)thread_info_out);
2212	dbg_info->page_creation_count = thread->t_page_creation_count;
2213
2214	*thread_info_count = THREAD_DEBUG_INFO_INTERNAL_COUNT;
2215	return KERN_SUCCESS;
2216	#endif /* DEVELOPMENT \|\| DEBUG */
2217	return KERN_NOT_SUPPORTED;
2218	}
2219
2220	return KERN_INVALID_ARGUMENT;
2221	}
2222
2223	static void
2224	_convert_mach_to_time_value(uint64_t time_mach, time_value_t *time)
2225	{
2226	clock_sec_t secs;
2227	clock_usec_t usecs;
2228	absolutetime_to_microtime(abstime: time_mach, secs: &secs, microsecs: &usecs);
2229	time->seconds = (typeof(time->seconds))secs;
2230	time->microseconds = usecs;
2231	}
2232
2233	void
2234	thread_read_times(
2235	thread_t thread,
2236	time_value_t *user_time,
2237	time_value_t *system_time,
2238	time_value_t *runnable_time)
2239	{
2240	if (user_time && system_time) {
2241	struct recount_times_mach times = recount_thread_times(thread);
2242	_convert_mach_to_time_value(time_mach: times.rtm_user, time: user_time);
2243	_convert_mach_to_time_value(time_mach: times.rtm_system, time: system_time);
2244	}
2245
2246	if (runnable_time) {
2247	uint64_t runnable_time_mach = timer_grab(timer: &thread->runnable_timer);
2248	_convert_mach_to_time_value(time_mach: runnable_time_mach, time: runnable_time);
2249	}
2250	}
2251
2252	uint64_t
2253	thread_get_runtime_self(void)
2254	{
2255	/*
2256	* Must be guaranteed to stay on the same CPU and not be updated by the
2257	* scheduler.
2258	*/
2259	boolean_t interrupt_state = ml_set_interrupts_enabled(FALSE);
2260	uint64_t time_mach = recount_current_thread_time_mach();
2261	ml_set_interrupts_enabled(enable: interrupt_state);
2262	return time_mach;
2263	}
2264
2265	/*
2266	* thread_wire_internal:
2267	*
2268	* Specify that the target thread must always be able
2269	* to run and to allocate memory.
2270	*/
2271	kern_return_t
2272	thread_wire_internal(
2273	host_priv_t host_priv,
2274	thread_t thread,
2275	boolean_t wired,
2276	boolean_t *prev_state)
2277	{
2278	if (host_priv == NULL \|\| thread != current_thread()) {
2279	return KERN_INVALID_ARGUMENT;
2280	}
2281
2282	if (prev_state) {
2283	*prev_state = (thread->options & TH_OPT_VMPRIV) != `0`;
2284	}
2285
2286	if (wired) {
2287	if (!(thread->options & TH_OPT_VMPRIV)) {
2288	vm_page_free_reserve(pages: `1`); / XXX /
2289	}
2290	thread->options \|= TH_OPT_VMPRIV;
2291	} else {
2292	if (thread->options & TH_OPT_VMPRIV) {
2293	vm_page_free_reserve(pages: -`1`); / XXX /
2294	}
2295	thread->options &= ~TH_OPT_VMPRIV;
2296	}
2297
2298	return KERN_SUCCESS;
2299	}
2300
2301
2302	/*
2303	* thread_wire:
2304	*
2305	* User-api wrapper for thread_wire_internal()
2306	*/
2307	kern_return_t
2308	thread_wire(
2309	host_priv_t host_priv __unused,
2310	thread_t thread __unused,
2311	boolean_t wired __unused)
2312	{
2313	return KERN_NOT_SUPPORTED;
2314	}
2315
2316	boolean_t
2317	is_external_pageout_thread(void)
2318	{
2319	return current_thread() == pgo_iothread_external_state.pgo_iothread;
2320	}
2321
2322	boolean_t
2323	is_vm_privileged(void)
2324	{
2325	return current_thread()->options & TH_OPT_VMPRIV ? TRUE : FALSE;
2326	}
2327
2328	boolean_t
2329	set_vm_privilege(boolean_t privileged)
2330	{
2331	boolean_t was_vmpriv;
2332
2333	if (current_thread()->options & TH_OPT_VMPRIV) {
2334	was_vmpriv = TRUE;
2335	} else {
2336	was_vmpriv = FALSE;
2337	}
2338
2339	if (privileged != FALSE) {
2340	current_thread()->options \|= TH_OPT_VMPRIV;
2341	} else {
2342	current_thread()->options &= ~TH_OPT_VMPRIV;
2343	}
2344
2345	return was_vmpriv;
2346	}
2347
2348	void
2349	thread_floor_boost_set_promotion_locked(thread_t thread)
2350	{
2351	assert(thread->priority_floor_count > `0`);
2352
2353	if (!(thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED)) {
2354	sched_thread_promote_reason(thread, TH_SFLAG_FLOOR_PROMOTED, trace_obj: `0`);
2355	}
2356	}
2357
2358	/! @function thread_priority_floor_start*
2359	* @abstract boost the current thread priority to floor.
2360	* @discussion Increase the priority of the current thread to at least MINPRI_FLOOR.
2361	* The boost will be mantained until a corresponding thread_priority_floor_end()
2362	* is called. Every call of thread_priority_floor_start() needs to have a corresponding
2363	* call to thread_priority_floor_end() from the same thread.
2364	* No thread can return to userspace before calling thread_priority_floor_end().
2365	*
2366	* NOTE: avoid to use this function. Try to use gate_t or sleep_with_inheritor()
2367	* instead.
2368	* @result a token to be given to the corresponding thread_priority_floor_end()
2369	*/
2370	thread_pri_floor_t
2371	thread_priority_floor_start(void)
2372	{
2373	thread_pri_floor_t ret;
2374	thread_t thread = current_thread();
2375	__assert_only uint16_t prev_priority_floor_count;
2376
2377	assert(thread->priority_floor_count < UINT16_MAX);
2378	prev_priority_floor_count = thread->priority_floor_count++;
2379	#if MACH_ASSERT
2380	/*
2381	* Set the ast to check that the
2382	* priority_floor_count is going to be set to zero when
2383	* going back to userspace.
2384	* Set it only once when we increment it for the first time.
2385	*/
2386	if (prev_priority_floor_count == `0`) {
2387	act_set_debug_assert();
2388	}
2389	#endif
2390
2391	ret.thread = thread;
2392	return ret;
2393	}
2394
2395	/! @function thread_priority_floor_end*
2396	* @abstract ends the floor boost.
2397	* @param token the token obtained from thread_priority_floor_start()
2398	* @discussion ends the priority floor boost started with thread_priority_floor_start()
2399	*/
2400	void
2401	thread_priority_floor_end(thread_pri_floor_t *token)
2402	{
2403	thread_t thread = current_thread();
2404
2405	assert(thread->priority_floor_count > `0`);
2406	assertf(token->thread == thread, "thread_priority_floor_end called from a different thread from thread_priority_floor_start %p %p", thread, token->thread);
2407
2408	if ((thread->priority_floor_count-- == `1`) && (thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED)) {
2409	spl_t s = splsched();
2410	thread_lock(thread);
2411
2412	if (thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED) {
2413	sched_thread_unpromote_reason(thread, TH_SFLAG_FLOOR_PROMOTED, trace_obj: `0`);
2414	}
2415
2416	thread_unlock(thread);
2417	splx(s);
2418	}
2419
2420	token->thread = NULL;
2421	}
2422
2423	/*
2424	* XXX assuming current thread only, for now...
2425	*/
2426	void
2427	thread_guard_violation(thread_t thread,
2428	mach_exception_data_type_t code, mach_exception_data_type_t subcode, boolean_t fatal)
2429	{
2430	assert(thread == current_thread());
2431
2432	/ Don't set up the AST for kernel threads; this check is needed to ensure*
2433	* that the guard_exc_* fields in the thread structure are set only by the
2434	* current thread and therefore, don't require a lock.
2435	*/
2436	if (get_threadtask(thread) == kernel_task) {
2437	return;
2438	}
2439
2440	assert(EXC_GUARD_DECODE_GUARD_TYPE(code));
2441
2442	/*
2443	* Use the saved state area of the thread structure
2444	* to store all info required to handle the AST when
2445	* returning to userspace. It's possible that there is
2446	* already a pending guard exception. If it's non-fatal,
2447	* it can only be over-written by a fatal exception code.
2448	*/
2449	if (thread->guard_exc_info.code && (thread->guard_exc_fatal \|\| !fatal)) {
2450	return;
2451	}
2452
2453	thread->guard_exc_info.code = code;
2454	thread->guard_exc_info.subcode = subcode;
2455	thread->guard_exc_fatal = fatal ? `1` : `0`;
2456
2457	spl_t s = splsched();
2458	thread_ast_set(thread, AST_GUARD);
2459	ast_propagate(thread);
2460	splx(s);
2461	}
2462
2463	#if CONFIG_DEBUG_SYSCALL_REJECTION
2464	extern void rejected_syscall_guard_ast(thread_t __unused t, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
2465	#endif /* CONFIG_DEBUG_SYSCALL_REJECTION */
2466
2467	/*
2468	* guard_ast:
2469	*
2470	* Handle AST_GUARD for a thread. This routine looks at the
2471	* state saved in the thread structure to determine the cause
2472	* of this exception. Based on this value, it invokes the
2473	* appropriate routine which determines other exception related
2474	* info and raises the exception.
2475	*/
2476	void
2477	guard_ast(thread_t t)
2478	{
2479	const mach_exception_data_type_t
2480	code = t->guard_exc_info.code,
2481	subcode = t->guard_exc_info.subcode;
2482
2483	t->guard_exc_info.code = `0`;
2484	t->guard_exc_info.subcode = `0`;
2485	t->guard_exc_fatal = `0`;
2486
2487	switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) {
2488	case GUARD_TYPE_NONE:
2489	/ lingering AST_GUARD on the processor? /
2490	break;
2491	case GUARD_TYPE_MACH_PORT:
2492	mach_port_guard_ast(t, code, subcode);
2493	break;
2494	case GUARD_TYPE_FD:
2495	fd_guard_ast(t, code, subcode);
2496	break;
2497	#if CONFIG_VNGUARD
2498	case GUARD_TYPE_VN:
2499	vn_guard_ast(t, code, subcode);
2500	break;
2501	#endif
2502	case GUARD_TYPE_VIRT_MEMORY:
2503	virt_memory_guard_ast(t, code, subcode);
2504	break;
2505	#if CONFIG_DEBUG_SYSCALL_REJECTION
2506	case GUARD_TYPE_REJECTED_SC:
2507	rejected_syscall_guard_ast(t, code, subcode);
2508	break;
2509	#endif /* CONFIG_DEBUG_SYSCALL_REJECTION */
2510	default:
2511	panic("guard_exc_info %llx %llx", code, subcode);
2512	}
2513	}
2514
2515	static void
2516	thread_cputime_callback(int warning, __unused const void arg0, __unused const* void *arg1)
2517	{
2518	if (warning == LEDGER_WARNING_ROSE_ABOVE) {
2519	#if CONFIG_TELEMETRY
2520	/*
2521	* This thread is in danger of violating the CPU usage monitor. Enable telemetry
2522	* on the entire task so there are micro-stackshots available if and when
2523	* EXC_RESOURCE is triggered. We could have chosen to enable micro-stackshots
2524	* for this thread only; but now that this task is suspect, knowing what all of
2525	* its threads are up to will be useful.
2526	*/
2527	telemetry_task_ctl(task: current_task(), TF_CPUMON_WARNING, enable_disable: `1`);
2528	#endif
2529	return;
2530	}
2531
2532	#if CONFIG_TELEMETRY
2533	/*
2534	* If the balance has dipped below the warning level (LEDGER_WARNING_DIPPED_BELOW) or
2535	* exceeded the limit, turn telemetry off for the task.
2536	*/
2537	telemetry_task_ctl(task: current_task(), TF_CPUMON_WARNING, enable_disable: `0`);
2538	#endif
2539
2540	if (warning == `0`) {
2541	SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU();
2542	}
2543	}
2544
2545	void __attribute__((noinline))
2546	SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
2547	{
2548	int pid = `0`;
2549	task_t task = current_task();
2550	thread_t thread = current_thread();
2551	uint64_t tid = thread->thread_id;
2552	const char *procname = "unknown";
2553	time_value_t thread_total_time = {`0`, `0`};
2554	time_value_t thread_system_time;
2555	time_value_t thread_user_time;
2556	int action;
2557	uint8_t percentage;
2558	uint32_t usage_percent = `0`;
2559	uint32_t interval_sec;
2560	uint64_t interval_ns;
2561	uint64_t balance_ns;
2562	boolean_t fatal = FALSE;
2563	boolean_t send_exc_resource = TRUE; / in addition to RESOURCE_NOTIFY /
2564	kern_return_t kr;
2565
2566	#ifdef EXC_RESOURCE_MONITORS
2567	mach_exception_data_type_t code[EXCEPTION_CODE_MAX];
2568	#endif /* EXC_RESOURCE_MONITORS */
2569	struct ledger_entry_info lei;
2570
2571	assert(thread->t_threadledger != LEDGER_NULL);
2572
2573	/*
2574	* Extract the fatal bit and suspend the monitor (which clears the bit).
2575	*/
2576	task_lock(task);
2577	if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_CPUMON) {
2578	fatal = TRUE;
2579	send_exc_resource = TRUE;
2580	}
2581	/ Only one thread can be here at a time. Whichever makes it through*
2582	* first will successfully suspend the monitor and proceed to send the
2583	* notification. Other threads will get an error trying to suspend the
2584	* monitor and give up on sending the notification. In the first release,
2585	* the monitor won't be resumed for a number of seconds, but we may
2586	* eventually need to handle low-latency resume.
2587	*/
2588	kr = task_suspend_cpumon(task);
2589	task_unlock(task);
2590	if (kr == KERN_INVALID_ARGUMENT) {
2591	return;
2592	}
2593
2594	#ifdef MACH_BSD
2595	pid = proc_selfpid();
2596	void *bsd_info = get_bsdtask_info(task);
2597	if (bsd_info != NULL) {
2598	procname = proc_name_address(p: bsd_info);
2599	}
2600	#endif
2601
2602	thread_get_cpulimit(action: &action, percentage: &percentage, interval_ns: &interval_ns);
2603
2604	interval_sec = (uint32_t)(interval_ns / NSEC_PER_SEC);
2605
2606	thread_read_times(thread, user_time: &thread_user_time, system_time: &thread_system_time, NULL);
2607	time_value_add(&thread_total_time, &thread_user_time);
2608	time_value_add(&thread_total_time, &thread_system_time);
2609	ledger_get_entry_info(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time, lei: &lei);
2610
2611	/ credit/debit/balance/limit are in absolute time units;*
2612	* the refill info is in nanoseconds. */
2613	absolutetime_to_nanoseconds(abstime: lei.lei_balance, result: &balance_ns);
2614	if (lei.lei_last_refill > `0`) {
2615	usage_percent = (uint32_t)((balance_ns * `100ULL`) / lei.lei_last_refill);
2616	}
2617
2618	/ TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? /
2619	printf(format: "process %s[%d] thread %llu caught burning CPU! It used more than %d%% CPU over %u seconds\n",
2620	procname, pid, tid, percentage, interval_sec);
2621	printf(format: " (actual recent usage: %d%% over ~%llu seconds)\n",
2622	usage_percent, (lei.lei_last_refill + NSEC_PER_SEC / `2`) / NSEC_PER_SEC);
2623	printf(format: " Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys)\n",
2624	thread_total_time.seconds, thread_total_time.microseconds,
2625	thread_user_time.seconds, thread_user_time.microseconds,
2626	thread_system_time.seconds, thread_system_time.microseconds);
2627	printf(format: " Ledger balance: %lld; mabs credit: %lld; mabs debit: %lld\n",
2628	lei.lei_balance, lei.lei_credit, lei.lei_debit);
2629	printf(format: " mabs limit: %llu; mabs period: %llu ns; last refill: %llu ns%s.\n",
2630	lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill,
2631	(fatal ? " [fatal violation]" : ""));
2632
2633	/*
2634	* For now, send RESOURCE_NOTIFY in parallel with EXC_RESOURCE. Once
2635	* we have logging parity, we will stop sending EXC_RESOURCE (24508922).
2636	*/
2637
2638	/ RESOURCE_NOTIFY MIG specifies nanoseconds of CPU time /
2639	lei.lei_balance = balance_ns;
2640	absolutetime_to_nanoseconds(abstime: lei.lei_limit, result: &lei.lei_limit);
2641	trace_resource_violation(RMON_CPUUSAGE_VIOLATED, ledger_info: &lei);
2642	kr = send_resource_violation(send_cpu_usage_violation, violator: task, ledger_info: &lei,
2643	flags: fatal ? kRNFatalLimitFlag : `0`);
2644	if (kr) {
2645	printf(format: "send_resource_violation(CPU usage, ...): error %#x\n", kr);
2646	}
2647
2648	#ifdef EXC_RESOURCE_MONITORS
2649	if (send_exc_resource) {
2650	if (disable_exc_resource) {
2651	printf("process %s[%d] thread %llu caught burning CPU! "
2652	"EXC_RESOURCE%s suppressed by a boot-arg\n",
2653	procname, pid, tid, fatal ? " (and termination)" : "");
2654	return;
2655	}
2656
2657	if (disable_exc_resource_during_audio && audio_active) {
2658	printf("process %s[%d] thread %llu caught burning CPU! "
2659	"EXC_RESOURCE & termination suppressed due to audio playback\n",
2660	procname, pid, tid);
2661	return;
2662	}
2663	}
2664
2665
2666	if (send_exc_resource) {
2667	code[`0`] = code[`1`] = `0`;
2668	EXC_RESOURCE_ENCODE_TYPE(code[`0`], RESOURCE_TYPE_CPU);
2669	if (fatal) {
2670	EXC_RESOURCE_ENCODE_FLAVOR(code[`0`], FLAVOR_CPU_MONITOR_FATAL);
2671	} else {
2672	EXC_RESOURCE_ENCODE_FLAVOR(code[`0`], FLAVOR_CPU_MONITOR);
2673	}
2674	EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[`0`], interval_sec);
2675	EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[`0`], percentage);
2676	EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[`1`], usage_percent);
2677	exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
2678	}
2679	#endif /* EXC_RESOURCE_MONITORS */
2680
2681	if (fatal) {
2682	#if CONFIG_JETSAM
2683	jetsam_on_ledger_cpulimit_exceeded();
2684	#else
2685	task_terminate_internal(task);
2686	#endif
2687	}
2688	}
2689
2690	bool os_variant_has_internal_diagnostics(const char *subsystem);
2691
2692	#if DEVELOPMENT \|\| DEBUG
2693
2694	void __attribute__((noinline))
2695	SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task, int thread_count)
2696	{
2697	mach_exception_data_type_t code[EXCEPTION_CODE_MAX] = {`0`};
2698	int pid = task_pid(task);
2699	char procname[MAXCOMLEN + `1`] = "unknown";
2700
2701	if (pid == `1`) {
2702	/*
2703	* Cannot suspend launchd
2704	*/
2705	return;
2706	}
2707
2708	proc_name(pid, procname, sizeof(procname));
2709
2710	/*
2711	* Skip all checks for testing when exc_resource_threads_enabled is overriden
2712	*/
2713	if (exc_resource_threads_enabled == `2`) {
2714	goto skip_checks;
2715	}
2716
2717	if (disable_exc_resource) {
2718	printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2719	"suppressed by a boot-arg.\n", procname, pid, thread_count);
2720	return;
2721	}
2722
2723	if (!os_variant_has_internal_diagnostics("com.apple.xnu")) {
2724	printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2725	"suppressed, internal diagnostics disabled.\n", procname, pid, thread_count);
2726	return;
2727	}
2728
2729	if (disable_exc_resource_during_audio && audio_active) {
2730	printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2731	"suppressed due to audio playback.\n", procname, pid, thread_count);
2732	return;
2733	}
2734
2735	if (!exc_via_corpse_forking) {
2736	printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2737	"suppressed due to corpse forking being disabled.\n", procname, pid,
2738	thread_count);
2739	return;
2740	}
2741
2742	skip_checks:
2743	printf("process %s[%d] crossed thread count high watermark (%d), sending "
2744	"EXC_RESOURCE\n", procname, pid, thread_count);
2745
2746	EXC_RESOURCE_ENCODE_TYPE(code[`0`], RESOURCE_TYPE_THREADS);
2747	EXC_RESOURCE_ENCODE_FLAVOR(code[`0`], FLAVOR_THREADS_HIGH_WATERMARK);
2748	EXC_RESOURCE_THREADS_ENCODE_THREADS(code[`0`], thread_count);
2749
2750	task_enqueue_exception_with_corpse(task, EXC_RESOURCE, code, EXCEPTION_CODE_MAX, NULL, FALSE);
2751	}
2752	#endif /* DEVELOPMENT \|\| DEBUG */
2753
2754	void
2755	thread_update_io_stats(thread_t thread, int size, int io_flags)
2756	{
2757	task_t task = get_threadtask(thread);
2758	int io_tier;
2759
2760	if (thread->thread_io_stats == NULL \|\| task->task_io_stats == NULL) {
2761	return;
2762	}
2763
2764	if (io_flags & DKIO_READ) {
2765	UPDATE_IO_STATS(thread->thread_io_stats->disk_reads, size);
2766	UPDATE_IO_STATS_ATOMIC(task->task_io_stats->disk_reads, size);
2767	}
2768
2769	if (io_flags & DKIO_META) {
2770	UPDATE_IO_STATS(thread->thread_io_stats->metadata, size);
2771	UPDATE_IO_STATS_ATOMIC(task->task_io_stats->metadata, size);
2772	}
2773
2774	if (io_flags & DKIO_PAGING) {
2775	UPDATE_IO_STATS(thread->thread_io_stats->paging, size);
2776	UPDATE_IO_STATS_ATOMIC(task->task_io_stats->paging, size);
2777	}
2778
2779	io_tier = ((io_flags & DKIO_TIER_MASK) >> DKIO_TIER_SHIFT);
2780	assert(io_tier < IO_NUM_PRIORITIES);
2781
2782	UPDATE_IO_STATS(thread->thread_io_stats->io_priority[io_tier], size);
2783	UPDATE_IO_STATS_ATOMIC(task->task_io_stats->io_priority[io_tier], size);
2784
2785	/ Update Total I/O Counts /
2786	UPDATE_IO_STATS(thread->thread_io_stats->total_io, size);
2787	UPDATE_IO_STATS_ATOMIC(task->task_io_stats->total_io, size);
2788
2789	if (!(io_flags & DKIO_READ)) {
2790	DTRACE_IO3(physical_writes, struct task , task, uint32_t, size, int*, io_flags);
2791	ledger_credit(ledger: task->ledger, entry: task_ledgers.physical_writes, amount: size);
2792	}
2793	}
2794
2795	static void
2796	init_thread_ledgers(void)
2797	{
2798	ledger_template_t t;
2799	int idx;
2800
2801	assert(thread_ledger_template == NULL);
2802
2803	if ((t = ledger_template_create(name: "Per-thread ledger")) == NULL) {
2804	panic("couldn't create thread ledger template");
2805	}
2806
2807	if ((idx = ledger_entry_add(template: t, key: "cpu_time", group: "sched", units: "ns")) < `0`) {
2808	panic("couldn't create cpu_time entry for thread ledger template");
2809	}
2810
2811	if (ledger_set_callback(template: t, entry: idx, callback: thread_cputime_callback, NULL, NULL) < `0`) {
2812	panic("couldn't set thread ledger callback for cpu_time entry");
2813	}
2814
2815	thread_ledgers.cpu_time = idx;
2816
2817	ledger_template_complete(template: t);
2818	thread_ledger_template = t;
2819	}
2820
2821	/*
2822	* Returns the amount of (abs) CPU time that remains before the limit would be
2823	* hit or the amount of time left in the current interval, whichever is smaller.
2824	* This value changes as CPU time is consumed and the ledgers refilled.
2825	* Used to limit the quantum of a thread.
2826	*/
2827	uint64_t
2828	thread_cpulimit_remaining(uint64_t now)
2829	{
2830	thread_t thread = current_thread();
2831
2832	if ((thread->options &
2833	(TH_OPT_PROC_CPULIMIT \| TH_OPT_PRVT_CPULIMIT)) == `0`) {
2834	return UINT64_MAX;
2835	}
2836
2837	/ Amount of time left in the current interval. /
2838	const uint64_t interval_remaining =
2839	ledger_get_interval_remaining(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time, now);
2840
2841	/ Amount that can be spent until the limit is hit. /
2842	const uint64_t remaining =
2843	ledger_get_remaining(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time);
2844
2845	return MIN(interval_remaining, remaining);
2846	}
2847
2848	/*
2849	* Returns true if a new interval should be started.
2850	*/
2851	bool
2852	thread_cpulimit_interval_has_expired(uint64_t now)
2853	{
2854	thread_t thread = current_thread();
2855
2856	if ((thread->options &
2857	(TH_OPT_PROC_CPULIMIT \| TH_OPT_PRVT_CPULIMIT)) == `0`) {
2858	return false;
2859	}
2860
2861	return ledger_get_interval_remaining(ledger: thread->t_threadledger,
2862	entry: thread_ledgers.cpu_time, now) == `0`;
2863	}
2864
2865	/*
2866	* Balances the ledger and sets the last refill time to `now`.
2867	*/
2868	void
2869	thread_cpulimit_restart(uint64_t now)
2870	{
2871	thread_t thread = current_thread();
2872
2873	assert3u(thread->options & (TH_OPT_PROC_CPULIMIT \| TH_OPT_PRVT_CPULIMIT), !=, `0`);
2874
2875	ledger_restart(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time, now);
2876	}
2877
2878	/*
2879	* Returns currently applied CPU usage limit, or 0/0 if none is applied.
2880	*/
2881	int
2882	thread_get_cpulimit(int action, uint8_t percentage, uint64_t *interval_ns)
2883	{
2884	int64_t abstime = `0`;
2885	uint64_t limittime = `0`;
2886	thread_t thread = current_thread();
2887
2888	*percentage = `0`;
2889	*interval_ns = `0`;
2890	*action = `0`;
2891
2892	if (thread->t_threadledger == LEDGER_NULL) {
2893	/*
2894	* This thread has no per-thread ledger, so it can't possibly
2895	* have a CPU limit applied.
2896	*/
2897	return KERN_SUCCESS;
2898	}
2899
2900	ledger_get_period(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time, period: interval_ns);
2901	ledger_get_limit(ledger: thread->t_threadledger, entry: thread_ledgers.cpu_time, limit: &abstime);
2902
2903	if ((abstime == LEDGER_LIMIT_INFINITY) \|\| (*interval_ns == `0`)) {
2904	/*
2905	* This thread's CPU time ledger has no period or limit; so it
2906	* doesn't have a CPU limit applied.
2907	*/
2908	return KERN_SUCCESS;
2909	}
2910
2911	/*
2912	* This calculation is the converse to the one in thread_set_cpulimit().
2913	*/
2914	absolutetime_to_nanoseconds(abstime, result: &limittime);
2915	percentage = (uint8_t)((limittime `100ULL`) / *interval_ns);
2916	assert(*percentage <= `100`);
2917
2918	if (thread->options & TH_OPT_PROC_CPULIMIT) {
2919	assert((thread->options & TH_OPT_PRVT_CPULIMIT) == `0`);
2920
2921	*action = THREAD_CPULIMIT_BLOCK;
2922	} else if (thread->options & TH_OPT_PRVT_CPULIMIT) {
2923	assert((thread->options & TH_OPT_PROC_CPULIMIT) == `0`);
2924
2925	*action = THREAD_CPULIMIT_EXCEPTION;
2926	} else {
2927	*action = THREAD_CPULIMIT_DISABLE;
2928	}
2929
2930	return KERN_SUCCESS;
2931	}
2932
2933	/*
2934	* Set CPU usage limit on a thread.
2935	*/
2936	int
2937	thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns)
2938	{
2939	thread_t thread = current_thread();
2940	ledger_t l;
2941	uint64_t limittime = `0`;
2942	uint64_t abstime = `0`;
2943
2944	assert(percentage <= `100`);
2945	assert(percentage > `0` \|\| action == THREAD_CPULIMIT_DISABLE);
2946
2947	/*
2948	* Disallow any change to the CPU limit if the TH_OPT_FORCED_LEDGER
2949	* flag is set.
2950	*/
2951	if ((thread->options & TH_OPT_FORCED_LEDGER) != `0`) {
2952	return KERN_FAILURE;
2953	}
2954
2955	if (action == THREAD_CPULIMIT_DISABLE) {
2956	/*
2957	* Remove CPU limit, if any exists.
2958	*/
2959	if (thread->t_threadledger != LEDGER_NULL) {
2960	l = thread->t_threadledger;
2961	ledger_set_limit(ledger: l, entry: thread_ledgers.cpu_time, LEDGER_LIMIT_INFINITY, warn_level_percentage: `0`);
2962	ledger_set_action(ledger: l, entry: thread_ledgers.cpu_time, LEDGER_ACTION_IGNORE);
2963	thread->options &= ~(TH_OPT_PROC_CPULIMIT \| TH_OPT_PRVT_CPULIMIT);
2964	}
2965
2966	return `0`;
2967	}
2968
2969	if (interval_ns < MINIMUM_CPULIMIT_INTERVAL_MS * NSEC_PER_MSEC) {
2970	return KERN_INVALID_ARGUMENT;
2971	}
2972
2973	l = thread->t_threadledger;
2974	if (l == LEDGER_NULL) {
2975	/*
2976	* This thread doesn't yet have a per-thread ledger; so create one with the CPU time entry active.
2977	*/
2978	if ((l = ledger_instantiate(template: thread_ledger_template, LEDGER_CREATE_INACTIVE_ENTRIES)) == LEDGER_NULL) {
2979	return KERN_RESOURCE_SHORTAGE;
2980	}
2981
2982	/*
2983	* We are the first to create this thread's ledger, so only activate our entry.
2984	*/
2985	ledger_entry_setactive(ledger: l, entry: thread_ledgers.cpu_time);
2986	thread->t_threadledger = l;
2987	}
2988
2989	/*
2990	* The limit is specified as a percentage of CPU over an interval in nanoseconds.
2991	* Calculate the amount of CPU time that the thread needs to consume in order to hit the limit.
2992	*/
2993	limittime = (interval_ns * percentage) / `100`;
2994	nanoseconds_to_absolutetime(nanoseconds: limittime, result: &abstime);
2995	ledger_set_limit(ledger: l, entry: thread_ledgers.cpu_time, limit: abstime, warn_level_percentage: cpumon_ustackshots_trigger_pct);
2996	/*
2997	* Refill the thread's allotted CPU time every interval_ns nanoseconds.
2998	*/
2999	ledger_set_period(ledger: l, entry: thread_ledgers.cpu_time, period: interval_ns);
3000
3001	if (action == THREAD_CPULIMIT_EXCEPTION) {
3002	/*
3003	* We don't support programming the CPU usage monitor on a task if any of its
3004	* threads have a per-thread blocking CPU limit configured.
3005	*/
3006	if (thread->options & TH_OPT_PRVT_CPULIMIT) {
3007	panic("CPU usage monitor activated, but blocking thread limit exists");
3008	}
3009
3010	/*
3011	* Make a note that this thread's CPU limit is being used for the task-wide CPU
3012	* usage monitor. We don't have to arm the callback which will trigger the
3013	* exception, because that was done for us in ledger_instantiate (because the
3014	* ledger template used has a default callback).
3015	*/
3016	thread->options \|= TH_OPT_PROC_CPULIMIT;
3017	} else {
3018	/*
3019	* We deliberately override any CPU limit imposed by a task-wide limit (eg
3020	* CPU usage monitor).
3021	*/
3022	thread->options &= ~TH_OPT_PROC_CPULIMIT;
3023
3024	thread->options \|= TH_OPT_PRVT_CPULIMIT;
3025	/ The per-thread ledger template by default has a callback for CPU time /
3026	ledger_disable_callback(ledger: l, entry: thread_ledgers.cpu_time);
3027	ledger_set_action(ledger: l, entry: thread_ledgers.cpu_time, LEDGER_ACTION_BLOCK);
3028	}
3029
3030	return `0`;
3031	}
3032
3033	void
3034	thread_sched_call(
3035	thread_t thread,
3036	sched_call_t call)
3037	{
3038	assert((thread->state & TH_WAIT_REPORT) == `0`);
3039	thread->sched_call = call;
3040	}
3041
3042	uint64_t
3043	thread_tid(
3044	thread_t thread)
3045	{
3046	return thread != THREAD_NULL? thread->thread_id: `0`;
3047	}
3048
3049	uint64_t
3050	uthread_tid(
3051	struct uthread *uth)
3052	{
3053	if (uth) {
3054	return thread_tid(thread: get_machthread(uth));
3055	}
3056	return `0`;
3057	}
3058
3059	uint16_t
3060	thread_set_tag(thread_t th, uint16_t tag)
3061	{
3062	return thread_set_tag_internal(thread: th, tag);
3063	}
3064
3065	uint16_t
3066	thread_get_tag(thread_t th)
3067	{
3068	return thread_get_tag_internal(thread: th);
3069	}
3070
3071	uint64_t
3072	thread_last_run_time(thread_t th)
3073	{
3074	return th->last_run_time;
3075	}
3076
3077	/*
3078	* Shared resource contention management
3079	*
3080	* The scheduler attempts to load balance the shared resource intensive
3081	* workloads across clusters to ensure that the resource is not heavily
3082	* contended. The kernel relies on external agents (userspace or
3083	* performance controller) to identify shared resource heavy threads.
3084	* The load balancing is achieved based on the scheduler configuration
3085	* enabled on the platform.
3086	*/
3087
3088
3089	#if CONFIG_SCHED_EDGE
3090
3091	/*
3092	* On the Edge scheduler, the load balancing is achieved by looking
3093	* at cluster level shared resource loads and migrating resource heavy
3094	* threads dynamically to under utilized cluster. Therefore, when a
3095	* thread is indicated as a resource heavy thread, the policy set
3096	* routine simply adds a flag to the thread which is looked at by
3097	* the scheduler on thread migration decisions.
3098	*/
3099
3100	boolean_t
3101	thread_shared_rsrc_policy_get(thread_t thread, cluster_shared_rsrc_type_t type)
3102	{
3103	return thread->th_shared_rsrc_heavy_user[type] \|\| thread->th_shared_rsrc_heavy_perf_control[type];
3104	}
3105
3106	__options_decl(sched_edge_rsrc_heavy_thread_state, uint32_t, {
3107	SCHED_EDGE_RSRC_HEAVY_THREAD_SET = `1`,
3108	SCHED_EDGE_RSRC_HEAVY_THREAD_CLR = `2`,
3109	});
3110
3111	kern_return_t
3112	thread_shared_rsrc_policy_set(thread_t thread, __unused uint32_t index, cluster_shared_rsrc_type_t type, shared_rsrc_policy_agent_t agent)
3113	{
3114	spl_t s = splsched();
3115	thread_lock(thread);
3116
3117	bool user = (agent == SHARED_RSRC_POLICY_AGENT_DISPATCH) \|\| (agent == SHARED_RSRC_POLICY_AGENT_SYSCTL);
3118	bool *thread_flags = (user) ? thread->th_shared_rsrc_heavy_user : thread->th_shared_rsrc_heavy_perf_control;
3119	if (thread_flags[type]) {
3120	thread_unlock(thread);
3121	splx(s);
3122	return KERN_FAILURE;
3123	}
3124
3125	thread_flags[type] = true;
3126	thread_unlock(thread);
3127	splx(s);
3128
3129	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_RSRC_HEAVY_THREAD) \| DBG_FUNC_NONE, SCHED_EDGE_RSRC_HEAVY_THREAD_SET, thread_tid(thread), type, agent);
3130	if (thread == current_thread()) {
3131	if (agent == SHARED_RSRC_POLICY_AGENT_PERFCTL_QUANTUM) {
3132	ast_on(AST_PREEMPT);
3133	} else {
3134	assert(agent != SHARED_RSRC_POLICY_AGENT_PERFCTL_CSW);
3135	thread_block(THREAD_CONTINUE_NULL);
3136	}
3137	}
3138	return KERN_SUCCESS;
3139	}
3140
3141	kern_return_t
3142	thread_shared_rsrc_policy_clear(thread_t thread, cluster_shared_rsrc_type_t type, shared_rsrc_policy_agent_t agent)
3143	{
3144	spl_t s = splsched();
3145	thread_lock(thread);
3146
3147	bool user = (agent == SHARED_RSRC_POLICY_AGENT_DISPATCH) \|\| (agent == SHARED_RSRC_POLICY_AGENT_SYSCTL);
3148	bool *thread_flags = (user) ? thread->th_shared_rsrc_heavy_user : thread->th_shared_rsrc_heavy_perf_control;
3149	if (!thread_flags[type]) {
3150	thread_unlock(thread);
3151	splx(s);
3152	return KERN_FAILURE;
3153	}
3154
3155	thread_flags[type] = false;
3156	thread_unlock(thread);
3157	splx(s);
3158
3159	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_RSRC_HEAVY_THREAD) \| DBG_FUNC_NONE, SCHED_EDGE_RSRC_HEAVY_THREAD_CLR, thread_tid(thread), type, agent);
3160	if (thread == current_thread()) {
3161	if (agent == SHARED_RSRC_POLICY_AGENT_PERFCTL_QUANTUM) {
3162	ast_on(AST_PREEMPT);
3163	} else {
3164	assert(agent != SHARED_RSRC_POLICY_AGENT_PERFCTL_CSW);
3165	thread_block(THREAD_CONTINUE_NULL);
3166	}
3167	}
3168	return KERN_SUCCESS;
3169	}
3170
3171	#else /* CONFIG_SCHED_EDGE */
3172
3173	/*
3174	* On non-Edge schedulers, the shared resource contention
3175	* is managed by simply binding threads to specific clusters
3176	* based on the worker index passed by the agents marking
3177	* this thread as resource heavy threads. The thread binding
3178	* approach does not provide any rebalancing opportunities;
3179	* it can also suffer from scheduling delays if the cluster
3180	* where the thread is bound is contended.
3181	*/
3182
3183	boolean_t
3184	thread_shared_rsrc_policy_get(__unused thread_t thread, __unused cluster_shared_rsrc_type_t type)
3185	{
3186	return false;
3187	}
3188
3189	kern_return_t
3190	thread_shared_rsrc_policy_set(thread_t thread, uint32_t index, __unused cluster_shared_rsrc_type_t type, __unused shared_rsrc_policy_agent_t agent)
3191	{
3192	return thread_bind_cluster_id(thread, cluster_id: index, options: THREAD_BIND_SOFT \| THREAD_BIND_ELIGIBLE_ONLY);
3193	}
3194
3195	kern_return_t
3196	thread_shared_rsrc_policy_clear(thread_t thread, __unused cluster_shared_rsrc_type_t type, __unused shared_rsrc_policy_agent_t agent)
3197	{
3198	return thread_bind_cluster_id(thread, cluster_id: `0`, options: THREAD_UNBIND);
3199	}
3200
3201	#endif /* CONFIG_SCHED_EDGE */
3202
3203	uint64_t
3204	thread_dispatchqaddr(
3205	thread_t thread)
3206	{
3207	uint64_t dispatchqueue_addr;
3208	uint64_t thread_handle;
3209	task_t task;
3210
3211	if (thread == THREAD_NULL) {
3212	return `0`;
3213	}
3214
3215	thread_handle = thread->machine.cthread_self;
3216	if (thread_handle == `0`) {
3217	return `0`;
3218	}
3219
3220	task = get_threadtask(thread);
3221	void *bsd_info = get_bsdtask_info(task);
3222	if (thread->inspection == TRUE) {
3223	dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(task);
3224	} else if (bsd_info) {
3225	dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(bsd_info);
3226	} else {
3227	dispatchqueue_addr = `0`;
3228	}
3229
3230	return dispatchqueue_addr;
3231	}
3232
3233
3234	uint64_t
3235	thread_wqquantum_addr(thread_t thread)
3236	{
3237	uint64_t thread_handle;
3238	task_t task;
3239
3240	if (thread == THREAD_NULL) {
3241	return `0`;
3242	}
3243
3244	thread_handle = thread->machine.cthread_self;
3245	if (thread_handle == `0`) {
3246	return `0`;
3247	}
3248	task = get_threadtask(thread);
3249
3250	uint64_t wq_quantum_expiry_offset = get_wq_quantum_offset_from_proc(get_bsdtask_info(task));
3251	if (wq_quantum_expiry_offset == `0`) {
3252	return `0`;
3253	}
3254
3255	return wq_quantum_expiry_offset + thread_handle;
3256	}
3257
3258	uint64_t
3259	thread_rettokern_addr(
3260	thread_t thread)
3261	{
3262	uint64_t rettokern_addr;
3263	uint64_t rettokern_offset;
3264	uint64_t thread_handle;
3265	task_t task;
3266	void *bsd_info;
3267
3268	if (thread == THREAD_NULL) {
3269	return `0`;
3270	}
3271
3272	thread_handle = thread->machine.cthread_self;
3273	if (thread_handle == `0`) {
3274	return `0`;
3275	}
3276	task = get_threadtask(thread);
3277	bsd_info = get_bsdtask_info(task);
3278
3279	if (bsd_info) {
3280	rettokern_offset = get_return_to_kernel_offset_from_proc(p: bsd_info);
3281
3282	/ Return 0 if return to kernel offset is not initialized. /
3283	if (rettokern_offset == `0`) {
3284	rettokern_addr = `0`;
3285	} else {
3286	rettokern_addr = thread_handle + rettokern_offset;
3287	}
3288	} else {
3289	rettokern_addr = `0`;
3290	}
3291
3292	return rettokern_addr;
3293	}
3294
3295	/*
3296	* Export routines to other components for things that are done as macros
3297	* within the osfmk component.
3298	*/
3299
3300	void
3301	thread_mtx_lock(thread_t thread)
3302	{
3303	lck_mtx_lock(lck: &thread->mutex);
3304	}
3305
3306	void
3307	thread_mtx_unlock(thread_t thread)
3308	{
3309	lck_mtx_unlock(lck: &thread->mutex);
3310	}
3311
3312	void
3313	thread_reference(
3314	thread_t thread)
3315	{
3316	if (thread != THREAD_NULL) {
3317	zone_id_require(zone_id: ZONE_ID_THREAD, elem_size: sizeof(struct thread), addr: thread);
3318	os_ref_retain_raw(&thread->ref_count, &thread_refgrp);
3319	}
3320	}
3321
3322	void
3323	thread_require(thread_t thread)
3324	{
3325	zone_id_require(zone_id: ZONE_ID_THREAD, elem_size: sizeof(struct thread), addr: thread);
3326	}
3327
3328	#undef thread_should_halt
3329
3330	boolean_t
3331	thread_should_halt(
3332	thread_t th)
3333	{
3334	return thread_should_halt_fast(th);
3335	}
3336
3337	/*
3338	* thread_set_voucher_name - reset the voucher port name bound to this thread
3339	*
3340	* Conditions: nothing locked
3341	*/
3342
3343	kern_return_t
3344	thread_set_voucher_name(mach_port_name_t voucher_name)
3345	{
3346	thread_t thread = current_thread();
3347	ipc_voucher_t new_voucher = IPC_VOUCHER_NULL;
3348	ipc_voucher_t voucher;
3349	ledger_t bankledger = NULL;
3350	struct thread_group *banktg = NULL;
3351	uint32_t persona_id = `0`;
3352
3353	if (MACH_PORT_DEAD == voucher_name) {
3354	return KERN_INVALID_RIGHT;
3355	}
3356
3357	/*
3358	* agressively convert to voucher reference
3359	*/
3360	if (MACH_PORT_VALID(voucher_name)) {
3361	new_voucher = convert_port_name_to_voucher(name: voucher_name);
3362	if (IPC_VOUCHER_NULL == new_voucher) {
3363	return KERN_INVALID_ARGUMENT;
3364	}
3365	}
3366	bank_get_bank_ledger_thread_group_and_persona(voucher: new_voucher, bankledger: &bankledger, banktg: &banktg, persona_id: &persona_id);
3367
3368	thread_mtx_lock(thread);
3369	voucher = thread->ith_voucher;
3370	thread->ith_voucher_name = voucher_name;
3371	thread->ith_voucher = new_voucher;
3372	thread_mtx_unlock(thread);
3373
3374	bank_swap_thread_bank_ledger(thread, ledger: bankledger);
3375	#if CONFIG_THREAD_GROUPS
3376	thread_group_set_bank(t: thread, tg: banktg);
3377	#endif /* CONFIG_THREAD_GROUPS */
3378
3379	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3380	MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) \| DBG_FUNC_NONE,
3381	(uintptr_t)thread_tid(thread),
3382	(uintptr_t)voucher_name,
3383	VM_KERNEL_ADDRPERM((uintptr_t)new_voucher),
3384	persona_id, `0`);
3385
3386	if (IPC_VOUCHER_NULL != voucher) {
3387	ipc_voucher_release(voucher);
3388	}
3389
3390	return KERN_SUCCESS;
3391	}
3392
3393	/*
3394	* thread_get_mach_voucher - return a voucher reference for the specified thread voucher
3395	*
3396	* Conditions: nothing locked
3397	*
3398	* NOTE: At the moment, there is no distinction between the current and effective
3399	* vouchers because we only set them at the thread level currently.
3400	*/
3401	kern_return_t
3402	thread_get_mach_voucher(
3403	thread_act_t thread,
3404	mach_voucher_selector_t __unused which,
3405	ipc_voucher_t *voucherp)
3406	{
3407	ipc_voucher_t voucher;
3408
3409	if (THREAD_NULL == thread) {
3410	return KERN_INVALID_ARGUMENT;
3411	}
3412
3413	thread_mtx_lock(thread);
3414	voucher = thread->ith_voucher;
3415
3416	if (IPC_VOUCHER_NULL != voucher) {
3417	ipc_voucher_reference(voucher);
3418	thread_mtx_unlock(thread);
3419	*voucherp = voucher;
3420	return KERN_SUCCESS;
3421	}
3422
3423	thread_mtx_unlock(thread);
3424
3425	*voucherp = IPC_VOUCHER_NULL;
3426	return KERN_SUCCESS;
3427	}
3428
3429	/*
3430	* thread_set_mach_voucher - set a voucher reference for the specified thread voucher
3431	*
3432	* Conditions: callers holds a reference on the voucher.
3433	* nothing locked.
3434	*
3435	* We grab another reference to the voucher and bind it to the thread.
3436	* The old voucher reference associated with the thread is
3437	* discarded.
3438	*/
3439	kern_return_t
3440	thread_set_mach_voucher(
3441	thread_t thread,
3442	ipc_voucher_t voucher)
3443	{
3444	ipc_voucher_t old_voucher;
3445	ledger_t bankledger = NULL;
3446	struct thread_group *banktg = NULL;
3447	uint32_t persona_id = `0`;
3448
3449	if (THREAD_NULL == thread) {
3450	return KERN_INVALID_ARGUMENT;
3451	}
3452
3453	bank_get_bank_ledger_thread_group_and_persona(voucher, bankledger: &bankledger, banktg: &banktg, persona_id: &persona_id);
3454
3455	thread_mtx_lock(thread);
3456	/*
3457	* Once the thread is started, we will look at `ith_voucher` without
3458	* holding any lock.
3459	*
3460	* Setting the voucher hence can only be done by current_thread() or
3461	* before it started. "started" flips under the thread mutex and must be
3462	* tested under it too.
3463	*/
3464	if (thread != current_thread() && thread->started) {
3465	thread_mtx_unlock(thread);
3466	return KERN_INVALID_ARGUMENT;
3467	}
3468
3469	ipc_voucher_reference(voucher);
3470	old_voucher = thread->ith_voucher;
3471	thread->ith_voucher = voucher;
3472	thread->ith_voucher_name = MACH_PORT_NULL;
3473	thread_mtx_unlock(thread);
3474
3475	bank_swap_thread_bank_ledger(thread, ledger: bankledger);
3476	#if CONFIG_THREAD_GROUPS
3477	thread_group_set_bank(t: thread, tg: banktg);
3478	#endif /* CONFIG_THREAD_GROUPS */
3479
3480	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3481	MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) \| DBG_FUNC_NONE,
3482	(uintptr_t)thread_tid(thread),
3483	(uintptr_t)MACH_PORT_NULL,
3484	VM_KERNEL_ADDRPERM((uintptr_t)voucher),
3485	persona_id, `0`);
3486
3487	ipc_voucher_release(voucher: old_voucher);
3488
3489	return KERN_SUCCESS;
3490	}
3491
3492	/*
3493	* thread_swap_mach_voucher - swap a voucher reference for the specified thread voucher
3494	*
3495	* Conditions: callers holds a reference on the new and presumed old voucher(s).
3496	* nothing locked.
3497	*
3498	* This function is no longer supported.
3499	*/
3500	kern_return_t
3501	thread_swap_mach_voucher(
3502	__unused thread_t thread,
3503	__unused ipc_voucher_t new_voucher,
3504	ipc_voucher_t *in_out_old_voucher)
3505	{
3506	/*
3507	* Currently this function is only called from a MIG generated
3508	* routine which doesn't release the reference on the voucher
3509	* addressed by in_out_old_voucher. To avoid leaking this reference,
3510	* a call to release it has been added here.
3511	*/
3512	ipc_voucher_release(voucher: *in_out_old_voucher);
3513	OS_ANALYZER_SUPPRESS("81787115") return KERN_NOT_SUPPORTED;
3514	}
3515
3516	/*
3517	* thread_get_current_voucher_origin_pid - get the pid of the originator of the current voucher.
3518	*/
3519	kern_return_t
3520	thread_get_current_voucher_origin_pid(
3521	int32_t *pid)
3522	{
3523	return thread_get_voucher_origin_pid(thread: current_thread(), pid);
3524	}
3525
3526	/*
3527	* thread_get_current_voucher_origin_pid - get the pid of the originator of the current voucher.
3528	*/
3529	kern_return_t
3530	thread_get_voucher_origin_pid(thread_t thread, int32_t *pid)
3531	{
3532	uint32_t buf_size = sizeof(*pid);
3533	return mach_voucher_attr_command(voucher: thread->ith_voucher,
3534	MACH_VOUCHER_ATTR_KEY_BANK,
3535	BANK_ORIGINATOR_PID,
3536	NULL,
3537	in_contentCnt: `0`,
3538	out_content: (mach_voucher_attr_content_t)pid,
3539	out_contentCnt: &buf_size);
3540	}
3541
3542	/*
3543	* thread_get_current_voucher_proximate_pid - get the pid of the proximate process of the current voucher.
3544	*/
3545	kern_return_t
3546	thread_get_voucher_origin_proximate_pid(thread_t thread, int32_t origin_pid, int32_t proximate_pid)
3547	{
3548	int32_t origin_proximate_pids[`2`] = { };
3549	uint32_t buf_size = sizeof(origin_proximate_pids);
3550	kern_return_t kr = mach_voucher_attr_command(voucher: thread->ith_voucher,
3551	MACH_VOUCHER_ATTR_KEY_BANK,
3552	BANK_ORIGINATOR_PROXIMATE_PID,
3553	NULL,
3554	in_contentCnt: `0`,
3555	out_content: (mach_voucher_attr_content_t)origin_proximate_pids,
3556	out_contentCnt: &buf_size);
3557	if (kr == KERN_SUCCESS) {
3558	*origin_pid = origin_proximate_pids[`0`];
3559	*proximate_pid = origin_proximate_pids[`1`];
3560	}
3561	return kr;
3562	}
3563
3564	#if CONFIG_THREAD_GROUPS
3565	/*
3566	* Returns the current thread's voucher-carried thread group
3567	*
3568	* Reference is borrowed from this being the current voucher, so it does NOT
3569	* return a reference to the group.
3570	*/
3571	struct thread_group *
3572	thread_get_current_voucher_thread_group(thread_t thread)
3573	{
3574	assert(thread == current_thread());
3575
3576	if (thread->ith_voucher == NULL) {
3577	return NULL;
3578	}
3579
3580	ledger_t bankledger = NULL;
3581	struct thread_group *banktg = NULL;
3582
3583	bank_get_bank_ledger_thread_group_and_persona(voucher: thread->ith_voucher, bankledger: &bankledger, banktg: &banktg, NULL);
3584
3585	return banktg;
3586	}
3587
3588	#endif /* CONFIG_THREAD_GROUPS */
3589
3590	#if CONFIG_COALITIONS
3591
3592	uint64_t
3593	thread_get_current_voucher_resource_coalition_id(thread_t thread)
3594	{
3595	uint64_t id = `0`;
3596	assert(thread == current_thread());
3597	if (thread->ith_voucher != NULL) {
3598	id = bank_get_bank_ledger_resource_coalition_id(voucher: thread->ith_voucher);
3599	}
3600	return id;
3601	}
3602
3603	#endif /* CONFIG_COALITIONS */
3604
3605	extern struct workqueue *
3606	proc_get_wqptr(void *proc);
3607
3608	static bool
3609	task_supports_cooperative_workqueue(task_t task)
3610	{
3611	void *bsd_info = get_bsdtask_info(task);
3612
3613	assert(task == current_task());
3614	if (bsd_info == NULL) {
3615	return false;
3616	}
3617
3618	uint64_t wq_quantum_expiry_offset = get_wq_quantum_offset_from_proc(bsd_info);
3619	/ userspace may not yet have called workq_open yet /
3620	struct workqueue *wq = proc_get_wqptr(proc: bsd_info);
3621
3622	return (wq != NULL) && (wq_quantum_expiry_offset != `0`);
3623	}
3624
3625	/ Not safe to call from scheduler paths - should only be called on self /
3626	bool
3627	thread_supports_cooperative_workqueue(thread_t thread)
3628	{
3629	struct uthread *uth = get_bsdthread_info(thread);
3630	task_t task = get_threadtask(thread);
3631
3632	assert(thread == current_thread());
3633
3634	return task_supports_cooperative_workqueue(task) &&
3635	bsdthread_part_of_cooperative_workqueue(uth);
3636	}
3637
3638	static inline bool
3639	thread_has_armed_workqueue_quantum(thread_t thread)
3640	{
3641	return thread->workq_quantum_deadline != `0`;
3642	}
3643
3644	/*
3645	* The workq quantum is a lazy timer that is evaluated at 2 specific times in
3646	* the scheduler:
3647	*
3648	* - context switch time
3649	* - scheduler quantum expiry time.
3650	*
3651	* We're currently expressing the workq quantum with a 0.5 scale factor of the
3652	* scheduler quantum. It is possible that if the workq quantum is rearmed
3653	* shortly after the scheduler quantum begins, we could have a large delay
3654	* between when the workq quantum next expires and when it actually is noticed.
3655	*
3656	* A potential future improvement for the wq quantum expiry logic is to compare
3657	* it to the next actual scheduler quantum deadline and expire it if it is
3658	* within a certain leeway.
3659	*/
3660	static inline uint64_t
3661	thread_workq_quantum_size(thread_t thread)
3662	{
3663	return (uint64_t) (SCHED(initial_quantum_size)(thread) / `2`);
3664	}
3665
3666	/*
3667	* Always called by thread on itself - either at AST boundary after processing
3668	* an existing quantum expiry, or when a new quantum is armed before the thread
3669	* goes out to userspace to handle a thread request
3670	*/
3671	void
3672	thread_arm_workqueue_quantum(thread_t thread)
3673	{
3674	/*
3675	* If the task is not opted into wq quantum notification, or if the thread
3676	* is not part of the cooperative workqueue, don't even bother with tracking
3677	* the quantum or calculating expiry
3678	*/
3679	if (!thread_supports_cooperative_workqueue(thread)) {
3680	assert(thread->workq_quantum_deadline == `0`);
3681	return;
3682	}
3683
3684	assert(current_thread() == thread);
3685	assert(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
3686
3687	uint64_t current_runtime = thread_get_runtime_self();
3688	uint64_t deadline = thread_workq_quantum_size(thread) + current_runtime;
3689
3690	/*
3691	* The update of a workqueue quantum should always be followed by the update
3692	* of the AST - see explanation in kern/thread.h for synchronization of this
3693	* field
3694	*/
3695	thread->workq_quantum_deadline = deadline;
3696
3697	/ We're arming a new quantum, clear any previous expiry notification /
3698	act_clear_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3699
3700	WQ_TRACE(TRACE_wq_quantum_arm, current_runtime, deadline, `0`, `0`);
3701
3702	WORKQ_QUANTUM_HISTORY_WRITE_ENTRY(thread, thread->workq_quantum_deadline, true);
3703	}
3704
3705	/ Called by a thread on itself when it is about to park /
3706	void
3707	thread_disarm_workqueue_quantum(thread_t thread)
3708	{
3709	/ The update of a workqueue quantum should always be followed by the update*
3710	* of the AST - see explanation in kern/thread.h for synchronization of this
3711	* field */
3712	thread->workq_quantum_deadline = `0`;
3713	act_clear_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3714
3715	WQ_TRACE(TRACE_wq_quantum_disarm, `0`, `0`, `0`, `0`);
3716
3717	WORKQ_QUANTUM_HISTORY_WRITE_ENTRY(thread, thread->workq_quantum_deadline, false);
3718	}
3719
3720	/ This is called at context switch time on a thread that may not be self,*
3721	* and at AST time
3722	*/
3723	bool
3724	thread_has_expired_workqueue_quantum(thread_t thread, bool should_trace)
3725	{
3726	if (!thread_has_armed_workqueue_quantum(thread)) {
3727	return false;
3728	}
3729	/ We do not do a thread_get_runtime_self() here since this function is*
3730	* called from context switch time or during scheduler quantum expiry and
3731	* therefore, we may not be evaluating it on the current thread/self.
3732	*
3733	* In addition, the timers on the thread have just been updated recently so
3734	* we don't need to update them again.
3735	*/
3736	uint64_t runtime = recount_thread_time_mach(thread);
3737	bool expired = runtime > thread->workq_quantum_deadline;
3738
3739	if (expired && should_trace) {
3740	WQ_TRACE(TRACE_wq_quantum_expired, runtime, thread->workq_quantum_deadline, `0`, `0`);
3741	}
3742
3743	return expired;
3744	}
3745
3746	/*
3747	* Called on a thread that is being context switched out or during quantum
3748	* expiry on self. Only called from scheduler paths.
3749	*/
3750	void
3751	thread_evaluate_workqueue_quantum_expiry(thread_t thread)
3752	{
3753	if (thread_has_expired_workqueue_quantum(thread, true)) {
3754	act_set_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3755	}
3756	}
3757
3758	boolean_t
3759	thread_has_thread_name(thread_t th)
3760	{
3761	if (th) {
3762	return bsd_hasthreadname(uth: get_bsdthread_info(th));
3763	}
3764
3765	/*
3766	* This is an odd case; clients may set the thread name based on the lack of
3767	* a name, but in this context there is no uthread to attach the name to.
3768	*/
3769	return FALSE;
3770	}
3771
3772	void
3773	thread_set_thread_name(thread_t th, const char* name)
3774	{
3775	if (th && name) {
3776	bsd_setthreadname(uth: get_bsdthread_info(th), tid: thread_tid(thread: th), buffer: name);
3777	}
3778	}
3779
3780	void
3781	thread_get_thread_name(thread_t th, char* name)
3782	{
3783	if (!name) {
3784	return;
3785	}
3786	if (th) {
3787	bsd_getthreadname(uth: get_bsdthread_info(th), buffer: name);
3788	} else {
3789	name[`0`] = `'\0'`;
3790	}
3791	}
3792
3793	processor_t
3794	thread_get_runq(thread_t thread)
3795	{
3796	thread_lock_assert(thread, LCK_ASSERT_OWNED);
3797	processor_t runq = thread->__runq.runq;
3798	os_atomic_thread_fence(acquire);
3799	return runq;
3800	}
3801
3802	processor_t
3803	thread_get_runq_locked(thread_t thread)
3804	{
3805	thread_lock_assert(thread, LCK_ASSERT_OWNED);
3806	processor_t runq = thread->__runq.runq;
3807	if (runq != PROCESSOR_NULL) {
3808	pset_assert_locked(runq->processor_set);
3809	}
3810	return runq;
3811	}
3812
3813	void
3814	thread_set_runq_locked(thread_t thread, processor_t new_runq)
3815	{
3816	thread_lock_assert(thread, LCK_ASSERT_OWNED);
3817	pset_assert_locked(new_runq->processor_set);
3818	thread_assert_runq_null(thread);
3819	thread->__runq.runq = new_runq;
3820	}
3821
3822	void
3823	thread_clear_runq(thread_t thread)
3824	{
3825	thread_assert_runq_nonnull(thread);
3826	os_atomic_thread_fence(release);
3827	thread->__runq.runq = PROCESSOR_NULL;
3828	}
3829
3830	void
3831	thread_clear_runq_locked(thread_t thread)
3832	{
3833	thread_lock_assert(thread, LCK_ASSERT_OWNED);
3834	thread_assert_runq_nonnull(thread);
3835	thread->__runq.runq = PROCESSOR_NULL;
3836	}
3837
3838	void
3839	thread_assert_runq_null(__assert_only thread_t thread)
3840	{
3841	assert(thread->__runq.runq == PROCESSOR_NULL);
3842	}
3843
3844	void
3845	thread_assert_runq_nonnull(thread_t thread)
3846	{
3847	pset_assert_locked(thread->__runq.runq->processor_set);
3848	assert(thread->__runq.runq != PROCESSOR_NULL);
3849	}
3850
3851	void
3852	thread_set_honor_qlimit(thread_t thread)
3853	{
3854	thread->options \|= TH_OPT_HONOR_QLIMIT;
3855	}
3856
3857	void
3858	thread_clear_honor_qlimit(thread_t thread)
3859	{
3860	thread->options &= (~TH_OPT_HONOR_QLIMIT);
3861	}
3862
3863	/*
3864	* thread_enable_send_importance - set/clear the SEND_IMPORTANCE thread option bit.
3865	*/
3866	void
3867	thread_enable_send_importance(thread_t thread, boolean_t enable)
3868	{
3869	if (enable == TRUE) {
3870	thread->options \|= TH_OPT_SEND_IMPORTANCE;
3871	} else {
3872	thread->options &= ~TH_OPT_SEND_IMPORTANCE;
3873	}
3874	}
3875
3876	kern_return_t
3877	thread_get_ipc_propagate_attr(thread_t thread, struct thread_attr_for_ipc_propagation *attr)
3878	{
3879	int iotier;
3880	int qos;
3881
3882	if (thread == NULL \|\| attr == NULL) {
3883	return KERN_INVALID_ARGUMENT;
3884	}
3885
3886	iotier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
3887	qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
3888
3889	if (!qos) {
3890	qos = thread_user_promotion_qos_for_pri(priority: thread->base_pri);
3891	}
3892
3893	attr->tafip_iotier = iotier;
3894	attr->tafip_qos = qos;
3895
3896	return KERN_SUCCESS;
3897	}
3898
3899	/*
3900	* thread_set_allocation_name - .
3901	*/
3902
3903	kern_allocation_name_t
3904	thread_set_allocation_name(kern_allocation_name_t new_name)
3905	{
3906	kern_allocation_name_t ret;
3907	thread_kernel_state_t kstate = thread_get_kernel_state(current_thread());
3908	ret = kstate->allocation_name;
3909	// fifo
3910	if (!new_name \|\| !kstate->allocation_name) {
3911	kstate->allocation_name = new_name;
3912	}
3913	return ret;
3914	}
3915
3916	void *
3917	thread_iokit_tls_get(uint32_t index)
3918	{
3919	assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
3920	return current_thread()->saved.iokit.tls[index];
3921	}
3922
3923	void
3924	thread_iokit_tls_set(uint32_t index, void * data)
3925	{
3926	assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
3927	current_thread()->saved.iokit.tls[index] = data;
3928	}
3929
3930	uint64_t
3931	thread_get_last_wait_duration(thread_t thread)
3932	{
3933	return thread->last_made_runnable_time - thread->last_run_time;
3934	}
3935
3936	integer_t
3937	thread_kern_get_pri(thread_t thr)
3938	{
3939	return thr->base_pri;
3940	}
3941
3942	void
3943	thread_kern_set_pri(thread_t thr, integer_t pri)
3944	{
3945	sched_set_kernel_thread_priority(thread: thr, priority: pri);
3946	}
3947
3948	integer_t
3949	thread_kern_get_kernel_maxpri(void)
3950	{
3951	return MAXPRI_KERNEL;
3952	}
3953	/*
3954	* thread_port_with_flavor_no_senders
3955	*
3956	* Called whenever the Mach port system detects no-senders on
3957	* the thread inspect or read port. These ports are allocated lazily and
3958	* should be deallocated here when there are no senders remaining.
3959	*/
3960	static void
3961	thread_port_with_flavor_no_senders(
3962	ipc_port_t port,
3963	mach_port_mscount_t mscount __unused)
3964	{
3965	thread_ro_t tro;
3966	thread_t thread;
3967	mach_thread_flavor_t flavor;
3968	ipc_kobject_type_t kotype;
3969
3970	ip_mq_lock(port);
3971	if (port->ip_srights > `0`) {
3972	ip_mq_unlock(port);
3973	return;
3974	}
3975	kotype = ip_kotype(port);
3976	assert((IKOT_THREAD_READ == kotype) \|\| (IKOT_THREAD_INSPECT == kotype));
3977	thread = ipc_kobject_get_locked(port, type: kotype);
3978	if (thread != THREAD_NULL) {
3979	thread_reference(thread);
3980	}
3981	ip_mq_unlock(port);
3982
3983	if (thread == THREAD_NULL) {
3984	/ The thread is exiting or disabled; it will eventually deallocate the port /
3985	return;
3986	}
3987
3988	if (kotype == IKOT_THREAD_READ) {
3989	flavor = THREAD_FLAVOR_READ;
3990	} else {
3991	flavor = THREAD_FLAVOR_INSPECT;
3992	}
3993
3994	thread_mtx_lock(thread);
3995	ip_mq_lock(port);
3996
3997	/*
3998	* If the port is no longer active, then ipc_thread_terminate() ran
3999	* and destroyed the kobject already. Just deallocate the task
4000	* ref we took and go away.
4001	*
4002	* It is also possible that several nsrequests are in flight,
4003	* only one shall NULL-out the port entry, and this is the one
4004	* that gets to dealloc the port.
4005	*
4006	* Check for a stale no-senders notification. A call to any function
4007	* that vends out send rights to this port could resurrect it between
4008	* this notification being generated and actually being handled here.
4009	*/
4010	tro = get_thread_ro(thread);
4011	if (!ip_active(port) \|\|
4012	tro->tro_ports[flavor] != port \|\|
4013	port->ip_srights > `0`) {
4014	ip_mq_unlock(port);
4015	thread_mtx_unlock(thread);
4016	thread_deallocate(thread);
4017	return;
4018	}
4019
4020	assert(tro->tro_ports[flavor] == port);
4021	zalloc_ro_clear_field(ZONE_ID_THREAD_RO, tro, tro_ports[flavor]);
4022	thread_mtx_unlock(thread);
4023
4024	ipc_kobject_dealloc_port_and_unlock(port, mscount: `0`, type: kotype);
4025
4026	thread_deallocate(thread);
4027	}
4028
4029	/*
4030	* The 'thread_region_page_shift' is used by footprint
4031	* to specify the page size that it will use to
4032	* accomplish its accounting work on the task being
4033	* inspected. Since footprint uses a thread for each
4034	* task that it works on, we need to keep the page_shift
4035	* on a per-thread basis.
4036	*/
4037
4038	int
4039	thread_self_region_page_shift(void)
4040	{
4041	/*
4042	* Return the page shift that this thread
4043	* would like to use for its accounting work.
4044	*/
4045	return current_thread()->thread_region_page_shift;
4046	}
4047
4048	void
4049	thread_self_region_page_shift_set(
4050	int pgshift)
4051	{
4052	/*
4053	* Set the page shift that this thread
4054	* would like to use for its accounting work
4055	* when dealing with a task.
4056	*/
4057	current_thread()->thread_region_page_shift = pgshift;
4058	}
4059
4060	__startup_func
4061	static void
4062	ctid_table_init(void)
4063	{
4064	/*
4065	* Pretend the early boot setup didn't exist,
4066	* and pick a mangling nonce.
4067	*/
4068	*compact_id_resolve(table: &ctid_table, compact_id: `0`) = THREAD_NULL;
4069	ctid_nonce = (uint32_t)early_random() & CTID_MASK;
4070	}
4071
4072
4073	/*
4074	* This maps the [0, CTID_MAX_THREAD_NUMBER] range
4075	* to [1, CTID_MAX_THREAD_NUMBER + 1 == CTID_MASK]
4076	* so that in mangled form, '0' is an invalid CTID.
4077	*/
4078	static ctid_t
4079	ctid_mangle(compact_id_t cid)
4080	{
4081	return (cid == ctid_nonce ? CTID_MASK : cid) ^ ctid_nonce;
4082	}
4083
4084	static compact_id_t
4085	ctid_unmangle(ctid_t ctid)
4086	{
4087	ctid ^= ctid_nonce;
4088	return ctid == CTID_MASK ? ctid_nonce : ctid;
4089	}
4090
4091	void
4092	ctid_table_add(thread_t thread)
4093	{
4094	compact_id_t cid;
4095
4096	cid = compact_id_get(table: &ctid_table, CTID_MAX_THREAD_NUMBER, value: thread);
4097	thread->ctid = ctid_mangle(cid);
4098	}
4099
4100	void
4101	ctid_table_remove(thread_t thread)
4102	{
4103	__assert_only thread_t value;
4104
4105	value = compact_id_put(table: &ctid_table, compact_id: ctid_unmangle(ctid: thread->ctid));
4106	assert3p(value, ==, thread);
4107	thread->ctid = `0`;
4108	}
4109
4110	thread_t
4111	ctid_get_thread_unsafe(ctid_t ctid)
4112	{
4113	if (ctid) {
4114	return *compact_id_resolve(table: &ctid_table, compact_id: ctid_unmangle(ctid));
4115	}
4116	return THREAD_NULL;
4117	}
4118
4119	thread_t
4120	ctid_get_thread(ctid_t ctid)
4121	{
4122	thread_t thread = THREAD_NULL;
4123
4124	if (ctid) {
4125	thread = *compact_id_resolve(table: &ctid_table, compact_id: ctid_unmangle(ctid));
4126	assert(thread && thread->ctid == ctid);
4127	}
4128	return thread;
4129	}
4130
4131	ctid_t
4132	thread_get_ctid(thread_t thread)
4133	{
4134	return thread->ctid;
4135	}
4136
4137	/*
4138	* Adjust code signature dependent thread state.
4139	*
4140	* Called to allow code signature dependent adjustments to the thread
4141	* state. Note that this is usually called twice for the main thread:
4142	* Once at thread creation by thread_create, when the signature is
4143	* potentially not attached yet (which is usually the case for the
4144	* first/main thread of a task), and once after the task's signature
4145	* has actually been attached.
4146	*
4147	*/
4148	kern_return_t
4149	thread_process_signature(thread_t thread, task_t task)
4150	{
4151	return machine_thread_process_signature(thread, task);
4152	}
4153
4154	#if CONFIG_SPTM
4155
4156	void
4157	thread_associate_txm_thread_stack(uintptr_t thread_stack)
4158	{
4159	thread_t self = current_thread();
4160
4161	if (self->txm_thread_stack != `0`) {
4162	panic("attempted multiple TXM thread associations: %lu \| %lu",
4163	self->txm_thread_stack, thread_stack);
4164	}
4165
4166	self->txm_thread_stack = thread_stack;
4167	}
4168
4169	void
4170	thread_disassociate_txm_thread_stack(uintptr_t thread_stack)
4171	{
4172	thread_t self = current_thread();
4173
4174	if (self->txm_thread_stack == `0`) {
4175	panic("attempted to disassociate non-existent TXM thread");
4176	} else if (self->txm_thread_stack != thread_stack) {
4177	panic("invalid disassociation for TXM thread: %lu \| %lu",
4178	self->txm_thread_stack, thread_stack);
4179	}
4180
4181	self->txm_thread_stack = `0`;
4182	}
4183
4184	uintptr_t
4185	thread_get_txm_thread_stack(void)
4186	{
4187	return current_thread()->txm_thread_stack;
4188	}
4189
4190	#endif
4191
4192	#if CONFIG_DTRACE
4193	uint32_t
4194	dtrace_get_thread_predcache(thread_t thread)
4195	{
4196	if (thread != THREAD_NULL) {
4197	return thread->t_dtrace_predcache;
4198	} else {
4199	return `0`;
4200	}
4201	}
4202
4203	int64_t
4204	dtrace_get_thread_vtime(thread_t thread)
4205	{
4206	if (thread != THREAD_NULL) {
4207	return thread->t_dtrace_vtime;
4208	} else {
4209	return `0`;
4210	}
4211	}
4212
4213	int
4214	dtrace_get_thread_last_cpu_id(thread_t thread)
4215	{
4216	if ((thread != THREAD_NULL) && (thread->last_processor != PROCESSOR_NULL)) {
4217	return thread->last_processor->cpu_id;
4218	} else {
4219	return -`1`;
4220	}
4221	}
4222
4223	int64_t
4224	dtrace_get_thread_tracing(thread_t thread)
4225	{
4226	if (thread != THREAD_NULL) {
4227	return thread->t_dtrace_tracing;
4228	} else {
4229	return `0`;
4230	}
4231	}
4232
4233	uint16_t
4234	dtrace_get_thread_inprobe(thread_t thread)
4235	{
4236	if (thread != THREAD_NULL) {
4237	return thread->t_dtrace_inprobe;
4238	} else {
4239	return `0`;
4240	}
4241	}
4242
4243	vm_offset_t
4244	thread_get_kernel_stack(thread_t thread)
4245	{
4246	if (thread != THREAD_NULL) {
4247	return thread->kernel_stack;
4248	} else {
4249	return `0`;
4250	}
4251	}
4252
4253	#if KASAN
4254	struct kasan_thread_data *
4255	kasan_get_thread_data(thread_t thread)
4256	{
4257	return &thread->kasan_data;
4258	}
4259	#endif
4260
4261	#if CONFIG_KCOV
4262	kcov_thread_data_t *
4263	kcov_get_thread_data(thread_t thread)
4264	{
4265	return &thread->kcov_data;
4266	}
4267	#endif
4268
4269	#if CONFIG_STKSZ
4270	/*
4271	* Returns base of a thread's kernel stack.
4272	*
4273	* Coverage sanitizer instruments every function including those that participates in stack handoff between threads.
4274	* There is a window in which CPU still holds old values but stack has been handed over to anoher thread already.
4275	* In this window kernel_stack is 0 but CPU still uses the original stack (until contex switch occurs). The original
4276	* kernel_stack value is preserved in ksancov_stack during this window.
4277	*/
4278	vm_offset_t
4279	kcov_stksz_get_thread_stkbase(thread_t thread)
4280	{
4281	if (thread != THREAD_NULL) {
4282	kcov_thread_data_t *data = kcov_get_thread_data(thread);
4283	if (data->ktd_stksz.kst_stack) {
4284	return data->ktd_stksz.kst_stack;
4285	} else {
4286	return thread->kernel_stack;
4287	}
4288	} else {
4289	return `0`;
4290	}
4291	}
4292
4293	vm_offset_t
4294	kcov_stksz_get_thread_stksize(thread_t thread)
4295	{
4296	if (thread != THREAD_NULL) {
4297	return kernel_stack_size;
4298	} else {
4299	return `0`;
4300	}
4301	}
4302
4303	void
4304	kcov_stksz_set_thread_stack(thread_t thread, vm_offset_t stack)
4305	{
4306	kcov_thread_data_t *data = kcov_get_thread_data(thread);
4307	data->ktd_stksz.kst_stack = stack;
4308	}
4309	#endif /* CONFIG_STKSZ */
4310
4311	int64_t
4312	dtrace_calc_thread_recent_vtime(thread_t thread)
4313	{
4314	if (thread == THREAD_NULL) {
4315	return `0`;
4316	}
4317
4318	struct recount_usage usage = { `0` };
4319	recount_current_thread_usage(usage: &usage);
4320	return (int64_t)(recount_usage_time_mach(usage: &usage));
4321	}
4322
4323	void
4324	dtrace_set_thread_predcache(thread_t thread, uint32_t predcache)
4325	{
4326	if (thread != THREAD_NULL) {
4327	thread->t_dtrace_predcache = predcache;
4328	}
4329	}
4330
4331	void
4332	dtrace_set_thread_vtime(thread_t thread, int64_t vtime)
4333	{
4334	if (thread != THREAD_NULL) {
4335	thread->t_dtrace_vtime = vtime;
4336	}
4337	}
4338
4339	void
4340	dtrace_set_thread_tracing(thread_t thread, int64_t accum)
4341	{
4342	if (thread != THREAD_NULL) {
4343	thread->t_dtrace_tracing = accum;
4344	}
4345	}
4346
4347	void
4348	dtrace_set_thread_inprobe(thread_t thread, uint16_t inprobe)
4349	{
4350	if (thread != THREAD_NULL) {
4351	thread->t_dtrace_inprobe = inprobe;
4352	}
4353	}
4354
4355	void
4356	dtrace_thread_bootstrap(void)
4357	{
4358	task_t task = current_task();
4359
4360	if (task->thread_count == `1`) {
4361	thread_t thread = current_thread();
4362	if (thread->t_dtrace_flags & TH_DTRACE_EXECSUCCESS) {
4363	thread->t_dtrace_flags &= ~TH_DTRACE_EXECSUCCESS;
4364	DTRACE_PROC(exec__success);
4365	KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC),
4366	task_pid(task));
4367	}
4368	DTRACE_PROC(start);
4369	}
4370	DTRACE_PROC(lwp__start);
4371	}
4372
4373	void
4374	dtrace_thread_didexec(thread_t thread)
4375	{
4376	thread->t_dtrace_flags \|= TH_DTRACE_EXECSUCCESS;
4377	}
4378	#endif /* CONFIG_DTRACE */
4379

Browse the source code of xnu/osfmk/kern/thread.c