kern_event.c source code [xnu/bsd/kern/kern_event.c]

1	/*
2	* Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*
28	*/
29	/-*
30	* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31	* All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	*
42	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52	* SUCH DAMAGE.
53	*/
54	/*
55	* @(#)kern_event.c 1.0 (3/31/2000)
56	*/
57	#include <stdint.h>
58	#include <machine/atomic.h>
59
60	#include <sys/param.h>
61	#include <sys/systm.h>
62	#include <sys/filedesc.h>
63	#include <sys/kernel.h>
64	#include <sys/proc_internal.h>
65	#include <sys/kauth.h>
66	#include <sys/malloc.h>
67	#include <sys/unistd.h>
68	#include <sys/file_internal.h>
69	#include <sys/fcntl.h>
70	#include <sys/select.h>
71	#include <sys/queue.h>
72	#include <sys/event.h>
73	#include <sys/eventvar.h>
74	#include <sys/protosw.h>
75	#include <sys/socket.h>
76	#include <sys/socketvar.h>
77	#include <sys/stat.h>
78	#include <sys/syscall.h> // SYS_* constants
79	#include <sys/sysctl.h>
80	#include <sys/uio.h>
81	#include <sys/sysproto.h>
82	#include <sys/user.h>
83	#include <sys/vnode_internal.h>
84	#include <string.h>
85	#include <sys/proc_info.h>
86	#include <sys/codesign.h>
87	#include <sys/pthread_shims.h>
88	#include <sys/kdebug.h>
89	#include <os/base.h>
90	#include <pexpert/pexpert.h>
91
92	#include <kern/thread_group.h>
93	#include <kern/locks.h>
94	#include <kern/clock.h>
95	#include <kern/cpu_data.h>
96	#include <kern/policy_internal.h>
97	#include <kern/thread_call.h>
98	#include <kern/sched_prim.h>
99	#include <kern/waitq.h>
100	#include <kern/zalloc.h>
101	#include <kern/kalloc.h>
102	#include <kern/assert.h>
103	#include <kern/ast.h>
104	#include <kern/thread.h>
105	#include <kern/kcdata.h>
106	#include <kern/work_interval.h>
107
108	#include <pthread/priority_private.h>
109	#include <pthread/workqueue_syscalls.h>
110	#include <pthread/workqueue_internal.h>
111	#include <libkern/libkern.h>
112
113	#include <os/log.h>
114
115	#include "net/net_str_id.h"
116
117	#if SKYWALK && defined(XNU_TARGET_OS_OSX)
118	#include <skywalk/lib/net_filter_event.h>
119
120	extern bool net_check_compatible_alf(void);
121	#endif /* SKYWALK && XNU_TARGET_OS_OSX */
122
123	#include <mach/task.h>
124	#include <libkern/section_keywords.h>
125
126	#if CONFIG_MEMORYSTATUS
127	#include <sys/kern_memorystatus.h>
128	#endif
129
130	#if DEVELOPMENT \|\| DEBUG
131	#define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
132	#define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
133	TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", `0`);
134	#endif
135
136	static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
137	SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
138	VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
139
140	extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); / osfmk/ipc/ipc_entry.h /
141	extern int cansignal(struct proc , kauth_cred_t, struct* proc , int); /* bsd/kern/kern_sig.c /
142
143	#define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
144
145	static int kqueue_select(struct fileproc fp, int* which, void *wq_link_id,
146	vfs_context_t ctx);
147	static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
148	static int kqueue_kqfilter(struct fileproc fp, struct* knote *kn,
149	struct kevent_qos_s *kev);
150	static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
151
152	static const struct fileops kqueueops = {
153	.fo_type = DTYPE_KQUEUE,
154	.fo_read = fo_no_read,
155	.fo_write = fo_no_write,
156	.fo_ioctl = fo_no_ioctl,
157	.fo_select = kqueue_select,
158	.fo_close = kqueue_close,
159	.fo_drain = kqueue_drain,
160	.fo_kqfilter = kqueue_kqfilter,
161	};
162
163	static inline int kevent_modern_copyout(struct kevent_qos_s , user_addr_t );
164	static int kevent_register_wait_prepare(struct knote kn, struct* kevent_qos_s kev, int* result);
165	static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
166	thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
167	static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
168	static void kevent_register_wait_cleanup(struct knote *kn);
169
170	static struct kqtailq kqueue_get_suppressed_queue(kqueue_t kq, struct* knote *kn);
171	static void kqueue_threadreq_initiate(struct kqueue kq, workq_threadreq_t, kq_index_t qos, int* flags);
172
173	static void kqworkq_unbind(proc_t p, workq_threadreq_t);
174	static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
175	static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
176	static void kqueue_update_iotier_override(kqueue_t kqu);
177
178	static void kqworkloop_unbind(struct kqworkloop *kwql);
179
180	enum kqwl_unbind_locked_mode {
181	KQWL_OVERRIDE_DROP_IMMEDIATELY,
182	KQWL_OVERRIDE_DROP_DELAYED,
183	};
184	static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
185	enum kqwl_unbind_locked_mode how);
186	static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
187	static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
188	static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
189	enum {
190	KQWL_UTQ_NONE,
191	/*
192	* The wakeup qos is the qos of QUEUED knotes.
193	*
194	* This QoS is accounted for with the events override in the
195	* kqr_override_index field. It is raised each time a new knote is queued at
196	* a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
197	* knote buckets and is recomputed after each event delivery.
198	*/
199	KQWL_UTQ_UPDATE_WAKEUP_QOS,
200	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
201	KQWL_UTQ_UNBINDING, / attempt to rebind /
202	KQWL_UTQ_PARKING,
203	/*
204	* The wakeup override is for suppressed knotes that have fired again at
205	* a higher QoS than the one for which they are suppressed already.
206	* This override is cleared when the knote suppressed list becomes empty.
207	*/
208	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
209	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
210	/*
211	* The QoS is the maximum QoS of an event enqueued on this workloop in
212	* userland. It is copied from the only EVFILT_WORKLOOP knote with
213	* a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
214	* such knote, this QoS is 0.
215	*/
216	KQWL_UTQ_SET_QOS_INDEX,
217	KQWL_UTQ_REDRIVE_EVENTS,
218	};
219	static void kqworkloop_update_threads_qos(struct kqworkloop kqwl, int* op, kq_index_t qos);
220	static int kqworkloop_end_processing(struct kqworkloop kqwl, int* flags, int kevent_flags);
221
222	static struct knote knote_alloc(void*);
223	static void knote_free(struct knote *kn);
224	static int kq_add_knote(struct kqueue kq, struct* knote *kn,
225	struct knote_lock_ctx knlc, struct* proc *p);
226	static struct knote kq_find_knote_and_kq_lock(struct* kqueue *kq,
227	struct kevent_qos_s kev, bool is_fd, struct* proc *p);
228
229	static void knote_activate(kqueue_t kqu, struct knote kn, int* result);
230	static void knote_dequeue(kqueue_t kqu, struct knote *kn);
231
232	static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
233	struct kevent_qos_s kev, int* result);
234	static void knote_suppress(kqueue_t kqu, struct knote *kn);
235	static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
236	static void knote_drop(kqueue_t kqu, struct knote kn, struct* knote_lock_ctx *knlc);
237
238	// both these functions may dequeue the knote and it is up to the caller
239	// to enqueue the knote back
240	static void knote_adjust_qos(struct kqueue kq, struct* knote kn, int* result);
241	static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
242
243	static ZONE_DEFINE(knote_zone, "knote zone",
244	sizeof(struct knote), ZC_CACHING \| ZC_ZFREE_CLEARMEM);
245	static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
246	sizeof(struct kqfile), ZC_ZFREE_CLEARMEM \| ZC_NOTBITAG);
247	static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
248	sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM \| ZC_NOTBITAG);
249	static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
250	sizeof(struct kqworkloop), ZC_CACHING \| ZC_ZFREE_CLEARMEM \| ZC_NOTBITAG);
251
252	#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
253
254	static int filt_no_attach(struct knote kn, struct* kevent_qos_s *kev);
255	static void filt_no_detach(struct knote *kn);
256	static int filt_bad_event(struct knote kn, long* hint);
257	static int filt_bad_touch(struct knote kn, struct* kevent_qos_s *kev);
258	static int filt_bad_process(struct knote kn, struct* kevent_qos_s *kev);
259
260	SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
261	.f_attach = filt_no_attach,
262	.f_detach = filt_no_detach,
263	.f_event = filt_bad_event,
264	.f_touch = filt_bad_touch,
265	.f_process = filt_bad_process,
266	};
267
268	#if CONFIG_MEMORYSTATUS
269	extern const struct filterops memorystatus_filtops;
270	#endif /* CONFIG_MEMORYSTATUS */
271	extern const struct filterops fs_filtops;
272	extern const struct filterops sig_filtops;
273	extern const struct filterops machport_attach_filtops;
274	extern const struct filterops mach_port_filtops;
275	extern const struct filterops mach_port_set_filtops;
276	extern const struct filterops pipe_nfiltops;
277	extern const struct filterops pipe_rfiltops;
278	extern const struct filterops pipe_wfiltops;
279	extern const struct filterops ptsd_kqops;
280	extern const struct filterops ptmx_kqops;
281	extern const struct filterops soread_filtops;
282	extern const struct filterops sowrite_filtops;
283	extern const struct filterops sock_filtops;
284	extern const struct filterops soexcept_filtops;
285	extern const struct filterops spec_filtops;
286	extern const struct filterops bpfread_filtops;
287	extern const struct filterops necp_fd_rfiltops;
288	#if SKYWALK
289	extern const struct filterops skywalk_channel_rfiltops;
290	extern const struct filterops skywalk_channel_wfiltops;
291	extern const struct filterops skywalk_channel_efiltops;
292	#endif /* SKYWALK */
293	extern const struct filterops fsevent_filtops;
294	extern const struct filterops vnode_filtops;
295	extern const struct filterops tty_filtops;
296
297	const static struct filterops file_filtops;
298	const static struct filterops kqread_filtops;
299	const static struct filterops proc_filtops;
300	const static struct filterops timer_filtops;
301	const static struct filterops user_filtops;
302	const static struct filterops workloop_filtops;
303	#if CONFIG_EXCLAVES
304	extern const struct filterops exclaves_notification_filtops;
305	#endif /* CONFIG_EXCLAVES */
306
307	/*
308	*
309	* Rules for adding new filters to the system:
310	* Public filters:
311	* - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
312	* in the exported section of the header
313	* - Update the EVFILT_SYSCOUNT value to reflect the new addition
314	* - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
315	* of the Public Filters section in the array.
316	* Private filters:
317	* - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
318	* - Update the EVFILTID_MAX value to reflect the new addition
319	* - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
320	* the Private filters section of the array.
321	*/
322	static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
323	static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
324	/ Public Filters /
325	[~EVFILT_READ] = &file_filtops,
326	[~EVFILT_WRITE] = &file_filtops,
327	[~EVFILT_AIO] = &bad_filtops,
328	[~EVFILT_VNODE] = &file_filtops,
329	[~EVFILT_PROC] = &proc_filtops,
330	[~EVFILT_SIGNAL] = &sig_filtops,
331	[~EVFILT_TIMER] = &timer_filtops,
332	[~EVFILT_MACHPORT] = &machport_attach_filtops,
333	[~EVFILT_FS] = &fs_filtops,
334	[~EVFILT_USER] = &user_filtops,
335	[~EVFILT_UNUSED_11] = &bad_filtops,
336	[~EVFILT_VM] = &bad_filtops,
337	[~EVFILT_SOCK] = &file_filtops,
338	#if CONFIG_MEMORYSTATUS
339	[~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
340	#else
341	[~EVFILT_MEMORYSTATUS] = &bad_filtops,
342	#endif
343	[~EVFILT_EXCEPT] = &file_filtops,
344	#if SKYWALK
345	[~EVFILT_NW_CHANNEL] = &file_filtops,
346	#else /* !SKYWALK */
347	[~EVFILT_NW_CHANNEL] = &bad_filtops,
348	#endif /* !SKYWALK */
349	[~EVFILT_WORKLOOP] = &workloop_filtops,
350	#if CONFIG_EXCLAVES
351	[~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
352	#else /* !CONFIG_EXCLAVES */
353	[~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
354	#endif /* CONFIG_EXCLAVES*/
355
356	/ Private filters /
357	[EVFILTID_KQREAD] = &kqread_filtops,
358	[EVFILTID_PIPE_N] = &pipe_nfiltops,
359	[EVFILTID_PIPE_R] = &pipe_rfiltops,
360	[EVFILTID_PIPE_W] = &pipe_wfiltops,
361	[EVFILTID_PTSD] = &ptsd_kqops,
362	[EVFILTID_SOREAD] = &soread_filtops,
363	[EVFILTID_SOWRITE] = &sowrite_filtops,
364	[EVFILTID_SCK] = &sock_filtops,
365	[EVFILTID_SOEXCEPT] = &soexcept_filtops,
366	[EVFILTID_SPEC] = &spec_filtops,
367	[EVFILTID_BPFREAD] = &bpfread_filtops,
368	[EVFILTID_NECP_FD] = &necp_fd_rfiltops,
369	#if SKYWALK
370	[EVFILTID_SKYWALK_CHANNEL_W] = &skywalk_channel_wfiltops,
371	[EVFILTID_SKYWALK_CHANNEL_R] = &skywalk_channel_rfiltops,
372	[EVFILTID_SKYWALK_CHANNEL_E] = &skywalk_channel_efiltops,
373	#else /* !SKYWALK */
374	[EVFILTID_SKYWALK_CHANNEL_W] = &bad_filtops,
375	[EVFILTID_SKYWALK_CHANNEL_R] = &bad_filtops,
376	[EVFILTID_SKYWALK_CHANNEL_E] = &bad_filtops,
377	#endif /* !SKYWALK */
378	[EVFILTID_FSEVENT] = &fsevent_filtops,
379	[EVFILTID_VN] = &vnode_filtops,
380	[EVFILTID_TTY] = &tty_filtops,
381	[EVFILTID_PTMX] = &ptmx_kqops,
382	[EVFILTID_MACH_PORT] = &mach_port_filtops,
383	[EVFILTID_MACH_PORT_SET] = &mach_port_set_filtops,
384
385	/ fake filter for detached knotes, keep last /
386	[EVFILTID_DETACHED] = &bad_filtops,
387	};
388
389	static inline bool
390	kqr_thread_bound(workq_threadreq_t kqr)
391	{
392	return kqr->tr_state == WORKQ_TR_STATE_BOUND;
393	}
394
395	static inline bool
396	kqr_thread_requested_pending(workq_threadreq_t kqr)
397	{
398	workq_tr_state_t tr_state = kqr->tr_state;
399	return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
400	}
401
402	static inline bool
403	kqr_thread_requested(workq_threadreq_t kqr)
404	{
405	return kqr->tr_state != WORKQ_TR_STATE_IDLE;
406	}
407
408	static inline thread_t
409	kqr_thread_fast(workq_threadreq_t kqr)
410	{
411	assert(kqr_thread_bound(kqr));
412	return kqr->tr_thread;
413	}
414
415	static inline thread_t
416	kqr_thread(workq_threadreq_t kqr)
417	{
418	return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
419	}
420
421	static inline struct kqworkloop *
422	kqr_kqworkloop(workq_threadreq_t kqr)
423	{
424	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
425	return __container_of(kqr, struct kqworkloop, kqwl_request);
426	}
427	return NULL;
428	}
429
430	static inline kqueue_t
431	kqr_kqueue(proc_t p, workq_threadreq_t kqr)
432	{
433	kqueue_t kqu;
434	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
435	kqu.kqwl = kqr_kqworkloop(kqr);
436	} else {
437	kqu.kqwq = p->p_fd.fd_wqkqueue;
438	assert(kqr >= kqu.kqwq->kqwq_request &&
439	kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
440	}
441	return kqu;
442	}
443
444	#if CONFIG_PREADOPT_TG
445	/ There are no guarantees about which locks are held when this is called /
446	inline thread_group_qos_t
447	kqr_preadopt_thread_group(workq_threadreq_t req)
448	{
449	struct kqworkloop *kqwl = kqr_kqworkloop(kqr: req);
450	return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
451	}
452
453	/ There are no guarantees about which locks are held when this is called /
454	inline _Atomic(thread_group_qos_t) *
455	kqr_preadopt_thread_group_addr(workq_threadreq_t req)
456	{
457	struct kqworkloop *kqwl = kqr_kqworkloop(kqr: req);
458	return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
459	}
460	#endif
461
462	/*
463	* kqueue/note lock implementations
464	*
465	* The kqueue lock guards the kq state, the state of its queues,
466	* and the kqueue-aware status and locks of individual knotes.
467	*
468	* The kqueue workq lock is used to protect state guarding the
469	* interaction of the kqueue with the workq. This state cannot
470	* be guarded by the kq lock - as it needs to be taken when we
471	* already have the waitq set lock held (during the waitq hook
472	* callback). It might be better to use the waitq lock itself
473	* for this, but the IRQ requirements make that difficult).
474	*
475	* Knote flags, filter flags, and associated data are protected
476	* by the underlying object lock - and are only ever looked at
477	* by calling the filter to get a [consistent] snapshot of that
478	* data.
479	*/
480
481	static inline void
482	kqlock(kqueue_t kqu)
483	{
484	lck_spin_lock(lck: &kqu.kq->kq_lock);
485	}
486
487	static inline void
488	kqlock_held(__assert_only kqueue_t kqu)
489	{
490	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
491	}
492
493	static inline void
494	kqunlock(kqueue_t kqu)
495	{
496	lck_spin_unlock(lck: &kqu.kq->kq_lock);
497	}
498
499	static inline void
500	knhash_lock(struct filedesc *fdp)
501	{
502	lck_mtx_lock(lck: &fdp->fd_knhashlock);
503	}
504
505	static inline void
506	knhash_unlock(struct filedesc *fdp)
507	{
508	lck_mtx_unlock(lck: &fdp->fd_knhashlock);
509	}
510
511	/ wait event for knote locks /
512	static inline event_t
513	knote_lock_wev(struct knote *kn)
514	{
515	return (event_t)(&kn->kn_hook);
516	}
517
518	/ wait event for kevent_register_wait_* /
519	static inline event64_t
520	knote_filt_wev64(struct knote *kn)
521	{
522	/ kdp_workloop_sync_wait_find_owner knows about this /
523	return CAST_EVENT64_T(kn);
524	}
525
526	/ wait event for knote_post/knote_drop /
527	static inline event_t
528	knote_post_wev(struct knote *kn)
529	{
530	return &kn->kn_kevent;
531	}
532
533	/!*
534	* @function knote_has_qos
535	*
536	* @brief
537	* Whether the knote has a regular QoS.
538	*
539	* @discussion
540	* kn_qos_override is:
541	* - 0 on kqfiles
542	* - THREAD_QOS_LAST for special buckets (manager)
543	*
544	* Other values mean the knote participates to QoS propagation.
545	*/
546	static inline bool
547	knote_has_qos(struct knote *kn)
548	{
549	return kn->kn_qos_override > `0` && kn->kn_qos_override < THREAD_QOS_LAST;
550	}
551
552	#pragma mark knote locks
553
554	/*
555	* Enum used by the knote_lock_* functions.
556	*
557	* KNOTE_KQ_LOCK_ALWAYS
558	* The function will always return with the kq lock held.
559	*
560	* KNOTE_KQ_LOCK_ON_SUCCESS
561	* The function will return with the kq lock held if it was successful
562	* (knote_lock() is the only function that can fail).
563	*
564	* KNOTE_KQ_LOCK_ON_FAILURE
565	* The function will return with the kq lock held if it was unsuccessful
566	* (knote_lock() is the only function that can fail).
567	*
568	* KNOTE_KQ_UNLOCK:
569	* The function returns with the kq unlocked.
570	*/
571	enum kqlocking {
572	KNOTE_KQ_LOCK_ALWAYS,
573	KNOTE_KQ_LOCK_ON_SUCCESS,
574	KNOTE_KQ_LOCK_ON_FAILURE,
575	KNOTE_KQ_UNLOCK,
576	};
577
578	static struct knote_lock_ctx *
579	knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
580	{
581	struct knote_lock_ctx *ctx;
582	LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
583	if (ctx->knlc_knote == kn) {
584	return ctx;
585	}
586	}
587	panic("knote lock context not found: %p", kn);
588	__builtin_trap();
589	}
590
591	/ slowpath of knote_lock() /
592	__attribute__((noinline))
593	static bool __result_use_check
594	knote_lock_slow(kqueue_t kqu, struct knote *kn,
595	struct knote_lock_ctx knlc, int* kqlocking)
596	{
597	struct knote_lock_ctx *owner_lc;
598	struct uthread *uth = current_uthread();
599	wait_result_t wr;
600
601	kqlock_held(kqu);
602
603	owner_lc = knote_lock_ctx_find(kqu, kn);
604	#if DEBUG \|\| DEVELOPMENT
605	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
606	#endif
607	owner_lc->knlc_waiters++;
608
609	/*
610	* Make our lock context visible to knote_unlock()
611	*/
612	uth->uu_knlock = knlc;
613
614	wr = lck_spin_sleep_with_inheritor(lock: &kqu.kq->kq_lock, lck_sleep_action: LCK_SLEEP_UNLOCK,
615	event: knote_lock_wev(kn), inheritor: owner_lc->knlc_thread,
616	THREAD_UNINT \| THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
617
618	if (wr == THREAD_RESTART) {
619	/*
620	* We haven't been woken up by knote_unlock() but knote_unlock_cancel.
621	* We need to cleanup the state since no one did.
622	*/
623	uth->uu_knlock = NULL;
624	#if DEBUG \|\| DEVELOPMENT
625	assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
626	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
627	#endif
628
629	if (kqlocking == KNOTE_KQ_LOCK_ALWAYS \|\|
630	kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
631	kqlock(kqu);
632	}
633	return false;
634	} else {
635	if (kqlocking == KNOTE_KQ_LOCK_ALWAYS \|\|
636	kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
637	kqlock(kqu);
638	#if DEBUG \|\| DEVELOPMENT
639	/*
640	* This state is set under the lock so we can't
641	* really assert this unless we hold the lock.
642	*/
643	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
644	#endif
645	}
646	return true;
647	}
648	}
649
650	/*
651	* Attempts to take the "knote" lock.
652	*
653	* Called with the kqueue lock held.
654	*
655	* Returns true if the knote lock is acquired, false if it has been dropped
656	*/
657	static bool __result_use_check
658	knote_lock(kqueue_t kqu, struct knote kn, struct* knote_lock_ctx *knlc,
659	enum kqlocking kqlocking)
660	{
661	kqlock_held(kqu);
662
663	#if DEBUG \|\| DEVELOPMENT
664	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
665	#endif
666	knlc->knlc_knote = kn;
667	knlc->knlc_thread = current_thread();
668	knlc->knlc_waiters = `0`;
669
670	if (__improbable(kn->kn_status & KN_LOCKED)) {
671	return knote_lock_slow(kqu, kn, knlc, kqlocking);
672	}
673
674	/*
675	* When the knote will be dropped, the knote lock is taken before
676	* KN_DROPPING is set, and then the knote will be removed from any
677	* hash table that references it before the lock is canceled.
678	*/
679	assert((kn->kn_status & KN_DROPPING) == `0`);
680	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
681	kn->kn_status \|= KN_LOCKED;
682	#if DEBUG \|\| DEVELOPMENT
683	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
684	#endif
685
686	if (kqlocking == KNOTE_KQ_UNLOCK \|\|
687	kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
688	kqunlock(kqu);
689	}
690	return true;
691	}
692
693	/*
694	* Unlocks a knote successfully locked with knote_lock().
695	*
696	* Called with the kqueue lock held.
697	*
698	* Returns with the kqueue lock held according to KNOTE_KQ_* mode.
699	*/
700	static void
701	knote_unlock(kqueue_t kqu, struct knote *kn,
702	struct knote_lock_ctx knlc, enum* kqlocking kqlocking)
703	{
704	kqlock_held(kqu);
705
706	assert(knlc->knlc_knote == kn);
707	assert(kn->kn_status & KN_LOCKED);
708	#if DEBUG \|\| DEVELOPMENT
709	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
710	#endif
711
712	LIST_REMOVE(knlc, knlc_link);
713
714	if (knlc->knlc_waiters) {
715	thread_t thread = THREAD_NULL;
716
717	wakeup_one_with_inheritor(event: knote_lock_wev(kn), THREAD_AWAKENED,
718	action: LCK_WAKE_DEFAULT, thread_wokenup: &thread);
719
720	/*
721	* knote_lock_slow() publishes the lock context of waiters
722	* in uthread::uu_knlock.
723	*
724	* Reach out and make this context the new owner.
725	*/
726	struct uthread *ut = get_bsdthread_info(thread);
727	struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
728
729	assert(next_owner_lc->knlc_knote == kn);
730	next_owner_lc->knlc_waiters = knlc->knlc_waiters - `1`;
731	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
732	#if DEBUG \|\| DEVELOPMENT
733	next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
734	#endif
735	ut->uu_knlock = NULL;
736	thread_deallocate_safe(thread);
737	} else {
738	kn->kn_status &= ~KN_LOCKED;
739	}
740
741	if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
742	/*
743	* No f_event() in flight anymore, we can leave QoS "Merge" mode
744	*
745	* See knote_adjust_qos()
746	*/
747	kn->kn_status &= ~KN_MERGE_QOS;
748	}
749	if (kqlocking == KNOTE_KQ_UNLOCK) {
750	kqunlock(kqu);
751	}
752	#if DEBUG \|\| DEVELOPMENT
753	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
754	#endif
755	}
756
757	/*
758	* Aborts all waiters for a knote lock, and unlock the knote.
759	*
760	* Called with the kqueue lock held.
761	*
762	* Returns with the kqueue unlocked.
763	*/
764	static void
765	knote_unlock_cancel(struct kqueue kq, struct* knote *kn,
766	struct knote_lock_ctx *knlc)
767	{
768	kqlock_held(kqu: kq);
769
770	assert(knlc->knlc_knote == kn);
771	assert(kn->kn_status & KN_LOCKED);
772	assert(kn->kn_status & KN_DROPPING);
773
774	LIST_REMOVE(knlc, knlc_link);
775	kn->kn_status &= ~KN_LOCKED;
776	kqunlock(kqu: kq);
777
778	if (knlc->knlc_waiters) {
779	wakeup_all_with_inheritor(event: knote_lock_wev(kn), THREAD_RESTART);
780	}
781	#if DEBUG \|\| DEVELOPMENT
782	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
783	#endif
784	}
785
786	/*
787	* Call the f_event hook of a given filter.
788	*
789	* Takes a use count to protect against concurrent drops.
790	* Called with the object lock held.
791	*/
792	static void
793	knote_post(struct knote kn, long* hint)
794	{
795	struct kqueue *kq = knote_get_kq(kn);
796	int dropping, result;
797
798	kqlock(kqu: kq);
799
800	if (__improbable(kn->kn_status & (KN_DROPPING \| KN_VANISHED))) {
801	return kqunlock(kqu: kq);
802	}
803
804	if (__improbable(kn->kn_status & KN_POSTING)) {
805	panic("KNOTE() called concurrently on knote %p", kn);
806	}
807
808	kn->kn_status \|= KN_POSTING;
809
810	kqunlock(kqu: kq);
811	result = filter_call(knote_fops(kn), f_event(kn, hint));
812	kqlock(kqu: kq);
813
814	/ Someone dropped the knote/the monitored object vanished while we*
815	* were in f_event, swallow the side effects of the post.
816	*/
817	dropping = (kn->kn_status & (KN_DROPPING \| KN_VANISHED));
818
819	if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
820	kqueue_update_iotier_override(kqu: kq);
821	}
822
823	if (!dropping && (result & FILTER_ACTIVE)) {
824	knote_activate(kqu: kq, kn, result);
825	}
826
827	if ((kn->kn_status & KN_LOCKED) == `0`) {
828	/*
829	* There's no other f_* call in flight, we can leave QoS "Merge" mode.
830	*
831	* See knote_adjust_qos()
832	*/
833	kn->kn_status &= ~(KN_POSTING \| KN_MERGE_QOS);
834	} else {
835	kn->kn_status &= ~KN_POSTING;
836	}
837
838	if (__improbable(dropping)) {
839	thread_wakeup(knote_post_wev(kn));
840	}
841
842	kqunlock(kqu: kq);
843	}
844
845	/*
846	* Called by knote_drop() and knote_fdclose() to wait for the last f_event()
847	* caller to be done.
848	*
849	* - kq locked at entry
850	* - kq unlocked at exit
851	*/
852	static void
853	knote_wait_for_post(struct kqueue kq, struct* knote *kn)
854	{
855	kqlock_held(kqu: kq);
856
857	assert(kn->kn_status & (KN_DROPPING \| KN_VANISHED));
858
859	if (kn->kn_status & KN_POSTING) {
860	lck_spin_sleep(lck: &kq->kq_lock, lck_sleep_action: LCK_SLEEP_UNLOCK, event: knote_post_wev(kn),
861	THREAD_UNINT \| THREAD_WAIT_NOREPORT);
862	} else {
863	kqunlock(kqu: kq);
864	}
865	}
866
867	#pragma mark knote helpers for filters
868
869	OS_ALWAYS_INLINE
870	void *
871	knote_kn_hook_get_raw(struct knote *kn)
872	{
873	uintptr_t *addr = &kn->kn_hook;
874
875	void hook = (void* ) addr;
876	#if __has_feature(ptrauth_calls)
877	if (hook) {
878	uint16_t blend = kn->kn_filter;
879	blend \|= (kn->kn_filtid << `8`);
880	blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
881
882	hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
883	ptrauth_blend_discriminator(addr, blend));
884	}
885	#endif
886
887	return hook;
888	}
889
890	OS_ALWAYS_INLINE void
891	knote_kn_hook_set_raw(struct knote kn, void* *kn_hook)
892	{
893	uintptr_t *addr = &kn->kn_hook;
894	#if __has_feature(ptrauth_calls)
895	if (kn_hook) {
896	uint16_t blend = kn->kn_filter;
897	blend \|= (kn->kn_filtid << `8`);
898	blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
899
900	kn_hook = ptrauth_sign_unauthenticated(kn_hook,
901	ptrauth_key_process_independent_data,
902	ptrauth_blend_discriminator(addr, blend));
903	}
904	#endif
905	*addr = (uintptr_t) kn_hook;
906	}
907
908	OS_ALWAYS_INLINE
909	void
910	knote_set_error(struct knote kn, int* error)
911	{
912	kn->kn_flags \|= EV_ERROR;
913	kn->kn_sdata = error;
914	}
915
916	OS_ALWAYS_INLINE
917	int64_t
918	knote_low_watermark(const struct knote *kn)
919	{
920	return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : `1`;
921	}
922
923	/!*
924	* @function knote_fill_kevent_with_sdata
925	*
926	* @brief
927	* Fills in a kevent from the current content of a knote.
928	*
929	* @discussion
930	* This is meant to be called from filter's f_process hooks.
931	* The kevent data is filled with kn->kn_sdata.
932	*
933	* kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
934	*
935	* Using knote_fill_kevent is typically preferred.
936	*/
937	OS_ALWAYS_INLINE
938	void
939	knote_fill_kevent_with_sdata(struct knote kn, struct* kevent_qos_s *kev)
940	{
941	#define knote_assert_aliases(name1, offs1, name2) \
942	static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
943	offsetof(struct kevent_internal_s, name2), \
944	"kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
945	/*
946	* All the code makes assumptions on these aliasing,
947	* so make sure we fail the build if we ever ever ever break them.
948	*/
949	knote_assert_aliases(ident, `0`, kei_ident);
950	#ifdef __LITTLE_ENDIAN__
951	knote_assert_aliases(filter, `0`, kei_filter); // non trivial overlap
952	knote_assert_aliases(filter, `1`, kei_filtid); // non trivial overlap
953	#else
954	knote_assert_aliases(filter, `0`, kei_filtid); // non trivial overlap
955	knote_assert_aliases(filter, `1`, kei_filter); // non trivial overlap
956	#endif
957	knote_assert_aliases(flags, `0`, kei_flags);
958	knote_assert_aliases(qos, `0`, kei_qos);
959	knote_assert_aliases(udata, `0`, kei_udata);
960	knote_assert_aliases(fflags, `0`, kei_fflags);
961	knote_assert_aliases(xflags, `0`, kei_sfflags); // non trivial overlap
962	knote_assert_aliases(data, `0`, kei_sdata); // non trivial overlap
963	knote_assert_aliases(ext, `0`, kei_ext);
964	#undef knote_assert_aliases
965
966	/*
967	* Fix the differences between kevent_qos_s and kevent_internal_s:
968	* - xflags is where kn_sfflags lives, we need to zero it
969	* - fixup the high bits of `filter` where kn_filtid lives
970	*/
971	kev = (struct kevent_qos_s *)&kn->kn_kevent;
972	kev->xflags = `0`;
973	kev->filter \|= `0xff00`;
974	if (kn->kn_flags & EV_CLEAR) {
975	kn->kn_fflags = `0`;
976	}
977	}
978
979	/!*
980	* @function knote_fill_kevent
981	*
982	* @brief
983	* Fills in a kevent from the current content of a knote.
984	*
985	* @discussion
986	* This is meant to be called from filter's f_process hooks.
987	* The kevent data is filled with the passed in data.
988	*
989	* kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
990	*/
991	OS_ALWAYS_INLINE
992	void
993	knote_fill_kevent(struct knote kn, struct* kevent_qos_s *kev, int64_t data)
994	{
995	knote_fill_kevent_with_sdata(kn, kev);
996	kev->filter = kn->kn_filter;
997	kev->data = data;
998	}
999
1000
1001	#pragma mark file_filtops
1002
1003	static int
1004	filt_fileattach(struct knote kn, struct* kevent_qos_s *kev)
1005	{
1006	return fo_kqfilter(fp: kn->kn_fp, kn, kev);
1007	}
1008
1009	SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1010	.f_isfd = `1`,
1011	.f_attach = filt_fileattach,
1012	};
1013
1014	#pragma mark kqread_filtops
1015
1016	#define f_flag fp_glob->fg_flag
1017	#define f_ops fp_glob->fg_ops
1018	#define f_lflags fp_glob->fg_lflags
1019
1020	static void
1021	filt_kqdetach(struct knote *kn)
1022	{
1023	struct kqfile kqf = (struct* kqfile *)fp_get_data(fp: kn->kn_fp);
1024	struct kqueue *kq = &kqf->kqf_kqueue;
1025
1026	kqlock(kqu: kq);
1027	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1028	kqunlock(kqu: kq);
1029	}
1030
1031	static int
1032	filt_kqueue(struct knote kn, __unused long* hint)
1033	{
1034	struct kqueue kq = (struct* kqueue *)fp_get_data(fp: kn->kn_fp);
1035
1036	return kq->kq_count > `0`;
1037	}
1038
1039	static int
1040	filt_kqtouch(struct knote kn, struct* kevent_qos_s *kev)
1041	{
1042	#pragma unused(kev)
1043	struct kqueue kq = (struct* kqueue *)fp_get_data(fp: kn->kn_fp);
1044	int res;
1045
1046	kqlock(kqu: kq);
1047	res = (kq->kq_count > `0`);
1048	kqunlock(kqu: kq);
1049
1050	return res;
1051	}
1052
1053	static int
1054	filt_kqprocess(struct knote kn, struct* kevent_qos_s *kev)
1055	{
1056	struct kqueue kq = (struct* kqueue *)fp_get_data(fp: kn->kn_fp);
1057	int res = `0`;
1058
1059	kqlock(kqu: kq);
1060	if (kq->kq_count) {
1061	knote_fill_kevent(kn, kev, data: kq->kq_count);
1062	res = `1`;
1063	}
1064	kqunlock(kqu: kq);
1065
1066	return res;
1067	}
1068
1069	SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1070	.f_isfd = `1`,
1071	.f_detach = filt_kqdetach,
1072	.f_event = filt_kqueue,
1073	.f_touch = filt_kqtouch,
1074	.f_process = filt_kqprocess,
1075	};
1076
1077	#pragma mark proc_filtops
1078
1079	static int
1080	filt_procattach(struct knote kn, __unused struct* kevent_qos_s *kev)
1081	{
1082	struct proc *p;
1083
1084	assert(PID_MAX < NOTE_PDATAMASK);
1085
1086	if ((kn->kn_sfflags & (NOTE_TRACK \| NOTE_TRACKERR \| NOTE_CHILD)) != `0`) {
1087	knote_set_error(kn, ENOTSUP);
1088	return `0`;
1089	}
1090
1091	p = proc_find(pid: (int)kn->kn_id);
1092	if (p == NULL) {
1093	knote_set_error(kn, ESRCH);
1094	return `0`;
1095	}
1096
1097	const uint32_t NoteExitStatusBits = NOTE_EXIT \| NOTE_EXITSTATUS;
1098
1099	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1100	do {
1101	pid_t selfpid = proc_selfpid();
1102
1103	if (p->p_ppid == selfpid) {
1104	break; / parent => ok /
1105	}
1106	if ((p->p_lflag & P_LTRACED) != `0` &&
1107	(p->p_oppid == selfpid)) {
1108	break; / parent-in-waiting => ok /
1109	}
1110	if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1111	break; / allowed to signal => ok /
1112	}
1113	proc_rele(p);
1114	knote_set_error(kn, EACCES);
1115	return `0`;
1116	} while (`0`);
1117	}
1118
1119	kn->kn_proc = p;
1120	kn->kn_flags \|= EV_CLEAR; / automatically set /
1121	kn->kn_sdata = `0`; / incoming data is ignored /
1122
1123	proc_klist_lock();
1124
1125	KNOTE_ATTACH(&p->p_klist, kn);
1126
1127	proc_klist_unlock();
1128
1129	proc_rele(p);
1130
1131	/*
1132	* only captures edge-triggered events after this point
1133	* so it can't already be fired.
1134	*/
1135	return `0`;
1136	}
1137
1138
1139	/*
1140	* The knote may be attached to a different process, which may exit,
1141	* leaving nothing for the knote to be attached to. In that case,
1142	* the pointer to the process will have already been nulled out.
1143	*/
1144	static void
1145	filt_procdetach(struct knote *kn)
1146	{
1147	struct proc *p;
1148
1149	proc_klist_lock();
1150
1151	p = kn->kn_proc;
1152	if (p != PROC_NULL) {
1153	kn->kn_proc = PROC_NULL;
1154	KNOTE_DETACH(&p->p_klist, kn);
1155	}
1156
1157	proc_klist_unlock();
1158	}
1159
1160	static int
1161	filt_procevent(struct knote kn, long* hint)
1162	{
1163	u_int event;
1164
1165	/ ALWAYS CALLED WITH proc_klist_lock /
1166
1167	/*
1168	* Note: a lot of bits in hint may be obtained from the knote
1169	* To free some of those bits, see <rdar://problem/12592988> Freeing up
1170	* bits in hint for filt_procevent
1171	*
1172	* mask off extra data
1173	*/
1174	event = (u_int)hint & NOTE_PCTRLMASK;
1175
1176	/*
1177	* termination lifecycle events can happen while a debugger
1178	* has reparented a process, in which case notifications
1179	* should be quashed except to the tracing parent. When
1180	* the debugger reaps the child (either via wait4(2) or
1181	* process exit), the child will be reparented to the original
1182	* parent and these knotes re-fired.
1183	*/
1184	if (event & NOTE_EXIT) {
1185	if ((kn->kn_proc->p_oppid != `0`)
1186	&& (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1187	/*
1188	* This knote is not for the current ptrace(2) parent, ignore.
1189	*/
1190	return `0`;
1191	}
1192	}
1193
1194	/*
1195	* if the user is interested in this event, record it.
1196	*/
1197	if (kn->kn_sfflags & event) {
1198	kn->kn_fflags \|= event;
1199	}
1200
1201	#pragma clang diagnostic push
1202	#pragma clang diagnostic ignored "-Wdeprecated-declarations"
1203	if ((event == NOTE_REAP) \|\| ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1204	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
1205	}
1206	#pragma clang diagnostic pop
1207
1208
1209	/*
1210	* The kernel has a wrapper in place that returns the same data
1211	* as is collected here, in kn_hook32. Any changes to how
1212	* NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1213	* should also be reflected in the proc_pidnoteexit() wrapper.
1214	*/
1215	if (event == NOTE_EXIT) {
1216	kn->kn_hook32 = `0`;
1217	if ((kn->kn_sfflags & NOTE_EXITSTATUS) != `0`) {
1218	kn->kn_fflags \|= NOTE_EXITSTATUS;
1219	kn->kn_hook32 \|= (hint & NOTE_PDATAMASK);
1220	}
1221	if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != `0`) {
1222	kn->kn_fflags \|= NOTE_EXIT_DETAIL;
1223	if ((kn->kn_proc->p_lflag &
1224	P_LTERM_DECRYPTFAIL) != `0`) {
1225	kn->kn_hook32 \|= NOTE_EXIT_DECRYPTFAIL;
1226	}
1227	if ((kn->kn_proc->p_lflag &
1228	P_LTERM_JETSAM) != `0`) {
1229	kn->kn_hook32 \|= NOTE_EXIT_MEMORY;
1230	switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1231	case P_JETSAM_VMPAGESHORTAGE:
1232	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1233	break;
1234	case P_JETSAM_VMTHRASHING:
1235	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_VMTHRASHING;
1236	break;
1237	case P_JETSAM_FCTHRASHING:
1238	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_FCTHRASHING;
1239	break;
1240	case P_JETSAM_VNODE:
1241	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_VNODE;
1242	break;
1243	case P_JETSAM_HIWAT:
1244	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_HIWAT;
1245	break;
1246	case P_JETSAM_PID:
1247	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_PID;
1248	break;
1249	case P_JETSAM_IDLEEXIT:
1250	kn->kn_hook32 \|= NOTE_EXIT_MEMORY_IDLE;
1251	break;
1252	}
1253	}
1254	if ((proc_getcsflags(kn->kn_proc) &
1255	CS_KILLED) != `0`) {
1256	kn->kn_hook32 \|= NOTE_EXIT_CSERROR;
1257	}
1258	}
1259	}
1260
1261	/ if we have any matching state, activate the knote /
1262	return kn->kn_fflags != `0`;
1263	}
1264
1265	static int
1266	filt_proctouch(struct knote kn, struct* kevent_qos_s *kev)
1267	{
1268	int res;
1269
1270	proc_klist_lock();
1271
1272	/ accept new filter flags and mask off output events no long interesting /
1273	kn->kn_sfflags = kev->fflags;
1274
1275	/ restrict the current results to the (smaller?) set of new interest /
1276	/*
1277	* For compatibility with previous implementations, we leave kn_fflags
1278	* as they were before.
1279	*/
1280	//kn->kn_fflags &= kn->kn_sfflags;
1281
1282	res = (kn->kn_fflags != `0`);
1283
1284	proc_klist_unlock();
1285
1286	return res;
1287	}
1288
1289	static int
1290	filt_procprocess(struct knote kn, struct* kevent_qos_s *kev)
1291	{
1292	int res = `0`;
1293
1294	proc_klist_lock();
1295	if (kn->kn_fflags) {
1296	knote_fill_kevent(kn, kev, data: kn->kn_hook32);
1297	kn->kn_hook32 = `0`;
1298	res = `1`;
1299	}
1300	proc_klist_unlock();
1301	return res;
1302	}
1303
1304	SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1305	.f_attach = filt_procattach,
1306	.f_detach = filt_procdetach,
1307	.f_event = filt_procevent,
1308	.f_touch = filt_proctouch,
1309	.f_process = filt_procprocess,
1310	};
1311
1312	#pragma mark timer_filtops
1313
1314	struct filt_timer_params {
1315	uint64_t deadline; / deadline in abs/cont time*
1316	* (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1317	uint64_t leeway; / leeway in abstime, or 0 if none /
1318	uint64_t interval; / interval in abstime or 0 if non-repeating timer /
1319	};
1320
1321	/*
1322	* Values stored in the knote at rest (using Mach absolute time units)
1323	*
1324	* kn->kn_thcall where the thread_call object is stored
1325	* kn->kn_ext[0] next deadline or 0 if immediate expiration
1326	* kn->kn_ext[1] leeway value
1327	* kn->kn_sdata interval timer: the interval
1328	* absolute/deadline timer: 0
1329	* kn->kn_hook32 timer state (with gencount)
1330	*
1331	* TIMER_IDLE:
1332	* The timer has either never been scheduled or been cancelled.
1333	* It is safe to schedule a new one in this state.
1334	*
1335	* TIMER_ARMED:
1336	* The timer has been scheduled
1337	*
1338	* TIMER_FIRED
1339	* The timer has fired and an event needs to be delivered.
1340	* When in this state, the callout may still be running.
1341	*
1342	* TIMER_IMMEDIATE
1343	* The timer has fired at registration time, and the callout was never
1344	* dispatched.
1345	*/
1346	#define TIMER_IDLE 0x0
1347	#define TIMER_ARMED 0x1
1348	#define TIMER_FIRED 0x2
1349	#define TIMER_IMMEDIATE 0x3
1350	#define TIMER_STATE_MASK 0x3
1351	#define TIMER_GEN_INC 0x4
1352
1353	static void
1354	filt_timer_set_params(struct knote kn, struct* filt_timer_params *params)
1355	{
1356	kn->kn_ext[`0`] = params->deadline;
1357	kn->kn_ext[`1`] = params->leeway;
1358	kn->kn_sdata = params->interval;
1359	}
1360
1361	/*
1362	* filt_timervalidate - process data from user
1363	*
1364	* Sets up the deadline, interval, and leeway from the provided user data
1365	*
1366	* Input:
1367	* kn_sdata timer deadline or interval time
1368	* kn_sfflags style of timer, unit of measurement
1369	*
1370	* Output:
1371	* struct filter_timer_params to apply to the filter with
1372	* filt_timer_set_params when changes are ready to be commited.
1373	*
1374	* Returns:
1375	* EINVAL Invalid user data parameters
1376	* ERANGE Various overflows with the parameters
1377	*
1378	* Called with timer filter lock held.
1379	*/
1380	static int
1381	filt_timervalidate(const struct kevent_qos_s *kev,
1382	struct filt_timer_params *params)
1383	{
1384	/*
1385	* There are 5 knobs that need to be chosen for a timer registration:
1386	*
1387	* A) Units of time (what is the time duration of the specified number)
1388	* Absolute and interval take:
1389	* NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1390	* Defaults to milliseconds if not specified
1391	*
1392	* B) Clock epoch (what is the zero point of the specified number)
1393	* For interval, there is none
1394	* For absolute, defaults to the gettimeofday/calendar epoch
1395	* With NOTE_MACHTIME, uses mach_absolute_time()
1396	* With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1397	*
1398	* C) The knote's behavior on delivery
1399	* Interval timer causes the knote to arm for the next interval unless one-shot is set
1400	* Absolute is a forced one-shot timer which deletes on delivery
1401	* TODO: Add a way for absolute to be not forced one-shot
1402	*
1403	* D) Whether the time duration is relative to now or absolute
1404	* Interval fires at now + duration when it is set up
1405	* Absolute fires at now + difference between now walltime and passed in walltime
1406	* With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1407	*
1408	* E) Whether the timer continues to tick across sleep
1409	* By default all three do not.
1410	* For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1411	* With NOTE_ABSOLUTE \| NOTE_MACHTIME \| NOTE_MACH_CONTINUOUS_TIME:
1412	* expires when mach_continuous_time() is > the passed in value.
1413	*/
1414
1415	uint64_t multiplier;
1416
1417	boolean_t use_abstime = FALSE;
1418
1419	switch (kev->fflags & (NOTE_SECONDS \| NOTE_USECONDS \| NOTE_NSECONDS \| NOTE_MACHTIME)) {
1420	case NOTE_SECONDS:
1421	multiplier = NSEC_PER_SEC;
1422	break;
1423	case NOTE_USECONDS:
1424	multiplier = NSEC_PER_USEC;
1425	break;
1426	case NOTE_NSECONDS:
1427	multiplier = `1`;
1428	break;
1429	case NOTE_MACHTIME:
1430	multiplier = `0`;
1431	use_abstime = TRUE;
1432	break;
1433	case `0`: / milliseconds (default) /
1434	multiplier = NSEC_PER_SEC / `1000`;
1435	break;
1436	default:
1437	return EINVAL;
1438	}
1439
1440	/ transform the leeway in kn_ext[1] to same time scale /
1441	if (kev->fflags & NOTE_LEEWAY) {
1442	uint64_t leeway_abs;
1443
1444	if (use_abstime) {
1445	leeway_abs = (uint64_t)kev->ext[`1`];
1446	} else {
1447	uint64_t leeway_ns;
1448	if (os_mul_overflow((uint64_t)kev->ext[`1`], multiplier, &leeway_ns)) {
1449	return ERANGE;
1450	}
1451
1452	nanoseconds_to_absolutetime(nanoseconds: leeway_ns, result: &leeway_abs);
1453	}
1454
1455	params->leeway = leeway_abs;
1456	} else {
1457	params->leeway = `0`;
1458	}
1459
1460	if (kev->fflags & NOTE_ABSOLUTE) {
1461	uint64_t deadline_abs;
1462
1463	if (use_abstime) {
1464	deadline_abs = (uint64_t)kev->data;
1465	} else {
1466	uint64_t calendar_deadline_ns;
1467
1468	if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1469	return ERANGE;
1470	}
1471
1472	/ calendar_deadline_ns is in nanoseconds since the epoch /
1473
1474	clock_sec_t seconds;
1475	clock_nsec_t nanoseconds;
1476
1477	/*
1478	* Note that the conversion through wall-time is only done once.
1479	*
1480	* If the relationship between MAT and gettimeofday changes,
1481	* the underlying timer does not update.
1482	*
1483	* TODO: build a wall-time denominated timer_call queue
1484	* and a flag to request DTRTing with wall-time timers
1485	*/
1486	clock_get_calendar_nanotime(secs: &seconds, nanosecs: &nanoseconds);
1487
1488	uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1489
1490	/ if deadline is in the future /
1491	if (calendar_now_ns < calendar_deadline_ns) {
1492	uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1493	uint64_t interval_abs;
1494
1495	nanoseconds_to_absolutetime(nanoseconds: interval_ns, result: &interval_abs);
1496
1497	/*
1498	* Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1499	* causes the timer to keep ticking across sleep, but
1500	* it does not change the calendar timebase.
1501	*/
1502
1503	if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1504	clock_continuoustime_interval_to_deadline(abstime: interval_abs,
1505	result: &deadline_abs);
1506	} else {
1507	clock_absolutetime_interval_to_deadline(abstime: interval_abs,
1508	result: &deadline_abs);
1509	}
1510	} else {
1511	deadline_abs = `0`; / cause immediate expiration /
1512	}
1513	}
1514
1515	params->deadline = deadline_abs;
1516	params->interval = `0`; / NOTE_ABSOLUTE is non-repeating /
1517	} else if (kev->data < `0`) {
1518	/*
1519	* Negative interval timers fire immediately, once.
1520	*
1521	* Ideally a negative interval would be an error, but certain clients
1522	* pass negative values on accident, and expect an event back.
1523	*
1524	* In the old implementation the timer would repeat with no delay
1525	* N times until mach_absolute_time() + (N * interval) underflowed,
1526	* then it would wait ~forever by accidentally arming a timer for the far future.
1527	*
1528	* We now skip the power-wasting hot spin phase and go straight to the idle phase.
1529	*/
1530
1531	params->deadline = `0`; / expire immediately /
1532	params->interval = `0`; / non-repeating /
1533	} else {
1534	uint64_t interval_abs = `0`;
1535
1536	if (use_abstime) {
1537	interval_abs = (uint64_t)kev->data;
1538	} else {
1539	uint64_t interval_ns;
1540	if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1541	return ERANGE;
1542	}
1543
1544	nanoseconds_to_absolutetime(nanoseconds: interval_ns, result: &interval_abs);
1545	}
1546
1547	uint64_t deadline = `0`;
1548
1549	if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1550	clock_continuoustime_interval_to_deadline(abstime: interval_abs, result: &deadline);
1551	} else {
1552	clock_absolutetime_interval_to_deadline(abstime: interval_abs, result: &deadline);
1553	}
1554
1555	params->deadline = deadline;
1556	params->interval = interval_abs;
1557	}
1558
1559	return `0`;
1560	}
1561
1562	/*
1563	* filt_timerexpire - the timer callout routine
1564	*/
1565	static void
1566	filt_timerexpire(void knx, void* *state_on_arm)
1567	{
1568	struct knote *kn = knx;
1569
1570	uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1571	uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1572
1573	if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1574	// our f_event always would say FILTER_ACTIVE,
1575	// so be leaner and just do it.
1576	struct kqueue *kq = knote_get_kq(kn);
1577	kqlock(kqu: kq);
1578	knote_activate(kqu: kq, kn, FILTER_ACTIVE);
1579	kqunlock(kqu: kq);
1580	} else {
1581	/*
1582	* The timer has been reprogrammed or canceled since it was armed,
1583	* and this is a late firing for the timer, just ignore it.
1584	*/
1585	}
1586	}
1587
1588	/*
1589	* Does this deadline needs a timer armed for it, or has it expired?
1590	*/
1591	static bool
1592	filt_timer_is_ready(struct knote *kn)
1593	{
1594	uint64_t now, deadline = kn->kn_ext[`0`];
1595
1596	if (deadline == `0`) {
1597	return true;
1598	}
1599
1600	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1601	now = mach_continuous_time();
1602	} else {
1603	now = mach_absolute_time();
1604	}
1605	return deadline <= now;
1606	}
1607
1608	/*
1609	* Arm a timer
1610	*
1611	* It is the responsibility of the caller to make sure the timer call
1612	* has completed or been cancelled properly prior to arming it.
1613	*/
1614	static void
1615	filt_timerarm(struct knote *kn)
1616	{
1617	uint64_t deadline = kn->kn_ext[`0`];
1618	uint64_t leeway = kn->kn_ext[`1`];
1619	uint32_t state;
1620
1621	int filter_flags = kn->kn_sfflags;
1622	unsigned int timer_flags = `0`;
1623
1624	if (filter_flags & NOTE_CRITICAL) {
1625	timer_flags \|= THREAD_CALL_DELAY_USER_CRITICAL;
1626	} else if (filter_flags & NOTE_BACKGROUND) {
1627	timer_flags \|= THREAD_CALL_DELAY_USER_BACKGROUND;
1628	} else {
1629	timer_flags \|= THREAD_CALL_DELAY_USER_NORMAL;
1630	}
1631
1632	if (filter_flags & NOTE_LEEWAY) {
1633	timer_flags \|= THREAD_CALL_DELAY_LEEWAY;
1634	}
1635
1636	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1637	timer_flags \|= THREAD_CALL_CONTINUOUS;
1638	}
1639
1640	/*
1641	* Move to ARMED.
1642	*
1643	* We increase the gencount, and setup the thread call with this expected
1644	* state. It means that if there was a previous generation of the timer in
1645	* flight that needs to be ignored, then 3 things are possible:
1646	*
1647	* - the timer fires first, filt_timerexpire() and sets the state to FIRED
1648	* but we clobber it with ARMED and a new gencount. The knote will still
1649	* be activated, but filt_timerprocess() which is serialized with this
1650	* call will not see the FIRED bit set and will not deliver an event.
1651	*
1652	* - this code runs first, but filt_timerexpire() comes second. Because it
1653	* knows an old gencount, it will debounce and not activate the knote.
1654	*
1655	* - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1656	* will just cancel it properly.
1657	*
1658	* This is important as userspace expects to never be woken up for past
1659	* timers after filt_timertouch ran.
1660	*/
1661	state = os_atomic_load(&kn->kn_hook32, relaxed);
1662	state &= ~TIMER_STATE_MASK;
1663	state += TIMER_GEN_INC + TIMER_ARMED;
1664	os_atomic_store(&kn->kn_hook32, state, relaxed);
1665
1666	thread_call_enter_delayed_with_leeway(call: kn->kn_thcall,
1667	param1: (void *)(uintptr_t)state, deadline, leeway, flags: timer_flags);
1668	}
1669
1670	/*
1671	* Mark a timer as "already fired" when it is being reprogrammed
1672	*
1673	* If there is a timer in flight, this will do a best effort at canceling it,
1674	* but will not wait. If the thread call was in flight, having set the
1675	* TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1676	* cancelation.
1677	*/
1678	static void
1679	filt_timerfire_immediate(struct knote *kn)
1680	{
1681	uint32_t state;
1682
1683	static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1684	"validate that this atomic or will transition to IMMEDIATE");
1685	state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1686
1687	if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1688	thread_call_cancel(call: kn->kn_thcall);
1689	}
1690	}
1691
1692	/*
1693	* Allocate a thread call for the knote's lifetime, and kick off the timer.
1694	*/
1695	static int
1696	filt_timerattach(struct knote kn, struct* kevent_qos_s *kev)
1697	{
1698	thread_call_t callout;
1699	struct filt_timer_params params;
1700	int error;
1701
1702	if ((error = filt_timervalidate(kev, params: &params)) != `0`) {
1703	knote_set_error(kn, error);
1704	return `0`;
1705	}
1706
1707	callout = thread_call_allocate_with_options(func: filt_timerexpire,
1708	param0: (thread_call_param_t)kn, pri: THREAD_CALL_PRIORITY_HIGH,
1709	options: THREAD_CALL_OPTIONS_ONCE);
1710
1711	if (NULL == callout) {
1712	knote_set_error(kn, ENOMEM);
1713	return `0`;
1714	}
1715
1716	filt_timer_set_params(kn, params: &params);
1717	kn->kn_thcall = callout;
1718	kn->kn_flags \|= EV_CLEAR;
1719	os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1720
1721	/ NOTE_ABSOLUTE implies EV_ONESHOT /
1722	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1723	kn->kn_flags \|= EV_ONESHOT;
1724	}
1725
1726	if (filt_timer_is_ready(kn)) {
1727	os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1728	return FILTER_ACTIVE;
1729	} else {
1730	filt_timerarm(kn);
1731	return `0`;
1732	}
1733	}
1734
1735	/*
1736	* Shut down the timer if it's running, and free the callout.
1737	*/
1738	static void
1739	filt_timerdetach(struct knote *kn)
1740	{
1741	__assert_only boolean_t freed;
1742
1743	/*
1744	* Unconditionally cancel to make sure there can't be any filt_timerexpire()
1745	* running anymore.
1746	*/
1747	thread_call_cancel_wait(call: kn->kn_thcall);
1748	freed = thread_call_free(call: kn->kn_thcall);
1749	assert(freed);
1750	}
1751
1752	/*
1753	* filt_timertouch - update timer knote with new user input
1754	*
1755	* Cancel and restart the timer based on new user data. When
1756	* the user picks up a knote, clear the count of how many timer
1757	* pops have gone off (in kn_data).
1758	*/
1759	static int
1760	filt_timertouch(struct knote kn, struct* kevent_qos_s *kev)
1761	{
1762	struct filt_timer_params params;
1763	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1764	int error;
1765
1766	if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1767	!_pthread_priority_thread_qos(pp: kev->qos)) {
1768	/ validate usage of FILTER_UPDATE_REQ_QOS /
1769	kev->flags \|= EV_ERROR;
1770	kev->data = ERANGE;
1771	return `0`;
1772	}
1773
1774	if (changed_flags & NOTE_ABSOLUTE) {
1775	kev->flags \|= EV_ERROR;
1776	kev->data = EINVAL;
1777	return `0`;
1778	}
1779
1780	if ((error = filt_timervalidate(kev, params: &params)) != `0`) {
1781	kev->flags \|= EV_ERROR;
1782	kev->data = error;
1783	return `0`;
1784	}
1785
1786	/ capture the new values used to compute deadline /
1787	filt_timer_set_params(kn, params: &params);
1788	kn->kn_sfflags = kev->fflags;
1789
1790	if (filt_timer_is_ready(kn)) {
1791	filt_timerfire_immediate(kn);
1792	return FILTER_ACTIVE \| FILTER_UPDATE_REQ_QOS;
1793	} else {
1794	filt_timerarm(kn);
1795	return FILTER_UPDATE_REQ_QOS;
1796	}
1797	}
1798
1799	/*
1800	* filt_timerprocess - query state of knote and snapshot event data
1801	*
1802	* Determine if the timer has fired in the past, snapshot the state
1803	* of the kevent for returning to user-space, and clear pending event
1804	* counters for the next time.
1805	*/
1806	static int
1807	filt_timerprocess(struct knote kn, struct* kevent_qos_s *kev)
1808	{
1809	uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1810
1811	/*
1812	* filt_timerprocess is serialized with any filter routine except for
1813	* filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1814	* transition, and on success, activates the knote.
1815	*
1816	* Hence, we don't need atomic modifications of the state, only to peek at
1817	* whether we see any of the "FIRED" state, and if we do, it is safe to
1818	* do simple state machine transitions.
1819	*/
1820	switch (state & TIMER_STATE_MASK) {
1821	case TIMER_IDLE:
1822	case TIMER_ARMED:
1823	/*
1824	* This can happen if a touch resets a timer that had fired
1825	* without being processed
1826	*/
1827	return `0`;
1828	}
1829
1830	os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1831
1832	/*
1833	* Copy out the interesting kevent state,
1834	* but don't leak out the raw time calculations.
1835	*
1836	* TODO: potential enhancements - tell the user about:
1837	* - deadline to which this timer thought it was expiring
1838	* - return kn_sfflags in the fflags field so the client can know
1839	* under what flags the timer fired
1840	*/
1841	knote_fill_kevent(kn, kev, data: `1`);
1842	kev->ext[`0`] = `0`;
1843	/ kev->ext[1] = 0; JMM - shouldn't we hide this too? /
1844
1845	if (kn->kn_sdata != `0`) {
1846	/*
1847	* This is a 'repeating' timer, so we have to emit
1848	* how many intervals expired between the arm
1849	* and the process.
1850	*
1851	* A very strange style of interface, because
1852	* this could easily be done in the client...
1853	*/
1854
1855	uint64_t now;
1856
1857	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1858	now = mach_continuous_time();
1859	} else {
1860	now = mach_absolute_time();
1861	}
1862
1863	uint64_t first_deadline = kn->kn_ext[`0`];
1864	uint64_t interval_abs = kn->kn_sdata;
1865	uint64_t orig_arm_time = first_deadline - interval_abs;
1866
1867	assert(now > orig_arm_time);
1868	assert(now > first_deadline);
1869
1870	uint64_t elapsed = now - orig_arm_time;
1871
1872	uint64_t num_fired = elapsed / interval_abs;
1873
1874	/*
1875	* To reach this code, we must have seen the timer pop
1876	* and be in repeating mode, so therefore it must have been
1877	* more than 'interval' time since the attach or last
1878	* successful touch.
1879	*/
1880	assert(num_fired > `0`);
1881
1882	/ report how many intervals have elapsed to the user /
1883	kev->data = (int64_t)num_fired;
1884
1885	/ We only need to re-arm the timer if it's not about to be destroyed /
1886	if ((kn->kn_flags & EV_ONESHOT) == `0`) {
1887	/ fire at the end of the next interval /
1888	uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1889
1890	assert(new_deadline > now);
1891
1892	kn->kn_ext[`0`] = new_deadline;
1893
1894	/*
1895	* This can't shortcut setting up the thread call, because
1896	* knote_process deactivates EV_CLEAR knotes unconditionnally.
1897	*/
1898	filt_timerarm(kn);
1899	}
1900	}
1901
1902	return FILTER_ACTIVE;
1903	}
1904
1905	SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1906	.f_extended_codes = true,
1907	.f_attach = filt_timerattach,
1908	.f_detach = filt_timerdetach,
1909	.f_event = filt_bad_event,
1910	.f_touch = filt_timertouch,
1911	.f_process = filt_timerprocess,
1912	};
1913
1914	#pragma mark user_filtops
1915
1916	static int
1917	filt_userattach(struct knote kn, __unused struct* kevent_qos_s *kev)
1918	{
1919	if (kn->kn_sfflags & NOTE_TRIGGER) {
1920	kn->kn_hook32 = FILTER_ACTIVE;
1921	} else {
1922	kn->kn_hook32 = `0`;
1923	}
1924	return kn->kn_hook32;
1925	}
1926
1927	static int
1928	filt_usertouch(struct knote kn, struct* kevent_qos_s *kev)
1929	{
1930	uint32_t ffctrl;
1931	int fflags;
1932
1933	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1934	fflags = kev->fflags & NOTE_FFLAGSMASK;
1935	switch (ffctrl) {
1936	case NOTE_FFNOP:
1937	break;
1938	case NOTE_FFAND:
1939	kn->kn_sfflags &= fflags;
1940	break;
1941	case NOTE_FFOR:
1942	kn->kn_sfflags \|= fflags;
1943	break;
1944	case NOTE_FFCOPY:
1945	kn->kn_sfflags = fflags;
1946	break;
1947	}
1948	kn->kn_sdata = kev->data;
1949
1950	if (kev->fflags & NOTE_TRIGGER) {
1951	kn->kn_hook32 = FILTER_ACTIVE;
1952	}
1953	return (int)kn->kn_hook32;
1954	}
1955
1956	static int
1957	filt_userprocess(struct knote kn, struct* kevent_qos_s *kev)
1958	{
1959	int result = (int)kn->kn_hook32;
1960
1961	if (result) {
1962	/ EVFILT_USER returns the data that was passed in /
1963	knote_fill_kevent_with_sdata(kn, kev);
1964	kev->fflags = kn->kn_sfflags;
1965	if (kn->kn_flags & EV_CLEAR) {
1966	/ knote_fill_kevent cleared kn_fflags /
1967	kn->kn_hook32 = `0`;
1968	}
1969	}
1970
1971	return result;
1972	}
1973
1974	SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1975	.f_extended_codes = true,
1976	.f_attach = filt_userattach,
1977	.f_detach = filt_no_detach,
1978	.f_event = filt_bad_event,
1979	.f_touch = filt_usertouch,
1980	.f_process = filt_userprocess,
1981	};
1982
1983	#pragma mark workloop_filtops
1984
1985	#define EPREEMPTDISABLED (-1)
1986
1987	static inline void
1988	filt_wllock(struct kqworkloop *kqwl)
1989	{
1990	lck_spin_lock(lck: &kqwl->kqwl_statelock);
1991	}
1992
1993	static inline void
1994	filt_wlunlock(struct kqworkloop *kqwl)
1995	{
1996	lck_spin_unlock(lck: &kqwl->kqwl_statelock);
1997	}
1998
1999	/*
2000	* Returns true when the interlock for the turnstile is the workqueue lock
2001	*
2002	* When this is the case, all turnstiles operations are delegated
2003	* to the workqueue subsystem.
2004	*
2005	* This is required because kqueue_threadreq_bind_prepost only holds the
2006	* workqueue lock but needs to move the inheritor from the workloop turnstile
2007	* away from the creator thread, so that this now fulfilled request cannot be
2008	* picked anymore by other threads.
2009	*/
2010	static inline bool
2011	filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2012	{
2013	return kqr_thread_requested_pending(kqr: &kqwl->kqwl_request);
2014	}
2015
2016	static void
2017	filt_wlupdate_inheritor(struct kqworkloop kqwl, struct* turnstile *ts,
2018	turnstile_update_flags_t flags)
2019	{
2020	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2021	workq_threadreq_t kqr = &kqwl->kqwl_request;
2022
2023	/*
2024	* binding to the workq should always happen through
2025	* workq_kern_threadreq_update_inheritor()
2026	*/
2027	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2028
2029	if ((inheritor = kqwl->kqwl_owner)) {
2030	flags \|= TURNSTILE_INHERITOR_THREAD;
2031	} else if ((inheritor = kqr_thread(kqr))) {
2032	flags \|= TURNSTILE_INHERITOR_THREAD;
2033	}
2034
2035	turnstile_update_inheritor(turnstile: ts, new_inheritor: inheritor, flags);
2036	}
2037
2038	#define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2039	#define FILT_WLATTACH 0
2040	#define FILT_WLTOUCH 1
2041	#define FILT_WLDROP 2
2042
2043	__result_use_check
2044	static int
2045	filt_wlupdate(struct kqworkloop kqwl, struct* knote *kn,
2046	struct kevent_qos_s kev, kq_index_t qos_index, int* op)
2047	{
2048	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2049	workq_threadreq_t kqr = &kqwl->kqwl_request;
2050	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2051	kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2052	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2053	int action = KQWL_UTQ_NONE, error = `0`;
2054	bool wl_inheritor_updated = false, needs_wake = false;
2055	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2056	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2057	uint64_t udata = `0`;
2058	struct turnstile *ts = TURNSTILE_NULL;
2059
2060	filt_wllock(kqwl);
2061
2062	again:
2063	new_owner = cur_owner = kqwl->kqwl_owner;
2064
2065	/*
2066	* Phase 1:
2067	*
2068	* If asked, load the uint64 value at the user provided address and compare
2069	* it against the passed in mask and expected value.
2070	*
2071	* If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2072	* a thread reference.
2073	*
2074	* If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2075	* the current thread, then end ownership.
2076	*
2077	* Lastly decide whether we need to perform a QoS update.
2078	*/
2079	if (uaddr) {
2080	/*
2081	* Until <rdar://problem/24999882> exists,
2082	* disabling preemption copyin forces any
2083	* vm_fault we encounter to fail.
2084	*/
2085	error = copyin_atomic64(user_addr: uaddr, u64: &udata);
2086
2087	/*
2088	* If we get EFAULT, drop locks, and retry.
2089	* If we still get an error report it,
2090	* else assume the memory has been faulted
2091	* and attempt to copyin under lock again.
2092	*/
2093	switch (error) {
2094	case `0`:
2095	break;
2096	case EFAULT:
2097	if (efault_retry-- > `0`) {
2098	filt_wlunlock(kqwl);
2099	error = copyin_atomic64(user_addr: uaddr, u64: &udata);
2100	filt_wllock(kqwl);
2101	if (error == `0`) {
2102	goto again;
2103	}
2104	}
2105	OS_FALLTHROUGH;
2106	default:
2107	goto out;
2108	}
2109
2110	/ Update state as copied in. /
2111	kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2112
2113	if ((udata & mask) != (kdata & mask)) {
2114	error = ESTALE;
2115	} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2116	/*
2117	* Decipher the owner port name, and translate accordingly.
2118	* The low 2 bits were borrowed for other flags, so mask them off.
2119	*
2120	* Then attempt translation to a thread reference or fail.
2121	*/
2122	mach_port_name_t name = (mach_port_name_t)udata & ~`0x3`;
2123	if (name != MACH_PORT_NULL) {
2124	name = ipc_entry_name_mask(name);
2125	extra_thread_ref = port_name_to_thread(port_name: name,
2126	options: PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2127	if (extra_thread_ref == THREAD_NULL) {
2128	error = EOWNERDEAD;
2129	goto out;
2130	}
2131	new_owner = extra_thread_ref;
2132	}
2133	}
2134	}
2135
2136	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2137	new_owner = THREAD_NULL;
2138	}
2139
2140	if (error == `0`) {
2141	if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2142	action = KQWL_UTQ_SET_QOS_INDEX;
2143	} else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2144	action = KQWL_UTQ_SET_QOS_INDEX;
2145	}
2146
2147	if (op == FILT_WLTOUCH) {
2148	/*
2149	* Save off any additional fflags/data we just accepted
2150	* But only keep the last round of "update" bits we acted on which helps
2151	* debugging a lot.
2152	*/
2153	kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2154	kn->kn_sfflags \|= kev->fflags;
2155	if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2156	needs_wake = (kn->kn_thread != THREAD_NULL);
2157	}
2158	} else if (op == FILT_WLDROP) {
2159	if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT \| NOTE_WL_SYNC_WAKE)) ==
2160	NOTE_WL_SYNC_WAIT) {
2161	/*
2162	* When deleting a SYNC_WAIT knote that hasn't been woken up
2163	* explicitly, issue a wake up.
2164	*/
2165	kn->kn_sfflags \|= NOTE_WL_SYNC_WAKE;
2166	needs_wake = (kn->kn_thread != THREAD_NULL);
2167	}
2168	}
2169	}
2170
2171	/*
2172	* Phase 2:
2173	*
2174	* Commit ownership and QoS changes if any, possibly wake up waiters
2175	*/
2176
2177	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2178	goto out;
2179	}
2180
2181	kqlock(kqu: kqwl);
2182
2183	/ If already tracked as servicer, don't track as owner /
2184	if (new_owner == kqr_thread(kqr)) {
2185	new_owner = THREAD_NULL;
2186	}
2187
2188	if (cur_owner != new_owner) {
2189	kqwl->kqwl_owner = new_owner;
2190	if (new_owner == extra_thread_ref) {
2191	/ we just transfered this ref to kqwl_owner /
2192	extra_thread_ref = THREAD_NULL;
2193	}
2194	cur_override = kqworkloop_override(kqwl);
2195
2196	if (new_owner) {
2197	/ override it before we drop the old /
2198	if (cur_override != THREAD_QOS_UNSPECIFIED) {
2199	thread_add_kevent_override(thread: new_owner, qos_override: cur_override);
2200	}
2201	if (kqr_thread_requested_pending(kqr)) {
2202	if (action == KQWL_UTQ_NONE) {
2203	action = KQWL_UTQ_REDRIVE_EVENTS;
2204	}
2205	}
2206	} else if (action == KQWL_UTQ_NONE &&
2207	!kqr_thread_requested(kqr) &&
2208	kqwl->kqwl_wakeup_qos) {
2209	action = KQWL_UTQ_REDRIVE_EVENTS;
2210	}
2211	}
2212
2213	if (action != KQWL_UTQ_NONE) {
2214	kqworkloop_update_threads_qos(kqwl, op: action, qos: qos_index);
2215	}
2216
2217	ts = kqwl->kqwl_turnstile;
2218	if (cur_owner != new_owner && ts) {
2219	if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2220	/*
2221	* Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2222	* the code went through workq_kern_threadreq_initiate()
2223	* and the workqueue has set the inheritor already
2224	*/
2225	assert(filt_wlturnstile_interlock_is_workq(kqwl));
2226	} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2227	workq_kern_threadreq_lock(p: kqwl->kqwl_p);
2228	workq_kern_threadreq_update_inheritor(p: kqwl->kqwl_p, kqr, owner: new_owner,
2229	ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
2230	workq_kern_threadreq_unlock(p: kqwl->kqwl_p);
2231	if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2232	/*
2233	* If the workq is no longer the interlock, then
2234	* workq_kern_threadreq_update_inheritor() has finished a bind
2235	* and we need to fallback to the regular path.
2236	*/
2237	filt_wlupdate_inheritor(kqwl, ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
2238	}
2239	wl_inheritor_updated = true;
2240	} else {
2241	filt_wlupdate_inheritor(kqwl, ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
2242	wl_inheritor_updated = true;
2243	}
2244
2245	/*
2246	* We need a turnstile reference because we are dropping the interlock
2247	* and the caller has not called turnstile_prepare.
2248	*/
2249	if (wl_inheritor_updated) {
2250	turnstile_reference(turnstile: ts);
2251	}
2252	}
2253
2254	if (needs_wake && ts) {
2255	waitq_wakeup64_thread(waitq: &ts->ts_waitq, wake_event: knote_filt_wev64(kn),
2256	thread: kn->kn_thread, THREAD_AWAKENED);
2257	if (op == FILT_WLATTACH \|\| op == FILT_WLTOUCH) {
2258	disable_preemption();
2259	error = EPREEMPTDISABLED;
2260	}
2261	}
2262
2263	kqunlock(kqu: kqwl);
2264
2265	out:
2266	/*
2267	* Phase 3:
2268	*
2269	* Unlock and cleanup various lingering references and things.
2270	*/
2271	filt_wlunlock(kqwl);
2272
2273	#if CONFIG_WORKLOOP_DEBUG
2274	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2275	.updater = current_thread(),
2276	.servicer = kqr_thread(kqr), / Note: racy /
2277	.old_owner = cur_owner,
2278	.new_owner = new_owner,
2279
2280	.kev_ident = kev->ident,
2281	.error = (int16_t)error,
2282	.kev_flags = kev->flags,
2283	.kev_fflags = kev->fflags,
2284
2285	.kev_mask = mask,
2286	.kev_value = kdata,
2287	.in_value = udata,
2288	});
2289	#endif // CONFIG_WORKLOOP_DEBUG
2290
2291	if (wl_inheritor_updated) {
2292	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_NOT_HELD);
2293	turnstile_deallocate_safe(turnstile: ts);
2294	}
2295
2296	if (cur_owner && new_owner != cur_owner) {
2297	if (cur_override != THREAD_QOS_UNSPECIFIED) {
2298	thread_drop_kevent_override(thread: cur_owner);
2299	}
2300	thread_deallocate_safe(thread: cur_owner);
2301	}
2302	if (extra_thread_ref) {
2303	thread_deallocate_safe(thread: extra_thread_ref);
2304	}
2305	return error;
2306	}
2307
2308	/*
2309	* Remembers the last updated that came in from userspace for debugging reasons.
2310	* - fflags is mirrored from the userspace kevent
2311	* - ext[i, i != VALUE] is mirrored from the userspace kevent
2312	* - ext[VALUE] is set to what the kernel loaded atomically
2313	* - data is set to the error if any
2314	*/
2315	static inline void
2316	filt_wlremember_last_update(struct knote kn, struct* kevent_qos_s *kev,
2317	int error)
2318	{
2319	kn->kn_fflags = kev->fflags;
2320	kn->kn_sdata = error;
2321	memcpy(dst: kn->kn_ext, src: kev->ext, n: sizeof(kev->ext));
2322	}
2323
2324	static int
2325	filt_wlupdate_sync_ipc(struct kqworkloop kqwl, struct* knote *kn,
2326	struct kevent_qos_s kev, int* op)
2327	{
2328	user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2329	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2330	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2331	uint64_t udata = `0`;
2332	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2333	int error = `0`;
2334
2335	if (op == FILT_WLATTACH) {
2336	(void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2337	} else if (uaddr == `0`) {
2338	return `0`;
2339	}
2340
2341	filt_wllock(kqwl);
2342
2343	again:
2344
2345	/*
2346	* Do the debounce thing, the lock serializing the state is the knote lock.
2347	*/
2348	if (uaddr) {
2349	/*
2350	* Until <rdar://problem/24999882> exists,
2351	* disabling preemption copyin forces any
2352	* vm_fault we encounter to fail.
2353	*/
2354	error = copyin_atomic64(user_addr: uaddr, u64: &udata);
2355
2356	/*
2357	* If we get EFAULT, drop locks, and retry.
2358	* If we still get an error report it,
2359	* else assume the memory has been faulted
2360	* and attempt to copyin under lock again.
2361	*/
2362	switch (error) {
2363	case `0`:
2364	break;
2365	case EFAULT:
2366	if (efault_retry-- > `0`) {
2367	filt_wlunlock(kqwl);
2368	error = copyin_atomic64(user_addr: uaddr, u64: &udata);
2369	filt_wllock(kqwl);
2370	if (error == `0`) {
2371	goto again;
2372	}
2373	}
2374	OS_FALLTHROUGH;
2375	default:
2376	goto out;
2377	}
2378
2379	kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2380	kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2381
2382	if ((udata & mask) != (kdata & mask)) {
2383	error = ESTALE;
2384	goto out;
2385	}
2386	}
2387
2388	if (op == FILT_WLATTACH) {
2389	error = filt_wlattach_sync_ipc(kn);
2390	if (error == `0`) {
2391	disable_preemption();
2392	error = EPREEMPTDISABLED;
2393	}
2394	}
2395
2396	out:
2397	filt_wlunlock(kqwl);
2398	return error;
2399	}
2400
2401	static int
2402	filt_wlattach(struct knote kn, struct* kevent_qos_s *kev)
2403	{
2404	struct kqueue *kq = knote_get_kq(kn);
2405	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2406	int error = `0`, result = `0`;
2407	kq_index_t qos_index = `0`;
2408
2409	if (__improbable((kq->kq_state & KQ_WORKLOOP) == `0`)) {
2410	error = ENOTSUP;
2411	goto out;
2412	}
2413
2414	uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2415	switch (command) {
2416	case NOTE_WL_THREAD_REQUEST:
2417	if (kn->kn_id != kqwl->kqwl_dynamicid) {
2418	error = EINVAL;
2419	goto out;
2420	}
2421	qos_index = _pthread_priority_thread_qos(pp: kn->kn_qos);
2422	if (qos_index == THREAD_QOS_UNSPECIFIED) {
2423	error = ERANGE;
2424	goto out;
2425	}
2426	if (kqwl->kqwl_request.tr_kq_qos_index) {
2427	/*
2428	* There already is a thread request, and well, you're only allowed
2429	* one per workloop, so fail the attach.
2430	*/
2431	error = EALREADY;
2432	goto out;
2433	}
2434	break;
2435	case NOTE_WL_SYNC_WAIT:
2436	case NOTE_WL_SYNC_WAKE:
2437	if (kn->kn_id == kqwl->kqwl_dynamicid) {
2438	error = EINVAL;
2439	goto out;
2440	}
2441	if ((kn->kn_flags & EV_DISABLE) == `0`) {
2442	error = EINVAL;
2443	goto out;
2444	}
2445	if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2446	error = EINVAL;
2447	goto out;
2448	}
2449	break;
2450
2451	case NOTE_WL_SYNC_IPC:
2452	if ((kn->kn_flags & EV_DISABLE) == `0`) {
2453	error = EINVAL;
2454	goto out;
2455	}
2456	if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS \| NOTE_WL_DISCOVER_OWNER)) {
2457	error = EINVAL;
2458	goto out;
2459	}
2460	break;
2461	default:
2462	error = EINVAL;
2463	goto out;
2464	}
2465
2466	if (command == NOTE_WL_SYNC_IPC) {
2467	error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2468	} else {
2469	error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2470	}
2471
2472	if (error == EPREEMPTDISABLED) {
2473	error = `0`;
2474	result = FILTER_THREADREQ_NODEFEER;
2475	}
2476	out:
2477	if (error) {
2478	/ If userland wants ESTALE to be hidden, fail the attach anyway /
2479	if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2480	error = `0`;
2481	}
2482	knote_set_error(kn, error);
2483	return result;
2484	}
2485	if (command == NOTE_WL_SYNC_WAIT) {
2486	return kevent_register_wait_prepare(kn, kev, result);
2487	}
2488	/ Just attaching the thread request successfully will fire it /
2489	if (command == NOTE_WL_THREAD_REQUEST) {
2490	/*
2491	* Thread Request knotes need an explicit touch to be active again,
2492	* so delivering an event needs to also consume it.
2493	*/
2494	kn->kn_flags \|= EV_CLEAR;
2495	return result \| FILTER_ACTIVE;
2496	}
2497	return result;
2498	}
2499
2500	static void __dead2
2501	filt_wlwait_continue(void *parameter, wait_result_t wr)
2502	{
2503	struct _kevent_register *cont_args = parameter;
2504	struct kqworkloop *kqwl = cont_args->kqwl;
2505
2506	kqlock(kqu: kqwl);
2507	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2508	workq_kern_threadreq_lock(p: kqwl->kqwl_p);
2509	turnstile_complete(proprietor: (uintptr_t)kqwl, tstore: &kqwl->kqwl_turnstile, NULL, type: TURNSTILE_WORKLOOPS);
2510	workq_kern_threadreq_unlock(p: kqwl->kqwl_p);
2511	} else {
2512	turnstile_complete(proprietor: (uintptr_t)kqwl, tstore: &kqwl->kqwl_turnstile, NULL, type: TURNSTILE_WORKLOOPS);
2513	}
2514	kqunlock(kqu: kqwl);
2515
2516	turnstile_cleanup();
2517
2518	if (wr == THREAD_INTERRUPTED) {
2519	cont_args->kev.flags \|= EV_ERROR;
2520	cont_args->kev.data = EINTR;
2521	} else if (wr != THREAD_AWAKENED) {
2522	panic("Unexpected wait result: %d", wr);
2523	}
2524
2525	kevent_register_wait_return(cont_args);
2526	}
2527
2528	/*
2529	* Called with the workloop mutex held, most of the time never returns as it
2530	* calls filt_wlwait_continue through a continuation.
2531	*/
2532	static void __dead2
2533	filt_wlpost_register_wait(struct uthread uth, struct* knote *kn,
2534	struct _kevent_register *cont_args)
2535	{
2536	struct kqworkloop *kqwl = cont_args->kqwl;
2537	workq_threadreq_t kqr = &kqwl->kqwl_request;
2538	struct turnstile *ts;
2539	bool workq_locked = false;
2540
2541	kqlock_held(kqu: kqwl);
2542
2543	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2544	workq_kern_threadreq_lock(p: kqwl->kqwl_p);
2545	workq_locked = true;
2546	}
2547
2548	ts = turnstile_prepare(proprietor: (uintptr_t)kqwl, tstore: &kqwl->kqwl_turnstile,
2549	TURNSTILE_NULL, type: TURNSTILE_WORKLOOPS);
2550
2551	if (workq_locked) {
2552	workq_kern_threadreq_update_inheritor(p: kqwl->kqwl_p,
2553	kqr: &kqwl->kqwl_request, owner: kqwl->kqwl_owner, ts,
2554	flags: TURNSTILE_DELAYED_UPDATE);
2555	if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2556	/*
2557	* if the interlock is no longer the workqueue lock,
2558	* then we don't need to hold it anymore.
2559	*/
2560	workq_kern_threadreq_unlock(p: kqwl->kqwl_p);
2561	workq_locked = false;
2562	}
2563	}
2564	if (!workq_locked) {
2565	/*
2566	* If the interlock is the workloop's, then it's our responsibility to
2567	* call update_inheritor, so just do it.
2568	*/
2569	filt_wlupdate_inheritor(kqwl, ts, flags: TURNSTILE_DELAYED_UPDATE);
2570	}
2571
2572	thread_set_pending_block_hint(thread: get_machthread(uth), block_hint: kThreadWaitWorkloopSyncWait);
2573	waitq_assert_wait64(waitq: &ts->ts_waitq, wait_event: knote_filt_wev64(kn),
2574	THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2575
2576	if (workq_locked) {
2577	workq_kern_threadreq_unlock(p: kqwl->kqwl_p);
2578	}
2579
2580	thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2581	if (thread) {
2582	thread_reference(thread);
2583	}
2584
2585	kevent_register_wait_block(ts, handoff_thread: thread, cont: filt_wlwait_continue, cont_args);
2586	}
2587
2588	/ called in stackshot context to report the thread responsible for blocking this thread /
2589	void
2590	kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2591	event64_t event, thread_waitinfo_t *waitinfo)
2592	{
2593	struct knote kn = (struct* knote *)event;
2594
2595	zone_require(zone: knote_zone, addr: kn);
2596
2597	assert(kn->kn_thread == thread);
2598
2599	struct kqueue *kq = knote_get_kq(kn);
2600
2601	zone_require(zone: kqworkloop_zone, addr: kq);
2602	assert(kq->kq_state & KQ_WORKLOOP);
2603
2604	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2605	workq_threadreq_t kqr = &kqwl->kqwl_request;
2606
2607	thread_t kqwl_owner = kqwl->kqwl_owner;
2608
2609	if (kqwl_owner != THREAD_NULL) {
2610	thread_require(thread: kqwl_owner);
2611	waitinfo->owner = thread_tid(thread: kqwl->kqwl_owner);
2612	} else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2613	thread_require(thread: kqr->tr_thread);
2614	waitinfo->owner = thread_tid(thread: kqr->tr_thread);
2615	} else if (kqr_thread_requested_pending(kqr)) { / > idle, < bound /
2616	waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2617	} else {
2618	waitinfo->owner = `0`;
2619	}
2620
2621	waitinfo->context = kqwl->kqwl_dynamicid;
2622	}
2623
2624	static void
2625	filt_wldetach(struct knote *kn)
2626	{
2627	if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2628	filt_wldetach_sync_ipc(kn);
2629	} else if (kn->kn_thread) {
2630	kevent_register_wait_cleanup(kn);
2631	}
2632	}
2633
2634	static int
2635	filt_wlvalidate_kev_flags(struct knote kn, struct* kevent_qos_s *kev,
2636	thread_qos_t *qos_index)
2637	{
2638	uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2639	uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2640
2641	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2642	return EINVAL;
2643	}
2644	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2645	if (kev->flags & EV_DELETE) {
2646	return EINVAL;
2647	}
2648	if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2649	return EINVAL;
2650	}
2651	if (!(*qos_index = _pthread_priority_thread_qos(pp: kev->qos))) {
2652	return ERANGE;
2653	}
2654	}
2655
2656	switch (new_commands) {
2657	case NOTE_WL_THREAD_REQUEST:
2658	/ thread requests can only update themselves /
2659	if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2660	return EINVAL;
2661	}
2662	break;
2663
2664	case NOTE_WL_SYNC_WAIT:
2665	if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2666	return EINVAL;
2667	}
2668	goto sync_checks;
2669
2670	case NOTE_WL_SYNC_WAKE:
2671	sync_checks:
2672	if (!(sav_commands & (NOTE_WL_SYNC_WAIT \| NOTE_WL_SYNC_WAKE))) {
2673	return EINVAL;
2674	}
2675	if ((kev->flags & (EV_ENABLE \| EV_DELETE)) == EV_ENABLE) {
2676	return EINVAL;
2677	}
2678	break;
2679
2680	case NOTE_WL_SYNC_IPC:
2681	if (sav_commands != NOTE_WL_SYNC_IPC) {
2682	return EINVAL;
2683	}
2684	if ((kev->flags & (EV_ENABLE \| EV_DELETE)) == EV_ENABLE) {
2685	return EINVAL;
2686	}
2687	break;
2688
2689	default:
2690	return EINVAL;
2691	}
2692	return `0`;
2693	}
2694
2695	static int
2696	filt_wltouch(struct knote kn, struct* kevent_qos_s *kev)
2697	{
2698	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2699	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2700	int result = `0`;
2701
2702	int error = filt_wlvalidate_kev_flags(kn, kev, qos_index: &qos_index);
2703	if (error) {
2704	goto out;
2705	}
2706
2707	uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2708	if (command == NOTE_WL_SYNC_IPC) {
2709	error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2710	} else {
2711	error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2712	filt_wlremember_last_update(kn, kev, error);
2713	}
2714	if (error == EPREEMPTDISABLED) {
2715	error = `0`;
2716	result = FILTER_THREADREQ_NODEFEER;
2717	}
2718
2719	out:
2720	if (error) {
2721	if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2722	/ If userland wants ESTALE to be hidden, do not activate /
2723	return result;
2724	}
2725	kev->flags \|= EV_ERROR;
2726	kev->data = error;
2727	return result;
2728	}
2729	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2730	return kevent_register_wait_prepare(kn, kev, result);
2731	}
2732	/ Just touching the thread request successfully will fire it /
2733	if (command == NOTE_WL_THREAD_REQUEST) {
2734	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2735	result \|= FILTER_UPDATE_REQ_QOS;
2736	}
2737	result \|= FILTER_ACTIVE;
2738	}
2739	return result;
2740	}
2741
2742	static bool
2743	filt_wlallow_drop(struct knote kn, struct* kevent_qos_s *kev)
2744	{
2745	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2746
2747	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2748	if (error) {
2749	goto out;
2750	}
2751
2752	uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2753	if (command == NOTE_WL_SYNC_IPC) {
2754	error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2755	} else {
2756	error = filt_wlupdate(kqwl, kn, kev, qos_index: `0`, FILT_WLDROP);
2757	filt_wlremember_last_update(kn, kev, error);
2758	}
2759	assert(error != EPREEMPTDISABLED);
2760
2761	out:
2762	if (error) {
2763	if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2764	return false;
2765	}
2766	kev->flags \|= EV_ERROR;
2767	kev->data = error;
2768	return false;
2769	}
2770	return true;
2771	}
2772
2773	static int
2774	filt_wlprocess(struct knote kn, struct* kevent_qos_s *kev)
2775	{
2776	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2777	int rc = `0`;
2778
2779	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2780
2781	kqlock(kqu: kqwl);
2782
2783	if (kqwl->kqwl_owner) {
2784	/*
2785	* <rdar://problem/33584321> userspace sometimes due to events being
2786	* delivered but not triggering a drain session can cause a process
2787	* of the thread request knote.
2788	*
2789	* When that happens, the automatic deactivation due to process
2790	* would swallow the event, so we have to activate the knote again.
2791	*/
2792	knote_activate(kqu: kqwl, kn, FILTER_ACTIVE);
2793	} else {
2794	#if DEBUG \|\| DEVELOPMENT
2795	if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2796	/*
2797	* see src/queue_internal.h in libdispatch
2798	*/
2799	#define DISPATCH_QUEUE_ENQUEUED 0x1ull
2800	user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2801	task_t t = current_task();
2802	uint64_t val;
2803	if (addr && task_is_active(t) && !task_is_halting(t) &&
2804	copyin_atomic64(addr, &val) == `0` &&
2805	val && (val & DISPATCH_QUEUE_ENQUEUED) == `0` &&
2806	(val >> `48`) != `0xdead` && (val >> `48`) != `0` && (val >> `48`) != `0xffff`) {
2807	panic("kevent: workloop %#016llx is not enqueued "
2808	"(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2809	kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2810	}
2811	}
2812	#endif
2813	knote_fill_kevent(kn, kev, data: `0`);
2814	kev->fflags = kn->kn_sfflags;
2815	rc \|= FILTER_ACTIVE;
2816	}
2817
2818	kqunlock(kqu: kqwl);
2819
2820	if (rc & FILTER_ACTIVE) {
2821	workq_thread_set_max_qos(p: kqwl->kqwl_p, kqr: &kqwl->kqwl_request);
2822	}
2823	return rc;
2824	}
2825
2826	SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2827	.f_extended_codes = true,
2828	.f_attach = filt_wlattach,
2829	.f_detach = filt_wldetach,
2830	.f_event = filt_bad_event,
2831	.f_touch = filt_wltouch,
2832	.f_process = filt_wlprocess,
2833	.f_allow_drop = filt_wlallow_drop,
2834	.f_post_register_wait = filt_wlpost_register_wait,
2835	};
2836
2837	#pragma mark - kqueues allocation and deallocation
2838
2839	OS_NOINLINE
2840	static void
2841	kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2842
2843	static inline bool
2844	kqworkloop_try_retain(struct kqworkloop *kqwl)
2845	{
2846	return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2847	}
2848
2849	static inline void
2850	kqworkloop_retain(struct kqworkloop *kqwl)
2851	{
2852	return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2853	}
2854
2855	OS_ALWAYS_INLINE
2856	static inline void
2857	kqueue_retain(kqueue_t kqu)
2858	{
2859	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2860	kqworkloop_retain(kqwl: kqu.kqwl);
2861	}
2862	}
2863
2864	OS_ALWAYS_INLINE
2865	static inline void
2866	kqworkloop_release_live(struct kqworkloop *kqwl)
2867	{
2868	os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2869	}
2870
2871	OS_ALWAYS_INLINE
2872	static inline void
2873	kqueue_release_live(kqueue_t kqu)
2874	{
2875	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2876	kqworkloop_release_live(kqwl: kqu.kqwl);
2877	}
2878	}
2879
2880	OS_ALWAYS_INLINE
2881	static inline void
2882	kqworkloop_release(struct kqworkloop *kqwl)
2883	{
2884	if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == `0`) {
2885	kqworkloop_dealloc(kqwl, true);
2886	}
2887	}
2888
2889	OS_ALWAYS_INLINE
2890	static inline void
2891	kqueue_release(kqueue_t kqu)
2892	{
2893	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2894	kqworkloop_release(kqwl: kqu.kqwl);
2895	}
2896	}
2897
2898	/!*
2899	* @function kqueue_destroy
2900	*
2901	* @brief
2902	* Common part to all kqueue dealloc functions.
2903	*/
2904	OS_NOINLINE
2905	static void
2906	kqueue_destroy(kqueue_t kqu, zone_t zone)
2907	{
2908	lck_spin_destroy(lck: &kqu.kq->kq_lock, grp: &kq_lck_grp);
2909
2910	zfree(zone, kqu.kq);
2911	}
2912
2913	/!*
2914	* @function kqueue_init
2915	*
2916	* @brief
2917	* Common part to all kqueue alloc functions.
2918	*/
2919	static kqueue_t
2920	kqueue_init(kqueue_t kqu)
2921	{
2922	lck_spin_init(lck: &kqu.kq->kq_lock, grp: &kq_lck_grp, LCK_ATTR_NULL);
2923	return kqu;
2924	}
2925
2926	#pragma mark kqfile allocation and deallocation
2927
2928	/!*
2929	* @function kqueue_dealloc
2930	*
2931	* @brief
2932	* Detach all knotes from a kqfile and free it.
2933	*
2934	* @discussion
2935	* We walk each list looking for knotes referencing this
2936	* this kqueue. If we find one, we try to drop it. But
2937	* if we fail to get a drop reference, that will wait
2938	* until it is dropped. So, we can just restart again
2939	* safe in the assumption that the list will eventually
2940	* not contain any more references to this kqueue (either
2941	* we dropped them all, or someone else did).
2942	*
2943	* Assumes no new events are being added to the kqueue.
2944	* Nothing locked on entry or exit.
2945	*/
2946	void
2947	kqueue_dealloc(struct kqueue *kq)
2948	{
2949	KNOTE_LOCK_CTX(knlc);
2950	struct proc *p = kq->kq_p;
2951	struct filedesc *fdp = &p->p_fd;
2952	struct knote *kn;
2953
2954	assert(kq && (kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
2955
2956	proc_fdlock(p);
2957	for (int i = `0`; i < fdp->fd_knlistsize; i++) {
2958	kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2959	while (kn != NULL) {
2960	if (kq == knote_get_kq(kn)) {
2961	kqlock(kqu: kq);
2962	proc_fdunlock(p);
2963	if (knote_lock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ON_SUCCESS)) {
2964	knote_drop(kqu: kq, kn, knlc: &knlc);
2965	}
2966	proc_fdlock(p);
2967	/ start over at beginning of list /
2968	kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2969	continue;
2970	}
2971	kn = SLIST_NEXT(kn, kn_link);
2972	}
2973	}
2974
2975	knhash_lock(fdp);
2976	proc_fdunlock(p);
2977
2978	if (fdp->fd_knhashmask != `0`) {
2979	for (int i = `0`; i < (int)fdp->fd_knhashmask + `1`; i++) {
2980	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2981	while (kn != NULL) {
2982	if (kq == knote_get_kq(kn)) {
2983	kqlock(kqu: kq);
2984	knhash_unlock(fdp);
2985	if (knote_lock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ON_SUCCESS)) {
2986	knote_drop(kqu: kq, kn, knlc: &knlc);
2987	}
2988	knhash_lock(fdp);
2989	/ start over at beginning of list /
2990	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2991	continue;
2992	}
2993	kn = SLIST_NEXT(kn, kn_link);
2994	}
2995	}
2996	}
2997	knhash_unlock(fdp);
2998
2999	kqueue_destroy(kqu: kq, zone: kqfile_zone);
3000	}
3001
3002	/!*
3003	* @function kqueue_alloc
3004	*
3005	* @brief
3006	* Allocate a kqfile.
3007	*/
3008	struct kqueue *
3009	kqueue_alloc(struct proc *p)
3010	{
3011	struct kqfile *kqf;
3012
3013	/*
3014	* kqfiles are created with kqueue() so we need to wait for
3015	* the first kevent syscall to know which bit among
3016	* KQ_KEV_{32,64,QOS} will be set in kqf_state
3017	*/
3018	kqf = zalloc_flags(kqfile_zone, Z_WAITOK \| Z_ZERO);
3019	kqf->kqf_p = p;
3020	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3021	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3022
3023	return kqueue_init(kqu: kqf).kq;
3024	}
3025
3026	/!*
3027	* @function kqueue_internal
3028	*
3029	* @brief
3030	* Core implementation for kqueue and guarded_kqueue_np()
3031	*/
3032	int
3033	kqueue_internal(struct proc p, fp_initfn_t fp_init, void* initarg, int32_t retval)
3034	{
3035	struct kqueue *kq;
3036	struct fileproc *fp;
3037	int fd, error;
3038
3039	error = falloc_withinit(p, p_cred: current_cached_proc_cred(p),
3040	ctx: vfs_context_current(), resultfp: &fp, resultfd: &fd, fp_init, initarg);
3041	if (error) {
3042	return error;
3043	}
3044
3045	kq = kqueue_alloc(p);
3046	if (kq == NULL) {
3047	fp_free(p, fd, fp);
3048	return ENOMEM;
3049	}
3050
3051	fp->fp_flags \|= FP_CLOEXEC \| FP_CLOFORK;
3052	fp->f_flag = FREAD \| FWRITE;
3053	fp->f_ops = &kqueueops;
3054	fp_set_data(fp, fg_data: kq);
3055	fp->f_lflags \|= FG_CONFINED;
3056
3057	proc_fdlock(p);
3058	procfdtbl_releasefd(p, fd, NULL);
3059	fp_drop(p, fd, fp, locked: `1`);
3060	proc_fdunlock(p);
3061
3062	*retval = fd;
3063	return error;
3064	}
3065
3066	/!*
3067	* @function kqueue
3068	*
3069	* @brief
3070	* The kqueue syscall.
3071	*/
3072	int
3073	kqueue(struct proc p, __unused struct* kqueue_args uap, int32_t retval)
3074	{
3075	return kqueue_internal(p, NULL, NULL, retval);
3076	}
3077
3078	#pragma mark kqworkq allocation and deallocation
3079
3080	/!*
3081	* @function kqworkq_dealloc
3082	*
3083	* @brief
3084	* Deallocates a workqueue kqueue.
3085	*
3086	* @discussion
3087	* This only happens at process death, or for races with concurrent
3088	* kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3089	* this kqueue, either there are none, or someone else took care of them.
3090	*/
3091	void
3092	kqworkq_dealloc(struct kqworkq *kqwq)
3093	{
3094	kqueue_destroy(kqu: kqwq, zone: kqworkq_zone);
3095	}
3096
3097	/!*
3098	* @function kqworkq_alloc
3099	*
3100	* @brief
3101	* Allocates a workqueue kqueue.
3102	*
3103	* @discussion
3104	* This is the slow path of kevent_get_kqwq.
3105	* This takes care of making sure procs have a single workq kqueue.
3106	*/
3107	OS_NOINLINE
3108	static struct kqworkq *
3109	kqworkq_alloc(struct proc p, unsigned* int flags)
3110	{
3111	struct kqworkq kqwq, tmp;
3112
3113	kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK \| Z_ZERO);
3114
3115	assert((flags & KEVENT_FLAG_LEGACY32) == `0`);
3116	if (flags & KEVENT_FLAG_LEGACY64) {
3117	kqwq->kqwq_state = KQ_WORKQ \| KQ_KEV64;
3118	} else {
3119	kqwq->kqwq_state = KQ_WORKQ \| KQ_KEV_QOS;
3120	}
3121	kqwq->kqwq_p = p;
3122
3123	for (int i = `0`; i < KQWQ_NBUCKETS; i++) {
3124	TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3125	TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3126	}
3127	for (int i = `0`; i < KQWQ_NBUCKETS; i++) {
3128	/*
3129	* Because of how the bucketized system works, we mix overcommit
3130	* sources with not overcommit: each time we move a knote from
3131	* one bucket to the next due to overrides, we'd had to track
3132	* overcommitness, and it's really not worth it in the workloop
3133	* enabled world that track this faithfully.
3134	*
3135	* Incidentally, this behaves like the original manager-based
3136	* kqwq where event delivery always happened (hence is
3137	* "overcommit")
3138	*/
3139	kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3140	kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3141	if (i != KQWQ_QOS_MANAGER) {
3142	kqwq->kqwq_request[i].tr_flags \|= WORKQ_TR_FLAG_OVERCOMMIT;
3143	}
3144	kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + `1`;
3145	}
3146
3147	kqueue_init(kqu: kqwq);
3148
3149	if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3150	kqworkq_dealloc(kqwq);
3151	return tmp;
3152	}
3153
3154	return kqwq;
3155	}
3156
3157	#pragma mark kqworkloop allocation and deallocation
3158
3159	#define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3160	#define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3161
3162	OS_ALWAYS_INLINE
3163	static inline void
3164	kqhash_lock(struct filedesc *fdp)
3165	{
3166	lck_mtx_lock_spin_always(lck: &fdp->fd_kqhashlock);
3167	}
3168
3169	OS_ALWAYS_INLINE
3170	static inline void
3171	kqhash_unlock(struct filedesc *fdp)
3172	{
3173	lck_mtx_unlock(lck: &fdp->fd_kqhashlock);
3174	}
3175
3176	OS_ALWAYS_INLINE
3177	static inline void
3178	kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3179	struct kqworkloop *kqwl)
3180	{
3181	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3182	LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3183	}
3184
3185	OS_ALWAYS_INLINE
3186	static inline struct kqworkloop *
3187	kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3188	{
3189	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3190	struct kqworkloop *kqwl;
3191
3192	LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3193	if (kqwl->kqwl_dynamicid == id) {
3194	return kqwl;
3195	}
3196	}
3197	return NULL;
3198	}
3199
3200	static struct kqworkloop *
3201	kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3202	{
3203	struct kqworkloop *kqwl = NULL;
3204
3205	kqhash_lock(fdp);
3206	if (__probable(fdp->fd_kqhash)) {
3207	kqwl = kqworkloop_hash_lookup_locked(fdp, id: kq_id);
3208	if (kqwl && !kqworkloop_try_retain(kqwl)) {
3209	kqwl = NULL;
3210	}
3211	}
3212	kqhash_unlock(fdp);
3213	return kqwl;
3214	}
3215
3216	OS_NOINLINE
3217	static void
3218	kqworkloop_hash_init(struct filedesc *fdp)
3219	{
3220	struct kqwllist *alloc_hash;
3221	u_long alloc_mask;
3222
3223	kqhash_unlock(fdp);
3224	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, hashmask: &alloc_mask);
3225	kqhash_lock(fdp);
3226
3227	/ See if we won the race /
3228	if (__probable(fdp->fd_kqhashmask == `0`)) {
3229	fdp->fd_kqhash = alloc_hash;
3230	fdp->fd_kqhashmask = alloc_mask;
3231	} else {
3232	kqhash_unlock(fdp);
3233	hashdestroy(alloc_hash, M_KQUEUE, hashmask: alloc_mask);
3234	kqhash_lock(fdp);
3235	}
3236	}
3237
3238	/*
3239	* kqueue iotier override is only supported for kqueue that has
3240	* only one port as a mach port source. Updating the iotier
3241	* override on the mach port source will update the override
3242	* on kqueue as well. Since kqueue with iotier override will
3243	* only have one port attached, there is no logic for saturation
3244	* like qos override, the iotier override of mach port source
3245	* would be reflected in kevent iotier override.
3246	*/
3247	void
3248	kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3249	{
3250	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3251	return;
3252	}
3253
3254	struct kqworkloop *kqwl = kqu.kqwl;
3255	os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3256	}
3257
3258	uint8_t
3259	kqueue_get_iotier_override(kqueue_t kqu)
3260	{
3261	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3262	return THROTTLE_LEVEL_END;
3263	}
3264
3265	struct kqworkloop *kqwl = kqu.kqwl;
3266	return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3267	}
3268
3269	#if CONFIG_PREADOPT_TG
3270	/*
3271	* This function is called with a borrowed reference on the thread group without
3272	* kq lock held with the mqueue lock held. It may or may not have the knote lock
3273	* (called from both fevent as well as fattach/ftouch). Upon success, an
3274	* additional reference on the TG is taken
3275	*/
3276	void
3277	kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3278	{
3279	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3280	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3281	(uintptr_t)thread_tid(current_thread()), `0`, `0`, `0`);
3282	return;
3283	}
3284
3285	struct kqworkloop *kqwl = kqu.kqwl;
3286
3287	assert(qos < THREAD_QOS_LAST);
3288
3289	thread_group_retain(tg);
3290
3291	thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3292	int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3293	if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3294	os_atomic_rmw_loop_give_up(break);
3295	}
3296
3297	if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3298	/*
3299	* Note that old_tg could be a NULL TG pointer but with a QoS
3300	* set. See also workq_thread_reset_pri.
3301	*
3302	* Compare the QoS of existing preadopted tg with new one and
3303	* only overwrite the thread group if we have one with a higher
3304	* QoS.
3305	*/
3306	thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3307	if (existing_qos >= qos) {
3308	os_atomic_rmw_loop_give_up(break);
3309	}
3310	}
3311
3312	// Transfer the ref taken earlier in the function to the kqwl
3313	new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3314	});
3315
3316	if (ret) {
3317	KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3318
3319	if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3320	thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3321	}
3322
3323	os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3324	} else {
3325	// We failed to write to the kqwl_preadopt_tg, drop the ref we took
3326	// earlier in the function
3327	thread_group_deallocate_safe(tg);
3328	}
3329	}
3330
3331	/*
3332	* Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3333	*/
3334	bool
3335	kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue kq, struct* thread_group *tg)
3336	{
3337	bool success = false;
3338	if (kq->kq_state & KQ_WORKLOOP) {
3339	struct kqworkloop kqwl = (struct* kqworkloop *) kq;
3340	thread_group_qos_t old_tg;
3341	success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3342	KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3343	&old_tg, relaxed);
3344	if (success) {
3345	thread_set_preadopt_thread_group(t: thread, tg);
3346	} else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3347	/*
3348	* Technically the following set_preadopt should be a no-op since this
3349	* servicer thread preadopts kqwl's permanent tg at bind time.
3350	* See kqueue_threadreq_bind.
3351	*/
3352	thread_set_preadopt_thread_group(t: thread, KQWL_GET_PREADOPTED_TG(old_tg));
3353	} else {
3354	assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED \|\|
3355	old_tg == KQWL_PREADOPTED_TG_NEVER);
3356	}
3357	}
3358	return success;
3359	}
3360	#endif
3361
3362	/!*
3363	* @function kqworkloop_dealloc
3364	*
3365	* @brief
3366	* Deallocates a workloop kqueue.
3367	*
3368	* @discussion
3369	* Knotes hold references on the workloop, so we can't really reach this
3370	* function unless all of these are already gone.
3371	*
3372	* Nothing locked on entry or exit.
3373	*
3374	* @param hash_remove
3375	* Whether to remove the workloop from its hash table.
3376	*/
3377	static void
3378	kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3379	{
3380	thread_t cur_owner;
3381
3382	cur_owner = kqwl->kqwl_owner;
3383	if (cur_owner) {
3384	if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3385	thread_drop_kevent_override(thread: cur_owner);
3386	}
3387	thread_deallocate(thread: cur_owner);
3388	kqwl->kqwl_owner = THREAD_NULL;
3389	}
3390
3391	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3392	struct turnstile *ts;
3393	turnstile_complete(proprietor: (uintptr_t)kqwl, tstore: &kqwl->kqwl_turnstile,
3394	turnstile: &ts, type: TURNSTILE_WORKLOOPS);
3395	turnstile_cleanup();
3396	turnstile_deallocate(turnstile: ts);
3397	}
3398
3399	if (hash_remove) {
3400	struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3401
3402	kqhash_lock(fdp);
3403	LIST_REMOVE(kqwl, kqwl_hashlink);
3404	#if CONFIG_PROC_RESOURCE_LIMITS
3405	fdp->num_kqwls--;
3406	#endif
3407	kqhash_unlock(fdp);
3408	}
3409
3410	#if CONFIG_PREADOPT_TG
3411	thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3412	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3413	thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3414	}
3415	#endif
3416
3417	assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3418	assert(kqwl->kqwl_owner == THREAD_NULL);
3419	assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3420
3421	lck_spin_destroy(lck: &kqwl->kqwl_statelock, grp: &kq_lck_grp);
3422	kqueue_destroy(kqu: kqwl, zone: kqworkloop_zone);
3423	}
3424
3425	/!*
3426	* @function kqworkloop_init
3427	*
3428	* @brief
3429	* Initializes an allocated kqworkloop.
3430	*/
3431	static void
3432	kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3433	kqueue_id_t id, workq_threadreq_param_t *trp
3434	#if CONFIG_PREADOPT_TG
3435	, struct thread_group *trp_permanent_preadopt_tg
3436	#endif
3437	)
3438	{
3439	kqwl->kqwl_state = KQ_WORKLOOP \| KQ_DYNAMIC \| KQ_KEV_QOS;
3440	os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3441	kqwl->kqwl_dynamicid = id;
3442	kqwl->kqwl_p = p;
3443	if (trp) {
3444	kqwl->kqwl_params = trp->trp_value;
3445	}
3446
3447	workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3448	if (trp) {
3449	if (trp->trp_flags & TRP_PRIORITY) {
3450	tr_flags \|= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3451	}
3452	if (trp->trp_flags) {
3453	tr_flags \|= WORKQ_TR_FLAG_WL_PARAMS;
3454	}
3455	}
3456	kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3457	kqwl->kqwl_request.tr_flags = tr_flags;
3458	os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3459	#if CONFIG_PREADOPT_TG
3460	if (trp_permanent_preadopt_tg) {
3461	/*
3462	* This kqwl is permanently configured with a thread group.
3463	* By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3464	* has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3465	* trp_permanent_preadopt_tg is transferred to the kqwl.
3466	*/
3467	thread_group_qos_t kqwl_preadopt_tg;
3468	kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_permanent_preadopt_tg);
3469	os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3470	} else if (task_is_app(task: current_task())) {
3471	/*
3472	* Not a specially preconfigured kqwl so it is open to participate in sync IPC
3473	* thread group preadoption; but, apps will never adopt a thread group that
3474	* is not their own. This is a gross hack to simulate the post-process that
3475	* is done in the voucher subsystem today for thread groups.
3476	*/
3477	os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3478	}
3479	#endif
3480
3481	for (int i = `0`; i < KQWL_NBUCKETS; i++) {
3482	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3483	}
3484	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3485
3486	lck_spin_init(lck: &kqwl->kqwl_statelock, grp: &kq_lck_grp, LCK_ATTR_NULL);
3487
3488	kqueue_init(kqu: kqwl);
3489	}
3490
3491	#if CONFIG_PROC_RESOURCE_LIMITS
3492	void
3493	kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3494	{
3495	int num_kqwls = fdp->num_kqwls;
3496	if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > `0` &&
3497	num_kqwls > fdp->kqwl_dyn_soft_limit) {
3498	kqwl_above_soft_limit_send_notification(fdp);
3499	act_set_astproc_resource(current_thread());
3500	} else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > `0`
3501	&& num_kqwls > fdp->kqwl_dyn_hard_limit) {
3502	kqwl_above_hard_limit_send_notification(fdp);
3503	act_set_astproc_resource(current_thread());
3504	}
3505	}
3506	#endif
3507
3508	/!*
3509	* @function kqworkloop_get_or_create
3510	*
3511	* @brief
3512	* Wrapper around kqworkloop_init that handles the uniquing of workloops.
3513	*
3514	* @returns
3515	* 0: success
3516	* EINVAL: invalid parameters
3517	* EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3518	* ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3519	* ENOMEM: allocation failed
3520	*/
3521	static int
3522	kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3523	workq_threadreq_param_t *trp,
3524	#if CONFIG_PREADOPT_TG
3525	struct thread_group *trp_permanent_preadopt_tg,
3526	#endif
3527	unsigned int flags, struct kqworkloop **kqwlp)
3528	{
3529	struct filedesc *fdp = &p->p_fd;
3530	struct kqworkloop *alloc_kqwl = NULL;
3531	struct kqworkloop *kqwl = NULL;
3532	int error = `0`;
3533
3534	assert(!trp \|\| (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3535
3536	if (id == `0` \|\| id == (kqueue_id_t)-`1`) {
3537	return EINVAL;
3538	}
3539
3540	for (;;) {
3541	kqhash_lock(fdp);
3542	if (__improbable(fdp->fd_kqhash == NULL)) {
3543	kqworkloop_hash_init(fdp);
3544	}
3545
3546	kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3547	if (kqwl) {
3548	if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3549	/*
3550	* If MUST_NOT_EXIST was passed, even if we would have failed
3551	* the try_retain, it could have gone the other way, and
3552	* userspace can't tell. Let'em fix their race.
3553	*/
3554	error = EEXIST;
3555	break;
3556	}
3557
3558	if (__probable(kqworkloop_try_retain(kqwl))) {
3559	/*
3560	* This is a valid live workloop !
3561	*/
3562	*kqwlp = kqwl;
3563	error = `0`;
3564	break;
3565	}
3566	}
3567
3568	if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3569	error = ENOENT;
3570	break;
3571	}
3572
3573	/*
3574	* We didn't find what we were looking for.
3575	*
3576	* If this is the second time we reach this point (alloc_kqwl != NULL),
3577	* then we're done.
3578	*
3579	* If this is the first time we reach this point (alloc_kqwl == NULL),
3580	* then try to allocate one without blocking.
3581	*/
3582	if (__probable(alloc_kqwl == NULL)) {
3583	alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT \| Z_ZERO);
3584	}
3585	if (__probable(alloc_kqwl)) {
3586	#if CONFIG_PROC_RESOURCE_LIMITS
3587	fdp->num_kqwls++;
3588	kqworkloop_check_limit_exceeded(fdp);
3589	#endif
3590	kqworkloop_init(kqwl: alloc_kqwl, p, id, trp
3591	#if CONFIG_PREADOPT_TG
3592	, trp_permanent_preadopt_tg
3593	#endif
3594	);
3595	kqworkloop_hash_insert_locked(fdp, id, kqwl: alloc_kqwl);
3596	kqhash_unlock(fdp);
3597	*kqwlp = alloc_kqwl;
3598	return `0`;
3599	}
3600
3601	/*
3602	* We have to block to allocate a workloop, drop the lock,
3603	* allocate one, but then we need to retry lookups as someone
3604	* else could race with us.
3605	*/
3606	kqhash_unlock(fdp);
3607
3608	alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK \| Z_ZERO);
3609	}
3610
3611	kqhash_unlock(fdp);
3612
3613	if (__improbable(alloc_kqwl)) {
3614	zfree(kqworkloop_zone, alloc_kqwl);
3615	}
3616
3617	return error;
3618	}
3619
3620	#pragma mark - knotes
3621
3622	static int
3623	filt_no_attach(struct knote kn, __unused struct* kevent_qos_s *kev)
3624	{
3625	knote_set_error(kn, ENOTSUP);
3626	return `0`;
3627	}
3628
3629	static void
3630	filt_no_detach(__unused struct knote *kn)
3631	{
3632	}
3633
3634	static int __dead2
3635	filt_bad_event(struct knote kn, long* hint)
3636	{
3637	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3638	}
3639
3640	static int __dead2
3641	filt_bad_touch(struct knote kn, struct* kevent_qos_s *kev)
3642	{
3643	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3644	}
3645
3646	static int __dead2
3647	filt_bad_process(struct knote kn, struct* kevent_qos_s *kev)
3648	{
3649	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3650	}
3651
3652	/*
3653	* knotes_dealloc - detach all knotes for the process and drop them
3654	*
3655	* Process is in such a state that it will not try to allocate
3656	* any more knotes during this process (stopped for exit or exec).
3657	*/
3658	void
3659	knotes_dealloc(proc_t p)
3660	{
3661	struct filedesc *fdp = &p->p_fd;
3662	struct kqueue *kq;
3663	struct knote *kn;
3664	struct klist *kn_hash = NULL;
3665	u_long kn_hashmask;
3666	int i;
3667
3668	proc_fdlock(p);
3669
3670	/ Close all the fd-indexed knotes up front /
3671	if (fdp->fd_knlistsize > `0`) {
3672	for (i = `0`; i < fdp->fd_knlistsize; i++) {
3673	while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3674	kq = knote_get_kq(kn);
3675	kqlock(kqu: kq);
3676	proc_fdunlock(p);
3677	knote_drop(kqu: kq, kn, NULL);
3678	proc_fdlock(p);
3679	}
3680	}
3681	/ free the table /
3682	kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3683	}
3684	fdp->fd_knlistsize = `0`;
3685
3686	proc_fdunlock(p);
3687
3688	knhash_lock(fdp);
3689
3690	/ Clean out all the hashed knotes as well /
3691	if (fdp->fd_knhashmask != `0`) {
3692	for (i = `0`; i <= (int)fdp->fd_knhashmask; i++) {
3693	while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3694	kq = knote_get_kq(kn);
3695	kqlock(kqu: kq);
3696	knhash_unlock(fdp);
3697	knote_drop(kqu: kq, kn, NULL);
3698	knhash_lock(fdp);
3699	}
3700	}
3701	kn_hash = fdp->fd_knhash;
3702	kn_hashmask = fdp->fd_knhashmask;
3703	fdp->fd_knhashmask = `0`;
3704	fdp->fd_knhash = NULL;
3705	}
3706
3707	knhash_unlock(fdp);
3708
3709	if (kn_hash) {
3710	hashdestroy(kn_hash, M_KQUEUE, hashmask: kn_hashmask);
3711	}
3712	}
3713
3714	/*
3715	* kqworkloops_dealloc - rebalance retains on kqworkloops created with
3716	* scheduling parameters
3717	*
3718	* Process is in such a state that it will not try to allocate
3719	* any more kqs or knotes during this process (stopped for exit or exec).
3720	*/
3721	void
3722	kqworkloops_dealloc(proc_t p)
3723	{
3724	struct filedesc *fdp = &p->p_fd;
3725	struct kqworkloop kqwl, kqwln;
3726	struct kqwllist tofree;
3727
3728	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3729	return;
3730	}
3731
3732	kqhash_lock(fdp);
3733
3734	if (fdp->fd_kqhashmask == `0`) {
3735	kqhash_unlock(fdp);
3736	return;
3737	}
3738
3739	LIST_INIT(&tofree);
3740
3741	for (size_t i = `0`; i <= fdp->fd_kqhashmask; i++) {
3742	LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3743	#if CONFIG_PREADOPT_TG
3744	/*
3745	* kqworkloops that have scheduling parameters have an
3746	* implicit retain from kqueue_workloop_ctl that needs
3747	* to be balanced on process exit.
3748	*/
3749	__assert_only thread_group_qos_t preadopt_tg;
3750	preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3751	#endif
3752	assert(kqwl->kqwl_params
3753	#if CONFIG_PREADOPT_TG
3754	\|\| KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3755	#endif
3756	);
3757
3758	LIST_REMOVE(kqwl, kqwl_hashlink);
3759	LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3760	}
3761	}
3762	#if CONFIG_PROC_RESOURCE_LIMITS
3763	fdp->num_kqwls = `0`;
3764	#endif
3765	kqhash_unlock(fdp);
3766
3767	LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3768	uint32_t ref = os_ref_get_count_raw(rc: &kqwl->kqwl_retains);
3769	if (ref != `1`) {
3770	panic("kq(%p) invalid refcount %d", kqwl, ref);
3771	}
3772	kqworkloop_dealloc(kqwl, false);
3773	}
3774	}
3775
3776	static int
3777	kevent_register_validate_priority(struct kqueue kq, struct* knote *kn,
3778	struct kevent_qos_s *kev)
3779	{
3780	/ We don't care about the priority of a disabled or deleted knote /
3781	if (kev->flags & (EV_DISABLE \| EV_DELETE)) {
3782	return `0`;
3783	}
3784
3785	if (kq->kq_state & KQ_WORKLOOP) {
3786	/*
3787	* Workloops need valid priorities with a QOS (excluding manager) for
3788	* any enabled knote.
3789	*
3790	* When it is pre-existing, just make sure it has a valid QoS as
3791	* kevent_register() will not use the incoming priority (filters who do
3792	* have the responsibility to validate it again, see filt_wltouch).
3793	*
3794	* If the knote is being made, validate the incoming priority.
3795	*/
3796	if (!_pthread_priority_thread_qos(pp: kn ? kn->kn_qos : kev->qos)) {
3797	return ERANGE;
3798	}
3799	}
3800
3801	return `0`;
3802	}
3803
3804	/*
3805	* Prepare a filter for waiting after register.
3806	*
3807	* The f_post_register_wait hook will be called later by kevent_register()
3808	* and should call kevent_register_wait_block()
3809	*/
3810	static int
3811	kevent_register_wait_prepare(struct knote kn, struct* kevent_qos_s kev, int* rc)
3812	{
3813	thread_t thread = current_thread();
3814
3815	assert(knote_fops(kn)->f_extended_codes);
3816
3817	if (kn->kn_thread == NULL) {
3818	thread_reference(thread);
3819	kn->kn_thread = thread;
3820	} else if (kn->kn_thread != thread) {
3821	/*
3822	* kn_thread may be set from a previous aborted wait
3823	* However, it has to be from the same thread.
3824	*/
3825	kev->flags \|= EV_ERROR;
3826	kev->data = EXDEV;
3827	return `0`;
3828	}
3829
3830	return FILTER_REGISTER_WAIT \| rc;
3831	}
3832
3833	/*
3834	* Cleanup a kevent_register_wait_prepare() effect for threads that have been
3835	* aborted instead of properly woken up with thread_wakeup_thread().
3836	*/
3837	static void
3838	kevent_register_wait_cleanup(struct knote *kn)
3839	{
3840	thread_t thread = kn->kn_thread;
3841	kn->kn_thread = NULL;
3842	thread_deallocate(thread);
3843	}
3844
3845	/*
3846	* Must be called at the end of a f_post_register_wait call from a filter.
3847	*/
3848	static void
3849	kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3850	thread_continue_t cont, struct _kevent_register *cont_args)
3851	{
3852	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
3853	kqunlock(kqu: cont_args->kqwl);
3854	cont_args->handoff_thread = thread;
3855	thread_handoff_parameter(thread, continuation: cont, parameter: cont_args, THREAD_HANDOFF_NONE);
3856	}
3857
3858	/*
3859	* Called by Filters using a f_post_register_wait to return from their wait.
3860	*/
3861	static void
3862	kevent_register_wait_return(struct _kevent_register *cont_args)
3863	{
3864	struct kqworkloop *kqwl = cont_args->kqwl;
3865	struct kevent_qos_s *kev = &cont_args->kev;
3866	int error = `0`;
3867
3868	if (cont_args->handoff_thread) {
3869	thread_deallocate(thread: cont_args->handoff_thread);
3870	}
3871
3872	if (kev->flags & (EV_ERROR \| EV_RECEIPT)) {
3873	if ((kev->flags & EV_ERROR) == `0`) {
3874	kev->flags \|= EV_ERROR;
3875	kev->data = `0`;
3876	}
3877	error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3878	if (error == `0`) {
3879	cont_args->eventout++;
3880	}
3881	}
3882
3883	kqworkloop_release(kqwl);
3884	if (error == `0`) {
3885	(int32_t )&current_uthread()->uu_rval = cont_args->eventout;
3886	}
3887	unix_syscall_return(error);
3888	}
3889
3890	/*
3891	* kevent_register - add a new event to a kqueue
3892	*
3893	* Creates a mapping between the event source and
3894	* the kqueue via a knote data structure.
3895	*
3896	* Because many/most the event sources are file
3897	* descriptor related, the knote is linked off
3898	* the filedescriptor table for quick access.
3899	*
3900	* called with nothing locked
3901	* caller holds a reference on the kqueue
3902	*/
3903
3904	int
3905	kevent_register(struct kqueue kq, struct* kevent_qos_s *kev,
3906	struct knote **kn_out)
3907	{
3908	struct proc *p = kq->kq_p;
3909	const struct filterops *fops;
3910	struct knote *kn = NULL;
3911	int result = `0`, error = `0`;
3912	unsigned short kev_flags = kev->flags;
3913	KNOTE_LOCK_CTX(knlc);
3914
3915	if (__probable(kev->filter < `0` && kev->filter + EVFILT_SYSCOUNT >= `0`)) {
3916	fops = sysfilt_ops[~kev->filter]; / to 0-base index /
3917	} else {
3918	error = EINVAL;
3919	goto out;
3920	}
3921
3922	/ restrict EV_VANISHED to adding udata-specific dispatch kevents /
3923	if (__improbable((kev->flags & EV_VANISHED) &&
3924	(kev->flags & (EV_ADD \| EV_DISPATCH2)) != (EV_ADD \| EV_DISPATCH2))) {
3925	error = EINVAL;
3926	goto out;
3927	}
3928
3929	/ Simplify the flags - delete and disable overrule /
3930	if (kev->flags & EV_DELETE) {
3931	kev->flags &= ~EV_ADD;
3932	}
3933	if (kev->flags & EV_DISABLE) {
3934	kev->flags &= ~EV_ENABLE;
3935	}
3936
3937	if (kq->kq_state & KQ_WORKLOOP) {
3938	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3939	((struct kqworkloop *)kq)->kqwl_dynamicid,
3940	kev->udata, kev->flags, kev->filter);
3941	} else if (kq->kq_state & KQ_WORKQ) {
3942	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3943	`0`, kev->udata, kev->flags, kev->filter);
3944	} else {
3945	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3946	VM_KERNEL_UNSLIDE_OR_PERM(kq),
3947	kev->udata, kev->flags, kev->filter);
3948	}
3949
3950	restart:
3951	/ find the matching knote from the fd tables/hashes /
3952	kn = kq_find_knote_and_kq_lock(kq, kev, is_fd: fops->f_isfd, p);
3953	error = kevent_register_validate_priority(kq, kn, kev);
3954	result = `0`;
3955	if (error) {
3956	if (kn) {
3957	kqunlock(kqu: kq);
3958	}
3959	goto out;
3960	}
3961
3962	if (kn == NULL && (kev->flags & EV_ADD) == `0`) {
3963	/*
3964	* No knote found, EV_ADD wasn't specified
3965	*/
3966
3967	if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3968	(kq->kq_state & KQ_WORKLOOP)) {
3969	/*
3970	* For workloops, understand EV_ADD\|EV_DELETE as a "soft" delete
3971	* that doesn't care about ENOENT, so just pretend the deletion
3972	* happened.
3973	*/
3974	} else {
3975	error = ENOENT;
3976	}
3977	goto out;
3978	} else if (kn == NULL) {
3979	/*
3980	* No knote found, need to attach a new one (attach)
3981	*/
3982
3983	struct fileproc *knote_fp = NULL;
3984
3985	/ grab a file reference for the new knote /
3986	if (fops->f_isfd) {
3987	if ((error = fp_lookup(p, fd: (int)kev->ident, resultfp: &knote_fp, locked: `0`)) != `0`) {
3988	goto out;
3989	}
3990	}
3991
3992	kn = knote_alloc();
3993	kn->kn_fp = knote_fp;
3994	kn->kn_is_fd = fops->f_isfd;
3995	kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3996	kn->kn_status = `0`;
3997
3998	/ was vanish support requested /
3999	if (kev->flags & EV_VANISHED) {
4000	kev->flags &= ~EV_VANISHED;
4001	kn->kn_status \|= KN_REQVANISH;
4002	}
4003
4004	/ snapshot matching/dispatching protocol flags into knote /
4005	if (kev->flags & EV_DISABLE) {
4006	kn->kn_status \|= KN_DISABLED;
4007	}
4008
4009	/*
4010	* copy the kevent state into knote
4011	* protocol is that fflags and data
4012	* are saved off, and cleared before
4013	* calling the attach routine.
4014	*
4015	* - kn->kn_sfflags aliases with kev->xflags
4016	* - kn->kn_sdata aliases with kev->data
4017	* - kn->kn_filter is the top 8 bits of kev->filter
4018	*/
4019	kn->kn_kevent = (struct* kevent_internal_s *)kev;
4020	kn->kn_sfflags = kev->fflags;
4021	kn->kn_filtid = (uint8_t)~kev->filter;
4022	kn->kn_fflags = `0`;
4023	knote_reset_priority(kqu: kq, kn, pp: kev->qos);
4024
4025	/ Add the knote for lookup thru the fd table /
4026	error = kq_add_knote(kq, kn, knlc: &knlc, p);
4027	if (error) {
4028	knote_free(kn);
4029	if (knote_fp != NULL) {
4030	fp_drop(p, fd: (int)kev->ident, fp: knote_fp, locked: `0`);
4031	}
4032
4033	if (error == ERESTART) {
4034	goto restart;
4035	}
4036	goto out;
4037	}
4038
4039	/ fp reference count now applies to knote /
4040
4041	/*
4042	* we can't use filter_call() because f_attach can change the filter ops
4043	* for a filter that supports f_extended_codes, so we need to reload
4044	* knote_fops() and not use `fops`.
4045	*/
4046	result = fops->f_attach(kn, kev);
4047	if (result && !knote_fops(kn)->f_extended_codes) {
4048	result = FILTER_ACTIVE;
4049	}
4050
4051	kqlock(kqu: kq);
4052
4053	if (result & FILTER_THREADREQ_NODEFEER) {
4054	enable_preemption();
4055	}
4056
4057	if (kn->kn_flags & EV_ERROR) {
4058	/*
4059	* Failed to attach correctly, so drop.
4060	*/
4061	kn->kn_filtid = EVFILTID_DETACHED;
4062	error = (int)kn->kn_sdata;
4063	knote_drop(kqu: kq, kn, knlc: &knlc);
4064	result = `0`;
4065	goto out;
4066	}
4067
4068	/*
4069	* end "attaching" phase - now just attached
4070	*
4071	* Mark the thread request overcommit, if appropos
4072	*
4073	* If the attach routine indicated that an
4074	* event is already fired, activate the knote.
4075	*/
4076	if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4077	(kq->kq_state & KQ_WORKLOOP)) {
4078	kqworkloop_set_overcommit(kqwl: (struct kqworkloop *)kq);
4079	}
4080	} else if (!knote_lock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ON_SUCCESS)) {
4081	/*
4082	* The knote was dropped while we were waiting for the lock,
4083	* we need to re-evaluate entirely
4084	*/
4085
4086	goto restart;
4087	} else if (kev->flags & EV_DELETE) {
4088	/*
4089	* Deletion of a knote (drop)
4090	*
4091	* If the filter wants to filter drop events, let it do so.
4092	*
4093	* defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4094	* we must wait for the knote to be re-enabled (unless it is being
4095	* re-enabled atomically here).
4096	*/
4097
4098	if (knote_fops(kn)->f_allow_drop) {
4099	bool drop;
4100
4101	kqunlock(kqu: kq);
4102	drop = knote_fops(kn)->f_allow_drop(kn, kev);
4103	kqlock(kqu: kq);
4104
4105	if (!drop) {
4106	goto out_unlock;
4107	}
4108	}
4109
4110	if ((kev->flags & EV_ENABLE) == `0` &&
4111	(kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4112	(kn->kn_status & KN_DISABLED) != `0`) {
4113	kn->kn_status \|= KN_DEFERDELETE;
4114	error = EINPROGRESS;
4115	goto out_unlock;
4116	}
4117
4118	knote_drop(kqu: kq, kn, knlc: &knlc);
4119	goto out;
4120	} else {
4121	/*
4122	* Regular update of a knote (touch)
4123	*
4124	* Call touch routine to notify filter of changes in filter values
4125	* (and to re-determine if any events are fired).
4126	*
4127	* If the knote is in defer-delete, avoid calling the filter touch
4128	* routine (it has delivered its last event already).
4129	*
4130	* If the touch routine had no failure,
4131	* apply the requested side effects to the knote.
4132	*/
4133
4134	if (kn->kn_status & (KN_DEFERDELETE \| KN_VANISHED)) {
4135	if (kev->flags & EV_ENABLE) {
4136	result = FILTER_ACTIVE;
4137	}
4138	} else {
4139	kqunlock(kqu: kq);
4140	result = filter_call(knote_fops(kn), f_touch(kn, kev));
4141	kqlock(kqu: kq);
4142	if (result & FILTER_THREADREQ_NODEFEER) {
4143	enable_preemption();
4144	}
4145	}
4146
4147	if (kev->flags & EV_ERROR) {
4148	result = `0`;
4149	goto out_unlock;
4150	}
4151
4152	if ((kn->kn_flags & EV_UDATA_SPECIFIC) == `0` &&
4153	kn->kn_udata != kev->udata) {
4154	// this allows klist_copy_udata() not to take locks
4155	os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4156	}
4157	if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4158	kn->kn_status \|= KN_DISABLED;
4159	knote_dequeue(kqu: kq, kn);
4160	}
4161	}
4162
4163	/ accept new kevent state /
4164	knote_apply_touch(kqu: kq, kn, kev, result);
4165
4166	out_unlock:
4167	/*
4168	* When the filter asked for a post-register wait,
4169	* we leave the kqueue locked for kevent_register()
4170	* to call the filter's f_post_register_wait hook.
4171	*/
4172	if (result & FILTER_REGISTER_WAIT) {
4173	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ALWAYS);
4174	*kn_out = kn;
4175	} else {
4176	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_UNLOCK);
4177	}
4178
4179	out:
4180	/ output local errors through the kevent /
4181	if (error) {
4182	kev->flags \|= EV_ERROR;
4183	kev->data = error;
4184	}
4185	return result;
4186	}
4187
4188	/*
4189	* knote_process - process a triggered event
4190	*
4191	* Validate that it is really still a triggered event
4192	* by calling the filter routines (if necessary). Hold
4193	* a use reference on the knote to avoid it being detached.
4194	*
4195	* If it is still considered triggered, we will have taken
4196	* a copy of the state under the filter lock. We use that
4197	* snapshot to dispatch the knote for future processing (or
4198	* not, if this was a lost event).
4199	*
4200	* Our caller assures us that nobody else can be processing
4201	* events from this knote during the whole operation. But
4202	* others can be touching or posting events to the knote
4203	* interspersed with our processing it.
4204	*
4205	* caller holds a reference on the kqueue.
4206	* kqueue locked on entry and exit - but may be dropped
4207	*/
4208	static int
4209	knote_process(struct knote *kn, kevent_ctx_t kectx,
4210	kevent_callback_t callback)
4211	{
4212	struct kevent_qos_s kev;
4213	struct kqueue *kq = knote_get_kq(kn);
4214	KNOTE_LOCK_CTX(knlc);
4215	int result = FILTER_ACTIVE;
4216	int error = `0`;
4217	bool drop = false;
4218
4219	/*
4220	* Must be active
4221	* Must be queued and not disabled/suppressed or dropping
4222	*/
4223	assert(kn->kn_status & KN_QUEUED);
4224	assert(kn->kn_status & KN_ACTIVE);
4225	assert(!(kn->kn_status & (KN_DISABLED \| KN_SUPPRESSED \| KN_DROPPING)));
4226
4227	if (kq->kq_state & KQ_WORKLOOP) {
4228	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4229	((struct kqworkloop *)kq)->kqwl_dynamicid,
4230	kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4231	kn->kn_filtid);
4232	} else if (kq->kq_state & KQ_WORKQ) {
4233	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4234	`0`, kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4235	kn->kn_filtid);
4236	} else {
4237	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4238	VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4239	kn->kn_status \| (kn->kn_id << `32`), kn->kn_filtid);
4240	}
4241
4242	if (!knote_lock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ALWAYS)) {
4243	/*
4244	* When the knote is dropping or has dropped,
4245	* then there's nothing we want to process.
4246	*/
4247	return EJUSTRETURN;
4248	}
4249
4250	/*
4251	* While waiting for the knote lock, we may have dropped the kq lock.
4252	* and a touch may have disabled and dequeued the knote.
4253	*/
4254	if (!(kn->kn_status & KN_QUEUED)) {
4255	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ALWAYS);
4256	return EJUSTRETURN;
4257	}
4258
4259	/*
4260	* For deferred-drop or vanished events, we just create a fake
4261	* event to acknowledge end-of-life. Otherwise, we call the
4262	* filter's process routine to snapshot the kevent state under
4263	* the filter's locking protocol.
4264	*
4265	* suppress knotes to avoid returning the same event multiple times in
4266	* a single call.
4267	*/
4268	knote_suppress(kqu: kq, kn);
4269
4270	if (kn->kn_status & (KN_DEFERDELETE \| KN_VANISHED)) {
4271	uint16_t kev_flags = EV_DISPATCH2 \| EV_ONESHOT;
4272	if (kn->kn_status & KN_DEFERDELETE) {
4273	kev_flags \|= EV_DELETE;
4274	} else {
4275	kev_flags \|= EV_VANISHED;
4276	}
4277
4278	/ create fake event /
4279	kev = (struct kevent_qos_s){
4280	.filter = kn->kn_filter,
4281	.ident = kn->kn_id,
4282	.flags = kev_flags,
4283	.udata = kn->kn_udata,
4284	};
4285	} else {
4286	kqunlock(kqu: kq);
4287	kev = (struct kevent_qos_s) { };
4288	result = filter_call(knote_fops(kn), f_process(kn, &kev));
4289	kqlock(kqu: kq);
4290	}
4291
4292	/*
4293	* Determine how to dispatch the knote for future event handling.
4294	* not-fired: just return (do not callout, leave deactivated).
4295	* One-shot: If dispatch2, enter deferred-delete mode (unless this is
4296	* is the deferred delete event delivery itself). Otherwise,
4297	* drop it.
4298	* Dispatch: don't clear state, just mark it disabled.
4299	* Cleared: just leave it deactivated.
4300	* Others: re-activate as there may be more events to handle.
4301	* This will not wake up more handlers right now, but
4302	* at the completion of handling events it may trigger
4303	* more handler threads (TODO: optimize based on more than
4304	* just this one event being detected by the filter).
4305	*/
4306	if ((result & FILTER_ACTIVE) == `0`) {
4307	if ((kn->kn_status & KN_ACTIVE) == `0`) {
4308	/*
4309	* Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4310	* within f_process() but that doesn't necessarily make them
4311	* ready to process, so we should leave them be.
4312	*
4313	* For other knotes, since we will not return an event,
4314	* there's no point keeping the knote suppressed.
4315	*/
4316	knote_unsuppress(kqu: kq, kn);
4317	}
4318	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ALWAYS);
4319	return EJUSTRETURN;
4320	}
4321
4322	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4323	knote_adjust_qos(kq, kn, result);
4324	}
4325
4326	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4327	kqueue_update_iotier_override(kqu: kq);
4328	}
4329
4330	kev.qos = _pthread_priority_combine(base_pp: kn->kn_qos, qos: kn->kn_qos_override);
4331
4332	if (kev.flags & EV_ONESHOT) {
4333	if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4334	(kn->kn_status & KN_DEFERDELETE) == `0`) {
4335	/ defer dropping non-delete oneshot dispatch2 events /
4336	kn->kn_status \|= KN_DEFERDELETE \| KN_DISABLED;
4337	} else {
4338	drop = true;
4339	}
4340	} else if (kn->kn_flags & EV_DISPATCH) {
4341	/ disable all dispatch knotes /
4342	kn->kn_status \|= KN_DISABLED;
4343	} else if ((kn->kn_flags & EV_CLEAR) == `0`) {
4344	/ re-activate in case there are more events /
4345	knote_activate(kqu: kq, kn, FILTER_ACTIVE);
4346	}
4347
4348	/*
4349	* callback to handle each event as we find it.
4350	* If we have to detach and drop the knote, do
4351	* it while we have the kq unlocked.
4352	*/
4353	if (drop) {
4354	knote_drop(kqu: kq, kn, knlc: &knlc);
4355	} else {
4356	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_UNLOCK);
4357	}
4358
4359	if (kev.flags & EV_VANISHED) {
4360	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4361	kev.ident, kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4362	kn->kn_filtid);
4363	}
4364
4365	error = (callback)(&kev, kectx);
4366	kqlock(kqu: kq);
4367	return error;
4368	}
4369
4370	/*
4371	* Returns -1 if the kqueue was unbound and processing should not happen
4372	*/
4373	#define KQWQAE_BEGIN_PROCESSING 1
4374	#define KQWQAE_END_PROCESSING 2
4375	#define KQWQAE_UNBIND 3
4376	static int
4377	kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4378	int kevent_flags, int kqwqae_op)
4379	{
4380	struct knote *kn;
4381	int rc = `0`;
4382	bool unbind;
4383	struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - `1`];
4384	struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - `1`];
4385
4386	kqlock_held(kqu: &kqwq->kqwq_kqueue);
4387
4388	/*
4389	* Return suppressed knotes to their original state.
4390	* For workq kqueues, suppressed ones that are still
4391	* truly active (not just forced into the queue) will
4392	* set flags we check below to see if anything got
4393	* woken up.
4394	*/
4395	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4396	knote_unsuppress(kqu: kqwq, kn);
4397	}
4398
4399	if (kqwqae_op == KQWQAE_UNBIND) {
4400	unbind = true;
4401	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == `0`) {
4402	unbind = false;
4403	} else {
4404	unbind = TAILQ_EMPTY(queue);
4405	}
4406	if (unbind) {
4407	thread_t thread = kqr_thread_fast(kqr);
4408	thread_qos_t old_override;
4409
4410	#if DEBUG \|\| DEVELOPMENT
4411	thread_t self = current_thread();
4412	struct uthread *ut = get_bsdthread_info(self);
4413
4414	assert(thread == self);
4415	assert(ut->uu_kqr_bound == kqr);
4416	#endif // DEBUG \|\| DEVELOPMENT
4417
4418	old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4419	if (!TAILQ_EMPTY(queue)) {
4420	/*
4421	* Request a new thread if we didn't process the whole
4422	* queue.
4423	*/
4424	kqueue_threadreq_initiate(kq: &kqwq->kqwq_kqueue, kqr,
4425	qos: kqr->tr_kq_qos_index, flags: `0`);
4426	}
4427	if (old_override) {
4428	thread_drop_kevent_override(thread);
4429	}
4430	rc = -`1`;
4431	}
4432
4433	return rc;
4434	}
4435
4436	/*
4437	* Return 0 to indicate that processing should proceed,
4438	* -1 if there is nothing to process.
4439	*
4440	* Called with kqueue locked and returns the same way,
4441	* but may drop lock temporarily.
4442	*/
4443	static int
4444	kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4445	int kevent_flags)
4446	{
4447	int rc = `0`;
4448
4449	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) \| DBG_FUNC_START,
4450	`0`, kqr->tr_kq_qos_index);
4451
4452	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4453	KQWQAE_BEGIN_PROCESSING);
4454
4455	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) \| DBG_FUNC_END,
4456	thread_tid(kqr_thread(kqr)),
4457	!TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - `1`]));
4458
4459	return rc;
4460	}
4461
4462	static thread_qos_t
4463	kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4464	{
4465	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4466	struct knote kn, tmp;
4467
4468	kqlock_held(kqu: kqwl);
4469
4470	TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4471	/*
4472	* If a knote that can adjust QoS is disabled because of the automatic
4473	* behavior of EV_DISPATCH, the knotes should stay suppressed so that
4474	* further overrides keep pushing.
4475	*/
4476	if (knote_fops(kn)->f_adjusts_qos &&
4477	(kn->kn_status & KN_DISABLED) != `0` &&
4478	(kn->kn_status & KN_DROPPING) == `0` &&
4479	(kn->kn_flags & (EV_DISPATCH \| EV_DISABLE)) == EV_DISPATCH) {
4480	qos = MAX(qos, kn->kn_qos_override);
4481	continue;
4482	}
4483	knote_unsuppress(kqu: kqwl, kn);
4484	}
4485
4486	return qos;
4487	}
4488
4489	static int
4490	kqworkloop_begin_processing(struct kqworkloop kqwl, unsigned* int kevent_flags)
4491	{
4492	workq_threadreq_t kqr = &kqwl->kqwl_request;
4493	struct kqueue *kq = &kqwl->kqwl_kqueue;
4494	int rc = `0`, op = KQWL_UTQ_NONE;
4495
4496	kqlock_held(kqu: kq);
4497
4498	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) \| DBG_FUNC_START,
4499	kqwl->kqwl_dynamicid, `0`, `0`);
4500
4501	/ nobody else should still be processing /
4502	assert((kq->kq_state & KQ_PROCESSING) == `0`);
4503
4504	kq->kq_state \|= KQ_PROCESSING;
4505
4506	if (kevent_flags & KEVENT_FLAG_PARKING) {
4507	/*
4508	* When "parking" we want to process events and if no events are found
4509	* unbind.
4510	*
4511	* However, non overcommit threads sometimes park even when they have
4512	* more work so that the pool can narrow. For these, we need to unbind
4513	* early, so that calling kqworkloop_update_threads_qos() can ask the
4514	* workqueue subsystem whether the thread should park despite having
4515	* pending events.
4516	*/
4517	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4518	op = KQWL_UTQ_PARKING;
4519	} else {
4520	op = KQWL_UTQ_UNBINDING;
4521	}
4522	} else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4523	op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4524	}
4525
4526	if (op != KQWL_UTQ_NONE) {
4527	thread_qos_t qos_override;
4528	thread_t thread = kqr_thread_fast(kqr);
4529
4530	qos_override = kqworkloop_acknowledge_events(kqwl);
4531
4532	if (op == KQWL_UTQ_UNBINDING) {
4533	kqworkloop_unbind_locked(kwql: kqwl, thread,
4534	how: KQWL_OVERRIDE_DROP_IMMEDIATELY);
4535	kqworkloop_release_live(kqwl);
4536	}
4537	kqworkloop_update_threads_qos(kqwl, op, qos: qos_override);
4538	if (op == KQWL_UTQ_PARKING &&
4539	(!kqwl->kqwl_count \|\| kqwl->kqwl_owner)) {
4540	kqworkloop_unbind_locked(kwql: kqwl, thread,
4541	how: KQWL_OVERRIDE_DROP_DELAYED);
4542	kqworkloop_release_live(kqwl);
4543	rc = -`1`;
4544	} else if (op == KQWL_UTQ_UNBINDING &&
4545	kqr_thread(kqr) != thread) {
4546	rc = -`1`;
4547	}
4548
4549	if (rc == -`1`) {
4550	kq->kq_state &= ~KQ_PROCESSING;
4551	kqworkloop_unbind_delayed_override_drop(thread);
4552	}
4553	}
4554
4555	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) \| DBG_FUNC_END,
4556	kqwl->kqwl_dynamicid, `0`, `0`);
4557
4558	return rc;
4559	}
4560
4561	/*
4562	* Return 0 to indicate that processing should proceed,
4563	* -1 if there is nothing to process.
4564	* EBADF if the kqueue is draining
4565	*
4566	* Called with kqueue locked and returns the same way,
4567	* but may drop lock temporarily.
4568	* May block.
4569	*/
4570	static int
4571	kqfile_begin_processing(struct kqfile *kq)
4572	{
4573	kqlock_held(kqu: kq);
4574
4575	assert((kq->kqf_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`);
4576	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_START,
4577	VM_KERNEL_UNSLIDE_OR_PERM(kq), `0`);
4578
4579	/ wait to become the exclusive processing thread /
4580	while ((kq->kqf_state & (KQ_PROCESSING \| KQ_DRAIN)) == KQ_PROCESSING) {
4581	kq->kqf_state \|= KQ_PROCWAIT;
4582	lck_spin_sleep(lck: &kq->kqf_lock, lck_sleep_action: LCK_SLEEP_DEFAULT,
4583	event: &kq->kqf_suppressed, THREAD_UNINT \| THREAD_WAIT_NOREPORT);
4584	}
4585
4586	if (kq->kqf_state & KQ_DRAIN) {
4587	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
4588	VM_KERNEL_UNSLIDE_OR_PERM(kq), `2`);
4589	return EBADF;
4590	}
4591
4592	/ Nobody else processing /
4593
4594	/ anything left to process? /
4595	if (kq->kqf_count == `0`) {
4596	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
4597	VM_KERNEL_UNSLIDE_OR_PERM(kq), `1`);
4598	return -`1`;
4599	}
4600
4601	/ convert to processing mode /
4602	kq->kqf_state \|= KQ_PROCESSING;
4603
4604	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
4605	VM_KERNEL_UNSLIDE_OR_PERM(kq), `0`);
4606	return `0`;
4607	}
4608
4609	/*
4610	* Try to end the processing, only called when a workq thread is attempting to
4611	* park (KEVENT_FLAG_PARKING is set).
4612	*
4613	* When returning -1, the kqworkq is setup again so that it is ready to be
4614	* processed.
4615	*/
4616	static int
4617	kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4618	int kevent_flags)
4619	{
4620	if (kevent_flags & KEVENT_FLAG_PARKING) {
4621	/*
4622	* if acknowledge events "succeeds" it means there are events,
4623	* which is a failure condition for end_processing.
4624	*/
4625	int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4626	KQWQAE_END_PROCESSING);
4627	if (rc == `0`) {
4628	return -`1`;
4629	}
4630	}
4631
4632	return `0`;
4633	}
4634
4635	/*
4636	* Try to end the processing, only called when a workq thread is attempting to
4637	* park (KEVENT_FLAG_PARKING is set).
4638	*
4639	* When returning -1, the kqworkq is setup again so that it is ready to be
4640	* processed (as if kqworkloop_begin_processing had just been called).
4641	*
4642	* If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4643	* the kqworkloop is unbound from its servicer as a side effect.
4644	*/
4645	static int
4646	kqworkloop_end_processing(struct kqworkloop kqwl, int* flags, int kevent_flags)
4647	{
4648	struct kqueue *kq = &kqwl->kqwl_kqueue;
4649	workq_threadreq_t kqr = &kqwl->kqwl_request;
4650	int rc = `0`;
4651
4652	kqlock_held(kqu: kq);
4653
4654	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) \| DBG_FUNC_START,
4655	kqwl->kqwl_dynamicid, `0`, `0`);
4656
4657	if (kevent_flags & KEVENT_FLAG_PARKING) {
4658	thread_t thread = kqr_thread_fast(kqr);
4659	thread_qos_t qos_override;
4660
4661	/*
4662	* When KEVENT_FLAG_PARKING is set, we need to attempt
4663	* an unbind while still under the lock.
4664	*
4665	* So we do everything kqworkloop_unbind() would do, but because
4666	* we're inside kqueue_process(), if the workloop actually
4667	* received events while our locks were dropped, we have
4668	* the opportunity to fail the end processing and loop again.
4669	*
4670	* This avoids going through the process-wide workqueue lock
4671	* hence scales better.
4672	*/
4673	assert(flags & KQ_PROCESSING);
4674	qos_override = kqworkloop_acknowledge_events(kqwl);
4675	kqworkloop_update_threads_qos(kqwl, op: KQWL_UTQ_PARKING, qos: qos_override);
4676
4677	if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4678	rc = -`1`;
4679	} else {
4680	kqworkloop_unbind_locked(kwql: kqwl, thread, how: KQWL_OVERRIDE_DROP_DELAYED);
4681	kqworkloop_release_live(kqwl);
4682	kq->kq_state &= ~flags;
4683	kqworkloop_unbind_delayed_override_drop(thread);
4684	}
4685	} else {
4686	kq->kq_state &= ~flags;
4687	kq->kq_state \|= KQ_R2K_ARMED;
4688	kqworkloop_update_threads_qos(kqwl, op: KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, qos: `0`);
4689	}
4690
4691	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) \| DBG_FUNC_END,
4692	kqwl->kqwl_dynamicid, `0`, `0`);
4693
4694	return rc;
4695	}
4696
4697	/*
4698	* Called with kqueue lock held.
4699	*
4700	* 0: no more events
4701	* -1: has more events
4702	* EBADF: kqueue is in draining mode
4703	*/
4704	static int
4705	kqfile_end_processing(struct kqfile *kq)
4706	{
4707	struct knote *kn;
4708	int procwait;
4709
4710	kqlock_held(kqu: kq);
4711
4712	assert((kq->kqf_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`);
4713
4714	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4715	VM_KERNEL_UNSLIDE_OR_PERM(kq), `0`);
4716
4717	/*
4718	* Return suppressed knotes to their original state.
4719	*/
4720	while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4721	knote_unsuppress(kqu: kq, kn);
4722	}
4723
4724	procwait = (kq->kqf_state & KQ_PROCWAIT);
4725	kq->kqf_state &= ~(KQ_PROCESSING \| KQ_PROCWAIT);
4726
4727	if (procwait) {
4728	/ first wake up any thread already waiting to process /
4729	thread_wakeup(&kq->kqf_suppressed);
4730	}
4731
4732	if (kq->kqf_state & KQ_DRAIN) {
4733	return EBADF;
4734	}
4735	return kq->kqf_count != `0` ? -`1` : `0`;
4736	}
4737
4738	static int
4739	kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4740	struct kqueue_workloop_params params, int* *retval)
4741	{
4742	int error = `0`;
4743	struct kqworkloop *kqwl;
4744	struct filedesc *fdp = &p->p_fd;
4745	workq_threadreq_param_t trp = { };
4746	#if CONFIG_PREADOPT_TG
4747	struct thread_group *trp_permanent_preadopt_tg = NULL;
4748	integer_t trp_preadopt_priority = `0`;
4749	integer_t trp_preadopt_policy = `0`;
4750	#endif /* CONFIG_PREADOPT_TG */
4751
4752	switch (cmd) {
4753	case KQ_WORKLOOP_CREATE:
4754	if (!params->kqwlp_flags) {
4755	error = EINVAL;
4756	break;
4757	}
4758
4759	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4760	(params->kqwlp_sched_pri < `1` \|\|
4761	params->kqwlp_sched_pri > `63` / MAXPRI_USER /)) {
4762	error = EINVAL;
4763	break;
4764	}
4765
4766	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4767	invalid_policy(params->kqwlp_sched_pol)) {
4768	error = EINVAL;
4769	break;
4770	}
4771
4772	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4773	(params->kqwlp_cpu_percent <= `0` \|\|
4774	params->kqwlp_cpu_percent > `100` \|\|
4775	params->kqwlp_cpu_refillms <= `0` \|\|
4776	params->kqwlp_cpu_refillms > `0x00ffffff`)) {
4777	error = EINVAL;
4778	break;
4779	}
4780
4781	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4782	#if CONFIG_PREADOPT_TG
4783	kern_return_t kr;
4784	kr = kern_work_interval_get_policy_from_port(port_name: params->kqwl_wi_port,
4785	policy: &trp_preadopt_policy,
4786	priority: &trp_preadopt_priority,
4787	tg: &trp_permanent_preadopt_tg);
4788	if (kr != KERN_SUCCESS) {
4789	error = EINVAL;
4790	break;
4791	}
4792	/ The work interval comes with scheduling policy. /
4793	if (trp_preadopt_policy) {
4794	trp.trp_flags \|= TRP_POLICY;
4795	trp.trp_pol = (uint8_t)trp_preadopt_policy;
4796
4797	trp.trp_flags \|= TRP_PRIORITY;
4798	trp.trp_pri = (uint8_t)trp_preadopt_priority;
4799	}
4800	/*
4801	* We take +1 ref on a thread group backing this work interval
4802	* via kern_work_interval_get_policy_from_port and pass it on to kqwl.
4803	* If, for whatever reasons, kqworkloop_get_or_create fails, we
4804	* get back this ref.
4805	*/
4806	#else
4807	error = ENOTSUP;
4808	break;
4809	#endif /* CONFIG_PREADOPT_TG */
4810	}
4811
4812	if (!(trp.trp_flags & (TRP_POLICY \| TRP_PRIORITY))) {
4813	/*
4814	* We always prefer scheduling policy + priority that comes with
4815	* a work interval. It it does not exist, we fallback to what the user
4816	* has asked.
4817	*/
4818	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4819	trp.trp_flags \|= TRP_PRIORITY;
4820	trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4821	}
4822	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4823	trp.trp_flags \|= TRP_POLICY;
4824	trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4825	}
4826	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4827	trp.trp_flags \|= TRP_CPUPERCENT;
4828	trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4829	trp.trp_refillms = params->kqwlp_cpu_refillms;
4830	}
4831	}
4832
4833	error = kqworkloop_get_or_create(p, id: params->kqwlp_id, trp: &trp,
4834	#if CONFIG_PREADOPT_TG
4835	trp_permanent_preadopt_tg,
4836	#endif /* CONFIG_PREADOPT_TG */
4837	KEVENT_FLAG_DYNAMIC_KQUEUE \| KEVENT_FLAG_WORKLOOP \|
4838	KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, kqwlp: &kqwl);
4839	if (error) {
4840	#if CONFIG_PREADOPT_TG
4841	/ In case of success, kqwl consumes this +1 ref. /
4842	if (trp_permanent_preadopt_tg) {
4843	thread_group_release(tg: trp_permanent_preadopt_tg);
4844	}
4845	#endif
4846	break;
4847	}
4848
4849	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
4850	/ FD_WORKLOOP indicates we've ever created a workloop*
4851	* via this syscall but its only ever added to a process, never
4852	* removed.
4853	*/
4854	proc_fdlock(p);
4855	fdt_flag_set(fdp, FD_WORKLOOP);
4856	proc_fdunlock(p);
4857	}
4858	break;
4859	case KQ_WORKLOOP_DESTROY:
4860	error = kqworkloop_get_or_create(p, id: params->kqwlp_id, NULL,
4861	#if CONFIG_PREADOPT_TG
4862	NULL,
4863	#endif /* CONFIG_PREADOPT_TG */
4864	KEVENT_FLAG_DYNAMIC_KQUEUE \| KEVENT_FLAG_WORKLOOP \|
4865	KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, kqwlp: &kqwl);
4866	if (error) {
4867	break;
4868	}
4869	kqlock(kqu: kqwl);
4870	trp.trp_value = kqwl->kqwl_params;
4871	if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4872	trp.trp_flags \|= TRP_RELEASED;
4873	kqwl->kqwl_params = trp.trp_value;
4874	kqworkloop_release_live(kqwl);
4875	} else {
4876	error = EINVAL;
4877	}
4878	kqunlock(kqu: kqwl);
4879	kqworkloop_release(kqwl);
4880	break;
4881	}
4882	*retval = `0`;
4883	return error;
4884	}
4885
4886	int
4887	kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args uap, int* *retval)
4888	{
4889	struct kqueue_workloop_params params = {
4890	.kqwlp_id = `0`,
4891	};
4892	if (uap->sz < sizeof(params.kqwlp_version)) {
4893	return EINVAL;
4894	}
4895
4896	size_t copyin_sz = MIN(sizeof(params), uap->sz);
4897	int rv = copyin(uap->addr, &params, copyin_sz);
4898	if (rv) {
4899	return rv;
4900	}
4901
4902	if (params.kqwlp_version != (int)uap->sz) {
4903	return EINVAL;
4904	}
4905
4906	return kqueue_workloop_ctl_internal(p, cmd: uap->cmd, options: uap->options, params: &params,
4907	retval);
4908	}
4909
4910	static int
4911	kqueue_select(struct fileproc fp, int* which, void *wql, __unused vfs_context_t ctx)
4912	{
4913	struct kqfile kq = (struct* kqfile *)fp_get_data(fp);
4914	int retnum = `0`;
4915
4916	assert((kq->kqf_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
4917
4918	if (which == FREAD) {
4919	kqlock(kqu: kq);
4920	if (kqfile_begin_processing(kq) == `0`) {
4921	retnum = kq->kqf_count;
4922	kqfile_end_processing(kq);
4923	} else if ((kq->kqf_state & KQ_DRAIN) == `0`) {
4924	selrecord(selector: kq->kqf_p, &kq->kqf_sel, wql);
4925	}
4926	kqunlock(kqu: kq);
4927	}
4928	return retnum;
4929	}
4930
4931	/*
4932	* kqueue_close -
4933	*/
4934	static int
4935	kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4936	{
4937	struct kqfile *kqf = fg_get_data(fg);
4938
4939	assert((kqf->kqf_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
4940	kqlock(kqu: kqf);
4941	selthreadclear(&kqf->kqf_sel);
4942	kqunlock(kqu: kqf);
4943	kqueue_dealloc(kq: &kqf->kqf_kqueue);
4944	fg_set_data(fg, NULL);
4945	return `0`;
4946	}
4947
4948	/*
4949	* Max depth of the nested kq path that can be created.
4950	* Note that this has to be less than the size of kq_level
4951	* to avoid wrapping around and mislabeling the level. We also
4952	* want to be aggressive about this so that we don't overflow the
4953	* kernel stack while posting kevents
4954	*/
4955	#define MAX_NESTED_KQ 10
4956
4957	/*
4958	* The callers has taken a use-count reference on this kqueue and will donate it
4959	* to the kqueue we are being added to. This keeps the kqueue from closing until
4960	* that relationship is torn down.
4961	*/
4962	static int
4963	kqueue_kqfilter(struct fileproc fp, struct* knote *kn,
4964	__unused struct kevent_qos_s *kev)
4965	{
4966	struct kqfile kqf = (struct* kqfile *)fp_get_data(fp);
4967	struct kqueue *kq = &kqf->kqf_kqueue;
4968	struct kqueue *parentkq = knote_get_kq(kn);
4969
4970	assert((kqf->kqf_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
4971
4972	if (parentkq == kq \|\| kn->kn_filter != EVFILT_READ) {
4973	knote_set_error(kn, EINVAL);
4974	return `0`;
4975	}
4976
4977	/*
4978	* We have to avoid creating a cycle when nesting kqueues
4979	* inside another. Rather than trying to walk the whole
4980	* potential DAG of nested kqueues, we just use a simple
4981	* ceiling protocol. When a kqueue is inserted into another,
4982	* we check that the (future) parent is not already nested
4983	* into another kqueue at a lower level than the potenial
4984	* child (because it could indicate a cycle). If that test
4985	* passes, we just mark the nesting levels accordingly.
4986	*
4987	* Only up to MAX_NESTED_KQ can be nested.
4988	*
4989	* Note: kqworkq and kqworkloop cannot be nested and have reused their
4990	* kq_level field, so ignore these as parent.
4991	*/
4992
4993	kqlock(kqu: parentkq);
4994
4995	if ((parentkq->kq_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`) {
4996	if (parentkq->kq_level > `0` &&
4997	parentkq->kq_level < kq->kq_level) {
4998	kqunlock(kqu: parentkq);
4999	knote_set_error(kn, EINVAL);
5000	return `0`;
5001	}
5002
5003	/ set parent level appropriately /
5004	uint16_t plevel = (parentkq->kq_level == `0`)? `2`: parentkq->kq_level;
5005	if (plevel < kq->kq_level + `1`) {
5006	if (kq->kq_level + `1` > MAX_NESTED_KQ) {
5007	kqunlock(kqu: parentkq);
5008	knote_set_error(kn, EINVAL);
5009	return `0`;
5010	}
5011	plevel = kq->kq_level + `1`;
5012	}
5013
5014	parentkq->kq_level = plevel;
5015	}
5016
5017	kqunlock(kqu: parentkq);
5018
5019	kn->kn_filtid = EVFILTID_KQREAD;
5020	kqlock(kqu: kq);
5021	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5022	/ indicate nesting in child, if needed /
5023	if (kq->kq_level == `0`) {
5024	kq->kq_level = `1`;
5025	}
5026
5027	int count = kq->kq_count;
5028	kqunlock(kqu: kq);
5029	return count > `0`;
5030	}
5031
5032	__attribute__((noinline))
5033	static void
5034	kqfile_wakeup(struct kqfile kqf, long* hint, wait_result_t wr)
5035	{
5036	/ wakeup a thread waiting on this queue /
5037	selwakeup(&kqf->kqf_sel);
5038
5039	/ wake up threads in kqueue_scan() /
5040	if (kqf->kqf_state & KQ_SLEEP) {
5041	kqf->kqf_state &= ~KQ_SLEEP;
5042	thread_wakeup_with_result(&kqf->kqf_count, wr);
5043	}
5044
5045	if (hint == NOTE_REVOKE) {
5046	/ wakeup threads waiting their turn to process /
5047	if (kqf->kqf_state & KQ_PROCWAIT) {
5048	assert(kqf->kqf_state & KQ_PROCESSING);
5049	kqf->kqf_state &= ~KQ_PROCWAIT;
5050	thread_wakeup(&kqf->kqf_suppressed);
5051	}
5052
5053	/ no need to KNOTE: knote_fdclose() takes care of it /
5054	} else {
5055	/ wakeup other kqueues/select sets we're inside /
5056	KNOTE(&kqf->kqf_sel.si_note, hint);
5057	}
5058	}
5059
5060	/*
5061	* kqueue_drain - called when kq is closed
5062	*/
5063	static int
5064	kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5065	{
5066	struct kqfile kqf = (struct* kqfile *)fp_get_data(fp);
5067
5068	assert((kqf->kqf_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
5069
5070	kqlock(kqu: kqf);
5071	kqf->kqf_state \|= KQ_DRAIN;
5072	kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5073	kqunlock(kqu: kqf);
5074	return `0`;
5075	}
5076
5077	int
5078	kqueue_stat(struct kqueue kq, void* ub, int* isstat64, proc_t p)
5079	{
5080	assert((kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`);
5081
5082	kqlock(kqu: kq);
5083	if (isstat64 != `0`) {
5084	struct stat64 sb64 = (struct* stat64 *)ub;
5085
5086	bzero(s: (void )sb64, n: sizeof(sb64));
5087	sb64->st_size = kq->kq_count;
5088	if (kq->kq_state & KQ_KEV_QOS) {
5089	sb64->st_blksize = sizeof(struct kevent_qos_s);
5090	} else if (kq->kq_state & KQ_KEV64) {
5091	sb64->st_blksize = sizeof(struct kevent64_s);
5092	} else if (IS_64BIT_PROCESS(p)) {
5093	sb64->st_blksize = sizeof(struct user64_kevent);
5094	} else {
5095	sb64->st_blksize = sizeof(struct user32_kevent);
5096	}
5097	sb64->st_mode = S_IFIFO;
5098	} else {
5099	struct stat sb = (struct* stat *)ub;
5100
5101	bzero(s: (void )sb, n: sizeof(sb));
5102	sb->st_size = kq->kq_count;
5103	if (kq->kq_state & KQ_KEV_QOS) {
5104	sb->st_blksize = sizeof(struct kevent_qos_s);
5105	} else if (kq->kq_state & KQ_KEV64) {
5106	sb->st_blksize = sizeof(struct kevent64_s);
5107	} else if (IS_64BIT_PROCESS(p)) {
5108	sb->st_blksize = sizeof(struct user64_kevent);
5109	} else {
5110	sb->st_blksize = sizeof(struct user32_kevent);
5111	}
5112	sb->st_mode = S_IFIFO;
5113	}
5114	kqunlock(kqu: kq);
5115	return `0`;
5116	}
5117
5118	static inline bool
5119	kqueue_threadreq_can_use_ast(struct kqueue *kq)
5120	{
5121	if (current_proc() == kq->kq_p) {
5122	/*
5123	* Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5124	* do combined send/receive and in the case of self-IPC, the AST may bet
5125	* set on a thread that will not return to userspace and needs the
5126	* thread the AST would create to unblock itself.
5127	*
5128	* At this time, we really want to target:
5129	*
5130	* - kevent variants that can cause thread creations, and dispatch
5131	* really only uses kevent_qos and kevent_id,
5132	*
5133	* - workq_kernreturn (directly about thread creations)
5134	*
5135	* - bsdthread_ctl which is used for qos changes and has direct impact
5136	* on the creator thread scheduling decisions.
5137	*/
5138	switch (current_uthread()->syscall_code) {
5139	case SYS_kevent_qos:
5140	case SYS_kevent_id:
5141	case SYS_workq_kernreturn:
5142	case SYS_bsdthread_ctl:
5143	return true;
5144	}
5145	}
5146	return false;
5147	}
5148
5149	/*
5150	* Interact with the pthread kext to request a servicing there at a specific QoS
5151	* level.
5152	*
5153	* - Caller holds the kqlock
5154	*
5155	* - May be called with the kqueue's wait queue set locked,
5156	* so cannot do anything that could recurse on that.
5157	*/
5158	static void
5159	kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5160	kq_index_t qos, int flags)
5161	{
5162	assert(kqr_thread(kqr) == THREAD_NULL);
5163	assert(!kqr_thread_requested(kqr));
5164	struct turnstile *ts = TURNSTILE_NULL;
5165
5166	if (workq_is_exiting(p: kqu.kq->kq_p)) {
5167	return;
5168	}
5169
5170	kqlock_held(kqu);
5171
5172	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5173	struct kqworkloop *kqwl = kqu.kqwl;
5174
5175	assert(kqwl->kqwl_owner == THREAD_NULL);
5176	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5177	kqwl->kqwl_dynamicid, `0`, qos, kqwl->kqwl_wakeup_qos);
5178	ts = kqwl->kqwl_turnstile;
5179	/ Add a thread request reference on the kqueue. /
5180	kqworkloop_retain(kqwl);
5181
5182	#if CONFIG_PREADOPT_TG
5183	thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5184	&kqwl->kqwl_preadopt_tg, relaxed);
5185	if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5186	/*
5187	* This kqwl has been permanently configured with a thread group.
5188	* See kqworkloops with scheduling parameters.
5189	*/
5190	flags \|= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5191	} else {
5192	/*
5193	* This thread is the one which is ack-ing the thread group on the kqwl
5194	* under the kqlock and will take action accordingly, pairs with the
5195	* release barrier in kqueue_set_preadopted_thread_group
5196	*/
5197	uint16_t tg_acknowledged;
5198	if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5199	KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5200	&tg_acknowledged, acquire)) {
5201	flags \|= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5202	}
5203	}
5204	#endif
5205	} else {
5206	assert(kqu.kq->kq_state & KQ_WORKQ);
5207	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -`1`, `0`, qos,
5208	!TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - `1`]));
5209	}
5210
5211	/*
5212	* New-style thread request supported.
5213	* Provide the pthread kext a pointer to a workq_threadreq_s structure for
5214	* its use until a corresponding kqueue_threadreq_bind callback.
5215	*/
5216	if (kqueue_threadreq_can_use_ast(kq: kqu.kq)) {
5217	flags \|= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5218	}
5219	if (qos == KQWQ_QOS_MANAGER) {
5220	qos = WORKQ_THREAD_QOS_MANAGER;
5221	}
5222
5223	if (!workq_kern_threadreq_initiate(p: kqu.kq->kq_p, kqr, ts, qos, flags)) {
5224	/*
5225	* Process is shutting down or exec'ing.
5226	* All the kqueues are going to be cleaned up
5227	* soon. Forget we even asked for a thread -
5228	* and make sure we don't ask for more.
5229	*/
5230	kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5231	kqueue_release_live(kqu);
5232	}
5233	}
5234
5235	/*
5236	* kqueue_threadreq_bind_prepost - prepost the bind to kevent
5237	*
5238	* This is used when kqueue_threadreq_bind may cause a lock inversion.
5239	*/
5240	__attribute__((always_inline))
5241	void
5242	kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5243	struct uthread *ut)
5244	{
5245	ut->uu_kqr_bound = kqr;
5246	kqr->tr_thread = get_machthread(ut);
5247	kqr->tr_state = WORKQ_TR_STATE_BINDING;
5248	}
5249
5250	/*
5251	* kqueue_threadreq_bind_commit - commit a bind prepost
5252	*
5253	* The workq code has to commit any binding prepost before the thread has
5254	* a chance to come back to userspace (and do kevent syscalls) or be aborted.
5255	*/
5256	void
5257	kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5258	{
5259	struct uthread *ut = get_bsdthread_info(thread);
5260	workq_threadreq_t kqr = ut->uu_kqr_bound;
5261	kqueue_t kqu = kqr_kqueue(p, kqr);
5262
5263	kqlock(kqu);
5264	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5265	kqueue_threadreq_bind(p, req: kqr, thread, flags: `0`);
5266	}
5267	kqunlock(kqu);
5268	}
5269
5270	static void
5271	kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5272	workq_kern_threadreq_flags_t flags)
5273	{
5274	assert(kqr_thread_requested_pending(kqr));
5275
5276	kqlock_held(kqu);
5277
5278	if (kqueue_threadreq_can_use_ast(kq: kqu.kq)) {
5279	flags \|= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5280	}
5281
5282	#if CONFIG_PREADOPT_TG
5283	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5284	struct kqworkloop *kqwl = kqu.kqwl;
5285	thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5286	&kqwl->kqwl_preadopt_tg, relaxed);
5287	if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5288	/*
5289	* This kqwl has been permanently configured with a thread group.
5290	* See kqworkloops with scheduling parameters.
5291	*/
5292	flags \|= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5293	} else {
5294	uint16_t tg_ack_status;
5295	/*
5296	* This thread is the one which is ack-ing the thread group on the kqwl
5297	* under the kqlock and will take action accordingly, needs acquire
5298	* barrier.
5299	*/
5300	if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5301	KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5302	flags \|= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5303	}
5304	}
5305	}
5306	#endif
5307
5308	workq_kern_threadreq_modify(p: kqu.kq->kq_p, kqr, qos, flags);
5309	}
5310
5311	/*
5312	* kqueue_threadreq_bind - bind thread to processing kqrequest
5313	*
5314	* The provided thread will be responsible for delivering events
5315	* associated with the given kqrequest. Bind it and get ready for
5316	* the thread to eventually arrive.
5317	*/
5318	void
5319	kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5320	unsigned int flags)
5321	{
5322	kqueue_t kqu = kqr_kqueue(p, kqr);
5323	struct uthread *ut = get_bsdthread_info(thread);
5324
5325	kqlock_held(kqu);
5326
5327	assert(ut->uu_kqueue_override == `0`);
5328
5329	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5330	assert(ut->uu_kqr_bound == kqr);
5331	assert(kqr->tr_thread == thread);
5332	} else {
5333	assert(kqr_thread_requested_pending(kqr));
5334	assert(kqr->tr_thread == THREAD_NULL);
5335	assert(ut->uu_kqr_bound == NULL);
5336	ut->uu_kqr_bound = kqr;
5337	kqr->tr_thread = thread;
5338	}
5339
5340	kqr->tr_state = WORKQ_TR_STATE_BOUND;
5341
5342	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5343	struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5344
5345	if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5346	/*
5347	* <rdar://problem/38626999> shows that asserting here is not ok.
5348	*
5349	* This is not supposed to happen for correct use of the interface,
5350	* but it is sadly possible for userspace (with the help of memory
5351	* corruption, such as over-release of a dispatch queue) to make
5352	* the creator thread the "owner" of a workloop.
5353	*
5354	* Once that happens, and that creator thread picks up the same
5355	* workloop as a servicer, we trip this codepath. We need to fixup
5356	* the state to forget about this thread being the owner, as the
5357	* entire workloop state machine expects servicers to never be
5358	* owners and everything would basically go downhill from here.
5359	*/
5360	kqu.kqwl->kqwl_owner = THREAD_NULL;
5361	if (kqworkloop_override(kqwl: kqu.kqwl)) {
5362	thread_drop_kevent_override(thread);
5363	}
5364	}
5365
5366	if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == `0`) {
5367	/*
5368	* Past this point, the interlock is the kq req lock again,
5369	* so we can fix the inheritor for good.
5370	*/
5371	filt_wlupdate_inheritor(kqwl: kqu.kqwl, ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
5372	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
5373	}
5374
5375	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5376	thread_tid(thread), kqr->tr_kq_qos_index,
5377	(kqr->tr_kq_override_index << `16`) \| kqwl->kqwl_wakeup_qos);
5378
5379	ut->uu_kqueue_override = kqr->tr_kq_override_index;
5380	if (kqr->tr_kq_override_index) {
5381	thread_add_servicer_override(thread, qos_override: kqr->tr_kq_override_index);
5382	}
5383
5384	#if CONFIG_PREADOPT_TG
5385	/ Remove reference from kqwl and mark it as bound with the SENTINEL /
5386	thread_group_qos_t old_tg;
5387	thread_group_qos_t new_tg;
5388	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5389	if ((old_tg == KQWL_PREADOPTED_TG_NEVER) \|\| KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5390	/*
5391	* Either an app or a kqwl permanently configured with a thread group.
5392	* Nothing to do.
5393	*/
5394	os_atomic_rmw_loop_give_up(break);
5395	}
5396	assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5397	new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5398	});
5399
5400	if (ret) {
5401	KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5402
5403	if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5404	struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5405	assert(tg != NULL);
5406
5407	thread_set_preadopt_thread_group(t: thread, tg);
5408	thread_group_release_live(tg); // The thread has a reference
5409	} else {
5410	/*
5411	* The thread may already have a preadopt thread group on it -
5412	* we need to make sure to clear that.
5413	*/
5414	thread_set_preadopt_thread_group(t: thread, NULL);
5415	}
5416
5417	/ We have taken action on the preadopted thread group set on the*
5418	* set on the kqwl, clear any redrive requests */
5419	os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5420	} else {
5421	if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5422	struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5423	assert(tg != NULL);
5424	thread_set_preadopt_thread_group(t: thread, tg);
5425	/*
5426	* From this point on, kqwl and thread both have +1 ref on this tg.
5427	*/
5428	}
5429	}
5430	#endif
5431	kqueue_update_iotier_override(kqu);
5432	} else {
5433	assert(kqr->tr_kq_override_index == `0`);
5434
5435	#if CONFIG_PREADOPT_TG
5436	/*
5437	* The thread may have a preadopt thread group on it already because it
5438	* got tagged with it as a creator thread. So we need to make sure to
5439	* clear that since we don't have preadopt thread groups for non-kqwl
5440	* cases
5441	*/
5442	thread_set_preadopt_thread_group(t: thread, NULL);
5443	#endif
5444	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -`1`,
5445	thread_tid(thread), kqr->tr_kq_qos_index,
5446	(kqr->tr_kq_override_index << `16`) \|
5447	!TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - `1`]));
5448	}
5449	}
5450
5451	/*
5452	* kqueue_threadreq_cancel - abort a pending thread request
5453	*
5454	* Called when exiting/exec'ing. Forget our pending request.
5455	*/
5456	void
5457	kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5458	{
5459	kqueue_release(kqu: kqr_kqueue(p, kqr));
5460	}
5461
5462	workq_threadreq_param_t
5463	kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5464	{
5465	struct kqworkloop *kqwl;
5466	workq_threadreq_param_t trp;
5467
5468	assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5469	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5470	trp.trp_value = kqwl->kqwl_params;
5471	return trp;
5472	}
5473
5474	/*
5475	* kqueue_threadreq_unbind - unbind thread from processing kqueue
5476	*
5477	* End processing the per-QoS bucket of events and allow other threads
5478	* to be requested for future servicing.
5479	*
5480	* caller holds a reference on the kqueue.
5481	*/
5482	void
5483	kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5484	{
5485	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5486	kqworkloop_unbind(kwql: kqr_kqworkloop(kqr));
5487	} else {
5488	kqworkq_unbind(p, kqr);
5489	}
5490	}
5491
5492	/*
5493	* If we aren't already busy processing events [for this QoS],
5494	* request workq thread support as appropriate.
5495	*
5496	* TBD - for now, we don't segregate out processing by QoS.
5497	*
5498	* - May be called with the kqueue's wait queue set locked,
5499	* so cannot do anything that could recurse on that.
5500	*/
5501	static void
5502	kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5503	{
5504	workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5505
5506	/ convert to thread qos value /
5507	assert(qos_index > `0` && qos_index <= KQWQ_NBUCKETS);
5508
5509	if (!kqr_thread_requested(kqr)) {
5510	kqueue_threadreq_initiate(kqu: &kqwq->kqwq_kqueue, kqr, qos: qos_index, flags: `0`);
5511	}
5512	}
5513
5514	/*
5515	* This represent the asynchronous QoS a given workloop contributes,
5516	* hence is the max of the current active knotes (override index)
5517	* and the workloop max qos (userspace async qos).
5518	*/
5519	static kq_index_t
5520	kqworkloop_override(struct kqworkloop *kqwl)
5521	{
5522	workq_threadreq_t kqr = &kqwl->kqwl_request;
5523	return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5524	}
5525
5526	static inline void
5527	kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5528	{
5529	workq_threadreq_t kqr = &kqwl->kqwl_request;
5530
5531	kqlock_held(kqu: kqwl);
5532
5533	if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5534	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5535	act_set_astkevent(thread: kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5536	}
5537	}
5538
5539	static void
5540	kqworkloop_update_threads_qos(struct kqworkloop kqwl, int* op, kq_index_t qos)
5541	{
5542	workq_threadreq_t kqr = &kqwl->kqwl_request;
5543	struct kqueue *kq = &kqwl->kqwl_kqueue;
5544	kq_index_t old_override = kqworkloop_override(kqwl);
5545
5546	kqlock_held(kqu: kqwl);
5547
5548	switch (op) {
5549	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5550	kqwl->kqwl_wakeup_qos = qos;
5551	kqworkloop_request_fire_r2k_notification(kqwl);
5552	goto recompute;
5553
5554	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5555	kqr->tr_kq_override_index = qos;
5556	goto recompute;
5557
5558	case KQWL_UTQ_PARKING:
5559	case KQWL_UTQ_UNBINDING:
5560	kqr->tr_kq_override_index = qos;
5561	OS_FALLTHROUGH;
5562
5563	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5564	if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5565	assert(qos == THREAD_QOS_UNSPECIFIED);
5566	}
5567	if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5568	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5569	}
5570	kqwl->kqwl_wakeup_qos = `0`;
5571	for (kq_index_t i = KQWL_NBUCKETS; i > `0`; i--) {
5572	if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - `1`])) {
5573	kqwl->kqwl_wakeup_qos = i;
5574	kqworkloop_request_fire_r2k_notification(kqwl);
5575	break;
5576	}
5577	}
5578	OS_FALLTHROUGH;
5579
5580	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5581	recompute:
5582	/*
5583	* When modifying the wakeup QoS or the override QoS, we always need to
5584	* maintain our invariant that kqr_override_index is at least as large
5585	* as the highest QoS for which an event is fired.
5586	*
5587	* However this override index can be larger when there is an overriden
5588	* suppressed knote pushing on the kqueue.
5589	*/
5590	if (qos < kqwl->kqwl_wakeup_qos) {
5591	qos = kqwl->kqwl_wakeup_qos;
5592	}
5593	if (kqr->tr_kq_override_index < qos) {
5594	kqr->tr_kq_override_index = qos;
5595	}
5596	break;
5597
5598	case KQWL_UTQ_REDRIVE_EVENTS:
5599	break;
5600
5601	case KQWL_UTQ_SET_QOS_INDEX:
5602	kqr->tr_kq_qos_index = qos;
5603	break;
5604
5605	default:
5606	panic("unknown kqwl thread qos update operation: %d", op);
5607	}
5608
5609	thread_t kqwl_owner = kqwl->kqwl_owner;
5610	thread_t servicer = kqr_thread(kqr);
5611	boolean_t qos_changed = FALSE;
5612	kq_index_t new_override = kqworkloop_override(kqwl);
5613
5614	/*
5615	* Apply the diffs to the owner if applicable
5616	*/
5617	if (kqwl_owner) {
5618	#if 0
5619	/ JMM - need new trace hooks for owner overrides /
5620	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5621	kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5622	(kqr->tr_kq_override_index << `16`) \| kqwl->kqwl_wakeup_qos);
5623	#endif
5624	if (new_override == old_override) {
5625	// nothing to do
5626	} else if (old_override == THREAD_QOS_UNSPECIFIED) {
5627	thread_add_kevent_override(thread: kqwl_owner, qos_override: new_override);
5628	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5629	thread_drop_kevent_override(thread: kqwl_owner);
5630	} else { / old_override != new_override /
5631	thread_update_kevent_override(thread: kqwl_owner, qos_override: new_override);
5632	}
5633	}
5634
5635	/*
5636	* apply the diffs to the servicer
5637	*/
5638
5639	if (!kqr_thread_requested(kqr)) {
5640	/*
5641	* No servicer, nor thread-request
5642	*
5643	* Make a new thread request, unless there is an owner (or the workloop
5644	* is suspended in userland) or if there is no asynchronous work in the
5645	* first place.
5646	*/
5647
5648	if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5649	int initiate_flags = `0`;
5650	if (op == KQWL_UTQ_UNBINDING) {
5651	initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5652	}
5653
5654	/ kqueue_threadreq_initiate handles the acknowledgement of the TG*
5655	* if needed */
5656	kqueue_threadreq_initiate(kqu: kq, kqr, qos: new_override, flags: initiate_flags);
5657	}
5658	} else if (servicer) {
5659	/*
5660	* Servicer in flight
5661	*
5662	* Just apply the diff to the servicer
5663	*/
5664
5665	#if CONFIG_PREADOPT_TG
5666	/ When there's a servicer for the kqwl already, then the servicer will*
5667	* adopt the thread group in the kqr, we don't need to poke the
5668	* workqueue subsystem to make different decisions due to the thread
5669	* group. Consider the current request ack-ed.
5670	*/
5671	os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5672	#endif
5673
5674	struct uthread *ut = get_bsdthread_info(servicer);
5675	if (ut->uu_kqueue_override != new_override) {
5676	if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5677	thread_add_servicer_override(thread: servicer, qos_override: new_override);
5678	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5679	thread_drop_servicer_override(thread: servicer);
5680	} else { / ut->uu_kqueue_override != new_override /
5681	thread_update_servicer_override(thread: servicer, qos_override: new_override);
5682	}
5683	ut->uu_kqueue_override = new_override;
5684	qos_changed = TRUE;
5685	}
5686	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5687	/*
5688	* No events to deliver anymore.
5689	*
5690	* However canceling with turnstiles is challenging, so the fact that
5691	* the request isn't useful will be discovered by the servicer himself
5692	* later on.
5693	*/
5694	} else if (old_override != new_override) {
5695	/*
5696	* Request is in flight
5697	*
5698	* Apply the diff to the thread request.
5699	*/
5700	kqueue_threadreq_modify(kqu: kq, kqr, qos: new_override, flags: WORKQ_THREADREQ_NONE);
5701	qos_changed = TRUE;
5702	}
5703
5704	if (qos_changed) {
5705	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5706	thread_tid(servicer), kqr->tr_kq_qos_index,
5707	(kqr->tr_kq_override_index << `16`) \| kqwl->kqwl_wakeup_qos);
5708	}
5709	}
5710
5711	static void
5712	kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
5713	{
5714	workq_threadreq_t kqr = &kqwl->kqwl_request;
5715	thread_t servicer = kqr_thread(kqr);
5716	uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
5717
5718	kqlock_held(kqu: kqwl);
5719
5720	if (servicer) {
5721	thread_update_servicer_iotier_override(thread: servicer, iotier_override: iotier);
5722	}
5723	}
5724
5725	static void
5726	kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5727	{
5728	if (qos <= kqwl->kqwl_wakeup_qos) {
5729	/*
5730	* Shortcut wakeups that really do nothing useful
5731	*/
5732	return;
5733	}
5734
5735	if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5736	kqr_thread(kqr: &kqwl->kqwl_request) == current_thread()) {
5737	/*
5738	* kqworkloop_end_processing() will perform the required QoS
5739	* computations when it unsets the processing mode.
5740	*/
5741	return;
5742	}
5743
5744	kqworkloop_update_threads_qos(kqwl, op: KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5745	}
5746
5747	static struct kqtailq *
5748	kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5749	{
5750	if (kq.kq->kq_state & KQ_WORKLOOP) {
5751	return &kq.kqwl->kqwl_suppressed;
5752	} else if (kq.kq->kq_state & KQ_WORKQ) {
5753	return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - `1`];
5754	} else {
5755	return &kq.kqf->kqf_suppressed;
5756	}
5757	}
5758
5759	struct turnstile *
5760	kqueue_alloc_turnstile(kqueue_t kqu)
5761	{
5762	struct kqworkloop *kqwl = kqu.kqwl;
5763	kq_state_t kq_state;
5764
5765	kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5766	if (kq_state & KQ_HAS_TURNSTILE) {
5767	/ force a dependency to pair with the atomic or with release below /
5768	return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5769	(uintptr_t)kq_state);
5770	}
5771
5772	if (!(kq_state & KQ_WORKLOOP)) {
5773	return TURNSTILE_NULL;
5774	}
5775
5776	struct turnstile ts = turnstile_alloc(), free_ts = TURNSTILE_NULL;
5777	bool workq_locked = false;
5778
5779	kqlock(kqu);
5780
5781	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5782	workq_locked = true;
5783	workq_kern_threadreq_lock(p: kqwl->kqwl_p);
5784	}
5785
5786	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5787	free_ts = ts;
5788	ts = kqwl->kqwl_turnstile;
5789	} else {
5790	ts = turnstile_prepare(proprietor: (uintptr_t)kqwl, tstore: &kqwl->kqwl_turnstile,
5791	turnstile: ts, type: TURNSTILE_WORKLOOPS);
5792
5793	/ release-barrier to pair with the unlocked load of kqwl_turnstile above /
5794	os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5795
5796	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5797	workq_kern_threadreq_update_inheritor(p: kqwl->kqwl_p,
5798	kqr: &kqwl->kqwl_request, owner: kqwl->kqwl_owner,
5799	ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
5800	/*
5801	* The workq may no longer be the interlock after this.
5802	* In which case the inheritor wasn't updated.
5803	*/
5804	}
5805	if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5806	filt_wlupdate_inheritor(kqwl, ts, flags: TURNSTILE_IMMEDIATE_UPDATE);
5807	}
5808	}
5809
5810	if (workq_locked) {
5811	workq_kern_threadreq_unlock(p: kqwl->kqwl_p);
5812	}
5813
5814	kqunlock(kqu);
5815
5816	if (free_ts) {
5817	turnstile_deallocate(turnstile: free_ts);
5818	} else {
5819	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_NOT_HELD);
5820	}
5821	return ts;
5822	}
5823
5824	__attribute__((always_inline))
5825	struct turnstile *
5826	kqueue_turnstile(kqueue_t kqu)
5827	{
5828	kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5829	if (kq_state & KQ_WORKLOOP) {
5830	return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5831	}
5832	return TURNSTILE_NULL;
5833	}
5834
5835	__attribute__((always_inline))
5836	struct turnstile *
5837	kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5838	{
5839	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5840	if (kqwl) {
5841	return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5842	}
5843	return TURNSTILE_NULL;
5844	}
5845
5846	static void
5847	kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5848	{
5849	workq_threadreq_t kqr = &kqwl->kqwl_request;
5850
5851	/*
5852	* This test is racy, but since we never remove this bit,
5853	* it allows us to avoid taking a lock.
5854	*/
5855	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5856	return;
5857	}
5858
5859	kqlock_held(kqu: kqwl);
5860
5861	if (kqr_thread_requested_pending(kqr)) {
5862	kqueue_threadreq_modify(kqu: kqwl, kqr, qos: kqr->tr_qos,
5863	flags: WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5864	} else {
5865	kqr->tr_flags \|= WORKQ_TR_FLAG_OVERCOMMIT;
5866	}
5867	}
5868
5869	static void
5870	kqworkq_update_override(struct kqworkq kqwq, struct* knote *kn,
5871	kq_index_t override_index)
5872	{
5873	workq_threadreq_t kqr;
5874	kq_index_t old_override_index;
5875	kq_index_t queue_index = kn->kn_qos_index;
5876
5877	if (override_index <= queue_index) {
5878	return;
5879	}
5880
5881	kqr = kqworkq_get_request(kqwq, qos_index: queue_index);
5882
5883	kqlock_held(kqu: kqwq);
5884
5885	old_override_index = kqr->tr_kq_override_index;
5886	if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5887	thread_t servicer = kqr_thread(kqr);
5888	kqr->tr_kq_override_index = override_index;
5889
5890	/ apply the override to [incoming?] servicing thread /
5891	if (servicer) {
5892	if (old_override_index) {
5893	thread_update_kevent_override(thread: servicer, qos_override: override_index);
5894	} else {
5895	thread_add_kevent_override(thread: servicer, qos_override: override_index);
5896	}
5897	}
5898	}
5899	}
5900
5901	static void
5902	kqueue_update_iotier_override(kqueue_t kqu)
5903	{
5904	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5905	kqworkloop_update_iotier_override(kqwl: kqu.kqwl);
5906	}
5907	}
5908
5909	static void
5910	kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5911	{
5912	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5913	kqworkloop_update_threads_qos(kqwl: kqu.kqwl, op: KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5914	qos);
5915	} else {
5916	kqworkq_update_override(kqwq: kqu.kqwq, kn, override_index: qos);
5917	}
5918	}
5919
5920	static void
5921	kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5922	enum kqwl_unbind_locked_mode how)
5923	{
5924	struct uthread *ut = get_bsdthread_info(thread);
5925	workq_threadreq_t kqr = &kqwl->kqwl_request;
5926
5927	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5928	thread_tid(thread), `0`, `0`);
5929
5930	kqlock_held(kqu: kqwl);
5931
5932	assert(ut->uu_kqr_bound == kqr);
5933	ut->uu_kqr_bound = NULL;
5934	if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5935	ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5936	thread_drop_servicer_override(thread);
5937	ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5938	}
5939
5940	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5941	turnstile_update_inheritor(turnstile: kqwl->kqwl_turnstile,
5942	TURNSTILE_INHERITOR_NULL, flags: TURNSTILE_IMMEDIATE_UPDATE);
5943	turnstile_update_inheritor_complete(turnstile: kqwl->kqwl_turnstile,
5944	flags: TURNSTILE_INTERLOCK_HELD);
5945	}
5946
5947	#if CONFIG_PREADOPT_TG
5948	/ The kqueue is able to adopt a thread group again /
5949
5950	thread_group_qos_t old_tg, new_tg = NULL;
5951	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5952	new_tg = old_tg;
5953	if (old_tg == KQWL_PREADOPTED_TG_SENTINEL \|\| old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
5954	new_tg = KQWL_PREADOPTED_TG_NULL;
5955	}
5956	});
5957
5958	if (ret) {
5959	KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
5960	// Servicer can drop any preadopt thread group it has since it has
5961	// unbound.
5962	thread_set_preadopt_thread_group(t: thread, NULL);
5963	}
5964	#endif
5965	thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
5966
5967	kqr->tr_thread = THREAD_NULL;
5968	kqr->tr_state = WORKQ_TR_STATE_IDLE;
5969	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5970	}
5971
5972	static void
5973	kqworkloop_unbind_delayed_override_drop(thread_t thread)
5974	{
5975	struct uthread *ut = get_bsdthread_info(thread);
5976	assert(ut->uu_kqr_bound == NULL);
5977	if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5978	thread_drop_servicer_override(thread);
5979	ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5980	}
5981	}
5982
5983	/*
5984	* kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5985	*
5986	* It will acknowledge events, and possibly request a new thread if:
5987	* - there were active events left
5988	* - we pended waitq hook callouts during processing
5989	* - we pended wakeups while processing (or unsuppressing)
5990	*
5991	* Called with kqueue lock held.
5992	*/
5993	static void
5994	kqworkloop_unbind(struct kqworkloop *kqwl)
5995	{
5996	struct kqueue *kq = &kqwl->kqwl_kqueue;
5997	workq_threadreq_t kqr = &kqwl->kqwl_request;
5998	thread_t thread = kqr_thread_fast(kqr);
5999	int op = KQWL_UTQ_PARKING;
6000	kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6001
6002	assert(thread == current_thread());
6003
6004	kqlock(kqu: kqwl);
6005
6006	/*
6007	* Forcing the KQ_PROCESSING flag allows for QoS updates because of
6008	* unsuppressing knotes not to be applied until the eventual call to
6009	* kqworkloop_update_threads_qos() below.
6010	*/
6011	assert((kq->kq_state & KQ_PROCESSING) == `0`);
6012	if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6013	kq->kq_state \|= KQ_PROCESSING;
6014	qos_override = kqworkloop_acknowledge_events(kqwl);
6015	kq->kq_state &= ~KQ_PROCESSING;
6016	}
6017
6018	kqworkloop_unbind_locked(kqwl, thread, how: KQWL_OVERRIDE_DROP_DELAYED);
6019	kqworkloop_update_threads_qos(kqwl, op, qos: qos_override);
6020
6021	kqunlock(kqu: kqwl);
6022
6023	/*
6024	* Drop the override on the current thread last, after the call to
6025	* kqworkloop_update_threads_qos above.
6026	*/
6027	kqworkloop_unbind_delayed_override_drop(thread);
6028
6029	/ If last reference, dealloc the workloop kq /
6030	kqworkloop_release(kqwl);
6031	}
6032
6033	static thread_qos_t
6034	kqworkq_unbind_locked(struct kqworkq *kqwq,
6035	workq_threadreq_t kqr, thread_t thread)
6036	{
6037	struct uthread *ut = get_bsdthread_info(thread);
6038	kq_index_t old_override = kqr->tr_kq_override_index;
6039
6040	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -`1`,
6041	thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, `0`);
6042
6043	kqlock_held(kqu: kqwq);
6044
6045	assert(ut->uu_kqr_bound == kqr);
6046	ut->uu_kqr_bound = NULL;
6047	kqr->tr_thread = THREAD_NULL;
6048	kqr->tr_state = WORKQ_TR_STATE_IDLE;
6049	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6050	kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6051
6052	return old_override;
6053	}
6054
6055	/*
6056	* kqworkq_unbind - unbind of a workq kqueue from a thread
6057	*
6058	* We may have to request new threads.
6059	* This can happen there are no waiting processing threads and:
6060	* - there were active events we never got to (count > 0)
6061	* - we pended waitq hook callouts during processing
6062	* - we pended wakeups while processing (or unsuppressing)
6063	*/
6064	static void
6065	kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6066	{
6067	struct kqworkq kqwq = (struct* kqworkq *)p->p_fd.fd_wqkqueue;
6068	__assert_only int rc;
6069
6070	kqlock(kqu: kqwq);
6071	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags: `0`, KQWQAE_UNBIND);
6072	assert(rc == -`1`);
6073	kqunlock(kqu: kqwq);
6074	}
6075
6076	workq_threadreq_t
6077	kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6078	{
6079	assert(qos_index > `0` && qos_index <= KQWQ_NBUCKETS);
6080	return &kqwq->kqwq_request[qos_index - `1`];
6081	}
6082
6083	static void
6084	knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6085	{
6086	kq_index_t qos = _pthread_priority_thread_qos(pp);
6087
6088	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6089	assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == `0`);
6090	pp = _pthread_priority_normalize(pp);
6091	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6092	if (qos == THREAD_QOS_UNSPECIFIED) {
6093	/ On workqueues, outside of QoS means MANAGER /
6094	qos = KQWQ_QOS_MANAGER;
6095	pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6096	} else {
6097	pp = _pthread_priority_normalize(pp);
6098	}
6099	} else {
6100	pp = _pthread_unspecified_priority();
6101	qos = THREAD_QOS_UNSPECIFIED;
6102	}
6103
6104	kn->kn_qos = (int32_t)pp;
6105
6106	if ((kn->kn_status & KN_MERGE_QOS) == `0` \|\| qos > kn->kn_qos_override) {
6107	/ Never lower QoS when in "Merge" mode /
6108	kn->kn_qos_override = qos;
6109	}
6110
6111	/ only adjust in-use qos index when not suppressed /
6112	if (kn->kn_status & KN_SUPPRESSED) {
6113	kqueue_update_override(kqu, kn, qos);
6114	} else if (kn->kn_qos_index != qos) {
6115	knote_dequeue(kqu, kn);
6116	kn->kn_qos_index = qos;
6117	}
6118	}
6119
6120	static void
6121	knote_adjust_qos(struct kqueue kq, struct* knote kn, int* result)
6122	{
6123	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & `7`;
6124
6125	kqlock_held(kqu: kq);
6126
6127	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6128	assert(qos_index < THREAD_QOS_LAST);
6129
6130	/*
6131	* Early exit for knotes that should not change QoS
6132	*/
6133	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6134	panic("filter %d cannot change QoS", kn->kn_filtid);
6135	} else if (__improbable(!knote_has_qos(kn))) {
6136	return;
6137	}
6138
6139	/*
6140	* knotes with the FALLBACK flag will only use their registration QoS if the
6141	* incoming event has no QoS, else, the registration QoS acts as a floor.
6142	*/
6143	thread_qos_t req_qos = _pthread_priority_thread_qos_fast(pp: kn->kn_qos);
6144	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6145	if (qos_index == THREAD_QOS_UNSPECIFIED) {
6146	qos_index = req_qos;
6147	}
6148	} else {
6149	if (qos_index < req_qos) {
6150	qos_index = req_qos;
6151	}
6152	}
6153	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6154	/ Never lower QoS when in "Merge" mode /
6155	return;
6156	}
6157
6158	if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6159	/*
6160	* When we're trying to update the QoS override and that both an
6161	* f_event() and other f_* calls are running concurrently, any of these
6162	* in flight calls may want to perform overrides that aren't properly
6163	* serialized with each other.
6164	*
6165	* The first update that observes this racy situation enters a "Merge"
6166	* mode which causes subsequent override requests to saturate the
6167	* override instead of replacing its value.
6168	*
6169	* This mode is left when knote_unlock() or knote_post()
6170	* observe that no other f_* routine is in flight.
6171	*/
6172	kn->kn_status \|= KN_MERGE_QOS;
6173	}
6174
6175	/*
6176	* Now apply the override if it changed.
6177	*/
6178
6179	if (kn->kn_qos_override == qos_index) {
6180	return;
6181	}
6182
6183	kn->kn_qos_override = qos_index;
6184
6185	if (kn->kn_status & KN_SUPPRESSED) {
6186	/*
6187	* For suppressed events, the kn_qos_index field cannot be touched as it
6188	* allows us to know on which supress queue the knote is for a kqworkq.
6189	*
6190	* Also, there's no natural push applied on the kqueues when this field
6191	* changes anyway. We hence need to apply manual overrides in this case,
6192	* which will be cleared when the events are later acknowledged.
6193	*/
6194	kqueue_update_override(kqu: kq, kn, qos: qos_index);
6195	} else if (kn->kn_qos_index != qos_index) {
6196	knote_dequeue(kqu: kq, kn);
6197	kn->kn_qos_index = qos_index;
6198	}
6199	}
6200
6201	void
6202	klist_init(struct klist *list)
6203	{
6204	SLIST_INIT(list);
6205	}
6206
6207
6208	/*
6209	* Query/Post each knote in the object's list
6210	*
6211	* The object lock protects the list. It is assumed that the filter/event
6212	* routine for the object can determine that the object is already locked (via
6213	* the hint) and not deadlock itself.
6214	*
6215	* Autodetach is a specific contract which will detach all knotes from the
6216	* object prior to posting the final event for that knote. This is done while
6217	* under the object lock. A breadcrumb is left in the knote's next pointer to
6218	* indicate to future calls to f_detach routines that they need not reattempt
6219	* to knote_detach from the object's klist again. This is currently used by
6220	* EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6221	*
6222	*/
6223	void
6224	knote(struct klist list, long* hint, bool autodetach)
6225	{
6226	struct knote *kn;
6227	struct knote *tmp_kn;
6228	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6229	/*
6230	* We can modify the knote's next pointer since since we are holding the
6231	* object lock and the list can't be concurrently modified. Anyone
6232	* determining auto-detached-ness of a knote should take the primitive lock
6233	* to synchronize.
6234	*
6235	* Note that we do this here instead of the filter's f_event since we may
6236	* not even post the event if the knote is being dropped.
6237	*/
6238	if (autodetach) {
6239	kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6240	}
6241	knote_post(kn, hint);
6242	}
6243
6244	/ Blast away the entire klist /
6245	if (autodetach) {
6246	klist_init(list);
6247	}
6248	}
6249
6250	/*
6251	* attach a knote to the specified list. Return true if this is the first entry.
6252	* The list is protected by whatever lock the object it is associated with uses.
6253	*/
6254	int
6255	knote_attach(struct klist list, struct* knote *kn)
6256	{
6257	int ret = SLIST_EMPTY(list);
6258	SLIST_INSERT_HEAD(list, kn, kn_selnext);
6259	return ret;
6260	}
6261
6262	/*
6263	* detach a knote from the specified list. Return true if that was the last
6264	* entry. The list is protected by whatever lock the object it is associated
6265	* with uses.
6266	*/
6267	int
6268	knote_detach(struct klist list, struct* knote *kn)
6269	{
6270	assert(!KNOTE_IS_AUTODETACHED(kn));
6271
6272	SLIST_REMOVE(list, kn, knote, kn_selnext);
6273	return SLIST_EMPTY(list);
6274	}
6275
6276	/*
6277	* knote_vanish - Indicate that the source has vanished
6278	*
6279	* Used only for vanishing ports - vanishing fds go
6280	* through knote_fdclose()
6281	*
6282	* If the knote has requested EV_VANISHED delivery,
6283	* arrange for that. Otherwise, deliver a NOTE_REVOKE
6284	* event for backward compatibility.
6285	*
6286	* The knote is marked as having vanished. The source's
6287	* reference to the knote is dropped by caller, but the knote's
6288	* source reference is only cleaned up later when the knote is dropped.
6289	*
6290	* Our caller already has the object lock held. Calling
6291	* the detach routine would try to take that lock
6292	* recursively - which likely is not supported.
6293	*/
6294	void
6295	knote_vanish(struct klist *list, bool make_active)
6296	{
6297	struct knote *kn;
6298	struct knote *kn_next;
6299
6300	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6301	struct kqueue *kq = knote_get_kq(kn);
6302
6303	kqlock(kqu: kq);
6304	if (__probable(kn->kn_status & KN_REQVANISH)) {
6305	/*
6306	* If EV_VANISH supported - prepare to deliver one
6307	*/
6308	kn->kn_status \|= KN_VANISHED;
6309	} else {
6310	/*
6311	* Handle the legacy way to indicate that the port/portset was
6312	* deallocated or left the current Mach portspace (modern technique
6313	* is with an EV_VANISHED protocol).
6314	*
6315	* Deliver an EV_EOF event for these changes (hopefully it will get
6316	* delivered before the port name recycles to the same generation
6317	* count and someone tries to re-register a kevent for it or the
6318	* events are udata-specific - avoiding a conflict).
6319	*/
6320	kn->kn_flags \|= EV_EOF \| EV_ONESHOT;
6321	}
6322	if (make_active) {
6323	knote_activate(kqu: kq, kn, FILTER_ACTIVE);
6324	}
6325	kqunlock(kqu: kq);
6326	}
6327	}
6328
6329	/*
6330	* remove all knotes referencing a specified fd
6331	*
6332	* Entered with the proc_fd lock already held.
6333	* It returns the same way, but may drop it temporarily.
6334	*/
6335	void
6336	knote_fdclose(struct proc p, int* fd)
6337	{
6338	struct filedesc *fdt = &p->p_fd;
6339	struct klist *list;
6340	struct knote *kn;
6341	KNOTE_LOCK_CTX(knlc);
6342
6343	restart:
6344	list = &fdt->fd_knlist[fd];
6345	SLIST_FOREACH(kn, list, kn_link) {
6346	struct kqueue *kq = knote_get_kq(kn);
6347
6348	kqlock(kqu: kq);
6349
6350	if (kq->kq_p != p) {
6351	panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6352	__func__, kq->kq_p, p);
6353	}
6354
6355	/*
6356	* If the knote supports EV_VANISHED delivery,
6357	* transition it to vanished mode (or skip over
6358	* it if already vanished).
6359	*/
6360	if (kn->kn_status & KN_VANISHED) {
6361	kqunlock(kqu: kq);
6362	continue;
6363	}
6364
6365	proc_fdunlock(p);
6366	if (!knote_lock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_LOCK_ON_SUCCESS)) {
6367	/ the knote was dropped by someone, nothing to do /
6368	} else if (kn->kn_status & KN_REQVANISH) {
6369	/*
6370	* Since we have REQVANISH for this knote, we need to notify clients about
6371	* the EV_VANISHED.
6372	*
6373	* But unlike mach ports, we want to do the detach here as well and not
6374	* defer it so that we can release the iocount that is on the knote and
6375	* close the fp.
6376	*/
6377	kn->kn_status \|= KN_VANISHED;
6378
6379	/*
6380	* There may be a concurrent post happening, make sure to wait for it
6381	* before we detach. knote_wait_for_post() unlocks on kq on exit
6382	*/
6383	knote_wait_for_post(kq, kn);
6384
6385	knote_fops(kn)->f_detach(kn);
6386	if (kn->kn_is_fd) {
6387	fp_drop(p, fd: (int)kn->kn_id, fp: kn->kn_fp, locked: `0`);
6388	}
6389	kn->kn_filtid = EVFILTID_DETACHED;
6390	kqlock(kqu: kq);
6391
6392	knote_activate(kqu: kq, kn, FILTER_ACTIVE);
6393	knote_unlock(kqu: kq, kn, knlc: &knlc, kqlocking: KNOTE_KQ_UNLOCK);
6394	} else {
6395	knote_drop(kqu: kq, kn, knlc: &knlc);
6396	}
6397
6398	proc_fdlock(p);
6399	goto restart;
6400	}
6401	}
6402
6403	/*
6404	* knote_fdfind - lookup a knote in the fd table for process
6405	*
6406	* If the filter is file-based, lookup based on fd index.
6407	* Otherwise use a hash based on the ident.
6408	*
6409	* Matching is based on kq, filter, and ident. Optionally,
6410	* it may also be based on the udata field in the kevent -
6411	* allowing multiple event registration for the file object
6412	* per kqueue.
6413	*
6414	* fd_knhashlock or fdlock held on entry (and exit)
6415	*/
6416	static struct knote *
6417	knote_fdfind(struct kqueue *kq,
6418	const struct kevent_internal_s *kev,
6419	bool is_fd,
6420	struct proc *p)
6421	{
6422	struct filedesc *fdp = &p->p_fd;
6423	struct klist *list = NULL;
6424	struct knote *kn = NULL;
6425
6426	/*
6427	* determine where to look for the knote
6428	*/
6429	if (is_fd) {
6430	/ fd-based knotes are linked off the fd table /
6431	if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6432	list = &fdp->fd_knlist[kev->kei_ident];
6433	}
6434	} else if (fdp->fd_knhashmask != `0`) {
6435	/ hash non-fd knotes here too /
6436	list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6437	}
6438
6439	/*
6440	* scan the selected list looking for a match
6441	*/
6442	if (list != NULL) {
6443	SLIST_FOREACH(kn, list, kn_link) {
6444	if (kq == knote_get_kq(kn) &&
6445	kev->kei_ident == kn->kn_id &&
6446	kev->kei_filter == kn->kn_filter) {
6447	if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6448	if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6449	kev->kei_udata == kn->kn_udata) {
6450	break; / matching udata-specific knote /
6451	}
6452	} else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == `0`) {
6453	break; / matching non-udata-specific knote /
6454	}
6455	}
6456	}
6457	}
6458	return kn;
6459	}
6460
6461	/*
6462	* kq_add_knote- Add knote to the fd table for process
6463	* while checking for duplicates.
6464	*
6465	* All file-based filters associate a list of knotes by file
6466	* descriptor index. All other filters hash the knote by ident.
6467	*
6468	* May have to grow the table of knote lists to cover the
6469	* file descriptor index presented.
6470	*
6471	* fd_knhashlock and fdlock unheld on entry (and exit).
6472	*
6473	* Takes a rwlock boost if inserting the knote is successful.
6474	*/
6475	static int
6476	kq_add_knote(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc,
6477	struct proc *p)
6478	{
6479	struct filedesc *fdp = &p->p_fd;
6480	struct klist *list = NULL;
6481	int ret = `0`;
6482	bool is_fd = kn->kn_is_fd;
6483
6484	if (is_fd) {
6485	proc_fdlock(p);
6486	} else {
6487	knhash_lock(fdp);
6488	}
6489
6490	if (knote_fdfind(kq, kev: &kn->kn_kevent, is_fd, p) != NULL) {
6491	/ found an existing knote: we can't add this one /
6492	ret = ERESTART;
6493	goto out_locked;
6494	}
6495
6496	/ knote was not found: add it now /
6497	if (!is_fd) {
6498	if (fdp->fd_knhashmask == `0`) {
6499	u_long size = `0`;
6500
6501	list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, hashmask: &size);
6502	if (list == NULL) {
6503	ret = ENOMEM;
6504	goto out_locked;
6505	}
6506
6507	fdp->fd_knhash = list;
6508	fdp->fd_knhashmask = size;
6509	}
6510
6511	list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6512	SLIST_INSERT_HEAD(list, kn, kn_link);
6513	ret = `0`;
6514	goto out_locked;
6515	} else {
6516	/ knote is fd based /
6517
6518	if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6519	u_int size = `0`;
6520
6521	/ Make sure that fd stays below current process's soft limit AND system allowed per-process limits /
6522	if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6523	ret = EINVAL;
6524	goto out_locked;
6525	}
6526	/ have to grow the fd_knlist /
6527	size = fdp->fd_knlistsize;
6528	while (size <= kn->kn_id) {
6529	size += KQEXTENT;
6530	}
6531
6532	if (size >= (UINT_MAX / sizeof(struct klist))) {
6533	ret = EINVAL;
6534	goto out_locked;
6535	}
6536
6537	list = kalloc_type(struct klist, size, Z_WAITOK \| Z_ZERO);
6538	if (list == NULL) {
6539	ret = ENOMEM;
6540	goto out_locked;
6541	}
6542
6543	bcopy(src: fdp->fd_knlist, dst: list,
6544	n: fdp->fd_knlistsize * sizeof(struct klist));
6545	kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6546	fdp->fd_knlist = list;
6547	fdp->fd_knlistsize = size;
6548	}
6549
6550	list = &fdp->fd_knlist[kn->kn_id];
6551	SLIST_INSERT_HEAD(list, kn, kn_link);
6552	ret = `0`;
6553	goto out_locked;
6554	}
6555
6556	out_locked:
6557	if (ret == `0`) {
6558	kqlock(kqu: kq);
6559	assert((kn->kn_status & KN_LOCKED) == `0`);
6560	(void)knote_lock(kqu: kq, kn, knlc, kqlocking: KNOTE_KQ_UNLOCK);
6561	kqueue_retain(kqu: kq); / retain a kq ref /
6562	}
6563	if (is_fd) {
6564	proc_fdunlock(p);
6565	} else {
6566	knhash_unlock(fdp);
6567	}
6568
6569	return ret;
6570	}
6571
6572	/*
6573	* kq_remove_knote - remove a knote from the fd table for process
6574	*
6575	* If the filter is file-based, remove based on fd index.
6576	* Otherwise remove from the hash based on the ident.
6577	*
6578	* fd_knhashlock and fdlock unheld on entry (and exit).
6579	*/
6580	static void
6581	kq_remove_knote(struct kqueue kq, struct* knote kn, struct* proc *p,
6582	struct knote_lock_ctx *knlc)
6583	{
6584	struct filedesc *fdp = &p->p_fd;
6585	struct klist *list = NULL;
6586	uint16_t kq_state;
6587	bool is_fd = kn->kn_is_fd;
6588
6589	if (is_fd) {
6590	proc_fdlock(p);
6591	} else {
6592	knhash_lock(fdp);
6593	}
6594
6595	if (is_fd) {
6596	assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6597	list = &fdp->fd_knlist[kn->kn_id];
6598	} else {
6599	list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6600	}
6601	SLIST_REMOVE(list, kn, knote, kn_link);
6602
6603	kqlock(kqu: kq);
6604
6605	/ Update the servicer iotier override /
6606	kqueue_update_iotier_override(kqu: kq);
6607
6608	kq_state = kq->kq_state;
6609	if (knlc) {
6610	knote_unlock_cancel(kq, kn, knlc);
6611	} else {
6612	kqunlock(kqu: kq);
6613	}
6614	if (is_fd) {
6615	proc_fdunlock(p);
6616	} else {
6617	knhash_unlock(fdp);
6618	}
6619
6620	if (kq_state & KQ_DYNAMIC) {
6621	kqworkloop_release(kqwl: (struct kqworkloop *)kq);
6622	}
6623	}
6624
6625	/*
6626	* kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6627	* and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6628	*
6629	* fd_knhashlock or fdlock unheld on entry (and exit)
6630	*/
6631
6632	static struct knote *
6633	kq_find_knote_and_kq_lock(struct kqueue kq, struct* kevent_qos_s *kev,
6634	bool is_fd, struct proc *p)
6635	{
6636	struct filedesc *fdp = &p->p_fd;
6637	struct knote *kn;
6638
6639	if (is_fd) {
6640	proc_fdlock(p);
6641	} else {
6642	knhash_lock(fdp);
6643	}
6644
6645	/*
6646	* Temporary horrible hack:
6647	* this cast is gross and will go away in a future change.
6648	* It is OK to do because we don't look at xflags/s_fflags,
6649	* and that when we cast down the kev this way,
6650	* the truncated filter field works.
6651	*/
6652	kn = knote_fdfind(kq, kev: (struct kevent_internal_s *)kev, is_fd, p);
6653
6654	if (kn) {
6655	kqlock(kqu: kq);
6656	assert(knote_get_kq(kn) == kq);
6657	}
6658
6659	if (is_fd) {
6660	proc_fdunlock(p);
6661	} else {
6662	knhash_unlock(fdp);
6663	}
6664
6665	return kn;
6666	}
6667
6668	static struct kqtailq *
6669	knote_get_tailq(kqueue_t kqu, struct knote *kn)
6670	{
6671	kq_index_t qos_index = kn->kn_qos_index;
6672
6673	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6674	assert(qos_index > `0` && qos_index <= KQWL_NBUCKETS);
6675	return &kqu.kqwl->kqwl_queue[qos_index - `1`];
6676	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6677	assert(qos_index > `0` && qos_index <= KQWQ_NBUCKETS);
6678	return &kqu.kqwq->kqwq_queue[qos_index - `1`];
6679	} else {
6680	assert(qos_index == QOS_INDEX_KQFILE);
6681	return &kqu.kqf->kqf_queue;
6682	}
6683	}
6684
6685	static void
6686	knote_enqueue(kqueue_t kqu, struct knote *kn)
6687	{
6688	kqlock_held(kqu);
6689
6690	if ((kn->kn_status & KN_ACTIVE) == `0`) {
6691	return;
6692	}
6693
6694	if (kn->kn_status & (KN_DISABLED \| KN_SUPPRESSED \| KN_DROPPING \| KN_QUEUED)) {
6695	return;
6696	}
6697
6698	struct kqtailq *queue = knote_get_tailq(kqu, kn);
6699	bool wakeup = TAILQ_EMPTY(queue);
6700
6701	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6702	kn->kn_status \|= KN_QUEUED;
6703	kqu.kq->kq_count++;
6704
6705	if (wakeup) {
6706	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6707	kqworkloop_wakeup(kqwl: kqu.kqwl, qos: kn->kn_qos_index);
6708	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6709	kqworkq_wakeup(kqwq: kqu.kqwq, qos_index: kn->kn_qos_index);
6710	} else {
6711	kqfile_wakeup(kqf: kqu.kqf, hint: `0`, THREAD_AWAKENED);
6712	}
6713	}
6714	}
6715
6716	__attribute__((always_inline))
6717	static inline void
6718	knote_dequeue(kqueue_t kqu, struct knote *kn)
6719	{
6720	if (kn->kn_status & KN_QUEUED) {
6721	struct kqtailq *queue = knote_get_tailq(kqu, kn);
6722
6723	// attaching the knote calls knote_reset_priority() without
6724	// the kqlock which is fine, so we can't call kqlock_held()
6725	// if we're not queued.
6726	kqlock_held(kqu);
6727
6728	TAILQ_REMOVE(queue, kn, kn_tqe);
6729	kn->kn_status &= ~KN_QUEUED;
6730	kqu.kq->kq_count--;
6731	if ((kqu.kq->kq_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`) {
6732	assert((kqu.kq->kq_count == `0`) ==
6733	(bool)TAILQ_EMPTY(queue));
6734	}
6735	}
6736	}
6737
6738	/ called with kqueue lock held /
6739	static void
6740	knote_suppress(kqueue_t kqu, struct knote *kn)
6741	{
6742	struct kqtailq *suppressq;
6743
6744	kqlock_held(kqu);
6745
6746	assert((kn->kn_status & KN_SUPPRESSED) == `0`);
6747	assert(kn->kn_status & KN_QUEUED);
6748
6749	knote_dequeue(kqu, kn);
6750	/ deactivate - so new activations indicate a wakeup /
6751	kn->kn_status &= ~KN_ACTIVE;
6752	kn->kn_status \|= KN_SUPPRESSED;
6753	suppressq = kqueue_get_suppressed_queue(kq: kqu, kn);
6754	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6755	}
6756
6757	__attribute__((always_inline))
6758	static inline void
6759	knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6760	{
6761	struct kqtailq *suppressq;
6762
6763	kqlock_held(kqu);
6764
6765	assert(kn->kn_status & KN_SUPPRESSED);
6766
6767	kn->kn_status &= ~KN_SUPPRESSED;
6768	suppressq = kqueue_get_suppressed_queue(kq: kqu, kn);
6769	TAILQ_REMOVE(suppressq, kn, kn_tqe);
6770
6771	/*
6772	* If the knote is no longer active, reset its push,
6773	* and resynchronize kn_qos_index with kn_qos_override
6774	* for knotes with a real qos.
6775	*/
6776	if ((kn->kn_status & KN_ACTIVE) == `0` && knote_has_qos(kn)) {
6777	kn->kn_qos_override = _pthread_priority_thread_qos_fast(pp: kn->kn_qos);
6778	}
6779	kn->kn_qos_index = kn->kn_qos_override;
6780	}
6781
6782	/ called with kqueue lock held /
6783	static void
6784	knote_unsuppress(kqueue_t kqu, struct knote *kn)
6785	{
6786	knote_unsuppress_noqueue(kqu, kn);
6787	knote_enqueue(kqu, kn);
6788	}
6789
6790	__attribute__((always_inline))
6791	static inline void
6792	knote_mark_active(struct knote *kn)
6793	{
6794	if ((kn->kn_status & KN_ACTIVE) == `0`) {
6795	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6796	kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
6797	kn->kn_filtid);
6798	}
6799
6800	kn->kn_status \|= KN_ACTIVE;
6801	}
6802
6803	/ called with kqueue lock held /
6804	static void
6805	knote_activate(kqueue_t kqu, struct knote kn, int* result)
6806	{
6807	assert(result & FILTER_ACTIVE);
6808	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6809	// may dequeue the knote
6810	knote_adjust_qos(kq: kqu.kq, kn, result);
6811	}
6812	knote_mark_active(kn);
6813	knote_enqueue(kqu, kn);
6814	}
6815
6816	/*
6817	* This function applies changes requested by f_attach or f_touch for
6818	* a given filter. It proceeds in a carefully chosen order to help
6819	* every single transition do the minimal amount of work possible.
6820	*/
6821	static void
6822	knote_apply_touch(kqueue_t kqu, struct knote kn, struct* kevent_qos_s *kev,
6823	int result)
6824	{
6825	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6826	kn->kn_status &= ~KN_DISABLED;
6827
6828	/*
6829	* it is possible for userland to have knotes registered for a given
6830	* workloop `wl_orig` but really handled on another workloop `wl_new`.
6831	*
6832	* In that case, rearming will happen from the servicer thread of
6833	* `wl_new` which if `wl_orig` is no longer being serviced, would cause
6834	* this knote to stay suppressed forever if we only relied on
6835	* kqworkloop_acknowledge_events to be called by `wl_orig`.
6836	*
6837	* However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6838	* unsuppress because that would mess with the processing phase of
6839	* `wl_orig`, however it also means kqworkloop_acknowledge_events()
6840	* will be called.
6841	*/
6842	if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6843	if ((kqu.kq->kq_state & KQ_PROCESSING) == `0`) {
6844	knote_unsuppress_noqueue(kqu, kn);
6845	}
6846	}
6847	}
6848
6849	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
6850	kqueue_update_iotier_override(kqu);
6851	}
6852
6853	if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6854	// may dequeue the knote
6855	knote_reset_priority(kqu, kn, pp: kev->qos);
6856	}
6857
6858	/*
6859	* When we unsuppress above, or because of knote_reset_priority(),
6860	* the knote may have been dequeued, we need to restore the invariant
6861	* that if the knote is active it needs to be queued now that
6862	* we're done applying changes.
6863	*/
6864	if (result & FILTER_ACTIVE) {
6865	knote_activate(kqu, kn, result);
6866	} else {
6867	knote_enqueue(kqu, kn);
6868	}
6869
6870	if ((result & FILTER_THREADREQ_NODEFEER) &&
6871	act_clear_astkevent(thread: current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6872	workq_kern_threadreq_redrive(p: kqu.kq->kq_p, flags: WORKQ_THREADREQ_NONE);
6873	}
6874	}
6875
6876	/*
6877	* knote_drop - disconnect and drop the knote
6878	*
6879	* Called with the kqueue locked, returns with the kqueue unlocked.
6880	*
6881	* If a knote locking context is passed, it is canceled.
6882	*
6883	* The knote may have already been detached from
6884	* (or not yet attached to) its source object.
6885	*/
6886	static void
6887	knote_drop(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc)
6888	{
6889	struct proc *p = kq->kq_p;
6890
6891	kqlock_held(kqu: kq);
6892
6893	assert((kn->kn_status & KN_DROPPING) == `0`);
6894	if (knlc == NULL) {
6895	assert((kn->kn_status & KN_LOCKED) == `0`);
6896	}
6897	kn->kn_status \|= KN_DROPPING;
6898
6899	if (kn->kn_status & KN_SUPPRESSED) {
6900	knote_unsuppress_noqueue(kqu: kq, kn);
6901	} else {
6902	knote_dequeue(kqu: kq, kn);
6903	}
6904	knote_wait_for_post(kq, kn);
6905
6906	/ Even if we are autodetached, the filter may need to do cleanups of any*
6907	* stuff stashed on the knote so always make the call and let each filter
6908	* handle the possibility of autodetached-ness */
6909	knote_fops(kn)->f_detach(kn);
6910
6911	/ kq may be freed when kq_remove_knote() returns /
6912	kq_remove_knote(kq, kn, p, knlc);
6913	if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == `0`)) {
6914	fp_drop(p, fd: (int)kn->kn_id, fp: kn->kn_fp, locked: `0`);
6915	}
6916
6917	knote_free(kn);
6918	}
6919
6920	void
6921	knote_init(void)
6922	{
6923	#if CONFIG_MEMORYSTATUS
6924	/ Initialize the memorystatus list lock /
6925	memorystatus_kevent_init(grp: &kq_lck_grp, LCK_ATTR_NULL);
6926	#endif
6927	}
6928	SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6929
6930	const struct filterops *
6931	knote_fops(struct knote *kn)
6932	{
6933	return sysfilt_ops[kn->kn_filtid];
6934	}
6935
6936	static struct knote *
6937	knote_alloc(void)
6938	{
6939	return zalloc_flags(knote_zone, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
6940	}
6941
6942	static void
6943	knote_free(struct knote *kn)
6944	{
6945	assert((kn->kn_status & (KN_LOCKED \| KN_POSTING)) == `0`);
6946	zfree(knote_zone, kn);
6947	}
6948
6949	#pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6950
6951	kevent_ctx_t
6952	kevent_get_context(thread_t thread)
6953	{
6954	uthread_t ut = get_bsdthread_info(thread);
6955	return &ut->uu_save.uus_kevent;
6956	}
6957
6958	static inline bool
6959	kevent_args_requesting_events(unsigned int flags, int nevents)
6960	{
6961	return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > `0`;
6962	}
6963
6964	static inline int
6965	kevent_adjust_flags_for_proc(proc_t p, int flags)
6966	{
6967	__builtin_assume(p);
6968	return flags \| (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : `0`);
6969	}
6970
6971	/!*
6972	* @function kevent_get_kqfile
6973	*
6974	* @brief
6975	* Lookup a kqfile by fd.
6976	*
6977	* @discussion
6978	* Callers: kevent, kevent64, kevent_qos
6979	*
6980	* This is not assumed to be a fastpath (kqfile interfaces are legacy)
6981	*/
6982	OS_NOINLINE
6983	static int
6984	kevent_get_kqfile(struct proc p, int* fd, int flags,
6985	struct fileproc fpp, struct kqueue kqp)
6986	{
6987	int error = `0`;
6988	struct kqueue *kq;
6989
6990	error = fp_get_ftype(p, fd, ftype: DTYPE_KQUEUE, EBADF, fpp);
6991	if (__improbable(error)) {
6992	return error;
6993	}
6994	kq = (struct kqueue )fp_get_data(fp: (fpp));
6995
6996	uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6997	if (__improbable((kq_state & (KQ_KEV32 \| KQ_KEV64 \| KQ_KEV_QOS)) == `0`)) {
6998	kqlock(kqu: kq);
6999	kq_state = kq->kq_state;
7000	if (!(kq_state & (KQ_KEV32 \| KQ_KEV64 \| KQ_KEV_QOS))) {
7001	if (flags & KEVENT_FLAG_LEGACY32) {
7002	kq_state \|= KQ_KEV32;
7003	} else if (flags & KEVENT_FLAG_LEGACY64) {
7004	kq_state \|= KQ_KEV64;
7005	} else {
7006	kq_state \|= KQ_KEV_QOS;
7007	}
7008	kq->kq_state = kq_state;
7009	}
7010	kqunlock(kqu: kq);
7011	}
7012
7013	/*
7014	* kqfiles can't be used through the legacy kevent()
7015	* and other interfaces at the same time.
7016	*/
7017	if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7018	(bool)(kq_state & KQ_KEV32))) {
7019	fp_drop(p, fd, fp: *fpp, locked: `0`);
7020	return EINVAL;
7021	}
7022
7023	*kqp = kq;
7024	return `0`;
7025	}
7026
7027	/!*
7028	* @function kevent_get_kqwq
7029	*
7030	* @brief
7031	* Lookup or create the process kqwq (faspath).
7032	*
7033	* @discussion
7034	* Callers: kevent64, kevent_qos
7035	*/
7036	OS_ALWAYS_INLINE
7037	static int
7038	kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7039	{
7040	struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7041
7042	if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7043	return EINVAL;
7044	}
7045	if (__improbable(kqwq == NULL)) {
7046	kqwq = kqworkq_alloc(p, flags);
7047	if (__improbable(kqwq == NULL)) {
7048	return ENOMEM;
7049	}
7050	}
7051
7052	*kqp = &kqwq->kqwq_kqueue;
7053	return `0`;
7054	}
7055
7056	#pragma mark kevent copyio
7057
7058	/!*
7059	* @function kevent_get_data_size
7060	*
7061	* @brief
7062	* Copies in the extra data size from user-space.
7063	*/
7064	static int
7065	kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7066	kevent_ctx_t kectx)
7067	{
7068	if (!data_avail \|\| !data_out) {
7069	kectx->kec_data_size = `0`;
7070	kectx->kec_data_resid = `0`;
7071	} else if (flags & KEVENT_FLAG_PROC64) {
7072	user64_size_t usize = `0`;
7073	int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7074	if (__improbable(error)) {
7075	return error;
7076	}
7077	kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7078	} else {
7079	user32_size_t usize = `0`;
7080	int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7081	if (__improbable(error)) {
7082	return error;
7083	}
7084	kectx->kec_data_avail = data_avail;
7085	kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7086	}
7087	kectx->kec_data_out = data_out;
7088	kectx->kec_data_avail = data_avail;
7089	return `0`;
7090	}
7091
7092	/!*
7093	* @function kevent_put_data_size
7094	*
7095	* @brief
7096	* Copies out the residual data size to user-space if any has been used.
7097	*/
7098	static int
7099	kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7100	{
7101	if (kectx->kec_data_resid == kectx->kec_data_size) {
7102	return `0`;
7103	}
7104	if (flags & KEVENT_FLAG_KERNEL) {
7105	(user_size_t )(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7106	return `0`;
7107	}
7108	if (flags & KEVENT_FLAG_PROC64) {
7109	user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7110	return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7111	} else {
7112	user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7113	return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7114	}
7115	}
7116
7117	/!*
7118	* @function kevent_legacy_copyin
7119	*
7120	* @brief
7121	* Handles the copyin of a kevent/kevent64 event.
7122	*/
7123	static int
7124	kevent_legacy_copyin(user_addr_t addrp, struct* kevent_qos_s kevp, unsigned* int flags)
7125	{
7126	int error;
7127
7128	assert((flags & (KEVENT_FLAG_LEGACY32 \| KEVENT_FLAG_LEGACY64)) != `0`);
7129
7130	if (flags & KEVENT_FLAG_LEGACY64) {
7131	struct kevent64_s kev64;
7132
7133	error = copyin(addrp, (caddr_t)&kev64, sizeof*(kev64));
7134	if (__improbable(error)) {
7135	return error;
7136	}
7137	addrp += sizeof*(kev64);
7138	kevp = (struct* kevent_qos_s){
7139	.ident = kev64.ident,
7140	.filter = kev64.filter,
7141	/ Make sure user doesn't pass in any system flags /
7142	.flags = kev64.flags & ~EV_SYSFLAGS,
7143	.udata = kev64.udata,
7144	.fflags = kev64.fflags,
7145	.data = kev64.data,
7146	.ext[`0`] = kev64.ext[`0`],
7147	.ext[`1`] = kev64.ext[`1`],
7148	};
7149	} else if (flags & KEVENT_FLAG_PROC64) {
7150	struct user64_kevent kev64;
7151
7152	error = copyin(addrp, (caddr_t)&kev64, sizeof*(kev64));
7153	if (__improbable(error)) {
7154	return error;
7155	}
7156	addrp += sizeof*(kev64);
7157	kevp = (struct* kevent_qos_s){
7158	.ident = kev64.ident,
7159	.filter = kev64.filter,
7160	/ Make sure user doesn't pass in any system flags /
7161	.flags = kev64.flags & ~EV_SYSFLAGS,
7162	.udata = kev64.udata,
7163	.fflags = kev64.fflags,
7164	.data = kev64.data,
7165	};
7166	} else {
7167	struct user32_kevent kev32;
7168
7169	error = copyin(addrp, (caddr_t)&kev32, sizeof*(kev32));
7170	if (__improbable(error)) {
7171	return error;
7172	}
7173	addrp += sizeof*(kev32);
7174	kevp = (struct* kevent_qos_s){
7175	.ident = (uintptr_t)kev32.ident,
7176	.filter = kev32.filter,
7177	/ Make sure user doesn't pass in any system flags /
7178	.flags = kev32.flags & ~EV_SYSFLAGS,
7179	.udata = CAST_USER_ADDR_T(kev32.udata),
7180	.fflags = kev32.fflags,
7181	.data = (intptr_t)kev32.data,
7182	};
7183	}
7184
7185	return `0`;
7186	}
7187
7188	/!*
7189	* @function kevent_modern_copyin
7190	*
7191	* @brief
7192	* Handles the copyin of a kevent_qos/kevent_id event.
7193	*/
7194	static int
7195	kevent_modern_copyin(user_addr_t addrp, struct* kevent_qos_s *kevp)
7196	{
7197	int error = copyin(addrp, (caddr_t)kevp, sizeof(struct* kevent_qos_s));
7198	if (__probable(!error)) {
7199	/ Make sure user doesn't pass in any system flags /
7200	addrp += sizeof(struct* kevent_qos_s);
7201	kevp->flags &= ~EV_SYSFLAGS;
7202	}
7203	return error;
7204	}
7205
7206	/!*
7207	* @function kevent_legacy_copyout
7208	*
7209	* @brief
7210	* Handles the copyout of a kevent/kevent64 event.
7211	*/
7212	static int
7213	kevent_legacy_copyout(struct kevent_qos_s kevp, user_addr_t addrp, unsigned int flags)
7214	{
7215	int advance;
7216	int error;
7217
7218	assert((flags & (KEVENT_FLAG_LEGACY32 \| KEVENT_FLAG_LEGACY64)) != `0`);
7219
7220	/*
7221	* fully initialize the differnt output event structure
7222	* types from the internal kevent (and some universal
7223	* defaults for fields not represented in the internal
7224	* form).
7225	*
7226	* Note: these structures have no padding hence the C99
7227	* initializers below do not leak kernel info.
7228	*/
7229	if (flags & KEVENT_FLAG_LEGACY64) {
7230	struct kevent64_s kev64 = {
7231	.ident = kevp->ident,
7232	.filter = kevp->filter,
7233	.flags = kevp->flags,
7234	.fflags = kevp->fflags,
7235	.data = (int64_t)kevp->data,
7236	.udata = kevp->udata,
7237	.ext[`0`] = kevp->ext[`0`],
7238	.ext[`1`] = kevp->ext[`1`],
7239	};
7240	advance = sizeof(struct kevent64_s);
7241	error = copyout((caddr_t)&kev64, *addrp, advance);
7242	} else if (flags & KEVENT_FLAG_PROC64) {
7243	/*
7244	* deal with the special case of a user-supplied
7245	* value of (uintptr_t)-1.
7246	*/
7247	uint64_t ident = (kevp->ident == (uintptr_t)-`1`) ?
7248	(uint64_t)-`1LL` : (uint64_t)kevp->ident;
7249	struct user64_kevent kev64 = {
7250	.ident = ident,
7251	.filter = kevp->filter,
7252	.flags = kevp->flags,
7253	.fflags = kevp->fflags,
7254	.data = (int64_t) kevp->data,
7255	.udata = (user_addr_t) kevp->udata,
7256	};
7257	advance = sizeof(kev64);
7258	error = copyout((caddr_t)&kev64, *addrp, advance);
7259	} else {
7260	struct user32_kevent kev32 = {
7261	.ident = (uint32_t)kevp->ident,
7262	.filter = kevp->filter,
7263	.flags = kevp->flags,
7264	.fflags = kevp->fflags,
7265	.data = (int32_t)kevp->data,
7266	.udata = (uint32_t)kevp->udata,
7267	};
7268	advance = sizeof(kev32);
7269	error = copyout((caddr_t)&kev32, *addrp, advance);
7270	}
7271	if (__probable(!error)) {
7272	*addrp += advance;
7273	}
7274	return error;
7275	}
7276
7277	/!*
7278	* @function kevent_modern_copyout
7279	*
7280	* @brief
7281	* Handles the copyout of a kevent_qos/kevent_id event.
7282	*/
7283	OS_ALWAYS_INLINE
7284	static inline int
7285	kevent_modern_copyout(struct kevent_qos_s kevp, user_addr_t addrp)
7286	{
7287	int error = copyout((caddr_t)kevp, addrp, sizeof(struct* kevent_qos_s));
7288	if (__probable(!error)) {
7289	addrp += sizeof(struct* kevent_qos_s);
7290	}
7291	return error;
7292	}
7293
7294	#pragma mark kevent core implementation
7295
7296	/!*
7297	* @function kevent_callback_inline
7298	*
7299	* @brief
7300	* Callback for each individual event
7301	*
7302	* @discussion
7303	* This is meant to be inlined in kevent_modern_callback and
7304	* kevent_legacy_callback.
7305	*/
7306	OS_ALWAYS_INLINE
7307	static inline int
7308	kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7309	{
7310	int error;
7311
7312	assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7313
7314	/*
7315	* Copy out the appropriate amount of event data for this user.
7316	*/
7317	if (legacy) {
7318	error = kevent_legacy_copyout(kevp, addrp: &kectx->kec_process_eventlist,
7319	flags: kectx->kec_process_flags);
7320	} else {
7321	error = kevent_modern_copyout(kevp, addrp: &kectx->kec_process_eventlist);
7322	}
7323
7324	/*
7325	* If there isn't space for additional events, return
7326	* a harmless error to stop the processing here
7327	*/
7328	if (error == `0` && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7329	error = EWOULDBLOCK;
7330	}
7331	return error;
7332	}
7333
7334	/!*
7335	* @function kevent_modern_callback
7336	*
7337	* @brief
7338	* Callback for each individual modern event.
7339	*
7340	* @discussion
7341	* This callback handles kevent_qos/kevent_id events.
7342	*/
7343	static int
7344	kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7345	{
7346	return kevent_callback_inline(kevp, kectx, /legacy/ false);
7347	}
7348
7349	/!*
7350	* @function kevent_legacy_callback
7351	*
7352	* @brief
7353	* Callback for each individual legacy event.
7354	*
7355	* @discussion
7356	* This callback handles kevent/kevent64 events.
7357	*/
7358	static int
7359	kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7360	{
7361	return kevent_callback_inline(kevp, kectx, /legacy/ true);
7362	}
7363
7364	/!*
7365	* @function kevent_cleanup
7366	*
7367	* @brief
7368	* Handles the cleanup returning from a kevent call.
7369	*
7370	* @discussion
7371	* kevent entry points will take a reference on workloops,
7372	* and a usecount on the fileglob of kqfiles.
7373	*
7374	* This function undoes this on the exit paths of kevents.
7375	*
7376	* @returns
7377	* The error to return to userspace.
7378	*/
7379	static int
7380	kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7381	{
7382	// poll should not call any codepath leading to this
7383	assert((flags & KEVENT_FLAG_POLL) == `0`);
7384
7385	if (flags & KEVENT_FLAG_WORKLOOP) {
7386	kqworkloop_release(kqwl: kqu.kqwl);
7387	} else if (flags & KEVENT_FLAG_WORKQ) {
7388	/ nothing held /
7389	} else {
7390	fp_drop(p: kqu.kqf->kqf_p, fd: kectx->kec_fd, fp: kectx->kec_fp, locked: `0`);
7391	}
7392
7393	/ don't restart after signals... /
7394	if (error == ERESTART) {
7395	error = EINTR;
7396	} else if (error == `0`) {
7397	/ don't abandon other output just because of residual copyout failures /
7398	(void)kevent_put_data_size(flags, kectx);
7399	}
7400
7401	if (flags & KEVENT_FLAG_PARKING) {
7402	thread_t th = current_thread();
7403	struct uthread *uth = get_bsdthread_info(th);
7404	if (uth->uu_kqr_bound) {
7405	thread_unfreeze_base_pri(thread: th);
7406	}
7407	}
7408	return error;
7409	}
7410
7411	/!*
7412	* @function kqueue_process
7413	*
7414	* @brief
7415	* Process the triggered events in a kqueue.
7416	*
7417	* @discussion
7418	* Walk the queued knotes and validate that they are really still triggered
7419	* events by calling the filter routines (if necessary).
7420	*
7421	* For each event that is still considered triggered, invoke the callback
7422	* routine provided.
7423	*
7424	* caller holds a reference on the kqueue.
7425	* kqueue locked on entry and exit - but may be dropped
7426	* kqueue list locked (held for duration of call)
7427	*
7428	* This is only called by kqueue_scan() so that the compiler can inline it.
7429	*
7430	* @returns
7431	* - 0: no event was returned, no other error occured
7432	* - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7433	* - EWOULDBLOCK: (not an error) events have been found and we should return
7434	* - EFAULT: copyout failed
7435	* - filter specific errors
7436	*/
7437	static int
7438	kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7439	kevent_callback_t callback)
7440	{
7441	workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7442	struct knote *kn;
7443	int error = `0`, rc = `0`;
7444	struct kqtailq base_queue, queue;
7445	uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ \| KQ_WORKLOOP));
7446
7447	if (kq_type & KQ_WORKQ) {
7448	rc = kqworkq_begin_processing(kqwq: kqu.kqwq, kqr, kevent_flags: flags);
7449	} else if (kq_type & KQ_WORKLOOP) {
7450	rc = kqworkloop_begin_processing(kqwl: kqu.kqwl, kevent_flags: flags);
7451	} else {
7452	kqfile_retry:
7453	rc = kqfile_begin_processing(kq: kqu.kqf);
7454	if (rc == EBADF) {
7455	return EBADF;
7456	}
7457	}
7458
7459	if (rc == -`1`) {
7460	/ Nothing to process /
7461	return `0`;
7462	}
7463
7464	/*
7465	* loop through the enqueued knotes associated with this request,
7466	* processing each one. Each request may have several queues
7467	* of knotes to process (depending on the type of kqueue) so we
7468	* have to loop through all the queues as long as we have additional
7469	* space.
7470	*/
7471
7472	process_again:
7473	if (kq_type & KQ_WORKQ) {
7474	base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - `1`];
7475	} else if (kq_type & KQ_WORKLOOP) {
7476	base_queue = &kqu.kqwl->kqwl_queue[`0`];
7477	queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - `1`];
7478	} else {
7479	base_queue = queue = &kqu.kqf->kqf_queue;
7480	}
7481
7482	do {
7483	while ((kn = TAILQ_FIRST(queue)) != NULL) {
7484	error = knote_process(kn, kectx, callback);
7485	if (error == EJUSTRETURN) {
7486	error = `0`;
7487	} else if (__improbable(error)) {
7488	/ error is EWOULDBLOCK when the out event array is full /
7489	goto stop_processing;
7490	}
7491	}
7492	} while (queue-- > base_queue);
7493
7494	if (kectx->kec_process_noutputs) {
7495	/ callers will transform this into no error /
7496	error = EWOULDBLOCK;
7497	}
7498
7499	stop_processing:
7500	/*
7501	* If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7502	* we want to unbind the kqrequest from the thread.
7503	*
7504	* However, because the kq locks are dropped several times during process,
7505	* new knotes may have fired again, in which case, we want to fail the end
7506	* processing and process again, until it converges.
7507	*
7508	* If we have an error or returned events, end processing never fails.
7509	*/
7510	if (error) {
7511	flags &= ~KEVENT_FLAG_PARKING;
7512	}
7513	if (kq_type & KQ_WORKQ) {
7514	rc = kqworkq_end_processing(kqwq: kqu.kqwq, kqr, kevent_flags: flags);
7515	} else if (kq_type & KQ_WORKLOOP) {
7516	rc = kqworkloop_end_processing(kqwl: kqu.kqwl, flags: KQ_PROCESSING, kevent_flags: flags);
7517	} else {
7518	rc = kqfile_end_processing(kq: kqu.kqf);
7519	}
7520
7521	if (__probable(error)) {
7522	return error;
7523	}
7524
7525	if (__probable(rc >= `0`)) {
7526	assert(rc == `0` \|\| rc == EBADF);
7527	return rc;
7528	}
7529
7530	if (kq_type & (KQ_WORKQ \| KQ_WORKLOOP)) {
7531	assert(flags & KEVENT_FLAG_PARKING);
7532	goto process_again;
7533	} else {
7534	goto kqfile_retry;
7535	}
7536	}
7537
7538	/!*
7539	* @function kqueue_scan_continue
7540	*
7541	* @brief
7542	* The continuation used by kqueue_scan for kevent entry points.
7543	*
7544	* @discussion
7545	* Assumes we inherit a use/ref count on the kq or its fileglob.
7546	*
7547	* This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7548	* KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7549	*/
7550	OS_NORETURN OS_NOINLINE
7551	static void
7552	kqueue_scan_continue(void *data, wait_result_t wait_result)
7553	{
7554	uthread_t ut = current_uthread();
7555	kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7556	int error = `0`, flags = kectx->kec_process_flags;
7557	struct kqueue *kq = data;
7558
7559	/*
7560	* only kevent variants call in here, so we know the callback is
7561	* kevent_legacy_callback or kevent_modern_callback.
7562	*/
7563	assert((flags & (KEVENT_FLAG_POLL \| KEVENT_FLAG_KERNEL)) == `0`);
7564
7565	switch (wait_result) {
7566	case THREAD_AWAKENED:
7567	if (__improbable(flags & (KEVENT_FLAG_LEGACY32 \| KEVENT_FLAG_LEGACY64))) {
7568	error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7569	} else {
7570	error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7571	}
7572	break;
7573	case THREAD_TIMED_OUT:
7574	error = `0`;
7575	break;
7576	case THREAD_INTERRUPTED:
7577	error = EINTR;
7578	break;
7579	case THREAD_RESTART:
7580	error = EBADF;
7581	break;
7582	default:
7583	panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7584	}
7585
7586
7587	error = kevent_cleanup(kqu: kq, flags, error, kectx);
7588	(int32_t )&ut->uu_rval = kectx->kec_process_noutputs;
7589	unix_syscall_return(error);
7590	}
7591
7592	/!*
7593	* @function kqueue_scan
7594	*
7595	* @brief
7596	* Scan and wait for events in a kqueue (used by poll & kevent).
7597	*
7598	* @discussion
7599	* Process the triggered events in a kqueue.
7600	*
7601	* If there are no events triggered arrange to wait for them:
7602	* - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7603	* - possibly until kectx->kec_deadline expires
7604	*
7605	* When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7606	* are set, then it will wait in the kqueue_scan_continue continuation.
7607	*
7608	* poll() will block in place, and KEVENT_FLAG_KERNEL calls
7609	* all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7610	*
7611	* @param kqu
7612	* The kqueue being scanned.
7613	*
7614	* @param flags
7615	* The KEVENT_FLAG_* flags for this call.
7616	*
7617	* @param kectx
7618	* The context used for this scan.
7619	* The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7620	*
7621	* @param callback
7622	* The callback to be called on events sucessfully processed.
7623	* (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7624	*/
7625	int
7626	kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7627	kevent_callback_t callback)
7628	{
7629	int error;
7630
7631	for (;;) {
7632	kqlock(kqu);
7633	error = kqueue_process(kqu, flags, kectx, callback);
7634
7635	/*
7636	* If we got an error, events returned (EWOULDBLOCK)
7637	* or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7638	* just return.
7639	*/
7640	if (__probable(error \|\| (flags & KEVENT_FLAG_IMMEDIATE))) {
7641	kqunlock(kqu);
7642	return error == EWOULDBLOCK ? `0` : error;
7643	}
7644
7645	assert((kqu.kq->kq_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`);
7646
7647	kqu.kqf->kqf_state \|= KQ_SLEEP;
7648	assert_wait_deadline(event: &kqu.kqf->kqf_count, THREAD_ABORTSAFE,
7649	deadline: kectx->kec_deadline);
7650	kqunlock(kqu);
7651
7652	if (__probable((flags & (KEVENT_FLAG_POLL \| KEVENT_FLAG_KERNEL)) == `0`)) {
7653	thread_block_parameter(continuation: kqueue_scan_continue, parameter: kqu.kqf);
7654	__builtin_unreachable();
7655	}
7656
7657	wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7658	switch (wr) {
7659	case THREAD_AWAKENED:
7660	break;
7661	case THREAD_TIMED_OUT:
7662	return `0`;
7663	case THREAD_INTERRUPTED:
7664	return EINTR;
7665	case THREAD_RESTART:
7666	return EBADF;
7667	default:
7668	panic("%s: - bad wait_result (%d)", __func__, wr);
7669	}
7670	}
7671	}
7672
7673	/!*
7674	* @function kevent_internal
7675	*
7676	* @brief
7677	* Common kevent code.
7678	*
7679	* @discussion
7680	* Needs to be inlined to specialize for legacy or modern and
7681	* eliminate dead code.
7682	*
7683	* This is the core logic of kevent entry points, that will:
7684	* - register kevents
7685	* - optionally scan the kqueue for events
7686	*
7687	* The caller is giving kevent_internal a reference on the kqueue
7688	* or its fileproc that needs to be cleaned up by kevent_cleanup().
7689	*/
7690	OS_ALWAYS_INLINE
7691	static inline int
7692	kevent_internal(kqueue_t kqu,
7693	user_addr_t changelist, int nchanges,
7694	user_addr_t ueventlist, int nevents,
7695	int flags, kevent_ctx_t kectx, int32_t *retval,
7696	bool legacy)
7697	{
7698	int error = `0`, noutputs = `0`, register_rc;
7699
7700	/ only bound threads can receive events on workloops /
7701	if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7702	#if CONFIG_WORKLOOP_DEBUG
7703	UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7704	.uu_kqid = kqu.kqwl->kqwl_dynamicid,
7705	.uu_kq = error ? NULL : kqu.kq,
7706	.uu_error = error,
7707	.uu_nchanges = nchanges,
7708	.uu_nevents = nevents,
7709	.uu_flags = flags,
7710	});
7711	#endif // CONFIG_WORKLOOP_DEBUG
7712
7713	if (flags & KEVENT_FLAG_KERNEL) {
7714	/ see kevent_workq_internal /
7715	error = copyout(&kqu.kqwl->kqwl_dynamicid,
7716	ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7717	kectx->kec_data_resid -= sizeof(kqueue_id_t);
7718	if (__improbable(error)) {
7719	goto out;
7720	}
7721	}
7722
7723	if (kevent_args_requesting_events(flags, nevents)) {
7724	/*
7725	* Disable the R2K notification while doing a register, if the
7726	* caller wants events too, we don't want the AST to be set if we
7727	* will process these events soon.
7728	*/
7729	kqlock(kqu);
7730	kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7731	kqunlock(kqu);
7732	flags \|= KEVENT_FLAG_NEEDS_END_PROCESSING;
7733	}
7734	}
7735
7736	/ register all the change requests the user provided... /
7737	while (nchanges > `0` && error == `0`) {
7738	struct kevent_qos_s kev;
7739	struct knote *kn = NULL;
7740
7741	if (legacy) {
7742	error = kevent_legacy_copyin(addrp: &changelist, kevp: &kev, flags);
7743	} else {
7744	error = kevent_modern_copyin(addrp: &changelist, kevp: &kev);
7745	}
7746	if (error) {
7747	break;
7748	}
7749
7750	register_rc = kevent_register(kq: kqu.kq, kev: &kev, kn_out: &kn);
7751	if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7752	thread_t thread = current_thread();
7753
7754	kqlock_held(kqu);
7755
7756	if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7757	workq_kern_threadreq_redrive(p: kqu.kq->kq_p, flags: WORKQ_THREADREQ_NONE);
7758	}
7759
7760	// f_post_register_wait is meant to call a continuation and not to
7761	// return, which is why we don't support FILTER_REGISTER_WAIT if
7762	// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7763	// waits isn't the last.
7764	//
7765	// It is implementable, but not used by any userspace code at the
7766	// moment, so for now return ENOTSUP if someone tries to do it.
7767	if (nchanges == `1` && noutputs < nevents &&
7768	(flags & KEVENT_FLAG_KERNEL) == `0` &&
7769	(flags & KEVENT_FLAG_PARKING) == `0` &&
7770	(flags & KEVENT_FLAG_ERROR_EVENTS) &&
7771	(flags & KEVENT_FLAG_WORKLOOP)) {
7772	uthread_t ut = get_bsdthread_info(thread);
7773
7774	/*
7775	* store the continuation/completion data in the uthread
7776	*
7777	* Note: the kectx aliases with this,
7778	* and is destroyed in the process.
7779	*/
7780	ut->uu_save.uus_kevent_register = (struct _kevent_register){
7781	.kev = kev,
7782	.kqwl = kqu.kqwl,
7783	.eventout = noutputs,
7784	.ueventlist = ueventlist,
7785	};
7786	knote_fops(kn)->f_post_register_wait(ut, kn,
7787	&ut->uu_save.uus_kevent_register);
7788	__builtin_unreachable();
7789	}
7790	kqunlock(kqu);
7791
7792	kev.flags \|= EV_ERROR;
7793	kev.data = ENOTSUP;
7794	} else {
7795	assert((register_rc & FILTER_REGISTER_WAIT) == `0`);
7796	}
7797
7798	// keep in sync with kevent_register_wait_return()
7799	if (noutputs < nevents && (kev.flags & (EV_ERROR \| EV_RECEIPT))) {
7800	if ((kev.flags & EV_ERROR) == `0`) {
7801	kev.flags \|= EV_ERROR;
7802	kev.data = `0`;
7803	}
7804	if (legacy) {
7805	error = kevent_legacy_copyout(kevp: &kev, addrp: &ueventlist, flags);
7806	} else {
7807	error = kevent_modern_copyout(kevp: &kev, addrp: &ueventlist);
7808	}
7809	if (error == `0`) {
7810	noutputs++;
7811	}
7812	} else if (kev.flags & EV_ERROR) {
7813	error = (int)kev.data;
7814	}
7815	nchanges--;
7816	}
7817
7818	if ((flags & KEVENT_FLAG_ERROR_EVENTS) == `0` &&
7819	nevents > `0` && noutputs == `0` && error == `0`) {
7820	kectx->kec_process_flags = flags;
7821	kectx->kec_process_nevents = nevents;
7822	kectx->kec_process_noutputs = `0`;
7823	kectx->kec_process_eventlist = ueventlist;
7824
7825	if (legacy) {
7826	error = kqueue_scan(kqu: kqu.kq, flags, kectx, callback: kevent_legacy_callback);
7827	} else {
7828	error = kqueue_scan(kqu: kqu.kq, flags, kectx, callback: kevent_modern_callback);
7829	}
7830
7831	noutputs = kectx->kec_process_noutputs;
7832	} else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7833	/*
7834	* If we didn't through kqworkloop_end_processing(),
7835	* we need to do it here.
7836	*
7837	* kqueue_scan will call kqworkloop_end_processing(),
7838	* so we only need to do it if we didn't scan.
7839	*/
7840	kqlock(kqu);
7841	kqworkloop_end_processing(kqwl: kqu.kqwl, flags: `0`, kevent_flags: `0`);
7842	kqunlock(kqu);
7843	}
7844
7845	*retval = noutputs;
7846	out:
7847	return kevent_cleanup(kqu: kqu.kq, flags, error, kectx);
7848	}
7849
7850	#pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7851
7852	/!*
7853	* @function kevent_modern_internal
7854	*
7855	* @brief
7856	* The backend of the kevent_id and kevent_workq_internal entry points.
7857	*
7858	* @discussion
7859	* Needs to be inline due to the number of arguments.
7860	*/
7861	OS_NOINLINE
7862	static int
7863	kevent_modern_internal(kqueue_t kqu,
7864	user_addr_t changelist, int nchanges,
7865	user_addr_t ueventlist, int nevents,
7866	int flags, kevent_ctx_t kectx, int32_t *retval)
7867	{
7868	return kevent_internal(kqu: kqu.kq, changelist, nchanges,
7869	ueventlist, nevents, flags, kectx, retval, /legacy/ false);
7870	}
7871
7872	/!*
7873	* @function kevent_id
7874	*
7875	* @brief
7876	* The kevent_id() syscall.
7877	*/
7878	int
7879	kevent_id(struct proc p, struct* kevent_id_args uap, int32_t retval)
7880	{
7881	int error, flags = uap->flags & KEVENT_FLAG_USER;
7882	uthread_t uth = current_uthread();
7883	workq_threadreq_t kqr = uth->uu_kqr_bound;
7884	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7885	kqueue_t kqu;
7886
7887	flags = kevent_adjust_flags_for_proc(p, flags);
7888	flags \|= KEVENT_FLAG_DYNAMIC_KQUEUE;
7889
7890	if (__improbable((flags & (KEVENT_FLAG_WORKQ \| KEVENT_FLAG_WORKLOOP)) !=
7891	KEVENT_FLAG_WORKLOOP)) {
7892	return EINVAL;
7893	}
7894
7895	error = kevent_get_data_size(flags, data_avail: uap->data_available, data_out: uap->data_out, kectx);
7896	if (__improbable(error)) {
7897	return error;
7898	}
7899
7900	kectx->kec_deadline = `0`;
7901	kectx->kec_fp = NULL;
7902	kectx->kec_fd = -`1`;
7903	/ the kec_process_* fields are filled if kqueue_scann is called only /
7904
7905	/*
7906	* Get the kq we are going to be working on
7907	* As a fastpath, look at the currently bound workloop.
7908	*/
7909	kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7910	if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7911	if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7912	return EEXIST;
7913	}
7914	kqworkloop_retain(kqwl: kqu.kqwl);
7915	} else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7916	return EXDEV;
7917	} else {
7918	error = kqworkloop_get_or_create(p, id: uap->id, NULL,
7919	#if CONFIG_PREADOPT_TG
7920	NULL,
7921	#endif /* CONFIG_PREADOPT_TG */
7922	flags, kqwlp: &kqu.kqwl);
7923	if (__improbable(error)) {
7924	return error;
7925	}
7926	}
7927
7928	return kevent_modern_internal(kqu, changelist: uap->changelist, nchanges: uap->nchanges,
7929	ueventlist: uap->eventlist, nevents: uap->nevents, flags, kectx, retval);
7930	}
7931
7932	/!
7933	* @function kevent_workq_internal
7934	*
7935	* @discussion
7936	* This function is exported for the sake of the workqueue subsystem.
7937	*
7938	* It is called in two ways:
7939	* - when a thread is about to go to userspace to ask for pending event
7940	* - when a thread is returning from userspace with events back
7941	*
7942	* the workqueue subsystem will only use the following flags:
7943	* - KEVENT_FLAG_STACK_DATA (always)
7944	* - KEVENT_FLAG_IMMEDIATE (always)
7945	* - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7946	* userspace).
7947	*
7948	* It implicitly acts on the bound kqueue, and for the case of workloops
7949	* will copyout the kqueue ID before anything else.
7950	*
7951	*
7952	* Pthread will have setup the various arguments to fit this stack layout:
7953	*
7954	* +-------....----+--------------+-----------+--------------------+
7955	* \| user stack \| data avail \| nevents \| pthread_self() \|
7956	* +-------....----+--------------+-----------+--------------------+
7957	* ^ ^
7958	* data_out eventlist
7959	*
7960	* When a workloop is used, the workloop ID is copied out right before
7961	* the eventlist and is taken from the data buffer.
7962	*
7963	* @warning
7964	* This function is carefuly tailored to not make any call except the final tail
7965	* call into kevent_modern_internal. (LTO inlines current_uthread()).
7966	*
7967	* This function is performance sensitive due to the workq subsystem.
7968	*/
7969	int
7970	kevent_workq_internal(struct proc *p,
7971	user_addr_t changelist, int nchanges,
7972	user_addr_t eventlist, int nevents,
7973	user_addr_t data_out, user_size_t *data_available,
7974	unsigned int flags, int32_t *retval)
7975	{
7976	uthread_t uth = current_uthread();
7977	workq_threadreq_t kqr = uth->uu_kqr_bound;
7978	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7979	kqueue_t kqu;
7980
7981	assert(flags == (KEVENT_FLAG_STACK_DATA \| KEVENT_FLAG_IMMEDIATE) \|\|
7982	flags == (KEVENT_FLAG_STACK_DATA \| KEVENT_FLAG_IMMEDIATE \| KEVENT_FLAG_PARKING));
7983
7984	kectx->kec_data_out = data_out;
7985	kectx->kec_data_avail = (uint64_t)data_available;
7986	kectx->kec_data_size = *data_available;
7987	kectx->kec_data_resid = *data_available;
7988	kectx->kec_deadline = `0`;
7989	kectx->kec_fp = NULL;
7990	kectx->kec_fd = -`1`;
7991	/ the kec_process_* fields are filled if kqueue_scann is called only /
7992
7993	flags = kevent_adjust_flags_for_proc(p, flags);
7994
7995	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7996	kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7997	kqworkloop_retain(kqwl: kqu.kqwl);
7998
7999	flags \|= KEVENT_FLAG_WORKLOOP \| KEVENT_FLAG_DYNAMIC_KQUEUE \|
8000	KEVENT_FLAG_KERNEL;
8001	} else {
8002	kqu.kqwq = p->p_fd.fd_wqkqueue;
8003
8004	flags \|= KEVENT_FLAG_WORKQ \| KEVENT_FLAG_KERNEL;
8005	}
8006
8007	return kevent_modern_internal(kqu, changelist, nchanges,
8008	ueventlist: eventlist, nevents, flags, kectx, retval);
8009	}
8010
8011	/!*
8012	* @function kevent_qos
8013	*
8014	* @brief
8015	* The kevent_qos() syscall.
8016	*/
8017	int
8018	kevent_qos(struct proc p, struct* kevent_qos_args uap, int32_t retval)
8019	{
8020	uthread_t uth = current_uthread();
8021	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8022	int error, flags = uap->flags & KEVENT_FLAG_USER;
8023	struct kqueue *kq;
8024
8025	if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8026	return EINVAL;
8027	}
8028
8029	flags = kevent_adjust_flags_for_proc(p, flags);
8030
8031	error = kevent_get_data_size(flags, data_avail: uap->data_available, data_out: uap->data_out, kectx);
8032	if (__improbable(error)) {
8033	return error;
8034	}
8035
8036	kectx->kec_deadline = `0`;
8037	kectx->kec_fp = NULL;
8038	kectx->kec_fd = uap->fd;
8039	/ the kec_process_* fields are filled if kqueue_scann is called only /
8040
8041	/ get the kq we are going to be working on /
8042	if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8043	error = kevent_get_kqwq(p, flags, nevents: uap->nevents, kqp: &kq);
8044	} else {
8045	error = kevent_get_kqfile(p, fd: uap->fd, flags, fpp: &kectx->kec_fp, kqp: &kq);
8046	}
8047	if (__improbable(error)) {
8048	return error;
8049	}
8050
8051	return kevent_modern_internal(kqu: kq, changelist: uap->changelist, nchanges: uap->nchanges,
8052	ueventlist: uap->eventlist, nevents: uap->nevents, flags, kectx, retval);
8053	}
8054
8055	#pragma mark legacy syscalls: kevent, kevent64
8056
8057	/!*
8058	* @function kevent_legacy_get_deadline
8059	*
8060	* @brief
8061	* Compute the deadline for the legacy kevent syscalls.
8062	*
8063	* @discussion
8064	* This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8065	* as this takes precedence over the deadline.
8066	*
8067	* This function will fail if utimeout is USER_ADDR_NULL
8068	* (the caller should check).
8069	*/
8070	static int
8071	kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8072	{
8073	struct timespec ts;
8074
8075	if (flags & KEVENT_FLAG_PROC64) {
8076	struct user64_timespec ts64;
8077	int error = copyin(utimeout, &ts64, sizeof(ts64));
8078	if (__improbable(error)) {
8079	return error;
8080	}
8081	ts.tv_sec = (unsigned long)ts64.tv_sec;
8082	ts.tv_nsec = (long)ts64.tv_nsec;
8083	} else {
8084	struct user32_timespec ts32;
8085	int error = copyin(utimeout, &ts32, sizeof(ts32));
8086	if (__improbable(error)) {
8087	return error;
8088	}
8089	ts.tv_sec = ts32.tv_sec;
8090	ts.tv_nsec = ts32.tv_nsec;
8091	}
8092	if (!timespec_is_valid(&ts)) {
8093	return EINVAL;
8094	}
8095
8096	clock_absolutetime_interval_to_deadline(abstime: tstoabstime(&ts), result: deadline);
8097	return `0`;
8098	}
8099
8100	/!*
8101	* @function kevent_legacy_internal
8102	*
8103	* @brief
8104	* The core implementation for kevent and kevent64
8105	*/
8106	OS_NOINLINE
8107	static int
8108	kevent_legacy_internal(struct proc p, struct* kevent64_args *uap,
8109	int32_t retval, int* flags)
8110	{
8111	uthread_t uth = current_uthread();
8112	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8113	struct kqueue *kq;
8114	int error;
8115
8116	if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8117	return EINVAL;
8118	}
8119
8120	flags = kevent_adjust_flags_for_proc(p, flags);
8121
8122	kectx->kec_data_out = `0`;
8123	kectx->kec_data_avail = `0`;
8124	kectx->kec_data_size = `0`;
8125	kectx->kec_data_resid = `0`;
8126	kectx->kec_deadline = `0`;
8127	kectx->kec_fp = NULL;
8128	kectx->kec_fd = uap->fd;
8129	/ the kec_process_* fields are filled if kqueue_scann is called only /
8130
8131	/ convert timeout to absolute - if we have one (and not immediate) /
8132	if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8133	error = kevent_legacy_get_deadline(flags, utimeout: uap->timeout,
8134	deadline: &kectx->kec_deadline);
8135	if (__improbable(error)) {
8136	return error;
8137	}
8138	}
8139
8140	/ get the kq we are going to be working on /
8141	if (flags & KEVENT_FLAG_WORKQ) {
8142	error = kevent_get_kqwq(p, flags, nevents: uap->nevents, kqp: &kq);
8143	} else {
8144	error = kevent_get_kqfile(p, fd: uap->fd, flags, fpp: &kectx->kec_fp, kqp: &kq);
8145	}
8146	if (__improbable(error)) {
8147	return error;
8148	}
8149
8150	return kevent_internal(kqu: kq, changelist: uap->changelist, nchanges: uap->nchanges,
8151	ueventlist: uap->eventlist, nevents: uap->nevents, flags, kectx, retval,
8152	/legacy/ true);
8153	}
8154
8155	/!*
8156	* @function kevent
8157	*
8158	* @brief
8159	* The legacy kevent() syscall.
8160	*/
8161	int
8162	kevent(struct proc p, struct* kevent_args uap, int32_t retval)
8163	{
8164	struct kevent64_args args = {
8165	.fd = uap->fd,
8166	.changelist = uap->changelist,
8167	.nchanges = uap->nchanges,
8168	.eventlist = uap->eventlist,
8169	.nevents = uap->nevents,
8170	.timeout = uap->timeout,
8171	};
8172
8173	return kevent_legacy_internal(p, uap: &args, retval, KEVENT_FLAG_LEGACY32);
8174	}
8175
8176	/!*
8177	* @function kevent64
8178	*
8179	* @brief
8180	* The legacy kevent64() syscall.
8181	*/
8182	int
8183	kevent64(struct proc p, struct* kevent64_args uap, int32_t retval)
8184	{
8185	int flags = (uap->flags & KEVENT_FLAG_USER) \| KEVENT_FLAG_LEGACY64;
8186	return kevent_legacy_internal(p, uap, retval, flags);
8187	}
8188
8189	#pragma mark - socket interface
8190
8191	#if SOCKETS
8192	#include <sys/param.h>
8193	#include <sys/socket.h>
8194	#include <sys/protosw.h>
8195	#include <sys/domain.h>
8196	#include <sys/mbuf.h>
8197	#include <sys/kern_event.h>
8198	#include <sys/malloc.h>
8199	#include <sys/sys_domain.h>
8200	#include <sys/syslog.h>
8201
8202	#ifndef ROUNDUP64
8203	#define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8204	#endif
8205
8206	#ifndef ADVANCE64
8207	#define ADVANCE64(p, n) (void)((char )(p) + ROUNDUP64(n))
8208	#endif
8209
8210	static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8211	static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8212
8213	static int kev_attach(struct socket so, int* proto, struct proc *p);
8214	static int kev_detach(struct socket *so);
8215	static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8216	struct ifnet ifp, struct* proc *p);
8217	static lck_mtx_t * event_getlock(struct socket , int*);
8218	static int event_lock(struct socket , int, void* *);
8219	static int event_unlock(struct socket , int, void* *);
8220
8221	static int event_sofreelastref(struct socket *);
8222	static void kev_delete(struct kern_event_pcb *);
8223
8224	static struct pr_usrreqs event_usrreqs = {
8225	.pru_attach = kev_attach,
8226	.pru_control = kev_control,
8227	.pru_detach = kev_detach,
8228	.pru_soreceive = soreceive,
8229	};
8230
8231	static struct protosw eventsw[] = {
8232	{
8233	.pr_type = SOCK_RAW,
8234	.pr_protocol = SYSPROTO_EVENT,
8235	.pr_flags = PR_ATOMIC,
8236	.pr_usrreqs = &event_usrreqs,
8237	.pr_lock = event_lock,
8238	.pr_unlock = event_unlock,
8239	.pr_getlock = event_getlock,
8240	}
8241	};
8242
8243	__private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8244	__private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8245
8246	SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8247	CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "Kernel event family");
8248
8249	struct kevtstat kevtstat;
8250	SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8251	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
8252	kevt_getstat, "S,kevtstat", "");
8253
8254	SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8255	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
8256	kevt_pcblist, "S,xkevtpcb", "");
8257
8258	static lck_mtx_t *
8259	event_getlock(struct socket so, int* flags)
8260	{
8261	#pragma unused(flags)
8262	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *)so->so_pcb;
8263
8264	if (so->so_pcb != NULL) {
8265	if (so->so_usecount < `0`) {
8266	panic("%s: so=%p usecount=%d lrh= %s", __func__,
8267	so, so->so_usecount, solockhistory_nr(so));
8268	}
8269	/ NOTREACHED /
8270	} else {
8271	panic("%s: so=%p NULL NO so_pcb %s", __func__,
8272	so, solockhistory_nr(so));
8273	/ NOTREACHED /
8274	}
8275	return &ev_pcb->evp_mtx;
8276	}
8277
8278	static int
8279	event_lock(struct socket so, int* refcount, void *lr)
8280	{
8281	void *lr_saved;
8282
8283	if (lr == NULL) {
8284	lr_saved = __builtin_return_address(`0`);
8285	} else {
8286	lr_saved = lr;
8287	}
8288
8289	if (so->so_pcb != NULL) {
8290	lck_mtx_lock(lck: &((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8291	} else {
8292	panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8293	so, lr_saved, solockhistory_nr(so));
8294	/ NOTREACHED /
8295	}
8296
8297	if (so->so_usecount < `0`) {
8298	panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8299	so, so->so_pcb, lr_saved, so->so_usecount,
8300	solockhistory_nr(so));
8301	/ NOTREACHED /
8302	}
8303
8304	if (refcount) {
8305	so->so_usecount++;
8306	}
8307
8308	so->lock_lr[so->next_lock_lr] = lr_saved;
8309	so->next_lock_lr = (so->next_lock_lr + `1`) % SO_LCKDBG_MAX;
8310	return `0`;
8311	}
8312
8313	static int
8314	event_unlock(struct socket so, int* refcount, void *lr)
8315	{
8316	void *lr_saved;
8317	lck_mtx_t *mutex_held;
8318
8319	if (lr == NULL) {
8320	lr_saved = __builtin_return_address(`0`);
8321	} else {
8322	lr_saved = lr;
8323	}
8324
8325	if (refcount) {
8326	so->so_usecount--;
8327	}
8328	if (so->so_usecount < `0`) {
8329	panic("%s: so=%p usecount=%d lrh= %s", __func__,
8330	so, so->so_usecount, solockhistory_nr(so));
8331	/ NOTREACHED /
8332	}
8333	if (so->so_pcb == NULL) {
8334	panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8335	so, so->so_usecount, (void *)lr_saved,
8336	solockhistory_nr(so));
8337	/ NOTREACHED /
8338	}
8339	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8340
8341	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8342	so->unlock_lr[so->next_unlock_lr] = lr_saved;
8343	so->next_unlock_lr = (so->next_unlock_lr + `1`) % SO_LCKDBG_MAX;
8344
8345	if (so->so_usecount == `0`) {
8346	VERIFY(so->so_flags & SOF_PCBCLEARING);
8347	event_sofreelastref(so);
8348	} else {
8349	lck_mtx_unlock(lck: mutex_held);
8350	}
8351
8352	return `0`;
8353	}
8354
8355	static int
8356	event_sofreelastref(struct socket *so)
8357	{
8358	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *)so->so_pcb;
8359
8360	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8361
8362	so->so_pcb = NULL;
8363
8364	/*
8365	* Disable upcall in the event another thread is in kev_post_msg()
8366	* appending record to the receive socket buffer, since sbwakeup()
8367	* may release the socket lock otherwise.
8368	*/
8369	so->so_rcv.sb_flags &= ~SB_UPCALL;
8370	so->so_snd.sb_flags &= ~SB_UPCALL;
8371	so->so_event = sonullevent;
8372	lck_mtx_unlock(lck: &(ev_pcb->evp_mtx));
8373
8374	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8375	lck_rw_lock_exclusive(lck: &kev_rwlock);
8376	LIST_REMOVE(ev_pcb, evp_link);
8377	kevtstat.kes_pcbcount--;
8378	kevtstat.kes_gencnt++;
8379	lck_rw_done(lck: &kev_rwlock);
8380	kev_delete(ev_pcb);
8381
8382	sofreelastref(so, `1`);
8383	return `0`;
8384	}
8385
8386	static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8387
8388	static
8389	struct kern_event_head kern_event_head;
8390
8391	static u_int32_t static_event_id = `0`;
8392
8393	static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8394
8395	/*
8396	* Install the protosw's for the NKE manager. Invoked at extension load time
8397	*/
8398	void
8399	kern_event_init(struct domain *dp)
8400	{
8401	struct protosw *pr;
8402	int i;
8403
8404	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8405	VERIFY(dp == systemdomain);
8406
8407	for (i = `0`, pr = &eventsw[`0`]; i < event_proto_count; i++, pr++) {
8408	net_add_proto(pr, dp, `1`);
8409	}
8410	}
8411
8412	static int
8413	kev_attach(struct socket so, __unused int* proto, __unused struct proc *p)
8414	{
8415	int error = `0`;
8416	struct kern_event_pcb *ev_pcb;
8417
8418	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8419	if (error != `0`) {
8420	return error;
8421	}
8422
8423	ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK \| Z_ZERO);
8424	lck_mtx_init(lck: &ev_pcb->evp_mtx, grp: &kev_lck_grp, LCK_ATTR_NULL);
8425
8426	ev_pcb->evp_socket = so;
8427	ev_pcb->evp_vendor_code_filter = `0xffffffff`;
8428
8429	so->so_pcb = (caddr_t) ev_pcb;
8430	lck_rw_lock_exclusive(lck: &kev_rwlock);
8431	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8432	kevtstat.kes_pcbcount++;
8433	kevtstat.kes_gencnt++;
8434	lck_rw_done(lck: &kev_rwlock);
8435
8436	return error;
8437	}
8438
8439	static void
8440	kev_delete(struct kern_event_pcb *ev_pcb)
8441	{
8442	VERIFY(ev_pcb != NULL);
8443	lck_mtx_destroy(lck: &ev_pcb->evp_mtx, grp: &kev_lck_grp);
8444	zfree(ev_pcb_zone, ev_pcb);
8445	}
8446
8447	static int
8448	kev_detach(struct socket *so)
8449	{
8450	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *) so->so_pcb;
8451
8452	if (ev_pcb != NULL) {
8453	soisdisconnected(so);
8454	so->so_flags \|= SOF_PCBCLEARING;
8455	}
8456
8457	return `0`;
8458	}
8459
8460	/*
8461	* For now, kev_vendor_code and mbuf_tags use the same
8462	* mechanism.
8463	*/
8464	errno_t
8465	kev_vendor_code_find(
8466	const char *string,
8467	u_int32_t *out_vendor_code)
8468	{
8469	if (strlen(s: string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8470	return EINVAL;
8471	}
8472	return net_str_id_find_internal(string, out_vendor_code,
8473	NSI_VENDOR_CODE, `1`);
8474	}
8475
8476	errno_t
8477	kev_msg_post(struct kev_msg *event_msg)
8478	{
8479	mbuf_tag_id_t min_vendor, max_vendor;
8480
8481	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8482
8483	if (event_msg == NULL) {
8484	return EINVAL;
8485	}
8486
8487	/*
8488	* Limit third parties to posting events for registered vendor codes
8489	* only
8490	*/
8491	if (event_msg->vendor_code < min_vendor \|\|
8492	event_msg->vendor_code > max_vendor) {
8493	os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8494	return EINVAL;
8495	}
8496	return kev_post_msg(event: event_msg);
8497	}
8498
8499	static int
8500	kev_post_msg_internal(struct kev_msg event_msg, int* wait)
8501	{
8502	struct mbuf m, m2;
8503	struct kern_event_pcb *ev_pcb;
8504	struct kern_event_msg *ev;
8505	char *tmp;
8506	u_int32_t total_size;
8507	int i;
8508
8509	#if SKYWALK && defined(XNU_TARGET_OS_OSX)
8510	/*
8511	* Special hook for ALF state updates
8512	*/
8513	if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8514	event_msg->kev_class == KEV_NKE_CLASS &&
8515	event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8516	event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8517	#if (DEBUG \|\| DEVELOPMENT)
8518	os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8519	#endif /* DEBUG \|\| DEVELOPMENT */
8520	net_filter_event_mark(subsystem: NET_FILTER_EVENT_ALF,
8521	compatible: net_check_compatible_alf());
8522	}
8523	#endif /* SKYWALK && XNU_TARGET_OS_OSX */
8524
8525	/ Verify the message is small enough to fit in one mbuf w/o cluster /
8526	total_size = KEV_MSG_HEADER_SIZE;
8527
8528	for (i = `0`; i < `5`; i++) {
8529	if (event_msg->dv[i].data_length == `0`) {
8530	break;
8531	}
8532	total_size += event_msg->dv[i].data_length;
8533	}
8534
8535	if (total_size > MLEN) {
8536	os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8537	return EMSGSIZE;
8538	}
8539
8540	m = m_get(wait, MT_DATA);
8541	if (m == `0`) {
8542	os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8543	return ENOMEM;
8544	}
8545	ev = mtod(m, struct kern_event_msg *);
8546	total_size = KEV_MSG_HEADER_SIZE;
8547
8548	tmp = (char *) &ev->event_data[`0`];
8549	for (i = `0`; i < `5`; i++) {
8550	if (event_msg->dv[i].data_length == `0`) {
8551	break;
8552	}
8553
8554	total_size += event_msg->dv[i].data_length;
8555	bcopy(src: event_msg->dv[i].data_ptr, dst: tmp,
8556	n: event_msg->dv[i].data_length);
8557	tmp += event_msg->dv[i].data_length;
8558	}
8559
8560	ev->id = ++static_event_id;
8561	ev->total_size = total_size;
8562	ev->vendor_code = event_msg->vendor_code;
8563	ev->kev_class = event_msg->kev_class;
8564	ev->kev_subclass = event_msg->kev_subclass;
8565	ev->event_code = event_msg->event_code;
8566
8567	m->m_len = total_size;
8568	lck_rw_lock_shared(lck: &kev_rwlock);
8569	for (ev_pcb = LIST_FIRST(&kern_event_head);
8570	ev_pcb;
8571	ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8572	lck_mtx_lock(lck: &ev_pcb->evp_mtx);
8573	if (ev_pcb->evp_socket->so_pcb == NULL) {
8574	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8575	continue;
8576	}
8577	if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8578	if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8579	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8580	continue;
8581	}
8582
8583	if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8584	if (ev_pcb->evp_class_filter != ev->kev_class) {
8585	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8586	continue;
8587	}
8588
8589	if ((ev_pcb->evp_subclass_filter !=
8590	KEV_ANY_SUBCLASS) &&
8591	(ev_pcb->evp_subclass_filter !=
8592	ev->kev_subclass)) {
8593	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8594	continue;
8595	}
8596	}
8597	}
8598
8599	m2 = m_copym(m, `0`, m->m_len, wait);
8600	if (m2 == `0`) {
8601	os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8602	m_free(m);
8603	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8604	lck_rw_done(lck: &kev_rwlock);
8605	return ENOMEM;
8606	}
8607	if (sbappendrecord(sb: &ev_pcb->evp_socket->so_rcv, m0: m2)) {
8608	/*
8609	* We use "m" for the socket stats as it would be
8610	* unsafe to use "m2"
8611	*/
8612	so_inc_recv_data_stat(ev_pcb->evp_socket,
8613	`1`, m->m_len, MBUF_TC_BE);
8614
8615	sorwakeup(so: ev_pcb->evp_socket);
8616	os_atomic_inc(&kevtstat.kes_posted, relaxed);
8617	} else {
8618	os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8619	}
8620	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8621	}
8622	m_free(m);
8623	lck_rw_done(lck: &kev_rwlock);
8624
8625	return `0`;
8626	}
8627
8628	int
8629	kev_post_msg(struct kev_msg *event_msg)
8630	{
8631	return kev_post_msg_internal(event_msg, M_WAIT);
8632	}
8633
8634	int
8635	kev_post_msg_nowait(struct kev_msg *event_msg)
8636	{
8637	return kev_post_msg_internal(event_msg, M_NOWAIT);
8638	}
8639
8640	static int
8641	kev_control(struct socket *so,
8642	u_long cmd,
8643	caddr_t data,
8644	__unused struct ifnet *ifp,
8645	__unused struct proc *p)
8646	{
8647	struct kev_request kev_req = (struct* kev_request *) data;
8648	struct kern_event_pcb *ev_pcb;
8649	struct kev_vendor_code *kev_vendor;
8650	u_int32_t id_value = (u_int32_t ) data;
8651
8652	switch (cmd) {
8653	case SIOCGKEVID:
8654	*id_value = static_event_id;
8655	break;
8656	case SIOCSKEVFILT:
8657	ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8658	ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8659	ev_pcb->evp_class_filter = kev_req->kev_class;
8660	ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
8661	break;
8662	case SIOCGKEVFILT:
8663	ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8664	kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8665	kev_req->kev_class = ev_pcb->evp_class_filter;
8666	kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8667	break;
8668	case SIOCGKEVVENDOR:
8669	kev_vendor = (struct kev_vendor_code *)data;
8670	/ Make sure string is NULL terminated /
8671	kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - `1`] = `0`;
8672	return net_str_id_find_internal(kev_vendor->vendor_string,
8673	&kev_vendor->vendor_code, NSI_VENDOR_CODE, `0`);
8674	default:
8675	return ENOTSUP;
8676	}
8677
8678	return `0`;
8679	}
8680
8681	int
8682	kevt_getstat SYSCTL_HANDLER_ARGS
8683	{
8684	#pragma unused(oidp, arg1, arg2)
8685	int error = `0`;
8686
8687	lck_rw_lock_shared(lck: &kev_rwlock);
8688
8689	if (req->newptr != USER_ADDR_NULL) {
8690	error = EPERM;
8691	goto done;
8692	}
8693	if (req->oldptr == USER_ADDR_NULL) {
8694	req->oldidx = sizeof(struct kevtstat);
8695	goto done;
8696	}
8697
8698	error = SYSCTL_OUT(req, &kevtstat,
8699	MIN(sizeof(struct kevtstat), req->oldlen));
8700	done:
8701	lck_rw_done(lck: &kev_rwlock);
8702
8703	return error;
8704	}
8705
8706	__private_extern__ int
8707	kevt_pcblist SYSCTL_HANDLER_ARGS
8708	{
8709	#pragma unused(oidp, arg1, arg2)
8710	int error = `0`;
8711	uint64_t n, i;
8712	struct xsystmgen xsg;
8713	void *buf = NULL;
8714	size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8715	ROUNDUP64(sizeof(struct xsocket_n)) +
8716	`2` * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8717	ROUNDUP64(sizeof(struct xsockstat_n));
8718	struct kern_event_pcb *ev_pcb;
8719
8720	buf = kalloc_data(item_size, Z_WAITOK \| Z_ZERO);
8721	if (buf == NULL) {
8722	return ENOMEM;
8723	}
8724
8725	lck_rw_lock_shared(lck: &kev_rwlock);
8726
8727	n = kevtstat.kes_pcbcount;
8728
8729	if (req->oldptr == USER_ADDR_NULL) {
8730	req->oldidx = (size_t) ((n + n / `8`) * item_size);
8731	goto done;
8732	}
8733	if (req->newptr != USER_ADDR_NULL) {
8734	error = EPERM;
8735	goto done;
8736	}
8737	bzero(s: &xsg, n: sizeof(xsg));
8738	xsg.xg_len = sizeof(xsg);
8739	xsg.xg_count = n;
8740	xsg.xg_gen = kevtstat.kes_gencnt;
8741	xsg.xg_sogen = so_gencnt;
8742	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8743	if (error) {
8744	goto done;
8745	}
8746	/*
8747	* We are done if there is no pcb
8748	*/
8749	if (n == `0`) {
8750	goto done;
8751	}
8752
8753	i = `0`;
8754	for (i = `0`, ev_pcb = LIST_FIRST(&kern_event_head);
8755	i < n && ev_pcb != NULL;
8756	i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8757	struct xkevtpcb xk = (struct* xkevtpcb *)buf;
8758	struct xsocket_n xso = (struct* xsocket_n *)
8759	ADVANCE64(xk, sizeof(*xk));
8760	struct xsockbuf_n xsbrcv = (struct* xsockbuf_n *)
8761	ADVANCE64(xso, sizeof(*xso));
8762	struct xsockbuf_n xsbsnd = (struct* xsockbuf_n *)
8763	ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8764	struct xsockstat_n xsostats = (struct* xsockstat_n *)
8765	ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8766
8767	bzero(s: buf, n: item_size);
8768
8769	lck_mtx_lock(lck: &ev_pcb->evp_mtx);
8770
8771	xk->kep_len = sizeof(struct xkevtpcb);
8772	xk->kep_kind = XSO_EVT;
8773	xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRHASH(ev_pcb);
8774	xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8775	xk->kep_class_filter = ev_pcb->evp_class_filter;
8776	xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8777
8778	sotoxsocket_n(ev_pcb->evp_socket, xso);
8779	sbtoxsockbuf_n(ev_pcb->evp_socket ?
8780	&ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8781	sbtoxsockbuf_n(ev_pcb->evp_socket ?
8782	&ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8783	sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8784
8785	lck_mtx_unlock(lck: &ev_pcb->evp_mtx);
8786
8787	error = SYSCTL_OUT(req, buf, item_size);
8788	}
8789
8790	if (error == `0`) {
8791	/*
8792	* Give the user an updated idea of our state.
8793	* If the generation differs from what we told
8794	* her before, she knows that something happened
8795	* while we were processing this request, and it
8796	* might be necessary to retry.
8797	*/
8798	bzero(s: &xsg, n: sizeof(xsg));
8799	xsg.xg_len = sizeof(xsg);
8800	xsg.xg_count = n;
8801	xsg.xg_gen = kevtstat.kes_gencnt;
8802	xsg.xg_sogen = so_gencnt;
8803	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8804	if (error) {
8805	goto done;
8806	}
8807	}
8808
8809	done:
8810	lck_rw_done(lck: &kev_rwlock);
8811
8812	kfree_data(buf, item_size);
8813	return error;
8814	}
8815
8816	#endif /* SOCKETS */
8817
8818
8819	int
8820	fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
8821	{
8822	struct vinfo_stat * st;
8823
8824	st = &kinfo->kq_stat;
8825
8826	st->vst_size = kqu.kq->kq_count;
8827	if (kqu.kq->kq_state & KQ_KEV_QOS) {
8828	st->vst_blksize = sizeof(struct kevent_qos_s);
8829	} else if (kqu.kq->kq_state & KQ_KEV64) {
8830	st->vst_blksize = sizeof(struct kevent64_s);
8831	} else {
8832	st->vst_blksize = sizeof(struct kevent);
8833	}
8834	st->vst_mode = S_IFIFO;
8835	st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
8836	kqu.kqwl->kqwl_dynamicid : `0`;
8837
8838	/ flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) /
8839	#define PROC_KQUEUE_MASK (KQ_SLEEP\|KQ_KEV32\|KQ_KEV64\|KQ_KEV_QOS\|KQ_WORKQ\|KQ_WORKLOOP)
8840	static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
8841	static_assert(PROC_KQUEUE_32 == KQ_KEV32);
8842	static_assert(PROC_KQUEUE_64 == KQ_KEV64);
8843	static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
8844	static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
8845	static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
8846	kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
8847	if ((kqu.kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ)) == `0`) {
8848	if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
8849	kinfo->kq_state \|= PROC_KQUEUE_SELECT;
8850	}
8851	}
8852
8853	return `0`;
8854	}
8855
8856	static int
8857	fill_kqueue_dyninfo(struct kqworkloop kqwl, struct* kqueue_dyninfo *kqdi)
8858	{
8859	workq_threadreq_t kqr = &kqwl->kqwl_request;
8860	workq_threadreq_param_t trp = {};
8861	int err;
8862
8863	if ((kqwl->kqwl_state & KQ_WORKLOOP) == `0`) {
8864	return EINVAL;
8865	}
8866
8867	if ((err = fill_kqueueinfo(kqu: &kqwl->kqwl_kqueue, kinfo: &kqdi->kqdi_info))) {
8868	return err;
8869	}
8870
8871	kqlock(kqu: kqwl);
8872
8873	kqdi->kqdi_servicer = thread_tid(thread: kqr_thread(kqr));
8874	kqdi->kqdi_owner = thread_tid(thread: kqwl->kqwl_owner);
8875	kqdi->kqdi_request_state = kqr->tr_state;
8876	kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8877	kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8878	kqdi->kqdi_sync_waiters = `0`;
8879	kqdi->kqdi_sync_waiter_qos = `0`;
8880
8881	trp.trp_value = kqwl->kqwl_params;
8882	if (trp.trp_flags & TRP_PRIORITY) {
8883	kqdi->kqdi_pri = trp.trp_pri;
8884	} else {
8885	kqdi->kqdi_pri = `0`;
8886	}
8887
8888	if (trp.trp_flags & TRP_POLICY) {
8889	kqdi->kqdi_pol = trp.trp_pol;
8890	} else {
8891	kqdi->kqdi_pol = `0`;
8892	}
8893
8894	if (trp.trp_flags & TRP_CPUPERCENT) {
8895	kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8896	} else {
8897	kqdi->kqdi_cpupercent = `0`;
8898	}
8899
8900	kqunlock(kqu: kqwl);
8901
8902	return `0`;
8903	}
8904
8905
8906	static unsigned long
8907	kevent_extinfo_emit(struct kqueue kq, struct* knote kn, struct* kevent_extinfo *buf,
8908	unsigned long buflen, unsigned long nknotes)
8909	{
8910	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8911	if (kq == knote_get_kq(kn)) {
8912	if (nknotes < buflen) {
8913	struct kevent_extinfo *info = &buf[nknotes];
8914
8915	kqlock(kqu: kq);
8916
8917	if (knote_fops(kn)->f_sanitized_copyout) {
8918	knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
8919	} else {
8920	info->kqext_kev = (struct* kevent_qos_s *)&kn->kn_kevent;
8921	}
8922
8923	if (knote_has_qos(kn)) {
8924	info->kqext_kev.qos =
8925	_pthread_priority_thread_qos_fast(pp: kn->kn_qos);
8926	} else {
8927	info->kqext_kev.qos = kn->kn_qos_override;
8928	}
8929	info->kqext_kev.filter \|= `0xff00`; / sign extend filter /
8930	info->kqext_kev.xflags = `0`; / this is where sfflags lives /
8931	info->kqext_kev.data = `0`; / this is where sdata lives /
8932	info->kqext_sdata = kn->kn_sdata;
8933	info->kqext_status = kn->kn_status;
8934	info->kqext_sfflags = kn->kn_sfflags;
8935
8936	kqunlock(kqu: kq);
8937	}
8938
8939	/ we return total number of knotes, which may be more than requested /
8940	nknotes++;
8941	}
8942	}
8943
8944	return nknotes;
8945	}
8946
8947	int
8948	kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8949	int32_t *nkqueues_out)
8950	{
8951	proc_t p = (proc_t)proc;
8952	struct filedesc *fdp = &p->p_fd;
8953	unsigned int nkqueues = `0`;
8954	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8955	size_t buflen, bufsize;
8956	kqueue_id_t *kq_ids = NULL;
8957	int err = `0`;
8958
8959	assert(p != NULL);
8960
8961	if (ubuf == USER_ADDR_NULL && ubufsize != `0`) {
8962	err = EINVAL;
8963	goto out;
8964	}
8965
8966	buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8967
8968	if (ubuflen != `0`) {
8969	if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8970	err = ERANGE;
8971	goto out;
8972	}
8973	kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK \| Z_ZERO);
8974	if (!kq_ids) {
8975	err = ENOMEM;
8976	goto out;
8977	}
8978	}
8979
8980	kqhash_lock(fdp);
8981
8982	u_long kqhashmask = fdp->fd_kqhashmask;
8983	if (kqhashmask > `0`) {
8984	for (uint32_t i = `0`; i < kqhashmask + `1`; i++) {
8985	struct kqworkloop *kqwl;
8986
8987	LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8988	/ report the number of kqueues, even if they don't all fit /
8989	if (nkqueues < buflen) {
8990	kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8991	}
8992	nkqueues++;
8993	}
8994
8995	/*
8996	* Drop the kqhash lock and take it again to give some breathing room
8997	*/
8998	kqhash_unlock(fdp);
8999	kqhash_lock(fdp);
9000
9001	/*
9002	* Reevaluate to see if we have raced with someone who changed this -
9003	* if we have, we should bail out with the set of info captured so far
9004	*/
9005	if (fdp->fd_kqhashmask != kqhashmask) {
9006	break;
9007	}
9008	}
9009	}
9010
9011	kqhash_unlock(fdp);
9012
9013	if (kq_ids) {
9014	size_t copysize;
9015	if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
9016	err = ERANGE;
9017	goto out;
9018	}
9019
9020	assert(ubufsize >= copysize);
9021	err = copyout(kq_ids, ubuf, copysize);
9022	}
9023
9024	out:
9025	if (kq_ids) {
9026	kfree_data(kq_ids, bufsize);
9027	}
9028
9029	if (!err) {
9030	nkqueues_out = (int*)min(a: nkqueues, PROC_PIDDYNKQUEUES_MAX);
9031	}
9032	return err;
9033	}
9034
9035	int
9036	kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9037	uint32_t ubufsize, int32_t *size_out)
9038	{
9039	proc_t p = (proc_t)proc;
9040	struct kqworkloop *kqwl;
9041	int err = `0`;
9042	struct kqueue_dyninfo kqdi = { };
9043
9044	assert(p != NULL);
9045
9046	if (ubufsize < sizeof(struct kqueue_info)) {
9047	return ENOBUFS;
9048	}
9049
9050	kqwl = kqworkloop_hash_lookup_and_retain(fdp: &p->p_fd, kq_id);
9051	if (!kqwl) {
9052	return ESRCH;
9053	}
9054
9055	/*
9056	* backward compatibility: allow the argument to this call to only be
9057	* a struct kqueue_info
9058	*/
9059	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9060	ubufsize = sizeof(struct kqueue_dyninfo);
9061	err = fill_kqueue_dyninfo(kqwl, kqdi: &kqdi);
9062	} else {
9063	ubufsize = sizeof(struct kqueue_info);
9064	err = fill_kqueueinfo(kqu: &kqwl->kqwl_kqueue, kinfo: &kqdi.kqdi_info);
9065	}
9066	if (err == `0` && (err = copyout(&kqdi, ubuf, ubufsize)) == `0`) {
9067	*size_out = ubufsize;
9068	}
9069	kqworkloop_release(kqwl);
9070	return err;
9071	}
9072
9073	int
9074	kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9075	uint32_t ubufsize, int32_t *nknotes_out)
9076	{
9077	proc_t p = (proc_t)proc;
9078	struct kqworkloop *kqwl;
9079	int err;
9080
9081	kqwl = kqworkloop_hash_lookup_and_retain(fdp: &p->p_fd, kq_id);
9082	if (!kqwl) {
9083	return ESRCH;
9084	}
9085
9086	err = pid_kqueue_extinfo(p, kq: &kqwl->kqwl_kqueue, buffer: ubuf, buffersize: ubufsize, retval: nknotes_out);
9087	kqworkloop_release(kqwl);
9088	return err;
9089	}
9090
9091	int
9092	pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9093	uint32_t bufsize, int32_t *retval)
9094	{
9095	struct knote *kn;
9096	int i;
9097	int err = `0`;
9098	struct filedesc *fdp = &p->p_fd;
9099	unsigned long nknotes = `0`;
9100	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9101	struct kevent_extinfo *kqext = NULL;
9102
9103	/ arbitrary upper limit to cap kernel memory usage, copyout size, etc. /
9104	buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9105
9106	kqext = (struct kevent_extinfo )kalloc_data(buflen sizeof(struct kevent_extinfo), Z_WAITOK \| Z_ZERO);
9107	if (kqext == NULL) {
9108	err = ENOMEM;
9109	goto out;
9110	}
9111
9112	proc_fdlock(p);
9113	u_long fd_knlistsize = fdp->fd_knlistsize;
9114	struct klist *fd_knlist = fdp->fd_knlist;
9115
9116	for (i = `0`; i < fd_knlistsize; i++) {
9117	kn = SLIST_FIRST(&fd_knlist[i]);
9118	nknotes = kevent_extinfo_emit(kq, kn, buf: kqext, buflen, nknotes);
9119
9120	proc_fdunlock(p);
9121	proc_fdlock(p);
9122	/*
9123	* Reevaluate to see if we have raced with someone who changed this -
9124	* if we have, we return the set of info for fd_knlistsize we knew
9125	* in the beginning except if knotes_dealloc interleaves with us.
9126	* In that case, we bail out early with the set of info captured so far.
9127	*/
9128	if (fd_knlistsize != fdp->fd_knlistsize) {
9129	if (fdp->fd_knlistsize) {
9130	/ kq_add_knote might grow fdp->fd_knlist. /
9131	fd_knlist = fdp->fd_knlist;
9132	} else {
9133	break;
9134	}
9135	}
9136	}
9137	proc_fdunlock(p);
9138
9139	knhash_lock(fdp);
9140	u_long knhashmask = fdp->fd_knhashmask;
9141
9142	if (knhashmask != `0`) {
9143	for (i = `0`; i < (int)knhashmask + `1`; i++) {
9144	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9145	nknotes = kevent_extinfo_emit(kq, kn, buf: kqext, buflen, nknotes);
9146
9147	knhash_unlock(fdp);
9148	knhash_lock(fdp);
9149
9150	/*
9151	* Reevaluate to see if we have raced with someone who changed this -
9152	* if we have, we should bail out with the set of info captured so far
9153	*/
9154	if (fdp->fd_knhashmask != knhashmask) {
9155	break;
9156	}
9157	}
9158	}
9159	knhash_unlock(fdp);
9160
9161	assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9162	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9163
9164	out:
9165	kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9166
9167	if (!err) {
9168	*retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9169	}
9170	return err;
9171	}
9172
9173	static unsigned int
9174	klist_copy_udata(struct klist list, uint64_t buf,
9175	unsigned int buflen, unsigned int nknotes)
9176	{
9177	struct knote *kn;
9178	SLIST_FOREACH(kn, list, kn_link) {
9179	if (nknotes < buflen) {
9180	/*
9181	* kevent_register will always set kn_udata atomically
9182	* so that we don't have to take any kqlock here.
9183	*/
9184	buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9185	}
9186	/ we return total number of knotes, which may be more than requested /
9187	nknotes++;
9188	}
9189
9190	return nknotes;
9191	}
9192
9193	int
9194	kevent_proc_copy_uptrs(void proc, uint64_t buf, uint32_t bufsize)
9195	{
9196	proc_t p = (proc_t)proc;
9197	struct filedesc *fdp = &p->p_fd;
9198	unsigned int nuptrs = `0`;
9199	unsigned int buflen = bufsize / sizeof(uint64_t);
9200	struct kqworkloop *kqwl;
9201	u_long size = `0`;
9202	struct klist *fd_knlist = NULL;
9203
9204	if (buflen > `0`) {
9205	assert(buf != NULL);
9206	}
9207
9208	/*
9209	* Copyout the uptrs as much as possible but make sure to drop the respective
9210	* locks and take them again periodically so that we don't blow through
9211	* preemption disabled timeouts. Always reevaluate to see if we have raced
9212	* with someone who changed size of the hash - if we have, we return info for
9213	* the size of the hash we knew in the beginning except if it drops to 0.
9214	* In that case, we bail out with the set of info captured so far
9215	*/
9216	proc_fdlock(p);
9217	size = fdp->fd_knlistsize;
9218	fd_knlist = fdp->fd_knlist;
9219
9220	for (int i = `0`; i < size; i++) {
9221	nuptrs = klist_copy_udata(list: &fd_knlist[i], buf, buflen, nknotes: nuptrs);
9222
9223	proc_fdunlock(p);
9224	proc_fdlock(p);
9225	if (size != fdp->fd_knlistsize) {
9226	if (fdp->fd_knlistsize) {
9227	/ kq_add_knote might grow fdp->fd_knlist. /
9228	fd_knlist = fdp->fd_knlist;
9229	} else {
9230	break;
9231	}
9232	}
9233	}
9234	proc_fdunlock(p);
9235
9236	knhash_lock(fdp);
9237	size = fdp->fd_knhashmask;
9238
9239	if (size != `0`) {
9240	for (size_t i = `0`; i < size + `1`; i++) {
9241	nuptrs = klist_copy_udata(list: &fdp->fd_knhash[i], buf, buflen, nknotes: nuptrs);
9242
9243	knhash_unlock(fdp);
9244	knhash_lock(fdp);
9245	/ The only path that can interleave with us today is knotes_dealloc. /
9246	if (size != fdp->fd_knhashmask) {
9247	break;
9248	}
9249	}
9250	}
9251	knhash_unlock(fdp);
9252
9253	kqhash_lock(fdp);
9254	size = fdp->fd_kqhashmask;
9255
9256	if (size != `0`) {
9257	for (size_t i = `0`; i < size + `1`; i++) {
9258	LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9259	if (nuptrs < buflen) {
9260	buf[nuptrs] = kqwl->kqwl_dynamicid;
9261	}
9262	nuptrs++;
9263	}
9264
9265	kqhash_unlock(fdp);
9266	kqhash_lock(fdp);
9267	if (size != fdp->fd_kqhashmask) {
9268	break;
9269	}
9270	}
9271	}
9272	kqhash_unlock(fdp);
9273
9274	return (int)nuptrs;
9275	}
9276
9277	static void
9278	kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9279	{
9280	uint64_t ast_addr;
9281	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9282	size_t user_addr_size = proc_is_64bit ? `8` : `4`;
9283	uint32_t ast_flags32 = `0`;
9284	uint64_t ast_flags64 = `0`;
9285	struct uthread *ut = get_bsdthread_info(thread);
9286
9287	if (ut->uu_kqr_bound != NULL) {
9288	ast_flags64 \|= R2K_WORKLOOP_PENDING_EVENTS;
9289	}
9290
9291	if (ast_flags64 == `0`) {
9292	return;
9293	}
9294
9295	if (!(p->p_flag & P_LP64)) {
9296	ast_flags32 = (uint32_t)ast_flags64;
9297	assert(ast_flags64 < `0x100000000ull`);
9298	}
9299
9300	ast_addr = thread_rettokern_addr(thread);
9301	if (ast_addr == `0`) {
9302	return;
9303	}
9304
9305	if (copyout((proc_is_64bit ? (void )&ast_flags64 : (void* *)&ast_flags32),
9306	(user_addr_t)ast_addr,
9307	user_addr_size) != `0`) {
9308	printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9309	"ast_addr = %llu\n", proc_getpid(p), thread_tid(thread: current_thread()), ast_addr);
9310	}
9311	}
9312
9313	/*
9314	* Semantics of writing to TSD value:
9315	*
9316	* 1. It is written to by the kernel and cleared by userspace.
9317	* 2. When the userspace code clears the TSD field, it takes responsibility for
9318	* taking action on the quantum expiry action conveyed by kernel.
9319	* 3. The TSD value is always cleared upon entry into userspace and upon exit of
9320	* userspace back to kernel to make sure that it is never leaked across thread
9321	* requests.
9322	*/
9323	void
9324	kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9325	uint64_t flags)
9326	{
9327	uint64_t ast_addr;
9328	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9329	uint32_t ast_flags32 = `0`;
9330	uint64_t ast_flags64 = flags;
9331
9332	if (ast_flags64 == `0`) {
9333	return;
9334	}
9335
9336	if (!(p->p_flag & P_LP64)) {
9337	ast_flags32 = (uint32_t)ast_flags64;
9338	assert(ast_flags64 < `0x100000000ull`);
9339	}
9340
9341	ast_addr = thread_wqquantum_addr(thread);
9342	assert(ast_addr != `0`);
9343
9344	if (proc_is_64bit) {
9345	if (copyout_atomic64(u64: ast_flags64, user_addr: (user_addr_t) ast_addr)) {
9346	#if DEBUG \|\| DEVELOPMENT
9347	printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9348	"ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9349	#endif
9350	}
9351	} else {
9352	if (copyout_atomic32(u32: ast_flags32, user_addr: (user_addr_t) ast_addr)) {
9353	#if DEBUG \|\| DEVELOPMENT
9354	printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9355	"ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9356	#endif
9357	}
9358	}
9359	}
9360
9361	void
9362	kevent_ast(thread_t thread, uint16_t bits)
9363	{
9364	proc_t p = current_proc();
9365
9366
9367	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9368	workq_kern_threadreq_redrive(p, flags: WORKQ_THREADREQ_CAN_CREATE_THREADS);
9369	}
9370	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9371	kevent_set_return_to_kernel_user_tsd(p, thread);
9372	}
9373
9374	if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9375	workq_kern_quantum_expiry_reevaluate(p, thread);
9376	}
9377	}
9378
9379	#if DEVELOPMENT \|\| DEBUG
9380
9381	#define KEVENT_SYSCTL_BOUND_ID 1
9382
9383	static int
9384	kevent_sysctl SYSCTL_HANDLER_ARGS
9385	{
9386	#pragma unused(oidp, arg2)
9387	uintptr_t type = (uintptr_t)arg1;
9388	uint64_t bound_id = `0`;
9389
9390	if (type != KEVENT_SYSCTL_BOUND_ID) {
9391	return EINVAL;
9392	}
9393
9394	if (req->newptr) {
9395	return EINVAL;
9396	}
9397
9398	struct uthread *ut = current_uthread();
9399	if (!ut) {
9400	return EFAULT;
9401	}
9402
9403	workq_threadreq_t kqr = ut->uu_kqr_bound;
9404	if (kqr) {
9405	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9406	bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9407	} else {
9408	bound_id = -`1`;
9409	}
9410	}
9411
9412	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9413	}
9414
9415	SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW \| CTLFLAG_LOCKED, `0`,
9416	"kevent information");
9417
9418	SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9419	CTLTYPE_QUAD \| CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED,
9420	(void *)KEVENT_SYSCTL_BOUND_ID,
9421	sizeof(kqueue_id_t), kevent_sysctl, "Q",
9422	"get the ID of the bound kqueue");
9423
9424	#endif /* DEVELOPMENT \|\| DEBUG */
9425

Browse the source code of xnu/bsd/kern/kern_event.c