kern_event.c source code [xnu/bsd/kern/kern_event.c]

1	/*
2	* Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*
28	*/
29	/-*
30	* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31	* All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	*
42	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52	* SUCH DAMAGE.
53	*/
54	/*
55	* @(#)kern_event.c 1.0 (3/31/2000)
56	*/
57	#include <stdint.h>
58	#include <machine/atomic.h>
59
60	#include <sys/param.h>
61	#include <sys/systm.h>
62	#include <sys/filedesc.h>
63	#include <sys/kernel.h>
64	#include <sys/proc_internal.h>
65	#include <sys/kauth.h>
66	#include <sys/malloc.h>
67	#include <sys/unistd.h>
68	#include <sys/file_internal.h>
69	#include <sys/fcntl.h>
70	#include <sys/select.h>
71	#include <sys/queue.h>
72	#include <sys/event.h>
73	#include <sys/eventvar.h>
74	#include <sys/protosw.h>
75	#include <sys/socket.h>
76	#include <sys/socketvar.h>
77	#include <sys/stat.h>
78	#include <sys/sysctl.h>
79	#include <sys/uio.h>
80	#include <sys/sysproto.h>
81	#include <sys/user.h>
82	#include <sys/vnode_internal.h>
83	#include <string.h>
84	#include <sys/proc_info.h>
85	#include <sys/codesign.h>
86	#include <sys/pthread_shims.h>
87	#include <sys/kdebug.h>
88	#include <sys/reason.h>
89	#include <os/reason_private.h>
90	#include <pexpert/pexpert.h>
91
92	#include <kern/locks.h>
93	#include <kern/clock.h>
94	#include <kern/cpu_data.h>
95	#include <kern/policy_internal.h>
96	#include <kern/thread_call.h>
97	#include <kern/sched_prim.h>
98	#include <kern/waitq.h>
99	#include <kern/zalloc.h>
100	#include <kern/kalloc.h>
101	#include <kern/assert.h>
102	#include <kern/ast.h>
103	#include <kern/thread.h>
104	#include <kern/kcdata.h>
105
106	#include <pthread/priority_private.h>
107	#include <pthread/workqueue_syscalls.h>
108	#include <pthread/workqueue_internal.h>
109	#include <libkern/libkern.h>
110	#include <libkern/OSAtomic.h>
111
112	#include "net/net_str_id.h"
113
114	#include <mach/task.h>
115	#include <libkern/section_keywords.h>
116
117	#if CONFIG_MEMORYSTATUS
118	#include <sys/kern_memorystatus.h>
119	#endif
120
121	extern thread_t port_name_to_thread(mach_port_name_t port_name); / osfmk/kern/ipc_tt.h /
122	extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); / osfmk/ipc/ipc_entry.h /
123
124	#define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
125
126	MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
127
128	#define KQ_EVENT NO_EVENT64
129
130	static int kqueue_read(struct fileproc fp, struct* uio *uio,
131	int flags, vfs_context_t ctx);
132	static int kqueue_write(struct fileproc fp, struct* uio *uio,
133	int flags, vfs_context_t ctx);
134	static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
135	vfs_context_t ctx);
136	static int kqueue_select(struct fileproc fp, int* which, void *wq_link_id,
137	vfs_context_t ctx);
138	static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
139	static int kqueue_kqfilter(struct fileproc fp, struct* knote *kn,
140	struct kevent_internal_s *kev, vfs_context_t ctx);
141	static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
142
143	static const struct fileops kqueueops = {
144	.fo_type = DTYPE_KQUEUE,
145	.fo_read = kqueue_read,
146	.fo_write = kqueue_write,
147	.fo_ioctl = kqueue_ioctl,
148	.fo_select = kqueue_select,
149	.fo_close = kqueue_close,
150	.fo_kqfilter = kqueue_kqfilter,
151	.fo_drain = kqueue_drain,
152	};
153
154	static void kevent_put_kq(struct proc p, kqueue_id_t id, struct* fileproc fp, struct* kqueue *kq);
155	static int kevent_internal(struct proc *p,
156	kqueue_id_t id, kqueue_id_t *id_out,
157	user_addr_t changelist, int nchanges,
158	user_addr_t eventlist, int nevents,
159	user_addr_t data_out, uint64_t data_available,
160	unsigned int flags, user_addr_t utimeout,
161	kqueue_continue_t continuation,
162	int32_t *retval);
163	static int kevent_copyin(user_addr_t addrp, struct* kevent_internal_s *kevp,
164	struct proc p, unsigned* int flags);
165	static int kevent_copyout(struct kevent_internal_s kevp, user_addr_t addrp,
166	struct proc p, unsigned* int flags);
167	char * kevent_description(struct kevent_internal_s kevp, char* *s, size_t n);
168
169	static int kevent_register_wait_prepare(struct knote kn, struct* kevent_internal_s *kev);
170	static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
171	struct knote_lock_ctx *knlc, thread_continue_t cont,
172	struct _kevent_register *cont_args) __dead2;
173	static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
174	static void kevent_register_wait_cleanup(struct knote *kn);
175	static inline void kqueue_release_last(struct proc *p, kqueue_t kqu);
176	static void kqueue_interrupt(struct kqueue *kq);
177	static int kevent_callback(struct kqueue kq, struct* kevent_internal_s *kevp,
178	void *data);
179	static void kevent_continue(struct kqueue kq, void* data, int* error);
180	static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
181	static int kqueue_process(struct kqueue kq, kevent_callback_t callback, void* *callback_data,
182	struct filt_process_s process_data, int* *countp);
183	static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
184
185	static struct kqtailq kqueue_get_suppressed_queue(kqueue_t kq, struct* knote *kn);
186	static void kqueue_threadreq_initiate(struct kqueue kq, struct* kqrequest kqr, kq_index_t qos, int* flags);
187
188	static void kqworkq_update_override(struct kqworkq kqwq, struct* knote *kn, kq_index_t qos);
189	static void kqworkq_unbind(proc_t p, struct kqrequest *kqr);
190	static thread_qos_t kqworkq_unbind_locked(struct kqworkq kqwq, struct* kqrequest *kqr, thread_t thread);
191	static struct kqrequest kqworkq_get_request(struct* kqworkq *kqwq, kq_index_t qos_index);
192
193	static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index);
194	static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql);
195	static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread);
196	static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl);
197	enum {
198	KQWL_UTQ_NONE,
199	/*
200	* The wakeup qos is the qos of QUEUED knotes.
201	*
202	* This QoS is accounted for with the events override in the
203	* kqr_override_index field. It is raised each time a new knote is queued at
204	* a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
205	* knote buckets and is recomputed after each event delivery.
206	*/
207	KQWL_UTQ_UPDATE_WAKEUP_QOS,
208	KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
209	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
210	KQWL_UTQ_UNBINDING, / attempt to rebind /
211	KQWL_UTQ_PARKING,
212	/*
213	* The wakeup override is for suppressed knotes that have fired again at
214	* a higher QoS than the one for which they are suppressed already.
215	* This override is cleared when the knote suppressed list becomes empty.
216	*/
217	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
218	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
219	/*
220	* The QoS is the maximum QoS of an event enqueued on this workloop in
221	* userland. It is copied from the only EVFILT_WORKLOOP knote with
222	* a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
223	* such knote, this QoS is 0.
224	*/
225	KQWL_UTQ_SET_QOS_INDEX,
226	KQWL_UTQ_REDRIVE_EVENTS,
227	};
228	static void kqworkloop_update_threads_qos(struct kqworkloop kqwl, int* op, kq_index_t qos);
229	static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
230	static int kqworkloop_end_processing(struct kqworkloop kqwl, int* flags, int kevent_flags);
231
232	static int knote_process(struct knote kn, kevent_callback_t callback, void* *callback_data,
233	struct filt_process_s *process_data);
234
235	static int kq_add_knote(struct kqueue kq, struct* knote *kn,
236	struct knote_lock_ctx knlc, struct* proc *p);
237	static struct knote kq_find_knote_and_kq_lock(struct* kqueue kq, struct* kevent_internal_s kev, bool is_fd, struct* proc *p);
238
239	static void knote_drop(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc);
240	static struct knote knote_alloc(void*);
241	static void knote_free(struct knote *kn);
242
243	static void knote_activate(struct knote *kn);
244	static void knote_deactivate(struct knote *kn);
245
246	static void knote_enable(struct knote *kn);
247	static void knote_disable(struct knote *kn);
248
249	static int knote_enqueue(struct knote *kn);
250	static void knote_dequeue(struct knote *kn);
251
252	static void knote_suppress(struct knote *kn);
253	static void knote_unsuppress(struct knote *kn);
254	static void knote_wakeup(struct knote *kn);
255
256	static bool knote_should_apply_qos_override(struct kqueue kq, struct* knote *kn,
257	int result, thread_qos_t *qos_out);
258	static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index);
259	static void knote_adjust_qos(struct kqueue kq, struct* knote kn, int* result);
260	static void knote_reset_priority(struct knote *kn, pthread_priority_t pp);
261	static kq_index_t knote_get_qos_override_index(struct knote *kn);
262	static void knote_set_qos_overcommit(struct knote *kn);
263
264	static zone_t knote_zone;
265	static zone_t kqfile_zone;
266	static zone_t kqworkq_zone;
267	static zone_t kqworkloop_zone;
268	#if DEVELOPMENT \|\| DEBUG
269	#define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
270	#define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
271	#define KEVENT_PANIC_BOOT_ARG_INITIALIZED (1U << 31)
272
273	#define KEVENT_PANIC_DEFAULT_VALUE (0)
274	static uint32_t
275	kevent_debug_flags(void)
276	{
277	static uint32_t flags = KEVENT_PANIC_DEFAULT_VALUE;
278
279	if ((flags & KEVENT_PANIC_BOOT_ARG_INITIALIZED) == `0`) {
280	uint32_t value = `0`;
281	if (!PE_parse_boot_argn("kevent_debug", &value, sizeof(value))) {
282	value = KEVENT_PANIC_DEFAULT_VALUE;
283	}
284	value \|= KEVENT_PANIC_BOOT_ARG_INITIALIZED;
285	os_atomic_store(&flags, value, relaxed);
286	}
287	return flags;
288	}
289	#endif
290
291	#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
292
293	/ placeholder for not-yet-implemented filters /
294	static int filt_badattach(struct knote kn, struct* kevent_internal_s *kev);
295	static int filt_badevent(struct knote kn, long* hint);
296	SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
297	.f_attach = filt_badattach,
298	};
299
300	#if CONFIG_MEMORYSTATUS
301	extern const struct filterops memorystatus_filtops;
302	#endif /* CONFIG_MEMORYSTATUS */
303	extern const struct filterops fs_filtops;
304	extern const struct filterops sig_filtops;
305	extern const struct filterops machport_filtops;
306	extern const struct filterops pipe_rfiltops;
307	extern const struct filterops pipe_wfiltops;
308	extern const struct filterops ptsd_kqops;
309	extern const struct filterops ptmx_kqops;
310	extern const struct filterops soread_filtops;
311	extern const struct filterops sowrite_filtops;
312	extern const struct filterops sock_filtops;
313	extern const struct filterops soexcept_filtops;
314	extern const struct filterops spec_filtops;
315	extern const struct filterops bpfread_filtops;
316	extern const struct filterops necp_fd_rfiltops;
317	extern const struct filterops fsevent_filtops;
318	extern const struct filterops vnode_filtops;
319	extern const struct filterops tty_filtops;
320
321	const static struct filterops file_filtops;
322	const static struct filterops kqread_filtops;
323	const static struct filterops proc_filtops;
324	const static struct filterops timer_filtops;
325	const static struct filterops user_filtops;
326	const static struct filterops workloop_filtops;
327
328	/*
329	*
330	* Rules for adding new filters to the system:
331	* Public filters:
332	* - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
333	* in the exported section of the header
334	* - Update the EVFILT_SYSCOUNT value to reflect the new addition
335	* - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
336	* of the Public Filters section in the array.
337	* Private filters:
338	* - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
339	* in the XNU_KERNEL_PRIVATE section of the header
340	* - Update the EVFILTID_MAX value to reflect the new addition
341	* - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
342	* the Private filters section of the array.
343	*/
344	SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
345	/ Public Filters /
346	[~EVFILT_READ] = &file_filtops,
347	[~EVFILT_WRITE] = &file_filtops,
348	[~EVFILT_AIO] = &bad_filtops,
349	[~EVFILT_VNODE] = &file_filtops,
350	[~EVFILT_PROC] = &proc_filtops,
351	[~EVFILT_SIGNAL] = &sig_filtops,
352	[~EVFILT_TIMER] = &timer_filtops,
353	[~EVFILT_MACHPORT] = &machport_filtops,
354	[~EVFILT_FS] = &fs_filtops,
355	[~EVFILT_USER] = &user_filtops,
356	&bad_filtops,
357	[~EVFILT_VM] = &bad_filtops,
358	[~EVFILT_SOCK] = &file_filtops,
359	#if CONFIG_MEMORYSTATUS
360	[~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
361	#else
362	[~EVFILT_MEMORYSTATUS] = &bad_filtops,
363	#endif
364	[~EVFILT_EXCEPT] = &file_filtops,
365	[~EVFILT_WORKLOOP] = &workloop_filtops,
366
367	/ Private filters /
368	[EVFILTID_KQREAD] = &kqread_filtops,
369	[EVFILTID_PIPE_R] = &pipe_rfiltops,
370	[EVFILTID_PIPE_W] = &pipe_wfiltops,
371	[EVFILTID_PTSD] = &ptsd_kqops,
372	[EVFILTID_SOREAD] = &soread_filtops,
373	[EVFILTID_SOWRITE] = &sowrite_filtops,
374	[EVFILTID_SCK] = &sock_filtops,
375	[EVFILTID_SOEXCEPT] = &soexcept_filtops,
376	[EVFILTID_SPEC] = &spec_filtops,
377	[EVFILTID_BPFREAD] = &bpfread_filtops,
378	[EVFILTID_NECP_FD] = &necp_fd_rfiltops,
379	[EVFILTID_FSEVENT] = &fsevent_filtops,
380	[EVFILTID_VN] = &vnode_filtops,
381	[EVFILTID_TTY] = &tty_filtops,
382	[EVFILTID_PTMX] = &ptmx_kqops,
383	};
384
385	/ waitq prepost callback /
386	void waitq_set__CALLING_PREPOST_HOOK__(void kq_hook, void* knote_hook, int* qos);
387
388	static inline struct kqworkloop *
389	kqr_kqworkloop(struct kqrequest *kqr)
390	{
391	if (kqr->kqr_state & KQR_WORKLOOP) {
392	return __container_of(kqr, struct kqworkloop, kqwl_request);
393	}
394	return NULL;
395	}
396
397	static inline kqueue_t
398	kqr_kqueue(proc_t p, struct kqrequest *kqr)
399	{
400	kqueue_t kqu;
401	if (kqr->kqr_state & KQR_WORKLOOP) {
402	kqu.kqwl = kqr_kqworkloop(kqr);
403	} else {
404	kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
405	assert(kqr >= kqu.kqwq->kqwq_request &&
406	kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
407	}
408	return kqu;
409	}
410
411	static inline boolean_t
412	is_workqueue_thread(thread_t thread)
413	{
414	return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
415	}
416
417	/*
418	* kqueue/note lock implementations
419	*
420	* The kqueue lock guards the kq state, the state of its queues,
421	* and the kqueue-aware status and locks of individual knotes.
422	*
423	* The kqueue workq lock is used to protect state guarding the
424	* interaction of the kqueue with the workq. This state cannot
425	* be guarded by the kq lock - as it needs to be taken when we
426	* already have the waitq set lock held (during the waitq hook
427	* callback). It might be better to use the waitq lock itself
428	* for this, but the IRQ requirements make that difficult).
429	*
430	* Knote flags, filter flags, and associated data are protected
431	* by the underlying object lock - and are only ever looked at
432	* by calling the filter to get a [consistent] snapshot of that
433	* data.
434	*/
435	static lck_grp_attr_t *kq_lck_grp_attr;
436	static lck_grp_t *kq_lck_grp;
437	static lck_attr_t *kq_lck_attr;
438
439	static inline void
440	kqlock(kqueue_t kqu)
441	{
442	lck_spin_lock(&kqu.kq->kq_lock);
443	}
444
445	static inline void
446	kqlock_held(__assert_only kqueue_t kqu)
447	{
448	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
449	}
450
451	static inline void
452	kqunlock(kqueue_t kqu)
453	{
454	lck_spin_unlock(&kqu.kq->kq_lock);
455	}
456
457	static inline void
458	kq_req_lock(kqueue_t kqu)
459	{
460	assert(kqu.kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ));
461	lck_spin_lock(&kqu.kq->kq_reqlock);
462	}
463
464	static inline void
465	kq_req_unlock(kqueue_t kqu)
466	{
467	assert(kqu.kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ));
468	lck_spin_unlock(&kqu.kq->kq_reqlock);
469	}
470
471	static inline void
472	kq_req_held(__assert_only kqueue_t kqu)
473	{
474	assert(kqu.kq->kq_state & (KQ_WORKLOOP \| KQ_WORKQ));
475	LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED);
476	}
477
478	static inline void
479	knhash_lock(proc_t p)
480	{
481	lck_mtx_lock(&p->p_fd->fd_knhashlock);
482	}
483
484	static inline void
485	knhash_unlock(proc_t p)
486	{
487	lck_mtx_unlock(&p->p_fd->fd_knhashlock);
488	}
489
490	#pragma mark knote locks
491
492	/*
493	* Enum used by the knote_lock_* functions.
494	*
495	* KNOTE_KQ_LOCK_ALWAYS
496	* The function will always return with the kq lock held.
497	*
498	* KNOTE_KQ_UNLOCK_ON_SUCCESS
499	* The function will return with the kq lock held if it was successful
500	* (knote_lock() is the only function that can fail).
501	*
502	* KNOTE_KQ_UNLOCK_ON_FAILURE
503	* The function will return with the kq lock held if it was unsuccessful
504	* (knote_lock() is the only function that can fail).
505	*
506	* KNOTE_KQ_UNLOCK:
507	* The function returns with the kq unlocked.
508	*/
509	#define KNOTE_KQ_LOCK_ALWAYS 0x0
510	#define KNOTE_KQ_LOCK_ON_SUCCESS 0x1
511	#define KNOTE_KQ_LOCK_ON_FAILURE 0x2
512	#define KNOTE_KQ_UNLOCK 0x3
513
514	#if DEBUG \|\| DEVELOPMENT
515	__attribute__((noinline, not_tail_called, disable_tail_calls))
516	void knote_lock_ctx_chk(struct knote_lock_ctx *knlc)
517	{
518	/ evil hackery to make sure no one forgets to unlock /
519	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
520	}
521	#endif
522
523	static struct knote_lock_ctx *
524	knote_lock_ctx_find(struct kqueue kq, struct* knote *kn)
525	{
526	struct knote_lock_ctx *ctx;
527	LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) {
528	if (ctx->knlc_knote == kn) return ctx;
529	}
530	panic("knote lock context not found: %p", kn);
531	__builtin_trap();
532	}
533
534	/ slowpath of knote_lock() /
535	__attribute__((noinline))
536	static bool __result_use_check
537	knote_lock_slow(struct kqueue kq, struct* knote *kn,
538	struct knote_lock_ctx knlc, int* kqlocking)
539	{
540	kqlock_held(kq);
541
542	struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn);
543	thread_t owner_thread = owner_lc->knlc_thread;
544
545	#if DEBUG \|\| DEVELOPMENT
546	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
547	#endif
548
549	thread_reference(owner_thread);
550	TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe);
551	assert_wait(&kn->kn_status, THREAD_UNINT \| THREAD_WAIT_NOREPORT);
552	kqunlock(kq);
553
554	if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) {
555	if (kqlocking == KNOTE_KQ_LOCK_ALWAYS \|\|
556	kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
557	kqlock(kq);
558	}
559	#if DEBUG \|\| DEVELOPMENT
560	assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
561	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
562	#endif
563	return false;
564	}
565	#if DEBUG \|\| DEVELOPMENT
566	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
567	#endif
568	if (kqlocking == KNOTE_KQ_LOCK_ALWAYS \|\|
569	kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
570	kqlock(kq);
571	}
572	return true;
573	}
574
575	/*
576	* Attempts to take the "knote" lock.
577	*
578	* Called with the kqueue lock held.
579	*
580	* Returns true if the knote lock is acquired, false if it has been dropped
581	*/
582	static bool __result_use_check
583	knote_lock(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc,
584	int kqlocking)
585	{
586	kqlock_held(kq);
587
588	#if DEBUG \|\| DEVELOPMENT
589	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
590	#endif
591	knlc->knlc_knote = kn;
592	knlc->knlc_thread = current_thread();
593	TAILQ_INIT(&knlc->knlc_head);
594
595	if (__improbable(kn->kn_status & KN_LOCKED)) {
596	return knote_lock_slow(kq, kn, knlc, kqlocking);
597	}
598
599	/*
600	* When the knote will be dropped, the knote lock is taken before
601	* KN_DROPPING is set, and then the knote will be removed from any
602	* hash table that references it before the lock is canceled.
603	*/
604	assert((kn->kn_status & KN_DROPPING) == `0`);
605	LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le);
606	kn->kn_status \|= KN_LOCKED;
607	#if DEBUG \|\| DEVELOPMENT
608	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
609	#endif
610
611	if (kqlocking == KNOTE_KQ_UNLOCK \|\|
612	kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
613	kqunlock(kq);
614	}
615	return true;
616	}
617
618	/*
619	* Unlocks a knote successfully locked with knote_lock().
620	*
621	* Called with the kqueue lock held.
622	*
623	* Returns with the kqueue lock held according to KNOTE_KQ_* flags
624	*/
625	static void
626	knote_unlock(struct kqueue kq, struct* knote *kn,
627	struct knote_lock_ctx knlc, int* flags)
628	{
629	kqlock_held(kq);
630
631	assert(knlc->knlc_knote == kn);
632	assert(kn->kn_status & KN_LOCKED);
633	#if DEBUG \|\| DEVELOPMENT
634	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
635	#endif
636
637	struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head);
638
639	LIST_REMOVE(knlc, knlc_le);
640
641	if (next_owner_lc) {
642	assert(next_owner_lc->knlc_knote == kn);
643	TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe);
644
645	assert(TAILQ_EMPTY(&next_owner_lc->knlc_head));
646	TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe);
647	LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le);
648	#if DEBUG \|\| DEVELOPMENT
649	next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
650	#endif
651	} else {
652	kn->kn_status &= ~KN_LOCKED;
653	}
654	if (kn->kn_inuse == `0`) {
655	/*
656	* No f_event() in flight anymore, we can leave QoS "Merge" mode
657	*
658	* See knote_should_apply_qos_override()
659	*/
660	kn->kn_status &= ~KN_MERGE_QOS;
661	}
662	if (flags & KNOTE_KQ_UNLOCK) {
663	kqunlock(kq);
664	}
665	if (next_owner_lc) {
666	thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread);
667	}
668	#if DEBUG \|\| DEVELOPMENT
669	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
670	#endif
671	}
672
673	/*
674	* Aborts all waiters for a knote lock, and unlock the knote.
675	*
676	* Called with the kqueue lock held.
677	*
678	* Returns with the kqueue lock held according to KNOTE_KQ_* flags
679	*/
680	static void
681	knote_unlock_cancel(struct kqueue kq, struct* knote *kn,
682	struct knote_lock_ctx knlc, int* kqlocking)
683	{
684	kqlock_held(kq);
685
686	assert(knlc->knlc_knote == kn);
687	assert(kn->kn_status & KN_LOCKED);
688	assert(kn->kn_status & KN_DROPPING);
689
690	LIST_REMOVE(knlc, knlc_le);
691	kn->kn_status &= ~KN_LOCKED;
692
693	if (kqlocking == KNOTE_KQ_UNLOCK \|\|
694	kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
695	kqunlock(kq);
696	}
697	if (!TAILQ_EMPTY(&knlc->knlc_head)) {
698	thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART);
699	}
700	#if DEBUG \|\| DEVELOPMENT
701	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
702	#endif
703	}
704
705	/*
706	* Call the f_event hook of a given filter.
707	*
708	* Takes a use count to protect against concurrent drops.
709	*/
710	static void
711	knote_call_filter_event(struct kqueue kq, struct* knote kn, long* hint)
712	{
713	int result, dropping = `0`;
714
715	kqlock_held(kq);
716
717	if (kn->kn_status & (KN_DROPPING \| KN_VANISHED))
718	return;
719
720	kn->kn_inuse++;
721	kqunlock(kq);
722	result = filter_call(knote_fops(kn), f_event(kn, hint));
723	kqlock(kq);
724
725	dropping = (kn->kn_status & KN_DROPPING);
726
727	if (!dropping && (result & FILTER_ACTIVE)) {
728	if (result & FILTER_ADJUST_EVENT_QOS_BIT)
729	knote_adjust_qos(kq, kn, result);
730	knote_activate(kn);
731	}
732
733	if (--kn->kn_inuse == `0`) {
734	if ((kn->kn_status & KN_LOCKED) == `0`) {
735	/*
736	* We're the last f_event() call and there's no other f_* call in
737	* flight, we can leave QoS "Merge" mode.
738	*
739	* See knote_should_apply_qos_override()
740	*/
741	kn->kn_status &= ~KN_MERGE_QOS;
742	}
743	if (dropping) {
744	waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
745	CAST_EVENT64_T(&kn->kn_inuse),
746	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
747	}
748	}
749	}
750
751	/*
752	* Called by knote_drop() to wait for the last f_event() caller to be done.
753	*
754	* - kq locked at entry
755	* - kq unlocked at exit
756	*/
757	static void
758	knote_wait_for_filter_events(struct kqueue kq, struct* knote *kn)
759	{
760	wait_result_t wr = THREAD_NOT_WAITING;
761
762	kqlock_held(kq);
763
764	assert(kn->kn_status & KN_DROPPING);
765
766	if (kn->kn_inuse) {
767	wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
768	CAST_EVENT64_T(&kn->kn_inuse),
769	THREAD_UNINT \| THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
770	}
771	kqunlock(kq);
772	if (wr == THREAD_WAITING) {
773	thread_block(THREAD_CONTINUE_NULL);
774	}
775	}
776
777	#pragma mark file_filtops
778
779	static int
780	filt_fileattach(struct knote kn, struct* kevent_internal_s *kev)
781	{
782	return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current());
783	}
784
785	SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
786	.f_isfd = `1`,
787	.f_attach = filt_fileattach,
788	};
789
790	#pragma mark kqread_filtops
791
792	#define f_flag f_fglob->fg_flag
793	#define f_ops f_fglob->fg_ops
794	#define f_data f_fglob->fg_data
795
796	static void
797	filt_kqdetach(struct knote *kn)
798	{
799	struct kqfile kqf = (struct* kqfile *)kn->kn_fp->f_data;
800	struct kqueue *kq = &kqf->kqf_kqueue;
801
802	kqlock(kq);
803	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
804	kqunlock(kq);
805	}
806
807	static int
808	filt_kqueue(struct knote kn, __unused long* hint)
809	{
810	struct kqueue kq = (struct* kqueue *)kn->kn_fp->f_data;
811
812	return (kq->kq_count > `0`);
813	}
814
815	static int
816	filt_kqtouch(struct knote kn, struct* kevent_internal_s *kev)
817	{
818	#pragma unused(kev)
819	struct kqueue kq = (struct* kqueue *)kn->kn_fp->f_data;
820	int res;
821
822	kqlock(kq);
823	kn->kn_data = kq->kq_count;
824	res = (kn->kn_data > `0`);
825
826	kqunlock(kq);
827
828	return res;
829	}
830
831	static int
832	filt_kqprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
833	{
834	#pragma unused(data)
835	struct kqueue kq = (struct* kqueue *)kn->kn_fp->f_data;
836	int res;
837
838	kqlock(kq);
839	kn->kn_data = kq->kq_count;
840	res = (kn->kn_data > `0`);
841	if (res) {
842	*kev = kn->kn_kevent;
843	if (kn->kn_flags & EV_CLEAR)
844	kn->kn_data = `0`;
845	}
846	kqunlock(kq);
847
848	return res;
849	}
850
851	SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
852	.f_isfd = `1`,
853	.f_detach = filt_kqdetach,
854	.f_event = filt_kqueue,
855	.f_touch = filt_kqtouch,
856	.f_process = filt_kqprocess,
857	};
858
859	#pragma mark proc_filtops
860
861	static int
862	filt_procattach(struct knote kn, __unused struct* kevent_internal_s *kev)
863	{
864	struct proc *p;
865
866	assert(PID_MAX < NOTE_PDATAMASK);
867
868	if ((kn->kn_sfflags & (NOTE_TRACK \| NOTE_TRACKERR \| NOTE_CHILD)) != `0`) {
869	knote_set_error(kn, ENOTSUP);
870	return `0`;
871	}
872
873	p = proc_find(kn->kn_id);
874	if (p == NULL) {
875	knote_set_error(kn, ESRCH);
876	return `0`;
877	}
878
879	const int NoteExitStatusBits = NOTE_EXIT \| NOTE_EXITSTATUS;
880
881	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
882	do {
883	pid_t selfpid = proc_selfpid();
884
885	if (p->p_ppid == selfpid)
886	break; / parent => ok /
887
888	if ((p->p_lflag & P_LTRACED) != `0` &&
889	(p->p_oppid == selfpid))
890	break; / parent-in-waiting => ok /
891
892	proc_rele(p);
893	knote_set_error(kn, EACCES);
894	return `0`;
895	} while (`0`);
896
897	proc_klist_lock();
898
899	kn->kn_ptr.p_proc = p; / store the proc handle /
900
901	KNOTE_ATTACH(&p->p_klist, kn);
902
903	proc_klist_unlock();
904
905	proc_rele(p);
906
907	/*
908	* only captures edge-triggered events after this point
909	* so it can't already be fired.
910	*/
911	return (`0`);
912	}
913
914
915	/*
916	* The knote may be attached to a different process, which may exit,
917	* leaving nothing for the knote to be attached to. In that case,
918	* the pointer to the process will have already been nulled out.
919	*/
920	static void
921	filt_procdetach(struct knote *kn)
922	{
923	struct proc *p;
924
925	proc_klist_lock();
926
927	p = kn->kn_ptr.p_proc;
928	if (p != PROC_NULL) {
929	kn->kn_ptr.p_proc = PROC_NULL;
930	KNOTE_DETACH(&p->p_klist, kn);
931	}
932
933	proc_klist_unlock();
934	}
935
936	static int
937	filt_proc(struct knote kn, long* hint)
938	{
939	u_int event;
940
941	/ ALWAYS CALLED WITH proc_klist_lock /
942
943	/*
944	* Note: a lot of bits in hint may be obtained from the knote
945	* To free some of those bits, see <rdar://problem/12592988> Freeing up
946	* bits in hint for filt_proc
947	*
948	* mask off extra data
949	*/
950	event = (u_int)hint & NOTE_PCTRLMASK;
951
952	/*
953	* termination lifecycle events can happen while a debugger
954	* has reparented a process, in which case notifications
955	* should be quashed except to the tracing parent. When
956	* the debugger reaps the child (either via wait4(2) or
957	* process exit), the child will be reparented to the original
958	* parent and these knotes re-fired.
959	*/
960	if (event & NOTE_EXIT) {
961	if ((kn->kn_ptr.p_proc->p_oppid != `0`)
962	&& (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
963	/*
964	* This knote is not for the current ptrace(2) parent, ignore.
965	*/
966	return `0`;
967	}
968	}
969
970	/*
971	* if the user is interested in this event, record it.
972	*/
973	if (kn->kn_sfflags & event)
974	kn->kn_fflags \|= event;
975
976	#pragma clang diagnostic push
977	#pragma clang diagnostic ignored "-Wdeprecated-declarations"
978	if ((event == NOTE_REAP) \|\| ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
979	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
980	}
981	#pragma clang diagnostic pop
982
983
984	/*
985	* The kernel has a wrapper in place that returns the same data
986	* as is collected here, in kn_data. Any changes to how
987	* NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
988	* should also be reflected in the proc_pidnoteexit() wrapper.
989	*/
990	if (event == NOTE_EXIT) {
991	kn->kn_data = `0`;
992	if ((kn->kn_sfflags & NOTE_EXITSTATUS) != `0`) {
993	kn->kn_fflags \|= NOTE_EXITSTATUS;
994	kn->kn_data \|= (hint & NOTE_PDATAMASK);
995	}
996	if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != `0`) {
997	kn->kn_fflags \|= NOTE_EXIT_DETAIL;
998	if ((kn->kn_ptr.p_proc->p_lflag &
999	P_LTERM_DECRYPTFAIL) != `0`) {
1000	kn->kn_data \|= NOTE_EXIT_DECRYPTFAIL;
1001	}
1002	if ((kn->kn_ptr.p_proc->p_lflag &
1003	P_LTERM_JETSAM) != `0`) {
1004	kn->kn_data \|= NOTE_EXIT_MEMORY;
1005	switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
1006	case P_JETSAM_VMPAGESHORTAGE:
1007	kn->kn_data \|= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1008	break;
1009	case P_JETSAM_VMTHRASHING:
1010	kn->kn_data \|= NOTE_EXIT_MEMORY_VMTHRASHING;
1011	break;
1012	case P_JETSAM_FCTHRASHING:
1013	kn->kn_data \|= NOTE_EXIT_MEMORY_FCTHRASHING;
1014	break;
1015	case P_JETSAM_VNODE:
1016	kn->kn_data \|= NOTE_EXIT_MEMORY_VNODE;
1017	break;
1018	case P_JETSAM_HIWAT:
1019	kn->kn_data \|= NOTE_EXIT_MEMORY_HIWAT;
1020	break;
1021	case P_JETSAM_PID:
1022	kn->kn_data \|= NOTE_EXIT_MEMORY_PID;
1023	break;
1024	case P_JETSAM_IDLEEXIT:
1025	kn->kn_data \|= NOTE_EXIT_MEMORY_IDLE;
1026	break;
1027	}
1028	}
1029	if ((kn->kn_ptr.p_proc->p_csflags &
1030	CS_KILLED) != `0`) {
1031	kn->kn_data \|= NOTE_EXIT_CSERROR;
1032	}
1033	}
1034	}
1035
1036	/ if we have any matching state, activate the knote /
1037	return (kn->kn_fflags != `0`);
1038	}
1039
1040	static int
1041	filt_proctouch(struct knote kn, struct* kevent_internal_s *kev)
1042	{
1043	int res;
1044
1045	proc_klist_lock();
1046
1047	/ accept new filter flags and mask off output events no long interesting /
1048	kn->kn_sfflags = kev->fflags;
1049
1050	/ restrict the current results to the (smaller?) set of new interest /
1051	/*
1052	* For compatibility with previous implementations, we leave kn_fflags
1053	* as they were before.
1054	*/
1055	//kn->kn_fflags &= kn->kn_sfflags;
1056
1057	res = (kn->kn_fflags != `0`);
1058
1059	proc_klist_unlock();
1060
1061	return res;
1062	}
1063
1064	static int
1065	filt_procprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
1066	{
1067	#pragma unused(data)
1068	int res;
1069
1070	proc_klist_lock();
1071	res = (kn->kn_fflags != `0`);
1072	if (res) {
1073	*kev = kn->kn_kevent;
1074	kn->kn_flags \|= EV_CLEAR; / automatically set /
1075	kn->kn_fflags = `0`;
1076	kn->kn_data = `0`;
1077	}
1078	proc_klist_unlock();
1079	return res;
1080	}
1081
1082	SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1083	.f_attach = filt_procattach,
1084	.f_detach = filt_procdetach,
1085	.f_event = filt_proc,
1086	.f_touch = filt_proctouch,
1087	.f_process = filt_procprocess,
1088	};
1089
1090	#pragma mark timer_filtops
1091
1092	struct filt_timer_params {
1093	uint64_t deadline; / deadline in abs/cont time*
1094	(or 0 if NOTE_ABSOLUTE and deadline is in past) /*
1095	uint64_t leeway; / leeway in abstime, or 0 if none /
1096	uint64_t interval; / interval in abstime or 0 if non-repeating timer /
1097	};
1098
1099	/*
1100	* Values stored in the knote at rest (using Mach absolute time units)
1101	*
1102	* kn->kn_hook where the thread_call object is stored
1103	* kn->kn_ext[0] next deadline or 0 if immediate expiration
1104	* kn->kn_ext[1] leeway value
1105	* kn->kn_sdata interval timer: the interval
1106	* absolute/deadline timer: 0
1107	* kn->kn_hookid timer state
1108	*
1109	* TIMER_IDLE:
1110	* The timer has either never been scheduled or been cancelled.
1111	* It is safe to schedule a new one in this state.
1112	*
1113	* TIMER_ARMED:
1114	* The timer has been scheduled
1115	*
1116	* TIMER_FIRED
1117	* The timer has fired and an event needs to be delivered.
1118	* When in this state, the callout may still be running.
1119	*
1120	* TIMER_IMMEDIATE
1121	* The timer has fired at registration time, and the callout was never
1122	* dispatched.
1123	*/
1124	#define TIMER_IDLE 0x0
1125	#define TIMER_ARMED 0x1
1126	#define TIMER_FIRED 0x2
1127	#define TIMER_IMMEDIATE 0x3
1128
1129	static void
1130	filt_timer_set_params(struct knote kn, struct* filt_timer_params *params)
1131	{
1132	kn->kn_ext[`0`] = params->deadline;
1133	kn->kn_ext[`1`] = params->leeway;
1134	kn->kn_sdata = params->interval;
1135	}
1136
1137	/*
1138	* filt_timervalidate - process data from user
1139	*
1140	* Sets up the deadline, interval, and leeway from the provided user data
1141	*
1142	* Input:
1143	* kn_sdata timer deadline or interval time
1144	* kn_sfflags style of timer, unit of measurement
1145	*
1146	* Output:
1147	* struct filter_timer_params to apply to the filter with
1148	* filt_timer_set_params when changes are ready to be commited.
1149	*
1150	* Returns:
1151	* EINVAL Invalid user data parameters
1152	* ERANGE Various overflows with the parameters
1153	*
1154	* Called with timer filter lock held.
1155	*/
1156	static int
1157	filt_timervalidate(const struct kevent_internal_s *kev,
1158	struct filt_timer_params *params)
1159	{
1160	/*
1161	* There are 5 knobs that need to be chosen for a timer registration:
1162	*
1163	* A) Units of time (what is the time duration of the specified number)
1164	* Absolute and interval take:
1165	* NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1166	* Defaults to milliseconds if not specified
1167	*
1168	* B) Clock epoch (what is the zero point of the specified number)
1169	* For interval, there is none
1170	* For absolute, defaults to the gettimeofday/calendar epoch
1171	* With NOTE_MACHTIME, uses mach_absolute_time()
1172	* With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1173	*
1174	* C) The knote's behavior on delivery
1175	* Interval timer causes the knote to arm for the next interval unless one-shot is set
1176	* Absolute is a forced one-shot timer which deletes on delivery
1177	* TODO: Add a way for absolute to be not forced one-shot
1178	*
1179	* D) Whether the time duration is relative to now or absolute
1180	* Interval fires at now + duration when it is set up
1181	* Absolute fires at now + difference between now walltime and passed in walltime
1182	* With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1183	*
1184	* E) Whether the timer continues to tick across sleep
1185	* By default all three do not.
1186	* For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1187	* With NOTE_ABSOLUTE \| NOTE_MACHTIME \| NOTE_MACH_CONTINUOUS_TIME:
1188	* expires when mach_continuous_time() is > the passed in value.
1189	*/
1190
1191	uint64_t multiplier;
1192
1193	boolean_t use_abstime = FALSE;
1194
1195	switch (kev->fflags & (NOTE_SECONDS\|NOTE_USECONDS\|NOTE_NSECONDS\|NOTE_MACHTIME)) {
1196	case NOTE_SECONDS:
1197	multiplier = NSEC_PER_SEC;
1198	break;
1199	case NOTE_USECONDS:
1200	multiplier = NSEC_PER_USEC;
1201	break;
1202	case NOTE_NSECONDS:
1203	multiplier = `1`;
1204	break;
1205	case NOTE_MACHTIME:
1206	multiplier = `0`;
1207	use_abstime = TRUE;
1208	break;
1209	case `0`: / milliseconds (default) /
1210	multiplier = NSEC_PER_SEC / `1000`;
1211	break;
1212	default:
1213	return (EINVAL);
1214	}
1215
1216	/ transform the leeway in kn_ext[1] to same time scale /
1217	if (kev->fflags & NOTE_LEEWAY) {
1218	uint64_t leeway_abs;
1219
1220	if (use_abstime) {
1221	leeway_abs = (uint64_t)kev->ext[`1`];
1222	} else {
1223	uint64_t leeway_ns;
1224	if (os_mul_overflow((uint64_t)kev->ext[`1`], multiplier, &leeway_ns))
1225	return (ERANGE);
1226
1227	nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1228	}
1229
1230	params->leeway = leeway_abs;
1231	} else {
1232	params->leeway = `0`;
1233	}
1234
1235	if (kev->fflags & NOTE_ABSOLUTE) {
1236	uint64_t deadline_abs;
1237
1238	if (use_abstime) {
1239	deadline_abs = (uint64_t)kev->data;
1240	} else {
1241	uint64_t calendar_deadline_ns;
1242
1243	if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns))
1244	return (ERANGE);
1245
1246	/ calendar_deadline_ns is in nanoseconds since the epoch /
1247
1248	clock_sec_t seconds;
1249	clock_nsec_t nanoseconds;
1250
1251	/*
1252	* Note that the conversion through wall-time is only done once.
1253	*
1254	* If the relationship between MAT and gettimeofday changes,
1255	* the underlying timer does not update.
1256	*
1257	* TODO: build a wall-time denominated timer_call queue
1258	* and a flag to request DTRTing with wall-time timers
1259	*/
1260	clock_get_calendar_nanotime(&seconds, &nanoseconds);
1261
1262	uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1263
1264	/ if deadline is in the future /
1265	if (calendar_now_ns < calendar_deadline_ns) {
1266	uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1267	uint64_t interval_abs;
1268
1269	nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1270
1271	/*
1272	* Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1273	* causes the timer to keep ticking across sleep, but
1274	* it does not change the calendar timebase.
1275	*/
1276
1277	if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME)
1278	clock_continuoustime_interval_to_deadline(interval_abs,
1279	&deadline_abs);
1280	else
1281	clock_absolutetime_interval_to_deadline(interval_abs,
1282	&deadline_abs);
1283	} else {
1284	deadline_abs = `0`; / cause immediate expiration /
1285	}
1286	}
1287
1288	params->deadline = deadline_abs;
1289	params->interval = `0`; / NOTE_ABSOLUTE is non-repeating /
1290	} else if (kev->data < `0`) {
1291	/*
1292	* Negative interval timers fire immediately, once.
1293	*
1294	* Ideally a negative interval would be an error, but certain clients
1295	* pass negative values on accident, and expect an event back.
1296	*
1297	* In the old implementation the timer would repeat with no delay
1298	* N times until mach_absolute_time() + (N * interval) underflowed,
1299	* then it would wait ~forever by accidentally arming a timer for the far future.
1300	*
1301	* We now skip the power-wasting hot spin phase and go straight to the idle phase.
1302	*/
1303
1304	params->deadline = `0`; / expire immediately /
1305	params->interval = `0`; / non-repeating /
1306	} else {
1307	uint64_t interval_abs = `0`;
1308
1309	if (use_abstime) {
1310	interval_abs = (uint64_t)kev->data;
1311	} else {
1312	uint64_t interval_ns;
1313	if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns))
1314	return (ERANGE);
1315
1316	nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1317	}
1318
1319	uint64_t deadline = `0`;
1320
1321	if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME)
1322	clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1323	else
1324	clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1325
1326	params->deadline = deadline;
1327	params->interval = interval_abs;
1328	}
1329
1330	return (`0`);
1331	}
1332
1333	/*
1334	* filt_timerexpire - the timer callout routine
1335	*/
1336	static void
1337	filt_timerexpire(void knx, __unused void* *spare)
1338	{
1339	struct knote *kn = knx;
1340	int v;
1341
1342	if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED,
1343	&v, relaxed)) {
1344	// our f_event always would say FILTER_ACTIVE,
1345	// so be leaner and just do it.
1346	struct kqueue *kq = knote_get_kq(kn);
1347	kqlock(kq);
1348	knote_activate(kn);
1349	kqunlock(kq);
1350	} else {
1351	/*
1352	* From TIMER_ARMED, the only allowed transition are:
1353	* - to TIMER_FIRED through the timer callout just above
1354	* - to TIMER_IDLE due to filt_timercancel() which will wait for the
1355	* timer callout (and any possible invocation of filt_timerexpire) to
1356	* have finished before the state is changed again.
1357	*/
1358	assert(v == TIMER_IDLE);
1359	}
1360	}
1361
1362	static void
1363	filt_timercancel(struct knote *kn)
1364	{
1365	if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) {
1366	/ cancel the thread call and wait for any filt_timerexpire in flight /
1367	thread_call_cancel_wait((thread_call_t)kn->kn_hook);
1368	}
1369	}
1370
1371	/*
1372	* Does this deadline needs a timer armed for it, or has it expired?
1373	*/
1374	static bool
1375	filt_timer_is_ready(struct knote *kn)
1376	{
1377	uint64_t now, deadline = kn->kn_ext[`0`];
1378
1379	if (deadline == `0`) {
1380	return true;
1381	}
1382
1383	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1384	now = mach_continuous_time();
1385	} else {
1386	now = mach_absolute_time();
1387	}
1388	return deadline <= now;
1389	}
1390
1391	/*
1392	* Arm a timer
1393	*
1394	* It is the responsibility of the caller to make sure the timer call
1395	* has completed or been cancelled properly prior to arming it.
1396	*/
1397	static void
1398	filt_timerarm(struct knote *kn)
1399	{
1400	uint64_t deadline = kn->kn_ext[`0`];
1401	uint64_t leeway = kn->kn_ext[`1`];
1402
1403	int filter_flags = kn->kn_sfflags;
1404	unsigned int timer_flags = `0`;
1405
1406	assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE);
1407
1408	if (filter_flags & NOTE_CRITICAL)
1409	timer_flags \|= THREAD_CALL_DELAY_USER_CRITICAL;
1410	else if (filter_flags & NOTE_BACKGROUND)
1411	timer_flags \|= THREAD_CALL_DELAY_USER_BACKGROUND;
1412	else
1413	timer_flags \|= THREAD_CALL_DELAY_USER_NORMAL;
1414
1415	if (filter_flags & NOTE_LEEWAY)
1416	timer_flags \|= THREAD_CALL_DELAY_LEEWAY;
1417
1418	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME)
1419	timer_flags \|= THREAD_CALL_CONTINUOUS;
1420
1421	os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed);
1422	thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL,
1423	deadline, leeway, timer_flags);
1424	}
1425
1426	/*
1427	* Allocate a thread call for the knote's lifetime, and kick off the timer.
1428	*/
1429	static int
1430	filt_timerattach(struct knote kn, struct* kevent_internal_s *kev)
1431	{
1432	thread_call_t callout;
1433	struct filt_timer_params params;
1434	int error;
1435
1436	if ((error = filt_timervalidate(kev, &params)) != `0`) {
1437	knote_set_error(kn, error);
1438	return `0`;
1439	}
1440
1441	callout = thread_call_allocate_with_options(filt_timerexpire,
1442	(thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1443	THREAD_CALL_OPTIONS_ONCE);
1444
1445	if (NULL == callout) {
1446	knote_set_error(kn, ENOMEM);
1447	return `0`;
1448	}
1449
1450	filt_timer_set_params(kn, &params);
1451	kn->kn_hook = callout;
1452	kn->kn_flags \|= EV_CLEAR;
1453	os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
1454
1455	/ NOTE_ABSOLUTE implies EV_ONESHOT /
1456	if (kn->kn_sfflags & NOTE_ABSOLUTE)
1457	kn->kn_flags \|= EV_ONESHOT;
1458
1459	if (filt_timer_is_ready(kn)) {
1460	os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
1461	return FILTER_ACTIVE;
1462	} else {
1463	filt_timerarm(kn);
1464	return `0`;
1465	}
1466	}
1467
1468	/*
1469	* Shut down the timer if it's running, and free the callout.
1470	*/
1471	static void
1472	filt_timerdetach(struct knote *kn)
1473	{
1474	__assert_only boolean_t freed;
1475
1476	/*
1477	* Unconditionally cancel to make sure there can't be any filt_timerexpire()
1478	* running anymore.
1479	*/
1480	thread_call_cancel_wait((thread_call_t)kn->kn_hook);
1481	freed = thread_call_free((thread_call_t)kn->kn_hook);
1482	assert(freed);
1483	}
1484
1485	/*
1486	* filt_timertouch - update timer knote with new user input
1487	*
1488	* Cancel and restart the timer based on new user data. When
1489	* the user picks up a knote, clear the count of how many timer
1490	* pops have gone off (in kn_data).
1491	*/
1492	static int
1493	filt_timertouch(struct knote kn, struct* kevent_internal_s *kev)
1494	{
1495	struct filt_timer_params params;
1496	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1497	int error;
1498
1499	if (changed_flags & NOTE_ABSOLUTE) {
1500	kev->flags \|= EV_ERROR;
1501	kev->data = EINVAL;
1502	return `0`;
1503	}
1504
1505	if ((error = filt_timervalidate(kev, &params)) != `0`) {
1506	kev->flags \|= EV_ERROR;
1507	kev->data = error;
1508	return `0`;
1509	}
1510
1511	/ capture the new values used to compute deadline /
1512	filt_timercancel(kn);
1513	filt_timer_set_params(kn, &params);
1514	kn->kn_sfflags = kev->fflags;
1515
1516	if (filt_timer_is_ready(kn)) {
1517	os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
1518	return FILTER_ACTIVE \| FILTER_UPDATE_REQ_QOS;
1519	} else {
1520	filt_timerarm(kn);
1521	return FILTER_UPDATE_REQ_QOS;
1522	}
1523	}
1524
1525	/*
1526	* filt_timerprocess - query state of knote and snapshot event data
1527	*
1528	* Determine if the timer has fired in the past, snapshot the state
1529	* of the kevent for returning to user-space, and clear pending event
1530	* counters for the next time.
1531	*/
1532	static int
1533	filt_timerprocess(
1534	struct knote *kn,
1535	__unused struct filt_process_s *data,
1536	struct kevent_internal_s *kev)
1537	{
1538	/*
1539	* filt_timerprocess is serialized with any filter routine except for
1540	* filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1541	* transition, and on success, activates the knote.
1542	*
1543	* Hence, we don't need atomic modifications of the state, only to peek at
1544	* whether we see any of the "FIRED" state, and if we do, it is safe to
1545	* do simple state machine transitions.
1546	*/
1547	switch (os_atomic_load(&kn->kn_hookid, relaxed)) {
1548	case TIMER_IDLE:
1549	case TIMER_ARMED:
1550	/*
1551	* This can happen if a touch resets a timer that had fired
1552	* without being processed
1553	*/
1554	return `0`;
1555	}
1556
1557	os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
1558
1559	/*
1560	* Copy out the interesting kevent state,
1561	* but don't leak out the raw time calculations.
1562	*
1563	* TODO: potential enhancements - tell the user about:
1564	* - deadline to which this timer thought it was expiring
1565	* - return kn_sfflags in the fflags field so the client can know
1566	* under what flags the timer fired
1567	*/
1568	*kev = kn->kn_kevent;
1569	kev->ext[`0`] = `0`;
1570	/ kev->ext[1] = 0; JMM - shouldn't we hide this too? /
1571
1572	if (kn->kn_sdata == `0`) {
1573	kev->data = `1`;
1574	} else {
1575	/*
1576	* This is a 'repeating' timer, so we have to emit
1577	* how many intervals expired between the arm
1578	* and the process.
1579	*
1580	* A very strange style of interface, because
1581	* this could easily be done in the client...
1582	*/
1583
1584	uint64_t now;
1585
1586	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1587	now = mach_continuous_time();
1588	else
1589	now = mach_absolute_time();
1590
1591	uint64_t first_deadline = kn->kn_ext[`0`];
1592	uint64_t interval_abs = kn->kn_sdata;
1593	uint64_t orig_arm_time = first_deadline - interval_abs;
1594
1595	assert(now > orig_arm_time);
1596	assert(now > first_deadline);
1597
1598	uint64_t elapsed = now - orig_arm_time;
1599
1600	uint64_t num_fired = elapsed / interval_abs;
1601
1602	/*
1603	* To reach this code, we must have seen the timer pop
1604	* and be in repeating mode, so therefore it must have been
1605	* more than 'interval' time since the attach or last
1606	* successful touch.
1607	*/
1608	assert(num_fired > `0`);
1609
1610	/ report how many intervals have elapsed to the user /
1611	kev->data = (int64_t)num_fired;
1612
1613	/ We only need to re-arm the timer if it's not about to be destroyed /
1614	if ((kn->kn_flags & EV_ONESHOT) == `0`) {
1615	/ fire at the end of the next interval /
1616	uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1617
1618	assert(new_deadline > now);
1619
1620	kn->kn_ext[`0`] = new_deadline;
1621
1622	/*
1623	* This can't shortcut setting up the thread call, because
1624	* knote_process deactivates EV_CLEAR knotes unconditionnally.
1625	*/
1626	filt_timerarm(kn);
1627	}
1628	}
1629
1630	return FILTER_ACTIVE;
1631	}
1632
1633	SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1634	.f_extended_codes = true,
1635	.f_attach = filt_timerattach,
1636	.f_detach = filt_timerdetach,
1637	.f_event = filt_badevent,
1638	.f_touch = filt_timertouch,
1639	.f_process = filt_timerprocess,
1640	};
1641
1642	#pragma mark user_filtops
1643
1644	static int
1645	filt_userattach(struct knote kn, __unused struct* kevent_internal_s *kev)
1646	{
1647	if (kn->kn_sfflags & NOTE_TRIGGER) {
1648	kn->kn_hookid = FILTER_ACTIVE;
1649	} else {
1650	kn->kn_hookid = `0`;
1651	}
1652	return (kn->kn_hookid);
1653	}
1654
1655	static void
1656	filt_userdetach(__unused struct knote *kn)
1657	{
1658	/ EVFILT_USER knotes are not attached to anything in the kernel /
1659	}
1660
1661	static int
1662	filt_usertouch(struct knote kn, struct* kevent_internal_s *kev)
1663	{
1664	uint32_t ffctrl;
1665	int fflags;
1666
1667	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1668	fflags = kev->fflags & NOTE_FFLAGSMASK;
1669	switch (ffctrl) {
1670	case NOTE_FFNOP:
1671	break;
1672	case NOTE_FFAND:
1673	kn->kn_sfflags &= fflags;
1674	break;
1675	case NOTE_FFOR:
1676	kn->kn_sfflags \|= fflags;
1677	break;
1678	case NOTE_FFCOPY:
1679	kn->kn_sfflags = fflags;
1680	break;
1681	}
1682	kn->kn_sdata = kev->data;
1683
1684	if (kev->fflags & NOTE_TRIGGER) {
1685	kn->kn_hookid = FILTER_ACTIVE;
1686	}
1687	return (int)kn->kn_hookid;
1688	}
1689
1690	static int
1691	filt_userprocess(
1692	struct knote *kn,
1693	__unused struct filt_process_s *data,
1694	struct kevent_internal_s *kev)
1695	{
1696	int result = (int)kn->kn_hookid;
1697
1698	if (result) {
1699	*kev = kn->kn_kevent;
1700	kev->fflags = kn->kn_sfflags;
1701	kev->data = kn->kn_sdata;
1702	if (kn->kn_flags & EV_CLEAR) {
1703	kn->kn_hookid = `0`;
1704	kn->kn_data = `0`;
1705	kn->kn_fflags = `0`;
1706	}
1707	}
1708
1709	return result;
1710	}
1711
1712	SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1713	.f_extended_codes = true,
1714	.f_attach = filt_userattach,
1715	.f_detach = filt_userdetach,
1716	.f_event = filt_badevent,
1717	.f_touch = filt_usertouch,
1718	.f_process = filt_userprocess,
1719	};
1720
1721	#pragma mark workloop_filtops
1722
1723	static inline void
1724	filt_wllock(struct kqworkloop *kqwl)
1725	{
1726	lck_mtx_lock(&kqwl->kqwl_statelock);
1727	}
1728
1729	static inline void
1730	filt_wlunlock(struct kqworkloop *kqwl)
1731	{
1732	lck_mtx_unlock(&kqwl->kqwl_statelock);
1733	}
1734
1735	/*
1736	* Returns true when the interlock for the turnstile is the workqueue lock
1737	*
1738	* When this is the case, all turnstiles operations are delegated
1739	* to the workqueue subsystem.
1740	*
1741	* This is required because kqueue_threadreq_bind_prepost only holds the
1742	* workqueue lock but needs to move the inheritor from the workloop turnstile
1743	* away from the creator thread, so that this now fulfilled request cannot be
1744	* picked anymore by other threads.
1745	*/
1746	static inline bool
1747	filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1748	{
1749	struct kqrequest *kqr = &kqwl->kqwl_request;
1750	return (kqr->kqr_state & KQR_THREQUESTED) &&
1751	(kqr->kqr_thread == THREAD_NULL);
1752	}
1753
1754	static void
1755	filt_wlupdate_inheritor(struct kqworkloop kqwl, struct* turnstile *ts,
1756	turnstile_update_flags_t flags)
1757	{
1758	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1759	struct kqrequest *kqr = &kqwl->kqwl_request;
1760
1761	/*
1762	* binding to the workq should always happen through
1763	* workq_kern_threadreq_update_inheritor()
1764	*/
1765	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1766
1767	if ((inheritor = kqwl->kqwl_owner)) {
1768	flags \|= TURNSTILE_INHERITOR_THREAD;
1769	} else if ((inheritor = kqr->kqr_thread)) {
1770	flags \|= TURNSTILE_INHERITOR_THREAD;
1771	}
1772
1773	turnstile_update_inheritor(ts, inheritor, flags);
1774	}
1775
1776	#define FILT_WLATTACH 0
1777	#define FILT_WLTOUCH 1
1778	#define FILT_WLDROP 2
1779
1780	__result_use_check
1781	static int
1782	filt_wlupdate(struct kqworkloop kqwl, struct* knote *kn,
1783	struct kevent_internal_s kev, kq_index_t qos_index, int* op)
1784	{
1785	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1786	struct kqrequest *kqr = &kqwl->kqwl_request;
1787	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1788	kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED;
1789	int action = KQWL_UTQ_NONE, error = `0`;
1790	bool needs_wake = false, needs_wllock = false;
1791	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
1792	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
1793	uint64_t udata = `0`;
1794
1795	if (kev->fflags & (NOTE_WL_END_OWNERSHIP \| NOTE_WL_DISCOVER_OWNER)) {
1796	/*
1797	* If we're maybe going to change the kqwl_owner,
1798	* then we need to hold the filt_wllock().
1799	*/
1800	needs_wllock = true;
1801	} else if (kqr->kqr_thread == current_thread()) {
1802	/*
1803	* <rdar://problem/41531764> Servicer updates need to be serialized with
1804	* any ownership change too, as the kqr_thread value influences the
1805	* outcome of handling NOTE_WL_DISCOVER_OWNER.
1806	*/
1807	needs_wllock = true;
1808	}
1809
1810	if (needs_wllock) {
1811	filt_wllock(kqwl);
1812	/*
1813	* The kqwl owner is set under both the req and filter lock,
1814	* meaning it's fine to look at it under any.
1815	*/
1816	new_owner = cur_owner = kqwl->kqwl_owner;
1817	} else {
1818	new_owner = cur_owner = THREAD_NULL;
1819	}
1820
1821	/*
1822	* Phase 1:
1823	*
1824	* If asked, load the uint64 value at the user provided address and compare
1825	* it against the passed in mask and expected value.
1826	*
1827	* If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1828	* a thread reference.
1829	*
1830	* If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1831	* the current thread, then end ownership.
1832	*
1833	* Lastly decide whether we need to perform a QoS update.
1834	*/
1835	if (uaddr) {
1836	error = copyin_word(uaddr, &udata, sizeof(udata));
1837	if (error) {
1838	goto out;
1839	}
1840
1841	/ Update state as copied in. /
1842	kev->ext[EV_EXTIDX_WL_VALUE] = udata;
1843
1844	if ((udata & mask) != (kdata & mask)) {
1845	error = ESTALE;
1846	} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
1847	/*
1848	* Decipher the owner port name, and translate accordingly.
1849	* The low 2 bits were borrowed for other flags, so mask them off.
1850	*
1851	* Then attempt translation to a thread reference or fail.
1852	*/
1853	mach_port_name_t name = (mach_port_name_t)udata & ~`0x3`;
1854	if (name != MACH_PORT_NULL) {
1855	name = ipc_entry_name_mask(name);
1856	extra_thread_ref = port_name_to_thread(name);
1857	if (extra_thread_ref == THREAD_NULL) {
1858	error = EOWNERDEAD;
1859	goto out;
1860	}
1861	new_owner = extra_thread_ref;
1862	}
1863	}
1864	}
1865
1866	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
1867	new_owner = THREAD_NULL;
1868	}
1869
1870	if (error == `0`) {
1871	if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
1872	action = KQWL_UTQ_SET_QOS_INDEX;
1873	} else if (qos_index && kqr->kqr_qos_index != qos_index) {
1874	action = KQWL_UTQ_SET_QOS_INDEX;
1875	}
1876
1877	if (op == FILT_WLTOUCH) {
1878	/*
1879	* Save off any additional fflags/data we just accepted
1880	* But only keep the last round of "update" bits we acted on which helps
1881	* debugging a lot.
1882	*/
1883	kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
1884	kn->kn_sfflags \|= kev->fflags;
1885	kn->kn_sdata = kev->data;
1886	if (kev->fflags & NOTE_WL_SYNC_WAKE) {
1887	needs_wake = (kn->kn_hook != THREAD_NULL);
1888	}
1889	} else if (op == FILT_WLDROP) {
1890	if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT \| NOTE_WL_SYNC_WAKE)) ==
1891	NOTE_WL_SYNC_WAIT) {
1892	/*
1893	* When deleting a SYNC_WAIT knote that hasn't been woken up
1894	* explicitly, issue a wake up.
1895	*/
1896	kn->kn_sfflags \|= NOTE_WL_SYNC_WAKE;
1897	needs_wake = (kn->kn_hook != THREAD_NULL);
1898	}
1899	}
1900	}
1901
1902	/*
1903	* Phase 2:
1904	*
1905	* Commit ownership and QoS changes if any, possibly wake up waiters
1906	*/
1907
1908	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
1909	goto out;
1910	}
1911
1912	kq_req_lock(kqwl);
1913
1914	/ If already tracked as servicer, don't track as owner /
1915	if (new_owner == kqr->kqr_thread) {
1916	new_owner = THREAD_NULL;
1917	}
1918
1919	if (cur_owner != new_owner) {
1920	kqwl->kqwl_owner = new_owner;
1921	if (new_owner == extra_thread_ref) {
1922	/ we just transfered this ref to kqwl_owner /
1923	extra_thread_ref = THREAD_NULL;
1924	}
1925	cur_owner_override = kqworkloop_owner_override(kqwl);
1926
1927	if (cur_owner) {
1928	thread_ends_owning_workloop(cur_owner);
1929	}
1930
1931	if (new_owner) {
1932	/ override it before we drop the old /
1933	if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
1934	thread_add_ipc_override(new_owner, cur_owner_override);
1935	}
1936	thread_starts_owning_workloop(new_owner);
1937	if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) {
1938	if (action == KQWL_UTQ_NONE) {
1939	action = KQWL_UTQ_REDRIVE_EVENTS;
1940	}
1941	}
1942	} else {
1943	if ((kqr->kqr_state & (KQR_THREQUESTED \| KQR_WAKEUP)) == KQR_WAKEUP) {
1944	if (action == KQWL_UTQ_NONE) {
1945	action = KQWL_UTQ_REDRIVE_EVENTS;
1946	}
1947	}
1948	}
1949	}
1950
1951	struct turnstile *ts = kqwl->kqwl_turnstile;
1952	bool wl_inheritor_updated = false;
1953
1954	if (action != KQWL_UTQ_NONE) {
1955	kqworkloop_update_threads_qos(kqwl, action, qos_index);
1956	}
1957
1958	if (cur_owner != new_owner && ts) {
1959	if (action == KQWL_UTQ_REDRIVE_EVENTS) {
1960	/*
1961	* Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
1962	* the code went through workq_kern_threadreq_initiate()
1963	* and the workqueue has set the inheritor already
1964	*/
1965	assert(filt_wlturnstile_interlock_is_workq(kqwl));
1966	} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
1967	workq_kern_threadreq_lock(kqwl->kqwl_p);
1968	workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
1969	ts, TURNSTILE_IMMEDIATE_UPDATE);
1970	workq_kern_threadreq_unlock(kqwl->kqwl_p);
1971	if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
1972	/*
1973	* If the workq is no longer the interlock, then
1974	* workq_kern_threadreq_update_inheritor() has finished a bind
1975	* and we need to fallback to the regular path.
1976	*/
1977	filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
1978	}
1979	wl_inheritor_updated = true;
1980	} else {
1981	filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
1982	wl_inheritor_updated = true;
1983	}
1984
1985	/*
1986	* We need a turnstile reference because we are dropping the interlock
1987	* and the caller has not called turnstile_prepare.
1988	*/
1989	if (wl_inheritor_updated) {
1990	turnstile_reference(ts);
1991	}
1992	}
1993
1994	if (needs_wake && ts) {
1995	waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn),
1996	(thread_t)kn->kn_hook, THREAD_AWAKENED);
1997	}
1998
1999	kq_req_unlock(kqwl);
2000
2001	if (wl_inheritor_updated) {
2002	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2003	turnstile_deallocate(ts);
2004	}
2005
2006	out:
2007	/*
2008	* Phase 3:
2009	*
2010	* Unlock and cleanup various lingering references and things.
2011	*/
2012	if (needs_wllock) {
2013	filt_wlunlock(kqwl);
2014	}
2015
2016	#if CONFIG_WORKLOOP_DEBUG
2017	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2018	.updater = current_thread(),
2019	.servicer = kqr->kqr_thread, / Note: racy /
2020	.old_owner = cur_owner,
2021	.new_owner = new_owner,
2022
2023	.kev_ident = kev->ident,
2024	.error = (int16_t)error,
2025	.kev_flags = kev->flags,
2026	.kev_fflags = kev->fflags,
2027
2028	.kev_mask = mask,
2029	.kev_value = kdata,
2030	.in_value = udata,
2031	});
2032	#endif // CONFIG_WORKLOOP_DEBUG
2033
2034	if (cur_owner && new_owner != cur_owner) {
2035	if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
2036	thread_drop_ipc_override(cur_owner);
2037	}
2038	thread_deallocate(cur_owner);
2039	}
2040
2041	if (extra_thread_ref) {
2042	thread_deallocate(extra_thread_ref);
2043	}
2044	return error;
2045	}
2046
2047	/*
2048	* Remembers the last updated that came in from userspace for debugging reasons.
2049	* - fflags is mirrored from the userspace kevent
2050	* - ext[i, i != VALUE] is mirrored from the userspace kevent
2051	* - ext[VALUE] is set to what the kernel loaded atomically
2052	* - data is set to the error if any
2053	*/
2054	static inline void
2055	filt_wlremember_last_update(struct knote kn, struct* kevent_internal_s *kev,
2056	int error)
2057	{
2058	kn->kn_fflags = kev->fflags;
2059	kn->kn_data = error;
2060	memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2061	}
2062
2063	static int
2064	filt_wlattach(struct knote kn, struct* kevent_internal_s *kev)
2065	{
2066	struct kqueue *kq = knote_get_kq(kn);
2067	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2068	int error = `0`;
2069	kq_index_t qos_index = `0`;
2070
2071	if ((kq->kq_state & KQ_WORKLOOP) == `0`) {
2072	error = ENOTSUP;
2073	goto out;
2074	}
2075
2076	#if DEVELOPMENT \|\| DEBUG
2077	if (kev->ident == `0` && kev->udata == `0` && kev->fflags == `0`) {
2078	struct kqrequest *kqr = &kqwl->kqwl_request;
2079
2080	kq_req_lock(kqwl);
2081	kev->fflags = `0`;
2082	if (kqr->kqr_dsync_waiters) {
2083	kev->fflags \|= NOTE_WL_SYNC_WAIT;
2084	}
2085	if (kqr->kqr_qos_index) {
2086	kev->fflags \|= NOTE_WL_THREAD_REQUEST;
2087	}
2088	kev->ext[`0`] = thread_tid(kqwl->kqwl_owner);
2089	kev->ext[`1`] = thread_tid(kqwl->kqwl_request.kqr_thread);
2090	kev->ext[`2`] = thread_owned_workloops_count(current_thread());
2091	kev->ext[`3`] = kn->kn_kevent.ext[`3`];
2092	kq_req_unlock(kqwl);
2093	error = EBUSY;
2094	goto out;
2095	}
2096	#endif
2097
2098	int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2099	switch (command) {
2100	case NOTE_WL_THREAD_REQUEST:
2101	if (kn->kn_id != kqwl->kqwl_dynamicid) {
2102	error = EINVAL;
2103	goto out;
2104	}
2105	qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2106	if (qos_index == THREAD_QOS_UNSPECIFIED) {
2107	error = ERANGE;
2108	goto out;
2109	}
2110	if (kqwl->kqwl_request.kqr_qos_index) {
2111	/*
2112	* There already is a thread request, and well, you're only allowed
2113	* one per workloop, so fail the attach.
2114	*/
2115	error = EALREADY;
2116	goto out;
2117	}
2118	break;
2119	case NOTE_WL_SYNC_WAIT:
2120	case NOTE_WL_SYNC_WAKE:
2121	if (kn->kn_id == kqwl->kqwl_dynamicid) {
2122	error = EINVAL;
2123	goto out;
2124	}
2125	if ((kn->kn_flags & EV_DISABLE) == `0`) {
2126	error = EINVAL;
2127	goto out;
2128	}
2129	if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2130	error = EINVAL;
2131	goto out;
2132	}
2133	break;
2134	default:
2135	error = EINVAL;
2136	goto out;
2137	}
2138
2139	error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2140
2141	out:
2142	if (error) {
2143	/ If userland wants ESTALE to be hidden, fail the attach anyway /
2144	if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2145	error = `0`;
2146	}
2147	knote_set_error(kn, error);
2148	return `0`;
2149	}
2150	if (command == NOTE_WL_SYNC_WAIT) {
2151	return kevent_register_wait_prepare(kn, kev);
2152	}
2153	/ Just attaching the thread request successfully will fire it /
2154	if (command == NOTE_WL_THREAD_REQUEST) {
2155	/*
2156	* Thread Request knotes need an explicit touch to be active again,
2157	* so delivering an event needs to also consume it.
2158	*/
2159	kn->kn_flags \|= EV_CLEAR;
2160	return FILTER_ACTIVE;
2161	}
2162	return `0`;
2163	}
2164
2165	static void __dead2
2166	filt_wlwait_continue(void *parameter, wait_result_t wr)
2167	{
2168	struct _kevent_register *cont_args = parameter;
2169	struct kqworkloop kqwl = (struct* kqworkloop *)cont_args->kq;
2170	struct kqrequest *kqr = &kqwl->kqwl_request;
2171
2172	kq_req_lock(kqwl);
2173	kqr->kqr_dsync_waiters--;
2174	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2175	workq_kern_threadreq_lock(kqwl->kqwl_p);
2176	turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
2177	workq_kern_threadreq_unlock(kqwl->kqwl_p);
2178	} else {
2179	turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
2180	}
2181	kq_req_unlock(kqwl);
2182
2183	turnstile_cleanup();
2184
2185	if (wr == THREAD_INTERRUPTED) {
2186	cont_args->kev.flags \|= EV_ERROR;
2187	cont_args->kev.data = EINTR;
2188	} else if (wr != THREAD_AWAKENED) {
2189	panic("Unexpected wait result: %d", wr);
2190	}
2191
2192	kevent_register_wait_return(cont_args);
2193	}
2194
2195	/*
2196	* Called with the workloop mutex held, most of the time never returns as it
2197	* calls filt_wlwait_continue through a continuation.
2198	*/
2199	static void __dead2
2200	filt_wlpost_register_wait(struct uthread uth, struct* knote_lock_ctx *knlc,
2201	struct _kevent_register *cont_args)
2202	{
2203	struct kqworkloop kqwl = (struct* kqworkloop *)cont_args->kq;
2204	struct kqrequest *kqr = &kqwl->kqwl_request;
2205	struct turnstile *ts;
2206	bool workq_locked = false;
2207
2208	kq_req_lock(kqwl);
2209
2210	kqr->kqr_dsync_waiters++;
2211
2212	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2213	workq_kern_threadreq_lock(kqwl->kqwl_p);
2214	workq_locked = true;
2215	}
2216
2217	ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2218	TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2219
2220	if (workq_locked) {
2221	workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2222	&kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2223	TURNSTILE_DELAYED_UPDATE);
2224	if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2225	/*
2226	* if the interlock is no longer the workqueue lock,
2227	* then we don't need to hold it anymore.
2228	*/
2229	workq_kern_threadreq_unlock(kqwl->kqwl_p);
2230	workq_locked = false;
2231	}
2232	}
2233	if (!workq_locked) {
2234	/*
2235	* If the interlock is the workloop's, then it's our responsibility to
2236	* call update_inheritor, so just do it.
2237	*/
2238	filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2239	}
2240
2241	thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
2242	waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote),
2243	THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2244
2245	if (workq_locked) {
2246	workq_kern_threadreq_unlock(kqwl->kqwl_p);
2247	}
2248
2249	thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread;
2250	if (thread) {
2251	thread_reference(thread);
2252	}
2253	kq_req_unlock(kqwl);
2254
2255	kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args);
2256	}
2257
2258	/ called in stackshot context to report the thread responsible for blocking this thread /
2259	void
2260	kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2261	event64_t event, thread_waitinfo_t *waitinfo)
2262	{
2263	struct knote kn = (struct* knote *)event;
2264	assert(kdp_is_in_zone(kn, "knote zone"));
2265
2266	assert(kn->kn_hook == thread);
2267
2268	struct kqueue *kq = knote_get_kq(kn);
2269	assert(kdp_is_in_zone(kq, "kqueue workloop zone"));
2270	assert(kq->kq_state & KQ_WORKLOOP);
2271
2272	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2273	struct kqrequest *kqr = &kqwl->kqwl_request;
2274
2275	thread_t kqwl_owner = kqwl->kqwl_owner;
2276	thread_t servicer = kqr->kqr_thread;
2277
2278	if (kqwl_owner != THREAD_NULL) {
2279	assert(kdp_is_in_zone(kqwl_owner, "threads"));
2280
2281	waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2282	} else if (servicer != THREAD_NULL) {
2283	assert(kdp_is_in_zone(servicer, "threads"));
2284
2285	waitinfo->owner = thread_tid(servicer);
2286	} else if (kqr->kqr_state & KQR_THREQUESTED) {
2287	waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2288	} else {
2289	waitinfo->owner = `0`;
2290	}
2291
2292	waitinfo->context = kqwl->kqwl_dynamicid;
2293	}
2294
2295	static void
2296	filt_wldetach(__assert_only struct knote *kn)
2297	{
2298	assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
2299	if (kn->kn_hook) {
2300	kevent_register_wait_cleanup(kn);
2301	}
2302	}
2303
2304	static int
2305	filt_wlvalidate_kev_flags(struct knote kn, struct* kevent_internal_s *kev,
2306	thread_qos_t *qos_index)
2307	{
2308	int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2309	int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2310
2311	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2312	return EINVAL;
2313	}
2314	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2315	if (kev->flags & EV_DELETE) {
2316	return EINVAL;
2317	}
2318	if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2319	return EINVAL;
2320	}
2321	if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2322	return ERANGE;
2323	}
2324	}
2325
2326	switch (new_commands) {
2327	case NOTE_WL_THREAD_REQUEST:
2328	/ thread requests can only update themselves /
2329	if (sav_commands != NOTE_WL_THREAD_REQUEST)
2330	return EINVAL;
2331	break;
2332
2333	case NOTE_WL_SYNC_WAIT:
2334	if (kev->fflags & NOTE_WL_END_OWNERSHIP)
2335	return EINVAL;
2336	goto sync_checks;
2337
2338	case NOTE_WL_SYNC_WAKE:
2339	sync_checks:
2340	if (!(sav_commands & (NOTE_WL_SYNC_WAIT \| NOTE_WL_SYNC_WAKE)))
2341	return EINVAL;
2342	if ((kev->flags & (EV_ENABLE \| EV_DELETE)) == EV_ENABLE)
2343	return EINVAL;
2344	break;
2345
2346	default:
2347	return EINVAL;
2348	}
2349	return `0`;
2350	}
2351
2352	static int
2353	filt_wltouch(struct knote kn, struct* kevent_internal_s *kev)
2354	{
2355	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2356	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2357
2358	int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2359	if (error) {
2360	goto out;
2361	}
2362
2363	error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2364	filt_wlremember_last_update(kn, kev, error);
2365	if (error) {
2366	goto out;
2367	}
2368
2369	out:
2370	if (error) {
2371	if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2372	/ If userland wants ESTALE to be hidden, do not activate /
2373	return `0`;
2374	}
2375	kev->flags \|= EV_ERROR;
2376	kev->data = error;
2377	return `0`;
2378	}
2379	int command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2380	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2381	return kevent_register_wait_prepare(kn, kev);
2382	}
2383	/ Just touching the thread request successfully will fire it /
2384	if (command == NOTE_WL_THREAD_REQUEST) {
2385	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2386	return FILTER_ACTIVE \| FILTER_UPDATE_REQ_QOS;
2387	}
2388	return FILTER_ACTIVE;
2389	}
2390	return `0`;
2391	}
2392
2393	static bool
2394	filt_wlallow_drop(struct knote kn, struct* kevent_internal_s *kev)
2395	{
2396	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2397
2398	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2399	if (error) {
2400	goto out;
2401	}
2402
2403	error = filt_wlupdate(kqwl, kn, kev, `0`, FILT_WLDROP);
2404	filt_wlremember_last_update(kn, kev, error);
2405	if (error) {
2406	goto out;
2407	}
2408
2409	out:
2410	if (error) {
2411	if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2412	return false;
2413	}
2414	kev->flags \|= EV_ERROR;
2415	kev->data = error;
2416	return false;
2417	}
2418	return true;
2419	}
2420
2421	static int
2422	filt_wlprocess(
2423	struct knote *kn,
2424	__unused struct filt_process_s *data,
2425	struct kevent_internal_s *kev)
2426	{
2427	struct kqworkloop kqwl = (struct* kqworkloop *)knote_get_kq(kn);
2428	int rc = `0`;
2429
2430	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2431
2432	filt_wllock(kqwl);
2433
2434	if (kqwl->kqwl_owner) {
2435	/*
2436	* <rdar://problem/33584321> userspace sometimes due to events being
2437	* delivered but not triggering a drain session can cause a process
2438	* of the thread request knote.
2439	*
2440	* When that happens, the automatic deactivation due to process
2441	* would swallow the event, so we have to activate the knote again.
2442	*/
2443	kqlock(kqwl);
2444	knote_activate(kn);
2445	kqunlock(kqwl);
2446	} else {
2447	#if DEBUG \|\| DEVELOPMENT
2448	if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2449	/*
2450	* see src/queue_internal.h in libdispatch
2451	*/
2452	#define DISPATCH_QUEUE_ENQUEUED 0x1ull
2453	user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2454	task_t t = current_task();
2455	uint64_t val;
2456	if (addr && task_is_active(t) && !task_is_halting(t) &&
2457	copyin_word(addr, &val, sizeof(val)) == `0` &&
2458	val && (val & DISPATCH_QUEUE_ENQUEUED) == `0` &&
2459	(val >> `48`) != `0xdead` && (val >> `48`) != `0` && (val >> `48`) != `0xffff`) {
2460	panic("kevent: workloop %#016llx is not enqueued "
2461	"(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2462	kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2463	}
2464	}
2465	#endif
2466	*kev = kn->kn_kevent;
2467	kev->fflags = kn->kn_sfflags;
2468	kev->data = kn->kn_sdata;
2469	kev->qos = kn->kn_qos;
2470	rc \|= FILTER_ACTIVE;
2471	}
2472
2473	filt_wlunlock(kqwl);
2474
2475	if (rc & FILTER_ACTIVE) {
2476	workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2477	}
2478	return rc;
2479	}
2480
2481	SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2482	.f_extended_codes = true,
2483	.f_attach = filt_wlattach,
2484	.f_detach = filt_wldetach,
2485	.f_event = filt_badevent,
2486	.f_touch = filt_wltouch,
2487	.f_process = filt_wlprocess,
2488	.f_allow_drop = filt_wlallow_drop,
2489	.f_post_register_wait = filt_wlpost_register_wait,
2490	};
2491
2492	#pragma mark kevent / knotes
2493
2494	/*
2495	* JMM - placeholder for not-yet-implemented filters
2496	*/
2497	static int
2498	filt_badevent(struct knote kn, long* hint)
2499	{
2500	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
2501	return `0`;
2502	}
2503
2504	static int
2505	filt_badattach(__unused struct knote kn, __unused struct* kevent_internal_s *kev)
2506	{
2507	knote_set_error(kn, ENOTSUP);
2508	return `0`;
2509	}
2510
2511	struct kqueue *
2512	kqueue_alloc(struct proc p, unsigned* int flags)
2513	{
2514	struct filedesc *fdp = p->p_fd;
2515	struct kqueue *kq = NULL;
2516	int policy;
2517	void *hook = NULL;
2518
2519	if (flags & KEVENT_FLAG_WORKQ) {
2520	struct kqworkq *kqwq;
2521	int i;
2522
2523	kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
2524	if (kqwq == NULL)
2525	return NULL;
2526
2527	kq = &kqwq->kqwq_kqueue;
2528	bzero(kqwq, sizeof (struct kqworkq));
2529
2530	kqwq->kqwq_state = KQ_WORKQ;
2531
2532	for (i = `0`; i < KQWQ_NBUCKETS; i++) {
2533	TAILQ_INIT(&kqwq->kqwq_queue[i]);
2534	}
2535	for (i = `0`; i < KQWQ_NBUCKETS; i++) {
2536	if (i != KQWQ_QOS_MANAGER) {
2537	/*
2538	* Because of how the bucketized system works, we mix overcommit
2539	* sources with not overcommit: each time we move a knote from
2540	* one bucket to the next due to overrides, we'd had to track
2541	* overcommitness, and it's really not worth it in the workloop
2542	* enabled world that track this faithfully.
2543	*
2544	* Incidentally, this behaves like the original manager-based
2545	* kqwq where event delivery always happened (hence is
2546	* "overcommit")
2547	*/
2548	kqwq->kqwq_request[i].kqr_state \|= KQR_THOVERCOMMIT;
2549	}
2550	kqwq->kqwq_request[i].kqr_qos_index = i;
2551	TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
2552	}
2553
2554	policy = SYNC_POLICY_FIFO;
2555	hook = (void *)kqwq;
2556	} else if (flags & KEVENT_FLAG_WORKLOOP) {
2557	struct kqworkloop *kqwl;
2558	int i;
2559
2560	kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
2561	if (kqwl == NULL)
2562	return NULL;
2563
2564	bzero(kqwl, sizeof (struct kqworkloop));
2565
2566	kqwl->kqwl_state = KQ_WORKLOOP \| KQ_DYNAMIC;
2567	kqwl->kqwl_retains = `1`; / donate a retain to creator /
2568	kqwl->kqwl_request.kqr_state = KQR_WORKLOOP;
2569
2570	kq = &kqwl->kqwl_kqueue;
2571	for (i = `0`; i < KQWL_NBUCKETS; i++) {
2572	TAILQ_INIT(&kqwl->kqwl_queue[i]);
2573	}
2574	TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
2575
2576	lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
2577
2578	policy = SYNC_POLICY_FIFO;
2579	hook = (void *)kqwl;
2580	} else {
2581	struct kqfile *kqf;
2582
2583	kqf = (struct kqfile *)zalloc(kqfile_zone);
2584	if (kqf == NULL)
2585	return NULL;
2586
2587	kq = &kqf->kqf_kqueue;
2588	bzero(kqf, sizeof (struct kqfile));
2589	TAILQ_INIT(&kqf->kqf_queue);
2590	TAILQ_INIT(&kqf->kqf_suppressed);
2591
2592	policy = SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST;
2593	}
2594
2595	waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
2596	lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
2597	lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr);
2598	kq->kq_p = p;
2599
2600	if (fdp->fd_knlistsize < `0`) {
2601	proc_fdlock(p);
2602	if (fdp->fd_knlistsize < `0`)
2603	fdp->fd_knlistsize = `0`; / this process has had a kq /
2604	proc_fdunlock(p);
2605	}
2606
2607	return (kq);
2608	}
2609
2610	/*
2611	* knotes_dealloc - detach all knotes for the process and drop them
2612	*
2613	* Called with proc_fdlock held.
2614	* Returns with it locked.
2615	* May drop it temporarily.
2616	* Process is in such a state that it will not try to allocate
2617	* any more knotes during this process (stopped for exit or exec).
2618	*/
2619	void
2620	knotes_dealloc(proc_t p)
2621	{
2622	struct filedesc *fdp = p->p_fd;
2623	struct kqueue *kq;
2624	struct knote *kn;
2625	struct klist *kn_hash = NULL;
2626	int i;
2627
2628	/ Close all the fd-indexed knotes up front /
2629	if (fdp->fd_knlistsize > `0`) {
2630	for (i = `0`; i < fdp->fd_knlistsize; i++) {
2631	while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
2632	kq = knote_get_kq(kn);
2633	kqlock(kq);
2634	proc_fdunlock(p);
2635	knote_drop(kq, kn, NULL);
2636	proc_fdlock(p);
2637	}
2638	}
2639	/ free the table /
2640	FREE(fdp->fd_knlist, M_KQUEUE);
2641	fdp->fd_knlist = NULL;
2642	}
2643	fdp->fd_knlistsize = -`1`;
2644
2645	knhash_lock(p);
2646	proc_fdunlock(p);
2647
2648	/ Clean out all the hashed knotes as well /
2649	if (fdp->fd_knhashmask != `0`) {
2650	for (i = `0`; i <= (int)fdp->fd_knhashmask; i++) {
2651	while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
2652	kq = knote_get_kq(kn);
2653	kqlock(kq);
2654	knhash_unlock(p);
2655	knote_drop(kq, kn, NULL);
2656	knhash_lock(p);
2657	}
2658	}
2659	kn_hash = fdp->fd_knhash;
2660	fdp->fd_knhashmask = `0`;
2661	fdp->fd_knhash = NULL;
2662	}
2663
2664	knhash_unlock(p);
2665
2666	/ free the kn_hash table /
2667	if (kn_hash)
2668	FREE(kn_hash, M_KQUEUE);
2669
2670	proc_fdlock(p);
2671	}
2672
2673	/*
2674	* kqworkloop_invalidate
2675	*
2676	* Invalidate ownership of a workloop.
2677	*
2678	* This is meant to be used so that any remnant of overrides and ownership
2679	* information is dropped before a kqworkloop can no longer be found in the
2680	* global hash table and have ghost workloop ownership left over.
2681	*
2682	* Possibly returns a thread to deallocate in a safe context.
2683	*/
2684	static thread_t
2685	kqworkloop_invalidate(struct kqworkloop *kqwl)
2686	{
2687	thread_t cur_owner = kqwl->kqwl_owner;
2688
2689	assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
2690	if (cur_owner) {
2691	/*
2692	* If the kqueue had an owner that prevented the thread request to
2693	* go through, then no unbind happened, and we may have lingering
2694	* overrides to drop.
2695	*/
2696	if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
2697	thread_drop_ipc_override(cur_owner);
2698	}
2699	thread_ends_owning_workloop(cur_owner);
2700	kqwl->kqwl_owner = THREAD_NULL;
2701	}
2702
2703	return cur_owner;
2704	}
2705
2706	/*
2707	* kqueue_dealloc - detach all knotes from a kqueue and free it
2708	*
2709	* We walk each list looking for knotes referencing this
2710	* this kqueue. If we find one, we try to drop it. But
2711	* if we fail to get a drop reference, that will wait
2712	* until it is dropped. So, we can just restart again
2713	* safe in the assumption that the list will eventually
2714	* not contain any more references to this kqueue (either
2715	* we dropped them all, or someone else did).
2716	*
2717	* Assumes no new events are being added to the kqueue.
2718	* Nothing locked on entry or exit.
2719	*
2720	* Workloop kqueues cant get here unless all the knotes
2721	* are already gone and all requested threads have come
2722	* and gone (cancelled or arrived).
2723	*/
2724	void
2725	kqueue_dealloc(struct kqueue *kq)
2726	{
2727	struct proc *p;
2728	struct filedesc *fdp;
2729	struct knote *kn;
2730	int i;
2731
2732	if (kq == NULL)
2733	return;
2734
2735	p = kq->kq_p;
2736	fdp = p->p_fd;
2737
2738	/*
2739	* Workloops are refcounted by their knotes, so there's no point
2740	* spending a lot of time under these locks just to deallocate one.
2741	*/
2742	if ((kq->kq_state & KQ_WORKLOOP) == `0`) {
2743	KNOTE_LOCK_CTX(knlc);
2744
2745	proc_fdlock(p);
2746	for (i = `0`; i < fdp->fd_knlistsize; i++) {
2747	kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2748	while (kn != NULL) {
2749	if (kq == knote_get_kq(kn)) {
2750	kqlock(kq);
2751	proc_fdunlock(p);
2752	if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2753	knote_drop(kq, kn, &knlc);
2754	}
2755	proc_fdlock(p);
2756	/ start over at beginning of list /
2757	kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2758	continue;
2759	}
2760	kn = SLIST_NEXT(kn, kn_link);
2761	}
2762	}
2763
2764	knhash_lock(p);
2765	proc_fdunlock(p);
2766
2767	if (fdp->fd_knhashmask != `0`) {
2768	for (i = `0`; i < (int)fdp->fd_knhashmask + `1`; i++) {
2769	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2770	while (kn != NULL) {
2771	if (kq == knote_get_kq(kn)) {
2772	kqlock(kq);
2773	knhash_unlock(p);
2774	if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2775	knote_drop(kq, kn, &knlc);
2776	}
2777	knhash_lock(p);
2778	/ start over at beginning of list /
2779	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2780	continue;
2781	}
2782	kn = SLIST_NEXT(kn, kn_link);
2783	}
2784	}
2785	}
2786	knhash_unlock(p);
2787	}
2788
2789	if (kq->kq_state & KQ_WORKLOOP) {
2790	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2791	thread_t cur_owner = kqworkloop_invalidate(kqwl);
2792
2793	if (cur_owner) thread_deallocate(cur_owner);
2794
2795	if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
2796	struct turnstile *ts;
2797	turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts);
2798	turnstile_cleanup();
2799	turnstile_deallocate(ts);
2800	} else {
2801	assert(kqwl->kqwl_turnstile == NULL);
2802	}
2803	}
2804
2805	/*
2806	* waitq_set_deinit() remove the KQ's waitq set from
2807	* any select sets to which it may belong.
2808	*/
2809	waitq_set_deinit(&kq->kq_wqs);
2810	lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
2811	lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp);
2812
2813	if (kq->kq_state & KQ_WORKQ) {
2814	zfree(kqworkq_zone, (struct kqworkq *)kq);
2815	} else if (kq->kq_state & KQ_WORKLOOP) {
2816	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2817
2818	assert(kqwl->kqwl_retains == `0`);
2819	lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
2820	zfree(kqworkloop_zone, kqwl);
2821	} else {
2822	zfree(kqfile_zone, (struct kqfile *)kq);
2823	}
2824	}
2825
2826	static inline void
2827	kqueue_retain(struct kqueue *kq)
2828	{
2829	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
2830	uint32_t previous;
2831
2832	if ((kq->kq_state & KQ_DYNAMIC) == `0`)
2833	return;
2834
2835	previous = OSIncrementAtomic(&kqwl->kqwl_retains);
2836	if (previous == KQ_WORKLOOP_RETAINS_MAX)
2837	panic("kq(%p) retain overflow", kq);
2838
2839	if (previous == `0`)
2840	panic("kq(%p) resurrection", kq);
2841	}
2842
2843	#define KQUEUE_CANT_BE_LAST_REF 0
2844	#define KQUEUE_MIGHT_BE_LAST_REF 1
2845
2846	static inline int
2847	kqueue_release(kqueue_t kqu, __assert_only int possibly_last)
2848	{
2849	if ((kqu.kq->kq_state & KQ_DYNAMIC) == `0`) {
2850	return `0`;
2851	}
2852
2853	assert(kqu.kq->kq_state & KQ_WORKLOOP); / for now /
2854	uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains);
2855	if (__improbable(refs == `0`)) {
2856	panic("kq(%p) over-release", kqu.kq);
2857	}
2858	if (refs == `1`) {
2859	assert(possibly_last);
2860	}
2861	return refs == `1`;
2862	}
2863
2864	int
2865	kqueue_body(struct proc p, fp_allocfn_t fp_zalloc, void* cra, int32_t retval)
2866	{
2867	struct kqueue *kq;
2868	struct fileproc *fp;
2869	int fd, error;
2870
2871	error = falloc_withalloc(p,
2872	&fp, &fd, vfs_context_current(), fp_zalloc, cra);
2873	if (error) {
2874	return (error);
2875	}
2876
2877	kq = kqueue_alloc(p, `0`);
2878	if (kq == NULL) {
2879	fp_free(p, fd, fp);
2880	return (ENOMEM);
2881	}
2882
2883	fp->f_flag = FREAD \| FWRITE;
2884	fp->f_ops = &kqueueops;
2885	fp->f_data = kq;
2886
2887	proc_fdlock(p);
2888	*fdflags(p, fd) \|= UF_EXCLOSE;
2889	procfdtbl_releasefd(p, fd, NULL);
2890	fp_drop(p, fd, fp, `1`);
2891	proc_fdunlock(p);
2892
2893	*retval = fd;
2894	return (error);
2895	}
2896
2897	int
2898	kqueue(struct proc p, __unused struct* kqueue_args uap, int32_t retval)
2899	{
2900	return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
2901	}
2902
2903	static int
2904	kevent_copyin(user_addr_t addrp, struct* kevent_internal_s kevp, struct* proc *p,
2905	unsigned int flags)
2906	{
2907	int advance;
2908	int error;
2909
2910	if (flags & KEVENT_FLAG_LEGACY32) {
2911	bzero(kevp, sizeof (*kevp));
2912
2913	if (IS_64BIT_PROCESS(p)) {
2914	struct user64_kevent kev64;
2915
2916	advance = sizeof (kev64);
2917	error = copyin(*addrp, (caddr_t)&kev64, advance);
2918	if (error)
2919	return (error);
2920	kevp->ident = kev64.ident;
2921	kevp->filter = kev64.filter;
2922	kevp->flags = kev64.flags;
2923	kevp->udata = kev64.udata;
2924	kevp->fflags = kev64.fflags;
2925	kevp->data = kev64.data;
2926	} else {
2927	struct user32_kevent kev32;
2928
2929	advance = sizeof (kev32);
2930	error = copyin(*addrp, (caddr_t)&kev32, advance);
2931	if (error)
2932	return (error);
2933	kevp->ident = (uintptr_t)kev32.ident;
2934	kevp->filter = kev32.filter;
2935	kevp->flags = kev32.flags;
2936	kevp->udata = CAST_USER_ADDR_T(kev32.udata);
2937	kevp->fflags = kev32.fflags;
2938	kevp->data = (intptr_t)kev32.data;
2939	}
2940	} else if (flags & KEVENT_FLAG_LEGACY64) {
2941	struct kevent64_s kev64;
2942
2943	bzero(kevp, sizeof (*kevp));
2944
2945	advance = sizeof (struct kevent64_s);
2946	error = copyin(*addrp, (caddr_t)&kev64, advance);
2947	if (error)
2948	return(error);
2949	kevp->ident = kev64.ident;
2950	kevp->filter = kev64.filter;
2951	kevp->flags = kev64.flags;
2952	kevp->udata = kev64.udata;
2953	kevp->fflags = kev64.fflags;
2954	kevp->data = kev64.data;
2955	kevp->ext[`0`] = kev64.ext[`0`];
2956	kevp->ext[`1`] = kev64.ext[`1`];
2957
2958	} else {
2959	struct kevent_qos_s kevqos;
2960
2961	bzero(kevp, sizeof (*kevp));
2962
2963	advance = sizeof (struct kevent_qos_s);
2964	error = copyin(*addrp, (caddr_t)&kevqos, advance);
2965	if (error)
2966	return error;
2967	kevp->ident = kevqos.ident;
2968	kevp->filter = kevqos.filter;
2969	kevp->flags = kevqos.flags;
2970	kevp->qos = kevqos.qos;
2971	// kevp->xflags = kevqos.xflags;
2972	kevp->udata = kevqos.udata;
2973	kevp->fflags = kevqos.fflags;
2974	kevp->data = kevqos.data;
2975	kevp->ext[`0`] = kevqos.ext[`0`];
2976	kevp->ext[`1`] = kevqos.ext[`1`];
2977	kevp->ext[`2`] = kevqos.ext[`2`];
2978	kevp->ext[`3`] = kevqos.ext[`3`];
2979	}
2980	if (!error)
2981	*addrp += advance;
2982	return (error);
2983	}
2984
2985	static int
2986	kevent_copyout(struct kevent_internal_s kevp, user_addr_t addrp, struct proc *p,
2987	unsigned int flags)
2988	{
2989	user_addr_t addr = *addrp;
2990	int advance;
2991	int error;
2992
2993	/*
2994	* fully initialize the differnt output event structure
2995	* types from the internal kevent (and some universal
2996	* defaults for fields not represented in the internal
2997	* form).
2998	*/
2999	if (flags & KEVENT_FLAG_LEGACY32) {
3000	assert((flags & KEVENT_FLAG_STACK_EVENTS) == `0`);
3001
3002	if (IS_64BIT_PROCESS(p)) {
3003	struct user64_kevent kev64;
3004
3005	advance = sizeof (kev64);
3006	bzero(&kev64, advance);
3007
3008	/*
3009	* deal with the special case of a user-supplied
3010	* value of (uintptr_t)-1.
3011	*/
3012	kev64.ident = (kevp->ident == (uintptr_t)-`1`) ?
3013	(uint64_t)-`1LL` : (uint64_t)kevp->ident;
3014
3015	kev64.filter = kevp->filter;
3016	kev64.flags = kevp->flags;
3017	kev64.fflags = kevp->fflags;
3018	kev64.data = (int64_t) kevp->data;
3019	kev64.udata = kevp->udata;
3020	error = copyout((caddr_t)&kev64, addr, advance);
3021	} else {
3022	struct user32_kevent kev32;
3023
3024	advance = sizeof (kev32);
3025	bzero(&kev32, advance);
3026	kev32.ident = (uint32_t)kevp->ident;
3027	kev32.filter = kevp->filter;
3028	kev32.flags = kevp->flags;
3029	kev32.fflags = kevp->fflags;
3030	kev32.data = (int32_t)kevp->data;
3031	kev32.udata = kevp->udata;
3032	error = copyout((caddr_t)&kev32, addr, advance);
3033	}
3034	} else if (flags & KEVENT_FLAG_LEGACY64) {
3035	struct kevent64_s kev64;
3036
3037	advance = sizeof (struct kevent64_s);
3038	if (flags & KEVENT_FLAG_STACK_EVENTS) {
3039	addr -= advance;
3040	}
3041	bzero(&kev64, advance);
3042	kev64.ident = kevp->ident;
3043	kev64.filter = kevp->filter;
3044	kev64.flags = kevp->flags;
3045	kev64.fflags = kevp->fflags;
3046	kev64.data = (int64_t) kevp->data;
3047	kev64.udata = kevp->udata;
3048	kev64.ext[`0`] = kevp->ext[`0`];
3049	kev64.ext[`1`] = kevp->ext[`1`];
3050	error = copyout((caddr_t)&kev64, addr, advance);
3051	} else {
3052	struct kevent_qos_s kevqos;
3053
3054	advance = sizeof (struct kevent_qos_s);
3055	if (flags & KEVENT_FLAG_STACK_EVENTS) {
3056	addr -= advance;
3057	}
3058	bzero(&kevqos, advance);
3059	kevqos.ident = kevp->ident;
3060	kevqos.filter = kevp->filter;
3061	kevqos.flags = kevp->flags;
3062	kevqos.qos = kevp->qos;
3063	kevqos.udata = kevp->udata;
3064	kevqos.fflags = kevp->fflags;
3065	kevqos.xflags = `0`;
3066	kevqos.data = (int64_t) kevp->data;
3067	kevqos.ext[`0`] = kevp->ext[`0`];
3068	kevqos.ext[`1`] = kevp->ext[`1`];
3069	kevqos.ext[`2`] = kevp->ext[`2`];
3070	kevqos.ext[`3`] = kevp->ext[`3`];
3071	error = copyout((caddr_t)&kevqos, addr, advance);
3072	}
3073	if (!error) {
3074	if (flags & KEVENT_FLAG_STACK_EVENTS)
3075	*addrp = addr;
3076	else
3077	*addrp = addr + advance;
3078	}
3079	return (error);
3080	}
3081
3082	static int
3083	kevent_get_data_size(
3084	struct proc *p,
3085	uint64_t data_available,
3086	unsigned int flags,
3087	user_size_t *residp)
3088	{
3089	user_size_t resid;
3090	int error = `0`;
3091
3092	if (data_available != USER_ADDR_NULL) {
3093	if (flags & KEVENT_FLAG_KERNEL) {
3094	resid = (user_size_t )(uintptr_t)data_available;
3095	} else if (IS_64BIT_PROCESS(p)) {
3096	user64_size_t usize;
3097	error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3098	resid = (user_size_t)usize;
3099	} else {
3100	user32_size_t usize;
3101	error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3102	resid = (user_size_t)usize;
3103	}
3104	if (error)
3105	return(error);
3106	} else {
3107	resid = `0`;
3108	}
3109	*residp = resid;
3110	return `0`;
3111	}
3112
3113	static int
3114	kevent_put_data_size(
3115	struct proc *p,
3116	uint64_t data_available,
3117	unsigned int flags,
3118	user_size_t resid)
3119	{
3120	int error = `0`;
3121
3122	if (data_available) {
3123	if (flags & KEVENT_FLAG_KERNEL) {
3124	(user_size_t )(uintptr_t)data_available = resid;
3125	} else if (IS_64BIT_PROCESS(p)) {
3126	user64_size_t usize = (user64_size_t)resid;
3127	error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3128	} else {
3129	user32_size_t usize = (user32_size_t)resid;
3130	error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3131	}
3132	}
3133	return error;
3134	}
3135
3136	/*
3137	* kevent_continue - continue a kevent syscall after blocking
3138	*
3139	* assume we inherit a use count on the kq fileglob.
3140	*/
3141	__attribute__((noreturn))
3142	static void
3143	kevent_continue(__unused struct kqueue kq, void* data, int* error)
3144	{
3145	struct _kevent *cont_args;
3146	struct fileproc *fp;
3147	uint64_t data_available;
3148	user_size_t data_size;
3149	user_size_t data_resid;
3150	unsigned int flags;
3151	int32_t *retval;
3152	int noutputs;
3153	int fd;
3154	struct proc *p = current_proc();
3155
3156	cont_args = (struct _kevent *)data;
3157	data_available = cont_args->data_available;
3158	flags = cont_args->process_data.fp_flags;
3159	data_size = cont_args->process_data.fp_data_size;
3160	data_resid = cont_args->process_data.fp_data_resid;
3161	noutputs = cont_args->eventout;
3162	retval = cont_args->retval;
3163	fd = cont_args->fd;
3164	fp = cont_args->fp;
3165
3166	kevent_put_kq(p, fd, fp, kq);
3167
3168	/ don't abandon other output just because of residual copyout failures /
3169	if (error == `0` && data_available && data_resid != data_size) {
3170	(void)kevent_put_data_size(p, data_available, flags, data_resid);
3171	}
3172
3173	/ don't restart after signals... /
3174	if (error == ERESTART)
3175	error = EINTR;
3176	else if (error == EWOULDBLOCK)
3177	error = `0`;
3178	if (error == `0`)
3179	*retval = noutputs;
3180	unix_syscall_return(error);
3181	}
3182
3183	/*
3184	* kevent - [syscall] register and wait for kernel events
3185	*
3186	*/
3187	int
3188	kevent(struct proc p, struct* kevent_args uap, int32_t retval)
3189	{
3190	unsigned int flags = KEVENT_FLAG_LEGACY32;
3191
3192	return kevent_internal(p,
3193	(kqueue_id_t)uap->fd, NULL,
3194	uap->changelist, uap->nchanges,
3195	uap->eventlist, uap->nevents,
3196	`0ULL`, `0ULL`,
3197	flags,
3198	uap->timeout,
3199	kevent_continue,
3200	retval);
3201	}
3202
3203	int
3204	kevent64(struct proc p, struct* kevent64_args uap, int32_t retval)
3205	{
3206	unsigned int flags;
3207
3208	/ restrict to user flags and set legacy64 /
3209	flags = uap->flags & KEVENT_FLAG_USER;
3210	flags \|= KEVENT_FLAG_LEGACY64;
3211
3212	return kevent_internal(p,
3213	(kqueue_id_t)uap->fd, NULL,
3214	uap->changelist, uap->nchanges,
3215	uap->eventlist, uap->nevents,
3216	`0ULL`, `0ULL`,
3217	flags,
3218	uap->timeout,
3219	kevent_continue,
3220	retval);
3221	}
3222
3223	int
3224	kevent_qos(struct proc p, struct* kevent_qos_args uap, int32_t retval)
3225	{
3226	/ restrict to user flags /
3227	uap->flags &= KEVENT_FLAG_USER;
3228
3229	return kevent_internal(p,
3230	(kqueue_id_t)uap->fd, NULL,
3231	uap->changelist, uap->nchanges,
3232	uap->eventlist, uap->nevents,
3233	uap->data_out, (uint64_t)uap->data_available,
3234	uap->flags,
3235	`0ULL`,
3236	kevent_continue,
3237	retval);
3238	}
3239
3240	int
3241	kevent_qos_internal(struct proc p, int* fd,
3242	user_addr_t changelist, int nchanges,
3243	user_addr_t eventlist, int nevents,
3244	user_addr_t data_out, user_size_t *data_available,
3245	unsigned int flags,
3246	int32_t *retval)
3247	{
3248	return kevent_internal(p,
3249	(kqueue_id_t)fd, NULL,
3250	changelist, nchanges,
3251	eventlist, nevents,
3252	data_out, (uint64_t)data_available,
3253	(flags \| KEVENT_FLAG_KERNEL),
3254	`0ULL`,
3255	NULL,
3256	retval);
3257	}
3258
3259	int
3260	kevent_id(struct proc p, struct* kevent_id_args uap, int32_t retval)
3261	{
3262	/ restrict to user flags /
3263	uap->flags &= KEVENT_FLAG_USER;
3264
3265	return kevent_internal(p,
3266	(kqueue_id_t)uap->id, NULL,
3267	uap->changelist, uap->nchanges,
3268	uap->eventlist, uap->nevents,
3269	uap->data_out, (uint64_t)uap->data_available,
3270	(uap->flags \| KEVENT_FLAG_DYNAMIC_KQUEUE),
3271	`0ULL`,
3272	kevent_continue,
3273	retval);
3274	}
3275
3276	int
3277	kevent_id_internal(struct proc p, kqueue_id_t id,
3278	user_addr_t changelist, int nchanges,
3279	user_addr_t eventlist, int nevents,
3280	user_addr_t data_out, user_size_t *data_available,
3281	unsigned int flags,
3282	int32_t *retval)
3283	{
3284	return kevent_internal(p,
3285	*id, id,
3286	changelist, nchanges,
3287	eventlist, nevents,
3288	data_out, (uint64_t)data_available,
3289	(flags \| KEVENT_FLAG_KERNEL \| KEVENT_FLAG_DYNAMIC_KQUEUE),
3290	`0ULL`,
3291	NULL,
3292	retval);
3293	}
3294
3295	static int
3296	kevent_get_timeout(struct proc *p,
3297	user_addr_t utimeout,
3298	unsigned int flags,
3299	struct timeval *atvp)
3300	{
3301	struct timeval atv;
3302	int error = `0`;
3303
3304	if (flags & KEVENT_FLAG_IMMEDIATE) {
3305	getmicrouptime(&atv);
3306	} else if (utimeout != USER_ADDR_NULL) {
3307	struct timeval rtv;
3308	if (flags & KEVENT_FLAG_KERNEL) {
3309	struct timespec tsp = (struct* timespec *)utimeout;
3310	TIMESPEC_TO_TIMEVAL(&rtv, tsp);
3311	} else if (IS_64BIT_PROCESS(p)) {
3312	struct user64_timespec ts;
3313	error = copyin(utimeout, &ts, sizeof(ts));
3314	if ((ts.tv_sec & `0xFFFFFFFF00000000ull`) != `0`)
3315	error = EINVAL;
3316	else
3317	TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3318	} else {
3319	struct user32_timespec ts;
3320	error = copyin(utimeout, &ts, sizeof(ts));
3321	TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3322	}
3323	if (error)
3324	return (error);
3325	if (itimerfix(&rtv))
3326	return (EINVAL);
3327	getmicrouptime(&atv);
3328	timevaladd(&atv, &rtv);
3329	} else {
3330	/ wait forever value /
3331	atv.tv_sec = `0`;
3332	atv.tv_usec = `0`;
3333	}
3334	*atvp = atv;
3335	return `0`;
3336	}
3337
3338	static int
3339	kevent_set_kq_mode(struct kqueue kq, unsigned* int flags)
3340	{
3341	/ each kq should only be used for events of one type /
3342	kqlock(kq);
3343	if (kq->kq_state & (KQ_KEV32 \| KQ_KEV64 \| KQ_KEV_QOS)) {
3344	if (flags & KEVENT_FLAG_LEGACY32) {
3345	if ((kq->kq_state & KQ_KEV32) == `0`) {
3346	kqunlock(kq);
3347	return EINVAL;
3348	}
3349	} else if (kq->kq_state & KQ_KEV32) {
3350	kqunlock(kq);
3351	return EINVAL;
3352	}
3353	} else if (flags & KEVENT_FLAG_LEGACY32) {
3354	kq->kq_state \|= KQ_KEV32;
3355	} else if (flags & KEVENT_FLAG_LEGACY64) {
3356	kq->kq_state \|= KQ_KEV64;
3357	} else {
3358	kq->kq_state \|= KQ_KEV_QOS;
3359	}
3360	kqunlock(kq);
3361	return `0`;
3362	}
3363
3364	#define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3365	#define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3366
3367	static inline void
3368	kqhash_lock(proc_t p)
3369	{
3370	lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock);
3371	}
3372
3373	static inline void
3374	kqhash_lock_held(__assert_only proc_t p)
3375	{
3376	LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED);
3377	}
3378
3379	static inline void
3380	kqhash_unlock(proc_t p)
3381	{
3382	lck_mtx_unlock(&p->p_fd->fd_kqhashlock);
3383	}
3384
3385	static void
3386	kqueue_hash_init_if_needed(proc_t p)
3387	{
3388	struct filedesc *fdp = p->p_fd;
3389
3390	kqhash_lock_held(p);
3391
3392	if (__improbable(fdp->fd_kqhash == NULL)) {
3393	struct kqlist *alloc_hash;
3394	u_long alloc_mask;
3395
3396	kqhash_unlock(p);
3397	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3398	kqhash_lock(p);
3399
3400	/ See if we won the race /
3401	if (fdp->fd_kqhashmask == `0`) {
3402	fdp->fd_kqhash = alloc_hash;
3403	fdp->fd_kqhashmask = alloc_mask;
3404	} else {
3405	kqhash_unlock(p);
3406	FREE(alloc_hash, M_KQUEUE);
3407	kqhash_lock(p);
3408	}
3409	}
3410	}
3411
3412	/*
3413	* Called with the kqhash_lock() held
3414	*/
3415	static void
3416	kqueue_hash_insert(
3417	struct proc *p,
3418	kqueue_id_t id,
3419	struct kqueue *kq)
3420	{
3421	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
3422	struct filedesc *fdp = p->p_fd;
3423	struct kqlist *list;
3424
3425	/ should hold the kq hash lock /
3426	kqhash_lock_held(p);
3427
3428	if ((kq->kq_state & KQ_DYNAMIC) == `0`) {
3429	assert(kq->kq_state & KQ_DYNAMIC);
3430	return;
3431	}
3432
3433	/ only dynamically allocate workloop kqs for now /
3434	assert(kq->kq_state & KQ_WORKLOOP);
3435	assert(fdp->fd_kqhash);
3436
3437	kqwl->kqwl_dynamicid = id;
3438
3439	list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3440	SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3441	}
3442
3443	/ Called with kqhash_lock held /
3444	static void
3445	kqueue_hash_remove(
3446	struct proc *p,
3447	struct kqueue *kq)
3448	{
3449	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
3450	struct filedesc *fdp = p->p_fd;
3451	struct kqlist *list;
3452
3453	/ should hold the kq hash lock /
3454	kqhash_lock_held(p);
3455
3456	if ((kq->kq_state & KQ_DYNAMIC) == `0`) {
3457	assert(kq->kq_state & KQ_DYNAMIC);
3458	return;
3459	}
3460	assert(kq->kq_state & KQ_WORKLOOP); / for now /
3461	list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)];
3462	SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3463	}
3464
3465	/ Called with kqhash_lock held /
3466	static struct kqueue *
3467	kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
3468	{
3469	struct filedesc *fdp = p->p_fd;
3470	struct kqlist *list;
3471	struct kqworkloop *kqwl;
3472
3473	/ should hold the kq hash lock /
3474	kqhash_lock_held(p);
3475
3476	if (fdp->fd_kqhashmask == `0`) return NULL;
3477
3478	list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3479	SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
3480	if (kqwl->kqwl_dynamicid == id) {
3481	struct kqueue kq = (struct* kqueue *)kqwl;
3482
3483	assert(kq->kq_state & KQ_DYNAMIC);
3484	assert(kq->kq_state & KQ_WORKLOOP); / for now /
3485	return kq;
3486	}
3487	}
3488	return NULL;
3489	}
3490
3491	static inline void
3492	kqueue_release_last(struct proc *p, kqueue_t kqu)
3493	{
3494	struct kqueue *kq = kqu.kq;
3495	if (kq->kq_state & KQ_DYNAMIC) {
3496	kqhash_lock(p);
3497	if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
3498	thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl);
3499	kqueue_hash_remove(p, kq);
3500	kqhash_unlock(p);
3501	if (cur_owner) thread_deallocate(cur_owner);
3502	kqueue_dealloc(kq);
3503	} else {
3504	kqhash_unlock(p);
3505	}
3506	}
3507	}
3508
3509	/*
3510	* kqworkloops_dealloc - rebalance retains on kqworkloops created with
3511	* scheduling parameters
3512	*
3513	* Called with proc_fdlock held.
3514	* Returns with it locked.
3515	* Process is in such a state that it will not try to allocate
3516	* any more knotes during this process (stopped for exit or exec).
3517	*/
3518	void
3519	kqworkloops_dealloc(proc_t p)
3520	{
3521	struct filedesc *fdp = p->p_fd;
3522	struct kqlist *list;
3523	struct kqworkloop kqwl, kqwln;
3524	struct kqlist tofree;
3525	int i;
3526
3527	if (!(fdp->fd_flags & FD_WORKLOOP)) {
3528	return;
3529	}
3530
3531	SLIST_INIT(&tofree);
3532
3533	kqhash_lock(p);
3534	assert(fdp->fd_kqhashmask != `0`);
3535
3536	for (i = `0`; i <= (int)fdp->fd_kqhashmask; i++) {
3537	list = &fdp->fd_kqhash[i];
3538	SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) {
3539	/*
3540	* kqworkloops that have scheduling parameters have an
3541	* implicit retain from kqueue_workloop_ctl that needs
3542	* to be balanced on process exit.
3543	*/
3544	assert(kqwl->kqwl_params);
3545	SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3546	SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3547	}
3548	}
3549
3550	kqhash_unlock(p);
3551
3552	SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3553	struct kqueue kq = (struct* kqueue *)kqwl;
3554	__assert_only bool released;
3555	released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF);
3556	assert(released);
3557	kqueue_dealloc(kq);
3558	}
3559	}
3560
3561	static struct kqueue *
3562	kevent_get_bound_kqworkloop(thread_t thread)
3563	{
3564	struct uthread *ut = get_bsdthread_info(thread);
3565	struct kqrequest *kqr = ut->uu_kqr_bound;
3566
3567	return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL;
3568	}
3569
3570	static int
3571	kevent_get_kq(struct proc p, kqueue_id_t id, workq_threadreq_param_t trp,
3572	unsigned int flags, struct fileproc *fpp, int* *fdp,
3573	struct kqueue **kqp)
3574	{
3575	struct filedesc *descp = p->p_fd;
3576	struct fileproc *fp = NULL;
3577	struct kqueue *kq = NULL;
3578	int fd = `0`;
3579	int error = `0`;
3580	thread_t th = current_thread();
3581
3582	assert(!trp \|\| (flags & KEVENT_FLAG_WORKLOOP));
3583
3584	/ Was the workloop flag passed? Then it is for sure only a workloop /
3585	if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
3586	assert(flags & KEVENT_FLAG_WORKLOOP);
3587	assert(!trp \|\| (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3588	kq = kevent_get_bound_kqworkloop(th);
3589
3590	/*
3591	* when kevent_id_internal is called from within the
3592	* kernel, and the passed 'id' value is '-1' then we
3593	* look for the currently bound workloop kq.
3594	*/
3595	if (id == (kqueue_id_t)-`1` &&
3596	(flags & KEVENT_FLAG_KERNEL) &&
3597	(flags & KEVENT_FLAG_WORKLOOP)) {
3598
3599	if (!is_workqueue_thread(th) \|\| !kq) {
3600	return EINVAL;
3601	}
3602
3603	kqueue_retain(kq);
3604	goto out;
3605	}
3606
3607	if (id == `0` \|\| id == (kqueue_id_t)-`1`) {
3608	return EINVAL;
3609	}
3610
3611	/ try shortcut on kq lookup for bound threads /
3612	if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
3613
3614	if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3615	return EEXIST;
3616	}
3617
3618	/ retain a reference while working with this kq. /
3619	assert(kq->kq_state & KQ_DYNAMIC);
3620	kqueue_retain(kq);
3621	goto out;
3622	}
3623
3624	/ look for the kq on the hash table /
3625	kqhash_lock(p);
3626	kq = kqueue_hash_lookup(p, id);
3627	if (kq == NULL) {
3628	kqhash_unlock(p);
3629
3630	if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
3631	return ENOENT;
3632	}
3633
3634	struct kqueue *alloc_kq;
3635	alloc_kq = kqueue_alloc(p, flags);
3636	if (!alloc_kq) {
3637	return ENOMEM;
3638	}
3639
3640	kqhash_lock(p);
3641	kqueue_hash_init_if_needed(p);
3642	kq = kqueue_hash_lookup(p, id);
3643	if (kq == NULL) {
3644	/ insert our new one /
3645	kq = alloc_kq;
3646	if (trp) {
3647	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
3648	kqwl->kqwl_params = trp->trp_value;
3649	}
3650	kqueue_hash_insert(p, id, kq);
3651	kqhash_unlock(p);
3652	} else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3653	/ lost race and caller wants an error /
3654	kqhash_unlock(p);
3655	kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3656	kqueue_dealloc(alloc_kq);
3657	return EEXIST;
3658	} else {
3659	/ lost race, retain existing workloop /
3660	kqueue_retain(kq);
3661	kqhash_unlock(p);
3662	kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3663	kqueue_dealloc(alloc_kq);
3664	}
3665	} else {
3666
3667	if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3668	kqhash_unlock(p);
3669	return EEXIST;
3670	}
3671
3672	/ retain a reference while working with this kq. /
3673	assert(kq->kq_state & KQ_DYNAMIC);
3674	kqueue_retain(kq);
3675	kqhash_unlock(p);
3676	}
3677
3678	} else if (flags & KEVENT_FLAG_WORKQ) {
3679	/ must already exist for bound threads. /
3680	if (flags & KEVENT_FLAG_KERNEL) {
3681	assert(descp->fd_wqkqueue != NULL);
3682	}
3683
3684	/*
3685	* use the private kq associated with the proc workq.
3686	* Just being a thread within the process (and not
3687	* being the exit/exec thread) is enough to hold a
3688	* reference on this special kq.
3689	*/
3690	kq = descp->fd_wqkqueue;
3691	if (kq == NULL) {
3692	struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
3693	if (alloc_kq == NULL) {
3694	return ENOMEM;
3695	}
3696
3697	knhash_lock(p);
3698	if (descp->fd_wqkqueue == NULL) {
3699	kq = descp->fd_wqkqueue = alloc_kq;
3700	knhash_unlock(p);
3701	} else {
3702	knhash_unlock(p);
3703	kq = descp->fd_wqkqueue;
3704	kqueue_dealloc(alloc_kq);
3705	}
3706	}
3707	} else {
3708	/ get a usecount for the kq itself /
3709	fd = (int)id;
3710	if ((error = fp_getfkq(p, fd, &fp, &kq)) != `0`)
3711	return (error);
3712	}
3713	if ((error = kevent_set_kq_mode(kq, flags)) != `0`) {
3714	/ drop the usecount /
3715	if (fp != NULL)
3716	fp_drop(p, fd, fp, `0`);
3717	return error;
3718	}
3719
3720	out:
3721	*fpp = fp;
3722	*fdp = fd;
3723	*kqp = kq;
3724
3725	return error;
3726	}
3727
3728	static void
3729	kevent_put_kq(
3730	struct proc *p,
3731	kqueue_id_t id,
3732	struct fileproc *fp,
3733	struct kqueue *kq)
3734	{
3735	kqueue_release_last(p, kq);
3736	if (fp != NULL) {
3737	assert((kq->kq_state & KQ_WORKQ) == `0`);
3738	fp_drop(p, (int)id, fp, `0`);
3739	}
3740	}
3741
3742	static uint64_t
3743	kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id)
3744	{
3745	uint64_t serial_no = `0`;
3746	user_addr_t addr;
3747	int rc;
3748
3749	if (workloop_id == `0` \|\| p->p_dispatchqueue_serialno_offset == `0`) {
3750	return `0`;
3751	}
3752	addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset);
3753
3754	if (proc_is64bit(p)) {
3755	rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no));
3756	} else {
3757	uint32_t serial_no32 = `0`;
3758	rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32));
3759	serial_no = serial_no32;
3760	}
3761	return rc == `0` ? serial_no : `0`;
3762	}
3763
3764	int
3765	kevent_exit_on_workloop_ownership_leak(thread_t thread)
3766	{
3767	proc_t p = current_proc();
3768	struct filedesc *fdp = p->p_fd;
3769	kqueue_id_t workloop_id = `0`;
3770	os_reason_t reason = OS_REASON_NULL;
3771	mach_vm_address_t addr;
3772	uint32_t reason_size;
3773
3774	kqhash_lock(p);
3775	if (fdp->fd_kqhashmask > `0`) {
3776	for (uint32_t i = `0`; i < fdp->fd_kqhashmask + `1`; i++) {
3777	struct kqworkloop *kqwl;
3778
3779	SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
3780	struct kqueue *kq = &kqwl->kqwl_kqueue;
3781	if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) {
3782	workloop_id = kqwl->kqwl_dynamicid;
3783	break;
3784	}
3785	}
3786	}
3787	}
3788	kqhash_unlock(p);
3789
3790	reason = os_reason_create(OS_REASON_LIBSYSTEM,
3791	OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
3792	if (reason == OS_REASON_NULL) {
3793	goto out;
3794	}
3795
3796	reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
3797	reason_size = `2` * sizeof(uint64_t);
3798	reason_size = kcdata_estimate_required_buffer_size(`2`, reason_size);
3799	if (os_reason_alloc_buffer(reason, reason_size) != `0`) {
3800	goto out;
3801	}
3802
3803	if (workloop_id) {
3804	struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
3805
3806	if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
3807	sizeof(workloop_id), &addr) == KERN_SUCCESS) {
3808	kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
3809	}
3810
3811	uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
3812	if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
3813	sizeof(serial_no), &addr) == KERN_SUCCESS) {
3814	kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
3815	}
3816	}
3817	out:
3818	#if DEVELOPMENT \|\| DEBUG
3819	if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) {
3820	panic("thread %p in task %p is leaked workloop 0x%016llx ownership",
3821	thread, p->task, workloop_id);
3822	}
3823	psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
3824	return `0`;
3825	#else
3826	return exit_with_reason(p, W_EXITCODE(`0`, SIGKILL), (int *)NULL,
3827	FALSE, FALSE, `0`, reason);
3828	#endif
3829	}
3830
3831	static inline boolean_t
3832	kevent_args_requesting_events(unsigned int flags, int nevents)
3833	{
3834	return (!(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > `0`);
3835	}
3836
3837	static int
3838	kevent_internal(struct proc *p,
3839	kqueue_id_t id, kqueue_id_t *id_out,
3840	user_addr_t changelist, int nchanges,
3841	user_addr_t ueventlist, int nevents,
3842	user_addr_t data_out, uint64_t data_available,
3843	unsigned int flags,
3844	user_addr_t utimeout,
3845	kqueue_continue_t continuation,
3846	int32_t *retval)
3847	{
3848	uthread_t ut;
3849	struct kqueue *kq;
3850	struct fileproc *fp = NULL;
3851	int fd = `0`;
3852	struct kevent_internal_s kev;
3853	int error, noutputs, register_rc;
3854	bool needs_end_processing = false;
3855	struct timeval atv;
3856	user_size_t data_size;
3857	user_size_t data_resid;
3858	thread_t thread = current_thread();
3859	KNOTE_LOCK_CTX(knlc);
3860
3861	/ Don't allow user-space threads to process output events from the workq kqs /
3862	if (((flags & (KEVENT_FLAG_WORKQ \| KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
3863	kevent_args_requesting_events(flags, nevents))
3864	return EINVAL;
3865
3866	if (flags & KEVENT_FLAG_PARKING) {
3867	if (!kevent_args_requesting_events(flags, nevents) \|\| id != (kqueue_id_t)-`1`)
3868	return EINVAL;
3869	}
3870
3871	/ restrict dynamic kqueue allocation to workloops (for now) /
3872	if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE \| KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE)
3873	return EINVAL;
3874
3875	if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ)))
3876	return EINVAL;
3877
3878	if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST \| KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3879
3880	/ allowed only on workloops when calling kevent_id from user-space /
3881	if (!(flags & KEVENT_FLAG_WORKLOOP) \|\| (flags & KEVENT_FLAG_KERNEL) \|\| !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE))
3882	return EINVAL;
3883	}
3884
3885	/ prepare to deal with stack-wise allocation of out events /
3886	if (flags & KEVENT_FLAG_STACK_EVENTS) {
3887	int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
3888	(IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
3889	sizeof(struct user32_kevent)) :
3890	((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
3891	sizeof(struct kevent_qos_s)));
3892	ueventlist += nevents * scale;
3893	}
3894
3895	/ convert timeout to absolute - if we have one (and not immediate) /
3896	error = kevent_get_timeout(p, utimeout, flags, &atv);
3897	if (error)
3898	return error;
3899
3900	/ copyin initial value of data residual from data_available /
3901	error = kevent_get_data_size(p, data_available, flags, &data_size);
3902	if (error)
3903	return error;
3904
3905	/ get the kq we are going to be working on /
3906	error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq);
3907	#if CONFIG_WORKLOOP_DEBUG
3908	ut = (uthread_t)get_bsdthread_info(thread);
3909	UU_KEVENT_HISTORY_WRITE_ENTRY(ut, {
3910	.uu_kqid = id,
3911	.uu_kq = error ? NULL : kq,
3912	.uu_error = error,
3913	.uu_nchanges = nchanges,
3914	.uu_nevents = nevents,
3915	.uu_flags = flags,
3916	});
3917	#endif // CONFIG_WORKLOOP_DEBUG
3918	if (error)
3919	return error;
3920
3921	/ only bound threads can receive events on workloops /
3922	if (flags & KEVENT_FLAG_WORKLOOP) {
3923	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
3924	struct kqrequest *kqr = &kqwl->kqwl_request;
3925
3926	assert(kq->kq_state & KQ_WORKLOOP);
3927
3928	if (kevent_args_requesting_events(flags, nevents)) {
3929	if (kq != kevent_get_bound_kqworkloop(thread)) {
3930	error = EXDEV;
3931	goto out;
3932	}
3933
3934	kq_req_lock(kqwl);
3935	/*
3936	* Disable the R2K notification while doing a register, if the
3937	* caller wants events too, we don't want the AST to be set if we
3938	* will process these events soon.
3939	*/
3940	kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
3941	needs_end_processing = true;
3942	kq_req_unlock(kq);
3943	}
3944
3945	if (id_out) {
3946	*id_out = kqwl->kqwl_dynamicid;
3947	}
3948
3949	}
3950
3951	/ register all the change requests the user provided... /
3952	noutputs = `0`;
3953	while (nchanges > `0` && error == `0`) {
3954	error = kevent_copyin(&changelist, &kev, p, flags);
3955	if (error)
3956	break;
3957
3958	/ Make sure user doesn't pass in any system flags /
3959	kev.flags &= ~EV_SYSFLAGS;
3960
3961	register_rc = kevent_register(kq, &kev, &knlc);
3962	if (register_rc & FILTER_REGISTER_WAIT) {
3963	kqlock_held(kq);
3964
3965	// f_post_register_wait is meant to call a continuation and not to
3966	// return, which is why we don't support FILTER_REGISTER_WAIT if
3967	// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
3968	// waits isn't the last.
3969	//
3970	// It is implementable, but not used by any userspace code at the
3971	// moment, so for now return ENOTSUP if someone tries to do it.
3972	if (nchanges == `1` && nevents >= `1` && (flags & KEVENT_FLAG_ERROR_EVENTS)) {
3973	struct _kevent_register *cont_args;
3974	/ store the continuation/completion data in the uthread /
3975	ut = (uthread_t)get_bsdthread_info(thread);
3976	cont_args = &ut->uu_save.uus_kevent_register;
3977	cont_args->kev = kev;
3978	cont_args->kq = kq;
3979	cont_args->fp = fp;
3980	cont_args->fd = fd;
3981	cont_args->ueventlist = ueventlist;
3982	cont_args->flags = flags;
3983	cont_args->retval = retval;
3984	cont_args->eventcount = nevents;
3985	cont_args->eventout = noutputs;
3986	knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args);
3987	panic("f_post_register_wait returned (kev: %p)", &kev);
3988	}
3989
3990	kev.flags \|= EV_ERROR;
3991	kev.data = ENOTSUP;
3992	knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK);
3993	}
3994
3995	// keep in sync with kevent_register_wait_return()
3996	if (nevents > `0` && (kev.flags & (EV_ERROR\|EV_RECEIPT))) {
3997	if ((kev.flags & EV_ERROR) == `0`) {
3998	kev.flags \|= EV_ERROR;
3999	kev.data = `0`;
4000	}
4001	error = kevent_copyout(&kev, &ueventlist, p, flags);
4002	if (error == `0`) {
4003	nevents--;
4004	noutputs++;
4005	}
4006	} else if (kev.flags & EV_ERROR) {
4007	error = kev.data;
4008	}
4009	nchanges--;
4010	}
4011
4012	/ short-circuit the scan if we only want error events /
4013	if (flags & KEVENT_FLAG_ERROR_EVENTS)
4014	nevents = `0`;
4015
4016	/ process pending events /
4017	if (nevents > `0` && noutputs == `0` && error == `0`) {
4018	struct _kevent *cont_args;
4019	/ store the continuation/completion data in the uthread /
4020	ut = (uthread_t)get_bsdthread_info(thread);
4021	cont_args = &ut->uu_save.uus_kevent;
4022	cont_args->fp = fp;
4023	cont_args->fd = fd;
4024	cont_args->retval = retval;
4025	cont_args->eventlist = ueventlist;
4026	cont_args->eventcount = nevents;
4027	cont_args->eventout = noutputs;
4028	cont_args->data_available = data_available;
4029	cont_args->process_data.fp_fd = (int)id;
4030	cont_args->process_data.fp_flags = flags;
4031	cont_args->process_data.fp_data_out = data_out;
4032	cont_args->process_data.fp_data_size = data_size;
4033	cont_args->process_data.fp_data_resid = data_size;
4034
4035	/*
4036	* kqworkloop_end_processing() will happen at the end of kqueue_scan()
4037	*/
4038	needs_end_processing = false;
4039
4040	error = kqueue_scan(kq, kevent_callback,
4041	continuation, cont_args,
4042	&cont_args->process_data,
4043	&atv, p);
4044
4045	/ process remaining outputs /
4046	noutputs = cont_args->eventout;
4047	data_resid = cont_args->process_data.fp_data_resid;
4048
4049	/ copyout residual data size value (if it needs to be copied out) /
4050	/ don't abandon other output just because of residual copyout failures /
4051	if (error == `0` && data_available && data_resid != data_size) {
4052	(void)kevent_put_data_size(p, data_available, flags, data_resid);
4053	}
4054	}
4055
4056	out:
4057	if (__improbable(needs_end_processing)) {
4058	/*
4059	* If we didn't through kqworkloop_end_processing(),
4060	* we need to do it here.
4061	*/
4062	kqlock(kq);
4063	kqworkloop_end_processing((struct kqworkloop *)kq, `0`, `0`);
4064	kqunlock(kq);
4065	}
4066	kevent_put_kq(p, id, fp, kq);
4067
4068	/ don't restart after signals... /
4069	if (error == ERESTART)
4070	error = EINTR;
4071	else if (error == EWOULDBLOCK)
4072	error = `0`;
4073	if (error == `0`)
4074	*retval = noutputs;
4075	return (error);
4076	}
4077
4078
4079	/*
4080	* kevent_callback - callback for each individual event
4081	*
4082	* called with nothing locked
4083	* caller holds a reference on the kqueue
4084	*/
4085	static int
4086	kevent_callback(__unused struct kqueue kq, struct* kevent_internal_s *kevp,
4087	void *data)
4088	{
4089	struct _kevent *cont_args;
4090	int error;
4091
4092	cont_args = (struct _kevent *)data;
4093	assert(cont_args->eventout < cont_args->eventcount);
4094
4095	/*
4096	* Copy out the appropriate amount of event data for this user.
4097	*/
4098	error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
4099	cont_args->process_data.fp_flags);
4100
4101	/*
4102	* If there isn't space for additional events, return
4103	* a harmless error to stop the processing here
4104	*/
4105	if (error == `0` && ++cont_args->eventout == cont_args->eventcount)
4106	error = EWOULDBLOCK;
4107	return (error);
4108	}
4109
4110	/*
4111	* kevent_description - format a description of a kevent for diagnostic output
4112	*
4113	* called with a 256-byte string buffer
4114	*/
4115
4116	char *
4117	kevent_description(struct kevent_internal_s kevp, char* *s, size_t n)
4118	{
4119	snprintf(s, n,
4120	"kevent="
4121	"{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
4122	kevp->ident,
4123	kevp->filter,
4124	kevp->flags,
4125	kevp->udata,
4126	kevp->fflags,
4127	kevp->data,
4128	kevp->ext[`0`],
4129	kevp->ext[`1`] );
4130
4131	return (s);
4132	}
4133
4134	static int
4135	kevent_register_validate_priority(struct kqueue kq, struct* knote *kn,
4136	struct kevent_internal_s *kev)
4137	{
4138	/ We don't care about the priority of a disabled or deleted knote /
4139	if (kev->flags & (EV_DISABLE \| EV_DELETE)) {
4140	return `0`;
4141	}
4142
4143	if (kq->kq_state & KQ_WORKLOOP) {
4144	/*
4145	* Workloops need valid priorities with a QOS (excluding manager) for
4146	* any enabled knote.
4147	*
4148	* When it is pre-existing, just make sure it has a valid QoS as
4149	* kevent_register() will not use the incoming priority (filters who do
4150	* have the responsibility to validate it again, see filt_wltouch).
4151	*
4152	* If the knote is being made, validate the incoming priority.
4153	*/
4154	if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
4155	return ERANGE;
4156	}
4157	}
4158
4159	return `0`;
4160	}
4161
4162	/*
4163	* Prepare a filter for waiting after register.
4164	*
4165	* The f_post_register_wait hook will be called later by kevent_register()
4166	* and should call kevent_register_wait_block()
4167	*/
4168	static int
4169	kevent_register_wait_prepare(struct knote kn, struct* kevent_internal_s *kev)
4170	{
4171	thread_t thread = current_thread();
4172	struct uthread *uth = get_bsdthread_info(thread);
4173
4174	assert(knote_fops(kn)->f_extended_codes);
4175
4176	if (kn->kn_hook == NULL) {
4177	thread_reference(thread);
4178	kn->kn_hook = thread;
4179	} else if (kn->kn_hook != thread) {
4180	/*
4181	* kn_hook may be set from a previous aborted wait
4182	* However, it has to be from the same thread.
4183	*/
4184	kev->flags \|= EV_ERROR;
4185	kev->data = EXDEV;
4186	return `0`;
4187	}
4188
4189	uth->uu_save.uus_kevent_register.knote = kn;
4190	return FILTER_REGISTER_WAIT;
4191	}
4192
4193	/*
4194	* Cleanup a kevent_register_wait_prepare() effect for threads that have been
4195	* aborted instead of properly woken up with thread_wakeup_thread().
4196	*/
4197	static void
4198	kevent_register_wait_cleanup(struct knote *kn)
4199	{
4200	thread_t thread = kn->kn_hook;
4201	kn->kn_hook = NULL;
4202	thread_deallocate(thread);
4203	}
4204
4205	/*
4206	* Must be called at the end of a f_post_register_wait call from a filter.
4207	*/
4208	static void
4209	kevent_register_wait_block(struct turnstile *ts, thread_t thread,
4210	struct knote_lock_ctx *knlc, thread_continue_t cont,
4211	struct _kevent_register *cont_args)
4212	{
4213	knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK);
4214	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
4215	cont_args->handoff_thread = thread;
4216	thread_handoff_parameter(thread, cont, cont_args);
4217	}
4218
4219	/*
4220	* Called by Filters using a f_post_register_wait to return from their wait.
4221	*/
4222	static void
4223	kevent_register_wait_return(struct _kevent_register *cont_args)
4224	{
4225	struct kqueue *kq = cont_args->kq;
4226	proc_t p = kq->kq_p;
4227	struct kevent_internal_s *kev = &cont_args->kev;
4228	int error = `0`;
4229
4230	if (cont_args->handoff_thread) {
4231	thread_deallocate(cont_args->handoff_thread);
4232	}
4233
4234	if (kev->flags & (EV_ERROR\|EV_RECEIPT)) {
4235	if ((kev->flags & EV_ERROR) == `0`) {
4236	kev->flags \|= EV_ERROR;
4237	kev->data = `0`;
4238	}
4239	error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags);
4240	if (error == `0`) cont_args->eventout++;
4241	}
4242
4243	kevent_put_kq(p, cont_args->fd, cont_args->fp, kq);
4244	if (error == `0`) {
4245	*cont_args->retval = cont_args->eventout;
4246	}
4247	unix_syscall_return(error);
4248	}
4249
4250	/*
4251	* kevent_register - add a new event to a kqueue
4252	*
4253	* Creates a mapping between the event source and
4254	* the kqueue via a knote data structure.
4255	*
4256	* Because many/most the event sources are file
4257	* descriptor related, the knote is linked off
4258	* the filedescriptor table for quick access.
4259	*
4260	* called with nothing locked
4261	* caller holds a reference on the kqueue
4262	*/
4263
4264	int
4265	kevent_register(struct kqueue kq, struct* kevent_internal_s *kev,
4266	struct knote_lock_ctx *knlc)
4267	{
4268	struct proc *p = kq->kq_p;
4269	const struct filterops *fops;
4270	struct knote *kn = NULL;
4271	int result = `0`, error = `0`;
4272	unsigned short kev_flags = kev->flags;
4273
4274	if (kev->filter < `0`) {
4275	if (kev->filter + EVFILT_SYSCOUNT < `0`) {
4276	error = EINVAL;
4277	goto out;
4278	}
4279	fops = sysfilt_ops[~kev->filter]; / to 0-base index /
4280	} else {
4281	error = EINVAL;
4282	goto out;
4283	}
4284
4285	/ restrict EV_VANISHED to adding udata-specific dispatch kevents /
4286	if ((kev->flags & EV_VANISHED) &&
4287	(kev->flags & (EV_ADD \| EV_DISPATCH2)) != (EV_ADD \| EV_DISPATCH2)) {
4288	error = EINVAL;
4289	goto out;
4290	}
4291
4292	/ Simplify the flags - delete and disable overrule /
4293	if (kev->flags & EV_DELETE)
4294	kev->flags &= ~EV_ADD;
4295	if (kev->flags & EV_DISABLE)
4296	kev->flags &= ~EV_ENABLE;
4297
4298	if (kq->kq_state & KQ_WORKLOOP) {
4299	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4300	((struct kqworkloop *)kq)->kqwl_dynamicid,
4301	kev->udata, kev->flags, kev->filter);
4302	} else if (kq->kq_state & KQ_WORKQ) {
4303	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4304	`0`, kev->udata, kev->flags, kev->filter);
4305	} else {
4306	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4307	VM_KERNEL_UNSLIDE_OR_PERM(kq),
4308	kev->udata, kev->flags, kev->filter);
4309	}
4310
4311	restart:
4312	/ find the matching knote from the fd tables/hashes /
4313	kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4314	error = kevent_register_validate_priority(kq, kn, kev);
4315	result = `0`;
4316	if (error) {
4317	goto out;
4318	}
4319
4320	if (kn == NULL && (kev->flags & EV_ADD) == `0`) {
4321	/*
4322	* No knote found, EV_ADD wasn't specified
4323	*/
4324
4325	if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4326	(kq->kq_state & KQ_WORKLOOP)) {
4327	/*
4328	* For workloops, understand EV_ADD\|EV_DELETE as a "soft" delete
4329	* that doesn't care about ENOENT, so just pretend the deletion
4330	* happened.
4331	*/
4332	} else {
4333	error = ENOENT;
4334	}
4335	goto out;
4336
4337	} else if (kn == NULL) {
4338	/*
4339	* No knote found, need to attach a new one (attach)
4340	*/
4341
4342	struct fileproc *knote_fp = NULL;
4343
4344	/ grab a file reference for the new knote /
4345	if (fops->f_isfd) {
4346	if ((error = fp_lookup(p, kev->ident, &knote_fp, `0`)) != `0`) {
4347	goto out;
4348	}
4349	}
4350
4351	kn = knote_alloc();
4352	if (kn == NULL) {
4353	error = ENOMEM;
4354	if (knote_fp != NULL)
4355	fp_drop(p, kev->ident, knote_fp, `0`);
4356	goto out;
4357	}
4358
4359	kn->kn_fp = knote_fp;
4360	kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq;
4361	kqueue_retain(kq); / retain a kq ref /
4362	kn->kn_filtid = ~kev->filter;
4363	kn->kn_status = KN_ATTACHING \| KN_ATTACHED;
4364
4365	/ was vanish support requested /
4366	if (kev->flags & EV_VANISHED) {
4367	kev->flags &= ~EV_VANISHED;
4368	kn->kn_status \|= KN_REQVANISH;
4369	}
4370
4371	/ snapshot matching/dispatching protcol flags into knote /
4372	if (kev->flags & EV_DISPATCH)
4373	kn->kn_status \|= KN_DISPATCH;
4374	if (kev->flags & EV_UDATA_SPECIFIC)
4375	kn->kn_status \|= KN_UDATA_SPECIFIC;
4376	if (kev->flags & EV_DISABLE)
4377	kn->kn_status \|= KN_DISABLED;
4378
4379	/*
4380	* copy the kevent state into knote
4381	* protocol is that fflags and data
4382	* are saved off, and cleared before
4383	* calling the attach routine.
4384	*/
4385	kn->kn_kevent = *kev;
4386	kn->kn_sfflags = kev->fflags;
4387	kn->kn_sdata = kev->data;
4388	kn->kn_fflags = `0`;
4389	kn->kn_data = `0`;
4390	knote_reset_priority(kn, kev->qos);
4391
4392	/ Add the knote for lookup thru the fd table /
4393	error = kq_add_knote(kq, kn, knlc, p);
4394	if (error) {
4395	(void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
4396	knote_free(kn);
4397	if (knote_fp != NULL)
4398	fp_drop(p, kev->ident, knote_fp, `0`);
4399
4400	if (error == ERESTART) {
4401	goto restart;
4402	}
4403	goto out;
4404	}
4405
4406	/ fp reference count now applies to knote /
4407
4408	/*
4409	* we can't use filter_call() because f_attach can change the filter ops
4410	* for a filter that supports f_extended_codes, so we need to reload
4411	* knote_fops() and not use `fops`.
4412	*/
4413	result = fops->f_attach(kn, kev);
4414	if (result && !knote_fops(kn)->f_extended_codes) {
4415	result = FILTER_ACTIVE;
4416	}
4417
4418	kqlock(kq);
4419
4420	if (kn->kn_flags & EV_ERROR) {
4421	/*
4422	* Failed to attach correctly, so drop.
4423	*/
4424	kn->kn_status &= ~(KN_ATTACHED \| KN_ATTACHING);
4425	error = kn->kn_data;
4426	knote_drop(kq, kn, knlc);
4427	result = `0`;
4428	goto out;
4429	}
4430
4431	/*
4432	* end "attaching" phase - now just attached
4433	*
4434	* Mark the thread request overcommit, if appropos
4435	*
4436	* If the attach routine indicated that an
4437	* event is already fired, activate the knote.
4438	*/
4439	kn->kn_status &= ~KN_ATTACHING;
4440	knote_set_qos_overcommit(kn);
4441
4442	if (result & FILTER_ACTIVE) {
4443	if (result & FILTER_ADJUST_EVENT_QOS_BIT)
4444	knote_adjust_qos(kq, kn, result);
4445	knote_activate(kn);
4446	}
4447
4448	} else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4449
4450	/*
4451	* The knote was dropped while we were waiting for the lock,
4452	* we need to re-evaluate entirely
4453	*/
4454
4455	goto restart;
4456
4457	} else if (kev->flags & EV_DELETE) {
4458	/*
4459	* Deletion of a knote (drop)
4460	*
4461	* If the filter wants to filter drop events, let it do so.
4462	*
4463	* defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4464	* we must wait for the knote to be re-enabled (unless it is being
4465	* re-enabled atomically here).
4466	*/
4467
4468	if (knote_fops(kn)->f_allow_drop) {
4469	bool drop;
4470
4471	kqunlock(kq);
4472	drop = knote_fops(kn)->f_allow_drop(kn, kev);
4473	kqlock(kq);
4474
4475	if (!drop) goto out_unlock;
4476	}
4477
4478	if ((kev->flags & EV_ENABLE) == `0` &&
4479	(kn->kn_status & (KN_DISPATCH2 \| KN_DISABLED)) ==
4480	(KN_DISPATCH2 \| KN_DISABLED)) {
4481	kn->kn_status \|= KN_DEFERDELETE;
4482	error = EINPROGRESS;
4483	goto out_unlock;
4484	}
4485
4486	knote_drop(kq, kn, knlc);
4487	goto out;
4488
4489	} else {
4490	/*
4491	* Regular update of a knote (touch)
4492	*
4493	* Call touch routine to notify filter of changes in filter values
4494	* (and to re-determine if any events are fired).
4495	*
4496	* If the knote is in defer-delete, avoid calling the filter touch
4497	* routine (it has delivered its last event already).
4498	*
4499	* If the touch routine had no failure,
4500	* apply the requested side effects to the knote.
4501	*/
4502
4503	if (kn->kn_status & (KN_DEFERDELETE \| KN_VANISHED)) {
4504	if (kev->flags & EV_ENABLE) {
4505	result = FILTER_ACTIVE;
4506	}
4507	} else {
4508	kqunlock(kq);
4509	result = filter_call(knote_fops(kn), f_touch(kn, kev));
4510	kqlock(kq);
4511	}
4512
4513	if (kev->flags & EV_ERROR) {
4514	result = `0`;
4515	} else {
4516	/ accept new kevent state /
4517	if ((kn->kn_status & KN_UDATA_SPECIFIC) == `0`)
4518	kn->kn_udata = kev->udata;
4519	if (kev->flags & EV_DISABLE)
4520	knote_disable(kn);
4521	if (result & (FILTER_UPDATE_REQ_QOS \| FILTER_ADJUST_EVENT_QOS_BIT))
4522	knote_dequeue(kn);
4523	if ((result & FILTER_UPDATE_REQ_QOS) &&
4524	kev->qos && kev->qos != kn->kn_qos) {
4525	knote_reset_priority(kn, kev->qos);
4526	}
4527	if (result & FILTER_ACTIVE) {
4528	thread_qos_t qos;
4529	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4530	if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
4531	knote_apply_qos_override(kn, qos);
4532	}
4533	}
4534	knote_activate(kn);
4535	}
4536	if (result & (FILTER_UPDATE_REQ_QOS \| FILTER_ADJUST_EVENT_QOS_BIT)) {
4537	if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
4538	knote_wakeup(kn);
4539	}
4540	}
4541	if (kev->flags & EV_ENABLE)
4542	knote_enable(kn);
4543	}
4544	}
4545
4546	out_unlock:
4547	if ((result & FILTER_REGISTER_WAIT) == `0`) {
4548	/*
4549	* When the filter asked for a post-register wait,
4550	* we leave the knote and kqueue locked for kevent_register()
4551	* to call the filter's f_post_register_wait hook.
4552	*/
4553	knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
4554	}
4555
4556	out:
4557	/ output local errors through the kevent /
4558	if (error) {
4559	kev->flags \|= EV_ERROR;
4560	kev->data = error;
4561	}
4562	return result;
4563	}
4564
4565	/*
4566	* knote_process - process a triggered event
4567	*
4568	* Validate that it is really still a triggered event
4569	* by calling the filter routines (if necessary). Hold
4570	* a use reference on the knote to avoid it being detached.
4571	*
4572	* If it is still considered triggered, we will have taken
4573	* a copy of the state under the filter lock. We use that
4574	* snapshot to dispatch the knote for future processing (or
4575	* not, if this was a lost event).
4576	*
4577	* Our caller assures us that nobody else can be processing
4578	* events from this knote during the whole operation. But
4579	* others can be touching or posting events to the knote
4580	* interspersed with our processing it.
4581	*
4582	* caller holds a reference on the kqueue.
4583	* kqueue locked on entry and exit - but may be dropped
4584	*/
4585	static int
4586	knote_process(struct knote *kn,
4587	kevent_callback_t callback,
4588	void *callback_data,
4589	struct filt_process_s *process_data)
4590	{
4591	struct kevent_internal_s kev;
4592	struct kqueue *kq = knote_get_kq(kn);
4593	KNOTE_LOCK_CTX(knlc);
4594	int result = FILTER_ACTIVE;
4595	int error = `0`;
4596	bool drop = false;
4597
4598	bzero(&kev, sizeof(kev));
4599
4600	/*
4601	* Must be active or stayactive
4602	* Must be queued and not disabled/suppressed
4603	*/
4604	assert(kn->kn_status & KN_QUEUED);
4605	assert(kn->kn_status & (KN_ACTIVE\|KN_STAYACTIVE));
4606	assert(!(kn->kn_status & (KN_DISABLED\|KN_SUPPRESSED\|KN_DROPPING)));
4607
4608	if (kq->kq_state & KQ_WORKLOOP) {
4609	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4610	((struct kqworkloop *)kq)->kqwl_dynamicid,
4611	kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4612	kn->kn_filtid);
4613	} else if (kq->kq_state & KQ_WORKQ) {
4614	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4615	`0`, kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4616	kn->kn_filtid);
4617	} else {
4618	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4619	VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4620	kn->kn_status \| (kn->kn_id << `32`), kn->kn_filtid);
4621	}
4622
4623	if ((kn->kn_status & KN_DROPPING) \|\|
4624	!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4625	/*
4626	* When the knote is dropping or has dropped,
4627	* then there's nothing we want to process.
4628	*/
4629	return EJUSTRETURN;
4630	}
4631
4632	/*
4633	* For deferred-drop or vanished events, we just create a fake
4634	* event to acknowledge end-of-life. Otherwise, we call the
4635	* filter's process routine to snapshot the kevent state under
4636	* the filter's locking protocol.
4637	*
4638	* suppress knotes to avoid returning the same event multiple times in
4639	* a single call.
4640	*/
4641	knote_suppress(kn);
4642
4643	if (kn->kn_status & (KN_DEFERDELETE \| KN_VANISHED)) {
4644	/ create fake event /
4645	kev.filter = kn->kn_filter;
4646	kev.ident = kn->kn_id;
4647	kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED;
4648	kev.flags \|= (EV_DISPATCH2 \| EV_ONESHOT);
4649	kev.udata = kn->kn_udata;
4650	} else {
4651	/ deactivate - so new activations indicate a wakeup /
4652	knote_deactivate(kn);
4653
4654	kqunlock(kq);
4655	result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev));
4656	kqlock(kq);
4657	}
4658
4659	/*
4660	* Determine how to dispatch the knote for future event handling.
4661	* not-fired: just return (do not callout, leave deactivated).
4662	* One-shot: If dispatch2, enter deferred-delete mode (unless this is
4663	* is the deferred delete event delivery itself). Otherwise,
4664	* drop it.
4665	* Dispatch: don't clear state, just mark it disabled.
4666	* Cleared: just leave it deactivated.
4667	* Others: re-activate as there may be more events to handle.
4668	* This will not wake up more handlers right now, but
4669	* at the completion of handling events it may trigger
4670	* more handler threads (TODO: optimize based on more than
4671	* just this one event being detected by the filter).
4672	*/
4673	if ((result & FILTER_ACTIVE) == `0`) {
4674	if ((kn->kn_status & (KN_ACTIVE \| KN_STAYACTIVE)) == `0`) {
4675	/*
4676	* Stay active knotes should not be unsuppressed or we'd create an
4677	* infinite loop.
4678	*
4679	* Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4680	* within f_process() but that doesn't necessarily make them
4681	* ready to process, so we should leave them be.
4682	*
4683	* For other knotes, since we will not return an event,
4684	* there's no point keeping the knote suppressed.
4685	*/
4686	knote_unsuppress(kn);
4687	}
4688	knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4689	return EJUSTRETURN;
4690	}
4691
4692	if (result & FILTER_ADJUST_EVENT_QOS_BIT)
4693	knote_adjust_qos(kq, kn, result);
4694	kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4695
4696	if (kev.flags & EV_ONESHOT) {
4697	if ((kn->kn_status & (KN_DISPATCH2 \| KN_DEFERDELETE)) == KN_DISPATCH2) {
4698	/ defer dropping non-delete oneshot dispatch2 events /
4699	kn->kn_status \|= KN_DEFERDELETE;
4700	knote_disable(kn);
4701	} else {
4702	drop = true;
4703	}
4704	} else if (kn->kn_status & KN_DISPATCH) {
4705	/ disable all dispatch knotes /
4706	knote_disable(kn);
4707	} else if ((kev.flags & EV_CLEAR) == `0`) {
4708	/ re-activate in case there are more events /
4709	knote_activate(kn);
4710	}
4711
4712	/*
4713	* callback to handle each event as we find it.
4714	* If we have to detach and drop the knote, do
4715	* it while we have the kq unlocked.
4716	*/
4717	if (drop) {
4718	knote_drop(kq, kn, &knlc);
4719	} else {
4720	knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4721	}
4722
4723	if (kev.flags & EV_VANISHED) {
4724	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4725	kev.ident, kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
4726	kn->kn_filtid);
4727	}
4728
4729	error = (callback)(kq, &kev, callback_data);
4730	kqlock(kq);
4731	return error;
4732	}
4733
4734	/*
4735	* Returns -1 if the kqueue was unbound and processing should not happen
4736	*/
4737	#define KQWQAE_BEGIN_PROCESSING 1
4738	#define KQWQAE_END_PROCESSING 2
4739	#define KQWQAE_UNBIND 3
4740	static int
4741	kqworkq_acknowledge_events(struct kqworkq kqwq, struct* kqrequest *kqr,
4742	int kevent_flags, int kqwqae_op)
4743	{
4744	thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
4745	thread_t thread = kqr->kqr_thread;
4746	struct knote *kn;
4747	int rc = `0`;
4748	bool seen_stayactive = false, unbind;
4749
4750	kqlock_held(&kqwq->kqwq_kqueue);
4751
4752	if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
4753	/*
4754	* Return suppressed knotes to their original state.
4755	* For workq kqueues, suppressed ones that are still
4756	* truly active (not just forced into the queue) will
4757	* set flags we check below to see if anything got
4758	* woken up.
4759	*/
4760	while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) {
4761	assert(kn->kn_status & KN_SUPPRESSED);
4762	knote_unsuppress(kn);
4763	if (kn->kn_status & KN_STAYACTIVE) {
4764	seen_stayactive = true;
4765	}
4766	}
4767	}
4768
4769	kq_req_lock(kqwq);
4770
4771	#if DEBUG \|\| DEVELOPMENT
4772	thread_t self = current_thread();
4773	struct uthread *ut = get_bsdthread_info(self);
4774
4775	assert(kqr->kqr_state & KQR_THREQUESTED);
4776	assert(kqr->kqr_thread == self);
4777	assert(ut->uu_kqr_bound == kqr);
4778	#endif // DEBUG \|\| DEVELOPMENT
4779
4780	if (kqwqae_op == KQWQAE_UNBIND) {
4781	unbind = true;
4782	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == `0`) {
4783	unbind = false;
4784	} else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) {
4785	/*
4786	* When we unsuppress stayactive knotes, for the kind that are hooked
4787	* through select, we need to process once before we can assert there's
4788	* no event pending. Hence we can't unbind during BEGIN PROCESSING.
4789	*/
4790	unbind = false;
4791	} else {
4792	unbind = ((kqr->kqr_state & KQR_WAKEUP) == `0`);
4793	}
4794	if (unbind) {
4795	old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4796	rc = -`1`;
4797	/*
4798	* request a new thread if we didn't process the whole queue or real events
4799	* have happened (not just putting stay-active events back).
4800	*/
4801	if (kqr->kqr_state & KQR_WAKEUP) {
4802	kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4803	kqr->kqr_qos_index, `0`);
4804	}
4805	}
4806
4807	if (rc == `0`) {
4808	/*
4809	* Reset wakeup bit to notice events firing while we are processing,
4810	* as we cannot rely on the bucket queue emptiness because of stay
4811	* active knotes.
4812	*/
4813	kqr->kqr_state &= ~KQR_WAKEUP;
4814	}
4815
4816	kq_req_unlock(kqwq);
4817
4818	if (old_override) {
4819	thread_drop_ipc_override(thread);
4820	}
4821
4822	return rc;
4823	}
4824
4825	/*
4826	* Return 0 to indicate that processing should proceed,
4827	* -1 if there is nothing to process.
4828	*
4829	* Called with kqueue locked and returns the same way,
4830	* but may drop lock temporarily.
4831	*/
4832	static int
4833	kqworkq_begin_processing(struct kqworkq kqwq, struct* kqrequest *kqr,
4834	int kevent_flags)
4835	{
4836	int rc = `0`;
4837
4838	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) \| DBG_FUNC_START,
4839	`0`, kqr->kqr_qos_index);
4840
4841	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4842	KQWQAE_BEGIN_PROCESSING);
4843
4844	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) \| DBG_FUNC_END,
4845	thread_tid(kqr->kqr_thread), kqr->kqr_state);
4846
4847	return rc;
4848	}
4849
4850	static inline bool
4851	kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
4852	{
4853	struct kqueue *kq = &kqwl->kqwl_kqueue;
4854
4855	kqlock_held(kq);
4856
4857	if (kq->kq_state & KQ_PROCESSING) {
4858	/*
4859	* KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
4860	* never modified while KQ_PROCESSING is set, meaning that peeking at
4861	* its value is safe from this context.
4862	*/
4863	return kqwl->kqwl_request.kqr_thread == current_thread();
4864	}
4865	return false;
4866	}
4867
4868	static thread_qos_t
4869	kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4870	{
4871	struct kqrequest *kqr = &kqwl->kqwl_request;
4872	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4873	struct knote kn, tmp;
4874
4875	kqlock_held(&kqwl->kqwl_kqueue);
4876
4877	TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) {
4878	/*
4879	* If a knote that can adjust QoS is disabled because of the automatic
4880	* behavior of EV_DISPATCH, the knotes should stay suppressed so that
4881	* further overrides keep pushing.
4882	*/
4883	if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
4884	(kn->kn_status & (KN_STAYACTIVE \| KN_DROPPING)) == `0` &&
4885	(kn->kn_flags & (EV_DISPATCH \| EV_DISABLE)) == EV_DISPATCH) {
4886	qos = MAX(qos, knote_get_qos_override_index(kn));
4887	continue;
4888	}
4889	knote_unsuppress(kn);
4890	}
4891
4892	return qos;
4893	}
4894
4895	static int
4896	kqworkloop_begin_processing(struct kqworkloop kqwl, unsigned* int kevent_flags)
4897	{
4898	struct kqrequest *kqr = &kqwl->kqwl_request;
4899	struct kqueue *kq = &kqwl->kqwl_kqueue;
4900	thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
4901	thread_t thread = kqr->kqr_thread;
4902	int rc = `0`, op = KQWL_UTQ_NONE;
4903
4904	kqlock_held(kq);
4905
4906	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) \| DBG_FUNC_START,
4907	kqwl->kqwl_dynamicid, `0`, `0`);
4908
4909	/ nobody else should still be processing /
4910	assert((kq->kq_state & KQ_PROCESSING) == `0`);
4911
4912	kq->kq_state \|= KQ_PROCESSING;
4913
4914	if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
4915	op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4916	}
4917
4918	if (kevent_flags & KEVENT_FLAG_PARKING) {
4919	/*
4920	* When "parking" we want to process events and if no events are found
4921	* unbind.
4922	*
4923	* However, non overcommit threads sometimes park even when they have
4924	* more work so that the pool can narrow. For these, we need to unbind
4925	* early, so that calling kqworkloop_update_threads_qos() can ask the
4926	* workqueue subsystem whether the thread should park despite having
4927	* pending events.
4928	*/
4929	if (kqr->kqr_state & KQR_THOVERCOMMIT) {
4930	op = KQWL_UTQ_PARKING;
4931	} else {
4932	op = KQWL_UTQ_UNBINDING;
4933	}
4934	}
4935	if (op == KQWL_UTQ_NONE) {
4936	goto done;
4937	}
4938
4939	qos_override = kqworkloop_acknowledge_events(kqwl);
4940
4941	kq_req_lock(kqwl);
4942
4943	if (op == KQWL_UTQ_UNBINDING) {
4944	old_override = kqworkloop_unbind_locked(kqwl, thread);
4945	(void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
4946	}
4947	kqworkloop_update_threads_qos(kqwl, op, qos_override);
4948	if (op == KQWL_UTQ_PARKING) {
4949	if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
4950	/*
4951	* We cannot trust KQR_WAKEUP when looking at stay active knotes.
4952	* We need to process once, and kqworkloop_end_processing will
4953	* handle the unbind.
4954	*/
4955	} else if ((kqr->kqr_state & KQR_WAKEUP) == `0` \|\| kqwl->kqwl_owner) {
4956	old_override = kqworkloop_unbind_locked(kqwl, thread);
4957	(void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
4958	rc = -`1`;
4959	}
4960	} else if (op == KQWL_UTQ_UNBINDING) {
4961	if (kqr->kqr_thread == thread) {
4962	/*
4963	* The thread request fired again, passed the admission check and
4964	* got bound to the current thread again.
4965	*/
4966	} else {
4967	rc = -`1`;
4968	}
4969	}
4970
4971	if (rc == `0`) {
4972	/*
4973	* Reset wakeup bit to notice stay active events firing while we are
4974	* processing, as we cannot rely on the stayactive bucket emptiness.
4975	*/
4976	kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
4977	} else {
4978	kq->kq_state &= ~KQ_PROCESSING;
4979	}
4980
4981	kq_req_unlock(kqwl);
4982
4983	if (old_override) {
4984	thread_drop_ipc_override(thread);
4985	}
4986
4987	done:
4988	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) \| DBG_FUNC_END,
4989	kqwl->kqwl_dynamicid, `0`, `0`);
4990
4991	return rc;
4992	}
4993
4994	/*
4995	* Return 0 to indicate that processing should proceed,
4996	* -1 if there is nothing to process.
4997	*
4998	* Called with kqueue locked and returns the same way,
4999	* but may drop lock temporarily.
5000	* May block.
5001	*/
5002	static int
5003	kqfile_begin_processing(struct kqueue *kq)
5004	{
5005	struct kqtailq *suppressq;
5006
5007	kqlock_held(kq);
5008
5009	assert((kq->kq_state & (KQ_WORKQ \| KQ_WORKLOOP)) == `0`);
5010	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_START,
5011	VM_KERNEL_UNSLIDE_OR_PERM(kq), `0`);
5012
5013	/ wait to become the exclusive processing thread /
5014	for (;;) {
5015	if (kq->kq_state & KQ_DRAIN) {
5016	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
5017	VM_KERNEL_UNSLIDE_OR_PERM(kq), `2`);
5018	return -`1`;
5019	}
5020
5021	if ((kq->kq_state & KQ_PROCESSING) == `0`)
5022	break;
5023
5024	/ if someone else is processing the queue, wait /
5025	kq->kq_state \|= KQ_PROCWAIT;
5026	suppressq = kqueue_get_suppressed_queue(kq, NULL);
5027	waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5028	CAST_EVENT64_T(suppressq), THREAD_UNINT \| THREAD_WAIT_NOREPORT,
5029	TIMEOUT_WAIT_FOREVER);
5030
5031	kqunlock(kq);
5032	thread_block(THREAD_CONTINUE_NULL);
5033	kqlock(kq);
5034	}
5035
5036	/ Nobody else processing /
5037
5038	/ clear pre-posts and KQ_WAKEUP now, in case we bail early /
5039	waitq_set_clear_preposts(&kq->kq_wqs);
5040	kq->kq_state &= ~KQ_WAKEUP;
5041
5042	/ anything left to process? /
5043	if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) {
5044	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
5045	VM_KERNEL_UNSLIDE_OR_PERM(kq), `1`);
5046	return -`1`;
5047	}
5048
5049	/ convert to processing mode /
5050	kq->kq_state \|= KQ_PROCESSING;
5051
5052	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) \| DBG_FUNC_END,
5053	VM_KERNEL_UNSLIDE_OR_PERM(kq));
5054
5055	return `0`;
5056	}
5057
5058	/*
5059	* Try to end the processing, only called when a workq thread is attempting to
5060	* park (KEVENT_FLAG_PARKING is set).
5061	*
5062	* When returning -1, the kqworkq is setup again so that it is ready to be
5063	* processed.
5064	*/
5065	static int
5066	kqworkq_end_processing(struct kqworkq kqwq, struct* kqrequest *kqr,
5067	int kevent_flags)
5068	{
5069	if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) {
5070	/ remember we didn't process everything /
5071	kq_req_lock(kqwq);
5072	kqr->kqr_state \|= KQR_WAKEUP;
5073	kq_req_unlock(kqwq);
5074	}
5075
5076	if (kevent_flags & KEVENT_FLAG_PARKING) {
5077	/*
5078	* if acknowledge events "succeeds" it means there are events,
5079	* which is a failure condition for end_processing.
5080	*/
5081	int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
5082	KQWQAE_END_PROCESSING);
5083	if (rc == `0`) {
5084	return -`1`;
5085	}
5086	}
5087
5088	return `0`;
5089	}
5090
5091	/*
5092	* Try to end the processing, only called when a workq thread is attempting to
5093	* park (KEVENT_FLAG_PARKING is set).
5094	*
5095	* When returning -1, the kqworkq is setup again so that it is ready to be
5096	* processed (as if kqworkloop_begin_processing had just been called).
5097	*
5098	* If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
5099	* the kqworkloop is unbound from its servicer as a side effect.
5100	*/
5101	static int
5102	kqworkloop_end_processing(struct kqworkloop kqwl, int* flags, int kevent_flags)
5103	{
5104	struct kqueue *kq = &kqwl->kqwl_kqueue;
5105	struct kqrequest *kqr = &kqwl->kqwl_request;
5106	thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
5107	thread_t thread = kqr->kqr_thread;
5108	int rc = `0`;
5109
5110	kqlock_held(kq);
5111
5112	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) \| DBG_FUNC_START,
5113	kqwl->kqwl_dynamicid, `0`, `0`);
5114
5115	if (flags & KQ_PROCESSING) {
5116	assert(kq->kq_state & KQ_PROCESSING);
5117
5118	/*
5119	* If we still have queued stayactive knotes, remember we didn't finish
5120	* processing all of them. This should be extremely rare and would
5121	* require to have a lot of them registered and fired.
5122	*/
5123	if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
5124	kq_req_lock(kqwl);
5125	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
5126	KQWL_BUCKET_STAYACTIVE);
5127	kq_req_unlock(kqwl);
5128	}
5129
5130	/*
5131	* When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
5132	* still under the lock.
5133	*
5134	* So we do everything kqworkloop_unbind() would do, but because we're
5135	* inside kqueue_process(), if the workloop actually received events
5136	* while our locks were dropped, we have the opportunity to fail the end
5137	* processing and loop again.
5138	*
5139	* This avoids going through the process-wide workqueue lock hence
5140	* scales better.
5141	*/
5142	if (kevent_flags & KEVENT_FLAG_PARKING) {
5143	qos_override = kqworkloop_acknowledge_events(kqwl);
5144	}
5145	}
5146
5147	kq_req_lock(kqwl);
5148
5149	if (kevent_flags & KEVENT_FLAG_PARKING) {
5150	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
5151	if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) {
5152	/*
5153	* Reset wakeup bit to notice stay active events firing while we are
5154	* processing, as we cannot rely on the stayactive bucket emptiness.
5155	*/
5156	kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
5157	rc = -`1`;
5158	} else {
5159	old_override = kqworkloop_unbind_locked(kqwl, thread);
5160	(void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
5161	kq->kq_state &= ~flags;
5162	}
5163	} else {
5164	kq->kq_state &= ~flags;
5165	kqr->kqr_state \|= KQR_R2K_NOTIF_ARMED;
5166	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, `0`);
5167	}
5168
5169	kq_req_unlock(kqwl);
5170
5171	if (old_override) {
5172	thread_drop_ipc_override(thread);
5173	}
5174
5175	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) \| DBG_FUNC_END,
5176	kqwl->kqwl_dynamicid, `0`, `0`);
5177
5178	return rc;
5179	}
5180
5181	/*
5182	* Called with kqueue lock held.
5183	*/
5184	static void
5185	kqfile_end_processing(struct kqueue *kq)
5186	{
5187	struct knote *kn;
5188	struct kqtailq *suppressq;
5189	int procwait;
5190
5191	kqlock_held(kq);
5192
5193	assert((kq->kq_state & (KQ_WORKQ\|KQ_WORKLOOP)) == `0`);
5194
5195	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
5196	VM_KERNEL_UNSLIDE_OR_PERM(kq), `0`);
5197
5198	/*
5199	* Return suppressed knotes to their original state.
5200	*/
5201	suppressq = kqueue_get_suppressed_queue(kq, NULL);
5202	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5203	assert(kn->kn_status & KN_SUPPRESSED);
5204	knote_unsuppress(kn);
5205	}
5206
5207	procwait = (kq->kq_state & KQ_PROCWAIT);
5208	kq->kq_state &= ~(KQ_PROCESSING \| KQ_PROCWAIT);
5209
5210	if (procwait) {
5211	/ first wake up any thread already waiting to process /
5212	waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
5213	CAST_EVENT64_T(suppressq),
5214	THREAD_AWAKENED,
5215	WAITQ_ALL_PRIORITIES);
5216	}
5217	}
5218
5219	static int
5220	kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
5221	struct kqueue_workloop_params params, int* *retval)
5222	{
5223	int error = `0`;
5224	int fd;
5225	struct fileproc *fp;
5226	struct kqueue *kq;
5227	struct kqworkloop *kqwl;
5228	struct filedesc *fdp = p->p_fd;
5229	workq_threadreq_param_t trp = { };
5230
5231	switch (cmd) {
5232	case KQ_WORKLOOP_CREATE:
5233	if (!params->kqwlp_flags) {
5234	error = EINVAL;
5235	break;
5236	}
5237
5238	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
5239	(params->kqwlp_sched_pri < `1` \|\|
5240	params->kqwlp_sched_pri > `63` / MAXPRI_USER /)) {
5241	error = EINVAL;
5242	break;
5243	}
5244
5245	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
5246	invalid_policy(params->kqwlp_sched_pol)) {
5247	error = EINVAL;
5248	break;
5249	}
5250
5251	if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
5252	(params->kqwlp_cpu_percent <= `0` \|\|
5253	params->kqwlp_cpu_percent > `100` \|\|
5254	params->kqwlp_cpu_refillms <= `0` \|\|
5255	params->kqwlp_cpu_refillms > `0x00ffffff`)) {
5256	error = EINVAL;
5257	break;
5258	}
5259
5260	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
5261	trp.trp_flags \|= TRP_PRIORITY;
5262	trp.trp_pri = params->kqwlp_sched_pri;
5263	}
5264	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
5265	trp.trp_flags \|= TRP_POLICY;
5266	trp.trp_pol = params->kqwlp_sched_pol;
5267	}
5268	if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
5269	trp.trp_flags \|= TRP_CPUPERCENT;
5270	trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
5271	trp.trp_refillms = params->kqwlp_cpu_refillms;
5272	}
5273
5274	error = kevent_get_kq(p, params->kqwlp_id, &trp,
5275	KEVENT_FLAG_DYNAMIC_KQUEUE \| KEVENT_FLAG_WORKLOOP \|
5276	KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST , &fp, &fd, &kq);
5277	if (error) {
5278	break;
5279	}
5280
5281	if (!(fdp->fd_flags & FD_WORKLOOP)) {
5282	/ FD_WORKLOOP indicates we've ever created a workloop*
5283	* via this syscall but its only ever added to a process, never
5284	* removed.
5285	*/
5286	proc_fdlock(p);
5287	fdp->fd_flags \|= FD_WORKLOOP;
5288	proc_fdunlock(p);
5289	}
5290	break;
5291	case KQ_WORKLOOP_DESTROY:
5292	error = kevent_get_kq(p, params->kqwlp_id, NULL,
5293	KEVENT_FLAG_DYNAMIC_KQUEUE \| KEVENT_FLAG_WORKLOOP \|
5294	KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST , &fp, &fd, &kq);
5295	if (error) {
5296	break;
5297	}
5298	kqlock(kq);
5299	kqwl = (struct kqworkloop *)kq;
5300	trp.trp_value = kqwl->kqwl_params;
5301	if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5302	trp.trp_flags \|= TRP_RELEASED;
5303	kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
5304	} else {
5305	error = EINVAL;
5306	}
5307	kqunlock(kq);
5308	kqueue_release_last(p, kq);
5309	break;
5310	}
5311	*retval = `0`;
5312	return error;
5313	}
5314
5315	int
5316	kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args uap, int* *retval)
5317	{
5318	struct kqueue_workloop_params params = {
5319	.kqwlp_id = `0`,
5320	};
5321	if (uap->sz < sizeof(params.kqwlp_version)) {
5322	return EINVAL;
5323	}
5324
5325	size_t copyin_sz = MIN(sizeof(params), uap->sz);
5326	int rv = copyin(uap->addr, &params, copyin_sz);
5327	if (rv) {
5328	return rv;
5329	}
5330
5331	if (params.kqwlp_version != (int)uap->sz) {
5332	return EINVAL;
5333	}
5334
5335	return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
5336	retval);
5337	}
5338
5339	/*
5340	* kqueue_process - process the triggered events in a kqueue
5341	*
5342	* Walk the queued knotes and validate that they are really still triggered
5343	* events by calling the filter routines (if necessary).
5344	*
5345	* For each event that is still considered triggered, invoke the callback
5346	* routine provided.
5347	*
5348	* caller holds a reference on the kqueue.
5349	* kqueue locked on entry and exit - but may be dropped
5350	* kqueue list locked (held for duration of call)
5351	*/
5352	static int
5353	kqueue_process(struct kqueue *kq,
5354	kevent_callback_t callback,
5355	void *callback_data,
5356	struct filt_process_s *process_data,
5357	int *countp)
5358	{
5359	struct uthread *ut = get_bsdthread_info(current_thread());
5360	struct kqrequest *kqr = ut->uu_kqr_bound;
5361	struct knote *kn;
5362	unsigned int flags = process_data ? process_data->fp_flags : `0`;
5363	int nevents = `0`, error = `0`, rc = `0`;
5364	struct kqtailq base_queue, queue;
5365	kqueue_t kqu = { .kq = kq };
5366	#if DEBUG \|\| DEVELOPMENT
5367	int retries = `64`;
5368	#endif
5369
5370	if (kq->kq_state & KQ_WORKQ) {
5371	if (kqr == NULL \|\| (kqr->kqr_state & KQR_WORKLOOP)) {
5372	return EJUSTRETURN;
5373	}
5374	rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
5375	} else if (kq->kq_state & KQ_WORKLOOP) {
5376	if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) {
5377	return EJUSTRETURN;
5378	}
5379	rc = kqworkloop_begin_processing(kqu.kqwl, flags);
5380	} else {
5381	rc = kqfile_begin_processing(kq);
5382	}
5383
5384	if (rc == -`1`) {
5385	/ Nothing to process /
5386	*countp = `0`;
5387	return `0`;
5388	}
5389
5390	/*
5391	* loop through the enqueued knotes associated with this request,
5392	* processing each one. Each request may have several queues
5393	* of knotes to process (depending on the type of kqueue) so we
5394	* have to loop through all the queues as long as we have additional
5395	* space.
5396	*/
5397
5398	process_again:
5399	if (kq->kq_state & KQ_WORKQ) {
5400	base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index];
5401	} else if (kq->kq_state & KQ_WORKLOOP) {
5402	base_queue = &kqu.kqwl->kqwl_queue[`0`];
5403	queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - `1`];
5404	} else {
5405	base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE];
5406	}
5407
5408	do {
5409	while (error == `0` && (kn = TAILQ_FIRST(queue)) != NULL) {
5410	error = knote_process(kn, callback, callback_data, process_data);
5411	if (error == EJUSTRETURN) {
5412	error = `0`;
5413	} else {
5414	nevents++;
5415	}
5416	/ error is EWOULDBLOCK when the out event array is full /
5417	}
5418
5419	if (error == EWOULDBLOCK) {
5420	/ break out if no more space for additional events /
5421	error = `0`;
5422	break;
5423	}
5424	} while (queue-- > base_queue);
5425
5426	*countp = nevents;
5427
5428	/*
5429	* If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
5430	* we want to unbind the kqrequest from the thread.
5431	*
5432	* However, because the kq locks are dropped several times during process,
5433	* new knotes may have fired again, in which case, we want to fail the end
5434	* processing and process again, until it converges.
5435	*
5436	* If we returned events however, end processing never fails.
5437	*/
5438	if (error \|\| nevents) flags &= ~KEVENT_FLAG_PARKING;
5439	if (kq->kq_state & KQ_WORKQ) {
5440	rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
5441	} else if (kq->kq_state & KQ_WORKLOOP) {
5442	rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
5443	} else {
5444	kqfile_end_processing(kq);
5445	rc = `0`;
5446	}
5447	if (rc == -`1`) {
5448	assert(flags & KEVENT_FLAG_PARKING);
5449	#if DEBUG \|\| DEVELOPMENT
5450	if (retries-- == `0`) {
5451	panic("kevent: way too many knote_process retries, kq: %p (0x%02x)",
5452	kq, kq->kq_state);
5453	}
5454	#endif
5455	goto process_again;
5456	}
5457	return error;
5458	}
5459
5460	static void
5461	kqueue_scan_continue(void *data, wait_result_t wait_result)
5462	{
5463	thread_t self = current_thread();
5464	uthread_t ut = (uthread_t)get_bsdthread_info(self);
5465	struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan;
5466	struct kqueue kq = (struct* kqueue *)data;
5467	struct filt_process_s *process_data = cont_args->process_data;
5468	int error;
5469	int count;
5470
5471	/ convert the (previous) wait_result to a proper error /
5472	switch (wait_result) {
5473	case THREAD_AWAKENED: {
5474	kqlock(kq);
5475	retry:
5476	error = kqueue_process(kq, cont_args->call, cont_args->data,
5477	process_data, &count);
5478	if (error == `0` && count == `0`) {
5479	if (kq->kq_state & KQ_DRAIN) {
5480	kqunlock(kq);
5481	goto drain;
5482	}
5483
5484	if (kq->kq_state & KQ_WAKEUP)
5485	goto retry;
5486
5487	waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5488	KQ_EVENT, THREAD_ABORTSAFE,
5489	cont_args->deadline);
5490	kq->kq_state \|= KQ_SLEEP;
5491	kqunlock(kq);
5492	thread_block_parameter(kqueue_scan_continue, kq);
5493	/ NOTREACHED /
5494	}
5495	kqunlock(kq);
5496	} break;
5497	case THREAD_TIMED_OUT:
5498	error = EWOULDBLOCK;
5499	break;
5500	case THREAD_INTERRUPTED:
5501	error = EINTR;
5502	break;
5503	case THREAD_RESTART:
5504	drain:
5505	error = EBADF;
5506	break;
5507	default:
5508	panic("%s: - invalid wait_result (%d)", __func__,
5509	wait_result);
5510	error = `0`;
5511	}
5512
5513	/ call the continuation with the results /
5514	assert(cont_args->cont != NULL);
5515	(cont_args->cont)(kq, cont_args->data, error);
5516	}
5517
5518
5519	/*
5520	* kqueue_scan - scan and wait for events in a kqueue
5521	*
5522	* Process the triggered events in a kqueue.
5523	*
5524	* If there are no events triggered arrange to
5525	* wait for them. If the caller provided a
5526	* continuation routine, then kevent_scan will
5527	* also.
5528	*
5529	* The callback routine must be valid.
5530	* The caller must hold a use-count reference on the kq.
5531	*/
5532	int
5533	kqueue_scan(struct kqueue *kq,
5534	kevent_callback_t callback,
5535	kqueue_continue_t continuation,
5536	void *callback_data,
5537	struct filt_process_s *process_data,
5538	struct timeval *atvp,
5539	__unused struct proc *p)
5540	{
5541	thread_continue_t cont = THREAD_CONTINUE_NULL;
5542	unsigned int flags;
5543	uint64_t deadline;
5544	int error;
5545	int first;
5546	int fd;
5547
5548	assert(callback != NULL);
5549
5550	/*
5551	* Determine which QoS index we are servicing
5552	*/
5553	flags = (process_data) ? process_data->fp_flags : `0`;
5554	fd = (process_data) ? process_data->fp_fd : -`1`;
5555
5556	first = `1`;
5557	for (;;) {
5558	wait_result_t wait_result;
5559	int count;
5560
5561	/*
5562	* Make a pass through the kq to find events already
5563	* triggered.
5564	*/
5565	kqlock(kq);
5566	error = kqueue_process(kq, callback, callback_data,
5567	process_data, &count);
5568	if (error \|\| count)
5569	break; / lock still held /
5570
5571	/ looks like we have to consider blocking /
5572	if (first) {
5573	first = `0`;
5574	/ convert the timeout to a deadline once /
5575	if (atvp->tv_sec \|\| atvp->tv_usec) {
5576	uint64_t now;
5577
5578	clock_get_uptime(&now);
5579	nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
5580	atvp->tv_usec * (long)NSEC_PER_USEC,
5581	&deadline);
5582	if (now >= deadline) {
5583	/ non-blocking call /
5584	error = EWOULDBLOCK;
5585	break; / lock still held /
5586	}
5587	deadline -= now;
5588	clock_absolutetime_interval_to_deadline(deadline, &deadline);
5589	} else {
5590	deadline = `0`; / block forever /
5591	}
5592
5593	if (continuation) {
5594	uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
5595	struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan;
5596
5597	cont_args->call = callback;
5598	cont_args->cont = continuation;
5599	cont_args->deadline = deadline;
5600	cont_args->data = callback_data;
5601	cont_args->process_data = process_data;
5602	cont = kqueue_scan_continue;
5603	}
5604	}
5605
5606	if (kq->kq_state & KQ_DRAIN) {
5607	kqunlock(kq);
5608	return EBADF;
5609	}
5610
5611	/ If awakened during processing, try again /
5612	if (kq->kq_state & KQ_WAKEUP) {
5613	kqunlock(kq);
5614	continue;
5615	}
5616
5617	/ go ahead and wait /
5618	waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
5619	KQ_EVENT, THREAD_ABORTSAFE,
5620	TIMEOUT_URGENCY_USER_NORMAL,
5621	deadline, TIMEOUT_NO_LEEWAY);
5622	kq->kq_state \|= KQ_SLEEP;
5623	kqunlock(kq);
5624	wait_result = thread_block_parameter(cont, kq);
5625	/ NOTREACHED if (continuation != NULL) /
5626
5627	switch (wait_result) {
5628	case THREAD_AWAKENED:
5629	continue;
5630	case THREAD_TIMED_OUT:
5631	return EWOULDBLOCK;
5632	case THREAD_INTERRUPTED:
5633	return EINTR;
5634	case THREAD_RESTART:
5635	return EBADF;
5636	default:
5637	panic("%s: - bad wait_result (%d)", __func__,
5638	wait_result);
5639	error = `0`;
5640	}
5641	}
5642	kqunlock(kq);
5643	return (error);
5644	}
5645
5646
5647	/*
5648	* XXX
5649	* This could be expanded to call kqueue_scan, if desired.
5650	*/
5651	/ARGSUSED/
5652	static int
5653	kqueue_read(__unused struct fileproc *fp,
5654	__unused struct uio *uio,
5655	__unused int flags,
5656	__unused vfs_context_t ctx)
5657	{
5658	return (ENXIO);
5659	}
5660
5661	/ARGSUSED/
5662	static int
5663	kqueue_write(__unused struct fileproc *fp,
5664	__unused struct uio *uio,
5665	__unused int flags,
5666	__unused vfs_context_t ctx)
5667	{
5668	return (ENXIO);
5669	}
5670
5671	/ARGSUSED/
5672	static int
5673	kqueue_ioctl(__unused struct fileproc *fp,
5674	__unused u_long com,
5675	__unused caddr_t data,
5676	__unused vfs_context_t ctx)
5677	{
5678	return (ENOTTY);
5679	}
5680
5681	/ARGSUSED/
5682	static int
5683	kqueue_select(struct fileproc fp, int* which, void *wq_link_id,
5684	__unused vfs_context_t ctx)
5685	{
5686	struct kqueue kq = (struct* kqueue *)fp->f_data;
5687	struct kqtailq *queue;
5688	struct kqtailq *suppressq;
5689	struct knote *kn;
5690	int retnum = `0`;
5691
5692	if (which != FREAD)
5693	return (`0`);
5694
5695	kqlock(kq);
5696
5697	assert((kq->kq_state & KQ_WORKQ) == `0`);
5698
5699	/*
5700	* If this is the first pass, link the wait queue associated with the
5701	* the kqueue onto the wait queue set for the select(). Normally we
5702	* use selrecord() for this, but it uses the wait queue within the
5703	* selinfo structure and we need to use the main one for the kqueue to
5704	* catch events from KN_STAYQUEUED sources. So we do the linkage manually.
5705	* (The select() call will unlink them when it ends).
5706	*/
5707	if (wq_link_id != NULL) {
5708	thread_t cur_act = current_thread();
5709	struct uthread * ut = get_bsdthread_info(cur_act);
5710
5711	kq->kq_state \|= KQ_SEL;
5712	waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
5713	WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
5714
5715	/ always consume the reserved link object /
5716	waitq_link_release((uint64_t )wq_link_id);
5717	(uint64_t )wq_link_id = `0`;
5718
5719	/*
5720	* selprocess() is expecting that we send it back the waitq
5721	* that was just added to the thread's waitq set. In order
5722	* to not change the selrecord() API (which is exported to
5723	* kexts), we pass this value back through the
5724	* void *wq_link_id pointer we were passed. We need to use
5725	* memcpy here because the pointer may not be properly aligned
5726	* on 32-bit systems.
5727	*/
5728	void *wqptr = &kq->kq_wqs;
5729	memcpy(wq_link_id, (void )&wqptr, sizeof(void* *));
5730	}
5731
5732	if (kqfile_begin_processing(kq) == -`1`) {
5733	kqunlock(kq);
5734	return (`0`);
5735	}
5736
5737	queue = &kq->kq_queue[QOS_INDEX_KQFILE];
5738	if (!TAILQ_EMPTY(queue)) {
5739	/*
5740	* there is something queued - but it might be a
5741	* KN_STAYACTIVE knote, which may or may not have
5742	* any events pending. Otherwise, we have to walk
5743	* the list of knotes to see, and peek at the
5744	* (non-vanished) stay-active ones to be really sure.
5745	*/
5746	while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
5747	if (kn->kn_status & KN_ACTIVE) {
5748	retnum = `1`;
5749	goto out;
5750	}
5751	assert(kn->kn_status & KN_STAYACTIVE);
5752	knote_suppress(kn);
5753	}
5754
5755	/*
5756	* There were no regular events on the queue, so take
5757	* a deeper look at the stay-queued ones we suppressed.
5758	*/
5759	suppressq = kqueue_get_suppressed_queue(kq, NULL);
5760	while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
5761	KNOTE_LOCK_CTX(knlc);
5762	int result = `0`;
5763
5764	/ If didn't vanish while suppressed - peek at it /
5765	if ((kn->kn_status & KN_DROPPING) \|\| !knote_lock(kq, kn, &knlc,
5766	KNOTE_KQ_LOCK_ON_FAILURE)) {
5767	continue;
5768	}
5769
5770	result = filter_call(knote_fops(kn), f_peek(kn));
5771
5772	kqlock(kq);
5773	knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
5774
5775	/ unsuppress it /
5776	knote_unsuppress(kn);
5777
5778	/ has data or it has to report a vanish /
5779	if (result & FILTER_ACTIVE) {
5780	retnum = `1`;
5781	goto out;
5782	}
5783	}
5784	}
5785
5786	out:
5787	kqfile_end_processing(kq);
5788	kqunlock(kq);
5789	return (retnum);
5790	}
5791
5792	/*
5793	* kqueue_close -
5794	*/
5795	/ARGSUSED/
5796	static int
5797	kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5798	{
5799	struct kqfile kqf = (struct* kqfile *)fg->fg_data;
5800
5801	assert((kqf->kqf_state & KQ_WORKQ) == `0`);
5802	kqueue_dealloc(&kqf->kqf_kqueue);
5803	fg->fg_data = NULL;
5804	return (`0`);
5805	}
5806
5807	/*
5808	* Max depth of the nested kq path that can be created.
5809	* Note that this has to be less than the size of kq_level
5810	* to avoid wrapping around and mislabeling the level.
5811	*/
5812	#define MAX_NESTED_KQ 1000
5813
5814	/ARGSUSED/
5815	/*
5816	* The callers has taken a use-count reference on this kqueue and will donate it
5817	* to the kqueue we are being added to. This keeps the kqueue from closing until
5818	* that relationship is torn down.
5819	*/
5820	static int
5821	kqueue_kqfilter(__unused struct fileproc fp, struct* knote *kn,
5822	__unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
5823	{
5824	struct kqfile kqf = (struct* kqfile *)kn->kn_fp->f_data;
5825	struct kqueue *kq = &kqf->kqf_kqueue;
5826	struct kqueue *parentkq = knote_get_kq(kn);
5827	uint16_t plevel = `0`;
5828
5829	assert((kqf->kqf_state & KQ_WORKQ) == `0`);
5830
5831	if (parentkq == kq \|\| kn->kn_filter != EVFILT_READ) {
5832	knote_set_error(kn, EINVAL);
5833	return `0`;
5834	}
5835
5836	/*
5837	* We have to avoid creating a cycle when nesting kqueues
5838	* inside another. Rather than trying to walk the whole
5839	* potential DAG of nested kqueues, we just use a simple
5840	* ceiling protocol. When a kqueue is inserted into another,
5841	* we check that the (future) parent is not already nested
5842	* into another kqueue at a lower level than the potenial
5843	* child (because it could indicate a cycle). If that test
5844	* passes, we just mark the nesting levels accordingly.
5845	*
5846	* Only up to MAX_NESTED_KQ can be nested.
5847	*/
5848
5849	kqlock(parentkq);
5850	if (parentkq->kq_level > `0` &&
5851	parentkq->kq_level < kq->kq_level)
5852	{
5853	kqunlock(parentkq);
5854	knote_set_error(kn, EINVAL);
5855	return `0`;
5856	} else {
5857	/ set parent level appropriately /
5858	plevel = (parentkq->kq_level == `0`)? `2`: parentkq->kq_level;
5859	if (plevel < kq->kq_level + `1`) {
5860	if (kq->kq_level + `1` > MAX_NESTED_KQ) {
5861	kqunlock(parentkq);
5862	knote_set_error(kn, EINVAL);
5863	return `0`;
5864	}
5865	plevel = kq->kq_level + `1`;
5866	}
5867
5868	parentkq->kq_level = plevel;
5869	kqunlock(parentkq);
5870
5871	kn->kn_filtid = EVFILTID_KQREAD;
5872	kqlock(kq);
5873	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5874	/ indicate nesting in child, if needed /
5875	if (kq->kq_level == `0`)
5876	kq->kq_level = `1`;
5877
5878	int count = kq->kq_count;
5879	kqunlock(kq);
5880	return (count > `0`);
5881	}
5882	}
5883
5884	/*
5885	* kqueue_drain - called when kq is closed
5886	*/
5887	/ARGSUSED/
5888	static int
5889	kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5890	{
5891	struct kqueue kq = (struct* kqueue *)fp->f_fglob->fg_data;
5892
5893	assert((kq->kq_state & KQ_WORKQ) == `0`);
5894
5895	kqlock(kq);
5896	kq->kq_state \|= KQ_DRAIN;
5897	kqueue_interrupt(kq);
5898	kqunlock(kq);
5899	return (`0`);
5900	}
5901
5902	/ARGSUSED/
5903	int
5904	kqueue_stat(struct kqueue kq, void* ub, int* isstat64, proc_t p)
5905	{
5906	assert((kq->kq_state & KQ_WORKQ) == `0`);
5907
5908	kqlock(kq);
5909	if (isstat64 != `0`) {
5910	struct stat64 sb64 = (struct* stat64 *)ub;
5911
5912	bzero((void )sb64, sizeof(sb64));
5913	sb64->st_size = kq->kq_count;
5914	if (kq->kq_state & KQ_KEV_QOS)
5915	sb64->st_blksize = sizeof(struct kevent_qos_s);
5916	else if (kq->kq_state & KQ_KEV64)
5917	sb64->st_blksize = sizeof(struct kevent64_s);
5918	else if (IS_64BIT_PROCESS(p))
5919	sb64->st_blksize = sizeof(struct user64_kevent);
5920	else
5921	sb64->st_blksize = sizeof(struct user32_kevent);
5922	sb64->st_mode = S_IFIFO;
5923	} else {
5924	struct stat sb = (struct* stat *)ub;
5925
5926	bzero((void )sb, sizeof(sb));
5927	sb->st_size = kq->kq_count;
5928	if (kq->kq_state & KQ_KEV_QOS)
5929	sb->st_blksize = sizeof(struct kevent_qos_s);
5930	else if (kq->kq_state & KQ_KEV64)
5931	sb->st_blksize = sizeof(struct kevent64_s);
5932	else if (IS_64BIT_PROCESS(p))
5933	sb->st_blksize = sizeof(struct user64_kevent);
5934	else
5935	sb->st_blksize = sizeof(struct user32_kevent);
5936	sb->st_mode = S_IFIFO;
5937	}
5938	kqunlock(kq);
5939	return (`0`);
5940	}
5941
5942	/*
5943	* Interact with the pthread kext to request a servicing there at a specific QoS
5944	* level.
5945	*
5946	* - Caller holds the workq request lock
5947	*
5948	* - May be called with the kqueue's wait queue set locked,
5949	* so cannot do anything that could recurse on that.
5950	*/
5951	static void
5952	kqueue_threadreq_initiate(struct kqueue kq, struct* kqrequest *kqr,
5953	kq_index_t qos, int flags)
5954	{
5955	assert(kqr->kqr_state & KQR_WAKEUP);
5956	assert(kqr->kqr_thread == THREAD_NULL);
5957	assert((kqr->kqr_state & KQR_THREQUESTED) == `0`);
5958	struct turnstile *ts = TURNSTILE_NULL;
5959
5960	if (workq_is_exiting(kq->kq_p)) {
5961	return;
5962	}
5963
5964	/ Add a thread request reference on the kqueue. /
5965	kqueue_retain(kq);
5966
5967	kq_req_held(kq);
5968
5969	if (kq->kq_state & KQ_WORKLOOP) {
5970	__assert_only struct kqworkloop kqwl = (struct* kqworkloop *)kq;
5971
5972	assert(kqwl->kqwl_owner == THREAD_NULL);
5973	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5974	kqwl->kqwl_dynamicid, `0`, qos, kqr->kqr_state);
5975	ts = kqwl->kqwl_turnstile;
5976	} else {
5977	assert(kq->kq_state & KQ_WORKQ);
5978	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
5979	-`1`, `0`, qos, kqr->kqr_state);
5980	}
5981
5982	kqr->kqr_state \|= KQR_THREQUESTED;
5983
5984	/*
5985	* New-style thread request supported.
5986	* Provide the pthread kext a pointer to a workq_threadreq_s structure for
5987	* its use until a corresponding kqueue_threadreq_bind callback.
5988	*/
5989	if ((kq->kq_state & KQ_WORKLOOP) && current_proc() == kq->kq_p) {
5990	flags \|= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5991	}
5992	if (qos == KQWQ_QOS_MANAGER) {
5993	qos = WORKQ_THREAD_QOS_MANAGER;
5994	}
5995	if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
5996	/*
5997	* Process is shutting down or exec'ing.
5998	* All the kqueues are going to be cleaned up
5999	* soon. Forget we even asked for a thread -
6000	* and make sure we don't ask for more.
6001	*/
6002	kqr->kqr_state &= ~(KQR_THREQUESTED \| KQR_R2K_NOTIF_ARMED);
6003	kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
6004	}
6005	}
6006
6007	/*
6008	* kqueue_threadreq_bind_prepost - prepost the bind to kevent
6009	*
6010	* This is used when kqueue_threadreq_bind may cause a lock inversion.
6011	*/
6012	void
6013	kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req,
6014	thread_t thread)
6015	{
6016	struct kqrequest kqr = __container_of(req, struct* kqrequest, kqr_req);
6017	struct uthread *ut = get_bsdthread_info(thread);
6018
6019	req->tr_binding_thread = thread;
6020	ut->uu_kqr_bound = kqr;
6021	req->tr_state = TR_STATE_BINDING;
6022
6023	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6024	if (kqwl && kqwl->kqwl_turnstile) {
6025	struct turnstile *ts = kqwl->kqwl_turnstile;
6026	/*
6027	* While a thread request is in flight, the workqueue
6028	* is the interlock for the turnstile and can update the inheritor.
6029	*/
6030	turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE \|
6031	TURNSTILE_INHERITOR_THREAD);
6032	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
6033	}
6034	}
6035
6036	/*
6037	* kqueue_threadreq_bind_commit - commit a bind prepost
6038	*
6039	* The workq code has to commit any binding prepost before the thread has
6040	* a chance to come back to userspace (and do kevent syscalls) or be aborted.
6041	*/
6042	void
6043	kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
6044	{
6045	struct uthread *ut = get_bsdthread_info(thread);
6046	struct kqrequest *kqr = ut->uu_kqr_bound;
6047	kqueue_t kqu = kqr_kqueue(p, kqr);
6048
6049	kq_req_lock(kqu);
6050	if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
6051	kqueue_threadreq_bind(p, &kqr->kqr_req, thread, `0`);
6052	}
6053	kq_req_unlock(kqu);
6054	}
6055
6056	static void
6057	kqueue_threadreq_modify(struct kqueue kq, struct* kqrequest *kqr, kq_index_t qos)
6058	{
6059	assert(kqr->kqr_state & KQR_THREQUESTED);
6060	assert(kqr->kqr_thread == THREAD_NULL);
6061
6062	kq_req_held(kq);
6063
6064	int flags = `0`;
6065	if ((kq->kq_state & KQ_WORKLOOP) && kq->kq_p == current_proc()) {
6066	flags \|= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
6067	}
6068	workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags);
6069	}
6070
6071	/*
6072	* kqueue_threadreq_bind - bind thread to processing kqrequest
6073	*
6074	* The provided thread will be responsible for delivering events
6075	* associated with the given kqrequest. Bind it and get ready for
6076	* the thread to eventually arrive.
6077	*/
6078	void
6079	kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread,
6080	unsigned int flags)
6081	{
6082	struct kqrequest kqr = __container_of(req, struct* kqrequest, kqr_req);
6083	kqueue_t kqu = kqr_kqueue(p, kqr);
6084	struct uthread *ut = get_bsdthread_info(thread);
6085
6086	kq_req_held(kqu);
6087
6088	assert(kqr->kqr_state & KQR_THREQUESTED);
6089	assert(kqr->kqr_thread == THREAD_NULL);
6090	assert(ut->uu_kqueue_override == `0`);
6091
6092	if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
6093	assert(ut->uu_kqr_bound == kqr);
6094	assert(kqr->kqr_req.tr_binding_thread == thread);
6095	kqr->kqr_req.tr_state = TR_STATE_IDLE;
6096	kqr->kqr_req.tr_binding_thread = NULL;
6097	} else {
6098	assert(ut->uu_kqr_bound == NULL);
6099	}
6100
6101	ut->uu_kqr_bound = kqr;
6102	kqr->kqr_thread = thread;
6103
6104	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6105	struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
6106
6107	if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
6108	/*
6109	* <rdar://problem/38626999> shows that asserting here is not ok.
6110	*
6111	* This is not supposed to happen for correct use of the interface,
6112	* but it is sadly possible for userspace (with the help of memory
6113	* corruption, such as over-release of a dispatch queue) to make
6114	* the creator thread the "owner" of a workloop.
6115	*
6116	* Once that happens, and that creator thread picks up the same
6117	* workloop as a servicer, we trip this codepath. We need to fixup
6118	* the state to forget about this thread being the owner, as the
6119	* entire workloop state machine expects servicers to never be
6120	* owners and everything would basically go downhill from here.
6121	*/
6122	kqu.kqwl->kqwl_owner = THREAD_NULL;
6123	if (kqworkloop_owner_override(kqu.kqwl)) {
6124	thread_drop_ipc_override(thread);
6125	}
6126	thread_ends_owning_workloop(thread);
6127	}
6128
6129	if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == `0`) {
6130	/*
6131	* Past this point, the interlock is the kq req lock again,
6132	* so we can fix the inheritor for good.
6133	*/
6134	filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6135	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
6136	}
6137
6138	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
6139	thread_tid(thread), kqr->kqr_qos_index,
6140	(kqr->kqr_override_index << `16`) \| kqr->kqr_state);
6141
6142	ut->uu_kqueue_override = kqr->kqr_override_index;
6143	if (kqr->kqr_override_index) {
6144	thread_add_ipc_override(thread, kqr->kqr_override_index);
6145	}
6146	} else {
6147	assert(kqr->kqr_override_index == `0`);
6148
6149	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -`1`,
6150	thread_tid(thread), kqr->kqr_qos_index,
6151	(kqr->kqr_override_index << `16`) \| kqr->kqr_state);
6152	}
6153	}
6154
6155	/*
6156	* kqueue_threadreq_cancel - abort a pending thread request
6157	*
6158	* Called when exiting/exec'ing. Forget our pending request.
6159	*/
6160	void
6161	kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req)
6162	{
6163	struct kqrequest kqr = __container_of(req, struct* kqrequest, kqr_req);
6164	kqueue_t kqu = kqr_kqueue(p, kqr);
6165
6166	kq_req_lock(kqu);
6167
6168	assert(kqr->kqr_thread == THREAD_NULL);
6169	assert(kqr->kqr_state & KQR_THREQUESTED);
6170	kqr->kqr_state &= ~(KQR_THREQUESTED \| KQR_R2K_NOTIF_ARMED);
6171
6172	kq_req_unlock(kqu);
6173
6174	kqueue_release_last(p, kqu); / may dealloc kqu /
6175	}
6176
6177	workq_threadreq_param_t
6178	kqueue_threadreq_workloop_param(workq_threadreq_t req)
6179	{
6180	struct kqrequest kqr = __container_of(req, struct* kqrequest, kqr_req);
6181	struct kqworkloop *kqwl;
6182	workq_threadreq_param_t trp;
6183
6184	assert(kqr->kqr_state & KQR_WORKLOOP);
6185	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
6186	trp.trp_value = kqwl->kqwl_params;
6187	return trp;
6188	}
6189
6190	/*
6191	* kqueue_threadreq_unbind - unbind thread from processing kqueue
6192	*
6193	* End processing the per-QoS bucket of events and allow other threads
6194	* to be requested for future servicing.
6195	*
6196	* caller holds a reference on the kqueue.
6197	*/
6198	void
6199	kqueue_threadreq_unbind(struct proc p, struct* kqrequest *kqr)
6200	{
6201	if (kqr->kqr_state & KQR_WORKLOOP) {
6202	kqworkloop_unbind(p, kqr_kqworkloop(kqr));
6203	} else {
6204	kqworkq_unbind(p, kqr);
6205	}
6206	}
6207
6208	/*
6209	* If we aren't already busy processing events [for this QoS],
6210	* request workq thread support as appropriate.
6211	*
6212	* TBD - for now, we don't segregate out processing by QoS.
6213	*
6214	* - May be called with the kqueue's wait queue set locked,
6215	* so cannot do anything that could recurse on that.
6216	*/
6217	static void
6218	kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index)
6219	{
6220	struct kqrequest *kqr;
6221
6222	/ convert to thread qos value /
6223	assert(qos_index < KQWQ_NBUCKETS);
6224
6225	kq_req_lock(kqwq);
6226	kqr = kqworkq_get_request(kqwq, qos_index);
6227
6228	if ((kqr->kqr_state & KQR_WAKEUP) == `0`) {
6229	kqr->kqr_state \|= KQR_WAKEUP;
6230	if ((kqr->kqr_state & KQR_THREQUESTED) == `0`) {
6231	kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, `0`);
6232	}
6233	}
6234	kq_req_unlock(kqwq);
6235	}
6236
6237	static kq_index_t
6238	kqworkloop_owner_override(struct kqworkloop *kqwl)
6239	{
6240	struct kqrequest *kqr = &kqwl->kqwl_request;
6241	return MAX(kqr->kqr_qos_index, kqr->kqr_override_index);
6242	}
6243
6244	static inline void
6245	kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
6246	{
6247	struct kqrequest *kqr = &kqwl->kqwl_request;
6248
6249	kq_req_held(kqwl);
6250
6251	if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
6252	assert(kqr->kqr_thread);
6253	kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
6254	act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
6255	}
6256	}
6257
6258	static void
6259	kqworkloop_update_threads_qos(struct kqworkloop kqwl, int* op, kq_index_t qos)
6260	{
6261	struct kqrequest *kqr = &kqwl->kqwl_request;
6262	struct kqueue *kq = &kqwl->kqwl_kqueue;
6263	kq_index_t old_owner_override = kqworkloop_owner_override(kqwl);
6264	kq_index_t i;
6265
6266	/ must hold the kqr lock /
6267	kq_req_held(kqwl);
6268
6269	switch (op) {
6270	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6271	if (qos == KQWL_BUCKET_STAYACTIVE) {
6272	/*
6273	* the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
6274	* a high watermark (kqr_stayactive_qos) of any stay active knote
6275	* that was ever registered with this workloop.
6276	*
6277	* When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
6278	* knote, we use this high-watermark as a wakeup-index, and also set
6279	* the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
6280	* there is at least one stay active knote fired until the next full
6281	* processing of this bucket.
6282	*/
6283	kqr->kqr_wakeup_indexes \|= KQWL_STAYACTIVE_FIRED_BIT;
6284	qos = kqr->kqr_stayactive_qos;
6285	assert(qos);
6286	}
6287	if (kqr->kqr_wakeup_indexes & (`1` << qos)) {
6288	assert(kqr->kqr_state & KQR_WAKEUP);
6289	break;
6290	}
6291
6292	kqr->kqr_wakeup_indexes \|= (`1` << qos);
6293	kqr->kqr_state \|= KQR_WAKEUP;
6294	kqworkloop_request_fire_r2k_notification(kqwl);
6295	goto recompute;
6296
6297	case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6298	assert(qos);
6299	if (kqr->kqr_stayactive_qos < qos) {
6300	kqr->kqr_stayactive_qos = qos;
6301	if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
6302	assert(kqr->kqr_state & KQR_WAKEUP);
6303	kqr->kqr_wakeup_indexes \|= (`1` << qos);
6304	goto recompute;
6305	}
6306	}
6307	break;
6308
6309	case KQWL_UTQ_PARKING:
6310	case KQWL_UTQ_UNBINDING:
6311	kqr->kqr_override_index = qos;
6312	/ FALLTHROUGH /
6313	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6314	if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
6315	assert(qos == THREAD_QOS_UNSPECIFIED);
6316	}
6317	kqlock_held(kqwl); // to look at kq_queues
6318	i = KQWL_BUCKET_STAYACTIVE;
6319	if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6320	kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6321	}
6322	if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
6323	(kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
6324	/*
6325	* If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
6326	* knote may have fired, so we need to merge in kqr_stayactive_qos.
6327	*
6328	* Unlike other buckets, this one is never empty but could be idle.
6329	*/
6330	kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
6331	kqr->kqr_wakeup_indexes \|= (`1` << kqr->kqr_stayactive_qos);
6332	} else {
6333	kqr->kqr_wakeup_indexes = `0`;
6334	}
6335	for (i = THREAD_QOS_UNSPECIFIED + `1`; i < KQWL_BUCKET_STAYACTIVE; i++) {
6336	if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
6337	kqr->kqr_wakeup_indexes \|= (`1` << i);
6338	}
6339	}
6340	if (kqr->kqr_wakeup_indexes) {
6341	kqr->kqr_state \|= KQR_WAKEUP;
6342	kqworkloop_request_fire_r2k_notification(kqwl);
6343	} else {
6344	kqr->kqr_state &= ~KQR_WAKEUP;
6345	}
6346	goto recompute;
6347
6348	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6349	kqr->kqr_override_index = qos;
6350	goto recompute;
6351
6352	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6353	recompute:
6354	/*
6355	* When modifying the wakeup QoS or the override QoS, we always need to
6356	* maintain our invariant that kqr_override_index is at least as large
6357	* as the highest QoS for which an event is fired.
6358	*
6359	* However this override index can be larger when there is an overriden
6360	* suppressed knote pushing on the kqueue.
6361	*/
6362	if (kqr->kqr_wakeup_indexes > (`1` << qos)) {
6363	qos = fls(kqr->kqr_wakeup_indexes) - `1`; / fls is 1-based /
6364	}
6365	if (kqr->kqr_override_index < qos) {
6366	kqr->kqr_override_index = qos;
6367	}
6368	break;
6369
6370	case KQWL_UTQ_REDRIVE_EVENTS:
6371	break;
6372
6373	case KQWL_UTQ_SET_QOS_INDEX:
6374	kqr->kqr_qos_index = qos;
6375	break;
6376
6377	default:
6378	panic("unknown kqwl thread qos update operation: %d", op);
6379	}
6380
6381	thread_t kqwl_owner = kqwl->kqwl_owner;
6382	thread_t servicer = kqr->kqr_thread;
6383	boolean_t qos_changed = FALSE;
6384	kq_index_t new_owner_override = kqworkloop_owner_override(kqwl);
6385
6386	/*
6387	* Apply the diffs to the owner if applicable
6388	*/
6389	if (kqwl_owner) {
6390	#if 0
6391	/ JMM - need new trace hooks for owner overrides /
6392	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6393	kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index,
6394	(kqr->kqr_override_index << `16`) \| kqr->kqr_state);
6395	#endif
6396	if (new_owner_override == old_owner_override) {
6397	// nothing to do
6398	} else if (old_owner_override == THREAD_QOS_UNSPECIFIED) {
6399	thread_add_ipc_override(kqwl_owner, new_owner_override);
6400	} else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
6401	thread_drop_ipc_override(kqwl_owner);
6402	} else / old_owner_override != new_owner_override / {
6403	thread_update_ipc_override(kqwl_owner, new_owner_override);
6404	}
6405	}
6406
6407	/*
6408	* apply the diffs to the servicer
6409	*/
6410	if ((kqr->kqr_state & KQR_THREQUESTED) == `0`) {
6411	/*
6412	* No servicer, nor thread-request
6413	*
6414	* Make a new thread request, unless there is an owner (or the workloop
6415	* is suspended in userland) or if there is no asynchronous work in the
6416	* first place.
6417	*/
6418
6419	if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) {
6420	int initiate_flags = `0`;
6421	if (op == KQWL_UTQ_UNBINDING) {
6422	initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
6423	}
6424	kqueue_threadreq_initiate(kq, kqr, new_owner_override,
6425	initiate_flags);
6426	}
6427	} else if (servicer) {
6428	/*
6429	* Servicer in flight
6430	*
6431	* Just apply the diff to the servicer
6432	*/
6433	struct uthread *ut = get_bsdthread_info(servicer);
6434	if (ut->uu_kqueue_override != kqr->kqr_override_index) {
6435	if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
6436	thread_add_ipc_override(servicer, kqr->kqr_override_index);
6437	} else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) {
6438	thread_drop_ipc_override(servicer);
6439	} else / ut->uu_kqueue_override != kqr->kqr_override_index / {
6440	thread_update_ipc_override(servicer, kqr->kqr_override_index);
6441	}
6442	ut->uu_kqueue_override = kqr->kqr_override_index;
6443	qos_changed = TRUE;
6444	}
6445	} else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
6446	/*
6447	* No events to deliver anymore.
6448	*
6449	* However canceling with turnstiles is challenging, so the fact that
6450	* the request isn't useful will be discovered by the servicer himself
6451	* later on.
6452	*/
6453	} else if (old_owner_override != new_owner_override) {
6454	/*
6455	* Request is in flight
6456	*
6457	* Apply the diff to the thread request
6458	*/
6459	kqueue_threadreq_modify(kq, kqr, new_owner_override);
6460	qos_changed = TRUE;
6461	}
6462
6463	if (qos_changed) {
6464	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6465	thread_tid(kqr->kqr_thread), kqr->kqr_qos_index,
6466	(kqr->kqr_override_index << `16`) \| kqr->kqr_state);
6467	}
6468	}
6469
6470	static void
6471	kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
6472	{
6473	/ convert to thread qos value /
6474	assert(qos_index < KQWL_NBUCKETS);
6475
6476	kq_req_lock(kqwl);
6477	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
6478	kq_req_unlock(kqwl);
6479	}
6480
6481	static struct kqtailq *
6482	kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index)
6483	{
6484	if (kq->kq_state & KQ_WORKQ) {
6485	assert(qos_index < KQWQ_NBUCKETS);
6486	} else if (kq->kq_state & KQ_WORKLOOP) {
6487	assert(qos_index < KQWL_NBUCKETS);
6488	} else {
6489	assert(qos_index == QOS_INDEX_KQFILE);
6490	}
6491	static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
6492	"struct kqueue::kq_queue must be exactly at the end");
6493	return &kq->kq_queue[qos_index];
6494	}
6495
6496	static int
6497	kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
6498	{
6499	return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index));
6500	}
6501
6502	static struct kqtailq *
6503	kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6504	{
6505	if (kq.kq->kq_state & KQ_WORKQ) {
6506	return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed;
6507	} else if (kq.kq->kq_state & KQ_WORKLOOP) {
6508	return &kq.kqwl->kqwl_request.kqr_suppressed;
6509	} else {
6510	return &kq.kqf->kqf_suppressed;
6511	}
6512	}
6513
6514	static struct turnstile *
6515	kqueue_get_turnstile(kqueue_t kqu, bool can_alloc)
6516	{
6517	uint8_t kqr_state;
6518
6519	if ((kqu.kq->kq_state & KQ_WORKLOOP) == `0`) {
6520	return TURNSTILE_NULL;
6521	}
6522
6523	kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed);
6524	if (kqr_state & KQR_ALLOCATED_TURNSTILE) {
6525	/ force a dependency to pair with the atomic or with release below /
6526	return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile,
6527	kqr_state);
6528	}
6529
6530	if (!can_alloc) {
6531	return TURNSTILE_NULL;
6532	}
6533
6534	struct turnstile ts = turnstile_alloc(), free_ts = TURNSTILE_NULL;
6535
6536	kq_req_lock(kqu);
6537	if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
6538	workq_kern_threadreq_lock(kqu.kqwl->kqwl_p);
6539	}
6540
6541	if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
6542	free_ts = ts;
6543	ts = kqu.kqwl->kqwl_turnstile;
6544	} else {
6545	ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile,
6546	ts, TURNSTILE_WORKLOOPS);
6547
6548	/ release-barrier to pair with the unlocked load of kqwl_turnstile above /
6549	os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state,
6550	KQR_ALLOCATED_TURNSTILE, release);
6551	}
6552
6553	if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
6554	workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p);
6555	}
6556	kq_req_unlock(kqu.kqwl);
6557
6558	if (free_ts) {
6559	turnstile_deallocate(free_ts);
6560	}
6561	return ts;
6562	}
6563
6564	struct turnstile *
6565	kqueue_turnstile(struct kqueue *kq)
6566	{
6567	return kqueue_get_turnstile(kq, false);
6568	}
6569
6570	struct turnstile *
6571	kqueue_alloc_turnstile(struct kqueue *kq)
6572	{
6573	return kqueue_get_turnstile(kq, true);
6574	}
6575
6576	static struct kqtailq *
6577	knote_get_queue(struct knote *kn)
6578	{
6579	return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index);
6580	}
6581
6582	static void
6583	knote_reset_priority(struct knote *kn, pthread_priority_t pp)
6584	{
6585	struct kqueue *kq = knote_get_kq(kn);
6586	kq_index_t qos = _pthread_priority_thread_qos(pp);
6587
6588	assert((kn->kn_status & KN_QUEUED) == `0`);
6589
6590	if (kq->kq_state & KQ_WORKQ) {
6591	if (qos == THREAD_QOS_UNSPECIFIED) {
6592	/ On workqueues, outside of QoS means MANAGER /
6593	qos = KQWQ_QOS_MANAGER;
6594	pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6595	} else {
6596	pp = _pthread_priority_normalize(pp);
6597	}
6598	} else if (kq->kq_state & KQ_WORKLOOP) {
6599	assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == `0`);
6600	pp = _pthread_priority_normalize(pp);
6601	} else {
6602	pp = _pthread_unspecified_priority();
6603	qos = THREAD_QOS_UNSPECIFIED;
6604	}
6605
6606	kn->kn_qos = pp;
6607	kn->kn_req_index = qos;
6608
6609	if ((kn->kn_status & KN_MERGE_QOS) == `0` \|\| qos > kn->kn_qos_override) {
6610	/ Never lower QoS when in "Merge" mode /
6611	kn->kn_qos_override = qos;
6612	}
6613
6614	/ only adjust in-use qos index when not suppressed /
6615	if ((kn->kn_status & KN_SUPPRESSED) == `0`) {
6616	kn->kn_qos_index = qos;
6617	} else if (kq->kq_state & KQ_WORKQ) {
6618	kqworkq_update_override((struct kqworkq *)kq, kn, qos);
6619	} else if (kq->kq_state & KQ_WORKLOOP) {
6620	kqworkloop_update_override((struct kqworkloop *)kq, qos);
6621	}
6622	}
6623
6624	static void
6625	knote_set_qos_overcommit(struct knote *kn)
6626	{
6627	struct kqueue *kq = knote_get_kq(kn);
6628
6629	/ turn overcommit on for the appropriate thread request? /
6630	if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
6631	(kq->kq_state & KQ_WORKLOOP)) {
6632	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
6633	struct kqrequest *kqr = &kqwl->kqwl_request;
6634
6635	/*
6636	* This test is racy, but since we never remove this bit,
6637	* it allows us to avoid taking a lock.
6638	*/
6639	if (kqr->kqr_state & KQR_THOVERCOMMIT) {
6640	return;
6641	}
6642
6643	kq_req_lock(kqwl);
6644	kqr->kqr_state \|= KQR_THOVERCOMMIT;
6645	if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) {
6646	kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos);
6647	}
6648	kq_req_unlock(kqwl);
6649	}
6650	}
6651
6652	static kq_index_t
6653	knote_get_qos_override_index(struct knote *kn)
6654	{
6655	return kn->kn_qos_override;
6656	}
6657
6658	static void
6659	kqworkq_update_override(struct kqworkq kqwq, struct* knote *kn,
6660	kq_index_t override_index)
6661	{
6662	struct kqrequest *kqr;
6663	kq_index_t old_override_index;
6664	kq_index_t queue_index = kn->kn_qos_index;
6665
6666	if (override_index <= queue_index) {
6667	return;
6668	}
6669
6670	kqr = kqworkq_get_request(kqwq, queue_index);
6671
6672	kq_req_lock(kqwq);
6673	old_override_index = kqr->kqr_override_index;
6674	if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
6675	kqr->kqr_override_index = override_index;
6676
6677	/ apply the override to [incoming?] servicing thread /
6678	if (kqr->kqr_thread) {
6679	if (old_override_index)
6680	thread_update_ipc_override(kqr->kqr_thread, override_index);
6681	else
6682	thread_add_ipc_override(kqr->kqr_thread, override_index);
6683	}
6684	}
6685	kq_req_unlock(kqwq);
6686	}
6687
6688	static void
6689	kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index)
6690	{
6691	kq_req_lock(kqwl);
6692	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6693	override_index);
6694	kq_req_unlock(kqwl);
6695	}
6696
6697	static thread_qos_t
6698	kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread)
6699	{
6700	struct uthread *ut = get_bsdthread_info(thread);
6701	struct kqrequest *kqr = &kqwl->kqwl_request;
6702	kq_index_t ipc_override = ut->uu_kqueue_override;
6703
6704	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6705	thread_tid(thread), `0`, `0`);
6706
6707	kq_req_held(kqwl);
6708	assert(ut->uu_kqr_bound == kqr);
6709	ut->uu_kqr_bound = NULL;
6710	ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6711
6712	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6713	turnstile_update_inheritor(kqwl->kqwl_turnstile,
6714	TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6715	turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6716	TURNSTILE_INTERLOCK_HELD);
6717	}
6718
6719	kqr->kqr_thread = NULL;
6720	kqr->kqr_state &= ~(KQR_THREQUESTED \| KQR_R2K_NOTIF_ARMED);
6721	return ipc_override;
6722	}
6723
6724	/*
6725	* kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6726	*
6727	* It will acknowledge events, and possibly request a new thread if:
6728	* - there were active events left
6729	* - we pended waitq hook callouts during processing
6730	* - we pended wakeups while processing (or unsuppressing)
6731	*
6732	* Called with kqueue lock held.
6733	*/
6734	static void
6735	kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl)
6736	{
6737	struct kqueue *kq = &kqwl->kqwl_kqueue;
6738	struct kqrequest *kqr = &kqwl->kqwl_request;
6739	thread_t thread = kqr->kqr_thread;
6740	int op = KQWL_UTQ_PARKING;
6741	kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED;
6742
6743	assert(thread == current_thread());
6744
6745	kqlock(kqwl);
6746
6747	/*
6748	* Forcing the KQ_PROCESSING flag allows for QoS updates because of
6749	* unsuppressing knotes not to be applied until the eventual call to
6750	* kqworkloop_update_threads_qos() below.
6751	*/
6752	assert((kq->kq_state & KQ_PROCESSING) == `0`);
6753	if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6754	kq->kq_state \|= KQ_PROCESSING;
6755	qos_override = kqworkloop_acknowledge_events(kqwl);
6756	kq->kq_state &= ~KQ_PROCESSING;
6757	}
6758
6759	kq_req_lock(kqwl);
6760
6761	ipc_override = kqworkloop_unbind_locked(kqwl, thread);
6762	kqworkloop_update_threads_qos(kqwl, op, qos_override);
6763
6764	kq_req_unlock(kqwl);
6765
6766	kqunlock(kqwl);
6767
6768	/*
6769	* Drop the override on the current thread last, after the call to
6770	* kqworkloop_update_threads_qos above.
6771	*/
6772	if (ipc_override) {
6773	thread_drop_ipc_override(thread);
6774	}
6775
6776	/ If last reference, dealloc the workloop kq /
6777	kqueue_release_last(p, kqwl);
6778	}
6779
6780	static thread_qos_t
6781	kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq,
6782	struct kqrequest *kqr, thread_t thread)
6783	{
6784	struct uthread *ut = get_bsdthread_info(thread);
6785	kq_index_t old_override = kqr->kqr_override_index;
6786
6787	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -`1`,
6788	thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, `0`);
6789
6790	kq_req_held(kqwq);
6791	assert(ut->uu_kqr_bound == kqr);
6792	ut->uu_kqr_bound = NULL;
6793	kqr->kqr_thread = NULL;
6794	kqr->kqr_state &= ~(KQR_THREQUESTED \| KQR_R2K_NOTIF_ARMED);
6795	kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6796
6797	return old_override;
6798	}
6799
6800	/*
6801	* kqworkq_unbind - unbind of a workq kqueue from a thread
6802	*
6803	* We may have to request new threads.
6804	* This can happen there are no waiting processing threads and:
6805	* - there were active events we never got to (count > 0)
6806	* - we pended waitq hook callouts during processing
6807	* - we pended wakeups while processing (or unsuppressing)
6808	*/
6809	static void
6810	kqworkq_unbind(proc_t p, struct kqrequest *kqr)
6811	{
6812	struct kqworkq kqwq = (struct* kqworkq *)p->p_fd->fd_wqkqueue;
6813	__assert_only int rc;
6814
6815	kqlock(kqwq);
6816	rc = kqworkq_acknowledge_events(kqwq, kqr, `0`, KQWQAE_UNBIND);
6817	assert(rc == -`1`);
6818	kqunlock(kqwq);
6819	}
6820
6821	struct kqrequest *
6822	kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6823	{
6824	assert(qos_index < KQWQ_NBUCKETS);
6825	return &kqwq->kqwq_request[qos_index];
6826	}
6827
6828	static void
6829	knote_apply_qos_override(struct knote *kn, kq_index_t qos_index)
6830	{
6831	assert((kn->kn_status & KN_QUEUED) == `0`);
6832
6833	kn->kn_qos_override = qos_index;
6834
6835	if (kn->kn_status & KN_SUPPRESSED) {
6836	struct kqueue *kq = knote_get_kq(kn);
6837	/*
6838	* For suppressed events, the kn_qos_index field cannot be touched as it
6839	* allows us to know on which supress queue the knote is for a kqworkq.
6840	*
6841	* Also, there's no natural push applied on the kqueues when this field
6842	* changes anyway. We hence need to apply manual overrides in this case,
6843	* which will be cleared when the events are later acknowledged.
6844	*/
6845	if (kq->kq_state & KQ_WORKQ) {
6846	kqworkq_update_override((struct kqworkq *)kq, kn, qos_index);
6847	} else {
6848	kqworkloop_update_override((struct kqworkloop *)kq, qos_index);
6849	}
6850	} else {
6851	kn->kn_qos_index = qos_index;
6852	}
6853	}
6854
6855	static bool
6856	knote_should_apply_qos_override(struct kqueue kq, struct* knote kn, int* result,
6857	thread_qos_t *qos_out)
6858	{
6859	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & `7`;
6860
6861	kqlock_held(kq);
6862
6863	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6864	assert(qos_index < THREAD_QOS_LAST);
6865
6866	/*
6867	* Early exit for knotes that should not change QoS
6868	*
6869	* It is safe to test kn_req_index against MANAGER / STAYACTIVE because
6870	* knotes with such kn_req_index values never change for their entire
6871	* lifetime.
6872	*/
6873	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6874	panic("filter %d cannot change QoS", kn->kn_filtid);
6875	} else if (kq->kq_state & KQ_WORKLOOP) {
6876	if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) {
6877	return false;
6878	}
6879	} else if (kq->kq_state & KQ_WORKQ) {
6880	if (kn->kn_req_index == KQWQ_QOS_MANAGER) {
6881	return false;
6882	}
6883	} else {
6884	return false;
6885	}
6886
6887	/*
6888	* knotes with the FALLBACK flag will only use their registration QoS if the
6889	* incoming event has no QoS, else, the registration QoS acts as a floor.
6890	*/
6891	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6892	if (qos_index == THREAD_QOS_UNSPECIFIED)
6893	qos_index = kn->kn_req_index;
6894	} else {
6895	if (qos_index < kn->kn_req_index)
6896	qos_index = kn->kn_req_index;
6897	}
6898	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6899	/ Never lower QoS when in "Merge" mode /
6900	return false;
6901	}
6902
6903	if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) {
6904	/*
6905	* When we're trying to update the QoS override and that both an
6906	* f_event() and other f_* calls are running concurrently, any of these
6907	* in flight calls may want to perform overrides that aren't properly
6908	* serialized with each other.
6909	*
6910	* The first update that observes this racy situation enters a "Merge"
6911	* mode which causes subsequent override requests to saturate the
6912	* override instead of replacing its value.
6913	*
6914	* This mode is left when knote_unlock() or knote_call_filter_event()
6915	* observe that no other f_* routine is in flight.
6916	*/
6917	kn->kn_status \|= KN_MERGE_QOS;
6918	}
6919
6920	if (kn->kn_qos_override == qos_index) {
6921	return false;
6922	}
6923
6924	*qos_out = qos_index;
6925	return true;
6926	}
6927
6928	static void
6929	knote_adjust_qos(struct kqueue kq, struct* knote kn, int* result)
6930	{
6931	thread_qos_t qos;
6932	if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
6933	knote_dequeue(kn);
6934	knote_apply_qos_override(kn, qos);
6935	if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
6936	knote_wakeup(kn);
6937	}
6938	}
6939	}
6940
6941	static void
6942	knote_wakeup(struct knote *kn)
6943	{
6944	struct kqueue *kq = knote_get_kq(kn);
6945
6946	kqlock_held(kq);
6947
6948	if (kq->kq_state & KQ_WORKQ) {
6949	struct kqworkq kqwq = (struct* kqworkq *)kq;
6950
6951	kqworkq_request_help(kqwq, kn->kn_qos_index);
6952	} else if (kq->kq_state & KQ_WORKLOOP) {
6953	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
6954
6955	/*
6956	* kqworkloop_end_processing() will perform the required QoS
6957	* computations when it unsets the processing mode.
6958	*/
6959	if (!kqworkloop_is_processing_on_current_thread(kqwl)) {
6960	kqworkloop_request_help(kqwl, kn->kn_qos_index);
6961	}
6962	} else {
6963	struct kqfile kqf = (struct* kqfile *)kq;
6964
6965	/ flag wakeups during processing /
6966	if (kq->kq_state & KQ_PROCESSING)
6967	kq->kq_state \|= KQ_WAKEUP;
6968
6969	/ wakeup a thread waiting on this queue /
6970	if (kq->kq_state & (KQ_SLEEP \| KQ_SEL)) {
6971	kq->kq_state &= ~(KQ_SLEEP \| KQ_SEL);
6972	waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
6973	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
6974	}
6975
6976	/ wakeup other kqueues/select sets we're inside /
6977	KNOTE(&kqf->kqf_sel.si_note, `0`);
6978	}
6979	}
6980
6981	/*
6982	* Called with the kqueue locked
6983	*/
6984	static void
6985	kqueue_interrupt(struct kqueue *kq)
6986	{
6987	assert((kq->kq_state & KQ_WORKQ) == `0`);
6988
6989	/ wakeup sleeping threads /
6990	if ((kq->kq_state & (KQ_SLEEP \| KQ_SEL)) != `0`) {
6991	kq->kq_state &= ~(KQ_SLEEP \| KQ_SEL);
6992	(void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
6993	KQ_EVENT,
6994	THREAD_RESTART,
6995	WAITQ_ALL_PRIORITIES);
6996	}
6997
6998	/ wakeup threads waiting their turn to process /
6999	if (kq->kq_state & KQ_PROCWAIT) {
7000	struct kqtailq *suppressq;
7001
7002	assert(kq->kq_state & KQ_PROCESSING);
7003
7004	kq->kq_state &= ~KQ_PROCWAIT;
7005	suppressq = kqueue_get_suppressed_queue(kq, NULL);
7006	(void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7007	CAST_EVENT64_T(suppressq),
7008	THREAD_RESTART,
7009	WAITQ_ALL_PRIORITIES);
7010	}
7011	}
7012
7013	/*
7014	* Called back from waitq code when no threads waiting and the hook was set.
7015	*
7016	* Interrupts are likely disabled and spin locks are held - minimal work
7017	* can be done in this context!!!
7018	*
7019	* JMM - in the future, this will try to determine which knotes match the
7020	* wait queue wakeup and apply these wakeups against those knotes themselves.
7021	* For now, all the events dispatched this way are dispatch-manager handled,
7022	* so hard-code that for now.
7023	*/
7024	void
7025	waitq_set__CALLING_PREPOST_HOOK__(void kq_hook, void* knote_hook, int* qos)
7026	{
7027	#pragma unused(knote_hook, qos)
7028
7029	struct kqueue kq = (struct* kqueue *)kq_hook;
7030
7031	if (kq->kq_state & KQ_WORKQ) {
7032	struct kqworkq kqwq = (struct* kqworkq *)kq;
7033
7034	kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
7035	} else if (kq->kq_state & KQ_WORKLOOP) {
7036	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
7037
7038	kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE);
7039	}
7040	}
7041
7042	void
7043	klist_init(struct klist *list)
7044	{
7045	SLIST_INIT(list);
7046	}
7047
7048
7049	/*
7050	* Query/Post each knote in the object's list
7051	*
7052	* The object lock protects the list. It is assumed
7053	* that the filter/event routine for the object can
7054	* determine that the object is already locked (via
7055	* the hint) and not deadlock itself.
7056	*
7057	* The object lock should also hold off pending
7058	* detach/drop operations.
7059	*/
7060	void
7061	knote(struct klist list, long* hint)
7062	{
7063	struct knote *kn;
7064
7065	SLIST_FOREACH(kn, list, kn_selnext) {
7066	struct kqueue *kq = knote_get_kq(kn);
7067	kqlock(kq);
7068	knote_call_filter_event(kq, kn, hint);
7069	kqunlock(kq);
7070	}
7071	}
7072
7073	/*
7074	* attach a knote to the specified list. Return true if this is the first entry.
7075	* The list is protected by whatever lock the object it is associated with uses.
7076	*/
7077	int
7078	knote_attach(struct klist list, struct* knote *kn)
7079	{
7080	int ret = SLIST_EMPTY(list);
7081	SLIST_INSERT_HEAD(list, kn, kn_selnext);
7082	return (ret);
7083	}
7084
7085	/*
7086	* detach a knote from the specified list. Return true if that was the last entry.
7087	* The list is protected by whatever lock the object it is associated with uses.
7088	*/
7089	int
7090	knote_detach(struct klist list, struct* knote *kn)
7091	{
7092	SLIST_REMOVE(list, kn, knote, kn_selnext);
7093	return (SLIST_EMPTY(list));
7094	}
7095
7096	/*
7097	* knote_vanish - Indicate that the source has vanished
7098	*
7099	* If the knote has requested EV_VANISHED delivery,
7100	* arrange for that. Otherwise, deliver a NOTE_REVOKE
7101	* event for backward compatibility.
7102	*
7103	* The knote is marked as having vanished, but is not
7104	* actually detached from the source in this instance.
7105	* The actual detach is deferred until the knote drop.
7106	*
7107	* Our caller already has the object lock held. Calling
7108	* the detach routine would try to take that lock
7109	* recursively - which likely is not supported.
7110	*/
7111	void
7112	knote_vanish(struct klist *list)
7113	{
7114	struct knote *kn;
7115	struct knote *kn_next;
7116
7117	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
7118	struct kqueue *kq = knote_get_kq(kn);
7119
7120	kqlock(kq);
7121	if (kn->kn_status & KN_REQVANISH) {
7122	/ If EV_VANISH supported - prepare to deliver one /
7123	kn->kn_status \|= KN_VANISHED;
7124	knote_activate(kn);
7125	} else {
7126	knote_call_filter_event(kq, kn, NOTE_REVOKE);
7127	}
7128	kqunlock(kq);
7129	}
7130	}
7131
7132	/*
7133	* Force a lazy allocation of the waitqset link
7134	* of the kq_wqs associated with the kn
7135	* if it wasn't already allocated.
7136	*
7137	* This allows knote_link_waitq to never block
7138	* if reserved_link is not NULL.
7139	*/
7140	void
7141	knote_link_waitqset_lazy_alloc(struct knote *kn)
7142	{
7143	struct kqueue *kq = knote_get_kq(kn);
7144	waitq_set_lazy_init_link(&kq->kq_wqs);
7145	}
7146
7147	/*
7148	* Check if a lazy allocation for the waitqset link
7149	* of the kq_wqs is needed.
7150	*/
7151	boolean_t
7152	knote_link_waitqset_should_lazy_alloc(struct knote *kn)
7153	{
7154	struct kqueue *kq = knote_get_kq(kn);
7155	return waitq_set_should_lazy_init_link(&kq->kq_wqs);
7156	}
7157
7158	/*
7159	* For a given knote, link a provided wait queue directly with the kqueue.
7160	* Wakeups will happen via recursive wait queue support. But nothing will move
7161	* the knote to the active list at wakeup (nothing calls knote()). Instead,
7162	* we permanently enqueue them here.
7163	*
7164	* kqueue and knote references are held by caller.
7165	* waitq locked by caller.
7166	*
7167	* caller provides the wait queue link structure and insures that the kq->kq_wqs
7168	* is linked by previously calling knote_link_waitqset_lazy_alloc.
7169	*/
7170	int
7171	knote_link_waitq(struct knote kn, struct* waitq wq, uint64_t reserved_link)
7172	{
7173	struct kqueue *kq = knote_get_kq(kn);
7174	kern_return_t kr;
7175
7176	kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
7177	if (kr == KERN_SUCCESS) {
7178	knote_markstayactive(kn);
7179	return (`0`);
7180	} else {
7181	return (EINVAL);
7182	}
7183	}
7184
7185	/*
7186	* Unlink the provided wait queue from the kqueue associated with a knote.
7187	* Also remove it from the magic list of directly attached knotes.
7188	*
7189	* Note that the unlink may have already happened from the other side, so
7190	* ignore any failures to unlink and just remove it from the kqueue list.
7191	*
7192	* On success, caller is responsible for the link structure
7193	*/
7194	int
7195	knote_unlink_waitq(struct knote kn, struct* waitq *wq)
7196	{
7197	struct kqueue *kq = knote_get_kq(kn);
7198	kern_return_t kr;
7199
7200	kr = waitq_unlink(wq, &kq->kq_wqs);
7201	knote_clearstayactive(kn);
7202	return ((kr != KERN_SUCCESS) ? EINVAL : `0`);
7203	}
7204
7205	/*
7206	* remove all knotes referencing a specified fd
7207	*
7208	* Entered with the proc_fd lock already held.
7209	* It returns the same way, but may drop it temporarily.
7210	*/
7211	void
7212	knote_fdclose(struct proc p, int* fd)
7213	{
7214	struct klist *list;
7215	struct knote *kn;
7216	KNOTE_LOCK_CTX(knlc);
7217
7218	restart:
7219	list = &p->p_fd->fd_knlist[fd];
7220	SLIST_FOREACH(kn, list, kn_link) {
7221	struct kqueue *kq = knote_get_kq(kn);
7222
7223	kqlock(kq);
7224
7225	if (kq->kq_p != p)
7226	panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
7227	__func__, kq->kq_p, p);
7228
7229	/*
7230	* If the knote supports EV_VANISHED delivery,
7231	* transition it to vanished mode (or skip over
7232	* it if already vanished).
7233	*/
7234	if (kn->kn_status & KN_VANISHED) {
7235	kqunlock(kq);
7236	continue;
7237	}
7238
7239	proc_fdunlock(p);
7240	if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
7241	/ the knote was dropped by someone, nothing to do /
7242	} else if (kn->kn_status & KN_REQVANISH) {
7243	kn->kn_status \|= KN_VANISHED;
7244	kn->kn_status &= ~KN_ATTACHED;
7245
7246	kqunlock(kq);
7247	knote_fops(kn)->f_detach(kn);
7248	if (knote_fops(kn)->f_isfd)
7249	fp_drop(p, kn->kn_id, kn->kn_fp, `0`);
7250	kqlock(kq);
7251
7252	knote_activate(kn);
7253	knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
7254	} else {
7255	knote_drop(kq, kn, &knlc);
7256	}
7257
7258	proc_fdlock(p);
7259	goto restart;
7260	}
7261	}
7262
7263	/*
7264	* knote_fdfind - lookup a knote in the fd table for process
7265	*
7266	* If the filter is file-based, lookup based on fd index.
7267	* Otherwise use a hash based on the ident.
7268	*
7269	* Matching is based on kq, filter, and ident. Optionally,
7270	* it may also be based on the udata field in the kevent -
7271	* allowing multiple event registration for the file object
7272	* per kqueue.
7273	*
7274	* fd_knhashlock or fdlock held on entry (and exit)
7275	*/
7276	static struct knote *
7277	knote_fdfind(struct kqueue *kq,
7278	struct kevent_internal_s *kev,
7279	bool is_fd,
7280	struct proc *p)
7281	{
7282	struct filedesc *fdp = p->p_fd;
7283	struct klist *list = NULL;
7284	struct knote *kn = NULL;
7285
7286	/*
7287	* determine where to look for the knote
7288	*/
7289	if (is_fd) {
7290	/ fd-based knotes are linked off the fd table /
7291	if (kev->ident < (u_int)fdp->fd_knlistsize) {
7292	list = &fdp->fd_knlist[kev->ident];
7293	}
7294	} else if (fdp->fd_knhashmask != `0`) {
7295	/ hash non-fd knotes here too /
7296	list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
7297	}
7298
7299	/*
7300	* scan the selected list looking for a match
7301	*/
7302	if (list != NULL) {
7303	SLIST_FOREACH(kn, list, kn_link) {
7304	if (kq == knote_get_kq(kn) &&
7305	kev->ident == kn->kn_id &&
7306	kev->filter == kn->kn_filter) {
7307	if (kev->flags & EV_UDATA_SPECIFIC) {
7308	if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
7309	kev->udata == kn->kn_udata) {
7310	break; / matching udata-specific knote /
7311	}
7312	} else if ((kn->kn_status & KN_UDATA_SPECIFIC) == `0`) {
7313	break; / matching non-udata-specific knote /
7314	}
7315	}
7316	}
7317	}
7318	return kn;
7319	}
7320
7321	/*
7322	* kq_add_knote- Add knote to the fd table for process
7323	* while checking for duplicates.
7324	*
7325	* All file-based filters associate a list of knotes by file
7326	* descriptor index. All other filters hash the knote by ident.
7327	*
7328	* May have to grow the table of knote lists to cover the
7329	* file descriptor index presented.
7330	*
7331	* fd_knhashlock and fdlock unheld on entry (and exit).
7332	*
7333	* Takes a rwlock boost if inserting the knote is successful.
7334	*/
7335	static int
7336	kq_add_knote(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc,
7337	struct proc *p)
7338	{
7339	struct filedesc *fdp = p->p_fd;
7340	struct klist *list = NULL;
7341	int ret = `0`;
7342	bool is_fd = knote_fops(kn)->f_isfd;
7343
7344	if (is_fd)
7345	proc_fdlock(p);
7346	else
7347	knhash_lock(p);
7348
7349	if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
7350	/ found an existing knote: we can't add this one /
7351	ret = ERESTART;
7352	goto out_locked;
7353	}
7354
7355	/ knote was not found: add it now /
7356	if (!is_fd) {
7357	if (fdp->fd_knhashmask == `0`) {
7358	u_long size = `0`;
7359
7360	list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
7361	if (list == NULL) {
7362	ret = ENOMEM;
7363	goto out_locked;
7364	}
7365
7366	fdp->fd_knhash = list;
7367	fdp->fd_knhashmask = size;
7368	}
7369
7370	list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
7371	SLIST_INSERT_HEAD(list, kn, kn_link);
7372	ret = `0`;
7373	goto out_locked;
7374
7375	} else {
7376	/ knote is fd based /
7377
7378	if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
7379	u_int size = `0`;
7380
7381	if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
7382	\|\| kn->kn_id >= (uint64_t)maxfiles) {
7383	ret = EINVAL;
7384	goto out_locked;
7385	}
7386	/ have to grow the fd_knlist /
7387	size = fdp->fd_knlistsize;
7388	while (size <= kn->kn_id)
7389	size += KQEXTENT;
7390
7391	if (size >= (UINT_MAX/sizeof(struct klist *))) {
7392	ret = EINVAL;
7393	goto out_locked;
7394	}
7395
7396	MALLOC(list, struct klist *,
7397	size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
7398	if (list == NULL) {
7399	ret = ENOMEM;
7400	goto out_locked;
7401	}
7402
7403	bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
7404	fdp->fd_knlistsize * sizeof(struct klist *));
7405	bzero((caddr_t)list +
7406	fdp->fd_knlistsize * sizeof(struct klist *),
7407	(size - fdp->fd_knlistsize) * sizeof(struct klist *));
7408	FREE(fdp->fd_knlist, M_KQUEUE);
7409	fdp->fd_knlist = list;
7410	fdp->fd_knlistsize = size;
7411	}
7412
7413	list = &fdp->fd_knlist[kn->kn_id];
7414	SLIST_INSERT_HEAD(list, kn, kn_link);
7415	ret = `0`;
7416	goto out_locked;
7417
7418	}
7419
7420	out_locked:
7421	if (ret == `0`) {
7422	kqlock(kq);
7423	assert((kn->kn_status & KN_LOCKED) == `0`);
7424	(void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
7425	}
7426	if (is_fd)
7427	proc_fdunlock(p);
7428	else
7429	knhash_unlock(p);
7430
7431	return ret;
7432	}
7433
7434	/*
7435	* kq_remove_knote - remove a knote from the fd table for process
7436	*
7437	* If the filter is file-based, remove based on fd index.
7438	* Otherwise remove from the hash based on the ident.
7439	*
7440	* fd_knhashlock and fdlock unheld on entry (and exit).
7441	*/
7442	static void
7443	kq_remove_knote(struct kqueue kq, struct* knote kn, struct* proc *p,
7444	struct knote_lock_ctx *knlc)
7445	{
7446	struct filedesc *fdp = p->p_fd;
7447	struct klist *list = NULL;
7448	uint16_t kq_state;
7449	bool is_fd;
7450
7451	is_fd = knote_fops(kn)->f_isfd;
7452
7453	if (is_fd)
7454	proc_fdlock(p);
7455	else
7456	knhash_lock(p);
7457
7458	if (is_fd) {
7459	assert ((u_int)fdp->fd_knlistsize > kn->kn_id);
7460	list = &fdp->fd_knlist[kn->kn_id];
7461	} else {
7462	list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
7463	}
7464	SLIST_REMOVE(list, kn, knote, kn_link);
7465
7466	kqlock(kq);
7467	kq_state = kq->kq_state;
7468	if (knlc) {
7469	knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK);
7470	} else {
7471	kqunlock(kq);
7472	}
7473	if (is_fd)
7474	proc_fdunlock(p);
7475	else
7476	knhash_unlock(p);
7477
7478	if (kq_state & KQ_DYNAMIC)
7479	kqueue_release_last(p, kq);
7480	}
7481
7482	/*
7483	* kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
7484	* and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
7485	*
7486	* fd_knhashlock or fdlock unheld on entry (and exit)
7487	*/
7488
7489	static struct knote *
7490	kq_find_knote_and_kq_lock(struct kqueue kq, struct* kevent_internal_s *kev,
7491	bool is_fd, struct proc *p)
7492	{
7493	struct knote * ret;
7494
7495	if (is_fd)
7496	proc_fdlock(p);
7497	else
7498	knhash_lock(p);
7499
7500	ret = knote_fdfind(kq, kev, is_fd, p);
7501
7502	if (ret) {
7503	kqlock(kq);
7504	}
7505
7506	if (is_fd)
7507	proc_fdunlock(p);
7508	else
7509	knhash_unlock(p);
7510
7511	return ret;
7512	}
7513	/*
7514	* knote_drop - disconnect and drop the knote
7515	*
7516	* Called with the kqueue locked, returns with the kqueue unlocked.
7517	*
7518	* If a knote locking context is passed, it is canceled.
7519	*
7520	* The knote may have already been detached from
7521	* (or not yet attached to) its source object.
7522	*/
7523	static void
7524	knote_drop(struct kqueue kq, struct* knote kn, struct* knote_lock_ctx *knlc)
7525	{
7526	struct proc *p = kq->kq_p;
7527
7528	kqlock_held(kq);
7529
7530	assert((kn->kn_status & KN_DROPPING) == `0`);
7531	if (knlc == NULL) {
7532	assert((kn->kn_status & KN_LOCKED) == `0`);
7533	}
7534	kn->kn_status \|= KN_DROPPING;
7535
7536	knote_unsuppress(kn);
7537	knote_dequeue(kn);
7538	knote_wait_for_filter_events(kq, kn);
7539
7540	/ If we are attached, disconnect from the source first /
7541	if (kn->kn_status & KN_ATTACHED) {
7542	knote_fops(kn)->f_detach(kn);
7543	}
7544
7545	/ kq may be freed when kq_remove_knote() returns /
7546	kq_remove_knote(kq, kn, p, knlc);
7547	if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == `0`))
7548	fp_drop(p, kn->kn_id, kn->kn_fp, `0`);
7549
7550	knote_free(kn);
7551	}
7552
7553	/ called with kqueue lock held /
7554	static void
7555	knote_activate(struct knote *kn)
7556	{
7557	if (kn->kn_status & KN_ACTIVE)
7558	return;
7559
7560	KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7561	kn->kn_udata, kn->kn_status \| (kn->kn_id << `32`),
7562	kn->kn_filtid);
7563
7564	kn->kn_status \|= KN_ACTIVE;
7565	if (knote_enqueue(kn))
7566	knote_wakeup(kn);
7567	}
7568
7569	/ called with kqueue lock held /
7570	static void
7571	knote_deactivate(struct knote *kn)
7572	{
7573	kn->kn_status &= ~KN_ACTIVE;
7574	if ((kn->kn_status & KN_STAYACTIVE) == `0`)
7575	knote_dequeue(kn);
7576	}
7577
7578	/ called with kqueue lock held /
7579	static void
7580	knote_enable(struct knote *kn)
7581	{
7582	if ((kn->kn_status & KN_DISABLED) == `0`)
7583	return;
7584
7585	kn->kn_status &= ~KN_DISABLED;
7586
7587	if (kn->kn_status & KN_SUPPRESSED) {
7588	/*
7589	* it is possible for userland to have knotes registered for a given
7590	* workloop `wl_orig` but really handled on another workloop `wl_new`.
7591	*
7592	* In that case, rearming will happen from the servicer thread of
7593	* `wl_new` which if `wl_orig` is no longer being serviced, would cause
7594	* this knote to stay suppressed forever if we only relied on
7595	* kqworkloop_acknowledge_events to be called by `wl_orig`.
7596	*
7597	* However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7598	* unsuppress because that would mess with the processing phase of
7599	* `wl_orig`, however it also means kqworkloop_acknowledge_events()
7600	* will be called.
7601	*/
7602	struct kqueue *kq = knote_get_kq(kn);
7603	if ((kq->kq_state & KQ_PROCESSING) == `0`) {
7604	knote_unsuppress(kn);
7605	}
7606	} else if (knote_enqueue(kn)) {
7607	knote_wakeup(kn);
7608	}
7609	}
7610
7611	/ called with kqueue lock held /
7612	static void
7613	knote_disable(struct knote *kn)
7614	{
7615	if (kn->kn_status & KN_DISABLED)
7616	return;
7617
7618	kn->kn_status \|= KN_DISABLED;
7619	knote_dequeue(kn);
7620	}
7621
7622	/ called with kqueue lock held /
7623	static void
7624	knote_suppress(struct knote *kn)
7625	{
7626	struct kqtailq *suppressq;
7627	struct kqueue *kq = knote_get_kq(kn);
7628
7629	kqlock_held(kq);
7630
7631	if (kn->kn_status & KN_SUPPRESSED)
7632	return;
7633
7634	knote_dequeue(kn);
7635	kn->kn_status \|= KN_SUPPRESSED;
7636	suppressq = kqueue_get_suppressed_queue(kq, kn);
7637	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7638	}
7639
7640	/ called with kqueue lock held /
7641	static void
7642	knote_unsuppress(struct knote *kn)
7643	{
7644	struct kqtailq *suppressq;
7645	struct kqueue *kq = knote_get_kq(kn);
7646
7647	kqlock_held(kq);
7648
7649	if ((kn->kn_status & KN_SUPPRESSED) == `0`)
7650	return;
7651
7652	kn->kn_status &= ~KN_SUPPRESSED;
7653	suppressq = kqueue_get_suppressed_queue(kq, kn);
7654	TAILQ_REMOVE(suppressq, kn, kn_tqe);
7655
7656	/*
7657	* If the knote is no longer active, reset its push,
7658	* and resynchronize kn_qos_index with kn_qos_override
7659	*/
7660	if ((kn->kn_status & KN_ACTIVE) == `0`) {
7661	kn->kn_qos_override = kn->kn_req_index;
7662	}
7663	kn->kn_qos_index = kn->kn_qos_override;
7664
7665	/ don't wakeup if unsuppressing just a stay-active knote /
7666	if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
7667	knote_wakeup(kn);
7668	}
7669
7670	if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) {
7671	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
7672
7673	if (kqworkloop_is_processing_on_current_thread(kqwl)) {
7674	/*
7675	* kqworkloop_end_processing() or kqworkloop_begin_processing()
7676	* will perform the required QoS computations when it unsets the
7677	* processing mode.
7678	*/
7679	} else {
7680	kq_req_lock(kqwl);
7681	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, `0`);
7682	kq_req_unlock(kqwl);
7683	}
7684	}
7685	}
7686
7687	/ called with kqueue lock held /
7688	static int
7689	knote_enqueue(struct knote *kn)
7690	{
7691	if ((kn->kn_status & (KN_ACTIVE \| KN_STAYACTIVE)) == `0` \|\|
7692	(kn->kn_status & (KN_DISABLED \| KN_SUPPRESSED \| KN_DROPPING)))
7693	return `0`;
7694
7695	if ((kn->kn_status & KN_QUEUED) == `0`) {
7696	struct kqtailq *queue = knote_get_queue(kn);
7697	struct kqueue *kq = knote_get_kq(kn);
7698
7699	kqlock_held(kq);
7700	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7701	kn->kn_status \|= KN_QUEUED;
7702	kq->kq_count++;
7703	return `1`;
7704	}
7705	return ((kn->kn_status & KN_STAYACTIVE) != `0`);
7706	}
7707
7708
7709	/ called with kqueue lock held /
7710	static void
7711	knote_dequeue(struct knote *kn)
7712	{
7713	struct kqueue *kq = knote_get_kq(kn);
7714	struct kqtailq *queue;
7715
7716	kqlock_held(kq);
7717
7718	if ((kn->kn_status & KN_QUEUED) == `0`)
7719	return;
7720
7721	queue = knote_get_queue(kn);
7722	TAILQ_REMOVE(queue, kn, kn_tqe);
7723	kn->kn_status &= ~KN_QUEUED;
7724	kq->kq_count--;
7725	}
7726
7727	void
7728	knote_init(void)
7729	{
7730	knote_zone = zinit(sizeof(struct knote), `8192`*sizeof(struct knote),
7731	`8192`, "knote zone");
7732
7733	kqfile_zone = zinit(sizeof(struct kqfile), `8192`*sizeof(struct kqfile),
7734	`8192`, "kqueue file zone");
7735
7736	kqworkq_zone = zinit(sizeof(struct kqworkq), `8192`*sizeof(struct kqworkq),
7737	`8192`, "kqueue workq zone");
7738
7739	kqworkloop_zone = zinit(sizeof(struct kqworkloop), `8192`*sizeof(struct kqworkloop),
7740	`8192`, "kqueue workloop zone");
7741
7742	/ allocate kq lock group attribute and group /
7743	kq_lck_grp_attr = lck_grp_attr_alloc_init();
7744
7745	kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
7746
7747	/ Allocate kq lock attribute /
7748	kq_lck_attr = lck_attr_alloc_init();
7749
7750	#if CONFIG_MEMORYSTATUS
7751	/ Initialize the memorystatus list lock /
7752	memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
7753	#endif
7754	}
7755	SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
7756
7757	const struct filterops *
7758	knote_fops(struct knote *kn)
7759	{
7760	return sysfilt_ops[kn->kn_filtid];
7761	}
7762
7763	static struct knote *
7764	knote_alloc(void)
7765	{
7766	struct knote kn = ((struct* knote *)zalloc(knote_zone));
7767	bzero(kn, sizeof(struct knote));
7768	return kn;
7769	}
7770
7771	static void
7772	knote_free(struct knote *kn)
7773	{
7774	assert(kn->kn_inuse == `0`);
7775	assert((kn->kn_status & KN_LOCKED) == `0`);
7776	zfree(knote_zone, kn);
7777	}
7778
7779	#if SOCKETS
7780	#include <sys/param.h>
7781	#include <sys/socket.h>
7782	#include <sys/protosw.h>
7783	#include <sys/domain.h>
7784	#include <sys/mbuf.h>
7785	#include <sys/kern_event.h>
7786	#include <sys/malloc.h>
7787	#include <sys/sys_domain.h>
7788	#include <sys/syslog.h>
7789
7790	#ifndef ROUNDUP64
7791	#define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
7792	#endif
7793
7794	#ifndef ADVANCE64
7795	#define ADVANCE64(p, n) (void)((char )(p) + ROUNDUP64(n))
7796	#endif
7797
7798	static lck_grp_attr_t *kev_lck_grp_attr;
7799	static lck_attr_t *kev_lck_attr;
7800	static lck_grp_t *kev_lck_grp;
7801	static decl_lck_rw_data(,kev_lck_data);
7802	static lck_rw_t *kev_rwlock = &kev_lck_data;
7803
7804	static int kev_attach(struct socket so, int* proto, struct proc *p);
7805	static int kev_detach(struct socket *so);
7806	static int kev_control(struct socket *so, u_long cmd, caddr_t data,
7807	struct ifnet ifp, struct* proc *p);
7808	static lck_mtx_t * event_getlock(struct socket , int*);
7809	static int event_lock(struct socket , int, void* *);
7810	static int event_unlock(struct socket , int, void* *);
7811
7812	static int event_sofreelastref(struct socket *);
7813	static void kev_delete(struct kern_event_pcb *);
7814
7815	static struct pr_usrreqs event_usrreqs = {
7816	.pru_attach = kev_attach,
7817	.pru_control = kev_control,
7818	.pru_detach = kev_detach,
7819	.pru_soreceive = soreceive,
7820	};
7821
7822	static struct protosw eventsw[] = {
7823	{
7824	.pr_type = SOCK_RAW,
7825	.pr_protocol = SYSPROTO_EVENT,
7826	.pr_flags = PR_ATOMIC,
7827	.pr_usrreqs = &event_usrreqs,
7828	.pr_lock = event_lock,
7829	.pr_unlock = event_unlock,
7830	.pr_getlock = event_getlock,
7831	}
7832	};
7833
7834	__private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
7835	__private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
7836
7837	SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
7838	CTLFLAG_RW\|CTLFLAG_LOCKED, `0`, "Kernel event family");
7839
7840	struct kevtstat kevtstat;
7841	SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
7842	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
7843	kevt_getstat, "S,kevtstat", "");
7844
7845	SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
7846	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
7847	kevt_pcblist, "S,xkevtpcb", "");
7848
7849	static lck_mtx_t *
7850	event_getlock(struct socket so, int* flags)
7851	{
7852	#pragma unused(flags)
7853	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *)so->so_pcb;
7854
7855	if (so->so_pcb != NULL) {
7856	if (so->so_usecount < `0`)
7857	panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
7858	so, so->so_usecount, solockhistory_nr(so));
7859	/ NOTREACHED /
7860	} else {
7861	panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
7862	so, solockhistory_nr(so));
7863	/ NOTREACHED /
7864	}
7865	return (&ev_pcb->evp_mtx);
7866	}
7867
7868	static int
7869	event_lock(struct socket so, int* refcount, void *lr)
7870	{
7871	void *lr_saved;
7872
7873	if (lr == NULL)
7874	lr_saved = __builtin_return_address(`0`);
7875	else
7876	lr_saved = lr;
7877
7878	if (so->so_pcb != NULL) {
7879	lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
7880	} else {
7881	panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
7882	so, lr_saved, solockhistory_nr(so));
7883	/ NOTREACHED /
7884	}
7885
7886	if (so->so_usecount < `0`) {
7887	panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
7888	so, so->so_pcb, lr_saved, so->so_usecount,
7889	solockhistory_nr(so));
7890	/ NOTREACHED /
7891	}
7892
7893	if (refcount)
7894	so->so_usecount++;
7895
7896	so->lock_lr[so->next_lock_lr] = lr_saved;
7897	so->next_lock_lr = (so->next_lock_lr+`1`) % SO_LCKDBG_MAX;
7898	return (`0`);
7899	}
7900
7901	static int
7902	event_unlock(struct socket so, int* refcount, void *lr)
7903	{
7904	void *lr_saved;
7905	lck_mtx_t *mutex_held;
7906
7907	if (lr == NULL)
7908	lr_saved = __builtin_return_address(`0`);
7909	else
7910	lr_saved = lr;
7911
7912	if (refcount) {
7913	so->so_usecount--;
7914	}
7915	if (so->so_usecount < `0`) {
7916	panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
7917	so, so->so_usecount, solockhistory_nr(so));
7918	/ NOTREACHED /
7919	}
7920	if (so->so_pcb == NULL) {
7921	panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
7922	so, so->so_usecount, (void *)lr_saved,
7923	solockhistory_nr(so));
7924	/ NOTREACHED /
7925	}
7926	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
7927
7928	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7929	so->unlock_lr[so->next_unlock_lr] = lr_saved;
7930	so->next_unlock_lr = (so->next_unlock_lr+`1`) % SO_LCKDBG_MAX;
7931
7932	if (so->so_usecount == `0`) {
7933	VERIFY(so->so_flags & SOF_PCBCLEARING);
7934	event_sofreelastref(so);
7935	} else {
7936	lck_mtx_unlock(mutex_held);
7937	}
7938
7939	return (`0`);
7940	}
7941
7942	static int
7943	event_sofreelastref(struct socket *so)
7944	{
7945	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *)so->so_pcb;
7946
7947	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
7948
7949	so->so_pcb = NULL;
7950
7951	/*
7952	* Disable upcall in the event another thread is in kev_post_msg()
7953	* appending record to the receive socket buffer, since sbwakeup()
7954	* may release the socket lock otherwise.
7955	*/
7956	so->so_rcv.sb_flags &= ~SB_UPCALL;
7957	so->so_snd.sb_flags &= ~SB_UPCALL;
7958	so->so_event = sonullevent;
7959	lck_mtx_unlock(&(ev_pcb->evp_mtx));
7960
7961	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
7962	lck_rw_lock_exclusive(kev_rwlock);
7963	LIST_REMOVE(ev_pcb, evp_link);
7964	kevtstat.kes_pcbcount--;
7965	kevtstat.kes_gencnt++;
7966	lck_rw_done(kev_rwlock);
7967	kev_delete(ev_pcb);
7968
7969	sofreelastref(so, `1`);
7970	return (`0`);
7971	}
7972
7973	static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
7974
7975	static
7976	struct kern_event_head kern_event_head;
7977
7978	static u_int32_t static_event_id = `0`;
7979
7980	#define EVPCB_ZONE_MAX 65536
7981	#define EVPCB_ZONE_NAME "kerneventpcb"
7982	static struct zone *ev_pcb_zone;
7983
7984	/*
7985	* Install the protosw's for the NKE manager. Invoked at extension load time
7986	*/
7987	void
7988	kern_event_init(struct domain *dp)
7989	{
7990	struct protosw *pr;
7991	int i;
7992
7993	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
7994	VERIFY(dp == systemdomain);
7995
7996	kev_lck_grp_attr = lck_grp_attr_alloc_init();
7997	if (kev_lck_grp_attr == NULL) {
7998	panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
7999	/ NOTREACHED /
8000	}
8001
8002	kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
8003	kev_lck_grp_attr);
8004	if (kev_lck_grp == NULL) {
8005	panic("%s: lck_grp_alloc_init failed\n", __func__);
8006	/ NOTREACHED /
8007	}
8008
8009	kev_lck_attr = lck_attr_alloc_init();
8010	if (kev_lck_attr == NULL) {
8011	panic("%s: lck_attr_alloc_init failed\n", __func__);
8012	/ NOTREACHED /
8013	}
8014
8015	lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
8016	if (kev_rwlock == NULL) {
8017	panic("%s: lck_mtx_alloc_init failed\n", __func__);
8018	/ NOTREACHED /
8019	}
8020
8021	for (i = `0`, pr = &eventsw[`0`]; i < event_proto_count; i++, pr++)
8022	net_add_proto(pr, dp, `1`);
8023
8024	ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
8025	EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), `0`, EVPCB_ZONE_NAME);
8026	if (ev_pcb_zone == NULL) {
8027	panic("%s: failed allocating ev_pcb_zone", __func__);
8028	/ NOTREACHED /
8029	}
8030	zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
8031	zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
8032	}
8033
8034	static int
8035	kev_attach(struct socket so, __unused int* proto, __unused struct proc *p)
8036	{
8037	int error = `0`;
8038	struct kern_event_pcb *ev_pcb;
8039
8040	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8041	if (error != `0`)
8042	return (error);
8043
8044	if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
8045	return (ENOBUFS);
8046	}
8047	bzero(ev_pcb, sizeof(struct kern_event_pcb));
8048	lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
8049
8050	ev_pcb->evp_socket = so;
8051	ev_pcb->evp_vendor_code_filter = `0xffffffff`;
8052
8053	so->so_pcb = (caddr_t) ev_pcb;
8054	lck_rw_lock_exclusive(kev_rwlock);
8055	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8056	kevtstat.kes_pcbcount++;
8057	kevtstat.kes_gencnt++;
8058	lck_rw_done(kev_rwlock);
8059
8060	return (error);
8061	}
8062
8063	static void
8064	kev_delete(struct kern_event_pcb *ev_pcb)
8065	{
8066	VERIFY(ev_pcb != NULL);
8067	lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
8068	zfree(ev_pcb_zone, ev_pcb);
8069	}
8070
8071	static int
8072	kev_detach(struct socket *so)
8073	{
8074	struct kern_event_pcb ev_pcb = (struct* kern_event_pcb *) so->so_pcb;
8075
8076	if (ev_pcb != NULL) {
8077	soisdisconnected(so);
8078	so->so_flags \|= SOF_PCBCLEARING;
8079	}
8080
8081	return (`0`);
8082	}
8083
8084	/*
8085	* For now, kev_vendor_code and mbuf_tags use the same
8086	* mechanism.
8087	*/
8088	errno_t kev_vendor_code_find(
8089	const char *string,
8090	u_int32_t *out_vendor_code)
8091	{
8092	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8093	return (EINVAL);
8094	}
8095	return (net_str_id_find_internal(string, out_vendor_code,
8096	NSI_VENDOR_CODE, `1`));
8097	}
8098
8099	errno_t
8100	kev_msg_post(struct kev_msg *event_msg)
8101	{
8102	mbuf_tag_id_t min_vendor, max_vendor;
8103
8104	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8105
8106	if (event_msg == NULL)
8107	return (EINVAL);
8108
8109	/*
8110	* Limit third parties to posting events for registered vendor codes
8111	* only
8112	*/
8113	if (event_msg->vendor_code < min_vendor \|\|
8114	event_msg->vendor_code > max_vendor) {
8115	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
8116	return (EINVAL);
8117	}
8118	return (kev_post_msg(event_msg));
8119	}
8120
8121	int
8122	kev_post_msg(struct kev_msg *event_msg)
8123	{
8124	struct mbuf m, m2;
8125	struct kern_event_pcb *ev_pcb;
8126	struct kern_event_msg *ev;
8127	char *tmp;
8128	u_int32_t total_size;
8129	int i;
8130
8131	/ Verify the message is small enough to fit in one mbuf w/o cluster /
8132	total_size = KEV_MSG_HEADER_SIZE;
8133
8134	for (i = `0`; i < `5`; i++) {
8135	if (event_msg->dv[i].data_length == `0`)
8136	break;
8137	total_size += event_msg->dv[i].data_length;
8138	}
8139
8140	if (total_size > MLEN) {
8141	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
8142	return (EMSGSIZE);
8143	}
8144
8145	m = m_get(M_WAIT, MT_DATA);
8146	if (m == `0`) {
8147	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8148	return (ENOMEM);
8149	}
8150	ev = mtod(m, struct kern_event_msg *);
8151	total_size = KEV_MSG_HEADER_SIZE;
8152
8153	tmp = (char *) &ev->event_data[`0`];
8154	for (i = `0`; i < `5`; i++) {
8155	if (event_msg->dv[i].data_length == `0`)
8156	break;
8157
8158	total_size += event_msg->dv[i].data_length;
8159	bcopy(event_msg->dv[i].data_ptr, tmp,
8160	event_msg->dv[i].data_length);
8161	tmp += event_msg->dv[i].data_length;
8162	}
8163
8164	ev->id = ++static_event_id;
8165	ev->total_size = total_size;
8166	ev->vendor_code = event_msg->vendor_code;
8167	ev->kev_class = event_msg->kev_class;
8168	ev->kev_subclass = event_msg->kev_subclass;
8169	ev->event_code = event_msg->event_code;
8170
8171	m->m_len = total_size;
8172	lck_rw_lock_shared(kev_rwlock);
8173	for (ev_pcb = LIST_FIRST(&kern_event_head);
8174	ev_pcb;
8175	ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8176	lck_mtx_lock(&ev_pcb->evp_mtx);
8177	if (ev_pcb->evp_socket->so_pcb == NULL) {
8178	lck_mtx_unlock(&ev_pcb->evp_mtx);
8179	continue;
8180	}
8181	if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8182	if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8183	lck_mtx_unlock(&ev_pcb->evp_mtx);
8184	continue;
8185	}
8186
8187	if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8188	if (ev_pcb->evp_class_filter != ev->kev_class) {
8189	lck_mtx_unlock(&ev_pcb->evp_mtx);
8190	continue;
8191	}
8192
8193	if ((ev_pcb->evp_subclass_filter !=
8194	KEV_ANY_SUBCLASS) &&
8195	(ev_pcb->evp_subclass_filter !=
8196	ev->kev_subclass)) {
8197	lck_mtx_unlock(&ev_pcb->evp_mtx);
8198	continue;
8199	}
8200	}
8201	}
8202
8203	m2 = m_copym(m, `0`, m->m_len, M_WAIT);
8204	if (m2 == `0`) {
8205	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8206	m_free(m);
8207	lck_mtx_unlock(&ev_pcb->evp_mtx);
8208	lck_rw_done(kev_rwlock);
8209	return (ENOMEM);
8210	}
8211	if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8212	/*
8213	* We use "m" for the socket stats as it would be
8214	* unsafe to use "m2"
8215	*/
8216	so_inc_recv_data_stat(ev_pcb->evp_socket,
8217	`1`, m->m_len, MBUF_TC_BE);
8218
8219	sorwakeup(ev_pcb->evp_socket);
8220	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
8221	} else {
8222	OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
8223	}
8224	lck_mtx_unlock(&ev_pcb->evp_mtx);
8225	}
8226	m_free(m);
8227	lck_rw_done(kev_rwlock);
8228
8229	return (`0`);
8230	}
8231
8232	static int
8233	kev_control(struct socket *so,
8234	u_long cmd,
8235	caddr_t data,
8236	__unused struct ifnet *ifp,
8237	__unused struct proc *p)
8238	{
8239	struct kev_request kev_req = (struct* kev_request *) data;
8240	struct kern_event_pcb *ev_pcb;
8241	struct kev_vendor_code *kev_vendor;
8242	u_int32_t id_value = (u_int32_t ) data;
8243
8244	switch (cmd) {
8245	case SIOCGKEVID:
8246	*id_value = static_event_id;
8247	break;
8248	case SIOCSKEVFILT:
8249	ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8250	ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8251	ev_pcb->evp_class_filter = kev_req->kev_class;
8252	ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
8253	break;
8254	case SIOCGKEVFILT:
8255	ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8256	kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8257	kev_req->kev_class = ev_pcb->evp_class_filter;
8258	kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8259	break;
8260	case SIOCGKEVVENDOR:
8261	kev_vendor = (struct kev_vendor_code *)data;
8262	/ Make sure string is NULL terminated /
8263	kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-`1`] = `0`;
8264	return (net_str_id_find_internal(kev_vendor->vendor_string,
8265	&kev_vendor->vendor_code, NSI_VENDOR_CODE, `0`));
8266	default:
8267	return (ENOTSUP);
8268	}
8269
8270	return (`0`);
8271	}
8272
8273	int
8274	kevt_getstat SYSCTL_HANDLER_ARGS
8275	{
8276	#pragma unused(oidp, arg1, arg2)
8277	int error = `0`;
8278
8279	lck_rw_lock_shared(kev_rwlock);
8280
8281	if (req->newptr != USER_ADDR_NULL) {
8282	error = EPERM;
8283	goto done;
8284	}
8285	if (req->oldptr == USER_ADDR_NULL) {
8286	req->oldidx = sizeof(struct kevtstat);
8287	goto done;
8288	}
8289
8290	error = SYSCTL_OUT(req, &kevtstat,
8291	MIN(sizeof(struct kevtstat), req->oldlen));
8292	done:
8293	lck_rw_done(kev_rwlock);
8294
8295	return (error);
8296	}
8297
8298	__private_extern__ int
8299	kevt_pcblist SYSCTL_HANDLER_ARGS
8300	{
8301	#pragma unused(oidp, arg1, arg2)
8302	int error = `0`;
8303	int n, i;
8304	struct xsystmgen xsg;
8305	void *buf = NULL;
8306	size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
8307	ROUNDUP64(sizeof (struct xsocket_n)) +
8308	`2` * ROUNDUP64(sizeof (struct xsockbuf_n)) +
8309	ROUNDUP64(sizeof (struct xsockstat_n));
8310	struct kern_event_pcb *ev_pcb;
8311
8312	buf = _MALLOC(item_size, M_TEMP, M_WAITOK \| M_ZERO);
8313	if (buf == NULL)
8314	return (ENOMEM);
8315
8316	lck_rw_lock_shared(kev_rwlock);
8317
8318	n = kevtstat.kes_pcbcount;
8319
8320	if (req->oldptr == USER_ADDR_NULL) {
8321	req->oldidx = (n + n/`8`) * item_size;
8322	goto done;
8323	}
8324	if (req->newptr != USER_ADDR_NULL) {
8325	error = EPERM;
8326	goto done;
8327	}
8328	bzero(&xsg, sizeof (xsg));
8329	xsg.xg_len = sizeof (xsg);
8330	xsg.xg_count = n;
8331	xsg.xg_gen = kevtstat.kes_gencnt;
8332	xsg.xg_sogen = so_gencnt;
8333	error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
8334	if (error) {
8335	goto done;
8336	}
8337	/*
8338	* We are done if there is no pcb
8339	*/
8340	if (n == `0`) {
8341	goto done;
8342	}
8343
8344	i = `0`;
8345	for (i = `0`, ev_pcb = LIST_FIRST(&kern_event_head);
8346	i < n && ev_pcb != NULL;
8347	i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8348	struct xkevtpcb xk = (struct* xkevtpcb *)buf;
8349	struct xsocket_n xso = (struct* xsocket_n *)
8350	ADVANCE64(xk, sizeof (*xk));
8351	struct xsockbuf_n xsbrcv = (struct* xsockbuf_n *)
8352	ADVANCE64(xso, sizeof (*xso));
8353	struct xsockbuf_n xsbsnd = (struct* xsockbuf_n *)
8354	ADVANCE64(xsbrcv, sizeof (*xsbrcv));
8355	struct xsockstat_n xsostats = (struct* xsockstat_n *)
8356	ADVANCE64(xsbsnd, sizeof (*xsbsnd));
8357
8358	bzero(buf, item_size);
8359
8360	lck_mtx_lock(&ev_pcb->evp_mtx);
8361
8362	xk->kep_len = sizeof(struct xkevtpcb);
8363	xk->kep_kind = XSO_EVT;
8364	xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8365	xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8366	xk->kep_class_filter = ev_pcb->evp_class_filter;
8367	xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8368
8369	sotoxsocket_n(ev_pcb->evp_socket, xso);
8370	sbtoxsockbuf_n(ev_pcb->evp_socket ?
8371	&ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8372	sbtoxsockbuf_n(ev_pcb->evp_socket ?
8373	&ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8374	sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8375
8376	lck_mtx_unlock(&ev_pcb->evp_mtx);
8377
8378	error = SYSCTL_OUT(req, buf, item_size);
8379	}
8380
8381	if (error == `0`) {
8382	/*
8383	* Give the user an updated idea of our state.
8384	* If the generation differs from what we told
8385	* her before, she knows that something happened
8386	* while we were processing this request, and it
8387	* might be necessary to retry.
8388	*/
8389	bzero(&xsg, sizeof (xsg));
8390	xsg.xg_len = sizeof (xsg);
8391	xsg.xg_count = n;
8392	xsg.xg_gen = kevtstat.kes_gencnt;
8393	xsg.xg_sogen = so_gencnt;
8394	error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
8395	if (error) {
8396	goto done;
8397	}
8398	}
8399
8400	done:
8401	lck_rw_done(kev_rwlock);
8402
8403	return (error);
8404	}
8405
8406	#endif /* SOCKETS */
8407
8408
8409	int
8410	fill_kqueueinfo(struct kqueue kq, struct* kqueue_info * kinfo)
8411	{
8412	struct vinfo_stat * st;
8413
8414	st = &kinfo->kq_stat;
8415
8416	st->vst_size = kq->kq_count;
8417	if (kq->kq_state & KQ_KEV_QOS)
8418	st->vst_blksize = sizeof(struct kevent_qos_s);
8419	else if (kq->kq_state & KQ_KEV64)
8420	st->vst_blksize = sizeof(struct kevent64_s);
8421	else
8422	st->vst_blksize = sizeof(struct kevent);
8423	st->vst_mode = S_IFIFO;
8424	st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
8425	((struct kqworkloop *)kq)->kqwl_dynamicid : `0`;
8426
8427	/ flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) /
8428	#define PROC_KQUEUE_MASK (KQ_SEL\|KQ_SLEEP\|KQ_KEV32\|KQ_KEV64\|KQ_KEV_QOS\|KQ_WORKQ\|KQ_WORKLOOP)
8429	kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
8430
8431	return (`0`);
8432	}
8433
8434	static int
8435	fill_kqueue_dyninfo(struct kqueue kq, struct* kqueue_dyninfo *kqdi)
8436	{
8437	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
8438	struct kqrequest *kqr = &kqwl->kqwl_request;
8439	workq_threadreq_param_t trp = {};
8440	int err;
8441
8442	if ((kq->kq_state & KQ_WORKLOOP) == `0`) {
8443	return EINVAL;
8444	}
8445
8446	if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) {
8447	return err;
8448	}
8449
8450	kq_req_lock(kqwl);
8451
8452	kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
8453	kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8454	kqdi->kqdi_request_state = kqr->kqr_state;
8455	kqdi->kqdi_async_qos = kqr->kqr_qos_index;
8456	kqdi->kqdi_events_qos = kqr->kqr_override_index;
8457	kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
8458	kqdi->kqdi_sync_waiter_qos = `0`;
8459
8460	trp.trp_value = kqwl->kqwl_params;
8461	if (trp.trp_flags & TRP_PRIORITY)
8462	kqdi->kqdi_pri = trp.trp_pri;
8463	else
8464	kqdi->kqdi_pri = `0`;
8465
8466	if (trp.trp_flags & TRP_POLICY)
8467	kqdi->kqdi_pol = trp.trp_pol;
8468	else
8469	kqdi->kqdi_pol = `0`;
8470
8471	if (trp.trp_flags & TRP_CPUPERCENT)
8472	kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8473	else
8474	kqdi->kqdi_cpupercent = `0`;
8475
8476	kq_req_unlock(kqwl);
8477
8478	return `0`;
8479	}
8480
8481
8482	void
8483	knote_markstayactive(struct knote *kn)
8484	{
8485	struct kqueue *kq = knote_get_kq(kn);
8486	kq_index_t qos;
8487
8488	kqlock(kq);
8489	kn->kn_status \|= KN_STAYACTIVE;
8490
8491	/*
8492	* Making a knote stay active is a property of the knote that must be
8493	* established before it is fully attached.
8494	*/
8495	assert(kn->kn_status & KN_ATTACHING);
8496	assert((kn->kn_status & (KN_QUEUED \| KN_SUPPRESSED)) == `0`);
8497
8498	/ handle all stayactive knotes on the (appropriate) manager /
8499	if (kq->kq_state & KQ_WORKQ) {
8500	qos = KQWQ_QOS_MANAGER;
8501	} else if (kq->kq_state & KQ_WORKLOOP) {
8502	struct kqworkloop kqwl = (struct* kqworkloop *)kq;
8503
8504	qos = _pthread_priority_thread_qos(kn->kn_qos);
8505	assert(qos && qos < THREAD_QOS_LAST);
8506	kq_req_lock(kq);
8507	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
8508	kq_req_unlock(kq);
8509	qos = KQWL_BUCKET_STAYACTIVE;
8510	} else {
8511	qos = THREAD_QOS_UNSPECIFIED;
8512	}
8513
8514	kn->kn_req_index = qos;
8515	kn->kn_qos_override = qos;
8516	kn->kn_qos_index = qos;
8517
8518	knote_activate(kn);
8519	kqunlock(kq);
8520	}
8521
8522	void
8523	knote_clearstayactive(struct knote *kn)
8524	{
8525	kqlock(knote_get_kq(kn));
8526	kn->kn_status &= ~KN_STAYACTIVE;
8527	knote_deactivate(kn);
8528	kqunlock(knote_get_kq(kn));
8529	}
8530
8531	static unsigned long
8532	kevent_extinfo_emit(struct kqueue kq, struct* knote kn, struct* kevent_extinfo *buf,
8533	unsigned long buflen, unsigned long nknotes)
8534	{
8535	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8536	if (kq == knote_get_kq(kn)) {
8537	if (nknotes < buflen) {
8538	struct kevent_extinfo *info = &buf[nknotes];
8539	struct kevent_internal_s *kevp = &kn->kn_kevent;
8540
8541	kqlock(kq);
8542
8543	info->kqext_kev = (struct kevent_qos_s){
8544	.ident = kevp->ident,
8545	.filter = kevp->filter,
8546	.flags = kevp->flags,
8547	.fflags = kevp->fflags,
8548	.data = (int64_t)kevp->data,
8549	.udata = kevp->udata,
8550	.ext[`0`] = kevp->ext[`0`],
8551	.ext[`1`] = kevp->ext[`1`],
8552	.ext[`2`] = kevp->ext[`2`],
8553	.ext[`3`] = kevp->ext[`3`],
8554	.qos = kn->kn_req_index,
8555	};
8556	info->kqext_sdata = kn->kn_sdata;
8557	info->kqext_status = kn->kn_status;
8558	info->kqext_sfflags = kn->kn_sfflags;
8559
8560	kqunlock(kq);
8561	}
8562
8563	/ we return total number of knotes, which may be more than requested /
8564	nknotes++;
8565	}
8566	}
8567
8568	return nknotes;
8569	}
8570
8571	int
8572	kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8573	int32_t *nkqueues_out)
8574	{
8575	proc_t p = (proc_t)proc;
8576	struct filedesc *fdp = p->p_fd;
8577	unsigned int nkqueues = `0`;
8578	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8579	size_t buflen, bufsize;
8580	kqueue_id_t *kq_ids = NULL;
8581	int err = `0`;
8582
8583	assert(p != NULL);
8584
8585	if (ubuf == USER_ADDR_NULL && ubufsize != `0`) {
8586	err = EINVAL;
8587	goto out;
8588	}
8589
8590	buflen = min(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8591
8592	if (ubuflen != `0`) {
8593	if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8594	err = ERANGE;
8595	goto out;
8596	}
8597	kq_ids = kalloc(bufsize);
8598	assert(kq_ids != NULL);
8599	}
8600
8601	kqhash_lock(p);
8602
8603	if (fdp->fd_kqhashmask > `0`) {
8604	for (uint32_t i = `0`; i < fdp->fd_kqhashmask + `1`; i++) {
8605	struct kqworkloop *kqwl;
8606
8607	SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8608	/ report the number of kqueues, even if they don't all fit /
8609	if (nkqueues < buflen) {
8610	kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8611	}
8612	nkqueues++;
8613	}
8614	}
8615	}
8616
8617	kqhash_unlock(p);
8618
8619	if (kq_ids) {
8620	size_t copysize;
8621	if (os_mul_overflow(sizeof(kqueue_id_t), min(ubuflen, nkqueues), &copysize)) {
8622	err = ERANGE;
8623	goto out;
8624	}
8625
8626	assert(ubufsize >= copysize);
8627	err = copyout(kq_ids, ubuf, copysize);
8628	}
8629
8630	out:
8631	if (kq_ids) {
8632	kfree(kq_ids, bufsize);
8633	}
8634
8635	if (!err) {
8636	nkqueues_out = (int*)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8637	}
8638	return err;
8639	}
8640
8641	int
8642	kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8643	uint32_t ubufsize, int32_t *size_out)
8644	{
8645	proc_t p = (proc_t)proc;
8646	struct kqueue *kq;
8647	int err = `0`;
8648	struct kqueue_dyninfo kqdi = { };
8649
8650	assert(p != NULL);
8651
8652	if (ubufsize < sizeof(struct kqueue_info)) {
8653	return ENOBUFS;
8654	}
8655
8656	kqhash_lock(p);
8657	kq = kqueue_hash_lookup(p, kq_id);
8658	if (!kq) {
8659	kqhash_unlock(p);
8660	return ESRCH;
8661	}
8662	kqueue_retain(kq);
8663	kqhash_unlock(p);
8664
8665	/*
8666	* backward compatibility: allow the argument to this call to only be
8667	* a struct kqueue_info
8668	*/
8669	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8670	ubufsize = sizeof(struct kqueue_dyninfo);
8671	err = fill_kqueue_dyninfo(kq, &kqdi);
8672	} else {
8673	ubufsize = sizeof(struct kqueue_info);
8674	err = fill_kqueueinfo(kq, &kqdi.kqdi_info);
8675	}
8676	if (err == `0` && (err = copyout(&kqdi, ubuf, ubufsize)) == `0`) {
8677	*size_out = ubufsize;
8678	}
8679	kqueue_release_last(p, kq);
8680	return err;
8681	}
8682
8683	int
8684	kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8685	uint32_t ubufsize, int32_t *nknotes_out)
8686	{
8687	proc_t p = (proc_t)proc;
8688	struct kqueue *kq;
8689	int err;
8690
8691	assert(p != NULL);
8692
8693	kqhash_lock(p);
8694	kq = kqueue_hash_lookup(p, kq_id);
8695	if (!kq) {
8696	kqhash_unlock(p);
8697	return ESRCH;
8698	}
8699	kqueue_retain(kq);
8700	kqhash_unlock(p);
8701
8702	err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out);
8703	kqueue_release_last(p, kq);
8704	return err;
8705	}
8706
8707	int
8708	pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8709	uint32_t bufsize, int32_t *retval)
8710	{
8711	struct knote *kn;
8712	int i;
8713	int err = `0`;
8714	struct filedesc *fdp = p->p_fd;
8715	unsigned long nknotes = `0`;
8716	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8717	struct kevent_extinfo *kqext = NULL;
8718
8719	/ arbitrary upper limit to cap kernel memory usage, copyout size, etc. /
8720	buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8721
8722	kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
8723	if (kqext == NULL) {
8724	err = ENOMEM;
8725	goto out;
8726	}
8727	bzero(kqext, buflen * sizeof(struct kevent_extinfo));
8728
8729	proc_fdlock(p);
8730	for (i = `0`; i < fdp->fd_knlistsize; i++) {
8731	kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8732	nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8733	}
8734	proc_fdunlock(p);
8735
8736	if (fdp->fd_knhashmask != `0`) {
8737	for (i = `0`; i < (int)fdp->fd_knhashmask + `1`; i++) {
8738	kqhash_lock(p);
8739	kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8740	nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8741	kqhash_unlock(p);
8742	}
8743	}
8744
8745	assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
8746	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
8747
8748	out:
8749	if (kqext) {
8750	kfree(kqext, buflen * sizeof(struct kevent_extinfo));
8751	kqext = NULL;
8752	}
8753
8754	if (!err) {
8755	*retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8756	}
8757	return err;
8758	}
8759
8760	static unsigned int
8761	klist_copy_udata(struct klist list, uint64_t buf,
8762	unsigned int buflen, unsigned int nknotes)
8763	{
8764	struct kevent_internal_s *kev;
8765	struct knote *kn;
8766	SLIST_FOREACH(kn, list, kn_link) {
8767	if (nknotes < buflen) {
8768	struct kqueue *kq = knote_get_kq(kn);
8769	kqlock(kq);
8770	kev = &(kn->kn_kevent);
8771	buf[nknotes] = kev->udata;
8772	kqunlock(kq);
8773	}
8774	/ we return total number of knotes, which may be more than requested /
8775	nknotes++;
8776	}
8777
8778	return nknotes;
8779	}
8780
8781	static unsigned int
8782	kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list,
8783	uint64_t buf, unsigned* int buflen, unsigned int nids)
8784	{
8785	kqhash_lock_held(p);
8786	struct kqworkloop *kqwl;
8787	SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
8788	if (nids < buflen) {
8789	buf[nids] = kqwl->kqwl_dynamicid;
8790	}
8791	nids++;
8792	}
8793	return nids;
8794	}
8795
8796	int
8797	kevent_proc_copy_uptrs(void proc, uint64_t buf, int bufsize)
8798	{
8799	proc_t p = (proc_t)proc;
8800	struct filedesc *fdp = p->p_fd;
8801	unsigned int nuptrs = `0`;
8802	unsigned long buflen = bufsize / sizeof(uint64_t);
8803
8804	if (buflen > `0`) {
8805	assert(buf != NULL);
8806	}
8807
8808	proc_fdlock(p);
8809	for (int i = `0`; i < fdp->fd_knlistsize; i++) {
8810	nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
8811	}
8812	knhash_lock(p);
8813	proc_fdunlock(p);
8814	if (fdp->fd_knhashmask != `0`) {
8815	for (int i = `0`; i < (int)fdp->fd_knhashmask + `1`; i++) {
8816	nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
8817	}
8818	}
8819	knhash_unlock(p);
8820
8821	kqhash_lock(p);
8822	if (fdp->fd_kqhashmask != `0`) {
8823	for (int i = `0`; i < (int)fdp->fd_kqhashmask + `1`; i++) {
8824	nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen,
8825	nuptrs);
8826	}
8827	}
8828	kqhash_unlock(p);
8829
8830	return (int)nuptrs;
8831	}
8832
8833	static void
8834	kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
8835	{
8836	uint64_t ast_addr;
8837	bool proc_is_64bit = !!(p->p_flag & P_LP64);
8838	size_t user_addr_size = proc_is_64bit ? `8` : `4`;
8839	uint32_t ast_flags32 = `0`;
8840	uint64_t ast_flags64 = `0`;
8841	struct uthread *ut = get_bsdthread_info(thread);
8842
8843	if (ut->uu_kqr_bound != NULL) {
8844	ast_flags64 \|= R2K_WORKLOOP_PENDING_EVENTS;
8845	}
8846
8847	if (ast_flags64 == `0`) {
8848	return;
8849	}
8850
8851	if (!(p->p_flag & P_LP64)) {
8852	ast_flags32 = (uint32_t)ast_flags64;
8853	assert(ast_flags64 < `0x100000000ull`);
8854	}
8855
8856	ast_addr = thread_rettokern_addr(thread);
8857	if (ast_addr == `0`) {
8858	return;
8859	}
8860
8861	if (copyout((proc_is_64bit ? (void )&ast_flags64 : (void* *)&ast_flags32),
8862	(user_addr_t)ast_addr,
8863	user_addr_size) != `0`) {
8864	printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
8865	"ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
8866	}
8867	}
8868
8869	void
8870	kevent_ast(thread_t thread, uint16_t bits)
8871	{
8872	proc_t p = current_proc();
8873
8874	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
8875	workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
8876	}
8877	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
8878	kevent_set_return_to_kernel_user_tsd(p, thread);
8879	}
8880	}
8881
8882	#if DEVELOPMENT \|\| DEBUG
8883
8884	#define KEVENT_SYSCTL_BOUND_ID 1
8885
8886	static int
8887	kevent_sysctl SYSCTL_HANDLER_ARGS
8888	{
8889	#pragma unused(oidp, arg2)
8890	uintptr_t type = (uintptr_t)arg1;
8891	uint64_t bound_id = `0`;
8892
8893	if (type != KEVENT_SYSCTL_BOUND_ID) {
8894	return EINVAL;
8895	}
8896
8897	if (req->newptr) {
8898	return EINVAL;
8899	}
8900
8901	struct uthread *ut = get_bsdthread_info(current_thread());
8902	if (!ut) {
8903	return EFAULT;
8904	}
8905
8906	struct kqrequest *kqr = ut->uu_kqr_bound;
8907	if (kqr) {
8908	if (kqr->kqr_state & KQR_WORKLOOP) {
8909	bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
8910	} else {
8911	bound_id = -`1`;
8912	}
8913	}
8914
8915	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
8916	}
8917
8918	SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW \| CTLFLAG_LOCKED, `0`,
8919	"kevent information");
8920
8921	SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
8922	CTLTYPE_QUAD \| CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED,
8923	(void *)KEVENT_SYSCTL_BOUND_ID,
8924	sizeof(kqueue_id_t), kevent_sysctl, "Q",
8925	"get the ID of the bound kqueue");
8926
8927	#endif /* DEVELOPMENT \|\| DEBUG */
8928

Browse the source code of xnu/bsd/kern/kern_event.c