sys_generic.c source code [xnu/bsd/kern/sys_generic.c]

1	/*
2	* Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1982, 1986, 1989, 1993
31	* The Regents of the University of California. All rights reserved.
32	* (c) UNIX System Laboratories, Inc.
33	* All or some portions of this file are derived from material licensed
34	* to the University of California by American Telephone and Telegraph
35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
36	* the permission of UNIX System Laboratories, Inc.
37	*
38	* Redistribution and use in source and binary forms, with or without
39	* modification, are permitted provided that the following conditions
40	* are met:
41	* 1. Redistributions of source code must retain the above copyright
42	* notice, this list of conditions and the following disclaimer.
43	* 2. Redistributions in binary form must reproduce the above copyright
44	* notice, this list of conditions and the following disclaimer in the
45	* documentation and/or other materials provided with the distribution.
46	* 3. All advertising materials mentioning features or use of this software
47	* must display the following acknowledgement:
48	* This product includes software developed by the University of
49	* California, Berkeley and its contributors.
50	* 4. Neither the name of the University nor the names of its contributors
51	* may be used to endorse or promote products derived from this software
52	* without specific prior written permission.
53	*
54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64	* SUCH DAMAGE.
65	*
66	* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
67	*/
68	/*
69	* NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
70	* support for mandatory and extensible security protections. This notice
71	* is included in support of clause 2.2 (b) of the Apple Public License,
72	* Version 2.0.
73	*/
74
75	#include <sys/param.h>
76	#include <sys/systm.h>
77	#include <sys/filedesc.h>
78	#include <sys/ioctl.h>
79	#include <sys/file_internal.h>
80	#include <sys/proc_internal.h>
81	#include <sys/socketvar.h>
82	#include <sys/uio_internal.h>
83	#include <sys/kernel.h>
84	#include <sys/guarded.h>
85	#include <sys/stat.h>
86	#include <sys/malloc.h>
87	#include <sys/sysproto.h>
88
89	#include <sys/mount_internal.h>
90	#include <sys/protosw.h>
91	#include <sys/ev.h>
92	#include <sys/user.h>
93	#include <sys/kdebug.h>
94	#include <sys/poll.h>
95	#include <sys/event.h>
96	#include <sys/eventvar.h>
97	#include <sys/proc.h>
98	#include <sys/kauth.h>
99
100	#include <machine/smp.h>
101	#include <mach/mach_types.h>
102	#include <kern/kern_types.h>
103	#include <kern/assert.h>
104	#include <kern/kalloc.h>
105	#include <kern/thread.h>
106	#include <kern/clock.h>
107	#include <kern/ledger.h>
108	#include <kern/task.h>
109	#include <kern/telemetry.h>
110	#include <kern/waitq.h>
111	#include <kern/sched_prim.h>
112
113	#include <sys/mbuf.h>
114	#include <sys/domain.h>
115	#include <sys/socket.h>
116	#include <sys/socketvar.h>
117	#include <sys/errno.h>
118	#include <sys/syscall.h>
119	#include <sys/pipe.h>
120
121	#include <security/audit/audit.h>
122
123	#include <net/if.h>
124	#include <net/route.h>
125
126	#include <netinet/in.h>
127	#include <netinet/in_systm.h>
128	#include <netinet/ip.h>
129	#include <netinet/in_pcb.h>
130	#include <netinet/ip_var.h>
131	#include <netinet/ip6.h>
132	#include <netinet/tcp.h>
133	#include <netinet/tcp_fsm.h>
134	#include <netinet/tcp_seq.h>
135	#include <netinet/tcp_timer.h>
136	#include <netinet/tcp_var.h>
137	#include <netinet/tcpip.h>
138	#include <netinet/tcp_debug.h>
139	/ for wait queue based select /
140	#include <kern/waitq.h>
141	#include <kern/kalloc.h>
142	#include <sys/vnode_internal.h>
143
144	#if CONFIG_MACF
145	#include <security/mac_framework.h>
146	#endif
147
148	/ XXX should be in a header file somewhere /
149	void evsofree(struct socket *);
150	void evpipefree(struct pipe *);
151	void postpipeevent(struct pipe , int*);
152	void postevent(struct socket , struct* sockbuf , int*);
153	extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
154
155	int rd_uio(struct proc p, int* fdes, uio_t uio, user_ssize_t *retval);
156	int wr_uio(struct proc p, struct* fileproc fp, uio_t uio, user_ssize_t retval);
157
158	__private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
159	user_addr_t bufp, user_size_t nbyte,
160	off_t offset, int flags, user_ssize_t *retval);
161	__private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
162	user_addr_t bufp, user_size_t nbyte,
163	off_t offset, int flags, user_ssize_t *retval);
164	__private_extern__ int preparefileread(struct proc p, struct* fileproc *fp_ret, int* fd, int check_for_vnode);
165	__private_extern__ void donefileread(struct proc p, struct* fileproc fp_ret, int* fd);
166
167	/ Conflict wait queue for when selects collide (opaque type) /
168	struct waitq select_conflict_queue;
169
170	/*
171	* Init routine called from bsd_init.c
172	*/
173	void select_waitq_init(void);
174	void
175	select_waitq_init(void)
176	{
177	waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
178	}
179
180	#define f_flag f_fglob->fg_flag
181	#define f_type f_fglob->fg_ops->fo_type
182	#define f_msgcount f_fglob->fg_msgcount
183	#define f_cred f_fglob->fg_cred
184	#define f_ops f_fglob->fg_ops
185	#define f_offset f_fglob->fg_offset
186	#define f_data f_fglob->fg_data
187
188	/*
189	* Read system call.
190	*
191	* Returns: 0 Success
192	* preparefileread:EBADF
193	* preparefileread:ESPIPE
194	* preparefileread:ENXIO
195	* preparefileread:EBADF
196	* dofileread:???
197	*/
198	int
199	read(struct proc p, struct* read_args uap, user_ssize_t retval)
200	{
201	__pthread_testcancel(`1`);
202	return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
203	}
204
205	int
206	read_nocancel(struct proc p, struct* read_nocancel_args uap, user_ssize_t retval)
207	{
208	struct fileproc *fp;
209	int error;
210	int fd = uap->fd;
211	struct vfs_context context;
212
213	if ( (error = preparefileread(p, &fp, fd, `0`)) )
214	return (error);
215
216	context = *(vfs_context_current());
217	context.vc_ucred = fp->f_fglob->fg_cred;
218
219	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
220	(off_t)-`1`, `0`, retval);
221
222	donefileread(p, fp, fd);
223
224	return (error);
225	}
226
227	/*
228	* Pread system call
229	*
230	* Returns: 0 Success
231	* preparefileread:EBADF
232	* preparefileread:ESPIPE
233	* preparefileread:ENXIO
234	* preparefileread:EBADF
235	* dofileread:???
236	*/
237	int
238	pread(struct proc p, struct* pread_args uap, user_ssize_t retval)
239	{
240	__pthread_testcancel(`1`);
241	return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
242	}
243
244	int
245	pread_nocancel(struct proc p, struct* pread_nocancel_args uap, user_ssize_t retval)
246	{
247	struct fileproc fp = NULL; /* fp set by preparefileread() /
248	int fd = uap->fd;
249	int error;
250	struct vfs_context context;
251
252	if ( (error = preparefileread(p, &fp, fd, `1`)) )
253	goto out;
254
255	context = *(vfs_context_current());
256	context.vc_ucred = fp->f_fglob->fg_cred;
257
258	error = dofileread(&context, fp, uap->buf, uap->nbyte,
259	uap->offset, FOF_OFFSET, retval);
260
261	donefileread(p, fp, fd);
262
263	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) \| DBG_FUNC_NONE),
264	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> `32`)), (unsigned int)(uap->offset), `0`);
265
266	out:
267	return (error);
268	}
269
270	/*
271	* Code common for read and pread
272	*/
273
274	void
275	donefileread(struct proc p, struct* fileproc fp, int* fd)
276	{
277	proc_fdlock_spin(p);
278	fp_drop(p, fd, fp, `1`);
279	proc_fdunlock(p);
280	}
281
282	/*
283	* Returns: 0 Success
284	* EBADF
285	* ESPIPE
286	* ENXIO
287	* fp_lookup:EBADF
288	* fo_read:???
289	*/
290	int
291	preparefileread(struct proc p, struct* fileproc *fp_ret, int* fd, int check_for_pread)
292	{
293	vnode_t vp;
294	int error;
295	struct fileproc *fp;
296
297	AUDIT_ARG(fd, fd);
298
299	proc_fdlock_spin(p);
300
301	error = fp_lookup(p, fd, &fp, `1`);
302
303	if (error) {
304	proc_fdunlock(p);
305	return (error);
306	}
307	if ((fp->f_flag & FREAD) == `0`) {
308	error = EBADF;
309	goto out;
310	}
311	if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
312	error = ESPIPE;
313	goto out;
314	}
315	if (fp->f_type == DTYPE_VNODE) {
316	vp = (struct vnode *)fp->f_fglob->fg_data;
317
318	if (check_for_pread && (vnode_isfifo(vp))) {
319	error = ESPIPE;
320	goto out;
321	}
322	if (check_for_pread && (vp->v_flag & VISTTY)) {
323	error = ENXIO;
324	goto out;
325	}
326	}
327
328	*fp_ret = fp;
329
330	proc_fdunlock(p);
331	return (`0`);
332
333	out:
334	fp_drop(p, fd, fp, `1`);
335	proc_fdunlock(p);
336	return (error);
337	}
338
339
340	/*
341	* Returns: 0 Success
342	* EINVAL
343	* fo_read:???
344	*/
345	__private_extern__ int
346	dofileread(vfs_context_t ctx, struct fileproc *fp,
347	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
348	user_ssize_t *retval)
349	{
350	uio_t auio;
351	user_ssize_t bytecnt;
352	long error = `0`;
353	char uio_buf[ UIO_SIZEOF(`1`) ];
354
355	if (nbyte > INT_MAX)
356	return (EINVAL);
357
358	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
359	auio = uio_createwithbuffer(`1`, offset, UIO_USERSPACE64, UIO_READ,
360	&uio_buf[`0`], sizeof(uio_buf));
361	} else {
362	auio = uio_createwithbuffer(`1`, offset, UIO_USERSPACE32, UIO_READ,
363	&uio_buf[`0`], sizeof(uio_buf));
364	}
365	uio_addiov(auio, bufp, nbyte);
366
367	bytecnt = nbyte;
368
369	if ((error = fo_read(fp, auio, flags, ctx))) {
370	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
371	error == EINTR \|\| error == EWOULDBLOCK))
372	error = `0`;
373	}
374	bytecnt -= uio_resid(auio);
375
376	*retval = bytecnt;
377
378	return (error);
379	}
380
381	/*
382	* Scatter read system call.
383	*
384	* Returns: 0 Success
385	* EINVAL
386	* ENOMEM
387	* copyin:EFAULT
388	* rd_uio:???
389	*/
390	int
391	readv(struct proc p, struct* readv_args uap, user_ssize_t retval)
392	{
393	__pthread_testcancel(`1`);
394	return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
395	}
396
397	int
398	readv_nocancel(struct proc p, struct* readv_nocancel_args uap, user_ssize_t retval)
399	{
400	uio_t auio = NULL;
401	int error;
402	struct user_iovec *iovp;
403
404	/ Verify range bedfore calling uio_create() /
405	if (uap->iovcnt <= `0` \|\| uap->iovcnt > UIO_MAXIOV)
406	return (EINVAL);
407
408	/ allocate a uio large enough to hold the number of iovecs passed /
409	auio = uio_create(uap->iovcnt, `0`,
410	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
411	UIO_READ);
412
413	/ get location of iovecs within the uio. then copyin the iovecs from*
414	* user space.
415	*/
416	iovp = uio_iovsaddr(auio);
417	if (iovp == NULL) {
418	error = ENOMEM;
419	goto ExitThisRoutine;
420	}
421	error = copyin_user_iovec_array(uap->iovp,
422	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
423	uap->iovcnt, iovp);
424	if (error) {
425	goto ExitThisRoutine;
426	}
427
428	/ finalize uio_t for use and do the IO*
429	*/
430	error = uio_calculateresid(auio);
431	if (error) {
432	goto ExitThisRoutine;
433	}
434	error = rd_uio(p, uap->fd, auio, retval);
435
436	ExitThisRoutine:
437	if (auio != NULL) {
438	uio_free(auio);
439	}
440	return (error);
441	}
442
443	/*
444	* Write system call
445	*
446	* Returns: 0 Success
447	* EBADF
448	* fp_lookup:EBADF
449	* dofilewrite:???
450	*/
451	int
452	write(struct proc p, struct* write_args uap, user_ssize_t retval)
453	{
454	__pthread_testcancel(`1`);
455	return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
456
457	}
458
459	int
460	write_nocancel(struct proc p, struct* write_nocancel_args uap, user_ssize_t retval)
461	{
462	struct fileproc *fp;
463	int error;
464	int fd = uap->fd;
465	bool wrote_some = false;
466
467	AUDIT_ARG(fd, fd);
468
469	error = fp_lookup(p,fd,&fp,`0`);
470	if (error)
471	return(error);
472	if ((fp->f_flag & FWRITE) == `0`) {
473	error = EBADF;
474	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
475	proc_fdlock(p);
476	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
477	proc_fdunlock(p);
478	} else {
479	struct vfs_context context = *(vfs_context_current());
480	context.vc_ucred = fp->f_fglob->fg_cred;
481
482	error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
483	(off_t)-`1`, `0`, retval);
484
485	wrote_some = *retval > `0`;
486	}
487	if (wrote_some)
488	fp_drop_written(p, fd, fp);
489	else
490	fp_drop(p, fd, fp, `0`);
491	return(error);
492	}
493
494	/*
495	* pwrite system call
496	*
497	* Returns: 0 Success
498	* EBADF
499	* ESPIPE
500	* ENXIO
501	* EINVAL
502	* fp_lookup:EBADF
503	* dofilewrite:???
504	*/
505	int
506	pwrite(struct proc p, struct* pwrite_args uap, user_ssize_t retval)
507	{
508	__pthread_testcancel(`1`);
509	return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
510	}
511
512	int
513	pwrite_nocancel(struct proc p, struct* pwrite_nocancel_args uap, user_ssize_t retval)
514	{
515	struct fileproc *fp;
516	int error;
517	int fd = uap->fd;
518	vnode_t vp = (vnode_t)`0`;
519	bool wrote_some = false;
520
521	AUDIT_ARG(fd, fd);
522
523	error = fp_lookup(p,fd,&fp,`0`);
524	if (error)
525	return(error);
526
527	if ((fp->f_flag & FWRITE) == `0`) {
528	error = EBADF;
529	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
530	proc_fdlock(p);
531	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
532	proc_fdunlock(p);
533	} else {
534	struct vfs_context context = *vfs_context_current();
535	context.vc_ucred = fp->f_fglob->fg_cred;
536
537	if (fp->f_type != DTYPE_VNODE) {
538	error = ESPIPE;
539	goto errout;
540	}
541	vp = (vnode_t)fp->f_fglob->fg_data;
542	if (vnode_isfifo(vp)) {
543	error = ESPIPE;
544	goto errout;
545	}
546	if ((vp->v_flag & VISTTY)) {
547	error = ENXIO;
548	goto errout;
549	}
550	if (uap->offset == (off_t)-`1`) {
551	error = EINVAL;
552	goto errout;
553	}
554
555	error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
556	uap->offset, FOF_OFFSET, retval);
557	wrote_some = *retval > `0`;
558	}
559	errout:
560	if (wrote_some)
561	fp_drop_written(p, fd, fp);
562	else
563	fp_drop(p, fd, fp, `0`);
564
565	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) \| DBG_FUNC_NONE),
566	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> `32`)), (unsigned int)(uap->offset), `0`);
567
568	return(error);
569	}
570
571	/*
572	* Returns: 0 Success
573	* EINVAL
574	* <fo_write>:EPIPE
575	* <fo_write>:??? [indirect through struct fileops]
576	*/
577	__private_extern__ int
578	dofilewrite(vfs_context_t ctx, struct fileproc *fp,
579	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
580	user_ssize_t *retval)
581	{
582	uio_t auio;
583	long error = `0`;
584	user_ssize_t bytecnt;
585	char uio_buf[ UIO_SIZEOF(`1`) ];
586
587	if (nbyte > INT_MAX) {
588	*retval = `0`;
589	return (EINVAL);
590	}
591
592	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
593	auio = uio_createwithbuffer(`1`, offset, UIO_USERSPACE64, UIO_WRITE,
594	&uio_buf[`0`], sizeof(uio_buf));
595	} else {
596	auio = uio_createwithbuffer(`1`, offset, UIO_USERSPACE32, UIO_WRITE,
597	&uio_buf[`0`], sizeof(uio_buf));
598	}
599	uio_addiov(auio, bufp, nbyte);
600
601	bytecnt = nbyte;
602	if ((error = fo_write(fp, auio, flags, ctx))) {
603	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
604	error == EINTR \|\| error == EWOULDBLOCK))
605	error = `0`;
606	/ The socket layer handles SIGPIPE /
607	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
608	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == `0`) {
609	/ XXX Raise the signal on the thread? /
610	psignal(vfs_context_proc(ctx), SIGPIPE);
611	}
612	}
613	bytecnt -= uio_resid(auio);
614	*retval = bytecnt;
615
616	return (error);
617	}
618
619	/*
620	* Gather write system call
621	*/
622	int
623	writev(struct proc p, struct* writev_args uap, user_ssize_t retval)
624	{
625	__pthread_testcancel(`1`);
626	return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
627	}
628
629	int
630	writev_nocancel(struct proc p, struct* writev_nocancel_args uap, user_ssize_t retval)
631	{
632	uio_t auio = NULL;
633	int error;
634	struct fileproc *fp;
635	struct user_iovec *iovp;
636	bool wrote_some = false;
637
638	AUDIT_ARG(fd, uap->fd);
639
640	/ Verify range bedfore calling uio_create() /
641	if (uap->iovcnt <= `0` \|\| uap->iovcnt > UIO_MAXIOV)
642	return (EINVAL);
643
644	/ allocate a uio large enough to hold the number of iovecs passed /
645	auio = uio_create(uap->iovcnt, `0`,
646	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
647	UIO_WRITE);
648
649	/ get location of iovecs within the uio. then copyin the iovecs from*
650	* user space.
651	*/
652	iovp = uio_iovsaddr(auio);
653	if (iovp == NULL) {
654	error = ENOMEM;
655	goto ExitThisRoutine;
656	}
657	error = copyin_user_iovec_array(uap->iovp,
658	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
659	uap->iovcnt, iovp);
660	if (error) {
661	goto ExitThisRoutine;
662	}
663
664	/ finalize uio_t for use and do the IO*
665	*/
666	error = uio_calculateresid(auio);
667	if (error) {
668	goto ExitThisRoutine;
669	}
670
671	error = fp_lookup(p, uap->fd, &fp, `0`);
672	if (error)
673	goto ExitThisRoutine;
674
675	if ((fp->f_flag & FWRITE) == `0`) {
676	error = EBADF;
677	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
678	proc_fdlock(p);
679	error = fp_guard_exception(p, uap->fd, fp, kGUARD_EXC_WRITE);
680	proc_fdunlock(p);
681	} else {
682	error = wr_uio(p, fp, auio, retval);
683	wrote_some = *retval > `0`;
684	}
685
686	if (wrote_some)
687	fp_drop_written(p, uap->fd, fp);
688	else
689	fp_drop(p, uap->fd, fp, `0`);
690
691	ExitThisRoutine:
692	if (auio != NULL) {
693	uio_free(auio);
694	}
695	return (error);
696	}
697
698
699	int
700	wr_uio(struct proc p, struct* fileproc fp, uio_t uio, user_ssize_t retval)
701	{
702	int error;
703	user_ssize_t count;
704	struct vfs_context context = *vfs_context_current();
705
706	count = uio_resid(uio);
707
708	context.vc_ucred = fp->f_cred;
709	error = fo_write(fp, uio, `0`, &context);
710	if (error) {
711	if (uio_resid(uio) != count && (error == ERESTART \|\|
712	error == EINTR \|\| error == EWOULDBLOCK))
713	error = `0`;
714	/ The socket layer handles SIGPIPE /
715	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
716	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == `0`)
717	psignal(p, SIGPIPE);
718	}
719	*retval = count - uio_resid(uio);
720
721	return(error);
722	}
723
724
725	int
726	rd_uio(struct proc p, int* fdes, uio_t uio, user_ssize_t *retval)
727	{
728	struct fileproc *fp;
729	int error;
730	user_ssize_t count;
731	struct vfs_context context = *vfs_context_current();
732
733	if ( (error = preparefileread(p, &fp, fdes, `0`)) )
734	return (error);
735
736	count = uio_resid(uio);
737
738	context.vc_ucred = fp->f_cred;
739
740	error = fo_read(fp, uio, `0`, &context);
741
742	if (error) {
743	if (uio_resid(uio) != count && (error == ERESTART \|\|
744	error == EINTR \|\| error == EWOULDBLOCK))
745	error = `0`;
746	}
747	*retval = count - uio_resid(uio);
748
749	donefileread(p, fp, fdes);
750
751	return (error);
752	}
753
754	/*
755	* Ioctl system call
756	*
757	* Returns: 0 Success
758	* EBADF
759	* ENOTTY
760	* ENOMEM
761	* ESRCH
762	* copyin:EFAULT
763	* copyoutEFAULT
764	* fp_lookup:EBADF Bad file descriptor
765	* fo_ioctl:???
766	*/
767	int
768	ioctl(struct proc p, struct* ioctl_args uap, __unused int32_t retval)
769	{
770	struct fileproc *fp = NULL;
771	int error = `0`;
772	u_int size = `0`;
773	caddr_t datap = NULL, memp = NULL;
774	boolean_t is64bit = FALSE;
775	int tmp = `0`;
776	#define STK_PARAMS 128
777	char stkbuf[STK_PARAMS] = {};
778	int fd = uap->fd;
779	u_long com = uap->com;
780	struct vfs_context context = *vfs_context_current();
781
782	AUDIT_ARG(fd, uap->fd);
783	AUDIT_ARG(addr, uap->data);
784
785	is64bit = proc_is64bit(p);
786	#if CONFIG_AUDIT
787	if (is64bit)
788	AUDIT_ARG(value64, com);
789	else
790	AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
791	#endif /* CONFIG_AUDIT */
792
793	/*
794	* Interpret high order word to find amount of data to be
795	* copied to/from the user's address space.
796	*/
797	size = IOCPARM_LEN(com);
798	if (size > IOCPARM_MAX)
799	return ENOTTY;
800	if (size > sizeof (stkbuf)) {
801	if ((memp = (caddr_t)kalloc(size)) == `0`)
802	return ENOMEM;
803	datap = memp;
804	} else
805	datap = &stkbuf[`0`];
806	if (com & IOC_IN) {
807	if (size) {
808	error = copyin(uap->data, datap, size);
809	if (error)
810	goto out_nofp;
811	} else {
812	/ XXX - IOC_IN and no size? we should proably return an error here!! /
813	if (is64bit) {
814	(user_addr_t )datap = uap->data;
815	}
816	else {
817	(uint32_t )datap = (uint32_t)uap->data;
818	}
819	}
820	} else if ((com & IOC_OUT) && size)
821	/*
822	* Zero the buffer so the user always
823	* gets back something deterministic.
824	*/
825	bzero(datap, size);
826	else if (com & IOC_VOID) {
827	/ XXX - this is odd since IOC_VOID means no parameters /
828	if (is64bit) {
829	(user_addr_t )datap = uap->data;
830	}
831	else {
832	(uint32_t )datap = (uint32_t)uap->data;
833	}
834	}
835
836	proc_fdlock(p);
837	error = fp_lookup(p,fd,&fp,`1`);
838	if (error) {
839	proc_fdunlock(p);
840	goto out_nofp;
841	}
842
843	AUDIT_ARG(file, p, fp);
844
845	if ((fp->f_flag & (FREAD \| FWRITE)) == `0`) {
846	error = EBADF;
847	goto out;
848	}
849
850	context.vc_ucred = fp->f_fglob->fg_cred;
851
852	#if CONFIG_MACF
853	error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
854	if (error)
855	goto out;
856	#endif
857
858	switch (com) {
859	case FIONCLEX:
860	*fdflags(p, fd) &= ~UF_EXCLOSE;
861	break;
862
863	case FIOCLEX:
864	*fdflags(p, fd) \|= UF_EXCLOSE;
865	break;
866
867	case FIONBIO:
868	if ( (tmp = (int* *)datap) )
869	fp->f_flag \|= FNONBLOCK;
870	else
871	fp->f_flag &= ~FNONBLOCK;
872	error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
873	break;
874
875	case FIOASYNC:
876	if ( (tmp = (int* *)datap) )
877	fp->f_flag \|= FASYNC;
878	else
879	fp->f_flag &= ~FASYNC;
880	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
881	break;
882
883	case FIOSETOWN:
884	tmp = (int* *)datap;
885	if (fp->f_type == DTYPE_SOCKET) {
886	((struct socket *)fp->f_data)->so_pgid = tmp;
887	break;
888	}
889	if (fp->f_type == DTYPE_PIPE) {
890	error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
891	break;
892	}
893	if (tmp <= `0`) {
894	tmp = -tmp;
895	} else {
896	struct proc *p1 = proc_find(tmp);
897	if (p1 == `0`) {
898	error = ESRCH;
899	break;
900	}
901	tmp = p1->p_pgrpid;
902	proc_rele(p1);
903	}
904	error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
905	break;
906
907	case FIOGETOWN:
908	if (fp->f_type == DTYPE_SOCKET) {
909	(int* )datap = ((struct* socket *)fp->f_data)->so_pgid;
910	break;
911	}
912	error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
913	(int* )datap = -(int *)datap;
914	break;
915
916	default:
917	error = fo_ioctl(fp, com, datap, &context);
918	/*
919	* Copy any data to user, size was
920	* already set and checked above.
921	*/
922	if (error == `0` && (com & IOC_OUT) && size)
923	error = copyout(datap, uap->data, (u_int)size);
924	break;
925	}
926	out:
927	fp_drop(p, fd, fp, `1`);
928	proc_fdunlock(p);
929
930	out_nofp:
931	if (memp)
932	kfree(memp, size);
933	return(error);
934	}
935
936	int selwait, nselcoll;
937	#define SEL_FIRSTPASS 1
938	#define SEL_SECONDPASS 2
939	extern int selcontinue(int error);
940	extern int selprocess(int error, int sel_pass);
941	static int selscan(struct proc p, struct* _select * sel, struct _select_data * seldata,
942	int nfd, int32_t retval, int* sel_pass, struct waitq_set *wqset);
943	static int selcount(struct proc p, u_int32_t ibits, int nfd, int *count);
944	static int seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int need_wakeup, int* fromselcount);
945	static int seldrop(struct proc p, u_int32_t ibits, int nfd);
946	static int select_internal(struct proc p, struct* select_nocancel_args uap, uint64_t timeout, int32_t retval);
947
948	/*
949	* Select system call.
950	*
951	* Returns: 0 Success
952	* EINVAL Invalid argument
953	* EAGAIN Nonconformant error if allocation fails
954	*/
955	int
956	select(struct proc p, struct* select_args uap, int32_t retval)
957	{
958	__pthread_testcancel(`1`);
959	return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
960	}
961
962	int
963	select_nocancel(struct proc p, struct* select_nocancel_args uap, int32_t retval)
964	{
965	uint64_t timeout = `0`;
966
967	if (uap->tv) {
968	int err;
969	struct timeval atv;
970	if (IS_64BIT_PROCESS(p)) {
971	struct user64_timeval atv64;
972	err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
973	/ Loses resolution - assume timeout < 68 years /
974	atv.tv_sec = atv64.tv_sec;
975	atv.tv_usec = atv64.tv_usec;
976	} else {
977	struct user32_timeval atv32;
978	err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
979	atv.tv_sec = atv32.tv_sec;
980	atv.tv_usec = atv32.tv_usec;
981	}
982	if (err)
983	return err;
984
985	if (itimerfix(&atv)) {
986	err = EINVAL;
987	return err;
988	}
989
990	clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
991	}
992
993	return select_internal(p, uap, timeout, retval);
994	}
995
996	int
997	pselect(struct proc p, struct* pselect_args uap, int32_t retval)
998	{
999	__pthread_testcancel(`1`);
1000	return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
1001	}
1002
1003	int
1004	pselect_nocancel(struct proc p, struct* pselect_nocancel_args uap, int32_t retval)
1005	{
1006	int err;
1007	struct uthread *ut;
1008	uint64_t timeout = `0`;
1009
1010	if (uap->ts) {
1011	struct timespec ts;
1012
1013	if (IS_64BIT_PROCESS(p)) {
1014	struct user64_timespec ts64;
1015	err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1016	ts.tv_sec = ts64.tv_sec;
1017	ts.tv_nsec = ts64.tv_nsec;
1018	} else {
1019	struct user32_timespec ts32;
1020	err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1021	ts.tv_sec = ts32.tv_sec;
1022	ts.tv_nsec = ts32.tv_nsec;
1023	}
1024	if (err) {
1025	return err;
1026	}
1027
1028	if (!timespec_is_valid(&ts)) {
1029	return EINVAL;
1030	}
1031	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1032	}
1033
1034	ut = get_bsdthread_info(current_thread());
1035
1036	if (uap->mask != USER_ADDR_NULL) {
1037	/ save current mask, then copyin and set new mask /
1038	sigset_t newset;
1039	err = copyin(uap->mask, &newset, sizeof(sigset_t));
1040	if (err) {
1041	return err;
1042	}
1043	ut->uu_oldmask = ut->uu_sigmask;
1044	ut->uu_flag \|= UT_SAS_OLDMASK;
1045	ut->uu_sigmask = (newset & ~sigcantmask);
1046	}
1047
1048	err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1049
1050	if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1051	/*
1052	* Restore old mask (direct return case). NOTE: EINTR can also be returned
1053	* if the thread is cancelled. In that case, we don't reset the signal
1054	* mask to its original value (which usually happens in the signal
1055	* delivery path). This behavior is permitted by POSIX.
1056	*/
1057	ut->uu_sigmask = ut->uu_oldmask;
1058	ut->uu_oldmask = `0`;
1059	ut->uu_flag &= ~UT_SAS_OLDMASK;
1060	}
1061
1062	return err;
1063	}
1064
1065	/*
1066	* Generic implementation of {,p}select. Care: we type-pun uap across the two
1067	* syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1068	* are identical. The 5th (timeout) argument points to different types, so we
1069	* unpack in the syscall-specific code, but the generic code still does a null
1070	* check on this argument to determine if a timeout was specified.
1071	*/
1072	static int
1073	select_internal(struct proc p, struct* select_nocancel_args uap, uint64_t timeout, int32_t retval)
1074	{
1075	int error = `0`;
1076	u_int ni, nw;
1077	thread_t th_act;
1078	struct uthread *uth;
1079	struct _select *sel;
1080	struct _select_data *seldata;
1081	int needzerofill = `1`;
1082	int count = `0`;
1083	size_t sz = `0`;
1084
1085	th_act = current_thread();
1086	uth = get_bsdthread_info(th_act);
1087	sel = &uth->uu_select;
1088	seldata = &uth->uu_save.uus_select_data;
1089	*retval = `0`;
1090
1091	seldata->args = uap;
1092	seldata->retval = retval;
1093	seldata->wqp = NULL;
1094	seldata->count = `0`;
1095
1096	if (uap->nd < `0`) {
1097	return (EINVAL);
1098	}
1099
1100	/ select on thread of process that already called proc_exit() /
1101	if (p->p_fd == NULL) {
1102	return (EBADF);
1103	}
1104
1105	if (uap->nd > p->p_fd->fd_nfiles)
1106	uap->nd = p->p_fd->fd_nfiles; / forgiving; slightly wrong /
1107
1108	nw = howmany(uap->nd, NFDBITS);
1109	ni = nw * sizeof(fd_mask);
1110
1111	/*
1112	* if the previously allocated space for the bits is smaller than
1113	* what is requested or no space has yet been allocated for this
1114	* thread, allocate enough space now.
1115	*
1116	* Note: If this process fails, select() will return EAGAIN; this
1117	* is the same thing pool() returns in a no-memory situation, but
1118	* it is not a POSIX compliant error code for select().
1119	*/
1120	if (sel->nbytes < (`3` * ni)) {
1121	int nbytes = `3` * ni;
1122
1123	/ Free previous allocation, if any /
1124	if (sel->ibits != NULL)
1125	FREE(sel->ibits, M_TEMP);
1126	if (sel->obits != NULL) {
1127	FREE(sel->obits, M_TEMP);
1128	/ NULL out; subsequent ibits allocation may fail /
1129	sel->obits = NULL;
1130	}
1131
1132	MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
1133	if (sel->ibits == NULL)
1134	return (EAGAIN);
1135	MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
1136	if (sel->obits == NULL) {
1137	FREE(sel->ibits, M_TEMP);
1138	sel->ibits = NULL;
1139	return (EAGAIN);
1140	}
1141	sel->nbytes = nbytes;
1142	needzerofill = `0`;
1143	}
1144
1145	if (needzerofill) {
1146	bzero((caddr_t)sel->ibits, sel->nbytes);
1147	bzero((caddr_t)sel->obits, sel->nbytes);
1148	}
1149
1150	/*
1151	* get the bits from the user address space
1152	*/
1153	#define getbits(name, x) \
1154	do { \
1155	if (uap->name && (error = copyin(uap->name, \
1156	(caddr_t)&sel->ibits[(x) * nw], ni))) \
1157	goto continuation; \
1158	} while (0)
1159
1160	getbits(in, `0`);
1161	getbits(ou, `1`);
1162	getbits(ex, `2`);
1163	#undef getbits
1164
1165	seldata->abstime = timeout;
1166
1167	if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
1168	goto continuation;
1169	}
1170
1171	/*
1172	* We need an array of waitq pointers. This is due to the new way
1173	* in which waitqs are linked to sets. When a thread selects on a
1174	* file descriptor, a waitq (embedded in a selinfo structure) is
1175	* added to the thread's local waitq set. There is no longer any
1176	* way to directly iterate over all members of a given waitq set.
1177	* The process of linking a waitq into a set may allocate a link
1178	* table object. Because we can't iterate over all the waitqs to
1179	* which our thread waitq set belongs, we need a way of removing
1180	* this link object!
1181	*
1182	* Thus we need a buffer which will hold one waitq pointer
1183	* per FD being selected. During the tear-down phase we can use
1184	* these pointers to dis-associate the underlying selinfo's waitq
1185	* from our thread's waitq set.
1186	*
1187	* Because we also need to allocate a waitq set for this thread,
1188	* we use a bare buffer pointer to hold all the memory. Note that
1189	* this memory is cached in the thread pointer and not reaped until
1190	* the thread exists. This is generally OK because threads that
1191	* call select tend to keep calling select repeatedly.
1192	*/
1193	sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
1194	if (sz > uth->uu_wqstate_sz) {
1195	/ (re)allocate a buffer to hold waitq pointers /
1196	if (uth->uu_wqset) {
1197	if (waitq_set_is_valid(uth->uu_wqset))
1198	waitq_set_deinit(uth->uu_wqset);
1199	FREE(uth->uu_wqset, M_SELECT);
1200	} else if (uth->uu_wqstate_sz && !uth->uu_wqset)
1201	panic("select: thread structure corrupt! "
1202	"uu_wqstate_sz:%ld, wqstate_buf == NULL",
1203	uth->uu_wqstate_sz);
1204	uth->uu_wqstate_sz = sz;
1205	MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
1206	if (!uth->uu_wqset)
1207	panic("can't allocate %ld bytes for wqstate buffer",
1208	uth->uu_wqstate_sz);
1209	waitq_set_init(uth->uu_wqset,
1210	SYNC_POLICY_FIFO\|SYNC_POLICY_PREPOST, NULL, NULL);
1211	}
1212
1213	if (!waitq_set_is_valid(uth->uu_wqset))
1214	waitq_set_init(uth->uu_wqset,
1215	SYNC_POLICY_FIFO\|SYNC_POLICY_PREPOST, NULL, NULL);
1216
1217	/ the last chunk of our buffer is an array of waitq pointers /
1218	seldata->wqp = (uint64_t )((char* )(uth->uu_wqset) + ALIGN(sizeof(struct* waitq_set)));
1219	bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
1220
1221	seldata->count = count;
1222
1223	continuation:
1224
1225	if (error) {
1226	/*
1227	* We have already cleaned up any state we established,
1228	* either locally or as a result of selcount(). We don't
1229	* need to wait_subqueue_unlink_all(), since we haven't set
1230	* anything at this point.
1231	*/
1232	return (error);
1233	}
1234
1235	return selprocess(`0`, SEL_FIRSTPASS);
1236	}
1237
1238	int
1239	selcontinue(int error)
1240	{
1241	return selprocess(error, SEL_SECONDPASS);
1242	}
1243
1244
1245	/*
1246	* selprocess
1247	*
1248	* Parameters: error The error code from our caller
1249	* sel_pass The pass we are on
1250	*/
1251	int
1252	selprocess(int error, int sel_pass)
1253	{
1254	int ncoll;
1255	u_int ni, nw;
1256	thread_t th_act;
1257	struct uthread *uth;
1258	struct proc *p;
1259	struct select_nocancel_args *uap;
1260	int *retval;
1261	struct _select *sel;
1262	struct _select_data *seldata;
1263	int unwind = `1`;
1264	int prepost = `0`;
1265	int somewakeup = `0`;
1266	int doretry = `0`;
1267	wait_result_t wait_result;
1268
1269	p = current_proc();
1270	th_act = current_thread();
1271	uth = get_bsdthread_info(th_act);
1272	sel = &uth->uu_select;
1273	seldata = &uth->uu_save.uus_select_data;
1274	uap = seldata->args;
1275	retval = seldata->retval;
1276
1277	if ((error != `0`) && (sel_pass == SEL_FIRSTPASS))
1278	unwind = `0`;
1279	if (seldata->count == `0`)
1280	unwind = `0`;
1281	retry:
1282	if (error != `0`)
1283	goto done;
1284
1285	ncoll = nselcoll;
1286	OSBitOrAtomic(P_SELECT, &p->p_flag);
1287
1288	/ skip scans if the select is just for timeouts /
1289	if (seldata->count) {
1290	error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
1291	if (error \|\| *retval) {
1292	goto done;
1293	}
1294	if (prepost \|\| somewakeup) {
1295	/*
1296	* if the select of log, then we can wakeup and
1297	* discover some one else already read the data;
1298	* go to select again if time permits
1299	*/
1300	prepost = `0`;
1301	somewakeup = `0`;
1302	doretry = `1`;
1303	}
1304	}
1305
1306	if (uap->tv) {
1307	uint64_t now;
1308
1309	clock_get_uptime(&now);
1310	if (now >= seldata->abstime)
1311	goto done;
1312	}
1313
1314	if (doretry) {
1315	/ cleanup obits and try again /
1316	doretry = `0`;
1317	sel_pass = SEL_FIRSTPASS;
1318	goto retry;
1319	}
1320
1321	/*
1322	* To effect a poll, the timeout argument should be
1323	* non-nil, pointing to a zero-valued timeval structure.
1324	*/
1325	if (uap->tv && seldata->abstime == `0`) {
1326	goto done;
1327	}
1328
1329	/ No spurious wakeups due to colls,no need to check for them /
1330	if ((sel_pass == SEL_SECONDPASS) \|\| ((p->p_flag & P_SELECT) == `0`)) {
1331	sel_pass = SEL_FIRSTPASS;
1332	goto retry;
1333	}
1334
1335	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1336
1337	/ if the select is just for timeout skip check /
1338	if (seldata->count && (sel_pass == SEL_SECONDPASS))
1339	panic("selprocess: 2nd pass assertwaiting");
1340
1341	/ waitq_set has waitqueue as first element /
1342	wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
1343	NO_EVENT64, THREAD_ABORTSAFE,
1344	TIMEOUT_URGENCY_USER_NORMAL,
1345	seldata->abstime,
1346	TIMEOUT_NO_LEEWAY);
1347	if (wait_result != THREAD_AWAKENED) {
1348	/ there are no preposted events /
1349	error = tsleep1(NULL, PSOCK \| PCATCH,
1350	"select", `0`, selcontinue);
1351	} else {
1352	prepost = `1`;
1353	error = `0`;
1354	}
1355
1356	if (error == `0`) {
1357	sel_pass = SEL_SECONDPASS;
1358	if (!prepost)
1359	somewakeup = `1`;
1360	goto retry;
1361	}
1362	done:
1363	if (unwind) {
1364	seldrop(p, sel->ibits, uap->nd);
1365	waitq_set_deinit(uth->uu_wqset);
1366	/*
1367	* zero out the waitq pointer array to avoid use-after free
1368	* errors in the selcount error path (seldrop_locked) if/when
1369	* the thread re-calls select().
1370	*/
1371	bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
1372	}
1373	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1374	/ select is not restarted after signals... /
1375	if (error == ERESTART)
1376	error = EINTR;
1377	if (error == EWOULDBLOCK)
1378	error = `0`;
1379	nw = howmany(uap->nd, NFDBITS);
1380	ni = nw * sizeof(fd_mask);
1381
1382	#define putbits(name, x) \
1383	do { \
1384	if (uap->name && (error2 = \
1385	copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1386	error = error2; \
1387	} while (0)
1388
1389	if (error == `0`) {
1390	int error2;
1391
1392	putbits(in, `0`);
1393	putbits(ou, `1`);
1394	putbits(ex, `2`);
1395	#undef putbits
1396	}
1397
1398	if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1399	/ restore signal mask - continuation case /
1400	uth->uu_sigmask = uth->uu_oldmask;
1401	uth->uu_oldmask = `0`;
1402	uth->uu_flag &= ~UT_SAS_OLDMASK;
1403	}
1404
1405	return(error);
1406	}
1407
1408
1409	/**
1410	* remove the fileproc's underlying waitq from the supplied waitq set;
1411	* clear FP_INSELECT when appropriate
1412	*
1413	* Parameters:
1414	* fp File proc that is potentially currently in select
1415	* wqset Waitq set to which the fileproc may belong
1416	* (usually this is the thread's private waitq set)
1417	* Conditions:
1418	* proc_fdlock is held
1419	*/
1420	static void selunlinkfp(struct fileproc fp, uint64_t wqp_id, struct* waitq_set *wqset)
1421	{
1422	int valid_set = waitq_set_is_valid(wqset);
1423	int valid_q = !!wqp_id;
1424
1425	/*
1426	* This could be called (from selcount error path) before we setup
1427	* the thread's wqset. Check the wqset passed in, and only unlink if
1428	* the set is valid.
1429	*/
1430
1431	/ unlink the underlying waitq from the input set (thread waitq set) /
1432	if (valid_q && valid_set)
1433	waitq_unlink_by_prepost_id(wqp_id, wqset);
1434
1435	/ allow passing a NULL/invalid fp for seldrop unwind /
1436	if (!fp \|\| !(fp->f_flags & (FP_INSELECT\|FP_SELCONFLICT)))
1437	return;
1438
1439	/*
1440	* We can always remove the conflict queue from our thread's set: this
1441	* will not affect other threads that potentially need to be awoken on
1442	* the conflict queue during a fileproc_drain - those sets will still
1443	* be linked with the global conflict queue, and the last waiter
1444	* on the fp clears the CONFLICT marker.
1445	*/
1446	if (valid_set && (fp->f_flags & FP_SELCONFLICT))
1447	waitq_unlink(&select_conflict_queue, wqset);
1448
1449	/ jca: TODO:*
1450	* This isn't quite right - we don't actually know if this
1451	* fileproc is in another select or not! Here we just assume
1452	* that if we were the first thread to select on the FD, then
1453	* we'll be the one to clear this flag...
1454	*/
1455	if (valid_set && fp->f_wset == (void *)wqset) {
1456	fp->f_flags &= ~FP_INSELECT;
1457	fp->f_wset = NULL;
1458	}
1459	}
1460
1461	/**
1462	* connect a fileproc to the given wqset, potentially bridging to a waitq
1463	* pointed to indirectly by wq_data
1464	*
1465	* Parameters:
1466	* fp File proc potentially currently in select
1467	* wq_data Pointer to a pointer to a waitq (could be NULL)
1468	* wqset Waitq set to which the fileproc should now belong
1469	* (usually this is the thread's private waitq set)
1470	*
1471	* Conditions:
1472	* proc_fdlock is held
1473	*/
1474	static uint64_t sellinkfp(struct fileproc fp, void* wq_data, struct** waitq_set *wqset)
1475	{
1476	struct waitq *f_wq = NULL;
1477
1478	if ((fp->f_flags & FP_INSELECT) != FP_INSELECT) {
1479	if (wq_data)
1480	panic("non-null data:%p on fp:%p not in select?!"
1481	"(wqset:%p)", wq_data, fp, wqset);
1482	return `0`;
1483	}
1484
1485	if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
1486	waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
1487	}
1488
1489	/*
1490	* The wq_data parameter has potentially been set by selrecord called
1491	* from a subsystems fo_select() function. If the subsystem does not
1492	* call selrecord, then wq_data will be NULL
1493	*
1494	* Use memcpy to get the value into a proper pointer because
1495	* wq_data most likely points to a stack variable that could be
1496	* unaligned on 32-bit systems.
1497	*/
1498	if (wq_data) {
1499	memcpy(&f_wq, wq_data, sizeof(f_wq));
1500	if (!waitq_is_valid(f_wq))
1501	f_wq = NULL;
1502	}
1503
1504	/ record the first thread's wqset in the fileproc structure /
1505	if (!fp->f_wset)
1506	fp->f_wset = (void *)wqset;
1507
1508	/ handles NULL f_wq /
1509	return waitq_get_prepost_id(f_wq);
1510	}
1511
1512
1513	/*
1514	* selscan
1515	*
1516	* Parameters: p Process performing the select
1517	* sel The per-thread select context structure
1518	* nfd The number of file descriptors to scan
1519	* retval The per thread system call return area
1520	* sel_pass Which pass this is; allowed values are
1521	* SEL_FIRSTPASS and SEL_SECONDPASS
1522	* wqset The per thread wait queue set
1523	*
1524	* Returns: 0 Success
1525	* EIO Invalid p->p_fd field XXX Obsolete?
1526	* EBADF One of the files in the bit vector is
1527	* invalid.
1528	*/
1529	static int
1530	selscan(struct proc p, struct* _select sel, struct* _select_data * seldata,
1531	int nfd, int32_t retval, int* sel_pass, struct waitq_set *wqset)
1532	{
1533	struct filedesc *fdp = p->p_fd;
1534	int msk, i, j, fd;
1535	u_int32_t bits;
1536	struct fileproc *fp;
1537	int n = `0`; / count of bits /
1538	int nc = `0`; / bit vector offset (nc'th bit) /
1539	static int flag[`3`] = { FREAD, FWRITE, `0` };
1540	u_int32_t iptr, optr;
1541	u_int nw;
1542	u_int32_t ibits, obits;
1543	uint64_t reserved_link, *rl_ptr = NULL;
1544	int count;
1545	struct vfs_context context = *vfs_context_current();
1546
1547	/*
1548	* Problems when reboot; due to MacOSX signal probs
1549	* in Beaker1C ; verify that the p->p_fd is valid
1550	*/
1551	if (fdp == NULL) {
1552	*retval=`0`;
1553	return(EIO);
1554	}
1555	ibits = sel->ibits;
1556	obits = sel->obits;
1557
1558	nw = howmany(nfd, NFDBITS);
1559
1560	count = seldata->count;
1561
1562	nc = `0`;
1563	if (!count) {
1564	*retval = `0`;
1565	return `0`;
1566	}
1567
1568	proc_fdlock(p);
1569	for (msk = `0`; msk < `3`; msk++) {
1570	iptr = (u_int32_t )&ibits[msk nw];
1571	optr = (u_int32_t )&obits[msk nw];
1572
1573	for (i = `0`; i < nfd; i += NFDBITS) {
1574	bits = iptr[i/NFDBITS];
1575
1576	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1577	bits &= ~(`1` << j);
1578
1579	if (fd < fdp->fd_nfiles)
1580	fp = fdp->fd_ofiles[fd];
1581	else
1582	fp = NULL;
1583
1584	if (fp == NULL \|\| (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1585	/*
1586	* If we abort because of a bad
1587	* fd, let the caller unwind...
1588	*/
1589	proc_fdunlock(p);
1590	return(EBADF);
1591	}
1592	if (sel_pass == SEL_SECONDPASS) {
1593	reserved_link = `0`;
1594	rl_ptr = NULL;
1595	selunlinkfp(fp, seldata->wqp[nc], wqset);
1596	} else {
1597	reserved_link = waitq_link_reserve((struct waitq *)wqset);
1598	rl_ptr = &reserved_link;
1599	if (fp->f_flags & FP_INSELECT)
1600	/ someone is already in select on this fp /
1601	fp->f_flags \|= FP_SELCONFLICT;
1602	else
1603	fp->f_flags \|= FP_INSELECT;
1604
1605	waitq_set_lazy_init_link(wqset);
1606	}
1607
1608	context.vc_ucred = fp->f_cred;
1609
1610	/*
1611	* stash this value b/c fo_select may replace
1612	* reserved_link with a pointer to a waitq object
1613	*/
1614	uint64_t rsvd = reserved_link;
1615
1616	/ The select; set the bit, if true /
1617	if (fp->f_ops && fp->f_type
1618	&& fo_select(fp, flag[msk], rl_ptr, &context)) {
1619	optr[fd/NFDBITS] \|= (`1` << (fd % NFDBITS));
1620	n++;
1621	}
1622	if (sel_pass == SEL_FIRSTPASS) {
1623	waitq_link_release(rsvd);
1624	/*
1625	* If the fp's supporting selinfo structure was linked
1626	* to this thread's waitq set, then 'reserved_link'
1627	* will have been updated by selrecord to be a pointer
1628	* to the selinfo's waitq.
1629	*/
1630	if (reserved_link == rsvd)
1631	rl_ptr = NULL; / fo_select never called selrecord() /
1632	/*
1633	* Hook up the thread's waitq set either to
1634	* the fileproc structure, or to the global
1635	* conflict queue: but only on the first
1636	* select pass.
1637	*/
1638	seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
1639	}
1640	nc++;
1641	}
1642	}
1643	}
1644	proc_fdunlock(p);
1645
1646	*retval = n;
1647	return (`0`);
1648	}
1649
1650	int poll_callback(struct kqueue , struct* kevent_internal_s , void* *);
1651
1652	struct poll_continue_args {
1653	user_addr_t pca_fds;
1654	u_int pca_nfds;
1655	u_int pca_rfds;
1656	};
1657
1658	int
1659	poll(struct proc p, struct* poll_args uap, int32_t retval)
1660	{
1661	__pthread_testcancel(`1`);
1662	return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1663	}
1664
1665
1666	int
1667	poll_nocancel(struct proc p, struct* poll_nocancel_args uap, int32_t retval)
1668	{
1669	struct poll_continue_args *cont;
1670	struct pollfd *fds;
1671	struct kqueue *kq;
1672	struct timeval atv;
1673	int ncoll, error = `0`;
1674	u_int nfds = uap->nfds;
1675	u_int rfds = `0`;
1676	u_int i;
1677	size_t ni;
1678
1679	/*
1680	* This is kinda bogus. We have fd limits, but that is not
1681	* really related to the size of the pollfd array. Make sure
1682	* we let the process use at least FD_SETSIZE entries and at
1683	* least enough for the current limits. We want to be reasonably
1684	* safe, but not overly restrictive.
1685	*/
1686	if (nfds > OPEN_MAX \|\|
1687	(nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) \|\| nfds > FD_SETSIZE)))
1688	return (EINVAL);
1689
1690	kq = kqueue_alloc(p, `0`);
1691	if (kq == NULL)
1692	return (EAGAIN);
1693
1694	ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1695	MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1696	if (NULL == cont) {
1697	error = EAGAIN;
1698	goto out;
1699	}
1700
1701	fds = (struct pollfd *)&cont[`1`];
1702	error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1703	if (error)
1704	goto out;
1705
1706	if (uap->timeout != -`1`) {
1707	struct timeval rtv;
1708
1709	atv.tv_sec = uap->timeout / `1000`;
1710	atv.tv_usec = (uap->timeout % `1000`) * `1000`;
1711	if (itimerfix(&atv)) {
1712	error = EINVAL;
1713	goto out;
1714	}
1715	getmicrouptime(&rtv);
1716	timevaladd(&atv, &rtv);
1717	} else {
1718	atv.tv_sec = `0`;
1719	atv.tv_usec = `0`;
1720	}
1721
1722	/ JMM - all this P_SELECT stuff is bogus /
1723	ncoll = nselcoll;
1724	OSBitOrAtomic(P_SELECT, &p->p_flag);
1725	for (i = `0`; i < nfds; i++) {
1726	short events = fds[i].events;
1727	KNOTE_LOCK_CTX(knlc);
1728	__assert_only int rc;
1729
1730	/ per spec, ignore fd values below zero /
1731	if (fds[i].fd < `0`) {
1732	fds[i].revents = `0`;
1733	continue;
1734	}
1735
1736	/ convert the poll event into a kqueue kevent /
1737	struct kevent_internal_s kev = {
1738	.ident = fds[i].fd,
1739	.flags = EV_ADD \| EV_ONESHOT \| EV_POLL,
1740	.udata = CAST_USER_ADDR_T(&fds[i]) };
1741
1742	/ Handle input events /
1743	if (events & ( POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND \| POLLHUP )) {
1744	kev.filter = EVFILT_READ;
1745	if (events & ( POLLPRI \| POLLRDBAND ))
1746	kev.flags \|= EV_OOBAND;
1747	rc = kevent_register(kq, &kev, &knlc);
1748	assert((rc & FILTER_REGISTER_WAIT) == `0`);
1749	}
1750
1751	/ Handle output events /
1752	if ((kev.flags & EV_ERROR) == `0` &&
1753	(events & ( POLLOUT \| POLLWRNORM \| POLLWRBAND ))) {
1754	kev.filter = EVFILT_WRITE;
1755	rc = kevent_register(kq, &kev, &knlc);
1756	assert((rc & FILTER_REGISTER_WAIT) == `0`);
1757	}
1758
1759	/ Handle BSD extension vnode events /
1760	if ((kev.flags & EV_ERROR) == `0` &&
1761	(events & ( POLLEXTEND \| POLLATTRIB \| POLLNLINK \| POLLWRITE ))) {
1762	kev.filter = EVFILT_VNODE;
1763	kev.fflags = `0`;
1764	if (events & POLLEXTEND)
1765	kev.fflags \|= NOTE_EXTEND;
1766	if (events & POLLATTRIB)
1767	kev.fflags \|= NOTE_ATTRIB;
1768	if (events & POLLNLINK)
1769	kev.fflags \|= NOTE_LINK;
1770	if (events & POLLWRITE)
1771	kev.fflags \|= NOTE_WRITE;
1772	rc = kevent_register(kq, &kev, &knlc);
1773	assert((rc & FILTER_REGISTER_WAIT) == `0`);
1774	}
1775
1776	if (kev.flags & EV_ERROR) {
1777	fds[i].revents = POLLNVAL;
1778	rfds++;
1779	} else
1780	fds[i].revents = `0`;
1781	}
1782
1783	/*
1784	* Did we have any trouble registering?
1785	* If user space passed 0 FDs, then respect any timeout value passed.
1786	* This is an extremely inefficient sleep. If user space passed one or
1787	* more FDs, and we had trouble registering _all_ of them, then bail
1788	* out. If a subset of the provided FDs failed to register, then we
1789	* will still call the kqueue_scan function.
1790	*/
1791	if (nfds && (rfds == nfds))
1792	goto done;
1793
1794	/*
1795	* If any events have trouble registering, an event has fired and we
1796	* shouldn't wait for events in kqueue_scan -- use the current time as
1797	* the deadline.
1798	*/
1799	if (rfds)
1800	getmicrouptime(&atv);
1801
1802	/ scan for, and possibly wait for, the kevents to trigger /
1803	cont->pca_fds = uap->fds;
1804	cont->pca_nfds = nfds;
1805	cont->pca_rfds = rfds;
1806	error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p);
1807	rfds = cont->pca_rfds;
1808
1809	done:
1810	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1811	/ poll is not restarted after signals... /
1812	if (error == ERESTART)
1813	error = EINTR;
1814	if (error == EWOULDBLOCK)
1815	error = `0`;
1816	if (error == `0`) {
1817	error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1818	*retval = rfds;
1819	}
1820
1821	out:
1822	if (NULL != cont)
1823	FREE(cont, M_TEMP);
1824
1825	kqueue_dealloc(kq);
1826	return (error);
1827	}
1828
1829	int
1830	poll_callback(__unused struct kqueue kq, struct* kevent_internal_s kevp, void* *data)
1831	{
1832	struct poll_continue_args cont = (struct* poll_continue_args *)data;
1833	struct pollfd fds = CAST_DOWN(struct* pollfd *, kevp->udata);
1834	short prev_revents = fds->revents;
1835	short mask = `0`;
1836
1837	/ convert the results back into revents /
1838	if (kevp->flags & EV_EOF)
1839	fds->revents \|= POLLHUP;
1840	if (kevp->flags & EV_ERROR)
1841	fds->revents \|= POLLERR;
1842
1843	switch (kevp->filter) {
1844	case EVFILT_READ:
1845	if (fds->revents & POLLHUP)
1846	mask = (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND );
1847	else {
1848	mask = (POLLIN \| POLLRDNORM);
1849	if (kevp->flags & EV_OOBAND)
1850	mask \|= (POLLPRI \| POLLRDBAND);
1851	}
1852	fds->revents \|= (fds->events & mask);
1853	break;
1854
1855	case EVFILT_WRITE:
1856	if (!(fds->revents & POLLHUP))
1857	fds->revents \|= (fds->events & ( POLLOUT \| POLLWRNORM \| POLLWRBAND ));
1858	break;
1859
1860	case EVFILT_VNODE:
1861	if (kevp->fflags & NOTE_EXTEND)
1862	fds->revents \|= (fds->events & POLLEXTEND);
1863	if (kevp->fflags & NOTE_ATTRIB)
1864	fds->revents \|= (fds->events & POLLATTRIB);
1865	if (kevp->fflags & NOTE_LINK)
1866	fds->revents \|= (fds->events & POLLNLINK);
1867	if (kevp->fflags & NOTE_WRITE)
1868	fds->revents \|= (fds->events & POLLWRITE);
1869	break;
1870	}
1871
1872	if (fds->revents != `0` && prev_revents == `0`)
1873	cont->pca_rfds++;
1874
1875	return `0`;
1876	}
1877
1878	int
1879	seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1880	{
1881
1882	return (`1`);
1883	}
1884
1885	/*
1886	* selcount
1887	*
1888	* Count the number of bits set in the input bit vector, and establish an
1889	* outstanding fp->f_iocount for each of the descriptors which will be in
1890	* use in the select operation.
1891	*
1892	* Parameters: p The process doing the select
1893	* ibits The input bit vector
1894	* nfd The number of fd's in the vector
1895	* countp Pointer to where to store the bit count
1896	*
1897	* Returns: 0 Success
1898	* EIO Bad per process open file table
1899	* EBADF One of the bits in the input bit vector
1900	* references an invalid fd
1901	*
1902	* Implicit: *countp (modified) Count of fd's
1903	*
1904	* Notes: This function is the first pass under the proc_fdlock() that
1905	* permits us to recognize invalid descriptors in the bit vector;
1906	* the may, however, not remain valid through the drop and
1907	* later reacquisition of the proc_fdlock().
1908	*/
1909	static int
1910	selcount(struct proc p, u_int32_t ibits, int nfd, int *countp)
1911	{
1912	struct filedesc *fdp = p->p_fd;
1913	int msk, i, j, fd;
1914	u_int32_t bits;
1915	struct fileproc *fp;
1916	int n = `0`;
1917	u_int32_t *iptr;
1918	u_int nw;
1919	int error=`0`;
1920	int dropcount;
1921	int need_wakeup = `0`;
1922
1923	/*
1924	* Problems when reboot; due to MacOSX signal probs
1925	* in Beaker1C ; verify that the p->p_fd is valid
1926	*/
1927	if (fdp == NULL) {
1928	*countp = `0`;
1929	return(EIO);
1930	}
1931	nw = howmany(nfd, NFDBITS);
1932
1933	proc_fdlock(p);
1934	for (msk = `0`; msk < `3`; msk++) {
1935	iptr = (u_int32_t )&ibits[msk nw];
1936	for (i = `0`; i < nfd; i += NFDBITS) {
1937	bits = iptr[i/NFDBITS];
1938	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1939	bits &= ~(`1` << j);
1940
1941	if (fd < fdp->fd_nfiles)
1942	fp = fdp->fd_ofiles[fd];
1943	else
1944	fp = NULL;
1945
1946	if (fp == NULL \|\|
1947	(fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1948	*countp = `0`;
1949	error = EBADF;
1950	goto bad;
1951	}
1952	fp->f_iocount++;
1953	n++;
1954	}
1955	}
1956	}
1957	proc_fdunlock(p);
1958
1959	*countp = n;
1960	return (`0`);
1961
1962	bad:
1963	dropcount = `0`;
1964
1965	if (n == `0`)
1966	goto out;
1967	/ Ignore error return; it's already EBADF /
1968	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, `1`);
1969
1970	out:
1971	proc_fdunlock(p);
1972	if (need_wakeup) {
1973	wakeup(&p->p_fpdrainwait);
1974	}
1975	return(error);
1976	}
1977
1978
1979	/*
1980	* seldrop_locked
1981	*
1982	* Drop outstanding wait queue references set up during selscan(); drop the
1983	* outstanding per fileproc f_iocount() picked up during the selcount().
1984	*
1985	* Parameters: p Process performing the select
1986	* ibits Input bit bector of fd's
1987	* nfd Number of fd's
1988	* lim Limit to number of vector entries to
1989	* consider, or -1 for "all"
1990	* inselect True if
1991	* need_wakeup Pointer to flag to set to do a wakeup
1992	* if f_iocont on any descriptor goes to 0
1993	*
1994	* Returns: 0 Success
1995	* EBADF One or more fds in the bit vector
1996	* were invalid, but the rest
1997	* were successfully dropped
1998	*
1999	* Notes: An fd make become bad while the proc_fdlock() is not held,
2000	* if a multithreaded application closes the fd out from under
2001	* the in progress select. In this case, we still have to
2002	* clean up after the set up on the remaining fds.
2003	*/
2004	static int
2005	seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int need_wakeup, int* fromselcount)
2006	{
2007	struct filedesc *fdp = p->p_fd;
2008	int msk, i, j, nc, fd;
2009	u_int32_t bits;
2010	struct fileproc *fp;
2011	u_int32_t *iptr;
2012	u_int nw;
2013	int error = `0`;
2014	int dropcount = `0`;
2015	uthread_t uth = get_bsdthread_info(current_thread());
2016	struct _select_data *seldata;
2017
2018	*need_wakeup = `0`;
2019
2020	/*
2021	* Problems when reboot; due to MacOSX signal probs
2022	* in Beaker1C ; verify that the p->p_fd is valid
2023	*/
2024	if (fdp == NULL) {
2025	return(EIO);
2026	}
2027
2028	nw = howmany(nfd, NFDBITS);
2029	seldata = &uth->uu_save.uus_select_data;
2030
2031	nc = `0`;
2032	for (msk = `0`; msk < `3`; msk++) {
2033	iptr = (u_int32_t )&ibits[msk nw];
2034	for (i = `0`; i < nfd; i += NFDBITS) {
2035	bits = iptr[i/NFDBITS];
2036	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2037	bits &= ~(`1` << j);
2038	fp = fdp->fd_ofiles[fd];
2039	/*
2040	* If we've already dropped as many as were
2041	* counted/scanned, then we are done.
2042	*/
2043	if ((fromselcount != `0`) && (++dropcount > lim))
2044	goto done;
2045
2046	/*
2047	* unlink even potentially NULL fileprocs.
2048	* If the FD was closed from under us, we
2049	* still need to cleanup the waitq links!
2050	*/
2051	selunlinkfp(fp,
2052	seldata->wqp ? seldata->wqp[nc] : `0`,
2053	uth->uu_wqset);
2054
2055	nc++;
2056
2057	if (fp == NULL) {
2058	/ skip (now) bad fds /
2059	error = EBADF;
2060	continue;
2061	}
2062
2063	fp->f_iocount--;
2064	if (fp->f_iocount < `0`)
2065	panic("f_iocount overdecrement!");
2066
2067	if (fp->f_iocount == `0`) {
2068	/*
2069	* The last iocount is responsible for clearing
2070	* selconfict flag - even if we didn't set it -
2071	* and is also responsible for waking up anyone
2072	* waiting on iocounts to drain.
2073	*/
2074	if (fp->f_flags & FP_SELCONFLICT)
2075	fp->f_flags &= ~FP_SELCONFLICT;
2076	if (p->p_fpdrainwait) {
2077	p->p_fpdrainwait = `0`;
2078	*need_wakeup = `1`;
2079	}
2080	}
2081	}
2082	}
2083	}
2084	done:
2085	return (error);
2086	}
2087
2088
2089	static int
2090	seldrop(struct proc p, u_int32_t ibits, int nfd)
2091	{
2092	int error;
2093	int need_wakeup = `0`;
2094
2095	proc_fdlock(p);
2096	error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, `0`);
2097	proc_fdunlock(p);
2098	if (need_wakeup) {
2099	wakeup(&p->p_fpdrainwait);
2100	}
2101	return (error);
2102	}
2103
2104	/*
2105	* Record a select request.
2106	*/
2107	void
2108	selrecord(__unused struct proc selector, struct* selinfo sip, void* *s_data)
2109	{
2110	thread_t cur_act = current_thread();
2111	struct uthread * ut = get_bsdthread_info(cur_act);
2112	/ on input, s_data points to the 64-bit ID of a reserved link object /
2113	uint64_t reserved_link = (uint64_t )s_data;
2114
2115	/ need to look at collisions /
2116
2117	/do not record if this is second pass of select /
2118	if (!s_data)
2119	return;
2120
2121	if ((sip->si_flags & SI_INITED) == `0`) {
2122	waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
2123	sip->si_flags \|= SI_INITED;
2124	sip->si_flags &= ~SI_CLEAR;
2125	}
2126
2127	if (sip->si_flags & SI_RECORDED)
2128	sip->si_flags \|= SI_COLL;
2129	else
2130	sip->si_flags &= ~SI_COLL;
2131
2132	sip->si_flags \|= SI_RECORDED;
2133	/ note: this checks for pre-existing linkage /
2134	waitq_link(&sip->si_waitq, ut->uu_wqset,
2135	WAITQ_SHOULD_LOCK, reserved_link);
2136
2137	/*
2138	* Always consume the reserved link.
2139	* We can always call waitq_link_release() safely because if
2140	* waitq_link is successful, it consumes the link and resets the
2141	* value to 0, in which case our call to release becomes a no-op.
2142	* If waitq_link fails, then the following release call will actually
2143	* release the reserved link object.
2144	*/
2145	waitq_link_release(*reserved_link);
2146	*reserved_link = `0`;
2147
2148	/*
2149	* Use the s_data pointer as an output parameter as well
2150	* This avoids changing the prototype for this function which is
2151	* used by many kexts. We need to surface the waitq object
2152	* associated with the selinfo we just added to the thread's select
2153	* set. New waitq sets do not have back-pointers to set members, so
2154	* the only way to clear out set linkage objects is to go from the
2155	* waitq to the set. We use a memcpy because s_data could be
2156	* pointing to an unaligned value on the stack
2157	* (especially on 32-bit systems)
2158	*/
2159	void wqptr = (void* *)&sip->si_waitq;
2160	memcpy((void )s_data, (void* )&wqptr, sizeof(void* *));
2161
2162	return;
2163	}
2164
2165	void
2166	selwakeup(struct selinfo *sip)
2167	{
2168
2169	if ((sip->si_flags & SI_INITED) == `0`) {
2170	return;
2171	}
2172
2173	if (sip->si_flags & SI_COLL) {
2174	nselcoll++;
2175	sip->si_flags &= ~SI_COLL;
2176	#if 0
2177	/ will not support /
2178	//wakeup((caddr_t)&selwait);
2179	#endif
2180	}
2181
2182	if (sip->si_flags & SI_RECORDED) {
2183	waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
2184	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2185	sip->si_flags &= ~SI_RECORDED;
2186	}
2187
2188	}
2189
2190	void
2191	selthreadclear(struct selinfo *sip)
2192	{
2193	struct waitq *wq;
2194
2195	if ((sip->si_flags & SI_INITED) == `0`) {
2196	return;
2197	}
2198	if (sip->si_flags & SI_RECORDED) {
2199	selwakeup(sip);
2200	sip->si_flags &= ~(SI_RECORDED \| SI_COLL);
2201	}
2202	sip->si_flags \|= SI_CLEAR;
2203	sip->si_flags &= ~SI_INITED;
2204
2205	wq = &sip->si_waitq;
2206
2207	/*
2208	* Higher level logic may have a handle on this waitq's prepost ID,
2209	* but that's OK because the waitq_deinit will remove/invalidate the
2210	* prepost object (as well as mark the waitq invalid). This de-couples
2211	* us from any callers that may have a handle to this waitq via the
2212	* prepost ID.
2213	*/
2214	waitq_deinit(wq);
2215	}
2216
2217
2218
2219
2220	#define DBG_POST 0x10
2221	#define DBG_WATCH 0x11
2222	#define DBG_WAIT 0x12
2223	#define DBG_MOD 0x13
2224	#define DBG_EWAKEUP 0x14
2225	#define DBG_ENQUEUE 0x15
2226	#define DBG_DEQUEUE 0x16
2227
2228	#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
2229	#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
2230	#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
2231	#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
2232	#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
2233	#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
2234	#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
2235
2236
2237	#define EVPROCDEQUE(p, evq) do { \
2238	proc_lock(p); \
2239	if (evq->ee_flags & EV_QUEUED) { \
2240	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \
2241	evq->ee_flags &= ~EV_QUEUED; \
2242	} \
2243	proc_unlock(p); \
2244	} while (0);
2245
2246
2247	/*
2248	* called upon socket close. deque and free all events for
2249	* the socket... socket must be locked by caller.
2250	*/
2251	void
2252	evsofree(struct socket *sp)
2253	{
2254	struct eventqelt evq, next;
2255	proc_t p;
2256
2257	if (sp == NULL)
2258	return;
2259
2260	for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
2261	next = evq->ee_slist.tqe_next;
2262	p = evq->ee_proc;
2263
2264	if (evq->ee_flags & EV_QUEUED) {
2265	EVPROCDEQUE(p, evq);
2266	}
2267	TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
2268	FREE(evq, M_TEMP);
2269	}
2270	}
2271
2272
2273	/*
2274	* called upon pipe close. deque and free all events for
2275	* the pipe... pipe must be locked by caller
2276	*/
2277	void
2278	evpipefree(struct pipe *cpipe)
2279	{
2280	struct eventqelt evq, next;
2281	proc_t p;
2282
2283	for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
2284	next = evq->ee_slist.tqe_next;
2285	p = evq->ee_proc;
2286
2287	EVPROCDEQUE(p, evq);
2288
2289	TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
2290	FREE(evq, M_TEMP);
2291	}
2292	}
2293
2294
2295	/*
2296	* enqueue this event if it's not already queued. wakeup
2297	* the proc if we do queue this event to it...
2298	* entered with proc lock held... we drop it before
2299	* doing the wakeup and return in that state
2300	*/
2301	static void
2302	evprocenque(struct eventqelt *evq)
2303	{
2304	proc_t p;
2305
2306	assert(evq);
2307	p = evq->ee_proc;
2308
2309	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,`0`,`0`);
2310
2311	proc_lock(p);
2312
2313	if (evq->ee_flags & EV_QUEUED) {
2314	proc_unlock(p);
2315
2316	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2317	return;
2318	}
2319	evq->ee_flags \|= EV_QUEUED;
2320
2321	TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
2322
2323	proc_unlock(p);
2324
2325	wakeup(&p->p_evlist);
2326
2327	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2328	}
2329
2330
2331	/*
2332	* pipe lock must be taken by the caller
2333	*/
2334	void
2335	postpipeevent(struct pipe pipep, int* event)
2336	{
2337	int mask;
2338	struct eventqelt *evq;
2339
2340	if (pipep == NULL)
2341	return;
2342	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_START, event,`0`,`0`,`1`,`0`);
2343
2344	for (evq = pipep->pipe_evlist.tqh_first;
2345	evq != NULL; evq = evq->ee_slist.tqe_next) {
2346
2347	if (evq->ee_eventmask == `0`)
2348	continue;
2349	mask = `0`;
2350
2351	switch (event & (EV_RWBYTES \| EV_RCLOSED \| EV_WCLOSED)) {
2352
2353	case EV_RWBYTES:
2354	if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
2355	mask \|= EV_RE;
2356	evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
2357	}
2358	if ((evq->ee_eventmask & EV_WR) &&
2359	(MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
2360
2361	if (pipep->pipe_state & PIPE_EOF) {
2362	mask \|= EV_WR\|EV_RESET;
2363	break;
2364	}
2365	mask \|= EV_WR;
2366	evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
2367	}
2368	break;
2369
2370	case EV_WCLOSED:
2371	case EV_RCLOSED:
2372	if ((evq->ee_eventmask & EV_RE)) {
2373	mask \|= EV_RE\|EV_RCLOSED;
2374	}
2375	if ((evq->ee_eventmask & EV_WR)) {
2376	mask \|= EV_WR\|EV_WCLOSED;
2377	}
2378	break;
2379
2380	default:
2381	return;
2382	}
2383	if (mask) {
2384	/*
2385	* disarm... postevents are nops until this event is 'read' via
2386	* waitevent and then re-armed via modwatch
2387	*/
2388	evq->ee_eventmask = `0`;
2389
2390	/*
2391	* since events are disarmed until after the waitevent
2392	* the ee_req.er_xxxx fields can't change once we've
2393	* inserted this event into the proc queue...
2394	* therefore, the waitevent will see a 'consistent'
2395	* snapshot of the event, even though it won't hold
2396	* the pipe lock, and we're updating the event outside
2397	* of the proc lock, which it will hold
2398	*/
2399	evq->ee_req.er_eventbits \|= mask;
2400
2401	KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, `1`,`0`);
2402
2403	evprocenque(evq);
2404	}
2405	}
2406	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, `0`,`0`,`0`,`1`,`0`);
2407	}
2408
2409	#if SOCKETS
2410	/*
2411	* given either a sockbuf or a socket run down the
2412	* event list and queue ready events found...
2413	* the socket must be locked by the caller
2414	*/
2415	void
2416	postevent(struct socket sp, struct* sockbuf sb, int* event)
2417	{
2418	int mask;
2419	struct eventqelt *evq;
2420	struct tcpcb *tp;
2421
2422	if (sb)
2423	sp = sb->sb_so;
2424	if (sp == NULL)
2425	return;
2426
2427	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_START, (int)sp, event, `0`, `0`, `0`);
2428
2429	for (evq = sp->so_evlist.tqh_first;
2430	evq != NULL; evq = evq->ee_slist.tqe_next) {
2431
2432	if (evq->ee_eventmask == `0`)
2433	continue;
2434	mask = `0`;
2435
2436	/ ready for reading:*
2437	- byte cnt >= receive low water mark
2438	- read-half of conn closed
2439	- conn pending for listening sock
2440	- socket error pending
2441
2442	ready for writing
2443	- byte cnt avail >= send low water mark
2444	- write half of conn closed
2445	- socket error pending
2446	- non-blocking conn completed successfully
2447
2448	exception pending
2449	- out of band data
2450	- sock at out of band mark
2451	*/
2452
2453	switch (event & EV_DMASK) {
2454
2455	case EV_OOB:
2456	if ((evq->ee_eventmask & EV_EX)) {
2457	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK)))
2458	mask \|= EV_EX\|EV_OOB;
2459	}
2460	break;
2461
2462	case EV_RWBYTES\|EV_OOB:
2463	if ((evq->ee_eventmask & EV_EX)) {
2464	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK)))
2465	mask \|= EV_EX\|EV_OOB;
2466	}
2467	/*
2468	* fall into the next case
2469	*/
2470	case EV_RWBYTES:
2471	if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2472	/ for AFP/OT purposes; may go away in future /
2473	if ((SOCK_DOM(sp) == PF_INET \|\|
2474	SOCK_DOM(sp) == PF_INET6) &&
2475	SOCK_PROTO(sp) == IPPROTO_TCP &&
2476	(sp->so_error == ECONNREFUSED \|\|
2477	sp->so_error == ECONNRESET)) {
2478	if (sp->so_pcb == NULL \|\|
2479	sotoinpcb(sp)->inp_state ==
2480	INPCB_STATE_DEAD \|\|
2481	(tp = sototcpcb(sp)) == NULL \|\|
2482	tp->t_state == TCPS_CLOSED) {
2483	mask \|= EV_RE\|EV_RESET;
2484	break;
2485	}
2486	}
2487	mask \|= EV_RE;
2488	evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2489
2490	if (sp->so_state & SS_CANTRCVMORE) {
2491	mask \|= EV_FIN;
2492	break;
2493	}
2494	}
2495	if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2496	/ for AFP/OT purposes; may go away in future /
2497	if ((SOCK_DOM(sp) == PF_INET \|\|
2498	SOCK_DOM(sp) == PF_INET6) &&
2499	SOCK_PROTO(sp) == IPPROTO_TCP &&
2500	(sp->so_error == ECONNREFUSED \|\|
2501	sp->so_error == ECONNRESET)) {
2502	if (sp->so_pcb == NULL \|\|
2503	sotoinpcb(sp)->inp_state ==
2504	INPCB_STATE_DEAD \|\|
2505	(tp = sototcpcb(sp)) == NULL \|\|
2506	tp->t_state == TCPS_CLOSED) {
2507	mask \|= EV_WR\|EV_RESET;
2508	break;
2509	}
2510	}
2511	mask \|= EV_WR;
2512	evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2513	}
2514	break;
2515
2516	case EV_RCONN:
2517	if ((evq->ee_eventmask & EV_RE)) {
2518	mask \|= EV_RE\|EV_RCONN;
2519	evq->ee_req.er_rcnt = sp->so_qlen + `1`; // incl this one
2520	}
2521	break;
2522
2523	case EV_WCONN:
2524	if ((evq->ee_eventmask & EV_WR)) {
2525	mask \|= EV_WR\|EV_WCONN;
2526	}
2527	break;
2528
2529	case EV_RCLOSED:
2530	if ((evq->ee_eventmask & EV_RE)) {
2531	mask \|= EV_RE\|EV_RCLOSED;
2532	}
2533	break;
2534
2535	case EV_WCLOSED:
2536	if ((evq->ee_eventmask & EV_WR)) {
2537	mask \|= EV_WR\|EV_WCLOSED;
2538	}
2539	break;
2540
2541	case EV_FIN:
2542	if (evq->ee_eventmask & EV_RE) {
2543	mask \|= EV_RE\|EV_FIN;
2544	}
2545	break;
2546
2547	case EV_RESET:
2548	case EV_TIMEOUT:
2549	if (evq->ee_eventmask & EV_RE) {
2550	mask \|= EV_RE \| event;
2551	}
2552	if (evq->ee_eventmask & EV_WR) {
2553	mask \|= EV_WR \| event;
2554	}
2555	break;
2556
2557	default:
2558	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, (int)sp, -`1`, `0`, `0`, `0`);
2559	return;
2560	} / switch /
2561
2562	KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, `0`);
2563
2564	if (mask) {
2565	/*
2566	* disarm... postevents are nops until this event is 'read' via
2567	* waitevent and then re-armed via modwatch
2568	*/
2569	evq->ee_eventmask = `0`;
2570
2571	/*
2572	* since events are disarmed until after the waitevent
2573	* the ee_req.er_xxxx fields can't change once we've
2574	* inserted this event into the proc queue...
2575	* since waitevent can't see this event until we
2576	* enqueue it, waitevent will see a 'consistent'
2577	* snapshot of the event, even though it won't hold
2578	* the socket lock, and we're updating the event outside
2579	* of the proc lock, which it will hold
2580	*/
2581	evq->ee_req.er_eventbits \|= mask;
2582
2583	evprocenque(evq);
2584	}
2585	}
2586	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, (int)sp, `0`, `0`, `0`, `0`);
2587	}
2588	#endif /* SOCKETS */
2589
2590
2591	/*
2592	* watchevent system call. user passes us an event to watch
2593	* for. we malloc an event object, initialize it, and queue
2594	* it to the open socket. when the event occurs, postevent()
2595	* will enque it back to our proc where we can retrieve it
2596	* via waitevent().
2597	*
2598	* should this prevent duplicate events on same socket?
2599	*
2600	* Returns:
2601	* ENOMEM No memory for operation
2602	* copyin:EFAULT
2603	*/
2604	int
2605	watchevent(proc_t p, struct watchevent_args uap, __unused int* *retval)
2606	{
2607	struct eventqelt evq = (struct* eventqelt *)`0`;
2608	struct eventqelt *np = NULL;
2609	struct eventreq64 *erp;
2610	struct fileproc *fp = NULL;
2611	int error;
2612
2613	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_START, `0`,`0`,`0`,`0`,`0`);
2614
2615	// get a qelt and fill with users req
2616	MALLOC(evq, struct eventqelt , sizeof(struct* eventqelt), M_TEMP, M_WAITOK);
2617
2618	if (evq == NULL)
2619	return (ENOMEM);
2620	erp = &evq->ee_req;
2621
2622	// get users request pkt
2623
2624	if (IS_64BIT_PROCESS(p)) {
2625	error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2626	} else {
2627	struct eventreq32 er32;
2628
2629	error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2630	if (error == `0`) {
2631	/*
2632	* the user only passes in the
2633	* er_type, er_handle and er_data...
2634	* the other fields are initialized
2635	* below, so don't bother to copy
2636	*/
2637	erp->er_type = er32.er_type;
2638	erp->er_handle = er32.er_handle;
2639	erp->er_data = (user_addr_t)er32.er_data;
2640	}
2641	}
2642	if (error) {
2643	FREE(evq, M_TEMP);
2644	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, error,`0`,`0`,`0`,`0`);
2645
2646	return(error);
2647	}
2648	KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,`0`,`0`);
2649
2650	// validate, freeing qelt if errors
2651	error = `0`;
2652	proc_fdlock(p);
2653
2654	if (erp->er_type != EV_FD) {
2655	error = EINVAL;
2656	} else if ((error = fp_lookup(p, erp->er_handle, &fp, `1`)) != `0`) {
2657	error = EBADF;
2658	#if SOCKETS
2659	} else if (fp->f_type == DTYPE_SOCKET) {
2660	socket_lock((struct socket *)fp->f_data, `1`);
2661	np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2662	#endif /* SOCKETS */
2663	} else if (fp->f_type == DTYPE_PIPE) {
2664	PIPE_LOCK((struct pipe *)fp->f_data);
2665	np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2666	} else {
2667	fp_drop(p, erp->er_handle, fp, `1`);
2668	error = EINVAL;
2669	}
2670	proc_fdunlock(p);
2671
2672	if (error) {
2673	FREE(evq, M_TEMP);
2674
2675	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, error,`0`,`0`,`0`,`0`);
2676	return(error);
2677	}
2678
2679	/*
2680	* only allow one watch per file per proc
2681	*/
2682	for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2683	if (np->ee_proc == p) {
2684	#if SOCKETS
2685	if (fp->f_type == DTYPE_SOCKET)
2686	socket_unlock((struct socket *)fp->f_data, `1`);
2687	else
2688	#endif /* SOCKETS */
2689	PIPE_UNLOCK((struct pipe *)fp->f_data);
2690	fp_drop(p, erp->er_handle, fp, `0`);
2691	FREE(evq, M_TEMP);
2692
2693	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, EINVAL,`0`,`0`,`0`,`0`);
2694	return(EINVAL);
2695	}
2696	}
2697	erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = `0`;
2698	evq->ee_proc = p;
2699	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2700	evq->ee_flags = `0`;
2701
2702	#if SOCKETS
2703	if (fp->f_type == DTYPE_SOCKET) {
2704	TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2705	postevent((struct socket )fp->f_data, `0`, EV_RWBYTES); // catch existing events*
2706
2707	socket_unlock((struct socket *)fp->f_data, `1`);
2708	} else
2709	#endif /* SOCKETS */
2710	{
2711	TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2712	postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2713
2714	PIPE_UNLOCK((struct pipe *)fp->f_data);
2715	}
2716	fp_drop_event(p, erp->er_handle, fp);
2717
2718	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2719	return(`0`);
2720	}
2721
2722
2723
2724	/*
2725	* waitevent system call.
2726	* grabs the next waiting event for this proc and returns
2727	* it. if no events, user can request to sleep with timeout
2728	* or without or poll mode
2729	* ((tv != NULL && interval == 0) \|\| tv == -1)
2730	*/
2731	int
2732	waitevent(proc_t p, struct waitevent_args uap, int* *retval)
2733	{
2734	int error = `0`;
2735	struct eventqelt *evq;
2736	struct eventreq64 *erp;
2737	uint64_t abstime, interval;
2738	boolean_t fast_poll = FALSE;
2739	union {
2740	struct eventreq64 er64;
2741	struct eventreq32 er32;
2742	} uer = {};
2743
2744	interval = `0`;
2745
2746	if (uap->tv) {
2747	struct timeval atv;
2748	/*
2749	* check for fast poll method
2750	*/
2751	if (IS_64BIT_PROCESS(p)) {
2752	if (uap->tv == (user_addr_t)-`1`)
2753	fast_poll = TRUE;
2754	} else if (uap->tv == (user_addr_t)((uint32_t)-`1`))
2755	fast_poll = TRUE;
2756
2757	if (fast_poll == TRUE) {
2758	if (p->p_evlist.tqh_first == NULL) {
2759	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_NONE, -`1`,`0`,`0`,`0`,`0`);
2760	/*
2761	* poll failed
2762	*/
2763	*retval = `1`;
2764	return (`0`);
2765	}
2766	proc_lock(p);
2767	goto retry;
2768	}
2769	if (IS_64BIT_PROCESS(p)) {
2770	struct user64_timeval atv64;
2771	error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
2772	/ Loses resolution - assume timeout < 68 years /
2773	atv.tv_sec = atv64.tv_sec;
2774	atv.tv_usec = atv64.tv_usec;
2775	} else {
2776	struct user32_timeval atv32;
2777	error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
2778	atv.tv_sec = atv32.tv_sec;
2779	atv.tv_usec = atv32.tv_usec;
2780	}
2781
2782	if (error)
2783	return(error);
2784	if (itimerfix(&atv)) {
2785	error = EINVAL;
2786	return(error);
2787	}
2788	interval = tvtoabstime(&atv);
2789	}
2790	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_START, `0`,`0`,`0`,`0`,`0`);
2791
2792	proc_lock(p);
2793	retry:
2794	if ((evq = p->p_evlist.tqh_first) != NULL) {
2795	/*
2796	* found one... make a local copy while it's still on the queue
2797	* to prevent it from changing while in the midst of copying
2798	* don't want to hold the proc lock across a copyout because
2799	* it might block on a page fault at the target in user space
2800	*/
2801	erp = &evq->ee_req;
2802
2803	if (IS_64BIT_PROCESS(p))
2804	bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2805	else {
2806	uer.er32.er_type = erp->er_type;
2807	uer.er32.er_handle = erp->er_handle;
2808	uer.er32.er_data = (uint32_t)erp->er_data;
2809	uer.er32.er_ecnt = erp->er_ecnt;
2810	uer.er32.er_rcnt = erp->er_rcnt;
2811	uer.er32.er_wcnt = erp->er_wcnt;
2812	uer.er32.er_eventbits = erp->er_eventbits;
2813	}
2814	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2815
2816	evq->ee_flags &= ~EV_QUEUED;
2817
2818	proc_unlock(p);
2819
2820	if (IS_64BIT_PROCESS(p))
2821	error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2822	else
2823	error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2824
2825	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, error,
2826	evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,`0`);
2827	return (error);
2828	}
2829	else {
2830	if (uap->tv && interval == `0`) {
2831	proc_unlock(p);
2832	retval = `1`; // poll failed*
2833
2834	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, error,`0`,`0`,`0`,`0`);
2835	return (error);
2836	}
2837	if (interval != `0`)
2838	clock_absolutetime_interval_to_deadline(interval, &abstime);
2839	else
2840	abstime = `0`;
2841
2842	KERNEL_DEBUG(DBG_MISC_WAIT, `1`,(uint32_t)&p->p_evlist,`0`,`0`,`0`);
2843
2844	error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK \| PCATCH), "waitevent", abstime);
2845
2846	KERNEL_DEBUG(DBG_MISC_WAIT, `2`,(uint32_t)&p->p_evlist,`0`,`0`,`0`);
2847
2848	if (error == `0`)
2849	goto retry;
2850	if (error == ERESTART)
2851	error = EINTR;
2852	if (error == EWOULDBLOCK) {
2853	*retval = `1`;
2854	error = `0`;
2855	}
2856	}
2857	proc_unlock(p);
2858
2859	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2860	return (error);
2861	}
2862
2863
2864	/*
2865	* modwatch system call. user passes in event to modify.
2866	* if we find it we reset the event bits and que/deque event
2867	* it needed.
2868	*/
2869	int
2870	modwatch(proc_t p, struct modwatch_args uap, __unused int* *retval)
2871	{
2872	struct eventreq64 er;
2873	struct eventreq64 *erp = &er;
2874	struct eventqelt evq = NULL; /* protected by error return /
2875	int error;
2876	struct fileproc *fp;
2877	int flag;
2878
2879	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_START, `0`,`0`,`0`,`0`,`0`);
2880
2881	/*
2882	* get user's request pkt
2883	* just need the er_type and er_handle which sit above the
2884	* problematic er_data (32/64 issue)... so only copy in
2885	* those 2 fields
2886	*/
2887	if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2888	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, error,`0`,`0`,`0`,`0`);
2889	return(error);
2890	}
2891	proc_fdlock(p);
2892
2893	if (erp->er_type != EV_FD) {
2894	error = EINVAL;
2895	} else if ((error = fp_lookup(p, erp->er_handle, &fp, `1`)) != `0`) {
2896	error = EBADF;
2897	#if SOCKETS
2898	} else if (fp->f_type == DTYPE_SOCKET) {
2899	socket_lock((struct socket *)fp->f_data, `1`);
2900	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2901	#endif /* SOCKETS */
2902	} else if (fp->f_type == DTYPE_PIPE) {
2903	PIPE_LOCK((struct pipe *)fp->f_data);
2904	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2905	} else {
2906	fp_drop(p, erp->er_handle, fp, `1`);
2907	error = EINVAL;
2908	}
2909
2910	if (error) {
2911	proc_fdunlock(p);
2912	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, error,`0`,`0`,`0`,`0`);
2913	return(error);
2914	}
2915
2916	if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2917	fp->f_flags &= ~FP_WAITEVENT;
2918	}
2919	proc_fdunlock(p);
2920
2921	// locate event if possible
2922	for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2923	if (evq->ee_proc == p)
2924	break;
2925	}
2926	if (evq == NULL) {
2927	#if SOCKETS
2928	if (fp->f_type == DTYPE_SOCKET)
2929	socket_unlock((struct socket *)fp->f_data, `1`);
2930	else
2931	#endif /* SOCKETS */
2932	PIPE_UNLOCK((struct pipe *)fp->f_data);
2933	fp_drop(p, erp->er_handle, fp, `0`);
2934	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, EINVAL,`0`,`0`,`0`,`0`);
2935	return(EINVAL);
2936	}
2937	KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,`0`,`0`);
2938
2939	if (uap->u_eventmask == EV_RM) {
2940	EVPROCDEQUE(p, evq);
2941
2942	#if SOCKETS
2943	if (fp->f_type == DTYPE_SOCKET) {
2944	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2945	socket_unlock((struct socket *)fp->f_data, `1`);
2946	} else
2947	#endif /* SOCKETS */
2948	{
2949	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2950	PIPE_UNLOCK((struct pipe *)fp->f_data);
2951	}
2952	fp_drop(p, erp->er_handle, fp, `0`);
2953	FREE(evq, M_TEMP);
2954	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2955	return(`0`);
2956	}
2957	switch (uap->u_eventmask & EV_MASK) {
2958
2959	case `0`:
2960	flag = `0`;
2961	break;
2962
2963	case EV_RE:
2964	case EV_WR:
2965	case EV_RE\|EV_WR:
2966	flag = EV_RWBYTES;
2967	break;
2968
2969	case EV_EX:
2970	flag = EV_OOB;
2971	break;
2972
2973	case EV_EX\|EV_RE:
2974	case EV_EX\|EV_WR:
2975	case EV_EX\|EV_RE\|EV_WR:
2976	flag = EV_OOB\|EV_RWBYTES;
2977	break;
2978
2979	default:
2980	#if SOCKETS
2981	if (fp->f_type == DTYPE_SOCKET)
2982	socket_unlock((struct socket *)fp->f_data, `1`);
2983	else
2984	#endif /* SOCKETS */
2985	PIPE_UNLOCK((struct pipe *)fp->f_data);
2986	fp_drop(p, erp->er_handle, fp, `0`);
2987	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, EINVAL,`0`,`0`,`0`,`0`);
2988	return(EINVAL);
2989	}
2990	/*
2991	* since we're holding the socket/pipe lock, the event
2992	* cannot go from the unqueued state to the queued state
2993	* however, it can go from the queued state to the unqueued state
2994	* since that direction is protected by the proc_lock...
2995	* so do a quick check for EV_QUEUED w/o holding the proc lock
2996	* since by far the common case will be NOT EV_QUEUED, this saves
2997	* us taking the proc_lock the majority of the time
2998	*/
2999	if (evq->ee_flags & EV_QUEUED) {
3000	/*
3001	* EVPROCDEQUE will recheck the state after it grabs the proc_lock
3002	*/
3003	EVPROCDEQUE(p, evq);
3004	}
3005	/*
3006	* while the event is off the proc queue and
3007	* we're holding the socket/pipe lock
3008	* it's safe to update these fields...
3009	*/
3010	evq->ee_req.er_eventbits = `0`;
3011	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
3012
3013	#if SOCKETS
3014	if (fp->f_type == DTYPE_SOCKET) {
3015	postevent((struct socket *)fp->f_data, `0`, flag);
3016	socket_unlock((struct socket *)fp->f_data, `1`);
3017	} else
3018	#endif /* SOCKETS */
3019	{
3020	postpipeevent((struct pipe *)fp->f_data, flag);
3021	PIPE_UNLOCK((struct pipe *)fp->f_data);
3022	}
3023	fp_drop(p, erp->er_handle, fp, `0`);
3024	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,`0`);
3025	return(`0`);
3026	}
3027
3028	/ this routine is called from the close of fd with proc_fdlock held /
3029	int
3030	waitevent_close(struct proc p, struct* fileproc *fp)
3031	{
3032	struct eventqelt *evq;
3033
3034
3035	fp->f_flags &= ~FP_WAITEVENT;
3036
3037	#if SOCKETS
3038	if (fp->f_type == DTYPE_SOCKET) {
3039	socket_lock((struct socket *)fp->f_data, `1`);
3040	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
3041	} else
3042	#endif /* SOCKETS */
3043	if (fp->f_type == DTYPE_PIPE) {
3044	PIPE_LOCK((struct pipe *)fp->f_data);
3045	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
3046	}
3047	else {
3048	return(EINVAL);
3049	}
3050	proc_fdunlock(p);
3051
3052
3053	// locate event if possible
3054	for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
3055	if (evq->ee_proc == p)
3056	break;
3057	}
3058	if (evq == NULL) {
3059	#if SOCKETS
3060	if (fp->f_type == DTYPE_SOCKET)
3061	socket_unlock((struct socket *)fp->f_data, `1`);
3062	else
3063	#endif /* SOCKETS */
3064	PIPE_UNLOCK((struct pipe *)fp->f_data);
3065
3066	proc_fdlock(p);
3067
3068	return(EINVAL);
3069	}
3070	EVPROCDEQUE(p, evq);
3071
3072	#if SOCKETS
3073	if (fp->f_type == DTYPE_SOCKET) {
3074	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
3075	socket_unlock((struct socket *)fp->f_data, `1`);
3076	} else
3077	#endif /* SOCKETS */
3078	{
3079	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
3080	PIPE_UNLOCK((struct pipe *)fp->f_data);
3081	}
3082	FREE(evq, M_TEMP);
3083
3084	proc_fdlock(p);
3085
3086	return(`0`);
3087	}
3088
3089
3090	/*
3091	* gethostuuid
3092	*
3093	* Description: Get the host UUID from IOKit and return it to user space.
3094	*
3095	* Parameters: uuid_buf Pointer to buffer to receive UUID
3096	* timeout Timespec for timout
3097	* spi SPI, skip sandbox check (temporary)
3098	*
3099	* Returns: 0 Success
3100	* EWOULDBLOCK Timeout is too short
3101	* copyout:EFAULT Bad user buffer
3102	* mac_system_check_info:EPERM Client not allowed to perform this operation
3103	*
3104	* Notes: A timeout seems redundant, since if it's tolerable to not
3105	* have a system UUID in hand, then why ask for one?
3106	*/
3107	int
3108	gethostuuid(struct proc p, struct* gethostuuid_args uap, __unused int32_t retval)
3109	{
3110	kern_return_t kret;
3111	int error;
3112	mach_timespec_t mach_ts; / for IOKit call /
3113	__darwin_uuid_t uuid_kern = {}; / for IOKit call /
3114
3115	if (!uap->spi) {
3116	#if CONFIG_EMBEDDED
3117	#if CONFIG_MACF
3118	if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != `0`) {
3119	/ EPERM invokes userspace upcall if present /
3120	return (error);
3121	}
3122	#endif
3123	#endif
3124	}
3125
3126	/ Convert the 32/64 bit timespec into a mach_timespec_t /
3127	if ( proc_is64bit(p) ) {
3128	struct user64_timespec ts;
3129	error = copyin(uap->timeoutp, &ts, sizeof(ts));
3130	if (error)
3131	return (error);
3132	mach_ts.tv_sec = ts.tv_sec;
3133	mach_ts.tv_nsec = ts.tv_nsec;
3134	} else {
3135	struct user32_timespec ts;
3136	error = copyin(uap->timeoutp, &ts, sizeof(ts) );
3137	if (error)
3138	return (error);
3139	mach_ts.tv_sec = ts.tv_sec;
3140	mach_ts.tv_nsec = ts.tv_nsec;
3141	}
3142
3143	/ Call IOKit with the stack buffer to get the UUID /
3144	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
3145
3146	/*
3147	* If we get it, copy out the data to the user buffer; note that a
3148	* uuid_t is an array of characters, so this is size invariant for
3149	* 32 vs. 64 bit.
3150	*/
3151	if (kret == KERN_SUCCESS) {
3152	error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
3153	} else {
3154	error = EWOULDBLOCK;
3155	}
3156
3157	return (error);
3158	}
3159
3160	/*
3161	* ledger
3162	*
3163	* Description: Omnibus system call for ledger operations
3164	*/
3165	int
3166	ledger(struct proc p, struct* ledger_args args, __unused int32_t retval)
3167	{
3168	#if !CONFIG_MACF
3169	#pragma unused(p)
3170	#endif
3171	int rval, pid, len, error;
3172	#ifdef LEDGER_DEBUG
3173	struct ledger_limit_args lla;
3174	#endif
3175	task_t task;
3176	proc_t proc;
3177
3178	/ Finish copying in the necessary args before taking the proc lock /
3179	error = `0`;
3180	len = `0`;
3181	if (args->cmd == LEDGER_ENTRY_INFO)
3182	error = copyin(args->arg3, (char )&len, sizeof* (len));
3183	else if (args->cmd == LEDGER_TEMPLATE_INFO)
3184	error = copyin(args->arg2, (char )&len, sizeof* (len));
3185	else if (args->cmd == LEDGER_LIMIT)
3186	#ifdef LEDGER_DEBUG
3187	error = copyin(args->arg2, (char )&lla, sizeof* (lla));
3188	#else
3189	return (EINVAL);
3190	#endif
3191	else if ((args->cmd < `0`) \|\| (args->cmd > LEDGER_MAX_CMD))
3192	return (EINVAL);
3193
3194	if (error)
3195	return (error);
3196	if (len < `0`)
3197	return (EINVAL);
3198
3199	rval = `0`;
3200	if (args->cmd != LEDGER_TEMPLATE_INFO) {
3201	pid = args->arg1;
3202	proc = proc_find(pid);
3203	if (proc == NULL)
3204	return (ESRCH);
3205
3206	#if CONFIG_MACF
3207	error = mac_proc_check_ledger(p, proc, args->cmd);
3208	if (error) {
3209	proc_rele(proc);
3210	return (error);
3211	}
3212	#endif
3213
3214	task = proc->task;
3215	}
3216
3217	switch (args->cmd) {
3218	#ifdef LEDGER_DEBUG
3219	case LEDGER_LIMIT: {
3220	if (!kauth_cred_issuser(kauth_cred_get()))
3221	rval = EPERM;
3222	rval = ledger_limit(task, &lla);
3223	proc_rele(proc);
3224	break;
3225	}
3226	#endif
3227	case LEDGER_INFO: {
3228	struct ledger_info info = {};
3229
3230	rval = ledger_info(task, &info);
3231	proc_rele(proc);
3232	if (rval == `0`)
3233	rval = copyout(&info, args->arg2,
3234	sizeof (info));
3235	break;
3236	}
3237
3238	case LEDGER_ENTRY_INFO: {
3239	void *buf;
3240	int sz;
3241
3242	rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
3243	proc_rele(proc);
3244	if ((rval == `0`) && (len >= `0`)) {
3245	sz = len * sizeof (struct ledger_entry_info);
3246	rval = copyout(buf, args->arg2, sz);
3247	kfree(buf, sz);
3248	}
3249	if (rval == `0`)
3250	rval = copyout(&len, args->arg3, sizeof (len));
3251	break;
3252	}
3253
3254	case LEDGER_TEMPLATE_INFO: {
3255	void *buf;
3256	int sz;
3257
3258	rval = ledger_template_info(&buf, &len);
3259	if ((rval == `0`) && (len >= `0`)) {
3260	sz = len * sizeof (struct ledger_template_info);
3261	rval = copyout(buf, args->arg1, sz);
3262	kfree(buf, sz);
3263	}
3264	if (rval == `0`)
3265	rval = copyout(&len, args->arg2, sizeof (len));
3266	break;
3267	}
3268
3269	default:
3270	panic("ledger syscall logic error -- command type %d", args->cmd);
3271	proc_rele(proc);
3272	rval = EINVAL;
3273	}
3274
3275	return (rval);
3276	}
3277
3278	int
3279	telemetry(__unused struct proc p, struct* telemetry_args args, __unused int32_t retval)
3280	{
3281	int error = `0`;
3282
3283	switch (args->cmd) {
3284	#if CONFIG_TELEMETRY
3285	case TELEMETRY_CMD_TIMER_EVENT:
3286	error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
3287	break;
3288	case TELEMETRY_CMD_PMI_SETUP:
3289	error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
3290	break;
3291	#endif /* CONFIG_TELEMETRY */
3292	case TELEMETRY_CMD_VOUCHER_NAME:
3293	if (thread_set_voucher_name((mach_port_name_t)args->deadline))
3294	error = EINVAL;
3295	break;
3296
3297	default:
3298	error = EINVAL;
3299	break;
3300	}
3301
3302	return (error);
3303	}
3304
3305	#if DEVELOPMENT \|\| DEBUG
3306	#if CONFIG_WAITQ_DEBUG
3307	static uint64_t g_wqset_num = `0`;
3308	struct g_wqset {
3309	queue_chain_t link;
3310	struct waitq_set *wqset;
3311	};
3312
3313	static queue_head_t g_wqset_list;
3314	static struct waitq_set *g_waitq_set = NULL;
3315
3316	static inline struct waitq_set sysctl_get_wqset(int* idx)
3317	{
3318	struct g_wqset *gwqs;
3319
3320	if (!g_wqset_num)
3321	queue_init(&g_wqset_list);
3322
3323	/ don't bother with locks: this is test-only code! /
3324	qe_foreach_element(gwqs, &g_wqset_list, link) {
3325	if ((int)(wqset_id(gwqs->wqset) & `0xffffffff`) == idx)
3326	return gwqs->wqset;
3327	}
3328
3329	/ allocate a new one /
3330	++g_wqset_num;
3331	gwqs = (struct g_wqset )kalloc(sizeof(gwqs));
3332	assert(gwqs != NULL);
3333
3334	gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO\|SYNC_POLICY_PREPOST, NULL);
3335	enqueue_tail(&g_wqset_list, &gwqs->link);
3336	printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
3337
3338	return gwqs->wqset;
3339	}
3340
3341	#define MAX_GLOBAL_TEST_QUEUES 64
3342	static int g_wq_init = `0`;
3343	static struct waitq g_wq[MAX_GLOBAL_TEST_QUEUES];
3344
3345	static inline struct waitq global_test_waitq(int* idx)
3346	{
3347	if (idx < `0`)
3348	return NULL;
3349
3350	if (!g_wq_init) {
3351	g_wq_init = `1`;
3352	for (int i = `0`; i < MAX_GLOBAL_TEST_QUEUES; i++)
3353	waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
3354	}
3355
3356	return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
3357	}
3358
3359	static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
3360	{
3361	#pragma unused(oidp, arg1, arg2)
3362	int error;
3363	int index;
3364	struct waitq *waitq;
3365	kern_return_t kr;
3366	int64_t event64 = `0`;
3367
3368	error = SYSCTL_IN(req, &event64, sizeof(event64));
3369	if (error)
3370	return error;
3371
3372	if (!req->newptr)
3373	return SYSCTL_OUT(req, &event64, sizeof(event64));
3374
3375	if (event64 < `0`) {
3376	index = (int)((-event64) & `0xffffffff`);
3377	waitq = wqset_waitq(sysctl_get_wqset(index));
3378	index = -index;
3379	} else {
3380	index = (int)event64;
3381	waitq = global_test_waitq(index);
3382	}
3383
3384	event64 = `0`;
3385
3386	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
3387	index, event64);
3388	kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
3389	WAITQ_ALL_PRIORITIES);
3390	printf("[WQ]: \tkr=%d\n", kr);
3391
3392	return SYSCTL_OUT(req, &kr, sizeof(kr));
3393	}
3394	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3395	`0`, `0`, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
3396
3397
3398	static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
3399	{
3400	#pragma unused(oidp, arg1, arg2)
3401	int error;
3402	int index;
3403	struct waitq *waitq;
3404	kern_return_t kr;
3405	int64_t event64 = `0`;
3406
3407	error = SYSCTL_IN(req, &event64, sizeof(event64));
3408	if (error)
3409	return error;
3410
3411	if (!req->newptr)
3412	return SYSCTL_OUT(req, &event64, sizeof(event64));
3413
3414	if (event64 < `0`) {
3415	index = (int)((-event64) & `0xffffffff`);
3416	waitq = wqset_waitq(sysctl_get_wqset(index));
3417	index = -index;
3418	} else {
3419	index = (int)event64;
3420	waitq = global_test_waitq(index);
3421	}
3422
3423	event64 = `0`;
3424
3425	printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
3426	index, event64);
3427	kr = waitq_wakeup64_all(waitq, (event64_t)event64,
3428	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
3429	printf("[WQ]: \tkr=%d\n", kr);
3430
3431	return SYSCTL_OUT(req, &kr, sizeof(kr));
3432	}
3433	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3434	`0`, `0`, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
3435
3436
3437	static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
3438	{
3439	#pragma unused(oidp, arg1, arg2)
3440	int error;
3441	int index;
3442	struct waitq *waitq;
3443	kern_return_t kr;
3444	int64_t event64 = `0`;
3445
3446	error = SYSCTL_IN(req, &event64, sizeof(event64));
3447	if (error)
3448	return error;
3449
3450	if (!req->newptr)
3451	return SYSCTL_OUT(req, &event64, sizeof(event64));
3452
3453	if (event64 < `0`) {
3454	index = (int)((-event64) & `0xffffffff`);
3455	waitq = wqset_waitq(sysctl_get_wqset(index));
3456	index = -index;
3457	} else {
3458	index = (int)event64;
3459	waitq = global_test_waitq(index);
3460	}
3461
3462	event64 = `0`;
3463
3464	printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
3465	index, event64);
3466	kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, `0`);
3467	if (kr == THREAD_WAITING)
3468	thread_block(THREAD_CONTINUE_NULL);
3469	printf("[WQ]: \tWoke Up: kr=%d\n", kr);
3470
3471	return SYSCTL_OUT(req, &kr, sizeof(kr));
3472	}
3473	SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3474	`0`, `0`, sysctl_waitq_wait, "Q", "start waiting on given event");
3475
3476
3477	static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
3478	{
3479	#pragma unused(oidp, arg1, arg2)
3480	int error;
3481	struct waitq_set *wqset;
3482	uint64_t event64 = `0`;
3483
3484	error = SYSCTL_IN(req, &event64, sizeof(event64));
3485	if (error)
3486	return error;
3487
3488	if (!req->newptr)
3489	goto out;
3490
3491	wqset = sysctl_get_wqset((int)(event64 & `0xffffffff`));
3492	g_waitq_set = wqset;
3493
3494	event64 = wqset_id(wqset);
3495	printf("[WQ]: selected wqset 0x%llx\n", event64);
3496
3497	out:
3498	if (g_waitq_set)
3499	event64 = wqset_id(g_waitq_set);
3500	else
3501	event64 = (uint64_t)(-`1`);
3502
3503	return SYSCTL_OUT(req, &event64, sizeof(event64));
3504	}
3505	SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3506	`0`, `0`, sysctl_wqset_select, "Q", "select/create a global waitq set");
3507
3508
3509	static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
3510	{
3511	#pragma unused(oidp, arg1, arg2)
3512	int error;
3513	int index;
3514	struct waitq *waitq;
3515	struct waitq_set *wqset;
3516	kern_return_t kr;
3517	uint64_t reserved_link = `0`;
3518	int64_t event64 = `0`;
3519
3520	error = SYSCTL_IN(req, &event64, sizeof(event64));
3521	if (error)
3522	return error;
3523
3524	if (!req->newptr)
3525	return SYSCTL_OUT(req, &event64, sizeof(event64));
3526
3527	if (!g_waitq_set)
3528	g_waitq_set = sysctl_get_wqset(`1`);
3529	wqset = g_waitq_set;
3530
3531	if (event64 < `0`) {
3532	struct waitq_set *tmp;
3533	index = (int)((-event64) & `0xffffffff`);
3534	tmp = sysctl_get_wqset(index);
3535	if (tmp == wqset)
3536	goto out;
3537	waitq = wqset_waitq(tmp);
3538	index = -index;
3539	} else {
3540	index = (int)event64;
3541	waitq = global_test_waitq(index);
3542	}
3543
3544	printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
3545	index, wqset_id(wqset));
3546	reserved_link = waitq_link_reserve(waitq);
3547	kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
3548	waitq_link_release(reserved_link);
3549
3550	printf("[WQ]: \tkr=%d\n", kr);
3551
3552	out:
3553	return SYSCTL_OUT(req, &kr, sizeof(kr));
3554	}
3555	SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3556	`0`, `0`, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
3557
3558
3559	static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
3560	{
3561	#pragma unused(oidp, arg1, arg2)
3562	int error;
3563	int index;
3564	struct waitq *waitq;
3565	struct waitq_set *wqset;
3566	kern_return_t kr;
3567	uint64_t event64 = `0`;
3568
3569	error = SYSCTL_IN(req, &event64, sizeof(event64));
3570	if (error)
3571	return error;
3572
3573	if (!req->newptr)
3574	return SYSCTL_OUT(req, &event64, sizeof(event64));
3575
3576	if (!g_waitq_set)
3577	g_waitq_set = sysctl_get_wqset(`1`);
3578	wqset = g_waitq_set;
3579
3580	index = (int)event64;
3581	waitq = global_test_waitq(index);
3582
3583	printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
3584	index, wqset_id(wqset));
3585
3586	kr = waitq_unlink(waitq, wqset);
3587	printf("[WQ]: \tkr=%d\n", kr);
3588
3589	return SYSCTL_OUT(req, &kr, sizeof(kr));
3590	}
3591	SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3592	`0`, `0`, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
3593
3594
3595	static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
3596	{
3597	#pragma unused(oidp, arg1, arg2)
3598	struct waitq *waitq;
3599	uint64_t event64 = `0`;
3600	int error, index;
3601
3602	error = SYSCTL_IN(req, &event64, sizeof(event64));
3603	if (error)
3604	return error;
3605
3606	if (!req->newptr)
3607	return SYSCTL_OUT(req, &event64, sizeof(event64));
3608
3609	index = (int)event64;
3610	waitq = global_test_waitq(index);
3611
3612	printf("[WQ]: clearing prepost on waitq [%d]\n", index);
3613	waitq_clear_prepost(waitq);
3614
3615	return SYSCTL_OUT(req, &event64, sizeof(event64));
3616	}
3617	SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3618	`0`, `0`, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
3619
3620
3621	static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
3622	{
3623	#pragma unused(oidp, arg1, arg2)
3624	int error;
3625	struct waitq_set *wqset;
3626	kern_return_t kr;
3627	uint64_t event64 = `0`;
3628
3629	error = SYSCTL_IN(req, &event64, sizeof(event64));
3630	if (error)
3631	return error;
3632
3633	if (!req->newptr)
3634	return SYSCTL_OUT(req, &event64, sizeof(event64));
3635
3636	if (!g_waitq_set)
3637	g_waitq_set = sysctl_get_wqset(`1`);
3638	wqset = g_waitq_set;
3639
3640	printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
3641	wqset_id(wqset));
3642
3643	kr = waitq_set_unlink_all(wqset);
3644	printf("[WQ]: \tkr=%d\n", kr);
3645
3646	return SYSCTL_OUT(req, &kr, sizeof(kr));
3647	}
3648	SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3649	`0`, `0`, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
3650
3651
3652	static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
3653	{
3654	#pragma unused(oidp, arg1, arg2)
3655	struct waitq_set *wqset = NULL;
3656	uint64_t event64 = `0`;
3657	int error, index;
3658
3659	error = SYSCTL_IN(req, &event64, sizeof(event64));
3660	if (error)
3661	return error;
3662
3663	if (!req->newptr)
3664	goto out;
3665
3666	index = (int)((event64) & `0xffffffff`);
3667	wqset = sysctl_get_wqset(index);
3668	assert(wqset != NULL);
3669
3670	printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
3671	waitq_set_clear_preposts(wqset);
3672
3673	out:
3674	if (wqset)
3675	event64 = wqset_id(wqset);
3676	else
3677	event64 = (uint64_t)(-`1`);
3678
3679	return SYSCTL_OUT(req, &event64, sizeof(event64));
3680	}
3681	SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
3682	`0`, `0`, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
3683
3684	#endif /* CONFIG_WAITQ_DEBUG */
3685
3686	static int
3687	sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
3688	{
3689	#pragma unused(oidp, arg1, arg2)
3690	int nelem;
3691
3692	/ Read only /
3693	if (req->newptr != USER_ADDR_NULL)
3694	return (EPERM);
3695
3696	nelem = sysctl_helper_waitq_set_nelem();
3697
3698	return SYSCTL_OUT(req, &nelem, sizeof(nelem));
3699	}
3700
3701	SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD \| CTLFLAG_LOCKED,
3702	`0`, `0`, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
3703
3704
3705	#endif /* DEVELOPMENT \|\| DEBUG */
3706
3707
3708

Browse the source code of xnu/bsd/kern/sys_generic.c