1/*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
67 */
68/*
69 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74
75#include <sys/param.h>
76#include <sys/systm.h>
77#include <sys/filedesc.h>
78#include <sys/ioctl.h>
79#include <sys/file_internal.h>
80#include <sys/proc_internal.h>
81#include <sys/socketvar.h>
82#include <sys/uio_internal.h>
83#include <sys/kernel.h>
84#include <sys/guarded.h>
85#include <sys/stat.h>
86#include <sys/malloc.h>
87#include <sys/sysproto.h>
88
89#include <sys/mount_internal.h>
90#include <sys/protosw.h>
91#include <sys/ev.h>
92#include <sys/user.h>
93#include <sys/kdebug.h>
94#include <sys/poll.h>
95#include <sys/event.h>
96#include <sys/eventvar.h>
97#include <sys/proc.h>
98#include <sys/kauth.h>
99
100#include <machine/smp.h>
101#include <mach/mach_types.h>
102#include <kern/kern_types.h>
103#include <kern/assert.h>
104#include <kern/kalloc.h>
105#include <kern/thread.h>
106#include <kern/clock.h>
107#include <kern/ledger.h>
108#include <kern/task.h>
109#include <kern/telemetry.h>
110#include <kern/waitq.h>
111#include <kern/sched_prim.h>
112
113#include <sys/mbuf.h>
114#include <sys/domain.h>
115#include <sys/socket.h>
116#include <sys/socketvar.h>
117#include <sys/errno.h>
118#include <sys/syscall.h>
119#include <sys/pipe.h>
120
121#include <security/audit/audit.h>
122
123#include <net/if.h>
124#include <net/route.h>
125
126#include <netinet/in.h>
127#include <netinet/in_systm.h>
128#include <netinet/ip.h>
129#include <netinet/in_pcb.h>
130#include <netinet/ip_var.h>
131#include <netinet/ip6.h>
132#include <netinet/tcp.h>
133#include <netinet/tcp_fsm.h>
134#include <netinet/tcp_seq.h>
135#include <netinet/tcp_timer.h>
136#include <netinet/tcp_var.h>
137#include <netinet/tcpip.h>
138#include <netinet/tcp_debug.h>
139/* for wait queue based select */
140#include <kern/waitq.h>
141#include <kern/kalloc.h>
142#include <sys/vnode_internal.h>
143
144#if CONFIG_MACF
145#include <security/mac_framework.h>
146#endif
147
148/* XXX should be in a header file somewhere */
149void evsofree(struct socket *);
150void evpipefree(struct pipe *);
151void postpipeevent(struct pipe *, int);
152void postevent(struct socket *, struct sockbuf *, int);
153extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
154
155int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
156int wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval);
157
158__private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
159 user_addr_t bufp, user_size_t nbyte,
160 off_t offset, int flags, user_ssize_t *retval);
161__private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
162 user_addr_t bufp, user_size_t nbyte,
163 off_t offset, int flags, user_ssize_t *retval);
164__private_extern__ int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
165__private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
166
167/* Conflict wait queue for when selects collide (opaque type) */
168struct waitq select_conflict_queue;
169
170/*
171 * Init routine called from bsd_init.c
172 */
173void select_waitq_init(void);
174void
175select_waitq_init(void)
176{
177 waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
178}
179
180#define f_flag f_fglob->fg_flag
181#define f_type f_fglob->fg_ops->fo_type
182#define f_msgcount f_fglob->fg_msgcount
183#define f_cred f_fglob->fg_cred
184#define f_ops f_fglob->fg_ops
185#define f_offset f_fglob->fg_offset
186#define f_data f_fglob->fg_data
187
188/*
189 * Read system call.
190 *
191 * Returns: 0 Success
192 * preparefileread:EBADF
193 * preparefileread:ESPIPE
194 * preparefileread:ENXIO
195 * preparefileread:EBADF
196 * dofileread:???
197 */
198int
199read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
200{
201 __pthread_testcancel(1);
202 return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
203}
204
205int
206read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
207{
208 struct fileproc *fp;
209 int error;
210 int fd = uap->fd;
211 struct vfs_context context;
212
213 if ( (error = preparefileread(p, &fp, fd, 0)) )
214 return (error);
215
216 context = *(vfs_context_current());
217 context.vc_ucred = fp->f_fglob->fg_cred;
218
219 error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
220 (off_t)-1, 0, retval);
221
222 donefileread(p, fp, fd);
223
224 return (error);
225}
226
227/*
228 * Pread system call
229 *
230 * Returns: 0 Success
231 * preparefileread:EBADF
232 * preparefileread:ESPIPE
233 * preparefileread:ENXIO
234 * preparefileread:EBADF
235 * dofileread:???
236 */
237int
238pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
239{
240 __pthread_testcancel(1);
241 return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
242}
243
244int
245pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
246{
247 struct fileproc *fp = NULL; /* fp set by preparefileread() */
248 int fd = uap->fd;
249 int error;
250 struct vfs_context context;
251
252 if ( (error = preparefileread(p, &fp, fd, 1)) )
253 goto out;
254
255 context = *(vfs_context_current());
256 context.vc_ucred = fp->f_fglob->fg_cred;
257
258 error = dofileread(&context, fp, uap->buf, uap->nbyte,
259 uap->offset, FOF_OFFSET, retval);
260
261 donefileread(p, fp, fd);
262
263 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
264 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
265
266out:
267 return (error);
268}
269
270/*
271 * Code common for read and pread
272 */
273
274void
275donefileread(struct proc *p, struct fileproc *fp, int fd)
276{
277 proc_fdlock_spin(p);
278 fp_drop(p, fd, fp, 1);
279 proc_fdunlock(p);
280}
281
282/*
283 * Returns: 0 Success
284 * EBADF
285 * ESPIPE
286 * ENXIO
287 * fp_lookup:EBADF
288 * fo_read:???
289 */
290int
291preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
292{
293 vnode_t vp;
294 int error;
295 struct fileproc *fp;
296
297 AUDIT_ARG(fd, fd);
298
299 proc_fdlock_spin(p);
300
301 error = fp_lookup(p, fd, &fp, 1);
302
303 if (error) {
304 proc_fdunlock(p);
305 return (error);
306 }
307 if ((fp->f_flag & FREAD) == 0) {
308 error = EBADF;
309 goto out;
310 }
311 if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
312 error = ESPIPE;
313 goto out;
314 }
315 if (fp->f_type == DTYPE_VNODE) {
316 vp = (struct vnode *)fp->f_fglob->fg_data;
317
318 if (check_for_pread && (vnode_isfifo(vp))) {
319 error = ESPIPE;
320 goto out;
321 }
322 if (check_for_pread && (vp->v_flag & VISTTY)) {
323 error = ENXIO;
324 goto out;
325 }
326 }
327
328 *fp_ret = fp;
329
330 proc_fdunlock(p);
331 return (0);
332
333out:
334 fp_drop(p, fd, fp, 1);
335 proc_fdunlock(p);
336 return (error);
337}
338
339
340/*
341 * Returns: 0 Success
342 * EINVAL
343 * fo_read:???
344 */
345__private_extern__ int
346dofileread(vfs_context_t ctx, struct fileproc *fp,
347 user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
348 user_ssize_t *retval)
349{
350 uio_t auio;
351 user_ssize_t bytecnt;
352 long error = 0;
353 char uio_buf[ UIO_SIZEOF(1) ];
354
355 if (nbyte > INT_MAX)
356 return (EINVAL);
357
358 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
359 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
360 &uio_buf[0], sizeof(uio_buf));
361 } else {
362 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
363 &uio_buf[0], sizeof(uio_buf));
364 }
365 uio_addiov(auio, bufp, nbyte);
366
367 bytecnt = nbyte;
368
369 if ((error = fo_read(fp, auio, flags, ctx))) {
370 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
371 error == EINTR || error == EWOULDBLOCK))
372 error = 0;
373 }
374 bytecnt -= uio_resid(auio);
375
376 *retval = bytecnt;
377
378 return (error);
379}
380
381/*
382 * Scatter read system call.
383 *
384 * Returns: 0 Success
385 * EINVAL
386 * ENOMEM
387 * copyin:EFAULT
388 * rd_uio:???
389 */
390int
391readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
392{
393 __pthread_testcancel(1);
394 return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
395}
396
397int
398readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
399{
400 uio_t auio = NULL;
401 int error;
402 struct user_iovec *iovp;
403
404 /* Verify range bedfore calling uio_create() */
405 if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
406 return (EINVAL);
407
408 /* allocate a uio large enough to hold the number of iovecs passed */
409 auio = uio_create(uap->iovcnt, 0,
410 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
411 UIO_READ);
412
413 /* get location of iovecs within the uio. then copyin the iovecs from
414 * user space.
415 */
416 iovp = uio_iovsaddr(auio);
417 if (iovp == NULL) {
418 error = ENOMEM;
419 goto ExitThisRoutine;
420 }
421 error = copyin_user_iovec_array(uap->iovp,
422 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
423 uap->iovcnt, iovp);
424 if (error) {
425 goto ExitThisRoutine;
426 }
427
428 /* finalize uio_t for use and do the IO
429 */
430 error = uio_calculateresid(auio);
431 if (error) {
432 goto ExitThisRoutine;
433 }
434 error = rd_uio(p, uap->fd, auio, retval);
435
436ExitThisRoutine:
437 if (auio != NULL) {
438 uio_free(auio);
439 }
440 return (error);
441}
442
443/*
444 * Write system call
445 *
446 * Returns: 0 Success
447 * EBADF
448 * fp_lookup:EBADF
449 * dofilewrite:???
450 */
451int
452write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
453{
454 __pthread_testcancel(1);
455 return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
456
457}
458
459int
460write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
461{
462 struct fileproc *fp;
463 int error;
464 int fd = uap->fd;
465 bool wrote_some = false;
466
467 AUDIT_ARG(fd, fd);
468
469 error = fp_lookup(p,fd,&fp,0);
470 if (error)
471 return(error);
472 if ((fp->f_flag & FWRITE) == 0) {
473 error = EBADF;
474 } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
475 proc_fdlock(p);
476 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
477 proc_fdunlock(p);
478 } else {
479 struct vfs_context context = *(vfs_context_current());
480 context.vc_ucred = fp->f_fglob->fg_cred;
481
482 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
483 (off_t)-1, 0, retval);
484
485 wrote_some = *retval > 0;
486 }
487 if (wrote_some)
488 fp_drop_written(p, fd, fp);
489 else
490 fp_drop(p, fd, fp, 0);
491 return(error);
492}
493
494/*
495 * pwrite system call
496 *
497 * Returns: 0 Success
498 * EBADF
499 * ESPIPE
500 * ENXIO
501 * EINVAL
502 * fp_lookup:EBADF
503 * dofilewrite:???
504 */
505int
506pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
507{
508 __pthread_testcancel(1);
509 return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
510}
511
512int
513pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
514{
515 struct fileproc *fp;
516 int error;
517 int fd = uap->fd;
518 vnode_t vp = (vnode_t)0;
519 bool wrote_some = false;
520
521 AUDIT_ARG(fd, fd);
522
523 error = fp_lookup(p,fd,&fp,0);
524 if (error)
525 return(error);
526
527 if ((fp->f_flag & FWRITE) == 0) {
528 error = EBADF;
529 } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
530 proc_fdlock(p);
531 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
532 proc_fdunlock(p);
533 } else {
534 struct vfs_context context = *vfs_context_current();
535 context.vc_ucred = fp->f_fglob->fg_cred;
536
537 if (fp->f_type != DTYPE_VNODE) {
538 error = ESPIPE;
539 goto errout;
540 }
541 vp = (vnode_t)fp->f_fglob->fg_data;
542 if (vnode_isfifo(vp)) {
543 error = ESPIPE;
544 goto errout;
545 }
546 if ((vp->v_flag & VISTTY)) {
547 error = ENXIO;
548 goto errout;
549 }
550 if (uap->offset == (off_t)-1) {
551 error = EINVAL;
552 goto errout;
553 }
554
555 error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
556 uap->offset, FOF_OFFSET, retval);
557 wrote_some = *retval > 0;
558 }
559errout:
560 if (wrote_some)
561 fp_drop_written(p, fd, fp);
562 else
563 fp_drop(p, fd, fp, 0);
564
565 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
566 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
567
568 return(error);
569}
570
571/*
572 * Returns: 0 Success
573 * EINVAL
574 * <fo_write>:EPIPE
575 * <fo_write>:??? [indirect through struct fileops]
576 */
577__private_extern__ int
578dofilewrite(vfs_context_t ctx, struct fileproc *fp,
579 user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
580 user_ssize_t *retval)
581{
582 uio_t auio;
583 long error = 0;
584 user_ssize_t bytecnt;
585 char uio_buf[ UIO_SIZEOF(1) ];
586
587 if (nbyte > INT_MAX) {
588 *retval = 0;
589 return (EINVAL);
590 }
591
592 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
593 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
594 &uio_buf[0], sizeof(uio_buf));
595 } else {
596 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
597 &uio_buf[0], sizeof(uio_buf));
598 }
599 uio_addiov(auio, bufp, nbyte);
600
601 bytecnt = nbyte;
602 if ((error = fo_write(fp, auio, flags, ctx))) {
603 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
604 error == EINTR || error == EWOULDBLOCK))
605 error = 0;
606 /* The socket layer handles SIGPIPE */
607 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
608 (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
609 /* XXX Raise the signal on the thread? */
610 psignal(vfs_context_proc(ctx), SIGPIPE);
611 }
612 }
613 bytecnt -= uio_resid(auio);
614 *retval = bytecnt;
615
616 return (error);
617}
618
619/*
620 * Gather write system call
621 */
622int
623writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
624{
625 __pthread_testcancel(1);
626 return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
627}
628
629int
630writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
631{
632 uio_t auio = NULL;
633 int error;
634 struct fileproc *fp;
635 struct user_iovec *iovp;
636 bool wrote_some = false;
637
638 AUDIT_ARG(fd, uap->fd);
639
640 /* Verify range bedfore calling uio_create() */
641 if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
642 return (EINVAL);
643
644 /* allocate a uio large enough to hold the number of iovecs passed */
645 auio = uio_create(uap->iovcnt, 0,
646 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
647 UIO_WRITE);
648
649 /* get location of iovecs within the uio. then copyin the iovecs from
650 * user space.
651 */
652 iovp = uio_iovsaddr(auio);
653 if (iovp == NULL) {
654 error = ENOMEM;
655 goto ExitThisRoutine;
656 }
657 error = copyin_user_iovec_array(uap->iovp,
658 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
659 uap->iovcnt, iovp);
660 if (error) {
661 goto ExitThisRoutine;
662 }
663
664 /* finalize uio_t for use and do the IO
665 */
666 error = uio_calculateresid(auio);
667 if (error) {
668 goto ExitThisRoutine;
669 }
670
671 error = fp_lookup(p, uap->fd, &fp, 0);
672 if (error)
673 goto ExitThisRoutine;
674
675 if ((fp->f_flag & FWRITE) == 0) {
676 error = EBADF;
677 } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
678 proc_fdlock(p);
679 error = fp_guard_exception(p, uap->fd, fp, kGUARD_EXC_WRITE);
680 proc_fdunlock(p);
681 } else {
682 error = wr_uio(p, fp, auio, retval);
683 wrote_some = *retval > 0;
684 }
685
686 if (wrote_some)
687 fp_drop_written(p, uap->fd, fp);
688 else
689 fp_drop(p, uap->fd, fp, 0);
690
691ExitThisRoutine:
692 if (auio != NULL) {
693 uio_free(auio);
694 }
695 return (error);
696}
697
698
699int
700wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval)
701{
702 int error;
703 user_ssize_t count;
704 struct vfs_context context = *vfs_context_current();
705
706 count = uio_resid(uio);
707
708 context.vc_ucred = fp->f_cred;
709 error = fo_write(fp, uio, 0, &context);
710 if (error) {
711 if (uio_resid(uio) != count && (error == ERESTART ||
712 error == EINTR || error == EWOULDBLOCK))
713 error = 0;
714 /* The socket layer handles SIGPIPE */
715 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
716 (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
717 psignal(p, SIGPIPE);
718 }
719 *retval = count - uio_resid(uio);
720
721 return(error);
722}
723
724
725int
726rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
727{
728 struct fileproc *fp;
729 int error;
730 user_ssize_t count;
731 struct vfs_context context = *vfs_context_current();
732
733 if ( (error = preparefileread(p, &fp, fdes, 0)) )
734 return (error);
735
736 count = uio_resid(uio);
737
738 context.vc_ucred = fp->f_cred;
739
740 error = fo_read(fp, uio, 0, &context);
741
742 if (error) {
743 if (uio_resid(uio) != count && (error == ERESTART ||
744 error == EINTR || error == EWOULDBLOCK))
745 error = 0;
746 }
747 *retval = count - uio_resid(uio);
748
749 donefileread(p, fp, fdes);
750
751 return (error);
752}
753
754/*
755 * Ioctl system call
756 *
757 * Returns: 0 Success
758 * EBADF
759 * ENOTTY
760 * ENOMEM
761 * ESRCH
762 * copyin:EFAULT
763 * copyoutEFAULT
764 * fp_lookup:EBADF Bad file descriptor
765 * fo_ioctl:???
766 */
767int
768ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
769{
770 struct fileproc *fp = NULL;
771 int error = 0;
772 u_int size = 0;
773 caddr_t datap = NULL, memp = NULL;
774 boolean_t is64bit = FALSE;
775 int tmp = 0;
776#define STK_PARAMS 128
777 char stkbuf[STK_PARAMS] = {};
778 int fd = uap->fd;
779 u_long com = uap->com;
780 struct vfs_context context = *vfs_context_current();
781
782 AUDIT_ARG(fd, uap->fd);
783 AUDIT_ARG(addr, uap->data);
784
785 is64bit = proc_is64bit(p);
786#if CONFIG_AUDIT
787 if (is64bit)
788 AUDIT_ARG(value64, com);
789 else
790 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
791#endif /* CONFIG_AUDIT */
792
793 /*
794 * Interpret high order word to find amount of data to be
795 * copied to/from the user's address space.
796 */
797 size = IOCPARM_LEN(com);
798 if (size > IOCPARM_MAX)
799 return ENOTTY;
800 if (size > sizeof (stkbuf)) {
801 if ((memp = (caddr_t)kalloc(size)) == 0)
802 return ENOMEM;
803 datap = memp;
804 } else
805 datap = &stkbuf[0];
806 if (com & IOC_IN) {
807 if (size) {
808 error = copyin(uap->data, datap, size);
809 if (error)
810 goto out_nofp;
811 } else {
812 /* XXX - IOC_IN and no size? we should proably return an error here!! */
813 if (is64bit) {
814 *(user_addr_t *)datap = uap->data;
815 }
816 else {
817 *(uint32_t *)datap = (uint32_t)uap->data;
818 }
819 }
820 } else if ((com & IOC_OUT) && size)
821 /*
822 * Zero the buffer so the user always
823 * gets back something deterministic.
824 */
825 bzero(datap, size);
826 else if (com & IOC_VOID) {
827 /* XXX - this is odd since IOC_VOID means no parameters */
828 if (is64bit) {
829 *(user_addr_t *)datap = uap->data;
830 }
831 else {
832 *(uint32_t *)datap = (uint32_t)uap->data;
833 }
834 }
835
836 proc_fdlock(p);
837 error = fp_lookup(p,fd,&fp,1);
838 if (error) {
839 proc_fdunlock(p);
840 goto out_nofp;
841 }
842
843 AUDIT_ARG(file, p, fp);
844
845 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
846 error = EBADF;
847 goto out;
848 }
849
850 context.vc_ucred = fp->f_fglob->fg_cred;
851
852#if CONFIG_MACF
853 error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
854 if (error)
855 goto out;
856#endif
857
858 switch (com) {
859 case FIONCLEX:
860 *fdflags(p, fd) &= ~UF_EXCLOSE;
861 break;
862
863 case FIOCLEX:
864 *fdflags(p, fd) |= UF_EXCLOSE;
865 break;
866
867 case FIONBIO:
868 if ( (tmp = *(int *)datap) )
869 fp->f_flag |= FNONBLOCK;
870 else
871 fp->f_flag &= ~FNONBLOCK;
872 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
873 break;
874
875 case FIOASYNC:
876 if ( (tmp = *(int *)datap) )
877 fp->f_flag |= FASYNC;
878 else
879 fp->f_flag &= ~FASYNC;
880 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
881 break;
882
883 case FIOSETOWN:
884 tmp = *(int *)datap;
885 if (fp->f_type == DTYPE_SOCKET) {
886 ((struct socket *)fp->f_data)->so_pgid = tmp;
887 break;
888 }
889 if (fp->f_type == DTYPE_PIPE) {
890 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
891 break;
892 }
893 if (tmp <= 0) {
894 tmp = -tmp;
895 } else {
896 struct proc *p1 = proc_find(tmp);
897 if (p1 == 0) {
898 error = ESRCH;
899 break;
900 }
901 tmp = p1->p_pgrpid;
902 proc_rele(p1);
903 }
904 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
905 break;
906
907 case FIOGETOWN:
908 if (fp->f_type == DTYPE_SOCKET) {
909 *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
910 break;
911 }
912 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
913 *(int *)datap = -*(int *)datap;
914 break;
915
916 default:
917 error = fo_ioctl(fp, com, datap, &context);
918 /*
919 * Copy any data to user, size was
920 * already set and checked above.
921 */
922 if (error == 0 && (com & IOC_OUT) && size)
923 error = copyout(datap, uap->data, (u_int)size);
924 break;
925 }
926out:
927 fp_drop(p, fd, fp, 1);
928 proc_fdunlock(p);
929
930out_nofp:
931 if (memp)
932 kfree(memp, size);
933 return(error);
934}
935
936int selwait, nselcoll;
937#define SEL_FIRSTPASS 1
938#define SEL_SECONDPASS 2
939extern int selcontinue(int error);
940extern int selprocess(int error, int sel_pass);
941static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
942 int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset);
943static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
944static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
945static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
946static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
947
948/*
949 * Select system call.
950 *
951 * Returns: 0 Success
952 * EINVAL Invalid argument
953 * EAGAIN Nonconformant error if allocation fails
954 */
955int
956select(struct proc *p, struct select_args *uap, int32_t *retval)
957{
958 __pthread_testcancel(1);
959 return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
960}
961
962int
963select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
964{
965 uint64_t timeout = 0;
966
967 if (uap->tv) {
968 int err;
969 struct timeval atv;
970 if (IS_64BIT_PROCESS(p)) {
971 struct user64_timeval atv64;
972 err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
973 /* Loses resolution - assume timeout < 68 years */
974 atv.tv_sec = atv64.tv_sec;
975 atv.tv_usec = atv64.tv_usec;
976 } else {
977 struct user32_timeval atv32;
978 err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
979 atv.tv_sec = atv32.tv_sec;
980 atv.tv_usec = atv32.tv_usec;
981 }
982 if (err)
983 return err;
984
985 if (itimerfix(&atv)) {
986 err = EINVAL;
987 return err;
988 }
989
990 clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
991 }
992
993 return select_internal(p, uap, timeout, retval);
994}
995
996int
997pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
998{
999 __pthread_testcancel(1);
1000 return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
1001}
1002
1003int
1004pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1005{
1006 int err;
1007 struct uthread *ut;
1008 uint64_t timeout = 0;
1009
1010 if (uap->ts) {
1011 struct timespec ts;
1012
1013 if (IS_64BIT_PROCESS(p)) {
1014 struct user64_timespec ts64;
1015 err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1016 ts.tv_sec = ts64.tv_sec;
1017 ts.tv_nsec = ts64.tv_nsec;
1018 } else {
1019 struct user32_timespec ts32;
1020 err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1021 ts.tv_sec = ts32.tv_sec;
1022 ts.tv_nsec = ts32.tv_nsec;
1023 }
1024 if (err) {
1025 return err;
1026 }
1027
1028 if (!timespec_is_valid(&ts)) {
1029 return EINVAL;
1030 }
1031 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1032 }
1033
1034 ut = get_bsdthread_info(current_thread());
1035
1036 if (uap->mask != USER_ADDR_NULL) {
1037 /* save current mask, then copyin and set new mask */
1038 sigset_t newset;
1039 err = copyin(uap->mask, &newset, sizeof(sigset_t));
1040 if (err) {
1041 return err;
1042 }
1043 ut->uu_oldmask = ut->uu_sigmask;
1044 ut->uu_flag |= UT_SAS_OLDMASK;
1045 ut->uu_sigmask = (newset & ~sigcantmask);
1046 }
1047
1048 err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1049
1050 if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1051 /*
1052 * Restore old mask (direct return case). NOTE: EINTR can also be returned
1053 * if the thread is cancelled. In that case, we don't reset the signal
1054 * mask to its original value (which usually happens in the signal
1055 * delivery path). This behavior is permitted by POSIX.
1056 */
1057 ut->uu_sigmask = ut->uu_oldmask;
1058 ut->uu_oldmask = 0;
1059 ut->uu_flag &= ~UT_SAS_OLDMASK;
1060 }
1061
1062 return err;
1063}
1064
1065/*
1066 * Generic implementation of {,p}select. Care: we type-pun uap across the two
1067 * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1068 * are identical. The 5th (timeout) argument points to different types, so we
1069 * unpack in the syscall-specific code, but the generic code still does a null
1070 * check on this argument to determine if a timeout was specified.
1071 */
1072static int
1073select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1074{
1075 int error = 0;
1076 u_int ni, nw;
1077 thread_t th_act;
1078 struct uthread *uth;
1079 struct _select *sel;
1080 struct _select_data *seldata;
1081 int needzerofill = 1;
1082 int count = 0;
1083 size_t sz = 0;
1084
1085 th_act = current_thread();
1086 uth = get_bsdthread_info(th_act);
1087 sel = &uth->uu_select;
1088 seldata = &uth->uu_save.uus_select_data;
1089 *retval = 0;
1090
1091 seldata->args = uap;
1092 seldata->retval = retval;
1093 seldata->wqp = NULL;
1094 seldata->count = 0;
1095
1096 if (uap->nd < 0) {
1097 return (EINVAL);
1098 }
1099
1100 /* select on thread of process that already called proc_exit() */
1101 if (p->p_fd == NULL) {
1102 return (EBADF);
1103 }
1104
1105 if (uap->nd > p->p_fd->fd_nfiles)
1106 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
1107
1108 nw = howmany(uap->nd, NFDBITS);
1109 ni = nw * sizeof(fd_mask);
1110
1111 /*
1112 * if the previously allocated space for the bits is smaller than
1113 * what is requested or no space has yet been allocated for this
1114 * thread, allocate enough space now.
1115 *
1116 * Note: If this process fails, select() will return EAGAIN; this
1117 * is the same thing pool() returns in a no-memory situation, but
1118 * it is not a POSIX compliant error code for select().
1119 */
1120 if (sel->nbytes < (3 * ni)) {
1121 int nbytes = 3 * ni;
1122
1123 /* Free previous allocation, if any */
1124 if (sel->ibits != NULL)
1125 FREE(sel->ibits, M_TEMP);
1126 if (sel->obits != NULL) {
1127 FREE(sel->obits, M_TEMP);
1128 /* NULL out; subsequent ibits allocation may fail */
1129 sel->obits = NULL;
1130 }
1131
1132 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1133 if (sel->ibits == NULL)
1134 return (EAGAIN);
1135 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1136 if (sel->obits == NULL) {
1137 FREE(sel->ibits, M_TEMP);
1138 sel->ibits = NULL;
1139 return (EAGAIN);
1140 }
1141 sel->nbytes = nbytes;
1142 needzerofill = 0;
1143 }
1144
1145 if (needzerofill) {
1146 bzero((caddr_t)sel->ibits, sel->nbytes);
1147 bzero((caddr_t)sel->obits, sel->nbytes);
1148 }
1149
1150 /*
1151 * get the bits from the user address space
1152 */
1153#define getbits(name, x) \
1154 do { \
1155 if (uap->name && (error = copyin(uap->name, \
1156 (caddr_t)&sel->ibits[(x) * nw], ni))) \
1157 goto continuation; \
1158 } while (0)
1159
1160 getbits(in, 0);
1161 getbits(ou, 1);
1162 getbits(ex, 2);
1163#undef getbits
1164
1165 seldata->abstime = timeout;
1166
1167 if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
1168 goto continuation;
1169 }
1170
1171 /*
1172 * We need an array of waitq pointers. This is due to the new way
1173 * in which waitqs are linked to sets. When a thread selects on a
1174 * file descriptor, a waitq (embedded in a selinfo structure) is
1175 * added to the thread's local waitq set. There is no longer any
1176 * way to directly iterate over all members of a given waitq set.
1177 * The process of linking a waitq into a set may allocate a link
1178 * table object. Because we can't iterate over all the waitqs to
1179 * which our thread waitq set belongs, we need a way of removing
1180 * this link object!
1181 *
1182 * Thus we need a buffer which will hold one waitq pointer
1183 * per FD being selected. During the tear-down phase we can use
1184 * these pointers to dis-associate the underlying selinfo's waitq
1185 * from our thread's waitq set.
1186 *
1187 * Because we also need to allocate a waitq set for this thread,
1188 * we use a bare buffer pointer to hold all the memory. Note that
1189 * this memory is cached in the thread pointer and not reaped until
1190 * the thread exists. This is generally OK because threads that
1191 * call select tend to keep calling select repeatedly.
1192 */
1193 sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
1194 if (sz > uth->uu_wqstate_sz) {
1195 /* (re)allocate a buffer to hold waitq pointers */
1196 if (uth->uu_wqset) {
1197 if (waitq_set_is_valid(uth->uu_wqset))
1198 waitq_set_deinit(uth->uu_wqset);
1199 FREE(uth->uu_wqset, M_SELECT);
1200 } else if (uth->uu_wqstate_sz && !uth->uu_wqset)
1201 panic("select: thread structure corrupt! "
1202 "uu_wqstate_sz:%ld, wqstate_buf == NULL",
1203 uth->uu_wqstate_sz);
1204 uth->uu_wqstate_sz = sz;
1205 MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
1206 if (!uth->uu_wqset)
1207 panic("can't allocate %ld bytes for wqstate buffer",
1208 uth->uu_wqstate_sz);
1209 waitq_set_init(uth->uu_wqset,
1210 SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1211 }
1212
1213 if (!waitq_set_is_valid(uth->uu_wqset))
1214 waitq_set_init(uth->uu_wqset,
1215 SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1216
1217 /* the last chunk of our buffer is an array of waitq pointers */
1218 seldata->wqp = (uint64_t *)((char *)(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
1219 bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
1220
1221 seldata->count = count;
1222
1223continuation:
1224
1225 if (error) {
1226 /*
1227 * We have already cleaned up any state we established,
1228 * either locally or as a result of selcount(). We don't
1229 * need to wait_subqueue_unlink_all(), since we haven't set
1230 * anything at this point.
1231 */
1232 return (error);
1233 }
1234
1235 return selprocess(0, SEL_FIRSTPASS);
1236}
1237
1238int
1239selcontinue(int error)
1240{
1241 return selprocess(error, SEL_SECONDPASS);
1242}
1243
1244
1245/*
1246 * selprocess
1247 *
1248 * Parameters: error The error code from our caller
1249 * sel_pass The pass we are on
1250 */
1251int
1252selprocess(int error, int sel_pass)
1253{
1254 int ncoll;
1255 u_int ni, nw;
1256 thread_t th_act;
1257 struct uthread *uth;
1258 struct proc *p;
1259 struct select_nocancel_args *uap;
1260 int *retval;
1261 struct _select *sel;
1262 struct _select_data *seldata;
1263 int unwind = 1;
1264 int prepost = 0;
1265 int somewakeup = 0;
1266 int doretry = 0;
1267 wait_result_t wait_result;
1268
1269 p = current_proc();
1270 th_act = current_thread();
1271 uth = get_bsdthread_info(th_act);
1272 sel = &uth->uu_select;
1273 seldata = &uth->uu_save.uus_select_data;
1274 uap = seldata->args;
1275 retval = seldata->retval;
1276
1277 if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1278 unwind = 0;
1279 if (seldata->count == 0)
1280 unwind = 0;
1281retry:
1282 if (error != 0)
1283 goto done;
1284
1285 ncoll = nselcoll;
1286 OSBitOrAtomic(P_SELECT, &p->p_flag);
1287
1288 /* skip scans if the select is just for timeouts */
1289 if (seldata->count) {
1290 error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
1291 if (error || *retval) {
1292 goto done;
1293 }
1294 if (prepost || somewakeup) {
1295 /*
1296 * if the select of log, then we can wakeup and
1297 * discover some one else already read the data;
1298 * go to select again if time permits
1299 */
1300 prepost = 0;
1301 somewakeup = 0;
1302 doretry = 1;
1303 }
1304 }
1305
1306 if (uap->tv) {
1307 uint64_t now;
1308
1309 clock_get_uptime(&now);
1310 if (now >= seldata->abstime)
1311 goto done;
1312 }
1313
1314 if (doretry) {
1315 /* cleanup obits and try again */
1316 doretry = 0;
1317 sel_pass = SEL_FIRSTPASS;
1318 goto retry;
1319 }
1320
1321 /*
1322 * To effect a poll, the timeout argument should be
1323 * non-nil, pointing to a zero-valued timeval structure.
1324 */
1325 if (uap->tv && seldata->abstime == 0) {
1326 goto done;
1327 }
1328
1329 /* No spurious wakeups due to colls,no need to check for them */
1330 if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1331 sel_pass = SEL_FIRSTPASS;
1332 goto retry;
1333 }
1334
1335 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1336
1337 /* if the select is just for timeout skip check */
1338 if (seldata->count && (sel_pass == SEL_SECONDPASS))
1339 panic("selprocess: 2nd pass assertwaiting");
1340
1341 /* waitq_set has waitqueue as first element */
1342 wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
1343 NO_EVENT64, THREAD_ABORTSAFE,
1344 TIMEOUT_URGENCY_USER_NORMAL,
1345 seldata->abstime,
1346 TIMEOUT_NO_LEEWAY);
1347 if (wait_result != THREAD_AWAKENED) {
1348 /* there are no preposted events */
1349 error = tsleep1(NULL, PSOCK | PCATCH,
1350 "select", 0, selcontinue);
1351 } else {
1352 prepost = 1;
1353 error = 0;
1354 }
1355
1356 if (error == 0) {
1357 sel_pass = SEL_SECONDPASS;
1358 if (!prepost)
1359 somewakeup = 1;
1360 goto retry;
1361 }
1362done:
1363 if (unwind) {
1364 seldrop(p, sel->ibits, uap->nd);
1365 waitq_set_deinit(uth->uu_wqset);
1366 /*
1367 * zero out the waitq pointer array to avoid use-after free
1368 * errors in the selcount error path (seldrop_locked) if/when
1369 * the thread re-calls select().
1370 */
1371 bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
1372 }
1373 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1374 /* select is not restarted after signals... */
1375 if (error == ERESTART)
1376 error = EINTR;
1377 if (error == EWOULDBLOCK)
1378 error = 0;
1379 nw = howmany(uap->nd, NFDBITS);
1380 ni = nw * sizeof(fd_mask);
1381
1382#define putbits(name, x) \
1383 do { \
1384 if (uap->name && (error2 = \
1385 copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1386 error = error2; \
1387 } while (0)
1388
1389 if (error == 0) {
1390 int error2;
1391
1392 putbits(in, 0);
1393 putbits(ou, 1);
1394 putbits(ex, 2);
1395#undef putbits
1396 }
1397
1398 if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1399 /* restore signal mask - continuation case */
1400 uth->uu_sigmask = uth->uu_oldmask;
1401 uth->uu_oldmask = 0;
1402 uth->uu_flag &= ~UT_SAS_OLDMASK;
1403 }
1404
1405 return(error);
1406}
1407
1408
1409/**
1410 * remove the fileproc's underlying waitq from the supplied waitq set;
1411 * clear FP_INSELECT when appropriate
1412 *
1413 * Parameters:
1414 * fp File proc that is potentially currently in select
1415 * wqset Waitq set to which the fileproc may belong
1416 * (usually this is the thread's private waitq set)
1417 * Conditions:
1418 * proc_fdlock is held
1419 */
1420static void selunlinkfp(struct fileproc *fp, uint64_t wqp_id, struct waitq_set *wqset)
1421{
1422 int valid_set = waitq_set_is_valid(wqset);
1423 int valid_q = !!wqp_id;
1424
1425 /*
1426 * This could be called (from selcount error path) before we setup
1427 * the thread's wqset. Check the wqset passed in, and only unlink if
1428 * the set is valid.
1429 */
1430
1431 /* unlink the underlying waitq from the input set (thread waitq set) */
1432 if (valid_q && valid_set)
1433 waitq_unlink_by_prepost_id(wqp_id, wqset);
1434
1435 /* allow passing a NULL/invalid fp for seldrop unwind */
1436 if (!fp || !(fp->f_flags & (FP_INSELECT|FP_SELCONFLICT)))
1437 return;
1438
1439 /*
1440 * We can always remove the conflict queue from our thread's set: this
1441 * will not affect other threads that potentially need to be awoken on
1442 * the conflict queue during a fileproc_drain - those sets will still
1443 * be linked with the global conflict queue, and the last waiter
1444 * on the fp clears the CONFLICT marker.
1445 */
1446 if (valid_set && (fp->f_flags & FP_SELCONFLICT))
1447 waitq_unlink(&select_conflict_queue, wqset);
1448
1449 /* jca: TODO:
1450 * This isn't quite right - we don't actually know if this
1451 * fileproc is in another select or not! Here we just assume
1452 * that if we were the first thread to select on the FD, then
1453 * we'll be the one to clear this flag...
1454 */
1455 if (valid_set && fp->f_wset == (void *)wqset) {
1456 fp->f_flags &= ~FP_INSELECT;
1457 fp->f_wset = NULL;
1458 }
1459}
1460
1461/**
1462 * connect a fileproc to the given wqset, potentially bridging to a waitq
1463 * pointed to indirectly by wq_data
1464 *
1465 * Parameters:
1466 * fp File proc potentially currently in select
1467 * wq_data Pointer to a pointer to a waitq (could be NULL)
1468 * wqset Waitq set to which the fileproc should now belong
1469 * (usually this is the thread's private waitq set)
1470 *
1471 * Conditions:
1472 * proc_fdlock is held
1473 */
1474static uint64_t sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set *wqset)
1475{
1476 struct waitq *f_wq = NULL;
1477
1478 if ((fp->f_flags & FP_INSELECT) != FP_INSELECT) {
1479 if (wq_data)
1480 panic("non-null data:%p on fp:%p not in select?!"
1481 "(wqset:%p)", wq_data, fp, wqset);
1482 return 0;
1483 }
1484
1485 if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
1486 waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
1487 }
1488
1489 /*
1490 * The wq_data parameter has potentially been set by selrecord called
1491 * from a subsystems fo_select() function. If the subsystem does not
1492 * call selrecord, then wq_data will be NULL
1493 *
1494 * Use memcpy to get the value into a proper pointer because
1495 * wq_data most likely points to a stack variable that could be
1496 * unaligned on 32-bit systems.
1497 */
1498 if (wq_data) {
1499 memcpy(&f_wq, wq_data, sizeof(f_wq));
1500 if (!waitq_is_valid(f_wq))
1501 f_wq = NULL;
1502 }
1503
1504 /* record the first thread's wqset in the fileproc structure */
1505 if (!fp->f_wset)
1506 fp->f_wset = (void *)wqset;
1507
1508 /* handles NULL f_wq */
1509 return waitq_get_prepost_id(f_wq);
1510}
1511
1512
1513/*
1514 * selscan
1515 *
1516 * Parameters: p Process performing the select
1517 * sel The per-thread select context structure
1518 * nfd The number of file descriptors to scan
1519 * retval The per thread system call return area
1520 * sel_pass Which pass this is; allowed values are
1521 * SEL_FIRSTPASS and SEL_SECONDPASS
1522 * wqset The per thread wait queue set
1523 *
1524 * Returns: 0 Success
1525 * EIO Invalid p->p_fd field XXX Obsolete?
1526 * EBADF One of the files in the bit vector is
1527 * invalid.
1528 */
1529static int
1530selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1531 int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset)
1532{
1533 struct filedesc *fdp = p->p_fd;
1534 int msk, i, j, fd;
1535 u_int32_t bits;
1536 struct fileproc *fp;
1537 int n = 0; /* count of bits */
1538 int nc = 0; /* bit vector offset (nc'th bit) */
1539 static int flag[3] = { FREAD, FWRITE, 0 };
1540 u_int32_t *iptr, *optr;
1541 u_int nw;
1542 u_int32_t *ibits, *obits;
1543 uint64_t reserved_link, *rl_ptr = NULL;
1544 int count;
1545 struct vfs_context context = *vfs_context_current();
1546
1547 /*
1548 * Problems when reboot; due to MacOSX signal probs
1549 * in Beaker1C ; verify that the p->p_fd is valid
1550 */
1551 if (fdp == NULL) {
1552 *retval=0;
1553 return(EIO);
1554 }
1555 ibits = sel->ibits;
1556 obits = sel->obits;
1557
1558 nw = howmany(nfd, NFDBITS);
1559
1560 count = seldata->count;
1561
1562 nc = 0;
1563 if (!count) {
1564 *retval = 0;
1565 return 0;
1566 }
1567
1568 proc_fdlock(p);
1569 for (msk = 0; msk < 3; msk++) {
1570 iptr = (u_int32_t *)&ibits[msk * nw];
1571 optr = (u_int32_t *)&obits[msk * nw];
1572
1573 for (i = 0; i < nfd; i += NFDBITS) {
1574 bits = iptr[i/NFDBITS];
1575
1576 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1577 bits &= ~(1 << j);
1578
1579 if (fd < fdp->fd_nfiles)
1580 fp = fdp->fd_ofiles[fd];
1581 else
1582 fp = NULL;
1583
1584 if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1585 /*
1586 * If we abort because of a bad
1587 * fd, let the caller unwind...
1588 */
1589 proc_fdunlock(p);
1590 return(EBADF);
1591 }
1592 if (sel_pass == SEL_SECONDPASS) {
1593 reserved_link = 0;
1594 rl_ptr = NULL;
1595 selunlinkfp(fp, seldata->wqp[nc], wqset);
1596 } else {
1597 reserved_link = waitq_link_reserve((struct waitq *)wqset);
1598 rl_ptr = &reserved_link;
1599 if (fp->f_flags & FP_INSELECT)
1600 /* someone is already in select on this fp */
1601 fp->f_flags |= FP_SELCONFLICT;
1602 else
1603 fp->f_flags |= FP_INSELECT;
1604
1605 waitq_set_lazy_init_link(wqset);
1606 }
1607
1608 context.vc_ucred = fp->f_cred;
1609
1610 /*
1611 * stash this value b/c fo_select may replace
1612 * reserved_link with a pointer to a waitq object
1613 */
1614 uint64_t rsvd = reserved_link;
1615
1616 /* The select; set the bit, if true */
1617 if (fp->f_ops && fp->f_type
1618 && fo_select(fp, flag[msk], rl_ptr, &context)) {
1619 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1620 n++;
1621 }
1622 if (sel_pass == SEL_FIRSTPASS) {
1623 waitq_link_release(rsvd);
1624 /*
1625 * If the fp's supporting selinfo structure was linked
1626 * to this thread's waitq set, then 'reserved_link'
1627 * will have been updated by selrecord to be a pointer
1628 * to the selinfo's waitq.
1629 */
1630 if (reserved_link == rsvd)
1631 rl_ptr = NULL; /* fo_select never called selrecord() */
1632 /*
1633 * Hook up the thread's waitq set either to
1634 * the fileproc structure, or to the global
1635 * conflict queue: but only on the first
1636 * select pass.
1637 */
1638 seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
1639 }
1640 nc++;
1641 }
1642 }
1643 }
1644 proc_fdunlock(p);
1645
1646 *retval = n;
1647 return (0);
1648}
1649
1650int poll_callback(struct kqueue *, struct kevent_internal_s *, void *);
1651
1652struct poll_continue_args {
1653 user_addr_t pca_fds;
1654 u_int pca_nfds;
1655 u_int pca_rfds;
1656};
1657
1658int
1659poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1660{
1661 __pthread_testcancel(1);
1662 return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1663}
1664
1665
1666int
1667poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1668{
1669 struct poll_continue_args *cont;
1670 struct pollfd *fds;
1671 struct kqueue *kq;
1672 struct timeval atv;
1673 int ncoll, error = 0;
1674 u_int nfds = uap->nfds;
1675 u_int rfds = 0;
1676 u_int i;
1677 size_t ni;
1678
1679 /*
1680 * This is kinda bogus. We have fd limits, but that is not
1681 * really related to the size of the pollfd array. Make sure
1682 * we let the process use at least FD_SETSIZE entries and at
1683 * least enough for the current limits. We want to be reasonably
1684 * safe, but not overly restrictive.
1685 */
1686 if (nfds > OPEN_MAX ||
1687 (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
1688 return (EINVAL);
1689
1690 kq = kqueue_alloc(p, 0);
1691 if (kq == NULL)
1692 return (EAGAIN);
1693
1694 ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1695 MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1696 if (NULL == cont) {
1697 error = EAGAIN;
1698 goto out;
1699 }
1700
1701 fds = (struct pollfd *)&cont[1];
1702 error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1703 if (error)
1704 goto out;
1705
1706 if (uap->timeout != -1) {
1707 struct timeval rtv;
1708
1709 atv.tv_sec = uap->timeout / 1000;
1710 atv.tv_usec = (uap->timeout % 1000) * 1000;
1711 if (itimerfix(&atv)) {
1712 error = EINVAL;
1713 goto out;
1714 }
1715 getmicrouptime(&rtv);
1716 timevaladd(&atv, &rtv);
1717 } else {
1718 atv.tv_sec = 0;
1719 atv.tv_usec = 0;
1720 }
1721
1722 /* JMM - all this P_SELECT stuff is bogus */
1723 ncoll = nselcoll;
1724 OSBitOrAtomic(P_SELECT, &p->p_flag);
1725 for (i = 0; i < nfds; i++) {
1726 short events = fds[i].events;
1727 KNOTE_LOCK_CTX(knlc);
1728 __assert_only int rc;
1729
1730 /* per spec, ignore fd values below zero */
1731 if (fds[i].fd < 0) {
1732 fds[i].revents = 0;
1733 continue;
1734 }
1735
1736 /* convert the poll event into a kqueue kevent */
1737 struct kevent_internal_s kev = {
1738 .ident = fds[i].fd,
1739 .flags = EV_ADD | EV_ONESHOT | EV_POLL,
1740 .udata = CAST_USER_ADDR_T(&fds[i]) };
1741
1742 /* Handle input events */
1743 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
1744 kev.filter = EVFILT_READ;
1745 if (events & ( POLLPRI | POLLRDBAND ))
1746 kev.flags |= EV_OOBAND;
1747 rc = kevent_register(kq, &kev, &knlc);
1748 assert((rc & FILTER_REGISTER_WAIT) == 0);
1749 }
1750
1751 /* Handle output events */
1752 if ((kev.flags & EV_ERROR) == 0 &&
1753 (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) {
1754 kev.filter = EVFILT_WRITE;
1755 rc = kevent_register(kq, &kev, &knlc);
1756 assert((rc & FILTER_REGISTER_WAIT) == 0);
1757 }
1758
1759 /* Handle BSD extension vnode events */
1760 if ((kev.flags & EV_ERROR) == 0 &&
1761 (events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE ))) {
1762 kev.filter = EVFILT_VNODE;
1763 kev.fflags = 0;
1764 if (events & POLLEXTEND)
1765 kev.fflags |= NOTE_EXTEND;
1766 if (events & POLLATTRIB)
1767 kev.fflags |= NOTE_ATTRIB;
1768 if (events & POLLNLINK)
1769 kev.fflags |= NOTE_LINK;
1770 if (events & POLLWRITE)
1771 kev.fflags |= NOTE_WRITE;
1772 rc = kevent_register(kq, &kev, &knlc);
1773 assert((rc & FILTER_REGISTER_WAIT) == 0);
1774 }
1775
1776 if (kev.flags & EV_ERROR) {
1777 fds[i].revents = POLLNVAL;
1778 rfds++;
1779 } else
1780 fds[i].revents = 0;
1781 }
1782
1783 /*
1784 * Did we have any trouble registering?
1785 * If user space passed 0 FDs, then respect any timeout value passed.
1786 * This is an extremely inefficient sleep. If user space passed one or
1787 * more FDs, and we had trouble registering _all_ of them, then bail
1788 * out. If a subset of the provided FDs failed to register, then we
1789 * will still call the kqueue_scan function.
1790 */
1791 if (nfds && (rfds == nfds))
1792 goto done;
1793
1794 /*
1795 * If any events have trouble registering, an event has fired and we
1796 * shouldn't wait for events in kqueue_scan -- use the current time as
1797 * the deadline.
1798 */
1799 if (rfds)
1800 getmicrouptime(&atv);
1801
1802 /* scan for, and possibly wait for, the kevents to trigger */
1803 cont->pca_fds = uap->fds;
1804 cont->pca_nfds = nfds;
1805 cont->pca_rfds = rfds;
1806 error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p);
1807 rfds = cont->pca_rfds;
1808
1809 done:
1810 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1811 /* poll is not restarted after signals... */
1812 if (error == ERESTART)
1813 error = EINTR;
1814 if (error == EWOULDBLOCK)
1815 error = 0;
1816 if (error == 0) {
1817 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1818 *retval = rfds;
1819 }
1820
1821 out:
1822 if (NULL != cont)
1823 FREE(cont, M_TEMP);
1824
1825 kqueue_dealloc(kq);
1826 return (error);
1827}
1828
1829int
1830poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *data)
1831{
1832 struct poll_continue_args *cont = (struct poll_continue_args *)data;
1833 struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1834 short prev_revents = fds->revents;
1835 short mask = 0;
1836
1837 /* convert the results back into revents */
1838 if (kevp->flags & EV_EOF)
1839 fds->revents |= POLLHUP;
1840 if (kevp->flags & EV_ERROR)
1841 fds->revents |= POLLERR;
1842
1843 switch (kevp->filter) {
1844 case EVFILT_READ:
1845 if (fds->revents & POLLHUP)
1846 mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1847 else {
1848 mask = (POLLIN | POLLRDNORM);
1849 if (kevp->flags & EV_OOBAND)
1850 mask |= (POLLPRI | POLLRDBAND);
1851 }
1852 fds->revents |= (fds->events & mask);
1853 break;
1854
1855 case EVFILT_WRITE:
1856 if (!(fds->revents & POLLHUP))
1857 fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1858 break;
1859
1860 case EVFILT_VNODE:
1861 if (kevp->fflags & NOTE_EXTEND)
1862 fds->revents |= (fds->events & POLLEXTEND);
1863 if (kevp->fflags & NOTE_ATTRIB)
1864 fds->revents |= (fds->events & POLLATTRIB);
1865 if (kevp->fflags & NOTE_LINK)
1866 fds->revents |= (fds->events & POLLNLINK);
1867 if (kevp->fflags & NOTE_WRITE)
1868 fds->revents |= (fds->events & POLLWRITE);
1869 break;
1870 }
1871
1872 if (fds->revents != 0 && prev_revents == 0)
1873 cont->pca_rfds++;
1874
1875 return 0;
1876}
1877
1878int
1879seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1880{
1881
1882 return (1);
1883}
1884
1885/*
1886 * selcount
1887 *
1888 * Count the number of bits set in the input bit vector, and establish an
1889 * outstanding fp->f_iocount for each of the descriptors which will be in
1890 * use in the select operation.
1891 *
1892 * Parameters: p The process doing the select
1893 * ibits The input bit vector
1894 * nfd The number of fd's in the vector
1895 * countp Pointer to where to store the bit count
1896 *
1897 * Returns: 0 Success
1898 * EIO Bad per process open file table
1899 * EBADF One of the bits in the input bit vector
1900 * references an invalid fd
1901 *
1902 * Implicit: *countp (modified) Count of fd's
1903 *
1904 * Notes: This function is the first pass under the proc_fdlock() that
1905 * permits us to recognize invalid descriptors in the bit vector;
1906 * the may, however, not remain valid through the drop and
1907 * later reacquisition of the proc_fdlock().
1908 */
1909static int
1910selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
1911{
1912 struct filedesc *fdp = p->p_fd;
1913 int msk, i, j, fd;
1914 u_int32_t bits;
1915 struct fileproc *fp;
1916 int n = 0;
1917 u_int32_t *iptr;
1918 u_int nw;
1919 int error=0;
1920 int dropcount;
1921 int need_wakeup = 0;
1922
1923 /*
1924 * Problems when reboot; due to MacOSX signal probs
1925 * in Beaker1C ; verify that the p->p_fd is valid
1926 */
1927 if (fdp == NULL) {
1928 *countp = 0;
1929 return(EIO);
1930 }
1931 nw = howmany(nfd, NFDBITS);
1932
1933 proc_fdlock(p);
1934 for (msk = 0; msk < 3; msk++) {
1935 iptr = (u_int32_t *)&ibits[msk * nw];
1936 for (i = 0; i < nfd; i += NFDBITS) {
1937 bits = iptr[i/NFDBITS];
1938 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1939 bits &= ~(1 << j);
1940
1941 if (fd < fdp->fd_nfiles)
1942 fp = fdp->fd_ofiles[fd];
1943 else
1944 fp = NULL;
1945
1946 if (fp == NULL ||
1947 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1948 *countp = 0;
1949 error = EBADF;
1950 goto bad;
1951 }
1952 fp->f_iocount++;
1953 n++;
1954 }
1955 }
1956 }
1957 proc_fdunlock(p);
1958
1959 *countp = n;
1960 return (0);
1961
1962bad:
1963 dropcount = 0;
1964
1965 if (n == 0)
1966 goto out;
1967 /* Ignore error return; it's already EBADF */
1968 (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
1969
1970out:
1971 proc_fdunlock(p);
1972 if (need_wakeup) {
1973 wakeup(&p->p_fpdrainwait);
1974 }
1975 return(error);
1976}
1977
1978
1979/*
1980 * seldrop_locked
1981 *
1982 * Drop outstanding wait queue references set up during selscan(); drop the
1983 * outstanding per fileproc f_iocount() picked up during the selcount().
1984 *
1985 * Parameters: p Process performing the select
1986 * ibits Input bit bector of fd's
1987 * nfd Number of fd's
1988 * lim Limit to number of vector entries to
1989 * consider, or -1 for "all"
1990 * inselect True if
1991 * need_wakeup Pointer to flag to set to do a wakeup
1992 * if f_iocont on any descriptor goes to 0
1993 *
1994 * Returns: 0 Success
1995 * EBADF One or more fds in the bit vector
1996 * were invalid, but the rest
1997 * were successfully dropped
1998 *
1999 * Notes: An fd make become bad while the proc_fdlock() is not held,
2000 * if a multithreaded application closes the fd out from under
2001 * the in progress select. In this case, we still have to
2002 * clean up after the set up on the remaining fds.
2003 */
2004static int
2005seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
2006{
2007 struct filedesc *fdp = p->p_fd;
2008 int msk, i, j, nc, fd;
2009 u_int32_t bits;
2010 struct fileproc *fp;
2011 u_int32_t *iptr;
2012 u_int nw;
2013 int error = 0;
2014 int dropcount = 0;
2015 uthread_t uth = get_bsdthread_info(current_thread());
2016 struct _select_data *seldata;
2017
2018 *need_wakeup = 0;
2019
2020 /*
2021 * Problems when reboot; due to MacOSX signal probs
2022 * in Beaker1C ; verify that the p->p_fd is valid
2023 */
2024 if (fdp == NULL) {
2025 return(EIO);
2026 }
2027
2028 nw = howmany(nfd, NFDBITS);
2029 seldata = &uth->uu_save.uus_select_data;
2030
2031 nc = 0;
2032 for (msk = 0; msk < 3; msk++) {
2033 iptr = (u_int32_t *)&ibits[msk * nw];
2034 for (i = 0; i < nfd; i += NFDBITS) {
2035 bits = iptr[i/NFDBITS];
2036 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2037 bits &= ~(1 << j);
2038 fp = fdp->fd_ofiles[fd];
2039 /*
2040 * If we've already dropped as many as were
2041 * counted/scanned, then we are done.
2042 */
2043 if ((fromselcount != 0) && (++dropcount > lim))
2044 goto done;
2045
2046 /*
2047 * unlink even potentially NULL fileprocs.
2048 * If the FD was closed from under us, we
2049 * still need to cleanup the waitq links!
2050 */
2051 selunlinkfp(fp,
2052 seldata->wqp ? seldata->wqp[nc] : 0,
2053 uth->uu_wqset);
2054
2055 nc++;
2056
2057 if (fp == NULL) {
2058 /* skip (now) bad fds */
2059 error = EBADF;
2060 continue;
2061 }
2062
2063 fp->f_iocount--;
2064 if (fp->f_iocount < 0)
2065 panic("f_iocount overdecrement!");
2066
2067 if (fp->f_iocount == 0) {
2068 /*
2069 * The last iocount is responsible for clearing
2070 * selconfict flag - even if we didn't set it -
2071 * and is also responsible for waking up anyone
2072 * waiting on iocounts to drain.
2073 */
2074 if (fp->f_flags & FP_SELCONFLICT)
2075 fp->f_flags &= ~FP_SELCONFLICT;
2076 if (p->p_fpdrainwait) {
2077 p->p_fpdrainwait = 0;
2078 *need_wakeup = 1;
2079 }
2080 }
2081 }
2082 }
2083 }
2084done:
2085 return (error);
2086}
2087
2088
2089static int
2090seldrop(struct proc *p, u_int32_t *ibits, int nfd)
2091{
2092 int error;
2093 int need_wakeup = 0;
2094
2095 proc_fdlock(p);
2096 error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
2097 proc_fdunlock(p);
2098 if (need_wakeup) {
2099 wakeup(&p->p_fpdrainwait);
2100 }
2101 return (error);
2102}
2103
2104/*
2105 * Record a select request.
2106 */
2107void
2108selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2109{
2110 thread_t cur_act = current_thread();
2111 struct uthread * ut = get_bsdthread_info(cur_act);
2112 /* on input, s_data points to the 64-bit ID of a reserved link object */
2113 uint64_t *reserved_link = (uint64_t *)s_data;
2114
2115 /* need to look at collisions */
2116
2117 /*do not record if this is second pass of select */
2118 if (!s_data)
2119 return;
2120
2121 if ((sip->si_flags & SI_INITED) == 0) {
2122 waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
2123 sip->si_flags |= SI_INITED;
2124 sip->si_flags &= ~SI_CLEAR;
2125 }
2126
2127 if (sip->si_flags & SI_RECORDED)
2128 sip->si_flags |= SI_COLL;
2129 else
2130 sip->si_flags &= ~SI_COLL;
2131
2132 sip->si_flags |= SI_RECORDED;
2133 /* note: this checks for pre-existing linkage */
2134 waitq_link(&sip->si_waitq, ut->uu_wqset,
2135 WAITQ_SHOULD_LOCK, reserved_link);
2136
2137 /*
2138 * Always consume the reserved link.
2139 * We can always call waitq_link_release() safely because if
2140 * waitq_link is successful, it consumes the link and resets the
2141 * value to 0, in which case our call to release becomes a no-op.
2142 * If waitq_link fails, then the following release call will actually
2143 * release the reserved link object.
2144 */
2145 waitq_link_release(*reserved_link);
2146 *reserved_link = 0;
2147
2148 /*
2149 * Use the s_data pointer as an output parameter as well
2150 * This avoids changing the prototype for this function which is
2151 * used by many kexts. We need to surface the waitq object
2152 * associated with the selinfo we just added to the thread's select
2153 * set. New waitq sets do not have back-pointers to set members, so
2154 * the only way to clear out set linkage objects is to go from the
2155 * waitq to the set. We use a memcpy because s_data could be
2156 * pointing to an unaligned value on the stack
2157 * (especially on 32-bit systems)
2158 */
2159 void *wqptr = (void *)&sip->si_waitq;
2160 memcpy((void *)s_data, (void *)&wqptr, sizeof(void *));
2161
2162 return;
2163}
2164
2165void
2166selwakeup(struct selinfo *sip)
2167{
2168
2169 if ((sip->si_flags & SI_INITED) == 0) {
2170 return;
2171 }
2172
2173 if (sip->si_flags & SI_COLL) {
2174 nselcoll++;
2175 sip->si_flags &= ~SI_COLL;
2176#if 0
2177 /* will not support */
2178 //wakeup((caddr_t)&selwait);
2179#endif
2180 }
2181
2182 if (sip->si_flags & SI_RECORDED) {
2183 waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
2184 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2185 sip->si_flags &= ~SI_RECORDED;
2186 }
2187
2188}
2189
2190void
2191selthreadclear(struct selinfo *sip)
2192{
2193 struct waitq *wq;
2194
2195 if ((sip->si_flags & SI_INITED) == 0) {
2196 return;
2197 }
2198 if (sip->si_flags & SI_RECORDED) {
2199 selwakeup(sip);
2200 sip->si_flags &= ~(SI_RECORDED | SI_COLL);
2201 }
2202 sip->si_flags |= SI_CLEAR;
2203 sip->si_flags &= ~SI_INITED;
2204
2205 wq = &sip->si_waitq;
2206
2207 /*
2208 * Higher level logic may have a handle on this waitq's prepost ID,
2209 * but that's OK because the waitq_deinit will remove/invalidate the
2210 * prepost object (as well as mark the waitq invalid). This de-couples
2211 * us from any callers that may have a handle to this waitq via the
2212 * prepost ID.
2213 */
2214 waitq_deinit(wq);
2215}
2216
2217
2218
2219
2220#define DBG_POST 0x10
2221#define DBG_WATCH 0x11
2222#define DBG_WAIT 0x12
2223#define DBG_MOD 0x13
2224#define DBG_EWAKEUP 0x14
2225#define DBG_ENQUEUE 0x15
2226#define DBG_DEQUEUE 0x16
2227
2228#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
2229#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
2230#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
2231#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
2232#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
2233#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
2234#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
2235
2236
2237#define EVPROCDEQUE(p, evq) do { \
2238 proc_lock(p); \
2239 if (evq->ee_flags & EV_QUEUED) { \
2240 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \
2241 evq->ee_flags &= ~EV_QUEUED; \
2242 } \
2243 proc_unlock(p); \
2244} while (0);
2245
2246
2247/*
2248 * called upon socket close. deque and free all events for
2249 * the socket... socket must be locked by caller.
2250 */
2251void
2252evsofree(struct socket *sp)
2253{
2254 struct eventqelt *evq, *next;
2255 proc_t p;
2256
2257 if (sp == NULL)
2258 return;
2259
2260 for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
2261 next = evq->ee_slist.tqe_next;
2262 p = evq->ee_proc;
2263
2264 if (evq->ee_flags & EV_QUEUED) {
2265 EVPROCDEQUE(p, evq);
2266 }
2267 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
2268 FREE(evq, M_TEMP);
2269 }
2270}
2271
2272
2273/*
2274 * called upon pipe close. deque and free all events for
2275 * the pipe... pipe must be locked by caller
2276 */
2277void
2278evpipefree(struct pipe *cpipe)
2279{
2280 struct eventqelt *evq, *next;
2281 proc_t p;
2282
2283 for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
2284 next = evq->ee_slist.tqe_next;
2285 p = evq->ee_proc;
2286
2287 EVPROCDEQUE(p, evq);
2288
2289 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
2290 FREE(evq, M_TEMP);
2291 }
2292}
2293
2294
2295/*
2296 * enqueue this event if it's not already queued. wakeup
2297 * the proc if we do queue this event to it...
2298 * entered with proc lock held... we drop it before
2299 * doing the wakeup and return in that state
2300 */
2301static void
2302evprocenque(struct eventqelt *evq)
2303{
2304 proc_t p;
2305
2306 assert(evq);
2307 p = evq->ee_proc;
2308
2309 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
2310
2311 proc_lock(p);
2312
2313 if (evq->ee_flags & EV_QUEUED) {
2314 proc_unlock(p);
2315
2316 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2317 return;
2318 }
2319 evq->ee_flags |= EV_QUEUED;
2320
2321 TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
2322
2323 proc_unlock(p);
2324
2325 wakeup(&p->p_evlist);
2326
2327 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2328}
2329
2330
2331/*
2332 * pipe lock must be taken by the caller
2333 */
2334void
2335postpipeevent(struct pipe *pipep, int event)
2336{
2337 int mask;
2338 struct eventqelt *evq;
2339
2340 if (pipep == NULL)
2341 return;
2342 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
2343
2344 for (evq = pipep->pipe_evlist.tqh_first;
2345 evq != NULL; evq = evq->ee_slist.tqe_next) {
2346
2347 if (evq->ee_eventmask == 0)
2348 continue;
2349 mask = 0;
2350
2351 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
2352
2353 case EV_RWBYTES:
2354 if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
2355 mask |= EV_RE;
2356 evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
2357 }
2358 if ((evq->ee_eventmask & EV_WR) &&
2359 (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
2360
2361 if (pipep->pipe_state & PIPE_EOF) {
2362 mask |= EV_WR|EV_RESET;
2363 break;
2364 }
2365 mask |= EV_WR;
2366 evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
2367 }
2368 break;
2369
2370 case EV_WCLOSED:
2371 case EV_RCLOSED:
2372 if ((evq->ee_eventmask & EV_RE)) {
2373 mask |= EV_RE|EV_RCLOSED;
2374 }
2375 if ((evq->ee_eventmask & EV_WR)) {
2376 mask |= EV_WR|EV_WCLOSED;
2377 }
2378 break;
2379
2380 default:
2381 return;
2382 }
2383 if (mask) {
2384 /*
2385 * disarm... postevents are nops until this event is 'read' via
2386 * waitevent and then re-armed via modwatch
2387 */
2388 evq->ee_eventmask = 0;
2389
2390 /*
2391 * since events are disarmed until after the waitevent
2392 * the ee_req.er_xxxx fields can't change once we've
2393 * inserted this event into the proc queue...
2394 * therefore, the waitevent will see a 'consistent'
2395 * snapshot of the event, even though it won't hold
2396 * the pipe lock, and we're updating the event outside
2397 * of the proc lock, which it will hold
2398 */
2399 evq->ee_req.er_eventbits |= mask;
2400
2401 KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
2402
2403 evprocenque(evq);
2404 }
2405 }
2406 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
2407}
2408
2409#if SOCKETS
2410/*
2411 * given either a sockbuf or a socket run down the
2412 * event list and queue ready events found...
2413 * the socket must be locked by the caller
2414 */
2415void
2416postevent(struct socket *sp, struct sockbuf *sb, int event)
2417{
2418 int mask;
2419 struct eventqelt *evq;
2420 struct tcpcb *tp;
2421
2422 if (sb)
2423 sp = sb->sb_so;
2424 if (sp == NULL)
2425 return;
2426
2427 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
2428
2429 for (evq = sp->so_evlist.tqh_first;
2430 evq != NULL; evq = evq->ee_slist.tqe_next) {
2431
2432 if (evq->ee_eventmask == 0)
2433 continue;
2434 mask = 0;
2435
2436 /* ready for reading:
2437 - byte cnt >= receive low water mark
2438 - read-half of conn closed
2439 - conn pending for listening sock
2440 - socket error pending
2441
2442 ready for writing
2443 - byte cnt avail >= send low water mark
2444 - write half of conn closed
2445 - socket error pending
2446 - non-blocking conn completed successfully
2447
2448 exception pending
2449 - out of band data
2450 - sock at out of band mark
2451 */
2452
2453 switch (event & EV_DMASK) {
2454
2455 case EV_OOB:
2456 if ((evq->ee_eventmask & EV_EX)) {
2457 if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2458 mask |= EV_EX|EV_OOB;
2459 }
2460 break;
2461
2462 case EV_RWBYTES|EV_OOB:
2463 if ((evq->ee_eventmask & EV_EX)) {
2464 if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2465 mask |= EV_EX|EV_OOB;
2466 }
2467 /*
2468 * fall into the next case
2469 */
2470 case EV_RWBYTES:
2471 if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2472 /* for AFP/OT purposes; may go away in future */
2473 if ((SOCK_DOM(sp) == PF_INET ||
2474 SOCK_DOM(sp) == PF_INET6) &&
2475 SOCK_PROTO(sp) == IPPROTO_TCP &&
2476 (sp->so_error == ECONNREFUSED ||
2477 sp->so_error == ECONNRESET)) {
2478 if (sp->so_pcb == NULL ||
2479 sotoinpcb(sp)->inp_state ==
2480 INPCB_STATE_DEAD ||
2481 (tp = sototcpcb(sp)) == NULL ||
2482 tp->t_state == TCPS_CLOSED) {
2483 mask |= EV_RE|EV_RESET;
2484 break;
2485 }
2486 }
2487 mask |= EV_RE;
2488 evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2489
2490 if (sp->so_state & SS_CANTRCVMORE) {
2491 mask |= EV_FIN;
2492 break;
2493 }
2494 }
2495 if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2496 /* for AFP/OT purposes; may go away in future */
2497 if ((SOCK_DOM(sp) == PF_INET ||
2498 SOCK_DOM(sp) == PF_INET6) &&
2499 SOCK_PROTO(sp) == IPPROTO_TCP &&
2500 (sp->so_error == ECONNREFUSED ||
2501 sp->so_error == ECONNRESET)) {
2502 if (sp->so_pcb == NULL ||
2503 sotoinpcb(sp)->inp_state ==
2504 INPCB_STATE_DEAD ||
2505 (tp = sototcpcb(sp)) == NULL ||
2506 tp->t_state == TCPS_CLOSED) {
2507 mask |= EV_WR|EV_RESET;
2508 break;
2509 }
2510 }
2511 mask |= EV_WR;
2512 evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2513 }
2514 break;
2515
2516 case EV_RCONN:
2517 if ((evq->ee_eventmask & EV_RE)) {
2518 mask |= EV_RE|EV_RCONN;
2519 evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one
2520 }
2521 break;
2522
2523 case EV_WCONN:
2524 if ((evq->ee_eventmask & EV_WR)) {
2525 mask |= EV_WR|EV_WCONN;
2526 }
2527 break;
2528
2529 case EV_RCLOSED:
2530 if ((evq->ee_eventmask & EV_RE)) {
2531 mask |= EV_RE|EV_RCLOSED;
2532 }
2533 break;
2534
2535 case EV_WCLOSED:
2536 if ((evq->ee_eventmask & EV_WR)) {
2537 mask |= EV_WR|EV_WCLOSED;
2538 }
2539 break;
2540
2541 case EV_FIN:
2542 if (evq->ee_eventmask & EV_RE) {
2543 mask |= EV_RE|EV_FIN;
2544 }
2545 break;
2546
2547 case EV_RESET:
2548 case EV_TIMEOUT:
2549 if (evq->ee_eventmask & EV_RE) {
2550 mask |= EV_RE | event;
2551 }
2552 if (evq->ee_eventmask & EV_WR) {
2553 mask |= EV_WR | event;
2554 }
2555 break;
2556
2557 default:
2558 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2559 return;
2560 } /* switch */
2561
2562 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2563
2564 if (mask) {
2565 /*
2566 * disarm... postevents are nops until this event is 'read' via
2567 * waitevent and then re-armed via modwatch
2568 */
2569 evq->ee_eventmask = 0;
2570
2571 /*
2572 * since events are disarmed until after the waitevent
2573 * the ee_req.er_xxxx fields can't change once we've
2574 * inserted this event into the proc queue...
2575 * since waitevent can't see this event until we
2576 * enqueue it, waitevent will see a 'consistent'
2577 * snapshot of the event, even though it won't hold
2578 * the socket lock, and we're updating the event outside
2579 * of the proc lock, which it will hold
2580 */
2581 evq->ee_req.er_eventbits |= mask;
2582
2583 evprocenque(evq);
2584 }
2585 }
2586 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2587}
2588#endif /* SOCKETS */
2589
2590
2591/*
2592 * watchevent system call. user passes us an event to watch
2593 * for. we malloc an event object, initialize it, and queue
2594 * it to the open socket. when the event occurs, postevent()
2595 * will enque it back to our proc where we can retrieve it
2596 * via waitevent().
2597 *
2598 * should this prevent duplicate events on same socket?
2599 *
2600 * Returns:
2601 * ENOMEM No memory for operation
2602 * copyin:EFAULT
2603 */
2604int
2605watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2606{
2607 struct eventqelt *evq = (struct eventqelt *)0;
2608 struct eventqelt *np = NULL;
2609 struct eventreq64 *erp;
2610 struct fileproc *fp = NULL;
2611 int error;
2612
2613 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2614
2615 // get a qelt and fill with users req
2616 MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2617
2618 if (evq == NULL)
2619 return (ENOMEM);
2620 erp = &evq->ee_req;
2621
2622 // get users request pkt
2623
2624 if (IS_64BIT_PROCESS(p)) {
2625 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2626 } else {
2627 struct eventreq32 er32;
2628
2629 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2630 if (error == 0) {
2631 /*
2632 * the user only passes in the
2633 * er_type, er_handle and er_data...
2634 * the other fields are initialized
2635 * below, so don't bother to copy
2636 */
2637 erp->er_type = er32.er_type;
2638 erp->er_handle = er32.er_handle;
2639 erp->er_data = (user_addr_t)er32.er_data;
2640 }
2641 }
2642 if (error) {
2643 FREE(evq, M_TEMP);
2644 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2645
2646 return(error);
2647 }
2648 KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2649
2650 // validate, freeing qelt if errors
2651 error = 0;
2652 proc_fdlock(p);
2653
2654 if (erp->er_type != EV_FD) {
2655 error = EINVAL;
2656 } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2657 error = EBADF;
2658#if SOCKETS
2659 } else if (fp->f_type == DTYPE_SOCKET) {
2660 socket_lock((struct socket *)fp->f_data, 1);
2661 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2662#endif /* SOCKETS */
2663 } else if (fp->f_type == DTYPE_PIPE) {
2664 PIPE_LOCK((struct pipe *)fp->f_data);
2665 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2666 } else {
2667 fp_drop(p, erp->er_handle, fp, 1);
2668 error = EINVAL;
2669 }
2670 proc_fdunlock(p);
2671
2672 if (error) {
2673 FREE(evq, M_TEMP);
2674
2675 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2676 return(error);
2677 }
2678
2679 /*
2680 * only allow one watch per file per proc
2681 */
2682 for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2683 if (np->ee_proc == p) {
2684#if SOCKETS
2685 if (fp->f_type == DTYPE_SOCKET)
2686 socket_unlock((struct socket *)fp->f_data, 1);
2687 else
2688#endif /* SOCKETS */
2689 PIPE_UNLOCK((struct pipe *)fp->f_data);
2690 fp_drop(p, erp->er_handle, fp, 0);
2691 FREE(evq, M_TEMP);
2692
2693 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2694 return(EINVAL);
2695 }
2696 }
2697 erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2698 evq->ee_proc = p;
2699 evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2700 evq->ee_flags = 0;
2701
2702#if SOCKETS
2703 if (fp->f_type == DTYPE_SOCKET) {
2704 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2705 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2706
2707 socket_unlock((struct socket *)fp->f_data, 1);
2708 } else
2709#endif /* SOCKETS */
2710 {
2711 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2712 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2713
2714 PIPE_UNLOCK((struct pipe *)fp->f_data);
2715 }
2716 fp_drop_event(p, erp->er_handle, fp);
2717
2718 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2719 return(0);
2720}
2721
2722
2723
2724/*
2725 * waitevent system call.
2726 * grabs the next waiting event for this proc and returns
2727 * it. if no events, user can request to sleep with timeout
2728 * or without or poll mode
2729 * ((tv != NULL && interval == 0) || tv == -1)
2730 */
2731int
2732waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2733{
2734 int error = 0;
2735 struct eventqelt *evq;
2736 struct eventreq64 *erp;
2737 uint64_t abstime, interval;
2738 boolean_t fast_poll = FALSE;
2739 union {
2740 struct eventreq64 er64;
2741 struct eventreq32 er32;
2742 } uer = {};
2743
2744 interval = 0;
2745
2746 if (uap->tv) {
2747 struct timeval atv;
2748 /*
2749 * check for fast poll method
2750 */
2751 if (IS_64BIT_PROCESS(p)) {
2752 if (uap->tv == (user_addr_t)-1)
2753 fast_poll = TRUE;
2754 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
2755 fast_poll = TRUE;
2756
2757 if (fast_poll == TRUE) {
2758 if (p->p_evlist.tqh_first == NULL) {
2759 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
2760 /*
2761 * poll failed
2762 */
2763 *retval = 1;
2764 return (0);
2765 }
2766 proc_lock(p);
2767 goto retry;
2768 }
2769 if (IS_64BIT_PROCESS(p)) {
2770 struct user64_timeval atv64;
2771 error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
2772 /* Loses resolution - assume timeout < 68 years */
2773 atv.tv_sec = atv64.tv_sec;
2774 atv.tv_usec = atv64.tv_usec;
2775 } else {
2776 struct user32_timeval atv32;
2777 error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
2778 atv.tv_sec = atv32.tv_sec;
2779 atv.tv_usec = atv32.tv_usec;
2780 }
2781
2782 if (error)
2783 return(error);
2784 if (itimerfix(&atv)) {
2785 error = EINVAL;
2786 return(error);
2787 }
2788 interval = tvtoabstime(&atv);
2789 }
2790 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2791
2792 proc_lock(p);
2793retry:
2794 if ((evq = p->p_evlist.tqh_first) != NULL) {
2795 /*
2796 * found one... make a local copy while it's still on the queue
2797 * to prevent it from changing while in the midst of copying
2798 * don't want to hold the proc lock across a copyout because
2799 * it might block on a page fault at the target in user space
2800 */
2801 erp = &evq->ee_req;
2802
2803 if (IS_64BIT_PROCESS(p))
2804 bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2805 else {
2806 uer.er32.er_type = erp->er_type;
2807 uer.er32.er_handle = erp->er_handle;
2808 uer.er32.er_data = (uint32_t)erp->er_data;
2809 uer.er32.er_ecnt = erp->er_ecnt;
2810 uer.er32.er_rcnt = erp->er_rcnt;
2811 uer.er32.er_wcnt = erp->er_wcnt;
2812 uer.er32.er_eventbits = erp->er_eventbits;
2813 }
2814 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2815
2816 evq->ee_flags &= ~EV_QUEUED;
2817
2818 proc_unlock(p);
2819
2820 if (IS_64BIT_PROCESS(p))
2821 error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2822 else
2823 error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2824
2825 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2826 evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
2827 return (error);
2828 }
2829 else {
2830 if (uap->tv && interval == 0) {
2831 proc_unlock(p);
2832 *retval = 1; // poll failed
2833
2834 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2835 return (error);
2836 }
2837 if (interval != 0)
2838 clock_absolutetime_interval_to_deadline(interval, &abstime);
2839 else
2840 abstime = 0;
2841
2842 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
2843
2844 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2845
2846 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
2847
2848 if (error == 0)
2849 goto retry;
2850 if (error == ERESTART)
2851 error = EINTR;
2852 if (error == EWOULDBLOCK) {
2853 *retval = 1;
2854 error = 0;
2855 }
2856 }
2857 proc_unlock(p);
2858
2859 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2860 return (error);
2861}
2862
2863
2864/*
2865 * modwatch system call. user passes in event to modify.
2866 * if we find it we reset the event bits and que/deque event
2867 * it needed.
2868 */
2869int
2870modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2871{
2872 struct eventreq64 er;
2873 struct eventreq64 *erp = &er;
2874 struct eventqelt *evq = NULL; /* protected by error return */
2875 int error;
2876 struct fileproc *fp;
2877 int flag;
2878
2879 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2880
2881 /*
2882 * get user's request pkt
2883 * just need the er_type and er_handle which sit above the
2884 * problematic er_data (32/64 issue)... so only copy in
2885 * those 2 fields
2886 */
2887 if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2888 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2889 return(error);
2890 }
2891 proc_fdlock(p);
2892
2893 if (erp->er_type != EV_FD) {
2894 error = EINVAL;
2895 } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2896 error = EBADF;
2897#if SOCKETS
2898 } else if (fp->f_type == DTYPE_SOCKET) {
2899 socket_lock((struct socket *)fp->f_data, 1);
2900 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2901#endif /* SOCKETS */
2902 } else if (fp->f_type == DTYPE_PIPE) {
2903 PIPE_LOCK((struct pipe *)fp->f_data);
2904 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2905 } else {
2906 fp_drop(p, erp->er_handle, fp, 1);
2907 error = EINVAL;
2908 }
2909
2910 if (error) {
2911 proc_fdunlock(p);
2912 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2913 return(error);
2914 }
2915
2916 if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2917 fp->f_flags &= ~FP_WAITEVENT;
2918 }
2919 proc_fdunlock(p);
2920
2921 // locate event if possible
2922 for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2923 if (evq->ee_proc == p)
2924 break;
2925 }
2926 if (evq == NULL) {
2927#if SOCKETS
2928 if (fp->f_type == DTYPE_SOCKET)
2929 socket_unlock((struct socket *)fp->f_data, 1);
2930 else
2931#endif /* SOCKETS */
2932 PIPE_UNLOCK((struct pipe *)fp->f_data);
2933 fp_drop(p, erp->er_handle, fp, 0);
2934 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2935 return(EINVAL);
2936 }
2937 KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2938
2939 if (uap->u_eventmask == EV_RM) {
2940 EVPROCDEQUE(p, evq);
2941
2942#if SOCKETS
2943 if (fp->f_type == DTYPE_SOCKET) {
2944 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2945 socket_unlock((struct socket *)fp->f_data, 1);
2946 } else
2947#endif /* SOCKETS */
2948 {
2949 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2950 PIPE_UNLOCK((struct pipe *)fp->f_data);
2951 }
2952 fp_drop(p, erp->er_handle, fp, 0);
2953 FREE(evq, M_TEMP);
2954 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2955 return(0);
2956 }
2957 switch (uap->u_eventmask & EV_MASK) {
2958
2959 case 0:
2960 flag = 0;
2961 break;
2962
2963 case EV_RE:
2964 case EV_WR:
2965 case EV_RE|EV_WR:
2966 flag = EV_RWBYTES;
2967 break;
2968
2969 case EV_EX:
2970 flag = EV_OOB;
2971 break;
2972
2973 case EV_EX|EV_RE:
2974 case EV_EX|EV_WR:
2975 case EV_EX|EV_RE|EV_WR:
2976 flag = EV_OOB|EV_RWBYTES;
2977 break;
2978
2979 default:
2980#if SOCKETS
2981 if (fp->f_type == DTYPE_SOCKET)
2982 socket_unlock((struct socket *)fp->f_data, 1);
2983 else
2984#endif /* SOCKETS */
2985 PIPE_UNLOCK((struct pipe *)fp->f_data);
2986 fp_drop(p, erp->er_handle, fp, 0);
2987 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2988 return(EINVAL);
2989 }
2990 /*
2991 * since we're holding the socket/pipe lock, the event
2992 * cannot go from the unqueued state to the queued state
2993 * however, it can go from the queued state to the unqueued state
2994 * since that direction is protected by the proc_lock...
2995 * so do a quick check for EV_QUEUED w/o holding the proc lock
2996 * since by far the common case will be NOT EV_QUEUED, this saves
2997 * us taking the proc_lock the majority of the time
2998 */
2999 if (evq->ee_flags & EV_QUEUED) {
3000 /*
3001 * EVPROCDEQUE will recheck the state after it grabs the proc_lock
3002 */
3003 EVPROCDEQUE(p, evq);
3004 }
3005 /*
3006 * while the event is off the proc queue and
3007 * we're holding the socket/pipe lock
3008 * it's safe to update these fields...
3009 */
3010 evq->ee_req.er_eventbits = 0;
3011 evq->ee_eventmask = uap->u_eventmask & EV_MASK;
3012
3013#if SOCKETS
3014 if (fp->f_type == DTYPE_SOCKET) {
3015 postevent((struct socket *)fp->f_data, 0, flag);
3016 socket_unlock((struct socket *)fp->f_data, 1);
3017 } else
3018#endif /* SOCKETS */
3019 {
3020 postpipeevent((struct pipe *)fp->f_data, flag);
3021 PIPE_UNLOCK((struct pipe *)fp->f_data);
3022 }
3023 fp_drop(p, erp->er_handle, fp, 0);
3024 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
3025 return(0);
3026}
3027
3028/* this routine is called from the close of fd with proc_fdlock held */
3029int
3030waitevent_close(struct proc *p, struct fileproc *fp)
3031{
3032 struct eventqelt *evq;
3033
3034
3035 fp->f_flags &= ~FP_WAITEVENT;
3036
3037#if SOCKETS
3038 if (fp->f_type == DTYPE_SOCKET) {
3039 socket_lock((struct socket *)fp->f_data, 1);
3040 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
3041 } else
3042#endif /* SOCKETS */
3043 if (fp->f_type == DTYPE_PIPE) {
3044 PIPE_LOCK((struct pipe *)fp->f_data);
3045 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
3046 }
3047 else {
3048 return(EINVAL);
3049 }
3050 proc_fdunlock(p);
3051
3052
3053 // locate event if possible
3054 for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
3055 if (evq->ee_proc == p)
3056 break;
3057 }
3058 if (evq == NULL) {
3059#if SOCKETS
3060 if (fp->f_type == DTYPE_SOCKET)
3061 socket_unlock((struct socket *)fp->f_data, 1);
3062 else
3063#endif /* SOCKETS */
3064 PIPE_UNLOCK((struct pipe *)fp->f_data);
3065
3066 proc_fdlock(p);
3067
3068 return(EINVAL);
3069 }
3070 EVPROCDEQUE(p, evq);
3071
3072#if SOCKETS
3073 if (fp->f_type == DTYPE_SOCKET) {
3074 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
3075 socket_unlock((struct socket *)fp->f_data, 1);
3076 } else
3077#endif /* SOCKETS */
3078 {
3079 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
3080 PIPE_UNLOCK((struct pipe *)fp->f_data);
3081 }
3082 FREE(evq, M_TEMP);
3083
3084 proc_fdlock(p);
3085
3086 return(0);
3087}
3088
3089
3090/*
3091 * gethostuuid
3092 *
3093 * Description: Get the host UUID from IOKit and return it to user space.
3094 *
3095 * Parameters: uuid_buf Pointer to buffer to receive UUID
3096 * timeout Timespec for timout
3097 * spi SPI, skip sandbox check (temporary)
3098 *
3099 * Returns: 0 Success
3100 * EWOULDBLOCK Timeout is too short
3101 * copyout:EFAULT Bad user buffer
3102 * mac_system_check_info:EPERM Client not allowed to perform this operation
3103 *
3104 * Notes: A timeout seems redundant, since if it's tolerable to not
3105 * have a system UUID in hand, then why ask for one?
3106 */
3107int
3108gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
3109{
3110 kern_return_t kret;
3111 int error;
3112 mach_timespec_t mach_ts; /* for IOKit call */
3113 __darwin_uuid_t uuid_kern = {}; /* for IOKit call */
3114
3115 if (!uap->spi) {
3116#if CONFIG_EMBEDDED
3117#if CONFIG_MACF
3118 if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
3119 /* EPERM invokes userspace upcall if present */
3120 return (error);
3121 }
3122#endif
3123#endif
3124 }
3125
3126 /* Convert the 32/64 bit timespec into a mach_timespec_t */
3127 if ( proc_is64bit(p) ) {
3128 struct user64_timespec ts;
3129 error = copyin(uap->timeoutp, &ts, sizeof(ts));
3130 if (error)
3131 return (error);
3132 mach_ts.tv_sec = ts.tv_sec;
3133 mach_ts.tv_nsec = ts.tv_nsec;
3134 } else {
3135 struct user32_timespec ts;
3136 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
3137 if (error)
3138 return (error);
3139 mach_ts.tv_sec = ts.tv_sec;
3140 mach_ts.tv_nsec = ts.tv_nsec;
3141 }
3142
3143 /* Call IOKit with the stack buffer to get the UUID */
3144 kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
3145
3146 /*
3147 * If we get it, copy out the data to the user buffer; note that a
3148 * uuid_t is an array of characters, so this is size invariant for
3149 * 32 vs. 64 bit.
3150 */
3151 if (kret == KERN_SUCCESS) {
3152 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
3153 } else {
3154 error = EWOULDBLOCK;
3155 }
3156
3157 return (error);
3158}
3159
3160/*
3161 * ledger
3162 *
3163 * Description: Omnibus system call for ledger operations
3164 */
3165int
3166ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
3167{
3168#if !CONFIG_MACF
3169#pragma unused(p)
3170#endif
3171 int rval, pid, len, error;
3172#ifdef LEDGER_DEBUG
3173 struct ledger_limit_args lla;
3174#endif
3175 task_t task;
3176 proc_t proc;
3177
3178 /* Finish copying in the necessary args before taking the proc lock */
3179 error = 0;
3180 len = 0;
3181 if (args->cmd == LEDGER_ENTRY_INFO)
3182 error = copyin(args->arg3, (char *)&len, sizeof (len));
3183 else if (args->cmd == LEDGER_TEMPLATE_INFO)
3184 error = copyin(args->arg2, (char *)&len, sizeof (len));
3185 else if (args->cmd == LEDGER_LIMIT)
3186#ifdef LEDGER_DEBUG
3187 error = copyin(args->arg2, (char *)&lla, sizeof (lla));
3188#else
3189 return (EINVAL);
3190#endif
3191 else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD))
3192 return (EINVAL);
3193
3194 if (error)
3195 return (error);
3196 if (len < 0)
3197 return (EINVAL);
3198
3199 rval = 0;
3200 if (args->cmd != LEDGER_TEMPLATE_INFO) {
3201 pid = args->arg1;
3202 proc = proc_find(pid);
3203 if (proc == NULL)
3204 return (ESRCH);
3205
3206#if CONFIG_MACF
3207 error = mac_proc_check_ledger(p, proc, args->cmd);
3208 if (error) {
3209 proc_rele(proc);
3210 return (error);
3211 }
3212#endif
3213
3214 task = proc->task;
3215 }
3216
3217 switch (args->cmd) {
3218#ifdef LEDGER_DEBUG
3219 case LEDGER_LIMIT: {
3220 if (!kauth_cred_issuser(kauth_cred_get()))
3221 rval = EPERM;
3222 rval = ledger_limit(task, &lla);
3223 proc_rele(proc);
3224 break;
3225 }
3226#endif
3227 case LEDGER_INFO: {
3228 struct ledger_info info = {};
3229
3230 rval = ledger_info(task, &info);
3231 proc_rele(proc);
3232 if (rval == 0)
3233 rval = copyout(&info, args->arg2,
3234 sizeof (info));
3235 break;
3236 }
3237
3238 case LEDGER_ENTRY_INFO: {
3239 void *buf;
3240 int sz;
3241
3242 rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
3243 proc_rele(proc);
3244 if ((rval == 0) && (len >= 0)) {
3245 sz = len * sizeof (struct ledger_entry_info);
3246 rval = copyout(buf, args->arg2, sz);
3247 kfree(buf, sz);
3248 }
3249 if (rval == 0)
3250 rval = copyout(&len, args->arg3, sizeof (len));
3251 break;
3252 }
3253
3254 case LEDGER_TEMPLATE_INFO: {
3255 void *buf;
3256 int sz;
3257
3258 rval = ledger_template_info(&buf, &len);
3259 if ((rval == 0) && (len >= 0)) {
3260 sz = len * sizeof (struct ledger_template_info);
3261 rval = copyout(buf, args->arg1, sz);
3262 kfree(buf, sz);
3263 }
3264 if (rval == 0)
3265 rval = copyout(&len, args->arg2, sizeof (len));
3266 break;
3267 }
3268
3269 default:
3270 panic("ledger syscall logic error -- command type %d", args->cmd);
3271 proc_rele(proc);
3272 rval = EINVAL;
3273 }
3274
3275 return (rval);
3276}
3277
3278int
3279telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
3280{
3281 int error = 0;
3282
3283 switch (args->cmd) {
3284#if CONFIG_TELEMETRY
3285 case TELEMETRY_CMD_TIMER_EVENT:
3286 error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
3287 break;
3288 case TELEMETRY_CMD_PMI_SETUP:
3289 error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
3290 break;
3291#endif /* CONFIG_TELEMETRY */
3292 case TELEMETRY_CMD_VOUCHER_NAME:
3293 if (thread_set_voucher_name((mach_port_name_t)args->deadline))
3294 error = EINVAL;
3295 break;
3296
3297 default:
3298 error = EINVAL;
3299 break;
3300 }
3301
3302 return (error);
3303}
3304
3305#if DEVELOPMENT || DEBUG
3306#if CONFIG_WAITQ_DEBUG
3307static uint64_t g_wqset_num = 0;
3308struct g_wqset {
3309 queue_chain_t link;
3310 struct waitq_set *wqset;
3311};
3312
3313static queue_head_t g_wqset_list;
3314static struct waitq_set *g_waitq_set = NULL;
3315
3316static inline struct waitq_set *sysctl_get_wqset(int idx)
3317{
3318 struct g_wqset *gwqs;
3319
3320 if (!g_wqset_num)
3321 queue_init(&g_wqset_list);
3322
3323 /* don't bother with locks: this is test-only code! */
3324 qe_foreach_element(gwqs, &g_wqset_list, link) {
3325 if ((int)(wqset_id(gwqs->wqset) & 0xffffffff) == idx)
3326 return gwqs->wqset;
3327 }
3328
3329 /* allocate a new one */
3330 ++g_wqset_num;
3331 gwqs = (struct g_wqset *)kalloc(sizeof(*gwqs));
3332 assert(gwqs != NULL);
3333
3334 gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL);
3335 enqueue_tail(&g_wqset_list, &gwqs->link);
3336 printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
3337
3338 return gwqs->wqset;
3339}
3340
3341#define MAX_GLOBAL_TEST_QUEUES 64
3342static int g_wq_init = 0;
3343static struct waitq g_wq[MAX_GLOBAL_TEST_QUEUES];
3344
3345static inline struct waitq *global_test_waitq(int idx)
3346{
3347 if (idx < 0)
3348 return NULL;
3349
3350 if (!g_wq_init) {
3351 g_wq_init = 1;
3352 for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++)
3353 waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
3354 }
3355
3356 return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
3357}
3358
3359static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
3360{
3361#pragma unused(oidp, arg1, arg2)
3362 int error;
3363 int index;
3364 struct waitq *waitq;
3365 kern_return_t kr;
3366 int64_t event64 = 0;
3367
3368 error = SYSCTL_IN(req, &event64, sizeof(event64));
3369 if (error)
3370 return error;
3371
3372 if (!req->newptr)
3373 return SYSCTL_OUT(req, &event64, sizeof(event64));
3374
3375 if (event64 < 0) {
3376 index = (int)((-event64) & 0xffffffff);
3377 waitq = wqset_waitq(sysctl_get_wqset(index));
3378 index = -index;
3379 } else {
3380 index = (int)event64;
3381 waitq = global_test_waitq(index);
3382 }
3383
3384 event64 = 0;
3385
3386 printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
3387 index, event64);
3388 kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
3389 WAITQ_ALL_PRIORITIES);
3390 printf("[WQ]: \tkr=%d\n", kr);
3391
3392 return SYSCTL_OUT(req, &kr, sizeof(kr));
3393}
3394SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3395 0, 0, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
3396
3397
3398static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
3399{
3400#pragma unused(oidp, arg1, arg2)
3401 int error;
3402 int index;
3403 struct waitq *waitq;
3404 kern_return_t kr;
3405 int64_t event64 = 0;
3406
3407 error = SYSCTL_IN(req, &event64, sizeof(event64));
3408 if (error)
3409 return error;
3410
3411 if (!req->newptr)
3412 return SYSCTL_OUT(req, &event64, sizeof(event64));
3413
3414 if (event64 < 0) {
3415 index = (int)((-event64) & 0xffffffff);
3416 waitq = wqset_waitq(sysctl_get_wqset(index));
3417 index = -index;
3418 } else {
3419 index = (int)event64;
3420 waitq = global_test_waitq(index);
3421 }
3422
3423 event64 = 0;
3424
3425 printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
3426 index, event64);
3427 kr = waitq_wakeup64_all(waitq, (event64_t)event64,
3428 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
3429 printf("[WQ]: \tkr=%d\n", kr);
3430
3431 return SYSCTL_OUT(req, &kr, sizeof(kr));
3432}
3433SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3434 0, 0, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
3435
3436
3437static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
3438{
3439#pragma unused(oidp, arg1, arg2)
3440 int error;
3441 int index;
3442 struct waitq *waitq;
3443 kern_return_t kr;
3444 int64_t event64 = 0;
3445
3446 error = SYSCTL_IN(req, &event64, sizeof(event64));
3447 if (error)
3448 return error;
3449
3450 if (!req->newptr)
3451 return SYSCTL_OUT(req, &event64, sizeof(event64));
3452
3453 if (event64 < 0) {
3454 index = (int)((-event64) & 0xffffffff);
3455 waitq = wqset_waitq(sysctl_get_wqset(index));
3456 index = -index;
3457 } else {
3458 index = (int)event64;
3459 waitq = global_test_waitq(index);
3460 }
3461
3462 event64 = 0;
3463
3464 printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
3465 index, event64);
3466 kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, 0);
3467 if (kr == THREAD_WAITING)
3468 thread_block(THREAD_CONTINUE_NULL);
3469 printf("[WQ]: \tWoke Up: kr=%d\n", kr);
3470
3471 return SYSCTL_OUT(req, &kr, sizeof(kr));
3472}
3473SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3474 0, 0, sysctl_waitq_wait, "Q", "start waiting on given event");
3475
3476
3477static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
3478{
3479#pragma unused(oidp, arg1, arg2)
3480 int error;
3481 struct waitq_set *wqset;
3482 uint64_t event64 = 0;
3483
3484 error = SYSCTL_IN(req, &event64, sizeof(event64));
3485 if (error)
3486 return error;
3487
3488 if (!req->newptr)
3489 goto out;
3490
3491 wqset = sysctl_get_wqset((int)(event64 & 0xffffffff));
3492 g_waitq_set = wqset;
3493
3494 event64 = wqset_id(wqset);
3495 printf("[WQ]: selected wqset 0x%llx\n", event64);
3496
3497out:
3498 if (g_waitq_set)
3499 event64 = wqset_id(g_waitq_set);
3500 else
3501 event64 = (uint64_t)(-1);
3502
3503 return SYSCTL_OUT(req, &event64, sizeof(event64));
3504}
3505SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3506 0, 0, sysctl_wqset_select, "Q", "select/create a global waitq set");
3507
3508
3509static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
3510{
3511#pragma unused(oidp, arg1, arg2)
3512 int error;
3513 int index;
3514 struct waitq *waitq;
3515 struct waitq_set *wqset;
3516 kern_return_t kr;
3517 uint64_t reserved_link = 0;
3518 int64_t event64 = 0;
3519
3520 error = SYSCTL_IN(req, &event64, sizeof(event64));
3521 if (error)
3522 return error;
3523
3524 if (!req->newptr)
3525 return SYSCTL_OUT(req, &event64, sizeof(event64));
3526
3527 if (!g_waitq_set)
3528 g_waitq_set = sysctl_get_wqset(1);
3529 wqset = g_waitq_set;
3530
3531 if (event64 < 0) {
3532 struct waitq_set *tmp;
3533 index = (int)((-event64) & 0xffffffff);
3534 tmp = sysctl_get_wqset(index);
3535 if (tmp == wqset)
3536 goto out;
3537 waitq = wqset_waitq(tmp);
3538 index = -index;
3539 } else {
3540 index = (int)event64;
3541 waitq = global_test_waitq(index);
3542 }
3543
3544 printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
3545 index, wqset_id(wqset));
3546 reserved_link = waitq_link_reserve(waitq);
3547 kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
3548 waitq_link_release(reserved_link);
3549
3550 printf("[WQ]: \tkr=%d\n", kr);
3551
3552out:
3553 return SYSCTL_OUT(req, &kr, sizeof(kr));
3554}
3555SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3556 0, 0, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
3557
3558
3559static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
3560{
3561#pragma unused(oidp, arg1, arg2)
3562 int error;
3563 int index;
3564 struct waitq *waitq;
3565 struct waitq_set *wqset;
3566 kern_return_t kr;
3567 uint64_t event64 = 0;
3568
3569 error = SYSCTL_IN(req, &event64, sizeof(event64));
3570 if (error)
3571 return error;
3572
3573 if (!req->newptr)
3574 return SYSCTL_OUT(req, &event64, sizeof(event64));
3575
3576 if (!g_waitq_set)
3577 g_waitq_set = sysctl_get_wqset(1);
3578 wqset = g_waitq_set;
3579
3580 index = (int)event64;
3581 waitq = global_test_waitq(index);
3582
3583 printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
3584 index, wqset_id(wqset));
3585
3586 kr = waitq_unlink(waitq, wqset);
3587 printf("[WQ]: \tkr=%d\n", kr);
3588
3589 return SYSCTL_OUT(req, &kr, sizeof(kr));
3590}
3591SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3592 0, 0, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
3593
3594
3595static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
3596{
3597#pragma unused(oidp, arg1, arg2)
3598 struct waitq *waitq;
3599 uint64_t event64 = 0;
3600 int error, index;
3601
3602 error = SYSCTL_IN(req, &event64, sizeof(event64));
3603 if (error)
3604 return error;
3605
3606 if (!req->newptr)
3607 return SYSCTL_OUT(req, &event64, sizeof(event64));
3608
3609 index = (int)event64;
3610 waitq = global_test_waitq(index);
3611
3612 printf("[WQ]: clearing prepost on waitq [%d]\n", index);
3613 waitq_clear_prepost(waitq);
3614
3615 return SYSCTL_OUT(req, &event64, sizeof(event64));
3616}
3617SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3618 0, 0, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
3619
3620
3621static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
3622{
3623#pragma unused(oidp, arg1, arg2)
3624 int error;
3625 struct waitq_set *wqset;
3626 kern_return_t kr;
3627 uint64_t event64 = 0;
3628
3629 error = SYSCTL_IN(req, &event64, sizeof(event64));
3630 if (error)
3631 return error;
3632
3633 if (!req->newptr)
3634 return SYSCTL_OUT(req, &event64, sizeof(event64));
3635
3636 if (!g_waitq_set)
3637 g_waitq_set = sysctl_get_wqset(1);
3638 wqset = g_waitq_set;
3639
3640 printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
3641 wqset_id(wqset));
3642
3643 kr = waitq_set_unlink_all(wqset);
3644 printf("[WQ]: \tkr=%d\n", kr);
3645
3646 return SYSCTL_OUT(req, &kr, sizeof(kr));
3647}
3648SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3649 0, 0, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
3650
3651
3652static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
3653{
3654#pragma unused(oidp, arg1, arg2)
3655 struct waitq_set *wqset = NULL;
3656 uint64_t event64 = 0;
3657 int error, index;
3658
3659 error = SYSCTL_IN(req, &event64, sizeof(event64));
3660 if (error)
3661 return error;
3662
3663 if (!req->newptr)
3664 goto out;
3665
3666 index = (int)((event64) & 0xffffffff);
3667 wqset = sysctl_get_wqset(index);
3668 assert(wqset != NULL);
3669
3670 printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
3671 waitq_set_clear_preposts(wqset);
3672
3673out:
3674 if (wqset)
3675 event64 = wqset_id(wqset);
3676 else
3677 event64 = (uint64_t)(-1);
3678
3679 return SYSCTL_OUT(req, &event64, sizeof(event64));
3680}
3681SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3682 0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
3683
3684#endif /* CONFIG_WAITQ_DEBUG */
3685
3686static int
3687sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
3688{
3689#pragma unused(oidp, arg1, arg2)
3690 int nelem;
3691
3692 /* Read only */
3693 if (req->newptr != USER_ADDR_NULL)
3694 return (EPERM);
3695
3696 nelem = sysctl_helper_waitq_set_nelem();
3697
3698 return SYSCTL_OUT(req, &nelem, sizeof(nelem));
3699}
3700
3701SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED,
3702 0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
3703
3704
3705#endif /* DEVELOPMENT || DEBUG */
3706
3707
3708