kern_aio.c source code [xnu/bsd/kern/kern_aio.c]

1	/*
2	* Copyright (c) 2003-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29
30	/*
31	* todo:
32	* 1) ramesh is looking into how to replace taking a reference on
33	* the user's map (vm_map_reference()) since it is believed that
34	* would not hold the process for us.
35	* 2) david is looking into a way for us to set the priority of the
36	* worker threads to match that of the user's thread when the
37	* async IO was queued.
38	*/
39
40
41	/*
42	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
43	*/
44
45	#include <sys/systm.h>
46	#include <sys/fcntl.h>
47	#include <sys/file_internal.h>
48	#include <sys/filedesc.h>
49	#include <sys/kernel.h>
50	#include <sys/vnode_internal.h>
51	#include <sys/kauth.h>
52	#include <sys/mount_internal.h>
53	#include <sys/param.h>
54	#include <sys/proc_internal.h>
55	#include <sys/sysctl.h>
56	#include <sys/unistd.h>
57	#include <sys/user.h>
58
59	#include <sys/aio_kern.h>
60	#include <sys/sysproto.h>
61
62	#include <machine/limits.h>
63
64	#include <mach/mach_types.h>
65	#include <kern/kern_types.h>
66	#include <kern/waitq.h>
67	#include <kern/zalloc.h>
68	#include <kern/task.h>
69	#include <kern/sched_prim.h>
70
71	#include <vm/vm_map.h>
72
73	#include <os/refcnt.h>
74
75	#include <sys/kdebug.h>
76	#define AIO_work_queued 1
77	#define AIO_worker_wake 2
78	#define AIO_completion_sig 3
79	#define AIO_completion_cleanup_wait 4
80	#define AIO_completion_cleanup_wake 5
81	#define AIO_completion_suspend_wake 6
82	#define AIO_fsync_delay 7
83	#define AIO_cancel 10
84	#define AIO_cancel_async_workq 11
85	#define AIO_cancel_sync_workq 12
86	#define AIO_cancel_activeq 13
87	#define AIO_cancel_doneq 14
88	#define AIO_fsync 20
89	#define AIO_read 30
90	#define AIO_write 40
91	#define AIO_listio 50
92	#define AIO_error 60
93	#define AIO_error_val 61
94	#define AIO_error_activeq 62
95	#define AIO_error_workq 63
96	#define AIO_return 70
97	#define AIO_return_val 71
98	#define AIO_return_activeq 72
99	#define AIO_return_workq 73
100	#define AIO_exec 80
101	#define AIO_exit 90
102	#define AIO_exit_sleep 91
103	#define AIO_close 100
104	#define AIO_close_sleep 101
105	#define AIO_suspend 110
106	#define AIO_suspend_sleep 111
107	#define AIO_worker_thread 120
108
109	__options_decl(aio_entry_flags_t, uint32_t, {
110	AIO_READ = `0x00000001`, / a read /
111	AIO_WRITE = `0x00000002`, / a write /
112	AIO_FSYNC = `0x00000004`, / aio_fsync with op = O_SYNC /
113	AIO_DSYNC = `0x00000008`, / aio_fsync with op = O_DSYNC (not supported yet) /
114	AIO_LIO = `0x00000010`, / lio_listio generated IO /
115	AIO_LIO_WAIT = `0x00000020`, / lio_listio is waiting on the leader /
116
117	/*
118	* These flags mean that this entry is blocking either:
119	* - close (AIO_CLOSE_WAIT)
120	* - exit or exec (AIO_EXIT_WAIT)
121	*
122	* These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123	* will also neuter notifications in do_aio_completion_and_unlock().
124	*/
125	AIO_CLOSE_WAIT = `0x00004000`,
126	AIO_EXIT_WAIT = `0x00008000`,
127	});
128
129	/! @struct aio_workq_entry*
130	*
131	* @discussion
132	* This represents a piece of aio/lio work.
133	*
134	* The ownership rules go as follows:
135	*
136	* - the "proc" owns one refcount on the entry (from creation), while it is
137	* enqueued on the aio_activeq and then the aio_doneq.
138	*
139	* either aio_return() (user read the status) or _aio_exit() (the process
140	* died) will dequeue the entry and consume this ref.
141	*
142	* - the async workqueue owns one refcount once the work is submitted,
143	* which is consumed in do_aio_completion_and_unlock().
144	*
145	* This ref protects the entry for the the end of
146	* do_aio_completion_and_unlock() (when signal delivery happens).
147	*
148	* - lio_listio() for batches picks one of the entries to be the "leader"
149	* of the batch. Each work item will have a refcount on its leader
150	* so that the accounting of the batch completion can be done on the leader
151	* (to be able to decrement lio_pending).
152	*
153	* This ref is consumed in do_aio_completion_and_unlock() as well.
154	*
155	* - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156	* an extra ref is taken in this syscall as it needs to keep accessing
157	* the leader "lio_pending" field until it hits 0.
158	*/
159	struct aio_workq_entry {
160	/ queue lock /
161	TAILQ_ENTRY(aio_workq_entry) aio_workq_link;
162
163	/ Proc lock /
164	TAILQ_ENTRY(aio_workq_entry) aio_proc_link; / p_aio_activeq or p_aio_doneq /
165	user_ssize_t returnval; / return value from read / write request /
166	errno_t errorval; / error value from read / write request /
167	os_refcnt_t aio_refcount;
168	aio_entry_flags_t flags;
169
170	int lio_pending; / pending I/Os in lio group, only on leader /
171	struct aio_workq_entry lio_leader; /* pointer to the lio leader, can be self /
172
173	/ Initialized and never changed, safe to access /
174	struct proc procp; /* user proc that queued this request /
175	user_addr_t uaiocbp; / pointer passed in from user land /
176	struct user_aiocb aiocb; / copy of aiocb from user land /
177	struct vfs_context context; / context which enqueued the request /
178
179	/ Initialized, and possibly freed by aio_work_thread() or at free if cancelled /
180	vm_map_t aio_map; / user land map we have a reference to /
181	};
182
183	/*
184	* aio requests queue up on the aio_async_workq or lio_sync_workq (for
185	* lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
186	* (proc.aio_activeq) when one of our worker threads start the IO.
187	* And finally, requests move to the per process aio_doneq (proc.aio_doneq)
188	* when the IO request completes. The request remains on aio_doneq until
189	* user process calls aio_return or the process exits, either way that is our
190	* trigger to release aio resources.
191	*/
192	typedef struct aio_workq {
193	TAILQ_HEAD(, aio_workq_entry) aioq_entries;
194	lck_spin_t aioq_lock;
195	struct waitq aioq_waitq;
196	} *aio_workq_t;
197
198	#define AIO_NUM_WORK_QUEUES 1
199	struct aio_anchor_cb {
200	os_atomic(int) aio_total_count; / total extant entries /
201
202	/ Hash table of queues here /
203	int aio_num_workqs;
204	struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
205	};
206	typedef struct aio_anchor_cb aio_anchor_cb;
207
208	/*
209	* Notes on aio sleep / wake channels.
210	* We currently pick a couple fields within the proc structure that will allow
211	* us sleep channels that currently do not collide with any other kernel routines.
212	* At this time, for binary compatibility reasons, we cannot create new proc fields.
213	*/
214	#define AIO_SUSPEND_SLEEP_CHAN p_aio_activeq
215	#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
216
217	#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
218	if ((aiop)->procp != (theproc)) { \
219	panic("AIO on a proc list that does not belong to that proc."); \
220	}
221
222	/*
223	* LOCAL PROTOTYPES
224	*/
225	static void aio_proc_lock(proc_t procp);
226	static void aio_proc_lock_spin(proc_t procp);
227	static void aio_proc_unlock(proc_t procp);
228	static lck_mtx_t *aio_proc_mutex(proc_t procp);
229	static bool aio_has_active_requests_for_process(proc_t procp);
230	static bool aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231	static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp);
232
233	static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
234	static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235	static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
236	static void aio_entry_ref(aio_workq_entry *entryp);
237	static void aio_entry_unref(aio_workq_entry *entryp);
238	static bool aio_entry_try_workq_remove(aio_workq_entry *entryp);
239	static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp);
240	static void aio_free_request(aio_workq_entry *entryp);
241
242	static void aio_workq_init(aio_workq_t wq);
243	static void aio_workq_lock_spin(aio_workq_t wq);
244	static void aio_workq_unlock(aio_workq_t wq);
245	static lck_spin_t *aio_workq_lock(aio_workq_t wq);
246
247	static void aio_work_thread(void *arg, wait_result_t wr);
248	static aio_workq_entry aio_get_some_work(void*);
249
250	static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251	static int aio_validate(proc_t, aio_workq_entry *entryp);
252
253	static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254	static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255	static int do_aio_fsync(aio_workq_entry *entryp);
256	static int do_aio_read(aio_workq_entry *entryp);
257	static int do_aio_write(aio_workq_entry *entryp);
258	static void do_munge_aiocb_user32_to_user(struct user32_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp);
259	static void do_munge_aiocb_user64_to_user(struct user64_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp);
260	static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261	static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t , int*);
262
263	#define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264	#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
265
266	/*
267	* EXTERNAL PROTOTYPES
268	*/
269
270	/ in ...bsd/kern/sys_generic.c /
271	extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
272	user_addr_t bufp, user_size_t nbyte,
273	off_t offset, int flags, user_ssize_t *retval);
274	extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
275	user_addr_t bufp, user_size_t nbyte, off_t offset,
276	int flags, user_ssize_t *retval);
277
278	/*
279	* aio external global variables.
280	*/
281	extern int aio_max_requests; / AIO_MAX - configurable /
282	extern int aio_max_requests_per_process; / AIO_PROCESS_MAX - configurable /
283	extern int aio_worker_threads; / AIO_THREAD_COUNT - configurable /
284
285
286	/*
287	* aio static variables.
288	*/
289	static aio_anchor_cb aio_anchor = {
290	.aio_num_workqs = AIO_NUM_WORK_QUEUES,
291	};
292	os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293	static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294	static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295	static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
296
297	static KALLOC_TYPE_DEFINE(aio_workq_zonep, aio_workq_entry, KT_DEFAULT);
298
299	/ Hash /
300	static aio_workq_t
301	aio_entry_workq(__unused aio_workq_entry *entryp)
302	{
303	return &aio_anchor.aio_async_workqs[`0`];
304	}
305
306	static void
307	aio_workq_init(aio_workq_t wq)
308	{
309	TAILQ_INIT(&wq->aioq_entries);
310	lck_spin_init(lck: &wq->aioq_lock, grp: &aio_queue_lock_grp, LCK_ATTR_NULL);
311	waitq_init(waitq: &wq->aioq_waitq, type: WQT_QUEUE, SYNC_POLICY_FIFO);
312	}
313
314
315	/*
316	* Can be passed a queue which is locked spin.
317	*/
318	static void
319	aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
320	{
321	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
322
323	if (entryp->aio_workq_link.tqe_prev == NULL) {
324	panic("Trying to remove an entry from a work queue, but it is not on a queue");
325	}
326
327	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
328	entryp->aio_workq_link.tqe_prev = NULL; / Not on a workq /
329	}
330
331	static void
332	aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333	{
334	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335
336	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337	}
338
339	static void
340	aio_proc_lock(proc_t procp)
341	{
342	lck_mtx_lock(lck: aio_proc_mutex(procp));
343	}
344
345	static void
346	aio_proc_lock_spin(proc_t procp)
347	{
348	lck_mtx_lock_spin(lck: aio_proc_mutex(procp));
349	}
350
351	static bool
352	aio_has_any_work(void)
353	{
354	return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != `0`;
355	}
356
357	static bool
358	aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
359	{
360	int old, new;
361
362	ASSERT_AIO_PROC_LOCK_OWNED(procp);
363
364	if (procp->p_aio_total_count >= aio_max_requests_per_process) {
365	return false;
366	}
367
368	if (is_already_queued(procp, aiocbp: entryp->uaiocbp)) {
369	return false;
370	}
371
372	os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
373	if (old >= aio_max_requests) {
374	os_atomic_rmw_loop_give_up(return false);
375	}
376	new = old + `1`;
377	});
378
379	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
380	procp->p_aio_total_count++;
381	return true;
382	}
383
384	static void
385	aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
386	{
387	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
388	TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
389	}
390
391	static void
392	aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
393	{
394	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
395	entryp->aio_proc_link.tqe_prev = NULL;
396	if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= `0`) {
397	panic("Negative total AIO count!");
398	}
399	if (procp->p_aio_total_count-- <= `0`) {
400	panic("proc %p: p_aio_total_count accounting mismatch", procp);
401	}
402	}
403
404	static void
405	aio_proc_unlock(proc_t procp)
406	{
407	lck_mtx_unlock(lck: aio_proc_mutex(procp));
408	}
409
410	static lck_mtx_t*
411	aio_proc_mutex(proc_t procp)
412	{
413	return &procp->p_mlock;
414	}
415
416	static void
417	aio_entry_ref(aio_workq_entry *entryp)
418	{
419	os_ref_retain(rc: &entryp->aio_refcount);
420	}
421
422	static void
423	aio_entry_unref(aio_workq_entry *entryp)
424	{
425	if (os_ref_release(rc: &entryp->aio_refcount) == `0`) {
426	aio_free_request(entryp);
427	}
428	}
429
430	static bool
431	aio_entry_try_workq_remove(aio_workq_entry *entryp)
432	{
433	/ Can only be cancelled if it's still on a work queue /
434	if (entryp->aio_workq_link.tqe_prev != NULL) {
435	aio_workq_t queue;
436
437	/ Will have to check again under the lock /
438	queue = aio_entry_workq(entryp);
439	aio_workq_lock_spin(wq: queue);
440	if (entryp->aio_workq_link.tqe_prev != NULL) {
441	aio_workq_remove_entry_locked(queue, entryp);
442	aio_workq_unlock(wq: queue);
443	return true;
444	} else {
445	aio_workq_unlock(wq: queue);
446	}
447	}
448
449	return false;
450	}
451
452	static void
453	aio_workq_lock_spin(aio_workq_t wq)
454	{
455	lck_spin_lock(lck: aio_workq_lock(wq));
456	}
457
458	static void
459	aio_workq_unlock(aio_workq_t wq)
460	{
461	lck_spin_unlock(lck: aio_workq_lock(wq));
462	}
463
464	static lck_spin_t*
465	aio_workq_lock(aio_workq_t wq)
466	{
467	return &wq->aioq_lock;
468	}
469
470	/*
471	* aio_cancel - attempt to cancel one or more async IO requests currently
472	* outstanding against file descriptor uap->fd. If uap->aiocbp is not
473	* NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
474	* is NULL then all outstanding async IO request for the given file
475	* descriptor are cancelled (if possible).
476	*/
477	int
478	aio_cancel(proc_t p, struct aio_cancel_args uap, int* *retval)
479	{
480	struct user_aiocb my_aiocb;
481	int result;
482
483	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) \| DBG_FUNC_START,
484	VM_KERNEL_ADDRPERM(p), uap->aiocbp, `0`, `0`, `0`);
485
486	/ quick check to see if there are any async IO requests queued up /
487	if (!aio_has_any_work()) {
488	result = `0`;
489	*retval = AIO_ALLDONE;
490	goto ExitRoutine;
491	}
492
493	*retval = -`1`;
494	if (uap->aiocbp != USER_ADDR_NULL) {
495	if (proc_is64bit(p)) {
496	struct user64_aiocb aiocb64;
497
498	result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
499	if (result == `0`) {
500	do_munge_aiocb_user64_to_user(my_aiocbp: &aiocb64, the_user_aiocbp: &my_aiocb);
501	}
502	} else {
503	struct user32_aiocb aiocb32;
504
505	result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
506	if (result == `0`) {
507	do_munge_aiocb_user32_to_user(my_aiocbp: &aiocb32, the_user_aiocbp: &my_aiocb);
508	}
509	}
510
511	if (result != `0`) {
512	result = EAGAIN;
513	goto ExitRoutine;
514	}
515
516	/ NOTE - POSIX standard says a mismatch between the file /
517	/ descriptor passed in and the file descriptor embedded in /
518	/ the aiocb causes unspecified results. We return EBADF in /
519	/ that situation. /
520	if (uap->fd != my_aiocb.aio_fildes) {
521	result = EBADF;
522	goto ExitRoutine;
523	}
524	}
525
526	aio_proc_lock(procp: p);
527	result = do_aio_cancel_locked(p, fd: uap->fd, aiocbp: uap->aiocbp, `0`);
528	ASSERT_AIO_PROC_LOCK_OWNED(p);
529	aio_proc_unlock(procp: p);
530
531	if (result != -`1`) {
532	*retval = result;
533	result = `0`;
534	goto ExitRoutine;
535	}
536
537	result = EBADF;
538
539	ExitRoutine:
540	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) \| DBG_FUNC_END,
541	VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, `0`, `0`);
542
543	return result;
544	}
545
546
547	/*
548	* _aio_close - internal function used to clean up async IO requests for
549	* a file descriptor that is closing.
550	* THIS MAY BLOCK.
551	*/
552	__private_extern__ void
553	_aio_close(proc_t p, int fd)
554	{
555	int error;
556
557	/ quick check to see if there are any async IO requests queued up /
558	if (!aio_has_any_work()) {
559	return;
560	}
561
562	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) \| DBG_FUNC_START,
563	VM_KERNEL_ADDRPERM(p), fd, `0`, `0`, `0`);
564
565	/ cancel all async IO requests on our todo queues for this file descriptor /
566	aio_proc_lock(procp: p);
567	error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
568	ASSERT_AIO_PROC_LOCK_OWNED(p);
569	if (error == AIO_NOTCANCELED) {
570	/*
571	* AIO_NOTCANCELED is returned when we find an aio request for this process
572	* and file descriptor on the active async IO queue. Active requests cannot
573	* be cancelled so we must wait for them to complete. We will get a special
574	* wake up call on our channel used to sleep for ALL active requests to
575	* complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
576	* when we must wait for all active aio requests.
577	*/
578
579	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) \| DBG_FUNC_NONE,
580	VM_KERNEL_ADDRPERM(p), fd, `0`, `0`, `0`);
581
582	while (aio_proc_has_active_requests_for_file(procp: p, fd)) {
583	msleep(chan: &p->AIO_CLEANUP_SLEEP_CHAN, mtx: aio_proc_mutex(procp: p), PRIBIO, wmesg: "aio_close", ts: `0`);
584	}
585	}
586
587	aio_proc_unlock(procp: p);
588
589	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) \| DBG_FUNC_END,
590	VM_KERNEL_ADDRPERM(p), fd, `0`, `0`, `0`);
591	}
592
593
594	/*
595	* aio_error - return the error status associated with the async IO
596	* request referred to by uap->aiocbp. The error status is the errno
597	* value that would be set by the corresponding IO request (read, wrtie,
598	* fdatasync, or sync).
599	*/
600	int
601	aio_error(proc_t p, struct aio_error_args uap, int* *retval)
602	{
603	aio_workq_entry *entryp;
604	int error;
605
606	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) \| DBG_FUNC_START,
607	VM_KERNEL_ADDRPERM(p), uap->aiocbp, `0`, `0`, `0`);
608
609	/ see if there are any aios to check /
610	if (!aio_has_any_work()) {
611	return EINVAL;
612	}
613
614	aio_proc_lock(procp: p);
615
616	/ look for a match on our queue of async IO requests that have completed /
617	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
618	if (entryp->uaiocbp == uap->aiocbp) {
619	ASSERT_AIO_FROM_PROC(entryp, p);
620
621	*retval = entryp->errorval;
622	error = `0`;
623
624	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) \| DBG_FUNC_NONE,
625	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, `0`, `0`);
626	goto ExitRoutine;
627	}
628	}
629
630	/ look for a match on our queue of active async IO requests /
631	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
632	if (entryp->uaiocbp == uap->aiocbp) {
633	ASSERT_AIO_FROM_PROC(entryp, p);
634	*retval = EINPROGRESS;
635	error = `0`;
636	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) \| DBG_FUNC_NONE,
637	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, `0`, `0`);
638	goto ExitRoutine;
639	}
640	}
641
642	error = EINVAL;
643
644	ExitRoutine:
645	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) \| DBG_FUNC_END,
646	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, `0`, `0`);
647	aio_proc_unlock(procp: p);
648
649	return error;
650	}
651
652
653	/*
654	* aio_fsync - asynchronously force all IO operations associated
655	* with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
656	* queued at the time of the call to the synchronized completion state.
657	* NOTE - we do not support op O_DSYNC at this point since we do not support the
658	* fdatasync() call.
659	*/
660	int
661	aio_fsync(proc_t p, struct aio_fsync_args uap, int* *retval)
662	{
663	aio_entry_flags_t fsync_kind;
664	int error;
665
666	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) \| DBG_FUNC_START,
667	VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, `0`, `0`);
668
669	*retval = `0`;
670	/ 0 := O_SYNC for binary backward compatibility with Panther /
671	if (uap->op == O_SYNC \|\| uap->op == `0`) {
672	fsync_kind = AIO_FSYNC;
673	} else if (uap->op == O_DSYNC) {
674	fsync_kind = AIO_DSYNC;
675	} else {
676	*retval = -`1`;
677	error = EINVAL;
678	goto ExitRoutine;
679	}
680
681	error = aio_queue_async_request(procp: p, aiocbp: uap->aiocbp, fsync_kind);
682	if (error != `0`) {
683	*retval = -`1`;
684	}
685
686	ExitRoutine:
687	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) \| DBG_FUNC_END,
688	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, `0`, `0`);
689
690	return error;
691	}
692
693
694	/ aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the*
695	* file descriptor (uap->aiocbp->aio_fildes) into the buffer
696	* (uap->aiocbp->aio_buf).
697	*/
698	int
699	aio_read(proc_t p, struct aio_read_args uap, int* *retval)
700	{
701	int error;
702
703	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) \| DBG_FUNC_START,
704	VM_KERNEL_ADDRPERM(p), uap->aiocbp, `0`, `0`, `0`);
705
706	*retval = `0`;
707
708	error = aio_queue_async_request(procp: p, aiocbp: uap->aiocbp, AIO_READ);
709	if (error != `0`) {
710	*retval = -`1`;
711	}
712
713	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) \| DBG_FUNC_END,
714	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, `0`, `0`);
715
716	return error;
717	}
718
719
720	/*
721	* aio_return - return the return status associated with the async IO
722	* request referred to by uap->aiocbp. The return status is the value
723	* that would be returned by corresponding IO request (read, write,
724	* fdatasync, or sync). This is where we release kernel resources
725	* held for async IO call associated with the given aiocb pointer.
726	*/
727	int
728	aio_return(proc_t p, struct aio_return_args uap, user_ssize_t retval)
729	{
730	aio_workq_entry *entryp;
731	int error = EINVAL;
732
733	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) \| DBG_FUNC_START,
734	VM_KERNEL_ADDRPERM(p), uap->aiocbp, `0`, `0`, `0`);
735
736	/ See if there are any entries to check /
737	if (!aio_has_any_work()) {
738	goto ExitRoutine;
739	}
740
741	aio_proc_lock(procp: p);
742	*retval = `0`;
743
744	/ look for a match on our queue of async IO requests that have completed /
745	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
746	ASSERT_AIO_FROM_PROC(entryp, p);
747	if (entryp->uaiocbp == uap->aiocbp) {
748	/ Done and valid for aio_return(), pull it off the list /
749	aio_proc_remove_done_locked(procp: p, entryp);
750
751	*retval = entryp->returnval;
752	error = `0`;
753	aio_proc_unlock(procp: p);
754
755	aio_entry_unref(entryp);
756
757	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) \| DBG_FUNC_NONE,
758	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, `0`, `0`);
759	goto ExitRoutine;
760	}
761	}
762
763	/ look for a match on our queue of active async IO requests /
764	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
765	ASSERT_AIO_FROM_PROC(entryp, p);
766	if (entryp->uaiocbp == uap->aiocbp) {
767	error = EINPROGRESS;
768	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) \| DBG_FUNC_NONE,
769	VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, `0`, `0`);
770	break;
771	}
772	}
773
774	aio_proc_unlock(procp: p);
775
776	ExitRoutine:
777	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) \| DBG_FUNC_END,
778	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, `0`, `0`);
779
780	return error;
781	}
782
783
784	/*
785	* _aio_exec - internal function used to clean up async IO requests for
786	* a process that is going away due to exec(). We cancel any async IOs
787	* we can and wait for those already active. We also disable signaling
788	* for cancelled or active aio requests that complete.
789	* This routine MAY block!
790	*/
791	__private_extern__ void
792	_aio_exec(proc_t p)
793	{
794	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) \| DBG_FUNC_START,
795	VM_KERNEL_ADDRPERM(p), `0`, `0`, `0`, `0`);
796
797	_aio_exit(p);
798
799	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) \| DBG_FUNC_END,
800	VM_KERNEL_ADDRPERM(p), `0`, `0`, `0`, `0`);
801	}
802
803
804	/*
805	* _aio_exit - internal function used to clean up async IO requests for
806	* a process that is terminating (via exit() or exec()). We cancel any async IOs
807	* we can and wait for those already active. We also disable signaling
808	* for cancelled or active aio requests that complete. This routine MAY block!
809	*/
810	__private_extern__ void
811	_aio_exit(proc_t p)
812	{
813	TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
814	aio_workq_entry entryp, tmp;
815	int error;
816
817	/ quick check to see if there are any async IO requests queued up /
818	if (!aio_has_any_work()) {
819	return;
820	}
821
822	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) \| DBG_FUNC_START,
823	VM_KERNEL_ADDRPERM(p), `0`, `0`, `0`, `0`);
824
825	aio_proc_lock(procp: p);
826
827	/*
828	* cancel async IO requests on the todo work queue and wait for those
829	* already active to complete.
830	*/
831	error = do_aio_cancel_locked(p, fd: -`1`, USER_ADDR_NULL, AIO_EXIT_WAIT);
832	ASSERT_AIO_PROC_LOCK_OWNED(p);
833	if (error == AIO_NOTCANCELED) {
834	/*
835	* AIO_NOTCANCELED is returned when we find an aio request for this process
836	* on the active async IO queue. Active requests cannot be cancelled so we
837	* must wait for them to complete. We will get a special wake up call on
838	* our channel used to sleep for ALL active requests to complete. This sleep
839	* channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
840	* active aio requests.
841	*/
842
843	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) \| DBG_FUNC_NONE,
844	VM_KERNEL_ADDRPERM(p), `0`, `0`, `0`, `0`);
845
846	while (aio_has_active_requests_for_process(procp: p)) {
847	msleep(chan: &p->AIO_CLEANUP_SLEEP_CHAN, mtx: aio_proc_mutex(procp: p), PRIBIO, wmesg: "aio_exit", ts: `0`);
848	}
849	}
850
851	assert(!aio_has_active_requests_for_process(p));
852
853	/ release all aio resources used by this process /
854	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
855	ASSERT_AIO_FROM_PROC(entryp, p);
856
857	aio_proc_remove_done_locked(procp: p, entryp);
858	TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
859	}
860
861	aio_proc_unlock(procp: p);
862
863	/ free all the entries outside of the aio_proc_lock() /
864	TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
865	entryp->aio_proc_link.tqe_prev = NULL;
866	aio_entry_unref(entryp);
867	}
868
869	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) \| DBG_FUNC_END,
870	VM_KERNEL_ADDRPERM(p), `0`, `0`, `0`, `0`);
871	}
872
873
874	static bool
875	should_cancel(aio_workq_entry entryp, int* fd, user_addr_t aiocbp,
876	aio_entry_flags_t reason)
877	{
878	if (reason & AIO_EXIT_WAIT) {
879	/ caller is _aio_exit() /
880	return true;
881	}
882	if (fd != entryp->aiocb.aio_fildes) {
883	/ not the file we're looking for /
884	return false;
885	}
886	/*
887	* aio_cancel() or _aio_close() cancel
888	* everything for a given fd when aiocbp is NULL
889	*/
890	return aiocbp == USER_ADDR_NULL \|\| entryp->uaiocbp == aiocbp;
891	}
892
893	/*
894	* do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
895	* aio_cancel, close, and at exit.
896	* There are three modes of operation: 1) cancel all async IOs for a process -
897	* fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
898	* is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
899	* aiocbp.
900	* Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
901	* target async IO requests, AIO_NOTCANCELED if we could not cancel all
902	* target async IO requests, and AIO_ALLDONE if all target async IO requests
903	* were already complete.
904	* WARNING - do not deference aiocbp in this routine, it may point to user
905	* land data that has not been copied in (when called from aio_cancel())
906	*
907	* Called with proc locked, and returns the same way.
908	*/
909	static int
910	do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
911	aio_entry_flags_t reason)
912	{
913	bool multiple_matches = (aiocbp == USER_ADDR_NULL);
914	aio_workq_entry entryp, tmp;
915	int result;
916
917	ASSERT_AIO_PROC_LOCK_OWNED(p);
918
919	/ look for a match on our queue of async todo work. /
920	again:
921	result = -`1`;
922	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
923	ASSERT_AIO_FROM_PROC(entryp, p);
924
925	if (!should_cancel(entryp, fd, aiocbp, reason)) {
926	continue;
927	}
928
929	if (reason) {
930	/ mark the entry as blocking close or exit/exec /
931	entryp->flags \|= reason;
932	if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
933	panic("Close and exit flags set at the same time");
934	}
935	}
936
937	/ Can only be cancelled if it's still on a work queue /
938	if (aio_entry_try_workq_remove(entryp)) {
939	entryp->errorval = ECANCELED;
940	entryp->returnval = -`1`;
941
942	/ Now it's officially cancelled. Do the completion /
943	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) \| DBG_FUNC_NONE,
944	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
945	fd, `0`, `0`);
946	do_aio_completion_and_unlock(p, entryp);
947
948	aio_proc_lock(procp: p);
949
950	if (multiple_matches) {
951	/*
952	* Restart from the head of the proc active queue since it
953	* may have been changed while we were away doing completion
954	* processing.
955	*
956	* Note that if we found an uncancellable AIO before, we will
957	* either find it again or discover that it's been completed,
958	* so resetting the result will not cause us to return success
959	* despite outstanding AIOs.
960	*/
961	goto again;
962	}
963
964	return AIO_CANCELED;
965	}
966
967	/*
968	* It's been taken off the active queue already, i.e. is in flight.
969	* All we can do is ask for notification.
970	*/
971	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) \| DBG_FUNC_NONE,
972	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
973	fd, `0`, `0`);
974
975	result = AIO_NOTCANCELED;
976	if (!multiple_matches) {
977	return result;
978	}
979	}
980
981	/*
982	* if we didn't find any matches on the todo or active queues then look for a
983	* match on our queue of async IO requests that have completed and if found
984	* return AIO_ALLDONE result.
985	*
986	* Proc AIO lock is still held.
987	*/
988	if (result == -`1`) {
989	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
990	ASSERT_AIO_FROM_PROC(entryp, p);
991	if (should_cancel(entryp, fd, aiocbp, reason)) {
992	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) \| DBG_FUNC_NONE,
993	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
994	fd, `0`, `0`);
995
996	result = AIO_ALLDONE;
997	if (!multiple_matches) {
998	return result;
999	}
1000	}
1001	}
1002	}
1003
1004	return result;
1005	}
1006
1007
1008	/*
1009	* aio_suspend - suspend the calling thread until at least one of the async
1010	* IO operations referenced by uap->aiocblist has completed, until a signal
1011	* interrupts the function, or uap->timeoutp time interval (optional) has
1012	* passed.
1013	* Returns 0 if one or more async IOs have completed else -1 and errno is
1014	* set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1015	* woke us up.
1016	*/
1017	int
1018	aio_suspend(proc_t p, struct aio_suspend_args uap, int* *retval)
1019	{
1020	__pthread_testcancel(presyscall: `1`);
1021	return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1022	}
1023
1024
1025	int
1026	aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args uap, int* *retval)
1027	{
1028	int error;
1029	int i;
1030	uint64_t abstime;
1031	struct user_timespec ts;
1032	aio_workq_entry *entryp;
1033	user_addr_t *aiocbpp;
1034	size_t aiocbpp_size;
1035
1036	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) \| DBG_FUNC_START,
1037	VM_KERNEL_ADDRPERM(p), uap->nent, `0`, `0`, `0`);
1038
1039	*retval = -`1`;
1040	abstime = `0`;
1041	aiocbpp = NULL;
1042
1043	if (!aio_has_any_work()) {
1044	error = EINVAL;
1045	goto ExitThisRoutine;
1046	}
1047
1048	if (uap->nent < `1` \|\| uap->nent > aio_max_requests_per_process \|\|
1049	os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1050	error = EINVAL;
1051	goto ExitThisRoutine;
1052	}
1053
1054	if (uap->timeoutp != USER_ADDR_NULL) {
1055	if (proc_is64bit(p)) {
1056	struct user64_timespec temp;
1057	error = copyin(uap->timeoutp, &temp, sizeof(temp));
1058	if (error == `0`) {
1059	ts.tv_sec = (user_time_t)temp.tv_sec;
1060	ts.tv_nsec = (user_long_t)temp.tv_nsec;
1061	}
1062	} else {
1063	struct user32_timespec temp;
1064	error = copyin(uap->timeoutp, &temp, sizeof(temp));
1065	if (error == `0`) {
1066	ts.tv_sec = temp.tv_sec;
1067	ts.tv_nsec = temp.tv_nsec;
1068	}
1069	}
1070	if (error != `0`) {
1071	error = EAGAIN;
1072	goto ExitThisRoutine;
1073	}
1074
1075	if (ts.tv_sec < `0` \|\| ts.tv_nsec < `0` \|\| ts.tv_nsec >= `1000000000`) {
1076	error = EINVAL;
1077	goto ExitThisRoutine;
1078	}
1079
1080	nanoseconds_to_absolutetime(nanoseconds: (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1081	result: &abstime);
1082	clock_absolutetime_interval_to_deadline(abstime, result: &abstime);
1083	}
1084
1085	aiocbpp = (user_addr_t *)kalloc_data(aiocbpp_size, Z_WAITOK);
1086	if (aiocbpp == NULL \|\| aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1087	error = EAGAIN;
1088	goto ExitThisRoutine;
1089	}
1090
1091	/ check list of aio requests to see if any have completed /
1092	check_for_our_aiocbp:
1093	aio_proc_lock_spin(procp: p);
1094	for (i = `0`; i < uap->nent; i++) {
1095	user_addr_t aiocbp;
1096
1097	/ NULL elements are legal so check for 'em /
1098	aiocbp = *(aiocbpp + i);
1099	if (aiocbp == USER_ADDR_NULL) {
1100	continue;
1101	}
1102
1103	/ return immediately if any aio request in the list is done /
1104	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1105	ASSERT_AIO_FROM_PROC(entryp, p);
1106	if (entryp->uaiocbp == aiocbp) {
1107	aio_proc_unlock(procp: p);
1108	*retval = `0`;
1109	error = `0`;
1110	goto ExitThisRoutine;
1111	}
1112	}
1113	}
1114
1115	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) \| DBG_FUNC_NONE,
1116	VM_KERNEL_ADDRPERM(p), uap->nent, `0`, `0`, `0`);
1117
1118	/*
1119	* wait for an async IO to complete or a signal fires or timeout expires.
1120	* we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1121	* interrupts us. If an async IO completes before a signal fires or our
1122	* timeout expires, we get a wakeup call from aio_work_thread().
1123	*/
1124
1125	error = msleep1(chan: &p->AIO_SUSPEND_SLEEP_CHAN, mtx: aio_proc_mutex(procp: p),
1126	PCATCH \| PWAIT \| PDROP, wmesg: "aio_suspend", timo: abstime);
1127	if (error == `0`) {
1128	/*
1129	* got our wakeup call from aio_work_thread().
1130	* Since we can get a wakeup on this channel from another thread in the
1131	* same process we head back up to make sure this is for the correct aiocbp.
1132	* If it is the correct aiocbp we will return from where we do the check
1133	* (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1134	* else we will fall out and just sleep again.
1135	*/
1136	goto check_for_our_aiocbp;
1137	} else if (error == EWOULDBLOCK) {
1138	/ our timeout expired /
1139	error = EAGAIN;
1140	} else {
1141	/ we were interrupted /
1142	error = EINTR;
1143	}
1144
1145	ExitThisRoutine:
1146	if (aiocbpp != NULL) {
1147	kfree_data(aiocbpp, aiocbpp_size);
1148	}
1149
1150	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) \| DBG_FUNC_END,
1151	VM_KERNEL_ADDRPERM(p), uap->nent, error, `0`, `0`);
1152
1153	return error;
1154	}
1155
1156
1157	/ aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the*
1158	* file descriptor (uap->aiocbp->aio_fildes) from the buffer
1159	* (uap->aiocbp->aio_buf).
1160	*/
1161
1162	int
1163	aio_write(proc_t p, struct aio_write_args uap, int* *retval __unused)
1164	{
1165	int error;
1166
1167	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) \| DBG_FUNC_START,
1168	VM_KERNEL_ADDRPERM(p), uap->aiocbp, `0`, `0`, `0`);
1169
1170	error = aio_queue_async_request(procp: p, aiocbp: uap->aiocbp, AIO_WRITE);
1171
1172	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) \| DBG_FUNC_END,
1173	VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, `0`, `0`);
1174
1175	return error;
1176	}
1177
1178
1179	static int
1180	aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1181	int nent)
1182	{
1183	int result;
1184
1185	/ copyin our aiocb pointers from list /
1186	result = copyin(aiocblist, aiocbpp,
1187	proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1188	: (nent * sizeof(user32_addr_t)));
1189	if (result) {
1190	return result;
1191	}
1192
1193	/*
1194	* We depend on a list of user_addr_t's so we need to
1195	* munge and expand when these pointers came from a
1196	* 32-bit process
1197	*/
1198	if (!proc_is64bit(procp)) {
1199	/ copy from last to first to deal with overlap /
1200	user32_addr_t my_ptrp = ((user32_addr_t )aiocbpp) + (nent - `1`);
1201	user_addr_t *my_addrp = aiocbpp + (nent - `1`);
1202
1203	for (int i = `0`; i < nent; i++, my_ptrp--, my_addrp--) {
1204	my_addrp = (user_addr_t) (my_ptrp);
1205	}
1206	}
1207
1208	return `0`;
1209	}
1210
1211
1212	static int
1213	aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1214	{
1215	int result = `0`;
1216
1217	if (sigp == USER_ADDR_NULL) {
1218	goto out;
1219	}
1220
1221	/*
1222	* We need to munge aio_sigevent since it contains pointers.
1223	* Since we do not know if sigev_value is an int or a ptr we do
1224	* NOT cast the ptr to a user_addr_t. This means if we send
1225	* this info back to user space we need to remember sigev_value
1226	* was not expanded for the 32-bit case.
1227	*
1228	* Notes: This does NOT affect us since we don't support
1229	* sigev_value yet in the aio context.
1230	*/
1231	if (proc_is64bit(procp)) {
1232	#if __LP64__
1233	struct user64_sigevent sigevent64;
1234
1235	result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1236	if (result == `0`) {
1237	sigev->sigev_notify = sigevent64.sigev_notify;
1238	sigev->sigev_signo = sigevent64.sigev_signo;
1239	sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1240	sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1241	sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1242	}
1243	#else
1244	panic("64bit process on 32bit kernel is not supported");
1245	#endif
1246	} else {
1247	struct user32_sigevent sigevent32;
1248
1249	result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1250	if (result == `0`) {
1251	sigev->sigev_notify = sigevent32.sigev_notify;
1252	sigev->sigev_signo = sigevent32.sigev_signo;
1253	sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1254	sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1255	sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1256	}
1257	}
1258
1259	if (result != `0`) {
1260	result = EAGAIN;
1261	}
1262
1263	out:
1264	return result;
1265	}
1266
1267	/*
1268	* validate user_sigevent. at this point we only support
1269	* sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1270	* sigev_value, sigev_notify_function, and sigev_notify_attributes
1271	* are ignored, since SIGEV_THREAD is unsupported. This is consistent
1272	* with no [RTS] (RalTime Signal) option group support.
1273	*/
1274	static int
1275	aio_sigev_validate(const struct user_sigevent *sigev)
1276	{
1277	switch (sigev->sigev_notify) {
1278	case SIGEV_SIGNAL:
1279	{
1280	int signum;
1281
1282	/ make sure we have a valid signal number /
1283	signum = sigev->sigev_signo;
1284	if (signum <= `0` \|\| signum >= NSIG \|\|
1285	signum == SIGKILL \|\| signum == SIGSTOP) {
1286	return EINVAL;
1287	}
1288	}
1289	break;
1290
1291	case SIGEV_NONE:
1292	break;
1293
1294	case SIGEV_THREAD:
1295	/ Unsupported [RTS] /
1296
1297	default:
1298	return EINVAL;
1299	}
1300
1301	return `0`;
1302	}
1303
1304
1305	/*
1306	* aio_try_enqueue_work_locked
1307	*
1308	* Queue up the entry on the aio asynchronous work queue in priority order
1309	* based on the relative priority of the request. We calculate the relative
1310	* priority using the nice value of the caller and the value
1311	*
1312	* Parameters: procp Process queueing the I/O
1313	* entryp The work queue entry being queued
1314	* leader The work leader if any
1315	*
1316	* Returns: Wether the enqueue was successful
1317	*
1318	* Notes: This function is used for both lio_listio and aio
1319	*
1320	* XXX: At some point, we may have to consider thread priority
1321	* rather than process priority, but we don't maintain the
1322	* adjusted priority for threads the POSIX way.
1323	*
1324	* Called with proc locked.
1325	*/
1326	static bool
1327	aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1328	aio_workq_entry *leader)
1329	{
1330	aio_workq_t queue = aio_entry_workq(entryp);
1331
1332	ASSERT_AIO_PROC_LOCK_OWNED(procp);
1333
1334	/ Onto proc queue /
1335	if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1336	return false;
1337	}
1338
1339	if (leader) {
1340	aio_entry_ref(entryp: leader); / consumed in do_aio_completion_and_unlock /
1341	leader->lio_pending++;
1342	entryp->lio_leader = leader;
1343	}
1344
1345	/ And work queue /
1346	aio_entry_ref(entryp); / consumed in do_aio_completion_and_unlock /
1347	aio_workq_lock_spin(wq: queue);
1348	aio_workq_add_entry_locked(queue, entryp);
1349	waitq_wakeup64_one(waitq: &queue->aioq_waitq, CAST_EVENT64_T(queue),
1350	THREAD_AWAKENED, flags: WAITQ_WAKEUP_DEFAULT);
1351	aio_workq_unlock(wq: queue);
1352
1353	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) \| DBG_FUNC_START,
1354	VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1355	entryp->flags, entryp->aiocb.aio_fildes, `0`);
1356	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) \| DBG_FUNC_END,
1357	entryp->aiocb.aio_offset, `0`, entryp->aiocb.aio_nbytes, `0`, `0`);
1358	return true;
1359	}
1360
1361
1362	/*
1363	* lio_listio - initiate a list of IO requests. We process the list of
1364	* aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1365	* (mode == LIO_NOWAIT).
1366	*
1367	* The caller gets error and return status for each aiocb in the list
1368	* via aio_error and aio_return. We must keep completed requests until
1369	* released by the aio_return call.
1370	*/
1371	int
1372	lio_listio(proc_t p, struct lio_listio_args uap, int* *retval __unused)
1373	{
1374	aio_workq_entry *entries[AIO_LISTIO_MAX] = { };
1375	user_addr_t aiocbpp[AIO_LISTIO_MAX];
1376	struct user_sigevent aiosigev = { };
1377	int result = `0`;
1378	int lio_count = `0`;
1379
1380	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) \| DBG_FUNC_START,
1381	VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, `0`, `0`);
1382
1383	if (!(uap->mode == LIO_NOWAIT \|\| uap->mode == LIO_WAIT)) {
1384	result = EINVAL;
1385	goto ExitRoutine;
1386	}
1387
1388	if (uap->nent < `1` \|\| uap->nent > AIO_LISTIO_MAX) {
1389	result = EINVAL;
1390	goto ExitRoutine;
1391	}
1392
1393	/*
1394	* Use sigevent passed in to lio_listio for each of our calls, but
1395	* only do completion notification after the last request completes.
1396	*/
1397	if (uap->sigp != USER_ADDR_NULL) {
1398	result = aio_copy_in_sigev(procp: p, sigp: uap->sigp, sigev: &aiosigev);
1399	if (result) {
1400	goto ExitRoutine;
1401	}
1402	result = aio_sigev_validate(sigev: &aiosigev);
1403	if (result) {
1404	goto ExitRoutine;
1405	}
1406	}
1407
1408	if (aio_copy_in_list(procp: p, aiocblist: uap->aiocblist, aiocbpp, nent: uap->nent)) {
1409	result = EAGAIN;
1410	goto ExitRoutine;
1411	}
1412
1413	/*
1414	* allocate/parse all entries
1415	*/
1416	for (int i = `0`; i < uap->nent; i++) {
1417	aio_workq_entry *entryp;
1418
1419	/ NULL elements are legal so check for 'em /
1420	if (aiocbpp[i] == USER_ADDR_NULL) {
1421	continue;
1422	}
1423
1424	entryp = aio_create_queue_entry(procp: p, aiocbp: aiocbpp[i], AIO_LIO);
1425	if (entryp == NULL) {
1426	result = EAGAIN;
1427	goto ExitRoutine;
1428	}
1429
1430	/*
1431	* This refcount is cleaned up on exit if the entry
1432	* isn't submitted
1433	*/
1434	entries[lio_count++] = entryp;
1435	if (uap->mode == LIO_NOWAIT) {
1436	/ Set signal hander, if any /
1437	entryp->aiocb.aio_sigevent = aiosigev;
1438	}
1439	}
1440
1441	if (lio_count == `0`) {
1442	/ There's nothing to submit /
1443	goto ExitRoutine;
1444	}
1445
1446	/*
1447	* Past this point we're commited and will not bail out
1448	*
1449	* - keep a reference on the leader for LIO_WAIT
1450	* - perform the submissions and optionally wait
1451	*/
1452
1453	aio_workq_entry *leader = entries[`0`];
1454	if (uap->mode == LIO_WAIT) {
1455	aio_entry_ref(entryp: leader); / consumed below /
1456	}
1457
1458	aio_proc_lock_spin(procp: p);
1459
1460	for (int i = `0`; i < lio_count; i++) {
1461	if (aio_try_enqueue_work_locked(procp: p, entryp: entries[i], leader)) {
1462	entries[i] = NULL; / the entry was submitted /
1463	} else {
1464	result = EAGAIN;
1465	}
1466	}
1467
1468	if (uap->mode == LIO_WAIT && result == `0`) {
1469	leader->flags \|= AIO_LIO_WAIT;
1470
1471	while (leader->lio_pending) {
1472	/ If we were interrupted, fail out (even if all finished) /
1473	if (msleep(chan: leader, mtx: aio_proc_mutex(procp: p),
1474	PCATCH \| PRIBIO \| PSPIN, wmesg: "lio_listio", ts: `0`) != `0`) {
1475	result = EINTR;
1476	break;
1477	}
1478	}
1479
1480	leader->flags &= ~AIO_LIO_WAIT;
1481	}
1482
1483	aio_proc_unlock(procp: p);
1484
1485	if (uap->mode == LIO_WAIT) {
1486	aio_entry_unref(entryp: leader);
1487	}
1488
1489	ExitRoutine:
1490	/ Consume unsubmitted entries /
1491	for (int i = `0`; i < lio_count; i++) {
1492	if (entries[i]) {
1493	aio_entry_unref(entryp: entries[i]);
1494	}
1495	}
1496
1497	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) \| DBG_FUNC_END,
1498	VM_KERNEL_ADDRPERM(p), result, `0`, `0`, `0`);
1499
1500	return result;
1501	}
1502
1503
1504	/*
1505	* aio worker thread. this is where all the real work gets done.
1506	* we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1507	* after new work is queued up.
1508	*/
1509	__attribute__((noreturn))
1510	static void
1511	aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1512	{
1513	aio_workq_entry *entryp;
1514	int error;
1515	vm_map_t currentmap;
1516	vm_map_t oldmap = VM_MAP_NULL;
1517	task_t oldaiotask = TASK_NULL;
1518	struct uthread *uthreadp = NULL;
1519	proc_t p = NULL;
1520
1521	for (;;) {
1522	/*
1523	* returns with the entry ref'ed.
1524	* sleeps until work is available.
1525	*/
1526	entryp = aio_get_some_work();
1527	p = entryp->procp;
1528
1529	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) \| DBG_FUNC_START,
1530	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1531	entryp->flags, `0`, `0`);
1532
1533	/*
1534	* Assume the target's address space identity for the duration
1535	* of the IO. Note: don't need to have the entryp locked,
1536	* because the proc and map don't change until it's freed.
1537	*/
1538	currentmap = get_task_map(proc_task(current_proc()));
1539	if (currentmap != entryp->aio_map) {
1540	uthreadp = (struct uthread *) current_uthread();
1541	oldaiotask = uthreadp->uu_aio_task;
1542	/*
1543	* workq entries at this stage cause _aio_exec() and _aio_exit() to
1544	* block until we hit `do_aio_completion_and_unlock()` below,
1545	* which means that it is safe to dereference p->task without
1546	* holding a lock or taking references.
1547	*/
1548	uthreadp->uu_aio_task = proc_task(p);
1549	oldmap = vm_map_switch(map: entryp->aio_map);
1550	}
1551
1552	if ((entryp->flags & AIO_READ) != `0`) {
1553	error = do_aio_read(entryp);
1554	} else if ((entryp->flags & AIO_WRITE) != `0`) {
1555	uthreadp = (struct uthread *)current_uthread();
1556	uthread_t context_uthreadp = get_bsdthread_info(vfs_context_thread(ctx: &entryp->context));
1557
1558	if ((context_uthreadp && (context_uthreadp->uu_flag & UT_FS_BLKSIZE_NOCACHE_WRITES)) \|\|
1559	os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) {
1560	uthreadp->uu_flag \|= UT_FS_BLKSIZE_NOCACHE_WRITES;
1561	}
1562
1563	error = do_aio_write(entryp);
1564
1565	uthreadp->uu_flag &= ~UT_FS_BLKSIZE_NOCACHE_WRITES;
1566	} else if ((entryp->flags & (AIO_FSYNC \| AIO_DSYNC)) != `0`) {
1567	error = do_aio_fsync(entryp);
1568	} else {
1569	error = EINVAL;
1570	}
1571
1572	/ Restore old map /
1573	if (currentmap != entryp->aio_map) {
1574	vm_map_switch(map: oldmap);
1575	uthreadp->uu_aio_task = oldaiotask;
1576	}
1577
1578	/ liberate unused map /
1579	vm_map_deallocate(map: entryp->aio_map);
1580	entryp->aio_map = VM_MAP_NULL;
1581
1582	KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) \| DBG_FUNC_END,
1583	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1584	entryp->errorval, entryp->returnval, `0`);
1585
1586	/ we're done with the IO request so pop it off the active queue and /
1587	/ push it on the done queue /
1588	aio_proc_lock(procp: p);
1589	entryp->errorval = error;
1590	do_aio_completion_and_unlock(p, entryp);
1591	}
1592	}
1593
1594
1595	/*
1596	* aio_get_some_work - get the next async IO request that is ready to be executed.
1597	* aio_fsync complicates matters a bit since we cannot do the fsync until all async
1598	* IO requests at the time the aio_fsync call came in have completed.
1599	* NOTE - AIO_LOCK must be held by caller
1600	*/
1601	static aio_workq_entry *
1602	aio_get_some_work(void)
1603	{
1604	aio_workq_entry *entryp = NULL;
1605	aio_workq_t queue = NULL;
1606
1607	/ Just one queue for the moment. In the future there will be many. /
1608	queue = &aio_anchor.aio_async_workqs[`0`];
1609	aio_workq_lock_spin(wq: queue);
1610
1611	/*
1612	* Hold the queue lock.
1613	*
1614	* pop some work off the work queue and add to our active queue
1615	* Always start with the queue lock held.
1616	*/
1617	while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1618	/*
1619	* Pull of of work queue. Once it's off, it can't be cancelled,
1620	* so we can take our ref once we drop the queue lock.
1621	*/
1622
1623	aio_workq_remove_entry_locked(queue, entryp);
1624
1625	aio_workq_unlock(wq: queue);
1626
1627	/*
1628	* Check if it's an fsync that must be delayed. No need to lock the entry;
1629	* that flag would have been set at initialization.
1630	*/
1631	if ((entryp->flags & AIO_FSYNC) != `0`) {
1632	/*
1633	* Check for unfinished operations on the same file
1634	* in this proc's queue.
1635	*/
1636	aio_proc_lock_spin(procp: entryp->procp);
1637	if (aio_delay_fsync_request(entryp)) {
1638	/ It needs to be delayed. Put it back on the end of the work queue /
1639	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) \| DBG_FUNC_NONE,
1640	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1641	`0`, `0`, `0`);
1642
1643	aio_proc_unlock(procp: entryp->procp);
1644
1645	aio_workq_lock_spin(wq: queue);
1646	aio_workq_add_entry_locked(queue, entryp);
1647	continue;
1648	}
1649	aio_proc_unlock(procp: entryp->procp);
1650	}
1651
1652	return entryp;
1653	}
1654
1655	/ We will wake up when someone enqueues something /
1656	waitq_assert_wait64(waitq: &queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, deadline: `0`);
1657	aio_workq_unlock(wq: queue);
1658	thread_block(continuation: aio_work_thread);
1659
1660	__builtin_unreachable();
1661	}
1662
1663	/*
1664	* aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1665	* A big, simple hammer: only send it off if it's the most recently filed IO which has
1666	* not been completed.
1667	*/
1668	static boolean_t
1669	aio_delay_fsync_request(aio_workq_entry *entryp)
1670	{
1671	if (proc_in_teardown(entryp->procp)) {
1672	/*
1673	* we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1674	* if it was dequeued, then we must now commit to it
1675	*/
1676	return FALSE;
1677	}
1678
1679	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1680	return FALSE;
1681	}
1682
1683	return TRUE;
1684	}
1685
1686	static aio_workq_entry *
1687	aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1688	{
1689	aio_workq_entry *entryp;
1690
1691	entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK \| Z_ZERO);
1692	entryp->procp = procp;
1693	entryp->uaiocbp = aiocbp;
1694	entryp->flags = flags;
1695	/ consumed in aio_return or _aio_exit /
1696	os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1697
1698	if (proc_is64bit(procp)) {
1699	struct user64_aiocb aiocb64;
1700
1701	if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != `0`) {
1702	goto error_exit;
1703	}
1704	do_munge_aiocb_user64_to_user(my_aiocbp: &aiocb64, the_user_aiocbp: &entryp->aiocb);
1705	} else {
1706	struct user32_aiocb aiocb32;
1707
1708	if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != `0`) {
1709	goto error_exit;
1710	}
1711	do_munge_aiocb_user32_to_user(my_aiocbp: &aiocb32, the_user_aiocbp: &entryp->aiocb);
1712	}
1713
1714	/ do some more validation on the aiocb and embedded file descriptor /
1715	if (aio_validate(procp, entryp) != `0`) {
1716	goto error_exit;
1717	}
1718
1719	/ get a reference to the user land map in order to keep it around /
1720	entryp->aio_map = get_task_map(proc_task(procp));
1721	vm_map_reference(map: entryp->aio_map);
1722
1723	/ get a reference on the current_thread, which is passed in vfs_context. /
1724	entryp->context = *vfs_context_current();
1725	thread_reference(thread: entryp->context.vc_thread);
1726	kauth_cred_ref(cred: entryp->context.vc_ucred);
1727	return entryp;
1728
1729	error_exit:
1730	zfree(aio_workq_zonep, entryp);
1731	return NULL;
1732	}
1733
1734
1735	/*
1736	* aio_queue_async_request - queue up an async IO request on our work queue then
1737	* wake up one of our worker threads to do the actual work. We get a reference
1738	* to our caller's user land map in order to keep it around while we are
1739	* processing the request.
1740	*/
1741	static int
1742	aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1743	aio_entry_flags_t flags)
1744	{
1745	aio_workq_entry *entryp;
1746	int result;
1747
1748	entryp = aio_create_queue_entry(procp, aiocbp, flags);
1749	if (entryp == NULL) {
1750	result = EAGAIN;
1751	goto error_noalloc;
1752	}
1753
1754	aio_proc_lock_spin(procp);
1755	if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1756	result = EAGAIN;
1757	goto error_exit;
1758	}
1759	aio_proc_unlock(procp);
1760	return `0`;
1761
1762	error_exit:
1763	/*
1764	* This entry has not been queued up so no worries about
1765	* unlocked state and aio_map
1766	*/
1767	aio_proc_unlock(procp);
1768	aio_free_request(entryp);
1769	error_noalloc:
1770	return result;
1771	}
1772
1773
1774	/*
1775	* aio_free_request - remove our reference on the user land map and
1776	* free the work queue entry resources. The entry is off all lists
1777	* and has zero refcount, so no one can have a pointer to it.
1778	*/
1779	static void
1780	aio_free_request(aio_workq_entry *entryp)
1781	{
1782	if (entryp->aio_proc_link.tqe_prev \|\| entryp->aio_workq_link.tqe_prev) {
1783	panic("aio_workq_entry %p being freed while still enqueued", entryp);
1784	}
1785
1786	/ remove our reference to the user land map. /
1787	if (VM_MAP_NULL != entryp->aio_map) {
1788	vm_map_deallocate(map: entryp->aio_map);
1789	}
1790
1791	/ remove our reference to thread which enqueued the request /
1792	if (entryp->context.vc_thread) {
1793	thread_deallocate(thread: entryp->context.vc_thread);
1794	}
1795	kauth_cred_unref(&entryp->context.vc_ucred);
1796
1797	zfree(aio_workq_zonep, entryp);
1798	}
1799
1800
1801	/*
1802	* aio_validate
1803	*
1804	* validate the aiocb passed in by one of the aio syscalls.
1805	*/
1806	static int
1807	aio_validate(proc_t p, aio_workq_entry *entryp)
1808	{
1809	struct fileproc *fp;
1810	int flag;
1811	int result;
1812
1813	result = `0`;
1814
1815	if ((entryp->flags & AIO_LIO) != `0`) {
1816	if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1817	entryp->flags \|= AIO_READ;
1818	} else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1819	entryp->flags \|= AIO_WRITE;
1820	} else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1821	return `0`;
1822	} else {
1823	return EINVAL;
1824	}
1825	}
1826
1827	flag = FREAD;
1828	if ((entryp->flags & (AIO_WRITE \| AIO_FSYNC \| AIO_DSYNC)) != `0`) {
1829	flag = FWRITE;
1830	}
1831
1832	if ((entryp->flags & (AIO_READ \| AIO_WRITE)) != `0`) {
1833	if (entryp->aiocb.aio_nbytes > INT_MAX \|\|
1834	entryp->aiocb.aio_buf == USER_ADDR_NULL \|\|
1835	entryp->aiocb.aio_offset < `0`) {
1836	return EINVAL;
1837	}
1838	}
1839
1840	result = aio_sigev_validate(sigev: &entryp->aiocb.aio_sigevent);
1841	if (result) {
1842	return result;
1843	}
1844
1845	/ validate the file descriptor and that the file was opened*
1846	* for the appropriate read / write access.
1847	*/
1848	proc_fdlock(p);
1849
1850	fp = fp_get_noref_locked(p, fd: entryp->aiocb.aio_fildes);
1851	if (fp == NULL) {
1852	result = EBADF;
1853	} else if ((fp->fp_glob->fg_flag & flag) == `0`) {
1854	/ we don't have read or write access /
1855	result = EBADF;
1856	} else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1857	/ this is not a file /
1858	result = ESPIPE;
1859	} else {
1860	fp->fp_flags \|= FP_AIOISSUED;
1861	}
1862
1863	proc_fdunlock(p);
1864
1865	return result;
1866	}
1867
1868	/*
1869	* do_aio_completion_and_unlock. Handle async IO completion.
1870	*/
1871	static void
1872	do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1873	{
1874	aio_workq_entry *leader = entryp->lio_leader;
1875	int lio_pending = `0`;
1876	bool do_signal = false;
1877
1878	ASSERT_AIO_PROC_LOCK_OWNED(p);
1879
1880	aio_proc_move_done_locked(procp: p, entryp);
1881
1882	if (leader) {
1883	lio_pending = --leader->lio_pending;
1884	if (lio_pending < `0`) {
1885	panic("lio_pending accounting mistake");
1886	}
1887	if (lio_pending == `0` && (leader->flags & AIO_LIO_WAIT)) {
1888	wakeup(chan: leader);
1889	}
1890	entryp->lio_leader = NULL; / no dangling pointers please /
1891	}
1892
1893	/*
1894	* need to handle case where a process is trying to exit, exec, or
1895	* close and is currently waiting for active aio requests to complete.
1896	* If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1897	* other requests in the active queue for this process. If there are
1898	* none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1899	* If there are some still active then do nothing - we only want to
1900	* wakeup when all active aio requests for the process are complete.
1901	*/
1902	if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1903	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) \| DBG_FUNC_NONE,
1904	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905	`0`, `0`, `0`);
1906
1907	if (!aio_has_active_requests_for_process(procp: p)) {
1908	/*
1909	* no active aio requests for this process, continue exiting. In this
1910	* case, there should be no one else waiting ont he proc in AIO...
1911	*/
1912	wakeup_one(chan: (caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1913
1914	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) \| DBG_FUNC_NONE,
1915	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1916	`0`, `0`, `0`);
1917	}
1918	} else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1919	/*
1920	* If this was the last request in the group, or not part of
1921	* a group, and that a signal is desired, send one.
1922	*/
1923	do_signal = (lio_pending == `0`);
1924	}
1925
1926	if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1927	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) \| DBG_FUNC_NONE,
1928	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1929	`0`, `0`, `0`);
1930
1931	if (!aio_proc_has_active_requests_for_file(procp: p, fd: entryp->aiocb.aio_fildes)) {
1932	/ Can't wakeup_one(); multiple closes might be in progress. /
1933	wakeup(chan: &p->AIO_CLEANUP_SLEEP_CHAN);
1934
1935	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) \| DBG_FUNC_NONE,
1936	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1937	`0`, `0`, `0`);
1938	}
1939	}
1940
1941	aio_proc_unlock(procp: p);
1942
1943	if (do_signal) {
1944	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) \| DBG_FUNC_NONE,
1945	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1946	entryp->aiocb.aio_sigevent.sigev_signo, `0`, `0`);
1947
1948	psignal(p, sig: entryp->aiocb.aio_sigevent.sigev_signo);
1949	}
1950
1951	/*
1952	* A thread in aio_suspend() wants to known about completed IOs. If it checked
1953	* the done list before we moved our AIO there, then it already asserted its wait,
1954	* and we can wake it up without holding the lock. If it checked the list after
1955	* we did our move, then it already has seen the AIO that we moved. Herego, we
1956	* can do our wakeup without holding the lock.
1957	*/
1958	wakeup(chan: &p->AIO_SUSPEND_SLEEP_CHAN);
1959	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) \| DBG_FUNC_NONE,
1960	VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), `0`, `0`, `0`);
1961
1962	aio_entry_unref(entryp); / see aio_try_enqueue_work_locked /
1963	if (leader) {
1964	aio_entry_unref(entryp: leader); / see lio_listio /
1965	}
1966	}
1967
1968
1969	/*
1970	* do_aio_read
1971	*/
1972	static int
1973	do_aio_read(aio_workq_entry *entryp)
1974	{
1975	struct proc *p = entryp->procp;
1976	struct fileproc *fp;
1977	int error;
1978
1979	if ((error = fp_lookup(p, fd: entryp->aiocb.aio_fildes, resultfp: &fp, locked: `0`))) {
1980	return error;
1981	}
1982
1983	if (fp->fp_glob->fg_flag & FREAD) {
1984	error = dofileread(ctx: &entryp->context, fp,
1985	bufp: entryp->aiocb.aio_buf,
1986	nbyte: entryp->aiocb.aio_nbytes,
1987	offset: entryp->aiocb.aio_offset, FOF_OFFSET,
1988	retval: &entryp->returnval);
1989	} else {
1990	error = EBADF;
1991	}
1992
1993	fp_drop(p, fd: entryp->aiocb.aio_fildes, fp, locked: `0`);
1994	return error;
1995	}
1996
1997
1998	/*
1999	* do_aio_write
2000	*/
2001	static int
2002	do_aio_write(aio_workq_entry *entryp)
2003	{
2004	struct proc *p = entryp->procp;
2005	struct fileproc *fp;
2006	int error;
2007
2008	if ((error = fp_lookup(p, fd: entryp->aiocb.aio_fildes, resultfp: &fp, locked: `0`))) {
2009	return error;
2010	}
2011
2012	if (fp->fp_glob->fg_flag & FWRITE) {
2013	int flags = `0`;
2014
2015	if ((fp->fp_glob->fg_flag & O_APPEND) == `0`) {
2016	flags \|= FOF_OFFSET;
2017	}
2018
2019	/ NB: tell dofilewrite the offset, and to use the proc cred /
2020	error = dofilewrite(ctx: &entryp->context,
2021	fp,
2022	bufp: entryp->aiocb.aio_buf,
2023	nbyte: entryp->aiocb.aio_nbytes,
2024	offset: entryp->aiocb.aio_offset,
2025	flags,
2026	retval: &entryp->returnval);
2027	} else {
2028	error = EBADF;
2029	}
2030
2031	fp_drop(p, fd: entryp->aiocb.aio_fildes, fp, locked: `0`);
2032	return error;
2033	}
2034
2035
2036	/*
2037	* aio_has_active_requests_for_process - return whether the process has active
2038	* requests pending.
2039	*/
2040	static bool
2041	aio_has_active_requests_for_process(proc_t procp)
2042	{
2043	return !TAILQ_EMPTY(&procp->p_aio_activeq);
2044	}
2045
2046	/*
2047	* Called with the proc locked.
2048	*/
2049	static bool
2050	aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2051	{
2052	aio_workq_entry *entryp;
2053
2054	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2055	if (entryp->aiocb.aio_fildes == fd) {
2056	return true;
2057	}
2058	}
2059
2060	return false;
2061	}
2062
2063
2064	/*
2065	* do_aio_fsync
2066	*/
2067	static int
2068	do_aio_fsync(aio_workq_entry *entryp)
2069	{
2070	struct proc *p = entryp->procp;
2071	struct vnode *vp;
2072	struct fileproc *fp;
2073	int sync_flag;
2074	int error;
2075
2076	/*
2077	* We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2078	*
2079	* If AIO_DSYNC is set, we can tell the lower layers that it is OK
2080	* to mark for update the metadata not strictly necessary for data
2081	* retrieval, rather than forcing it to disk.
2082	*
2083	* If AIO_FSYNC is set, we have to also wait for metadata not really
2084	* necessary to data retrival are committed to stable storage (e.g.
2085	* atime, mtime, ctime, etc.).
2086	*
2087	* Metadata necessary for data retrieval ust be committed to stable
2088	* storage in either case (file length, etc.).
2089	*/
2090	if (entryp->flags & AIO_FSYNC) {
2091	sync_flag = MNT_WAIT;
2092	} else {
2093	sync_flag = MNT_DWAIT;
2094	}
2095
2096	error = fp_get_ftype(p, fd: entryp->aiocb.aio_fildes, ftype: DTYPE_VNODE, ENOTSUP, fpp: &fp);
2097	if (error != `0`) {
2098	entryp->returnval = -`1`;
2099	return error;
2100	}
2101	vp = fp_get_data(fp);
2102
2103	if ((error = vnode_getwithref(vp)) == `0`) {
2104	error = VNOP_FSYNC(vp, waitfor: sync_flag, ctx: &entryp->context);
2105
2106	(void)vnode_put(vp);
2107	} else {
2108	entryp->returnval = -`1`;
2109	}
2110
2111	fp_drop(p, fd: entryp->aiocb.aio_fildes, fp, locked: `0`);
2112	return error;
2113	}
2114
2115
2116	/*
2117	* is_already_queued - runs through our queues to see if the given
2118	* aiocbp / process is there. Returns TRUE if there is a match
2119	* on any of our aio queues.
2120	*
2121	* Called with proc aio lock held (can be held spin)
2122	*/
2123	static boolean_t
2124	is_already_queued(proc_t procp, user_addr_t aiocbp)
2125	{
2126	aio_workq_entry *entryp;
2127	boolean_t result;
2128
2129	result = FALSE;
2130
2131	/ look for matches on our queue of async IO requests that have completed /
2132	TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2133	if (aiocbp == entryp->uaiocbp) {
2134	result = TRUE;
2135	goto ExitThisRoutine;
2136	}
2137	}
2138
2139	/ look for matches on our queue of active async IO requests /
2140	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2141	if (aiocbp == entryp->uaiocbp) {
2142	result = TRUE;
2143	goto ExitThisRoutine;
2144	}
2145	}
2146
2147	ExitThisRoutine:
2148	return result;
2149	}
2150
2151
2152	/*
2153	* aio initialization
2154	*/
2155	__private_extern__ void
2156	aio_init(void)
2157	{
2158	for (int i = `0`; i < AIO_NUM_WORK_QUEUES; i++) {
2159	aio_workq_init(wq: &aio_anchor.aio_async_workqs[i]);
2160	}
2161
2162	_aio_create_worker_threads(num: aio_worker_threads);
2163	}
2164
2165
2166	/*
2167	* aio worker threads created here.
2168	*/
2169	__private_extern__ void
2170	_aio_create_worker_threads(int num)
2171	{
2172	int i;
2173
2174	/ create some worker threads to handle the async IO requests /
2175	for (i = `0`; i < num; i++) {
2176	thread_t myThread;
2177
2178	if (KERN_SUCCESS != kernel_thread_start(continuation: aio_work_thread, NULL, new_thread: &myThread)) {
2179	printf("%s - failed to create a work thread \n", __FUNCTION__);
2180	} else {
2181	thread_deallocate(thread: myThread);
2182	}
2183	}
2184	}
2185
2186	/*
2187	* Return the current activation utask
2188	*/
2189	task_t
2190	get_aiotask(void)
2191	{
2192	return current_uthread()->uu_aio_task;
2193	}
2194
2195
2196	/*
2197	* In the case of an aiocb from a
2198	* 32-bit process we need to expand some longs and pointers to the correct
2199	* sizes in order to let downstream code always work on the same type of
2200	* aiocb (in our case that is a user_aiocb)
2201	*/
2202	static void
2203	do_munge_aiocb_user32_to_user(struct user32_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp)
2204	{
2205	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2206	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2207	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2208	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2209	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2210	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2211
2212	/ special case here. since we do not know if sigev_value is an /
2213	/ int or a ptr we do NOT cast the ptr to a user_addr_t. This /
2214	/ means if we send this info back to user space we need to remember /
2215	/ sigev_value was not expanded for the 32-bit case. /
2216	/ NOTE - this does NOT affect us since we don't support sigev_value /
2217	/ yet in the aio context. /
2218	//LP64
2219	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2220	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2221	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2222	my_aiocbp->aio_sigevent.sigev_value.sival_int;
2223	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2224	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2225	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2226	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2227	}
2228
2229	/ Similar for 64-bit user process, so that we don't need to satisfy*
2230	* the alignment constraints of the original user64_aiocb
2231	*/
2232	#if !__LP64__
2233	__dead2
2234	#endif
2235	static void
2236	do_munge_aiocb_user64_to_user(struct user64_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp)
2237	{
2238	#if __LP64__
2239	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2240	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2241	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2242	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2243	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2244	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2245
2246	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2247	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2248	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2249	my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2250	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2251	my_aiocbp->aio_sigevent.sigev_notify_function;
2252	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2253	my_aiocbp->aio_sigevent.sigev_notify_attributes;
2254	#else
2255	#pragma unused(my_aiocbp, the_user_aiocbp)
2256	panic("64bit process on 32bit kernel is not supported");
2257	#endif
2258	}
2259

Browse the source code of xnu/bsd/kern/kern_aio.c