kern_aio.c source code [xnu/bsd/kern/kern_aio.c]

1	/*
2	* Copyright (c) 2003-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29
30	/*
31	* todo:
32	* 1) ramesh is looking into how to replace taking a reference on
33	* the user's map (vm_map_reference()) since it is believed that
34	* would not hold the process for us.
35	* 2) david is looking into a way for us to set the priority of the
36	* worker threads to match that of the user's thread when the
37	* async IO was queued.
38	*/
39
40
41	/*
42	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
43	*/
44
45	#include <sys/systm.h>
46	#include <sys/fcntl.h>
47	#include <sys/file_internal.h>
48	#include <sys/filedesc.h>
49	#include <sys/kernel.h>
50	#include <sys/vnode_internal.h>
51	#include <sys/malloc.h>
52	#include <sys/mount_internal.h>
53	#include <sys/param.h>
54	#include <sys/proc_internal.h>
55	#include <sys/sysctl.h>
56	#include <sys/unistd.h>
57	#include <sys/user.h>
58
59	#include <sys/aio_kern.h>
60	#include <sys/sysproto.h>
61
62	#include <machine/limits.h>
63
64	#include <mach/mach_types.h>
65	#include <kern/kern_types.h>
66	#include <kern/waitq.h>
67	#include <kern/zalloc.h>
68	#include <kern/task.h>
69	#include <kern/sched_prim.h>
70
71	#include <vm/vm_map.h>
72
73	#include <libkern/OSAtomic.h>
74
75	#include <sys/kdebug.h>
76	#define AIO_work_queued 1
77	#define AIO_worker_wake 2
78	#define AIO_completion_sig 3
79	#define AIO_completion_cleanup_wait 4
80	#define AIO_completion_cleanup_wake 5
81	#define AIO_completion_suspend_wake 6
82	#define AIO_fsync_delay 7
83	#define AIO_cancel 10
84	#define AIO_cancel_async_workq 11
85	#define AIO_cancel_sync_workq 12
86	#define AIO_cancel_activeq 13
87	#define AIO_cancel_doneq 14
88	#define AIO_fsync 20
89	#define AIO_read 30
90	#define AIO_write 40
91	#define AIO_listio 50
92	#define AIO_error 60
93	#define AIO_error_val 61
94	#define AIO_error_activeq 62
95	#define AIO_error_workq 63
96	#define AIO_return 70
97	#define AIO_return_val 71
98	#define AIO_return_activeq 72
99	#define AIO_return_workq 73
100	#define AIO_exec 80
101	#define AIO_exit 90
102	#define AIO_exit_sleep 91
103	#define AIO_close 100
104	#define AIO_close_sleep 101
105	#define AIO_suspend 110
106	#define AIO_suspend_sleep 111
107	#define AIO_worker_thread 120
108
109	#if 0
110	#undef KERNEL_DEBUG
111	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
112	#endif
113
114	/*
115	* aio requests queue up on the aio_async_workq or lio_sync_workq (for
116	* lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
117	* (proc.aio_activeq) when one of our worker threads start the IO.
118	* And finally, requests move to the per process aio_doneq (proc.aio_doneq)
119	* when the IO request completes. The request remains on aio_doneq until
120	* user process calls aio_return or the process exits, either way that is our
121	* trigger to release aio resources.
122	*/
123	typedef struct aio_workq {
124	TAILQ_HEAD(, aio_workq_entry) aioq_entries;
125	int aioq_count;
126	lck_mtx_t aioq_mtx;
127	struct waitq aioq_waitq;
128	} *aio_workq_t;
129
130	#define AIO_NUM_WORK_QUEUES 1
131	struct aio_anchor_cb
132	{
133	volatile int32_t aio_inflight_count; / entries that have been taken from a workq /
134	volatile int32_t aio_done_count; / entries on all done queues (proc.aio_doneq) /
135	volatile int32_t aio_total_count; / total extant entries /
136
137	/ Hash table of queues here /
138	int aio_num_workqs;
139	struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
140	};
141	typedef struct aio_anchor_cb aio_anchor_cb;
142
143	struct aio_lio_context
144	{
145	int io_waiter;
146	int io_issued;
147	int io_completed;
148	};
149	typedef struct aio_lio_context aio_lio_context;
150
151
152	/*
153	* Notes on aio sleep / wake channels.
154	* We currently pick a couple fields within the proc structure that will allow
155	* us sleep channels that currently do not collide with any other kernel routines.
156	* At this time, for binary compatibility reasons, we cannot create new proc fields.
157	*/
158	#define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count
159	#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
160
161	#define ASSERT_AIO_FROM_PROC(aiop, theproc) \
162	if ((aiop)->procp != (theproc)) { \
163	panic("AIO on a proc list that does not belong to that proc.\n"); \
164	}
165
166	/*
167	* LOCAL PROTOTYPES
168	*/
169	static void aio_proc_lock(proc_t procp);
170	static void aio_proc_lock_spin(proc_t procp);
171	static void aio_proc_unlock(proc_t procp);
172	static lck_mtx_t* aio_proc_mutex(proc_t procp);
173	static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
174	static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
175	static int aio_get_process_count(proc_t procp );
176	static int aio_active_requests_for_process(proc_t procp );
177	static int aio_proc_active_requests_for_file(proc_t procp, int fd);
178	static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp );
179	static boolean_t should_cancel(aio_workq_entry entryp, user_addr_t aiocbp, int* fd);
180
181	static void aio_entry_lock(aio_workq_entry *entryp);
182	static void aio_entry_lock_spin(aio_workq_entry *entryp);
183	static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
184	static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp);
185	static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
186	static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
187	static void aio_entry_ref_locked(aio_workq_entry *entryp);
188	static void aio_entry_unref_locked(aio_workq_entry *entryp);
189	static void aio_entry_ref(aio_workq_entry *entryp);
190	static void aio_entry_unref(aio_workq_entry *entryp);
191	static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
192	int wait_for_completion, boolean_t disable_notification);
193	static int aio_entry_try_workq_remove(aio_workq_entry *entryp);
194	static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp );
195	static int aio_free_request(aio_workq_entry *entryp);
196
197	static void aio_workq_init(aio_workq_t wq);
198	static void aio_workq_lock_spin(aio_workq_t wq);
199	static void aio_workq_unlock(aio_workq_t wq);
200	static lck_mtx_t* aio_workq_mutex(aio_workq_t wq);
201
202	static void aio_work_thread( void );
203	static aio_workq_entry aio_get_some_work( void* );
204
205	static int aio_get_all_queues_count( void );
206	static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
207	static int aio_validate( aio_workq_entry *entryp );
208	static int aio_increment_total_count(void);
209	static int aio_decrement_total_count(void);
210
211	static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
212	static void do_aio_completion( aio_workq_entry *entryp );
213	static int do_aio_fsync( aio_workq_entry *entryp );
214	static int do_aio_read( aio_workq_entry *entryp );
215	static int do_aio_write( aio_workq_entry *entryp );
216	static void do_munge_aiocb_user32_to_user( struct user32_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp );
217	static void do_munge_aiocb_user64_to_user( struct user64_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp );
218	static int lio_create_entry(proc_t procp,
219	user_addr_t aiocbp,
220	void *group_tag,
221	aio_workq_entry **entrypp );
222	static aio_workq_entry *aio_create_queue_entry(proc_t procp,
223	user_addr_t aiocbp,
224	void *group_tag,
225	int kindOfIO);
226	static user_addr_t aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int* nent);
227	static void free_lio_context(aio_lio_context* context);
228	static void aio_enqueue_work( proc_t procp, aio_workq_entry entryp, int* proc_locked);
229
230	#define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
231	#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
232	#define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
233
234	/*
235	* EXTERNAL PROTOTYPES
236	*/
237
238	/ in ...bsd/kern/sys_generic.c /
239	extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
240	user_addr_t bufp, user_size_t nbyte,
241	off_t offset, int flags, user_ssize_t *retval );
242	extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
243	user_addr_t bufp, user_size_t nbyte, off_t offset,
244	int flags, user_ssize_t *retval );
245	#if DEBUG
246	static uint32_t lio_contexts_alloced = `0`;
247	#endif /* DEBUG */
248
249	/*
250	* aio external global variables.
251	*/
252	extern int aio_max_requests; / AIO_MAX - configurable /
253	extern int aio_max_requests_per_process; / AIO_PROCESS_MAX - configurable /
254	extern int aio_worker_threads; / AIO_THREAD_COUNT - configurable /
255
256
257	/*
258	* aio static variables.
259	*/
260	static aio_anchor_cb aio_anchor;
261	static lck_grp_t *aio_proc_lock_grp;
262	static lck_grp_t *aio_entry_lock_grp;
263	static lck_grp_t *aio_queue_lock_grp;
264	static lck_attr_t *aio_lock_attr;
265	static lck_grp_attr_t *aio_lock_grp_attr;
266	static struct zone *aio_workq_zonep;
267	static lck_mtx_t aio_entry_mtx;
268	static lck_mtx_t aio_proc_mtx;
269
270	static void
271	aio_entry_lock(__unused aio_workq_entry *entryp)
272	{
273	lck_mtx_lock(&aio_entry_mtx);
274	}
275
276	static void
277	aio_entry_lock_spin(__unused aio_workq_entry *entryp)
278	{
279	lck_mtx_lock_spin(&aio_entry_mtx);
280	}
281
282	static void
283	aio_entry_unlock(__unused aio_workq_entry *entryp)
284	{
285	lck_mtx_unlock(&aio_entry_mtx);
286	}
287
288	/ Hash /
289	static aio_workq_t
290	aio_entry_workq(__unused aio_workq_entry *entryp)
291	{
292	return &aio_anchor.aio_async_workqs[`0`];
293	}
294
295	static lck_mtx_t*
296	aio_entry_mutex(__unused aio_workq_entry *entryp)
297	{
298	return &aio_entry_mtx;
299	}
300
301	static void
302	aio_workq_init(aio_workq_t wq)
303	{
304	TAILQ_INIT(&wq->aioq_entries);
305	wq->aioq_count = `0`;
306	lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
307	waitq_init(&wq->aioq_waitq, SYNC_POLICY_FIFO);
308	}
309
310
311	/*
312	* Can be passed a queue which is locked spin.
313	*/
314	static void
315	aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
316	{
317	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
318
319	if (entryp->aio_workq_link.tqe_prev == NULL) {
320	panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
321	}
322
323	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
324	queue->aioq_count--;
325	entryp->aio_workq_link.tqe_prev = NULL; / Not on a workq /
326
327	if (queue->aioq_count < `0`) {
328	panic("Negative count on a queue.\n");
329	}
330	}
331
332	static void
333	aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
334	{
335	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
336
337	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
338	if (queue->aioq_count < `0`) {
339	panic("Negative count on a queue.\n");
340	}
341	queue->aioq_count++;
342	}
343
344	static void
345	aio_proc_lock(proc_t procp)
346	{
347	lck_mtx_lock(aio_proc_mutex(procp));
348	}
349
350	static void
351	aio_proc_lock_spin(proc_t procp)
352	{
353	lck_mtx_lock_spin(aio_proc_mutex(procp));
354	}
355
356	static void
357	aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
358	{
359	ASSERT_AIO_PROC_LOCK_OWNED(procp);
360
361	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
362	TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
363	procp->p_aio_active_count--;
364	OSIncrementAtomic(&aio_anchor.aio_done_count);
365	}
366
367	static void
368	aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
369	{
370	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
371	OSDecrementAtomic(&aio_anchor.aio_done_count);
372	aio_decrement_total_count();
373	procp->p_aio_total_count--;
374	}
375
376	static void
377	aio_proc_unlock(proc_t procp)
378	{
379	lck_mtx_unlock(aio_proc_mutex(procp));
380	}
381
382	static lck_mtx_t*
383	aio_proc_mutex(proc_t procp)
384	{
385	return &procp->p_mlock;
386	}
387
388	static void
389	aio_entry_ref_locked(aio_workq_entry *entryp)
390	{
391	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
392
393	if (entryp->aio_refcount < `0`) {
394	panic("AIO workq entry with a negative refcount.\n");
395	}
396	entryp->aio_refcount++;
397	}
398
399
400	/ Return 1 if you've freed it /
401	static void
402	aio_entry_unref_locked(aio_workq_entry *entryp)
403	{
404	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
405
406	entryp->aio_refcount--;
407	if (entryp->aio_refcount < `0`) {
408	panic("AIO workq entry with a negative refcount.\n");
409	}
410	}
411
412	static void
413	aio_entry_ref(aio_workq_entry *entryp)
414	{
415	aio_entry_lock_spin(entryp);
416	aio_entry_ref_locked(entryp);
417	aio_entry_unlock(entryp);
418	}
419	static void
420	aio_entry_unref(aio_workq_entry *entryp)
421	{
422	aio_entry_lock_spin(entryp);
423	aio_entry_unref_locked(entryp);
424
425	if ((entryp->aio_refcount == `0`) && ((entryp->flags & AIO_DO_FREE) != `0`)) {
426	aio_entry_unlock(entryp);
427	aio_free_request(entryp);
428	} else {
429	aio_entry_unlock(entryp);
430	}
431
432	return;
433	}
434
435	static void
436	aio_entry_update_for_cancel(aio_workq_entry entryp, boolean_t cancelled, int* wait_for_completion, boolean_t disable_notification)
437	{
438	aio_entry_lock_spin(entryp);
439
440	if (cancelled) {
441	aio_entry_ref_locked(entryp);
442	entryp->errorval = ECANCELED;
443	entryp->returnval = -`1`;
444	}
445
446	if ( wait_for_completion ) {
447	entryp->flags \|= wait_for_completion; / flag for special completion processing /
448	}
449
450	if ( disable_notification ) {
451	entryp->flags \|= AIO_DISABLE; / Don't want a signal /
452	}
453
454	aio_entry_unlock(entryp);
455	}
456
457	static int
458	aio_entry_try_workq_remove(aio_workq_entry *entryp)
459	{
460	/ Can only be cancelled if it's still on a work queue /
461	if (entryp->aio_workq_link.tqe_prev != NULL) {
462	aio_workq_t queue;
463
464	/ Will have to check again under the lock /
465	queue = aio_entry_workq(entryp);
466	aio_workq_lock_spin(queue);
467	if (entryp->aio_workq_link.tqe_prev != NULL) {
468	aio_workq_remove_entry_locked(queue, entryp);
469	aio_workq_unlock(queue);
470	return `1`;
471	} else {
472	aio_workq_unlock(queue);
473	}
474	}
475
476	return `0`;
477	}
478
479	static void
480	aio_workq_lock_spin(aio_workq_t wq)
481	{
482	lck_mtx_lock_spin(aio_workq_mutex(wq));
483	}
484
485	static void
486	aio_workq_unlock(aio_workq_t wq)
487	{
488	lck_mtx_unlock(aio_workq_mutex(wq));
489	}
490
491	static lck_mtx_t*
492	aio_workq_mutex(aio_workq_t wq)
493	{
494	return &wq->aioq_mtx;
495	}
496
497	/*
498	* aio_cancel - attempt to cancel one or more async IO requests currently
499	* outstanding against file descriptor uap->fd. If uap->aiocbp is not
500	* NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
501	* is NULL then all outstanding async IO request for the given file
502	* descriptor are cancelled (if possible).
503	*/
504	int
505	aio_cancel(proc_t p, struct aio_cancel_args uap, int* *retval )
506	{
507	struct user_aiocb my_aiocb;
508	int result;
509
510	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) \| DBG_FUNC_START,
511	(int)p, (int)uap->aiocbp, `0`, `0`, `0` );
512
513	/ quick check to see if there are any async IO requests queued up /
514	if (aio_get_all_queues_count() < `1`) {
515	result = `0`;
516	*retval = AIO_ALLDONE;
517	goto ExitRoutine;
518	}
519
520	*retval = -`1`;
521	if ( uap->aiocbp != USER_ADDR_NULL ) {
522	if ( proc_is64bit(p) ) {
523	struct user64_aiocb aiocb64;
524
525	result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
526	if (result == `0` )
527	do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
528
529	} else {
530	struct user32_aiocb aiocb32;
531
532	result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
533	if ( result == `0` )
534	do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
535	}
536
537	if ( result != `0` ) {
538	result = EAGAIN;
539	goto ExitRoutine;
540	}
541
542	/ NOTE - POSIX standard says a mismatch between the file /
543	/ descriptor passed in and the file descriptor embedded in /
544	/ the aiocb causes unspecified results. We return EBADF in /
545	/ that situation. /
546	if ( uap->fd != my_aiocb.aio_fildes ) {
547	result = EBADF;
548	goto ExitRoutine;
549	}
550	}
551
552	aio_proc_lock(p);
553	result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, `0`, FALSE );
554	ASSERT_AIO_PROC_LOCK_OWNED(p);
555	aio_proc_unlock(p);
556
557	if ( result != -`1` ) {
558	*retval = result;
559	result = `0`;
560	goto ExitRoutine;
561	}
562
563	result = EBADF;
564
565	ExitRoutine:
566	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) \| DBG_FUNC_END,
567	(int)p, (int)uap->aiocbp, result, `0`, `0` );
568
569	return( result );
570
571	} / aio_cancel /
572
573
574	/*
575	* _aio_close - internal function used to clean up async IO requests for
576	* a file descriptor that is closing.
577	* THIS MAY BLOCK.
578	*/
579	__private_extern__ void
580	_aio_close(proc_t p, int fd )
581	{
582	int error;
583
584	/ quick check to see if there are any async IO requests queued up /
585	if (aio_get_all_queues_count() < `1`) {
586	return;
587	}
588
589	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) \| DBG_FUNC_START,
590	(int)p, fd, `0`, `0`, `0` );
591
592	/ cancel all async IO requests on our todo queues for this file descriptor /
593	aio_proc_lock(p);
594	error = do_aio_cancel_locked( p, fd, `0`, AIO_CLOSE_WAIT, FALSE );
595	ASSERT_AIO_PROC_LOCK_OWNED(p);
596	if ( error == AIO_NOTCANCELED ) {
597	/*
598	* AIO_NOTCANCELED is returned when we find an aio request for this process
599	* and file descriptor on the active async IO queue. Active requests cannot
600	* be cancelled so we must wait for them to complete. We will get a special
601	* wake up call on our channel used to sleep for ALL active requests to
602	* complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
603	* when we must wait for all active aio requests.
604	*/
605
606	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) \| DBG_FUNC_NONE,
607	(int)p, fd, `0`, `0`, `0` );
608
609	while (aio_proc_active_requests_for_file(p, fd) > `0`) {
610	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", `0` );
611	}
612
613	}
614
615	aio_proc_unlock(p);
616
617	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) \| DBG_FUNC_END,
618	(int)p, fd, `0`, `0`, `0` );
619
620	return;
621
622	} / _aio_close /
623
624
625	/*
626	* aio_error - return the error status associated with the async IO
627	* request referred to by uap->aiocbp. The error status is the errno
628	* value that would be set by the corresponding IO request (read, wrtie,
629	* fdatasync, or sync).
630	*/
631	int
632	aio_error(proc_t p, struct aio_error_args uap, int* *retval )
633	{
634	aio_workq_entry *entryp;
635	int error;
636
637	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) \| DBG_FUNC_START,
638	(int)p, (int)uap->aiocbp, `0`, `0`, `0` );
639
640	/ see if there are any aios to check /
641	if (aio_get_all_queues_count() < `1`) {
642	return EINVAL;
643	}
644
645	aio_proc_lock(p);
646
647	/ look for a match on our queue of async IO requests that have completed /
648	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
649	if ( entryp->uaiocbp == uap->aiocbp ) {
650	ASSERT_AIO_FROM_PROC(entryp, p);
651
652	aio_entry_lock_spin(entryp);
653	*retval = entryp->errorval;
654	error = `0`;
655	aio_entry_unlock(entryp);
656	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) \| DBG_FUNC_NONE,
657	(int)p, (int)uap->aiocbp, *retval, `0`, `0` );
658	goto ExitRoutine;
659	}
660	}
661
662	/ look for a match on our queue of active async IO requests /
663	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
664	if ( entryp->uaiocbp == uap->aiocbp ) {
665	ASSERT_AIO_FROM_PROC(entryp, p);
666	*retval = EINPROGRESS;
667	error = `0`;
668	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) \| DBG_FUNC_NONE,
669	(int)p, (int)uap->aiocbp, *retval, `0`, `0` );
670	goto ExitRoutine;
671	}
672	}
673
674	error = EINVAL;
675
676	ExitRoutine:
677	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) \| DBG_FUNC_END,
678	(int)p, (int)uap->aiocbp, error, `0`, `0` );
679	aio_proc_unlock(p);
680
681	return( error );
682
683	} / aio_error /
684
685
686	/*
687	* aio_fsync - asynchronously force all IO operations associated
688	* with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
689	* queued at the time of the call to the synchronized completion state.
690	* NOTE - we do not support op O_DSYNC at this point since we do not support the
691	* fdatasync() call.
692	*/
693	int
694	aio_fsync(proc_t p, struct aio_fsync_args uap, int* *retval )
695	{
696	int error;
697	int fsync_kind;
698
699	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) \| DBG_FUNC_START,
700	(int)p, (int)uap->aiocbp, uap->op, `0`, `0` );
701
702	*retval = `0`;
703	/ 0 := O_SYNC for binary backward compatibility with Panther /
704	if (uap->op == O_SYNC \|\| uap->op == `0`)
705	fsync_kind = AIO_FSYNC;
706	else if ( uap->op == O_DSYNC )
707	fsync_kind = AIO_DSYNC;
708	else {
709	*retval = -`1`;
710	error = EINVAL;
711	goto ExitRoutine;
712	}
713
714	error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
715	if ( error != `0` )
716	*retval = -`1`;
717
718	ExitRoutine:
719	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) \| DBG_FUNC_END,
720	(int)p, (int)uap->aiocbp, error, `0`, `0` );
721
722	return( error );
723
724	} / aio_fsync /
725
726
727	/ aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the*
728	* file descriptor (uap->aiocbp->aio_fildes) into the buffer
729	* (uap->aiocbp->aio_buf).
730	*/
731	int
732	aio_read(proc_t p, struct aio_read_args uap, int* *retval )
733	{
734	int error;
735
736	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) \| DBG_FUNC_START,
737	(int)p, (int)uap->aiocbp, `0`, `0`, `0` );
738
739	*retval = `0`;
740
741	error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
742	if ( error != `0` )
743	*retval = -`1`;
744
745	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) \| DBG_FUNC_END,
746	(int)p, (int)uap->aiocbp, error, `0`, `0` );
747
748	return( error );
749
750	} / aio_read /
751
752
753	/*
754	* aio_return - return the return status associated with the async IO
755	* request referred to by uap->aiocbp. The return status is the value
756	* that would be returned by corresponding IO request (read, write,
757	* fdatasync, or sync). This is where we release kernel resources
758	* held for async IO call associated with the given aiocb pointer.
759	*/
760	int
761	aio_return(proc_t p, struct aio_return_args uap, user_ssize_t retval )
762	{
763	aio_workq_entry *entryp;
764	int error;
765	boolean_t proc_lock_held = FALSE;
766
767	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) \| DBG_FUNC_START,
768	(int)p, (int)uap->aiocbp, `0`, `0`, `0` );
769
770	/ See if there are any entries to check /
771	if (aio_get_all_queues_count() < `1`) {
772	error = EINVAL;
773	goto ExitRoutine;
774	}
775
776	aio_proc_lock(p);
777	proc_lock_held = TRUE;
778	*retval = `0`;
779
780	/ look for a match on our queue of async IO requests that have completed /
781	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
782	ASSERT_AIO_FROM_PROC(entryp, p);
783	if ( entryp->uaiocbp == uap->aiocbp ) {
784	/ Done and valid for aio_return(), pull it off the list /
785	aio_proc_remove_done_locked(p, entryp);
786
787	/ Drop the proc lock, but keep the entry locked /
788	aio_entry_lock(entryp);
789	aio_proc_unlock(p);
790	proc_lock_held = FALSE;
791
792	*retval = entryp->returnval;
793	error = `0`;
794
795	/ No references and off all lists, safe to free /
796	if (entryp->aio_refcount == `0`) {
797	aio_entry_unlock(entryp);
798	aio_free_request(entryp);
799	}
800	else {
801	/ Whoever has the refcount will have to free it /
802	entryp->flags \|= AIO_DO_FREE;
803	aio_entry_unlock(entryp);
804	}
805
806
807	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) \| DBG_FUNC_NONE,
808	(int)p, (int)uap->aiocbp, *retval, `0`, `0` );
809	goto ExitRoutine;
810	}
811	}
812
813	/ look for a match on our queue of active async IO requests /
814	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
815	ASSERT_AIO_FROM_PROC(entryp, p);
816	if ( entryp->uaiocbp == uap->aiocbp ) {
817	error = EINPROGRESS;
818	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) \| DBG_FUNC_NONE,
819	(int)p, (int)uap->aiocbp, *retval, `0`, `0` );
820	goto ExitRoutine;
821	}
822	}
823
824	error = EINVAL;
825
826	ExitRoutine:
827	if (proc_lock_held)
828	aio_proc_unlock(p);
829	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) \| DBG_FUNC_END,
830	(int)p, (int)uap->aiocbp, error, `0`, `0` );
831
832	return( error );
833
834	} / aio_return /
835
836
837	/*
838	* _aio_exec - internal function used to clean up async IO requests for
839	* a process that is going away due to exec(). We cancel any async IOs
840	* we can and wait for those already active. We also disable signaling
841	* for cancelled or active aio requests that complete.
842	* This routine MAY block!
843	*/
844	__private_extern__ void
845	_aio_exec(proc_t p )
846	{
847
848	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) \| DBG_FUNC_START,
849	(int)p, `0`, `0`, `0`, `0` );
850
851	_aio_exit( p );
852
853	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) \| DBG_FUNC_END,
854	(int)p, `0`, `0`, `0`, `0` );
855
856	return;
857
858	} / _aio_exec /
859
860
861	/*
862	* _aio_exit - internal function used to clean up async IO requests for
863	* a process that is terminating (via exit() or exec() ). We cancel any async IOs
864	* we can and wait for those already active. We also disable signaling
865	* for cancelled or active aio requests that complete. This routine MAY block!
866	*/
867	__private_extern__ void
868	_aio_exit(proc_t p )
869	{
870	int error;
871	aio_workq_entry *entryp;
872
873
874	/ quick check to see if there are any async IO requests queued up /
875	if (aio_get_all_queues_count() < `1`) {
876	return;
877	}
878
879	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) \| DBG_FUNC_START,
880	(int)p, `0`, `0`, `0`, `0` );
881
882	aio_proc_lock(p);
883
884	/*
885	* cancel async IO requests on the todo work queue and wait for those
886	* already active to complete.
887	*/
888	error = do_aio_cancel_locked( p, `0`, `0`, AIO_EXIT_WAIT, TRUE );
889	ASSERT_AIO_PROC_LOCK_OWNED(p);
890	if ( error == AIO_NOTCANCELED ) {
891	/*
892	* AIO_NOTCANCELED is returned when we find an aio request for this process
893	* on the active async IO queue. Active requests cannot be cancelled so we
894	* must wait for them to complete. We will get a special wake up call on
895	* our channel used to sleep for ALL active requests to complete. This sleep
896	* channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
897	* active aio requests.
898	*/
899
900	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) \| DBG_FUNC_NONE,
901	(int)p, `0`, `0`, `0`, `0` );
902
903	while (p->p_aio_active_count != `0`) {
904	msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", `0` );
905	}
906	}
907
908	if (p->p_aio_active_count != `0`) {
909	panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
910	}
911
912	/ release all aio resources used by this process /
913	entryp = TAILQ_FIRST( &p->p_aio_doneq );
914	while ( entryp != NULL ) {
915	ASSERT_AIO_FROM_PROC(entryp, p);
916	aio_workq_entry *next_entryp;
917
918	next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
919	aio_proc_remove_done_locked(p, entryp);
920
921	/ we cannot free requests that are still completing /
922	aio_entry_lock_spin(entryp);
923	if (entryp->aio_refcount == `0`) {
924	aio_proc_unlock(p);
925	aio_entry_unlock(entryp);
926	aio_free_request(entryp);
927
928	/ need to start over since aio_doneq may have been /
929	/ changed while we were away. /
930	aio_proc_lock(p);
931	entryp = TAILQ_FIRST( &p->p_aio_doneq );
932	continue;
933	}
934	else {
935	/ whoever has the reference will have to do the free /
936	entryp->flags \|= AIO_DO_FREE;
937	}
938
939	aio_entry_unlock(entryp);
940	entryp = next_entryp;
941	}
942
943	aio_proc_unlock(p);
944
945	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) \| DBG_FUNC_END,
946	(int)p, `0`, `0`, `0`, `0` );
947	return;
948
949	} / _aio_exit /
950
951
952	static boolean_t
953	should_cancel(aio_workq_entry entryp, user_addr_t aiocbp, int* fd)
954	{
955	if ( (aiocbp == USER_ADDR_NULL && fd == `0`) \|\|
956	(aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) \|\|
957	(aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
958	return TRUE;
959	}
960
961	return FALSE;
962	}
963
964	/*
965	* do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
966	* aio_cancel, close, and at exit.
967	* There are three modes of operation: 1) cancel all async IOs for a process -
968	* fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
969	* is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
970	* aiocbp.
971	* Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
972	* target async IO requests, AIO_NOTCANCELED if we could not cancel all
973	* target async IO requests, and AIO_ALLDONE if all target async IO requests
974	* were already complete.
975	* WARNING - do not deference aiocbp in this routine, it may point to user
976	* land data that has not been copied in (when called from aio_cancel() )
977	*
978	* Called with proc locked, and returns the same way.
979	*/
980	static int
981	do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
982	int wait_for_completion, boolean_t disable_notification )
983	{
984	ASSERT_AIO_PROC_LOCK_OWNED(p);
985
986	aio_workq_entry *entryp;
987	int result;
988
989	result = -`1`;
990
991	/ look for a match on our queue of async todo work. /
992	entryp = TAILQ_FIRST(&p->p_aio_activeq);
993	while ( entryp != NULL ) {
994	ASSERT_AIO_FROM_PROC(entryp, p);
995	aio_workq_entry *next_entryp;
996
997	next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
998	if (!should_cancel(entryp, aiocbp, fd)) {
999	entryp = next_entryp;
1000	continue;
1001	}
1002
1003	/ Can only be cancelled if it's still on a work queue /
1004	if (aio_entry_try_workq_remove(entryp) != `0`) {
1005	/ Have removed from workq. Update entry state and take a ref /
1006	aio_entry_update_for_cancel(entryp, TRUE, `0`, disable_notification);
1007
1008	/ Put on the proc done queue and update counts, then unlock the proc /
1009	aio_proc_move_done_locked(p, entryp);
1010	aio_proc_unlock(p);
1011
1012	/ Now it's officially cancelled. Do the completion /
1013	result = AIO_CANCELED;
1014	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) \| DBG_FUNC_NONE,
1015	(int)entryp->procp, (int)entryp->uaiocbp, fd, `0`, `0` );
1016	do_aio_completion(entryp);
1017
1018	/ This will free if the aio_return() has already happened ... /
1019	aio_entry_unref(entryp);
1020	aio_proc_lock(p);
1021
1022	if ( aiocbp != USER_ADDR_NULL ) {
1023	return( result );
1024	}
1025
1026	/*
1027	* Restart from the head of the proc active queue since it
1028	* may have been changed while we were away doing completion
1029	* processing.
1030	*
1031	* Note that if we found an uncancellable AIO before, we will
1032	* either find it again or discover that it's been completed,
1033	* so resetting the result will not cause us to return success
1034	* despite outstanding AIOs.
1035	*/
1036	entryp = TAILQ_FIRST(&p->p_aio_activeq);
1037	result = -`1`; / As if beginning anew /
1038	} else {
1039	/*
1040	* It's been taken off the active queue already, i.e. is in flight.
1041	* All we can do is ask for notification.
1042	*/
1043	result = AIO_NOTCANCELED;
1044
1045	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) \| DBG_FUNC_NONE,
1046	(int)entryp->procp, (int)entryp->uaiocbp, fd, `0`, `0` );
1047
1048	/ Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE /
1049	aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1050
1051	if ( aiocbp != USER_ADDR_NULL ) {
1052	return( result );
1053	}
1054	entryp = next_entryp;
1055	}
1056	} / while... /
1057
1058	/*
1059	* if we didn't find any matches on the todo or active queues then look for a
1060	* match on our queue of async IO requests that have completed and if found
1061	* return AIO_ALLDONE result.
1062	*
1063	* Proc AIO lock is still held.
1064	*/
1065	if ( result == -`1` ) {
1066	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1067	ASSERT_AIO_FROM_PROC(entryp, p);
1068	if (should_cancel(entryp, aiocbp, fd)) {
1069	result = AIO_ALLDONE;
1070	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) \| DBG_FUNC_NONE,
1071	(int)entryp->procp, (int)entryp->uaiocbp, fd, `0`, `0` );
1072
1073	if ( aiocbp != USER_ADDR_NULL ) {
1074	return( result );
1075	}
1076	}
1077	}
1078	}
1079
1080	return( result );
1081
1082	}
1083	/ do_aio_cancel_locked /
1084
1085
1086	/*
1087	* aio_suspend - suspend the calling thread until at least one of the async
1088	* IO operations referenced by uap->aiocblist has completed, until a signal
1089	* interrupts the function, or uap->timeoutp time interval (optional) has
1090	* passed.
1091	* Returns 0 if one or more async IOs have completed else -1 and errno is
1092	* set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1093	* woke us up.
1094	*/
1095	int
1096	aio_suspend(proc_t p, struct aio_suspend_args uap, int* *retval )
1097	{
1098	__pthread_testcancel(`1`);
1099	return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1100	}
1101
1102
1103	int
1104	aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args uap, int* *retval )
1105	{
1106	int error;
1107	int i, count;
1108	uint64_t abstime;
1109	struct user_timespec ts;
1110	aio_workq_entry *entryp;
1111	user_addr_t *aiocbpp;
1112
1113	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) \| DBG_FUNC_START,
1114	(int)p, uap->nent, `0`, `0`, `0` );
1115
1116	*retval = -`1`;
1117	abstime = `0`;
1118	aiocbpp = NULL;
1119
1120	count = aio_get_all_queues_count( );
1121	if ( count < `1` ) {
1122	error = EINVAL;
1123	goto ExitThisRoutine;
1124	}
1125
1126	if ( uap->nent < `1` \|\| uap->nent > aio_max_requests_per_process ) {
1127	error = EINVAL;
1128	goto ExitThisRoutine;
1129	}
1130
1131	if ( uap->timeoutp != USER_ADDR_NULL ) {
1132	if ( proc_is64bit(p) ) {
1133	struct user64_timespec temp;
1134	error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1135	if ( error == `0` ) {
1136	ts.tv_sec = temp.tv_sec;
1137	ts.tv_nsec = temp.tv_nsec;
1138	}
1139	}
1140	else {
1141	struct user32_timespec temp;
1142	error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1143	if ( error == `0` ) {
1144	ts.tv_sec = temp.tv_sec;
1145	ts.tv_nsec = temp.tv_nsec;
1146	}
1147	}
1148	if ( error != `0` ) {
1149	error = EAGAIN;
1150	goto ExitThisRoutine;
1151	}
1152
1153	if ( ts.tv_sec < `0` \|\| ts.tv_nsec < `0` \|\| ts.tv_nsec >= `1000000000` ) {
1154	error = EINVAL;
1155	goto ExitThisRoutine;
1156	}
1157
1158	nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1159	&abstime );
1160	clock_absolutetime_interval_to_deadline( abstime, &abstime );
1161	}
1162
1163	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1164	if ( aiocbpp == NULL ) {
1165	error = EAGAIN;
1166	goto ExitThisRoutine;
1167	}
1168
1169	/ check list of aio requests to see if any have completed /
1170	check_for_our_aiocbp:
1171	aio_proc_lock_spin(p);
1172	for ( i = `0`; i < uap->nent; i++ ) {
1173	user_addr_t aiocbp;
1174
1175	/ NULL elements are legal so check for 'em /
1176	aiocbp = *(aiocbpp + i);
1177	if ( aiocbp == USER_ADDR_NULL )
1178	continue;
1179
1180	/ return immediately if any aio request in the list is done /
1181	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1182	ASSERT_AIO_FROM_PROC(entryp, p);
1183	if ( entryp->uaiocbp == aiocbp ) {
1184	aio_proc_unlock(p);
1185	*retval = `0`;
1186	error = `0`;
1187	goto ExitThisRoutine;
1188	}
1189	}
1190	} / for ( ; i < uap->nent; ) /
1191
1192	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) \| DBG_FUNC_NONE,
1193	(int)p, uap->nent, `0`, `0`, `0` );
1194
1195	/*
1196	* wait for an async IO to complete or a signal fires or timeout expires.
1197	* we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1198	* interrupts us. If an async IO completes before a signal fires or our
1199	* timeout expires, we get a wakeup call from aio_work_thread().
1200	*/
1201
1202	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH \| PWAIT \| PDROP, "aio_suspend", abstime); / XXX better priority? /
1203	if ( error == `0` ) {
1204	/*
1205	* got our wakeup call from aio_work_thread().
1206	* Since we can get a wakeup on this channel from another thread in the
1207	* same process we head back up to make sure this is for the correct aiocbp.
1208	* If it is the correct aiocbp we will return from where we do the check
1209	* (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1210	* else we will fall out and just sleep again.
1211	*/
1212	goto check_for_our_aiocbp;
1213	}
1214	else if ( error == EWOULDBLOCK ) {
1215	/ our timeout expired /
1216	error = EAGAIN;
1217	}
1218	else {
1219	/ we were interrupted /
1220	error = EINTR;
1221	}
1222
1223	ExitThisRoutine:
1224	if ( aiocbpp != NULL )
1225	FREE( aiocbpp, M_TEMP );
1226
1227	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) \| DBG_FUNC_END,
1228	(int)p, uap->nent, error, `0`, `0` );
1229
1230	return( error );
1231
1232	} / aio_suspend /
1233
1234
1235	/ aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the*
1236	* file descriptor (uap->aiocbp->aio_fildes) from the buffer
1237	* (uap->aiocbp->aio_buf).
1238	*/
1239
1240	int
1241	aio_write(proc_t p, struct aio_write_args uap, int* *retval )
1242	{
1243	int error;
1244
1245	*retval = `0`;
1246
1247	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) \| DBG_FUNC_START,
1248	(int)p, (int)uap->aiocbp, `0`, `0`, `0` );
1249
1250	error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1251	if ( error != `0` )
1252	*retval = -`1`;
1253
1254	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) \| DBG_FUNC_END,
1255	(int)p, (int)uap->aiocbp, error, `0`, `0` );
1256
1257	return( error );
1258
1259	} / aio_write /
1260
1261
1262	static user_addr_t *
1263	aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1264	{
1265	user_addr_t *aiocbpp;
1266	int i, result;
1267
1268	/ we reserve enough space for largest possible pointer size /
1269	MALLOC( aiocbpp, user_addr_t , (nent sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1270	if ( aiocbpp == NULL )
1271	goto err;
1272
1273	/ copyin our aiocb pointers from list /
1274	result = copyin( aiocblist, aiocbpp,
1275	proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1276	: (nent * sizeof(user32_addr_t)) );
1277	if ( result) {
1278	FREE( aiocbpp, M_TEMP );
1279	aiocbpp = NULL;
1280	goto err;
1281	}
1282
1283	/*
1284	* We depend on a list of user_addr_t's so we need to
1285	* munge and expand when these pointers came from a
1286	* 32-bit process
1287	*/
1288	if ( !proc_is64bit(procp) ) {
1289	/ copy from last to first to deal with overlap /
1290	user32_addr_t my_ptrp = ((user32_addr_t )aiocbpp) + (nent - `1`);
1291	user_addr_t *my_addrp = aiocbpp + (nent - `1`);
1292
1293	for (i = `0`; i < nent; i++, my_ptrp--, my_addrp--) {
1294	my_addrp = (user_addr_t) (my_ptrp);
1295	}
1296	}
1297
1298	err:
1299	return (aiocbpp);
1300	}
1301
1302
1303	static int
1304	aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1305	{
1306	int result = `0`;
1307
1308	if (sigp == USER_ADDR_NULL)
1309	goto out;
1310
1311	/*
1312	* We need to munge aio_sigevent since it contains pointers.
1313	* Since we do not know if sigev_value is an int or a ptr we do
1314	* NOT cast the ptr to a user_addr_t. This means if we send
1315	* this info back to user space we need to remember sigev_value
1316	* was not expanded for the 32-bit case.
1317	*
1318	* Notes: This does NOT affect us since we don't support
1319	* sigev_value yet in the aio context.
1320	*/
1321	if ( proc_is64bit(procp) ) {
1322	struct user64_sigevent sigevent64;
1323
1324	result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1325	if ( result == `0` ) {
1326	sigev->sigev_notify = sigevent64.sigev_notify;
1327	sigev->sigev_signo = sigevent64.sigev_signo;
1328	sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1329	sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1330	sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1331	}
1332
1333	} else {
1334	struct user32_sigevent sigevent32;
1335
1336	result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1337	if ( result == `0` ) {
1338	sigev->sigev_notify = sigevent32.sigev_notify;
1339	sigev->sigev_signo = sigevent32.sigev_signo;
1340	sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1341	sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1342	sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1343	}
1344	}
1345
1346	if ( result != `0` ) {
1347	result = EAGAIN;
1348	}
1349
1350	out:
1351	return (result);
1352	}
1353
1354	/*
1355	* aio_enqueue_work
1356	*
1357	* Queue up the entry on the aio asynchronous work queue in priority order
1358	* based on the relative priority of the request. We calculate the relative
1359	* priority using the nice value of the caller and the value
1360	*
1361	* Parameters: procp Process queueing the I/O
1362	* entryp The work queue entry being queued
1363	*
1364	* Returns: (void) No failure modes
1365	*
1366	* Notes: This function is used for both lio_listio and aio
1367	*
1368	* XXX: At some point, we may have to consider thread priority
1369	* rather than process priority, but we don't maintain the
1370	* adjusted priority for threads the POSIX way.
1371	*
1372	*
1373	* Called with proc locked.
1374	*/
1375	static void
1376	aio_enqueue_work( proc_t procp, aio_workq_entry entryp, int* proc_locked)
1377	{
1378	#if 0
1379	aio_workq_entry my_entryp; /* used for insertion sort /
1380	#endif /* 0 */
1381	aio_workq_t queue = aio_entry_workq(entryp);
1382
1383	if (proc_locked == `0`) {
1384	aio_proc_lock(procp);
1385	}
1386
1387	ASSERT_AIO_PROC_LOCK_OWNED(procp);
1388
1389	/ Onto proc queue /
1390	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
1391	procp->p_aio_active_count++;
1392	procp->p_aio_total_count++;
1393
1394	/ And work queue /
1395	aio_workq_lock_spin(queue);
1396	aio_workq_add_entry_locked(queue, entryp);
1397	waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1398	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1399	aio_workq_unlock(queue);
1400
1401	if (proc_locked == `0`) {
1402	aio_proc_unlock(procp);
1403	}
1404
1405	#if 0
1406	/*
1407	* Procedure:
1408	*
1409	* (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1410	* (2) The normalized nice value is in the range 0..((2 * NZERO) - 1)
1411	* which is [0..39], with 0 not being used. In nice values, the
1412	* lower the nice value, the higher the priority.
1413	* (3) The normalized scheduling prioritiy is the highest nice value
1414	* minus the current nice value. In I/O scheduling priority, the
1415	* higher the value the lower the priority, so it is the inverse
1416	* of the nice value (the higher the number, the higher the I/O
1417	* priority).
1418	* (4) From the normalized scheduling priority, we subtract the
1419	* request priority to get the request priority value number;
1420	* this means that requests are only capable of depressing their
1421	* priority relative to other requests,
1422	*/
1423	entryp->priority = (((`2` * NZERO) - `1`) - procp->p_nice);
1424
1425	/ only premit depressing the priority /
1426	if (entryp->aiocb.aio_reqprio < `0`)
1427	entryp->aiocb.aio_reqprio = `0`;
1428	if (entryp->aiocb.aio_reqprio > `0`) {
1429	entryp->priority -= entryp->aiocb.aio_reqprio;
1430	if (entryp->priority < `0`)
1431	entryp->priority = `0`;
1432	}
1433
1434	/ Insertion sort the entry; lowest ->priority to highest /
1435	TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1436	if ( entryp->priority <= my_entryp->priority) {
1437	TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1438	break;
1439	}
1440	}
1441	if (my_entryp == NULL)
1442	TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1443	#endif /* 0 */
1444	}
1445
1446
1447	/*
1448	* lio_listio - initiate a list of IO requests. We process the list of
1449	* aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1450	* (mode == LIO_NOWAIT).
1451	*
1452	* The caller gets error and return status for each aiocb in the list
1453	* via aio_error and aio_return. We must keep completed requests until
1454	* released by the aio_return call.
1455	*/
1456	int
1457	lio_listio(proc_t p, struct lio_listio_args uap, int* *retval )
1458	{
1459	int i;
1460	int call_result;
1461	int result;
1462	int old_count;
1463	aio_workq_entry **entryp_listp;
1464	user_addr_t *aiocbpp;
1465	struct user_sigevent aiosigev;
1466	aio_lio_context *lio_context;
1467	boolean_t free_context = FALSE;
1468	uint32_t *paio_offset;
1469	uint32_t *paio_nbytes;
1470
1471	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) \| DBG_FUNC_START,
1472	(int)p, uap->nent, uap->mode, `0`, `0` );
1473
1474	entryp_listp = NULL;
1475	lio_context = NULL;
1476	aiocbpp = NULL;
1477	call_result = -`1`;
1478	*retval = -`1`;
1479	if ( !(uap->mode == LIO_NOWAIT \|\| uap->mode == LIO_WAIT) ) {
1480	call_result = EINVAL;
1481	goto ExitRoutine;
1482	}
1483
1484	if ( uap->nent < `1` \|\| uap->nent > AIO_LISTIO_MAX ) {
1485	call_result = EINVAL;
1486	goto ExitRoutine;
1487	}
1488
1489	/*
1490	* allocate a list of aio_workq_entry pointers that we will use
1491	* to queue up all our requests at once while holding our lock.
1492	*/
1493	MALLOC( entryp_listp, void , (uap->nent sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1494	if ( entryp_listp == NULL ) {
1495	call_result = EAGAIN;
1496	goto ExitRoutine;
1497	}
1498
1499	MALLOC( lio_context, aio_lio_context, sizeof*(aio_lio_context), M_TEMP, M_WAITOK );
1500	if ( lio_context == NULL ) {
1501	call_result = EAGAIN;
1502	goto ExitRoutine;
1503	}
1504
1505	#if DEBUG
1506	OSIncrementAtomic(&lio_contexts_alloced);
1507	#endif /* DEBUG */
1508
1509	free_context = TRUE;
1510	bzero(lio_context, sizeof(aio_lio_context));
1511
1512	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1513	if ( aiocbpp == NULL ) {
1514	call_result = EAGAIN;
1515	goto ExitRoutine;
1516	}
1517
1518	/*
1519	* Use sigevent passed in to lio_listio for each of our calls, but
1520	* only do completion notification after the last request completes.
1521	*/
1522	bzero(&aiosigev, sizeof(aiosigev));
1523	/ Only copy in an sigev if the user supplied one /
1524	if (uap->sigp != USER_ADDR_NULL) {
1525	call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1526	if ( call_result)
1527	goto ExitRoutine;
1528	}
1529
1530	/ process list of aio requests /
1531	free_context = FALSE;
1532	lio_context->io_issued = uap->nent;
1533	lio_context->io_waiter = uap->mode == LIO_WAIT ? `1` : `0`; / Should it be freed by last AIO /
1534	for ( i = `0`; i < uap->nent; i++ ) {
1535	user_addr_t my_aiocbp;
1536	aio_workq_entry *entryp;
1537
1538	*(entryp_listp + i) = NULL;
1539	my_aiocbp = *(aiocbpp + i);
1540
1541	/ NULL elements are legal so check for 'em /
1542	if ( my_aiocbp == USER_ADDR_NULL ) {
1543	aio_proc_lock_spin(p);
1544	lio_context->io_issued--;
1545	aio_proc_unlock(p);
1546	continue;
1547	}
1548
1549	/*
1550	* We use lio_context to mark IO requests for delayed completion
1551	* processing which means we wait until all IO requests in the
1552	* group have completed before we either return to the caller
1553	* when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1554	*
1555	* We use the address of the lio_context for this, since it is
1556	* unique in the address space.
1557	*/
1558	result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1559	if ( result != `0` && call_result == -`1` )
1560	call_result = result;
1561
1562	/ NULL elements are legal so check for 'em /
1563	entryp = *(entryp_listp + i);
1564	if ( entryp == NULL ) {
1565	aio_proc_lock_spin(p);
1566	lio_context->io_issued--;
1567	aio_proc_unlock(p);
1568	continue;
1569	}
1570
1571	if ( uap->mode == LIO_NOWAIT ) {
1572	/ Set signal hander, if any /
1573	entryp->aiocb.aio_sigevent = aiosigev;
1574	} else {
1575	/ flag that this thread blocks pending completion /
1576	entryp->flags \|= AIO_LIO_NOTIFY;
1577	}
1578
1579	/ check our aio limits to throttle bad or rude user land behavior /
1580	old_count = aio_increment_total_count();
1581
1582	aio_proc_lock_spin(p);
1583	if ( old_count >= aio_max_requests \|\|
1584	aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process \|\|
1585	is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1586
1587	lio_context->io_issued--;
1588	aio_proc_unlock(p);
1589
1590	aio_decrement_total_count();
1591
1592	if ( call_result == -`1` )
1593	call_result = EAGAIN;
1594	aio_free_request(entryp);
1595	entryp_listp[i] = NULL;
1596	continue;
1597	}
1598
1599	lck_mtx_convert_spin(aio_proc_mutex(p));
1600	aio_enqueue_work(p, entryp, `1`);
1601	aio_proc_unlock(p);
1602
1603	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_START,
1604	(int)p, (int)entryp->uaiocbp, entryp->flags, entryp->aiocb.aio_fildes, `0` );
1605	paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1606	paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1607	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_END,
1608	paio_offset[`0`], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[`1`] : `0`),
1609	paio_nbytes[`0`], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[`1`] : `0`),
1610	`0` );
1611	}
1612
1613	switch(uap->mode) {
1614	case LIO_WAIT:
1615	aio_proc_lock_spin(p);
1616	while (lio_context->io_completed < lio_context->io_issued) {
1617	result = msleep(lio_context, aio_proc_mutex(p), PCATCH \| PRIBIO \| PSPIN, "lio_listio", `0`);
1618
1619	/ If we were interrupted, fail out (even if all finished) /
1620	if (result != `0`) {
1621	call_result = EINTR;
1622	lio_context->io_waiter = `0`;
1623	break;
1624	}
1625	}
1626
1627	/ If all IOs have finished must free it /
1628	if (lio_context->io_completed == lio_context->io_issued) {
1629	free_context = TRUE;
1630	}
1631
1632	aio_proc_unlock(p);
1633	break;
1634
1635	case LIO_NOWAIT:
1636	break;
1637	}
1638
1639	/ call_result == -1 means we had no trouble queueing up requests /
1640	if ( call_result == -`1` ) {
1641	call_result = `0`;
1642	*retval = `0`;
1643	}
1644
1645	ExitRoutine:
1646	if ( entryp_listp != NULL )
1647	FREE( entryp_listp, M_TEMP );
1648	if ( aiocbpp != NULL )
1649	FREE( aiocbpp, M_TEMP );
1650	if (free_context) {
1651	free_lio_context(lio_context);
1652	}
1653
1654	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) \| DBG_FUNC_END,
1655	(int)p, call_result, `0`, `0`, `0` );
1656
1657	return( call_result );
1658
1659	} / lio_listio /
1660
1661
1662	/*
1663	* aio worker thread. this is where all the real work gets done.
1664	* we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1665	* after new work is queued up.
1666	*/
1667	__attribute__((noreturn))
1668	static void
1669	aio_work_thread(void)
1670	{
1671	aio_workq_entry *entryp;
1672	int error;
1673	vm_map_t currentmap;
1674	vm_map_t oldmap = VM_MAP_NULL;
1675	task_t oldaiotask = TASK_NULL;
1676	struct uthread *uthreadp = NULL;
1677
1678	for( ;; ) {
1679	/*
1680	* returns with the entry ref'ed.
1681	* sleeps until work is available.
1682	*/
1683	entryp = aio_get_some_work();
1684
1685	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) \| DBG_FUNC_START,
1686	(int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, `0`, `0` );
1687
1688	/*
1689	* Assume the target's address space identity for the duration
1690	* of the IO. Note: don't need to have the entryp locked,
1691	* because the proc and map don't change until it's freed.
1692	*/
1693	currentmap = get_task_map( (current_proc())->task );
1694	if ( currentmap != entryp->aio_map ) {
1695	uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1696	oldaiotask = uthreadp->uu_aio_task;
1697	uthreadp->uu_aio_task = entryp->procp->task;
1698	oldmap = vm_map_switch( entryp->aio_map );
1699	}
1700
1701	if ( (entryp->flags & AIO_READ) != `0` ) {
1702	error = do_aio_read( entryp );
1703	}
1704	else if ( (entryp->flags & AIO_WRITE) != `0` ) {
1705	error = do_aio_write( entryp );
1706	}
1707	else if ( (entryp->flags & (AIO_FSYNC \| AIO_DSYNC)) != `0` ) {
1708	error = do_aio_fsync( entryp );
1709	}
1710	else {
1711	printf( "%s - unknown aio request - flags 0x%02X \n",
1712	__FUNCTION__, entryp->flags );
1713	error = EINVAL;
1714	}
1715
1716	/ Restore old map /
1717	if ( currentmap != entryp->aio_map ) {
1718	(void) vm_map_switch( oldmap );
1719	uthreadp->uu_aio_task = oldaiotask;
1720	}
1721
1722	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) \| DBG_FUNC_END,
1723	(int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1724	entryp->returnval, `0` );
1725
1726
1727	/ XXX COUNTS /
1728	aio_entry_lock_spin(entryp);
1729	entryp->errorval = error;
1730	aio_entry_unlock(entryp);
1731
1732	/ we're done with the IO request so pop it off the active queue and /
1733	/ push it on the done queue /
1734	aio_proc_lock(entryp->procp);
1735	aio_proc_move_done_locked(entryp->procp, entryp);
1736	aio_proc_unlock(entryp->procp);
1737
1738	OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1739
1740	/ remove our reference to the user land map. /
1741	if ( VM_MAP_NULL != entryp->aio_map ) {
1742	vm_map_t my_map;
1743
1744	my_map = entryp->aio_map;
1745	entryp->aio_map = VM_MAP_NULL;
1746	vm_map_deallocate( my_map );
1747	}
1748
1749	/ Provide notifications /
1750	do_aio_completion( entryp );
1751
1752	/ Will free if needed /
1753	aio_entry_unref(entryp);
1754
1755	} / for ( ;; ) /
1756
1757	/ NOT REACHED /
1758
1759	} / aio_work_thread /
1760
1761
1762	/*
1763	* aio_get_some_work - get the next async IO request that is ready to be executed.
1764	* aio_fsync complicates matters a bit since we cannot do the fsync until all async
1765	* IO requests at the time the aio_fsync call came in have completed.
1766	* NOTE - AIO_LOCK must be held by caller
1767	*/
1768	static aio_workq_entry *
1769	aio_get_some_work( void )
1770	{
1771	aio_workq_entry *entryp = NULL;
1772	aio_workq_t queue = NULL;
1773
1774	/ Just one queue for the moment. In the future there will be many. /
1775	queue = &aio_anchor.aio_async_workqs[`0`];
1776	aio_workq_lock_spin(queue);
1777	if (queue->aioq_count == `0`) {
1778	goto nowork;
1779	}
1780
1781	/*
1782	* Hold the queue lock.
1783	*
1784	* pop some work off the work queue and add to our active queue
1785	* Always start with the queue lock held.
1786	*/
1787	for(;;) {
1788	/*
1789	* Pull of of work queue. Once it's off, it can't be cancelled,
1790	* so we can take our ref once we drop the queue lock.
1791	*/
1792	entryp = TAILQ_FIRST(&queue->aioq_entries);
1793
1794	/*
1795	* If there's no work or only fsyncs that need delay, go to sleep
1796	* and then start anew from aio_work_thread
1797	*/
1798	if (entryp == NULL) {
1799	goto nowork;
1800	}
1801
1802	aio_workq_remove_entry_locked(queue, entryp);
1803
1804	aio_workq_unlock(queue);
1805
1806	/*
1807	* Check if it's an fsync that must be delayed. No need to lock the entry;
1808	* that flag would have been set at initialization.
1809	*/
1810	if ( (entryp->flags & AIO_FSYNC) != `0` ) {
1811	/*
1812	* Check for unfinished operations on the same file
1813	* in this proc's queue.
1814	*/
1815	aio_proc_lock_spin(entryp->procp);
1816	if ( aio_delay_fsync_request( entryp ) ) {
1817	/ It needs to be delayed. Put it back on the end of the work queue /
1818	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) \| DBG_FUNC_NONE,
1819	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
1820
1821	aio_proc_unlock(entryp->procp);
1822
1823	aio_workq_lock_spin(queue);
1824	aio_workq_add_entry_locked(queue, entryp);
1825	continue;
1826	}
1827	aio_proc_unlock(entryp->procp);
1828	}
1829
1830	break;
1831	}
1832
1833	aio_entry_ref(entryp);
1834
1835	OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1836	return( entryp );
1837
1838	nowork:
1839	/ We will wake up when someone enqueues something /
1840	waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, `0`);
1841	aio_workq_unlock(queue);
1842	thread_block( (thread_continue_t)aio_work_thread );
1843
1844	// notreached
1845	return NULL;
1846	}
1847
1848	/*
1849	* aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1850	* A big, simple hammer: only send it off if it's the most recently filed IO which has
1851	* not been completed.
1852	*/
1853	static boolean_t
1854	aio_delay_fsync_request( aio_workq_entry *entryp )
1855	{
1856	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1857	return FALSE;
1858	}
1859
1860	return TRUE;
1861	} / aio_delay_fsync_request /
1862
1863	static aio_workq_entry *
1864	aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void group_tag, int* kindOfIO)
1865	{
1866	aio_workq_entry *entryp;
1867	int result = `0`;
1868
1869	entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1870	if ( entryp == NULL ) {
1871	result = EAGAIN;
1872	goto error_exit;
1873	}
1874
1875	bzero( entryp, sizeof(*entryp) );
1876
1877	/ fill in the rest of the aio_workq_entry /
1878	entryp->procp = procp;
1879	entryp->uaiocbp = aiocbp;
1880	entryp->flags \|= kindOfIO;
1881	entryp->group_tag = group_tag;
1882	entryp->aio_map = VM_MAP_NULL;
1883	entryp->aio_refcount = `0`;
1884
1885	if ( proc_is64bit(procp) ) {
1886	struct user64_aiocb aiocb64;
1887
1888	result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1889	if (result == `0` )
1890	do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1891
1892	} else {
1893	struct user32_aiocb aiocb32;
1894
1895	result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1896	if ( result == `0` )
1897	do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1898	}
1899
1900	if ( result != `0` ) {
1901	result = EAGAIN;
1902	goto error_exit;
1903	}
1904
1905	/ get a reference to the user land map in order to keep it around /
1906	entryp->aio_map = get_task_map( procp->task );
1907	vm_map_reference( entryp->aio_map );
1908
1909	/ do some more validation on the aiocb and embedded file descriptor /
1910	result = aio_validate( entryp );
1911	if ( result != `0` )
1912	goto error_exit_with_ref;
1913
1914	/ get a reference on the current_thread, which is passed in vfs_context. /
1915	entryp->thread = current_thread();
1916	thread_reference( entryp->thread );
1917	return ( entryp );
1918
1919	error_exit_with_ref:
1920	if ( VM_MAP_NULL != entryp->aio_map ) {
1921	vm_map_deallocate( entryp->aio_map );
1922	}
1923	error_exit:
1924	if ( result && entryp != NULL ) {
1925	zfree( aio_workq_zonep, entryp );
1926	entryp = NULL;
1927	}
1928
1929	return ( entryp );
1930	}
1931
1932
1933	/*
1934	* aio_queue_async_request - queue up an async IO request on our work queue then
1935	* wake up one of our worker threads to do the actual work. We get a reference
1936	* to our caller's user land map in order to keep it around while we are
1937	* processing the request.
1938	*/
1939	static int
1940	aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1941	{
1942	aio_workq_entry *entryp;
1943	int result;
1944	int old_count;
1945	uint32_t *paio_offset;
1946	uint32_t *paio_nbytes;
1947
1948	old_count = aio_increment_total_count();
1949	if (old_count >= aio_max_requests) {
1950	result = EAGAIN;
1951	goto error_noalloc;
1952	}
1953
1954	entryp = aio_create_queue_entry( procp, aiocbp, `0`, kindOfIO);
1955	if ( entryp == NULL ) {
1956	result = EAGAIN;
1957	goto error_noalloc;
1958	}
1959
1960
1961	aio_proc_lock_spin(procp);
1962
1963	if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1964	result = EAGAIN;
1965	goto error_exit;
1966	}
1967
1968	/ check our aio limits to throttle bad or rude user land behavior /
1969	if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1970	printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1971	result = EAGAIN;
1972	goto error_exit;
1973	}
1974
1975	/ Add the IO to proc and work queues, wake up threads as appropriate /
1976	lck_mtx_convert_spin(aio_proc_mutex(procp));
1977	aio_enqueue_work(procp, entryp, `1`);
1978
1979	aio_proc_unlock(procp);
1980
1981	paio_offset = (uint32_t*) &entryp->aiocb.aio_offset;
1982	paio_nbytes = (uint32_t*) &entryp->aiocb.aio_nbytes;
1983	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_START,
1984	(int)procp, (int)aiocbp, entryp->flags, entryp->aiocb.aio_fildes, `0` );
1985	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) \| DBG_FUNC_END,
1986	paio_offset[`0`], (sizeof(entryp->aiocb.aio_offset) == sizeof(uint64_t) ? paio_offset[`1`] : `0`),
1987	paio_nbytes[`0`], (sizeof(entryp->aiocb.aio_nbytes) == sizeof(uint64_t) ? paio_nbytes[`1`] : `0`),
1988	`0` );
1989
1990	return( `0` );
1991
1992	error_exit:
1993	/*
1994	* This entry has not been queued up so no worries about
1995	* unlocked state and aio_map
1996	*/
1997	aio_proc_unlock(procp);
1998	aio_free_request(entryp);
1999
2000	error_noalloc:
2001	aio_decrement_total_count();
2002
2003	return( result );
2004
2005	} / aio_queue_async_request /
2006
2007
2008	/*
2009	* lio_create_entry
2010	*
2011	* Allocate an aio_workq_entry and fill it in. If all goes well return 0
2012	* and pass the aio_workq_entry pointer back to our caller.
2013	*
2014	* Parameters: procp The process makign the request
2015	* aiocbp The aio context buffer pointer
2016	* group_tag The group tag used to indicate a
2017	* group of operations has completed
2018	* entrypp Pointer to the pointer to receive the
2019	* address of the created aio_workq_entry
2020	*
2021	* Returns: 0 Successfully created
2022	* EAGAIN Try again (usually resource shortage)
2023	*
2024	*
2025	* Notes: We get a reference to our caller's user land map in order
2026	* to keep it around while we are processing the request.
2027	*
2028	* lio_listio calls behave differently at completion they do
2029	* completion notification when all async IO requests have
2030	* completed. We use group_tag to tag IO requests that behave
2031	* in the delay notification manner.
2032	*
2033	* All synchronous operations are considered to not have a
2034	* signal routine associated with them (sigp == USER_ADDR_NULL).
2035	*/
2036	static int
2037	lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2038	aio_workq_entry **entrypp )
2039	{
2040	aio_workq_entry *entryp;
2041	int result;
2042
2043	entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2044	if ( entryp == NULL ) {
2045	result = EAGAIN;
2046	goto error_exit;
2047	}
2048
2049	/*
2050	* Look for lio_listio LIO_NOP requests and ignore them; this is
2051	* not really an error, but we need to free our aio_workq_entry.
2052	*/
2053	if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2054	result = `0`;
2055	goto error_exit;
2056	}
2057
2058	*entrypp = entryp;
2059	return( `0` );
2060
2061	error_exit:
2062
2063	if ( entryp != NULL ) {
2064	/*
2065	* This entry has not been queued up so no worries about
2066	* unlocked state and aio_map
2067	*/
2068	aio_free_request(entryp);
2069	}
2070
2071	return( result );
2072
2073	} / lio_create_entry /
2074
2075
2076	/*
2077	* aio_free_request - remove our reference on the user land map and
2078	* free the work queue entry resources. The entry is off all lists
2079	* and has zero refcount, so no one can have a pointer to it.
2080	*/
2081
2082	static int
2083	aio_free_request(aio_workq_entry *entryp)
2084	{
2085	/ remove our reference to the user land map. /
2086	if ( VM_MAP_NULL != entryp->aio_map) {
2087	vm_map_deallocate(entryp->aio_map);
2088	}
2089
2090	/ remove our reference to thread which enqueued the request /
2091	if ( NULL != entryp->thread ) {
2092	thread_deallocate( entryp->thread );
2093	}
2094
2095	entryp->aio_refcount = -`1`; / A bit of poisoning in case of bad refcounting. /
2096
2097	zfree( aio_workq_zonep, entryp );
2098
2099	return( `0` );
2100
2101	} / aio_free_request /
2102
2103
2104	/*
2105	* aio_validate
2106	*
2107	* validate the aiocb passed in by one of the aio syscalls.
2108	*/
2109	static int
2110	aio_validate( aio_workq_entry *entryp )
2111	{
2112	struct fileproc *fp;
2113	int flag;
2114	int result;
2115
2116	result = `0`;
2117
2118	if ( (entryp->flags & AIO_LIO) != `0` ) {
2119	if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2120	entryp->flags \|= AIO_READ;
2121	else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2122	entryp->flags \|= AIO_WRITE;
2123	else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2124	return( `0` );
2125	else
2126	return( EINVAL );
2127	}
2128
2129	flag = FREAD;
2130	if ( (entryp->flags & (AIO_WRITE \| AIO_FSYNC \| AIO_DSYNC)) != `0` ) {
2131	flag = FWRITE;
2132	}
2133
2134	if ( (entryp->flags & (AIO_READ \| AIO_WRITE)) != `0` ) {
2135	if ( entryp->aiocb.aio_nbytes > INT_MAX \|\|
2136	entryp->aiocb.aio_buf == USER_ADDR_NULL \|\|
2137	entryp->aiocb.aio_offset < `0` )
2138	return( EINVAL );
2139	}
2140
2141	/*
2142	* validate aiocb.aio_sigevent. at this point we only support
2143	* sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
2144	* sigev_value, sigev_notify_function, and sigev_notify_attributes
2145	* are ignored, since SIGEV_THREAD is unsupported. This is consistent
2146	* with no [RTS] (RalTime Signal) option group support.
2147	*/
2148	switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2149	case SIGEV_SIGNAL:
2150	{
2151	int signum;
2152
2153	/ make sure we have a valid signal number /
2154	signum = entryp->aiocb.aio_sigevent.sigev_signo;
2155	if ( signum <= `0` \|\| signum >= NSIG \|\|
2156	signum == SIGKILL \|\| signum == SIGSTOP )
2157	return (EINVAL);
2158	}
2159	break;
2160
2161	case SIGEV_NONE:
2162	break;
2163
2164	case SIGEV_THREAD:
2165	/ Unsupported [RTS] /
2166
2167	default:
2168	return (EINVAL);
2169	}
2170
2171	/ validate the file descriptor and that the file was opened*
2172	* for the appropriate read / write access.
2173	*/
2174	proc_fdlock(entryp->procp);
2175
2176	result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , `1`);
2177	if ( result == `0` ) {
2178	if ( (fp->f_fglob->fg_flag & flag) == `0` ) {
2179	/ we don't have read or write access /
2180	result = EBADF;
2181	}
2182	else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
2183	/ this is not a file /
2184	result = ESPIPE;
2185	} else
2186	fp->f_flags \|= FP_AIOISSUED;
2187
2188	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , `1`);
2189	}
2190	else {
2191	result = EBADF;
2192	}
2193
2194	proc_fdunlock(entryp->procp);
2195
2196	return( result );
2197
2198	} / aio_validate /
2199
2200	static int
2201	aio_increment_total_count()
2202	{
2203	return OSIncrementAtomic(&aio_anchor.aio_total_count);
2204	}
2205
2206	static int
2207	aio_decrement_total_count()
2208	{
2209	int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2210	if (old <= `0`) {
2211	panic("Negative total AIO count!\n");
2212	}
2213
2214	return old;
2215	}
2216
2217	static int
2218	aio_get_process_count(proc_t procp )
2219	{
2220	return procp->p_aio_total_count;
2221
2222	} / aio_get_process_count /
2223
2224	static int
2225	aio_get_all_queues_count( void )
2226	{
2227	return aio_anchor.aio_total_count;
2228
2229	} / aio_get_all_queues_count /
2230
2231
2232	/*
2233	* do_aio_completion. Handle async IO completion.
2234	*/
2235	static void
2236	do_aio_completion( aio_workq_entry *entryp )
2237	{
2238
2239	boolean_t lastLioCompleted = FALSE;
2240	aio_lio_context *lio_context = NULL;
2241	int waiter = `0`;
2242
2243	lio_context = (aio_lio_context *)entryp->group_tag;
2244
2245	if (lio_context != NULL) {
2246
2247	aio_proc_lock_spin(entryp->procp);
2248
2249	/ Account for this I/O completing. /
2250	lio_context->io_completed++;
2251
2252	/ Are we done with this lio context? /
2253	if (lio_context->io_issued == lio_context->io_completed) {
2254	lastLioCompleted = TRUE;
2255	}
2256
2257	waiter = lio_context->io_waiter;
2258
2259	/ explicit wakeup of lio_listio() waiting in LIO_WAIT /
2260	if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != `0`)) {
2261	/ wake up the waiter /
2262	wakeup(lio_context);
2263	}
2264
2265	aio_proc_unlock(entryp->procp);
2266	}
2267
2268	if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2269	(entryp->flags & AIO_DISABLE) == `0` ) {
2270
2271	boolean_t performSignal = FALSE;
2272	if (lio_context == NULL) {
2273	performSignal = TRUE;
2274	}
2275	else {
2276	/*
2277	* If this was the last request in the group and a signal
2278	* is desired, send one.
2279	*/
2280	performSignal = lastLioCompleted;
2281	}
2282
2283	if (performSignal) {
2284
2285	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) \| DBG_FUNC_NONE,
2286	(int)entryp->procp, (int)entryp->uaiocbp,
2287	entryp->aiocb.aio_sigevent.sigev_signo, `0`, `0` );
2288
2289	psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2290	}
2291	}
2292
2293	if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2294	panic("Close and exit flags set at the same time\n");
2295	}
2296
2297	/*
2298	* need to handle case where a process is trying to exit, exec, or
2299	* close and is currently waiting for active aio requests to complete.
2300	* If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2301	* other requests in the active queue for this process. If there are
2302	* none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2303	* If there are some still active then do nothing - we only want to
2304	* wakeup when all active aio requests for the process are complete.
2305	*
2306	* Don't need to lock the entry or proc to check the cleanup flag. It can only be
2307	* set for cancellation, while the entryp is still on a proc list; now it's
2308	* off, so that flag is already set if it's going to be.
2309	*/
2310	if ( (entryp->flags & AIO_EXIT_WAIT) != `0` ) {
2311	int active_requests;
2312
2313	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) \| DBG_FUNC_NONE,
2314	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
2315
2316	aio_proc_lock_spin(entryp->procp);
2317	active_requests = aio_active_requests_for_process( entryp->procp );
2318	if ( active_requests < `1` ) {
2319	/*
2320	* no active aio requests for this process, continue exiting. In this
2321	* case, there should be no one else waiting ont he proc in AIO...
2322	*/
2323	wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2324	aio_proc_unlock(entryp->procp);
2325
2326	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) \| DBG_FUNC_NONE,
2327	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
2328	} else {
2329	aio_proc_unlock(entryp->procp);
2330	}
2331	}
2332
2333	if ( (entryp->flags & AIO_CLOSE_WAIT) != `0` ) {
2334	int active_requests;
2335
2336	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) \| DBG_FUNC_NONE,
2337	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
2338
2339	aio_proc_lock_spin(entryp->procp);
2340	active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2341	if ( active_requests < `1` ) {
2342	/ Can't wakeup_one(); multiple closes might be in progress. /
2343	wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2344	aio_proc_unlock(entryp->procp);
2345
2346	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) \| DBG_FUNC_NONE,
2347	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
2348	} else {
2349	aio_proc_unlock(entryp->procp);
2350	}
2351	}
2352	/*
2353	* A thread in aio_suspend() wants to known about completed IOs. If it checked
2354	* the done list before we moved our AIO there, then it already asserted its wait,
2355	* and we can wake it up without holding the lock. If it checked the list after
2356	* we did our move, then it already has seen the AIO that we moved. Herego, we
2357	* can do our wakeup without holding the lock.
2358	*/
2359	wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2360	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) \| DBG_FUNC_NONE,
2361	(int)entryp->procp, (int)entryp->uaiocbp, `0`, `0`, `0` );
2362
2363	/*
2364	* free the LIO context if the last lio completed and no thread is
2365	* waiting
2366	*/
2367	if (lastLioCompleted && (waiter == `0`))
2368	free_lio_context (lio_context);
2369
2370
2371	} / do_aio_completion /
2372
2373
2374	/*
2375	* do_aio_read
2376	*/
2377	static int
2378	do_aio_read( aio_workq_entry *entryp )
2379	{
2380	struct fileproc *fp;
2381	int error;
2382	struct vfs_context context;
2383
2384	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , `0`)) )
2385	return(error);
2386	if ( (fp->f_fglob->fg_flag & FREAD) == `0` ) {
2387	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2388	return(EBADF);
2389	}
2390
2391	context.vc_thread = entryp->thread; / XXX /
2392	context.vc_ucred = fp->f_fglob->fg_cred;
2393
2394	error = dofileread(&context, fp,
2395	entryp->aiocb.aio_buf,
2396	entryp->aiocb.aio_nbytes,
2397	entryp->aiocb.aio_offset, FOF_OFFSET,
2398	&entryp->returnval);
2399	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2400
2401	return( error );
2402
2403	} / do_aio_read /
2404
2405
2406	/*
2407	* do_aio_write
2408	*/
2409	static int
2410	do_aio_write( aio_workq_entry *entryp )
2411	{
2412	struct fileproc *fp;
2413	int error, flags;
2414	struct vfs_context context;
2415
2416	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , `0`)) )
2417	return(error);
2418	if ( (fp->f_fglob->fg_flag & FWRITE) == `0` ) {
2419	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2420	return(EBADF);
2421	}
2422
2423	flags = FOF_PCRED;
2424	if ( (fp->f_fglob->fg_flag & O_APPEND) == `0` ) {
2425	flags \|= FOF_OFFSET;
2426	}
2427
2428	context.vc_thread = entryp->thread; / XXX /
2429	context.vc_ucred = fp->f_fglob->fg_cred;
2430
2431	/ NB: tell dofilewrite the offset, and to use the proc cred /
2432	error = dofilewrite(&context,
2433	fp,
2434	entryp->aiocb.aio_buf,
2435	entryp->aiocb.aio_nbytes,
2436	entryp->aiocb.aio_offset,
2437	flags,
2438	&entryp->returnval);
2439
2440	if (entryp->returnval)
2441	fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2442	else
2443	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2444
2445	return( error );
2446
2447	} / do_aio_write /
2448
2449
2450	/*
2451	* aio_active_requests_for_process - return number of active async IO
2452	* requests for the given process.
2453	*/
2454	static int
2455	aio_active_requests_for_process(proc_t procp )
2456	{
2457	return( procp->p_aio_active_count );
2458
2459	} / aio_active_requests_for_process /
2460
2461	/*
2462	* Called with the proc locked.
2463	*/
2464	static int
2465	aio_proc_active_requests_for_file(proc_t procp, int fd)
2466	{
2467	int count = `0`;
2468	aio_workq_entry *entryp;
2469	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2470	if (entryp->aiocb.aio_fildes == fd) {
2471	count++;
2472	}
2473	}
2474
2475	return count;
2476	} / aio_active_requests_for_process /
2477
2478
2479
2480	/*
2481	* do_aio_fsync
2482	*/
2483	static int
2484	do_aio_fsync( aio_workq_entry *entryp )
2485	{
2486	struct vfs_context context;
2487	struct vnode *vp;
2488	struct fileproc *fp;
2489	int sync_flag;
2490	int error;
2491
2492	/*
2493	* We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2494	*
2495	* If AIO_DSYNC is set, we can tell the lower layers that it is OK
2496	* to mark for update the metadata not strictly necessary for data
2497	* retrieval, rather than forcing it to disk.
2498	*
2499	* If AIO_FSYNC is set, we have to also wait for metadata not really
2500	* necessary to data retrival are committed to stable storage (e.g.
2501	* atime, mtime, ctime, etc.).
2502	*
2503	* Metadata necessary for data retrieval ust be committed to stable
2504	* storage in either case (file length, etc.).
2505	*/
2506	if (entryp->flags & AIO_FSYNC)
2507	sync_flag = MNT_WAIT;
2508	else
2509	sync_flag = MNT_DWAIT;
2510
2511	error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2512	if ( error == `0` ) {
2513	if ( (error = vnode_getwithref(vp)) ) {
2514	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2515	entryp->returnval = -`1`;
2516	return(error);
2517	}
2518	context.vc_thread = current_thread();
2519	context.vc_ucred = fp->f_fglob->fg_cred;
2520
2521	error = VNOP_FSYNC( vp, sync_flag, &context);
2522
2523	(void)vnode_put(vp);
2524
2525	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, `0`);
2526	}
2527	if ( error != `0` )
2528	entryp->returnval = -`1`;
2529
2530	return( error );
2531
2532	} / do_aio_fsync /
2533
2534
2535	/*
2536	* is_already_queued - runs through our queues to see if the given
2537	* aiocbp / process is there. Returns TRUE if there is a match
2538	* on any of our aio queues.
2539	*
2540	* Called with proc aio lock held (can be held spin)
2541	*/
2542	static boolean_t
2543	is_already_queued(proc_t procp,
2544	user_addr_t aiocbp )
2545	{
2546	aio_workq_entry *entryp;
2547	boolean_t result;
2548
2549	result = FALSE;
2550
2551	/ look for matches on our queue of async IO requests that have completed /
2552	TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2553	if ( aiocbp == entryp->uaiocbp ) {
2554	result = TRUE;
2555	goto ExitThisRoutine;
2556	}
2557	}
2558
2559	/ look for matches on our queue of active async IO requests /
2560	TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2561	if ( aiocbp == entryp->uaiocbp ) {
2562	result = TRUE;
2563	goto ExitThisRoutine;
2564	}
2565	}
2566
2567	ExitThisRoutine:
2568	return( result );
2569
2570	} / is_already_queued /
2571
2572
2573	static void
2574	free_lio_context(aio_lio_context* context)
2575	{
2576
2577	#if DEBUG
2578	OSDecrementAtomic(&lio_contexts_alloced);
2579	#endif /* DEBUG */
2580
2581	FREE( context, M_TEMP );
2582
2583	} / free_lio_context /
2584
2585
2586	/*
2587	* aio initialization
2588	*/
2589	__private_extern__ void
2590	aio_init( void )
2591	{
2592	int i;
2593
2594	aio_lock_grp_attr = lck_grp_attr_alloc_init();
2595	aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2596	aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2597	aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2598	aio_lock_attr = lck_attr_alloc_init();
2599
2600	lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2601	lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2602
2603	aio_anchor.aio_inflight_count = `0`;
2604	aio_anchor.aio_done_count = `0`;
2605	aio_anchor.aio_total_count = `0`;
2606	aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2607
2608	for (i = `0`; i < AIO_NUM_WORK_QUEUES; i++) {
2609	aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2610	}
2611
2612
2613	i = sizeof( aio_workq_entry );
2614	aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2615
2616	_aio_create_worker_threads( aio_worker_threads );
2617
2618	} / aio_init /
2619
2620
2621	/*
2622	* aio worker threads created here.
2623	*/
2624	__private_extern__ void
2625	_aio_create_worker_threads( int num )
2626	{
2627	int i;
2628
2629	/ create some worker threads to handle the async IO requests /
2630	for ( i = `0`; i < num; i++ ) {
2631	thread_t myThread;
2632
2633	if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2634	printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2635	}
2636	else
2637	thread_deallocate(myThread);
2638	}
2639
2640	return;
2641
2642	} / _aio_create_worker_threads /
2643
2644	/*
2645	* Return the current activation utask
2646	*/
2647	task_t
2648	get_aiotask(void)
2649	{
2650	return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2651	}
2652
2653
2654	/*
2655	* In the case of an aiocb from a
2656	* 32-bit process we need to expand some longs and pointers to the correct
2657	* sizes in order to let downstream code always work on the same type of
2658	* aiocb (in our case that is a user_aiocb)
2659	*/
2660	static void
2661	do_munge_aiocb_user32_to_user( struct user32_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp )
2662	{
2663	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2664	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2665	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2666	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2667	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2668	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2669
2670	/ special case here. since we do not know if sigev_value is an /
2671	/ int or a ptr we do NOT cast the ptr to a user_addr_t. This /
2672	/ means if we send this info back to user space we need to remember /
2673	/ sigev_value was not expanded for the 32-bit case. /
2674	/ NOTE - this does NOT affect us since we don't support sigev_value /
2675	/ yet in the aio context. /
2676	//LP64
2677	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2678	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2679	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2680	my_aiocbp->aio_sigevent.sigev_value.sival_int;
2681	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2682	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2683	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2684	CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2685	}
2686
2687	/ Similar for 64-bit user process, so that we don't need to satisfy*
2688	* the alignment constraints of the original user64_aiocb
2689	*/
2690	static void
2691	do_munge_aiocb_user64_to_user( struct user64_aiocb my_aiocbp, struct* user_aiocb *the_user_aiocbp )
2692	{
2693	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2694	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2695	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2696	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2697	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2698	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2699
2700	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2701	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2702	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2703	my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2704	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2705	my_aiocbp->aio_sigevent.sigev_notify_function;
2706	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2707	my_aiocbp->aio_sigevent.sigev_notify_attributes;
2708	}
2709

Browse the source code of xnu/bsd/kern/kern_aio.c