vfs_cluster.c source code [xnu/bsd/vfs/vfs_cluster.c]

1	/*
2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62	*/
63
64	#include <sys/param.h>
65	#include <sys/proc_internal.h>
66	#include <sys/buf_internal.h>
67	#include <sys/mount_internal.h>
68	#include <sys/vnode_internal.h>
69	#include <sys/trace.h>
70	#include <kern/kalloc.h>
71	#include <sys/time.h>
72	#include <sys/kernel.h>
73	#include <sys/resourcevar.h>
74	#include <miscfs/specfs/specdev.h>
75	#include <sys/uio_internal.h>
76	#include <libkern/libkern.h>
77	#include <machine/machine_routines.h>
78
79	#include <sys/ubc_internal.h>
80	#include <vm/vnode_pager.h>
81
82	#include <mach/mach_types.h>
83	#include <mach/memory_object_types.h>
84	#include <mach/vm_map.h>
85	#include <mach/upl.h>
86	#include <kern/task.h>
87	#include <kern/policy_internal.h>
88
89	#include <vm/vm_kern.h>
90	#include <vm/vm_map.h>
91	#include <vm/vm_pageout.h>
92	#include <vm/vm_fault.h>
93
94	#include <sys/kdebug.h>
95	#include <sys/kdebug_triage.h>
96	#include <libkern/OSAtomic.h>
97
98	#include <sys/sdt.h>
99
100	#include <stdbool.h>
101
102	#include <vfs/vfs_disk_conditioner.h>
103
104	#if 0
105	#undef KERNEL_DEBUG
106	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107	#endif
108
109
110	#define CL_READ 0x01
111	#define CL_WRITE 0x02
112	#define CL_ASYNC 0x04
113	#define CL_COMMIT 0x08
114	#define CL_PAGEOUT 0x10
115	#define CL_AGE 0x20
116	#define CL_NOZERO 0x40
117	#define CL_PAGEIN 0x80
118	#define CL_DEV_MEMORY 0x100
119	#define CL_PRESERVE 0x200
120	#define CL_THROTTLE 0x400
121	#define CL_KEEPCACHED 0x800
122	#define CL_DIRECT_IO 0x1000
123	#define CL_PASSIVE 0x2000
124	#define CL_IOSTREAMING 0x4000
125	#define CL_CLOSE 0x8000
126	#define CL_ENCRYPTED 0x10000
127	#define CL_RAW_ENCRYPTED 0x20000
128	#define CL_NOCACHE 0x40000
129
130	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132	#define CLUSTER_IO_WAITING ((buf_t)1)
133
134	extern upl_t vector_upl_create(vm_offset_t, uint32_t);
135	extern uint32_t vector_upl_max_upls(upl_t);
136	extern boolean_t vector_upl_is_valid(upl_t);
137	extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138	extern void vector_upl_set_pagelist(upl_t);
139	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140
141	struct clios {
142	lck_mtx_t io_mtxp;
143	u_int io_completed; / amount of io that has currently completed /
144	u_int io_issued; / amount of io that was successfully issued /
145	int io_error; / error code of first error encountered /
146	int io_wanted; / someone is sleeping waiting for a change in state /
147	};
148
149	struct cl_direct_read_lock {
150	LIST_ENTRY(cl_direct_read_lock) chain;
151	int32_t ref_count;
152	vnode_t vp;
153	lck_rw_t rw_lock;
154	};
155
156	#define CL_DIRECT_READ_LOCK_BUCKETS 61
157
158	static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159	cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160
161	static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162	static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163	static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164
165	static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166	sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167
168	static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169	sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170
171	#define IO_UNKNOWN 0
172	#define IO_DIRECT 1
173	#define IO_CONTIG 2
174	#define IO_COPY 3
175
176	#define PUSH_DELAY 0x01
177	#define PUSH_ALL 0x02
178	#define PUSH_SYNC 0x04
179
180
181	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182	static void cluster_wait_IO(buf_t cbp_head, int async);
183	static void cluster_complete_transaction(buf_t cbp_head, void* callback_arg, int* retval, int* flags, int needwait);
184
185	static int cluster_io_type(struct uio uio, int* io_type, u_int32_t io_length, u_int32_t min_length);
186
187	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188	int flags, buf_t real_bp, struct clios iostate, int* ()(buf_t, void* ), void* *callback_arg);
189	static int cluster_iodone(buf_t bp, void *callback_arg);
190	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191	static int cluster_is_throttled(vnode_t vp);
192
193	static void cluster_iostate_wait(struct clios iostate, u_int target, const* char *wait_name);
194
195	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void* ), void* callback_arg, int* flags);
196
197	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int* io_resid, int* mark_dirty, int take_reference);
199
200	static int cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int* flags,
201	int ()(buf_t, void* ), void* callback_arg) __attribute__*((noinline));
202	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int* read_type, u_int32_t read_length,
203	int flags, int ()(buf_t, void* ), void* callback_arg) __attribute__*((noinline));
204	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int* read_type, u_int32_t read_length,
205	int ()(buf_t, void* ), void* callback_arg, int* flags) __attribute__((noinline));
206
207	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void* ), void* callback_arg) __attribute__*((noinline));
209	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void* ), void* callback_arg) __attribute__*((noinline));
211	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212	int write_type, u_int32_t write_length, int ()(buf_t, void* ), void* callback_arg, int* bflag) __attribute__((noinline));
213
214	static void cluster_update_state_internal(vnode_t vp, struct cl_extent cl, int* flags, boolean_t defer_writes, boolean_t *first_pass,
215	off_t write_off, int write_cnt, off_t newEOF, int (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated);
216
217	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int* flags, int ()(buf_t, void* ), void* *callback_arg);
218
219	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void* ), void* callback_arg, int* bflag);
220	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct* cl_readahead *ra,
221	int (callback)(buf_t, void* ), void* callback_arg, int* bflag);
222
223	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int* flags, int ()(buf_t, void* ), void* *callback_arg, boolean_t vm_ioitiated);
224
225	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int* push_flag, int flags, int ()(buf_t, void* *),
226	void callback_arg, int* *err, boolean_t vm_initiated);
227
228	static int sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int* ()(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated);
229	static int sparse_cluster_push(struct cl_writebehind , void* *cmapp, vnode_t vp, off_t EOF, int* push_flag,
230	int io_flags, int ()(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated);
231	static int sparse_cluster_add(struct cl_writebehind , void* cmapp, vnode_t vp, struct** cl_extent *, off_t EOF,
232	int ()(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated);
233
234	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
235	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
236	static kern_return_t vfs_drt_control(void *cmapp, int* op_type);
237	static kern_return_t vfs_get_scmap_push_behavior_internal(void *cmapp, int* *push_flag);
238
239
240	/*
241	* For throttled IO to check whether
242	* a block is cached by the boot cache
243	* and thus it can avoid delaying the IO.
244	*
245	* bootcache_contains_block is initially
246	* NULL. The BootCache will set it while
247	* the cache is active and clear it when
248	* the cache is jettisoned.
249	*
250	* Returns 0 if the block is not
251	* contained in the cache, 1 if it is
252	* contained.
253	*
254	* The function pointer remains valid
255	* after the cache has been evicted even
256	* if bootcache_contains_block has been
257	* cleared.
258	*
259	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260	*/
261	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262
263
264	/*
265	* limit the internal I/O size so that we
266	* can represent it in a 32 bit int
267	*/
268	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
269	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
270	#define MAX_VECTS 16
271	/*
272	* The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273	* allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
274	* we have not historically allowed the write to bypass the UBC.
275	*/
276	#define MIN_DIRECT_WRITE_SIZE (16384)
277
278	#define WRITE_THROTTLE 6
279	#define WRITE_THROTTLE_SSD 2
280	#define WRITE_BEHIND 1
281	#define WRITE_BEHIND_SSD 1
282
283	#if !defined(XNU_TARGET_OS_OSX)
284	#define PREFETCH 1
285	#define PREFETCH_SSD 1
286	uint32_t speculative_prefetch_max = (`2048` * `1024`); / maximum bytes in a specluative read-ahead /
287	uint32_t speculative_prefetch_max_iosize = (`512` * `1024`); / maximum I/O size to use in a specluative read-ahead /
288	#else /* XNU_TARGET_OS_OSX */
289	#define PREFETCH 3
290	#define PREFETCH_SSD 2
291	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * `3`); / maximum bytes in a specluative read-ahead /
292	uint32_t speculative_prefetch_max_iosize = (`512` * `1024`); / maximum I/O size to use in a specluative read-ahead on SSDs/
293	#endif /* ! XNU_TARGET_OS_OSX */
294
295	/ maximum bytes for read-ahead /
296	uint32_t prefetch_max = (`1024` * `1024` * `1024`);
297	/ maximum bytes for outstanding reads /
298	uint32_t overlapping_read_max = (`1024` * `1024` * `1024`);
299	/ maximum bytes for outstanding writes /
300	uint32_t overlapping_write_max = (`1024` * `1024` * `1024`);
301
302	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
303	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
304
305	int speculative_reads_disabled = `0`;
306
307	/*
308	* throttle the number of async writes that
309	* can be outstanding on a single vnode
310	* before we issue a synchronous write
311	*/
312	#define THROTTLE_MAXCNT 0
313
314	uint32_t throttle_max_iosize = (`128` * `1024`);
315
316	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
317
318	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, `0`, "");
319
320
321	void
322	cluster_init(void)
323	{
324	for (int i = `0`; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
325	LIST_INIT(&cl_direct_read_locks[i]);
326	}
327	}
328
329
330	uint32_t
331	cluster_max_io_size(mount_t mp, int type)
332	{
333	uint32_t max_io_size;
334	uint32_t segcnt;
335	uint32_t maxcnt;
336
337	switch (type) {
338	case CL_READ:
339	segcnt = mp->mnt_segreadcnt;
340	maxcnt = mp->mnt_maxreadcnt;
341	break;
342	case CL_WRITE:
343	segcnt = mp->mnt_segwritecnt;
344	maxcnt = mp->mnt_maxwritecnt;
345	break;
346	default:
347	segcnt = min(a: mp->mnt_segreadcnt, b: mp->mnt_segwritecnt);
348	maxcnt = min(a: mp->mnt_maxreadcnt, b: mp->mnt_maxwritecnt);
349	break;
350	}
351	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
352	/*
353	* don't allow a size beyond the max UPL size we can create
354	*/
355	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
356	}
357	max_io_size = min(a: (segcnt * PAGE_SIZE), b: maxcnt);
358
359	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
360	/*
361	* don't allow a size smaller than the old fixed limit
362	*/
363	max_io_size = MAX_UPL_TRANSFER_BYTES;
364	} else {
365	/*
366	* make sure the size specified is a multiple of PAGE_SIZE
367	*/
368	max_io_size &= ~PAGE_MASK;
369	}
370	return max_io_size;
371	}
372
373	/*
374	* Returns max prefetch value. If the value overflows or exceeds the specified
375	* 'prefetch_limit', it will be capped at 'prefetch_limit' value.
376	*/
377	static inline uint32_t
378	cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
379	{
380	bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
381	uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
382	uint32_t prefetch = `0`;
383
384	if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) \|\|
385	(prefetch > prefetch_limit))) {
386	prefetch = prefetch_limit;
387	}
388
389	return prefetch;
390	}
391
392	static inline uint32_t
393	calculate_max_throttle_size(vnode_t vp)
394	{
395	bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
396	uint32_t io_scale = IO_SCALE(vp, is_ssd ? `2` : `1`);
397
398	return MIN(io_scale * THROTTLE_MAX_IOSIZE, MAX_UPL_TRANSFER_BYTES);
399	}
400
401	static inline uint32_t
402	calculate_max_throttle_cnt(vnode_t vp)
403	{
404	bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
405	uint32_t io_scale = IO_SCALE(vp, `1`);
406
407	return is_ssd ? MIN(io_scale, `4`) : THROTTLE_MAXCNT;
408	}
409
410	#define CLW_ALLOCATE 0x01
411	#define CLW_RETURNLOCKED 0x02
412	#define CLW_IONOCACHE 0x04
413	#define CLW_IOPASSIVE 0x08
414
415	/*
416	* if the read ahead context doesn't yet exist,
417	* allocate and initialize it...
418	* the vnode lock serializes multiple callers
419	* during the actual assignment... first one
420	* to grab the lock wins... the other callers
421	* will release the now unnecessary storage
422	*
423	* once the context is present, try to grab (but don't block on)
424	* the lock associated with it... if someone
425	* else currently owns it, than the read
426	* will run without read-ahead. this allows
427	* multiple readers to run in parallel and
428	* since there's only 1 read ahead context,
429	* there's no real loss in only allowing 1
430	* reader to have read-ahead enabled.
431	*/
432	static struct cl_readahead *
433	cluster_get_rap(vnode_t vp)
434	{
435	struct ubc_info *ubc;
436	struct cl_readahead *rap;
437
438	ubc = vp->v_ubcinfo;
439
440	if ((rap = ubc->cl_rahead) == NULL) {
441	rap = zalloc_flags(cl_rd_zone, Z_WAITOK \| Z_ZERO);
442	rap->cl_lastr = -`1`;
443	lck_mtx_init(lck: &rap->cl_lockr, grp: &cl_mtx_grp, LCK_ATTR_NULL);
444
445	vnode_lock(vp);
446
447	if (ubc->cl_rahead == NULL) {
448	ubc->cl_rahead = rap;
449	} else {
450	lck_mtx_destroy(lck: &rap->cl_lockr, grp: &cl_mtx_grp);
451	zfree(cl_rd_zone, rap);
452	rap = ubc->cl_rahead;
453	}
454	vnode_unlock(vp);
455	}
456	if (lck_mtx_try_lock(lck: &rap->cl_lockr) == TRUE) {
457	return rap;
458	}
459
460	return (struct cl_readahead *)NULL;
461	}
462
463
464	/*
465	* if the write behind context doesn't yet exist,
466	* and CLW_ALLOCATE is specified, allocate and initialize it...
467	* the vnode lock serializes multiple callers
468	* during the actual assignment... first one
469	* to grab the lock wins... the other callers
470	* will release the now unnecessary storage
471	*
472	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
473	* the lock associated with the write behind context before
474	* returning
475	*/
476
477	static struct cl_writebehind *
478	cluster_get_wbp(vnode_t vp, int flags)
479	{
480	struct ubc_info *ubc;
481	struct cl_writebehind *wbp;
482
483	ubc = vp->v_ubcinfo;
484
485	if ((wbp = ubc->cl_wbehind) == NULL) {
486	if (!(flags & CLW_ALLOCATE)) {
487	return (struct cl_writebehind *)NULL;
488	}
489
490	wbp = zalloc_flags(cl_wr_zone, Z_WAITOK \| Z_ZERO);
491
492	lck_mtx_init(lck: &wbp->cl_lockw, grp: &cl_mtx_grp, LCK_ATTR_NULL);
493
494	vnode_lock(vp);
495
496	if (ubc->cl_wbehind == NULL) {
497	ubc->cl_wbehind = wbp;
498	} else {
499	lck_mtx_destroy(lck: &wbp->cl_lockw, grp: &cl_mtx_grp);
500	zfree(cl_wr_zone, wbp);
501	wbp = ubc->cl_wbehind;
502	}
503	vnode_unlock(vp);
504	}
505	if (flags & CLW_RETURNLOCKED) {
506	lck_mtx_lock(lck: &wbp->cl_lockw);
507	}
508
509	return wbp;
510	}
511
512
513	static void
514	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void* ), void* callback_arg, int* flags)
515	{
516	struct cl_writebehind *wbp;
517
518	if ((wbp = cluster_get_wbp(vp, flags: `0`)) != NULL) {
519	if (wbp->cl_number) {
520	lck_mtx_lock(lck: &wbp->cl_lockw);
521
522	cluster_try_push(wbp, vp, EOF: newEOF, PUSH_ALL \| flags, flags: `0`, callback, callback_arg, NULL, FALSE);
523
524	lck_mtx_unlock(lck: &wbp->cl_lockw);
525	}
526	}
527	}
528
529
530	static int
531	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
532	{
533	daddr64_t blkno;
534	size_t io_size;
535	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
536
537	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
538	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ \| VNODE_BLOCKMAP_NO_TRACK, NULL)) {
539	return `0`;
540	}
541
542	if (io_size == `0`) {
543	return `0`;
544	}
545
546	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
547	return `1`;
548	}
549	}
550	return `0`;
551	}
552
553
554	static int
555	cluster_is_throttled(vnode_t vp)
556	{
557	return throttle_io_will_be_throttled(lowpri_window_msecs: -`1`, mp: vp->v_mount);
558	}
559
560
561	static void
562	cluster_iostate_wait(struct clios iostate, u_int target, const* char *wait_name)
563	{
564	lck_mtx_lock(lck: &iostate->io_mtxp);
565
566	while ((iostate->io_issued - iostate->io_completed) > target) {
567	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `95`)) \| DBG_FUNC_START,
568	iostate->io_issued, iostate->io_completed, target, `0`, `0`);
569
570	iostate->io_wanted = `1`;
571	msleep(chan: (caddr_t)&iostate->io_wanted, mtx: &iostate->io_mtxp, PRIBIO + `1`, wmesg: wait_name, NULL);
572
573	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `95`)) \| DBG_FUNC_END,
574	iostate->io_issued, iostate->io_completed, target, `0`, `0`);
575	}
576	lck_mtx_unlock(lck: &iostate->io_mtxp);
577	}
578
579	static void
580	cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
581	upl_offset_t upl_offset, upl_size_t size)
582	{
583	if (!size) {
584	return;
585	}
586
587	upl_t associated_upl = upl_associated_upl(upl);
588
589	if (!associated_upl) {
590	return;
591	}
592
593	#if 0
594	printf("1: %d %d\n", upl_offset, upl_offset + size);
595	#endif
596
597	/*
598	* The associated UPL is page aligned to file offsets whereas the
599	* UPL it's attached to has different alignment requirements. The
600	* upl_offset that we have refers to @upl. The code that follows
601	* has to deal with the first and last pages in this transaction
602	* which might straddle pages in the associated UPL. To keep
603	* track of these pages, we use the mark bits: if the mark bit is
604	* set, we know another transaction has completed its part of that
605	* page and so we can unlock that page here.
606	*
607	* The following illustrates what we have to deal with:
608	*
609	* MEM u <------------ 1 PAGE ------------> e
610	* +-------------+----------------------+-----------------
611	* \| \|######################\|#################
612	* +-------------+----------------------+-----------------
613	* FILE \| <--- a ---> o <------------ 1 PAGE ------------>
614	*
615	* So here we show a write to offset @o. The data that is to be
616	* written is in a buffer that is not page aligned; it has offset
617	* @a in the page. The upl that carries the data starts in memory
618	* at @u. The associated upl starts in the file at offset @o. A
619	* transaction will always end on a page boundary (like @e above)
620	* except for the very last transaction in the group. We cannot
621	* unlock the page at @o in the associated upl until both the
622	* transaction ending at @e and the following transaction (that
623	* starts at @e) has completed.
624	*/
625
626	/*
627	* We record whether or not the two UPLs are aligned as the mark
628	* bit in the first page of @upl.
629	*/
630	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
631	bool is_unaligned = upl_page_get_mark(upl: pl, index: `0`);
632
633	if (is_unaligned) {
634	upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
635
636	upl_offset_t upl_end = upl_offset + size;
637	assert(upl_end >= PAGE_SIZE);
638
639	upl_size_t assoc_upl_size = upl_get_size(upl: associated_upl);
640
641	/*
642	* In the very first transaction in the group, upl_offset will
643	* not be page aligned, but after that it will be and in that
644	* case we want the preceding page in the associated UPL hence
645	* the minus one.
646	*/
647	assert(upl_offset);
648	if (upl_offset) {
649	upl_offset = trunc_page_32(upl_offset - `1`);
650	}
651
652	lck_mtx_lock_spin(lck: &iostate->io_mtxp);
653
654	// Look at the first page...
655	if (upl_offset
656	&& !upl_page_get_mark(upl: assoc_pl, index: upl_offset >> PAGE_SHIFT)) {
657	/*
658	* The first page isn't marked so let another transaction
659	* completion handle it.
660	*/
661	upl_page_set_mark(upl: assoc_pl, index: upl_offset >> PAGE_SHIFT, true);
662	upl_offset += PAGE_SIZE;
663	}
664
665	// And now the last page...
666
667	/*
668	* This needs to be > rather than >= because if it's equal, it
669	* means there's another transaction that is sharing the last
670	* page.
671	*/
672	if (upl_end > assoc_upl_size) {
673	upl_end = assoc_upl_size;
674	} else {
675	upl_end = trunc_page_32(upl_end);
676	const int last_pg = (upl_end >> PAGE_SHIFT) - `1`;
677
678	if (!upl_page_get_mark(upl: assoc_pl, index: last_pg)) {
679	/*
680	* The last page isn't marked so mark the page and let another
681	* transaction completion handle it.
682	*/
683	upl_page_set_mark(upl: assoc_pl, index: last_pg, true);
684	upl_end -= PAGE_SIZE;
685	}
686	}
687
688	lck_mtx_unlock(lck: &iostate->io_mtxp);
689
690	#if 0
691	printf("2: %d %d\n", upl_offset, upl_end);
692	#endif
693
694	if (upl_end <= upl_offset) {
695	return;
696	}
697
698	size = upl_end - upl_offset;
699	} else {
700	assert(!(upl_offset & PAGE_MASK));
701	assert(!(size & PAGE_MASK));
702	}
703
704	boolean_t empty;
705
706	/*
707	* We can unlock these pages now and as this is for a
708	* direct/uncached write, we want to dump the pages too.
709	*/
710	kern_return_t kr = upl_abort_range(upl_object: associated_upl, offset: upl_offset, size,
711	UPL_ABORT_DUMP_PAGES, empty: &empty);
712
713	assert(!kr);
714
715	if (!kr && empty) {
716	upl_set_associated_upl(upl, NULL);
717	upl_deallocate(upl: associated_upl);
718	}
719	}
720
721	static int
722	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
723	{
724	int upl_abort_code = `0`;
725	int page_in = `0`;
726	int page_out = `0`;
727
728	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE)) {
729	/*
730	* direct write of any flavor, or a direct read that wasn't aligned
731	*/
732	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
733	} else {
734	if (io_flags & B_PAGEIO) {
735	if (io_flags & B_READ) {
736	page_in = `1`;
737	} else {
738	page_out = `1`;
739	}
740	}
741	if (io_flags & B_CACHE) {
742	/*
743	* leave pages in the cache unchanged on error
744	*/
745	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
746	} else if (((io_flags & B_READ) == `0`) && ((error != ENXIO) \|\| vnode_isswap(vp))) {
747	/*
748	* transient error on pageout/write path... leave pages unchanged
749	*/
750	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
751	} else if (page_in) {
752	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
753	} else {
754	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
755	}
756
757	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
758	}
759	return upl_abort_code;
760	}
761
762
763	static int
764	cluster_iodone(buf_t bp, void *callback_arg)
765	{
766	int b_flags;
767	int error;
768	int total_size;
769	int total_resid;
770	int upl_offset;
771	int zero_offset;
772	int pg_offset = `0`;
773	int commit_size = `0`;
774	int upl_flags = `0`;
775	int transaction_size = `0`;
776	upl_t upl;
777	buf_t cbp;
778	buf_t cbp_head;
779	buf_t cbp_next;
780	buf_t real_bp;
781	vnode_t vp;
782	struct clios *iostate;
783	void *verify_ctx;
784	boolean_t transaction_complete = FALSE;
785
786	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
787
788	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `20`)) \| DBG_FUNC_START,
789	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, `0`);
790
791	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
792	lck_mtx_lock_spin(lck: &cl_transaction_mtxp);
793
794	bp->b_flags \|= B_TDONE;
795
796	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
797	/*
798	* all I/O requests that are part of this transaction
799	* have to complete before we can process it
800	*/
801	if (!(cbp->b_flags & B_TDONE)) {
802	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `20`)) \| DBG_FUNC_END,
803	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, `0`);
804
805	lck_mtx_unlock(lck: &cl_transaction_mtxp);
806
807	return `0`;
808	}
809
810	if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
811	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `20`)) \| DBG_FUNC_END,
812	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, `0`);
813
814	lck_mtx_unlock(lck: &cl_transaction_mtxp);
815	wakeup(chan: cbp);
816
817	return `0`;
818	}
819
820	if (cbp->b_flags & B_EOT) {
821	transaction_complete = TRUE;
822	}
823	}
824	lck_mtx_unlock(lck: &cl_transaction_mtxp);
825
826	if (transaction_complete == FALSE) {
827	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `20`)) \| DBG_FUNC_END,
828	cbp_head, `0`, `0`, `0`, `0`);
829	return `0`;
830	}
831	}
832	error = `0`;
833	total_size = `0`;
834	total_resid = `0`;
835
836	cbp = cbp_head;
837	vp = cbp->b_vp;
838	upl_offset = cbp->b_uploffset;
839	upl = cbp->b_upl;
840	b_flags = cbp->b_flags;
841	real_bp = cbp->b_real_bp;
842	zero_offset = cbp->b_validend;
843	iostate = (struct clios *)cbp->b_iostate;
844
845	if (real_bp) {
846	real_bp->b_dev = cbp->b_dev;
847	}
848
849	while (cbp) {
850	if ((cbp->b_flags & B_ERROR) && error == `0`) {
851	error = cbp->b_error;
852	}
853
854	total_resid += cbp->b_resid;
855	total_size += cbp->b_bcount;
856
857	cbp_next = cbp->b_trans_next;
858
859	if (cbp_next == NULL) {
860	/*
861	* compute the overall size of the transaction
862	* in case we created one that has 'holes' in it
863	* 'total_size' represents the amount of I/O we
864	* did, not the span of the transaction w/r to the UPL
865	*/
866	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
867	}
868
869	if (cbp != cbp_head) {
870	free_io_buf(cbp);
871	}
872
873	cbp = cbp_next;
874	}
875
876	if (ISSET(b_flags, B_COMMIT_UPL)) {
877	cluster_handle_associated_upl(iostate,
878	upl: cbp_head->b_upl,
879	upl_offset,
880	size: transaction_size);
881	}
882
883	if (error == `0` && total_resid) {
884	error = EIO;
885	}
886
887	if (error == `0`) {
888	int (cliodone_func)(buf_t, void* ) = (int* ()(buf_t, void* *))(cbp_head->b_cliodone);
889
890	if (cliodone_func != NULL) {
891	cbp_head->b_bcount = transaction_size;
892
893	error = (*cliodone_func)(cbp_head, callback_arg);
894	}
895	}
896	if (zero_offset) {
897	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
898	}
899
900	verify_ctx = cbp_head->b_attr.ba_verify_ctx;
901	cbp_head->b_attr.ba_verify_ctx = NULL;
902	if (verify_ctx) {
903	vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
904	caddr_t verify_buf = NULL;
905	off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
906	size_t verify_length = transaction_size;
907	vm_offset_t vaddr;
908
909	if (!error) {
910	verify_flags \|= VNODE_VERIFY_WITH_CONTEXT;
911	error = ubc_upl_map_range(upl, upl_offset, round_page(x: transaction_size), VM_PROT_DEFAULT, &vaddr); / Map it in /
912	if (error) {
913	panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
914	error, upl, (int)upl_offset, (int)round_page(transaction_size));
915	} else {
916	verify_buf = (caddr_t)vaddr;
917	}
918	}
919
920	error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, `0`, &verify_ctx, verify_flags, NULL);
921
922	if (verify_buf) {
923	(void)ubc_upl_unmap_range(upl, upl_offset, round_page(x: transaction_size));
924	verify_buf = NULL;
925	}
926	} else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) {
927	error = EBADMSG;
928	}
929
930	free_io_buf(cbp_head);
931
932	if (iostate) {
933	int need_wakeup = `0`;
934
935	/*
936	* someone has issued multiple I/Os asynchrounsly
937	* and is waiting for them to complete (streaming)
938	*/
939	lck_mtx_lock_spin(lck: &iostate->io_mtxp);
940
941	if (error && iostate->io_error == `0`) {
942	iostate->io_error = error;
943	}
944
945	iostate->io_completed += total_size;
946
947	if (iostate->io_wanted) {
948	/*
949	* someone is waiting for the state of
950	* this io stream to change
951	*/
952	iostate->io_wanted = `0`;
953	need_wakeup = `1`;
954	}
955	lck_mtx_unlock(lck: &iostate->io_mtxp);
956
957	if (need_wakeup) {
958	wakeup(chan: (caddr_t)&iostate->io_wanted);
959	}
960	}
961
962	if (b_flags & B_COMMIT_UPL) {
963	pg_offset = upl_offset & PAGE_MASK;
964	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
965
966	if (error) {
967	upl_set_iodone_error(upl, error);
968
969	upl_flags = cluster_ioerror(upl, upl_offset: upl_offset - pg_offset, abort_size: commit_size, error, io_flags: b_flags, vp);
970	} else {
971	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
972
973	if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
974	upl_flags \|= UPL_COMMIT_SET_DIRTY;
975	}
976
977	if (b_flags & B_AGE) {
978	upl_flags \|= UPL_COMMIT_INACTIVATE;
979	}
980
981	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
982	}
983	}
984	if (real_bp) {
985	if (error) {
986	real_bp->b_flags \|= B_ERROR;
987	real_bp->b_error = error;
988	}
989	real_bp->b_resid = total_resid;
990
991	buf_biodone(bp: real_bp);
992	}
993	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `20`)) \| DBG_FUNC_END,
994	upl, upl_offset - pg_offset, commit_size, (error << `24`) \| upl_flags, `0`);
995
996	return error;
997	}
998
999
1000	uint32_t
1001	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
1002	{
1003	if (cluster_is_throttled(vp)) {
1004	*limit = calculate_max_throttle_size(vp);
1005	return `1`;
1006	}
1007	return `0`;
1008	}
1009
1010
1011	void
1012	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1013	{
1014	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `23`)) \| DBG_FUNC_START,
1015	upl_offset, size, bp, `0`, `0`);
1016
1017	if (bp == NULL \|\| bp->b_datap == `0`) {
1018	upl_page_info_t *pl;
1019	addr64_t zero_addr;
1020
1021	pl = ubc_upl_pageinfo(upl);
1022
1023	if (upl_device_page(upl: pl) == TRUE) {
1024	zero_addr = ((addr64_t)upl_phys_page(upl: pl, index: `0`) << PAGE_SHIFT) + upl_offset;
1025
1026	bzero_phys_nc(src64: zero_addr, bytes: size);
1027	} else {
1028	while (size) {
1029	int page_offset;
1030	int page_index;
1031	int zero_cnt;
1032
1033	page_index = upl_offset / PAGE_SIZE;
1034	page_offset = upl_offset & PAGE_MASK;
1035
1036	zero_addr = ((addr64_t)upl_phys_page(upl: pl, index: page_index) << PAGE_SHIFT) + page_offset;
1037	zero_cnt = min(PAGE_SIZE - page_offset, b: size);
1038
1039	bzero_phys(phys_address: zero_addr, length: zero_cnt);
1040
1041	size -= zero_cnt;
1042	upl_offset += zero_cnt;
1043	}
1044	}
1045	} else {
1046	bzero(s: (caddr_t)((vm_offset_t)bp->b_datap + upl_offset), n: size);
1047	}
1048
1049	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `23`)) \| DBG_FUNC_END,
1050	upl_offset, size, `0`, `0`, `0`);
1051	}
1052
1053
1054	static void
1055	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1056	{
1057	/*
1058	* We will assign a verification context to cbp_head.
1059	* This will be passed back to the filesystem when
1060	* verifying (in cluster_iodone).
1061	*/
1062	if (verify_block_size) {
1063	off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1064	size_t length;
1065	void *verify_ctx = NULL;
1066	int error = `0`;
1067	vnode_t vp = buf_vnode(bp: cbp_head);
1068
1069	if (cbp_head == cbp_tail) {
1070	length = cbp_head->b_bcount;
1071	} else {
1072	length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1073	}
1074
1075	/*
1076	* zero_offset is non zero for the transaction containing the EOF
1077	* (if the filesize is not page aligned). In that case we might
1078	* have the transaction size not be page/verify block size aligned
1079	*/
1080	if ((zero_offset == `0`) &&
1081	((length < verify_block_size) \|\| (length % verify_block_size)) != `0`) {
1082	panic("%s length = %zu, verify_block_size = %zu",
1083	__FUNCTION__, length, verify_block_size);
1084	}
1085
1086	error = VNOP_VERIFY(vp, start_off, NULL, length,
1087	&verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1088
1089	cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1090	} else {
1091	cbp_head->b_attr.ba_verify_ctx = NULL;
1092	}
1093
1094	cbp_head->b_validend = zero_offset;
1095	cbp_tail->b_flags \|= B_EOT;
1096	}
1097
1098	static void
1099	cluster_wait_IO(buf_t cbp_head, int async)
1100	{
1101	buf_t cbp;
1102
1103	if (async) {
1104	/*
1105	* Async callback completion will not normally generate a
1106	* wakeup upon I/O completion. To get woken up, we set
1107	* b_trans_next (which is safe for us to modify) on the last
1108	* buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1109	* to wake us up when all buffers as part of this transaction
1110	* are completed. This is done under the umbrella of
1111	* cl_transaction_mtxp which is also taken in cluster_iodone.
1112	*/
1113	bool done = true;
1114	buf_t last = NULL;
1115
1116	lck_mtx_lock_spin(lck: &cl_transaction_mtxp);
1117
1118	for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1119	if (!ISSET(cbp->b_flags, B_TDONE)) {
1120	done = false;
1121	}
1122	}
1123
1124	if (!done) {
1125	last->b_trans_next = CLUSTER_IO_WAITING;
1126
1127	DTRACE_IO1(wait__start, buf_t, last);
1128	do {
1129	msleep(chan: last, mtx: &cl_transaction_mtxp, PSPIN \| (PRIBIO + `1`), wmesg: "cluster_wait_IO", NULL);
1130
1131	/*
1132	* We should only have been woken up if all the
1133	* buffers are completed, but just in case...
1134	*/
1135	done = true;
1136	for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1137	if (!ISSET(cbp->b_flags, B_TDONE)) {
1138	done = false;
1139	break;
1140	}
1141	}
1142	} while (!done);
1143	DTRACE_IO1(wait__done, buf_t, last);
1144
1145	last->b_trans_next = NULL;
1146	}
1147
1148	lck_mtx_unlock(lck: &cl_transaction_mtxp);
1149	} else { // !async
1150	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1151	buf_biowait(bp: cbp);
1152	}
1153	}
1154	}
1155
1156	static void
1157	cluster_complete_transaction(buf_t cbp_head, void* callback_arg, int* retval, int* flags, int needwait)
1158	{
1159	buf_t cbp;
1160	int error;
1161	boolean_t isswapout = FALSE;
1162
1163	/*
1164	* cluster_complete_transaction will
1165	* only be called if we've issued a complete chain in synchronous mode
1166	* or, we've already done a cluster_wait_IO on an incomplete chain
1167	*/
1168	if (needwait) {
1169	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1170	buf_biowait(bp: cbp);
1171	}
1172	}
1173	/*
1174	* we've already waited on all of the I/Os in this transaction,
1175	* so mark all of the buf_t's in this transaction as B_TDONE
1176	* so that cluster_iodone sees the transaction as completed
1177	*/
1178	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1179	cbp->b_flags \|= B_TDONE;
1180	}
1181	cbp = *cbp_head;
1182
1183	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(vp: cbp->b_vp)) {
1184	isswapout = TRUE;
1185	}
1186
1187	error = cluster_iodone(bp: cbp, callback_arg);
1188
1189	if (!(flags & CL_ASYNC) && error && *retval == `0`) {
1190	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO)) {
1191	*retval = error;
1192	} else if (isswapout == TRUE) {
1193	*retval = error;
1194	}
1195	}
1196	*cbp_head = (buf_t)NULL;
1197	}
1198
1199
1200	static int
1201	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1202	int flags, buf_t real_bp, struct clios iostate, int* (callback)(buf_t, void* ), void* *callback_arg)
1203	{
1204	buf_t cbp;
1205	u_int size;
1206	u_int io_size;
1207	int io_flags;
1208	int bmap_flags;
1209	int error = `0`;
1210	int retval = `0`;
1211	buf_t cbp_head = NULL;
1212	buf_t cbp_tail = NULL;
1213	int trans_count = `0`;
1214	int max_trans_count;
1215	u_int pg_count;
1216	int pg_offset;
1217	u_int max_iosize;
1218	u_int max_vectors;
1219	int priv;
1220	int zero_offset = `0`;
1221	int async_throttle = `0`;
1222	mount_t mp;
1223	vm_offset_t upl_end_offset;
1224	boolean_t need_EOT = FALSE;
1225	size_t verify_block_size = `0`;
1226
1227	/*
1228	* we currently don't support buffers larger than a page
1229	*/
1230	if (real_bp && non_rounded_size > PAGE_SIZE) {
1231	panic("%s(): Called with real buffer of size %d bytes which "
1232	"is greater than the maximum allowed size of "
1233	"%d bytes (the system PAGE_SIZE).\n",
1234	__FUNCTION__, non_rounded_size, PAGE_SIZE);
1235	}
1236
1237	mp = vp->v_mount;
1238
1239	/*
1240	* we don't want to do any funny rounding of the size for IO requests
1241	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
1242	* belong to us... we can't extend (nor do we need to) the I/O to fill
1243	* out a page
1244	*/
1245	if (mp->mnt_devblocksize > `1` && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
1246	/*
1247	* round the requested size up so that this I/O ends on a
1248	* page boundary in case this is a 'write'... if the filesystem
1249	* has blocks allocated to back the page beyond the EOF, we want to
1250	* make sure to write out the zero's that are sitting beyond the EOF
1251	* so that in case the filesystem doesn't explicitly zero this area
1252	* if a hole is created via a lseek/write beyond the current EOF,
1253	* it will return zeros when it's read back from the disk. If the
1254	* physical allocation doesn't extend for the whole page, we'll
1255	* only write/read from the disk up to the end of this allocation
1256	* via the extent info returned from the VNOP_BLOCKMAP call.
1257	*/
1258	pg_offset = upl_offset & PAGE_MASK;
1259
1260	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - `1`)) & ~PAGE_MASK) - pg_offset;
1261	} else {
1262	/*
1263	* anyone advertising a blocksize of 1 byte probably
1264	* can't deal with us rounding up the request size
1265	* AFP is one such filesystem/device
1266	*/
1267	size = non_rounded_size;
1268	}
1269	upl_end_offset = upl_offset + size;
1270
1271	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `22`)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, `0`);
1272
1273	/*
1274	* Set the maximum transaction size to the maximum desired number of
1275	* buffers.
1276	*/
1277	max_trans_count = `8`;
1278	if (flags & CL_DEV_MEMORY) {
1279	max_trans_count = `16`;
1280	}
1281
1282	if (flags & CL_READ) {
1283	io_flags = B_READ;
1284	bmap_flags = VNODE_READ;
1285
1286	max_iosize = mp->mnt_maxreadcnt;
1287	max_vectors = mp->mnt_segreadcnt;
1288
1289	if ((flags & CL_PAGEIN) && / Cluster layer verification will be limited to pagein for now /
1290	!(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1291	(VNOP_VERIFY(vp, f_offset, NULL, `0`, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == `0`) &&
1292	verify_block_size) {
1293	if (verify_block_size != PAGE_SIZE) {
1294	verify_block_size = `0`;
1295	}
1296	if (real_bp && verify_block_size) {
1297	panic("%s(): Called with real buffer and needs verification ",
1298	__FUNCTION__);
1299	}
1300	}
1301	} else {
1302	io_flags = B_WRITE;
1303	bmap_flags = VNODE_WRITE;
1304
1305	max_iosize = mp->mnt_maxwritecnt;
1306	max_vectors = mp->mnt_segwritecnt;
1307	}
1308	if (verify_block_size) {
1309	bmap_flags \|= VNODE_CLUSTER_VERIFY;
1310	}
1311	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `22`)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, `0`, `0`);
1312
1313	/*
1314	* make sure the maximum iosize is a
1315	* multiple of the page size
1316	*/
1317	max_iosize &= ~PAGE_MASK;
1318
1319	/*
1320	* Ensure the maximum iosize is sensible.
1321	*/
1322	if (!max_iosize) {
1323	max_iosize = PAGE_SIZE;
1324	}
1325
1326	if (flags & CL_THROTTLE) {
1327	if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1328	uint32_t max_throttle_size = calculate_max_throttle_size(vp);
1329
1330	if (max_iosize > max_throttle_size) {
1331	max_iosize = max_throttle_size;
1332	}
1333	async_throttle = calculate_max_throttle_cnt(vp);
1334	} else {
1335	if ((flags & CL_DEV_MEMORY)) {
1336	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1337	} else {
1338	u_int max_cluster;
1339	u_int max_cluster_size;
1340	u_int scale;
1341
1342	if (vp->v_mount->mnt_minsaturationbytecount) {
1343	max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1344
1345	scale = `1`;
1346	} else {
1347	max_cluster_size = MAX_CLUSTER_SIZE(vp);
1348
1349	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1350	scale = WRITE_THROTTLE_SSD;
1351	} else {
1352	scale = WRITE_THROTTLE;
1353	}
1354	}
1355	if (max_iosize > max_cluster_size) {
1356	max_cluster = max_cluster_size;
1357	} else {
1358	max_cluster = max_iosize;
1359	}
1360
1361	if (size < max_cluster) {
1362	max_cluster = size;
1363	}
1364
1365	if (flags & CL_CLOSE) {
1366	scale += MAX_CLUSTERS;
1367	}
1368
1369	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), b: ((scale * max_cluster_size) / max_cluster) - `1`);
1370	}
1371	}
1372	}
1373	if (flags & CL_AGE) {
1374	io_flags \|= B_AGE;
1375	}
1376	if (flags & (CL_PAGEIN \| CL_PAGEOUT)) {
1377	io_flags \|= B_PAGEIO;
1378	}
1379	if (flags & (CL_IOSTREAMING)) {
1380	io_flags \|= B_IOSTREAMING;
1381	}
1382	if (flags & CL_COMMIT) {
1383	io_flags \|= B_COMMIT_UPL;
1384	}
1385	if (flags & CL_DIRECT_IO) {
1386	io_flags \|= B_PHYS;
1387	}
1388	if (flags & (CL_PRESERVE \| CL_KEEPCACHED)) {
1389	io_flags \|= B_CACHE;
1390	}
1391	if (flags & CL_PASSIVE) {
1392	io_flags \|= B_PASSIVE;
1393	}
1394	if (flags & CL_ENCRYPTED) {
1395	io_flags \|= B_ENCRYPTED_IO;
1396	}
1397
1398	if (vp->v_flag & VSYSTEM) {
1399	io_flags \|= B_META;
1400	}
1401
1402	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1403	/*
1404	* then we are going to end up
1405	* with a page that we can't complete (the file size wasn't a multiple
1406	* of PAGE_SIZE and we're trying to read to the end of the file
1407	* so we'll go ahead and zero out the portion of the page we can't
1408	* read in from the file
1409	*/
1410	zero_offset = (int)(upl_offset + non_rounded_size);
1411	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1412	assert(ISSET(flags, CL_COMMIT));
1413
1414	// For a direct/uncached write, we need to lock pages...
1415
1416	upl_t cached_upl;
1417
1418	/*
1419	* Create a UPL to lock the pages in the cache whilst the
1420	* write is in progress.
1421	*/
1422	ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1423	NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1424
1425	/*
1426	* Attach this UPL to the other UPL so that we can find it
1427	* later.
1428	*/
1429	upl_set_associated_upl(upl, associated_upl: cached_upl);
1430
1431	if (upl_offset & PAGE_MASK) {
1432	/*
1433	* The two UPLs are not aligned, so mark the first page in
1434	* @upl so that cluster_handle_associated_upl can handle
1435	* it accordingly.
1436	*/
1437	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1438	upl_page_set_mark(upl: pl, index: `0`, true);
1439	}
1440	}
1441
1442	while (size) {
1443	daddr64_t blkno;
1444	daddr64_t lblkno;
1445	size_t io_size_tmp;
1446	u_int io_size_wanted;
1447	uint32_t lblksize;
1448
1449	if (size > max_iosize) {
1450	io_size = max_iosize;
1451	} else {
1452	io_size = size;
1453	}
1454
1455	io_size_wanted = io_size;
1456	io_size_tmp = (size_t)io_size;
1457
1458	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1459	break;
1460	}
1461
1462	if (io_size_tmp > io_size_wanted) {
1463	io_size = io_size_wanted;
1464	} else {
1465	io_size = (u_int)io_size_tmp;
1466	}
1467
1468	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1469	real_bp->b_blkno = blkno;
1470	}
1471
1472	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `24`)) \| DBG_FUNC_NONE,
1473	(int)f_offset, (int)(blkno >> `32`), (int)blkno, io_size, `0`);
1474
1475	if (io_size == `0`) {
1476	/*
1477	* vnop_blockmap didn't return an error... however, it did
1478	* return an extent size of 0 which means we can't
1479	* make forward progress on this I/O... a hole in the
1480	* file would be returned as a blkno of -1 with a non-zero io_size
1481	* a real extent is returned with a blkno != -1 and a non-zero io_size
1482	*/
1483	error = EINVAL;
1484	break;
1485	}
1486	if (!(flags & CL_READ) && blkno == -`1`) {
1487	off_t e_offset;
1488	int pageout_flags;
1489
1490	if (upl_get_internal_vectorupl(upl)) {
1491	panic("Vector UPLs should not take this code-path");
1492	}
1493	/*
1494	* we're writing into a 'hole'
1495	*/
1496	if (flags & CL_PAGEOUT) {
1497	/*
1498	* if we got here via cluster_pageout
1499	* then just error the request and return
1500	* the 'hole' should already have been covered
1501	*/
1502	error = EINVAL;
1503	break;
1504	}
1505	/*
1506	* we can get here if the cluster code happens to
1507	* pick up a page that was dirtied via mmap vs
1508	* a 'write' and the page targets a 'hole'...
1509	* i.e. the writes to the cluster were sparse
1510	* and the file was being written for the first time
1511	*
1512	* we can also get here if the filesystem supports
1513	* 'holes' that are less than PAGE_SIZE.... because
1514	* we can't know if the range in the page that covers
1515	* the 'hole' has been dirtied via an mmap or not,
1516	* we have to assume the worst and try to push the
1517	* entire page to storage.
1518	*
1519	* Try paging out the page individually before
1520	* giving up entirely and dumping it (the pageout
1521	* path will insure that the zero extent accounting
1522	* has been taken care of before we get back into cluster_io)
1523	*
1524	* go direct to vnode_pageout so that we don't have to
1525	* unbusy the page from the UPL... we used to do this
1526	* so that we could call ubc_msync, but that results
1527	* in a potential deadlock if someone else races us to acquire
1528	* that page and wins and in addition needs one of the pages
1529	* we're continuing to hold in the UPL
1530	*/
1531	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
1532
1533	if (!(flags & CL_ASYNC)) {
1534	pageout_flags \|= UPL_IOSYNC;
1535	}
1536	if (!(flags & CL_COMMIT)) {
1537	pageout_flags \|= UPL_NOCOMMIT;
1538	}
1539
1540	if (cbp_head) {
1541	buf_t prev_cbp;
1542	uint32_t bytes_in_last_page;
1543
1544	/*
1545	* first we have to wait for the the current outstanding I/Os
1546	* to complete... EOT hasn't been set yet on this transaction
1547	* so the pages won't be released
1548	*/
1549	cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
1550
1551	bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1552	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1553	bytes_in_last_page += cbp->b_bcount;
1554	}
1555	bytes_in_last_page &= PAGE_MASK;
1556
1557	while (bytes_in_last_page) {
1558	/*
1559	* we've got a transcation that
1560	* includes the page we're about to push out through vnode_pageout...
1561	* find the bp's in the list which intersect this page and either
1562	* remove them entirely from the transaction (there could be multiple bp's), or
1563	* round it's iosize down to the page boundary (there can only be one)...
1564	*
1565	* find the last bp in the list and act on it
1566	*/
1567	for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1568	prev_cbp = cbp;
1569	}
1570
1571	if (bytes_in_last_page >= cbp->b_bcount) {
1572	/*
1573	* this buf no longer has any I/O associated with it
1574	*/
1575	bytes_in_last_page -= cbp->b_bcount;
1576	cbp->b_bcount = `0`;
1577
1578	free_io_buf(cbp);
1579
1580	if (cbp == cbp_head) {
1581	assert(bytes_in_last_page == `0`);
1582	/*
1583	* the buf we just freed was the only buf in
1584	* this transaction... so there's no I/O to do
1585	*/
1586	cbp_head = NULL;
1587	cbp_tail = NULL;
1588	} else {
1589	/*
1590	* remove the buf we just freed from
1591	* the transaction list
1592	*/
1593	prev_cbp->b_trans_next = NULL;
1594	cbp_tail = prev_cbp;
1595	}
1596	} else {
1597	/*
1598	* this is the last bp that has I/O
1599	* intersecting the page of interest
1600	* only some of the I/O is in the intersection
1601	* so clip the size but keep it in the transaction list
1602	*/
1603	cbp->b_bcount -= bytes_in_last_page;
1604	cbp_tail = cbp;
1605	bytes_in_last_page = `0`;
1606	}
1607	}
1608	if (cbp_head) {
1609	/*
1610	* there was more to the current transaction
1611	* than just the page we are pushing out via vnode_pageout...
1612	* mark it as finished and complete it... we've already
1613	* waited for the I/Os to complete above in the call to cluster_wait_IO
1614	*/
1615	cluster_EOT(cbp_head, cbp_tail, zero_offset: `0`, verify_block_size: `0`);
1616
1617	cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: `0`);
1618
1619	trans_count = `0`;
1620	}
1621	}
1622	if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1623	error = EINVAL;
1624	}
1625	e_offset = round_page_64(x: f_offset + `1`);
1626	io_size = (u_int)(e_offset - f_offset);
1627
1628	f_offset += io_size;
1629	upl_offset += io_size;
1630
1631	if (size >= io_size) {
1632	size -= io_size;
1633	} else {
1634	size = `0`;
1635	}
1636	/*
1637	* keep track of how much of the original request
1638	* that we've actually completed... non_rounded_size
1639	* may go negative due to us rounding the request
1640	* to a page size multiple (i.e. size > non_rounded_size)
1641	*/
1642	non_rounded_size -= io_size;
1643
1644	if (non_rounded_size <= `0`) {
1645	/*
1646	* we've transferred all of the data in the original
1647	* request, but we were unable to complete the tail
1648	* of the last page because the file didn't have
1649	* an allocation to back that portion... this is ok.
1650	*/
1651	size = `0`;
1652	}
1653	if (error) {
1654	if (size == `0`) {
1655	flags &= ~CL_COMMIT;
1656	}
1657	break;
1658	}
1659	continue;
1660	}
1661
1662	lblksize = CLUSTER_IO_BLOCK_SIZE;
1663	lblkno = (daddr64_t)(f_offset / lblksize);
1664
1665	/*
1666	* we have now figured out how much I/O we can do - this is in 'io_size'
1667	* pg_offset is the starting point in the first page for the I/O
1668	* pg_count is the number of full and partial pages that 'io_size' encompasses
1669	*/
1670	pg_offset = upl_offset & PAGE_MASK;
1671
1672	if (flags & CL_DEV_MEMORY) {
1673	/*
1674	* treat physical requests as one 'giant' page
1675	*/
1676	pg_count = `1`;
1677	} else {
1678	pg_count = (io_size + pg_offset + (PAGE_SIZE - `1`)) / PAGE_SIZE;
1679	}
1680
1681	if ((flags & CL_READ) && blkno == -`1`) {
1682	vm_offset_t commit_offset;
1683	int bytes_to_zero;
1684	int complete_transaction_now = `0`;
1685
1686	/*
1687	* if we're reading and blkno == -1, then we've got a
1688	* 'hole' in the file that we need to deal with by zeroing
1689	* out the affected area in the upl
1690	*/
1691	if (io_size >= (u_int)non_rounded_size) {
1692	/*
1693	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1694	* than 'zero_offset' will be non-zero
1695	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
1696	* (indicated by the io_size finishing off the I/O request for this UPL)
1697	* than we're not going to issue an I/O for the
1698	* last page in this upl... we need to zero both the hole and the tail
1699	* of the page beyond the EOF, since the delayed zero-fill won't kick in
1700	*/
1701	bytes_to_zero = non_rounded_size;
1702	if (!(flags & CL_NOZERO)) {
1703	bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - `1`)) & ~PAGE_MASK) - upl_offset);
1704	}
1705
1706	zero_offset = `0`;
1707	} else {
1708	bytes_to_zero = io_size;
1709	}
1710
1711	pg_count = `0`;
1712
1713	cluster_zero(upl, upl_offset: (upl_offset_t)upl_offset, size: bytes_to_zero, bp: real_bp);
1714
1715	if (cbp_head) {
1716	int pg_resid;
1717
1718	/*
1719	* if there is a current I/O chain pending
1720	* then the first page of the group we just zero'd
1721	* will be handled by the I/O completion if the zero
1722	* fill started in the middle of the page
1723	*/
1724	commit_offset = (upl_offset + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
1725
1726	pg_resid = (int)(commit_offset - upl_offset);
1727
1728	if (bytes_to_zero >= pg_resid) {
1729	/*
1730	* the last page of the current I/O
1731	* has been completed...
1732	* compute the number of fully zero'd
1733	* pages that are beyond it
1734	* plus the last page if its partial
1735	* and we have no more I/O to issue...
1736	* otherwise a partial page is left
1737	* to begin the next I/O
1738	*/
1739	if ((int)io_size >= non_rounded_size) {
1740	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - `1`)) / PAGE_SIZE;
1741	} else {
1742	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1743	}
1744
1745	complete_transaction_now = `1`;
1746	}
1747	} else {
1748	/*
1749	* no pending I/O to deal with
1750	* so, commit all of the fully zero'd pages
1751	* plus the last page if its partial
1752	* and we have no more I/O to issue...
1753	* otherwise a partial page is left
1754	* to begin the next I/O
1755	*/
1756	if ((int)io_size >= non_rounded_size) {
1757	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - `1`)) / PAGE_SIZE;
1758	} else {
1759	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1760	}
1761
1762	commit_offset = upl_offset & ~PAGE_MASK;
1763	}
1764
1765	// Associated UPL is currently only used in the direct write path
1766	assert(!upl_associated_upl(upl));
1767
1768	if ((flags & CL_COMMIT) && pg_count) {
1769	ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1770	pg_count * PAGE_SIZE,
1771	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
1772	}
1773	upl_offset += io_size;
1774	f_offset += io_size;
1775	size -= io_size;
1776
1777	/*
1778	* keep track of how much of the original request
1779	* that we've actually completed... non_rounded_size
1780	* may go negative due to us rounding the request
1781	* to a page size multiple (i.e. size > non_rounded_size)
1782	*/
1783	non_rounded_size -= io_size;
1784
1785	if (non_rounded_size <= `0`) {
1786	/*
1787	* we've transferred all of the data in the original
1788	* request, but we were unable to complete the tail
1789	* of the last page because the file didn't have
1790	* an allocation to back that portion... this is ok.
1791	*/
1792	size = `0`;
1793	}
1794	if (cbp_head && (complete_transaction_now \|\| size == `0`)) {
1795	cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
1796
1797	cluster_EOT(cbp_head, cbp_tail, zero_offset: size == `0` ? zero_offset : `0`, verify_block_size);
1798
1799	cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: `0`);
1800
1801	trans_count = `0`;
1802	}
1803	continue;
1804	}
1805	if (pg_count > max_vectors) {
1806	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1807	io_size = PAGE_SIZE - pg_offset;
1808	pg_count = `1`;
1809	} else {
1810	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1811	pg_count = max_vectors;
1812	}
1813	}
1814	/*
1815	* If the transaction is going to reach the maximum number of
1816	* desired elements, truncate the i/o to the nearest page so
1817	* that the actual i/o is initiated after this buffer is
1818	* created and added to the i/o chain.
1819	*
1820	* I/O directed to physically contiguous memory
1821	* doesn't have a requirement to make sure we 'fill' a page
1822	*/
1823	if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1824	((upl_offset + io_size) & PAGE_MASK)) {
1825	vm_offset_t aligned_ofs;
1826
1827	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1828	/*
1829	* If the io_size does not actually finish off even a
1830	* single page we have to keep adding buffers to the
1831	* transaction despite having reached the desired limit.
1832	*
1833	* Eventually we get here with the page being finished
1834	* off (and exceeded) and then we truncate the size of
1835	* this i/o request so that it is page aligned so that
1836	* we can finally issue the i/o on the transaction.
1837	*/
1838	if (aligned_ofs > upl_offset) {
1839	io_size = (u_int)(aligned_ofs - upl_offset);
1840	pg_count--;
1841	}
1842	}
1843
1844	if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1845	/*
1846	* if we're not targeting a virtual device i.e. a disk image
1847	* it's safe to dip into the reserve pool since real devices
1848	* can complete this I/O request without requiring additional
1849	* bufs from the alloc_io_buf pool
1850	*/
1851	priv = `1`;
1852	} else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1853	/*
1854	* Throttle the speculative IO
1855	*
1856	* We can only throttle this if it is the first iobuf
1857	* for the transaction. alloc_io_buf implements
1858	* additional restrictions for diskimages anyway.
1859	*/
1860	priv = `0`;
1861	} else {
1862	priv = `1`;
1863	}
1864
1865	cbp = alloc_io_buf(vp, priv);
1866
1867	if (flags & CL_PAGEOUT) {
1868	u_int i;
1869
1870	/*
1871	* since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1872	* iteration to (PAGE_SIZE * pg_count) of blks.
1873	*/
1874	for (i = `0`; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1875	if (buf_invalblkno(vp, lblkno: lblkno + i, flags: `0`) == EBUSY) {
1876	panic("BUSY bp found in cluster_io");
1877	}
1878	}
1879	}
1880	if (flags & CL_ASYNC) {
1881	if (buf_setcallback(bp: cbp, callback: (void *)cluster_iodone, transaction: callback_arg)) {
1882	panic("buf_setcallback failed");
1883	}
1884	}
1885	cbp->b_cliodone = (void *)callback;
1886	cbp->b_flags \|= io_flags;
1887	if (flags & CL_NOCACHE) {
1888	cbp->b_attr.ba_flags \|= BA_NOCACHE;
1889	}
1890	if (verify_block_size) {
1891	cbp->b_attr.ba_flags \|= BA_WILL_VERIFY;
1892	}
1893
1894	cbp->b_lblkno = lblkno;
1895	cbp->b_lblksize = lblksize;
1896	cbp->b_blkno = blkno;
1897	cbp->b_bcount = io_size;
1898
1899	if (buf_setupl(bp: cbp, upl, offset: (uint32_t)upl_offset)) {
1900	panic("buf_setupl failed");
1901	}
1902	#if CONFIG_IOSCHED
1903	upl_set_blkno(upl, upl_offset, size: io_size, blkno);
1904	#endif
1905	cbp->b_trans_next = (buf_t)NULL;
1906
1907	if ((cbp->b_iostate = (void *)iostate)) {
1908	/*
1909	* caller wants to track the state of this
1910	* io... bump the amount issued against this stream
1911	*/
1912	iostate->io_issued += io_size;
1913	}
1914
1915	if (flags & CL_READ) {
1916	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `26`)) \| DBG_FUNC_NONE,
1917	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, `0`);
1918	} else {
1919	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `27`)) \| DBG_FUNC_NONE,
1920	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, `0`);
1921	}
1922
1923	if (cbp_head) {
1924	cbp_tail->b_trans_next = cbp;
1925	cbp_tail = cbp;
1926	} else {
1927	cbp_head = cbp;
1928	cbp_tail = cbp;
1929
1930	if ((cbp_head->b_real_bp = real_bp)) {
1931	real_bp = (buf_t)NULL;
1932	}
1933	}
1934	(buf_t )(&cbp->b_trans_head) = cbp_head;
1935
1936	trans_count++;
1937
1938	upl_offset += io_size;
1939	f_offset += io_size;
1940	size -= io_size;
1941	/*
1942	* keep track of how much of the original request
1943	* that we've actually completed... non_rounded_size
1944	* may go negative due to us rounding the request
1945	* to a page size multiple (i.e. size > non_rounded_size)
1946	*/
1947	non_rounded_size -= io_size;
1948
1949	if (non_rounded_size <= `0`) {
1950	/*
1951	* we've transferred all of the data in the original
1952	* request, but we were unable to complete the tail
1953	* of the last page because the file didn't have
1954	* an allocation to back that portion... this is ok.
1955	*/
1956	size = `0`;
1957	}
1958	if (size == `0`) {
1959	/*
1960	* we have no more I/O to issue, so go
1961	* finish the final transaction
1962	*/
1963	need_EOT = TRUE;
1964	} else if (((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == `0`) &&
1965	((flags & CL_ASYNC) \|\| trans_count > max_trans_count)) {
1966	/*
1967	* I/O directed to physically contiguous memory...
1968	* which doesn't have a requirement to make sure we 'fill' a page
1969	* or...
1970	* the current I/O we've prepared fully
1971	* completes the last page in this request
1972	* and ...
1973	* it's either an ASYNC request or
1974	* we've already accumulated more than 8 I/O's into
1975	* this transaction so mark it as complete so that
1976	* it can finish asynchronously or via the cluster_complete_transaction
1977	* below if the request is synchronous
1978	*/
1979	need_EOT = TRUE;
1980	}
1981	if (need_EOT == TRUE) {
1982	cluster_EOT(cbp_head, cbp_tail, zero_offset: size == `0` ? zero_offset : `0`, verify_block_size);
1983	}
1984
1985	if (flags & CL_THROTTLE) {
1986	(void)vnode_waitforwrites(vp, output_target: async_throttle, slpflag: `0`, slptimeout: `0`, msg: "cluster_io");
1987	}
1988
1989	if (!(io_flags & B_READ)) {
1990	vnode_startwrite(vp);
1991	}
1992
1993	if (flags & CL_RAW_ENCRYPTED) {
1994	/*
1995	* User requested raw encrypted bytes.
1996	* Twiddle the bit in the ba_flags for the buffer
1997	*/
1998	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
1999	}
2000
2001	(void) VNOP_STRATEGY(bp: cbp);
2002
2003	if (need_EOT == TRUE) {
2004	if (!(flags & CL_ASYNC)) {
2005	cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: `1`);
2006	}
2007
2008	need_EOT = FALSE;
2009	trans_count = `0`;
2010	cbp_head = NULL;
2011	}
2012	}
2013	if (error) {
2014	int abort_size;
2015
2016	io_size = `0`;
2017
2018	if (cbp_head) {
2019	/*
2020	* Wait until all of the outstanding I/O
2021	* for this partial transaction has completed
2022	*/
2023	cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
2024
2025	/*
2026	* Rewind the upl offset to the beginning of the
2027	* transaction.
2028	*/
2029	upl_offset = cbp_head->b_uploffset;
2030	}
2031
2032	if (ISSET(flags, CL_COMMIT)) {
2033	cluster_handle_associated_upl(iostate, upl,
2034	upl_offset: (upl_offset_t)upl_offset,
2035	size: (upl_size_t)(upl_end_offset - upl_offset));
2036	}
2037
2038	// Free all the IO buffers in this transaction
2039	for (cbp = cbp_head; cbp;) {
2040	buf_t cbp_next;
2041
2042	size += cbp->b_bcount;
2043	io_size += cbp->b_bcount;
2044
2045	cbp_next = cbp->b_trans_next;
2046	free_io_buf(cbp);
2047	cbp = cbp_next;
2048	}
2049
2050	if (iostate) {
2051	int need_wakeup = `0`;
2052
2053	/*
2054	* update the error condition for this stream
2055	* since we never really issued the io
2056	* just go ahead and adjust it back
2057	*/
2058	lck_mtx_lock_spin(lck: &iostate->io_mtxp);
2059
2060	if (iostate->io_error == `0`) {
2061	iostate->io_error = error;
2062	}
2063	iostate->io_issued -= io_size;
2064
2065	if (iostate->io_wanted) {
2066	/*
2067	* someone is waiting for the state of
2068	* this io stream to change
2069	*/
2070	iostate->io_wanted = `0`;
2071	need_wakeup = `1`;
2072	}
2073	lck_mtx_unlock(lck: &iostate->io_mtxp);
2074
2075	if (need_wakeup) {
2076	wakeup(chan: (caddr_t)&iostate->io_wanted);
2077	}
2078	}
2079
2080	if (flags & CL_COMMIT) {
2081	int upl_flags;
2082
2083	pg_offset = upl_offset & PAGE_MASK;
2084	abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2085
2086	upl_flags = cluster_ioerror(upl, upl_offset: (int)(upl_offset - pg_offset),
2087	abort_size, error, io_flags, vp);
2088
2089	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `28`)) \| DBG_FUNC_NONE,
2090	upl, upl_offset - pg_offset, abort_size, (error << `24`) \| upl_flags, `0`);
2091	}
2092	if (retval == `0`) {
2093	retval = error;
2094	}
2095	} else if (cbp_head) {
2096	panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2097	}
2098
2099	if (real_bp) {
2100	/*
2101	* can get here if we either encountered an error
2102	* or we completely zero-filled the request and
2103	* no I/O was issued
2104	*/
2105	if (error) {
2106	real_bp->b_flags \|= B_ERROR;
2107	real_bp->b_error = error;
2108	}
2109	buf_biodone(bp: real_bp);
2110	}
2111	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `22`)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, `0`);
2112
2113	return retval;
2114	}
2115
2116	#define reset_vector_run_state() \
2117	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2118
2119	static int
2120	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2121	int io_flag, buf_t real_bp, struct clios iostate, int* (callback)(buf_t, void* ), void* *callback_arg)
2122	{
2123	vector_upl_set_pagelist(vector_upl);
2124
2125	if (io_flag & CL_READ) {
2126	if (vector_upl_offset == `0` && ((vector_upl_iosize & PAGE_MASK) == `0`)) {
2127	io_flag &= ~CL_PRESERVE; /don't zero fill/
2128	} else {
2129	io_flag \|= CL_PRESERVE; /zero fill/
2130	}
2131	}
2132	return cluster_io(vp, upl: vector_upl, upl_offset: vector_upl_offset, f_offset: v_upl_uio_offset, non_rounded_size: vector_upl_iosize, flags: io_flag, real_bp, iostate, callback, callback_arg);
2133	}
2134
2135	static int
2136	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void* ), void* callback_arg, int* bflag)
2137	{
2138	int pages_in_prefetch;
2139
2140	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `49`)) \| DBG_FUNC_START,
2141	(int)f_offset, size, (int)filesize, `0`, `0`);
2142
2143	if (f_offset >= filesize) {
2144	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `49`)) \| DBG_FUNC_END,
2145	(int)f_offset, `0`, `0`, `0`, `0`);
2146	return `0`;
2147	}
2148	if ((off_t)size > (filesize - f_offset)) {
2149	size = (u_int)(filesize - f_offset);
2150	}
2151	pages_in_prefetch = (size + (PAGE_SIZE - `1`)) / PAGE_SIZE;
2152
2153	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2154
2155	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `49`)) \| DBG_FUNC_END,
2156	(int)f_offset + size, pages_in_prefetch, `0`, `1`, `0`);
2157
2158	return pages_in_prefetch;
2159	}
2160
2161
2162
2163	static void
2164	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct* cl_readahead rap, int* (callback)(buf_t, void* ), void* *callback_arg,
2165	int bflag)
2166	{
2167	daddr64_t r_addr;
2168	off_t f_offset;
2169	int size_of_prefetch;
2170	u_int max_prefetch;
2171
2172
2173	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_START,
2174	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, `0`, `0`);
2175
2176	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2177	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2178	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `0`, `0`);
2179	return;
2180	}
2181	if (rap->cl_lastr == -`1` \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + `1`))) {
2182	rap->cl_ralen = `0`;
2183	rap->cl_maxra = `0`;
2184
2185	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2186	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `1`, `0`);
2187
2188	return;
2189	}
2190
2191	max_prefetch = cluster_max_prefetch(vp,
2192	max_io_size: cluster_max_io_size(mp: vp->v_mount, CL_READ), prefetch_limit: speculative_prefetch_max);
2193
2194	if (max_prefetch <= PAGE_SIZE) {
2195	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2196	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `6`, `0`);
2197	return;
2198	}
2199	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= `4`) {
2200	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / `4`)) {
2201	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2202	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `2`, `0`);
2203	return;
2204	}
2205	}
2206	r_addr = MAX(extent->e_addr, rap->cl_maxra) + `1`;
2207	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2208
2209	size_of_prefetch = `0`;
2210
2211	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2212
2213	if (size_of_prefetch) {
2214	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2215	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `3`, `0`);
2216	return;
2217	}
2218	if (f_offset < filesize) {
2219	daddr64_t read_size;
2220
2221	rap->cl_ralen = rap->cl_ralen ? min(a: max_prefetch / PAGE_SIZE, b: rap->cl_ralen << `1`) : `1`;
2222
2223	read_size = (extent->e_addr + `1`) - extent->b_addr;
2224
2225	if (read_size > rap->cl_ralen) {
2226	if (read_size > max_prefetch / PAGE_SIZE) {
2227	rap->cl_ralen = max_prefetch / PAGE_SIZE;
2228	} else {
2229	rap->cl_ralen = (int)read_size;
2230	}
2231	}
2232	size_of_prefetch = cluster_read_prefetch(vp, f_offset, size: rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2233
2234	if (size_of_prefetch) {
2235	rap->cl_maxra = (r_addr + size_of_prefetch) - `1`;
2236	}
2237	}
2238	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `48`)) \| DBG_FUNC_END,
2239	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, `4`, `0`);
2240	}
2241
2242
2243	int
2244	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2245	int size, off_t filesize, int flags)
2246	{
2247	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2248	}
2249
2250
2251	int
2252	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2253	int size, off_t filesize, int flags, int (callback)(buf_t, void* ), void* *callback_arg)
2254	{
2255	int io_size;
2256	int rounded_size;
2257	off_t max_size;
2258	int local_flags;
2259
2260	local_flags = CL_PAGEOUT \| CL_THROTTLE;
2261
2262	if ((flags & UPL_IOSYNC) == `0`) {
2263	local_flags \|= CL_ASYNC;
2264	}
2265	if ((flags & UPL_NOCOMMIT) == `0`) {
2266	local_flags \|= CL_COMMIT;
2267	}
2268	if ((flags & UPL_KEEPCACHED)) {
2269	local_flags \|= CL_KEEPCACHED;
2270	}
2271	if (flags & UPL_PAGING_ENCRYPTED) {
2272	local_flags \|= CL_ENCRYPTED;
2273	}
2274
2275
2276	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `52`)) \| DBG_FUNC_NONE,
2277	(int)f_offset, size, (int)filesize, local_flags, `0`);
2278
2279	/*
2280	* If they didn't specify any I/O, then we are done...
2281	* we can't issue an abort because we don't know how
2282	* big the upl really is
2283	*/
2284	if (size <= `0`) {
2285	return EINVAL;
2286	}
2287
2288	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2289	if (local_flags & CL_COMMIT) {
2290	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2291	}
2292	return EROFS;
2293	}
2294	/*
2295	* can't page-in from a negative offset
2296	* or if we're starting beyond the EOF
2297	* or if the file offset isn't page aligned
2298	* or the size requested isn't a multiple of PAGE_SIZE
2299	*/
2300	if (f_offset < `0` \|\| f_offset >= filesize \|\|
2301	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
2302	if (local_flags & CL_COMMIT) {
2303	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2304	}
2305	return EINVAL;
2306	}
2307	max_size = filesize - f_offset;
2308
2309	if (size < max_size) {
2310	io_size = size;
2311	} else {
2312	io_size = (int)max_size;
2313	}
2314
2315	rounded_size = (io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
2316
2317	if (size > rounded_size) {
2318	if (local_flags & CL_COMMIT) {
2319	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2320	UPL_ABORT_FREE_ON_EMPTY);
2321	}
2322	}
2323	return cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size: io_size,
2324	flags: local_flags, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
2325	}
2326
2327
2328	int
2329	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2330	int size, off_t filesize, int flags)
2331	{
2332	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2333	}
2334
2335
2336	int
2337	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2338	int size, off_t filesize, int flags, int (callback)(buf_t, void* ), void* *callback_arg)
2339	{
2340	u_int io_size;
2341	int rounded_size;
2342	off_t max_size;
2343	int retval;
2344	int local_flags = `0`;
2345
2346	if (upl == NULL \|\| size < `0`) {
2347	panic("cluster_pagein: NULL upl passed in");
2348	}
2349
2350	if ((flags & UPL_IOSYNC) == `0`) {
2351	local_flags \|= CL_ASYNC;
2352	}
2353	if ((flags & UPL_NOCOMMIT) == `0`) {
2354	local_flags \|= CL_COMMIT;
2355	}
2356	if (flags & UPL_IOSTREAMING) {
2357	local_flags \|= CL_IOSTREAMING;
2358	}
2359	if (flags & UPL_PAGING_ENCRYPTED) {
2360	local_flags \|= CL_ENCRYPTED;
2361	}
2362
2363
2364	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `56`)) \| DBG_FUNC_NONE,
2365	(int)f_offset, size, (int)filesize, local_flags, `0`);
2366
2367	/*
2368	* can't page-in from a negative offset
2369	* or if we're starting beyond the EOF
2370	* or if the file offset isn't page aligned
2371	* or the size requested isn't a multiple of PAGE_SIZE
2372	*/
2373	if (f_offset < `0` \|\| f_offset >= filesize \|\|
2374	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
2375	if (local_flags & CL_COMMIT) {
2376	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
2377	}
2378
2379	if (f_offset >= filesize) {
2380	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), arg: `0` / arg /);
2381	}
2382
2383	return EINVAL;
2384	}
2385	max_size = filesize - f_offset;
2386
2387	if (size < max_size) {
2388	io_size = size;
2389	} else {
2390	io_size = (int)max_size;
2391	}
2392
2393	rounded_size = (io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
2394
2395	if (size > rounded_size && (local_flags & CL_COMMIT)) {
2396	ubc_upl_abort_range(upl, upl_offset + rounded_size,
2397	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
2398	}
2399
2400	retval = cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size: io_size,
2401	flags: local_flags \| CL_READ \| CL_PAGEIN, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
2402
2403	return retval;
2404	}
2405
2406
2407	int
2408	cluster_bp(buf_t bp)
2409	{
2410	return cluster_bp_ext(bp, NULL, NULL);
2411	}
2412
2413
2414	int
2415	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void* ), void* *callback_arg)
2416	{
2417	off_t f_offset;
2418	int flags;
2419
2420	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `19`)) \| DBG_FUNC_START,
2421	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, `0`);
2422
2423	if (bp->b_flags & B_READ) {
2424	flags = CL_ASYNC \| CL_READ;
2425	} else {
2426	flags = CL_ASYNC;
2427	}
2428	if (bp->b_flags & B_PASSIVE) {
2429	flags \|= CL_PASSIVE;
2430	}
2431
2432	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2433
2434	return cluster_io(vp: bp->b_vp, upl: bp->b_upl, upl_offset: `0`, f_offset, non_rounded_size: bp->b_bcount, flags, real_bp: bp, iostate: (struct clios *)NULL, callback, callback_arg);
2435	}
2436
2437
2438
2439	int
2440	cluster_write(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int* xflags)
2441	{
2442	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2443	}
2444
2445
2446	int
2447	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2448	int xflags, int (callback)(buf_t, void* ), void* *callback_arg)
2449	{
2450	user_ssize_t cur_resid;
2451	int retval = `0`;
2452	int flags;
2453	int zflags;
2454	int bflag;
2455	int write_type = IO_COPY;
2456	u_int32_t write_length;
2457
2458	flags = xflags;
2459
2460	if (flags & IO_PASSIVE) {
2461	bflag = CL_PASSIVE;
2462	} else {
2463	bflag = `0`;
2464	}
2465
2466	if (vp->v_flag & VNOCACHE_DATA) {
2467	flags \|= IO_NOCACHE;
2468	bflag \|= CL_NOCACHE;
2469	}
2470	if (uio == NULL) {
2471	/*
2472	* no user data...
2473	* this call is being made to zero-fill some range in the file
2474	*/
2475	retval = cluster_write_copy(vp, NULL, io_req_size: (u_int32_t)`0`, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2476
2477	return retval;
2478	}
2479	/*
2480	* do a write through the cache if one of the following is true....
2481	* NOCACHE is not true or NODIRECT is true
2482	* the uio request doesn't target USERSPACE
2483	* otherwise, find out if we want the direct or contig variant for
2484	* the first vector in the uio request
2485	*/
2486	if (((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2487	retval = cluster_io_type(uio, io_type: &write_type, io_length: &write_length, MIN_DIRECT_WRITE_SIZE);
2488	}
2489
2490	if ((flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2491	/*
2492	* must go through the cached variant in this case
2493	*/
2494	write_type = IO_COPY;
2495	}
2496
2497	while ((cur_resid = uio_resid(a_uio: uio)) && uio->uio_offset < newEOF && retval == `0`) {
2498	switch (write_type) {
2499	case IO_COPY:
2500	/*
2501	* make sure the uio_resid isn't too big...
2502	* internally, we want to handle all of the I/O in
2503	* chunk sizes that fit in a 32 bit int
2504	*/
2505	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2506	/*
2507	* we're going to have to call cluster_write_copy
2508	* more than once...
2509	*
2510	* only want the last call to cluster_write_copy to
2511	* have the IO_TAILZEROFILL flag set and only the
2512	* first call should have IO_HEADZEROFILL
2513	*/
2514	zflags = flags & ~IO_TAILZEROFILL;
2515	flags &= ~IO_HEADZEROFILL;
2516
2517	write_length = MAX_IO_REQUEST_SIZE;
2518	} else {
2519	/*
2520	* last call to cluster_write_copy
2521	*/
2522	zflags = flags;
2523
2524	write_length = (u_int32_t)cur_resid;
2525	}
2526	retval = cluster_write_copy(vp, uio, io_req_size: write_length, oldEOF, newEOF, headOff, tailOff, flags: zflags, callback, callback_arg);
2527	break;
2528
2529	case IO_CONTIG:
2530	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
2531
2532	if (flags & IO_HEADZEROFILL) {
2533	/*
2534	* only do this once per request
2535	*/
2536	flags &= ~IO_HEADZEROFILL;
2537
2538	retval = cluster_write_copy(vp, uio: (struct uio *)`0`, io_req_size: (u_int32_t)`0`, oldEOF: (off_t)`0`, newEOF: uio->uio_offset,
2539	headOff, tailOff: (off_t)`0`, flags: zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
2540	if (retval) {
2541	break;
2542	}
2543	}
2544	retval = cluster_write_contig(vp, uio, newEOF, write_type: &write_type, write_length: &write_length, callback, callback_arg, bflag);
2545
2546	if (retval == `0` && (flags & IO_TAILZEROFILL) && uio_resid(a_uio: uio) == `0`) {
2547	/*
2548	* we're done with the data from the user specified buffer(s)
2549	* and we've been requested to zero fill at the tail
2550	* treat this as an IO_HEADZEROFILL which doesn't require a uio
2551	* by rearranging the args and passing in IO_HEADZEROFILL
2552	*/
2553
2554	/*
2555	* Update the oldEOF to reflect the current EOF. If the UPL page
2556	* to zero-fill is not valid (when F_NOCACHE is set), the
2557	* cluster_write_copy() will perform RMW on the UPL page when
2558	* the oldEOF is not aligned on page boundary due to unaligned
2559	* write.
2560	*/
2561	if (uio->uio_offset > oldEOF) {
2562	oldEOF = uio->uio_offset;
2563	}
2564	retval = cluster_write_copy(vp, uio: (struct uio *)`0`, io_req_size: (u_int32_t)`0`, oldEOF: (off_t)oldEOF, newEOF: tailOff, headOff: uio->uio_offset,
2565	tailOff: (off_t)`0`, flags: zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
2566	}
2567	break;
2568
2569	case IO_DIRECT:
2570	/*
2571	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
2572	*/
2573	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, write_type: &write_type, write_length: &write_length, flags, callback, callback_arg);
2574	break;
2575
2576	case IO_UNKNOWN:
2577	retval = cluster_io_type(uio, io_type: &write_type, io_length: &write_length, MIN_DIRECT_WRITE_SIZE);
2578	break;
2579	}
2580	/*
2581	* in case we end up calling cluster_write_copy (from cluster_write_direct)
2582	* multiple times to service a multi-vector request that is not aligned properly
2583	* we need to update the oldEOF so that we
2584	* don't zero-fill the head of a page if we've successfully written
2585	* data to that area... 'cluster_write_copy' will zero-fill the head of a
2586	* page that is beyond the oldEOF if the write is unaligned... we only
2587	* want that to happen for the very first page of the cluster_write,
2588	* NOT the first page of each vector making up a multi-vector write.
2589	*/
2590	if (uio->uio_offset > oldEOF) {
2591	oldEOF = uio->uio_offset;
2592	}
2593	}
2594	return retval;
2595	}
2596
2597
2598	static int
2599	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int* write_type, u_int32_t write_length,
2600	int flags, int (callback)(buf_t, void* ), void* *callback_arg)
2601	{
2602	upl_t upl = NULL;
2603	upl_page_info_t *pl;
2604	vm_offset_t upl_offset;
2605	vm_offset_t vector_upl_offset = `0`;
2606	u_int32_t io_req_size;
2607	u_int32_t offset_in_file;
2608	u_int32_t offset_in_iovbase;
2609	u_int32_t io_size;
2610	int io_flag = `0`;
2611	upl_size_t upl_size = `0`, vector_upl_size = `0`;
2612	vm_size_t upl_needed_size;
2613	mach_msg_type_number_t pages_in_pl = `0`;
2614	upl_control_flags_t upl_flags;
2615	kern_return_t kret = KERN_SUCCESS;
2616	mach_msg_type_number_t i = `0`;
2617	int force_data_sync;
2618	int retval = `0`;
2619	int first_IO = `1`;
2620	struct clios iostate;
2621	user_addr_t iov_base;
2622	u_int32_t mem_alignment_mask;
2623	u_int32_t devblocksize;
2624	u_int32_t max_io_size;
2625	u_int32_t max_upl_size;
2626	u_int32_t max_vector_size;
2627	u_int32_t bytes_outstanding_limit;
2628	boolean_t io_throttled = FALSE;
2629
2630	u_int32_t vector_upl_iosize = `0`;
2631	int issueVectorUPL = `0`, useVectorUPL = (uio->uio_iovcnt > `1`);
2632	off_t v_upl_uio_offset = `0`;
2633	int vector_upl_index = `0`;
2634	upl_t vector_upl = NULL;
2635
2636
2637	/*
2638	* When we enter this routine, we know
2639	* -- the resid will not exceed iov_len
2640	*/
2641	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `75`)) \| DBG_FUNC_START,
2642	(int)uio->uio_offset, write_length, (int*)newEOF, `0`, `0`);
2643
2644	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2645
2646	max_upl_size = cluster_max_io_size(mp: vp->v_mount, CL_WRITE);
2647
2648	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
2649
2650	if (flags & IO_PASSIVE) {
2651	io_flag \|= CL_PASSIVE;
2652	}
2653
2654	if (flags & IO_NOCACHE) {
2655	io_flag \|= CL_NOCACHE;
2656	}
2657
2658	if (flags & IO_SKIP_ENCRYPTION) {
2659	io_flag \|= CL_ENCRYPTED;
2660	}
2661
2662	iostate.io_completed = `0`;
2663	iostate.io_issued = `0`;
2664	iostate.io_error = `0`;
2665	iostate.io_wanted = `0`;
2666
2667	lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
2668
2669	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2670	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2671
2672	if (devblocksize == `1`) {
2673	/*
2674	* the AFP client advertises a devblocksize of 1
2675	* however, its BLOCKMAP routine maps to physical
2676	* blocks that are PAGE_SIZE in size...
2677	* therefore we can't ask for I/Os that aren't page aligned
2678	* or aren't multiples of PAGE_SIZE in size
2679	* by setting devblocksize to PAGE_SIZE, we re-instate
2680	* the old behavior we had before the mem_alignment_mask
2681	* changes went in...
2682	*/
2683	devblocksize = PAGE_SIZE;
2684	}
2685
2686	next_dwrite:
2687	io_req_size = *write_length;
2688	iov_base = uio_curriovbase(a_uio: uio);
2689
2690	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2691	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2692
2693	if (offset_in_file \|\| offset_in_iovbase) {
2694	/*
2695	* one of the 2 important offsets is misaligned
2696	* so fire an I/O through the cache for this entire vector
2697	*/
2698	goto wait_for_dwrites;
2699	}
2700	if (iov_base & (devblocksize - `1`)) {
2701	/*
2702	* the offset in memory must be on a device block boundary
2703	* so that we can guarantee that we can generate an
2704	* I/O that ends on a page boundary in cluster_io
2705	*/
2706	goto wait_for_dwrites;
2707	}
2708
2709	task_update_logical_writes(task: current_task(), io_size: (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2710	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == `0`) {
2711	int throttle_type;
2712
2713	if ((throttle_type = cluster_is_throttled(vp))) {
2714	uint32_t max_throttle_size = calculate_max_throttle_size(vp);
2715
2716	/*
2717	* we're in the throttle window, at the very least
2718	* we want to limit the size of the I/O we're about
2719	* to issue
2720	*/
2721	if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2722	/*
2723	* we're in the throttle window and at least 1 I/O
2724	* has already been issued by a throttleable thread
2725	* in this window, so return with EAGAIN to indicate
2726	* to the FS issuing the cluster_write call that it
2727	* should now throttle after dropping any locks
2728	*/
2729	throttle_info_update_by_mount(mp: vp->v_mount);
2730
2731	io_throttled = TRUE;
2732	goto wait_for_dwrites;
2733	}
2734	max_vector_size = max_throttle_size;
2735	max_io_size = max_throttle_size;
2736	} else {
2737	max_vector_size = MAX_VECTOR_UPL_SIZE;
2738	max_io_size = max_upl_size;
2739	}
2740
2741	if (first_IO) {
2742	cluster_syncup(vp, newEOF, callback, callback_arg, flags: callback ? PUSH_SYNC : `0`);
2743	first_IO = `0`;
2744	}
2745	io_size = io_req_size & ~PAGE_MASK;
2746	iov_base = uio_curriovbase(a_uio: uio);
2747
2748	if (io_size > max_io_size) {
2749	io_size = max_io_size;
2750	}
2751
2752	if (useVectorUPL && (iov_base & PAGE_MASK)) {
2753	/*
2754	* We have an iov_base that's not page-aligned.
2755	* Issue all I/O's that have been collected within
2756	* this Vectored UPL.
2757	*/
2758	if (vector_upl_index) {
2759	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2760	reset_vector_run_state();
2761	}
2762
2763	/*
2764	* After this point, if we are using the Vector UPL path and the base is
2765	* not page-aligned then the UPL with that base will be the first in the vector UPL.
2766	*/
2767	}
2768
2769	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2770	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
2771
2772	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `76`)) \| DBG_FUNC_START,
2773	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, `0`);
2774
2775	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2776	for (force_data_sync = `0`; force_data_sync < `3`; force_data_sync++) {
2777	pages_in_pl = `0`;
2778	upl_size = (upl_size_t)upl_needed_size;
2779	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
2780	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
2781
2782	kret = vm_map_get_upl(target_map: map,
2783	map_offset: (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2784	size: &upl_size,
2785	upl: &upl,
2786	NULL,
2787	page_infoCnt: &pages_in_pl,
2788	flags: &upl_flags,
2789	VM_KERN_MEMORY_FILE,
2790	force_data_sync);
2791
2792	if (kret != KERN_SUCCESS) {
2793	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `76`)) \| DBG_FUNC_END,
2794	`0`, `0`, `0`, kret, `0`);
2795	/*
2796	* failed to get pagelist
2797	*
2798	* we may have already spun some portion of this request
2799	* off as async requests... we need to wait for the I/O
2800	* to complete before returning
2801	*/
2802	goto wait_for_dwrites;
2803	}
2804	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2805	pages_in_pl = upl_size / PAGE_SIZE;
2806
2807	for (i = `0`; i < pages_in_pl; i++) {
2808	if (!upl_valid_page(upl: pl, index: i)) {
2809	break;
2810	}
2811	}
2812	if (i == pages_in_pl) {
2813	break;
2814	}
2815
2816	/*
2817	* didn't get all the pages back that we
2818	* needed... release this upl and try again
2819	*/
2820	ubc_upl_abort(upl, `0`);
2821	}
2822	if (force_data_sync >= `3`) {
2823	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `76`)) \| DBG_FUNC_END,
2824	i, pages_in_pl, upl_size, kret, `0`);
2825	/*
2826	* for some reason, we couldn't acquire a hold on all
2827	* the pages needed in the user's address space
2828	*
2829	* we may have already spun some portion of this request
2830	* off as async requests... we need to wait for the I/O
2831	* to complete before returning
2832	*/
2833	goto wait_for_dwrites;
2834	}
2835
2836	/*
2837	* Consider the possibility that upl_size wasn't satisfied.
2838	*/
2839	if (upl_size < upl_needed_size) {
2840	if (upl_size && upl_offset == `0`) {
2841	io_size = upl_size;
2842	} else {
2843	io_size = `0`;
2844	}
2845	}
2846	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `76`)) \| DBG_FUNC_END,
2847	(int)upl_offset, upl_size, (int)iov_base, io_size, `0`);
2848
2849	if (io_size == `0`) {
2850	ubc_upl_abort(upl, `0`);
2851	/*
2852	* we may have already spun some portion of this request
2853	* off as async requests... we need to wait for the I/O
2854	* to complete before returning
2855	*/
2856	goto wait_for_dwrites;
2857	}
2858
2859	if (useVectorUPL) {
2860	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2861	if (end_off) {
2862	issueVectorUPL = `1`;
2863	}
2864	/*
2865	* After this point, if we are using a vector UPL, then
2866	* either all the UPL elements end on a page boundary OR
2867	* this UPL is the last element because it does not end
2868	* on a page boundary.
2869	*/
2870	}
2871
2872	/*
2873	* we want push out these writes asynchronously so that we can overlap
2874	* the preparation of the next I/O
2875	* if there are already too many outstanding writes
2876	* wait until some complete before issuing the next
2877	*/
2878	if (vp->v_mount->mnt_minsaturationbytecount) {
2879	bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2880	} else {
2881	if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, `2`),
2882	&bytes_outstanding_limit) \|\|
2883	(bytes_outstanding_limit > overlapping_write_max))) {
2884	bytes_outstanding_limit = overlapping_write_max;
2885	}
2886	}
2887
2888	cluster_iostate_wait(iostate: &iostate, target: bytes_outstanding_limit, wait_name: "cluster_write_direct");
2889
2890	if (iostate.io_error) {
2891	/*
2892	* one of the earlier writes we issued ran into a hard error
2893	* don't issue any more writes, cleanup the UPL
2894	* that was just created but not used, then
2895	* go wait for all writes that are part of this stream
2896	* to complete before returning the error to the caller
2897	*/
2898	ubc_upl_abort(upl, `0`);
2899
2900	goto wait_for_dwrites;
2901	}
2902
2903	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `77`)) \| DBG_FUNC_START,
2904	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, `0`);
2905
2906	if (!useVectorUPL) {
2907	retval = cluster_io(vp, upl, upl_offset, f_offset: uio->uio_offset,
2908	non_rounded_size: io_size, flags: io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2909	} else {
2910	if (!vector_upl_index) {
2911	vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
2912	v_upl_uio_offset = uio->uio_offset;
2913	vector_upl_offset = upl_offset;
2914	}
2915
2916	vector_upl_set_subupl(vector_upl, upl, upl_size);
2917	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2918	vector_upl_index++;
2919	vector_upl_iosize += io_size;
2920	vector_upl_size += upl_size;
2921
2922	if (issueVectorUPL \|\| vector_upl_index == vector_upl_max_upls(vector_upl) \|\| vector_upl_size >= max_vector_size) {
2923	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2924	reset_vector_run_state();
2925	}
2926	}
2927
2928	/*
2929	* update the uio structure to
2930	* reflect the I/O that we just issued
2931	*/
2932	uio_update(a_uio: uio, a_count: (user_size_t)io_size);
2933
2934	/*
2935	* in case we end up calling through to cluster_write_copy to finish
2936	* the tail of this request, we need to update the oldEOF so that we
2937	* don't zero-fill the head of a page if we've successfully written
2938	* data to that area... 'cluster_write_copy' will zero-fill the head of a
2939	* page that is beyond the oldEOF if the write is unaligned... we only
2940	* want that to happen for the very first page of the cluster_write,
2941	* NOT the first page of each vector making up a multi-vector write.
2942	*/
2943	if (uio->uio_offset > oldEOF) {
2944	oldEOF = uio->uio_offset;
2945	}
2946
2947	io_req_size -= io_size;
2948
2949	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `77`)) \| DBG_FUNC_END,
2950	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, `0`);
2951	} / end while /
2952
2953	if (retval == `0` && iostate.io_error == `0` && io_req_size == `0`) {
2954	retval = cluster_io_type(uio, io_type: write_type, io_length: write_length, MIN_DIRECT_WRITE_SIZE);
2955
2956	if (retval == `0` && *write_type == IO_DIRECT) {
2957	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `75`)) \| DBG_FUNC_NONE,
2958	(int)uio->uio_offset, write_length, (int*)newEOF, `0`, `0`);
2959
2960	goto next_dwrite;
2961	}
2962	}
2963
2964	wait_for_dwrites:
2965
2966	if (retval == `0` && iostate.io_error == `0` && useVectorUPL && vector_upl_index) {
2967	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2968	reset_vector_run_state();
2969	}
2970	/*
2971	* make sure all async writes issued as part of this stream
2972	* have completed before we return
2973	*/
2974	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_write_direct");
2975
2976	if (iostate.io_error) {
2977	retval = iostate.io_error;
2978	}
2979
2980	lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
2981
2982	if (io_throttled == TRUE && retval == `0`) {
2983	retval = EAGAIN;
2984	}
2985
2986	if (io_req_size && retval == `0`) {
2987	/*
2988	* we couldn't handle the tail of this request in DIRECT mode
2989	* so fire it through the copy path
2990	*
2991	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2992	* so we can just pass 0 in for the headOff and tailOff
2993	*/
2994	if (uio->uio_offset > oldEOF) {
2995	oldEOF = uio->uio_offset;
2996	}
2997
2998	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, headOff: (off_t)`0`, tailOff: (off_t)`0`, flags, callback, callback_arg);
2999
3000	*write_type = IO_UNKNOWN;
3001	}
3002	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `75`)) \| DBG_FUNC_END,
3003	(int)uio->uio_offset, io_req_size, retval, `4`, `0`);
3004
3005	return retval;
3006	}
3007
3008
3009	static int
3010	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int* write_type, u_int32_t write_length,
3011	int (callback)(buf_t, void* ), void* callback_arg, int* bflag)
3012	{
3013	upl_page_info_t *pl;
3014	addr64_t src_paddr = `0`;
3015	upl_t upl[MAX_VECTS];
3016	vm_offset_t upl_offset;
3017	u_int32_t tail_size = `0`;
3018	u_int32_t io_size;
3019	u_int32_t xsize;
3020	upl_size_t upl_size;
3021	vm_size_t upl_needed_size;
3022	mach_msg_type_number_t pages_in_pl;
3023	upl_control_flags_t upl_flags;
3024	kern_return_t kret;
3025	struct clios iostate;
3026	int error = `0`;
3027	int cur_upl = `0`;
3028	int num_upl = `0`;
3029	int n;
3030	user_addr_t iov_base;
3031	u_int32_t devblocksize;
3032	u_int32_t mem_alignment_mask;
3033
3034	/*
3035	* When we enter this routine, we know
3036	* -- the io_req_size will not exceed iov_len
3037	* -- the target address is physically contiguous
3038	*/
3039	cluster_syncup(vp, newEOF, callback, callback_arg, flags: callback ? PUSH_SYNC : `0`);
3040
3041	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3042	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3043
3044	iostate.io_completed = `0`;
3045	iostate.io_issued = `0`;
3046	iostate.io_error = `0`;
3047	iostate.io_wanted = `0`;
3048
3049	lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
3050
3051	next_cwrite:
3052	io_size = *write_length;
3053
3054	iov_base = uio_curriovbase(a_uio: uio);
3055
3056	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3057	upl_needed_size = upl_offset + io_size;
3058
3059	pages_in_pl = `0`;
3060	upl_size = (upl_size_t)upl_needed_size;
3061	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
3062	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
3063
3064	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3065	kret = vm_map_get_upl(target_map: map,
3066	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3067	size: &upl_size, upl: &upl[cur_upl], NULL, page_infoCnt: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: `0`);
3068
3069	if (kret != KERN_SUCCESS) {
3070	/*
3071	* failed to get pagelist
3072	*/
3073	error = EINVAL;
3074	goto wait_for_cwrites;
3075	}
3076	num_upl++;
3077
3078	/*
3079	* Consider the possibility that upl_size wasn't satisfied.
3080	*/
3081	if (upl_size < upl_needed_size) {
3082	/*
3083	* This is a failure in the physical memory case.
3084	*/
3085	error = EINVAL;
3086	goto wait_for_cwrites;
3087	}
3088	pl = ubc_upl_pageinfo(upl[cur_upl]);
3089
3090	src_paddr = ((addr64_t)upl_phys_page(upl: pl, index: `0`) << PAGE_SHIFT) + (addr64_t)upl_offset;
3091
3092	while (((uio->uio_offset & (devblocksize - `1`)) \|\| io_size < devblocksize) && io_size) {
3093	u_int32_t head_size;
3094
3095	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - `1`));
3096
3097	if (head_size > io_size) {
3098	head_size = io_size;
3099	}
3100
3101	error = cluster_align_phys_io(vp, uio, usr_paddr: src_paddr, xsize: head_size, flags: `0`, callback, callback_arg);
3102
3103	if (error) {
3104	goto wait_for_cwrites;
3105	}
3106
3107	upl_offset += head_size;
3108	src_paddr += head_size;
3109	io_size -= head_size;
3110
3111	iov_base += head_size;
3112	}
3113	if ((u_int32_t)iov_base & mem_alignment_mask) {
3114	/*
3115	* request doesn't set up on a memory boundary
3116	* the underlying DMA engine can handle...
3117	* return an error instead of going through
3118	* the slow copy path since the intent of this
3119	* path is direct I/O from device memory
3120	*/
3121	error = EINVAL;
3122	goto wait_for_cwrites;
3123	}
3124
3125	tail_size = io_size & (devblocksize - `1`);
3126	io_size -= tail_size;
3127
3128	while (io_size && error == `0`) {
3129	if (io_size > MAX_IO_CONTIG_SIZE) {
3130	xsize = MAX_IO_CONTIG_SIZE;
3131	} else {
3132	xsize = io_size;
3133	}
3134	/*
3135	* request asynchronously so that we can overlap
3136	* the preparation of the next I/O... we'll do
3137	* the commit after all the I/O has completed
3138	* since its all issued against the same UPL
3139	* if there are already too many outstanding writes
3140	* wait until some have completed before issuing the next
3141	*/
3142	cluster_iostate_wait(iostate: &iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, `2`), wait_name: "cluster_write_contig");
3143
3144	if (iostate.io_error) {
3145	/*
3146	* one of the earlier writes we issued ran into a hard error
3147	* don't issue any more writes...
3148	* go wait for all writes that are part of this stream
3149	* to complete before returning the error to the caller
3150	*/
3151	goto wait_for_cwrites;
3152	}
3153	/*
3154	* issue an asynchronous write to cluster_io
3155	*/
3156	error = cluster_io(vp, upl: upl[cur_upl], upl_offset, f_offset: uio->uio_offset,
3157	non_rounded_size: xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)&iostate, callback, callback_arg);
3158
3159	if (error == `0`) {
3160	/*
3161	* The cluster_io write completed successfully,
3162	* update the uio structure
3163	*/
3164	uio_update(a_uio: uio, a_count: (user_size_t)xsize);
3165
3166	upl_offset += xsize;
3167	src_paddr += xsize;
3168	io_size -= xsize;
3169	}
3170	}
3171	if (error == `0` && iostate.io_error == `0` && tail_size == `0` && num_upl < MAX_VECTS) {
3172	error = cluster_io_type(uio, io_type: write_type, io_length: write_length, min_length: `0`);
3173
3174	if (error == `0` && *write_type == IO_CONTIG) {
3175	cur_upl++;
3176	goto next_cwrite;
3177	}
3178	} else {
3179	*write_type = IO_UNKNOWN;
3180	}
3181
3182	wait_for_cwrites:
3183	/*
3184	* make sure all async writes that are part of this stream
3185	* have completed before we proceed
3186	*/
3187	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_write_contig");
3188
3189	if (iostate.io_error) {
3190	error = iostate.io_error;
3191	}
3192
3193	lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
3194
3195	if (error == `0` && tail_size) {
3196	error = cluster_align_phys_io(vp, uio, usr_paddr: src_paddr, xsize: tail_size, flags: `0`, callback, callback_arg);
3197	}
3198
3199	for (n = `0`; n < num_upl; n++) {
3200	/*
3201	* just release our hold on each physically contiguous
3202	* region without changing any state
3203	*/
3204	ubc_upl_abort(upl[n], `0`);
3205	}
3206
3207	return error;
3208	}
3209
3210
3211	/*
3212	* need to avoid a race between an msync of a range of pages dirtied via mmap
3213	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3214	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3215	*
3216	* we should never force-zero-fill pages that are already valid in the cache...
3217	* the entire page contains valid data (either from disk, zero-filled or dirtied
3218	* via an mmap) so we can only do damage by trying to zero-fill
3219	*
3220	*/
3221	static int
3222	cluster_zero_range(upl_t upl, upl_page_info_t pl, int* flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3223	{
3224	int zero_pg_index;
3225	boolean_t need_cluster_zero = TRUE;
3226
3227	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
3228	bytes_to_zero = min(a: bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3229	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3230
3231	if (upl_valid_page(upl: pl, index: zero_pg_index)) {
3232	/*
3233	* never force zero valid pages - dirty or clean
3234	* we'll leave these in the UPL for cluster_write_copy to deal with
3235	*/
3236	need_cluster_zero = FALSE;
3237	}
3238	}
3239	if (need_cluster_zero == TRUE) {
3240	cluster_zero(upl, upl_offset: io_offset, size: bytes_to_zero, NULL);
3241	}
3242
3243	return bytes_to_zero;
3244	}
3245
3246
3247	void
3248	cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3249	{
3250	struct cl_extent cl;
3251	boolean_t first_pass = TRUE;
3252
3253	assert(s_offset < e_offset);
3254	assert((s_offset & PAGE_MASK_64) == `0`);
3255	assert((e_offset & PAGE_MASK_64) == `0`);
3256
3257	cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3258	cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3259
3260	cluster_update_state_internal(vp, cl: &cl, flags: `0`, TRUE, first_pass: &first_pass, write_off: s_offset, write_cnt: (int)(e_offset - s_offset),
3261	newEOF: vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3262	}
3263
3264
3265	static void
3266	cluster_update_state_internal(vnode_t vp, struct cl_extent cl, int* flags, boolean_t defer_writes,
3267	boolean_t first_pass, off_t write_off, int* write_cnt, off_t newEOF,
3268	int (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated)
3269	{
3270	struct cl_writebehind *wbp;
3271	int cl_index;
3272	int ret_cluster_try_push;
3273	u_int max_cluster_pgcount;
3274
3275
3276	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3277
3278	/*
3279	* take the lock to protect our accesses
3280	* of the writebehind and sparse cluster state
3281	*/
3282	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
3283
3284	if (wbp->cl_scmap) {
3285	if (!(flags & IO_NOCACHE)) {
3286	/*
3287	* we've fallen into the sparse
3288	* cluster method of delaying dirty pages
3289	*/
3290	sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, cl, EOF: newEOF, callback, callback_arg, vm_initiated);
3291
3292	lck_mtx_unlock(lck: &wbp->cl_lockw);
3293	return;
3294	}
3295	/*
3296	* must have done cached writes that fell into
3297	* the sparse cluster mechanism... we've switched
3298	* to uncached writes on the file, so go ahead
3299	* and push whatever's in the sparse map
3300	* and switch back to normal clustering
3301	*/
3302	wbp->cl_number = `0`;
3303
3304	sparse_cluster_push(wbp, cmapp: &(wbp->cl_scmap), vp, EOF: newEOF, PUSH_ALL, io_flags: `0`, callback, callback_arg, vm_initiated);
3305	/*
3306	* no clusters of either type present at this point
3307	* so just go directly to start_new_cluster since
3308	* we know we need to delay this I/O since we've
3309	* already released the pages back into the cache
3310	* to avoid the deadlock with sparse_cluster_push
3311	*/
3312	goto start_new_cluster;
3313	}
3314	if (*first_pass == TRUE) {
3315	if (write_off == wbp->cl_last_write) {
3316	wbp->cl_seq_written += write_cnt;
3317	} else {
3318	wbp->cl_seq_written = write_cnt;
3319	}
3320
3321	wbp->cl_last_write = write_off + write_cnt;
3322
3323	*first_pass = FALSE;
3324	}
3325	if (wbp->cl_number == `0`) {
3326	/*
3327	* no clusters currently present
3328	*/
3329	goto start_new_cluster;
3330	}
3331
3332	for (cl_index = `0`; cl_index < wbp->cl_number; cl_index++) {
3333	/*
3334	* check each cluster that we currently hold
3335	* try to merge some or all of this write into
3336	* one or more of the existing clusters... if
3337	* any portion of the write remains, start a
3338	* new cluster
3339	*/
3340	if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3341	/*
3342	* the current write starts at or after the current cluster
3343	*/
3344	if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3345	/*
3346	* we have a write that fits entirely
3347	* within the existing cluster limits
3348	*/
3349	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3350	/*
3351	* update our idea of where the cluster ends
3352	*/
3353	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3354	}
3355	break;
3356	}
3357	if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3358	/*
3359	* we have a write that starts in the middle of the current cluster
3360	* but extends beyond the cluster's limit... we know this because
3361	* of the previous checks
3362	* we'll extend the current cluster to the max
3363	* and update the b_addr for the current write to reflect that
3364	* the head of it was absorbed into this cluster...
3365	* note that we'll always have a leftover tail in this case since
3366	* full absorbtion would have occurred in the clause above
3367	*/
3368	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3369
3370	cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3371	}
3372	/*
3373	* we come here for the case where the current write starts
3374	* beyond the limit of the existing cluster or we have a leftover
3375	* tail after a partial absorbtion
3376	*
3377	* in either case, we'll check the remaining clusters before
3378	* starting a new one
3379	*/
3380	} else {
3381	/*
3382	* the current write starts in front of the cluster we're currently considering
3383	*/
3384	if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3385	/*
3386	* we can just merge the new request into
3387	* this cluster and leave it in the cache
3388	* since the resulting cluster is still
3389	* less than the maximum allowable size
3390	*/
3391	wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3392
3393	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3394	/*
3395	* the current write completely
3396	* envelops the existing cluster and since
3397	* each write is limited to at most max_cluster_pgcount pages
3398	* we can just use the start and last blocknos of the write
3399	* to generate the cluster limits
3400	*/
3401	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3402	}
3403	break;
3404	}
3405	/*
3406	* if we were to combine this write with the current cluster
3407	* we would exceed the cluster size limit.... so,
3408	* let's see if there's any overlap of the new I/O with
3409	* the cluster we're currently considering... in fact, we'll
3410	* stretch the cluster out to it's full limit and see if we
3411	* get an intersection with the current write
3412	*
3413	*/
3414	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3415	/*
3416	* the current write extends into the proposed cluster
3417	* clip the length of the current write after first combining it's
3418	* tail with the newly shaped cluster
3419	*/
3420	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3421
3422	cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3423	}
3424	/*
3425	* if we get here, there was no way to merge
3426	* any portion of this write with this cluster
3427	* or we could only merge part of it which
3428	* will leave a tail...
3429	* we'll check the remaining clusters before starting a new one
3430	*/
3431	}
3432	}
3433	if (cl_index < wbp->cl_number) {
3434	/*
3435	* we found an existing cluster(s) that we
3436	* could entirely merge this I/O into
3437	*/
3438	goto delay_io;
3439	}
3440
3441	if (defer_writes == FALSE &&
3442	wbp->cl_number == MAX_CLUSTERS &&
3443	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3444	uint32_t n;
3445
3446	if (vp->v_mount->mnt_minsaturationbytecount) {
3447	n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3448
3449	if (n > MAX_CLUSTERS) {
3450	n = MAX_CLUSTERS;
3451	}
3452	} else {
3453	n = `0`;
3454	}
3455
3456	if (n == `0`) {
3457	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3458	n = WRITE_BEHIND_SSD;
3459	} else {
3460	n = WRITE_BEHIND;
3461	}
3462	}
3463	while (n--) {
3464	cluster_try_push(wbp, vp, EOF: newEOF, push_flag: `0`, flags: `0`, callback, callback_arg, NULL, vm_initiated);
3465	}
3466	}
3467	if (wbp->cl_number < MAX_CLUSTERS) {
3468	/*
3469	* we didn't find an existing cluster to
3470	* merge into, but there's room to start
3471	* a new one
3472	*/
3473	goto start_new_cluster;
3474	}
3475	/*
3476	* no exisitng cluster to merge with and no
3477	* room to start a new one... we'll try
3478	* pushing one of the existing ones... if none of
3479	* them are able to be pushed, we'll switch
3480	* to the sparse cluster mechanism
3481	* cluster_try_push updates cl_number to the
3482	* number of remaining clusters... and
3483	* returns the number of currently unused clusters
3484	*/
3485	ret_cluster_try_push = `0`;
3486
3487	/*
3488	* if writes are not deferred, call cluster push immediately
3489	*/
3490	if (defer_writes == FALSE) {
3491	ret_cluster_try_push = cluster_try_push(wbp, vp, EOF: newEOF, push_flag: (flags & IO_NOCACHE) ? `0` : PUSH_DELAY, flags: `0`, callback, callback_arg, NULL, vm_initiated);
3492	}
3493	/*
3494	* execute following regardless of writes being deferred or not
3495	*/
3496	if (ret_cluster_try_push == `0`) {
3497	/*
3498	* no more room in the normal cluster mechanism
3499	* so let's switch to the more expansive but expensive
3500	* sparse mechanism....
3501	*/
3502	sparse_cluster_switch(wbp, vp, EOF: newEOF, callback, callback_arg, vm_initiated);
3503	sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, cl, EOF: newEOF, callback, callback_arg, vm_initiated);
3504
3505	lck_mtx_unlock(lck: &wbp->cl_lockw);
3506	return;
3507	}
3508	start_new_cluster:
3509	wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3510	wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3511
3512	wbp->cl_clusters[wbp->cl_number].io_flags = `0`;
3513
3514	if (flags & IO_NOCACHE) {
3515	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
3516	}
3517
3518	if (flags & IO_PASSIVE) {
3519	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
3520	}
3521
3522	wbp->cl_number++;
3523	delay_io:
3524	lck_mtx_unlock(lck: &wbp->cl_lockw);
3525	return;
3526	}
3527
3528
3529	static int
3530	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3531	off_t tailOff, int flags, int (callback)(buf_t, void* ), void* *callback_arg)
3532	{
3533	upl_page_info_t *pl;
3534	upl_t upl;
3535	vm_offset_t upl_offset = `0`;
3536	vm_size_t upl_size;
3537	off_t upl_f_offset;
3538	int pages_in_upl;
3539	int start_offset;
3540	int xfer_resid;
3541	int io_size;
3542	int io_offset;
3543	int bytes_to_zero;
3544	int bytes_to_move;
3545	kern_return_t kret;
3546	int retval = `0`;
3547	int io_resid;
3548	long long total_size;
3549	long long zero_cnt;
3550	off_t zero_off;
3551	long long zero_cnt1;
3552	off_t zero_off1;
3553	off_t write_off = `0`;
3554	int write_cnt = `0`;
3555	boolean_t first_pass = FALSE;
3556	struct cl_extent cl;
3557	int bflag;
3558	u_int max_io_size;
3559
3560	if (uio) {
3561	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `40`)) \| DBG_FUNC_START,
3562	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, `0`);
3563
3564	io_resid = io_req_size;
3565	} else {
3566	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `40`)) \| DBG_FUNC_START,
3567	`0`, `0`, (int)oldEOF, (int)newEOF, `0`);
3568
3569	io_resid = `0`;
3570	}
3571	if (flags & IO_PASSIVE) {
3572	bflag = CL_PASSIVE;
3573	} else {
3574	bflag = `0`;
3575	}
3576	if (flags & IO_NOCACHE) {
3577	bflag \|= CL_NOCACHE;
3578	}
3579
3580	if (flags & IO_SKIP_ENCRYPTION) {
3581	bflag \|= CL_ENCRYPTED;
3582	}
3583
3584	zero_cnt = `0`;
3585	zero_cnt1 = `0`;
3586	zero_off = `0`;
3587	zero_off1 = `0`;
3588
3589	max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_WRITE);
3590
3591	if (flags & IO_HEADZEROFILL) {
3592	/*
3593	* some filesystems (HFS is one) don't support unallocated holes within a file...
3594	* so we zero fill the intervening space between the old EOF and the offset
3595	* where the next chunk of real data begins.... ftruncate will also use this
3596	* routine to zero fill to the new EOF when growing a file... in this case, the
3597	* uio structure will not be provided
3598	*/
3599	if (uio) {
3600	if (headOff < uio->uio_offset) {
3601	zero_cnt = uio->uio_offset - headOff;
3602	zero_off = headOff;
3603	}
3604	} else if (headOff < newEOF) {
3605	zero_cnt = newEOF - headOff;
3606	zero_off = headOff;
3607	}
3608	} else {
3609	if (uio && uio->uio_offset > oldEOF) {
3610	zero_off = uio->uio_offset & ~PAGE_MASK_64;
3611
3612	if (zero_off >= oldEOF) {
3613	zero_cnt = uio->uio_offset - zero_off;
3614
3615	flags \|= IO_HEADZEROFILL;
3616	}
3617	}
3618	}
3619	if (flags & IO_TAILZEROFILL) {
3620	if (uio) {
3621	zero_off1 = uio->uio_offset + io_req_size;
3622
3623	if (zero_off1 < tailOff) {
3624	zero_cnt1 = tailOff - zero_off1;
3625	}
3626	}
3627	} else {
3628	if (uio && newEOF > oldEOF) {
3629	zero_off1 = uio->uio_offset + io_req_size;
3630
3631	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3632	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3633
3634	flags \|= IO_TAILZEROFILL;
3635	}
3636	}
3637	}
3638	if (zero_cnt == `0` && uio == (struct uio *) `0`) {
3639	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `40`)) \| DBG_FUNC_END,
3640	retval, `0`, `0`, `0`, `0`);
3641	return `0`;
3642	}
3643	if (uio) {
3644	write_off = uio->uio_offset;
3645	write_cnt = (int)uio_resid(a_uio: uio);
3646	/*
3647	* delay updating the sequential write info
3648	* in the control block until we've obtained
3649	* the lock for it
3650	*/
3651	first_pass = TRUE;
3652	}
3653	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == `0`) {
3654	/*
3655	* for this iteration of the loop, figure out where our starting point is
3656	*/
3657	if (zero_cnt) {
3658	start_offset = (int)(zero_off & PAGE_MASK_64);
3659	upl_f_offset = zero_off - start_offset;
3660	} else if (io_resid) {
3661	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3662	upl_f_offset = uio->uio_offset - start_offset;
3663	} else {
3664	start_offset = (int)(zero_off1 & PAGE_MASK_64);
3665	upl_f_offset = zero_off1 - start_offset;
3666	}
3667	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `46`)) \| DBG_FUNC_NONE,
3668	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, `0`);
3669
3670	if (total_size > max_io_size) {
3671	total_size = max_io_size;
3672	}
3673
3674	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3675
3676	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == `0`)) {
3677	/*
3678	* assumption... total_size <= io_resid
3679	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3680	*/
3681	if ((start_offset + total_size) > max_io_size) {
3682	total_size = max_io_size - start_offset;
3683	}
3684	xfer_resid = (int)total_size;
3685
3686	retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: &xfer_resid, mark_dirty: `1`, take_reference: `1`);
3687
3688	if (retval) {
3689	break;
3690	}
3691
3692	io_resid -= (total_size - xfer_resid);
3693	total_size = xfer_resid;
3694	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3695	upl_f_offset = uio->uio_offset - start_offset;
3696
3697	if (total_size == `0`) {
3698	if (start_offset) {
3699	/*
3700	* the write did not finish on a page boundary
3701	* which will leave upl_f_offset pointing to the
3702	* beginning of the last page written instead of
3703	* the page beyond it... bump it in this case
3704	* so that the cluster code records the last page
3705	* written as dirty
3706	*/
3707	upl_f_offset += PAGE_SIZE_64;
3708	}
3709	upl_size = `0`;
3710
3711	goto check_cluster;
3712	}
3713	}
3714	/*
3715	* compute the size of the upl needed to encompass
3716	* the requested write... limit each call to cluster_io
3717	* to the maximum UPL size... cluster_io will clip if
3718	* this exceeds the maximum io_size for the device,
3719	* make sure to account for
3720	* a starting offset that's not page aligned
3721	*/
3722	upl_size = (start_offset + total_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
3723
3724	if (upl_size > max_io_size) {
3725	upl_size = max_io_size;
3726	}
3727
3728	pages_in_upl = (int)(upl_size / PAGE_SIZE);
3729	io_size = (int)(upl_size - start_offset);
3730
3731	if ((long long)io_size > total_size) {
3732	io_size = (int)total_size;
3733	}
3734
3735	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `41`)) \| DBG_FUNC_START, upl_size, io_size, total_size, `0`, `0`);
3736
3737
3738	/*
3739	* Gather the pages from the buffer cache.
3740	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
3741	* that we intend to modify these pages.
3742	*/
3743	kret = ubc_create_upl_kernel(vp,
3744	upl_f_offset,
3745	(int)upl_size,
3746	&upl,
3747	&pl,
3748	UPL_SET_LITE \| ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? `0` : UPL_WILL_MODIFY),
3749	VM_KERN_MEMORY_FILE);
3750	if (kret != KERN_SUCCESS) {
3751	panic("cluster_write_copy: failed to get pagelist");
3752	}
3753
3754	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `41`)) \| DBG_FUNC_END,
3755	upl, (int)upl_f_offset, start_offset, `0`, `0`);
3756
3757	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(upl: pl, index: `0`)) {
3758	int read_size;
3759
3760	/*
3761	* we're starting in the middle of the first page of the upl
3762	* and the page isn't currently valid, so we're going to have
3763	* to read it in first... this is a synchronous operation
3764	*/
3765	read_size = PAGE_SIZE;
3766
3767	if ((upl_f_offset + read_size) > oldEOF) {
3768	read_size = (int)(oldEOF - upl_f_offset);
3769	}
3770
3771	retval = cluster_io(vp, upl, upl_offset: `0`, f_offset: upl_f_offset, non_rounded_size: read_size,
3772	CL_READ \| bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
3773	if (retval) {
3774	/*
3775	* we had an error during the read which causes us to abort
3776	* the current cluster_write request... before we do, we need
3777	* to release the rest of the pages in the upl without modifying
3778	* there state and mark the failed page in error
3779	*/
3780	ubc_upl_abort_range(upl, `0`, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
3781
3782	if (upl_size > PAGE_SIZE) {
3783	ubc_upl_abort_range(upl, `0`, (upl_size_t)upl_size,
3784	UPL_ABORT_FREE_ON_EMPTY);
3785	}
3786
3787	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `45`)) \| DBG_FUNC_NONE,
3788	upl, `0`, `0`, retval, `0`);
3789	break;
3790	}
3791	}
3792	if ((start_offset == `0` \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3793	/*
3794	* the last offset we're writing to in this upl does not end on a page
3795	* boundary... if it's not beyond the old EOF, then we'll also need to
3796	* pre-read this page in if it isn't already valid
3797	*/
3798	upl_offset = upl_size - PAGE_SIZE;
3799
3800	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3801	!upl_valid_page(upl: pl, index: (int)(upl_offset / PAGE_SIZE))) {
3802	int read_size;
3803
3804	read_size = PAGE_SIZE;
3805
3806	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3807	read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3808	}
3809
3810	retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: read_size,
3811	CL_READ \| bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
3812	if (retval) {
3813	/*
3814	* we had an error during the read which causes us to abort
3815	* the current cluster_write request... before we do, we
3816	* need to release the rest of the pages in the upl without
3817	* modifying there state and mark the failed page in error
3818	*/
3819	ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
3820
3821	if (upl_size > PAGE_SIZE) {
3822	ubc_upl_abort_range(upl, `0`, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3823	}
3824
3825	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `45`)) \| DBG_FUNC_NONE,
3826	upl, `0`, `0`, retval, `0`);
3827	break;
3828	}
3829	}
3830	}
3831	xfer_resid = io_size;
3832	io_offset = start_offset;
3833
3834	while (zero_cnt && xfer_resid) {
3835	if (zero_cnt < (long long)xfer_resid) {
3836	bytes_to_zero = (int)zero_cnt;
3837	} else {
3838	bytes_to_zero = xfer_resid;
3839	}
3840
3841	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3842
3843	xfer_resid -= bytes_to_zero;
3844	zero_cnt -= bytes_to_zero;
3845	zero_off += bytes_to_zero;
3846	io_offset += bytes_to_zero;
3847	}
3848	if (xfer_resid && io_resid) {
3849	u_int32_t io_requested;
3850
3851	bytes_to_move = min(a: io_resid, b: xfer_resid);
3852	io_requested = bytes_to_move;
3853
3854	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3855
3856	if (retval) {
3857	ubc_upl_abort_range(upl, `0`, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3858
3859	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `45`)) \| DBG_FUNC_NONE,
3860	upl, `0`, `0`, retval, `0`);
3861	} else {
3862	io_resid -= bytes_to_move;
3863	xfer_resid -= bytes_to_move;
3864	io_offset += bytes_to_move;
3865	}
3866	}
3867	while (xfer_resid && zero_cnt1 && retval == `0`) {
3868	if (zero_cnt1 < (long long)xfer_resid) {
3869	bytes_to_zero = (int)zero_cnt1;
3870	} else {
3871	bytes_to_zero = xfer_resid;
3872	}
3873
3874	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off: zero_off1, upl_f_offset, bytes_to_zero);
3875
3876	xfer_resid -= bytes_to_zero;
3877	zero_cnt1 -= bytes_to_zero;
3878	zero_off1 += bytes_to_zero;
3879	io_offset += bytes_to_zero;
3880	}
3881	if (retval == `0`) {
3882	int do_zeroing = `1`;
3883
3884	io_size += start_offset;
3885
3886	/ Force more restrictive zeroing behavior only on APFS /
3887	if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3888	do_zeroing = `0`;
3889	}
3890
3891	if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3892	/*
3893	* if we're extending the file with this write
3894	* we'll zero fill the rest of the page so that
3895	* if the file gets extended again in such a way as to leave a
3896	* hole starting at this EOF, we'll have zero's in the correct spot
3897	*/
3898	cluster_zero(upl, upl_offset: io_size, size: (int)(upl_size - io_size), NULL);
3899	}
3900	/*
3901	* release the upl now if we hold one since...
3902	* 1) pages in it may be present in the sparse cluster map
3903	* and may span 2 separate buckets there... if they do and
3904	* we happen to have to flush a bucket to make room and it intersects
3905	* this upl, a deadlock may result on page BUSY
3906	* 2) we're delaying the I/O... from this point forward we're just updating
3907	* the cluster state... no need to hold the pages, so commit them
3908	* 3) IO_SYNC is set...
3909	* because we had to ask for a UPL that provides currenty non-present pages, the
3910	* UPL has been automatically set to clear the dirty flags (both software and hardware)
3911	* upon committing it... this is not the behavior we want since it's possible for
3912	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3913	* we'll pick these pages back up later with the correct behavior specified.
3914	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3915	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3916	* we hold since the flushing context is holding the cluster lock.
3917	*/
3918	ubc_upl_commit_range(upl, `0`, (upl_size_t)upl_size,
3919	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
3920	check_cluster:
3921	/*
3922	* calculate the last logical block number
3923	* that this delayed I/O encompassed
3924	*/
3925	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3926
3927	if (flags & IO_SYNC) {
3928	/*
3929	* if the IO_SYNC flag is set than we need to bypass
3930	* any clustering and immediately issue the I/O
3931	*
3932	* we don't hold the lock at this point
3933	*
3934	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3935	* so that we correctly deal with a change in state of the hardware modify bit...
3936	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3937	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3938	* responsible for generating the correct sized I/O(s)
3939	*/
3940	retval = cluster_push_now(vp, &cl, EOF: newEOF, flags, callback, callback_arg, FALSE);
3941	} else {
3942	boolean_t defer_writes = FALSE;
3943
3944	if (vfs_flags(mp: vp->v_mount) & MNT_DEFWRITE) {
3945	defer_writes = TRUE;
3946	}
3947
3948	cluster_update_state_internal(vp, cl: &cl, flags, defer_writes, first_pass: &first_pass,
3949	write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3950	}
3951	}
3952	}
3953	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `40`)) \| DBG_FUNC_END, retval, `0`, io_resid, `0`, `0`);
3954
3955	return retval;
3956	}
3957
3958
3959
3960	int
3961	cluster_read(vnode_t vp, struct uio uio, off_t filesize, int* xflags)
3962	{
3963	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3964	}
3965
3966
3967	int
3968	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int* xflags, int (callback)(buf_t, void* ), void* *callback_arg)
3969	{
3970	int retval = `0`;
3971	int flags;
3972	user_ssize_t cur_resid;
3973	u_int32_t io_size;
3974	u_int32_t read_length = `0`;
3975	int read_type = IO_COPY;
3976
3977	flags = xflags;
3978
3979	if (vp->v_flag & VNOCACHE_DATA) {
3980	flags \|= IO_NOCACHE;
3981	}
3982	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled) {
3983	flags \|= IO_RAOFF;
3984	}
3985
3986	if (flags & IO_SKIP_ENCRYPTION) {
3987	flags \|= IO_ENCRYPTED;
3988	}
3989
3990	/*
3991	* do a read through the cache if one of the following is true....
3992	* NOCACHE is not true
3993	* the uio request doesn't target USERSPACE
3994	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3995	* Reading encrypted data from a CP filesystem should never result in the data touching
3996	* the UBC.
3997	*
3998	* otherwise, find out if we want the direct or contig variant for
3999	* the first vector in the uio request
4000	*/
4001	if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED)) {
4002	retval = cluster_io_type(uio, io_type: &read_type, io_length: &read_length, min_length: `0`);
4003	}
4004
4005	while ((cur_resid = uio_resid(a_uio: uio)) && uio->uio_offset < filesize && retval == `0`) {
4006	switch (read_type) {
4007	case IO_COPY:
4008	/*
4009	* make sure the uio_resid isn't too big...
4010	* internally, we want to handle all of the I/O in
4011	* chunk sizes that fit in a 32 bit int
4012	*/
4013	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4014	io_size = MAX_IO_REQUEST_SIZE;
4015	} else {
4016	io_size = (u_int32_t)cur_resid;
4017	}
4018
4019	retval = cluster_read_copy(vp, uio, io_req_size: io_size, filesize, flags, callback, callback_arg);
4020	break;
4021
4022	case IO_DIRECT:
4023	retval = cluster_read_direct(vp, uio, filesize, read_type: &read_type, read_length: &read_length, flags, callback, callback_arg);
4024	break;
4025
4026	case IO_CONTIG:
4027	retval = cluster_read_contig(vp, uio, filesize, read_type: &read_type, read_length: &read_length, callback, callback_arg, flags);
4028	break;
4029
4030	case IO_UNKNOWN:
4031	retval = cluster_io_type(uio, io_type: &read_type, io_length: &read_length, min_length: `0`);
4032	break;
4033	}
4034	}
4035	return retval;
4036	}
4037
4038
4039
4040	static void
4041	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4042	{
4043	int range;
4044	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4045
4046	if ((range = last_pg - start_pg)) {
4047	if (take_reference) {
4048	abort_flags \|= UPL_ABORT_REFERENCE;
4049	}
4050
4051	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4052	}
4053	}
4054
4055
4056	static int
4057	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int* flags, int (callback)(buf_t, void* ), void* *callback_arg)
4058	{
4059	upl_page_info_t *pl;
4060	upl_t upl = NULL;
4061	vm_offset_t upl_offset;
4062	u_int32_t upl_size;
4063	off_t upl_f_offset;
4064	int start_offset;
4065	int start_pg;
4066	int last_pg;
4067	int uio_last = `0`;
4068	int pages_in_upl;
4069	off_t max_size;
4070	off_t last_ioread_offset;
4071	off_t last_request_offset;
4072	kern_return_t kret;
4073	int error = `0`;
4074	int retval = `0`;
4075	u_int32_t size_of_prefetch;
4076	u_int32_t xsize;
4077	u_int32_t io_size;
4078	u_int32_t max_rd_size;
4079	u_int32_t max_io_size;
4080	u_int32_t max_prefetch;
4081	u_int rd_ahead_enabled = `1`;
4082	u_int prefetch_enabled = `1`;
4083	struct cl_readahead * rap;
4084	struct clios iostate;
4085	struct cl_extent extent;
4086	int bflag;
4087	int take_reference = `1`;
4088	int policy = IOPOL_DEFAULT;
4089	boolean_t iolock_inited = FALSE;
4090
4091	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `32`)) \| DBG_FUNC_START,
4092	(int)uio->uio_offset, io_req_size, (int)filesize, flags, `0`);
4093
4094	if (flags & IO_ENCRYPTED) {
4095	panic("encrypted blocks will hit UBC!");
4096	}
4097
4098	policy = throttle_get_io_policy(NULL);
4099
4100	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE)) {
4101	take_reference = `0`;
4102	}
4103
4104	if (flags & IO_PASSIVE) {
4105	bflag = CL_PASSIVE;
4106	} else {
4107	bflag = `0`;
4108	}
4109
4110	if (flags & IO_NOCACHE) {
4111	bflag \|= CL_NOCACHE;
4112	}
4113
4114	if (flags & IO_SKIP_ENCRYPTION) {
4115	bflag \|= CL_ENCRYPTED;
4116	}
4117
4118	max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
4119	max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_limit: prefetch_max);
4120	max_rd_size = max_prefetch;
4121
4122	last_request_offset = uio->uio_offset + io_req_size;
4123
4124	if (last_request_offset > filesize) {
4125	last_request_offset = filesize;
4126	}
4127
4128	if ((flags & (IO_RAOFF \| IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4129	rd_ahead_enabled = `0`;
4130	rap = NULL;
4131	} else {
4132	if (cluster_is_throttled(vp)) {
4133	/*
4134	* we're in the throttle window, at the very least
4135	* we want to limit the size of the I/O we're about
4136	* to issue
4137	*/
4138	rd_ahead_enabled = `0`;
4139	prefetch_enabled = `0`;
4140
4141	max_rd_size = calculate_max_throttle_size(vp);
4142	}
4143	if ((rap = cluster_get_rap(vp)) == NULL) {
4144	rd_ahead_enabled = `0`;
4145	} else {
4146	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4147	extent.e_addr = (last_request_offset - `1`) / PAGE_SIZE_64;
4148	}
4149	}
4150	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + `1`) == extent.b_addr)) {
4151	/*
4152	* determine if we already have a read-ahead in the pipe courtesy of the
4153	* last read systemcall that was issued...
4154	* if so, pick up it's extent to determine where we should start
4155	* with respect to any read-ahead that might be necessary to
4156	* garner all the data needed to complete this read systemcall
4157	*/
4158	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4159
4160	if (last_ioread_offset < uio->uio_offset) {
4161	last_ioread_offset = (off_t)`0`;
4162	} else if (last_ioread_offset > last_request_offset) {
4163	last_ioread_offset = last_request_offset;
4164	}
4165	} else {
4166	last_ioread_offset = (off_t)`0`;
4167	}
4168
4169	while (io_req_size && uio->uio_offset < filesize && retval == `0`) {
4170	max_size = filesize - uio->uio_offset;
4171	bool leftover_upl_aborted = false;
4172
4173	if ((off_t)(io_req_size) < max_size) {
4174	io_size = io_req_size;
4175	} else {
4176	io_size = (u_int32_t)max_size;
4177	}
4178
4179	if (!(flags & IO_NOCACHE)) {
4180	while (io_size) {
4181	u_int32_t io_resid;
4182	u_int32_t io_requested;
4183
4184	/*
4185	* if we keep finding the pages we need already in the cache, then
4186	* don't bother to call cluster_read_prefetch since it costs CPU cycles
4187	* to determine that we have all the pages we need... once we miss in
4188	* the cache and have issued an I/O, than we'll assume that we're likely
4189	* to continue to miss in the cache and it's to our advantage to try and prefetch
4190	*/
4191	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4192	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4193	/*
4194	* we've already issued I/O for this request and
4195	* there's still work to do and
4196	* our prefetch stream is running dry, so issue a
4197	* pre-fetch I/O... the I/O latency will overlap
4198	* with the copying of the data
4199	*/
4200	if (size_of_prefetch > max_rd_size) {
4201	size_of_prefetch = max_rd_size;
4202	}
4203
4204	size_of_prefetch = cluster_read_prefetch(vp, f_offset: last_ioread_offset, size: size_of_prefetch, filesize, callback, callback_arg, bflag);
4205
4206	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4207
4208	if (last_ioread_offset > last_request_offset) {
4209	last_ioread_offset = last_request_offset;
4210	}
4211	}
4212	}
4213	/*
4214	* limit the size of the copy we're about to do so that
4215	* we can notice that our I/O pipe is running dry and
4216	* get the next I/O issued before it does go dry
4217	*/
4218	if (last_ioread_offset && io_size > (max_io_size / `4`)) {
4219	io_resid = (max_io_size / `4`);
4220	} else {
4221	io_resid = io_size;
4222	}
4223
4224	io_requested = io_resid;
4225
4226	retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: (int *)&io_resid, mark_dirty: `0`, take_reference);
4227
4228	xsize = io_requested - io_resid;
4229
4230	io_size -= xsize;
4231	io_req_size -= xsize;
4232
4233	if (retval \|\| io_resid) {
4234	/*
4235	* if we run into a real error or
4236	* a page that is not in the cache
4237	* we need to leave streaming mode
4238	*/
4239	break;
4240	}
4241
4242	if (rd_ahead_enabled && (io_size == `0` \|\| last_ioread_offset == last_request_offset)) {
4243	/*
4244	* we're already finished the I/O for this read request
4245	* let's see if we should do a read-ahead
4246	*/
4247	cluster_read_ahead(vp, extent: &extent, filesize, rap, callback, callback_arg, bflag);
4248	}
4249	}
4250	if (retval) {
4251	break;
4252	}
4253	if (io_size == `0`) {
4254	if (rap != NULL) {
4255	if (extent.e_addr < rap->cl_lastr) {
4256	rap->cl_maxra = `0`;
4257	}
4258	rap->cl_lastr = extent.e_addr;
4259	}
4260	break;
4261	}
4262	/*
4263	* recompute max_size since cluster_copy_ubc_data_internal
4264	* may have advanced uio->uio_offset
4265	*/
4266	max_size = filesize - uio->uio_offset;
4267	}
4268
4269	iostate.io_completed = `0`;
4270	iostate.io_issued = `0`;
4271	iostate.io_error = `0`;
4272	iostate.io_wanted = `0`;
4273
4274	if ((flags & IO_RETURN_ON_THROTTLE)) {
4275	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4276	if (!cluster_io_present_in_BC(vp, f_offset: uio->uio_offset)) {
4277	/*
4278	* we're in the throttle window and at least 1 I/O
4279	* has already been issued by a throttleable thread
4280	* in this window, so return with EAGAIN to indicate
4281	* to the FS issuing the cluster_read call that it
4282	* should now throttle after dropping any locks
4283	*/
4284	throttle_info_update_by_mount(mp: vp->v_mount);
4285
4286	retval = EAGAIN;
4287	break;
4288	}
4289	}
4290	}
4291
4292	/*
4293	* compute the size of the upl needed to encompass
4294	* the requested read... limit each call to cluster_io
4295	* to the maximum UPL size... cluster_io will clip if
4296	* this exceeds the maximum io_size for the device,
4297	* make sure to account for
4298	* a starting offset that's not page aligned
4299	*/
4300	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4301	upl_f_offset = uio->uio_offset - (off_t)start_offset;
4302
4303	if (io_size > max_rd_size) {
4304	io_size = max_rd_size;
4305	}
4306
4307	upl_size = (start_offset + io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
4308
4309	if (flags & IO_NOCACHE) {
4310	if (upl_size > max_io_size) {
4311	upl_size = max_io_size;
4312	}
4313	} else {
4314	if (upl_size > max_io_size / `4`) {
4315	upl_size = max_io_size / `4`;
4316	upl_size &= ~PAGE_MASK;
4317
4318	if (upl_size == `0`) {
4319	upl_size = PAGE_SIZE;
4320	}
4321	}
4322	}
4323	pages_in_upl = upl_size / PAGE_SIZE;
4324
4325	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `33`)) \| DBG_FUNC_START,
4326	upl, (int)upl_f_offset, upl_size, start_offset, `0`);
4327
4328	kret = ubc_create_upl_kernel(vp,
4329	upl_f_offset,
4330	upl_size,
4331	&upl,
4332	&pl,
4333	UPL_FILE_IO \| UPL_SET_LITE,
4334	VM_KERN_MEMORY_FILE);
4335	if (kret != KERN_SUCCESS) {
4336	panic("cluster_read_copy: failed to get pagelist");
4337	}
4338
4339	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `33`)) \| DBG_FUNC_END,
4340	upl, (int)upl_f_offset, upl_size, start_offset, `0`);
4341
4342	/*
4343	* scan from the beginning of the upl looking for the first
4344	* non-valid page.... this will become the first page in
4345	* the request we're going to make to 'cluster_io'... if all
4346	* of the pages are valid, we won't call through to 'cluster_io'
4347	*/
4348	for (start_pg = `0`; start_pg < pages_in_upl; start_pg++) {
4349	if (!upl_valid_page(upl: pl, index: start_pg)) {
4350	break;
4351	}
4352	}
4353
4354	/*
4355	* scan from the starting invalid page looking for a valid
4356	* page before the end of the upl is reached, if we
4357	* find one, then it will be the last page of the request to
4358	* 'cluster_io'
4359	*/
4360	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4361	if (upl_valid_page(upl: pl, index: last_pg)) {
4362	break;
4363	}
4364	}
4365
4366	if (start_pg < last_pg) {
4367	/*
4368	* we found a range of 'invalid' pages that must be filled
4369	* if the last page in this range is the last page of the file
4370	* we may have to clip the size of it to keep from reading past
4371	* the end of the last physical block associated with the file
4372	*/
4373	if (iolock_inited == FALSE) {
4374	lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4375
4376	iolock_inited = TRUE;
4377	}
4378	upl_offset = start_pg * PAGE_SIZE;
4379	io_size = (last_pg - start_pg) * PAGE_SIZE;
4380
4381	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4382	io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4383	}
4384
4385	/*
4386	* Find out if this needs verification, we'll have to manage the UPL
4387	* diffrently if so. Note that this call only lets us know if
4388	* verification is enabled on this mount point, the actual verification
4389	* is performed in the File system.
4390	*/
4391	size_t verify_block_size = `0`;
4392	if ((VNOP_VERIFY(vp, start_offset, NULL, `0`, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == `0`) / && verify_block_size /) {
4393	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4394	if (!upl_valid_page(upl: pl, index: uio_last)) {
4395	break;
4396	}
4397	}
4398	if (uio_last < pages_in_upl) {
4399	/*
4400	* there were some invalid pages beyond the valid pages
4401	* that we didn't issue an I/O for, just release them
4402	* unchanged now, so that any prefetch/readahed can
4403	* include them
4404	*/
4405	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4406	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4407	leftover_upl_aborted = true;
4408	}
4409	}
4410
4411	/*
4412	* issue an asynchronous read to cluster_io
4413	*/
4414
4415	error = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset,
4416	non_rounded_size: io_size, CL_READ \| CL_ASYNC \| bflag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
4417
4418	if (rap) {
4419	if (extent.e_addr < rap->cl_maxra) {
4420	/*
4421	* we've just issued a read for a block that should have been
4422	* in the cache courtesy of the read-ahead engine... something
4423	* has gone wrong with the pipeline, so reset the read-ahead
4424	* logic which will cause us to restart from scratch
4425	*/
4426	rap->cl_maxra = `0`;
4427	}
4428	}
4429	}
4430	if (error == `0`) {
4431	/*
4432	* if the read completed successfully, or there was no I/O request
4433	* issued, than copy the data into user land via 'cluster_upl_copy_data'
4434	* we'll first add on any 'valid'
4435	* pages that were present in the upl when we acquired it.
4436	*/
4437	u_int val_size;
4438
4439	if (!leftover_upl_aborted) {
4440	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4441	if (!upl_valid_page(upl: pl, index: uio_last)) {
4442	break;
4443	}
4444	}
4445	if (uio_last < pages_in_upl) {
4446	/*
4447	* there were some invalid pages beyond the valid pages
4448	* that we didn't issue an I/O for, just release them
4449	* unchanged now, so that any prefetch/readahed can
4450	* include them
4451	*/
4452	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4453	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4454	}
4455	}
4456
4457	/*
4458	* compute size to transfer this round, if io_req_size is
4459	* still non-zero after this attempt, we'll loop around and
4460	* set up for another I/O.
4461	*/
4462	val_size = (uio_last * PAGE_SIZE) - start_offset;
4463
4464	if (val_size > max_size) {
4465	val_size = (u_int)max_size;
4466	}
4467
4468	if (val_size > io_req_size) {
4469	val_size = io_req_size;
4470	}
4471
4472	if ((uio->uio_offset + val_size) > last_ioread_offset) {
4473	last_ioread_offset = uio->uio_offset + val_size;
4474	}
4475
4476	if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4477	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4478	/*
4479	* if there's still I/O left to do for this request, and...
4480	* we're not in hard throttle mode, and...
4481	* we're close to using up the previous prefetch, then issue a
4482	* new pre-fetch I/O... the I/O latency will overlap
4483	* with the copying of the data
4484	*/
4485	if (size_of_prefetch > max_rd_size) {
4486	size_of_prefetch = max_rd_size;
4487	}
4488
4489	size_of_prefetch = cluster_read_prefetch(vp, f_offset: last_ioread_offset, size: size_of_prefetch, filesize, callback, callback_arg, bflag);
4490
4491	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4492
4493	if (last_ioread_offset > last_request_offset) {
4494	last_ioread_offset = last_request_offset;
4495	}
4496	}
4497	} else if ((uio->uio_offset + val_size) == last_request_offset) {
4498	/*
4499	* this transfer will finish this request, so...
4500	* let's try to read ahead if we're in
4501	* a sequential access pattern and we haven't
4502	* explicitly disabled it
4503	*/
4504	if (rd_ahead_enabled) {
4505	cluster_read_ahead(vp, extent: &extent, filesize, rap, callback, callback_arg, bflag);
4506	}
4507
4508	if (rap != NULL) {
4509	if (extent.e_addr < rap->cl_lastr) {
4510	rap->cl_maxra = `0`;
4511	}
4512	rap->cl_lastr = extent.e_addr;
4513	}
4514	}
4515	if (iolock_inited == TRUE) {
4516	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_read_copy");
4517	}
4518
4519	if (iostate.io_error) {
4520	error = iostate.io_error;
4521	} else {
4522	u_int32_t io_requested;
4523
4524	io_requested = val_size;
4525
4526	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4527
4528	io_req_size -= (val_size - io_requested);
4529	}
4530	} else {
4531	if (iolock_inited == TRUE) {
4532	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_read_copy");
4533	}
4534	}
4535	if (start_pg < last_pg) {
4536	/*
4537	* compute the range of pages that we actually issued an I/O for
4538	* and either commit them as valid if the I/O succeeded
4539	* or abort them if the I/O failed or we're not supposed to
4540	* keep them in the cache
4541	*/
4542	io_size = (last_pg - start_pg) * PAGE_SIZE;
4543
4544	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `35`)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, `0`);
4545
4546	if (error \|\| (flags & IO_NOCACHE)) {
4547	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4548	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
4549	} else {
4550	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
4551
4552	if (take_reference) {
4553	commit_flags \|= UPL_COMMIT_INACTIVATE;
4554	} else {
4555	commit_flags \|= UPL_COMMIT_SPECULATE;
4556	}
4557
4558	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4559	}
4560	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `35`)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, `0`);
4561	}
4562	if ((last_pg - start_pg) < pages_in_upl) {
4563	/*
4564	* the set of pages that we issued an I/O for did not encompass
4565	* the entire upl... so just release these without modifying
4566	* their state
4567	*/
4568	if (error) {
4569	if (leftover_upl_aborted) {
4570	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4571	UPL_ABORT_FREE_ON_EMPTY);
4572	} else {
4573	ubc_upl_abort_range(upl, `0`, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4574	}
4575	} else {
4576	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `35`)) \| DBG_FUNC_START,
4577	upl, -`1`, pages_in_upl - (last_pg - start_pg), `0`, `0`);
4578
4579	/*
4580	* handle any valid pages at the beginning of
4581	* the upl... release these appropriately
4582	*/
4583	cluster_read_upl_release(upl, start_pg: `0`, last_pg: start_pg, take_reference);
4584
4585	/*
4586	* handle any valid pages immediately after the
4587	* pages we issued I/O for... ... release these appropriately
4588	*/
4589	cluster_read_upl_release(upl, start_pg: last_pg, last_pg: uio_last, take_reference);
4590
4591	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `35`)) \| DBG_FUNC_END, upl, -`1`, -`1`, `0`, `0`);
4592	}
4593	}
4594	if (retval == `0`) {
4595	retval = error;
4596	}
4597
4598	if (io_req_size) {
4599	uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4600
4601	if (cluster_is_throttled(vp)) {
4602	/*
4603	* we're in the throttle window, at the very least
4604	* we want to limit the size of the I/O we're about
4605	* to issue
4606	*/
4607	rd_ahead_enabled = `0`;
4608	prefetch_enabled = `0`;
4609	max_rd_size = max_throttle_size;
4610	} else {
4611	if (max_rd_size == max_throttle_size) {
4612	/*
4613	* coming out of throttled state
4614	*/
4615	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4616	if (rap != NULL) {
4617	rd_ahead_enabled = `1`;
4618	}
4619	prefetch_enabled = `1`;
4620	}
4621	max_rd_size = max_prefetch;
4622	last_ioread_offset = `0`;
4623	}
4624	}
4625	}
4626	}
4627	if (iolock_inited == TRUE) {
4628	/*
4629	* cluster_io returned an error after it
4630	* had already issued some I/O. we need
4631	* to wait for that I/O to complete before
4632	* we can destroy the iostate mutex...
4633	* 'retval' already contains the early error
4634	* so no need to pick it up from iostate.io_error
4635	*/
4636	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_read_copy");
4637
4638	lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
4639	}
4640	if (rap != NULL) {
4641	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `32`)) \| DBG_FUNC_END,
4642	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, `0`);
4643
4644	lck_mtx_unlock(lck: &rap->cl_lockr);
4645	} else {
4646	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `32`)) \| DBG_FUNC_END,
4647	(int)uio->uio_offset, io_req_size, `0`, retval, `0`);
4648	}
4649
4650	return retval;
4651	}
4652
4653	/*
4654	* We don't want another read/write lock for every vnode in the system
4655	* so we keep a hash of them here. There should never be very many of
4656	* these around at any point in time.
4657	*/
4658	cl_direct_read_lock_t *
4659	cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4660	{
4661	struct cl_direct_read_locks *head
4662	= &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4663	% CL_DIRECT_READ_LOCK_BUCKETS];
4664
4665	struct cl_direct_read_lock lck, new_lck = NULL;
4666
4667	for (;;) {
4668	lck_spin_lock(lck: &cl_direct_read_spin_lock);
4669
4670	LIST_FOREACH(lck, head, chain) {
4671	if (lck->vp == vp) {
4672	++lck->ref_count;
4673	lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4674	if (new_lck) {
4675	// Someone beat us to it, ditch the allocation
4676	lck_rw_destroy(lck: &new_lck->rw_lock, grp: &cl_mtx_grp);
4677	kfree_type(cl_direct_read_lock_t, new_lck);
4678	}
4679	lck_rw_lock(lck: &lck->rw_lock, lck_rw_type: type);
4680	return lck;
4681	}
4682	}
4683
4684	if (new_lck) {
4685	// Use the lock we allocated
4686	LIST_INSERT_HEAD(head, new_lck, chain);
4687	lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4688	lck_rw_lock(lck: &new_lck->rw_lock, lck_rw_type: type);
4689	return new_lck;
4690	}
4691
4692	lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4693
4694	// Allocate a new lock
4695	new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4696	lck_rw_init(lck: &new_lck->rw_lock, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4697	new_lck->vp = vp;
4698	new_lck->ref_count = `1`;
4699
4700	// Got to go round again
4701	}
4702	}
4703
4704	void
4705	cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4706	{
4707	lck_rw_done(lck: &lck->rw_lock);
4708
4709	lck_spin_lock(lck: &cl_direct_read_spin_lock);
4710	if (lck->ref_count == `1`) {
4711	LIST_REMOVE(lck, chain);
4712	lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4713	lck_rw_destroy(lck: &lck->rw_lock, grp: &cl_mtx_grp);
4714	kfree_type(cl_direct_read_lock_t, lck);
4715	} else {
4716	--lck->ref_count;
4717	lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4718	}
4719	}
4720
4721	static int
4722	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int* read_type, u_int32_t read_length,
4723	int flags, int (callback)(buf_t, void* ), void* *callback_arg)
4724	{
4725	upl_t upl = NULL;
4726	upl_page_info_t *pl;
4727	off_t max_io_size;
4728	vm_offset_t upl_offset, vector_upl_offset = `0`;
4729	upl_size_t upl_size = `0`, vector_upl_size = `0`;
4730	vm_size_t upl_needed_size;
4731	unsigned int pages_in_pl;
4732	upl_control_flags_t upl_flags;
4733	kern_return_t kret = KERN_SUCCESS;
4734	unsigned int i;
4735	int force_data_sync;
4736	int retval = `0`;
4737	int no_zero_fill = `0`;
4738	int io_flag = `0`;
4739	int misaligned = `0`;
4740	struct clios iostate;
4741	user_addr_t iov_base;
4742	u_int32_t io_req_size;
4743	u_int32_t offset_in_file;
4744	u_int32_t offset_in_iovbase;
4745	u_int32_t io_size;
4746	u_int32_t io_min;
4747	u_int32_t xsize;
4748	u_int32_t devblocksize;
4749	u_int32_t mem_alignment_mask;
4750	u_int32_t max_upl_size;
4751	u_int32_t max_rd_size;
4752	u_int32_t max_rd_ahead;
4753	u_int32_t max_vector_size;
4754	boolean_t io_throttled = FALSE;
4755
4756	u_int32_t vector_upl_iosize = `0`;
4757	int issueVectorUPL = `0`, useVectorUPL = (uio->uio_iovcnt > `1`);
4758	off_t v_upl_uio_offset = `0`;
4759	int vector_upl_index = `0`;
4760	upl_t vector_upl = NULL;
4761	cl_direct_read_lock_t *lock = NULL;
4762
4763	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4764
4765	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `70`)) \| DBG_FUNC_START,
4766	(int)uio->uio_offset, (int)filesize, read_type, read_length, `0`);
4767
4768	max_upl_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
4769
4770	max_rd_size = max_upl_size;
4771
4772	if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, `2`),
4773	&max_rd_ahead) \|\| (max_rd_ahead > overlapping_read_max))) {
4774	max_rd_ahead = overlapping_read_max;
4775	}
4776
4777	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
4778
4779	if (flags & IO_PASSIVE) {
4780	io_flag \|= CL_PASSIVE;
4781	}
4782
4783	if (flags & IO_ENCRYPTED) {
4784	io_flag \|= CL_RAW_ENCRYPTED;
4785	}
4786
4787	if (flags & IO_NOCACHE) {
4788	io_flag \|= CL_NOCACHE;
4789	}
4790
4791	if (flags & IO_SKIP_ENCRYPTION) {
4792	io_flag \|= CL_ENCRYPTED;
4793	}
4794
4795	iostate.io_completed = `0`;
4796	iostate.io_issued = `0`;
4797	iostate.io_error = `0`;
4798	iostate.io_wanted = `0`;
4799
4800	lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4801
4802	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4803	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4804
4805	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `70`)) \| DBG_FUNC_NONE,
4806	(int)devblocksize, (int)mem_alignment_mask, `0`, `0`, `0`);
4807
4808	if (devblocksize == `1`) {
4809	/*
4810	* the AFP client advertises a devblocksize of 1
4811	* however, its BLOCKMAP routine maps to physical
4812	* blocks that are PAGE_SIZE in size...
4813	* therefore we can't ask for I/Os that aren't page aligned
4814	* or aren't multiples of PAGE_SIZE in size
4815	* by setting devblocksize to PAGE_SIZE, we re-instate
4816	* the old behavior we had before the mem_alignment_mask
4817	* changes went in...
4818	*/
4819	devblocksize = PAGE_SIZE;
4820	}
4821
4822	/*
4823	* We are going to need this uio for the prefaulting later
4824	* especially for the cases where multiple non-contiguous
4825	* iovs are passed into this routine.
4826	*/
4827	uio_t uio_acct = uio_duplicate(a_uio: uio);
4828
4829	next_dread:
4830	io_req_size = *read_length;
4831	iov_base = uio_curriovbase(a_uio: uio);
4832
4833	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - `1`);
4834	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4835
4836	if (vm_map_page_mask(map: current_map()) < PAGE_MASK) {
4837	/*
4838	* XXX TODO4K
4839	* Direct I/O might not work as expected from a 16k kernel space
4840	* to a 4k user space because each 4k chunk might point to
4841	* a different 16k physical page...
4842	* Let's go the "misaligned" way.
4843	*/
4844	if (!misaligned) {
4845	DEBUG4K_VFS("forcing misaligned\n");
4846	}
4847	misaligned = `1`;
4848	}
4849
4850	if (offset_in_file \|\| offset_in_iovbase) {
4851	/*
4852	* one of the 2 important offsets is misaligned
4853	* so fire an I/O through the cache for this entire vector
4854	*/
4855	misaligned = `1`;
4856	}
4857	if (iov_base & (devblocksize - `1`)) {
4858	/*
4859	* the offset in memory must be on a device block boundary
4860	* so that we can guarantee that we can generate an
4861	* I/O that ends on a page boundary in cluster_io
4862	*/
4863	misaligned = `1`;
4864	}
4865
4866	max_io_size = filesize - uio->uio_offset;
4867
4868	/*
4869	* The user must request IO in aligned chunks. If the
4870	* offset into the file is bad, or the userland pointer
4871	* is non-aligned, then we cannot service the encrypted IO request.
4872	*/
4873	if (flags & IO_ENCRYPTED) {
4874	if (misaligned \|\| (io_req_size & (devblocksize - `1`))) {
4875	retval = EINVAL;
4876	}
4877
4878	max_io_size = roundup(max_io_size, devblocksize);
4879	}
4880
4881	if ((off_t)io_req_size > max_io_size) {
4882	io_req_size = (u_int32_t)max_io_size;
4883	}
4884
4885	/*
4886	* When we get to this point, we know...
4887	* -- the offset into the file is on a devblocksize boundary
4888	*/
4889
4890	while (io_req_size && retval == `0`) {
4891	u_int32_t io_start;
4892
4893	if (cluster_is_throttled(vp)) {
4894	uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4895
4896	/*
4897	* we're in the throttle window, at the very least
4898	* we want to limit the size of the I/O we're about
4899	* to issue
4900	*/
4901	max_rd_size = max_throttle_size;
4902	max_rd_ahead = max_throttle_size - `1`;
4903	max_vector_size = max_throttle_size;
4904	} else {
4905	max_rd_size = max_upl_size;
4906	max_rd_ahead = max_rd_size * IO_SCALE(vp, `2`);
4907	max_vector_size = MAX_VECTOR_UPL_SIZE;
4908	}
4909	io_start = io_size = io_req_size;
4910
4911	/*
4912	* First look for pages already in the cache
4913	* and move them to user space. But only do this
4914	* check if we are not retrieving encrypted data directly
4915	* from the filesystem; those blocks should never
4916	* be in the UBC.
4917	*
4918	* cluster_copy_ubc_data returns the resid
4919	* in io_size
4920	*/
4921	if ((flags & IO_ENCRYPTED) == `0`) {
4922	retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: (int *)&io_size, mark_dirty: `0`, take_reference: `0`);
4923	}
4924	/*
4925	* calculate the number of bytes actually copied
4926	* starting size - residual
4927	*/
4928	xsize = io_start - io_size;
4929
4930	io_req_size -= xsize;
4931
4932	if (useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
4933	/*
4934	* We found something in the cache or we have an iov_base that's not
4935	* page-aligned.
4936	*
4937	* Issue all I/O's that have been collected within this Vectored UPL.
4938	*/
4939	if (vector_upl_index) {
4940	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
4941	reset_vector_run_state();
4942	}
4943
4944	if (xsize) {
4945	useVectorUPL = `0`;
4946	}
4947
4948	/*
4949	* After this point, if we are using the Vector UPL path and the base is
4950	* not page-aligned then the UPL with that base will be the first in the vector UPL.
4951	*/
4952	}
4953
4954	/*
4955	* check to see if we are finished with this request.
4956	*
4957	* If we satisfied this IO already, then io_req_size will be 0.
4958	* Otherwise, see if the IO was mis-aligned and needs to go through
4959	* the UBC to deal with the 'tail'.
4960	*
4961	*/
4962	if (io_req_size == `0` \|\| (misaligned)) {
4963	/*
4964	* see if there's another uio vector to
4965	* process that's of type IO_DIRECT
4966	*
4967	* break out of while loop to get there
4968	*/
4969	break;
4970	}
4971	/*
4972	* assume the request ends on a device block boundary
4973	*/
4974	io_min = devblocksize;
4975
4976	/*
4977	* we can handle I/O's in multiples of the device block size
4978	* however, if io_size isn't a multiple of devblocksize we
4979	* want to clip it back to the nearest page boundary since
4980	* we are going to have to go through cluster_read_copy to
4981	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
4982	* multiple, we avoid asking the drive for the same physical
4983	* blocks twice.. once for the partial page at the end of the
4984	* request and a 2nd time for the page we read into the cache
4985	* (which overlaps the end of the direct read) in order to
4986	* get at the overhang bytes
4987	*/
4988	if (io_size & (devblocksize - `1`)) {
4989	assert(!(flags & IO_ENCRYPTED));
4990	/*
4991	* Clip the request to the previous page size boundary
4992	* since request does NOT end on a device block boundary
4993	*/
4994	io_size &= ~PAGE_MASK;
4995	io_min = PAGE_SIZE;
4996	}
4997	if (retval \|\| io_size < io_min) {
4998	/*
4999	* either an error or we only have the tail left to
5000	* complete via the copy path...
5001	* we may have already spun some portion of this request
5002	* off as async requests... we need to wait for the I/O
5003	* to complete before returning
5004	*/
5005	goto wait_for_dreads;
5006	}
5007
5008	/*
5009	* Don't re-check the UBC data if we are looking for uncached IO
5010	* or asking for encrypted blocks.
5011	*/
5012	if ((flags & IO_ENCRYPTED) == `0`) {
5013	if ((xsize = io_size) > max_rd_size) {
5014	xsize = max_rd_size;
5015	}
5016
5017	io_size = `0`;
5018
5019	if (!lock) {
5020	/*
5021	* We hold a lock here between the time we check the
5022	* cache and the time we issue I/O. This saves us
5023	* from having to lock the pages in the cache. Not
5024	* all clients will care about this lock but some
5025	* clients may want to guarantee stability between
5026	* here and when the I/O is issued in which case they
5027	* will take the lock exclusively.
5028	*/
5029	lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5030	}
5031
5032	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5033
5034	if (io_size == `0`) {
5035	/*
5036	* a page must have just come into the cache
5037	* since the first page in this range is no
5038	* longer absent, go back and re-evaluate
5039	*/
5040	continue;
5041	}
5042	}
5043	if ((flags & IO_RETURN_ON_THROTTLE)) {
5044	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5045	if (!cluster_io_present_in_BC(vp, f_offset: uio->uio_offset)) {
5046	/*
5047	* we're in the throttle window and at least 1 I/O
5048	* has already been issued by a throttleable thread
5049	* in this window, so return with EAGAIN to indicate
5050	* to the FS issuing the cluster_read call that it
5051	* should now throttle after dropping any locks
5052	*/
5053	throttle_info_update_by_mount(mp: vp->v_mount);
5054
5055	io_throttled = TRUE;
5056	goto wait_for_dreads;
5057	}
5058	}
5059	}
5060	if (io_size > max_rd_size) {
5061	io_size = max_rd_size;
5062	}
5063
5064	iov_base = uio_curriovbase(a_uio: uio);
5065
5066	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5067	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
5068
5069	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `72`)) \| DBG_FUNC_START,
5070	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, `0`);
5071
5072	if (upl_offset == `0` && ((io_size & PAGE_MASK) == `0`)) {
5073	no_zero_fill = `1`;
5074	} else {
5075	no_zero_fill = `0`;
5076	}
5077
5078	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5079	for (force_data_sync = `0`; force_data_sync < `3`; force_data_sync++) {
5080	pages_in_pl = `0`;
5081	upl_size = (upl_size_t)upl_needed_size;
5082	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
5083	if (no_zero_fill) {
5084	upl_flags \|= UPL_NOZEROFILL;
5085	}
5086	if (force_data_sync) {
5087	upl_flags \|= UPL_FORCE_DATA_SYNC;
5088	}
5089
5090	kret = vm_map_create_upl(map,
5091	offset: (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5092	upl_size: &upl_size, upl: &upl, NULL, count: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE);
5093
5094	if (kret != KERN_SUCCESS) {
5095	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `72`)) \| DBG_FUNC_END,
5096	(int)upl_offset, upl_size, io_size, kret, `0`);
5097	/*
5098	* failed to get pagelist
5099	*
5100	* we may have already spun some portion of this request
5101	* off as async requests... we need to wait for the I/O
5102	* to complete before returning
5103	*/
5104	goto wait_for_dreads;
5105	}
5106	pages_in_pl = upl_size / PAGE_SIZE;
5107	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5108
5109	for (i = `0`; i < pages_in_pl; i++) {
5110	if (!upl_page_present(upl: pl, index: i)) {
5111	break;
5112	}
5113	}
5114	if (i == pages_in_pl) {
5115	break;
5116	}
5117
5118	ubc_upl_abort(upl, `0`);
5119	}
5120	if (force_data_sync >= `3`) {
5121	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `72`)) \| DBG_FUNC_END,
5122	(int)upl_offset, upl_size, io_size, kret, `0`);
5123
5124	goto wait_for_dreads;
5125	}
5126	/*
5127	* Consider the possibility that upl_size wasn't satisfied.
5128	*/
5129	if (upl_size < upl_needed_size) {
5130	if (upl_size && upl_offset == `0`) {
5131	io_size = upl_size;
5132	} else {
5133	io_size = `0`;
5134	}
5135	}
5136	if (io_size == `0`) {
5137	ubc_upl_abort(upl, `0`);
5138	goto wait_for_dreads;
5139	}
5140	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `72`)) \| DBG_FUNC_END,
5141	(int)upl_offset, upl_size, io_size, kret, `0`);
5142
5143	if (useVectorUPL) {
5144	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5145	if (end_off) {
5146	issueVectorUPL = `1`;
5147	}
5148	/*
5149	* After this point, if we are using a vector UPL, then
5150	* either all the UPL elements end on a page boundary OR
5151	* this UPL is the last element because it does not end
5152	* on a page boundary.
5153	*/
5154	}
5155
5156	/*
5157	* request asynchronously so that we can overlap
5158	* the preparation of the next I/O
5159	* if there are already too many outstanding reads
5160	* wait until some have completed before issuing the next read
5161	*/
5162	cluster_iostate_wait(iostate: &iostate, target: max_rd_ahead, wait_name: "cluster_read_direct");
5163
5164	if (iostate.io_error) {
5165	/*
5166	* one of the earlier reads we issued ran into a hard error
5167	* don't issue any more reads, cleanup the UPL
5168	* that was just created but not used, then
5169	* go wait for any other reads to complete before
5170	* returning the error to the caller
5171	*/
5172	ubc_upl_abort(upl, `0`);
5173
5174	goto wait_for_dreads;
5175	}
5176	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `73`)) \| DBG_FUNC_START,
5177	upl, (int)upl_offset, (int)uio->uio_offset, io_size, `0`);
5178
5179	if (!useVectorUPL) {
5180	if (no_zero_fill) {
5181	io_flag &= ~CL_PRESERVE;
5182	} else {
5183	io_flag \|= CL_PRESERVE;
5184	}
5185
5186	retval = cluster_io(vp, upl, upl_offset, f_offset: uio->uio_offset, non_rounded_size: io_size, flags: io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5187	} else {
5188	if (!vector_upl_index) {
5189	vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
5190	v_upl_uio_offset = uio->uio_offset;
5191	vector_upl_offset = upl_offset;
5192	}
5193
5194	vector_upl_set_subupl(vector_upl, upl, upl_size);
5195	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5196	vector_upl_index++;
5197	vector_upl_size += upl_size;
5198	vector_upl_iosize += io_size;
5199
5200	if (issueVectorUPL \|\| vector_upl_index == vector_upl_max_upls(vector_upl) \|\| vector_upl_size >= max_vector_size) {
5201	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5202	reset_vector_run_state();
5203	}
5204	}
5205
5206	if (lock) {
5207	// We don't need to wait for the I/O to complete
5208	cluster_unlock_direct_read(lck: lock);
5209	lock = NULL;
5210	}
5211
5212	/*
5213	* update the uio structure
5214	*/
5215	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5216	uio_update(a_uio: uio, a_count: (user_size_t)max_io_size);
5217	} else {
5218	uio_update(a_uio: uio, a_count: (user_size_t)io_size);
5219	}
5220
5221	io_req_size -= io_size;
5222
5223	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `73`)) \| DBG_FUNC_END,
5224	upl, (int)uio->uio_offset, io_req_size, retval, `0`);
5225	} / end while /
5226
5227	if (retval == `0` && iostate.io_error == `0` && io_req_size == `0` && uio->uio_offset < filesize) {
5228	retval = cluster_io_type(uio, io_type: read_type, io_length: read_length, min_length: `0`);
5229
5230	if (retval == `0` && *read_type == IO_DIRECT) {
5231	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `70`)) \| DBG_FUNC_NONE,
5232	(int)uio->uio_offset, (int)filesize, read_type, read_length, `0`);
5233
5234	goto next_dread;
5235	}
5236	}
5237
5238	wait_for_dreads:
5239
5240	if (retval == `0` && iostate.io_error == `0` && useVectorUPL && vector_upl_index) {
5241	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5242	reset_vector_run_state();
5243	}
5244
5245	// We don't need to wait for the I/O to complete
5246	if (lock) {
5247	cluster_unlock_direct_read(lck: lock);
5248	}
5249
5250	/*
5251	* make sure all async reads that are part of this stream
5252	* have completed before we return
5253	*/
5254	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_read_direct");
5255
5256	if (iostate.io_error) {
5257	retval = iostate.io_error;
5258	}
5259
5260	lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
5261
5262	if (io_throttled == TRUE && retval == `0`) {
5263	retval = EAGAIN;
5264	}
5265
5266	vm_map_offset_t current_page_size, current_page_mask;
5267	current_page_size = vm_map_page_size(map: current_map());
5268	current_page_mask = vm_map_page_mask(map: current_map());
5269	if (uio_acct) {
5270	off_t bytes_to_prefault = `0`, bytes_prefaulted = `0`;
5271	user_addr_t curr_iov_base = `0`;
5272	user_addr_t curr_iov_end = `0`;
5273	user_size_t curr_iov_len = `0`;
5274
5275	bytes_to_prefault = uio_offset(a_uio: uio) - uio_offset(a_uio: uio_acct);
5276
5277	for (; bytes_prefaulted < bytes_to_prefault;) {
5278	curr_iov_base = uio_curriovbase(a_uio: uio_acct);
5279	curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5280	curr_iov_end = curr_iov_base + curr_iov_len;
5281
5282	for (; curr_iov_base < curr_iov_end;) {
5283	/*
5284	* This is specifically done for pmap accounting purposes.
5285	* vm_pre_fault() will call vm_fault() to enter the page into
5286	* the pmap if there isn't _a_ physical page for that VA already.
5287	*/
5288	vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5289	curr_iov_base += current_page_size;
5290	bytes_prefaulted += current_page_size;
5291	}
5292	/*
5293	* Use update instead of advance so we can see how many iovs we processed.
5294	*/
5295	uio_update(a_uio: uio_acct, a_count: curr_iov_len);
5296	}
5297	uio_free(a_uio: uio_acct);
5298	uio_acct = NULL;
5299	}
5300
5301	if (io_req_size && retval == `0`) {
5302	/*
5303	* we couldn't handle the tail of this request in DIRECT mode
5304	* so fire it through the copy path
5305	*/
5306	if (flags & IO_ENCRYPTED) {
5307	/*
5308	* We cannot fall back to the copy path for encrypted I/O. If this
5309	* happens, there is something wrong with the user buffer passed
5310	* down.
5311	*/
5312	retval = EFAULT;
5313	} else {
5314	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5315	}
5316
5317	*read_type = IO_UNKNOWN;
5318	}
5319	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `70`)) \| DBG_FUNC_END,
5320	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, `0`);
5321
5322	return retval;
5323	}
5324
5325
5326	static int
5327	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int* read_type, u_int32_t read_length,
5328	int (callback)(buf_t, void* ), void* callback_arg, int* flags)
5329	{
5330	upl_page_info_t *pl;
5331	upl_t upl[MAX_VECTS];
5332	vm_offset_t upl_offset;
5333	addr64_t dst_paddr = `0`;
5334	user_addr_t iov_base;
5335	off_t max_size;
5336	upl_size_t upl_size;
5337	vm_size_t upl_needed_size;
5338	mach_msg_type_number_t pages_in_pl;
5339	upl_control_flags_t upl_flags;
5340	kern_return_t kret;
5341	struct clios iostate;
5342	int error = `0`;
5343	int cur_upl = `0`;
5344	int num_upl = `0`;
5345	int n;
5346	u_int32_t xsize;
5347	u_int32_t io_size;
5348	u_int32_t devblocksize;
5349	u_int32_t mem_alignment_mask;
5350	u_int32_t tail_size = `0`;
5351	int bflag;
5352
5353	if (flags & IO_PASSIVE) {
5354	bflag = CL_PASSIVE;
5355	} else {
5356	bflag = `0`;
5357	}
5358
5359	if (flags & IO_NOCACHE) {
5360	bflag \|= CL_NOCACHE;
5361	}
5362
5363	/*
5364	* When we enter this routine, we know
5365	* -- the read_length will not exceed the current iov_len
5366	* -- the target address is physically contiguous for read_length
5367	*/
5368	cluster_syncup(vp, newEOF: filesize, callback, callback_arg, PUSH_SYNC);
5369
5370	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5371	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5372
5373	iostate.io_completed = `0`;
5374	iostate.io_issued = `0`;
5375	iostate.io_error = `0`;
5376	iostate.io_wanted = `0`;
5377
5378	lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
5379
5380	next_cread:
5381	io_size = *read_length;
5382
5383	max_size = filesize - uio->uio_offset;
5384
5385	if (io_size > max_size) {
5386	io_size = (u_int32_t)max_size;
5387	}
5388
5389	iov_base = uio_curriovbase(a_uio: uio);
5390
5391	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5392	upl_needed_size = upl_offset + io_size;
5393
5394	pages_in_pl = `0`;
5395	upl_size = (upl_size_t)upl_needed_size;
5396	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
5397
5398
5399	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `92`)) \| DBG_FUNC_START,
5400	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, `0`);
5401
5402	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5403	kret = vm_map_get_upl(target_map: map,
5404	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5405	size: &upl_size, upl: &upl[cur_upl], NULL, page_infoCnt: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: `0`);
5406
5407	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `92`)) \| DBG_FUNC_END,
5408	(int)upl_offset, upl_size, io_size, kret, `0`);
5409
5410	if (kret != KERN_SUCCESS) {
5411	/*
5412	* failed to get pagelist
5413	*/
5414	error = EINVAL;
5415	goto wait_for_creads;
5416	}
5417	num_upl++;
5418
5419	if (upl_size < upl_needed_size) {
5420	/*
5421	* The upl_size wasn't satisfied.
5422	*/
5423	error = EINVAL;
5424	goto wait_for_creads;
5425	}
5426	pl = ubc_upl_pageinfo(upl[cur_upl]);
5427
5428	dst_paddr = ((addr64_t)upl_phys_page(upl: pl, index: `0`) << PAGE_SHIFT) + (addr64_t)upl_offset;
5429
5430	while (((uio->uio_offset & (devblocksize - `1`)) \|\| io_size < devblocksize) && io_size) {
5431	u_int32_t head_size;
5432
5433	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - `1`));
5434
5435	if (head_size > io_size) {
5436	head_size = io_size;
5437	}
5438
5439	error = cluster_align_phys_io(vp, uio, usr_paddr: dst_paddr, xsize: head_size, CL_READ, callback, callback_arg);
5440
5441	if (error) {
5442	goto wait_for_creads;
5443	}
5444
5445	upl_offset += head_size;
5446	dst_paddr += head_size;
5447	io_size -= head_size;
5448
5449	iov_base += head_size;
5450	}
5451	if ((u_int32_t)iov_base & mem_alignment_mask) {
5452	/*
5453	* request doesn't set up on a memory boundary
5454	* the underlying DMA engine can handle...
5455	* return an error instead of going through
5456	* the slow copy path since the intent of this
5457	* path is direct I/O to device memory
5458	*/
5459	error = EINVAL;
5460	goto wait_for_creads;
5461	}
5462
5463	tail_size = io_size & (devblocksize - `1`);
5464
5465	io_size -= tail_size;
5466
5467	while (io_size && error == `0`) {
5468	if (io_size > MAX_IO_CONTIG_SIZE) {
5469	xsize = MAX_IO_CONTIG_SIZE;
5470	} else {
5471	xsize = io_size;
5472	}
5473	/*
5474	* request asynchronously so that we can overlap
5475	* the preparation of the next I/O... we'll do
5476	* the commit after all the I/O has completed
5477	* since its all issued against the same UPL
5478	* if there are already too many outstanding reads
5479	* wait until some have completed before issuing the next
5480	*/
5481	cluster_iostate_wait(iostate: &iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, `2`), wait_name: "cluster_read_contig");
5482
5483	if (iostate.io_error) {
5484	/*
5485	* one of the earlier reads we issued ran into a hard error
5486	* don't issue any more reads...
5487	* go wait for any other reads to complete before
5488	* returning the error to the caller
5489	*/
5490	goto wait_for_creads;
5491	}
5492	error = cluster_io(vp, upl: upl[cur_upl], upl_offset, f_offset: uio->uio_offset, non_rounded_size: xsize,
5493	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
5494	real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5495	/*
5496	* The cluster_io read was issued successfully,
5497	* update the uio structure
5498	*/
5499	if (error == `0`) {
5500	uio_update(a_uio: uio, a_count: (user_size_t)xsize);
5501
5502	dst_paddr += xsize;
5503	upl_offset += xsize;
5504	io_size -= xsize;
5505	}
5506	}
5507	if (error == `0` && iostate.io_error == `0` && tail_size == `0` && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5508	error = cluster_io_type(uio, io_type: read_type, io_length: read_length, min_length: `0`);
5509
5510	if (error == `0` && *read_type == IO_CONTIG) {
5511	cur_upl++;
5512	goto next_cread;
5513	}
5514	} else {
5515	*read_type = IO_UNKNOWN;
5516	}
5517
5518	wait_for_creads:
5519	/*
5520	* make sure all async reads that are part of this stream
5521	* have completed before we proceed
5522	*/
5523	cluster_iostate_wait(iostate: &iostate, target: `0`, wait_name: "cluster_read_contig");
5524
5525	if (iostate.io_error) {
5526	error = iostate.io_error;
5527	}
5528
5529	lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
5530
5531	if (error == `0` && tail_size) {
5532	error = cluster_align_phys_io(vp, uio, usr_paddr: dst_paddr, xsize: tail_size, CL_READ, callback, callback_arg);
5533	}
5534
5535	for (n = `0`; n < num_upl; n++) {
5536	/*
5537	* just release our hold on each physically contiguous
5538	* region without changing any state
5539	*/
5540	ubc_upl_abort(upl[n], `0`);
5541	}
5542
5543	return error;
5544	}
5545
5546
5547	static int
5548	cluster_io_type(struct uio uio, int* io_type, u_int32_t io_length, u_int32_t min_length)
5549	{
5550	user_size_t iov_len;
5551	user_addr_t iov_base = `0`;
5552	upl_t upl;
5553	upl_size_t upl_size;
5554	upl_control_flags_t upl_flags;
5555	int retval = `0`;
5556
5557	/*
5558	* skip over any emtpy vectors
5559	*/
5560	uio_update(a_uio: uio, a_count: (user_size_t)`0`);
5561
5562	iov_len = uio_curriovlen(a_uio: uio);
5563
5564	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `94`)) \| DBG_FUNC_START, uio, (int)iov_len, `0`, `0`, `0`);
5565
5566	if (iov_len) {
5567	iov_base = uio_curriovbase(a_uio: uio);
5568	/*
5569	* make sure the size of the vector isn't too big...
5570	* internally, we want to handle all of the I/O in
5571	* chunk sizes that fit in a 32 bit int
5572	*/
5573	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5574	upl_size = MAX_IO_REQUEST_SIZE;
5575	} else {
5576	upl_size = (u_int32_t)iov_len;
5577	}
5578
5579	upl_flags = UPL_QUERY_OBJECT_TYPE;
5580
5581	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5582	if ((vm_map_get_upl(target_map: map,
5583	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5584	size: &upl_size, upl: &upl, NULL, NULL, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: `0`)) != KERN_SUCCESS) {
5585	/*
5586	* the user app must have passed in an invalid address
5587	*/
5588	retval = EFAULT;
5589	}
5590	if (upl_size == `0`) {
5591	retval = EFAULT;
5592	}
5593
5594	*io_length = upl_size;
5595
5596	if (upl_flags & UPL_PHYS_CONTIG) {
5597	*io_type = IO_CONTIG;
5598	} else if (iov_len >= min_length) {
5599	*io_type = IO_DIRECT;
5600	} else {
5601	*io_type = IO_COPY;
5602	}
5603	} else {
5604	/*
5605	* nothing left to do for this uio
5606	*/
5607	*io_length = `0`;
5608	*io_type = IO_UNKNOWN;
5609	}
5610	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `94`)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, `0`);
5611
5612	if (*io_type == IO_DIRECT &&
5613	vm_map_page_shift(map: current_map()) < PAGE_SHIFT) {
5614	/ no direct I/O for sub-page-size address spaces /
5615	DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5616	*io_type = IO_COPY;
5617	}
5618
5619	return retval;
5620	}
5621
5622
5623	/*
5624	* generate advisory I/O's in the largest chunks possible
5625	* the completed pages will be released into the VM cache
5626	*/
5627	int
5628	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5629	{
5630	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5631	}
5632
5633	int
5634	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void* ), void* callback_arg, int* bflag)
5635	{
5636	upl_page_info_t *pl;
5637	upl_t upl = NULL;
5638	vm_offset_t upl_offset;
5639	int upl_size;
5640	off_t upl_f_offset;
5641	int start_offset;
5642	int start_pg;
5643	int last_pg;
5644	int pages_in_upl;
5645	off_t max_size;
5646	int io_size;
5647	kern_return_t kret;
5648	int retval = `0`;
5649	int issued_io;
5650	int skip_range;
5651	uint32_t max_io_size;
5652
5653
5654	if (!UBCINFOEXISTS(vp)) {
5655	return EINVAL;
5656	}
5657
5658	if (f_offset < `0` \|\| resid < `0`) {
5659	return EINVAL;
5660	}
5661
5662	max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
5663
5664	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5665	if (max_io_size > speculative_prefetch_max_iosize) {
5666	max_io_size = speculative_prefetch_max_iosize;
5667	}
5668	}
5669
5670	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `60`)) \| DBG_FUNC_START,
5671	(int)f_offset, resid, (int)filesize, `0`, `0`);
5672
5673	while (resid && f_offset < filesize && retval == `0`) {
5674	/*
5675	* compute the size of the upl needed to encompass
5676	* the requested read... limit each call to cluster_io
5677	* to the maximum UPL size... cluster_io will clip if
5678	* this exceeds the maximum io_size for the device,
5679	* make sure to account for
5680	* a starting offset that's not page aligned
5681	*/
5682	start_offset = (int)(f_offset & PAGE_MASK_64);
5683	upl_f_offset = f_offset - (off_t)start_offset;
5684	max_size = filesize - f_offset;
5685
5686	if (resid < max_size) {
5687	io_size = resid;
5688	} else {
5689	io_size = (int)max_size;
5690	}
5691
5692	upl_size = (start_offset + io_size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
5693	if ((uint32_t)upl_size > max_io_size) {
5694	upl_size = max_io_size;
5695	}
5696
5697	skip_range = `0`;
5698	/*
5699	* return the number of contiguously present pages in the cache
5700	* starting at upl_f_offset within the file
5701	*/
5702	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5703
5704	if (skip_range) {
5705	/*
5706	* skip over pages already present in the cache
5707	*/
5708	io_size = skip_range - start_offset;
5709
5710	f_offset += io_size;
5711	resid -= io_size;
5712
5713	if (skip_range == upl_size) {
5714	continue;
5715	}
5716	/*
5717	* have to issue some real I/O
5718	* at this point, we know it's starting on a page boundary
5719	* because we've skipped over at least the first page in the request
5720	*/
5721	start_offset = `0`;
5722	upl_f_offset += skip_range;
5723	upl_size -= skip_range;
5724	}
5725	pages_in_upl = upl_size / PAGE_SIZE;
5726
5727	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `61`)) \| DBG_FUNC_START,
5728	upl, (int)upl_f_offset, upl_size, start_offset, `0`);
5729
5730	kret = ubc_create_upl_kernel(vp,
5731	upl_f_offset,
5732	upl_size,
5733	&upl,
5734	&pl,
5735	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE,
5736	VM_KERN_MEMORY_FILE);
5737	if (kret != KERN_SUCCESS) {
5738	return retval;
5739	}
5740	issued_io = `0`;
5741
5742	/*
5743	* before we start marching forward, we must make sure we end on
5744	* a present page, otherwise we will be working with a freed
5745	* upl
5746	*/
5747	for (last_pg = pages_in_upl - `1`; last_pg >= `0`; last_pg--) {
5748	if (upl_page_present(upl: pl, index: last_pg)) {
5749	break;
5750	}
5751	}
5752	pages_in_upl = last_pg + `1`;
5753
5754
5755	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `61`)) \| DBG_FUNC_END,
5756	upl, (int)upl_f_offset, upl_size, start_offset, `0`);
5757
5758
5759	for (last_pg = `0`; last_pg < pages_in_upl;) {
5760	/*
5761	* scan from the beginning of the upl looking for the first
5762	* page that is present.... this will become the first page in
5763	* the request we're going to make to 'cluster_io'... if all
5764	* of the pages are absent, we won't call through to 'cluster_io'
5765	*/
5766	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5767	if (upl_page_present(upl: pl, index: start_pg)) {
5768	break;
5769	}
5770	}
5771
5772	/*
5773	* scan from the starting present page looking for an absent
5774	* page before the end of the upl is reached, if we
5775	* find one, then it will terminate the range of pages being
5776	* presented to 'cluster_io'
5777	*/
5778	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5779	if (!upl_page_present(upl: pl, index: last_pg)) {
5780	break;
5781	}
5782	}
5783
5784	if (last_pg > start_pg) {
5785	/*
5786	* we found a range of pages that must be filled
5787	* if the last page in this range is the last page of the file
5788	* we may have to clip the size of it to keep from reading past
5789	* the end of the last physical block associated with the file
5790	*/
5791	upl_offset = start_pg * PAGE_SIZE;
5792	io_size = (last_pg - start_pg) * PAGE_SIZE;
5793
5794	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5795	io_size = (int)(filesize - (upl_f_offset + upl_offset));
5796	}
5797
5798	/*
5799	* issue an asynchronous read to cluster_io
5800	*/
5801	retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: io_size,
5802	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
5803
5804	issued_io = `1`;
5805	}
5806	}
5807	if (issued_io == `0`) {
5808	ubc_upl_abort(upl, `0`);
5809	}
5810
5811	io_size = upl_size - start_offset;
5812
5813	if (io_size > resid) {
5814	io_size = resid;
5815	}
5816	f_offset += io_size;
5817	resid -= io_size;
5818	}
5819
5820	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `60`)) \| DBG_FUNC_END,
5821	(int)f_offset, resid, retval, `0`, `0`);
5822
5823	return retval;
5824	}
5825
5826
5827	int
5828	cluster_push(vnode_t vp, int flags)
5829	{
5830	return cluster_push_ext(vp, flags, NULL, NULL);
5831	}
5832
5833
5834	int
5835	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void* ), void* *callback_arg)
5836	{
5837	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5838	}
5839
5840	/ write errors via err, but return the number of clusters written /
5841	extern uint32_t system_inshutdown;
5842	uint32_t cl_sparse_push_error = `0`;
5843	int
5844	cluster_push_err(vnode_t vp, int flags, int (callback)(buf_t, void* ), void* callback_arg, int* *err)
5845	{
5846	int retval;
5847	int my_sparse_wait = `0`;
5848	struct cl_writebehind *wbp;
5849	int local_err = `0`;
5850
5851	if (err) {
5852	*err = `0`;
5853	}
5854
5855	if (!UBCINFOEXISTS(vp)) {
5856	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `53`)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, `0`, -`1`, `0`);
5857	return `0`;
5858	}
5859	/ return if deferred write is set /
5860	if (((unsigned int)vfs_flags(mp: vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5861	return `0`;
5862	}
5863	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5864	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `53`)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, `0`, -`2`, `0`);
5865	return `0`;
5866	}
5867	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == `0` && wbp->cl_scmap == NULL) {
5868	lck_mtx_unlock(lck: &wbp->cl_lockw);
5869
5870	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `53`)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, `0`, -`3`, `0`);
5871	return `0`;
5872	}
5873	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `53`)) \| DBG_FUNC_START,
5874	wbp->cl_scmap, wbp->cl_number, flags, `0`, `0`);
5875
5876	/*
5877	* if we have an fsync in progress, we don't want to allow any additional
5878	* sync/fsync/close(s) to occur until it finishes.
5879	* note that its possible for writes to continue to occur to this file
5880	* while we're waiting and also once the fsync starts to clean if we're
5881	* in the sparse map case
5882	*/
5883	while (wbp->cl_sparse_wait) {
5884	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `97`)) \| DBG_FUNC_START, kdebug_vnode(vp), `0`, `0`, `0`, `0`);
5885
5886	msleep(chan: (caddr_t)&wbp->cl_sparse_wait, mtx: &wbp->cl_lockw, PRIBIO + `1`, wmesg: "cluster_push_ext", NULL);
5887
5888	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `97`)) \| DBG_FUNC_END, kdebug_vnode(vp), `0`, `0`, `0`, `0`);
5889	}
5890	if (flags & IO_SYNC) {
5891	my_sparse_wait = `1`;
5892	wbp->cl_sparse_wait = `1`;
5893
5894	/*
5895	* this is an fsync (or equivalent)... we must wait for any existing async
5896	* cleaning operations to complete before we evaulate the current state
5897	* and finish cleaning... this insures that all writes issued before this
5898	* fsync actually get cleaned to the disk before this fsync returns
5899	*/
5900	while (wbp->cl_sparse_pushes) {
5901	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `98`)) \| DBG_FUNC_START, kdebug_vnode(vp), `0`, `0`, `0`, `0`);
5902
5903	msleep(chan: (caddr_t)&wbp->cl_sparse_pushes, mtx: &wbp->cl_lockw, PRIBIO + `1`, wmesg: "cluster_push_ext", NULL);
5904
5905	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `98`)) \| DBG_FUNC_END, kdebug_vnode(vp), `0`, `0`, `0`, `0`);
5906	}
5907	}
5908	if (wbp->cl_scmap) {
5909	void *scmap;
5910
5911	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5912	scmap = wbp->cl_scmap;
5913	wbp->cl_scmap = NULL;
5914
5915	wbp->cl_sparse_pushes++;
5916
5917	lck_mtx_unlock(lck: &wbp->cl_lockw);
5918
5919	retval = sparse_cluster_push(wbp, cmapp: &scmap, vp, EOF: ubc_getsize(vp), PUSH_ALL, io_flags: flags, callback, callback_arg, FALSE);
5920
5921	lck_mtx_lock(lck: &wbp->cl_lockw);
5922
5923	wbp->cl_sparse_pushes--;
5924
5925	if (retval) {
5926	if (wbp->cl_scmap != NULL) {
5927	/*
5928	* panic("cluster_push_err: Expected NULL cl_scmap\n");
5929	*
5930	* This can happen if we get an error from the underlying FS
5931	* e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5932	* are transient and the I/Os will succeed at a later point.
5933	*
5934	* The tricky part here is that a new sparse cluster has been
5935	* allocated and tracking a different set of dirty pages. So these
5936	* pages are not going to be pushed out with the next sparse_cluster_push.
5937	* An explicit msync or file close will, however, push the pages out.
5938	*
5939	* What if those calls still don't work? And so, during shutdown we keep
5940	* trying till we succeed...
5941	*/
5942
5943	if (system_inshutdown) {
5944	if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL \| MNT_REMOVABLE)) == MNT_LOCAL) {
5945	os_atomic_inc(&cl_sparse_push_error, relaxed);
5946	}
5947	} else {
5948	vfs_drt_control(cmapp: &scmap, op_type: `0`); / emit stats and free this memory. Dirty pages stay intact. /
5949	scmap = NULL;
5950	}
5951	} else {
5952	wbp->cl_scmap = scmap;
5953	}
5954	}
5955
5956	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == `0`) {
5957	wakeup(chan: (caddr_t)&wbp->cl_sparse_pushes);
5958	}
5959	} else {
5960	retval = sparse_cluster_push(wbp, cmapp: &(wbp->cl_scmap), vp, EOF: ubc_getsize(vp), PUSH_ALL, io_flags: flags, callback, callback_arg, FALSE);
5961	}
5962
5963	local_err = retval;
5964
5965	if (err) {
5966	*err = retval;
5967	}
5968	retval = `1`;
5969	} else {
5970	retval = cluster_try_push(wbp, vp, EOF: ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err: &local_err, FALSE);
5971	if (err) {
5972	*err = local_err;
5973	}
5974	}
5975	lck_mtx_unlock(lck: &wbp->cl_lockw);
5976
5977	if (flags & IO_SYNC) {
5978	(void)vnode_waitforwrites(vp, output_target: `0`, slpflag: `0`, slptimeout: `0`, msg: "cluster_push");
5979	}
5980
5981	if (my_sparse_wait) {
5982	/*
5983	* I'm the owner of the serialization token
5984	* clear it and wakeup anyone that is waiting
5985	* for me to finish
5986	*/
5987	lck_mtx_lock(lck: &wbp->cl_lockw);
5988
5989	wbp->cl_sparse_wait = `0`;
5990	wakeup(chan: (caddr_t)&wbp->cl_sparse_wait);
5991
5992	lck_mtx_unlock(lck: &wbp->cl_lockw);
5993	}
5994	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `53`)) \| DBG_FUNC_END,
5995	wbp->cl_scmap, wbp->cl_number, retval, local_err, `0`);
5996
5997	return retval;
5998	}
5999
6000
6001	__private_extern__ void
6002	cluster_release(struct ubc_info *ubc)
6003	{
6004	struct cl_writebehind *wbp;
6005	struct cl_readahead *rap;
6006
6007	if ((wbp = ubc->cl_wbehind)) {
6008	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `81`)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, `0`, `0`, `0`);
6009
6010	if (wbp->cl_scmap) {
6011	vfs_drt_control(cmapp: &(wbp->cl_scmap), op_type: `0`);
6012	}
6013	lck_mtx_destroy(lck: &wbp->cl_lockw, grp: &cl_mtx_grp);
6014	zfree(cl_wr_zone, wbp);
6015	ubc->cl_wbehind = NULL;
6016	} else {
6017	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `81`)) \| DBG_FUNC_START, ubc, `0`, `0`, `0`, `0`);
6018	}
6019
6020	if ((rap = ubc->cl_rahead)) {
6021	lck_mtx_destroy(lck: &rap->cl_lockr, grp: &cl_mtx_grp);
6022	zfree(cl_rd_zone, rap);
6023	ubc->cl_rahead = NULL;
6024	}
6025
6026	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `81`)) \| DBG_FUNC_END, ubc, rap, wbp, `0`, `0`);
6027	}
6028
6029
6030	static int
6031	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int* push_flag, int io_flags, int (callback)(buf_t, void* ), void* callback_arg, int* *err, boolean_t vm_initiated)
6032	{
6033	int cl_index;
6034	int cl_index1;
6035	int min_index;
6036	int cl_len;
6037	int cl_pushed = `0`;
6038	struct cl_wextent l_clusters[MAX_CLUSTERS];
6039	u_int max_cluster_pgcount;
6040	int error = `0`;
6041
6042	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6043	/*
6044	* the write behind context exists and has
6045	* already been locked...
6046	*/
6047	if (wbp->cl_number == `0`) {
6048	/*
6049	* no clusters to push
6050	* return number of empty slots
6051	*/
6052	return MAX_CLUSTERS;
6053	}
6054
6055	/*
6056	* make a local 'sorted' copy of the clusters
6057	* and clear wbp->cl_number so that new clusters can
6058	* be developed
6059	*/
6060	for (cl_index = `0`; cl_index < wbp->cl_number; cl_index++) {
6061	for (min_index = -`1`, cl_index1 = `0`; cl_index1 < wbp->cl_number; cl_index1++) {
6062	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6063	continue;
6064	}
6065	if (min_index == -`1`) {
6066	min_index = cl_index1;
6067	} else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6068	min_index = cl_index1;
6069	}
6070	}
6071	if (min_index == -`1`) {
6072	break;
6073	}
6074
6075	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6076	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6077	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6078
6079	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6080	}
6081	wbp->cl_number = `0`;
6082
6083	cl_len = cl_index;
6084
6085	/ skip switching to the sparse cluster mechanism if on diskimage /
6086	if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6087	!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6088	int i;
6089
6090	/*
6091	* determine if we appear to be writing the file sequentially
6092	* if not, by returning without having pushed any clusters
6093	* we will cause this vnode to be pushed into the sparse cluster mechanism
6094	* used for managing more random I/O patterns
6095	*
6096	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6097	* that's why we're in try_push with PUSH_DELAY...
6098	*
6099	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
6100	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6101	* so we can just make a simple pass through, up to, but not including the last one...
6102	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6103	* are sequential
6104	*
6105	* we let the last one be partial as long as it was adjacent to the previous one...
6106	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6107	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6108	*/
6109	for (i = `0`; i < MAX_CLUSTERS - `1`; i++) {
6110	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6111	goto dont_try;
6112	}
6113	if (l_clusters[i].e_addr != l_clusters[i + `1`].b_addr) {
6114	goto dont_try;
6115	}
6116	}
6117	}
6118	if (vm_initiated == TRUE) {
6119	lck_mtx_unlock(lck: &wbp->cl_lockw);
6120	}
6121
6122	for (cl_index = `0`; cl_index < cl_len; cl_index++) {
6123	int flags;
6124	struct cl_extent cl;
6125	int retval;
6126
6127	flags = io_flags & (IO_PASSIVE \| IO_CLOSE);
6128
6129	/*
6130	* try to push each cluster in turn...
6131	*/
6132	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6133	flags \|= IO_NOCACHE;
6134	}
6135
6136	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6137	flags \|= IO_PASSIVE;
6138	}
6139
6140	if (push_flag & PUSH_SYNC) {
6141	flags \|= IO_SYNC;
6142	}
6143
6144	cl.b_addr = l_clusters[cl_index].b_addr;
6145	cl.e_addr = l_clusters[cl_index].e_addr;
6146
6147	retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_ioitiated: vm_initiated);
6148
6149	if (retval == `0`) {
6150	cl_pushed++;
6151
6152	l_clusters[cl_index].b_addr = `0`;
6153	l_clusters[cl_index].e_addr = `0`;
6154	} else if (error == `0`) {
6155	error = retval;
6156	}
6157
6158	if (!(push_flag & PUSH_ALL)) {
6159	break;
6160	}
6161	}
6162	if (vm_initiated == TRUE) {
6163	lck_mtx_lock(lck: &wbp->cl_lockw);
6164	}
6165
6166	if (err) {
6167	*err = error;
6168	}
6169
6170	dont_try:
6171	if (cl_len > cl_pushed) {
6172	/*
6173	* we didn't push all of the clusters, so
6174	* lets try to merge them back in to the vnode
6175	*/
6176	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6177	/*
6178	* we picked up some new clusters while we were trying to
6179	* push the old ones... this can happen because I've dropped
6180	* the vnode lock... the sum of the
6181	* leftovers plus the new cluster count exceeds our ability
6182	* to represent them, so switch to the sparse cluster mechanism
6183	*
6184	* collect the active public clusters...
6185	*/
6186	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6187
6188	for (cl_index = `0`, cl_index1 = `0`; cl_index < cl_len; cl_index++) {
6189	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6190	continue;
6191	}
6192	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6193	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6194	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6195
6196	cl_index1++;
6197	}
6198	/*
6199	* update the cluster count
6200	*/
6201	wbp->cl_number = cl_index1;
6202
6203	/*
6204	* and collect the original clusters that were moved into the
6205	* local storage for sorting purposes
6206	*/
6207	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6208	} else {
6209	/*
6210	* we've got room to merge the leftovers back in
6211	* just append them starting at the next 'hole'
6212	* represented by wbp->cl_number
6213	*/
6214	for (cl_index = `0`, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6215	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6216	continue;
6217	}
6218
6219	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6220	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6221	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6222
6223	cl_index1++;
6224	}
6225	/*
6226	* update the cluster count
6227	*/
6228	wbp->cl_number = cl_index1;
6229	}
6230	}
6231	return MAX_CLUSTERS - wbp->cl_number;
6232	}
6233
6234
6235
6236	static int
6237	cluster_push_now(vnode_t vp, struct cl_extent cl, off_t EOF, int* flags,
6238	int (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated)
6239	{
6240	upl_page_info_t *pl;
6241	upl_t upl;
6242	vm_offset_t upl_offset;
6243	int upl_size;
6244	off_t upl_f_offset;
6245	int pages_in_upl;
6246	int start_pg;
6247	int last_pg;
6248	int io_size;
6249	int io_flags;
6250	int upl_flags;
6251	int bflag;
6252	int size;
6253	int error = `0`;
6254	int retval;
6255	kern_return_t kret;
6256
6257	if (flags & IO_PASSIVE) {
6258	bflag = CL_PASSIVE;
6259	} else {
6260	bflag = `0`;
6261	}
6262
6263	if (flags & IO_SKIP_ENCRYPTION) {
6264	bflag \|= CL_ENCRYPTED;
6265	}
6266
6267	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `51`)) \| DBG_FUNC_START,
6268	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, `0`);
6269
6270	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == `0`) {
6271	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `51`)) \| DBG_FUNC_END, `1`, `0`, `0`, `0`, `0`);
6272
6273	return `0`;
6274	}
6275	upl_size = pages_in_upl * PAGE_SIZE;
6276	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6277
6278	if (upl_f_offset + upl_size >= EOF) {
6279	if (upl_f_offset >= EOF) {
6280	/*
6281	* must have truncated the file and missed
6282	* clearing a dangling cluster (i.e. it's completely
6283	* beyond the new EOF
6284	*/
6285	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `51`)) \| DBG_FUNC_END, `1`, `1`, `0`, `0`, `0`);
6286
6287	return `0`;
6288	}
6289	size = (int)(EOF - upl_f_offset);
6290
6291	upl_size = (size + (PAGE_SIZE - `1`)) & ~PAGE_MASK;
6292	pages_in_upl = upl_size / PAGE_SIZE;
6293	} else {
6294	size = upl_size;
6295	}
6296
6297
6298	if (vm_initiated) {
6299	vnode_pageout(vp, NULL, (upl_offset_t)`0`, upl_f_offset, (upl_size_t)upl_size,
6300	UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_KEEPCACHED, &error);
6301
6302	return error;
6303	}
6304	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `41`)) \| DBG_FUNC_START, upl_size, size, `0`, `0`, `0`);
6305
6306	/*
6307	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6308	*
6309	* - only pages that are currently dirty are returned... these are the ones we need to clean
6310	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6311	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6312	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6313	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
6314	*
6315	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6316	*/
6317
6318	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE)) {
6319	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
6320	} else {
6321	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
6322	}
6323
6324	kret = ubc_create_upl_kernel(vp,
6325	upl_f_offset,
6326	upl_size,
6327	&upl,
6328	&pl,
6329	upl_flags,
6330	VM_KERN_MEMORY_FILE);
6331	if (kret != KERN_SUCCESS) {
6332	panic("cluster_push: failed to get pagelist");
6333	}
6334
6335	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `41`)) \| DBG_FUNC_END, upl, upl_f_offset, `0`, `0`, `0`);
6336
6337	/*
6338	* since we only asked for the dirty pages back
6339	* it's possible that we may only get a few or even none, so...
6340	* before we start marching forward, we must make sure we know
6341	* where the last present page is in the UPL, otherwise we could
6342	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
6343	* employed by commit_range and abort_range.
6344	*/
6345	for (last_pg = pages_in_upl - `1`; last_pg >= `0`; last_pg--) {
6346	if (upl_page_present(upl: pl, index: last_pg)) {
6347	break;
6348	}
6349	}
6350	pages_in_upl = last_pg + `1`;
6351
6352	if (pages_in_upl == `0`) {
6353	ubc_upl_abort(upl, `0`);
6354
6355	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `51`)) \| DBG_FUNC_END, `1`, `2`, `0`, `0`, `0`);
6356	return `0`;
6357	}
6358
6359	for (last_pg = `0`; last_pg < pages_in_upl;) {
6360	/*
6361	* find the next dirty page in the UPL
6362	* this will become the first page in the
6363	* next I/O to generate
6364	*/
6365	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6366	if (upl_dirty_page(upl: pl, index: start_pg)) {
6367	break;
6368	}
6369	if (upl_page_present(upl: pl, index: start_pg)) {
6370	/*
6371	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
6372	* just release these unchanged since we're not going
6373	* to steal them or change their state
6374	*/
6375	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6376	}
6377	}
6378	if (start_pg >= pages_in_upl) {
6379	/*
6380	* done... no more dirty pages to push
6381	*/
6382	break;
6383	}
6384	if (start_pg > last_pg) {
6385	/*
6386	* skipped over some non-dirty pages
6387	*/
6388	size -= ((start_pg - last_pg) * PAGE_SIZE);
6389	}
6390
6391	/*
6392	* find a range of dirty pages to write
6393	*/
6394	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6395	if (!upl_dirty_page(upl: pl, index: last_pg)) {
6396	break;
6397	}
6398	}
6399	upl_offset = start_pg * PAGE_SIZE;
6400
6401	io_size = min(a: size, b: (last_pg - start_pg) * PAGE_SIZE);
6402
6403	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
6404
6405	if (!(flags & IO_SYNC)) {
6406	io_flags \|= CL_ASYNC;
6407	}
6408
6409	if (flags & IO_CLOSE) {
6410	io_flags \|= CL_CLOSE;
6411	}
6412
6413	if (flags & IO_NOCACHE) {
6414	io_flags \|= CL_NOCACHE;
6415	}
6416
6417	retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: io_size,
6418	flags: io_flags, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6419
6420	if (error == `0` && retval) {
6421	error = retval;
6422	}
6423
6424	size -= io_size;
6425	}
6426	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `51`)) \| DBG_FUNC_END, `1`, `3`, error, `0`, `0`);
6427
6428	return error;
6429	}
6430
6431
6432	/*
6433	* sparse_cluster_switch is called with the write behind lock held
6434	*/
6435	static int
6436	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int* (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated)
6437	{
6438	int cl_index;
6439	int error = `0`;
6440
6441	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `78`)) \| DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, `0`, `0`);
6442
6443	for (cl_index = `0`; cl_index < wbp->cl_number; cl_index++) {
6444	int flags;
6445	struct cl_extent cl;
6446
6447	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6448	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), `0`, NULL, &flags) == KERN_SUCCESS) {
6449	if (flags & UPL_POP_DIRTY) {
6450	cl.e_addr = cl.b_addr + `1`;
6451
6452	error = sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6453
6454	if (error) {
6455	break;
6456	}
6457	}
6458	}
6459	}
6460	}
6461	wbp->cl_number -= cl_index;
6462
6463	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `78`)) \| DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, `0`);
6464
6465	return error;
6466	}
6467
6468
6469	/*
6470	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
6471	* still associated with the write-behind context... however, if the scmap has been disassociated
6472	* from the write-behind context (the cluster_push case), the wb lock is not held
6473	*/
6474	static int
6475	sparse_cluster_push(struct cl_writebehind wbp, void* *scmap, vnode_t vp, off_t EOF, int* push_flag,
6476	int io_flags, int (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated)
6477	{
6478	struct cl_extent cl;
6479	off_t offset;
6480	u_int length;
6481	void *l_scmap;
6482	int error = `0`;
6483
6484	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `79`)) \| DBG_FUNC_START, kdebug_vnode(vp), (*scmap), `0`, push_flag, `0`);
6485
6486	if (push_flag & PUSH_ALL) {
6487	vfs_drt_control(cmapp: scmap, op_type: `1`);
6488	}
6489
6490	l_scmap = *scmap;
6491
6492	for (;;) {
6493	int retval;
6494
6495	if (vfs_drt_get_cluster(cmapp: scmap, offsetp: &offset, lengthp: &length) != KERN_SUCCESS) {
6496	/*
6497	* Not finding anything to push will return KERN_FAILURE.
6498	* Confusing since it isn't really a failure. But that's the
6499	* reason we don't set 'error' here like we do below.
6500	*/
6501	break;
6502	}
6503
6504	if (vm_initiated == TRUE) {
6505	lck_mtx_unlock(lck: &wbp->cl_lockw);
6506	}
6507
6508	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6509	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6510
6511	retval = cluster_push_now(vp, cl: &cl, EOF, flags: io_flags, callback, callback_arg, vm_initiated);
6512	if (error == `0` && retval) {
6513	error = retval;
6514	}
6515
6516	if (vm_initiated == TRUE) {
6517	lck_mtx_lock(lck: &wbp->cl_lockw);
6518
6519	if (*scmap != l_scmap) {
6520	break;
6521	}
6522	}
6523
6524	if (error) {
6525	if (vfs_drt_mark_pages(cmapp: scmap, offset, length, NULL) != KERN_SUCCESS) {
6526	panic("Failed to restore dirty state on failure");
6527	}
6528
6529	break;
6530	}
6531
6532	if (!(push_flag & PUSH_ALL)) {
6533	break;
6534	}
6535	}
6536	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `79`)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, `0`, `0`);
6537
6538	return error;
6539	}
6540
6541
6542	/*
6543	* sparse_cluster_add is called with the write behind lock held
6544	*/
6545	static int
6546	sparse_cluster_add(struct cl_writebehind wbp, void* scmap, vnode_t vp, struct** cl_extent *cl, off_t EOF,
6547	int (callback)(buf_t, void* ), void* *callback_arg, boolean_t vm_initiated)
6548	{
6549	u_int new_dirty;
6550	u_int length;
6551	off_t offset;
6552	int error = `0`;
6553	int push_flag = `0`; / Is this a valid value? /
6554
6555	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `80`)) \| DBG_FUNC_START, (scmap), `0`, cl->b_addr, (int*)cl->e_addr, `0`);
6556
6557	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6558	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6559
6560	while (vfs_drt_mark_pages(cmapp: scmap, offset, length, setcountp: &new_dirty) != KERN_SUCCESS) {
6561	/*
6562	* no room left in the map
6563	* only a partial update was done
6564	* push out some pages and try again
6565	*/
6566
6567	if (vfs_get_scmap_push_behavior_internal(cmapp: scmap, push_flag: &push_flag)) {
6568	push_flag = `0`;
6569	}
6570
6571	error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, io_flags: `0`, callback, callback_arg, vm_initiated);
6572
6573	if (error) {
6574	break;
6575	}
6576
6577	offset += (new_dirty * PAGE_SIZE_64);
6578	length -= (new_dirty * PAGE_SIZE);
6579	}
6580	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `80`)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, `0`, `0`);
6581
6582	return error;
6583	}
6584
6585
6586	static int
6587	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int* flags, int (callback)(buf_t, void* ), void* *callback_arg)
6588	{
6589	upl_page_info_t *pl;
6590	upl_t upl;
6591	addr64_t ubc_paddr;
6592	kern_return_t kret;
6593	int error = `0`;
6594	int did_read = `0`;
6595	int abort_flags;
6596	int upl_flags;
6597	int bflag;
6598
6599	if (flags & IO_PASSIVE) {
6600	bflag = CL_PASSIVE;
6601	} else {
6602	bflag = `0`;
6603	}
6604
6605	if (flags & IO_NOCACHE) {
6606	bflag \|= CL_NOCACHE;
6607	}
6608
6609	upl_flags = UPL_SET_LITE;
6610
6611	if (!(flags & CL_READ)) {
6612	/*
6613	* "write" operation: let the UPL subsystem know
6614	* that we intend to modify the buffer cache pages
6615	* we're gathering.
6616	*/
6617	upl_flags \|= UPL_WILL_MODIFY;
6618	} else {
6619	/*
6620	* indicate that there is no need to pull the
6621	* mapping for this page... we're only going
6622	* to read from it, not modify it.
6623	*/
6624	upl_flags \|= UPL_FILE_IO;
6625	}
6626	kret = ubc_create_upl_kernel(vp,
6627	uio->uio_offset & ~PAGE_MASK_64,
6628	PAGE_SIZE,
6629	&upl,
6630	&pl,
6631	upl_flags,
6632	VM_KERN_MEMORY_FILE);
6633
6634	if (kret != KERN_SUCCESS) {
6635	return EINVAL;
6636	}
6637
6638	if (!upl_valid_page(upl: pl, index: `0`)) {
6639	/*
6640	* issue a synchronous read to cluster_io
6641	*/
6642	error = cluster_io(vp, upl, upl_offset: `0`, f_offset: uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6643	CL_READ \| bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6644	if (error) {
6645	ubc_upl_abort_range(upl, `0`, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
6646
6647	return error;
6648	}
6649	did_read = `1`;
6650	}
6651	ubc_paddr = ((addr64_t)upl_phys_page(upl: pl, index: `0`) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6652
6653	/*
6654	* NOTE: There is no prototype for the following in BSD. It, and the definitions
6655	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6656	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
6657	* way to do so without exporting them to kexts as well.
6658	*/
6659	if (flags & CL_READ) {
6660	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); / Copy physical to physical and flush the destination /
6661	copypv(source: ubc_paddr, sink: usr_paddr, size: xsize, which: `2` \| `1` \| `4`); / Copy physical to physical and flush the destination /
6662	} else {
6663	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); / Copy physical to physical and flush the source /
6664	copypv(source: usr_paddr, sink: ubc_paddr, size: xsize, which: `2` \| `1` \| `8`); / Copy physical to physical and flush the source /
6665	}
6666	if (!(flags & CL_READ) \|\| (upl_valid_page(upl: pl, index: `0`) && upl_dirty_page(upl: pl, index: `0`))) {
6667	/*
6668	* issue a synchronous write to cluster_io
6669	*/
6670	error = cluster_io(vp, upl, upl_offset: `0`, f_offset: uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6671	flags: bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6672	}
6673	if (error == `0`) {
6674	uio_update(a_uio: uio, a_count: (user_size_t)xsize);
6675	}
6676
6677	if (did_read) {
6678	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6679	} else {
6680	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
6681	}
6682
6683	ubc_upl_abort_range(upl, `0`, PAGE_SIZE, abort_flags);
6684
6685	return error;
6686	}
6687
6688	int
6689	cluster_copy_upl_data(struct uio uio, upl_t upl, int* upl_offset, int *io_resid)
6690	{
6691	int pg_offset;
6692	int pg_index;
6693	int csize;
6694	int segflg;
6695	int retval = `0`;
6696	int xsize;
6697	upl_page_info_t *pl;
6698	int dirty_count;
6699
6700	xsize = *io_resid;
6701
6702	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `34`)) \| DBG_FUNC_START,
6703	(int)uio->uio_offset, upl_offset, xsize, `0`, `0`);
6704
6705	segflg = uio->uio_segflg;
6706
6707	switch (segflg) {
6708	case UIO_USERSPACE32:
6709	case UIO_USERISPACE32:
6710	uio->uio_segflg = UIO_PHYS_USERSPACE32;
6711	break;
6712
6713	case UIO_USERSPACE:
6714	case UIO_USERISPACE:
6715	uio->uio_segflg = UIO_PHYS_USERSPACE;
6716	break;
6717
6718	case UIO_USERSPACE64:
6719	case UIO_USERISPACE64:
6720	uio->uio_segflg = UIO_PHYS_USERSPACE64;
6721	break;
6722
6723	case UIO_SYSSPACE:
6724	uio->uio_segflg = UIO_PHYS_SYSSPACE;
6725	break;
6726	}
6727	pl = ubc_upl_pageinfo(upl);
6728
6729	pg_index = upl_offset / PAGE_SIZE;
6730	pg_offset = upl_offset & PAGE_MASK;
6731	csize = min(PAGE_SIZE - pg_offset, b: xsize);
6732
6733	dirty_count = `0`;
6734	while (xsize && retval == `0`) {
6735	addr64_t paddr;
6736
6737	paddr = ((addr64_t)upl_phys_page(upl: pl, index: pg_index) << PAGE_SHIFT) + pg_offset;
6738	if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(upl: pl, index: pg_index) == FALSE)) {
6739	dirty_count++;
6740	}
6741
6742	retval = uiomove64(cp: paddr, n: csize, uio);
6743
6744	pg_index += `1`;
6745	pg_offset = `0`;
6746	xsize -= csize;
6747	csize = min(PAGE_SIZE, b: xsize);
6748	}
6749	*io_resid = xsize;
6750
6751	uio->uio_segflg = segflg;
6752
6753	if (dirty_count) {
6754	task_update_logical_writes(task: current_task(), io_size: (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, vp: upl_lookup_vnode(upl));
6755	}
6756
6757	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `34`)) \| DBG_FUNC_END,
6758	(int)uio->uio_offset, xsize, retval, segflg, `0`);
6759
6760	return retval;
6761	}
6762
6763
6764	int
6765	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int* io_resid, int* mark_dirty)
6766	{
6767	return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, take_reference: `1`);
6768	}
6769
6770
6771	static int
6772	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int* io_resid, int* mark_dirty, int take_reference)
6773	{
6774	int segflg;
6775	int io_size;
6776	int xsize;
6777	int start_offset;
6778	int retval = `0`;
6779	memory_object_control_t control;
6780
6781	io_size = *io_resid;
6782
6783	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `34`)) \| DBG_FUNC_START,
6784	(int)uio->uio_offset, io_size, mark_dirty, take_reference, `0`);
6785
6786	control = ubc_getobject(vp, UBC_FLAGS_NONE);
6787
6788	if (control == MEMORY_OBJECT_CONTROL_NULL) {
6789	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `34`)) \| DBG_FUNC_END,
6790	(int)uio->uio_offset, io_size, retval, `3`, `0`);
6791
6792	return `0`;
6793	}
6794	segflg = uio->uio_segflg;
6795
6796	switch (segflg) {
6797	case UIO_USERSPACE32:
6798	case UIO_USERISPACE32:
6799	uio->uio_segflg = UIO_PHYS_USERSPACE32;
6800	break;
6801
6802	case UIO_USERSPACE64:
6803	case UIO_USERISPACE64:
6804	uio->uio_segflg = UIO_PHYS_USERSPACE64;
6805	break;
6806
6807	case UIO_USERSPACE:
6808	case UIO_USERISPACE:
6809	uio->uio_segflg = UIO_PHYS_USERSPACE;
6810	break;
6811
6812	case UIO_SYSSPACE:
6813	uio->uio_segflg = UIO_PHYS_SYSSPACE;
6814	break;
6815	}
6816
6817	if ((io_size = *io_resid)) {
6818	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6819	xsize = (int)uio_resid(a_uio: uio);
6820
6821	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6822	start_offset, io_size, mark_dirty, take_reference);
6823	xsize -= uio_resid(a_uio: uio);
6824
6825	int num_bytes_copied = xsize;
6826	if (num_bytes_copied && uio_rw(a_uio: uio)) {
6827	task_update_logical_writes(task: current_task(), io_size: num_bytes_copied, TASK_WRITE_DEFERRED, vp);
6828	}
6829	io_size -= xsize;
6830	}
6831	uio->uio_segflg = segflg;
6832	*io_resid = io_size;
6833
6834	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `34`)) \| DBG_FUNC_END,
6835	(int)uio->uio_offset, io_size, retval, `0x80000000` \| segflg, `0`);
6836
6837	return retval;
6838	}
6839
6840
6841	int
6842	is_file_clean(vnode_t vp, off_t filesize)
6843	{
6844	off_t f_offset;
6845	int flags;
6846	int total_dirty = `0`;
6847
6848	for (f_offset = `0`; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6849	if (ubc_page_op(vp, f_offset, `0`, NULL, &flags) == KERN_SUCCESS) {
6850	if (flags & UPL_POP_DIRTY) {
6851	total_dirty++;
6852	}
6853	}
6854	}
6855	if (total_dirty) {
6856	return EINVAL;
6857	}
6858
6859	return `0`;
6860	}
6861
6862
6863
6864	/*
6865	* Dirty region tracking/clustering mechanism.
6866	*
6867	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
6868	* dirty regions within a larger space (file). It is primarily intended to
6869	* support clustering in large files with many dirty areas.
6870	*
6871	* The implementation assumes that the dirty regions are pages.
6872	*
6873	* To represent dirty pages within the file, we store bit vectors in a
6874	* variable-size circular hash.
6875	*/
6876
6877	/*
6878	* Bitvector size. This determines the number of pages we group in a
6879	* single hashtable entry. Each hashtable entry is aligned to this
6880	* size within the file.
6881	*/
6882	#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6883
6884	/*
6885	* File offset handling.
6886	*
6887	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6888	* the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6889	*/
6890	#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6891	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6892
6893	/*
6894	* Hashtable address field handling.
6895	*
6896	* The low-order bits of the hashtable address are used to conserve
6897	* space.
6898	*
6899	* DRT_HASH_COUNT_MASK must be large enough to store the range
6900	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6901	* to indicate that the bucket is actually unoccupied.
6902	*/
6903	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6904	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
6905	do { \
6906	(scm)->scm_hashtable[(i)].dhe_control = \
6907	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
6908	} while (0)
6909	#define DRT_HASH_COUNT_MASK 0x1ff
6910	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6911	#define DRT_HASH_SET_COUNT(scm, i, c) \
6912	do { \
6913	(scm)->scm_hashtable[(i)].dhe_control = \
6914	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
6915	} while (0)
6916	#define DRT_HASH_CLEAR(scm, i) \
6917	do { \
6918	(scm)->scm_hashtable[(i)].dhe_control = 0; \
6919	} while (0)
6920	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6921	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6922	#define DRT_HASH_COPY(oscm, oi, scm, i) \
6923	do { \
6924	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6925	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6926	} while(0);
6927
6928
6929	#if !defined(XNU_TARGET_OS_OSX)
6930	/*
6931	* Hash table moduli.
6932	*
6933	* Since the hashtable entry's size is dependent on the size of
6934	* the bitvector, and since the hashtable size is constrained to
6935	* both being prime and fitting within the desired allocation
6936	* size, these values need to be manually determined.
6937	*
6938	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6939	*
6940	* The small hashtable allocation is 4096 bytes, so the modulus is 251.
6941	* The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6942	* The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6943	*/
6944
6945	#define DRT_HASH_SMALL_MODULUS 251
6946	#define DRT_HASH_LARGE_MODULUS 2039
6947	#define DRT_HASH_XLARGE_MODULUS 8179
6948
6949	/*
6950	* Physical memory required before the large hash modulus is permitted.
6951	*
6952	* On small memory systems, the large hash modulus can lead to phsyical
6953	* memory starvation, so we avoid using it there.
6954	*/
6955	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6956	#define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6957
6958	#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6959	#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6960	#define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6961
6962	#else /* XNU_TARGET_OS_OSX */
6963	/*
6964	* Hash table moduli.
6965	*
6966	* Since the hashtable entry's size is dependent on the size of
6967	* the bitvector, and since the hashtable size is constrained to
6968	* both being prime and fitting within the desired allocation
6969	* size, these values need to be manually determined.
6970	*
6971	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6972	*
6973	* The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6974	* The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6975	* The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6976	*/
6977
6978	#define DRT_HASH_SMALL_MODULUS 1019
6979	#define DRT_HASH_LARGE_MODULUS 8179
6980	#define DRT_HASH_XLARGE_MODULUS 32749
6981
6982	/*
6983	* Physical memory required before the large hash modulus is permitted.
6984	*
6985	* On small memory systems, the large hash modulus can lead to phsyical
6986	* memory starvation, so we avoid using it there.
6987	*/
6988	#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6989	#define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6990
6991	#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6992	#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6993	#define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6994
6995	#endif /* ! XNU_TARGET_OS_OSX */
6996
6997	/ * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * /
6998
6999	/*
7000	* Hashtable entry.
7001	*/
7002	struct vfs_drt_hashentry {
7003	u_int64_t dhe_control;
7004	/*
7005	* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
7006	* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
7007	* Since PAGE_SIZE is only known at boot time,
7008	* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
7009	* -declare dhe_bitvector array for largest possible length
7010	*/
7011	#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
7012	u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / `32`];
7013	};
7014
7015	/*
7016	* Hashtable bitvector handling.
7017	*
7018	* Bitvector fields are 32 bits long.
7019	*/
7020
7021	#define DRT_HASH_SET_BIT(scm, i, bit) \
7022	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
7023
7024	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
7025	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7026
7027	#define DRT_HASH_TEST_BIT(scm, i, bit) \
7028	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7029
7030	#define DRT_BITVECTOR_CLEAR(scm, i) \
7031	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7032
7033	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
7034	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
7035	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
7036	(MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7037
7038	/*
7039	* Dirty Region Tracking structure.
7040	*
7041	* The hashtable is allocated entirely inside the DRT structure.
7042	*
7043	* The hash is a simple circular prime modulus arrangement, the structure
7044	* is resized from small to large if it overflows.
7045	*/
7046
7047	struct vfs_drt_clustermap {
7048	u_int32_t scm_magic; / sanity/detection /
7049	#define DRT_SCM_MAGIC 0x12020003
7050	u_int32_t scm_modulus; / current ring size /
7051	u_int32_t scm_buckets; / number of occupied buckets /
7052	u_int32_t scm_lastclean; / last entry we cleaned /
7053	u_int32_t scm_iskips; / number of slot skips /
7054
7055	struct vfs_drt_hashentry scm_hashtable[`0`];
7056	};
7057
7058
7059	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7060	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7061
7062	/*
7063	* Debugging codes and arguments.
7064	*/
7065	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7066	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7067	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7068	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7069	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7070	* dirty */
7071	/ 0, setcount /
7072	/ 1 (clean, no map) /
7073	/ 2 (map alloc fail) /
7074	/ 3, resid (partial) /
7075	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7076	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7077	* lastclean, iskips */
7078
7079
7080	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7081	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7082	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7083	u_int64_t offset, int *indexp);
7084	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7085	u_int64_t offset,
7086	int *indexp,
7087	int recursed);
7088	static kern_return_t vfs_drt_do_mark_pages(
7089	void **cmapp,
7090	u_int64_t offset,
7091	u_int length,
7092	u_int *setcountp,
7093	int dirty);
7094	static void vfs_drt_trace(
7095	struct vfs_drt_clustermap *cmap,
7096	int code,
7097	int arg1,
7098	int arg2,
7099	int arg3,
7100	int arg4);
7101
7102
7103	/*
7104	* Allocate and initialise a sparse cluster map.
7105	*
7106	* Will allocate a new map, resize or compact an existing map.
7107	*
7108	* XXX we should probably have at least one intermediate map size,
7109	* as the 1:16 ratio seems a bit drastic.
7110	*/
7111	static kern_return_t
7112	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7113	{
7114	struct vfs_drt_clustermap cmap = NULL, ocmap = NULL;
7115	kern_return_t kret = KERN_SUCCESS;
7116	u_int64_t offset = `0`;
7117	u_int32_t i = `0`;
7118	int modulus_size = `0`, map_size = `0`, active_buckets = `0`, index = `0`, copycount = `0`;
7119
7120	ocmap = NULL;
7121	if (cmapp != NULL) {
7122	ocmap = *cmapp;
7123	}
7124
7125	/*
7126	* Decide on the size of the new map.
7127	*/
7128	if (ocmap == NULL) {
7129	modulus_size = DRT_HASH_SMALL_MODULUS;
7130	map_size = DRT_SMALL_ALLOCATION;
7131	} else {
7132	/ count the number of active buckets in the old map /
7133	active_buckets = `0`;
7134	for (i = `0`; i < ocmap->scm_modulus; i++) {
7135	if (!DRT_HASH_VACANT(ocmap, i) &&
7136	(DRT_HASH_GET_COUNT(ocmap, i) != `0`)) {
7137	active_buckets++;
7138	}
7139	}
7140	/*
7141	* If we're currently using the small allocation, check to
7142	* see whether we should grow to the large one.
7143	*/
7144	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7145	/*
7146	* If the ring is nearly full and we are allowed to
7147	* use the large modulus, upgrade.
7148	*/
7149	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - `5`)) &&
7150	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7151	modulus_size = DRT_HASH_LARGE_MODULUS;
7152	map_size = DRT_LARGE_ALLOCATION;
7153	} else {
7154	modulus_size = DRT_HASH_SMALL_MODULUS;
7155	map_size = DRT_SMALL_ALLOCATION;
7156	}
7157	} else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7158	if ((active_buckets > (DRT_HASH_LARGE_MODULUS - `5`)) &&
7159	(max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7160	modulus_size = DRT_HASH_XLARGE_MODULUS;
7161	map_size = DRT_XLARGE_ALLOCATION;
7162	} else {
7163	/*
7164	* If the ring is completely full and we can't
7165	* expand, there's nothing useful for us to do.
7166	* Behave as though we had compacted into the new
7167	* array and return.
7168	*/
7169	return KERN_SUCCESS;
7170	}
7171	} else {
7172	/ already using the xlarge modulus /
7173	modulus_size = DRT_HASH_XLARGE_MODULUS;
7174	map_size = DRT_XLARGE_ALLOCATION;
7175
7176	/*
7177	* If the ring is completely full, there's
7178	* nothing useful for us to do. Behave as
7179	* though we had compacted into the new
7180	* array and return.
7181	*/
7182	if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7183	return KERN_SUCCESS;
7184	}
7185	}
7186	}
7187
7188	/*
7189	* Allocate and initialise the new map.
7190	*/
7191
7192	kret = kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&cmap, size: map_size,
7193	flags: KMA_DATA, VM_KERN_MEMORY_FILE);
7194	if (kret != KERN_SUCCESS) {
7195	return kret;
7196	}
7197	cmap->scm_magic = DRT_SCM_MAGIC;
7198	cmap->scm_modulus = modulus_size;
7199	cmap->scm_buckets = `0`;
7200	cmap->scm_lastclean = `0`;
7201	cmap->scm_iskips = `0`;
7202	for (i = `0`; i < cmap->scm_modulus; i++) {
7203	DRT_HASH_CLEAR(cmap, i);
7204	DRT_HASH_VACATE(cmap, i);
7205	DRT_BITVECTOR_CLEAR(cmap, i);
7206	}
7207
7208	/*
7209	* If there's an old map, re-hash entries from it into the new map.
7210	*/
7211	copycount = `0`;
7212	if (ocmap != NULL) {
7213	for (i = `0`; i < ocmap->scm_modulus; i++) {
7214	/ skip empty buckets /
7215	if (DRT_HASH_VACANT(ocmap, i) \|\|
7216	(DRT_HASH_GET_COUNT(ocmap, i) == `0`)) {
7217	continue;
7218	}
7219	/ get new index /
7220	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7221	kret = vfs_drt_get_index(cmapp: &cmap, offset, indexp: &index, recursed: `1`);
7222	if (kret != KERN_SUCCESS) {
7223	/ XXX need to bail out gracefully here /
7224	panic("vfs_drt: new cluster map mysteriously too small");
7225	index = `0`;
7226	}
7227	/ copy /
7228	DRT_HASH_COPY(ocmap, i, cmap, index);
7229	copycount++;
7230	}
7231	}
7232
7233	/ log what we've done /
7234	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, arg1: copycount, arg2: `0`, arg3: `0`, arg4: `0`);
7235
7236	/*
7237	* It's important to ensure that *cmapp always points to
7238	* a valid map, so we must overwrite it before freeing
7239	* the old map.
7240	*/
7241	*cmapp = cmap;
7242	if (ocmap != NULL) {
7243	/ emit stats into trace buffer /
7244	vfs_drt_trace(cmap: ocmap, DRT_DEBUG_SCMDATA,
7245	arg1: ocmap->scm_modulus,
7246	arg2: ocmap->scm_buckets,
7247	arg3: ocmap->scm_lastclean,
7248	arg4: ocmap->scm_iskips);
7249
7250	vfs_drt_free_map(cmap: ocmap);
7251	}
7252	return KERN_SUCCESS;
7253	}
7254
7255
7256	/*
7257	* Free a sparse cluster map.
7258	*/
7259	static kern_return_t
7260	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7261	{
7262	vm_size_t map_size = `0`;
7263
7264	if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7265	map_size = DRT_SMALL_ALLOCATION;
7266	} else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7267	map_size = DRT_LARGE_ALLOCATION;
7268	} else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7269	map_size = DRT_XLARGE_ALLOCATION;
7270	} else {
7271	panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7272	}
7273
7274	kmem_free(map: kernel_map, addr: (vm_offset_t)cmap, size: map_size);
7275	return KERN_SUCCESS;
7276	}
7277
7278
7279	/*
7280	* Find the hashtable slot currently occupied by an entry for the supplied offset.
7281	*/
7282	static kern_return_t
7283	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int* *indexp)
7284	{
7285	int index;
7286	u_int32_t i;
7287
7288	offset = DRT_ALIGN_ADDRESS(offset);
7289	index = DRT_HASH(cmap, offset);
7290
7291	/ traverse the hashtable /
7292	for (i = `0`; i < cmap->scm_modulus; i++) {
7293	/*
7294	* If the slot is vacant, we can stop.
7295	*/
7296	if (DRT_HASH_VACANT(cmap, index)) {
7297	break;
7298	}
7299
7300	/*
7301	* If the address matches our offset, we have success.
7302	*/
7303	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7304	*indexp = index;
7305	return KERN_SUCCESS;
7306	}
7307
7308	/*
7309	* Move to the next slot, try again.
7310	*/
7311	index = DRT_HASH_NEXT(cmap, index);
7312	}
7313	/*
7314	* It's not there.
7315	*/
7316	return KERN_FAILURE;
7317	}
7318
7319	/*
7320	* Find the hashtable slot for the supplied offset. If we haven't allocated
7321	* one yet, allocate one and populate the address field. Note that it will
7322	* not have a nonzero page count and thus will still technically be free, so
7323	* in the case where we are called to clean pages, the slot will remain free.
7324	*/
7325	static kern_return_t
7326	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int* indexp, int* recursed)
7327	{
7328	struct vfs_drt_clustermap *cmap;
7329	kern_return_t kret;
7330	u_int32_t index;
7331	u_int32_t i;
7332
7333	cmap = *cmapp;
7334
7335	/ look for an existing entry /
7336	kret = vfs_drt_search_index(cmap, offset, indexp);
7337	if (kret == KERN_SUCCESS) {
7338	return kret;
7339	}
7340
7341	/ need to allocate an entry /
7342	offset = DRT_ALIGN_ADDRESS(offset);
7343	index = DRT_HASH(cmap, offset);
7344
7345	/ scan from the index forwards looking for a vacant slot /
7346	for (i = `0`; i < cmap->scm_modulus; i++) {
7347	/ slot vacant? /
7348	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap, index) == `0`) {
7349	cmap->scm_buckets++;
7350	if (index < cmap->scm_lastclean) {
7351	cmap->scm_lastclean = index;
7352	}
7353	DRT_HASH_SET_ADDRESS(cmap, index, offset);
7354	DRT_HASH_SET_COUNT(cmap, index, `0`);
7355	DRT_BITVECTOR_CLEAR(cmap, index);
7356	*indexp = index;
7357	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, arg1: (int)offset, arg2: i, arg3: `0`, arg4: `0`);
7358	return KERN_SUCCESS;
7359	}
7360	cmap->scm_iskips += i;
7361	index = DRT_HASH_NEXT(cmap, index);
7362	}
7363
7364	/*
7365	* We haven't found a vacant slot, so the map is full. If we're not
7366	* already recursed, try reallocating/compacting it.
7367	*/
7368	if (recursed) {
7369	return KERN_FAILURE;
7370	}
7371	kret = vfs_drt_alloc_map(cmapp);
7372	if (kret == KERN_SUCCESS) {
7373	/ now try to insert again /
7374	kret = vfs_drt_get_index(cmapp, offset, indexp, recursed: `1`);
7375	}
7376	return kret;
7377	}
7378
7379	/*
7380	* Implementation of set dirty/clean.
7381	*
7382	* In the 'clean' case, not finding a map is OK.
7383	*/
7384	static kern_return_t
7385	vfs_drt_do_mark_pages(
7386	void **private,
7387	u_int64_t offset,
7388	u_int length,
7389	u_int *setcountp,
7390	int dirty)
7391	{
7392	struct vfs_drt_clustermap cmap, *cmapp;
7393	kern_return_t kret;
7394	int i, index, pgoff, pgcount, setcount, ecount;
7395
7396	cmapp = (struct vfs_drt_clustermap **)private;
7397	cmap = *cmapp;
7398
7399	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, arg1: (int)offset, arg2: (int)length, arg3: dirty, arg4: `0`);
7400
7401	if (setcountp != NULL) {
7402	*setcountp = `0`;
7403	}
7404
7405	/ allocate a cluster map if we don't already have one /
7406	if (cmap == NULL) {
7407	/ no cluster map, nothing to clean /
7408	if (!dirty) {
7409	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, arg1: `1`, arg2: `0`, arg3: `0`, arg4: `0`);
7410	return KERN_SUCCESS;
7411	}
7412	kret = vfs_drt_alloc_map(cmapp);
7413	if (kret != KERN_SUCCESS) {
7414	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, arg1: `2`, arg2: `0`, arg3: `0`, arg4: `0`);
7415	return kret;
7416	}
7417	}
7418	setcount = `0`;
7419
7420	/*
7421	* Iterate over the length of the region.
7422	*/
7423	while (length > `0`) {
7424	/*
7425	* Get the hashtable index for this offset.
7426	*
7427	* XXX this will add blank entries if we are clearing a range
7428	* that hasn't been dirtied.
7429	*/
7430	kret = vfs_drt_get_index(cmapp, offset, indexp: &index, recursed: `0`);
7431	cmap = cmapp; /* may have changed! /
7432	/ this may be a partial-success return /
7433	if (kret != KERN_SUCCESS) {
7434	if (setcountp != NULL) {
7435	*setcountp = setcount;
7436	}
7437	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, arg1: `3`, arg2: (int)length, arg3: `0`, arg4: `0`);
7438
7439	return kret;
7440	}
7441
7442	/*
7443	* Work out how many pages we're modifying in this
7444	* hashtable entry.
7445	*/
7446	pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7447	pgcount = min(a: (length / PAGE_SIZE), b: (DRT_BITVECTOR_PAGES - pgoff));
7448
7449	/*
7450	* Iterate over pages, dirty/clearing as we go.
7451	*/
7452	ecount = DRT_HASH_GET_COUNT(cmap, index);
7453	for (i = `0`; i < pgcount; i++) {
7454	if (dirty) {
7455	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7456	if (ecount >= DRT_BITVECTOR_PAGES) {
7457	panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7458	}
7459	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7460	ecount++;
7461	setcount++;
7462	}
7463	} else {
7464	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7465	if (ecount <= `0`) {
7466	panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7467	}
7468	assert(ecount > `0`);
7469	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7470	ecount--;
7471	setcount++;
7472	}
7473	}
7474	}
7475	DRT_HASH_SET_COUNT(cmap, index, ecount);
7476
7477	offset += pgcount * PAGE_SIZE;
7478	length -= pgcount * PAGE_SIZE;
7479	}
7480	if (setcountp != NULL) {
7481	*setcountp = setcount;
7482	}
7483
7484	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, arg1: `0`, arg2: setcount, arg3: `0`, arg4: `0`);
7485
7486	return KERN_SUCCESS;
7487	}
7488
7489	/*
7490	* Mark a set of pages as dirty/clean.
7491	*
7492	* This is a public interface.
7493	*
7494	* cmapp
7495	* Pointer to storage suitable for holding a pointer. Note that
7496	* this must either be NULL or a value set by this function.
7497	*
7498	* size
7499	* Current file size in bytes.
7500	*
7501	* offset
7502	* Offset of the first page to be marked as dirty, in bytes. Must be
7503	* page-aligned.
7504	*
7505	* length
7506	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7507	*
7508	* setcountp
7509	* Number of pages newly marked dirty by this call (optional).
7510	*
7511	* Returns KERN_SUCCESS if all the pages were successfully marked.
7512	*/
7513	static kern_return_t
7514	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
7515	{
7516	/ XXX size unused, drop from interface /
7517	return vfs_drt_do_mark_pages(private: cmapp, offset, length, setcountp, dirty: `1`);
7518	}
7519
7520	#if 0
7521	static kern_return_t
7522	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7523	{
7524	return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, `0`);
7525	}
7526	#endif
7527
7528	/*
7529	* Get a cluster of dirty pages.
7530	*
7531	* This is a public interface.
7532	*
7533	* cmapp
7534	* Pointer to storage managed by drt_mark_pages. Note that this must
7535	* be NULL or a value set by drt_mark_pages.
7536	*
7537	* offsetp
7538	* Returns the byte offset into the file of the first page in the cluster.
7539	*
7540	* lengthp
7541	* Returns the length in bytes of the cluster of dirty pages.
7542	*
7543	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
7544	* are no dirty pages meeting the minmum size criteria. Private storage will
7545	* be released if there are no more dirty pages left in the map
7546	*
7547	*/
7548	static kern_return_t
7549	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
7550	{
7551	struct vfs_drt_clustermap *cmap;
7552	u_int64_t offset;
7553	u_int length;
7554	u_int32_t j;
7555	int index, i, fs, ls;
7556
7557	/ sanity /
7558	if ((cmapp == NULL) \|\| (*cmapp == NULL)) {
7559	return KERN_FAILURE;
7560	}
7561	cmap = *cmapp;
7562
7563	/ walk the hashtable /
7564	for (offset = `0`, j = `0`; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7565	index = DRT_HASH(cmap, offset);
7566
7567	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == `0`)) {
7568	continue;
7569	}
7570
7571	/ scan the bitfield for a string of bits /
7572	fs = -`1`;
7573
7574	for (i = `0`; i < DRT_BITVECTOR_PAGES; i++) {
7575	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7576	fs = i;
7577	break;
7578	}
7579	}
7580	if (fs == -`1`) {
7581	/ didn't find any bits set /
7582	panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7583	cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7584	}
7585	for (ls = `0`; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7586	if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7587	break;
7588	}
7589	}
7590
7591	/ compute offset and length, mark pages clean /
7592	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7593	length = ls * PAGE_SIZE;
7594	vfs_drt_do_mark_pages(private: cmapp, offset, length, NULL, dirty: `0`);
7595	cmap->scm_lastclean = index;
7596
7597	/ return successful /
7598	*offsetp = (off_t)offset;
7599	*lengthp = length;
7600
7601	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, arg1: (int)offset, arg2: (int)length, arg3: `0`, arg4: `0`);
7602	return KERN_SUCCESS;
7603	}
7604	/*
7605	* We didn't find anything... hashtable is empty
7606	* emit stats into trace buffer and
7607	* then free it
7608	*/
7609	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7610	arg1: cmap->scm_modulus,
7611	arg2: cmap->scm_buckets,
7612	arg3: cmap->scm_lastclean,
7613	arg4: cmap->scm_iskips);
7614
7615	vfs_drt_free_map(cmap);
7616	*cmapp = NULL;
7617
7618	return KERN_FAILURE;
7619	}
7620
7621
7622	static kern_return_t
7623	vfs_drt_control(void *cmapp, int* op_type)
7624	{
7625	struct vfs_drt_clustermap *cmap;
7626
7627	/ sanity /
7628	if ((cmapp == NULL) \|\| (*cmapp == NULL)) {
7629	return KERN_FAILURE;
7630	}
7631	cmap = *cmapp;
7632
7633	switch (op_type) {
7634	case `0`:
7635	/ emit stats into trace buffer /
7636	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7637	arg1: cmap->scm_modulus,
7638	arg2: cmap->scm_buckets,
7639	arg3: cmap->scm_lastclean,
7640	arg4: cmap->scm_iskips);
7641
7642	vfs_drt_free_map(cmap);
7643	*cmapp = NULL;
7644	break;
7645
7646	case `1`:
7647	cmap->scm_lastclean = `0`;
7648	break;
7649	}
7650	return KERN_SUCCESS;
7651	}
7652
7653
7654
7655	/*
7656	* Emit a summary of the state of the clustermap into the trace buffer
7657	* along with some caller-provided data.
7658	*/
7659	#if KDEBUG
7660	static void
7661	vfs_drt_trace(__unused struct vfs_drt_clustermap cmap, int* code, int arg1, int arg2, int arg3, int arg4)
7662	{
7663	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, `0`);
7664	}
7665	#else
7666	static void
7667	vfs_drt_trace(__unused struct vfs_drt_clustermap cmap, __unused int* code,
7668	__unused int arg1, __unused int arg2, __unused int arg3,
7669	__unused int arg4)
7670	{
7671	}
7672	#endif
7673
7674	#if 0
7675	/*
7676	* Perform basic sanity check on the hash entry summary count
7677	* vs. the actual bits set in the entry.
7678	*/
7679	static void
7680	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7681	{
7682	int index, i;
7683	int bits_on;
7684
7685	for (index = `0`; index < cmap->scm_modulus; index++) {
7686	if (DRT_HASH_VACANT(cmap, index)) {
7687	continue;
7688	}
7689
7690	for (bits_on = `0`, i = `0`; i < DRT_BITVECTOR_PAGES; i++) {
7691	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7692	bits_on++;
7693	}
7694	}
7695	if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7696	panic("bits_on = %d, index = %d", bits_on, index);
7697	}
7698	}
7699	}
7700	#endif
7701
7702	/*
7703	* Internal interface only.
7704	*/
7705	static kern_return_t
7706	vfs_get_scmap_push_behavior_internal(void *cmapp, int* *push_flag)
7707	{
7708	struct vfs_drt_clustermap *cmap;
7709
7710	/ sanity /
7711	if ((cmapp == NULL) \|\| (*cmapp == NULL) \|\| (push_flag == NULL)) {
7712	return KERN_FAILURE;
7713	}
7714	cmap = *cmapp;
7715
7716	if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7717	/*
7718	* If we have a full xlarge sparse cluster,
7719	* we push it out all at once so the cluster
7720	* map can be available to absorb more I/Os.
7721	* This is done on large memory configs so
7722	* the small I/Os don't interfere with the
7723	* pro workloads.
7724	*/
7725	*push_flag = PUSH_ALL;
7726	}
7727	return KERN_SUCCESS;
7728	}
7729

Browse the source code of xnu/bsd/vfs/vfs_cluster.c