1/*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64#include <sys/param.h>
65#include <sys/proc_internal.h>
66#include <sys/buf_internal.h>
67#include <sys/mount_internal.h>
68#include <sys/vnode_internal.h>
69#include <sys/trace.h>
70#include <kern/kalloc.h>
71#include <sys/time.h>
72#include <sys/kernel.h>
73#include <sys/resourcevar.h>
74#include <miscfs/specfs/specdev.h>
75#include <sys/uio_internal.h>
76#include <libkern/libkern.h>
77#include <machine/machine_routines.h>
78
79#include <sys/ubc_internal.h>
80#include <vm/vnode_pager.h>
81
82#include <mach/mach_types.h>
83#include <mach/memory_object_types.h>
84#include <mach/vm_map.h>
85#include <mach/upl.h>
86#include <kern/task.h>
87#include <kern/policy_internal.h>
88
89#include <vm/vm_kern.h>
90#include <vm/vm_map.h>
91#include <vm/vm_pageout.h>
92#include <vm/vm_fault.h>
93
94#include <sys/kdebug.h>
95#include <sys/kdebug_triage.h>
96#include <libkern/OSAtomic.h>
97
98#include <sys/sdt.h>
99
100#include <stdbool.h>
101
102#include <vfs/vfs_disk_conditioner.h>
103
104#if 0
105#undef KERNEL_DEBUG
106#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107#endif
108
109
110#define CL_READ 0x01
111#define CL_WRITE 0x02
112#define CL_ASYNC 0x04
113#define CL_COMMIT 0x08
114#define CL_PAGEOUT 0x10
115#define CL_AGE 0x20
116#define CL_NOZERO 0x40
117#define CL_PAGEIN 0x80
118#define CL_DEV_MEMORY 0x100
119#define CL_PRESERVE 0x200
120#define CL_THROTTLE 0x400
121#define CL_KEEPCACHED 0x800
122#define CL_DIRECT_IO 0x1000
123#define CL_PASSIVE 0x2000
124#define CL_IOSTREAMING 0x4000
125#define CL_CLOSE 0x8000
126#define CL_ENCRYPTED 0x10000
127#define CL_RAW_ENCRYPTED 0x20000
128#define CL_NOCACHE 0x40000
129
130#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132#define CLUSTER_IO_WAITING ((buf_t)1)
133
134extern upl_t vector_upl_create(vm_offset_t, uint32_t);
135extern uint32_t vector_upl_max_upls(upl_t);
136extern boolean_t vector_upl_is_valid(upl_t);
137extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138extern void vector_upl_set_pagelist(upl_t);
139extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140
141struct clios {
142 lck_mtx_t io_mtxp;
143 u_int io_completed; /* amount of io that has currently completed */
144 u_int io_issued; /* amount of io that was successfully issued */
145 int io_error; /* error code of first error encountered */
146 int io_wanted; /* someone is sleeping waiting for a change in state */
147};
148
149struct cl_direct_read_lock {
150 LIST_ENTRY(cl_direct_read_lock) chain;
151 int32_t ref_count;
152 vnode_t vp;
153 lck_rw_t rw_lock;
154};
155
156#define CL_DIRECT_READ_LOCK_BUCKETS 61
157
158static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160
161static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164
165static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166 sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167
168static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169 sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170
171#define IO_UNKNOWN 0
172#define IO_DIRECT 1
173#define IO_CONTIG 2
174#define IO_COPY 3
175
176#define PUSH_DELAY 0x01
177#define PUSH_ALL 0x02
178#define PUSH_SYNC 0x04
179
180
181static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182static void cluster_wait_IO(buf_t cbp_head, int async);
183static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
184
185static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
186
187static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
189static int cluster_iodone(buf_t bp, void *callback_arg);
190static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191static int cluster_is_throttled(vnode_t vp);
192
193static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
194
195static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
196
197static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
199
200static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
201 int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
202static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
203 int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205 int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
206
207static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
209static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
213
214static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
215 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
216
217static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
218
219static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
220static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
221 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222
223static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
224
225static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
226 void *callback_arg, int *err, boolean_t vm_initiated);
227
228static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
230 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
232 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233
234static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
235static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
236static kern_return_t vfs_drt_control(void **cmapp, int op_type);
237static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
238
239
240/*
241 * For throttled IO to check whether
242 * a block is cached by the boot cache
243 * and thus it can avoid delaying the IO.
244 *
245 * bootcache_contains_block is initially
246 * NULL. The BootCache will set it while
247 * the cache is active and clear it when
248 * the cache is jettisoned.
249 *
250 * Returns 0 if the block is not
251 * contained in the cache, 1 if it is
252 * contained.
253 *
254 * The function pointer remains valid
255 * after the cache has been evicted even
256 * if bootcache_contains_block has been
257 * cleared.
258 *
259 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260 */
261int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262
263
264/*
265 * limit the internal I/O size so that we
266 * can represent it in a 32 bit int
267 */
268#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
269#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
270#define MAX_VECTS 16
271/*
272 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
274 * we have not historically allowed the write to bypass the UBC.
275 */
276#define MIN_DIRECT_WRITE_SIZE (16384)
277
278#define WRITE_THROTTLE 6
279#define WRITE_THROTTLE_SSD 2
280#define WRITE_BEHIND 1
281#define WRITE_BEHIND_SSD 1
282
283#if !defined(XNU_TARGET_OS_OSX)
284#define PREFETCH 1
285#define PREFETCH_SSD 1
286uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
287uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
288#else /* XNU_TARGET_OS_OSX */
289#define PREFETCH 3
290#define PREFETCH_SSD 2
291uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
292uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
293#endif /* ! XNU_TARGET_OS_OSX */
294
295/* maximum bytes for read-ahead */
296uint32_t prefetch_max = (1024 * 1024 * 1024);
297/* maximum bytes for outstanding reads */
298uint32_t overlapping_read_max = (1024 * 1024 * 1024);
299/* maximum bytes for outstanding writes */
300uint32_t overlapping_write_max = (1024 * 1024 * 1024);
301
302#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
303#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
304
305int speculative_reads_disabled = 0;
306
307/*
308 * throttle the number of async writes that
309 * can be outstanding on a single vnode
310 * before we issue a synchronous write
311 */
312#define THROTTLE_MAXCNT 0
313
314uint32_t throttle_max_iosize = (128 * 1024);
315
316#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
317
318SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
319
320
321void
322cluster_init(void)
323{
324 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
325 LIST_INIT(&cl_direct_read_locks[i]);
326 }
327}
328
329
330uint32_t
331cluster_max_io_size(mount_t mp, int type)
332{
333 uint32_t max_io_size;
334 uint32_t segcnt;
335 uint32_t maxcnt;
336
337 switch (type) {
338 case CL_READ:
339 segcnt = mp->mnt_segreadcnt;
340 maxcnt = mp->mnt_maxreadcnt;
341 break;
342 case CL_WRITE:
343 segcnt = mp->mnt_segwritecnt;
344 maxcnt = mp->mnt_maxwritecnt;
345 break;
346 default:
347 segcnt = min(a: mp->mnt_segreadcnt, b: mp->mnt_segwritecnt);
348 maxcnt = min(a: mp->mnt_maxreadcnt, b: mp->mnt_maxwritecnt);
349 break;
350 }
351 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
352 /*
353 * don't allow a size beyond the max UPL size we can create
354 */
355 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
356 }
357 max_io_size = min(a: (segcnt * PAGE_SIZE), b: maxcnt);
358
359 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
360 /*
361 * don't allow a size smaller than the old fixed limit
362 */
363 max_io_size = MAX_UPL_TRANSFER_BYTES;
364 } else {
365 /*
366 * make sure the size specified is a multiple of PAGE_SIZE
367 */
368 max_io_size &= ~PAGE_MASK;
369 }
370 return max_io_size;
371}
372
373/*
374 * Returns max prefetch value. If the value overflows or exceeds the specified
375 * 'prefetch_limit', it will be capped at 'prefetch_limit' value.
376 */
377static inline uint32_t
378cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
379{
380 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
381 uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
382 uint32_t prefetch = 0;
383
384 if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) ||
385 (prefetch > prefetch_limit))) {
386 prefetch = prefetch_limit;
387 }
388
389 return prefetch;
390}
391
392static inline uint32_t
393calculate_max_throttle_size(vnode_t vp)
394{
395 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
396 uint32_t io_scale = IO_SCALE(vp, is_ssd ? 2 : 1);
397
398 return MIN(io_scale * THROTTLE_MAX_IOSIZE, MAX_UPL_TRANSFER_BYTES);
399}
400
401static inline uint32_t
402calculate_max_throttle_cnt(vnode_t vp)
403{
404 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
405 uint32_t io_scale = IO_SCALE(vp, 1);
406
407 return is_ssd ? MIN(io_scale, 4) : THROTTLE_MAXCNT;
408}
409
410#define CLW_ALLOCATE 0x01
411#define CLW_RETURNLOCKED 0x02
412#define CLW_IONOCACHE 0x04
413#define CLW_IOPASSIVE 0x08
414
415/*
416 * if the read ahead context doesn't yet exist,
417 * allocate and initialize it...
418 * the vnode lock serializes multiple callers
419 * during the actual assignment... first one
420 * to grab the lock wins... the other callers
421 * will release the now unnecessary storage
422 *
423 * once the context is present, try to grab (but don't block on)
424 * the lock associated with it... if someone
425 * else currently owns it, than the read
426 * will run without read-ahead. this allows
427 * multiple readers to run in parallel and
428 * since there's only 1 read ahead context,
429 * there's no real loss in only allowing 1
430 * reader to have read-ahead enabled.
431 */
432static struct cl_readahead *
433cluster_get_rap(vnode_t vp)
434{
435 struct ubc_info *ubc;
436 struct cl_readahead *rap;
437
438 ubc = vp->v_ubcinfo;
439
440 if ((rap = ubc->cl_rahead) == NULL) {
441 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
442 rap->cl_lastr = -1;
443 lck_mtx_init(lck: &rap->cl_lockr, grp: &cl_mtx_grp, LCK_ATTR_NULL);
444
445 vnode_lock(vp);
446
447 if (ubc->cl_rahead == NULL) {
448 ubc->cl_rahead = rap;
449 } else {
450 lck_mtx_destroy(lck: &rap->cl_lockr, grp: &cl_mtx_grp);
451 zfree(cl_rd_zone, rap);
452 rap = ubc->cl_rahead;
453 }
454 vnode_unlock(vp);
455 }
456 if (lck_mtx_try_lock(lck: &rap->cl_lockr) == TRUE) {
457 return rap;
458 }
459
460 return (struct cl_readahead *)NULL;
461}
462
463
464/*
465 * if the write behind context doesn't yet exist,
466 * and CLW_ALLOCATE is specified, allocate and initialize it...
467 * the vnode lock serializes multiple callers
468 * during the actual assignment... first one
469 * to grab the lock wins... the other callers
470 * will release the now unnecessary storage
471 *
472 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
473 * the lock associated with the write behind context before
474 * returning
475 */
476
477static struct cl_writebehind *
478cluster_get_wbp(vnode_t vp, int flags)
479{
480 struct ubc_info *ubc;
481 struct cl_writebehind *wbp;
482
483 ubc = vp->v_ubcinfo;
484
485 if ((wbp = ubc->cl_wbehind) == NULL) {
486 if (!(flags & CLW_ALLOCATE)) {
487 return (struct cl_writebehind *)NULL;
488 }
489
490 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
491
492 lck_mtx_init(lck: &wbp->cl_lockw, grp: &cl_mtx_grp, LCK_ATTR_NULL);
493
494 vnode_lock(vp);
495
496 if (ubc->cl_wbehind == NULL) {
497 ubc->cl_wbehind = wbp;
498 } else {
499 lck_mtx_destroy(lck: &wbp->cl_lockw, grp: &cl_mtx_grp);
500 zfree(cl_wr_zone, wbp);
501 wbp = ubc->cl_wbehind;
502 }
503 vnode_unlock(vp);
504 }
505 if (flags & CLW_RETURNLOCKED) {
506 lck_mtx_lock(lck: &wbp->cl_lockw);
507 }
508
509 return wbp;
510}
511
512
513static void
514cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
515{
516 struct cl_writebehind *wbp;
517
518 if ((wbp = cluster_get_wbp(vp, flags: 0)) != NULL) {
519 if (wbp->cl_number) {
520 lck_mtx_lock(lck: &wbp->cl_lockw);
521
522 cluster_try_push(wbp, vp, EOF: newEOF, PUSH_ALL | flags, flags: 0, callback, callback_arg, NULL, FALSE);
523
524 lck_mtx_unlock(lck: &wbp->cl_lockw);
525 }
526 }
527}
528
529
530static int
531cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
532{
533 daddr64_t blkno;
534 size_t io_size;
535 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
536
537 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
538 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
539 return 0;
540 }
541
542 if (io_size == 0) {
543 return 0;
544 }
545
546 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
547 return 1;
548 }
549 }
550 return 0;
551}
552
553
554static int
555cluster_is_throttled(vnode_t vp)
556{
557 return throttle_io_will_be_throttled(lowpri_window_msecs: -1, mp: vp->v_mount);
558}
559
560
561static void
562cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
563{
564 lck_mtx_lock(lck: &iostate->io_mtxp);
565
566 while ((iostate->io_issued - iostate->io_completed) > target) {
567 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
568 iostate->io_issued, iostate->io_completed, target, 0, 0);
569
570 iostate->io_wanted = 1;
571 msleep(chan: (caddr_t)&iostate->io_wanted, mtx: &iostate->io_mtxp, PRIBIO + 1, wmesg: wait_name, NULL);
572
573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
574 iostate->io_issued, iostate->io_completed, target, 0, 0);
575 }
576 lck_mtx_unlock(lck: &iostate->io_mtxp);
577}
578
579static void
580cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
581 upl_offset_t upl_offset, upl_size_t size)
582{
583 if (!size) {
584 return;
585 }
586
587 upl_t associated_upl = upl_associated_upl(upl);
588
589 if (!associated_upl) {
590 return;
591 }
592
593#if 0
594 printf("1: %d %d\n", upl_offset, upl_offset + size);
595#endif
596
597 /*
598 * The associated UPL is page aligned to file offsets whereas the
599 * UPL it's attached to has different alignment requirements. The
600 * upl_offset that we have refers to @upl. The code that follows
601 * has to deal with the first and last pages in this transaction
602 * which might straddle pages in the associated UPL. To keep
603 * track of these pages, we use the mark bits: if the mark bit is
604 * set, we know another transaction has completed its part of that
605 * page and so we can unlock that page here.
606 *
607 * The following illustrates what we have to deal with:
608 *
609 * MEM u <------------ 1 PAGE ------------> e
610 * +-------------+----------------------+-----------------
611 * | |######################|#################
612 * +-------------+----------------------+-----------------
613 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
614 *
615 * So here we show a write to offset @o. The data that is to be
616 * written is in a buffer that is not page aligned; it has offset
617 * @a in the page. The upl that carries the data starts in memory
618 * at @u. The associated upl starts in the file at offset @o. A
619 * transaction will always end on a page boundary (like @e above)
620 * except for the very last transaction in the group. We cannot
621 * unlock the page at @o in the associated upl until both the
622 * transaction ending at @e and the following transaction (that
623 * starts at @e) has completed.
624 */
625
626 /*
627 * We record whether or not the two UPLs are aligned as the mark
628 * bit in the first page of @upl.
629 */
630 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
631 bool is_unaligned = upl_page_get_mark(upl: pl, index: 0);
632
633 if (is_unaligned) {
634 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
635
636 upl_offset_t upl_end = upl_offset + size;
637 assert(upl_end >= PAGE_SIZE);
638
639 upl_size_t assoc_upl_size = upl_get_size(upl: associated_upl);
640
641 /*
642 * In the very first transaction in the group, upl_offset will
643 * not be page aligned, but after that it will be and in that
644 * case we want the preceding page in the associated UPL hence
645 * the minus one.
646 */
647 assert(upl_offset);
648 if (upl_offset) {
649 upl_offset = trunc_page_32(upl_offset - 1);
650 }
651
652 lck_mtx_lock_spin(lck: &iostate->io_mtxp);
653
654 // Look at the first page...
655 if (upl_offset
656 && !upl_page_get_mark(upl: assoc_pl, index: upl_offset >> PAGE_SHIFT)) {
657 /*
658 * The first page isn't marked so let another transaction
659 * completion handle it.
660 */
661 upl_page_set_mark(upl: assoc_pl, index: upl_offset >> PAGE_SHIFT, true);
662 upl_offset += PAGE_SIZE;
663 }
664
665 // And now the last page...
666
667 /*
668 * This needs to be > rather than >= because if it's equal, it
669 * means there's another transaction that is sharing the last
670 * page.
671 */
672 if (upl_end > assoc_upl_size) {
673 upl_end = assoc_upl_size;
674 } else {
675 upl_end = trunc_page_32(upl_end);
676 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
677
678 if (!upl_page_get_mark(upl: assoc_pl, index: last_pg)) {
679 /*
680 * The last page isn't marked so mark the page and let another
681 * transaction completion handle it.
682 */
683 upl_page_set_mark(upl: assoc_pl, index: last_pg, true);
684 upl_end -= PAGE_SIZE;
685 }
686 }
687
688 lck_mtx_unlock(lck: &iostate->io_mtxp);
689
690#if 0
691 printf("2: %d %d\n", upl_offset, upl_end);
692#endif
693
694 if (upl_end <= upl_offset) {
695 return;
696 }
697
698 size = upl_end - upl_offset;
699 } else {
700 assert(!(upl_offset & PAGE_MASK));
701 assert(!(size & PAGE_MASK));
702 }
703
704 boolean_t empty;
705
706 /*
707 * We can unlock these pages now and as this is for a
708 * direct/uncached write, we want to dump the pages too.
709 */
710 kern_return_t kr = upl_abort_range(upl_object: associated_upl, offset: upl_offset, size,
711 UPL_ABORT_DUMP_PAGES, empty: &empty);
712
713 assert(!kr);
714
715 if (!kr && empty) {
716 upl_set_associated_upl(upl, NULL);
717 upl_deallocate(upl: associated_upl);
718 }
719}
720
721static int
722cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
723{
724 int upl_abort_code = 0;
725 int page_in = 0;
726 int page_out = 0;
727
728 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
729 /*
730 * direct write of any flavor, or a direct read that wasn't aligned
731 */
732 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
733 } else {
734 if (io_flags & B_PAGEIO) {
735 if (io_flags & B_READ) {
736 page_in = 1;
737 } else {
738 page_out = 1;
739 }
740 }
741 if (io_flags & B_CACHE) {
742 /*
743 * leave pages in the cache unchanged on error
744 */
745 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
746 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
747 /*
748 * transient error on pageout/write path... leave pages unchanged
749 */
750 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
751 } else if (page_in) {
752 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
753 } else {
754 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
755 }
756
757 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
758 }
759 return upl_abort_code;
760}
761
762
763static int
764cluster_iodone(buf_t bp, void *callback_arg)
765{
766 int b_flags;
767 int error;
768 int total_size;
769 int total_resid;
770 int upl_offset;
771 int zero_offset;
772 int pg_offset = 0;
773 int commit_size = 0;
774 int upl_flags = 0;
775 int transaction_size = 0;
776 upl_t upl;
777 buf_t cbp;
778 buf_t cbp_head;
779 buf_t cbp_next;
780 buf_t real_bp;
781 vnode_t vp;
782 struct clios *iostate;
783 void *verify_ctx;
784 boolean_t transaction_complete = FALSE;
785
786 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
787
788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
789 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
790
791 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
792 lck_mtx_lock_spin(lck: &cl_transaction_mtxp);
793
794 bp->b_flags |= B_TDONE;
795
796 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
797 /*
798 * all I/O requests that are part of this transaction
799 * have to complete before we can process it
800 */
801 if (!(cbp->b_flags & B_TDONE)) {
802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
803 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
804
805 lck_mtx_unlock(lck: &cl_transaction_mtxp);
806
807 return 0;
808 }
809
810 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
811 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
812 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
813
814 lck_mtx_unlock(lck: &cl_transaction_mtxp);
815 wakeup(chan: cbp);
816
817 return 0;
818 }
819
820 if (cbp->b_flags & B_EOT) {
821 transaction_complete = TRUE;
822 }
823 }
824 lck_mtx_unlock(lck: &cl_transaction_mtxp);
825
826 if (transaction_complete == FALSE) {
827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
828 cbp_head, 0, 0, 0, 0);
829 return 0;
830 }
831 }
832 error = 0;
833 total_size = 0;
834 total_resid = 0;
835
836 cbp = cbp_head;
837 vp = cbp->b_vp;
838 upl_offset = cbp->b_uploffset;
839 upl = cbp->b_upl;
840 b_flags = cbp->b_flags;
841 real_bp = cbp->b_real_bp;
842 zero_offset = cbp->b_validend;
843 iostate = (struct clios *)cbp->b_iostate;
844
845 if (real_bp) {
846 real_bp->b_dev = cbp->b_dev;
847 }
848
849 while (cbp) {
850 if ((cbp->b_flags & B_ERROR) && error == 0) {
851 error = cbp->b_error;
852 }
853
854 total_resid += cbp->b_resid;
855 total_size += cbp->b_bcount;
856
857 cbp_next = cbp->b_trans_next;
858
859 if (cbp_next == NULL) {
860 /*
861 * compute the overall size of the transaction
862 * in case we created one that has 'holes' in it
863 * 'total_size' represents the amount of I/O we
864 * did, not the span of the transaction w/r to the UPL
865 */
866 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
867 }
868
869 if (cbp != cbp_head) {
870 free_io_buf(cbp);
871 }
872
873 cbp = cbp_next;
874 }
875
876 if (ISSET(b_flags, B_COMMIT_UPL)) {
877 cluster_handle_associated_upl(iostate,
878 upl: cbp_head->b_upl,
879 upl_offset,
880 size: transaction_size);
881 }
882
883 if (error == 0 && total_resid) {
884 error = EIO;
885 }
886
887 if (error == 0) {
888 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
889
890 if (cliodone_func != NULL) {
891 cbp_head->b_bcount = transaction_size;
892
893 error = (*cliodone_func)(cbp_head, callback_arg);
894 }
895 }
896 if (zero_offset) {
897 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
898 }
899
900 verify_ctx = cbp_head->b_attr.ba_verify_ctx;
901 cbp_head->b_attr.ba_verify_ctx = NULL;
902 if (verify_ctx) {
903 vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
904 caddr_t verify_buf = NULL;
905 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
906 size_t verify_length = transaction_size;
907 vm_offset_t vaddr;
908
909 if (!error) {
910 verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
911 error = ubc_upl_map_range(upl, upl_offset, round_page(x: transaction_size), VM_PROT_DEFAULT, &vaddr); /* Map it in */
912 if (error) {
913 panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
914 error, upl, (int)upl_offset, (int)round_page(transaction_size));
915 } else {
916 verify_buf = (caddr_t)vaddr;
917 }
918 }
919
920 error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
921
922 if (verify_buf) {
923 (void)ubc_upl_unmap_range(upl, upl_offset, round_page(x: transaction_size));
924 verify_buf = NULL;
925 }
926 } else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) {
927 error = EBADMSG;
928 }
929
930 free_io_buf(cbp_head);
931
932 if (iostate) {
933 int need_wakeup = 0;
934
935 /*
936 * someone has issued multiple I/Os asynchrounsly
937 * and is waiting for them to complete (streaming)
938 */
939 lck_mtx_lock_spin(lck: &iostate->io_mtxp);
940
941 if (error && iostate->io_error == 0) {
942 iostate->io_error = error;
943 }
944
945 iostate->io_completed += total_size;
946
947 if (iostate->io_wanted) {
948 /*
949 * someone is waiting for the state of
950 * this io stream to change
951 */
952 iostate->io_wanted = 0;
953 need_wakeup = 1;
954 }
955 lck_mtx_unlock(lck: &iostate->io_mtxp);
956
957 if (need_wakeup) {
958 wakeup(chan: (caddr_t)&iostate->io_wanted);
959 }
960 }
961
962 if (b_flags & B_COMMIT_UPL) {
963 pg_offset = upl_offset & PAGE_MASK;
964 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
965
966 if (error) {
967 upl_set_iodone_error(upl, error);
968
969 upl_flags = cluster_ioerror(upl, upl_offset: upl_offset - pg_offset, abort_size: commit_size, error, io_flags: b_flags, vp);
970 } else {
971 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
972
973 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
974 upl_flags |= UPL_COMMIT_SET_DIRTY;
975 }
976
977 if (b_flags & B_AGE) {
978 upl_flags |= UPL_COMMIT_INACTIVATE;
979 }
980
981 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
982 }
983 }
984 if (real_bp) {
985 if (error) {
986 real_bp->b_flags |= B_ERROR;
987 real_bp->b_error = error;
988 }
989 real_bp->b_resid = total_resid;
990
991 buf_biodone(bp: real_bp);
992 }
993 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
994 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
995
996 return error;
997}
998
999
1000uint32_t
1001cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
1002{
1003 if (cluster_is_throttled(vp)) {
1004 *limit = calculate_max_throttle_size(vp);
1005 return 1;
1006 }
1007 return 0;
1008}
1009
1010
1011void
1012cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1013{
1014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
1015 upl_offset, size, bp, 0, 0);
1016
1017 if (bp == NULL || bp->b_datap == 0) {
1018 upl_page_info_t *pl;
1019 addr64_t zero_addr;
1020
1021 pl = ubc_upl_pageinfo(upl);
1022
1023 if (upl_device_page(upl: pl) == TRUE) {
1024 zero_addr = ((addr64_t)upl_phys_page(upl: pl, index: 0) << PAGE_SHIFT) + upl_offset;
1025
1026 bzero_phys_nc(src64: zero_addr, bytes: size);
1027 } else {
1028 while (size) {
1029 int page_offset;
1030 int page_index;
1031 int zero_cnt;
1032
1033 page_index = upl_offset / PAGE_SIZE;
1034 page_offset = upl_offset & PAGE_MASK;
1035
1036 zero_addr = ((addr64_t)upl_phys_page(upl: pl, index: page_index) << PAGE_SHIFT) + page_offset;
1037 zero_cnt = min(PAGE_SIZE - page_offset, b: size);
1038
1039 bzero_phys(phys_address: zero_addr, length: zero_cnt);
1040
1041 size -= zero_cnt;
1042 upl_offset += zero_cnt;
1043 }
1044 }
1045 } else {
1046 bzero(s: (caddr_t)((vm_offset_t)bp->b_datap + upl_offset), n: size);
1047 }
1048
1049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1050 upl_offset, size, 0, 0, 0);
1051}
1052
1053
1054static void
1055cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1056{
1057 /*
1058 * We will assign a verification context to cbp_head.
1059 * This will be passed back to the filesystem when
1060 * verifying (in cluster_iodone).
1061 */
1062 if (verify_block_size) {
1063 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1064 size_t length;
1065 void *verify_ctx = NULL;
1066 int error = 0;
1067 vnode_t vp = buf_vnode(bp: cbp_head);
1068
1069 if (cbp_head == cbp_tail) {
1070 length = cbp_head->b_bcount;
1071 } else {
1072 length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1073 }
1074
1075 /*
1076 * zero_offset is non zero for the transaction containing the EOF
1077 * (if the filesize is not page aligned). In that case we might
1078 * have the transaction size not be page/verify block size aligned
1079 */
1080 if ((zero_offset == 0) &&
1081 ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1082 panic("%s length = %zu, verify_block_size = %zu",
1083 __FUNCTION__, length, verify_block_size);
1084 }
1085
1086 error = VNOP_VERIFY(vp, start_off, NULL, length,
1087 &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1088
1089 cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1090 } else {
1091 cbp_head->b_attr.ba_verify_ctx = NULL;
1092 }
1093
1094 cbp_head->b_validend = zero_offset;
1095 cbp_tail->b_flags |= B_EOT;
1096}
1097
1098static void
1099cluster_wait_IO(buf_t cbp_head, int async)
1100{
1101 buf_t cbp;
1102
1103 if (async) {
1104 /*
1105 * Async callback completion will not normally generate a
1106 * wakeup upon I/O completion. To get woken up, we set
1107 * b_trans_next (which is safe for us to modify) on the last
1108 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1109 * to wake us up when all buffers as part of this transaction
1110 * are completed. This is done under the umbrella of
1111 * cl_transaction_mtxp which is also taken in cluster_iodone.
1112 */
1113 bool done = true;
1114 buf_t last = NULL;
1115
1116 lck_mtx_lock_spin(lck: &cl_transaction_mtxp);
1117
1118 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1119 if (!ISSET(cbp->b_flags, B_TDONE)) {
1120 done = false;
1121 }
1122 }
1123
1124 if (!done) {
1125 last->b_trans_next = CLUSTER_IO_WAITING;
1126
1127 DTRACE_IO1(wait__start, buf_t, last);
1128 do {
1129 msleep(chan: last, mtx: &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), wmesg: "cluster_wait_IO", NULL);
1130
1131 /*
1132 * We should only have been woken up if all the
1133 * buffers are completed, but just in case...
1134 */
1135 done = true;
1136 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1137 if (!ISSET(cbp->b_flags, B_TDONE)) {
1138 done = false;
1139 break;
1140 }
1141 }
1142 } while (!done);
1143 DTRACE_IO1(wait__done, buf_t, last);
1144
1145 last->b_trans_next = NULL;
1146 }
1147
1148 lck_mtx_unlock(lck: &cl_transaction_mtxp);
1149 } else { // !async
1150 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1151 buf_biowait(bp: cbp);
1152 }
1153 }
1154}
1155
1156static void
1157cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1158{
1159 buf_t cbp;
1160 int error;
1161 boolean_t isswapout = FALSE;
1162
1163 /*
1164 * cluster_complete_transaction will
1165 * only be called if we've issued a complete chain in synchronous mode
1166 * or, we've already done a cluster_wait_IO on an incomplete chain
1167 */
1168 if (needwait) {
1169 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1170 buf_biowait(bp: cbp);
1171 }
1172 }
1173 /*
1174 * we've already waited on all of the I/Os in this transaction,
1175 * so mark all of the buf_t's in this transaction as B_TDONE
1176 * so that cluster_iodone sees the transaction as completed
1177 */
1178 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1179 cbp->b_flags |= B_TDONE;
1180 }
1181 cbp = *cbp_head;
1182
1183 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(vp: cbp->b_vp)) {
1184 isswapout = TRUE;
1185 }
1186
1187 error = cluster_iodone(bp: cbp, callback_arg);
1188
1189 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1190 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1191 *retval = error;
1192 } else if (isswapout == TRUE) {
1193 *retval = error;
1194 }
1195 }
1196 *cbp_head = (buf_t)NULL;
1197}
1198
1199
1200static int
1201cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1202 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1203{
1204 buf_t cbp;
1205 u_int size;
1206 u_int io_size;
1207 int io_flags;
1208 int bmap_flags;
1209 int error = 0;
1210 int retval = 0;
1211 buf_t cbp_head = NULL;
1212 buf_t cbp_tail = NULL;
1213 int trans_count = 0;
1214 int max_trans_count;
1215 u_int pg_count;
1216 int pg_offset;
1217 u_int max_iosize;
1218 u_int max_vectors;
1219 int priv;
1220 int zero_offset = 0;
1221 int async_throttle = 0;
1222 mount_t mp;
1223 vm_offset_t upl_end_offset;
1224 boolean_t need_EOT = FALSE;
1225 size_t verify_block_size = 0;
1226
1227 /*
1228 * we currently don't support buffers larger than a page
1229 */
1230 if (real_bp && non_rounded_size > PAGE_SIZE) {
1231 panic("%s(): Called with real buffer of size %d bytes which "
1232 "is greater than the maximum allowed size of "
1233 "%d bytes (the system PAGE_SIZE).\n",
1234 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1235 }
1236
1237 mp = vp->v_mount;
1238
1239 /*
1240 * we don't want to do any funny rounding of the size for IO requests
1241 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1242 * belong to us... we can't extend (nor do we need to) the I/O to fill
1243 * out a page
1244 */
1245 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1246 /*
1247 * round the requested size up so that this I/O ends on a
1248 * page boundary in case this is a 'write'... if the filesystem
1249 * has blocks allocated to back the page beyond the EOF, we want to
1250 * make sure to write out the zero's that are sitting beyond the EOF
1251 * so that in case the filesystem doesn't explicitly zero this area
1252 * if a hole is created via a lseek/write beyond the current EOF,
1253 * it will return zeros when it's read back from the disk. If the
1254 * physical allocation doesn't extend for the whole page, we'll
1255 * only write/read from the disk up to the end of this allocation
1256 * via the extent info returned from the VNOP_BLOCKMAP call.
1257 */
1258 pg_offset = upl_offset & PAGE_MASK;
1259
1260 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1261 } else {
1262 /*
1263 * anyone advertising a blocksize of 1 byte probably
1264 * can't deal with us rounding up the request size
1265 * AFP is one such filesystem/device
1266 */
1267 size = non_rounded_size;
1268 }
1269 upl_end_offset = upl_offset + size;
1270
1271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1272
1273 /*
1274 * Set the maximum transaction size to the maximum desired number of
1275 * buffers.
1276 */
1277 max_trans_count = 8;
1278 if (flags & CL_DEV_MEMORY) {
1279 max_trans_count = 16;
1280 }
1281
1282 if (flags & CL_READ) {
1283 io_flags = B_READ;
1284 bmap_flags = VNODE_READ;
1285
1286 max_iosize = mp->mnt_maxreadcnt;
1287 max_vectors = mp->mnt_segreadcnt;
1288
1289 if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1290 !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1291 (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1292 verify_block_size) {
1293 if (verify_block_size != PAGE_SIZE) {
1294 verify_block_size = 0;
1295 }
1296 if (real_bp && verify_block_size) {
1297 panic("%s(): Called with real buffer and needs verification ",
1298 __FUNCTION__);
1299 }
1300 }
1301 } else {
1302 io_flags = B_WRITE;
1303 bmap_flags = VNODE_WRITE;
1304
1305 max_iosize = mp->mnt_maxwritecnt;
1306 max_vectors = mp->mnt_segwritecnt;
1307 }
1308 if (verify_block_size) {
1309 bmap_flags |= VNODE_CLUSTER_VERIFY;
1310 }
1311 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1312
1313 /*
1314 * make sure the maximum iosize is a
1315 * multiple of the page size
1316 */
1317 max_iosize &= ~PAGE_MASK;
1318
1319 /*
1320 * Ensure the maximum iosize is sensible.
1321 */
1322 if (!max_iosize) {
1323 max_iosize = PAGE_SIZE;
1324 }
1325
1326 if (flags & CL_THROTTLE) {
1327 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1328 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
1329
1330 if (max_iosize > max_throttle_size) {
1331 max_iosize = max_throttle_size;
1332 }
1333 async_throttle = calculate_max_throttle_cnt(vp);
1334 } else {
1335 if ((flags & CL_DEV_MEMORY)) {
1336 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1337 } else {
1338 u_int max_cluster;
1339 u_int max_cluster_size;
1340 u_int scale;
1341
1342 if (vp->v_mount->mnt_minsaturationbytecount) {
1343 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1344
1345 scale = 1;
1346 } else {
1347 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1348
1349 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1350 scale = WRITE_THROTTLE_SSD;
1351 } else {
1352 scale = WRITE_THROTTLE;
1353 }
1354 }
1355 if (max_iosize > max_cluster_size) {
1356 max_cluster = max_cluster_size;
1357 } else {
1358 max_cluster = max_iosize;
1359 }
1360
1361 if (size < max_cluster) {
1362 max_cluster = size;
1363 }
1364
1365 if (flags & CL_CLOSE) {
1366 scale += MAX_CLUSTERS;
1367 }
1368
1369 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), b: ((scale * max_cluster_size) / max_cluster) - 1);
1370 }
1371 }
1372 }
1373 if (flags & CL_AGE) {
1374 io_flags |= B_AGE;
1375 }
1376 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1377 io_flags |= B_PAGEIO;
1378 }
1379 if (flags & (CL_IOSTREAMING)) {
1380 io_flags |= B_IOSTREAMING;
1381 }
1382 if (flags & CL_COMMIT) {
1383 io_flags |= B_COMMIT_UPL;
1384 }
1385 if (flags & CL_DIRECT_IO) {
1386 io_flags |= B_PHYS;
1387 }
1388 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1389 io_flags |= B_CACHE;
1390 }
1391 if (flags & CL_PASSIVE) {
1392 io_flags |= B_PASSIVE;
1393 }
1394 if (flags & CL_ENCRYPTED) {
1395 io_flags |= B_ENCRYPTED_IO;
1396 }
1397
1398 if (vp->v_flag & VSYSTEM) {
1399 io_flags |= B_META;
1400 }
1401
1402 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1403 /*
1404 * then we are going to end up
1405 * with a page that we can't complete (the file size wasn't a multiple
1406 * of PAGE_SIZE and we're trying to read to the end of the file
1407 * so we'll go ahead and zero out the portion of the page we can't
1408 * read in from the file
1409 */
1410 zero_offset = (int)(upl_offset + non_rounded_size);
1411 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1412 assert(ISSET(flags, CL_COMMIT));
1413
1414 // For a direct/uncached write, we need to lock pages...
1415
1416 upl_t cached_upl;
1417
1418 /*
1419 * Create a UPL to lock the pages in the cache whilst the
1420 * write is in progress.
1421 */
1422 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1423 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1424
1425 /*
1426 * Attach this UPL to the other UPL so that we can find it
1427 * later.
1428 */
1429 upl_set_associated_upl(upl, associated_upl: cached_upl);
1430
1431 if (upl_offset & PAGE_MASK) {
1432 /*
1433 * The two UPLs are not aligned, so mark the first page in
1434 * @upl so that cluster_handle_associated_upl can handle
1435 * it accordingly.
1436 */
1437 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1438 upl_page_set_mark(upl: pl, index: 0, true);
1439 }
1440 }
1441
1442 while (size) {
1443 daddr64_t blkno;
1444 daddr64_t lblkno;
1445 size_t io_size_tmp;
1446 u_int io_size_wanted;
1447 uint32_t lblksize;
1448
1449 if (size > max_iosize) {
1450 io_size = max_iosize;
1451 } else {
1452 io_size = size;
1453 }
1454
1455 io_size_wanted = io_size;
1456 io_size_tmp = (size_t)io_size;
1457
1458 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1459 break;
1460 }
1461
1462 if (io_size_tmp > io_size_wanted) {
1463 io_size = io_size_wanted;
1464 } else {
1465 io_size = (u_int)io_size_tmp;
1466 }
1467
1468 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1469 real_bp->b_blkno = blkno;
1470 }
1471
1472 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1473 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1474
1475 if (io_size == 0) {
1476 /*
1477 * vnop_blockmap didn't return an error... however, it did
1478 * return an extent size of 0 which means we can't
1479 * make forward progress on this I/O... a hole in the
1480 * file would be returned as a blkno of -1 with a non-zero io_size
1481 * a real extent is returned with a blkno != -1 and a non-zero io_size
1482 */
1483 error = EINVAL;
1484 break;
1485 }
1486 if (!(flags & CL_READ) && blkno == -1) {
1487 off_t e_offset;
1488 int pageout_flags;
1489
1490 if (upl_get_internal_vectorupl(upl)) {
1491 panic("Vector UPLs should not take this code-path");
1492 }
1493 /*
1494 * we're writing into a 'hole'
1495 */
1496 if (flags & CL_PAGEOUT) {
1497 /*
1498 * if we got here via cluster_pageout
1499 * then just error the request and return
1500 * the 'hole' should already have been covered
1501 */
1502 error = EINVAL;
1503 break;
1504 }
1505 /*
1506 * we can get here if the cluster code happens to
1507 * pick up a page that was dirtied via mmap vs
1508 * a 'write' and the page targets a 'hole'...
1509 * i.e. the writes to the cluster were sparse
1510 * and the file was being written for the first time
1511 *
1512 * we can also get here if the filesystem supports
1513 * 'holes' that are less than PAGE_SIZE.... because
1514 * we can't know if the range in the page that covers
1515 * the 'hole' has been dirtied via an mmap or not,
1516 * we have to assume the worst and try to push the
1517 * entire page to storage.
1518 *
1519 * Try paging out the page individually before
1520 * giving up entirely and dumping it (the pageout
1521 * path will insure that the zero extent accounting
1522 * has been taken care of before we get back into cluster_io)
1523 *
1524 * go direct to vnode_pageout so that we don't have to
1525 * unbusy the page from the UPL... we used to do this
1526 * so that we could call ubc_msync, but that results
1527 * in a potential deadlock if someone else races us to acquire
1528 * that page and wins and in addition needs one of the pages
1529 * we're continuing to hold in the UPL
1530 */
1531 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1532
1533 if (!(flags & CL_ASYNC)) {
1534 pageout_flags |= UPL_IOSYNC;
1535 }
1536 if (!(flags & CL_COMMIT)) {
1537 pageout_flags |= UPL_NOCOMMIT;
1538 }
1539
1540 if (cbp_head) {
1541 buf_t prev_cbp;
1542 uint32_t bytes_in_last_page;
1543
1544 /*
1545 * first we have to wait for the the current outstanding I/Os
1546 * to complete... EOT hasn't been set yet on this transaction
1547 * so the pages won't be released
1548 */
1549 cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
1550
1551 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1552 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1553 bytes_in_last_page += cbp->b_bcount;
1554 }
1555 bytes_in_last_page &= PAGE_MASK;
1556
1557 while (bytes_in_last_page) {
1558 /*
1559 * we've got a transcation that
1560 * includes the page we're about to push out through vnode_pageout...
1561 * find the bp's in the list which intersect this page and either
1562 * remove them entirely from the transaction (there could be multiple bp's), or
1563 * round it's iosize down to the page boundary (there can only be one)...
1564 *
1565 * find the last bp in the list and act on it
1566 */
1567 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1568 prev_cbp = cbp;
1569 }
1570
1571 if (bytes_in_last_page >= cbp->b_bcount) {
1572 /*
1573 * this buf no longer has any I/O associated with it
1574 */
1575 bytes_in_last_page -= cbp->b_bcount;
1576 cbp->b_bcount = 0;
1577
1578 free_io_buf(cbp);
1579
1580 if (cbp == cbp_head) {
1581 assert(bytes_in_last_page == 0);
1582 /*
1583 * the buf we just freed was the only buf in
1584 * this transaction... so there's no I/O to do
1585 */
1586 cbp_head = NULL;
1587 cbp_tail = NULL;
1588 } else {
1589 /*
1590 * remove the buf we just freed from
1591 * the transaction list
1592 */
1593 prev_cbp->b_trans_next = NULL;
1594 cbp_tail = prev_cbp;
1595 }
1596 } else {
1597 /*
1598 * this is the last bp that has I/O
1599 * intersecting the page of interest
1600 * only some of the I/O is in the intersection
1601 * so clip the size but keep it in the transaction list
1602 */
1603 cbp->b_bcount -= bytes_in_last_page;
1604 cbp_tail = cbp;
1605 bytes_in_last_page = 0;
1606 }
1607 }
1608 if (cbp_head) {
1609 /*
1610 * there was more to the current transaction
1611 * than just the page we are pushing out via vnode_pageout...
1612 * mark it as finished and complete it... we've already
1613 * waited for the I/Os to complete above in the call to cluster_wait_IO
1614 */
1615 cluster_EOT(cbp_head, cbp_tail, zero_offset: 0, verify_block_size: 0);
1616
1617 cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: 0);
1618
1619 trans_count = 0;
1620 }
1621 }
1622 if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1623 error = EINVAL;
1624 }
1625 e_offset = round_page_64(x: f_offset + 1);
1626 io_size = (u_int)(e_offset - f_offset);
1627
1628 f_offset += io_size;
1629 upl_offset += io_size;
1630
1631 if (size >= io_size) {
1632 size -= io_size;
1633 } else {
1634 size = 0;
1635 }
1636 /*
1637 * keep track of how much of the original request
1638 * that we've actually completed... non_rounded_size
1639 * may go negative due to us rounding the request
1640 * to a page size multiple (i.e. size > non_rounded_size)
1641 */
1642 non_rounded_size -= io_size;
1643
1644 if (non_rounded_size <= 0) {
1645 /*
1646 * we've transferred all of the data in the original
1647 * request, but we were unable to complete the tail
1648 * of the last page because the file didn't have
1649 * an allocation to back that portion... this is ok.
1650 */
1651 size = 0;
1652 }
1653 if (error) {
1654 if (size == 0) {
1655 flags &= ~CL_COMMIT;
1656 }
1657 break;
1658 }
1659 continue;
1660 }
1661
1662 lblksize = CLUSTER_IO_BLOCK_SIZE;
1663 lblkno = (daddr64_t)(f_offset / lblksize);
1664
1665 /*
1666 * we have now figured out how much I/O we can do - this is in 'io_size'
1667 * pg_offset is the starting point in the first page for the I/O
1668 * pg_count is the number of full and partial pages that 'io_size' encompasses
1669 */
1670 pg_offset = upl_offset & PAGE_MASK;
1671
1672 if (flags & CL_DEV_MEMORY) {
1673 /*
1674 * treat physical requests as one 'giant' page
1675 */
1676 pg_count = 1;
1677 } else {
1678 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1679 }
1680
1681 if ((flags & CL_READ) && blkno == -1) {
1682 vm_offset_t commit_offset;
1683 int bytes_to_zero;
1684 int complete_transaction_now = 0;
1685
1686 /*
1687 * if we're reading and blkno == -1, then we've got a
1688 * 'hole' in the file that we need to deal with by zeroing
1689 * out the affected area in the upl
1690 */
1691 if (io_size >= (u_int)non_rounded_size) {
1692 /*
1693 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1694 * than 'zero_offset' will be non-zero
1695 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1696 * (indicated by the io_size finishing off the I/O request for this UPL)
1697 * than we're not going to issue an I/O for the
1698 * last page in this upl... we need to zero both the hole and the tail
1699 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1700 */
1701 bytes_to_zero = non_rounded_size;
1702 if (!(flags & CL_NOZERO)) {
1703 bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1704 }
1705
1706 zero_offset = 0;
1707 } else {
1708 bytes_to_zero = io_size;
1709 }
1710
1711 pg_count = 0;
1712
1713 cluster_zero(upl, upl_offset: (upl_offset_t)upl_offset, size: bytes_to_zero, bp: real_bp);
1714
1715 if (cbp_head) {
1716 int pg_resid;
1717
1718 /*
1719 * if there is a current I/O chain pending
1720 * then the first page of the group we just zero'd
1721 * will be handled by the I/O completion if the zero
1722 * fill started in the middle of the page
1723 */
1724 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1725
1726 pg_resid = (int)(commit_offset - upl_offset);
1727
1728 if (bytes_to_zero >= pg_resid) {
1729 /*
1730 * the last page of the current I/O
1731 * has been completed...
1732 * compute the number of fully zero'd
1733 * pages that are beyond it
1734 * plus the last page if its partial
1735 * and we have no more I/O to issue...
1736 * otherwise a partial page is left
1737 * to begin the next I/O
1738 */
1739 if ((int)io_size >= non_rounded_size) {
1740 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1741 } else {
1742 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1743 }
1744
1745 complete_transaction_now = 1;
1746 }
1747 } else {
1748 /*
1749 * no pending I/O to deal with
1750 * so, commit all of the fully zero'd pages
1751 * plus the last page if its partial
1752 * and we have no more I/O to issue...
1753 * otherwise a partial page is left
1754 * to begin the next I/O
1755 */
1756 if ((int)io_size >= non_rounded_size) {
1757 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1758 } else {
1759 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1760 }
1761
1762 commit_offset = upl_offset & ~PAGE_MASK;
1763 }
1764
1765 // Associated UPL is currently only used in the direct write path
1766 assert(!upl_associated_upl(upl));
1767
1768 if ((flags & CL_COMMIT) && pg_count) {
1769 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1770 pg_count * PAGE_SIZE,
1771 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1772 }
1773 upl_offset += io_size;
1774 f_offset += io_size;
1775 size -= io_size;
1776
1777 /*
1778 * keep track of how much of the original request
1779 * that we've actually completed... non_rounded_size
1780 * may go negative due to us rounding the request
1781 * to a page size multiple (i.e. size > non_rounded_size)
1782 */
1783 non_rounded_size -= io_size;
1784
1785 if (non_rounded_size <= 0) {
1786 /*
1787 * we've transferred all of the data in the original
1788 * request, but we were unable to complete the tail
1789 * of the last page because the file didn't have
1790 * an allocation to back that portion... this is ok.
1791 */
1792 size = 0;
1793 }
1794 if (cbp_head && (complete_transaction_now || size == 0)) {
1795 cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
1796
1797 cluster_EOT(cbp_head, cbp_tail, zero_offset: size == 0 ? zero_offset : 0, verify_block_size);
1798
1799 cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: 0);
1800
1801 trans_count = 0;
1802 }
1803 continue;
1804 }
1805 if (pg_count > max_vectors) {
1806 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1807 io_size = PAGE_SIZE - pg_offset;
1808 pg_count = 1;
1809 } else {
1810 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1811 pg_count = max_vectors;
1812 }
1813 }
1814 /*
1815 * If the transaction is going to reach the maximum number of
1816 * desired elements, truncate the i/o to the nearest page so
1817 * that the actual i/o is initiated after this buffer is
1818 * created and added to the i/o chain.
1819 *
1820 * I/O directed to physically contiguous memory
1821 * doesn't have a requirement to make sure we 'fill' a page
1822 */
1823 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1824 ((upl_offset + io_size) & PAGE_MASK)) {
1825 vm_offset_t aligned_ofs;
1826
1827 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1828 /*
1829 * If the io_size does not actually finish off even a
1830 * single page we have to keep adding buffers to the
1831 * transaction despite having reached the desired limit.
1832 *
1833 * Eventually we get here with the page being finished
1834 * off (and exceeded) and then we truncate the size of
1835 * this i/o request so that it is page aligned so that
1836 * we can finally issue the i/o on the transaction.
1837 */
1838 if (aligned_ofs > upl_offset) {
1839 io_size = (u_int)(aligned_ofs - upl_offset);
1840 pg_count--;
1841 }
1842 }
1843
1844 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1845 /*
1846 * if we're not targeting a virtual device i.e. a disk image
1847 * it's safe to dip into the reserve pool since real devices
1848 * can complete this I/O request without requiring additional
1849 * bufs from the alloc_io_buf pool
1850 */
1851 priv = 1;
1852 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1853 /*
1854 * Throttle the speculative IO
1855 *
1856 * We can only throttle this if it is the first iobuf
1857 * for the transaction. alloc_io_buf implements
1858 * additional restrictions for diskimages anyway.
1859 */
1860 priv = 0;
1861 } else {
1862 priv = 1;
1863 }
1864
1865 cbp = alloc_io_buf(vp, priv);
1866
1867 if (flags & CL_PAGEOUT) {
1868 u_int i;
1869
1870 /*
1871 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1872 * iteration to (PAGE_SIZE * pg_count) of blks.
1873 */
1874 for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1875 if (buf_invalblkno(vp, lblkno: lblkno + i, flags: 0) == EBUSY) {
1876 panic("BUSY bp found in cluster_io");
1877 }
1878 }
1879 }
1880 if (flags & CL_ASYNC) {
1881 if (buf_setcallback(bp: cbp, callback: (void *)cluster_iodone, transaction: callback_arg)) {
1882 panic("buf_setcallback failed");
1883 }
1884 }
1885 cbp->b_cliodone = (void *)callback;
1886 cbp->b_flags |= io_flags;
1887 if (flags & CL_NOCACHE) {
1888 cbp->b_attr.ba_flags |= BA_NOCACHE;
1889 }
1890 if (verify_block_size) {
1891 cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1892 }
1893
1894 cbp->b_lblkno = lblkno;
1895 cbp->b_lblksize = lblksize;
1896 cbp->b_blkno = blkno;
1897 cbp->b_bcount = io_size;
1898
1899 if (buf_setupl(bp: cbp, upl, offset: (uint32_t)upl_offset)) {
1900 panic("buf_setupl failed");
1901 }
1902#if CONFIG_IOSCHED
1903 upl_set_blkno(upl, upl_offset, size: io_size, blkno);
1904#endif
1905 cbp->b_trans_next = (buf_t)NULL;
1906
1907 if ((cbp->b_iostate = (void *)iostate)) {
1908 /*
1909 * caller wants to track the state of this
1910 * io... bump the amount issued against this stream
1911 */
1912 iostate->io_issued += io_size;
1913 }
1914
1915 if (flags & CL_READ) {
1916 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1917 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1918 } else {
1919 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1920 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1921 }
1922
1923 if (cbp_head) {
1924 cbp_tail->b_trans_next = cbp;
1925 cbp_tail = cbp;
1926 } else {
1927 cbp_head = cbp;
1928 cbp_tail = cbp;
1929
1930 if ((cbp_head->b_real_bp = real_bp)) {
1931 real_bp = (buf_t)NULL;
1932 }
1933 }
1934 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1935
1936 trans_count++;
1937
1938 upl_offset += io_size;
1939 f_offset += io_size;
1940 size -= io_size;
1941 /*
1942 * keep track of how much of the original request
1943 * that we've actually completed... non_rounded_size
1944 * may go negative due to us rounding the request
1945 * to a page size multiple (i.e. size > non_rounded_size)
1946 */
1947 non_rounded_size -= io_size;
1948
1949 if (non_rounded_size <= 0) {
1950 /*
1951 * we've transferred all of the data in the original
1952 * request, but we were unable to complete the tail
1953 * of the last page because the file didn't have
1954 * an allocation to back that portion... this is ok.
1955 */
1956 size = 0;
1957 }
1958 if (size == 0) {
1959 /*
1960 * we have no more I/O to issue, so go
1961 * finish the final transaction
1962 */
1963 need_EOT = TRUE;
1964 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1965 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1966 /*
1967 * I/O directed to physically contiguous memory...
1968 * which doesn't have a requirement to make sure we 'fill' a page
1969 * or...
1970 * the current I/O we've prepared fully
1971 * completes the last page in this request
1972 * and ...
1973 * it's either an ASYNC request or
1974 * we've already accumulated more than 8 I/O's into
1975 * this transaction so mark it as complete so that
1976 * it can finish asynchronously or via the cluster_complete_transaction
1977 * below if the request is synchronous
1978 */
1979 need_EOT = TRUE;
1980 }
1981 if (need_EOT == TRUE) {
1982 cluster_EOT(cbp_head, cbp_tail, zero_offset: size == 0 ? zero_offset : 0, verify_block_size);
1983 }
1984
1985 if (flags & CL_THROTTLE) {
1986 (void)vnode_waitforwrites(vp, output_target: async_throttle, slpflag: 0, slptimeout: 0, msg: "cluster_io");
1987 }
1988
1989 if (!(io_flags & B_READ)) {
1990 vnode_startwrite(vp);
1991 }
1992
1993 if (flags & CL_RAW_ENCRYPTED) {
1994 /*
1995 * User requested raw encrypted bytes.
1996 * Twiddle the bit in the ba_flags for the buffer
1997 */
1998 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1999 }
2000
2001 (void) VNOP_STRATEGY(bp: cbp);
2002
2003 if (need_EOT == TRUE) {
2004 if (!(flags & CL_ASYNC)) {
2005 cluster_complete_transaction(cbp_head: &cbp_head, callback_arg, retval: &retval, flags, needwait: 1);
2006 }
2007
2008 need_EOT = FALSE;
2009 trans_count = 0;
2010 cbp_head = NULL;
2011 }
2012 }
2013 if (error) {
2014 int abort_size;
2015
2016 io_size = 0;
2017
2018 if (cbp_head) {
2019 /*
2020 * Wait until all of the outstanding I/O
2021 * for this partial transaction has completed
2022 */
2023 cluster_wait_IO(cbp_head, async: (flags & CL_ASYNC));
2024
2025 /*
2026 * Rewind the upl offset to the beginning of the
2027 * transaction.
2028 */
2029 upl_offset = cbp_head->b_uploffset;
2030 }
2031
2032 if (ISSET(flags, CL_COMMIT)) {
2033 cluster_handle_associated_upl(iostate, upl,
2034 upl_offset: (upl_offset_t)upl_offset,
2035 size: (upl_size_t)(upl_end_offset - upl_offset));
2036 }
2037
2038 // Free all the IO buffers in this transaction
2039 for (cbp = cbp_head; cbp;) {
2040 buf_t cbp_next;
2041
2042 size += cbp->b_bcount;
2043 io_size += cbp->b_bcount;
2044
2045 cbp_next = cbp->b_trans_next;
2046 free_io_buf(cbp);
2047 cbp = cbp_next;
2048 }
2049
2050 if (iostate) {
2051 int need_wakeup = 0;
2052
2053 /*
2054 * update the error condition for this stream
2055 * since we never really issued the io
2056 * just go ahead and adjust it back
2057 */
2058 lck_mtx_lock_spin(lck: &iostate->io_mtxp);
2059
2060 if (iostate->io_error == 0) {
2061 iostate->io_error = error;
2062 }
2063 iostate->io_issued -= io_size;
2064
2065 if (iostate->io_wanted) {
2066 /*
2067 * someone is waiting for the state of
2068 * this io stream to change
2069 */
2070 iostate->io_wanted = 0;
2071 need_wakeup = 1;
2072 }
2073 lck_mtx_unlock(lck: &iostate->io_mtxp);
2074
2075 if (need_wakeup) {
2076 wakeup(chan: (caddr_t)&iostate->io_wanted);
2077 }
2078 }
2079
2080 if (flags & CL_COMMIT) {
2081 int upl_flags;
2082
2083 pg_offset = upl_offset & PAGE_MASK;
2084 abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2085
2086 upl_flags = cluster_ioerror(upl, upl_offset: (int)(upl_offset - pg_offset),
2087 abort_size, error, io_flags, vp);
2088
2089 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2090 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2091 }
2092 if (retval == 0) {
2093 retval = error;
2094 }
2095 } else if (cbp_head) {
2096 panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2097 }
2098
2099 if (real_bp) {
2100 /*
2101 * can get here if we either encountered an error
2102 * or we completely zero-filled the request and
2103 * no I/O was issued
2104 */
2105 if (error) {
2106 real_bp->b_flags |= B_ERROR;
2107 real_bp->b_error = error;
2108 }
2109 buf_biodone(bp: real_bp);
2110 }
2111 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2112
2113 return retval;
2114}
2115
2116#define reset_vector_run_state() \
2117 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2118
2119static int
2120vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2121 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2122{
2123 vector_upl_set_pagelist(vector_upl);
2124
2125 if (io_flag & CL_READ) {
2126 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2127 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2128 } else {
2129 io_flag |= CL_PRESERVE; /*zero fill*/
2130 }
2131 }
2132 return cluster_io(vp, upl: vector_upl, upl_offset: vector_upl_offset, f_offset: v_upl_uio_offset, non_rounded_size: vector_upl_iosize, flags: io_flag, real_bp, iostate, callback, callback_arg);
2133}
2134
2135static int
2136cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2137{
2138 int pages_in_prefetch;
2139
2140 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2141 (int)f_offset, size, (int)filesize, 0, 0);
2142
2143 if (f_offset >= filesize) {
2144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2145 (int)f_offset, 0, 0, 0, 0);
2146 return 0;
2147 }
2148 if ((off_t)size > (filesize - f_offset)) {
2149 size = (u_int)(filesize - f_offset);
2150 }
2151 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2152
2153 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2154
2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2156 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2157
2158 return pages_in_prefetch;
2159}
2160
2161
2162
2163static void
2164cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2165 int bflag)
2166{
2167 daddr64_t r_addr;
2168 off_t f_offset;
2169 int size_of_prefetch;
2170 u_int max_prefetch;
2171
2172
2173 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2174 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2175
2176 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2178 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2179 return;
2180 }
2181 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2182 rap->cl_ralen = 0;
2183 rap->cl_maxra = 0;
2184
2185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2186 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2187
2188 return;
2189 }
2190
2191 max_prefetch = cluster_max_prefetch(vp,
2192 max_io_size: cluster_max_io_size(mp: vp->v_mount, CL_READ), prefetch_limit: speculative_prefetch_max);
2193
2194 if (max_prefetch <= PAGE_SIZE) {
2195 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2196 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2197 return;
2198 }
2199 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2200 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2202 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2203 return;
2204 }
2205 }
2206 r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2207 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2208
2209 size_of_prefetch = 0;
2210
2211 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2212
2213 if (size_of_prefetch) {
2214 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2215 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2216 return;
2217 }
2218 if (f_offset < filesize) {
2219 daddr64_t read_size;
2220
2221 rap->cl_ralen = rap->cl_ralen ? min(a: max_prefetch / PAGE_SIZE, b: rap->cl_ralen << 1) : 1;
2222
2223 read_size = (extent->e_addr + 1) - extent->b_addr;
2224
2225 if (read_size > rap->cl_ralen) {
2226 if (read_size > max_prefetch / PAGE_SIZE) {
2227 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2228 } else {
2229 rap->cl_ralen = (int)read_size;
2230 }
2231 }
2232 size_of_prefetch = cluster_read_prefetch(vp, f_offset, size: rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2233
2234 if (size_of_prefetch) {
2235 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2236 }
2237 }
2238 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2239 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2240}
2241
2242
2243int
2244cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2245 int size, off_t filesize, int flags)
2246{
2247 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2248}
2249
2250
2251int
2252cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2253 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2254{
2255 int io_size;
2256 int rounded_size;
2257 off_t max_size;
2258 int local_flags;
2259
2260 local_flags = CL_PAGEOUT | CL_THROTTLE;
2261
2262 if ((flags & UPL_IOSYNC) == 0) {
2263 local_flags |= CL_ASYNC;
2264 }
2265 if ((flags & UPL_NOCOMMIT) == 0) {
2266 local_flags |= CL_COMMIT;
2267 }
2268 if ((flags & UPL_KEEPCACHED)) {
2269 local_flags |= CL_KEEPCACHED;
2270 }
2271 if (flags & UPL_PAGING_ENCRYPTED) {
2272 local_flags |= CL_ENCRYPTED;
2273 }
2274
2275
2276 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2277 (int)f_offset, size, (int)filesize, local_flags, 0);
2278
2279 /*
2280 * If they didn't specify any I/O, then we are done...
2281 * we can't issue an abort because we don't know how
2282 * big the upl really is
2283 */
2284 if (size <= 0) {
2285 return EINVAL;
2286 }
2287
2288 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2289 if (local_flags & CL_COMMIT) {
2290 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2291 }
2292 return EROFS;
2293 }
2294 /*
2295 * can't page-in from a negative offset
2296 * or if we're starting beyond the EOF
2297 * or if the file offset isn't page aligned
2298 * or the size requested isn't a multiple of PAGE_SIZE
2299 */
2300 if (f_offset < 0 || f_offset >= filesize ||
2301 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2302 if (local_flags & CL_COMMIT) {
2303 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2304 }
2305 return EINVAL;
2306 }
2307 max_size = filesize - f_offset;
2308
2309 if (size < max_size) {
2310 io_size = size;
2311 } else {
2312 io_size = (int)max_size;
2313 }
2314
2315 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2316
2317 if (size > rounded_size) {
2318 if (local_flags & CL_COMMIT) {
2319 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2320 UPL_ABORT_FREE_ON_EMPTY);
2321 }
2322 }
2323 return cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size: io_size,
2324 flags: local_flags, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
2325}
2326
2327
2328int
2329cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2330 int size, off_t filesize, int flags)
2331{
2332 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2333}
2334
2335
2336int
2337cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2338 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2339{
2340 u_int io_size;
2341 int rounded_size;
2342 off_t max_size;
2343 int retval;
2344 int local_flags = 0;
2345
2346 if (upl == NULL || size < 0) {
2347 panic("cluster_pagein: NULL upl passed in");
2348 }
2349
2350 if ((flags & UPL_IOSYNC) == 0) {
2351 local_flags |= CL_ASYNC;
2352 }
2353 if ((flags & UPL_NOCOMMIT) == 0) {
2354 local_flags |= CL_COMMIT;
2355 }
2356 if (flags & UPL_IOSTREAMING) {
2357 local_flags |= CL_IOSTREAMING;
2358 }
2359 if (flags & UPL_PAGING_ENCRYPTED) {
2360 local_flags |= CL_ENCRYPTED;
2361 }
2362
2363
2364 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2365 (int)f_offset, size, (int)filesize, local_flags, 0);
2366
2367 /*
2368 * can't page-in from a negative offset
2369 * or if we're starting beyond the EOF
2370 * or if the file offset isn't page aligned
2371 * or the size requested isn't a multiple of PAGE_SIZE
2372 */
2373 if (f_offset < 0 || f_offset >= filesize ||
2374 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2375 if (local_flags & CL_COMMIT) {
2376 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2377 }
2378
2379 if (f_offset >= filesize) {
2380 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), arg: 0 /* arg */);
2381 }
2382
2383 return EINVAL;
2384 }
2385 max_size = filesize - f_offset;
2386
2387 if (size < max_size) {
2388 io_size = size;
2389 } else {
2390 io_size = (int)max_size;
2391 }
2392
2393 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2394
2395 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2396 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2397 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2398 }
2399
2400 retval = cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size: io_size,
2401 flags: local_flags | CL_READ | CL_PAGEIN, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
2402
2403 return retval;
2404}
2405
2406
2407int
2408cluster_bp(buf_t bp)
2409{
2410 return cluster_bp_ext(bp, NULL, NULL);
2411}
2412
2413
2414int
2415cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2416{
2417 off_t f_offset;
2418 int flags;
2419
2420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2421 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2422
2423 if (bp->b_flags & B_READ) {
2424 flags = CL_ASYNC | CL_READ;
2425 } else {
2426 flags = CL_ASYNC;
2427 }
2428 if (bp->b_flags & B_PASSIVE) {
2429 flags |= CL_PASSIVE;
2430 }
2431
2432 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2433
2434 return cluster_io(vp: bp->b_vp, upl: bp->b_upl, upl_offset: 0, f_offset, non_rounded_size: bp->b_bcount, flags, real_bp: bp, iostate: (struct clios *)NULL, callback, callback_arg);
2435}
2436
2437
2438
2439int
2440cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2441{
2442 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2443}
2444
2445
2446int
2447cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2448 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2449{
2450 user_ssize_t cur_resid;
2451 int retval = 0;
2452 int flags;
2453 int zflags;
2454 int bflag;
2455 int write_type = IO_COPY;
2456 u_int32_t write_length;
2457
2458 flags = xflags;
2459
2460 if (flags & IO_PASSIVE) {
2461 bflag = CL_PASSIVE;
2462 } else {
2463 bflag = 0;
2464 }
2465
2466 if (vp->v_flag & VNOCACHE_DATA) {
2467 flags |= IO_NOCACHE;
2468 bflag |= CL_NOCACHE;
2469 }
2470 if (uio == NULL) {
2471 /*
2472 * no user data...
2473 * this call is being made to zero-fill some range in the file
2474 */
2475 retval = cluster_write_copy(vp, NULL, io_req_size: (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2476
2477 return retval;
2478 }
2479 /*
2480 * do a write through the cache if one of the following is true....
2481 * NOCACHE is not true or NODIRECT is true
2482 * the uio request doesn't target USERSPACE
2483 * otherwise, find out if we want the direct or contig variant for
2484 * the first vector in the uio request
2485 */
2486 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2487 retval = cluster_io_type(uio, io_type: &write_type, io_length: &write_length, MIN_DIRECT_WRITE_SIZE);
2488 }
2489
2490 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2491 /*
2492 * must go through the cached variant in this case
2493 */
2494 write_type = IO_COPY;
2495 }
2496
2497 while ((cur_resid = uio_resid(a_uio: uio)) && uio->uio_offset < newEOF && retval == 0) {
2498 switch (write_type) {
2499 case IO_COPY:
2500 /*
2501 * make sure the uio_resid isn't too big...
2502 * internally, we want to handle all of the I/O in
2503 * chunk sizes that fit in a 32 bit int
2504 */
2505 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2506 /*
2507 * we're going to have to call cluster_write_copy
2508 * more than once...
2509 *
2510 * only want the last call to cluster_write_copy to
2511 * have the IO_TAILZEROFILL flag set and only the
2512 * first call should have IO_HEADZEROFILL
2513 */
2514 zflags = flags & ~IO_TAILZEROFILL;
2515 flags &= ~IO_HEADZEROFILL;
2516
2517 write_length = MAX_IO_REQUEST_SIZE;
2518 } else {
2519 /*
2520 * last call to cluster_write_copy
2521 */
2522 zflags = flags;
2523
2524 write_length = (u_int32_t)cur_resid;
2525 }
2526 retval = cluster_write_copy(vp, uio, io_req_size: write_length, oldEOF, newEOF, headOff, tailOff, flags: zflags, callback, callback_arg);
2527 break;
2528
2529 case IO_CONTIG:
2530 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2531
2532 if (flags & IO_HEADZEROFILL) {
2533 /*
2534 * only do this once per request
2535 */
2536 flags &= ~IO_HEADZEROFILL;
2537
2538 retval = cluster_write_copy(vp, uio: (struct uio *)0, io_req_size: (u_int32_t)0, oldEOF: (off_t)0, newEOF: uio->uio_offset,
2539 headOff, tailOff: (off_t)0, flags: zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2540 if (retval) {
2541 break;
2542 }
2543 }
2544 retval = cluster_write_contig(vp, uio, newEOF, write_type: &write_type, write_length: &write_length, callback, callback_arg, bflag);
2545
2546 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(a_uio: uio) == 0) {
2547 /*
2548 * we're done with the data from the user specified buffer(s)
2549 * and we've been requested to zero fill at the tail
2550 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2551 * by rearranging the args and passing in IO_HEADZEROFILL
2552 */
2553
2554 /*
2555 * Update the oldEOF to reflect the current EOF. If the UPL page
2556 * to zero-fill is not valid (when F_NOCACHE is set), the
2557 * cluster_write_copy() will perform RMW on the UPL page when
2558 * the oldEOF is not aligned on page boundary due to unaligned
2559 * write.
2560 */
2561 if (uio->uio_offset > oldEOF) {
2562 oldEOF = uio->uio_offset;
2563 }
2564 retval = cluster_write_copy(vp, uio: (struct uio *)0, io_req_size: (u_int32_t)0, oldEOF: (off_t)oldEOF, newEOF: tailOff, headOff: uio->uio_offset,
2565 tailOff: (off_t)0, flags: zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2566 }
2567 break;
2568
2569 case IO_DIRECT:
2570 /*
2571 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2572 */
2573 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, write_type: &write_type, write_length: &write_length, flags, callback, callback_arg);
2574 break;
2575
2576 case IO_UNKNOWN:
2577 retval = cluster_io_type(uio, io_type: &write_type, io_length: &write_length, MIN_DIRECT_WRITE_SIZE);
2578 break;
2579 }
2580 /*
2581 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2582 * multiple times to service a multi-vector request that is not aligned properly
2583 * we need to update the oldEOF so that we
2584 * don't zero-fill the head of a page if we've successfully written
2585 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2586 * page that is beyond the oldEOF if the write is unaligned... we only
2587 * want that to happen for the very first page of the cluster_write,
2588 * NOT the first page of each vector making up a multi-vector write.
2589 */
2590 if (uio->uio_offset > oldEOF) {
2591 oldEOF = uio->uio_offset;
2592 }
2593 }
2594 return retval;
2595}
2596
2597
2598static int
2599cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2600 int flags, int (*callback)(buf_t, void *), void *callback_arg)
2601{
2602 upl_t upl = NULL;
2603 upl_page_info_t *pl;
2604 vm_offset_t upl_offset;
2605 vm_offset_t vector_upl_offset = 0;
2606 u_int32_t io_req_size;
2607 u_int32_t offset_in_file;
2608 u_int32_t offset_in_iovbase;
2609 u_int32_t io_size;
2610 int io_flag = 0;
2611 upl_size_t upl_size = 0, vector_upl_size = 0;
2612 vm_size_t upl_needed_size;
2613 mach_msg_type_number_t pages_in_pl = 0;
2614 upl_control_flags_t upl_flags;
2615 kern_return_t kret = KERN_SUCCESS;
2616 mach_msg_type_number_t i = 0;
2617 int force_data_sync;
2618 int retval = 0;
2619 int first_IO = 1;
2620 struct clios iostate;
2621 user_addr_t iov_base;
2622 u_int32_t mem_alignment_mask;
2623 u_int32_t devblocksize;
2624 u_int32_t max_io_size;
2625 u_int32_t max_upl_size;
2626 u_int32_t max_vector_size;
2627 u_int32_t bytes_outstanding_limit;
2628 boolean_t io_throttled = FALSE;
2629
2630 u_int32_t vector_upl_iosize = 0;
2631 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2632 off_t v_upl_uio_offset = 0;
2633 int vector_upl_index = 0;
2634 upl_t vector_upl = NULL;
2635
2636
2637 /*
2638 * When we enter this routine, we know
2639 * -- the resid will not exceed iov_len
2640 */
2641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2642 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2643
2644 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2645
2646 max_upl_size = cluster_max_io_size(mp: vp->v_mount, CL_WRITE);
2647
2648 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2649
2650 if (flags & IO_PASSIVE) {
2651 io_flag |= CL_PASSIVE;
2652 }
2653
2654 if (flags & IO_NOCACHE) {
2655 io_flag |= CL_NOCACHE;
2656 }
2657
2658 if (flags & IO_SKIP_ENCRYPTION) {
2659 io_flag |= CL_ENCRYPTED;
2660 }
2661
2662 iostate.io_completed = 0;
2663 iostate.io_issued = 0;
2664 iostate.io_error = 0;
2665 iostate.io_wanted = 0;
2666
2667 lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
2668
2669 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2670 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2671
2672 if (devblocksize == 1) {
2673 /*
2674 * the AFP client advertises a devblocksize of 1
2675 * however, its BLOCKMAP routine maps to physical
2676 * blocks that are PAGE_SIZE in size...
2677 * therefore we can't ask for I/Os that aren't page aligned
2678 * or aren't multiples of PAGE_SIZE in size
2679 * by setting devblocksize to PAGE_SIZE, we re-instate
2680 * the old behavior we had before the mem_alignment_mask
2681 * changes went in...
2682 */
2683 devblocksize = PAGE_SIZE;
2684 }
2685
2686next_dwrite:
2687 io_req_size = *write_length;
2688 iov_base = uio_curriovbase(a_uio: uio);
2689
2690 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2691 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2692
2693 if (offset_in_file || offset_in_iovbase) {
2694 /*
2695 * one of the 2 important offsets is misaligned
2696 * so fire an I/O through the cache for this entire vector
2697 */
2698 goto wait_for_dwrites;
2699 }
2700 if (iov_base & (devblocksize - 1)) {
2701 /*
2702 * the offset in memory must be on a device block boundary
2703 * so that we can guarantee that we can generate an
2704 * I/O that ends on a page boundary in cluster_io
2705 */
2706 goto wait_for_dwrites;
2707 }
2708
2709 task_update_logical_writes(task: current_task(), io_size: (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2710 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2711 int throttle_type;
2712
2713 if ((throttle_type = cluster_is_throttled(vp))) {
2714 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
2715
2716 /*
2717 * we're in the throttle window, at the very least
2718 * we want to limit the size of the I/O we're about
2719 * to issue
2720 */
2721 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2722 /*
2723 * we're in the throttle window and at least 1 I/O
2724 * has already been issued by a throttleable thread
2725 * in this window, so return with EAGAIN to indicate
2726 * to the FS issuing the cluster_write call that it
2727 * should now throttle after dropping any locks
2728 */
2729 throttle_info_update_by_mount(mp: vp->v_mount);
2730
2731 io_throttled = TRUE;
2732 goto wait_for_dwrites;
2733 }
2734 max_vector_size = max_throttle_size;
2735 max_io_size = max_throttle_size;
2736 } else {
2737 max_vector_size = MAX_VECTOR_UPL_SIZE;
2738 max_io_size = max_upl_size;
2739 }
2740
2741 if (first_IO) {
2742 cluster_syncup(vp, newEOF, callback, callback_arg, flags: callback ? PUSH_SYNC : 0);
2743 first_IO = 0;
2744 }
2745 io_size = io_req_size & ~PAGE_MASK;
2746 iov_base = uio_curriovbase(a_uio: uio);
2747
2748 if (io_size > max_io_size) {
2749 io_size = max_io_size;
2750 }
2751
2752 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2753 /*
2754 * We have an iov_base that's not page-aligned.
2755 * Issue all I/O's that have been collected within
2756 * this Vectored UPL.
2757 */
2758 if (vector_upl_index) {
2759 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2760 reset_vector_run_state();
2761 }
2762
2763 /*
2764 * After this point, if we are using the Vector UPL path and the base is
2765 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2766 */
2767 }
2768
2769 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2770 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2771
2772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2773 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2774
2775 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2776 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2777 pages_in_pl = 0;
2778 upl_size = (upl_size_t)upl_needed_size;
2779 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2780 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2781
2782 kret = vm_map_get_upl(target_map: map,
2783 map_offset: (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2784 size: &upl_size,
2785 upl: &upl,
2786 NULL,
2787 page_infoCnt: &pages_in_pl,
2788 flags: &upl_flags,
2789 VM_KERN_MEMORY_FILE,
2790 force_data_sync);
2791
2792 if (kret != KERN_SUCCESS) {
2793 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2794 0, 0, 0, kret, 0);
2795 /*
2796 * failed to get pagelist
2797 *
2798 * we may have already spun some portion of this request
2799 * off as async requests... we need to wait for the I/O
2800 * to complete before returning
2801 */
2802 goto wait_for_dwrites;
2803 }
2804 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2805 pages_in_pl = upl_size / PAGE_SIZE;
2806
2807 for (i = 0; i < pages_in_pl; i++) {
2808 if (!upl_valid_page(upl: pl, index: i)) {
2809 break;
2810 }
2811 }
2812 if (i == pages_in_pl) {
2813 break;
2814 }
2815
2816 /*
2817 * didn't get all the pages back that we
2818 * needed... release this upl and try again
2819 */
2820 ubc_upl_abort(upl, 0);
2821 }
2822 if (force_data_sync >= 3) {
2823 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2824 i, pages_in_pl, upl_size, kret, 0);
2825 /*
2826 * for some reason, we couldn't acquire a hold on all
2827 * the pages needed in the user's address space
2828 *
2829 * we may have already spun some portion of this request
2830 * off as async requests... we need to wait for the I/O
2831 * to complete before returning
2832 */
2833 goto wait_for_dwrites;
2834 }
2835
2836 /*
2837 * Consider the possibility that upl_size wasn't satisfied.
2838 */
2839 if (upl_size < upl_needed_size) {
2840 if (upl_size && upl_offset == 0) {
2841 io_size = upl_size;
2842 } else {
2843 io_size = 0;
2844 }
2845 }
2846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2847 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2848
2849 if (io_size == 0) {
2850 ubc_upl_abort(upl, 0);
2851 /*
2852 * we may have already spun some portion of this request
2853 * off as async requests... we need to wait for the I/O
2854 * to complete before returning
2855 */
2856 goto wait_for_dwrites;
2857 }
2858
2859 if (useVectorUPL) {
2860 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2861 if (end_off) {
2862 issueVectorUPL = 1;
2863 }
2864 /*
2865 * After this point, if we are using a vector UPL, then
2866 * either all the UPL elements end on a page boundary OR
2867 * this UPL is the last element because it does not end
2868 * on a page boundary.
2869 */
2870 }
2871
2872 /*
2873 * we want push out these writes asynchronously so that we can overlap
2874 * the preparation of the next I/O
2875 * if there are already too many outstanding writes
2876 * wait until some complete before issuing the next
2877 */
2878 if (vp->v_mount->mnt_minsaturationbytecount) {
2879 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2880 } else {
2881 if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, 2),
2882 &bytes_outstanding_limit) ||
2883 (bytes_outstanding_limit > overlapping_write_max))) {
2884 bytes_outstanding_limit = overlapping_write_max;
2885 }
2886 }
2887
2888 cluster_iostate_wait(iostate: &iostate, target: bytes_outstanding_limit, wait_name: "cluster_write_direct");
2889
2890 if (iostate.io_error) {
2891 /*
2892 * one of the earlier writes we issued ran into a hard error
2893 * don't issue any more writes, cleanup the UPL
2894 * that was just created but not used, then
2895 * go wait for all writes that are part of this stream
2896 * to complete before returning the error to the caller
2897 */
2898 ubc_upl_abort(upl, 0);
2899
2900 goto wait_for_dwrites;
2901 }
2902
2903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2904 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2905
2906 if (!useVectorUPL) {
2907 retval = cluster_io(vp, upl, upl_offset, f_offset: uio->uio_offset,
2908 non_rounded_size: io_size, flags: io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2909 } else {
2910 if (!vector_upl_index) {
2911 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
2912 v_upl_uio_offset = uio->uio_offset;
2913 vector_upl_offset = upl_offset;
2914 }
2915
2916 vector_upl_set_subupl(vector_upl, upl, upl_size);
2917 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2918 vector_upl_index++;
2919 vector_upl_iosize += io_size;
2920 vector_upl_size += upl_size;
2921
2922 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
2923 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2924 reset_vector_run_state();
2925 }
2926 }
2927
2928 /*
2929 * update the uio structure to
2930 * reflect the I/O that we just issued
2931 */
2932 uio_update(a_uio: uio, a_count: (user_size_t)io_size);
2933
2934 /*
2935 * in case we end up calling through to cluster_write_copy to finish
2936 * the tail of this request, we need to update the oldEOF so that we
2937 * don't zero-fill the head of a page if we've successfully written
2938 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2939 * page that is beyond the oldEOF if the write is unaligned... we only
2940 * want that to happen for the very first page of the cluster_write,
2941 * NOT the first page of each vector making up a multi-vector write.
2942 */
2943 if (uio->uio_offset > oldEOF) {
2944 oldEOF = uio->uio_offset;
2945 }
2946
2947 io_req_size -= io_size;
2948
2949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2950 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2951 } /* end while */
2952
2953 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2954 retval = cluster_io_type(uio, io_type: write_type, io_length: write_length, MIN_DIRECT_WRITE_SIZE);
2955
2956 if (retval == 0 && *write_type == IO_DIRECT) {
2957 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2958 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2959
2960 goto next_dwrite;
2961 }
2962 }
2963
2964wait_for_dwrites:
2965
2966 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2967 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
2968 reset_vector_run_state();
2969 }
2970 /*
2971 * make sure all async writes issued as part of this stream
2972 * have completed before we return
2973 */
2974 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_write_direct");
2975
2976 if (iostate.io_error) {
2977 retval = iostate.io_error;
2978 }
2979
2980 lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
2981
2982 if (io_throttled == TRUE && retval == 0) {
2983 retval = EAGAIN;
2984 }
2985
2986 if (io_req_size && retval == 0) {
2987 /*
2988 * we couldn't handle the tail of this request in DIRECT mode
2989 * so fire it through the copy path
2990 *
2991 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2992 * so we can just pass 0 in for the headOff and tailOff
2993 */
2994 if (uio->uio_offset > oldEOF) {
2995 oldEOF = uio->uio_offset;
2996 }
2997
2998 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, headOff: (off_t)0, tailOff: (off_t)0, flags, callback, callback_arg);
2999
3000 *write_type = IO_UNKNOWN;
3001 }
3002 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
3003 (int)uio->uio_offset, io_req_size, retval, 4, 0);
3004
3005 return retval;
3006}
3007
3008
3009static int
3010cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
3011 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
3012{
3013 upl_page_info_t *pl;
3014 addr64_t src_paddr = 0;
3015 upl_t upl[MAX_VECTS];
3016 vm_offset_t upl_offset;
3017 u_int32_t tail_size = 0;
3018 u_int32_t io_size;
3019 u_int32_t xsize;
3020 upl_size_t upl_size;
3021 vm_size_t upl_needed_size;
3022 mach_msg_type_number_t pages_in_pl;
3023 upl_control_flags_t upl_flags;
3024 kern_return_t kret;
3025 struct clios iostate;
3026 int error = 0;
3027 int cur_upl = 0;
3028 int num_upl = 0;
3029 int n;
3030 user_addr_t iov_base;
3031 u_int32_t devblocksize;
3032 u_int32_t mem_alignment_mask;
3033
3034 /*
3035 * When we enter this routine, we know
3036 * -- the io_req_size will not exceed iov_len
3037 * -- the target address is physically contiguous
3038 */
3039 cluster_syncup(vp, newEOF, callback, callback_arg, flags: callback ? PUSH_SYNC : 0);
3040
3041 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3042 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3043
3044 iostate.io_completed = 0;
3045 iostate.io_issued = 0;
3046 iostate.io_error = 0;
3047 iostate.io_wanted = 0;
3048
3049 lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
3050
3051next_cwrite:
3052 io_size = *write_length;
3053
3054 iov_base = uio_curriovbase(a_uio: uio);
3055
3056 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3057 upl_needed_size = upl_offset + io_size;
3058
3059 pages_in_pl = 0;
3060 upl_size = (upl_size_t)upl_needed_size;
3061 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3062 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3063
3064 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3065 kret = vm_map_get_upl(target_map: map,
3066 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3067 size: &upl_size, upl: &upl[cur_upl], NULL, page_infoCnt: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: 0);
3068
3069 if (kret != KERN_SUCCESS) {
3070 /*
3071 * failed to get pagelist
3072 */
3073 error = EINVAL;
3074 goto wait_for_cwrites;
3075 }
3076 num_upl++;
3077
3078 /*
3079 * Consider the possibility that upl_size wasn't satisfied.
3080 */
3081 if (upl_size < upl_needed_size) {
3082 /*
3083 * This is a failure in the physical memory case.
3084 */
3085 error = EINVAL;
3086 goto wait_for_cwrites;
3087 }
3088 pl = ubc_upl_pageinfo(upl[cur_upl]);
3089
3090 src_paddr = ((addr64_t)upl_phys_page(upl: pl, index: 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3091
3092 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3093 u_int32_t head_size;
3094
3095 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3096
3097 if (head_size > io_size) {
3098 head_size = io_size;
3099 }
3100
3101 error = cluster_align_phys_io(vp, uio, usr_paddr: src_paddr, xsize: head_size, flags: 0, callback, callback_arg);
3102
3103 if (error) {
3104 goto wait_for_cwrites;
3105 }
3106
3107 upl_offset += head_size;
3108 src_paddr += head_size;
3109 io_size -= head_size;
3110
3111 iov_base += head_size;
3112 }
3113 if ((u_int32_t)iov_base & mem_alignment_mask) {
3114 /*
3115 * request doesn't set up on a memory boundary
3116 * the underlying DMA engine can handle...
3117 * return an error instead of going through
3118 * the slow copy path since the intent of this
3119 * path is direct I/O from device memory
3120 */
3121 error = EINVAL;
3122 goto wait_for_cwrites;
3123 }
3124
3125 tail_size = io_size & (devblocksize - 1);
3126 io_size -= tail_size;
3127
3128 while (io_size && error == 0) {
3129 if (io_size > MAX_IO_CONTIG_SIZE) {
3130 xsize = MAX_IO_CONTIG_SIZE;
3131 } else {
3132 xsize = io_size;
3133 }
3134 /*
3135 * request asynchronously so that we can overlap
3136 * the preparation of the next I/O... we'll do
3137 * the commit after all the I/O has completed
3138 * since its all issued against the same UPL
3139 * if there are already too many outstanding writes
3140 * wait until some have completed before issuing the next
3141 */
3142 cluster_iostate_wait(iostate: &iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), wait_name: "cluster_write_contig");
3143
3144 if (iostate.io_error) {
3145 /*
3146 * one of the earlier writes we issued ran into a hard error
3147 * don't issue any more writes...
3148 * go wait for all writes that are part of this stream
3149 * to complete before returning the error to the caller
3150 */
3151 goto wait_for_cwrites;
3152 }
3153 /*
3154 * issue an asynchronous write to cluster_io
3155 */
3156 error = cluster_io(vp, upl: upl[cur_upl], upl_offset, f_offset: uio->uio_offset,
3157 non_rounded_size: xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)&iostate, callback, callback_arg);
3158
3159 if (error == 0) {
3160 /*
3161 * The cluster_io write completed successfully,
3162 * update the uio structure
3163 */
3164 uio_update(a_uio: uio, a_count: (user_size_t)xsize);
3165
3166 upl_offset += xsize;
3167 src_paddr += xsize;
3168 io_size -= xsize;
3169 }
3170 }
3171 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3172 error = cluster_io_type(uio, io_type: write_type, io_length: write_length, min_length: 0);
3173
3174 if (error == 0 && *write_type == IO_CONTIG) {
3175 cur_upl++;
3176 goto next_cwrite;
3177 }
3178 } else {
3179 *write_type = IO_UNKNOWN;
3180 }
3181
3182wait_for_cwrites:
3183 /*
3184 * make sure all async writes that are part of this stream
3185 * have completed before we proceed
3186 */
3187 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_write_contig");
3188
3189 if (iostate.io_error) {
3190 error = iostate.io_error;
3191 }
3192
3193 lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
3194
3195 if (error == 0 && tail_size) {
3196 error = cluster_align_phys_io(vp, uio, usr_paddr: src_paddr, xsize: tail_size, flags: 0, callback, callback_arg);
3197 }
3198
3199 for (n = 0; n < num_upl; n++) {
3200 /*
3201 * just release our hold on each physically contiguous
3202 * region without changing any state
3203 */
3204 ubc_upl_abort(upl[n], 0);
3205 }
3206
3207 return error;
3208}
3209
3210
3211/*
3212 * need to avoid a race between an msync of a range of pages dirtied via mmap
3213 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3214 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3215 *
3216 * we should never force-zero-fill pages that are already valid in the cache...
3217 * the entire page contains valid data (either from disk, zero-filled or dirtied
3218 * via an mmap) so we can only do damage by trying to zero-fill
3219 *
3220 */
3221static int
3222cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3223{
3224 int zero_pg_index;
3225 boolean_t need_cluster_zero = TRUE;
3226
3227 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3228 bytes_to_zero = min(a: bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3229 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3230
3231 if (upl_valid_page(upl: pl, index: zero_pg_index)) {
3232 /*
3233 * never force zero valid pages - dirty or clean
3234 * we'll leave these in the UPL for cluster_write_copy to deal with
3235 */
3236 need_cluster_zero = FALSE;
3237 }
3238 }
3239 if (need_cluster_zero == TRUE) {
3240 cluster_zero(upl, upl_offset: io_offset, size: bytes_to_zero, NULL);
3241 }
3242
3243 return bytes_to_zero;
3244}
3245
3246
3247void
3248cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3249{
3250 struct cl_extent cl;
3251 boolean_t first_pass = TRUE;
3252
3253 assert(s_offset < e_offset);
3254 assert((s_offset & PAGE_MASK_64) == 0);
3255 assert((e_offset & PAGE_MASK_64) == 0);
3256
3257 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3258 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3259
3260 cluster_update_state_internal(vp, cl: &cl, flags: 0, TRUE, first_pass: &first_pass, write_off: s_offset, write_cnt: (int)(e_offset - s_offset),
3261 newEOF: vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3262}
3263
3264
3265static void
3266cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3267 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3268 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3269{
3270 struct cl_writebehind *wbp;
3271 int cl_index;
3272 int ret_cluster_try_push;
3273 u_int max_cluster_pgcount;
3274
3275
3276 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3277
3278 /*
3279 * take the lock to protect our accesses
3280 * of the writebehind and sparse cluster state
3281 */
3282 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3283
3284 if (wbp->cl_scmap) {
3285 if (!(flags & IO_NOCACHE)) {
3286 /*
3287 * we've fallen into the sparse
3288 * cluster method of delaying dirty pages
3289 */
3290 sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, cl, EOF: newEOF, callback, callback_arg, vm_initiated);
3291
3292 lck_mtx_unlock(lck: &wbp->cl_lockw);
3293 return;
3294 }
3295 /*
3296 * must have done cached writes that fell into
3297 * the sparse cluster mechanism... we've switched
3298 * to uncached writes on the file, so go ahead
3299 * and push whatever's in the sparse map
3300 * and switch back to normal clustering
3301 */
3302 wbp->cl_number = 0;
3303
3304 sparse_cluster_push(wbp, cmapp: &(wbp->cl_scmap), vp, EOF: newEOF, PUSH_ALL, io_flags: 0, callback, callback_arg, vm_initiated);
3305 /*
3306 * no clusters of either type present at this point
3307 * so just go directly to start_new_cluster since
3308 * we know we need to delay this I/O since we've
3309 * already released the pages back into the cache
3310 * to avoid the deadlock with sparse_cluster_push
3311 */
3312 goto start_new_cluster;
3313 }
3314 if (*first_pass == TRUE) {
3315 if (write_off == wbp->cl_last_write) {
3316 wbp->cl_seq_written += write_cnt;
3317 } else {
3318 wbp->cl_seq_written = write_cnt;
3319 }
3320
3321 wbp->cl_last_write = write_off + write_cnt;
3322
3323 *first_pass = FALSE;
3324 }
3325 if (wbp->cl_number == 0) {
3326 /*
3327 * no clusters currently present
3328 */
3329 goto start_new_cluster;
3330 }
3331
3332 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3333 /*
3334 * check each cluster that we currently hold
3335 * try to merge some or all of this write into
3336 * one or more of the existing clusters... if
3337 * any portion of the write remains, start a
3338 * new cluster
3339 */
3340 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3341 /*
3342 * the current write starts at or after the current cluster
3343 */
3344 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3345 /*
3346 * we have a write that fits entirely
3347 * within the existing cluster limits
3348 */
3349 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3350 /*
3351 * update our idea of where the cluster ends
3352 */
3353 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3354 }
3355 break;
3356 }
3357 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3358 /*
3359 * we have a write that starts in the middle of the current cluster
3360 * but extends beyond the cluster's limit... we know this because
3361 * of the previous checks
3362 * we'll extend the current cluster to the max
3363 * and update the b_addr for the current write to reflect that
3364 * the head of it was absorbed into this cluster...
3365 * note that we'll always have a leftover tail in this case since
3366 * full absorbtion would have occurred in the clause above
3367 */
3368 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3369
3370 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3371 }
3372 /*
3373 * we come here for the case where the current write starts
3374 * beyond the limit of the existing cluster or we have a leftover
3375 * tail after a partial absorbtion
3376 *
3377 * in either case, we'll check the remaining clusters before
3378 * starting a new one
3379 */
3380 } else {
3381 /*
3382 * the current write starts in front of the cluster we're currently considering
3383 */
3384 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3385 /*
3386 * we can just merge the new request into
3387 * this cluster and leave it in the cache
3388 * since the resulting cluster is still
3389 * less than the maximum allowable size
3390 */
3391 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3392
3393 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3394 /*
3395 * the current write completely
3396 * envelops the existing cluster and since
3397 * each write is limited to at most max_cluster_pgcount pages
3398 * we can just use the start and last blocknos of the write
3399 * to generate the cluster limits
3400 */
3401 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3402 }
3403 break;
3404 }
3405 /*
3406 * if we were to combine this write with the current cluster
3407 * we would exceed the cluster size limit.... so,
3408 * let's see if there's any overlap of the new I/O with
3409 * the cluster we're currently considering... in fact, we'll
3410 * stretch the cluster out to it's full limit and see if we
3411 * get an intersection with the current write
3412 *
3413 */
3414 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3415 /*
3416 * the current write extends into the proposed cluster
3417 * clip the length of the current write after first combining it's
3418 * tail with the newly shaped cluster
3419 */
3420 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3421
3422 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3423 }
3424 /*
3425 * if we get here, there was no way to merge
3426 * any portion of this write with this cluster
3427 * or we could only merge part of it which
3428 * will leave a tail...
3429 * we'll check the remaining clusters before starting a new one
3430 */
3431 }
3432 }
3433 if (cl_index < wbp->cl_number) {
3434 /*
3435 * we found an existing cluster(s) that we
3436 * could entirely merge this I/O into
3437 */
3438 goto delay_io;
3439 }
3440
3441 if (defer_writes == FALSE &&
3442 wbp->cl_number == MAX_CLUSTERS &&
3443 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3444 uint32_t n;
3445
3446 if (vp->v_mount->mnt_minsaturationbytecount) {
3447 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3448
3449 if (n > MAX_CLUSTERS) {
3450 n = MAX_CLUSTERS;
3451 }
3452 } else {
3453 n = 0;
3454 }
3455
3456 if (n == 0) {
3457 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3458 n = WRITE_BEHIND_SSD;
3459 } else {
3460 n = WRITE_BEHIND;
3461 }
3462 }
3463 while (n--) {
3464 cluster_try_push(wbp, vp, EOF: newEOF, push_flag: 0, flags: 0, callback, callback_arg, NULL, vm_initiated);
3465 }
3466 }
3467 if (wbp->cl_number < MAX_CLUSTERS) {
3468 /*
3469 * we didn't find an existing cluster to
3470 * merge into, but there's room to start
3471 * a new one
3472 */
3473 goto start_new_cluster;
3474 }
3475 /*
3476 * no exisitng cluster to merge with and no
3477 * room to start a new one... we'll try
3478 * pushing one of the existing ones... if none of
3479 * them are able to be pushed, we'll switch
3480 * to the sparse cluster mechanism
3481 * cluster_try_push updates cl_number to the
3482 * number of remaining clusters... and
3483 * returns the number of currently unused clusters
3484 */
3485 ret_cluster_try_push = 0;
3486
3487 /*
3488 * if writes are not deferred, call cluster push immediately
3489 */
3490 if (defer_writes == FALSE) {
3491 ret_cluster_try_push = cluster_try_push(wbp, vp, EOF: newEOF, push_flag: (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, flags: 0, callback, callback_arg, NULL, vm_initiated);
3492 }
3493 /*
3494 * execute following regardless of writes being deferred or not
3495 */
3496 if (ret_cluster_try_push == 0) {
3497 /*
3498 * no more room in the normal cluster mechanism
3499 * so let's switch to the more expansive but expensive
3500 * sparse mechanism....
3501 */
3502 sparse_cluster_switch(wbp, vp, EOF: newEOF, callback, callback_arg, vm_initiated);
3503 sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, cl, EOF: newEOF, callback, callback_arg, vm_initiated);
3504
3505 lck_mtx_unlock(lck: &wbp->cl_lockw);
3506 return;
3507 }
3508start_new_cluster:
3509 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3510 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3511
3512 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3513
3514 if (flags & IO_NOCACHE) {
3515 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3516 }
3517
3518 if (flags & IO_PASSIVE) {
3519 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3520 }
3521
3522 wbp->cl_number++;
3523delay_io:
3524 lck_mtx_unlock(lck: &wbp->cl_lockw);
3525 return;
3526}
3527
3528
3529static int
3530cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3531 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3532{
3533 upl_page_info_t *pl;
3534 upl_t upl;
3535 vm_offset_t upl_offset = 0;
3536 vm_size_t upl_size;
3537 off_t upl_f_offset;
3538 int pages_in_upl;
3539 int start_offset;
3540 int xfer_resid;
3541 int io_size;
3542 int io_offset;
3543 int bytes_to_zero;
3544 int bytes_to_move;
3545 kern_return_t kret;
3546 int retval = 0;
3547 int io_resid;
3548 long long total_size;
3549 long long zero_cnt;
3550 off_t zero_off;
3551 long long zero_cnt1;
3552 off_t zero_off1;
3553 off_t write_off = 0;
3554 int write_cnt = 0;
3555 boolean_t first_pass = FALSE;
3556 struct cl_extent cl;
3557 int bflag;
3558 u_int max_io_size;
3559
3560 if (uio) {
3561 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3562 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3563
3564 io_resid = io_req_size;
3565 } else {
3566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3567 0, 0, (int)oldEOF, (int)newEOF, 0);
3568
3569 io_resid = 0;
3570 }
3571 if (flags & IO_PASSIVE) {
3572 bflag = CL_PASSIVE;
3573 } else {
3574 bflag = 0;
3575 }
3576 if (flags & IO_NOCACHE) {
3577 bflag |= CL_NOCACHE;
3578 }
3579
3580 if (flags & IO_SKIP_ENCRYPTION) {
3581 bflag |= CL_ENCRYPTED;
3582 }
3583
3584 zero_cnt = 0;
3585 zero_cnt1 = 0;
3586 zero_off = 0;
3587 zero_off1 = 0;
3588
3589 max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_WRITE);
3590
3591 if (flags & IO_HEADZEROFILL) {
3592 /*
3593 * some filesystems (HFS is one) don't support unallocated holes within a file...
3594 * so we zero fill the intervening space between the old EOF and the offset
3595 * where the next chunk of real data begins.... ftruncate will also use this
3596 * routine to zero fill to the new EOF when growing a file... in this case, the
3597 * uio structure will not be provided
3598 */
3599 if (uio) {
3600 if (headOff < uio->uio_offset) {
3601 zero_cnt = uio->uio_offset - headOff;
3602 zero_off = headOff;
3603 }
3604 } else if (headOff < newEOF) {
3605 zero_cnt = newEOF - headOff;
3606 zero_off = headOff;
3607 }
3608 } else {
3609 if (uio && uio->uio_offset > oldEOF) {
3610 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3611
3612 if (zero_off >= oldEOF) {
3613 zero_cnt = uio->uio_offset - zero_off;
3614
3615 flags |= IO_HEADZEROFILL;
3616 }
3617 }
3618 }
3619 if (flags & IO_TAILZEROFILL) {
3620 if (uio) {
3621 zero_off1 = uio->uio_offset + io_req_size;
3622
3623 if (zero_off1 < tailOff) {
3624 zero_cnt1 = tailOff - zero_off1;
3625 }
3626 }
3627 } else {
3628 if (uio && newEOF > oldEOF) {
3629 zero_off1 = uio->uio_offset + io_req_size;
3630
3631 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3632 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3633
3634 flags |= IO_TAILZEROFILL;
3635 }
3636 }
3637 }
3638 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3640 retval, 0, 0, 0, 0);
3641 return 0;
3642 }
3643 if (uio) {
3644 write_off = uio->uio_offset;
3645 write_cnt = (int)uio_resid(a_uio: uio);
3646 /*
3647 * delay updating the sequential write info
3648 * in the control block until we've obtained
3649 * the lock for it
3650 */
3651 first_pass = TRUE;
3652 }
3653 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3654 /*
3655 * for this iteration of the loop, figure out where our starting point is
3656 */
3657 if (zero_cnt) {
3658 start_offset = (int)(zero_off & PAGE_MASK_64);
3659 upl_f_offset = zero_off - start_offset;
3660 } else if (io_resid) {
3661 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3662 upl_f_offset = uio->uio_offset - start_offset;
3663 } else {
3664 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3665 upl_f_offset = zero_off1 - start_offset;
3666 }
3667 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3668 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3669
3670 if (total_size > max_io_size) {
3671 total_size = max_io_size;
3672 }
3673
3674 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3675
3676 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3677 /*
3678 * assumption... total_size <= io_resid
3679 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3680 */
3681 if ((start_offset + total_size) > max_io_size) {
3682 total_size = max_io_size - start_offset;
3683 }
3684 xfer_resid = (int)total_size;
3685
3686 retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: &xfer_resid, mark_dirty: 1, take_reference: 1);
3687
3688 if (retval) {
3689 break;
3690 }
3691
3692 io_resid -= (total_size - xfer_resid);
3693 total_size = xfer_resid;
3694 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3695 upl_f_offset = uio->uio_offset - start_offset;
3696
3697 if (total_size == 0) {
3698 if (start_offset) {
3699 /*
3700 * the write did not finish on a page boundary
3701 * which will leave upl_f_offset pointing to the
3702 * beginning of the last page written instead of
3703 * the page beyond it... bump it in this case
3704 * so that the cluster code records the last page
3705 * written as dirty
3706 */
3707 upl_f_offset += PAGE_SIZE_64;
3708 }
3709 upl_size = 0;
3710
3711 goto check_cluster;
3712 }
3713 }
3714 /*
3715 * compute the size of the upl needed to encompass
3716 * the requested write... limit each call to cluster_io
3717 * to the maximum UPL size... cluster_io will clip if
3718 * this exceeds the maximum io_size for the device,
3719 * make sure to account for
3720 * a starting offset that's not page aligned
3721 */
3722 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3723
3724 if (upl_size > max_io_size) {
3725 upl_size = max_io_size;
3726 }
3727
3728 pages_in_upl = (int)(upl_size / PAGE_SIZE);
3729 io_size = (int)(upl_size - start_offset);
3730
3731 if ((long long)io_size > total_size) {
3732 io_size = (int)total_size;
3733 }
3734
3735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3736
3737
3738 /*
3739 * Gather the pages from the buffer cache.
3740 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3741 * that we intend to modify these pages.
3742 */
3743 kret = ubc_create_upl_kernel(vp,
3744 upl_f_offset,
3745 (int)upl_size,
3746 &upl,
3747 &pl,
3748 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3749 VM_KERN_MEMORY_FILE);
3750 if (kret != KERN_SUCCESS) {
3751 panic("cluster_write_copy: failed to get pagelist");
3752 }
3753
3754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3755 upl, (int)upl_f_offset, start_offset, 0, 0);
3756
3757 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(upl: pl, index: 0)) {
3758 int read_size;
3759
3760 /*
3761 * we're starting in the middle of the first page of the upl
3762 * and the page isn't currently valid, so we're going to have
3763 * to read it in first... this is a synchronous operation
3764 */
3765 read_size = PAGE_SIZE;
3766
3767 if ((upl_f_offset + read_size) > oldEOF) {
3768 read_size = (int)(oldEOF - upl_f_offset);
3769 }
3770
3771 retval = cluster_io(vp, upl, upl_offset: 0, f_offset: upl_f_offset, non_rounded_size: read_size,
3772 CL_READ | bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
3773 if (retval) {
3774 /*
3775 * we had an error during the read which causes us to abort
3776 * the current cluster_write request... before we do, we need
3777 * to release the rest of the pages in the upl without modifying
3778 * there state and mark the failed page in error
3779 */
3780 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3781
3782 if (upl_size > PAGE_SIZE) {
3783 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3784 UPL_ABORT_FREE_ON_EMPTY);
3785 }
3786
3787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3788 upl, 0, 0, retval, 0);
3789 break;
3790 }
3791 }
3792 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3793 /*
3794 * the last offset we're writing to in this upl does not end on a page
3795 * boundary... if it's not beyond the old EOF, then we'll also need to
3796 * pre-read this page in if it isn't already valid
3797 */
3798 upl_offset = upl_size - PAGE_SIZE;
3799
3800 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3801 !upl_valid_page(upl: pl, index: (int)(upl_offset / PAGE_SIZE))) {
3802 int read_size;
3803
3804 read_size = PAGE_SIZE;
3805
3806 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3807 read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3808 }
3809
3810 retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: read_size,
3811 CL_READ | bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
3812 if (retval) {
3813 /*
3814 * we had an error during the read which causes us to abort
3815 * the current cluster_write request... before we do, we
3816 * need to release the rest of the pages in the upl without
3817 * modifying there state and mark the failed page in error
3818 */
3819 ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3820
3821 if (upl_size > PAGE_SIZE) {
3822 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3823 }
3824
3825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3826 upl, 0, 0, retval, 0);
3827 break;
3828 }
3829 }
3830 }
3831 xfer_resid = io_size;
3832 io_offset = start_offset;
3833
3834 while (zero_cnt && xfer_resid) {
3835 if (zero_cnt < (long long)xfer_resid) {
3836 bytes_to_zero = (int)zero_cnt;
3837 } else {
3838 bytes_to_zero = xfer_resid;
3839 }
3840
3841 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3842
3843 xfer_resid -= bytes_to_zero;
3844 zero_cnt -= bytes_to_zero;
3845 zero_off += bytes_to_zero;
3846 io_offset += bytes_to_zero;
3847 }
3848 if (xfer_resid && io_resid) {
3849 u_int32_t io_requested;
3850
3851 bytes_to_move = min(a: io_resid, b: xfer_resid);
3852 io_requested = bytes_to_move;
3853
3854 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3855
3856 if (retval) {
3857 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3858
3859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3860 upl, 0, 0, retval, 0);
3861 } else {
3862 io_resid -= bytes_to_move;
3863 xfer_resid -= bytes_to_move;
3864 io_offset += bytes_to_move;
3865 }
3866 }
3867 while (xfer_resid && zero_cnt1 && retval == 0) {
3868 if (zero_cnt1 < (long long)xfer_resid) {
3869 bytes_to_zero = (int)zero_cnt1;
3870 } else {
3871 bytes_to_zero = xfer_resid;
3872 }
3873
3874 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off: zero_off1, upl_f_offset, bytes_to_zero);
3875
3876 xfer_resid -= bytes_to_zero;
3877 zero_cnt1 -= bytes_to_zero;
3878 zero_off1 += bytes_to_zero;
3879 io_offset += bytes_to_zero;
3880 }
3881 if (retval == 0) {
3882 int do_zeroing = 1;
3883
3884 io_size += start_offset;
3885
3886 /* Force more restrictive zeroing behavior only on APFS */
3887 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3888 do_zeroing = 0;
3889 }
3890
3891 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3892 /*
3893 * if we're extending the file with this write
3894 * we'll zero fill the rest of the page so that
3895 * if the file gets extended again in such a way as to leave a
3896 * hole starting at this EOF, we'll have zero's in the correct spot
3897 */
3898 cluster_zero(upl, upl_offset: io_size, size: (int)(upl_size - io_size), NULL);
3899 }
3900 /*
3901 * release the upl now if we hold one since...
3902 * 1) pages in it may be present in the sparse cluster map
3903 * and may span 2 separate buckets there... if they do and
3904 * we happen to have to flush a bucket to make room and it intersects
3905 * this upl, a deadlock may result on page BUSY
3906 * 2) we're delaying the I/O... from this point forward we're just updating
3907 * the cluster state... no need to hold the pages, so commit them
3908 * 3) IO_SYNC is set...
3909 * because we had to ask for a UPL that provides currenty non-present pages, the
3910 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3911 * upon committing it... this is not the behavior we want since it's possible for
3912 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3913 * we'll pick these pages back up later with the correct behavior specified.
3914 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3915 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3916 * we hold since the flushing context is holding the cluster lock.
3917 */
3918 ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3919 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3920check_cluster:
3921 /*
3922 * calculate the last logical block number
3923 * that this delayed I/O encompassed
3924 */
3925 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3926
3927 if (flags & IO_SYNC) {
3928 /*
3929 * if the IO_SYNC flag is set than we need to bypass
3930 * any clustering and immediately issue the I/O
3931 *
3932 * we don't hold the lock at this point
3933 *
3934 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3935 * so that we correctly deal with a change in state of the hardware modify bit...
3936 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3937 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3938 * responsible for generating the correct sized I/O(s)
3939 */
3940 retval = cluster_push_now(vp, &cl, EOF: newEOF, flags, callback, callback_arg, FALSE);
3941 } else {
3942 boolean_t defer_writes = FALSE;
3943
3944 if (vfs_flags(mp: vp->v_mount) & MNT_DEFWRITE) {
3945 defer_writes = TRUE;
3946 }
3947
3948 cluster_update_state_internal(vp, cl: &cl, flags, defer_writes, first_pass: &first_pass,
3949 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3950 }
3951 }
3952 }
3953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3954
3955 return retval;
3956}
3957
3958
3959
3960int
3961cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3962{
3963 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3964}
3965
3966
3967int
3968cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3969{
3970 int retval = 0;
3971 int flags;
3972 user_ssize_t cur_resid;
3973 u_int32_t io_size;
3974 u_int32_t read_length = 0;
3975 int read_type = IO_COPY;
3976
3977 flags = xflags;
3978
3979 if (vp->v_flag & VNOCACHE_DATA) {
3980 flags |= IO_NOCACHE;
3981 }
3982 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3983 flags |= IO_RAOFF;
3984 }
3985
3986 if (flags & IO_SKIP_ENCRYPTION) {
3987 flags |= IO_ENCRYPTED;
3988 }
3989
3990 /*
3991 * do a read through the cache if one of the following is true....
3992 * NOCACHE is not true
3993 * the uio request doesn't target USERSPACE
3994 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3995 * Reading encrypted data from a CP filesystem should never result in the data touching
3996 * the UBC.
3997 *
3998 * otherwise, find out if we want the direct or contig variant for
3999 * the first vector in the uio request
4000 */
4001 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
4002 retval = cluster_io_type(uio, io_type: &read_type, io_length: &read_length, min_length: 0);
4003 }
4004
4005 while ((cur_resid = uio_resid(a_uio: uio)) && uio->uio_offset < filesize && retval == 0) {
4006 switch (read_type) {
4007 case IO_COPY:
4008 /*
4009 * make sure the uio_resid isn't too big...
4010 * internally, we want to handle all of the I/O in
4011 * chunk sizes that fit in a 32 bit int
4012 */
4013 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4014 io_size = MAX_IO_REQUEST_SIZE;
4015 } else {
4016 io_size = (u_int32_t)cur_resid;
4017 }
4018
4019 retval = cluster_read_copy(vp, uio, io_req_size: io_size, filesize, flags, callback, callback_arg);
4020 break;
4021
4022 case IO_DIRECT:
4023 retval = cluster_read_direct(vp, uio, filesize, read_type: &read_type, read_length: &read_length, flags, callback, callback_arg);
4024 break;
4025
4026 case IO_CONTIG:
4027 retval = cluster_read_contig(vp, uio, filesize, read_type: &read_type, read_length: &read_length, callback, callback_arg, flags);
4028 break;
4029
4030 case IO_UNKNOWN:
4031 retval = cluster_io_type(uio, io_type: &read_type, io_length: &read_length, min_length: 0);
4032 break;
4033 }
4034 }
4035 return retval;
4036}
4037
4038
4039
4040static void
4041cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4042{
4043 int range;
4044 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4045
4046 if ((range = last_pg - start_pg)) {
4047 if (take_reference) {
4048 abort_flags |= UPL_ABORT_REFERENCE;
4049 }
4050
4051 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4052 }
4053}
4054
4055
4056static int
4057cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4058{
4059 upl_page_info_t *pl;
4060 upl_t upl = NULL;
4061 vm_offset_t upl_offset;
4062 u_int32_t upl_size;
4063 off_t upl_f_offset;
4064 int start_offset;
4065 int start_pg;
4066 int last_pg;
4067 int uio_last = 0;
4068 int pages_in_upl;
4069 off_t max_size;
4070 off_t last_ioread_offset;
4071 off_t last_request_offset;
4072 kern_return_t kret;
4073 int error = 0;
4074 int retval = 0;
4075 u_int32_t size_of_prefetch;
4076 u_int32_t xsize;
4077 u_int32_t io_size;
4078 u_int32_t max_rd_size;
4079 u_int32_t max_io_size;
4080 u_int32_t max_prefetch;
4081 u_int rd_ahead_enabled = 1;
4082 u_int prefetch_enabled = 1;
4083 struct cl_readahead * rap;
4084 struct clios iostate;
4085 struct cl_extent extent;
4086 int bflag;
4087 int take_reference = 1;
4088 int policy = IOPOL_DEFAULT;
4089 boolean_t iolock_inited = FALSE;
4090
4091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4092 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4093
4094 if (flags & IO_ENCRYPTED) {
4095 panic("encrypted blocks will hit UBC!");
4096 }
4097
4098 policy = throttle_get_io_policy(NULL);
4099
4100 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4101 take_reference = 0;
4102 }
4103
4104 if (flags & IO_PASSIVE) {
4105 bflag = CL_PASSIVE;
4106 } else {
4107 bflag = 0;
4108 }
4109
4110 if (flags & IO_NOCACHE) {
4111 bflag |= CL_NOCACHE;
4112 }
4113
4114 if (flags & IO_SKIP_ENCRYPTION) {
4115 bflag |= CL_ENCRYPTED;
4116 }
4117
4118 max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
4119 max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_limit: prefetch_max);
4120 max_rd_size = max_prefetch;
4121
4122 last_request_offset = uio->uio_offset + io_req_size;
4123
4124 if (last_request_offset > filesize) {
4125 last_request_offset = filesize;
4126 }
4127
4128 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4129 rd_ahead_enabled = 0;
4130 rap = NULL;
4131 } else {
4132 if (cluster_is_throttled(vp)) {
4133 /*
4134 * we're in the throttle window, at the very least
4135 * we want to limit the size of the I/O we're about
4136 * to issue
4137 */
4138 rd_ahead_enabled = 0;
4139 prefetch_enabled = 0;
4140
4141 max_rd_size = calculate_max_throttle_size(vp);
4142 }
4143 if ((rap = cluster_get_rap(vp)) == NULL) {
4144 rd_ahead_enabled = 0;
4145 } else {
4146 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4147 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4148 }
4149 }
4150 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4151 /*
4152 * determine if we already have a read-ahead in the pipe courtesy of the
4153 * last read systemcall that was issued...
4154 * if so, pick up it's extent to determine where we should start
4155 * with respect to any read-ahead that might be necessary to
4156 * garner all the data needed to complete this read systemcall
4157 */
4158 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4159
4160 if (last_ioread_offset < uio->uio_offset) {
4161 last_ioread_offset = (off_t)0;
4162 } else if (last_ioread_offset > last_request_offset) {
4163 last_ioread_offset = last_request_offset;
4164 }
4165 } else {
4166 last_ioread_offset = (off_t)0;
4167 }
4168
4169 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4170 max_size = filesize - uio->uio_offset;
4171 bool leftover_upl_aborted = false;
4172
4173 if ((off_t)(io_req_size) < max_size) {
4174 io_size = io_req_size;
4175 } else {
4176 io_size = (u_int32_t)max_size;
4177 }
4178
4179 if (!(flags & IO_NOCACHE)) {
4180 while (io_size) {
4181 u_int32_t io_resid;
4182 u_int32_t io_requested;
4183
4184 /*
4185 * if we keep finding the pages we need already in the cache, then
4186 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4187 * to determine that we have all the pages we need... once we miss in
4188 * the cache and have issued an I/O, than we'll assume that we're likely
4189 * to continue to miss in the cache and it's to our advantage to try and prefetch
4190 */
4191 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4192 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4193 /*
4194 * we've already issued I/O for this request and
4195 * there's still work to do and
4196 * our prefetch stream is running dry, so issue a
4197 * pre-fetch I/O... the I/O latency will overlap
4198 * with the copying of the data
4199 */
4200 if (size_of_prefetch > max_rd_size) {
4201 size_of_prefetch = max_rd_size;
4202 }
4203
4204 size_of_prefetch = cluster_read_prefetch(vp, f_offset: last_ioread_offset, size: size_of_prefetch, filesize, callback, callback_arg, bflag);
4205
4206 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4207
4208 if (last_ioread_offset > last_request_offset) {
4209 last_ioread_offset = last_request_offset;
4210 }
4211 }
4212 }
4213 /*
4214 * limit the size of the copy we're about to do so that
4215 * we can notice that our I/O pipe is running dry and
4216 * get the next I/O issued before it does go dry
4217 */
4218 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4219 io_resid = (max_io_size / 4);
4220 } else {
4221 io_resid = io_size;
4222 }
4223
4224 io_requested = io_resid;
4225
4226 retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: (int *)&io_resid, mark_dirty: 0, take_reference);
4227
4228 xsize = io_requested - io_resid;
4229
4230 io_size -= xsize;
4231 io_req_size -= xsize;
4232
4233 if (retval || io_resid) {
4234 /*
4235 * if we run into a real error or
4236 * a page that is not in the cache
4237 * we need to leave streaming mode
4238 */
4239 break;
4240 }
4241
4242 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4243 /*
4244 * we're already finished the I/O for this read request
4245 * let's see if we should do a read-ahead
4246 */
4247 cluster_read_ahead(vp, extent: &extent, filesize, rap, callback, callback_arg, bflag);
4248 }
4249 }
4250 if (retval) {
4251 break;
4252 }
4253 if (io_size == 0) {
4254 if (rap != NULL) {
4255 if (extent.e_addr < rap->cl_lastr) {
4256 rap->cl_maxra = 0;
4257 }
4258 rap->cl_lastr = extent.e_addr;
4259 }
4260 break;
4261 }
4262 /*
4263 * recompute max_size since cluster_copy_ubc_data_internal
4264 * may have advanced uio->uio_offset
4265 */
4266 max_size = filesize - uio->uio_offset;
4267 }
4268
4269 iostate.io_completed = 0;
4270 iostate.io_issued = 0;
4271 iostate.io_error = 0;
4272 iostate.io_wanted = 0;
4273
4274 if ((flags & IO_RETURN_ON_THROTTLE)) {
4275 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4276 if (!cluster_io_present_in_BC(vp, f_offset: uio->uio_offset)) {
4277 /*
4278 * we're in the throttle window and at least 1 I/O
4279 * has already been issued by a throttleable thread
4280 * in this window, so return with EAGAIN to indicate
4281 * to the FS issuing the cluster_read call that it
4282 * should now throttle after dropping any locks
4283 */
4284 throttle_info_update_by_mount(mp: vp->v_mount);
4285
4286 retval = EAGAIN;
4287 break;
4288 }
4289 }
4290 }
4291
4292 /*
4293 * compute the size of the upl needed to encompass
4294 * the requested read... limit each call to cluster_io
4295 * to the maximum UPL size... cluster_io will clip if
4296 * this exceeds the maximum io_size for the device,
4297 * make sure to account for
4298 * a starting offset that's not page aligned
4299 */
4300 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4301 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4302
4303 if (io_size > max_rd_size) {
4304 io_size = max_rd_size;
4305 }
4306
4307 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4308
4309 if (flags & IO_NOCACHE) {
4310 if (upl_size > max_io_size) {
4311 upl_size = max_io_size;
4312 }
4313 } else {
4314 if (upl_size > max_io_size / 4) {
4315 upl_size = max_io_size / 4;
4316 upl_size &= ~PAGE_MASK;
4317
4318 if (upl_size == 0) {
4319 upl_size = PAGE_SIZE;
4320 }
4321 }
4322 }
4323 pages_in_upl = upl_size / PAGE_SIZE;
4324
4325 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4326 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4327
4328 kret = ubc_create_upl_kernel(vp,
4329 upl_f_offset,
4330 upl_size,
4331 &upl,
4332 &pl,
4333 UPL_FILE_IO | UPL_SET_LITE,
4334 VM_KERN_MEMORY_FILE);
4335 if (kret != KERN_SUCCESS) {
4336 panic("cluster_read_copy: failed to get pagelist");
4337 }
4338
4339 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4340 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4341
4342 /*
4343 * scan from the beginning of the upl looking for the first
4344 * non-valid page.... this will become the first page in
4345 * the request we're going to make to 'cluster_io'... if all
4346 * of the pages are valid, we won't call through to 'cluster_io'
4347 */
4348 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4349 if (!upl_valid_page(upl: pl, index: start_pg)) {
4350 break;
4351 }
4352 }
4353
4354 /*
4355 * scan from the starting invalid page looking for a valid
4356 * page before the end of the upl is reached, if we
4357 * find one, then it will be the last page of the request to
4358 * 'cluster_io'
4359 */
4360 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4361 if (upl_valid_page(upl: pl, index: last_pg)) {
4362 break;
4363 }
4364 }
4365
4366 if (start_pg < last_pg) {
4367 /*
4368 * we found a range of 'invalid' pages that must be filled
4369 * if the last page in this range is the last page of the file
4370 * we may have to clip the size of it to keep from reading past
4371 * the end of the last physical block associated with the file
4372 */
4373 if (iolock_inited == FALSE) {
4374 lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4375
4376 iolock_inited = TRUE;
4377 }
4378 upl_offset = start_pg * PAGE_SIZE;
4379 io_size = (last_pg - start_pg) * PAGE_SIZE;
4380
4381 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4382 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4383 }
4384
4385 /*
4386 * Find out if this needs verification, we'll have to manage the UPL
4387 * diffrently if so. Note that this call only lets us know if
4388 * verification is enabled on this mount point, the actual verification
4389 * is performed in the File system.
4390 */
4391 size_t verify_block_size = 0;
4392 if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4393 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4394 if (!upl_valid_page(upl: pl, index: uio_last)) {
4395 break;
4396 }
4397 }
4398 if (uio_last < pages_in_upl) {
4399 /*
4400 * there were some invalid pages beyond the valid pages
4401 * that we didn't issue an I/O for, just release them
4402 * unchanged now, so that any prefetch/readahed can
4403 * include them
4404 */
4405 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4406 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4407 leftover_upl_aborted = true;
4408 }
4409 }
4410
4411 /*
4412 * issue an asynchronous read to cluster_io
4413 */
4414
4415 error = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset,
4416 non_rounded_size: io_size, CL_READ | CL_ASYNC | bflag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
4417
4418 if (rap) {
4419 if (extent.e_addr < rap->cl_maxra) {
4420 /*
4421 * we've just issued a read for a block that should have been
4422 * in the cache courtesy of the read-ahead engine... something
4423 * has gone wrong with the pipeline, so reset the read-ahead
4424 * logic which will cause us to restart from scratch
4425 */
4426 rap->cl_maxra = 0;
4427 }
4428 }
4429 }
4430 if (error == 0) {
4431 /*
4432 * if the read completed successfully, or there was no I/O request
4433 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4434 * we'll first add on any 'valid'
4435 * pages that were present in the upl when we acquired it.
4436 */
4437 u_int val_size;
4438
4439 if (!leftover_upl_aborted) {
4440 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4441 if (!upl_valid_page(upl: pl, index: uio_last)) {
4442 break;
4443 }
4444 }
4445 if (uio_last < pages_in_upl) {
4446 /*
4447 * there were some invalid pages beyond the valid pages
4448 * that we didn't issue an I/O for, just release them
4449 * unchanged now, so that any prefetch/readahed can
4450 * include them
4451 */
4452 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4453 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4454 }
4455 }
4456
4457 /*
4458 * compute size to transfer this round, if io_req_size is
4459 * still non-zero after this attempt, we'll loop around and
4460 * set up for another I/O.
4461 */
4462 val_size = (uio_last * PAGE_SIZE) - start_offset;
4463
4464 if (val_size > max_size) {
4465 val_size = (u_int)max_size;
4466 }
4467
4468 if (val_size > io_req_size) {
4469 val_size = io_req_size;
4470 }
4471
4472 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4473 last_ioread_offset = uio->uio_offset + val_size;
4474 }
4475
4476 if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4477 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4478 /*
4479 * if there's still I/O left to do for this request, and...
4480 * we're not in hard throttle mode, and...
4481 * we're close to using up the previous prefetch, then issue a
4482 * new pre-fetch I/O... the I/O latency will overlap
4483 * with the copying of the data
4484 */
4485 if (size_of_prefetch > max_rd_size) {
4486 size_of_prefetch = max_rd_size;
4487 }
4488
4489 size_of_prefetch = cluster_read_prefetch(vp, f_offset: last_ioread_offset, size: size_of_prefetch, filesize, callback, callback_arg, bflag);
4490
4491 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4492
4493 if (last_ioread_offset > last_request_offset) {
4494 last_ioread_offset = last_request_offset;
4495 }
4496 }
4497 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4498 /*
4499 * this transfer will finish this request, so...
4500 * let's try to read ahead if we're in
4501 * a sequential access pattern and we haven't
4502 * explicitly disabled it
4503 */
4504 if (rd_ahead_enabled) {
4505 cluster_read_ahead(vp, extent: &extent, filesize, rap, callback, callback_arg, bflag);
4506 }
4507
4508 if (rap != NULL) {
4509 if (extent.e_addr < rap->cl_lastr) {
4510 rap->cl_maxra = 0;
4511 }
4512 rap->cl_lastr = extent.e_addr;
4513 }
4514 }
4515 if (iolock_inited == TRUE) {
4516 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_read_copy");
4517 }
4518
4519 if (iostate.io_error) {
4520 error = iostate.io_error;
4521 } else {
4522 u_int32_t io_requested;
4523
4524 io_requested = val_size;
4525
4526 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4527
4528 io_req_size -= (val_size - io_requested);
4529 }
4530 } else {
4531 if (iolock_inited == TRUE) {
4532 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_read_copy");
4533 }
4534 }
4535 if (start_pg < last_pg) {
4536 /*
4537 * compute the range of pages that we actually issued an I/O for
4538 * and either commit them as valid if the I/O succeeded
4539 * or abort them if the I/O failed or we're not supposed to
4540 * keep them in the cache
4541 */
4542 io_size = (last_pg - start_pg) * PAGE_SIZE;
4543
4544 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4545
4546 if (error || (flags & IO_NOCACHE)) {
4547 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4548 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4549 } else {
4550 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4551
4552 if (take_reference) {
4553 commit_flags |= UPL_COMMIT_INACTIVATE;
4554 } else {
4555 commit_flags |= UPL_COMMIT_SPECULATE;
4556 }
4557
4558 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4559 }
4560 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4561 }
4562 if ((last_pg - start_pg) < pages_in_upl) {
4563 /*
4564 * the set of pages that we issued an I/O for did not encompass
4565 * the entire upl... so just release these without modifying
4566 * their state
4567 */
4568 if (error) {
4569 if (leftover_upl_aborted) {
4570 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4571 UPL_ABORT_FREE_ON_EMPTY);
4572 } else {
4573 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4574 }
4575 } else {
4576 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4577 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4578
4579 /*
4580 * handle any valid pages at the beginning of
4581 * the upl... release these appropriately
4582 */
4583 cluster_read_upl_release(upl, start_pg: 0, last_pg: start_pg, take_reference);
4584
4585 /*
4586 * handle any valid pages immediately after the
4587 * pages we issued I/O for... ... release these appropriately
4588 */
4589 cluster_read_upl_release(upl, start_pg: last_pg, last_pg: uio_last, take_reference);
4590
4591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4592 }
4593 }
4594 if (retval == 0) {
4595 retval = error;
4596 }
4597
4598 if (io_req_size) {
4599 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4600
4601 if (cluster_is_throttled(vp)) {
4602 /*
4603 * we're in the throttle window, at the very least
4604 * we want to limit the size of the I/O we're about
4605 * to issue
4606 */
4607 rd_ahead_enabled = 0;
4608 prefetch_enabled = 0;
4609 max_rd_size = max_throttle_size;
4610 } else {
4611 if (max_rd_size == max_throttle_size) {
4612 /*
4613 * coming out of throttled state
4614 */
4615 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4616 if (rap != NULL) {
4617 rd_ahead_enabled = 1;
4618 }
4619 prefetch_enabled = 1;
4620 }
4621 max_rd_size = max_prefetch;
4622 last_ioread_offset = 0;
4623 }
4624 }
4625 }
4626 }
4627 if (iolock_inited == TRUE) {
4628 /*
4629 * cluster_io returned an error after it
4630 * had already issued some I/O. we need
4631 * to wait for that I/O to complete before
4632 * we can destroy the iostate mutex...
4633 * 'retval' already contains the early error
4634 * so no need to pick it up from iostate.io_error
4635 */
4636 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_read_copy");
4637
4638 lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
4639 }
4640 if (rap != NULL) {
4641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4642 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4643
4644 lck_mtx_unlock(lck: &rap->cl_lockr);
4645 } else {
4646 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4647 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4648 }
4649
4650 return retval;
4651}
4652
4653/*
4654 * We don't want another read/write lock for every vnode in the system
4655 * so we keep a hash of them here. There should never be very many of
4656 * these around at any point in time.
4657 */
4658cl_direct_read_lock_t *
4659cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4660{
4661 struct cl_direct_read_locks *head
4662 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4663 % CL_DIRECT_READ_LOCK_BUCKETS];
4664
4665 struct cl_direct_read_lock *lck, *new_lck = NULL;
4666
4667 for (;;) {
4668 lck_spin_lock(lck: &cl_direct_read_spin_lock);
4669
4670 LIST_FOREACH(lck, head, chain) {
4671 if (lck->vp == vp) {
4672 ++lck->ref_count;
4673 lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4674 if (new_lck) {
4675 // Someone beat us to it, ditch the allocation
4676 lck_rw_destroy(lck: &new_lck->rw_lock, grp: &cl_mtx_grp);
4677 kfree_type(cl_direct_read_lock_t, new_lck);
4678 }
4679 lck_rw_lock(lck: &lck->rw_lock, lck_rw_type: type);
4680 return lck;
4681 }
4682 }
4683
4684 if (new_lck) {
4685 // Use the lock we allocated
4686 LIST_INSERT_HEAD(head, new_lck, chain);
4687 lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4688 lck_rw_lock(lck: &new_lck->rw_lock, lck_rw_type: type);
4689 return new_lck;
4690 }
4691
4692 lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4693
4694 // Allocate a new lock
4695 new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4696 lck_rw_init(lck: &new_lck->rw_lock, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4697 new_lck->vp = vp;
4698 new_lck->ref_count = 1;
4699
4700 // Got to go round again
4701 }
4702}
4703
4704void
4705cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4706{
4707 lck_rw_done(lck: &lck->rw_lock);
4708
4709 lck_spin_lock(lck: &cl_direct_read_spin_lock);
4710 if (lck->ref_count == 1) {
4711 LIST_REMOVE(lck, chain);
4712 lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4713 lck_rw_destroy(lck: &lck->rw_lock, grp: &cl_mtx_grp);
4714 kfree_type(cl_direct_read_lock_t, lck);
4715 } else {
4716 --lck->ref_count;
4717 lck_spin_unlock(lck: &cl_direct_read_spin_lock);
4718 }
4719}
4720
4721static int
4722cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4723 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4724{
4725 upl_t upl = NULL;
4726 upl_page_info_t *pl;
4727 off_t max_io_size;
4728 vm_offset_t upl_offset, vector_upl_offset = 0;
4729 upl_size_t upl_size = 0, vector_upl_size = 0;
4730 vm_size_t upl_needed_size;
4731 unsigned int pages_in_pl;
4732 upl_control_flags_t upl_flags;
4733 kern_return_t kret = KERN_SUCCESS;
4734 unsigned int i;
4735 int force_data_sync;
4736 int retval = 0;
4737 int no_zero_fill = 0;
4738 int io_flag = 0;
4739 int misaligned = 0;
4740 struct clios iostate;
4741 user_addr_t iov_base;
4742 u_int32_t io_req_size;
4743 u_int32_t offset_in_file;
4744 u_int32_t offset_in_iovbase;
4745 u_int32_t io_size;
4746 u_int32_t io_min;
4747 u_int32_t xsize;
4748 u_int32_t devblocksize;
4749 u_int32_t mem_alignment_mask;
4750 u_int32_t max_upl_size;
4751 u_int32_t max_rd_size;
4752 u_int32_t max_rd_ahead;
4753 u_int32_t max_vector_size;
4754 boolean_t io_throttled = FALSE;
4755
4756 u_int32_t vector_upl_iosize = 0;
4757 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4758 off_t v_upl_uio_offset = 0;
4759 int vector_upl_index = 0;
4760 upl_t vector_upl = NULL;
4761 cl_direct_read_lock_t *lock = NULL;
4762
4763 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4764
4765 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4766 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4767
4768 max_upl_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
4769
4770 max_rd_size = max_upl_size;
4771
4772 if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, 2),
4773 &max_rd_ahead) || (max_rd_ahead > overlapping_read_max))) {
4774 max_rd_ahead = overlapping_read_max;
4775 }
4776
4777 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4778
4779 if (flags & IO_PASSIVE) {
4780 io_flag |= CL_PASSIVE;
4781 }
4782
4783 if (flags & IO_ENCRYPTED) {
4784 io_flag |= CL_RAW_ENCRYPTED;
4785 }
4786
4787 if (flags & IO_NOCACHE) {
4788 io_flag |= CL_NOCACHE;
4789 }
4790
4791 if (flags & IO_SKIP_ENCRYPTION) {
4792 io_flag |= CL_ENCRYPTED;
4793 }
4794
4795 iostate.io_completed = 0;
4796 iostate.io_issued = 0;
4797 iostate.io_error = 0;
4798 iostate.io_wanted = 0;
4799
4800 lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
4801
4802 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4803 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4804
4805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4806 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4807
4808 if (devblocksize == 1) {
4809 /*
4810 * the AFP client advertises a devblocksize of 1
4811 * however, its BLOCKMAP routine maps to physical
4812 * blocks that are PAGE_SIZE in size...
4813 * therefore we can't ask for I/Os that aren't page aligned
4814 * or aren't multiples of PAGE_SIZE in size
4815 * by setting devblocksize to PAGE_SIZE, we re-instate
4816 * the old behavior we had before the mem_alignment_mask
4817 * changes went in...
4818 */
4819 devblocksize = PAGE_SIZE;
4820 }
4821
4822 /*
4823 * We are going to need this uio for the prefaulting later
4824 * especially for the cases where multiple non-contiguous
4825 * iovs are passed into this routine.
4826 */
4827 uio_t uio_acct = uio_duplicate(a_uio: uio);
4828
4829next_dread:
4830 io_req_size = *read_length;
4831 iov_base = uio_curriovbase(a_uio: uio);
4832
4833 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4834 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4835
4836 if (vm_map_page_mask(map: current_map()) < PAGE_MASK) {
4837 /*
4838 * XXX TODO4K
4839 * Direct I/O might not work as expected from a 16k kernel space
4840 * to a 4k user space because each 4k chunk might point to
4841 * a different 16k physical page...
4842 * Let's go the "misaligned" way.
4843 */
4844 if (!misaligned) {
4845 DEBUG4K_VFS("forcing misaligned\n");
4846 }
4847 misaligned = 1;
4848 }
4849
4850 if (offset_in_file || offset_in_iovbase) {
4851 /*
4852 * one of the 2 important offsets is misaligned
4853 * so fire an I/O through the cache for this entire vector
4854 */
4855 misaligned = 1;
4856 }
4857 if (iov_base & (devblocksize - 1)) {
4858 /*
4859 * the offset in memory must be on a device block boundary
4860 * so that we can guarantee that we can generate an
4861 * I/O that ends on a page boundary in cluster_io
4862 */
4863 misaligned = 1;
4864 }
4865
4866 max_io_size = filesize - uio->uio_offset;
4867
4868 /*
4869 * The user must request IO in aligned chunks. If the
4870 * offset into the file is bad, or the userland pointer
4871 * is non-aligned, then we cannot service the encrypted IO request.
4872 */
4873 if (flags & IO_ENCRYPTED) {
4874 if (misaligned || (io_req_size & (devblocksize - 1))) {
4875 retval = EINVAL;
4876 }
4877
4878 max_io_size = roundup(max_io_size, devblocksize);
4879 }
4880
4881 if ((off_t)io_req_size > max_io_size) {
4882 io_req_size = (u_int32_t)max_io_size;
4883 }
4884
4885 /*
4886 * When we get to this point, we know...
4887 * -- the offset into the file is on a devblocksize boundary
4888 */
4889
4890 while (io_req_size && retval == 0) {
4891 u_int32_t io_start;
4892
4893 if (cluster_is_throttled(vp)) {
4894 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4895
4896 /*
4897 * we're in the throttle window, at the very least
4898 * we want to limit the size of the I/O we're about
4899 * to issue
4900 */
4901 max_rd_size = max_throttle_size;
4902 max_rd_ahead = max_throttle_size - 1;
4903 max_vector_size = max_throttle_size;
4904 } else {
4905 max_rd_size = max_upl_size;
4906 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4907 max_vector_size = MAX_VECTOR_UPL_SIZE;
4908 }
4909 io_start = io_size = io_req_size;
4910
4911 /*
4912 * First look for pages already in the cache
4913 * and move them to user space. But only do this
4914 * check if we are not retrieving encrypted data directly
4915 * from the filesystem; those blocks should never
4916 * be in the UBC.
4917 *
4918 * cluster_copy_ubc_data returns the resid
4919 * in io_size
4920 */
4921 if ((flags & IO_ENCRYPTED) == 0) {
4922 retval = cluster_copy_ubc_data_internal(vp, uio, io_resid: (int *)&io_size, mark_dirty: 0, take_reference: 0);
4923 }
4924 /*
4925 * calculate the number of bytes actually copied
4926 * starting size - residual
4927 */
4928 xsize = io_start - io_size;
4929
4930 io_req_size -= xsize;
4931
4932 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4933 /*
4934 * We found something in the cache or we have an iov_base that's not
4935 * page-aligned.
4936 *
4937 * Issue all I/O's that have been collected within this Vectored UPL.
4938 */
4939 if (vector_upl_index) {
4940 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
4941 reset_vector_run_state();
4942 }
4943
4944 if (xsize) {
4945 useVectorUPL = 0;
4946 }
4947
4948 /*
4949 * After this point, if we are using the Vector UPL path and the base is
4950 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4951 */
4952 }
4953
4954 /*
4955 * check to see if we are finished with this request.
4956 *
4957 * If we satisfied this IO already, then io_req_size will be 0.
4958 * Otherwise, see if the IO was mis-aligned and needs to go through
4959 * the UBC to deal with the 'tail'.
4960 *
4961 */
4962 if (io_req_size == 0 || (misaligned)) {
4963 /*
4964 * see if there's another uio vector to
4965 * process that's of type IO_DIRECT
4966 *
4967 * break out of while loop to get there
4968 */
4969 break;
4970 }
4971 /*
4972 * assume the request ends on a device block boundary
4973 */
4974 io_min = devblocksize;
4975
4976 /*
4977 * we can handle I/O's in multiples of the device block size
4978 * however, if io_size isn't a multiple of devblocksize we
4979 * want to clip it back to the nearest page boundary since
4980 * we are going to have to go through cluster_read_copy to
4981 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4982 * multiple, we avoid asking the drive for the same physical
4983 * blocks twice.. once for the partial page at the end of the
4984 * request and a 2nd time for the page we read into the cache
4985 * (which overlaps the end of the direct read) in order to
4986 * get at the overhang bytes
4987 */
4988 if (io_size & (devblocksize - 1)) {
4989 assert(!(flags & IO_ENCRYPTED));
4990 /*
4991 * Clip the request to the previous page size boundary
4992 * since request does NOT end on a device block boundary
4993 */
4994 io_size &= ~PAGE_MASK;
4995 io_min = PAGE_SIZE;
4996 }
4997 if (retval || io_size < io_min) {
4998 /*
4999 * either an error or we only have the tail left to
5000 * complete via the copy path...
5001 * we may have already spun some portion of this request
5002 * off as async requests... we need to wait for the I/O
5003 * to complete before returning
5004 */
5005 goto wait_for_dreads;
5006 }
5007
5008 /*
5009 * Don't re-check the UBC data if we are looking for uncached IO
5010 * or asking for encrypted blocks.
5011 */
5012 if ((flags & IO_ENCRYPTED) == 0) {
5013 if ((xsize = io_size) > max_rd_size) {
5014 xsize = max_rd_size;
5015 }
5016
5017 io_size = 0;
5018
5019 if (!lock) {
5020 /*
5021 * We hold a lock here between the time we check the
5022 * cache and the time we issue I/O. This saves us
5023 * from having to lock the pages in the cache. Not
5024 * all clients will care about this lock but some
5025 * clients may want to guarantee stability between
5026 * here and when the I/O is issued in which case they
5027 * will take the lock exclusively.
5028 */
5029 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5030 }
5031
5032 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5033
5034 if (io_size == 0) {
5035 /*
5036 * a page must have just come into the cache
5037 * since the first page in this range is no
5038 * longer absent, go back and re-evaluate
5039 */
5040 continue;
5041 }
5042 }
5043 if ((flags & IO_RETURN_ON_THROTTLE)) {
5044 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5045 if (!cluster_io_present_in_BC(vp, f_offset: uio->uio_offset)) {
5046 /*
5047 * we're in the throttle window and at least 1 I/O
5048 * has already been issued by a throttleable thread
5049 * in this window, so return with EAGAIN to indicate
5050 * to the FS issuing the cluster_read call that it
5051 * should now throttle after dropping any locks
5052 */
5053 throttle_info_update_by_mount(mp: vp->v_mount);
5054
5055 io_throttled = TRUE;
5056 goto wait_for_dreads;
5057 }
5058 }
5059 }
5060 if (io_size > max_rd_size) {
5061 io_size = max_rd_size;
5062 }
5063
5064 iov_base = uio_curriovbase(a_uio: uio);
5065
5066 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5067 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5068
5069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5070 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5071
5072 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5073 no_zero_fill = 1;
5074 } else {
5075 no_zero_fill = 0;
5076 }
5077
5078 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5079 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5080 pages_in_pl = 0;
5081 upl_size = (upl_size_t)upl_needed_size;
5082 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5083 if (no_zero_fill) {
5084 upl_flags |= UPL_NOZEROFILL;
5085 }
5086 if (force_data_sync) {
5087 upl_flags |= UPL_FORCE_DATA_SYNC;
5088 }
5089
5090 kret = vm_map_create_upl(map,
5091 offset: (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5092 upl_size: &upl_size, upl: &upl, NULL, count: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE);
5093
5094 if (kret != KERN_SUCCESS) {
5095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5096 (int)upl_offset, upl_size, io_size, kret, 0);
5097 /*
5098 * failed to get pagelist
5099 *
5100 * we may have already spun some portion of this request
5101 * off as async requests... we need to wait for the I/O
5102 * to complete before returning
5103 */
5104 goto wait_for_dreads;
5105 }
5106 pages_in_pl = upl_size / PAGE_SIZE;
5107 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5108
5109 for (i = 0; i < pages_in_pl; i++) {
5110 if (!upl_page_present(upl: pl, index: i)) {
5111 break;
5112 }
5113 }
5114 if (i == pages_in_pl) {
5115 break;
5116 }
5117
5118 ubc_upl_abort(upl, 0);
5119 }
5120 if (force_data_sync >= 3) {
5121 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5122 (int)upl_offset, upl_size, io_size, kret, 0);
5123
5124 goto wait_for_dreads;
5125 }
5126 /*
5127 * Consider the possibility that upl_size wasn't satisfied.
5128 */
5129 if (upl_size < upl_needed_size) {
5130 if (upl_size && upl_offset == 0) {
5131 io_size = upl_size;
5132 } else {
5133 io_size = 0;
5134 }
5135 }
5136 if (io_size == 0) {
5137 ubc_upl_abort(upl, 0);
5138 goto wait_for_dreads;
5139 }
5140 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5141 (int)upl_offset, upl_size, io_size, kret, 0);
5142
5143 if (useVectorUPL) {
5144 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5145 if (end_off) {
5146 issueVectorUPL = 1;
5147 }
5148 /*
5149 * After this point, if we are using a vector UPL, then
5150 * either all the UPL elements end on a page boundary OR
5151 * this UPL is the last element because it does not end
5152 * on a page boundary.
5153 */
5154 }
5155
5156 /*
5157 * request asynchronously so that we can overlap
5158 * the preparation of the next I/O
5159 * if there are already too many outstanding reads
5160 * wait until some have completed before issuing the next read
5161 */
5162 cluster_iostate_wait(iostate: &iostate, target: max_rd_ahead, wait_name: "cluster_read_direct");
5163
5164 if (iostate.io_error) {
5165 /*
5166 * one of the earlier reads we issued ran into a hard error
5167 * don't issue any more reads, cleanup the UPL
5168 * that was just created but not used, then
5169 * go wait for any other reads to complete before
5170 * returning the error to the caller
5171 */
5172 ubc_upl_abort(upl, 0);
5173
5174 goto wait_for_dreads;
5175 }
5176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5177 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5178
5179 if (!useVectorUPL) {
5180 if (no_zero_fill) {
5181 io_flag &= ~CL_PRESERVE;
5182 } else {
5183 io_flag |= CL_PRESERVE;
5184 }
5185
5186 retval = cluster_io(vp, upl, upl_offset, f_offset: uio->uio_offset, non_rounded_size: io_size, flags: io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5187 } else {
5188 if (!vector_upl_index) {
5189 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
5190 v_upl_uio_offset = uio->uio_offset;
5191 vector_upl_offset = upl_offset;
5192 }
5193
5194 vector_upl_set_subupl(vector_upl, upl, upl_size);
5195 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5196 vector_upl_index++;
5197 vector_upl_size += upl_size;
5198 vector_upl_iosize += io_size;
5199
5200 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
5201 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5202 reset_vector_run_state();
5203 }
5204 }
5205
5206 if (lock) {
5207 // We don't need to wait for the I/O to complete
5208 cluster_unlock_direct_read(lck: lock);
5209 lock = NULL;
5210 }
5211
5212 /*
5213 * update the uio structure
5214 */
5215 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5216 uio_update(a_uio: uio, a_count: (user_size_t)max_io_size);
5217 } else {
5218 uio_update(a_uio: uio, a_count: (user_size_t)io_size);
5219 }
5220
5221 io_req_size -= io_size;
5222
5223 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5224 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5225 } /* end while */
5226
5227 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5228 retval = cluster_io_type(uio, io_type: read_type, io_length: read_length, min_length: 0);
5229
5230 if (retval == 0 && *read_type == IO_DIRECT) {
5231 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5232 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5233
5234 goto next_dread;
5235 }
5236 }
5237
5238wait_for_dreads:
5239
5240 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5241 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5242 reset_vector_run_state();
5243 }
5244
5245 // We don't need to wait for the I/O to complete
5246 if (lock) {
5247 cluster_unlock_direct_read(lck: lock);
5248 }
5249
5250 /*
5251 * make sure all async reads that are part of this stream
5252 * have completed before we return
5253 */
5254 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_read_direct");
5255
5256 if (iostate.io_error) {
5257 retval = iostate.io_error;
5258 }
5259
5260 lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
5261
5262 if (io_throttled == TRUE && retval == 0) {
5263 retval = EAGAIN;
5264 }
5265
5266 vm_map_offset_t current_page_size, current_page_mask;
5267 current_page_size = vm_map_page_size(map: current_map());
5268 current_page_mask = vm_map_page_mask(map: current_map());
5269 if (uio_acct) {
5270 off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5271 user_addr_t curr_iov_base = 0;
5272 user_addr_t curr_iov_end = 0;
5273 user_size_t curr_iov_len = 0;
5274
5275 bytes_to_prefault = uio_offset(a_uio: uio) - uio_offset(a_uio: uio_acct);
5276
5277 for (; bytes_prefaulted < bytes_to_prefault;) {
5278 curr_iov_base = uio_curriovbase(a_uio: uio_acct);
5279 curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5280 curr_iov_end = curr_iov_base + curr_iov_len;
5281
5282 for (; curr_iov_base < curr_iov_end;) {
5283 /*
5284 * This is specifically done for pmap accounting purposes.
5285 * vm_pre_fault() will call vm_fault() to enter the page into
5286 * the pmap if there isn't _a_ physical page for that VA already.
5287 */
5288 vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5289 curr_iov_base += current_page_size;
5290 bytes_prefaulted += current_page_size;
5291 }
5292 /*
5293 * Use update instead of advance so we can see how many iovs we processed.
5294 */
5295 uio_update(a_uio: uio_acct, a_count: curr_iov_len);
5296 }
5297 uio_free(a_uio: uio_acct);
5298 uio_acct = NULL;
5299 }
5300
5301 if (io_req_size && retval == 0) {
5302 /*
5303 * we couldn't handle the tail of this request in DIRECT mode
5304 * so fire it through the copy path
5305 */
5306 if (flags & IO_ENCRYPTED) {
5307 /*
5308 * We cannot fall back to the copy path for encrypted I/O. If this
5309 * happens, there is something wrong with the user buffer passed
5310 * down.
5311 */
5312 retval = EFAULT;
5313 } else {
5314 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5315 }
5316
5317 *read_type = IO_UNKNOWN;
5318 }
5319 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5320 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5321
5322 return retval;
5323}
5324
5325
5326static int
5327cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5328 int (*callback)(buf_t, void *), void *callback_arg, int flags)
5329{
5330 upl_page_info_t *pl;
5331 upl_t upl[MAX_VECTS];
5332 vm_offset_t upl_offset;
5333 addr64_t dst_paddr = 0;
5334 user_addr_t iov_base;
5335 off_t max_size;
5336 upl_size_t upl_size;
5337 vm_size_t upl_needed_size;
5338 mach_msg_type_number_t pages_in_pl;
5339 upl_control_flags_t upl_flags;
5340 kern_return_t kret;
5341 struct clios iostate;
5342 int error = 0;
5343 int cur_upl = 0;
5344 int num_upl = 0;
5345 int n;
5346 u_int32_t xsize;
5347 u_int32_t io_size;
5348 u_int32_t devblocksize;
5349 u_int32_t mem_alignment_mask;
5350 u_int32_t tail_size = 0;
5351 int bflag;
5352
5353 if (flags & IO_PASSIVE) {
5354 bflag = CL_PASSIVE;
5355 } else {
5356 bflag = 0;
5357 }
5358
5359 if (flags & IO_NOCACHE) {
5360 bflag |= CL_NOCACHE;
5361 }
5362
5363 /*
5364 * When we enter this routine, we know
5365 * -- the read_length will not exceed the current iov_len
5366 * -- the target address is physically contiguous for read_length
5367 */
5368 cluster_syncup(vp, newEOF: filesize, callback, callback_arg, PUSH_SYNC);
5369
5370 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5371 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5372
5373 iostate.io_completed = 0;
5374 iostate.io_issued = 0;
5375 iostate.io_error = 0;
5376 iostate.io_wanted = 0;
5377
5378 lck_mtx_init(lck: &iostate.io_mtxp, grp: &cl_mtx_grp, LCK_ATTR_NULL);
5379
5380next_cread:
5381 io_size = *read_length;
5382
5383 max_size = filesize - uio->uio_offset;
5384
5385 if (io_size > max_size) {
5386 io_size = (u_int32_t)max_size;
5387 }
5388
5389 iov_base = uio_curriovbase(a_uio: uio);
5390
5391 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5392 upl_needed_size = upl_offset + io_size;
5393
5394 pages_in_pl = 0;
5395 upl_size = (upl_size_t)upl_needed_size;
5396 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5397
5398
5399 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5400 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5401
5402 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5403 kret = vm_map_get_upl(target_map: map,
5404 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5405 size: &upl_size, upl: &upl[cur_upl], NULL, page_infoCnt: &pages_in_pl, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: 0);
5406
5407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5408 (int)upl_offset, upl_size, io_size, kret, 0);
5409
5410 if (kret != KERN_SUCCESS) {
5411 /*
5412 * failed to get pagelist
5413 */
5414 error = EINVAL;
5415 goto wait_for_creads;
5416 }
5417 num_upl++;
5418
5419 if (upl_size < upl_needed_size) {
5420 /*
5421 * The upl_size wasn't satisfied.
5422 */
5423 error = EINVAL;
5424 goto wait_for_creads;
5425 }
5426 pl = ubc_upl_pageinfo(upl[cur_upl]);
5427
5428 dst_paddr = ((addr64_t)upl_phys_page(upl: pl, index: 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5429
5430 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5431 u_int32_t head_size;
5432
5433 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5434
5435 if (head_size > io_size) {
5436 head_size = io_size;
5437 }
5438
5439 error = cluster_align_phys_io(vp, uio, usr_paddr: dst_paddr, xsize: head_size, CL_READ, callback, callback_arg);
5440
5441 if (error) {
5442 goto wait_for_creads;
5443 }
5444
5445 upl_offset += head_size;
5446 dst_paddr += head_size;
5447 io_size -= head_size;
5448
5449 iov_base += head_size;
5450 }
5451 if ((u_int32_t)iov_base & mem_alignment_mask) {
5452 /*
5453 * request doesn't set up on a memory boundary
5454 * the underlying DMA engine can handle...
5455 * return an error instead of going through
5456 * the slow copy path since the intent of this
5457 * path is direct I/O to device memory
5458 */
5459 error = EINVAL;
5460 goto wait_for_creads;
5461 }
5462
5463 tail_size = io_size & (devblocksize - 1);
5464
5465 io_size -= tail_size;
5466
5467 while (io_size && error == 0) {
5468 if (io_size > MAX_IO_CONTIG_SIZE) {
5469 xsize = MAX_IO_CONTIG_SIZE;
5470 } else {
5471 xsize = io_size;
5472 }
5473 /*
5474 * request asynchronously so that we can overlap
5475 * the preparation of the next I/O... we'll do
5476 * the commit after all the I/O has completed
5477 * since its all issued against the same UPL
5478 * if there are already too many outstanding reads
5479 * wait until some have completed before issuing the next
5480 */
5481 cluster_iostate_wait(iostate: &iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), wait_name: "cluster_read_contig");
5482
5483 if (iostate.io_error) {
5484 /*
5485 * one of the earlier reads we issued ran into a hard error
5486 * don't issue any more reads...
5487 * go wait for any other reads to complete before
5488 * returning the error to the caller
5489 */
5490 goto wait_for_creads;
5491 }
5492 error = cluster_io(vp, upl: upl[cur_upl], upl_offset, f_offset: uio->uio_offset, non_rounded_size: xsize,
5493 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5494 real_bp: (buf_t)NULL, iostate: &iostate, callback, callback_arg);
5495 /*
5496 * The cluster_io read was issued successfully,
5497 * update the uio structure
5498 */
5499 if (error == 0) {
5500 uio_update(a_uio: uio, a_count: (user_size_t)xsize);
5501
5502 dst_paddr += xsize;
5503 upl_offset += xsize;
5504 io_size -= xsize;
5505 }
5506 }
5507 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5508 error = cluster_io_type(uio, io_type: read_type, io_length: read_length, min_length: 0);
5509
5510 if (error == 0 && *read_type == IO_CONTIG) {
5511 cur_upl++;
5512 goto next_cread;
5513 }
5514 } else {
5515 *read_type = IO_UNKNOWN;
5516 }
5517
5518wait_for_creads:
5519 /*
5520 * make sure all async reads that are part of this stream
5521 * have completed before we proceed
5522 */
5523 cluster_iostate_wait(iostate: &iostate, target: 0, wait_name: "cluster_read_contig");
5524
5525 if (iostate.io_error) {
5526 error = iostate.io_error;
5527 }
5528
5529 lck_mtx_destroy(lck: &iostate.io_mtxp, grp: &cl_mtx_grp);
5530
5531 if (error == 0 && tail_size) {
5532 error = cluster_align_phys_io(vp, uio, usr_paddr: dst_paddr, xsize: tail_size, CL_READ, callback, callback_arg);
5533 }
5534
5535 for (n = 0; n < num_upl; n++) {
5536 /*
5537 * just release our hold on each physically contiguous
5538 * region without changing any state
5539 */
5540 ubc_upl_abort(upl[n], 0);
5541 }
5542
5543 return error;
5544}
5545
5546
5547static int
5548cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5549{
5550 user_size_t iov_len;
5551 user_addr_t iov_base = 0;
5552 upl_t upl;
5553 upl_size_t upl_size;
5554 upl_control_flags_t upl_flags;
5555 int retval = 0;
5556
5557 /*
5558 * skip over any emtpy vectors
5559 */
5560 uio_update(a_uio: uio, a_count: (user_size_t)0);
5561
5562 iov_len = uio_curriovlen(a_uio: uio);
5563
5564 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5565
5566 if (iov_len) {
5567 iov_base = uio_curriovbase(a_uio: uio);
5568 /*
5569 * make sure the size of the vector isn't too big...
5570 * internally, we want to handle all of the I/O in
5571 * chunk sizes that fit in a 32 bit int
5572 */
5573 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5574 upl_size = MAX_IO_REQUEST_SIZE;
5575 } else {
5576 upl_size = (u_int32_t)iov_len;
5577 }
5578
5579 upl_flags = UPL_QUERY_OBJECT_TYPE;
5580
5581 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5582 if ((vm_map_get_upl(target_map: map,
5583 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5584 size: &upl_size, upl: &upl, NULL, NULL, flags: &upl_flags, VM_KERN_MEMORY_FILE, force_data_sync: 0)) != KERN_SUCCESS) {
5585 /*
5586 * the user app must have passed in an invalid address
5587 */
5588 retval = EFAULT;
5589 }
5590 if (upl_size == 0) {
5591 retval = EFAULT;
5592 }
5593
5594 *io_length = upl_size;
5595
5596 if (upl_flags & UPL_PHYS_CONTIG) {
5597 *io_type = IO_CONTIG;
5598 } else if (iov_len >= min_length) {
5599 *io_type = IO_DIRECT;
5600 } else {
5601 *io_type = IO_COPY;
5602 }
5603 } else {
5604 /*
5605 * nothing left to do for this uio
5606 */
5607 *io_length = 0;
5608 *io_type = IO_UNKNOWN;
5609 }
5610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5611
5612 if (*io_type == IO_DIRECT &&
5613 vm_map_page_shift(map: current_map()) < PAGE_SHIFT) {
5614 /* no direct I/O for sub-page-size address spaces */
5615 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5616 *io_type = IO_COPY;
5617 }
5618
5619 return retval;
5620}
5621
5622
5623/*
5624 * generate advisory I/O's in the largest chunks possible
5625 * the completed pages will be released into the VM cache
5626 */
5627int
5628advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5629{
5630 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5631}
5632
5633int
5634advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5635{
5636 upl_page_info_t *pl;
5637 upl_t upl = NULL;
5638 vm_offset_t upl_offset;
5639 int upl_size;
5640 off_t upl_f_offset;
5641 int start_offset;
5642 int start_pg;
5643 int last_pg;
5644 int pages_in_upl;
5645 off_t max_size;
5646 int io_size;
5647 kern_return_t kret;
5648 int retval = 0;
5649 int issued_io;
5650 int skip_range;
5651 uint32_t max_io_size;
5652
5653
5654 if (!UBCINFOEXISTS(vp)) {
5655 return EINVAL;
5656 }
5657
5658 if (f_offset < 0 || resid < 0) {
5659 return EINVAL;
5660 }
5661
5662 max_io_size = cluster_max_io_size(mp: vp->v_mount, CL_READ);
5663
5664 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5665 if (max_io_size > speculative_prefetch_max_iosize) {
5666 max_io_size = speculative_prefetch_max_iosize;
5667 }
5668 }
5669
5670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5671 (int)f_offset, resid, (int)filesize, 0, 0);
5672
5673 while (resid && f_offset < filesize && retval == 0) {
5674 /*
5675 * compute the size of the upl needed to encompass
5676 * the requested read... limit each call to cluster_io
5677 * to the maximum UPL size... cluster_io will clip if
5678 * this exceeds the maximum io_size for the device,
5679 * make sure to account for
5680 * a starting offset that's not page aligned
5681 */
5682 start_offset = (int)(f_offset & PAGE_MASK_64);
5683 upl_f_offset = f_offset - (off_t)start_offset;
5684 max_size = filesize - f_offset;
5685
5686 if (resid < max_size) {
5687 io_size = resid;
5688 } else {
5689 io_size = (int)max_size;
5690 }
5691
5692 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5693 if ((uint32_t)upl_size > max_io_size) {
5694 upl_size = max_io_size;
5695 }
5696
5697 skip_range = 0;
5698 /*
5699 * return the number of contiguously present pages in the cache
5700 * starting at upl_f_offset within the file
5701 */
5702 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5703
5704 if (skip_range) {
5705 /*
5706 * skip over pages already present in the cache
5707 */
5708 io_size = skip_range - start_offset;
5709
5710 f_offset += io_size;
5711 resid -= io_size;
5712
5713 if (skip_range == upl_size) {
5714 continue;
5715 }
5716 /*
5717 * have to issue some real I/O
5718 * at this point, we know it's starting on a page boundary
5719 * because we've skipped over at least the first page in the request
5720 */
5721 start_offset = 0;
5722 upl_f_offset += skip_range;
5723 upl_size -= skip_range;
5724 }
5725 pages_in_upl = upl_size / PAGE_SIZE;
5726
5727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5728 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5729
5730 kret = ubc_create_upl_kernel(vp,
5731 upl_f_offset,
5732 upl_size,
5733 &upl,
5734 &pl,
5735 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5736 VM_KERN_MEMORY_FILE);
5737 if (kret != KERN_SUCCESS) {
5738 return retval;
5739 }
5740 issued_io = 0;
5741
5742 /*
5743 * before we start marching forward, we must make sure we end on
5744 * a present page, otherwise we will be working with a freed
5745 * upl
5746 */
5747 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5748 if (upl_page_present(upl: pl, index: last_pg)) {
5749 break;
5750 }
5751 }
5752 pages_in_upl = last_pg + 1;
5753
5754
5755 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5756 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5757
5758
5759 for (last_pg = 0; last_pg < pages_in_upl;) {
5760 /*
5761 * scan from the beginning of the upl looking for the first
5762 * page that is present.... this will become the first page in
5763 * the request we're going to make to 'cluster_io'... if all
5764 * of the pages are absent, we won't call through to 'cluster_io'
5765 */
5766 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5767 if (upl_page_present(upl: pl, index: start_pg)) {
5768 break;
5769 }
5770 }
5771
5772 /*
5773 * scan from the starting present page looking for an absent
5774 * page before the end of the upl is reached, if we
5775 * find one, then it will terminate the range of pages being
5776 * presented to 'cluster_io'
5777 */
5778 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5779 if (!upl_page_present(upl: pl, index: last_pg)) {
5780 break;
5781 }
5782 }
5783
5784 if (last_pg > start_pg) {
5785 /*
5786 * we found a range of pages that must be filled
5787 * if the last page in this range is the last page of the file
5788 * we may have to clip the size of it to keep from reading past
5789 * the end of the last physical block associated with the file
5790 */
5791 upl_offset = start_pg * PAGE_SIZE;
5792 io_size = (last_pg - start_pg) * PAGE_SIZE;
5793
5794 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5795 io_size = (int)(filesize - (upl_f_offset + upl_offset));
5796 }
5797
5798 /*
5799 * issue an asynchronous read to cluster_io
5800 */
5801 retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: io_size,
5802 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
5803
5804 issued_io = 1;
5805 }
5806 }
5807 if (issued_io == 0) {
5808 ubc_upl_abort(upl, 0);
5809 }
5810
5811 io_size = upl_size - start_offset;
5812
5813 if (io_size > resid) {
5814 io_size = resid;
5815 }
5816 f_offset += io_size;
5817 resid -= io_size;
5818 }
5819
5820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5821 (int)f_offset, resid, retval, 0, 0);
5822
5823 return retval;
5824}
5825
5826
5827int
5828cluster_push(vnode_t vp, int flags)
5829{
5830 return cluster_push_ext(vp, flags, NULL, NULL);
5831}
5832
5833
5834int
5835cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5836{
5837 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5838}
5839
5840/* write errors via err, but return the number of clusters written */
5841extern uint32_t system_inshutdown;
5842uint32_t cl_sparse_push_error = 0;
5843int
5844cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5845{
5846 int retval;
5847 int my_sparse_wait = 0;
5848 struct cl_writebehind *wbp;
5849 int local_err = 0;
5850
5851 if (err) {
5852 *err = 0;
5853 }
5854
5855 if (!UBCINFOEXISTS(vp)) {
5856 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5857 return 0;
5858 }
5859 /* return if deferred write is set */
5860 if (((unsigned int)vfs_flags(mp: vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5861 return 0;
5862 }
5863 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5864 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5865 return 0;
5866 }
5867 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5868 lck_mtx_unlock(lck: &wbp->cl_lockw);
5869
5870 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5871 return 0;
5872 }
5873 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5874 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5875
5876 /*
5877 * if we have an fsync in progress, we don't want to allow any additional
5878 * sync/fsync/close(s) to occur until it finishes.
5879 * note that its possible for writes to continue to occur to this file
5880 * while we're waiting and also once the fsync starts to clean if we're
5881 * in the sparse map case
5882 */
5883 while (wbp->cl_sparse_wait) {
5884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5885
5886 msleep(chan: (caddr_t)&wbp->cl_sparse_wait, mtx: &wbp->cl_lockw, PRIBIO + 1, wmesg: "cluster_push_ext", NULL);
5887
5888 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5889 }
5890 if (flags & IO_SYNC) {
5891 my_sparse_wait = 1;
5892 wbp->cl_sparse_wait = 1;
5893
5894 /*
5895 * this is an fsync (or equivalent)... we must wait for any existing async
5896 * cleaning operations to complete before we evaulate the current state
5897 * and finish cleaning... this insures that all writes issued before this
5898 * fsync actually get cleaned to the disk before this fsync returns
5899 */
5900 while (wbp->cl_sparse_pushes) {
5901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5902
5903 msleep(chan: (caddr_t)&wbp->cl_sparse_pushes, mtx: &wbp->cl_lockw, PRIBIO + 1, wmesg: "cluster_push_ext", NULL);
5904
5905 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5906 }
5907 }
5908 if (wbp->cl_scmap) {
5909 void *scmap;
5910
5911 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5912 scmap = wbp->cl_scmap;
5913 wbp->cl_scmap = NULL;
5914
5915 wbp->cl_sparse_pushes++;
5916
5917 lck_mtx_unlock(lck: &wbp->cl_lockw);
5918
5919 retval = sparse_cluster_push(wbp, cmapp: &scmap, vp, EOF: ubc_getsize(vp), PUSH_ALL, io_flags: flags, callback, callback_arg, FALSE);
5920
5921 lck_mtx_lock(lck: &wbp->cl_lockw);
5922
5923 wbp->cl_sparse_pushes--;
5924
5925 if (retval) {
5926 if (wbp->cl_scmap != NULL) {
5927 /*
5928 * panic("cluster_push_err: Expected NULL cl_scmap\n");
5929 *
5930 * This can happen if we get an error from the underlying FS
5931 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5932 * are transient and the I/Os will succeed at a later point.
5933 *
5934 * The tricky part here is that a new sparse cluster has been
5935 * allocated and tracking a different set of dirty pages. So these
5936 * pages are not going to be pushed out with the next sparse_cluster_push.
5937 * An explicit msync or file close will, however, push the pages out.
5938 *
5939 * What if those calls still don't work? And so, during shutdown we keep
5940 * trying till we succeed...
5941 */
5942
5943 if (system_inshutdown) {
5944 if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
5945 os_atomic_inc(&cl_sparse_push_error, relaxed);
5946 }
5947 } else {
5948 vfs_drt_control(cmapp: &scmap, op_type: 0); /* emit stats and free this memory. Dirty pages stay intact. */
5949 scmap = NULL;
5950 }
5951 } else {
5952 wbp->cl_scmap = scmap;
5953 }
5954 }
5955
5956 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5957 wakeup(chan: (caddr_t)&wbp->cl_sparse_pushes);
5958 }
5959 } else {
5960 retval = sparse_cluster_push(wbp, cmapp: &(wbp->cl_scmap), vp, EOF: ubc_getsize(vp), PUSH_ALL, io_flags: flags, callback, callback_arg, FALSE);
5961 }
5962
5963 local_err = retval;
5964
5965 if (err) {
5966 *err = retval;
5967 }
5968 retval = 1;
5969 } else {
5970 retval = cluster_try_push(wbp, vp, EOF: ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err: &local_err, FALSE);
5971 if (err) {
5972 *err = local_err;
5973 }
5974 }
5975 lck_mtx_unlock(lck: &wbp->cl_lockw);
5976
5977 if (flags & IO_SYNC) {
5978 (void)vnode_waitforwrites(vp, output_target: 0, slpflag: 0, slptimeout: 0, msg: "cluster_push");
5979 }
5980
5981 if (my_sparse_wait) {
5982 /*
5983 * I'm the owner of the serialization token
5984 * clear it and wakeup anyone that is waiting
5985 * for me to finish
5986 */
5987 lck_mtx_lock(lck: &wbp->cl_lockw);
5988
5989 wbp->cl_sparse_wait = 0;
5990 wakeup(chan: (caddr_t)&wbp->cl_sparse_wait);
5991
5992 lck_mtx_unlock(lck: &wbp->cl_lockw);
5993 }
5994 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5995 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5996
5997 return retval;
5998}
5999
6000
6001__private_extern__ void
6002cluster_release(struct ubc_info *ubc)
6003{
6004 struct cl_writebehind *wbp;
6005 struct cl_readahead *rap;
6006
6007 if ((wbp = ubc->cl_wbehind)) {
6008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
6009
6010 if (wbp->cl_scmap) {
6011 vfs_drt_control(cmapp: &(wbp->cl_scmap), op_type: 0);
6012 }
6013 lck_mtx_destroy(lck: &wbp->cl_lockw, grp: &cl_mtx_grp);
6014 zfree(cl_wr_zone, wbp);
6015 ubc->cl_wbehind = NULL;
6016 } else {
6017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
6018 }
6019
6020 if ((rap = ubc->cl_rahead)) {
6021 lck_mtx_destroy(lck: &rap->cl_lockr, grp: &cl_mtx_grp);
6022 zfree(cl_rd_zone, rap);
6023 ubc->cl_rahead = NULL;
6024 }
6025
6026 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
6027}
6028
6029
6030static int
6031cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
6032{
6033 int cl_index;
6034 int cl_index1;
6035 int min_index;
6036 int cl_len;
6037 int cl_pushed = 0;
6038 struct cl_wextent l_clusters[MAX_CLUSTERS];
6039 u_int max_cluster_pgcount;
6040 int error = 0;
6041
6042 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6043 /*
6044 * the write behind context exists and has
6045 * already been locked...
6046 */
6047 if (wbp->cl_number == 0) {
6048 /*
6049 * no clusters to push
6050 * return number of empty slots
6051 */
6052 return MAX_CLUSTERS;
6053 }
6054
6055 /*
6056 * make a local 'sorted' copy of the clusters
6057 * and clear wbp->cl_number so that new clusters can
6058 * be developed
6059 */
6060 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6061 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6062 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6063 continue;
6064 }
6065 if (min_index == -1) {
6066 min_index = cl_index1;
6067 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6068 min_index = cl_index1;
6069 }
6070 }
6071 if (min_index == -1) {
6072 break;
6073 }
6074
6075 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6076 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6077 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6078
6079 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6080 }
6081 wbp->cl_number = 0;
6082
6083 cl_len = cl_index;
6084
6085 /* skip switching to the sparse cluster mechanism if on diskimage */
6086 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6087 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6088 int i;
6089
6090 /*
6091 * determine if we appear to be writing the file sequentially
6092 * if not, by returning without having pushed any clusters
6093 * we will cause this vnode to be pushed into the sparse cluster mechanism
6094 * used for managing more random I/O patterns
6095 *
6096 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6097 * that's why we're in try_push with PUSH_DELAY...
6098 *
6099 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6100 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6101 * so we can just make a simple pass through, up to, but not including the last one...
6102 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6103 * are sequential
6104 *
6105 * we let the last one be partial as long as it was adjacent to the previous one...
6106 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6107 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6108 */
6109 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6110 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6111 goto dont_try;
6112 }
6113 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6114 goto dont_try;
6115 }
6116 }
6117 }
6118 if (vm_initiated == TRUE) {
6119 lck_mtx_unlock(lck: &wbp->cl_lockw);
6120 }
6121
6122 for (cl_index = 0; cl_index < cl_len; cl_index++) {
6123 int flags;
6124 struct cl_extent cl;
6125 int retval;
6126
6127 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6128
6129 /*
6130 * try to push each cluster in turn...
6131 */
6132 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6133 flags |= IO_NOCACHE;
6134 }
6135
6136 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6137 flags |= IO_PASSIVE;
6138 }
6139
6140 if (push_flag & PUSH_SYNC) {
6141 flags |= IO_SYNC;
6142 }
6143
6144 cl.b_addr = l_clusters[cl_index].b_addr;
6145 cl.e_addr = l_clusters[cl_index].e_addr;
6146
6147 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_ioitiated: vm_initiated);
6148
6149 if (retval == 0) {
6150 cl_pushed++;
6151
6152 l_clusters[cl_index].b_addr = 0;
6153 l_clusters[cl_index].e_addr = 0;
6154 } else if (error == 0) {
6155 error = retval;
6156 }
6157
6158 if (!(push_flag & PUSH_ALL)) {
6159 break;
6160 }
6161 }
6162 if (vm_initiated == TRUE) {
6163 lck_mtx_lock(lck: &wbp->cl_lockw);
6164 }
6165
6166 if (err) {
6167 *err = error;
6168 }
6169
6170dont_try:
6171 if (cl_len > cl_pushed) {
6172 /*
6173 * we didn't push all of the clusters, so
6174 * lets try to merge them back in to the vnode
6175 */
6176 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6177 /*
6178 * we picked up some new clusters while we were trying to
6179 * push the old ones... this can happen because I've dropped
6180 * the vnode lock... the sum of the
6181 * leftovers plus the new cluster count exceeds our ability
6182 * to represent them, so switch to the sparse cluster mechanism
6183 *
6184 * collect the active public clusters...
6185 */
6186 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6187
6188 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6189 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6190 continue;
6191 }
6192 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6193 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6194 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6195
6196 cl_index1++;
6197 }
6198 /*
6199 * update the cluster count
6200 */
6201 wbp->cl_number = cl_index1;
6202
6203 /*
6204 * and collect the original clusters that were moved into the
6205 * local storage for sorting purposes
6206 */
6207 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6208 } else {
6209 /*
6210 * we've got room to merge the leftovers back in
6211 * just append them starting at the next 'hole'
6212 * represented by wbp->cl_number
6213 */
6214 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6215 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6216 continue;
6217 }
6218
6219 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6220 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6221 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6222
6223 cl_index1++;
6224 }
6225 /*
6226 * update the cluster count
6227 */
6228 wbp->cl_number = cl_index1;
6229 }
6230 }
6231 return MAX_CLUSTERS - wbp->cl_number;
6232}
6233
6234
6235
6236static int
6237cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6238 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6239{
6240 upl_page_info_t *pl;
6241 upl_t upl;
6242 vm_offset_t upl_offset;
6243 int upl_size;
6244 off_t upl_f_offset;
6245 int pages_in_upl;
6246 int start_pg;
6247 int last_pg;
6248 int io_size;
6249 int io_flags;
6250 int upl_flags;
6251 int bflag;
6252 int size;
6253 int error = 0;
6254 int retval;
6255 kern_return_t kret;
6256
6257 if (flags & IO_PASSIVE) {
6258 bflag = CL_PASSIVE;
6259 } else {
6260 bflag = 0;
6261 }
6262
6263 if (flags & IO_SKIP_ENCRYPTION) {
6264 bflag |= CL_ENCRYPTED;
6265 }
6266
6267 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6268 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6269
6270 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6272
6273 return 0;
6274 }
6275 upl_size = pages_in_upl * PAGE_SIZE;
6276 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6277
6278 if (upl_f_offset + upl_size >= EOF) {
6279 if (upl_f_offset >= EOF) {
6280 /*
6281 * must have truncated the file and missed
6282 * clearing a dangling cluster (i.e. it's completely
6283 * beyond the new EOF
6284 */
6285 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6286
6287 return 0;
6288 }
6289 size = (int)(EOF - upl_f_offset);
6290
6291 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6292 pages_in_upl = upl_size / PAGE_SIZE;
6293 } else {
6294 size = upl_size;
6295 }
6296
6297
6298 if (vm_initiated) {
6299 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6300 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6301
6302 return error;
6303 }
6304 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6305
6306 /*
6307 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6308 *
6309 * - only pages that are currently dirty are returned... these are the ones we need to clean
6310 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6311 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6312 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6313 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6314 *
6315 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6316 */
6317
6318 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6319 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6320 } else {
6321 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6322 }
6323
6324 kret = ubc_create_upl_kernel(vp,
6325 upl_f_offset,
6326 upl_size,
6327 &upl,
6328 &pl,
6329 upl_flags,
6330 VM_KERN_MEMORY_FILE);
6331 if (kret != KERN_SUCCESS) {
6332 panic("cluster_push: failed to get pagelist");
6333 }
6334
6335 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6336
6337 /*
6338 * since we only asked for the dirty pages back
6339 * it's possible that we may only get a few or even none, so...
6340 * before we start marching forward, we must make sure we know
6341 * where the last present page is in the UPL, otherwise we could
6342 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6343 * employed by commit_range and abort_range.
6344 */
6345 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6346 if (upl_page_present(upl: pl, index: last_pg)) {
6347 break;
6348 }
6349 }
6350 pages_in_upl = last_pg + 1;
6351
6352 if (pages_in_upl == 0) {
6353 ubc_upl_abort(upl, 0);
6354
6355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6356 return 0;
6357 }
6358
6359 for (last_pg = 0; last_pg < pages_in_upl;) {
6360 /*
6361 * find the next dirty page in the UPL
6362 * this will become the first page in the
6363 * next I/O to generate
6364 */
6365 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6366 if (upl_dirty_page(upl: pl, index: start_pg)) {
6367 break;
6368 }
6369 if (upl_page_present(upl: pl, index: start_pg)) {
6370 /*
6371 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6372 * just release these unchanged since we're not going
6373 * to steal them or change their state
6374 */
6375 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6376 }
6377 }
6378 if (start_pg >= pages_in_upl) {
6379 /*
6380 * done... no more dirty pages to push
6381 */
6382 break;
6383 }
6384 if (start_pg > last_pg) {
6385 /*
6386 * skipped over some non-dirty pages
6387 */
6388 size -= ((start_pg - last_pg) * PAGE_SIZE);
6389 }
6390
6391 /*
6392 * find a range of dirty pages to write
6393 */
6394 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6395 if (!upl_dirty_page(upl: pl, index: last_pg)) {
6396 break;
6397 }
6398 }
6399 upl_offset = start_pg * PAGE_SIZE;
6400
6401 io_size = min(a: size, b: (last_pg - start_pg) * PAGE_SIZE);
6402
6403 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6404
6405 if (!(flags & IO_SYNC)) {
6406 io_flags |= CL_ASYNC;
6407 }
6408
6409 if (flags & IO_CLOSE) {
6410 io_flags |= CL_CLOSE;
6411 }
6412
6413 if (flags & IO_NOCACHE) {
6414 io_flags |= CL_NOCACHE;
6415 }
6416
6417 retval = cluster_io(vp, upl, upl_offset, f_offset: upl_f_offset + upl_offset, non_rounded_size: io_size,
6418 flags: io_flags, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6419
6420 if (error == 0 && retval) {
6421 error = retval;
6422 }
6423
6424 size -= io_size;
6425 }
6426 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6427
6428 return error;
6429}
6430
6431
6432/*
6433 * sparse_cluster_switch is called with the write behind lock held
6434 */
6435static int
6436sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6437{
6438 int cl_index;
6439 int error = 0;
6440
6441 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6442
6443 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6444 int flags;
6445 struct cl_extent cl;
6446
6447 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6448 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6449 if (flags & UPL_POP_DIRTY) {
6450 cl.e_addr = cl.b_addr + 1;
6451
6452 error = sparse_cluster_add(wbp, cmapp: &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6453
6454 if (error) {
6455 break;
6456 }
6457 }
6458 }
6459 }
6460 }
6461 wbp->cl_number -= cl_index;
6462
6463 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6464
6465 return error;
6466}
6467
6468
6469/*
6470 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6471 * still associated with the write-behind context... however, if the scmap has been disassociated
6472 * from the write-behind context (the cluster_push case), the wb lock is not held
6473 */
6474static int
6475sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6476 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6477{
6478 struct cl_extent cl;
6479 off_t offset;
6480 u_int length;
6481 void *l_scmap;
6482 int error = 0;
6483
6484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6485
6486 if (push_flag & PUSH_ALL) {
6487 vfs_drt_control(cmapp: scmap, op_type: 1);
6488 }
6489
6490 l_scmap = *scmap;
6491
6492 for (;;) {
6493 int retval;
6494
6495 if (vfs_drt_get_cluster(cmapp: scmap, offsetp: &offset, lengthp: &length) != KERN_SUCCESS) {
6496 /*
6497 * Not finding anything to push will return KERN_FAILURE.
6498 * Confusing since it isn't really a failure. But that's the
6499 * reason we don't set 'error' here like we do below.
6500 */
6501 break;
6502 }
6503
6504 if (vm_initiated == TRUE) {
6505 lck_mtx_unlock(lck: &wbp->cl_lockw);
6506 }
6507
6508 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6509 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6510
6511 retval = cluster_push_now(vp, cl: &cl, EOF, flags: io_flags, callback, callback_arg, vm_initiated);
6512 if (error == 0 && retval) {
6513 error = retval;
6514 }
6515
6516 if (vm_initiated == TRUE) {
6517 lck_mtx_lock(lck: &wbp->cl_lockw);
6518
6519 if (*scmap != l_scmap) {
6520 break;
6521 }
6522 }
6523
6524 if (error) {
6525 if (vfs_drt_mark_pages(cmapp: scmap, offset, length, NULL) != KERN_SUCCESS) {
6526 panic("Failed to restore dirty state on failure");
6527 }
6528
6529 break;
6530 }
6531
6532 if (!(push_flag & PUSH_ALL)) {
6533 break;
6534 }
6535 }
6536 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6537
6538 return error;
6539}
6540
6541
6542/*
6543 * sparse_cluster_add is called with the write behind lock held
6544 */
6545static int
6546sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6547 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6548{
6549 u_int new_dirty;
6550 u_int length;
6551 off_t offset;
6552 int error = 0;
6553 int push_flag = 0; /* Is this a valid value? */
6554
6555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6556
6557 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6558 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6559
6560 while (vfs_drt_mark_pages(cmapp: scmap, offset, length, setcountp: &new_dirty) != KERN_SUCCESS) {
6561 /*
6562 * no room left in the map
6563 * only a partial update was done
6564 * push out some pages and try again
6565 */
6566
6567 if (vfs_get_scmap_push_behavior_internal(cmapp: scmap, push_flag: &push_flag)) {
6568 push_flag = 0;
6569 }
6570
6571 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, io_flags: 0, callback, callback_arg, vm_initiated);
6572
6573 if (error) {
6574 break;
6575 }
6576
6577 offset += (new_dirty * PAGE_SIZE_64);
6578 length -= (new_dirty * PAGE_SIZE);
6579 }
6580 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6581
6582 return error;
6583}
6584
6585
6586static int
6587cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6588{
6589 upl_page_info_t *pl;
6590 upl_t upl;
6591 addr64_t ubc_paddr;
6592 kern_return_t kret;
6593 int error = 0;
6594 int did_read = 0;
6595 int abort_flags;
6596 int upl_flags;
6597 int bflag;
6598
6599 if (flags & IO_PASSIVE) {
6600 bflag = CL_PASSIVE;
6601 } else {
6602 bflag = 0;
6603 }
6604
6605 if (flags & IO_NOCACHE) {
6606 bflag |= CL_NOCACHE;
6607 }
6608
6609 upl_flags = UPL_SET_LITE;
6610
6611 if (!(flags & CL_READ)) {
6612 /*
6613 * "write" operation: let the UPL subsystem know
6614 * that we intend to modify the buffer cache pages
6615 * we're gathering.
6616 */
6617 upl_flags |= UPL_WILL_MODIFY;
6618 } else {
6619 /*
6620 * indicate that there is no need to pull the
6621 * mapping for this page... we're only going
6622 * to read from it, not modify it.
6623 */
6624 upl_flags |= UPL_FILE_IO;
6625 }
6626 kret = ubc_create_upl_kernel(vp,
6627 uio->uio_offset & ~PAGE_MASK_64,
6628 PAGE_SIZE,
6629 &upl,
6630 &pl,
6631 upl_flags,
6632 VM_KERN_MEMORY_FILE);
6633
6634 if (kret != KERN_SUCCESS) {
6635 return EINVAL;
6636 }
6637
6638 if (!upl_valid_page(upl: pl, index: 0)) {
6639 /*
6640 * issue a synchronous read to cluster_io
6641 */
6642 error = cluster_io(vp, upl, upl_offset: 0, f_offset: uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6643 CL_READ | bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6644 if (error) {
6645 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6646
6647 return error;
6648 }
6649 did_read = 1;
6650 }
6651 ubc_paddr = ((addr64_t)upl_phys_page(upl: pl, index: 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6652
6653/*
6654 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6655 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6656 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6657 * way to do so without exporting them to kexts as well.
6658 */
6659 if (flags & CL_READ) {
6660// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6661 copypv(source: ubc_paddr, sink: usr_paddr, size: xsize, which: 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6662 } else {
6663// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6664 copypv(source: usr_paddr, sink: ubc_paddr, size: xsize, which: 2 | 1 | 8); /* Copy physical to physical and flush the source */
6665 }
6666 if (!(flags & CL_READ) || (upl_valid_page(upl: pl, index: 0) && upl_dirty_page(upl: pl, index: 0))) {
6667 /*
6668 * issue a synchronous write to cluster_io
6669 */
6670 error = cluster_io(vp, upl, upl_offset: 0, f_offset: uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6671 flags: bflag, real_bp: (buf_t)NULL, iostate: (struct clios *)NULL, callback, callback_arg);
6672 }
6673 if (error == 0) {
6674 uio_update(a_uio: uio, a_count: (user_size_t)xsize);
6675 }
6676
6677 if (did_read) {
6678 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6679 } else {
6680 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6681 }
6682
6683 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6684
6685 return error;
6686}
6687
6688int
6689cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6690{
6691 int pg_offset;
6692 int pg_index;
6693 int csize;
6694 int segflg;
6695 int retval = 0;
6696 int xsize;
6697 upl_page_info_t *pl;
6698 int dirty_count;
6699
6700 xsize = *io_resid;
6701
6702 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6703 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6704
6705 segflg = uio->uio_segflg;
6706
6707 switch (segflg) {
6708 case UIO_USERSPACE32:
6709 case UIO_USERISPACE32:
6710 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6711 break;
6712
6713 case UIO_USERSPACE:
6714 case UIO_USERISPACE:
6715 uio->uio_segflg = UIO_PHYS_USERSPACE;
6716 break;
6717
6718 case UIO_USERSPACE64:
6719 case UIO_USERISPACE64:
6720 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6721 break;
6722
6723 case UIO_SYSSPACE:
6724 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6725 break;
6726 }
6727 pl = ubc_upl_pageinfo(upl);
6728
6729 pg_index = upl_offset / PAGE_SIZE;
6730 pg_offset = upl_offset & PAGE_MASK;
6731 csize = min(PAGE_SIZE - pg_offset, b: xsize);
6732
6733 dirty_count = 0;
6734 while (xsize && retval == 0) {
6735 addr64_t paddr;
6736
6737 paddr = ((addr64_t)upl_phys_page(upl: pl, index: pg_index) << PAGE_SHIFT) + pg_offset;
6738 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(upl: pl, index: pg_index) == FALSE)) {
6739 dirty_count++;
6740 }
6741
6742 retval = uiomove64(cp: paddr, n: csize, uio);
6743
6744 pg_index += 1;
6745 pg_offset = 0;
6746 xsize -= csize;
6747 csize = min(PAGE_SIZE, b: xsize);
6748 }
6749 *io_resid = xsize;
6750
6751 uio->uio_segflg = segflg;
6752
6753 if (dirty_count) {
6754 task_update_logical_writes(task: current_task(), io_size: (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, vp: upl_lookup_vnode(upl));
6755 }
6756
6757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6758 (int)uio->uio_offset, xsize, retval, segflg, 0);
6759
6760 return retval;
6761}
6762
6763
6764int
6765cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6766{
6767 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, take_reference: 1);
6768}
6769
6770
6771static int
6772cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6773{
6774 int segflg;
6775 int io_size;
6776 int xsize;
6777 int start_offset;
6778 int retval = 0;
6779 memory_object_control_t control;
6780
6781 io_size = *io_resid;
6782
6783 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6784 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6785
6786 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6787
6788 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6789 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6790 (int)uio->uio_offset, io_size, retval, 3, 0);
6791
6792 return 0;
6793 }
6794 segflg = uio->uio_segflg;
6795
6796 switch (segflg) {
6797 case UIO_USERSPACE32:
6798 case UIO_USERISPACE32:
6799 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6800 break;
6801
6802 case UIO_USERSPACE64:
6803 case UIO_USERISPACE64:
6804 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6805 break;
6806
6807 case UIO_USERSPACE:
6808 case UIO_USERISPACE:
6809 uio->uio_segflg = UIO_PHYS_USERSPACE;
6810 break;
6811
6812 case UIO_SYSSPACE:
6813 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6814 break;
6815 }
6816
6817 if ((io_size = *io_resid)) {
6818 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6819 xsize = (int)uio_resid(a_uio: uio);
6820
6821 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6822 start_offset, io_size, mark_dirty, take_reference);
6823 xsize -= uio_resid(a_uio: uio);
6824
6825 int num_bytes_copied = xsize;
6826 if (num_bytes_copied && uio_rw(a_uio: uio)) {
6827 task_update_logical_writes(task: current_task(), io_size: num_bytes_copied, TASK_WRITE_DEFERRED, vp);
6828 }
6829 io_size -= xsize;
6830 }
6831 uio->uio_segflg = segflg;
6832 *io_resid = io_size;
6833
6834 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6835 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6836
6837 return retval;
6838}
6839
6840
6841int
6842is_file_clean(vnode_t vp, off_t filesize)
6843{
6844 off_t f_offset;
6845 int flags;
6846 int total_dirty = 0;
6847
6848 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6849 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6850 if (flags & UPL_POP_DIRTY) {
6851 total_dirty++;
6852 }
6853 }
6854 }
6855 if (total_dirty) {
6856 return EINVAL;
6857 }
6858
6859 return 0;
6860}
6861
6862
6863
6864/*
6865 * Dirty region tracking/clustering mechanism.
6866 *
6867 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6868 * dirty regions within a larger space (file). It is primarily intended to
6869 * support clustering in large files with many dirty areas.
6870 *
6871 * The implementation assumes that the dirty regions are pages.
6872 *
6873 * To represent dirty pages within the file, we store bit vectors in a
6874 * variable-size circular hash.
6875 */
6876
6877/*
6878 * Bitvector size. This determines the number of pages we group in a
6879 * single hashtable entry. Each hashtable entry is aligned to this
6880 * size within the file.
6881 */
6882#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6883
6884/*
6885 * File offset handling.
6886 *
6887 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6888 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6889 */
6890#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6891#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6892
6893/*
6894 * Hashtable address field handling.
6895 *
6896 * The low-order bits of the hashtable address are used to conserve
6897 * space.
6898 *
6899 * DRT_HASH_COUNT_MASK must be large enough to store the range
6900 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6901 * to indicate that the bucket is actually unoccupied.
6902 */
6903#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6904#define DRT_HASH_SET_ADDRESS(scm, i, a) \
6905 do { \
6906 (scm)->scm_hashtable[(i)].dhe_control = \
6907 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6908 } while (0)
6909#define DRT_HASH_COUNT_MASK 0x1ff
6910#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6911#define DRT_HASH_SET_COUNT(scm, i, c) \
6912 do { \
6913 (scm)->scm_hashtable[(i)].dhe_control = \
6914 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6915 } while (0)
6916#define DRT_HASH_CLEAR(scm, i) \
6917 do { \
6918 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6919 } while (0)
6920#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6921#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6922#define DRT_HASH_COPY(oscm, oi, scm, i) \
6923 do { \
6924 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6925 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6926 } while(0);
6927
6928
6929#if !defined(XNU_TARGET_OS_OSX)
6930/*
6931 * Hash table moduli.
6932 *
6933 * Since the hashtable entry's size is dependent on the size of
6934 * the bitvector, and since the hashtable size is constrained to
6935 * both being prime and fitting within the desired allocation
6936 * size, these values need to be manually determined.
6937 *
6938 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6939 *
6940 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6941 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6942 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6943 */
6944
6945#define DRT_HASH_SMALL_MODULUS 251
6946#define DRT_HASH_LARGE_MODULUS 2039
6947#define DRT_HASH_XLARGE_MODULUS 8179
6948
6949/*
6950 * Physical memory required before the large hash modulus is permitted.
6951 *
6952 * On small memory systems, the large hash modulus can lead to phsyical
6953 * memory starvation, so we avoid using it there.
6954 */
6955#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6956#define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6957
6958#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6959#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6960#define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6961
6962#else /* XNU_TARGET_OS_OSX */
6963/*
6964 * Hash table moduli.
6965 *
6966 * Since the hashtable entry's size is dependent on the size of
6967 * the bitvector, and since the hashtable size is constrained to
6968 * both being prime and fitting within the desired allocation
6969 * size, these values need to be manually determined.
6970 *
6971 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6972 *
6973 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6974 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6975 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6976 */
6977
6978#define DRT_HASH_SMALL_MODULUS 1019
6979#define DRT_HASH_LARGE_MODULUS 8179
6980#define DRT_HASH_XLARGE_MODULUS 32749
6981
6982/*
6983 * Physical memory required before the large hash modulus is permitted.
6984 *
6985 * On small memory systems, the large hash modulus can lead to phsyical
6986 * memory starvation, so we avoid using it there.
6987 */
6988#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6989#define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6990
6991#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6992#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6993#define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6994
6995#endif /* ! XNU_TARGET_OS_OSX */
6996
6997/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6998
6999/*
7000 * Hashtable entry.
7001 */
7002struct vfs_drt_hashentry {
7003 u_int64_t dhe_control;
7004/*
7005 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
7006 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
7007 * Since PAGE_SIZE is only known at boot time,
7008 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
7009 * -declare dhe_bitvector array for largest possible length
7010 */
7011#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
7012 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
7013};
7014
7015/*
7016 * Hashtable bitvector handling.
7017 *
7018 * Bitvector fields are 32 bits long.
7019 */
7020
7021#define DRT_HASH_SET_BIT(scm, i, bit) \
7022 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
7023
7024#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
7025 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7026
7027#define DRT_HASH_TEST_BIT(scm, i, bit) \
7028 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7029
7030#define DRT_BITVECTOR_CLEAR(scm, i) \
7031 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7032
7033#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
7034 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
7035 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
7036 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7037
7038/*
7039 * Dirty Region Tracking structure.
7040 *
7041 * The hashtable is allocated entirely inside the DRT structure.
7042 *
7043 * The hash is a simple circular prime modulus arrangement, the structure
7044 * is resized from small to large if it overflows.
7045 */
7046
7047struct vfs_drt_clustermap {
7048 u_int32_t scm_magic; /* sanity/detection */
7049#define DRT_SCM_MAGIC 0x12020003
7050 u_int32_t scm_modulus; /* current ring size */
7051 u_int32_t scm_buckets; /* number of occupied buckets */
7052 u_int32_t scm_lastclean; /* last entry we cleaned */
7053 u_int32_t scm_iskips; /* number of slot skips */
7054
7055 struct vfs_drt_hashentry scm_hashtable[0];
7056};
7057
7058
7059#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7060#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7061
7062/*
7063 * Debugging codes and arguments.
7064 */
7065#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7066#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7067#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7068#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7069#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7070 * dirty */
7071 /* 0, setcount */
7072 /* 1 (clean, no map) */
7073 /* 2 (map alloc fail) */
7074 /* 3, resid (partial) */
7075#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7076#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7077 * lastclean, iskips */
7078
7079
7080static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7081static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7082static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7083 u_int64_t offset, int *indexp);
7084static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7085 u_int64_t offset,
7086 int *indexp,
7087 int recursed);
7088static kern_return_t vfs_drt_do_mark_pages(
7089 void **cmapp,
7090 u_int64_t offset,
7091 u_int length,
7092 u_int *setcountp,
7093 int dirty);
7094static void vfs_drt_trace(
7095 struct vfs_drt_clustermap *cmap,
7096 int code,
7097 int arg1,
7098 int arg2,
7099 int arg3,
7100 int arg4);
7101
7102
7103/*
7104 * Allocate and initialise a sparse cluster map.
7105 *
7106 * Will allocate a new map, resize or compact an existing map.
7107 *
7108 * XXX we should probably have at least one intermediate map size,
7109 * as the 1:16 ratio seems a bit drastic.
7110 */
7111static kern_return_t
7112vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7113{
7114 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7115 kern_return_t kret = KERN_SUCCESS;
7116 u_int64_t offset = 0;
7117 u_int32_t i = 0;
7118 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7119
7120 ocmap = NULL;
7121 if (cmapp != NULL) {
7122 ocmap = *cmapp;
7123 }
7124
7125 /*
7126 * Decide on the size of the new map.
7127 */
7128 if (ocmap == NULL) {
7129 modulus_size = DRT_HASH_SMALL_MODULUS;
7130 map_size = DRT_SMALL_ALLOCATION;
7131 } else {
7132 /* count the number of active buckets in the old map */
7133 active_buckets = 0;
7134 for (i = 0; i < ocmap->scm_modulus; i++) {
7135 if (!DRT_HASH_VACANT(ocmap, i) &&
7136 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7137 active_buckets++;
7138 }
7139 }
7140 /*
7141 * If we're currently using the small allocation, check to
7142 * see whether we should grow to the large one.
7143 */
7144 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7145 /*
7146 * If the ring is nearly full and we are allowed to
7147 * use the large modulus, upgrade.
7148 */
7149 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7150 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7151 modulus_size = DRT_HASH_LARGE_MODULUS;
7152 map_size = DRT_LARGE_ALLOCATION;
7153 } else {
7154 modulus_size = DRT_HASH_SMALL_MODULUS;
7155 map_size = DRT_SMALL_ALLOCATION;
7156 }
7157 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7158 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7159 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7160 modulus_size = DRT_HASH_XLARGE_MODULUS;
7161 map_size = DRT_XLARGE_ALLOCATION;
7162 } else {
7163 /*
7164 * If the ring is completely full and we can't
7165 * expand, there's nothing useful for us to do.
7166 * Behave as though we had compacted into the new
7167 * array and return.
7168 */
7169 return KERN_SUCCESS;
7170 }
7171 } else {
7172 /* already using the xlarge modulus */
7173 modulus_size = DRT_HASH_XLARGE_MODULUS;
7174 map_size = DRT_XLARGE_ALLOCATION;
7175
7176 /*
7177 * If the ring is completely full, there's
7178 * nothing useful for us to do. Behave as
7179 * though we had compacted into the new
7180 * array and return.
7181 */
7182 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7183 return KERN_SUCCESS;
7184 }
7185 }
7186 }
7187
7188 /*
7189 * Allocate and initialise the new map.
7190 */
7191
7192 kret = kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&cmap, size: map_size,
7193 flags: KMA_DATA, VM_KERN_MEMORY_FILE);
7194 if (kret != KERN_SUCCESS) {
7195 return kret;
7196 }
7197 cmap->scm_magic = DRT_SCM_MAGIC;
7198 cmap->scm_modulus = modulus_size;
7199 cmap->scm_buckets = 0;
7200 cmap->scm_lastclean = 0;
7201 cmap->scm_iskips = 0;
7202 for (i = 0; i < cmap->scm_modulus; i++) {
7203 DRT_HASH_CLEAR(cmap, i);
7204 DRT_HASH_VACATE(cmap, i);
7205 DRT_BITVECTOR_CLEAR(cmap, i);
7206 }
7207
7208 /*
7209 * If there's an old map, re-hash entries from it into the new map.
7210 */
7211 copycount = 0;
7212 if (ocmap != NULL) {
7213 for (i = 0; i < ocmap->scm_modulus; i++) {
7214 /* skip empty buckets */
7215 if (DRT_HASH_VACANT(ocmap, i) ||
7216 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7217 continue;
7218 }
7219 /* get new index */
7220 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7221 kret = vfs_drt_get_index(cmapp: &cmap, offset, indexp: &index, recursed: 1);
7222 if (kret != KERN_SUCCESS) {
7223 /* XXX need to bail out gracefully here */
7224 panic("vfs_drt: new cluster map mysteriously too small");
7225 index = 0;
7226 }
7227 /* copy */
7228 DRT_HASH_COPY(ocmap, i, cmap, index);
7229 copycount++;
7230 }
7231 }
7232
7233 /* log what we've done */
7234 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, arg1: copycount, arg2: 0, arg3: 0, arg4: 0);
7235
7236 /*
7237 * It's important to ensure that *cmapp always points to
7238 * a valid map, so we must overwrite it before freeing
7239 * the old map.
7240 */
7241 *cmapp = cmap;
7242 if (ocmap != NULL) {
7243 /* emit stats into trace buffer */
7244 vfs_drt_trace(cmap: ocmap, DRT_DEBUG_SCMDATA,
7245 arg1: ocmap->scm_modulus,
7246 arg2: ocmap->scm_buckets,
7247 arg3: ocmap->scm_lastclean,
7248 arg4: ocmap->scm_iskips);
7249
7250 vfs_drt_free_map(cmap: ocmap);
7251 }
7252 return KERN_SUCCESS;
7253}
7254
7255
7256/*
7257 * Free a sparse cluster map.
7258 */
7259static kern_return_t
7260vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7261{
7262 vm_size_t map_size = 0;
7263
7264 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7265 map_size = DRT_SMALL_ALLOCATION;
7266 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7267 map_size = DRT_LARGE_ALLOCATION;
7268 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7269 map_size = DRT_XLARGE_ALLOCATION;
7270 } else {
7271 panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7272 }
7273
7274 kmem_free(map: kernel_map, addr: (vm_offset_t)cmap, size: map_size);
7275 return KERN_SUCCESS;
7276}
7277
7278
7279/*
7280 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7281 */
7282static kern_return_t
7283vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7284{
7285 int index;
7286 u_int32_t i;
7287
7288 offset = DRT_ALIGN_ADDRESS(offset);
7289 index = DRT_HASH(cmap, offset);
7290
7291 /* traverse the hashtable */
7292 for (i = 0; i < cmap->scm_modulus; i++) {
7293 /*
7294 * If the slot is vacant, we can stop.
7295 */
7296 if (DRT_HASH_VACANT(cmap, index)) {
7297 break;
7298 }
7299
7300 /*
7301 * If the address matches our offset, we have success.
7302 */
7303 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7304 *indexp = index;
7305 return KERN_SUCCESS;
7306 }
7307
7308 /*
7309 * Move to the next slot, try again.
7310 */
7311 index = DRT_HASH_NEXT(cmap, index);
7312 }
7313 /*
7314 * It's not there.
7315 */
7316 return KERN_FAILURE;
7317}
7318
7319/*
7320 * Find the hashtable slot for the supplied offset. If we haven't allocated
7321 * one yet, allocate one and populate the address field. Note that it will
7322 * not have a nonzero page count and thus will still technically be free, so
7323 * in the case where we are called to clean pages, the slot will remain free.
7324 */
7325static kern_return_t
7326vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7327{
7328 struct vfs_drt_clustermap *cmap;
7329 kern_return_t kret;
7330 u_int32_t index;
7331 u_int32_t i;
7332
7333 cmap = *cmapp;
7334
7335 /* look for an existing entry */
7336 kret = vfs_drt_search_index(cmap, offset, indexp);
7337 if (kret == KERN_SUCCESS) {
7338 return kret;
7339 }
7340
7341 /* need to allocate an entry */
7342 offset = DRT_ALIGN_ADDRESS(offset);
7343 index = DRT_HASH(cmap, offset);
7344
7345 /* scan from the index forwards looking for a vacant slot */
7346 for (i = 0; i < cmap->scm_modulus; i++) {
7347 /* slot vacant? */
7348 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7349 cmap->scm_buckets++;
7350 if (index < cmap->scm_lastclean) {
7351 cmap->scm_lastclean = index;
7352 }
7353 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7354 DRT_HASH_SET_COUNT(cmap, index, 0);
7355 DRT_BITVECTOR_CLEAR(cmap, index);
7356 *indexp = index;
7357 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, arg1: (int)offset, arg2: i, arg3: 0, arg4: 0);
7358 return KERN_SUCCESS;
7359 }
7360 cmap->scm_iskips += i;
7361 index = DRT_HASH_NEXT(cmap, index);
7362 }
7363
7364 /*
7365 * We haven't found a vacant slot, so the map is full. If we're not
7366 * already recursed, try reallocating/compacting it.
7367 */
7368 if (recursed) {
7369 return KERN_FAILURE;
7370 }
7371 kret = vfs_drt_alloc_map(cmapp);
7372 if (kret == KERN_SUCCESS) {
7373 /* now try to insert again */
7374 kret = vfs_drt_get_index(cmapp, offset, indexp, recursed: 1);
7375 }
7376 return kret;
7377}
7378
7379/*
7380 * Implementation of set dirty/clean.
7381 *
7382 * In the 'clean' case, not finding a map is OK.
7383 */
7384static kern_return_t
7385vfs_drt_do_mark_pages(
7386 void **private,
7387 u_int64_t offset,
7388 u_int length,
7389 u_int *setcountp,
7390 int dirty)
7391{
7392 struct vfs_drt_clustermap *cmap, **cmapp;
7393 kern_return_t kret;
7394 int i, index, pgoff, pgcount, setcount, ecount;
7395
7396 cmapp = (struct vfs_drt_clustermap **)private;
7397 cmap = *cmapp;
7398
7399 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, arg1: (int)offset, arg2: (int)length, arg3: dirty, arg4: 0);
7400
7401 if (setcountp != NULL) {
7402 *setcountp = 0;
7403 }
7404
7405 /* allocate a cluster map if we don't already have one */
7406 if (cmap == NULL) {
7407 /* no cluster map, nothing to clean */
7408 if (!dirty) {
7409 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, arg1: 1, arg2: 0, arg3: 0, arg4: 0);
7410 return KERN_SUCCESS;
7411 }
7412 kret = vfs_drt_alloc_map(cmapp);
7413 if (kret != KERN_SUCCESS) {
7414 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, arg1: 2, arg2: 0, arg3: 0, arg4: 0);
7415 return kret;
7416 }
7417 }
7418 setcount = 0;
7419
7420 /*
7421 * Iterate over the length of the region.
7422 */
7423 while (length > 0) {
7424 /*
7425 * Get the hashtable index for this offset.
7426 *
7427 * XXX this will add blank entries if we are clearing a range
7428 * that hasn't been dirtied.
7429 */
7430 kret = vfs_drt_get_index(cmapp, offset, indexp: &index, recursed: 0);
7431 cmap = *cmapp; /* may have changed! */
7432 /* this may be a partial-success return */
7433 if (kret != KERN_SUCCESS) {
7434 if (setcountp != NULL) {
7435 *setcountp = setcount;
7436 }
7437 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, arg1: 3, arg2: (int)length, arg3: 0, arg4: 0);
7438
7439 return kret;
7440 }
7441
7442 /*
7443 * Work out how many pages we're modifying in this
7444 * hashtable entry.
7445 */
7446 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7447 pgcount = min(a: (length / PAGE_SIZE), b: (DRT_BITVECTOR_PAGES - pgoff));
7448
7449 /*
7450 * Iterate over pages, dirty/clearing as we go.
7451 */
7452 ecount = DRT_HASH_GET_COUNT(cmap, index);
7453 for (i = 0; i < pgcount; i++) {
7454 if (dirty) {
7455 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7456 if (ecount >= DRT_BITVECTOR_PAGES) {
7457 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7458 }
7459 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7460 ecount++;
7461 setcount++;
7462 }
7463 } else {
7464 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7465 if (ecount <= 0) {
7466 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7467 }
7468 assert(ecount > 0);
7469 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7470 ecount--;
7471 setcount++;
7472 }
7473 }
7474 }
7475 DRT_HASH_SET_COUNT(cmap, index, ecount);
7476
7477 offset += pgcount * PAGE_SIZE;
7478 length -= pgcount * PAGE_SIZE;
7479 }
7480 if (setcountp != NULL) {
7481 *setcountp = setcount;
7482 }
7483
7484 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, arg1: 0, arg2: setcount, arg3: 0, arg4: 0);
7485
7486 return KERN_SUCCESS;
7487}
7488
7489/*
7490 * Mark a set of pages as dirty/clean.
7491 *
7492 * This is a public interface.
7493 *
7494 * cmapp
7495 * Pointer to storage suitable for holding a pointer. Note that
7496 * this must either be NULL or a value set by this function.
7497 *
7498 * size
7499 * Current file size in bytes.
7500 *
7501 * offset
7502 * Offset of the first page to be marked as dirty, in bytes. Must be
7503 * page-aligned.
7504 *
7505 * length
7506 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7507 *
7508 * setcountp
7509 * Number of pages newly marked dirty by this call (optional).
7510 *
7511 * Returns KERN_SUCCESS if all the pages were successfully marked.
7512 */
7513static kern_return_t
7514vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7515{
7516 /* XXX size unused, drop from interface */
7517 return vfs_drt_do_mark_pages(private: cmapp, offset, length, setcountp, dirty: 1);
7518}
7519
7520#if 0
7521static kern_return_t
7522vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7523{
7524 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7525}
7526#endif
7527
7528/*
7529 * Get a cluster of dirty pages.
7530 *
7531 * This is a public interface.
7532 *
7533 * cmapp
7534 * Pointer to storage managed by drt_mark_pages. Note that this must
7535 * be NULL or a value set by drt_mark_pages.
7536 *
7537 * offsetp
7538 * Returns the byte offset into the file of the first page in the cluster.
7539 *
7540 * lengthp
7541 * Returns the length in bytes of the cluster of dirty pages.
7542 *
7543 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7544 * are no dirty pages meeting the minmum size criteria. Private storage will
7545 * be released if there are no more dirty pages left in the map
7546 *
7547 */
7548static kern_return_t
7549vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7550{
7551 struct vfs_drt_clustermap *cmap;
7552 u_int64_t offset;
7553 u_int length;
7554 u_int32_t j;
7555 int index, i, fs, ls;
7556
7557 /* sanity */
7558 if ((cmapp == NULL) || (*cmapp == NULL)) {
7559 return KERN_FAILURE;
7560 }
7561 cmap = *cmapp;
7562
7563 /* walk the hashtable */
7564 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7565 index = DRT_HASH(cmap, offset);
7566
7567 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7568 continue;
7569 }
7570
7571 /* scan the bitfield for a string of bits */
7572 fs = -1;
7573
7574 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7575 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7576 fs = i;
7577 break;
7578 }
7579 }
7580 if (fs == -1) {
7581 /* didn't find any bits set */
7582 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7583 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7584 }
7585 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7586 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7587 break;
7588 }
7589 }
7590
7591 /* compute offset and length, mark pages clean */
7592 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7593 length = ls * PAGE_SIZE;
7594 vfs_drt_do_mark_pages(private: cmapp, offset, length, NULL, dirty: 0);
7595 cmap->scm_lastclean = index;
7596
7597 /* return successful */
7598 *offsetp = (off_t)offset;
7599 *lengthp = length;
7600
7601 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, arg1: (int)offset, arg2: (int)length, arg3: 0, arg4: 0);
7602 return KERN_SUCCESS;
7603 }
7604 /*
7605 * We didn't find anything... hashtable is empty
7606 * emit stats into trace buffer and
7607 * then free it
7608 */
7609 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7610 arg1: cmap->scm_modulus,
7611 arg2: cmap->scm_buckets,
7612 arg3: cmap->scm_lastclean,
7613 arg4: cmap->scm_iskips);
7614
7615 vfs_drt_free_map(cmap);
7616 *cmapp = NULL;
7617
7618 return KERN_FAILURE;
7619}
7620
7621
7622static kern_return_t
7623vfs_drt_control(void **cmapp, int op_type)
7624{
7625 struct vfs_drt_clustermap *cmap;
7626
7627 /* sanity */
7628 if ((cmapp == NULL) || (*cmapp == NULL)) {
7629 return KERN_FAILURE;
7630 }
7631 cmap = *cmapp;
7632
7633 switch (op_type) {
7634 case 0:
7635 /* emit stats into trace buffer */
7636 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7637 arg1: cmap->scm_modulus,
7638 arg2: cmap->scm_buckets,
7639 arg3: cmap->scm_lastclean,
7640 arg4: cmap->scm_iskips);
7641
7642 vfs_drt_free_map(cmap);
7643 *cmapp = NULL;
7644 break;
7645
7646 case 1:
7647 cmap->scm_lastclean = 0;
7648 break;
7649 }
7650 return KERN_SUCCESS;
7651}
7652
7653
7654
7655/*
7656 * Emit a summary of the state of the clustermap into the trace buffer
7657 * along with some caller-provided data.
7658 */
7659#if KDEBUG
7660static void
7661vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7662{
7663 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7664}
7665#else
7666static void
7667vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7668 __unused int arg1, __unused int arg2, __unused int arg3,
7669 __unused int arg4)
7670{
7671}
7672#endif
7673
7674#if 0
7675/*
7676 * Perform basic sanity check on the hash entry summary count
7677 * vs. the actual bits set in the entry.
7678 */
7679static void
7680vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7681{
7682 int index, i;
7683 int bits_on;
7684
7685 for (index = 0; index < cmap->scm_modulus; index++) {
7686 if (DRT_HASH_VACANT(cmap, index)) {
7687 continue;
7688 }
7689
7690 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7691 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7692 bits_on++;
7693 }
7694 }
7695 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7696 panic("bits_on = %d, index = %d", bits_on, index);
7697 }
7698 }
7699}
7700#endif
7701
7702/*
7703 * Internal interface only.
7704 */
7705static kern_return_t
7706vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7707{
7708 struct vfs_drt_clustermap *cmap;
7709
7710 /* sanity */
7711 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7712 return KERN_FAILURE;
7713 }
7714 cmap = *cmapp;
7715
7716 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7717 /*
7718 * If we have a full xlarge sparse cluster,
7719 * we push it out all at once so the cluster
7720 * map can be available to absorb more I/Os.
7721 * This is done on large memory configs so
7722 * the small I/Os don't interfere with the
7723 * pro workloads.
7724 */
7725 *push_flag = PUSH_ALL;
7726 }
7727 return KERN_SUCCESS;
7728}
7729