1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64#include <sys/param.h>
65#include <sys/proc_internal.h>
66#include <sys/buf_internal.h>
67#include <sys/mount_internal.h>
68#include <sys/vnode_internal.h>
69#include <sys/trace.h>
70#include <sys/malloc.h>
71#include <sys/time.h>
72#include <sys/kernel.h>
73#include <sys/resourcevar.h>
74#include <miscfs/specfs/specdev.h>
75#include <sys/uio_internal.h>
76#include <libkern/libkern.h>
77#include <machine/machine_routines.h>
78
79#include <sys/ubc_internal.h>
80#include <vm/vnode_pager.h>
81
82#include <mach/mach_types.h>
83#include <mach/memory_object_types.h>
84#include <mach/vm_map.h>
85#include <mach/upl.h>
86#include <kern/task.h>
87#include <kern/policy_internal.h>
88
89#include <vm/vm_kern.h>
90#include <vm/vm_map.h>
91#include <vm/vm_pageout.h>
92#include <vm/vm_fault.h>
93
94#include <sys/kdebug.h>
95#include <libkern/OSAtomic.h>
96
97#include <sys/sdt.h>
98
99#include <stdbool.h>
100
101#include <vfs/vfs_disk_conditioner.h>
102
103#if 0
104#undef KERNEL_DEBUG
105#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
106#endif
107
108
109#define CL_READ 0x01
110#define CL_WRITE 0x02
111#define CL_ASYNC 0x04
112#define CL_COMMIT 0x08
113#define CL_PAGEOUT 0x10
114#define CL_AGE 0x20
115#define CL_NOZERO 0x40
116#define CL_PAGEIN 0x80
117#define CL_DEV_MEMORY 0x100
118#define CL_PRESERVE 0x200
119#define CL_THROTTLE 0x400
120#define CL_KEEPCACHED 0x800
121#define CL_DIRECT_IO 0x1000
122#define CL_PASSIVE 0x2000
123#define CL_IOSTREAMING 0x4000
124#define CL_CLOSE 0x8000
125#define CL_ENCRYPTED 0x10000
126#define CL_RAW_ENCRYPTED 0x20000
127#define CL_NOCACHE 0x40000
128
129#define MAX_VECTOR_UPL_ELEMENTS 8
130#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132#define CLUSTER_IO_WAITING ((buf_t)1)
133
134extern upl_t vector_upl_create(vm_offset_t);
135extern boolean_t vector_upl_is_valid(upl_t);
136extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
137extern void vector_upl_set_pagelist(upl_t);
138extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
139
140struct clios {
141 lck_mtx_t io_mtxp;
142 u_int io_completed; /* amount of io that has currently completed */
143 u_int io_issued; /* amount of io that was successfully issued */
144 int io_error; /* error code of first error encountered */
145 int io_wanted; /* someone is sleeping waiting for a change in state */
146};
147
148struct cl_direct_read_lock {
149 LIST_ENTRY(cl_direct_read_lock) chain;
150 int32_t ref_count;
151 vnode_t vp;
152 lck_rw_t rw_lock;
153};
154
155#define CL_DIRECT_READ_LOCK_BUCKETS 61
156
157static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
158 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
159
160static lck_spin_t cl_direct_read_spin_lock;
161
162static lck_grp_t *cl_mtx_grp;
163static lck_attr_t *cl_mtx_attr;
164static lck_grp_attr_t *cl_mtx_grp_attr;
165static lck_mtx_t *cl_transaction_mtxp;
166
167#define IO_UNKNOWN 0
168#define IO_DIRECT 1
169#define IO_CONTIG 2
170#define IO_COPY 3
171
172#define PUSH_DELAY 0x01
173#define PUSH_ALL 0x02
174#define PUSH_SYNC 0x04
175
176
177static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
178static void cluster_wait_IO(buf_t cbp_head, int async);
179static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
180
181static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
182
183static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
184 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
185static int cluster_iodone(buf_t bp, void *callback_arg);
186static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
187static int cluster_is_throttled(vnode_t vp);
188
189static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
190
191static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
192
193static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
194static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
195
196static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
197 int (*)(buf_t, void *), void *callback_arg);
198static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
199 int flags, int (*)(buf_t, void *), void *callback_arg);
200static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
201 int (*)(buf_t, void *), void *callback_arg, int flags);
202
203static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
204 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
205static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
206 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
207static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
208 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
209
210static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
211 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
212
213static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
214
215static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
216static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
217 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
218
219static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
220
221static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
222 void *callback_arg, int *err, boolean_t vm_initiated);
223
224static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
225static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
226 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
227static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
228 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229
230static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
231static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
232static kern_return_t vfs_drt_control(void **cmapp, int op_type);
233
234
235/*
236 * For throttled IO to check whether
237 * a block is cached by the boot cache
238 * and thus it can avoid delaying the IO.
239 *
240 * bootcache_contains_block is initially
241 * NULL. The BootCache will set it while
242 * the cache is active and clear it when
243 * the cache is jettisoned.
244 *
245 * Returns 0 if the block is not
246 * contained in the cache, 1 if it is
247 * contained.
248 *
249 * The function pointer remains valid
250 * after the cache has been evicted even
251 * if bootcache_contains_block has been
252 * cleared.
253 *
254 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
255 */
256int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
257
258
259/*
260 * limit the internal I/O size so that we
261 * can represent it in a 32 bit int
262 */
263#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
264#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
265#define MAX_VECTS 16
266/*
267 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
268 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
269 * we have not historically allowed the write to bypass the UBC.
270 */
271#define MIN_DIRECT_WRITE_SIZE (16384)
272
273#define WRITE_THROTTLE 6
274#define WRITE_THROTTLE_SSD 2
275#define WRITE_BEHIND 1
276#define WRITE_BEHIND_SSD 1
277
278#if CONFIG_EMBEDDED
279#define PREFETCH 1
280#define PREFETCH_SSD 1
281uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
282uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
283#else
284#define PREFETCH 3
285#define PREFETCH_SSD 2
286uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
287uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
288#endif
289
290
291#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
292#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
293#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
294
295int speculative_reads_disabled = 0;
296
297/*
298 * throttle the number of async writes that
299 * can be outstanding on a single vnode
300 * before we issue a synchronous write
301 */
302#define THROTTLE_MAXCNT 0
303
304uint32_t throttle_max_iosize = (128 * 1024);
305
306#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
307
308SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
309
310
311void
312cluster_init(void) {
313 /*
314 * allocate lock group attribute and group
315 */
316 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
317 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
318
319 /*
320 * allocate the lock attribute
321 */
322 cl_mtx_attr = lck_attr_alloc_init();
323
324 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
325
326 if (cl_transaction_mtxp == NULL)
327 panic("cluster_init: failed to allocate cl_transaction_mtxp");
328
329 lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
330
331 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i)
332 LIST_INIT(&cl_direct_read_locks[i]);
333}
334
335
336uint32_t
337cluster_max_io_size(mount_t mp, int type)
338{
339 uint32_t max_io_size;
340 uint32_t segcnt;
341 uint32_t maxcnt;
342
343 switch(type) {
344
345 case CL_READ:
346 segcnt = mp->mnt_segreadcnt;
347 maxcnt = mp->mnt_maxreadcnt;
348 break;
349 case CL_WRITE:
350 segcnt = mp->mnt_segwritecnt;
351 maxcnt = mp->mnt_maxwritecnt;
352 break;
353 default:
354 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
355 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
356 break;
357 }
358 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
359 /*
360 * don't allow a size beyond the max UPL size we can create
361 */
362 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
363 }
364 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
365
366 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
367 /*
368 * don't allow a size smaller than the old fixed limit
369 */
370 max_io_size = MAX_UPL_TRANSFER_BYTES;
371 } else {
372 /*
373 * make sure the size specified is a multiple of PAGE_SIZE
374 */
375 max_io_size &= ~PAGE_MASK;
376 }
377 return (max_io_size);
378}
379
380
381
382
383#define CLW_ALLOCATE 0x01
384#define CLW_RETURNLOCKED 0x02
385#define CLW_IONOCACHE 0x04
386#define CLW_IOPASSIVE 0x08
387
388/*
389 * if the read ahead context doesn't yet exist,
390 * allocate and initialize it...
391 * the vnode lock serializes multiple callers
392 * during the actual assignment... first one
393 * to grab the lock wins... the other callers
394 * will release the now unnecessary storage
395 *
396 * once the context is present, try to grab (but don't block on)
397 * the lock associated with it... if someone
398 * else currently owns it, than the read
399 * will run without read-ahead. this allows
400 * multiple readers to run in parallel and
401 * since there's only 1 read ahead context,
402 * there's no real loss in only allowing 1
403 * reader to have read-ahead enabled.
404 */
405static struct cl_readahead *
406cluster_get_rap(vnode_t vp)
407{
408 struct ubc_info *ubc;
409 struct cl_readahead *rap;
410
411 ubc = vp->v_ubcinfo;
412
413 if ((rap = ubc->cl_rahead) == NULL) {
414 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
415
416 bzero(rap, sizeof *rap);
417 rap->cl_lastr = -1;
418 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
419
420 vnode_lock(vp);
421
422 if (ubc->cl_rahead == NULL)
423 ubc->cl_rahead = rap;
424 else {
425 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
426 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
427 rap = ubc->cl_rahead;
428 }
429 vnode_unlock(vp);
430 }
431 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
432 return(rap);
433
434 return ((struct cl_readahead *)NULL);
435}
436
437
438/*
439 * if the write behind context doesn't yet exist,
440 * and CLW_ALLOCATE is specified, allocate and initialize it...
441 * the vnode lock serializes multiple callers
442 * during the actual assignment... first one
443 * to grab the lock wins... the other callers
444 * will release the now unnecessary storage
445 *
446 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
447 * the lock associated with the write behind context before
448 * returning
449 */
450
451static struct cl_writebehind *
452cluster_get_wbp(vnode_t vp, int flags)
453{
454 struct ubc_info *ubc;
455 struct cl_writebehind *wbp;
456
457 ubc = vp->v_ubcinfo;
458
459 if ((wbp = ubc->cl_wbehind) == NULL) {
460
461 if ( !(flags & CLW_ALLOCATE))
462 return ((struct cl_writebehind *)NULL);
463
464 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
465
466 bzero(wbp, sizeof *wbp);
467 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
468
469 vnode_lock(vp);
470
471 if (ubc->cl_wbehind == NULL)
472 ubc->cl_wbehind = wbp;
473 else {
474 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
475 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
476 wbp = ubc->cl_wbehind;
477 }
478 vnode_unlock(vp);
479 }
480 if (flags & CLW_RETURNLOCKED)
481 lck_mtx_lock(&wbp->cl_lockw);
482
483 return (wbp);
484}
485
486
487static void
488cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
489{
490 struct cl_writebehind *wbp;
491
492 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
493
494 if (wbp->cl_number) {
495 lck_mtx_lock(&wbp->cl_lockw);
496
497 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
498
499 lck_mtx_unlock(&wbp->cl_lockw);
500 }
501 }
502}
503
504
505static int
506cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
507{
508 daddr64_t blkno;
509 size_t io_size;
510 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
511
512 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
513 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL))
514 return(0);
515
516 if (io_size == 0)
517 return (0);
518
519 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
520 return(1);
521 }
522 return(0);
523}
524
525
526static int
527cluster_is_throttled(vnode_t vp)
528{
529 return (throttle_io_will_be_throttled(-1, vp->v_mount));
530}
531
532
533static void
534cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
535{
536
537 lck_mtx_lock(&iostate->io_mtxp);
538
539 while ((iostate->io_issued - iostate->io_completed) > target) {
540
541 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
542 iostate->io_issued, iostate->io_completed, target, 0, 0);
543
544 iostate->io_wanted = 1;
545 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
546
547 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
548 iostate->io_issued, iostate->io_completed, target, 0, 0);
549 }
550 lck_mtx_unlock(&iostate->io_mtxp);
551}
552
553static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
554 upl_offset_t upl_offset, upl_size_t size)
555{
556 if (!size)
557 return;
558
559 upl_t associated_upl = upl_associated_upl(upl);
560
561 if (!associated_upl)
562 return;
563
564#if 0
565 printf("1: %d %d\n", upl_offset, upl_offset + size);
566#endif
567
568 /*
569 * The associated UPL is page aligned to file offsets whereas the
570 * UPL it's attached to has different alignment requirements. The
571 * upl_offset that we have refers to @upl. The code that follows
572 * has to deal with the first and last pages in this transaction
573 * which might straddle pages in the associated UPL. To keep
574 * track of these pages, we use the mark bits: if the mark bit is
575 * set, we know another transaction has completed its part of that
576 * page and so we can unlock that page here.
577 *
578 * The following illustrates what we have to deal with:
579 *
580 * MEM u <------------ 1 PAGE ------------> e
581 * +-------------+----------------------+-----------------
582 * | |######################|#################
583 * +-------------+----------------------+-----------------
584 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
585 *
586 * So here we show a write to offset @o. The data that is to be
587 * written is in a buffer that is not page aligned; it has offset
588 * @a in the page. The upl that carries the data starts in memory
589 * at @u. The associated upl starts in the file at offset @o. A
590 * transaction will always end on a page boundary (like @e above)
591 * except for the very last transaction in the group. We cannot
592 * unlock the page at @o in the associated upl until both the
593 * transaction ending at @e and the following transaction (that
594 * starts at @e) has completed.
595 */
596
597 /*
598 * We record whether or not the two UPLs are aligned as the mark
599 * bit in the first page of @upl.
600 */
601 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
602 bool is_unaligned = upl_page_get_mark(pl, 0);
603
604 if (is_unaligned) {
605 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
606
607 upl_offset_t upl_end = upl_offset + size;
608 assert(upl_end >= PAGE_SIZE);
609
610 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
611
612 /*
613 * In the very first transaction in the group, upl_offset will
614 * not be page aligned, but after that it will be and in that
615 * case we want the preceding page in the associated UPL hence
616 * the minus one.
617 */
618 assert(upl_offset);
619 if (upl_offset)
620 upl_offset = trunc_page_32(upl_offset - 1);
621
622 lck_mtx_lock_spin(&iostate->io_mtxp);
623
624 // Look at the first page...
625 if (upl_offset
626 && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
627 /*
628 * The first page isn't marked so let another transaction
629 * completion handle it.
630 */
631 upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
632 upl_offset += PAGE_SIZE;
633 }
634
635 // And now the last page...
636
637 /*
638 * This needs to be > rather than >= because if it's equal, it
639 * means there's another transaction that is sharing the last
640 * page.
641 */
642 if (upl_end > assoc_upl_size)
643 upl_end = assoc_upl_size;
644 else {
645 upl_end = trunc_page_32(upl_end);
646 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
647
648 if (!upl_page_get_mark(assoc_pl, last_pg)) {
649 /*
650 * The last page isn't marked so mark the page and let another
651 * transaction completion handle it.
652 */
653 upl_page_set_mark(assoc_pl, last_pg, true);
654 upl_end -= PAGE_SIZE;
655 }
656 }
657
658 lck_mtx_unlock(&iostate->io_mtxp);
659
660#if 0
661 printf("2: %d %d\n", upl_offset, upl_end);
662#endif
663
664 if (upl_end <= upl_offset)
665 return;
666
667 size = upl_end - upl_offset;
668 } else {
669 assert(!(upl_offset & PAGE_MASK));
670 assert(!(size & PAGE_MASK));
671 }
672
673 boolean_t empty;
674
675 /*
676 * We can unlock these pages now and as this is for a
677 * direct/uncached write, we want to dump the pages too.
678 */
679 kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
680 UPL_ABORT_DUMP_PAGES, &empty);
681
682 assert(!kr);
683
684 if (!kr && empty) {
685 upl_set_associated_upl(upl, NULL);
686 upl_deallocate(associated_upl);
687 }
688}
689
690static int
691cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
692{
693 int upl_abort_code = 0;
694 int page_in = 0;
695 int page_out = 0;
696
697 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE))
698 /*
699 * direct write of any flavor, or a direct read that wasn't aligned
700 */
701 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
702 else {
703 if (io_flags & B_PAGEIO) {
704 if (io_flags & B_READ)
705 page_in = 1;
706 else
707 page_out = 1;
708 }
709 if (io_flags & B_CACHE)
710 /*
711 * leave pages in the cache unchanged on error
712 */
713 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
714 else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp)))
715 /*
716 * transient error on pageout/write path... leave pages unchanged
717 */
718 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
719 else if (page_in)
720 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
721 else
722 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
723
724 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
725 }
726 return (upl_abort_code);
727}
728
729
730static int
731cluster_iodone(buf_t bp, void *callback_arg)
732{
733 int b_flags;
734 int error;
735 int total_size;
736 int total_resid;
737 int upl_offset;
738 int zero_offset;
739 int pg_offset = 0;
740 int commit_size = 0;
741 int upl_flags = 0;
742 int transaction_size = 0;
743 upl_t upl;
744 buf_t cbp;
745 buf_t cbp_head;
746 buf_t cbp_next;
747 buf_t real_bp;
748 vnode_t vp;
749 struct clios *iostate;
750 boolean_t transaction_complete = FALSE;
751
752 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
753
754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
755 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
756
757 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
758 lck_mtx_lock_spin(cl_transaction_mtxp);
759
760 bp->b_flags |= B_TDONE;
761
762 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
763 /*
764 * all I/O requests that are part of this transaction
765 * have to complete before we can process it
766 */
767 if ( !(cbp->b_flags & B_TDONE)) {
768
769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
770 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
771
772 lck_mtx_unlock(cl_transaction_mtxp);
773
774 return 0;
775 }
776
777 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
778 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
779 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
780
781 lck_mtx_unlock(cl_transaction_mtxp);
782 wakeup(cbp);
783
784 return 0;
785 }
786
787 if (cbp->b_flags & B_EOT)
788 transaction_complete = TRUE;
789 }
790 lck_mtx_unlock(cl_transaction_mtxp);
791
792 if (transaction_complete == FALSE) {
793 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
794 cbp_head, 0, 0, 0, 0);
795 return 0;
796 }
797 }
798 error = 0;
799 total_size = 0;
800 total_resid = 0;
801
802 cbp = cbp_head;
803 vp = cbp->b_vp;
804 upl_offset = cbp->b_uploffset;
805 upl = cbp->b_upl;
806 b_flags = cbp->b_flags;
807 real_bp = cbp->b_real_bp;
808 zero_offset= cbp->b_validend;
809 iostate = (struct clios *)cbp->b_iostate;
810
811 if (real_bp)
812 real_bp->b_dev = cbp->b_dev;
813
814 while (cbp) {
815 if ((cbp->b_flags & B_ERROR) && error == 0)
816 error = cbp->b_error;
817
818 total_resid += cbp->b_resid;
819 total_size += cbp->b_bcount;
820
821 cbp_next = cbp->b_trans_next;
822
823 if (cbp_next == NULL)
824 /*
825 * compute the overall size of the transaction
826 * in case we created one that has 'holes' in it
827 * 'total_size' represents the amount of I/O we
828 * did, not the span of the transaction w/r to the UPL
829 */
830 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
831
832 if (cbp != cbp_head)
833 free_io_buf(cbp);
834
835 cbp = cbp_next;
836 }
837
838 if (ISSET(b_flags, B_COMMIT_UPL)) {
839 cluster_handle_associated_upl(iostate,
840 cbp_head->b_upl,
841 upl_offset,
842 transaction_size);
843 }
844
845 if (error == 0 && total_resid)
846 error = EIO;
847
848 if (error == 0) {
849 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
850
851 if (cliodone_func != NULL) {
852 cbp_head->b_bcount = transaction_size;
853
854 error = (*cliodone_func)(cbp_head, callback_arg);
855 }
856 }
857 if (zero_offset)
858 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
859
860 free_io_buf(cbp_head);
861
862 if (iostate) {
863 int need_wakeup = 0;
864
865 /*
866 * someone has issued multiple I/Os asynchrounsly
867 * and is waiting for them to complete (streaming)
868 */
869 lck_mtx_lock_spin(&iostate->io_mtxp);
870
871 if (error && iostate->io_error == 0)
872 iostate->io_error = error;
873
874 iostate->io_completed += total_size;
875
876 if (iostate->io_wanted) {
877 /*
878 * someone is waiting for the state of
879 * this io stream to change
880 */
881 iostate->io_wanted = 0;
882 need_wakeup = 1;
883 }
884 lck_mtx_unlock(&iostate->io_mtxp);
885
886 if (need_wakeup)
887 wakeup((caddr_t)&iostate->io_wanted);
888 }
889
890 if (b_flags & B_COMMIT_UPL) {
891
892 pg_offset = upl_offset & PAGE_MASK;
893 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
894
895 if (error) {
896 upl_set_iodone_error(upl, error);
897
898 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
899 } else {
900 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
901
902 if ((b_flags & B_PHYS) && (b_flags & B_READ))
903 upl_flags |= UPL_COMMIT_SET_DIRTY;
904
905 if (b_flags & B_AGE)
906 upl_flags |= UPL_COMMIT_INACTIVATE;
907
908 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
909 }
910 }
911 if (real_bp) {
912 if (error) {
913 real_bp->b_flags |= B_ERROR;
914 real_bp->b_error = error;
915 }
916 real_bp->b_resid = total_resid;
917
918 buf_biodone(real_bp);
919 }
920 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
921 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
922
923 return (error);
924}
925
926
927uint32_t
928cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
929{
930 if (cluster_is_throttled(vp)) {
931 *limit = THROTTLE_MAX_IOSIZE;
932 return 1;
933 }
934 return 0;
935}
936
937
938void
939cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
940{
941
942 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
943 upl_offset, size, bp, 0, 0);
944
945 if (bp == NULL || bp->b_datap == 0) {
946 upl_page_info_t *pl;
947 addr64_t zero_addr;
948
949 pl = ubc_upl_pageinfo(upl);
950
951 if (upl_device_page(pl) == TRUE) {
952 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
953
954 bzero_phys_nc(zero_addr, size);
955 } else {
956 while (size) {
957 int page_offset;
958 int page_index;
959 int zero_cnt;
960
961 page_index = upl_offset / PAGE_SIZE;
962 page_offset = upl_offset & PAGE_MASK;
963
964 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
965 zero_cnt = min(PAGE_SIZE - page_offset, size);
966
967 bzero_phys(zero_addr, zero_cnt);
968
969 size -= zero_cnt;
970 upl_offset += zero_cnt;
971 }
972 }
973 } else
974 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
975
976 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
977 upl_offset, size, 0, 0, 0);
978}
979
980
981static void
982cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
983{
984 cbp_head->b_validend = zero_offset;
985 cbp_tail->b_flags |= B_EOT;
986}
987
988static void
989cluster_wait_IO(buf_t cbp_head, int async)
990{
991 buf_t cbp;
992
993 if (async) {
994 /*
995 * Async callback completion will not normally generate a
996 * wakeup upon I/O completion. To get woken up, we set
997 * b_trans_next (which is safe for us to modify) on the last
998 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
999 * to wake us up when all buffers as part of this transaction
1000 * are completed. This is done under the umbrella of
1001 * cl_transaction_mtxp which is also taken in cluster_iodone.
1002 */
1003 bool done = true;
1004 buf_t last = NULL;
1005
1006 lck_mtx_lock_spin(cl_transaction_mtxp);
1007
1008 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1009 if (!ISSET(cbp->b_flags, B_TDONE))
1010 done = false;
1011 }
1012
1013 if (!done) {
1014 last->b_trans_next = CLUSTER_IO_WAITING;
1015
1016 DTRACE_IO1(wait__start, buf_t, last);
1017 do {
1018 msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO+1), "cluster_wait_IO", NULL);
1019
1020 /*
1021 * We should only have been woken up if all the
1022 * buffers are completed, but just in case...
1023 */
1024 done = true;
1025 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1026 if (!ISSET(cbp->b_flags, B_TDONE)) {
1027 done = false;
1028 break;
1029 }
1030 }
1031 } while (!done);
1032 DTRACE_IO1(wait__done, buf_t, last);
1033
1034 last->b_trans_next = NULL;
1035 }
1036
1037 lck_mtx_unlock(cl_transaction_mtxp);
1038 } else { // !async
1039 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
1040 buf_biowait(cbp);
1041 }
1042}
1043
1044static void
1045cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1046{
1047 buf_t cbp;
1048 int error;
1049 boolean_t isswapout = FALSE;
1050
1051 /*
1052 * cluster_complete_transaction will
1053 * only be called if we've issued a complete chain in synchronous mode
1054 * or, we've already done a cluster_wait_IO on an incomplete chain
1055 */
1056 if (needwait) {
1057 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
1058 buf_biowait(cbp);
1059 }
1060 /*
1061 * we've already waited on all of the I/Os in this transaction,
1062 * so mark all of the buf_t's in this transaction as B_TDONE
1063 * so that cluster_iodone sees the transaction as completed
1064 */
1065 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
1066 cbp->b_flags |= B_TDONE;
1067 cbp = *cbp_head;
1068
1069 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
1070 isswapout = TRUE;
1071
1072 error = cluster_iodone(cbp, callback_arg);
1073
1074 if ( !(flags & CL_ASYNC) && error && *retval == 0) {
1075 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO))
1076 *retval = error;
1077 else if (isswapout == TRUE)
1078 *retval = error;
1079 }
1080 *cbp_head = (buf_t)NULL;
1081}
1082
1083
1084static int
1085cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1086 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1087{
1088 buf_t cbp;
1089 u_int size;
1090 u_int io_size;
1091 int io_flags;
1092 int bmap_flags;
1093 int error = 0;
1094 int retval = 0;
1095 buf_t cbp_head = NULL;
1096 buf_t cbp_tail = NULL;
1097 int trans_count = 0;
1098 int max_trans_count;
1099 u_int pg_count;
1100 int pg_offset;
1101 u_int max_iosize;
1102 u_int max_vectors;
1103 int priv;
1104 int zero_offset = 0;
1105 int async_throttle = 0;
1106 mount_t mp;
1107 vm_offset_t upl_end_offset;
1108 boolean_t need_EOT = FALSE;
1109
1110 /*
1111 * we currently don't support buffers larger than a page
1112 */
1113 if (real_bp && non_rounded_size > PAGE_SIZE)
1114 panic("%s(): Called with real buffer of size %d bytes which "
1115 "is greater than the maximum allowed size of "
1116 "%d bytes (the system PAGE_SIZE).\n",
1117 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1118
1119 mp = vp->v_mount;
1120
1121 /*
1122 * we don't want to do any funny rounding of the size for IO requests
1123 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1124 * belong to us... we can't extend (nor do we need to) the I/O to fill
1125 * out a page
1126 */
1127 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1128 /*
1129 * round the requested size up so that this I/O ends on a
1130 * page boundary in case this is a 'write'... if the filesystem
1131 * has blocks allocated to back the page beyond the EOF, we want to
1132 * make sure to write out the zero's that are sitting beyond the EOF
1133 * so that in case the filesystem doesn't explicitly zero this area
1134 * if a hole is created via a lseek/write beyond the current EOF,
1135 * it will return zeros when it's read back from the disk. If the
1136 * physical allocation doesn't extend for the whole page, we'll
1137 * only write/read from the disk up to the end of this allocation
1138 * via the extent info returned from the VNOP_BLOCKMAP call.
1139 */
1140 pg_offset = upl_offset & PAGE_MASK;
1141
1142 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1143 } else {
1144 /*
1145 * anyone advertising a blocksize of 1 byte probably
1146 * can't deal with us rounding up the request size
1147 * AFP is one such filesystem/device
1148 */
1149 size = non_rounded_size;
1150 }
1151 upl_end_offset = upl_offset + size;
1152
1153 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1154
1155 /*
1156 * Set the maximum transaction size to the maximum desired number of
1157 * buffers.
1158 */
1159 max_trans_count = 8;
1160 if (flags & CL_DEV_MEMORY)
1161 max_trans_count = 16;
1162
1163 if (flags & CL_READ) {
1164 io_flags = B_READ;
1165 bmap_flags = VNODE_READ;
1166
1167 max_iosize = mp->mnt_maxreadcnt;
1168 max_vectors = mp->mnt_segreadcnt;
1169 } else {
1170 io_flags = B_WRITE;
1171 bmap_flags = VNODE_WRITE;
1172
1173 max_iosize = mp->mnt_maxwritecnt;
1174 max_vectors = mp->mnt_segwritecnt;
1175 }
1176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1177
1178 /*
1179 * make sure the maximum iosize is a
1180 * multiple of the page size
1181 */
1182 max_iosize &= ~PAGE_MASK;
1183
1184 /*
1185 * Ensure the maximum iosize is sensible.
1186 */
1187 if (!max_iosize)
1188 max_iosize = PAGE_SIZE;
1189
1190 if (flags & CL_THROTTLE) {
1191 if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1192 if (max_iosize > THROTTLE_MAX_IOSIZE)
1193 max_iosize = THROTTLE_MAX_IOSIZE;
1194 async_throttle = THROTTLE_MAXCNT;
1195 } else {
1196 if ( (flags & CL_DEV_MEMORY) )
1197 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1198 else {
1199 u_int max_cluster;
1200 u_int max_cluster_size;
1201 u_int scale;
1202
1203 if (vp->v_mount->mnt_minsaturationbytecount) {
1204 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1205
1206 scale = 1;
1207 } else {
1208 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1209
1210 if (disk_conditioner_mount_is_ssd(vp->v_mount))
1211 scale = WRITE_THROTTLE_SSD;
1212 else
1213 scale = WRITE_THROTTLE;
1214 }
1215 if (max_iosize > max_cluster_size)
1216 max_cluster = max_cluster_size;
1217 else
1218 max_cluster = max_iosize;
1219
1220 if (size < max_cluster)
1221 max_cluster = size;
1222
1223 if (flags & CL_CLOSE)
1224 scale += MAX_CLUSTERS;
1225
1226 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1227 }
1228 }
1229 }
1230 if (flags & CL_AGE)
1231 io_flags |= B_AGE;
1232 if (flags & (CL_PAGEIN | CL_PAGEOUT))
1233 io_flags |= B_PAGEIO;
1234 if (flags & (CL_IOSTREAMING))
1235 io_flags |= B_IOSTREAMING;
1236 if (flags & CL_COMMIT)
1237 io_flags |= B_COMMIT_UPL;
1238 if (flags & CL_DIRECT_IO)
1239 io_flags |= B_PHYS;
1240 if (flags & (CL_PRESERVE | CL_KEEPCACHED))
1241 io_flags |= B_CACHE;
1242 if (flags & CL_PASSIVE)
1243 io_flags |= B_PASSIVE;
1244 if (flags & CL_ENCRYPTED)
1245 io_flags |= B_ENCRYPTED_IO;
1246
1247 if (vp->v_flag & VSYSTEM)
1248 io_flags |= B_META;
1249
1250 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1251 /*
1252 * then we are going to end up
1253 * with a page that we can't complete (the file size wasn't a multiple
1254 * of PAGE_SIZE and we're trying to read to the end of the file
1255 * so we'll go ahead and zero out the portion of the page we can't
1256 * read in from the file
1257 */
1258 zero_offset = upl_offset + non_rounded_size;
1259 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1260 assert(ISSET(flags, CL_COMMIT));
1261
1262 // For a direct/uncached write, we need to lock pages...
1263
1264 upl_t cached_upl;
1265
1266 /*
1267 * Create a UPL to lock the pages in the cache whilst the
1268 * write is in progress.
1269 */
1270 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1271 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1272
1273 /*
1274 * Attach this UPL to the other UPL so that we can find it
1275 * later.
1276 */
1277 upl_set_associated_upl(upl, cached_upl);
1278
1279 if (upl_offset & PAGE_MASK) {
1280 /*
1281 * The two UPLs are not aligned, so mark the first page in
1282 * @upl so that cluster_handle_associated_upl can handle
1283 * it accordingly.
1284 */
1285 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1286 upl_page_set_mark(pl, 0, true);
1287 }
1288 }
1289
1290 while (size) {
1291 daddr64_t blkno;
1292 daddr64_t lblkno;
1293 u_int io_size_wanted;
1294 size_t io_size_tmp;
1295
1296 if (size > max_iosize)
1297 io_size = max_iosize;
1298 else
1299 io_size = size;
1300
1301 io_size_wanted = io_size;
1302 io_size_tmp = (size_t)io_size;
1303
1304 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
1305 break;
1306
1307 if (io_size_tmp > io_size_wanted)
1308 io_size = io_size_wanted;
1309 else
1310 io_size = (u_int)io_size_tmp;
1311
1312 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
1313 real_bp->b_blkno = blkno;
1314
1315 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1316 (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
1317
1318 if (io_size == 0) {
1319 /*
1320 * vnop_blockmap didn't return an error... however, it did
1321 * return an extent size of 0 which means we can't
1322 * make forward progress on this I/O... a hole in the
1323 * file would be returned as a blkno of -1 with a non-zero io_size
1324 * a real extent is returned with a blkno != -1 and a non-zero io_size
1325 */
1326 error = EINVAL;
1327 break;
1328 }
1329 if ( !(flags & CL_READ) && blkno == -1) {
1330 off_t e_offset;
1331 int pageout_flags;
1332
1333 if (upl_get_internal_vectorupl(upl))
1334 panic("Vector UPLs should not take this code-path\n");
1335 /*
1336 * we're writing into a 'hole'
1337 */
1338 if (flags & CL_PAGEOUT) {
1339 /*
1340 * if we got here via cluster_pageout
1341 * then just error the request and return
1342 * the 'hole' should already have been covered
1343 */
1344 error = EINVAL;
1345 break;
1346 }
1347 /*
1348 * we can get here if the cluster code happens to
1349 * pick up a page that was dirtied via mmap vs
1350 * a 'write' and the page targets a 'hole'...
1351 * i.e. the writes to the cluster were sparse
1352 * and the file was being written for the first time
1353 *
1354 * we can also get here if the filesystem supports
1355 * 'holes' that are less than PAGE_SIZE.... because
1356 * we can't know if the range in the page that covers
1357 * the 'hole' has been dirtied via an mmap or not,
1358 * we have to assume the worst and try to push the
1359 * entire page to storage.
1360 *
1361 * Try paging out the page individually before
1362 * giving up entirely and dumping it (the pageout
1363 * path will insure that the zero extent accounting
1364 * has been taken care of before we get back into cluster_io)
1365 *
1366 * go direct to vnode_pageout so that we don't have to
1367 * unbusy the page from the UPL... we used to do this
1368 * so that we could call ubc_msync, but that results
1369 * in a potential deadlock if someone else races us to acquire
1370 * that page and wins and in addition needs one of the pages
1371 * we're continuing to hold in the UPL
1372 */
1373 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1374
1375 if ( !(flags & CL_ASYNC))
1376 pageout_flags |= UPL_IOSYNC;
1377 if ( !(flags & CL_COMMIT))
1378 pageout_flags |= UPL_NOCOMMIT;
1379
1380 if (cbp_head) {
1381 buf_t prev_cbp;
1382 int bytes_in_last_page;
1383
1384 /*
1385 * first we have to wait for the the current outstanding I/Os
1386 * to complete... EOT hasn't been set yet on this transaction
1387 * so the pages won't be released
1388 */
1389 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1390
1391 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1392 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
1393 bytes_in_last_page += cbp->b_bcount;
1394 bytes_in_last_page &= PAGE_MASK;
1395
1396 while (bytes_in_last_page) {
1397 /*
1398 * we've got a transcation that
1399 * includes the page we're about to push out through vnode_pageout...
1400 * find the bp's in the list which intersect this page and either
1401 * remove them entirely from the transaction (there could be multiple bp's), or
1402 * round it's iosize down to the page boundary (there can only be one)...
1403 *
1404 * find the last bp in the list and act on it
1405 */
1406 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
1407 prev_cbp = cbp;
1408
1409 if (bytes_in_last_page >= cbp->b_bcount) {
1410 /*
1411 * this buf no longer has any I/O associated with it
1412 */
1413 bytes_in_last_page -= cbp->b_bcount;
1414 cbp->b_bcount = 0;
1415
1416 free_io_buf(cbp);
1417
1418 if (cbp == cbp_head) {
1419 assert(bytes_in_last_page == 0);
1420 /*
1421 * the buf we just freed was the only buf in
1422 * this transaction... so there's no I/O to do
1423 */
1424 cbp_head = NULL;
1425 cbp_tail = NULL;
1426 } else {
1427 /*
1428 * remove the buf we just freed from
1429 * the transaction list
1430 */
1431 prev_cbp->b_trans_next = NULL;
1432 cbp_tail = prev_cbp;
1433 }
1434 } else {
1435 /*
1436 * this is the last bp that has I/O
1437 * intersecting the page of interest
1438 * only some of the I/O is in the intersection
1439 * so clip the size but keep it in the transaction list
1440 */
1441 cbp->b_bcount -= bytes_in_last_page;
1442 cbp_tail = cbp;
1443 bytes_in_last_page = 0;
1444 }
1445 }
1446 if (cbp_head) {
1447 /*
1448 * there was more to the current transaction
1449 * than just the page we are pushing out via vnode_pageout...
1450 * mark it as finished and complete it... we've already
1451 * waited for the I/Os to complete above in the call to cluster_wait_IO
1452 */
1453 cluster_EOT(cbp_head, cbp_tail, 0);
1454
1455 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1456
1457 trans_count = 0;
1458 }
1459 }
1460 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1461 error = EINVAL;
1462 }
1463 e_offset = round_page_64(f_offset + 1);
1464 io_size = e_offset - f_offset;
1465
1466 f_offset += io_size;
1467 upl_offset += io_size;
1468
1469 if (size >= io_size)
1470 size -= io_size;
1471 else
1472 size = 0;
1473 /*
1474 * keep track of how much of the original request
1475 * that we've actually completed... non_rounded_size
1476 * may go negative due to us rounding the request
1477 * to a page size multiple (i.e. size > non_rounded_size)
1478 */
1479 non_rounded_size -= io_size;
1480
1481 if (non_rounded_size <= 0) {
1482 /*
1483 * we've transferred all of the data in the original
1484 * request, but we were unable to complete the tail
1485 * of the last page because the file didn't have
1486 * an allocation to back that portion... this is ok.
1487 */
1488 size = 0;
1489 }
1490 if (error) {
1491 if (size == 0)
1492 flags &= ~CL_COMMIT;
1493 break;
1494 }
1495 continue;
1496 }
1497 lblkno = (daddr64_t)(f_offset / 0x1000);
1498 /*
1499 * we have now figured out how much I/O we can do - this is in 'io_size'
1500 * pg_offset is the starting point in the first page for the I/O
1501 * pg_count is the number of full and partial pages that 'io_size' encompasses
1502 */
1503 pg_offset = upl_offset & PAGE_MASK;
1504
1505 if (flags & CL_DEV_MEMORY) {
1506 /*
1507 * treat physical requests as one 'giant' page
1508 */
1509 pg_count = 1;
1510 } else
1511 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1512
1513 if ((flags & CL_READ) && blkno == -1) {
1514 vm_offset_t commit_offset;
1515 int bytes_to_zero;
1516 int complete_transaction_now = 0;
1517
1518 /*
1519 * if we're reading and blkno == -1, then we've got a
1520 * 'hole' in the file that we need to deal with by zeroing
1521 * out the affected area in the upl
1522 */
1523 if (io_size >= (u_int)non_rounded_size) {
1524 /*
1525 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1526 * than 'zero_offset' will be non-zero
1527 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1528 * (indicated by the io_size finishing off the I/O request for this UPL)
1529 * than we're not going to issue an I/O for the
1530 * last page in this upl... we need to zero both the hole and the tail
1531 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1532 */
1533 bytes_to_zero = non_rounded_size;
1534 if (!(flags & CL_NOZERO))
1535 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1536
1537 zero_offset = 0;
1538 } else
1539 bytes_to_zero = io_size;
1540
1541 pg_count = 0;
1542
1543 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1544
1545 if (cbp_head) {
1546 int pg_resid;
1547
1548 /*
1549 * if there is a current I/O chain pending
1550 * then the first page of the group we just zero'd
1551 * will be handled by the I/O completion if the zero
1552 * fill started in the middle of the page
1553 */
1554 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1555
1556 pg_resid = commit_offset - upl_offset;
1557
1558 if (bytes_to_zero >= pg_resid) {
1559 /*
1560 * the last page of the current I/O
1561 * has been completed...
1562 * compute the number of fully zero'd
1563 * pages that are beyond it
1564 * plus the last page if its partial
1565 * and we have no more I/O to issue...
1566 * otherwise a partial page is left
1567 * to begin the next I/O
1568 */
1569 if ((int)io_size >= non_rounded_size)
1570 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1571 else
1572 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1573
1574 complete_transaction_now = 1;
1575 }
1576 } else {
1577 /*
1578 * no pending I/O to deal with
1579 * so, commit all of the fully zero'd pages
1580 * plus the last page if its partial
1581 * and we have no more I/O to issue...
1582 * otherwise a partial page is left
1583 * to begin the next I/O
1584 */
1585 if ((int)io_size >= non_rounded_size)
1586 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1587 else
1588 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1589
1590 commit_offset = upl_offset & ~PAGE_MASK;
1591 }
1592
1593 // Associated UPL is currently only used in the direct write path
1594 assert(!upl_associated_upl(upl));
1595
1596 if ( (flags & CL_COMMIT) && pg_count) {
1597 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1598 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1599 }
1600 upl_offset += io_size;
1601 f_offset += io_size;
1602 size -= io_size;
1603
1604 /*
1605 * keep track of how much of the original request
1606 * that we've actually completed... non_rounded_size
1607 * may go negative due to us rounding the request
1608 * to a page size multiple (i.e. size > non_rounded_size)
1609 */
1610 non_rounded_size -= io_size;
1611
1612 if (non_rounded_size <= 0) {
1613 /*
1614 * we've transferred all of the data in the original
1615 * request, but we were unable to complete the tail
1616 * of the last page because the file didn't have
1617 * an allocation to back that portion... this is ok.
1618 */
1619 size = 0;
1620 }
1621 if (cbp_head && (complete_transaction_now || size == 0)) {
1622 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1623
1624 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1625
1626 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1627
1628 trans_count = 0;
1629 }
1630 continue;
1631 }
1632 if (pg_count > max_vectors) {
1633 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1634 io_size = PAGE_SIZE - pg_offset;
1635 pg_count = 1;
1636 } else {
1637 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1638 pg_count = max_vectors;
1639 }
1640 }
1641 /*
1642 * If the transaction is going to reach the maximum number of
1643 * desired elements, truncate the i/o to the nearest page so
1644 * that the actual i/o is initiated after this buffer is
1645 * created and added to the i/o chain.
1646 *
1647 * I/O directed to physically contiguous memory
1648 * doesn't have a requirement to make sure we 'fill' a page
1649 */
1650 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1651 ((upl_offset + io_size) & PAGE_MASK)) {
1652 vm_offset_t aligned_ofs;
1653
1654 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1655 /*
1656 * If the io_size does not actually finish off even a
1657 * single page we have to keep adding buffers to the
1658 * transaction despite having reached the desired limit.
1659 *
1660 * Eventually we get here with the page being finished
1661 * off (and exceeded) and then we truncate the size of
1662 * this i/o request so that it is page aligned so that
1663 * we can finally issue the i/o on the transaction.
1664 */
1665 if (aligned_ofs > upl_offset) {
1666 io_size = aligned_ofs - upl_offset;
1667 pg_count--;
1668 }
1669 }
1670
1671 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
1672 /*
1673 * if we're not targeting a virtual device i.e. a disk image
1674 * it's safe to dip into the reserve pool since real devices
1675 * can complete this I/O request without requiring additional
1676 * bufs from the alloc_io_buf pool
1677 */
1678 priv = 1;
1679 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
1680 /*
1681 * Throttle the speculative IO
1682 */
1683 priv = 0;
1684 else
1685 priv = 1;
1686
1687 cbp = alloc_io_buf(vp, priv);
1688
1689 if (flags & CL_PAGEOUT) {
1690 u_int i;
1691
1692 /*
1693 * since blocks are in offsets of 0x1000, scale
1694 * iteration to (PAGE_SIZE * pg_count) of blks.
1695 */
1696 for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) {
1697 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
1698 panic("BUSY bp found in cluster_io");
1699 }
1700 }
1701 if (flags & CL_ASYNC) {
1702 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
1703 panic("buf_setcallback failed\n");
1704 }
1705 cbp->b_cliodone = (void *)callback;
1706 cbp->b_flags |= io_flags;
1707 if (flags & CL_NOCACHE)
1708 cbp->b_attr.ba_flags |= BA_NOCACHE;
1709
1710 cbp->b_lblkno = lblkno;
1711 cbp->b_blkno = blkno;
1712 cbp->b_bcount = io_size;
1713
1714 if (buf_setupl(cbp, upl, upl_offset))
1715 panic("buf_setupl failed\n");
1716#if CONFIG_IOSCHED
1717 upl_set_blkno(upl, upl_offset, io_size, blkno);
1718#endif
1719 cbp->b_trans_next = (buf_t)NULL;
1720
1721 if ((cbp->b_iostate = (void *)iostate))
1722 /*
1723 * caller wants to track the state of this
1724 * io... bump the amount issued against this stream
1725 */
1726 iostate->io_issued += io_size;
1727
1728 if (flags & CL_READ) {
1729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1730 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1731 }
1732 else {
1733 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1734 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1735 }
1736
1737 if (cbp_head) {
1738 cbp_tail->b_trans_next = cbp;
1739 cbp_tail = cbp;
1740 } else {
1741 cbp_head = cbp;
1742 cbp_tail = cbp;
1743
1744 if ( (cbp_head->b_real_bp = real_bp) )
1745 real_bp = (buf_t)NULL;
1746 }
1747 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1748
1749 trans_count++;
1750
1751 upl_offset += io_size;
1752 f_offset += io_size;
1753 size -= io_size;
1754 /*
1755 * keep track of how much of the original request
1756 * that we've actually completed... non_rounded_size
1757 * may go negative due to us rounding the request
1758 * to a page size multiple (i.e. size > non_rounded_size)
1759 */
1760 non_rounded_size -= io_size;
1761
1762 if (non_rounded_size <= 0) {
1763 /*
1764 * we've transferred all of the data in the original
1765 * request, but we were unable to complete the tail
1766 * of the last page because the file didn't have
1767 * an allocation to back that portion... this is ok.
1768 */
1769 size = 0;
1770 }
1771 if (size == 0) {
1772 /*
1773 * we have no more I/O to issue, so go
1774 * finish the final transaction
1775 */
1776 need_EOT = TRUE;
1777 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1778 ((flags & CL_ASYNC) || trans_count > max_trans_count) ) {
1779 /*
1780 * I/O directed to physically contiguous memory...
1781 * which doesn't have a requirement to make sure we 'fill' a page
1782 * or...
1783 * the current I/O we've prepared fully
1784 * completes the last page in this request
1785 * and ...
1786 * it's either an ASYNC request or
1787 * we've already accumulated more than 8 I/O's into
1788 * this transaction so mark it as complete so that
1789 * it can finish asynchronously or via the cluster_complete_transaction
1790 * below if the request is synchronous
1791 */
1792 need_EOT = TRUE;
1793 }
1794 if (need_EOT == TRUE)
1795 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1796
1797 if (flags & CL_THROTTLE)
1798 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1799
1800 if ( !(io_flags & B_READ))
1801 vnode_startwrite(vp);
1802
1803 if (flags & CL_RAW_ENCRYPTED) {
1804 /*
1805 * User requested raw encrypted bytes.
1806 * Twiddle the bit in the ba_flags for the buffer
1807 */
1808 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1809 }
1810
1811 (void) VNOP_STRATEGY(cbp);
1812
1813 if (need_EOT == TRUE) {
1814 if ( !(flags & CL_ASYNC))
1815 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1816
1817 need_EOT = FALSE;
1818 trans_count = 0;
1819 cbp_head = NULL;
1820 }
1821 }
1822 if (error) {
1823 int abort_size;
1824
1825 io_size = 0;
1826
1827 if (cbp_head) {
1828 /*
1829 * Wait until all of the outstanding I/O
1830 * for this partial transaction has completed
1831 */
1832 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1833
1834 /*
1835 * Rewind the upl offset to the beginning of the
1836 * transaction.
1837 */
1838 upl_offset = cbp_head->b_uploffset;
1839 }
1840
1841 if (ISSET(flags, CL_COMMIT)) {
1842 cluster_handle_associated_upl(iostate, upl, upl_offset,
1843 upl_end_offset - upl_offset);
1844 }
1845
1846 // Free all the IO buffers in this transaction
1847 for (cbp = cbp_head; cbp;) {
1848 buf_t cbp_next;
1849
1850 size += cbp->b_bcount;
1851 io_size += cbp->b_bcount;
1852
1853 cbp_next = cbp->b_trans_next;
1854 free_io_buf(cbp);
1855 cbp = cbp_next;
1856 }
1857
1858 if (iostate) {
1859 int need_wakeup = 0;
1860
1861 /*
1862 * update the error condition for this stream
1863 * since we never really issued the io
1864 * just go ahead and adjust it back
1865 */
1866 lck_mtx_lock_spin(&iostate->io_mtxp);
1867
1868 if (iostate->io_error == 0)
1869 iostate->io_error = error;
1870 iostate->io_issued -= io_size;
1871
1872 if (iostate->io_wanted) {
1873 /*
1874 * someone is waiting for the state of
1875 * this io stream to change
1876 */
1877 iostate->io_wanted = 0;
1878 need_wakeup = 1;
1879 }
1880 lck_mtx_unlock(&iostate->io_mtxp);
1881
1882 if (need_wakeup)
1883 wakeup((caddr_t)&iostate->io_wanted);
1884 }
1885
1886 if (flags & CL_COMMIT) {
1887 int upl_flags;
1888
1889 pg_offset = upl_offset & PAGE_MASK;
1890 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1891
1892 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
1893
1894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1895 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1896 }
1897 if (retval == 0)
1898 retval = error;
1899 } else if (cbp_head)
1900 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1901
1902 if (real_bp) {
1903 /*
1904 * can get here if we either encountered an error
1905 * or we completely zero-filled the request and
1906 * no I/O was issued
1907 */
1908 if (error) {
1909 real_bp->b_flags |= B_ERROR;
1910 real_bp->b_error = error;
1911 }
1912 buf_biodone(real_bp);
1913 }
1914 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1915
1916 return (retval);
1917}
1918
1919#define reset_vector_run_state() \
1920 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1921
1922static int
1923vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
1924 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1925{
1926 vector_upl_set_pagelist(vector_upl);
1927
1928 if(io_flag & CL_READ) {
1929 if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
1930 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
1931 else
1932 io_flag |= CL_PRESERVE; /*zero fill*/
1933 }
1934 return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
1935
1936}
1937
1938static int
1939cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1940{
1941 int pages_in_prefetch;
1942
1943 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1944 (int)f_offset, size, (int)filesize, 0, 0);
1945
1946 if (f_offset >= filesize) {
1947 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1948 (int)f_offset, 0, 0, 0, 0);
1949 return(0);
1950 }
1951 if ((off_t)size > (filesize - f_offset))
1952 size = filesize - f_offset;
1953 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1954
1955 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
1956
1957 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1958 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1959
1960 return (pages_in_prefetch);
1961}
1962
1963
1964
1965static void
1966cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
1967 int bflag)
1968{
1969 daddr64_t r_addr;
1970 off_t f_offset;
1971 int size_of_prefetch;
1972 u_int max_prefetch;
1973
1974
1975 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1976 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1977
1978 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1980 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1981 return;
1982 }
1983 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
1984 rap->cl_ralen = 0;
1985 rap->cl_maxra = 0;
1986
1987 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1988 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1989
1990 return;
1991 }
1992 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
1993
1994 if (max_prefetch > speculative_prefetch_max)
1995 max_prefetch = speculative_prefetch_max;
1996
1997 if (max_prefetch <= PAGE_SIZE) {
1998 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1999 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2000 return;
2001 }
2002 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2003 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2004
2005 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2006 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2007 return;
2008 }
2009 }
2010 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
2011 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2012
2013 size_of_prefetch = 0;
2014
2015 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2016
2017 if (size_of_prefetch) {
2018 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2019 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2020 return;
2021 }
2022 if (f_offset < filesize) {
2023 daddr64_t read_size;
2024
2025 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2026
2027 read_size = (extent->e_addr + 1) - extent->b_addr;
2028
2029 if (read_size > rap->cl_ralen) {
2030 if (read_size > max_prefetch / PAGE_SIZE)
2031 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2032 else
2033 rap->cl_ralen = read_size;
2034 }
2035 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2036
2037 if (size_of_prefetch)
2038 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2039 }
2040 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2041 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2042}
2043
2044
2045int
2046cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2047 int size, off_t filesize, int flags)
2048{
2049 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2050
2051}
2052
2053
2054int
2055cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2056 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2057{
2058 int io_size;
2059 int rounded_size;
2060 off_t max_size;
2061 int local_flags;
2062
2063 local_flags = CL_PAGEOUT | CL_THROTTLE;
2064
2065 if ((flags & UPL_IOSYNC) == 0)
2066 local_flags |= CL_ASYNC;
2067 if ((flags & UPL_NOCOMMIT) == 0)
2068 local_flags |= CL_COMMIT;
2069 if ((flags & UPL_KEEPCACHED))
2070 local_flags |= CL_KEEPCACHED;
2071 if (flags & UPL_PAGING_ENCRYPTED)
2072 local_flags |= CL_ENCRYPTED;
2073
2074
2075 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2076 (int)f_offset, size, (int)filesize, local_flags, 0);
2077
2078 /*
2079 * If they didn't specify any I/O, then we are done...
2080 * we can't issue an abort because we don't know how
2081 * big the upl really is
2082 */
2083 if (size <= 0)
2084 return (EINVAL);
2085
2086 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2087 if (local_flags & CL_COMMIT)
2088 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2089 return (EROFS);
2090 }
2091 /*
2092 * can't page-in from a negative offset
2093 * or if we're starting beyond the EOF
2094 * or if the file offset isn't page aligned
2095 * or the size requested isn't a multiple of PAGE_SIZE
2096 */
2097 if (f_offset < 0 || f_offset >= filesize ||
2098 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2099 if (local_flags & CL_COMMIT)
2100 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2101 return (EINVAL);
2102 }
2103 max_size = filesize - f_offset;
2104
2105 if (size < max_size)
2106 io_size = size;
2107 else
2108 io_size = max_size;
2109
2110 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2111
2112 if (size > rounded_size) {
2113 if (local_flags & CL_COMMIT)
2114 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2115 UPL_ABORT_FREE_ON_EMPTY);
2116 }
2117 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
2118 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
2119}
2120
2121
2122int
2123cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2124 int size, off_t filesize, int flags)
2125{
2126 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2127}
2128
2129
2130int
2131cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2132 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2133{
2134 u_int io_size;
2135 int rounded_size;
2136 off_t max_size;
2137 int retval;
2138 int local_flags = 0;
2139
2140 if (upl == NULL || size < 0)
2141 panic("cluster_pagein: NULL upl passed in");
2142
2143 if ((flags & UPL_IOSYNC) == 0)
2144 local_flags |= CL_ASYNC;
2145 if ((flags & UPL_NOCOMMIT) == 0)
2146 local_flags |= CL_COMMIT;
2147 if (flags & UPL_IOSTREAMING)
2148 local_flags |= CL_IOSTREAMING;
2149 if (flags & UPL_PAGING_ENCRYPTED)
2150 local_flags |= CL_ENCRYPTED;
2151
2152
2153 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2154 (int)f_offset, size, (int)filesize, local_flags, 0);
2155
2156 /*
2157 * can't page-in from a negative offset
2158 * or if we're starting beyond the EOF
2159 * or if the file offset isn't page aligned
2160 * or the size requested isn't a multiple of PAGE_SIZE
2161 */
2162 if (f_offset < 0 || f_offset >= filesize ||
2163 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2164 if (local_flags & CL_COMMIT)
2165 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2166 return (EINVAL);
2167 }
2168 max_size = filesize - f_offset;
2169
2170 if (size < max_size)
2171 io_size = size;
2172 else
2173 io_size = max_size;
2174
2175 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2176
2177 if (size > rounded_size && (local_flags & CL_COMMIT))
2178 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2179 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2180
2181 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2182 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2183
2184 return (retval);
2185}
2186
2187
2188int
2189cluster_bp(buf_t bp)
2190{
2191 return cluster_bp_ext(bp, NULL, NULL);
2192}
2193
2194
2195int
2196cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2197{
2198 off_t f_offset;
2199 int flags;
2200
2201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2202 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2203
2204 if (bp->b_flags & B_READ)
2205 flags = CL_ASYNC | CL_READ;
2206 else
2207 flags = CL_ASYNC;
2208 if (bp->b_flags & B_PASSIVE)
2209 flags |= CL_PASSIVE;
2210
2211 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2212
2213 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
2214}
2215
2216
2217
2218int
2219cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2220{
2221 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2222}
2223
2224
2225int
2226cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2227 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2228{
2229 user_ssize_t cur_resid;
2230 int retval = 0;
2231 int flags;
2232 int zflags;
2233 int bflag;
2234 int write_type = IO_COPY;
2235 u_int32_t write_length;
2236
2237 flags = xflags;
2238
2239 if (flags & IO_PASSIVE)
2240 bflag = CL_PASSIVE;
2241 else
2242 bflag = 0;
2243
2244 if (vp->v_flag & VNOCACHE_DATA){
2245 flags |= IO_NOCACHE;
2246 bflag |= CL_NOCACHE;
2247 }
2248 if (uio == NULL) {
2249 /*
2250 * no user data...
2251 * this call is being made to zero-fill some range in the file
2252 */
2253 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2254
2255 return(retval);
2256 }
2257 /*
2258 * do a write through the cache if one of the following is true....
2259 * NOCACHE is not true or NODIRECT is true
2260 * the uio request doesn't target USERSPACE
2261 * otherwise, find out if we want the direct or contig variant for
2262 * the first vector in the uio request
2263 */
2264 if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
2265 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2266
2267 if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT)
2268 /*
2269 * must go through the cached variant in this case
2270 */
2271 write_type = IO_COPY;
2272
2273 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2274
2275 switch (write_type) {
2276
2277 case IO_COPY:
2278 /*
2279 * make sure the uio_resid isn't too big...
2280 * internally, we want to handle all of the I/O in
2281 * chunk sizes that fit in a 32 bit int
2282 */
2283 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2284 /*
2285 * we're going to have to call cluster_write_copy
2286 * more than once...
2287 *
2288 * only want the last call to cluster_write_copy to
2289 * have the IO_TAILZEROFILL flag set and only the
2290 * first call should have IO_HEADZEROFILL
2291 */
2292 zflags = flags & ~IO_TAILZEROFILL;
2293 flags &= ~IO_HEADZEROFILL;
2294
2295 write_length = MAX_IO_REQUEST_SIZE;
2296 } else {
2297 /*
2298 * last call to cluster_write_copy
2299 */
2300 zflags = flags;
2301
2302 write_length = (u_int32_t)cur_resid;
2303 }
2304 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2305 break;
2306
2307 case IO_CONTIG:
2308 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2309
2310 if (flags & IO_HEADZEROFILL) {
2311 /*
2312 * only do this once per request
2313 */
2314 flags &= ~IO_HEADZEROFILL;
2315
2316 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2317 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2318 if (retval)
2319 break;
2320 }
2321 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2322
2323 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2324 /*
2325 * we're done with the data from the user specified buffer(s)
2326 * and we've been requested to zero fill at the tail
2327 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2328 * by rearranging the args and passing in IO_HEADZEROFILL
2329 */
2330 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2331 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2332 }
2333 break;
2334
2335 case IO_DIRECT:
2336 /*
2337 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2338 */
2339 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2340 break;
2341
2342 case IO_UNKNOWN:
2343 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2344 break;
2345 }
2346 /*
2347 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2348 * multiple times to service a multi-vector request that is not aligned properly
2349 * we need to update the oldEOF so that we
2350 * don't zero-fill the head of a page if we've successfully written
2351 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2352 * page that is beyond the oldEOF if the write is unaligned... we only
2353 * want that to happen for the very first page of the cluster_write,
2354 * NOT the first page of each vector making up a multi-vector write.
2355 */
2356 if (uio->uio_offset > oldEOF)
2357 oldEOF = uio->uio_offset;
2358 }
2359 return (retval);
2360}
2361
2362
2363static int
2364cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2365 int flags, int (*callback)(buf_t, void *), void *callback_arg)
2366{
2367 upl_t upl;
2368 upl_page_info_t *pl;
2369 vm_offset_t upl_offset;
2370 vm_offset_t vector_upl_offset = 0;
2371 u_int32_t io_req_size;
2372 u_int32_t offset_in_file;
2373 u_int32_t offset_in_iovbase;
2374 u_int32_t io_size;
2375 int io_flag = 0;
2376 upl_size_t upl_size, vector_upl_size = 0;
2377 vm_size_t upl_needed_size;
2378 mach_msg_type_number_t pages_in_pl;
2379 upl_control_flags_t upl_flags;
2380 kern_return_t kret;
2381 mach_msg_type_number_t i;
2382 int force_data_sync;
2383 int retval = 0;
2384 int first_IO = 1;
2385 struct clios iostate;
2386 user_addr_t iov_base;
2387 u_int32_t mem_alignment_mask;
2388 u_int32_t devblocksize;
2389 u_int32_t max_io_size;
2390 u_int32_t max_upl_size;
2391 u_int32_t max_vector_size;
2392 u_int32_t bytes_outstanding_limit;
2393 boolean_t io_throttled = FALSE;
2394
2395 u_int32_t vector_upl_iosize = 0;
2396 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
2397 off_t v_upl_uio_offset = 0;
2398 int vector_upl_index=0;
2399 upl_t vector_upl = NULL;
2400
2401
2402 /*
2403 * When we enter this routine, we know
2404 * -- the resid will not exceed iov_len
2405 */
2406 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2407 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2408
2409 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2410
2411 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2412
2413 if (flags & IO_PASSIVE)
2414 io_flag |= CL_PASSIVE;
2415
2416 if (flags & IO_NOCACHE)
2417 io_flag |= CL_NOCACHE;
2418
2419 if (flags & IO_SKIP_ENCRYPTION)
2420 io_flag |= CL_ENCRYPTED;
2421
2422 iostate.io_completed = 0;
2423 iostate.io_issued = 0;
2424 iostate.io_error = 0;
2425 iostate.io_wanted = 0;
2426
2427 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2428
2429 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2430 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2431
2432 if (devblocksize == 1) {
2433 /*
2434 * the AFP client advertises a devblocksize of 1
2435 * however, its BLOCKMAP routine maps to physical
2436 * blocks that are PAGE_SIZE in size...
2437 * therefore we can't ask for I/Os that aren't page aligned
2438 * or aren't multiples of PAGE_SIZE in size
2439 * by setting devblocksize to PAGE_SIZE, we re-instate
2440 * the old behavior we had before the mem_alignment_mask
2441 * changes went in...
2442 */
2443 devblocksize = PAGE_SIZE;
2444 }
2445
2446next_dwrite:
2447 io_req_size = *write_length;
2448 iov_base = uio_curriovbase(uio);
2449
2450 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2451 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2452
2453 if (offset_in_file || offset_in_iovbase) {
2454 /*
2455 * one of the 2 important offsets is misaligned
2456 * so fire an I/O through the cache for this entire vector
2457 */
2458 goto wait_for_dwrites;
2459 }
2460 if (iov_base & (devblocksize - 1)) {
2461 /*
2462 * the offset in memory must be on a device block boundary
2463 * so that we can guarantee that we can generate an
2464 * I/O that ends on a page boundary in cluster_io
2465 */
2466 goto wait_for_dwrites;
2467 }
2468
2469 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2470 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2471 int throttle_type;
2472
2473 if ( (throttle_type = cluster_is_throttled(vp)) ) {
2474 /*
2475 * we're in the throttle window, at the very least
2476 * we want to limit the size of the I/O we're about
2477 * to issue
2478 */
2479 if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2480 /*
2481 * we're in the throttle window and at least 1 I/O
2482 * has already been issued by a throttleable thread
2483 * in this window, so return with EAGAIN to indicate
2484 * to the FS issuing the cluster_write call that it
2485 * should now throttle after dropping any locks
2486 */
2487 throttle_info_update_by_mount(vp->v_mount);
2488
2489 io_throttled = TRUE;
2490 goto wait_for_dwrites;
2491 }
2492 max_vector_size = THROTTLE_MAX_IOSIZE;
2493 max_io_size = THROTTLE_MAX_IOSIZE;
2494 } else {
2495 max_vector_size = MAX_VECTOR_UPL_SIZE;
2496 max_io_size = max_upl_size;
2497 }
2498
2499 if (first_IO) {
2500 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2501 first_IO = 0;
2502 }
2503 io_size = io_req_size & ~PAGE_MASK;
2504 iov_base = uio_curriovbase(uio);
2505
2506 if (io_size > max_io_size)
2507 io_size = max_io_size;
2508
2509 if(useVectorUPL && (iov_base & PAGE_MASK)) {
2510 /*
2511 * We have an iov_base that's not page-aligned.
2512 * Issue all I/O's that have been collected within
2513 * this Vectored UPL.
2514 */
2515 if(vector_upl_index) {
2516 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2517 reset_vector_run_state();
2518 }
2519
2520 /*
2521 * After this point, if we are using the Vector UPL path and the base is
2522 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2523 */
2524 }
2525
2526 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2527 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2528
2529 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2530 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2531
2532 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2533 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2534 pages_in_pl = 0;
2535 upl_size = upl_needed_size;
2536 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2537 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2538
2539 kret = vm_map_get_upl(map,
2540 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2541 &upl_size,
2542 &upl,
2543 NULL,
2544 &pages_in_pl,
2545 &upl_flags,
2546 VM_KERN_MEMORY_FILE,
2547 force_data_sync);
2548
2549 if (kret != KERN_SUCCESS) {
2550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2551 0, 0, 0, kret, 0);
2552 /*
2553 * failed to get pagelist
2554 *
2555 * we may have already spun some portion of this request
2556 * off as async requests... we need to wait for the I/O
2557 * to complete before returning
2558 */
2559 goto wait_for_dwrites;
2560 }
2561 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2562 pages_in_pl = upl_size / PAGE_SIZE;
2563
2564 for (i = 0; i < pages_in_pl; i++) {
2565 if (!upl_valid_page(pl, i))
2566 break;
2567 }
2568 if (i == pages_in_pl)
2569 break;
2570
2571 /*
2572 * didn't get all the pages back that we
2573 * needed... release this upl and try again
2574 */
2575 ubc_upl_abort(upl, 0);
2576 }
2577 if (force_data_sync >= 3) {
2578 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2579 i, pages_in_pl, upl_size, kret, 0);
2580 /*
2581 * for some reason, we couldn't acquire a hold on all
2582 * the pages needed in the user's address space
2583 *
2584 * we may have already spun some portion of this request
2585 * off as async requests... we need to wait for the I/O
2586 * to complete before returning
2587 */
2588 goto wait_for_dwrites;
2589 }
2590
2591 /*
2592 * Consider the possibility that upl_size wasn't satisfied.
2593 */
2594 if (upl_size < upl_needed_size) {
2595 if (upl_size && upl_offset == 0)
2596 io_size = upl_size;
2597 else
2598 io_size = 0;
2599 }
2600 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2601 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2602
2603 if (io_size == 0) {
2604 ubc_upl_abort(upl, 0);
2605 /*
2606 * we may have already spun some portion of this request
2607 * off as async requests... we need to wait for the I/O
2608 * to complete before returning
2609 */
2610 goto wait_for_dwrites;
2611 }
2612
2613 if(useVectorUPL) {
2614 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2615 if(end_off)
2616 issueVectorUPL = 1;
2617 /*
2618 * After this point, if we are using a vector UPL, then
2619 * either all the UPL elements end on a page boundary OR
2620 * this UPL is the last element because it does not end
2621 * on a page boundary.
2622 */
2623 }
2624
2625 /*
2626 * we want push out these writes asynchronously so that we can overlap
2627 * the preparation of the next I/O
2628 * if there are already too many outstanding writes
2629 * wait until some complete before issuing the next
2630 */
2631 if (vp->v_mount->mnt_minsaturationbytecount)
2632 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2633 else
2634 bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2635
2636 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2637
2638 if (iostate.io_error) {
2639 /*
2640 * one of the earlier writes we issued ran into a hard error
2641 * don't issue any more writes, cleanup the UPL
2642 * that was just created but not used, then
2643 * go wait for all writes that are part of this stream
2644 * to complete before returning the error to the caller
2645 */
2646 ubc_upl_abort(upl, 0);
2647
2648 goto wait_for_dwrites;
2649 }
2650
2651 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2652 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2653
2654 if(!useVectorUPL)
2655 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2656 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2657
2658 else {
2659 if(!vector_upl_index) {
2660 vector_upl = vector_upl_create(upl_offset);
2661 v_upl_uio_offset = uio->uio_offset;
2662 vector_upl_offset = upl_offset;
2663 }
2664
2665 vector_upl_set_subupl(vector_upl,upl,upl_size);
2666 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2667 vector_upl_index++;
2668 vector_upl_iosize += io_size;
2669 vector_upl_size += upl_size;
2670
2671 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2672 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2673 reset_vector_run_state();
2674 }
2675 }
2676
2677 /*
2678 * update the uio structure to
2679 * reflect the I/O that we just issued
2680 */
2681 uio_update(uio, (user_size_t)io_size);
2682
2683 /*
2684 * in case we end up calling through to cluster_write_copy to finish
2685 * the tail of this request, we need to update the oldEOF so that we
2686 * don't zero-fill the head of a page if we've successfully written
2687 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2688 * page that is beyond the oldEOF if the write is unaligned... we only
2689 * want that to happen for the very first page of the cluster_write,
2690 * NOT the first page of each vector making up a multi-vector write.
2691 */
2692 if (uio->uio_offset > oldEOF)
2693 oldEOF = uio->uio_offset;
2694
2695 io_req_size -= io_size;
2696
2697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2698 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2699
2700 } /* end while */
2701
2702 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2703
2704 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2705
2706 if (retval == 0 && *write_type == IO_DIRECT) {
2707
2708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2709 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2710
2711 goto next_dwrite;
2712 }
2713 }
2714
2715wait_for_dwrites:
2716
2717 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2718 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2719 reset_vector_run_state();
2720 }
2721 /*
2722 * make sure all async writes issued as part of this stream
2723 * have completed before we return
2724 */
2725 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2726
2727 if (iostate.io_error)
2728 retval = iostate.io_error;
2729
2730 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2731
2732 if (io_throttled == TRUE && retval == 0)
2733 retval = EAGAIN;
2734
2735 if (io_req_size && retval == 0) {
2736 /*
2737 * we couldn't handle the tail of this request in DIRECT mode
2738 * so fire it through the copy path
2739 *
2740 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2741 * so we can just pass 0 in for the headOff and tailOff
2742 */
2743 if (uio->uio_offset > oldEOF)
2744 oldEOF = uio->uio_offset;
2745
2746 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2747
2748 *write_type = IO_UNKNOWN;
2749 }
2750 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2751 (int)uio->uio_offset, io_req_size, retval, 4, 0);
2752
2753 return (retval);
2754}
2755
2756
2757static int
2758cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2759 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2760{
2761 upl_page_info_t *pl;
2762 addr64_t src_paddr = 0;
2763 upl_t upl[MAX_VECTS];
2764 vm_offset_t upl_offset;
2765 u_int32_t tail_size = 0;
2766 u_int32_t io_size;
2767 u_int32_t xsize;
2768 upl_size_t upl_size;
2769 vm_size_t upl_needed_size;
2770 mach_msg_type_number_t pages_in_pl;
2771 upl_control_flags_t upl_flags;
2772 kern_return_t kret;
2773 struct clios iostate;
2774 int error = 0;
2775 int cur_upl = 0;
2776 int num_upl = 0;
2777 int n;
2778 user_addr_t iov_base;
2779 u_int32_t devblocksize;
2780 u_int32_t mem_alignment_mask;
2781
2782 /*
2783 * When we enter this routine, we know
2784 * -- the io_req_size will not exceed iov_len
2785 * -- the target address is physically contiguous
2786 */
2787 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2788
2789 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2790 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2791
2792 iostate.io_completed = 0;
2793 iostate.io_issued = 0;
2794 iostate.io_error = 0;
2795 iostate.io_wanted = 0;
2796
2797 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2798
2799next_cwrite:
2800 io_size = *write_length;
2801
2802 iov_base = uio_curriovbase(uio);
2803
2804 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2805 upl_needed_size = upl_offset + io_size;
2806
2807 pages_in_pl = 0;
2808 upl_size = upl_needed_size;
2809 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2810 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2811
2812 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2813 kret = vm_map_get_upl(map,
2814 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2815 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
2816
2817 if (kret != KERN_SUCCESS) {
2818 /*
2819 * failed to get pagelist
2820 */
2821 error = EINVAL;
2822 goto wait_for_cwrites;
2823 }
2824 num_upl++;
2825
2826 /*
2827 * Consider the possibility that upl_size wasn't satisfied.
2828 */
2829 if (upl_size < upl_needed_size) {
2830 /*
2831 * This is a failure in the physical memory case.
2832 */
2833 error = EINVAL;
2834 goto wait_for_cwrites;
2835 }
2836 pl = ubc_upl_pageinfo(upl[cur_upl]);
2837
2838 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
2839
2840 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2841 u_int32_t head_size;
2842
2843 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2844
2845 if (head_size > io_size)
2846 head_size = io_size;
2847
2848 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2849
2850 if (error)
2851 goto wait_for_cwrites;
2852
2853 upl_offset += head_size;
2854 src_paddr += head_size;
2855 io_size -= head_size;
2856
2857 iov_base += head_size;
2858 }
2859 if ((u_int32_t)iov_base & mem_alignment_mask) {
2860 /*
2861 * request doesn't set up on a memory boundary
2862 * the underlying DMA engine can handle...
2863 * return an error instead of going through
2864 * the slow copy path since the intent of this
2865 * path is direct I/O from device memory
2866 */
2867 error = EINVAL;
2868 goto wait_for_cwrites;
2869 }
2870
2871 tail_size = io_size & (devblocksize - 1);
2872 io_size -= tail_size;
2873
2874 while (io_size && error == 0) {
2875
2876 if (io_size > MAX_IO_CONTIG_SIZE)
2877 xsize = MAX_IO_CONTIG_SIZE;
2878 else
2879 xsize = io_size;
2880 /*
2881 * request asynchronously so that we can overlap
2882 * the preparation of the next I/O... we'll do
2883 * the commit after all the I/O has completed
2884 * since its all issued against the same UPL
2885 * if there are already too many outstanding writes
2886 * wait until some have completed before issuing the next
2887 */
2888 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2889
2890 if (iostate.io_error) {
2891 /*
2892 * one of the earlier writes we issued ran into a hard error
2893 * don't issue any more writes...
2894 * go wait for all writes that are part of this stream
2895 * to complete before returning the error to the caller
2896 */
2897 goto wait_for_cwrites;
2898 }
2899 /*
2900 * issue an asynchronous write to cluster_io
2901 */
2902 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
2903 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
2904
2905 if (error == 0) {
2906 /*
2907 * The cluster_io write completed successfully,
2908 * update the uio structure
2909 */
2910 uio_update(uio, (user_size_t)xsize);
2911
2912 upl_offset += xsize;
2913 src_paddr += xsize;
2914 io_size -= xsize;
2915 }
2916 }
2917 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
2918
2919 error = cluster_io_type(uio, write_type, write_length, 0);
2920
2921 if (error == 0 && *write_type == IO_CONTIG) {
2922 cur_upl++;
2923 goto next_cwrite;
2924 }
2925 } else
2926 *write_type = IO_UNKNOWN;
2927
2928wait_for_cwrites:
2929 /*
2930 * make sure all async writes that are part of this stream
2931 * have completed before we proceed
2932 */
2933 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
2934
2935 if (iostate.io_error)
2936 error = iostate.io_error;
2937
2938 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2939
2940 if (error == 0 && tail_size)
2941 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
2942
2943 for (n = 0; n < num_upl; n++)
2944 /*
2945 * just release our hold on each physically contiguous
2946 * region without changing any state
2947 */
2948 ubc_upl_abort(upl[n], 0);
2949
2950 return (error);
2951}
2952
2953
2954/*
2955 * need to avoid a race between an msync of a range of pages dirtied via mmap
2956 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
2957 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
2958 *
2959 * we should never force-zero-fill pages that are already valid in the cache...
2960 * the entire page contains valid data (either from disk, zero-filled or dirtied
2961 * via an mmap) so we can only do damage by trying to zero-fill
2962 *
2963 */
2964static int
2965cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
2966{
2967 int zero_pg_index;
2968 boolean_t need_cluster_zero = TRUE;
2969
2970 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2971
2972 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2973 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2974
2975 if (upl_valid_page(pl, zero_pg_index)) {
2976 /*
2977 * never force zero valid pages - dirty or clean
2978 * we'll leave these in the UPL for cluster_write_copy to deal with
2979 */
2980 need_cluster_zero = FALSE;
2981 }
2982 }
2983 if (need_cluster_zero == TRUE)
2984 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2985
2986 return (bytes_to_zero);
2987}
2988
2989
2990void
2991cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
2992{
2993 struct cl_extent cl;
2994 boolean_t first_pass = TRUE;
2995
2996 assert(s_offset < e_offset);
2997 assert((s_offset & PAGE_MASK_64) == 0);
2998 assert((e_offset & PAGE_MASK_64) == 0);
2999
3000 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3001 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3002
3003 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3004 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3005}
3006
3007
3008static void
3009cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3010 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3011 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3012{
3013 struct cl_writebehind *wbp;
3014 int cl_index;
3015 int ret_cluster_try_push;
3016 u_int max_cluster_pgcount;
3017
3018
3019 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3020
3021 /*
3022 * take the lock to protect our accesses
3023 * of the writebehind and sparse cluster state
3024 */
3025 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3026
3027 if (wbp->cl_scmap) {
3028
3029 if ( !(flags & IO_NOCACHE)) {
3030 /*
3031 * we've fallen into the sparse
3032 * cluster method of delaying dirty pages
3033 */
3034 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3035
3036 lck_mtx_unlock(&wbp->cl_lockw);
3037 return;
3038 }
3039 /*
3040 * must have done cached writes that fell into
3041 * the sparse cluster mechanism... we've switched
3042 * to uncached writes on the file, so go ahead
3043 * and push whatever's in the sparse map
3044 * and switch back to normal clustering
3045 */
3046 wbp->cl_number = 0;
3047
3048 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3049 /*
3050 * no clusters of either type present at this point
3051 * so just go directly to start_new_cluster since
3052 * we know we need to delay this I/O since we've
3053 * already released the pages back into the cache
3054 * to avoid the deadlock with sparse_cluster_push
3055 */
3056 goto start_new_cluster;
3057 }
3058 if (*first_pass == TRUE) {
3059 if (write_off == wbp->cl_last_write)
3060 wbp->cl_seq_written += write_cnt;
3061 else
3062 wbp->cl_seq_written = write_cnt;
3063
3064 wbp->cl_last_write = write_off + write_cnt;
3065
3066 *first_pass = FALSE;
3067 }
3068 if (wbp->cl_number == 0)
3069 /*
3070 * no clusters currently present
3071 */
3072 goto start_new_cluster;
3073
3074 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3075 /*
3076 * check each cluster that we currently hold
3077 * try to merge some or all of this write into
3078 * one or more of the existing clusters... if
3079 * any portion of the write remains, start a
3080 * new cluster
3081 */
3082 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3083 /*
3084 * the current write starts at or after the current cluster
3085 */
3086 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3087 /*
3088 * we have a write that fits entirely
3089 * within the existing cluster limits
3090 */
3091 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr)
3092 /*
3093 * update our idea of where the cluster ends
3094 */
3095 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3096 break;
3097 }
3098 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3099 /*
3100 * we have a write that starts in the middle of the current cluster
3101 * but extends beyond the cluster's limit... we know this because
3102 * of the previous checks
3103 * we'll extend the current cluster to the max
3104 * and update the b_addr for the current write to reflect that
3105 * the head of it was absorbed into this cluster...
3106 * note that we'll always have a leftover tail in this case since
3107 * full absorbtion would have occurred in the clause above
3108 */
3109 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3110
3111 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3112 }
3113 /*
3114 * we come here for the case where the current write starts
3115 * beyond the limit of the existing cluster or we have a leftover
3116 * tail after a partial absorbtion
3117 *
3118 * in either case, we'll check the remaining clusters before
3119 * starting a new one
3120 */
3121 } else {
3122 /*
3123 * the current write starts in front of the cluster we're currently considering
3124 */
3125 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3126 /*
3127 * we can just merge the new request into
3128 * this cluster and leave it in the cache
3129 * since the resulting cluster is still
3130 * less than the maximum allowable size
3131 */
3132 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3133
3134 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3135 /*
3136 * the current write completely
3137 * envelops the existing cluster and since
3138 * each write is limited to at most max_cluster_pgcount pages
3139 * we can just use the start and last blocknos of the write
3140 * to generate the cluster limits
3141 */
3142 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3143 }
3144 break;
3145 }
3146 /*
3147 * if we were to combine this write with the current cluster
3148 * we would exceed the cluster size limit.... so,
3149 * let's see if there's any overlap of the new I/O with
3150 * the cluster we're currently considering... in fact, we'll
3151 * stretch the cluster out to it's full limit and see if we
3152 * get an intersection with the current write
3153 *
3154 */
3155 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3156 /*
3157 * the current write extends into the proposed cluster
3158 * clip the length of the current write after first combining it's
3159 * tail with the newly shaped cluster
3160 */
3161 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3162
3163 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3164 }
3165 /*
3166 * if we get here, there was no way to merge
3167 * any portion of this write with this cluster
3168 * or we could only merge part of it which
3169 * will leave a tail...
3170 * we'll check the remaining clusters before starting a new one
3171 */
3172 }
3173 }
3174 if (cl_index < wbp->cl_number)
3175 /*
3176 * we found an existing cluster(s) that we
3177 * could entirely merge this I/O into
3178 */
3179 goto delay_io;
3180
3181 if (defer_writes == FALSE &&
3182 wbp->cl_number == MAX_CLUSTERS &&
3183 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3184 uint32_t n;
3185
3186 if (vp->v_mount->mnt_minsaturationbytecount) {
3187 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3188
3189 if (n > MAX_CLUSTERS)
3190 n = MAX_CLUSTERS;
3191 } else
3192 n = 0;
3193
3194 if (n == 0) {
3195 if (disk_conditioner_mount_is_ssd(vp->v_mount))
3196 n = WRITE_BEHIND_SSD;
3197 else
3198 n = WRITE_BEHIND;
3199 }
3200 while (n--)
3201 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3202 }
3203 if (wbp->cl_number < MAX_CLUSTERS) {
3204 /*
3205 * we didn't find an existing cluster to
3206 * merge into, but there's room to start
3207 * a new one
3208 */
3209 goto start_new_cluster;
3210 }
3211 /*
3212 * no exisitng cluster to merge with and no
3213 * room to start a new one... we'll try
3214 * pushing one of the existing ones... if none of
3215 * them are able to be pushed, we'll switch
3216 * to the sparse cluster mechanism
3217 * cluster_try_push updates cl_number to the
3218 * number of remaining clusters... and
3219 * returns the number of currently unused clusters
3220 */
3221 ret_cluster_try_push = 0;
3222
3223 /*
3224 * if writes are not deferred, call cluster push immediately
3225 */
3226 if (defer_writes == FALSE) {
3227
3228 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3229 }
3230 /*
3231 * execute following regardless of writes being deferred or not
3232 */
3233 if (ret_cluster_try_push == 0) {
3234 /*
3235 * no more room in the normal cluster mechanism
3236 * so let's switch to the more expansive but expensive
3237 * sparse mechanism....
3238 */
3239 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3240 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3241
3242 lck_mtx_unlock(&wbp->cl_lockw);
3243 return;
3244 }
3245start_new_cluster:
3246 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3247 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3248
3249 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3250
3251 if (flags & IO_NOCACHE)
3252 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3253
3254 if (flags & IO_PASSIVE)
3255 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3256
3257 wbp->cl_number++;
3258delay_io:
3259 lck_mtx_unlock(&wbp->cl_lockw);
3260 return;
3261}
3262
3263
3264static int
3265cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3266 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3267{
3268 upl_page_info_t *pl;
3269 upl_t upl;
3270 vm_offset_t upl_offset = 0;
3271 vm_size_t upl_size;
3272 off_t upl_f_offset;
3273 int pages_in_upl;
3274 int start_offset;
3275 int xfer_resid;
3276 int io_size;
3277 int io_offset;
3278 int bytes_to_zero;
3279 int bytes_to_move;
3280 kern_return_t kret;
3281 int retval = 0;
3282 int io_resid;
3283 long long total_size;
3284 long long zero_cnt;
3285 off_t zero_off;
3286 long long zero_cnt1;
3287 off_t zero_off1;
3288 off_t write_off = 0;
3289 int write_cnt = 0;
3290 boolean_t first_pass = FALSE;
3291 struct cl_extent cl;
3292 int bflag;
3293 u_int max_io_size;
3294
3295 if (uio) {
3296 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3297 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3298
3299 io_resid = io_req_size;
3300 } else {
3301 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3302 0, 0, (int)oldEOF, (int)newEOF, 0);
3303
3304 io_resid = 0;
3305 }
3306 if (flags & IO_PASSIVE)
3307 bflag = CL_PASSIVE;
3308 else
3309 bflag = 0;
3310 if (flags & IO_NOCACHE)
3311 bflag |= CL_NOCACHE;
3312
3313 if (flags & IO_SKIP_ENCRYPTION)
3314 bflag |= CL_ENCRYPTED;
3315
3316 zero_cnt = 0;
3317 zero_cnt1 = 0;
3318 zero_off = 0;
3319 zero_off1 = 0;
3320
3321 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3322
3323 if (flags & IO_HEADZEROFILL) {
3324 /*
3325 * some filesystems (HFS is one) don't support unallocated holes within a file...
3326 * so we zero fill the intervening space between the old EOF and the offset
3327 * where the next chunk of real data begins.... ftruncate will also use this
3328 * routine to zero fill to the new EOF when growing a file... in this case, the
3329 * uio structure will not be provided
3330 */
3331 if (uio) {
3332 if (headOff < uio->uio_offset) {
3333 zero_cnt = uio->uio_offset - headOff;
3334 zero_off = headOff;
3335 }
3336 } else if (headOff < newEOF) {
3337 zero_cnt = newEOF - headOff;
3338 zero_off = headOff;
3339 }
3340 } else {
3341 if (uio && uio->uio_offset > oldEOF) {
3342 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3343
3344 if (zero_off >= oldEOF) {
3345 zero_cnt = uio->uio_offset - zero_off;
3346
3347 flags |= IO_HEADZEROFILL;
3348 }
3349 }
3350 }
3351 if (flags & IO_TAILZEROFILL) {
3352 if (uio) {
3353 zero_off1 = uio->uio_offset + io_req_size;
3354
3355 if (zero_off1 < tailOff)
3356 zero_cnt1 = tailOff - zero_off1;
3357 }
3358 } else {
3359 if (uio && newEOF > oldEOF) {
3360 zero_off1 = uio->uio_offset + io_req_size;
3361
3362 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3363 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3364
3365 flags |= IO_TAILZEROFILL;
3366 }
3367 }
3368 }
3369 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3371 retval, 0, 0, 0, 0);
3372 return (0);
3373 }
3374 if (uio) {
3375 write_off = uio->uio_offset;
3376 write_cnt = uio_resid(uio);
3377 /*
3378 * delay updating the sequential write info
3379 * in the control block until we've obtained
3380 * the lock for it
3381 */
3382 first_pass = TRUE;
3383 }
3384 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3385 /*
3386 * for this iteration of the loop, figure out where our starting point is
3387 */
3388 if (zero_cnt) {
3389 start_offset = (int)(zero_off & PAGE_MASK_64);
3390 upl_f_offset = zero_off - start_offset;
3391 } else if (io_resid) {
3392 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3393 upl_f_offset = uio->uio_offset - start_offset;
3394 } else {
3395 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3396 upl_f_offset = zero_off1 - start_offset;
3397 }
3398 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3399 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3400
3401 if (total_size > max_io_size)
3402 total_size = max_io_size;
3403
3404 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3405
3406 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3407 /*
3408 * assumption... total_size <= io_resid
3409 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3410 */
3411 if ((start_offset + total_size) > max_io_size)
3412 total_size = max_io_size - start_offset;
3413 xfer_resid = total_size;
3414
3415 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3416
3417 if (retval)
3418 break;
3419
3420 io_resid -= (total_size - xfer_resid);
3421 total_size = xfer_resid;
3422 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3423 upl_f_offset = uio->uio_offset - start_offset;
3424
3425 if (total_size == 0) {
3426 if (start_offset) {
3427 /*
3428 * the write did not finish on a page boundary
3429 * which will leave upl_f_offset pointing to the
3430 * beginning of the last page written instead of
3431 * the page beyond it... bump it in this case
3432 * so that the cluster code records the last page
3433 * written as dirty
3434 */
3435 upl_f_offset += PAGE_SIZE_64;
3436 }
3437 upl_size = 0;
3438
3439 goto check_cluster;
3440 }
3441 }
3442 /*
3443 * compute the size of the upl needed to encompass
3444 * the requested write... limit each call to cluster_io
3445 * to the maximum UPL size... cluster_io will clip if
3446 * this exceeds the maximum io_size for the device,
3447 * make sure to account for
3448 * a starting offset that's not page aligned
3449 */
3450 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3451
3452 if (upl_size > max_io_size)
3453 upl_size = max_io_size;
3454
3455 pages_in_upl = upl_size / PAGE_SIZE;
3456 io_size = upl_size - start_offset;
3457
3458 if ((long long)io_size > total_size)
3459 io_size = total_size;
3460
3461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3462
3463
3464 /*
3465 * Gather the pages from the buffer cache.
3466 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3467 * that we intend to modify these pages.
3468 */
3469 kret = ubc_create_upl_kernel(vp,
3470 upl_f_offset,
3471 upl_size,
3472 &upl,
3473 &pl,
3474 UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3475 VM_KERN_MEMORY_FILE);
3476 if (kret != KERN_SUCCESS)
3477 panic("cluster_write_copy: failed to get pagelist");
3478
3479 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3480 upl, (int)upl_f_offset, start_offset, 0, 0);
3481
3482 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3483 int read_size;
3484
3485 /*
3486 * we're starting in the middle of the first page of the upl
3487 * and the page isn't currently valid, so we're going to have
3488 * to read it in first... this is a synchronous operation
3489 */
3490 read_size = PAGE_SIZE;
3491
3492 if ((upl_f_offset + read_size) > oldEOF)
3493 read_size = oldEOF - upl_f_offset;
3494
3495 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3496 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3497 if (retval) {
3498 /*
3499 * we had an error during the read which causes us to abort
3500 * the current cluster_write request... before we do, we need
3501 * to release the rest of the pages in the upl without modifying
3502 * there state and mark the failed page in error
3503 */
3504 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
3505
3506 if (upl_size > PAGE_SIZE)
3507 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3508
3509 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3510 upl, 0, 0, retval, 0);
3511 break;
3512 }
3513 }
3514 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3515 /*
3516 * the last offset we're writing to in this upl does not end on a page
3517 * boundary... if it's not beyond the old EOF, then we'll also need to
3518 * pre-read this page in if it isn't already valid
3519 */
3520 upl_offset = upl_size - PAGE_SIZE;
3521
3522 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3523 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
3524 int read_size;
3525
3526 read_size = PAGE_SIZE;
3527
3528 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
3529 read_size = oldEOF - (upl_f_offset + upl_offset);
3530
3531 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3532 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3533 if (retval) {
3534 /*
3535 * we had an error during the read which causes us to abort
3536 * the current cluster_write request... before we do, we
3537 * need to release the rest of the pages in the upl without
3538 * modifying there state and mark the failed page in error
3539 */
3540 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
3541
3542 if (upl_size > PAGE_SIZE)
3543 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3544
3545 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3546 upl, 0, 0, retval, 0);
3547 break;
3548 }
3549 }
3550 }
3551 xfer_resid = io_size;
3552 io_offset = start_offset;
3553
3554 while (zero_cnt && xfer_resid) {
3555
3556 if (zero_cnt < (long long)xfer_resid)
3557 bytes_to_zero = zero_cnt;
3558 else
3559 bytes_to_zero = xfer_resid;
3560
3561 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3562
3563 xfer_resid -= bytes_to_zero;
3564 zero_cnt -= bytes_to_zero;
3565 zero_off += bytes_to_zero;
3566 io_offset += bytes_to_zero;
3567 }
3568 if (xfer_resid && io_resid) {
3569 u_int32_t io_requested;
3570
3571 bytes_to_move = min(io_resid, xfer_resid);
3572 io_requested = bytes_to_move;
3573
3574 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3575
3576 if (retval) {
3577 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3578
3579 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3580 upl, 0, 0, retval, 0);
3581 } else {
3582 io_resid -= bytes_to_move;
3583 xfer_resid -= bytes_to_move;
3584 io_offset += bytes_to_move;
3585 }
3586 }
3587 while (xfer_resid && zero_cnt1 && retval == 0) {
3588
3589 if (zero_cnt1 < (long long)xfer_resid)
3590 bytes_to_zero = zero_cnt1;
3591 else
3592 bytes_to_zero = xfer_resid;
3593
3594 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3595
3596 xfer_resid -= bytes_to_zero;
3597 zero_cnt1 -= bytes_to_zero;
3598 zero_off1 += bytes_to_zero;
3599 io_offset += bytes_to_zero;
3600 }
3601 if (retval == 0) {
3602 int do_zeroing = 1;
3603
3604 io_size += start_offset;
3605
3606 /* Force more restrictive zeroing behavior only on APFS */
3607 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3608 do_zeroing = 0;
3609 }
3610
3611 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3612
3613 /*
3614 * if we're extending the file with this write
3615 * we'll zero fill the rest of the page so that
3616 * if the file gets extended again in such a way as to leave a
3617 * hole starting at this EOF, we'll have zero's in the correct spot
3618 */
3619 cluster_zero(upl, io_size, upl_size - io_size, NULL);
3620 }
3621 /*
3622 * release the upl now if we hold one since...
3623 * 1) pages in it may be present in the sparse cluster map
3624 * and may span 2 separate buckets there... if they do and
3625 * we happen to have to flush a bucket to make room and it intersects
3626 * this upl, a deadlock may result on page BUSY
3627 * 2) we're delaying the I/O... from this point forward we're just updating
3628 * the cluster state... no need to hold the pages, so commit them
3629 * 3) IO_SYNC is set...
3630 * because we had to ask for a UPL that provides currenty non-present pages, the
3631 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3632 * upon committing it... this is not the behavior we want since it's possible for
3633 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3634 * we'll pick these pages back up later with the correct behavior specified.
3635 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3636 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3637 * we hold since the flushing context is holding the cluster lock.
3638 */
3639 ubc_upl_commit_range(upl, 0, upl_size,
3640 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3641check_cluster:
3642 /*
3643 * calculate the last logical block number
3644 * that this delayed I/O encompassed
3645 */
3646 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3647
3648 if (flags & IO_SYNC) {
3649 /*
3650 * if the IO_SYNC flag is set than we need to bypass
3651 * any clustering and immediately issue the I/O
3652 *
3653 * we don't hold the lock at this point
3654 *
3655 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3656 * so that we correctly deal with a change in state of the hardware modify bit...
3657 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3658 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3659 * responsible for generating the correct sized I/O(s)
3660 */
3661 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3662 } else {
3663 boolean_t defer_writes = FALSE;
3664
3665 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE)
3666 defer_writes = TRUE;
3667
3668 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3669 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3670 }
3671 }
3672 }
3673 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3674
3675 return (retval);
3676}
3677
3678
3679
3680int
3681cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3682{
3683 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3684}
3685
3686
3687int
3688cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3689{
3690 int retval = 0;
3691 int flags;
3692 user_ssize_t cur_resid;
3693 u_int32_t io_size;
3694 u_int32_t read_length = 0;
3695 int read_type = IO_COPY;
3696
3697 flags = xflags;
3698
3699 if (vp->v_flag & VNOCACHE_DATA)
3700 flags |= IO_NOCACHE;
3701 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled)
3702 flags |= IO_RAOFF;
3703
3704 if (flags & IO_SKIP_ENCRYPTION)
3705 flags |= IO_ENCRYPTED;
3706
3707 /*
3708 * do a read through the cache if one of the following is true....
3709 * NOCACHE is not true
3710 * the uio request doesn't target USERSPACE
3711 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3712 * Reading encrypted data from a CP filesystem should never result in the data touching
3713 * the UBC.
3714 *
3715 * otherwise, find out if we want the direct or contig variant for
3716 * the first vector in the uio request
3717 */
3718 if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) {
3719
3720 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3721 }
3722
3723 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3724
3725 switch (read_type) {
3726
3727 case IO_COPY:
3728 /*
3729 * make sure the uio_resid isn't too big...
3730 * internally, we want to handle all of the I/O in
3731 * chunk sizes that fit in a 32 bit int
3732 */
3733 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
3734 io_size = MAX_IO_REQUEST_SIZE;
3735 else
3736 io_size = (u_int32_t)cur_resid;
3737
3738 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3739 break;
3740
3741 case IO_DIRECT:
3742 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3743 break;
3744
3745 case IO_CONTIG:
3746 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3747 break;
3748
3749 case IO_UNKNOWN:
3750 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3751 break;
3752 }
3753 }
3754 return (retval);
3755}
3756
3757
3758
3759static void
3760cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
3761{
3762 int range;
3763 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3764
3765 if ((range = last_pg - start_pg)) {
3766 if (take_reference)
3767 abort_flags |= UPL_ABORT_REFERENCE;
3768
3769 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3770 }
3771}
3772
3773
3774static int
3775cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3776{
3777 upl_page_info_t *pl;
3778 upl_t upl;
3779 vm_offset_t upl_offset;
3780 u_int32_t upl_size;
3781 off_t upl_f_offset;
3782 int start_offset;
3783 int start_pg;
3784 int last_pg;
3785 int uio_last = 0;
3786 int pages_in_upl;
3787 off_t max_size;
3788 off_t last_ioread_offset;
3789 off_t last_request_offset;
3790 kern_return_t kret;
3791 int error = 0;
3792 int retval = 0;
3793 u_int32_t size_of_prefetch;
3794 u_int32_t xsize;
3795 u_int32_t io_size;
3796 u_int32_t max_rd_size;
3797 u_int32_t max_io_size;
3798 u_int32_t max_prefetch;
3799 u_int rd_ahead_enabled = 1;
3800 u_int prefetch_enabled = 1;
3801 struct cl_readahead * rap;
3802 struct clios iostate;
3803 struct cl_extent extent;
3804 int bflag;
3805 int take_reference = 1;
3806 int policy = IOPOL_DEFAULT;
3807 boolean_t iolock_inited = FALSE;
3808
3809 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3810 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3811
3812 if (flags & IO_ENCRYPTED) {
3813 panic ("encrypted blocks will hit UBC!");
3814 }
3815
3816 policy = throttle_get_io_policy(NULL);
3817
3818 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE))
3819 take_reference = 0;
3820
3821 if (flags & IO_PASSIVE)
3822 bflag = CL_PASSIVE;
3823 else
3824 bflag = 0;
3825
3826 if (flags & IO_NOCACHE)
3827 bflag |= CL_NOCACHE;
3828
3829 if (flags & IO_SKIP_ENCRYPTION)
3830 bflag |= CL_ENCRYPTED;
3831
3832 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3833 max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
3834 max_rd_size = max_prefetch;
3835
3836 last_request_offset = uio->uio_offset + io_req_size;
3837
3838 if (last_request_offset > filesize)
3839 last_request_offset = filesize;
3840
3841 if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3842 rd_ahead_enabled = 0;
3843 rap = NULL;
3844 } else {
3845 if (cluster_is_throttled(vp)) {
3846 /*
3847 * we're in the throttle window, at the very least
3848 * we want to limit the size of the I/O we're about
3849 * to issue
3850 */
3851 rd_ahead_enabled = 0;
3852 prefetch_enabled = 0;
3853
3854 max_rd_size = THROTTLE_MAX_IOSIZE;
3855 }
3856 if ((rap = cluster_get_rap(vp)) == NULL)
3857 rd_ahead_enabled = 0;
3858 else {
3859 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
3860 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
3861 }
3862 }
3863 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
3864 /*
3865 * determine if we already have a read-ahead in the pipe courtesy of the
3866 * last read systemcall that was issued...
3867 * if so, pick up it's extent to determine where we should start
3868 * with respect to any read-ahead that might be necessary to
3869 * garner all the data needed to complete this read systemcall
3870 */
3871 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
3872
3873 if (last_ioread_offset < uio->uio_offset)
3874 last_ioread_offset = (off_t)0;
3875 else if (last_ioread_offset > last_request_offset)
3876 last_ioread_offset = last_request_offset;
3877 } else
3878 last_ioread_offset = (off_t)0;
3879
3880 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
3881
3882 max_size = filesize - uio->uio_offset;
3883
3884 if ((off_t)(io_req_size) < max_size)
3885 io_size = io_req_size;
3886 else
3887 io_size = max_size;
3888
3889 if (!(flags & IO_NOCACHE)) {
3890
3891 while (io_size) {
3892 u_int32_t io_resid;
3893 u_int32_t io_requested;
3894
3895 /*
3896 * if we keep finding the pages we need already in the cache, then
3897 * don't bother to call cluster_read_prefetch since it costs CPU cycles
3898 * to determine that we have all the pages we need... once we miss in
3899 * the cache and have issued an I/O, than we'll assume that we're likely
3900 * to continue to miss in the cache and it's to our advantage to try and prefetch
3901 */
3902 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
3903 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
3904 /*
3905 * we've already issued I/O for this request and
3906 * there's still work to do and
3907 * our prefetch stream is running dry, so issue a
3908 * pre-fetch I/O... the I/O latency will overlap
3909 * with the copying of the data
3910 */
3911 if (size_of_prefetch > max_rd_size)
3912 size_of_prefetch = max_rd_size;
3913
3914 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3915
3916 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3917
3918 if (last_ioread_offset > last_request_offset)
3919 last_ioread_offset = last_request_offset;
3920 }
3921 }
3922 /*
3923 * limit the size of the copy we're about to do so that
3924 * we can notice that our I/O pipe is running dry and
3925 * get the next I/O issued before it does go dry
3926 */
3927 if (last_ioread_offset && io_size > (max_io_size / 4))
3928 io_resid = (max_io_size / 4);
3929 else
3930 io_resid = io_size;
3931
3932 io_requested = io_resid;
3933
3934 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
3935
3936 xsize = io_requested - io_resid;
3937
3938 io_size -= xsize;
3939 io_req_size -= xsize;
3940
3941 if (retval || io_resid)
3942 /*
3943 * if we run into a real error or
3944 * a page that is not in the cache
3945 * we need to leave streaming mode
3946 */
3947 break;
3948
3949 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
3950 /*
3951 * we're already finished the I/O for this read request
3952 * let's see if we should do a read-ahead
3953 */
3954 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3955 }
3956 }
3957 if (retval)
3958 break;
3959 if (io_size == 0) {
3960 if (rap != NULL) {
3961 if (extent.e_addr < rap->cl_lastr)
3962 rap->cl_maxra = 0;
3963 rap->cl_lastr = extent.e_addr;
3964 }
3965 break;
3966 }
3967 /*
3968 * recompute max_size since cluster_copy_ubc_data_internal
3969 * may have advanced uio->uio_offset
3970 */
3971 max_size = filesize - uio->uio_offset;
3972 }
3973
3974 iostate.io_completed = 0;
3975 iostate.io_issued = 0;
3976 iostate.io_error = 0;
3977 iostate.io_wanted = 0;
3978
3979 if ( (flags & IO_RETURN_ON_THROTTLE) ) {
3980 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
3981 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
3982 /*
3983 * we're in the throttle window and at least 1 I/O
3984 * has already been issued by a throttleable thread
3985 * in this window, so return with EAGAIN to indicate
3986 * to the FS issuing the cluster_read call that it
3987 * should now throttle after dropping any locks
3988 */
3989 throttle_info_update_by_mount(vp->v_mount);
3990
3991 retval = EAGAIN;
3992 break;
3993 }
3994 }
3995 }
3996
3997 /*
3998 * compute the size of the upl needed to encompass
3999 * the requested read... limit each call to cluster_io
4000 * to the maximum UPL size... cluster_io will clip if
4001 * this exceeds the maximum io_size for the device,
4002 * make sure to account for
4003 * a starting offset that's not page aligned
4004 */
4005 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4006 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4007
4008 if (io_size > max_rd_size)
4009 io_size = max_rd_size;
4010
4011 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4012
4013 if (flags & IO_NOCACHE) {
4014 if (upl_size > max_io_size)
4015 upl_size = max_io_size;
4016 } else {
4017 if (upl_size > max_io_size / 4) {
4018 upl_size = max_io_size / 4;
4019 upl_size &= ~PAGE_MASK;
4020
4021 if (upl_size == 0)
4022 upl_size = PAGE_SIZE;
4023 }
4024 }
4025 pages_in_upl = upl_size / PAGE_SIZE;
4026
4027 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4028 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4029
4030 kret = ubc_create_upl_kernel(vp,
4031 upl_f_offset,
4032 upl_size,
4033 &upl,
4034 &pl,
4035 UPL_FILE_IO | UPL_SET_LITE,
4036 VM_KERN_MEMORY_FILE);
4037 if (kret != KERN_SUCCESS)
4038 panic("cluster_read_copy: failed to get pagelist");
4039
4040 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4041 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4042
4043 /*
4044 * scan from the beginning of the upl looking for the first
4045 * non-valid page.... this will become the first page in
4046 * the request we're going to make to 'cluster_io'... if all
4047 * of the pages are valid, we won't call through to 'cluster_io'
4048 */
4049 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4050 if (!upl_valid_page(pl, start_pg))
4051 break;
4052 }
4053
4054 /*
4055 * scan from the starting invalid page looking for a valid
4056 * page before the end of the upl is reached, if we
4057 * find one, then it will be the last page of the request to
4058 * 'cluster_io'
4059 */
4060 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4061 if (upl_valid_page(pl, last_pg))
4062 break;
4063 }
4064
4065 if (start_pg < last_pg) {
4066 /*
4067 * we found a range of 'invalid' pages that must be filled
4068 * if the last page in this range is the last page of the file
4069 * we may have to clip the size of it to keep from reading past
4070 * the end of the last physical block associated with the file
4071 */
4072 if (iolock_inited == FALSE) {
4073 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4074
4075 iolock_inited = TRUE;
4076 }
4077 upl_offset = start_pg * PAGE_SIZE;
4078 io_size = (last_pg - start_pg) * PAGE_SIZE;
4079
4080 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
4081 io_size = filesize - (upl_f_offset + upl_offset);
4082
4083 /*
4084 * issue an asynchronous read to cluster_io
4085 */
4086
4087 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4088 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4089
4090 if (rap) {
4091 if (extent.e_addr < rap->cl_maxra) {
4092 /*
4093 * we've just issued a read for a block that should have been
4094 * in the cache courtesy of the read-ahead engine... something
4095 * has gone wrong with the pipeline, so reset the read-ahead
4096 * logic which will cause us to restart from scratch
4097 */
4098 rap->cl_maxra = 0;
4099 }
4100 }
4101 }
4102 if (error == 0) {
4103 /*
4104 * if the read completed successfully, or there was no I/O request
4105 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4106 * we'll first add on any 'valid'
4107 * pages that were present in the upl when we acquired it.
4108 */
4109 u_int val_size;
4110
4111 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4112 if (!upl_valid_page(pl, uio_last))
4113 break;
4114 }
4115 if (uio_last < pages_in_upl) {
4116 /*
4117 * there were some invalid pages beyond the valid pages
4118 * that we didn't issue an I/O for, just release them
4119 * unchanged now, so that any prefetch/readahed can
4120 * include them
4121 */
4122 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4123 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4124 }
4125
4126 /*
4127 * compute size to transfer this round, if io_req_size is
4128 * still non-zero after this attempt, we'll loop around and
4129 * set up for another I/O.
4130 */
4131 val_size = (uio_last * PAGE_SIZE) - start_offset;
4132
4133 if (val_size > max_size)
4134 val_size = max_size;
4135
4136 if (val_size > io_req_size)
4137 val_size = io_req_size;
4138
4139 if ((uio->uio_offset + val_size) > last_ioread_offset)
4140 last_ioread_offset = uio->uio_offset + val_size;
4141
4142 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4143
4144 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4145 /*
4146 * if there's still I/O left to do for this request, and...
4147 * we're not in hard throttle mode, and...
4148 * we're close to using up the previous prefetch, then issue a
4149 * new pre-fetch I/O... the I/O latency will overlap
4150 * with the copying of the data
4151 */
4152 if (size_of_prefetch > max_rd_size)
4153 size_of_prefetch = max_rd_size;
4154
4155 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4156
4157 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4158
4159 if (last_ioread_offset > last_request_offset)
4160 last_ioread_offset = last_request_offset;
4161 }
4162
4163 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4164 /*
4165 * this transfer will finish this request, so...
4166 * let's try to read ahead if we're in
4167 * a sequential access pattern and we haven't
4168 * explicitly disabled it
4169 */
4170 if (rd_ahead_enabled)
4171 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4172
4173 if (rap != NULL) {
4174 if (extent.e_addr < rap->cl_lastr)
4175 rap->cl_maxra = 0;
4176 rap->cl_lastr = extent.e_addr;
4177 }
4178 }
4179 if (iolock_inited == TRUE)
4180 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4181
4182 if (iostate.io_error)
4183 error = iostate.io_error;
4184 else {
4185 u_int32_t io_requested;
4186
4187 io_requested = val_size;
4188
4189 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4190
4191 io_req_size -= (val_size - io_requested);
4192 }
4193 } else {
4194 if (iolock_inited == TRUE)
4195 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4196 }
4197 if (start_pg < last_pg) {
4198 /*
4199 * compute the range of pages that we actually issued an I/O for
4200 * and either commit them as valid if the I/O succeeded
4201 * or abort them if the I/O failed or we're not supposed to
4202 * keep them in the cache
4203 */
4204 io_size = (last_pg - start_pg) * PAGE_SIZE;
4205
4206 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4207
4208 if (error || (flags & IO_NOCACHE))
4209 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4210 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4211 else {
4212 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4213
4214 if (take_reference)
4215 commit_flags |= UPL_COMMIT_INACTIVATE;
4216 else
4217 commit_flags |= UPL_COMMIT_SPECULATE;
4218
4219 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4220 }
4221 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4222 }
4223 if ((last_pg - start_pg) < pages_in_upl) {
4224 /*
4225 * the set of pages that we issued an I/O for did not encompass
4226 * the entire upl... so just release these without modifying
4227 * their state
4228 */
4229 if (error)
4230 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4231 else {
4232
4233 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4234 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4235
4236 /*
4237 * handle any valid pages at the beginning of
4238 * the upl... release these appropriately
4239 */
4240 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4241
4242 /*
4243 * handle any valid pages immediately after the
4244 * pages we issued I/O for... ... release these appropriately
4245 */
4246 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4247
4248 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4249 }
4250 }
4251 if (retval == 0)
4252 retval = error;
4253
4254 if (io_req_size) {
4255 if (cluster_is_throttled(vp)) {
4256 /*
4257 * we're in the throttle window, at the very least
4258 * we want to limit the size of the I/O we're about
4259 * to issue
4260 */
4261 rd_ahead_enabled = 0;
4262 prefetch_enabled = 0;
4263 max_rd_size = THROTTLE_MAX_IOSIZE;
4264 } else {
4265 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4266 /*
4267 * coming out of throttled state
4268 */
4269 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4270 if (rap != NULL)
4271 rd_ahead_enabled = 1;
4272 prefetch_enabled = 1;
4273 }
4274 max_rd_size = max_prefetch;
4275 last_ioread_offset = 0;
4276 }
4277 }
4278 }
4279 }
4280 if (iolock_inited == TRUE) {
4281 /*
4282 * cluster_io returned an error after it
4283 * had already issued some I/O. we need
4284 * to wait for that I/O to complete before
4285 * we can destroy the iostate mutex...
4286 * 'retval' already contains the early error
4287 * so no need to pick it up from iostate.io_error
4288 */
4289 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4290
4291 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4292 }
4293 if (rap != NULL) {
4294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4295 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4296
4297 lck_mtx_unlock(&rap->cl_lockr);
4298 } else {
4299 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4300 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4301 }
4302
4303 return (retval);
4304}
4305
4306/*
4307 * We don't want another read/write lock for every vnode in the system
4308 * so we keep a hash of them here. There should never be very many of
4309 * these around at any point in time.
4310 */
4311cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4312{
4313 struct cl_direct_read_locks *head
4314 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4315 % CL_DIRECT_READ_LOCK_BUCKETS];
4316
4317 struct cl_direct_read_lock *lck, *new_lck = NULL;
4318
4319 for (;;) {
4320 lck_spin_lock(&cl_direct_read_spin_lock);
4321
4322 LIST_FOREACH(lck, head, chain) {
4323 if (lck->vp == vp) {
4324 ++lck->ref_count;
4325 lck_spin_unlock(&cl_direct_read_spin_lock);
4326 if (new_lck) {
4327 // Someone beat us to it, ditch the allocation
4328 lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
4329 FREE(new_lck, M_TEMP);
4330 }
4331 lck_rw_lock(&lck->rw_lock, type);
4332 return lck;
4333 }
4334 }
4335
4336 if (new_lck) {
4337 // Use the lock we allocated
4338 LIST_INSERT_HEAD(head, new_lck, chain);
4339 lck_spin_unlock(&cl_direct_read_spin_lock);
4340 lck_rw_lock(&new_lck->rw_lock, type);
4341 return new_lck;
4342 }
4343
4344 lck_spin_unlock(&cl_direct_read_spin_lock);
4345
4346 // Allocate a new lock
4347 MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck),
4348 M_TEMP, M_WAITOK);
4349 lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
4350 new_lck->vp = vp;
4351 new_lck->ref_count = 1;
4352
4353 // Got to go round again
4354 }
4355}
4356
4357void cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4358{
4359 lck_rw_done(&lck->rw_lock);
4360
4361 lck_spin_lock(&cl_direct_read_spin_lock);
4362 if (lck->ref_count == 1) {
4363 LIST_REMOVE(lck, chain);
4364 lck_spin_unlock(&cl_direct_read_spin_lock);
4365 lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
4366 FREE(lck, M_TEMP);
4367 } else {
4368 --lck->ref_count;
4369 lck_spin_unlock(&cl_direct_read_spin_lock);
4370 }
4371}
4372
4373static int
4374cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4375 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4376{
4377 upl_t upl;
4378 upl_page_info_t *pl;
4379 off_t max_io_size;
4380 vm_offset_t upl_offset, vector_upl_offset = 0;
4381 upl_size_t upl_size, vector_upl_size = 0;
4382 vm_size_t upl_needed_size;
4383 unsigned int pages_in_pl;
4384 upl_control_flags_t upl_flags;
4385 kern_return_t kret;
4386 unsigned int i;
4387 int force_data_sync;
4388 int retval = 0;
4389 int no_zero_fill = 0;
4390 int io_flag = 0;
4391 int misaligned = 0;
4392 struct clios iostate;
4393 user_addr_t iov_base;
4394 u_int32_t io_req_size;
4395 u_int32_t offset_in_file;
4396 u_int32_t offset_in_iovbase;
4397 u_int32_t io_size;
4398 u_int32_t io_min;
4399 u_int32_t xsize;
4400 u_int32_t devblocksize;
4401 u_int32_t mem_alignment_mask;
4402 u_int32_t max_upl_size;
4403 u_int32_t max_rd_size;
4404 u_int32_t max_rd_ahead;
4405 u_int32_t max_vector_size;
4406 boolean_t io_throttled = FALSE;
4407
4408 u_int32_t vector_upl_iosize = 0;
4409 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
4410 off_t v_upl_uio_offset = 0;
4411 int vector_upl_index=0;
4412 upl_t vector_upl = NULL;
4413 cl_direct_read_lock_t *lock = NULL;
4414
4415 user_addr_t orig_iov_base = 0;
4416 user_addr_t last_iov_base = 0;
4417 user_addr_t next_iov_base = 0;
4418
4419 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4420 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4421
4422 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4423
4424 max_rd_size = max_upl_size;
4425 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4426
4427 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4428
4429 if (flags & IO_PASSIVE)
4430 io_flag |= CL_PASSIVE;
4431
4432 if (flags & IO_ENCRYPTED) {
4433 io_flag |= CL_RAW_ENCRYPTED;
4434 }
4435
4436 if (flags & IO_NOCACHE) {
4437 io_flag |= CL_NOCACHE;
4438 }
4439
4440 if (flags & IO_SKIP_ENCRYPTION)
4441 io_flag |= CL_ENCRYPTED;
4442
4443 iostate.io_completed = 0;
4444 iostate.io_issued = 0;
4445 iostate.io_error = 0;
4446 iostate.io_wanted = 0;
4447
4448 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4449
4450 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4451 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4452
4453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4454 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4455
4456 if (devblocksize == 1) {
4457 /*
4458 * the AFP client advertises a devblocksize of 1
4459 * however, its BLOCKMAP routine maps to physical
4460 * blocks that are PAGE_SIZE in size...
4461 * therefore we can't ask for I/Os that aren't page aligned
4462 * or aren't multiples of PAGE_SIZE in size
4463 * by setting devblocksize to PAGE_SIZE, we re-instate
4464 * the old behavior we had before the mem_alignment_mask
4465 * changes went in...
4466 */
4467 devblocksize = PAGE_SIZE;
4468 }
4469
4470 orig_iov_base = uio_curriovbase(uio);
4471 last_iov_base = orig_iov_base;
4472
4473next_dread:
4474 io_req_size = *read_length;
4475 iov_base = uio_curriovbase(uio);
4476
4477 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4478 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4479
4480 if (offset_in_file || offset_in_iovbase) {
4481 /*
4482 * one of the 2 important offsets is misaligned
4483 * so fire an I/O through the cache for this entire vector
4484 */
4485 misaligned = 1;
4486 }
4487 if (iov_base & (devblocksize - 1)) {
4488 /*
4489 * the offset in memory must be on a device block boundary
4490 * so that we can guarantee that we can generate an
4491 * I/O that ends on a page boundary in cluster_io
4492 */
4493 misaligned = 1;
4494 }
4495
4496 max_io_size = filesize - uio->uio_offset;
4497
4498 /*
4499 * The user must request IO in aligned chunks. If the
4500 * offset into the file is bad, or the userland pointer
4501 * is non-aligned, then we cannot service the encrypted IO request.
4502 */
4503 if (flags & IO_ENCRYPTED) {
4504 if (misaligned || (io_req_size & (devblocksize - 1)))
4505 retval = EINVAL;
4506
4507 max_io_size = roundup(max_io_size, devblocksize);
4508 }
4509
4510 if ((off_t)io_req_size > max_io_size)
4511 io_req_size = max_io_size;
4512
4513 /*
4514 * When we get to this point, we know...
4515 * -- the offset into the file is on a devblocksize boundary
4516 */
4517
4518 while (io_req_size && retval == 0) {
4519 u_int32_t io_start;
4520
4521 if (cluster_is_throttled(vp)) {
4522 /*
4523 * we're in the throttle window, at the very least
4524 * we want to limit the size of the I/O we're about
4525 * to issue
4526 */
4527 max_rd_size = THROTTLE_MAX_IOSIZE;
4528 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4529 max_vector_size = THROTTLE_MAX_IOSIZE;
4530 } else {
4531 max_rd_size = max_upl_size;
4532 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4533 max_vector_size = MAX_VECTOR_UPL_SIZE;
4534 }
4535 io_start = io_size = io_req_size;
4536
4537 /*
4538 * First look for pages already in the cache
4539 * and move them to user space. But only do this
4540 * check if we are not retrieving encrypted data directly
4541 * from the filesystem; those blocks should never
4542 * be in the UBC.
4543 *
4544 * cluster_copy_ubc_data returns the resid
4545 * in io_size
4546 */
4547 if ((flags & IO_ENCRYPTED) == 0) {
4548 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4549 }
4550 /*
4551 * calculate the number of bytes actually copied
4552 * starting size - residual
4553 */
4554 xsize = io_start - io_size;
4555
4556 io_req_size -= xsize;
4557
4558 if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4559 /*
4560 * We found something in the cache or we have an iov_base that's not
4561 * page-aligned.
4562 *
4563 * Issue all I/O's that have been collected within this Vectored UPL.
4564 */
4565 if(vector_upl_index) {
4566 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4567 reset_vector_run_state();
4568 }
4569
4570 if(xsize)
4571 useVectorUPL = 0;
4572
4573 /*
4574 * After this point, if we are using the Vector UPL path and the base is
4575 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4576 */
4577 }
4578
4579 /*
4580 * check to see if we are finished with this request.
4581 *
4582 * If we satisfied this IO already, then io_req_size will be 0.
4583 * Otherwise, see if the IO was mis-aligned and needs to go through
4584 * the UBC to deal with the 'tail'.
4585 *
4586 */
4587 if (io_req_size == 0 || (misaligned)) {
4588 /*
4589 * see if there's another uio vector to
4590 * process that's of type IO_DIRECT
4591 *
4592 * break out of while loop to get there
4593 */
4594 break;
4595 }
4596 /*
4597 * assume the request ends on a device block boundary
4598 */
4599 io_min = devblocksize;
4600
4601 /*
4602 * we can handle I/O's in multiples of the device block size
4603 * however, if io_size isn't a multiple of devblocksize we
4604 * want to clip it back to the nearest page boundary since
4605 * we are going to have to go through cluster_read_copy to
4606 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4607 * multiple, we avoid asking the drive for the same physical
4608 * blocks twice.. once for the partial page at the end of the
4609 * request and a 2nd time for the page we read into the cache
4610 * (which overlaps the end of the direct read) in order to
4611 * get at the overhang bytes
4612 */
4613 if (io_size & (devblocksize - 1)) {
4614 assert(!(flags & IO_ENCRYPTED));
4615 /*
4616 * Clip the request to the previous page size boundary
4617 * since request does NOT end on a device block boundary
4618 */
4619 io_size &= ~PAGE_MASK;
4620 io_min = PAGE_SIZE;
4621 }
4622 if (retval || io_size < io_min) {
4623 /*
4624 * either an error or we only have the tail left to
4625 * complete via the copy path...
4626 * we may have already spun some portion of this request
4627 * off as async requests... we need to wait for the I/O
4628 * to complete before returning
4629 */
4630 goto wait_for_dreads;
4631 }
4632
4633 /*
4634 * Don't re-check the UBC data if we are looking for uncached IO
4635 * or asking for encrypted blocks.
4636 */
4637 if ((flags & IO_ENCRYPTED) == 0) {
4638
4639 if ((xsize = io_size) > max_rd_size)
4640 xsize = max_rd_size;
4641
4642 io_size = 0;
4643
4644 if (!lock) {
4645 /*
4646 * We hold a lock here between the time we check the
4647 * cache and the time we issue I/O. This saves us
4648 * from having to lock the pages in the cache. Not
4649 * all clients will care about this lock but some
4650 * clients may want to guarantee stability between
4651 * here and when the I/O is issued in which case they
4652 * will take the lock exclusively.
4653 */
4654 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4655 }
4656
4657 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4658
4659 if (io_size == 0) {
4660 /*
4661 * a page must have just come into the cache
4662 * since the first page in this range is no
4663 * longer absent, go back and re-evaluate
4664 */
4665 continue;
4666 }
4667 }
4668 if ( (flags & IO_RETURN_ON_THROTTLE) ) {
4669 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4670 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
4671 /*
4672 * we're in the throttle window and at least 1 I/O
4673 * has already been issued by a throttleable thread
4674 * in this window, so return with EAGAIN to indicate
4675 * to the FS issuing the cluster_read call that it
4676 * should now throttle after dropping any locks
4677 */
4678 throttle_info_update_by_mount(vp->v_mount);
4679
4680 io_throttled = TRUE;
4681 goto wait_for_dreads;
4682 }
4683 }
4684 }
4685 if (io_size > max_rd_size)
4686 io_size = max_rd_size;
4687
4688 iov_base = uio_curriovbase(uio);
4689
4690 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4691 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
4692
4693 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
4694 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
4695
4696 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
4697 no_zero_fill = 1;
4698 else
4699 no_zero_fill = 0;
4700
4701 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4702 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
4703 pages_in_pl = 0;
4704 upl_size = upl_needed_size;
4705 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4706 if (no_zero_fill)
4707 upl_flags |= UPL_NOZEROFILL;
4708 if (force_data_sync)
4709 upl_flags |= UPL_FORCE_DATA_SYNC;
4710
4711 kret = vm_map_create_upl(map,
4712 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4713 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
4714
4715 if (kret != KERN_SUCCESS) {
4716 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4717 (int)upl_offset, upl_size, io_size, kret, 0);
4718 /*
4719 * failed to get pagelist
4720 *
4721 * we may have already spun some portion of this request
4722 * off as async requests... we need to wait for the I/O
4723 * to complete before returning
4724 */
4725 goto wait_for_dreads;
4726 }
4727 pages_in_pl = upl_size / PAGE_SIZE;
4728 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
4729
4730 for (i = 0; i < pages_in_pl; i++) {
4731 if (!upl_page_present(pl, i))
4732 break;
4733 }
4734 if (i == pages_in_pl)
4735 break;
4736
4737 ubc_upl_abort(upl, 0);
4738 }
4739 if (force_data_sync >= 3) {
4740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4741 (int)upl_offset, upl_size, io_size, kret, 0);
4742
4743 goto wait_for_dreads;
4744 }
4745 /*
4746 * Consider the possibility that upl_size wasn't satisfied.
4747 */
4748 if (upl_size < upl_needed_size) {
4749 if (upl_size && upl_offset == 0)
4750 io_size = upl_size;
4751 else
4752 io_size = 0;
4753 }
4754 if (io_size == 0) {
4755 ubc_upl_abort(upl, 0);
4756 goto wait_for_dreads;
4757 }
4758 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4759 (int)upl_offset, upl_size, io_size, kret, 0);
4760
4761 if(useVectorUPL) {
4762 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
4763 if(end_off)
4764 issueVectorUPL = 1;
4765 /*
4766 * After this point, if we are using a vector UPL, then
4767 * either all the UPL elements end on a page boundary OR
4768 * this UPL is the last element because it does not end
4769 * on a page boundary.
4770 */
4771 }
4772
4773 /*
4774 * request asynchronously so that we can overlap
4775 * the preparation of the next I/O
4776 * if there are already too many outstanding reads
4777 * wait until some have completed before issuing the next read
4778 */
4779 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
4780
4781 if (iostate.io_error) {
4782 /*
4783 * one of the earlier reads we issued ran into a hard error
4784 * don't issue any more reads, cleanup the UPL
4785 * that was just created but not used, then
4786 * go wait for any other reads to complete before
4787 * returning the error to the caller
4788 */
4789 ubc_upl_abort(upl, 0);
4790
4791 goto wait_for_dreads;
4792 }
4793 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
4794 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
4795
4796 if(!useVectorUPL) {
4797 if (no_zero_fill)
4798 io_flag &= ~CL_PRESERVE;
4799 else
4800 io_flag |= CL_PRESERVE;
4801
4802 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4803
4804 } else {
4805
4806 if(!vector_upl_index) {
4807 vector_upl = vector_upl_create(upl_offset);
4808 v_upl_uio_offset = uio->uio_offset;
4809 vector_upl_offset = upl_offset;
4810 }
4811
4812 vector_upl_set_subupl(vector_upl,upl, upl_size);
4813 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
4814 vector_upl_index++;
4815 vector_upl_size += upl_size;
4816 vector_upl_iosize += io_size;
4817
4818 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
4819 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4820 reset_vector_run_state();
4821 }
4822 }
4823 last_iov_base = iov_base + io_size;
4824
4825 if (lock) {
4826 // We don't need to wait for the I/O to complete
4827 cluster_unlock_direct_read(lock);
4828 lock = NULL;
4829 }
4830
4831 /*
4832 * update the uio structure
4833 */
4834 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
4835 uio_update(uio, (user_size_t)max_io_size);
4836 }
4837 else {
4838 uio_update(uio, (user_size_t)io_size);
4839 }
4840
4841 io_req_size -= io_size;
4842
4843 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
4844 upl, (int)uio->uio_offset, io_req_size, retval, 0);
4845
4846 } /* end while */
4847
4848 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
4849
4850 retval = cluster_io_type(uio, read_type, read_length, 0);
4851
4852 if (retval == 0 && *read_type == IO_DIRECT) {
4853
4854 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4855 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4856
4857 goto next_dread;
4858 }
4859 }
4860
4861wait_for_dreads:
4862
4863 if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
4864 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4865 reset_vector_run_state();
4866 }
4867
4868 // We don't need to wait for the I/O to complete
4869 if (lock)
4870 cluster_unlock_direct_read(lock);
4871
4872 /*
4873 * make sure all async reads that are part of this stream
4874 * have completed before we return
4875 */
4876 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
4877
4878 if (iostate.io_error)
4879 retval = iostate.io_error;
4880
4881 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4882
4883 if (io_throttled == TRUE && retval == 0)
4884 retval = EAGAIN;
4885
4886 for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
4887 /*
4888 * This is specifically done for pmap accounting purposes.
4889 * vm_pre_fault() will call vm_fault() to enter the page into
4890 * the pmap if there isn't _a_ physical page for that VA already.
4891 */
4892 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
4893 }
4894
4895 if (io_req_size && retval == 0) {
4896 /*
4897 * we couldn't handle the tail of this request in DIRECT mode
4898 * so fire it through the copy path
4899 */
4900 if (flags & IO_ENCRYPTED) {
4901 /*
4902 * We cannot fall back to the copy path for encrypted I/O. If this
4903 * happens, there is something wrong with the user buffer passed
4904 * down.
4905 */
4906 retval = EFAULT;
4907 } else {
4908 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
4909 }
4910
4911 *read_type = IO_UNKNOWN;
4912 }
4913 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
4914 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
4915
4916 return (retval);
4917}
4918
4919
4920static int
4921cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4922 int (*callback)(buf_t, void *), void *callback_arg, int flags)
4923{
4924 upl_page_info_t *pl;
4925 upl_t upl[MAX_VECTS];
4926 vm_offset_t upl_offset;
4927 addr64_t dst_paddr = 0;
4928 user_addr_t iov_base;
4929 off_t max_size;
4930 upl_size_t upl_size;
4931 vm_size_t upl_needed_size;
4932 mach_msg_type_number_t pages_in_pl;
4933 upl_control_flags_t upl_flags;
4934 kern_return_t kret;
4935 struct clios iostate;
4936 int error= 0;
4937 int cur_upl = 0;
4938 int num_upl = 0;
4939 int n;
4940 u_int32_t xsize;
4941 u_int32_t io_size;
4942 u_int32_t devblocksize;
4943 u_int32_t mem_alignment_mask;
4944 u_int32_t tail_size = 0;
4945 int bflag;
4946
4947 if (flags & IO_PASSIVE)
4948 bflag = CL_PASSIVE;
4949 else
4950 bflag = 0;
4951
4952 if (flags & IO_NOCACHE)
4953 bflag |= CL_NOCACHE;
4954
4955 /*
4956 * When we enter this routine, we know
4957 * -- the read_length will not exceed the current iov_len
4958 * -- the target address is physically contiguous for read_length
4959 */
4960 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
4961
4962 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4963 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4964
4965 iostate.io_completed = 0;
4966 iostate.io_issued = 0;
4967 iostate.io_error = 0;
4968 iostate.io_wanted = 0;
4969
4970 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4971
4972next_cread:
4973 io_size = *read_length;
4974
4975 max_size = filesize - uio->uio_offset;
4976
4977 if (io_size > max_size)
4978 io_size = max_size;
4979
4980 iov_base = uio_curriovbase(uio);
4981
4982 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4983 upl_needed_size = upl_offset + io_size;
4984
4985 pages_in_pl = 0;
4986 upl_size = upl_needed_size;
4987 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4988
4989
4990 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
4991 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
4992
4993 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4994 kret = vm_map_get_upl(map,
4995 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4996 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
4997
4998 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
4999 (int)upl_offset, upl_size, io_size, kret, 0);
5000
5001 if (kret != KERN_SUCCESS) {
5002 /*
5003 * failed to get pagelist
5004 */
5005 error = EINVAL;
5006 goto wait_for_creads;
5007 }
5008 num_upl++;
5009
5010 if (upl_size < upl_needed_size) {
5011 /*
5012 * The upl_size wasn't satisfied.
5013 */
5014 error = EINVAL;
5015 goto wait_for_creads;
5016 }
5017 pl = ubc_upl_pageinfo(upl[cur_upl]);
5018
5019 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5020
5021 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5022 u_int32_t head_size;
5023
5024 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5025
5026 if (head_size > io_size)
5027 head_size = io_size;
5028
5029 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5030
5031 if (error)
5032 goto wait_for_creads;
5033
5034 upl_offset += head_size;
5035 dst_paddr += head_size;
5036 io_size -= head_size;
5037
5038 iov_base += head_size;
5039 }
5040 if ((u_int32_t)iov_base & mem_alignment_mask) {
5041 /*
5042 * request doesn't set up on a memory boundary
5043 * the underlying DMA engine can handle...
5044 * return an error instead of going through
5045 * the slow copy path since the intent of this
5046 * path is direct I/O to device memory
5047 */
5048 error = EINVAL;
5049 goto wait_for_creads;
5050 }
5051
5052 tail_size = io_size & (devblocksize - 1);
5053
5054 io_size -= tail_size;
5055
5056 while (io_size && error == 0) {
5057
5058 if (io_size > MAX_IO_CONTIG_SIZE)
5059 xsize = MAX_IO_CONTIG_SIZE;
5060 else
5061 xsize = io_size;
5062 /*
5063 * request asynchronously so that we can overlap
5064 * the preparation of the next I/O... we'll do
5065 * the commit after all the I/O has completed
5066 * since its all issued against the same UPL
5067 * if there are already too many outstanding reads
5068 * wait until some have completed before issuing the next
5069 */
5070 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5071
5072 if (iostate.io_error) {
5073 /*
5074 * one of the earlier reads we issued ran into a hard error
5075 * don't issue any more reads...
5076 * go wait for any other reads to complete before
5077 * returning the error to the caller
5078 */
5079 goto wait_for_creads;
5080 }
5081 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5082 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5083 (buf_t)NULL, &iostate, callback, callback_arg);
5084 /*
5085 * The cluster_io read was issued successfully,
5086 * update the uio structure
5087 */
5088 if (error == 0) {
5089 uio_update(uio, (user_size_t)xsize);
5090
5091 dst_paddr += xsize;
5092 upl_offset += xsize;
5093 io_size -= xsize;
5094 }
5095 }
5096 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5097
5098 error = cluster_io_type(uio, read_type, read_length, 0);
5099
5100 if (error == 0 && *read_type == IO_CONTIG) {
5101 cur_upl++;
5102 goto next_cread;
5103 }
5104 } else
5105 *read_type = IO_UNKNOWN;
5106
5107wait_for_creads:
5108 /*
5109 * make sure all async reads that are part of this stream
5110 * have completed before we proceed
5111 */
5112 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5113
5114 if (iostate.io_error)
5115 error = iostate.io_error;
5116
5117 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5118
5119 if (error == 0 && tail_size)
5120 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5121
5122 for (n = 0; n < num_upl; n++)
5123 /*
5124 * just release our hold on each physically contiguous
5125 * region without changing any state
5126 */
5127 ubc_upl_abort(upl[n], 0);
5128
5129 return (error);
5130}
5131
5132
5133static int
5134cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5135{
5136 user_size_t iov_len;
5137 user_addr_t iov_base = 0;
5138 upl_t upl;
5139 upl_size_t upl_size;
5140 upl_control_flags_t upl_flags;
5141 int retval = 0;
5142
5143 /*
5144 * skip over any emtpy vectors
5145 */
5146 uio_update(uio, (user_size_t)0);
5147
5148 iov_len = uio_curriovlen(uio);
5149
5150 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5151
5152 if (iov_len) {
5153 iov_base = uio_curriovbase(uio);
5154 /*
5155 * make sure the size of the vector isn't too big...
5156 * internally, we want to handle all of the I/O in
5157 * chunk sizes that fit in a 32 bit int
5158 */
5159 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
5160 upl_size = MAX_IO_REQUEST_SIZE;
5161 else
5162 upl_size = (u_int32_t)iov_len;
5163
5164 upl_flags = UPL_QUERY_OBJECT_TYPE;
5165
5166 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5167 if ((vm_map_get_upl(map,
5168 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5169 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5170 /*
5171 * the user app must have passed in an invalid address
5172 */
5173 retval = EFAULT;
5174 }
5175 if (upl_size == 0)
5176 retval = EFAULT;
5177
5178 *io_length = upl_size;
5179
5180 if (upl_flags & UPL_PHYS_CONTIG)
5181 *io_type = IO_CONTIG;
5182 else if (iov_len >= min_length)
5183 *io_type = IO_DIRECT;
5184 else
5185 *io_type = IO_COPY;
5186 } else {
5187 /*
5188 * nothing left to do for this uio
5189 */
5190 *io_length = 0;
5191 *io_type = IO_UNKNOWN;
5192 }
5193 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5194
5195 return (retval);
5196}
5197
5198
5199/*
5200 * generate advisory I/O's in the largest chunks possible
5201 * the completed pages will be released into the VM cache
5202 */
5203int
5204advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5205{
5206 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5207}
5208
5209int
5210advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5211{
5212 upl_page_info_t *pl;
5213 upl_t upl;
5214 vm_offset_t upl_offset;
5215 int upl_size;
5216 off_t upl_f_offset;
5217 int start_offset;
5218 int start_pg;
5219 int last_pg;
5220 int pages_in_upl;
5221 off_t max_size;
5222 int io_size;
5223 kern_return_t kret;
5224 int retval = 0;
5225 int issued_io;
5226 int skip_range;
5227 uint32_t max_io_size;
5228
5229
5230 if ( !UBCINFOEXISTS(vp))
5231 return(EINVAL);
5232
5233 if (resid < 0)
5234 return(EINVAL);
5235
5236 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5237
5238#if CONFIG_EMBEDDED
5239 if (max_io_size > speculative_prefetch_max_iosize)
5240 max_io_size = speculative_prefetch_max_iosize;
5241#else
5242 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5243 if (max_io_size > speculative_prefetch_max_iosize)
5244 max_io_size = speculative_prefetch_max_iosize;
5245 }
5246#endif
5247
5248 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5249 (int)f_offset, resid, (int)filesize, 0, 0);
5250
5251 while (resid && f_offset < filesize && retval == 0) {
5252 /*
5253 * compute the size of the upl needed to encompass
5254 * the requested read... limit each call to cluster_io
5255 * to the maximum UPL size... cluster_io will clip if
5256 * this exceeds the maximum io_size for the device,
5257 * make sure to account for
5258 * a starting offset that's not page aligned
5259 */
5260 start_offset = (int)(f_offset & PAGE_MASK_64);
5261 upl_f_offset = f_offset - (off_t)start_offset;
5262 max_size = filesize - f_offset;
5263
5264 if (resid < max_size)
5265 io_size = resid;
5266 else
5267 io_size = max_size;
5268
5269 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5270 if ((uint32_t)upl_size > max_io_size)
5271 upl_size = max_io_size;
5272
5273 skip_range = 0;
5274 /*
5275 * return the number of contiguously present pages in the cache
5276 * starting at upl_f_offset within the file
5277 */
5278 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5279
5280 if (skip_range) {
5281 /*
5282 * skip over pages already present in the cache
5283 */
5284 io_size = skip_range - start_offset;
5285
5286 f_offset += io_size;
5287 resid -= io_size;
5288
5289 if (skip_range == upl_size)
5290 continue;
5291 /*
5292 * have to issue some real I/O
5293 * at this point, we know it's starting on a page boundary
5294 * because we've skipped over at least the first page in the request
5295 */
5296 start_offset = 0;
5297 upl_f_offset += skip_range;
5298 upl_size -= skip_range;
5299 }
5300 pages_in_upl = upl_size / PAGE_SIZE;
5301
5302 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5303 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5304
5305 kret = ubc_create_upl_kernel(vp,
5306 upl_f_offset,
5307 upl_size,
5308 &upl,
5309 &pl,
5310 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5311 VM_KERN_MEMORY_FILE);
5312 if (kret != KERN_SUCCESS)
5313 return(retval);
5314 issued_io = 0;
5315
5316 /*
5317 * before we start marching forward, we must make sure we end on
5318 * a present page, otherwise we will be working with a freed
5319 * upl
5320 */
5321 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5322 if (upl_page_present(pl, last_pg))
5323 break;
5324 }
5325 pages_in_upl = last_pg + 1;
5326
5327
5328 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5329 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5330
5331
5332 for (last_pg = 0; last_pg < pages_in_upl; ) {
5333 /*
5334 * scan from the beginning of the upl looking for the first
5335 * page that is present.... this will become the first page in
5336 * the request we're going to make to 'cluster_io'... if all
5337 * of the pages are absent, we won't call through to 'cluster_io'
5338 */
5339 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5340 if (upl_page_present(pl, start_pg))
5341 break;
5342 }
5343
5344 /*
5345 * scan from the starting present page looking for an absent
5346 * page before the end of the upl is reached, if we
5347 * find one, then it will terminate the range of pages being
5348 * presented to 'cluster_io'
5349 */
5350 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5351 if (!upl_page_present(pl, last_pg))
5352 break;
5353 }
5354
5355 if (last_pg > start_pg) {
5356 /*
5357 * we found a range of pages that must be filled
5358 * if the last page in this range is the last page of the file
5359 * we may have to clip the size of it to keep from reading past
5360 * the end of the last physical block associated with the file
5361 */
5362 upl_offset = start_pg * PAGE_SIZE;
5363 io_size = (last_pg - start_pg) * PAGE_SIZE;
5364
5365 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
5366 io_size = filesize - (upl_f_offset + upl_offset);
5367
5368 /*
5369 * issue an asynchronous read to cluster_io
5370 */
5371 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5372 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5373
5374 issued_io = 1;
5375 }
5376 }
5377 if (issued_io == 0)
5378 ubc_upl_abort(upl, 0);
5379
5380 io_size = upl_size - start_offset;
5381
5382 if (io_size > resid)
5383 io_size = resid;
5384 f_offset += io_size;
5385 resid -= io_size;
5386 }
5387
5388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5389 (int)f_offset, resid, retval, 0, 0);
5390
5391 return(retval);
5392}
5393
5394
5395int
5396cluster_push(vnode_t vp, int flags)
5397{
5398 return cluster_push_ext(vp, flags, NULL, NULL);
5399}
5400
5401
5402int
5403cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5404{
5405 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5406}
5407
5408/* write errors via err, but return the number of clusters written */
5409int
5410cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5411{
5412 int retval;
5413 int my_sparse_wait = 0;
5414 struct cl_writebehind *wbp;
5415 int local_err = 0;
5416
5417 if (err)
5418 *err = 0;
5419
5420 if ( !UBCINFOEXISTS(vp)) {
5421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5422 return (0);
5423 }
5424 /* return if deferred write is set */
5425 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5426 return (0);
5427 }
5428 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5429 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5430 return (0);
5431 }
5432 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5433 lck_mtx_unlock(&wbp->cl_lockw);
5434
5435 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5436 return(0);
5437 }
5438 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5439 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5440
5441 /*
5442 * if we have an fsync in progress, we don't want to allow any additional
5443 * sync/fsync/close(s) to occur until it finishes.
5444 * note that its possible for writes to continue to occur to this file
5445 * while we're waiting and also once the fsync starts to clean if we're
5446 * in the sparse map case
5447 */
5448 while (wbp->cl_sparse_wait) {
5449 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5450
5451 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5452
5453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5454 }
5455 if (flags & IO_SYNC) {
5456 my_sparse_wait = 1;
5457 wbp->cl_sparse_wait = 1;
5458
5459 /*
5460 * this is an fsync (or equivalent)... we must wait for any existing async
5461 * cleaning operations to complete before we evaulate the current state
5462 * and finish cleaning... this insures that all writes issued before this
5463 * fsync actually get cleaned to the disk before this fsync returns
5464 */
5465 while (wbp->cl_sparse_pushes) {
5466 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5467
5468 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5469
5470 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5471 }
5472 }
5473 if (wbp->cl_scmap) {
5474 void *scmap;
5475
5476 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5477
5478 scmap = wbp->cl_scmap;
5479 wbp->cl_scmap = NULL;
5480
5481 wbp->cl_sparse_pushes++;
5482
5483 lck_mtx_unlock(&wbp->cl_lockw);
5484
5485 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5486
5487 lck_mtx_lock(&wbp->cl_lockw);
5488
5489 wbp->cl_sparse_pushes--;
5490
5491 if (retval) {
5492 if (wbp->cl_scmap != NULL) {
5493 panic("cluster_push_err: Expected NULL cl_scmap\n");
5494 }
5495
5496 wbp->cl_scmap = scmap;
5497 }
5498
5499 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
5500 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5501 } else {
5502 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5503 }
5504
5505 local_err = retval;
5506
5507 if (err)
5508 *err = retval;
5509 retval = 1;
5510 } else {
5511 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5512 if (err)
5513 *err = local_err;
5514 }
5515 lck_mtx_unlock(&wbp->cl_lockw);
5516
5517 if (flags & IO_SYNC)
5518 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5519
5520 if (my_sparse_wait) {
5521 /*
5522 * I'm the owner of the serialization token
5523 * clear it and wakeup anyone that is waiting
5524 * for me to finish
5525 */
5526 lck_mtx_lock(&wbp->cl_lockw);
5527
5528 wbp->cl_sparse_wait = 0;
5529 wakeup((caddr_t)&wbp->cl_sparse_wait);
5530
5531 lck_mtx_unlock(&wbp->cl_lockw);
5532 }
5533 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5534 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5535
5536 return (retval);
5537}
5538
5539
5540__private_extern__ void
5541cluster_release(struct ubc_info *ubc)
5542{
5543 struct cl_writebehind *wbp;
5544 struct cl_readahead *rap;
5545
5546 if ((wbp = ubc->cl_wbehind)) {
5547
5548 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5549
5550 if (wbp->cl_scmap)
5551 vfs_drt_control(&(wbp->cl_scmap), 0);
5552 } else {
5553 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5554 }
5555
5556 rap = ubc->cl_rahead;
5557
5558 if (wbp != NULL) {
5559 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
5560 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
5561 }
5562 if ((rap = ubc->cl_rahead)) {
5563 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
5564 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
5565 }
5566 ubc->cl_rahead = NULL;
5567 ubc->cl_wbehind = NULL;
5568
5569 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5570}
5571
5572
5573static int
5574cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5575{
5576 int cl_index;
5577 int cl_index1;
5578 int min_index;
5579 int cl_len;
5580 int cl_pushed = 0;
5581 struct cl_wextent l_clusters[MAX_CLUSTERS];
5582 u_int max_cluster_pgcount;
5583 int error = 0;
5584
5585 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
5586 /*
5587 * the write behind context exists and has
5588 * already been locked...
5589 */
5590 if (wbp->cl_number == 0)
5591 /*
5592 * no clusters to push
5593 * return number of empty slots
5594 */
5595 return (MAX_CLUSTERS);
5596
5597 /*
5598 * make a local 'sorted' copy of the clusters
5599 * and clear wbp->cl_number so that new clusters can
5600 * be developed
5601 */
5602 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5603 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5604 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
5605 continue;
5606 if (min_index == -1)
5607 min_index = cl_index1;
5608 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
5609 min_index = cl_index1;
5610 }
5611 if (min_index == -1)
5612 break;
5613
5614 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
5615 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
5616 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
5617
5618 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
5619 }
5620 wbp->cl_number = 0;
5621
5622 cl_len = cl_index;
5623
5624 /* skip switching to the sparse cluster mechanism if on diskimage */
5625 if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) &&
5626 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) {
5627 int i;
5628
5629 /*
5630 * determine if we appear to be writing the file sequentially
5631 * if not, by returning without having pushed any clusters
5632 * we will cause this vnode to be pushed into the sparse cluster mechanism
5633 * used for managing more random I/O patterns
5634 *
5635 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5636 * that's why we're in try_push with PUSH_DELAY...
5637 *
5638 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5639 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5640 * so we can just make a simple pass through, up to, but not including the last one...
5641 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5642 * are sequential
5643 *
5644 * we let the last one be partial as long as it was adjacent to the previous one...
5645 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5646 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5647 */
5648 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
5649 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
5650 goto dont_try;
5651 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
5652 goto dont_try;
5653 }
5654 }
5655 if (vm_initiated == TRUE)
5656 lck_mtx_unlock(&wbp->cl_lockw);
5657
5658 for (cl_index = 0; cl_index < cl_len; cl_index++) {
5659 int flags;
5660 struct cl_extent cl;
5661 int retval;
5662
5663 flags = io_flags & (IO_PASSIVE|IO_CLOSE);
5664
5665 /*
5666 * try to push each cluster in turn...
5667 */
5668 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
5669 flags |= IO_NOCACHE;
5670
5671 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
5672 flags |= IO_PASSIVE;
5673
5674 if (push_flag & PUSH_SYNC)
5675 flags |= IO_SYNC;
5676
5677 cl.b_addr = l_clusters[cl_index].b_addr;
5678 cl.e_addr = l_clusters[cl_index].e_addr;
5679
5680 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
5681
5682 if (retval == 0) {
5683 cl_pushed++;
5684
5685 l_clusters[cl_index].b_addr = 0;
5686 l_clusters[cl_index].e_addr = 0;
5687 } else if (error == 0) {
5688 error = retval;
5689 }
5690
5691 if ( !(push_flag & PUSH_ALL) )
5692 break;
5693 }
5694 if (vm_initiated == TRUE)
5695 lck_mtx_lock(&wbp->cl_lockw);
5696
5697 if (err)
5698 *err = error;
5699
5700dont_try:
5701 if (cl_len > cl_pushed) {
5702 /*
5703 * we didn't push all of the clusters, so
5704 * lets try to merge them back in to the vnode
5705 */
5706 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5707 /*
5708 * we picked up some new clusters while we were trying to
5709 * push the old ones... this can happen because I've dropped
5710 * the vnode lock... the sum of the
5711 * leftovers plus the new cluster count exceeds our ability
5712 * to represent them, so switch to the sparse cluster mechanism
5713 *
5714 * collect the active public clusters...
5715 */
5716 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5717
5718 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5719 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
5720 continue;
5721 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5722 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5723 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5724
5725 cl_index1++;
5726 }
5727 /*
5728 * update the cluster count
5729 */
5730 wbp->cl_number = cl_index1;
5731
5732 /*
5733 * and collect the original clusters that were moved into the
5734 * local storage for sorting purposes
5735 */
5736 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5737
5738 } else {
5739 /*
5740 * we've got room to merge the leftovers back in
5741 * just append them starting at the next 'hole'
5742 * represented by wbp->cl_number
5743 */
5744 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
5745 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
5746 continue;
5747
5748 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5749 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5750 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5751
5752 cl_index1++;
5753 }
5754 /*
5755 * update the cluster count
5756 */
5757 wbp->cl_number = cl_index1;
5758 }
5759 }
5760 return (MAX_CLUSTERS - wbp->cl_number);
5761}
5762
5763
5764
5765static int
5766cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
5767 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
5768{
5769 upl_page_info_t *pl;
5770 upl_t upl;
5771 vm_offset_t upl_offset;
5772 int upl_size;
5773 off_t upl_f_offset;
5774 int pages_in_upl;
5775 int start_pg;
5776 int last_pg;
5777 int io_size;
5778 int io_flags;
5779 int upl_flags;
5780 int bflag;
5781 int size;
5782 int error = 0;
5783 int retval;
5784 kern_return_t kret;
5785
5786 if (flags & IO_PASSIVE)
5787 bflag = CL_PASSIVE;
5788 else
5789 bflag = 0;
5790
5791 if (flags & IO_SKIP_ENCRYPTION)
5792 bflag |= CL_ENCRYPTED;
5793
5794 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
5795 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
5796
5797 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
5798 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
5799
5800 return (0);
5801 }
5802 upl_size = pages_in_upl * PAGE_SIZE;
5803 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
5804
5805 if (upl_f_offset + upl_size >= EOF) {
5806
5807 if (upl_f_offset >= EOF) {
5808 /*
5809 * must have truncated the file and missed
5810 * clearing a dangling cluster (i.e. it's completely
5811 * beyond the new EOF
5812 */
5813 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
5814
5815 return(0);
5816 }
5817 size = EOF - upl_f_offset;
5818
5819 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5820 pages_in_upl = upl_size / PAGE_SIZE;
5821 } else
5822 size = upl_size;
5823
5824
5825 if (vm_initiated) {
5826 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
5827 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
5828
5829 return (error);
5830 }
5831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
5832
5833 /*
5834 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
5835 *
5836 * - only pages that are currently dirty are returned... these are the ones we need to clean
5837 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
5838 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
5839 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
5840 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
5841 *
5842 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
5843 */
5844
5845 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
5846 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
5847 else
5848 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
5849
5850 kret = ubc_create_upl_kernel(vp,
5851 upl_f_offset,
5852 upl_size,
5853 &upl,
5854 &pl,
5855 upl_flags,
5856 VM_KERN_MEMORY_FILE);
5857 if (kret != KERN_SUCCESS)
5858 panic("cluster_push: failed to get pagelist");
5859
5860 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
5861
5862 /*
5863 * since we only asked for the dirty pages back
5864 * it's possible that we may only get a few or even none, so...
5865 * before we start marching forward, we must make sure we know
5866 * where the last present page is in the UPL, otherwise we could
5867 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
5868 * employed by commit_range and abort_range.
5869 */
5870 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5871 if (upl_page_present(pl, last_pg))
5872 break;
5873 }
5874 pages_in_upl = last_pg + 1;
5875
5876 if (pages_in_upl == 0) {
5877 ubc_upl_abort(upl, 0);
5878
5879 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
5880 return(0);
5881 }
5882
5883 for (last_pg = 0; last_pg < pages_in_upl; ) {
5884 /*
5885 * find the next dirty page in the UPL
5886 * this will become the first page in the
5887 * next I/O to generate
5888 */
5889 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5890 if (upl_dirty_page(pl, start_pg))
5891 break;
5892 if (upl_page_present(pl, start_pg))
5893 /*
5894 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
5895 * just release these unchanged since we're not going
5896 * to steal them or change their state
5897 */
5898 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
5899 }
5900 if (start_pg >= pages_in_upl)
5901 /*
5902 * done... no more dirty pages to push
5903 */
5904 break;
5905 if (start_pg > last_pg)
5906 /*
5907 * skipped over some non-dirty pages
5908 */
5909 size -= ((start_pg - last_pg) * PAGE_SIZE);
5910
5911 /*
5912 * find a range of dirty pages to write
5913 */
5914 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5915 if (!upl_dirty_page(pl, last_pg))
5916 break;
5917 }
5918 upl_offset = start_pg * PAGE_SIZE;
5919
5920 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
5921
5922 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
5923
5924 if ( !(flags & IO_SYNC))
5925 io_flags |= CL_ASYNC;
5926
5927 if (flags & IO_CLOSE)
5928 io_flags |= CL_CLOSE;
5929
5930 if (flags & IO_NOCACHE)
5931 io_flags |= CL_NOCACHE;
5932
5933 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5934 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5935
5936 if (error == 0 && retval)
5937 error = retval;
5938
5939 size -= io_size;
5940 }
5941 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
5942
5943 return(error);
5944}
5945
5946
5947/*
5948 * sparse_cluster_switch is called with the write behind lock held
5949 */
5950static int
5951sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
5952{
5953 int cl_index;
5954 int error;
5955
5956 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
5957
5958 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5959 int flags;
5960 struct cl_extent cl;
5961
5962 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
5963
5964 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
5965 if (flags & UPL_POP_DIRTY) {
5966 cl.e_addr = cl.b_addr + 1;
5967
5968 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
5969
5970 if (error) {
5971 break;
5972 }
5973 }
5974 }
5975 }
5976 }
5977 wbp->cl_number -= cl_index;
5978
5979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
5980
5981 return error;
5982}
5983
5984
5985/*
5986 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
5987 * still associated with the write-behind context... however, if the scmap has been disassociated
5988 * from the write-behind context (the cluster_push case), the wb lock is not held
5989 */
5990static int
5991sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
5992 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
5993{
5994 struct cl_extent cl;
5995 off_t offset;
5996 u_int length;
5997 void *l_scmap;
5998 int error = 0;
5999
6000 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6001
6002 if (push_flag & PUSH_ALL)
6003 vfs_drt_control(scmap, 1);
6004
6005 l_scmap = *scmap;
6006
6007 for (;;) {
6008 int retval;
6009
6010 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
6011 break;
6012
6013 if (vm_initiated == TRUE)
6014 lck_mtx_unlock(&wbp->cl_lockw);
6015
6016 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6017 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6018
6019 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6020 if (error == 0 && retval)
6021 error = retval;
6022
6023 if (vm_initiated == TRUE) {
6024 lck_mtx_lock(&wbp->cl_lockw);
6025
6026 if (*scmap != l_scmap)
6027 break;
6028 }
6029
6030 if (error) {
6031 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6032 panic("Failed to restore dirty state on failure\n");
6033 }
6034
6035 break;
6036 }
6037
6038 if ( !(push_flag & PUSH_ALL)) {
6039 break;
6040 }
6041 }
6042 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6043
6044 return error;
6045}
6046
6047
6048/*
6049 * sparse_cluster_add is called with the write behind lock held
6050 */
6051static int
6052sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6053 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6054{
6055 u_int new_dirty;
6056 u_int length;
6057 off_t offset;
6058 int error;
6059
6060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6061
6062 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6063 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6064
6065 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6066 /*
6067 * no room left in the map
6068 * only a partial update was done
6069 * push out some pages and try again
6070 */
6071 error = sparse_cluster_push(wbp, scmap, vp, EOF, 0, 0, callback, callback_arg, vm_initiated);
6072
6073 if (error) {
6074 break;
6075 }
6076
6077 offset += (new_dirty * PAGE_SIZE_64);
6078 length -= (new_dirty * PAGE_SIZE);
6079 }
6080 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6081
6082 return error;
6083}
6084
6085
6086static int
6087cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6088{
6089 upl_page_info_t *pl;
6090 upl_t upl;
6091 addr64_t ubc_paddr;
6092 kern_return_t kret;
6093 int error = 0;
6094 int did_read = 0;
6095 int abort_flags;
6096 int upl_flags;
6097 int bflag;
6098
6099 if (flags & IO_PASSIVE)
6100 bflag = CL_PASSIVE;
6101 else
6102 bflag = 0;
6103
6104 if (flags & IO_NOCACHE)
6105 bflag |= CL_NOCACHE;
6106
6107 upl_flags = UPL_SET_LITE;
6108
6109 if ( !(flags & CL_READ) ) {
6110 /*
6111 * "write" operation: let the UPL subsystem know
6112 * that we intend to modify the buffer cache pages
6113 * we're gathering.
6114 */
6115 upl_flags |= UPL_WILL_MODIFY;
6116 } else {
6117 /*
6118 * indicate that there is no need to pull the
6119 * mapping for this page... we're only going
6120 * to read from it, not modify it.
6121 */
6122 upl_flags |= UPL_FILE_IO;
6123 }
6124 kret = ubc_create_upl_kernel(vp,
6125 uio->uio_offset & ~PAGE_MASK_64,
6126 PAGE_SIZE,
6127 &upl,
6128 &pl,
6129 upl_flags,
6130 VM_KERN_MEMORY_FILE);
6131
6132 if (kret != KERN_SUCCESS)
6133 return(EINVAL);
6134
6135 if (!upl_valid_page(pl, 0)) {
6136 /*
6137 * issue a synchronous read to cluster_io
6138 */
6139 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6140 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6141 if (error) {
6142 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6143
6144 return(error);
6145 }
6146 did_read = 1;
6147 }
6148 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6149
6150/*
6151 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6152 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6153 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6154 * way to do so without exporting them to kexts as well.
6155 */
6156 if (flags & CL_READ)
6157// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6158 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6159 else
6160// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6161 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6162
6163 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6164 /*
6165 * issue a synchronous write to cluster_io
6166 */
6167 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6168 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6169 }
6170 if (error == 0)
6171 uio_update(uio, (user_size_t)xsize);
6172
6173 if (did_read)
6174 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6175 else
6176 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6177
6178 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6179
6180 return (error);
6181}
6182
6183int
6184cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6185{
6186 int pg_offset;
6187 int pg_index;
6188 int csize;
6189 int segflg;
6190 int retval = 0;
6191 int xsize;
6192 upl_page_info_t *pl;
6193 int dirty_count;
6194
6195 xsize = *io_resid;
6196
6197 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6198 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6199
6200 segflg = uio->uio_segflg;
6201
6202 switch(segflg) {
6203
6204 case UIO_USERSPACE32:
6205 case UIO_USERISPACE32:
6206 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6207 break;
6208
6209 case UIO_USERSPACE:
6210 case UIO_USERISPACE:
6211 uio->uio_segflg = UIO_PHYS_USERSPACE;
6212 break;
6213
6214 case UIO_USERSPACE64:
6215 case UIO_USERISPACE64:
6216 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6217 break;
6218
6219 case UIO_SYSSPACE:
6220 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6221 break;
6222
6223 }
6224 pl = ubc_upl_pageinfo(upl);
6225
6226 pg_index = upl_offset / PAGE_SIZE;
6227 pg_offset = upl_offset & PAGE_MASK;
6228 csize = min(PAGE_SIZE - pg_offset, xsize);
6229
6230 dirty_count = 0;
6231 while (xsize && retval == 0) {
6232 addr64_t paddr;
6233
6234 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6235 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE))
6236 dirty_count++;
6237
6238 retval = uiomove64(paddr, csize, uio);
6239
6240 pg_index += 1;
6241 pg_offset = 0;
6242 xsize -= csize;
6243 csize = min(PAGE_SIZE, xsize);
6244 }
6245 *io_resid = xsize;
6246
6247 uio->uio_segflg = segflg;
6248
6249 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6250 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6251 (int)uio->uio_offset, xsize, retval, segflg, 0);
6252
6253 return (retval);
6254}
6255
6256
6257int
6258cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6259{
6260
6261 return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
6262}
6263
6264
6265static int
6266cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6267{
6268 int segflg;
6269 int io_size;
6270 int xsize;
6271 int start_offset;
6272 int retval = 0;
6273 memory_object_control_t control;
6274
6275 io_size = *io_resid;
6276
6277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6278 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6279
6280 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6281
6282 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6283 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6284 (int)uio->uio_offset, io_size, retval, 3, 0);
6285
6286 return(0);
6287 }
6288 segflg = uio->uio_segflg;
6289
6290 switch(segflg) {
6291
6292 case UIO_USERSPACE32:
6293 case UIO_USERISPACE32:
6294 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6295 break;
6296
6297 case UIO_USERSPACE64:
6298 case UIO_USERISPACE64:
6299 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6300 break;
6301
6302 case UIO_USERSPACE:
6303 case UIO_USERISPACE:
6304 uio->uio_segflg = UIO_PHYS_USERSPACE;
6305 break;
6306
6307 case UIO_SYSSPACE:
6308 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6309 break;
6310 }
6311
6312 if ( (io_size = *io_resid) ) {
6313 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6314 xsize = uio_resid(uio);
6315
6316 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6317 start_offset, io_size, mark_dirty, take_reference);
6318 xsize -= uio_resid(uio);
6319 io_size -= xsize;
6320 }
6321 uio->uio_segflg = segflg;
6322 *io_resid = io_size;
6323
6324 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6325 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6326
6327 return(retval);
6328}
6329
6330
6331int
6332is_file_clean(vnode_t vp, off_t filesize)
6333{
6334 off_t f_offset;
6335 int flags;
6336 int total_dirty = 0;
6337
6338 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6339 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6340 if (flags & UPL_POP_DIRTY) {
6341 total_dirty++;
6342 }
6343 }
6344 }
6345 if (total_dirty)
6346 return(EINVAL);
6347
6348 return (0);
6349}
6350
6351
6352
6353/*
6354 * Dirty region tracking/clustering mechanism.
6355 *
6356 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6357 * dirty regions within a larger space (file). It is primarily intended to
6358 * support clustering in large files with many dirty areas.
6359 *
6360 * The implementation assumes that the dirty regions are pages.
6361 *
6362 * To represent dirty pages within the file, we store bit vectors in a
6363 * variable-size circular hash.
6364 */
6365
6366/*
6367 * Bitvector size. This determines the number of pages we group in a
6368 * single hashtable entry. Each hashtable entry is aligned to this
6369 * size within the file.
6370 */
6371#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6372
6373/*
6374 * File offset handling.
6375 *
6376 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6377 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6378 */
6379#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6380#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6381
6382/*
6383 * Hashtable address field handling.
6384 *
6385 * The low-order bits of the hashtable address are used to conserve
6386 * space.
6387 *
6388 * DRT_HASH_COUNT_MASK must be large enough to store the range
6389 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6390 * to indicate that the bucket is actually unoccupied.
6391 */
6392#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6393#define DRT_HASH_SET_ADDRESS(scm, i, a) \
6394 do { \
6395 (scm)->scm_hashtable[(i)].dhe_control = \
6396 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6397 } while (0)
6398#define DRT_HASH_COUNT_MASK 0x1ff
6399#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6400#define DRT_HASH_SET_COUNT(scm, i, c) \
6401 do { \
6402 (scm)->scm_hashtable[(i)].dhe_control = \
6403 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6404 } while (0)
6405#define DRT_HASH_CLEAR(scm, i) \
6406 do { \
6407 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6408 } while (0)
6409#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6410#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6411#define DRT_HASH_COPY(oscm, oi, scm, i) \
6412 do { \
6413 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6414 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6415 } while(0);
6416
6417
6418#if CONFIG_EMBEDDED
6419/*
6420 * Hash table moduli.
6421 *
6422 * Since the hashtable entry's size is dependent on the size of
6423 * the bitvector, and since the hashtable size is constrained to
6424 * both being prime and fitting within the desired allocation
6425 * size, these values need to be manually determined.
6426 *
6427 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6428 *
6429 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6430 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6431 */
6432
6433#define DRT_HASH_SMALL_MODULUS 251
6434#define DRT_HASH_LARGE_MODULUS 2039
6435
6436/*
6437 * Physical memory required before the large hash modulus is permitted.
6438 *
6439 * On small memory systems, the large hash modulus can lead to phsyical
6440 * memory starvation, so we avoid using it there.
6441 */
6442#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6443
6444#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6445#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6446
6447#else
6448/*
6449 * Hash table moduli.
6450 *
6451 * Since the hashtable entry's size is dependent on the size of
6452 * the bitvector, and since the hashtable size is constrained to
6453 * both being prime and fitting within the desired allocation
6454 * size, these values need to be manually determined.
6455 *
6456 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6457 *
6458 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6459 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6460 */
6461
6462#define DRT_HASH_SMALL_MODULUS 1019
6463#define DRT_HASH_LARGE_MODULUS 8179
6464
6465/*
6466 * Physical memory required before the large hash modulus is permitted.
6467 *
6468 * On small memory systems, the large hash modulus can lead to phsyical
6469 * memory starvation, so we avoid using it there.
6470 */
6471#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6472
6473#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6474#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6475
6476#endif
6477
6478/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6479
6480/*
6481 * Hashtable entry.
6482 */
6483struct vfs_drt_hashentry {
6484 u_int64_t dhe_control;
6485/*
6486* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6487* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6488* Since PAGE_SIZE is only known at boot time,
6489* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6490* -declare dhe_bitvector array for largest possible length
6491*/
6492#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6493 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
6494};
6495
6496/*
6497 * Hashtable bitvector handling.
6498 *
6499 * Bitvector fields are 32 bits long.
6500 */
6501
6502#define DRT_HASH_SET_BIT(scm, i, bit) \
6503 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6504
6505#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6506 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6507
6508#define DRT_HASH_TEST_BIT(scm, i, bit) \
6509 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6510
6511#define DRT_BITVECTOR_CLEAR(scm, i) \
6512 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6513
6514#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6515 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6516 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6517 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6518
6519/*
6520 * Dirty Region Tracking structure.
6521 *
6522 * The hashtable is allocated entirely inside the DRT structure.
6523 *
6524 * The hash is a simple circular prime modulus arrangement, the structure
6525 * is resized from small to large if it overflows.
6526 */
6527
6528struct vfs_drt_clustermap {
6529 u_int32_t scm_magic; /* sanity/detection */
6530#define DRT_SCM_MAGIC 0x12020003
6531 u_int32_t scm_modulus; /* current ring size */
6532 u_int32_t scm_buckets; /* number of occupied buckets */
6533 u_int32_t scm_lastclean; /* last entry we cleaned */
6534 u_int32_t scm_iskips; /* number of slot skips */
6535
6536 struct vfs_drt_hashentry scm_hashtable[0];
6537};
6538
6539
6540#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6541#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6542
6543/*
6544 * Debugging codes and arguments.
6545 */
6546#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6547#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6548#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6549#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6550#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6551 * dirty */
6552 /* 0, setcount */
6553 /* 1 (clean, no map) */
6554 /* 2 (map alloc fail) */
6555 /* 3, resid (partial) */
6556#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6557#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6558 * lastclean, iskips */
6559
6560
6561static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6562static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6563static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6564 u_int64_t offset, int *indexp);
6565static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6566 u_int64_t offset,
6567 int *indexp,
6568 int recursed);
6569static kern_return_t vfs_drt_do_mark_pages(
6570 void **cmapp,
6571 u_int64_t offset,
6572 u_int length,
6573 u_int *setcountp,
6574 int dirty);
6575static void vfs_drt_trace(
6576 struct vfs_drt_clustermap *cmap,
6577 int code,
6578 int arg1,
6579 int arg2,
6580 int arg3,
6581 int arg4);
6582
6583
6584/*
6585 * Allocate and initialise a sparse cluster map.
6586 *
6587 * Will allocate a new map, resize or compact an existing map.
6588 *
6589 * XXX we should probably have at least one intermediate map size,
6590 * as the 1:16 ratio seems a bit drastic.
6591 */
6592static kern_return_t
6593vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6594{
6595 struct vfs_drt_clustermap *cmap, *ocmap;
6596 kern_return_t kret;
6597 u_int64_t offset;
6598 u_int32_t i;
6599 int nsize, active_buckets, index, copycount;
6600
6601 ocmap = NULL;
6602 if (cmapp != NULL)
6603 ocmap = *cmapp;
6604
6605 /*
6606 * Decide on the size of the new map.
6607 */
6608 if (ocmap == NULL) {
6609 nsize = DRT_HASH_SMALL_MODULUS;
6610 } else {
6611 /* count the number of active buckets in the old map */
6612 active_buckets = 0;
6613 for (i = 0; i < ocmap->scm_modulus; i++) {
6614 if (!DRT_HASH_VACANT(ocmap, i) &&
6615 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
6616 active_buckets++;
6617 }
6618 /*
6619 * If we're currently using the small allocation, check to
6620 * see whether we should grow to the large one.
6621 */
6622 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6623 /*
6624 * If the ring is nearly full and we are allowed to
6625 * use the large modulus, upgrade.
6626 */
6627 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6628 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
6629 nsize = DRT_HASH_LARGE_MODULUS;
6630 } else {
6631 nsize = DRT_HASH_SMALL_MODULUS;
6632 }
6633 } else {
6634 /* already using the large modulus */
6635 nsize = DRT_HASH_LARGE_MODULUS;
6636 /*
6637 * If the ring is completely full, there's
6638 * nothing useful for us to do. Behave as
6639 * though we had compacted into the new
6640 * array and return.
6641 */
6642 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
6643 return(KERN_SUCCESS);
6644 }
6645 }
6646
6647 /*
6648 * Allocate and initialise the new map.
6649 */
6650
6651 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
6652 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE);
6653 if (kret != KERN_SUCCESS)
6654 return(kret);
6655 cmap->scm_magic = DRT_SCM_MAGIC;
6656 cmap->scm_modulus = nsize;
6657 cmap->scm_buckets = 0;
6658 cmap->scm_lastclean = 0;
6659 cmap->scm_iskips = 0;
6660 for (i = 0; i < cmap->scm_modulus; i++) {
6661 DRT_HASH_CLEAR(cmap, i);
6662 DRT_HASH_VACATE(cmap, i);
6663 DRT_BITVECTOR_CLEAR(cmap, i);
6664 }
6665
6666 /*
6667 * If there's an old map, re-hash entries from it into the new map.
6668 */
6669 copycount = 0;
6670 if (ocmap != NULL) {
6671 for (i = 0; i < ocmap->scm_modulus; i++) {
6672 /* skip empty buckets */
6673 if (DRT_HASH_VACANT(ocmap, i) ||
6674 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
6675 continue;
6676 /* get new index */
6677 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
6678 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
6679 if (kret != KERN_SUCCESS) {
6680 /* XXX need to bail out gracefully here */
6681 panic("vfs_drt: new cluster map mysteriously too small");
6682 index = 0;
6683 }
6684 /* copy */
6685 DRT_HASH_COPY(ocmap, i, cmap, index);
6686 copycount++;
6687 }
6688 }
6689
6690 /* log what we've done */
6691 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
6692
6693 /*
6694 * It's important to ensure that *cmapp always points to
6695 * a valid map, so we must overwrite it before freeing
6696 * the old map.
6697 */
6698 *cmapp = cmap;
6699 if (ocmap != NULL) {
6700 /* emit stats into trace buffer */
6701 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
6702 ocmap->scm_modulus,
6703 ocmap->scm_buckets,
6704 ocmap->scm_lastclean,
6705 ocmap->scm_iskips);
6706
6707 vfs_drt_free_map(ocmap);
6708 }
6709 return(KERN_SUCCESS);
6710}
6711
6712
6713/*
6714 * Free a sparse cluster map.
6715 */
6716static kern_return_t
6717vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
6718{
6719 kmem_free(kernel_map, (vm_offset_t)cmap,
6720 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
6721 return(KERN_SUCCESS);
6722}
6723
6724
6725/*
6726 * Find the hashtable slot currently occupied by an entry for the supplied offset.
6727 */
6728static kern_return_t
6729vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
6730{
6731 int index;
6732 u_int32_t i;
6733
6734 offset = DRT_ALIGN_ADDRESS(offset);
6735 index = DRT_HASH(cmap, offset);
6736
6737 /* traverse the hashtable */
6738 for (i = 0; i < cmap->scm_modulus; i++) {
6739
6740 /*
6741 * If the slot is vacant, we can stop.
6742 */
6743 if (DRT_HASH_VACANT(cmap, index))
6744 break;
6745
6746 /*
6747 * If the address matches our offset, we have success.
6748 */
6749 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
6750 *indexp = index;
6751 return(KERN_SUCCESS);
6752 }
6753
6754 /*
6755 * Move to the next slot, try again.
6756 */
6757 index = DRT_HASH_NEXT(cmap, index);
6758 }
6759 /*
6760 * It's not there.
6761 */
6762 return(KERN_FAILURE);
6763}
6764
6765/*
6766 * Find the hashtable slot for the supplied offset. If we haven't allocated
6767 * one yet, allocate one and populate the address field. Note that it will
6768 * not have a nonzero page count and thus will still technically be free, so
6769 * in the case where we are called to clean pages, the slot will remain free.
6770 */
6771static kern_return_t
6772vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
6773{
6774 struct vfs_drt_clustermap *cmap;
6775 kern_return_t kret;
6776 u_int32_t index;
6777 u_int32_t i;
6778
6779 cmap = *cmapp;
6780
6781 /* look for an existing entry */
6782 kret = vfs_drt_search_index(cmap, offset, indexp);
6783 if (kret == KERN_SUCCESS)
6784 return(kret);
6785
6786 /* need to allocate an entry */
6787 offset = DRT_ALIGN_ADDRESS(offset);
6788 index = DRT_HASH(cmap, offset);
6789
6790 /* scan from the index forwards looking for a vacant slot */
6791 for (i = 0; i < cmap->scm_modulus; i++) {
6792 /* slot vacant? */
6793 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
6794 cmap->scm_buckets++;
6795 if (index < cmap->scm_lastclean)
6796 cmap->scm_lastclean = index;
6797 DRT_HASH_SET_ADDRESS(cmap, index, offset);
6798 DRT_HASH_SET_COUNT(cmap, index, 0);
6799 DRT_BITVECTOR_CLEAR(cmap, index);
6800 *indexp = index;
6801 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
6802 return(KERN_SUCCESS);
6803 }
6804 cmap->scm_iskips += i;
6805 index = DRT_HASH_NEXT(cmap, index);
6806 }
6807
6808 /*
6809 * We haven't found a vacant slot, so the map is full. If we're not
6810 * already recursed, try reallocating/compacting it.
6811 */
6812 if (recursed)
6813 return(KERN_FAILURE);
6814 kret = vfs_drt_alloc_map(cmapp);
6815 if (kret == KERN_SUCCESS) {
6816 /* now try to insert again */
6817 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
6818 }
6819 return(kret);
6820}
6821
6822/*
6823 * Implementation of set dirty/clean.
6824 *
6825 * In the 'clean' case, not finding a map is OK.
6826 */
6827static kern_return_t
6828vfs_drt_do_mark_pages(
6829 void **private,
6830 u_int64_t offset,
6831 u_int length,
6832 u_int *setcountp,
6833 int dirty)
6834{
6835 struct vfs_drt_clustermap *cmap, **cmapp;
6836 kern_return_t kret;
6837 int i, index, pgoff, pgcount, setcount, ecount;
6838
6839 cmapp = (struct vfs_drt_clustermap **)private;
6840 cmap = *cmapp;
6841
6842 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
6843
6844 if (setcountp != NULL)
6845 *setcountp = 0;
6846
6847 /* allocate a cluster map if we don't already have one */
6848 if (cmap == NULL) {
6849 /* no cluster map, nothing to clean */
6850 if (!dirty) {
6851 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
6852 return(KERN_SUCCESS);
6853 }
6854 kret = vfs_drt_alloc_map(cmapp);
6855 if (kret != KERN_SUCCESS) {
6856 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
6857 return(kret);
6858 }
6859 }
6860 setcount = 0;
6861
6862 /*
6863 * Iterate over the length of the region.
6864 */
6865 while (length > 0) {
6866 /*
6867 * Get the hashtable index for this offset.
6868 *
6869 * XXX this will add blank entries if we are clearing a range
6870 * that hasn't been dirtied.
6871 */
6872 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
6873 cmap = *cmapp; /* may have changed! */
6874 /* this may be a partial-success return */
6875 if (kret != KERN_SUCCESS) {
6876 if (setcountp != NULL)
6877 *setcountp = setcount;
6878 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
6879
6880 return(kret);
6881 }
6882
6883 /*
6884 * Work out how many pages we're modifying in this
6885 * hashtable entry.
6886 */
6887 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
6888 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
6889
6890 /*
6891 * Iterate over pages, dirty/clearing as we go.
6892 */
6893 ecount = DRT_HASH_GET_COUNT(cmap, index);
6894 for (i = 0; i < pgcount; i++) {
6895 if (dirty) {
6896 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
6897 if (ecount >= DRT_BITVECTOR_PAGES)
6898 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
6899 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
6900 ecount++;
6901 setcount++;
6902 }
6903 } else {
6904 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
6905 if (ecount <= 0)
6906 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
6907 assert(ecount > 0);
6908 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
6909 ecount--;
6910 setcount++;
6911 }
6912 }
6913 }
6914 DRT_HASH_SET_COUNT(cmap, index, ecount);
6915
6916 offset += pgcount * PAGE_SIZE;
6917 length -= pgcount * PAGE_SIZE;
6918 }
6919 if (setcountp != NULL)
6920 *setcountp = setcount;
6921
6922 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
6923
6924 return(KERN_SUCCESS);
6925}
6926
6927/*
6928 * Mark a set of pages as dirty/clean.
6929 *
6930 * This is a public interface.
6931 *
6932 * cmapp
6933 * Pointer to storage suitable for holding a pointer. Note that
6934 * this must either be NULL or a value set by this function.
6935 *
6936 * size
6937 * Current file size in bytes.
6938 *
6939 * offset
6940 * Offset of the first page to be marked as dirty, in bytes. Must be
6941 * page-aligned.
6942 *
6943 * length
6944 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
6945 *
6946 * setcountp
6947 * Number of pages newly marked dirty by this call (optional).
6948 *
6949 * Returns KERN_SUCCESS if all the pages were successfully marked.
6950 */
6951static kern_return_t
6952vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
6953{
6954 /* XXX size unused, drop from interface */
6955 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
6956}
6957
6958#if 0
6959static kern_return_t
6960vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
6961{
6962 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
6963}
6964#endif
6965
6966/*
6967 * Get a cluster of dirty pages.
6968 *
6969 * This is a public interface.
6970 *
6971 * cmapp
6972 * Pointer to storage managed by drt_mark_pages. Note that this must
6973 * be NULL or a value set by drt_mark_pages.
6974 *
6975 * offsetp
6976 * Returns the byte offset into the file of the first page in the cluster.
6977 *
6978 * lengthp
6979 * Returns the length in bytes of the cluster of dirty pages.
6980 *
6981 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
6982 * are no dirty pages meeting the minmum size criteria. Private storage will
6983 * be released if there are no more dirty pages left in the map
6984 *
6985 */
6986static kern_return_t
6987vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
6988{
6989 struct vfs_drt_clustermap *cmap;
6990 u_int64_t offset;
6991 u_int length;
6992 u_int32_t j;
6993 int index, i, fs, ls;
6994
6995 /* sanity */
6996 if ((cmapp == NULL) || (*cmapp == NULL))
6997 return(KERN_FAILURE);
6998 cmap = *cmapp;
6999
7000 /* walk the hashtable */
7001 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7002 index = DRT_HASH(cmap, offset);
7003
7004 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
7005 continue;
7006
7007 /* scan the bitfield for a string of bits */
7008 fs = -1;
7009
7010 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7011 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7012 fs = i;
7013 break;
7014 }
7015 }
7016 if (fs == -1) {
7017 /* didn't find any bits set */
7018 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7019 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7020 }
7021 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7022 if (!DRT_HASH_TEST_BIT(cmap, index, i))
7023 break;
7024 }
7025
7026 /* compute offset and length, mark pages clean */
7027 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7028 length = ls * PAGE_SIZE;
7029 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7030 cmap->scm_lastclean = index;
7031
7032 /* return successful */
7033 *offsetp = (off_t)offset;
7034 *lengthp = length;
7035
7036 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7037 return(KERN_SUCCESS);
7038 }
7039 /*
7040 * We didn't find anything... hashtable is empty
7041 * emit stats into trace buffer and
7042 * then free it
7043 */
7044 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7045 cmap->scm_modulus,
7046 cmap->scm_buckets,
7047 cmap->scm_lastclean,
7048 cmap->scm_iskips);
7049
7050 vfs_drt_free_map(cmap);
7051 *cmapp = NULL;
7052
7053 return(KERN_FAILURE);
7054}
7055
7056
7057static kern_return_t
7058vfs_drt_control(void **cmapp, int op_type)
7059{
7060 struct vfs_drt_clustermap *cmap;
7061
7062 /* sanity */
7063 if ((cmapp == NULL) || (*cmapp == NULL))
7064 return(KERN_FAILURE);
7065 cmap = *cmapp;
7066
7067 switch (op_type) {
7068 case 0:
7069 /* emit stats into trace buffer */
7070 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7071 cmap->scm_modulus,
7072 cmap->scm_buckets,
7073 cmap->scm_lastclean,
7074 cmap->scm_iskips);
7075
7076 vfs_drt_free_map(cmap);
7077 *cmapp = NULL;
7078 break;
7079
7080 case 1:
7081 cmap->scm_lastclean = 0;
7082 break;
7083 }
7084 return(KERN_SUCCESS);
7085}
7086
7087
7088
7089/*
7090 * Emit a summary of the state of the clustermap into the trace buffer
7091 * along with some caller-provided data.
7092 */
7093#if KDEBUG
7094static void
7095vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7096{
7097 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7098}
7099#else
7100static void
7101vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7102 __unused int arg1, __unused int arg2, __unused int arg3,
7103 __unused int arg4)
7104{
7105}
7106#endif
7107
7108#if 0
7109/*
7110 * Perform basic sanity check on the hash entry summary count
7111 * vs. the actual bits set in the entry.
7112 */
7113static void
7114vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7115{
7116 int index, i;
7117 int bits_on;
7118
7119 for (index = 0; index < cmap->scm_modulus; index++) {
7120 if (DRT_HASH_VACANT(cmap, index))
7121 continue;
7122
7123 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7124 if (DRT_HASH_TEST_BIT(cmap, index, i))
7125 bits_on++;
7126 }
7127 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
7128 panic("bits_on = %d, index = %d\n", bits_on, index);
7129 }
7130}
7131#endif
7132