vfs_bio.c source code [xnu/bsd/vfs/vfs_bio.c]

1	/*
2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/-*
30	* Copyright (c) 1994 Christopher G. Demetriou
31	* Copyright (c) 1982, 1986, 1989, 1993
32	* The Regents of the University of California. All rights reserved.
33	* (c) UNIX System Laboratories, Inc.
34	* All or some portions of this file are derived from material licensed
35	* to the University of California by American Telephone and Telegraph
36	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
37	* the permission of UNIX System Laboratories, Inc.
38	*
39	* Redistribution and use in source and binary forms, with or without
40	* modification, are permitted provided that the following conditions
41	* are met:
42	* 1. Redistributions of source code must retain the above copyright
43	* notice, this list of conditions and the following disclaimer.
44	* 2. Redistributions in binary form must reproduce the above copyright
45	* notice, this list of conditions and the following disclaimer in the
46	* documentation and/or other materials provided with the distribution.
47	* 3. All advertising materials mentioning features or use of this software
48	* must display the following acknowledgement:
49	* This product includes software developed by the University of
50	* California, Berkeley and its contributors.
51	* 4. Neither the name of the University nor the names of its contributors
52	* may be used to endorse or promote products derived from this software
53	* without specific prior written permission.
54	*
55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65	* SUCH DAMAGE.
66	*
67	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68	*/
69
70	/*
71	* Some references:
72	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73	* Leffler, et al.: The Design and Implementation of the 4.3BSD
74	* UNIX Operating System (Addison Welley, 1989)
75	*/
76
77	#include <sys/param.h>
78	#include <sys/systm.h>
79	#include <sys/proc_internal.h>
80	#include <sys/buf_internal.h>
81	#include <sys/vnode_internal.h>
82	#include <sys/mount_internal.h>
83	#include <sys/trace.h>
84	#include <kern/kalloc.h>
85	#include <sys/resourcevar.h>
86	#include <miscfs/specfs/specdev.h>
87	#include <sys/ubc.h>
88	#include <sys/kauth.h>
89	#if DIAGNOSTIC
90	#include <kern/assert.h>
91	#endif /* DIAGNOSTIC */
92	#include <kern/task.h>
93	#include <kern/zalloc.h>
94	#include <kern/locks.h>
95	#include <kern/thread.h>
96
97	#include <sys/fslog.h> /* fslog_io_error() */
98	#include <sys/disk.h> /* dk_error_description_t */
99
100	#include <mach/mach_types.h>
101	#include <mach/memory_object_types.h>
102	#include <kern/sched_prim.h> /* thread_block() */
103
104	#include <vm/vm_kern.h>
105	#include <vm/vm_pageout.h>
106
107	#include <sys/kdebug.h>
108
109	#include <libkern/OSAtomic.h>
110	#include <libkern/OSDebug.h>
111	#include <sys/ubc_internal.h>
112
113	#include <sys/sdt.h>
114
115	int bcleanbuf(buf_t bp, boolean_t discard);
116	static int brecover_data(buf_t bp);
117	static boolean_t incore(vnode_t vp, daddr64_t blkno);
118	/ timeout is in msecs /
119	static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
120	static void bremfree_locked(buf_t bp);
121	static void buf_reassign(buf_t bp, vnode_t newvp);
122	static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
123	static int buf_iterprepare(vnode_t vp, struct buflists , int* flags);
124	static void buf_itercomplete(vnode_t vp, struct buflists , int* flags);
125	static boolean_t buffer_cache_gc(int);
126	static buf_t buf_brelse_shadow(buf_t bp);
127	static void buf_free_meta_store(buf_t bp);
128
129	static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
130	uintptr_t external_storage, void (iodone)(buf_t, void* ), void* arg, int* priv);
131
132
133	int bdwrite_internal(buf_t, int);
134
135	extern void disk_conditioner_delay(buf_t, int, int, uint64_t);
136
137	/ zone allocated buffer headers /
138	static void bcleanbuf_thread_init(void);
139	static void bcleanbuf_thread(void);
140
141	static ZONE_DEFINE_TYPE(buf_hdr_zone, "buf headers", struct buf, ZC_NONE);
142	static int buf_hdr_count;
143
144
145	/*
146	* Definitions for the buffer hash lists.
147	*/
148	#define BUFHASH(dvp, lbn) \
149	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
150	LIST_HEAD(bufhashhdr, buf) * bufhashtbl, invalhash;
151	u_long bufhash;
152
153	static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
154
155	/ Definitions for the buffer stats. /
156	struct bufstats bufstats;
157
158	/ Number of delayed write buffers /
159	long nbdwrite = `0`;
160	int blaundrycnt = `0`;
161	static int boot_nbuf_headers = `0`;
162
163	static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
164
165	static TAILQ_HEAD(ioqueue, buf) iobufqueue;
166	static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
167	static int needbuffer;
168	static int need_iobuffer;
169
170	static LCK_GRP_DECLARE(buf_mtx_grp, "buffer cache");
171	static LCK_ATTR_DECLARE(buf_mtx_attr, `0`, `0`);
172	static LCK_MTX_DECLARE_ATTR(iobuffer_mtxp, &buf_mtx_grp, &buf_mtx_attr);
173	static LCK_MTX_DECLARE_ATTR(buf_mtx, &buf_mtx_grp, &buf_mtx_attr);
174	static LCK_MTX_DECLARE_ATTR(buf_gc_callout, &buf_mtx_grp, &buf_mtx_attr);
175
176	static uint32_t buf_busycount;
177
178	#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
179	typedef struct {
180	void (* callout)(int, void *);
181	void *context;
182	} fs_buffer_cache_gc_callout_t;
183
184	fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
185
186	static __inline__ int
187	buf_timestamp(void)
188	{
189	struct timeval t;
190	microuptime(tv: &t);
191	return (int)t.tv_sec;
192	}
193
194	/*
195	* Insq/Remq for the buffer free lists.
196	*/
197	#define binsheadfree(bp, dp, whichq) do { \
198	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
199	} while (0)
200
201	#define binstailfree(bp, dp, whichq) do { \
202	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
203	} while (0)
204
205	#define BHASHENTCHECK(bp) \
206	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
207	panic("%p: b_hash.le_prev is not deadbeef", (bp));
208
209	#define BLISTNONE(bp) \
210	(bp)->b_hash.le_next = (struct buf *)0; \
211	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
212
213	/*
214	* Insq/Remq for the vnode usage lists.
215	*/
216	#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
217	#define bufremvn(bp) { \
218	LIST_REMOVE(bp, b_vnbufs); \
219	(bp)->b_vnbufs.le_next = NOLIST; \
220	}
221
222	/*
223	* Time in seconds before a buffer on a list is
224	* considered as a stale buffer
225	*/
226	#define LRU_IS_STALE 120 /* default value for the LRU */
227	#define AGE_IS_STALE 60 /* default value for the AGE */
228	#define META_IS_STALE 180 /* default value for the BQ_META */
229
230	int lru_is_stale = LRU_IS_STALE;
231	int age_is_stale = AGE_IS_STALE;
232	int meta_is_stale = META_IS_STALE;
233
234	#define MAXLAUNDRY 10
235
236	/ LIST_INSERT_HEAD() with assertions /
237	static __inline__ void
238	blistenterhead(struct bufhashhdr * head, buf_t bp)
239	{
240	if ((bp->b_hash.le_next = (head)->lh_first) != NULL) {
241	(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
242	}
243	(head)->lh_first = bp;
244	bp->b_hash.le_prev = &(head)->lh_first;
245	if (bp->b_hash.le_prev == (struct buf **)`0xdeadbeef`) {
246	panic("blistenterhead: le_prev is deadbeef");
247	}
248	}
249
250	static __inline__ void
251	binshash(buf_t bp, struct bufhashhdr *dp)
252	{
253	#if DIAGNOSTIC
254	buf_t nbp;
255	#endif /* DIAGNOSTIC */
256
257	BHASHENTCHECK(bp);
258
259	#if DIAGNOSTIC
260	nbp = dp->lh_first;
261	for (; nbp != NULL; nbp = nbp->b_hash.le_next) {
262	if (nbp == bp) {
263	panic("buf already in hashlist");
264	}
265	}
266	#endif /* DIAGNOSTIC */
267
268	blistenterhead(head: dp, bp);
269	}
270
271	static __inline__ void
272	bremhash(buf_t bp)
273	{
274	if (bp->b_hash.le_prev == (struct buf **)`0xdeadbeef`) {
275	panic("bremhash le_prev is deadbeef");
276	}
277	if (bp->b_hash.le_next == bp) {
278	panic("bremhash: next points to self");
279	}
280
281	if (bp->b_hash.le_next != NULL) {
282	bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
283	}
284	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
285	}
286
287	/*
288	* buf_mtx held.
289	*/
290	static __inline__ void
291	bmovelaundry(buf_t bp)
292	{
293	bp->b_whichq = BQ_LAUNDRY;
294	bp->b_timestamp = buf_timestamp();
295	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
296	blaundrycnt++;
297	}
298
299	static __inline__ void
300	buf_release_credentials(buf_t bp)
301	{
302	if (IS_VALID_CRED(bp->b_rcred)) {
303	kauth_cred_unref(&bp->b_rcred);
304	}
305	if (IS_VALID_CRED(bp->b_wcred)) {
306	kauth_cred_unref(&bp->b_wcred);
307	}
308	}
309
310
311	int
312	buf_valid(buf_t bp)
313	{
314	if ((bp->b_flags & (B_DONE \| B_DELWRI))) {
315	return `1`;
316	}
317	return `0`;
318	}
319
320	int
321	buf_fromcache(buf_t bp)
322	{
323	if ((bp->b_flags & B_CACHE)) {
324	return `1`;
325	}
326	return `0`;
327	}
328
329	void
330	buf_markinvalid(buf_t bp)
331	{
332	SET(bp->b_flags, B_INVAL);
333	}
334
335	void
336	buf_markdelayed(buf_t bp)
337	{
338	if (!ISSET(bp->b_flags, B_DELWRI)) {
339	SET(bp->b_flags, B_DELWRI);
340
341	OSAddAtomicLong(`1`, &nbdwrite);
342	buf_reassign(bp, newvp: bp->b_vp);
343	}
344	SET(bp->b_flags, B_DONE);
345	}
346
347	void
348	buf_markclean(buf_t bp)
349	{
350	if (ISSET(bp->b_flags, B_DELWRI)) {
351	CLR(bp->b_flags, B_DELWRI);
352
353	OSAddAtomicLong(-`1`, &nbdwrite);
354	buf_reassign(bp, newvp: bp->b_vp);
355	}
356	}
357
358	void
359	buf_markeintr(buf_t bp)
360	{
361	SET(bp->b_flags, B_EINTR);
362	}
363
364
365	void
366	buf_markaged(buf_t bp)
367	{
368	SET(bp->b_flags, B_AGE);
369	}
370
371	int
372	buf_fua(buf_t bp)
373	{
374	if ((bp->b_flags & B_FUA) == B_FUA) {
375	return `1`;
376	}
377	return `0`;
378	}
379
380	void
381	buf_markfua(buf_t bp)
382	{
383	SET(bp->b_flags, B_FUA);
384	}
385
386	#if CONFIG_PROTECT
387	cpx_t
388	bufattr_cpx(bufattr_t bap)
389	{
390	return bap->ba_cpx;
391	}
392
393	void
394	bufattr_setcpx(bufattr_t bap, cpx_t cpx)
395	{
396	bap->ba_cpx = cpx;
397	}
398
399	void
400	buf_setcpoff(buf_t bp, uint64_t foffset)
401	{
402	bp->b_attr.ba_cp_file_off = foffset;
403	}
404
405	uint64_t
406	bufattr_cpoff(bufattr_t bap)
407	{
408	return bap->ba_cp_file_off;
409	}
410
411	void
412	bufattr_setcpoff(bufattr_t bap, uint64_t foffset)
413	{
414	bap->ba_cp_file_off = foffset;
415	}
416
417	#else // !CONTECT_PROTECT
418
419	uint64_t
420	bufattr_cpoff(bufattr_t bap __unused)
421	{
422	return `0`;
423	}
424
425	void
426	bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset)
427	{
428	return;
429	}
430
431	struct cpx *
432	bufattr_cpx(__unused bufattr_t bap)
433	{
434	return NULL;
435	}
436
437	void
438	bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
439	{
440	}
441
442	#endif /* !CONFIG_PROTECT */
443
444	bufattr_t
445	bufattr_alloc(void)
446	{
447	return kalloc_type(struct bufattr, Z_WAITOK \| Z_ZERO);
448	}
449
450	void
451	bufattr_free(bufattr_t bap)
452	{
453	kfree_type(struct bufattr, bap);
454	}
455
456	bufattr_t
457	bufattr_dup(bufattr_t bap)
458	{
459	bufattr_t new_bufattr;
460	new_bufattr = kalloc_type(struct bufattr, Z_WAITOK \| Z_NOFAIL);
461
462	/ Copy the provided one into the new copy /
463	memcpy(dst: new_bufattr, src: bap, n: sizeof(struct bufattr));
464	return new_bufattr;
465	}
466
467	int
468	bufattr_rawencrypted(bufattr_t bap)
469	{
470	if ((bap->ba_flags & BA_RAW_ENCRYPTED_IO)) {
471	return `1`;
472	}
473	return `0`;
474	}
475
476	int
477	bufattr_throttled(bufattr_t bap)
478	{
479	return GET_BUFATTR_IO_TIER(bap);
480	}
481
482	int
483	bufattr_passive(bufattr_t bap)
484	{
485	if ((bap->ba_flags & BA_PASSIVE)) {
486	return `1`;
487	}
488	return `0`;
489	}
490
491	int
492	bufattr_nocache(bufattr_t bap)
493	{
494	if ((bap->ba_flags & BA_NOCACHE)) {
495	return `1`;
496	}
497	return `0`;
498	}
499
500	int
501	bufattr_meta(bufattr_t bap)
502	{
503	if ((bap->ba_flags & BA_META)) {
504	return `1`;
505	}
506	return `0`;
507	}
508
509	void
510	bufattr_markmeta(bufattr_t bap)
511	{
512	SET(bap->ba_flags, BA_META);
513	}
514
515	int
516	bufattr_delayidlesleep(bufattr_t bap)
517	{
518	if ((bap->ba_flags & BA_DELAYIDLESLEEP)) {
519	return `1`;
520	}
521	return `0`;
522	}
523
524	bufattr_t
525	buf_attr(buf_t bp)
526	{
527	return &bp->b_attr;
528	}
529
530	void
531	buf_markstatic(buf_t bp __unused)
532	{
533	SET(bp->b_flags, B_STATICCONTENT);
534	}
535
536	int
537	buf_static(buf_t bp)
538	{
539	if ((bp->b_flags & B_STATICCONTENT)) {
540	return `1`;
541	}
542	return `0`;
543	}
544
545	void
546	bufattr_markgreedymode(bufattr_t bap)
547	{
548	SET(bap->ba_flags, BA_GREEDY_MODE);
549	}
550
551	int
552	bufattr_greedymode(bufattr_t bap)
553	{
554	if ((bap->ba_flags & BA_GREEDY_MODE)) {
555	return `1`;
556	}
557	return `0`;
558	}
559
560	void
561	bufattr_markisochronous(bufattr_t bap)
562	{
563	SET(bap->ba_flags, BA_ISOCHRONOUS);
564	}
565
566	int
567	bufattr_isochronous(bufattr_t bap)
568	{
569	if ((bap->ba_flags & BA_ISOCHRONOUS)) {
570	return `1`;
571	}
572	return `0`;
573	}
574
575	void
576	bufattr_markquickcomplete(bufattr_t bap)
577	{
578	SET(bap->ba_flags, BA_QUICK_COMPLETE);
579	}
580
581	int
582	bufattr_quickcomplete(bufattr_t bap)
583	{
584	if ((bap->ba_flags & BA_QUICK_COMPLETE)) {
585	return `1`;
586	}
587	return `0`;
588	}
589
590	void
591	bufattr_markioscheduled(bufattr_t bap)
592	{
593	SET(bap->ba_flags, BA_IO_SCHEDULED);
594	}
595
596
597	int
598	bufattr_ioscheduled(bufattr_t bap)
599	{
600	if ((bap->ba_flags & BA_IO_SCHEDULED)) {
601	return `1`;
602	}
603	return `0`;
604	}
605
606	void
607	bufattr_markexpeditedmeta(bufattr_t bap)
608	{
609	SET(bap->ba_flags, BA_EXPEDITED_META_IO);
610	}
611
612	int
613	bufattr_expeditedmeta(bufattr_t bap)
614	{
615	if ((bap->ba_flags & BA_EXPEDITED_META_IO)) {
616	return `1`;
617	}
618	return `0`;
619	}
620
621	int
622	bufattr_willverify(bufattr_t bap)
623	{
624	if ((bap->ba_flags & BA_WILL_VERIFY)) {
625	return `1`;
626	}
627	return `0`;
628	}
629
630	errno_t
631	buf_error(buf_t bp)
632	{
633	return bp->b_error;
634	}
635
636	void
637	buf_seterror(buf_t bp, errno_t error)
638	{
639	if ((bp->b_error = error)) {
640	SET(bp->b_flags, B_ERROR);
641	} else {
642	CLR(bp->b_flags, B_ERROR);
643	}
644	}
645
646	void
647	buf_setflags(buf_t bp, int32_t flags)
648	{
649	SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
650	}
651
652	void
653	buf_clearflags(buf_t bp, int32_t flags)
654	{
655	CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
656	}
657
658	int32_t
659	buf_flags(buf_t bp)
660	{
661	return bp->b_flags & BUF_X_RDFLAGS;
662	}
663
664	void
665	buf_reset(buf_t bp, int32_t io_flags)
666	{
667	CLR(bp->b_flags, (B_READ \| B_WRITE \| B_ERROR \| B_DONE \| B_INVAL \| B_ASYNC \| B_NOCACHE \| B_FUA));
668	SET(bp->b_flags, (io_flags & (B_ASYNC \| B_READ \| B_WRITE \| B_NOCACHE)));
669
670	bp->b_error = `0`;
671	}
672
673	uint32_t
674	buf_count(buf_t bp)
675	{
676	return bp->b_bcount;
677	}
678
679	void
680	buf_setcount(buf_t bp, uint32_t bcount)
681	{
682	bp->b_bcount = bcount;
683	}
684
685	uint32_t
686	buf_size(buf_t bp)
687	{
688	return bp->b_bufsize;
689	}
690
691	void
692	buf_setsize(buf_t bp, uint32_t bufsize)
693	{
694	bp->b_bufsize = bufsize;
695	}
696
697	uint32_t
698	buf_resid(buf_t bp)
699	{
700	return bp->b_resid;
701	}
702
703	void
704	buf_setresid(buf_t bp, uint32_t resid)
705	{
706	bp->b_resid = resid;
707	}
708
709	uint32_t
710	buf_dirtyoff(buf_t bp)
711	{
712	return bp->b_dirtyoff;
713	}
714
715	uint32_t
716	buf_dirtyend(buf_t bp)
717	{
718	return bp->b_dirtyend;
719	}
720
721	void
722	buf_setdirtyoff(buf_t bp, uint32_t dirtyoff)
723	{
724	bp->b_dirtyoff = dirtyoff;
725	}
726
727	void
728	buf_setdirtyend(buf_t bp, uint32_t dirtyend)
729	{
730	bp->b_dirtyend = dirtyend;
731	}
732
733	uintptr_t
734	buf_dataptr(buf_t bp)
735	{
736	return bp->b_datap;
737	}
738
739	void
740	buf_setdataptr(buf_t bp, uintptr_t data)
741	{
742	bp->b_datap = data;
743	}
744
745	vnode_t
746	buf_vnode(buf_t bp)
747	{
748	return bp->b_vp;
749	}
750
751	void
752	buf_setvnode(buf_t bp, vnode_t vp)
753	{
754	bp->b_vp = vp;
755	}
756
757	vnode_t
758	buf_vnop_vnode(buf_t bp)
759	{
760	return bp->b_vnop_vp ? bp->b_vnop_vp : bp->b_vp;
761	}
762
763	void *
764	buf_callback(buf_t bp)
765	{
766	if (!(bp->b_flags & B_CALL)) {
767	return (void *) NULL;
768	}
769
770	return (void *)bp->b_iodone;
771	}
772
773
774	errno_t
775	buf_setcallback(buf_t bp, void (callback)(buf_t, void* ), void* *transaction)
776	{
777	assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
778
779	if (callback) {
780	bp->b_flags \|= (B_CALL \| B_ASYNC);
781	} else {
782	bp->b_flags &= ~B_CALL;
783	}
784	bp->b_transaction = transaction;
785	bp->b_iodone = callback;
786
787	return `0`;
788	}
789
790	errno_t
791	buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
792	{
793	if (!(bp->b_lflags & BL_IOBUF)) {
794	return EINVAL;
795	}
796
797	if (upl) {
798	bp->b_flags \|= B_CLUSTER;
799	} else {
800	bp->b_flags &= ~B_CLUSTER;
801	}
802	bp->b_upl = upl;
803	bp->b_uploffset = offset;
804
805	return `0`;
806	}
807
808	buf_t
809	buf_clone(buf_t bp, int io_offset, int io_size, void (iodone)(buf_t, void* ), void* *arg)
810	{
811	buf_t io_bp;
812	int add1, add2;
813
814	if (io_offset < `0` \|\| io_size < `0`) {
815	return NULL;
816	}
817
818	if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) {
819	return NULL;
820	}
821
822	if (bp->b_flags & B_CLUSTER) {
823	if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) {
824	return NULL;
825	}
826
827	if (os_add_overflow(io_offset, io_size, &add1) \|\| os_add_overflow(add1, bp->b_uploffset, &add2)) {
828	return NULL;
829	}
830	if ((add2 & PAGE_MASK) && ((uint32_t)add1 < (uint32_t)bp->b_bcount)) {
831	return NULL;
832	}
833	}
834	io_bp = alloc_io_buf(bp->b_vp, `0`);
835
836	io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL \| B_META \| B_PAGEIO \| B_CLUSTER \| B_PHYS \| B_RAW \| B_ASYNC \| B_READ \| B_FUA);
837
838	if (iodone) {
839	io_bp->b_transaction = arg;
840	io_bp->b_iodone = iodone;
841	io_bp->b_flags \|= B_CALL;
842	}
843	if (bp->b_flags & B_CLUSTER) {
844	io_bp->b_upl = bp->b_upl;
845	io_bp->b_uploffset = bp->b_uploffset + io_offset;
846	} else {
847	io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
848	}
849	io_bp->b_bcount = io_size;
850
851	return io_bp;
852	}
853
854
855	int
856	buf_shadow(buf_t bp)
857	{
858	if (bp->b_lflags & BL_SHADOW) {
859	return `1`;
860	}
861	return `0`;
862	}
863
864
865	buf_t
866	buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void* ), void* *arg)
867	{
868	return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, priv: `1`);
869	}
870
871	buf_t
872	buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void* ), void* *arg)
873	{
874	return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, priv: `0`);
875	}
876
877
878	static buf_t
879	buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void* ), void* arg, int* priv)
880	{
881	buf_t io_bp;
882
883	KERNEL_DEBUG(`0xbbbbc000` \| DBG_FUNC_START, bp, `0`, `0`, `0`, `0`);
884
885	if (!(bp->b_flags & B_META) \|\| (bp->b_lflags & BL_IOBUF)) {
886	KERNEL_DEBUG(`0xbbbbc000` \| DBG_FUNC_END, bp, `0`, `0`, `0`, `0`);
887	return NULL;
888	}
889	#ifdef BUF_MAKE_PRIVATE
890	if (bp->b_shadow_ref && bp->b_data_ref == `0` && external_storage == `0`) {
891	panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
892	}
893	#endif
894	io_bp = alloc_io_buf(bp->b_vp, priv);
895
896	io_bp->b_flags = bp->b_flags & (B_META \| B_ZALLOC \| B_ASYNC \| B_READ \| B_FUA);
897	io_bp->b_blkno = bp->b_blkno;
898	io_bp->b_lblkno = bp->b_lblkno;
899	io_bp->b_lblksize = bp->b_lblksize;
900
901	if (iodone) {
902	io_bp->b_transaction = arg;
903	io_bp->b_iodone = iodone;
904	io_bp->b_flags \|= B_CALL;
905	}
906	if (force_copy == FALSE) {
907	io_bp->b_bcount = bp->b_bcount;
908	io_bp->b_bufsize = bp->b_bufsize;
909
910	if (external_storage) {
911	io_bp->b_datap = external_storage;
912	#ifdef BUF_MAKE_PRIVATE
913	io_bp->b_data_store = NULL;
914	#endif
915	} else {
916	io_bp->b_datap = bp->b_datap;
917	#ifdef BUF_MAKE_PRIVATE
918	io_bp->b_data_store = bp;
919	#endif
920	}
921	(buf_t )(&io_bp->b_orig) = bp;
922
923	lck_mtx_lock_spin(lck: &buf_mtx);
924
925	io_bp->b_lflags \|= BL_SHADOW;
926	io_bp->b_shadow = bp->b_shadow;
927	bp->b_shadow = io_bp;
928	bp->b_shadow_ref++;
929
930	#ifdef BUF_MAKE_PRIVATE
931	if (external_storage) {
932	io_bp->b_lflags \|= BL_EXTERNAL;
933	} else {
934	bp->b_data_ref++;
935	}
936	#endif
937	lck_mtx_unlock(lck: &buf_mtx);
938	} else {
939	if (external_storage) {
940	#ifdef BUF_MAKE_PRIVATE
941	io_bp->b_lflags \|= BL_EXTERNAL;
942	#endif
943	io_bp->b_bcount = bp->b_bcount;
944	io_bp->b_bufsize = bp->b_bufsize;
945	io_bp->b_datap = external_storage;
946	} else {
947	allocbuf(io_bp, bp->b_bcount);
948
949	io_bp->b_lflags \|= BL_IOBUF_ALLOC;
950	}
951	bcopy(src: (caddr_t)bp->b_datap, dst: (caddr_t)io_bp->b_datap, n: bp->b_bcount);
952
953	#ifdef BUF_MAKE_PRIVATE
954	io_bp->b_data_store = NULL;
955	#endif
956	}
957	KERNEL_DEBUG(`0xbbbbc000` \| DBG_FUNC_END, bp, bp->b_shadow_ref, `0`, io_bp, `0`);
958
959	return io_bp;
960	}
961
962
963	#ifdef BUF_MAKE_PRIVATE
964	errno_t
965	buf_make_private(buf_t bp)
966	{
967	buf_t ds_bp;
968	buf_t t_bp;
969	struct buf my_buf;
970
971	KERNEL_DEBUG(`0xbbbbc004` \| DBG_FUNC_START, bp, bp->b_shadow_ref, `0`, `0`, `0`);
972
973	if (bp->b_shadow_ref == `0` \|\| bp->b_data_ref == `0` \|\| ISSET(bp->b_lflags, BL_SHADOW)) {
974	KERNEL_DEBUG(`0xbbbbc004` \| DBG_FUNC_END, bp, bp->b_shadow_ref, `0`, EINVAL, `0`);
975	return EINVAL;
976	}
977	my_buf.b_flags = B_META;
978	my_buf.b_datap = (uintptr_t)NULL;
979	allocbuf(&my_buf, bp->b_bcount);
980
981	bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
982
983	lck_mtx_lock_spin(&buf_mtx);
984
985	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
986	if (!ISSET(bp->b_lflags, BL_EXTERNAL)) {
987	break;
988	}
989	}
990	ds_bp = t_bp;
991
992	if (ds_bp == NULL && bp->b_data_ref) {
993	panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
994	}
995
996	if (ds_bp && (bp->b_data_ref == `0` \|\| bp->b_shadow_ref == `0`)) {
997	panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
998	}
999
1000	if (ds_bp == NULL) {
1001	lck_mtx_unlock(&buf_mtx);
1002
1003	buf_free_meta_store(&my_buf);
1004
1005	KERNEL_DEBUG(`0xbbbbc004` \| DBG_FUNC_END, bp, bp->b_shadow_ref, `0`, EINVAL, `0`);
1006	return EINVAL;
1007	}
1008	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
1009	if (!ISSET(t_bp->b_lflags, BL_EXTERNAL)) {
1010	t_bp->b_data_store = ds_bp;
1011	}
1012	}
1013	ds_bp->b_data_ref = bp->b_data_ref;
1014
1015	bp->b_data_ref = `0`;
1016	bp->b_datap = my_buf.b_datap;
1017
1018	lck_mtx_unlock(&buf_mtx);
1019
1020	KERNEL_DEBUG(`0xbbbbc004` \| DBG_FUNC_END, bp, bp->b_shadow_ref, `0`, `0`, `0`);
1021	return `0`;
1022	}
1023	#endif
1024
1025
1026	void
1027	buf_setfilter(buf_t bp, void (filter)(buf_t, void* ), void* *transaction,
1028	void(*old_iodone)(buf_t, void* ), void* **old_transaction)
1029	{
1030	assert(ISSET(bp->b_lflags, BL_BUSY));
1031
1032	if (old_iodone) {
1033	*old_iodone = bp->b_iodone;
1034	}
1035	if (old_transaction) {
1036	*old_transaction = bp->b_transaction;
1037	}
1038
1039	bp->b_transaction = transaction;
1040	bp->b_iodone = filter;
1041	if (filter) {
1042	bp->b_flags \|= B_FILTER;
1043	} else {
1044	bp->b_flags &= ~B_FILTER;
1045	}
1046	}
1047
1048
1049	daddr64_t
1050	buf_blkno(buf_t bp)
1051	{
1052	return bp->b_blkno;
1053	}
1054
1055	daddr64_t
1056	buf_lblkno(buf_t bp)
1057	{
1058	return bp->b_lblkno;
1059	}
1060
1061	uint32_t
1062	buf_lblksize(buf_t bp)
1063	{
1064	return bp->b_lblksize;
1065	}
1066
1067	void
1068	buf_setblkno(buf_t bp, daddr64_t blkno)
1069	{
1070	bp->b_blkno = blkno;
1071	}
1072
1073	void
1074	buf_setlblkno(buf_t bp, daddr64_t lblkno)
1075	{
1076	bp->b_lblkno = lblkno;
1077	}
1078
1079	void
1080	buf_setlblksize(buf_t bp, uint32_t lblksize)
1081	{
1082	bp->b_lblksize = lblksize;
1083	}
1084
1085	dev_t
1086	buf_device(buf_t bp)
1087	{
1088	return bp->b_dev;
1089	}
1090
1091	errno_t
1092	buf_setdevice(buf_t bp, vnode_t vp)
1093	{
1094	if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) {
1095	return EINVAL;
1096	}
1097	bp->b_dev = vp->v_rdev;
1098
1099	return `0`;
1100	}
1101
1102
1103	void *
1104	buf_drvdata(buf_t bp)
1105	{
1106	return bp->b_drvdata;
1107	}
1108
1109	void
1110	buf_setdrvdata(buf_t bp, void *drvdata)
1111	{
1112	bp->b_drvdata = drvdata;
1113	}
1114
1115	void *
1116	buf_fsprivate(buf_t bp)
1117	{
1118	return bp->b_fsprivate;
1119	}
1120
1121	void
1122	buf_setfsprivate(buf_t bp, void *fsprivate)
1123	{
1124	bp->b_fsprivate = fsprivate;
1125	}
1126
1127	kauth_cred_t
1128	buf_rcred(buf_t bp)
1129	{
1130	return bp->b_rcred;
1131	}
1132
1133	kauth_cred_t
1134	buf_wcred(buf_t bp)
1135	{
1136	return bp->b_wcred;
1137	}
1138
1139	void *
1140	buf_upl(buf_t bp)
1141	{
1142	return bp->b_upl;
1143	}
1144
1145	uint32_t
1146	buf_uploffset(buf_t bp)
1147	{
1148	return (uint32_t)(bp->b_uploffset);
1149	}
1150
1151	proc_t
1152	buf_proc(buf_t bp)
1153	{
1154	return bp->b_proc;
1155	}
1156
1157
1158	static errno_t
1159	buf_map_range_internal(buf_t bp, caddr_t *io_addr, boolean_t legacymode,
1160	vm_prot_t prot)
1161	{
1162	buf_t real_bp;
1163	vm_offset_t vaddr;
1164	kern_return_t kret;
1165
1166	if (!(bp->b_flags & B_CLUSTER)) {
1167	*io_addr = (caddr_t)bp->b_datap;
1168	return `0`;
1169	}
1170	real_bp = (buf_t)(bp->b_real_bp);
1171
1172	if (real_bp && real_bp->b_datap) {
1173	/*
1174	* b_real_bp is only valid if B_CLUSTER is SET
1175	* if it's non-zero, than someone did a cluster_bp call
1176	* if the backing physical pages were already mapped
1177	* in before the call to cluster_bp (non-zero b_datap),
1178	* than we just use that mapping
1179	*/
1180	*io_addr = (caddr_t)real_bp->b_datap;
1181	return `0`;
1182	}
1183
1184	if (legacymode) {
1185	kret = ubc_upl_map(bp->b_upl, &vaddr); / Map it in /
1186	if (kret == KERN_SUCCESS) {
1187	vaddr += bp->b_uploffset;
1188	}
1189	} else {
1190	kret = ubc_upl_map_range(bp->b_upl, bp->b_uploffset, bp->b_bcount, prot, &vaddr); / Map it in /
1191	}
1192
1193	if (kret != KERN_SUCCESS) {
1194	*io_addr = NULL;
1195
1196	return ENOMEM;
1197	}
1198
1199	*io_addr = (caddr_t)vaddr;
1200
1201	return `0`;
1202	}
1203
1204	errno_t
1205	buf_map_range(buf_t bp, caddr_t *io_addr)
1206	{
1207	return buf_map_range_internal(bp, io_addr, false, VM_PROT_DEFAULT);
1208	}
1209
1210	errno_t
1211	buf_map_range_with_prot(buf_t bp, caddr_t *io_addr, vm_prot_t prot)
1212	{
1213	/ Only VM_PROT_READ and/or VM_PROT_WRITE is allowed. /
1214	prot &= (VM_PROT_READ \| VM_PROT_WRITE);
1215	if (prot == VM_PROT_NONE) {
1216	*io_addr = NULL;
1217	return EINVAL;
1218	}
1219
1220	return buf_map_range_internal(bp, io_addr, false, prot);
1221	}
1222
1223	errno_t
1224	buf_map(buf_t bp, caddr_t *io_addr)
1225	{
1226	return buf_map_range_internal(bp, io_addr, true, VM_PROT_DEFAULT);
1227	}
1228
1229	static errno_t
1230	buf_unmap_range_internal(buf_t bp, boolean_t legacymode)
1231	{
1232	buf_t real_bp;
1233	kern_return_t kret;
1234
1235	if (!(bp->b_flags & B_CLUSTER)) {
1236	return `0`;
1237	}
1238	/*
1239	* see buf_map for the explanation
1240	*/
1241	real_bp = (buf_t)(bp->b_real_bp);
1242
1243	if (real_bp && real_bp->b_datap) {
1244	return `0`;
1245	}
1246
1247	if ((bp->b_lflags & BL_IOBUF) &&
1248	((bp->b_flags & (B_PAGEIO \| B_READ)) != (B_PAGEIO \| B_READ))) {
1249	/*
1250	* ignore pageins... the 'right' thing will
1251	* happen due to the way we handle speculative
1252	* clusters...
1253	*
1254	* when we commit these pages, we'll hit
1255	* it with UPL_COMMIT_INACTIVE which
1256	* will clear the reference bit that got
1257	* turned on when we touched the mapping
1258	*/
1259	bp->b_flags \|= B_AGE;
1260	}
1261
1262	if (legacymode) {
1263	kret = ubc_upl_unmap(bp->b_upl);
1264	} else {
1265	kret = ubc_upl_unmap_range(bp->b_upl, bp->b_uploffset, bp->b_bcount);
1266	}
1267
1268	if (kret != KERN_SUCCESS) {
1269	return EINVAL;
1270	}
1271	return `0`;
1272	}
1273
1274	errno_t
1275	buf_unmap_range(buf_t bp)
1276	{
1277	return buf_unmap_range_internal(bp, false);
1278	}
1279
1280	errno_t
1281	buf_unmap(buf_t bp)
1282	{
1283	return buf_unmap_range_internal(bp, true);
1284	}
1285
1286
1287	void
1288	buf_clear(buf_t bp)
1289	{
1290	caddr_t baddr;
1291
1292	if (buf_map(bp, io_addr: &baddr) == `0`) {
1293	bzero(s: baddr, n: bp->b_bcount);
1294	buf_unmap(bp);
1295	}
1296	bp->b_resid = `0`;
1297	}
1298
1299	/*
1300	* Read or write a buffer that is not contiguous on disk.
1301	* buffer is marked done/error at the conclusion
1302	*/
1303	static int
1304	buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1305	{
1306	vnode_t vp = buf_vnode(bp);
1307	buf_t io_bp; / For reading or writing a single block /
1308	int io_direction;
1309	int io_resid;
1310	size_t io_contig_bytes;
1311	daddr64_t io_blkno;
1312	int error = `0`;
1313	int bmap_flags;
1314
1315	/*
1316	* save our starting point... the bp was already mapped
1317	* in buf_strategy before we got called
1318	* no sense doing it again.
1319	*/
1320	io_blkno = bp->b_blkno;
1321	/*
1322	* Make sure we redo this mapping for the next I/O
1323	* i.e. this can never be a 'permanent' mapping
1324	*/
1325	bp->b_blkno = bp->b_lblkno;
1326
1327	/*
1328	* Get an io buffer to do the deblocking
1329	*/
1330	io_bp = alloc_io_buf(devvp, `0`);
1331
1332	io_bp->b_lblkno = bp->b_lblkno;
1333	io_bp->b_lblksize = bp->b_lblksize;
1334	io_bp->b_datap = bp->b_datap;
1335	io_resid = bp->b_bcount;
1336	io_direction = bp->b_flags & B_READ;
1337	io_contig_bytes = contig_bytes;
1338
1339	if (bp->b_flags & B_READ) {
1340	bmap_flags = VNODE_READ;
1341	} else {
1342	bmap_flags = VNODE_WRITE;
1343	}
1344
1345	for (;;) {
1346	if (io_blkno == -`1`) {
1347	/*
1348	* this is unexepected, but we'll allow for it
1349	*/
1350	bzero(s: (caddr_t)io_bp->b_datap, n: (int)io_contig_bytes);
1351	} else {
1352	io_bp->b_bcount = (uint32_t)io_contig_bytes;
1353	io_bp->b_bufsize = (uint32_t)io_contig_bytes;
1354	io_bp->b_resid = (uint32_t)io_contig_bytes;
1355	io_bp->b_blkno = io_blkno;
1356
1357	buf_reset(bp: io_bp, io_flags: io_direction);
1358
1359	/*
1360	* Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
1361	*/
1362
1363	if (!ISSET(bp->b_flags, B_READ)) {
1364	OSAddAtomic(`1`, &devvp->v_numoutput);
1365	}
1366
1367	if ((error = VNOP_STRATEGY(bp: io_bp))) {
1368	break;
1369	}
1370	if ((error = (int)buf_biowait(bp: io_bp))) {
1371	break;
1372	}
1373	if (io_bp->b_resid) {
1374	io_resid -= (io_contig_bytes - io_bp->b_resid);
1375	break;
1376	}
1377	}
1378	if ((io_resid -= io_contig_bytes) == `0`) {
1379	break;
1380	}
1381	f_offset += io_contig_bytes;
1382	io_bp->b_datap += io_contig_bytes;
1383
1384	/*
1385	* Map the current position to a physical block number
1386	*/
1387	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) {
1388	break;
1389	}
1390	}
1391	buf_free(bp: io_bp);
1392
1393	if (error) {
1394	buf_seterror(bp, error);
1395	}
1396	bp->b_resid = io_resid;
1397	/*
1398	* This I/O is now complete
1399	*/
1400	buf_biodone(bp);
1401
1402	return error;
1403	}
1404
1405
1406	/*
1407	* struct vnop_strategy_args {
1408	* struct buf *a_bp;
1409	* } *ap;
1410	*/
1411	errno_t
1412	buf_strategy(vnode_t devvp, void *ap)
1413	{
1414	buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1415	vnode_t vp = bp->b_vp;
1416	int bmap_flags;
1417	errno_t error;
1418	#if CONFIG_DTRACE
1419	int dtrace_io_start_flag = `0`; / We only want to trip the io:::start*
1420	* probe once, with the true physical
1421	* block in place (b_blkno)
1422	*/
1423
1424	#endif
1425
1426	if (vp == NULL \|\| vp->v_type == VCHR \|\| vp->v_type == VBLK) {
1427	panic("buf_strategy: b_vp == NULL \|\| vtype == VCHR \| VBLK");
1428	}
1429	/*
1430	* associate the physical device with
1431	* with this buf_t even if we don't
1432	* end up issuing the I/O...
1433	*/
1434	bp->b_dev = devvp->v_rdev;
1435
1436	if (bp->b_flags & B_READ) {
1437	bmap_flags = VNODE_READ;
1438	} else {
1439	bmap_flags = VNODE_WRITE;
1440	}
1441
1442	if (!(bp->b_flags & B_CLUSTER)) {
1443	if ((bp->b_upl)) {
1444	/*
1445	* we have a UPL associated with this bp
1446	* go through cluster_bp which knows how
1447	* to deal with filesystem block sizes
1448	* that aren't equal to the page size
1449	*/
1450	DTRACE_IO1(start, buf_t, bp);
1451	return cluster_bp(bp);
1452	}
1453	if (bp->b_blkno == bp->b_lblkno) {
1454	off_t f_offset;
1455	size_t contig_bytes;
1456
1457	if (bp->b_lblksize && bp->b_lblkno >= `0`) {
1458	f_offset = bp->b_lblkno * bp->b_lblksize;
1459	} else if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1460	DTRACE_IO1(start, buf_t, bp);
1461	buf_seterror(bp, error);
1462	buf_biodone(bp);
1463
1464	return error;
1465	}
1466
1467	if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1468	DTRACE_IO1(start, buf_t, bp);
1469	buf_seterror(bp, error);
1470	buf_biodone(bp);
1471
1472	return error;
1473	}
1474
1475	DTRACE_IO1(start, buf_t, bp);
1476	#if CONFIG_DTRACE
1477	dtrace_io_start_flag = `1`;
1478	#endif /* CONFIG_DTRACE */
1479
1480	if ((bp->b_blkno == -`1`) \|\| (contig_bytes == `0`)) {
1481	/ Set block number to force biodone later /
1482	bp->b_blkno = -`1`;
1483	buf_clear(bp);
1484	} else if (contig_bytes < (size_t)bp->b_bcount) {
1485	return buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes);
1486	}
1487	}
1488
1489	#if CONFIG_DTRACE
1490	if (dtrace_io_start_flag == `0`) {
1491	DTRACE_IO1(start, buf_t, bp);
1492	dtrace_io_start_flag = `1`;
1493	}
1494	#endif /* CONFIG_DTRACE */
1495
1496	if (bp->b_blkno == -`1`) {
1497	buf_biodone(bp);
1498	return `0`;
1499	}
1500	}
1501
1502	#if CONFIG_DTRACE
1503	if (dtrace_io_start_flag == `0`) {
1504	DTRACE_IO1(start, buf_t, bp);
1505	}
1506	#endif /* CONFIG_DTRACE */
1507
1508	#if CONFIG_PROTECT
1509	/ Capture f_offset in the bufattr/
1510	cpx_t cpx = bufattr_cpx(bap: buf_attr(bp));
1511	if (cpx) {
1512	/ No need to go here for older EAs /
1513	if (cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
1514	off_t f_offset;
1515
1516	/*
1517	* this assert should be changed if cluster_io ever
1518	* changes its logical block size.
1519	*/
1520	assert((bp->b_lblksize == CLUSTER_IO_BLOCK_SIZE) \|\| !(bp->b_flags & B_CLUSTER));
1521
1522	if (bp->b_lblksize && bp->b_lblkno >= `0`) {
1523	f_offset = bp->b_lblkno * bp->b_lblksize;
1524	} else if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) {
1525	return error;
1526	}
1527
1528	/*
1529	* Attach the file offset to this buffer. The
1530	* bufattr attributes will be passed down the stack
1531	* until they reach the storage driver (whether
1532	* IOFlashStorage, ASP, or IONVMe). The driver
1533	* will retain the offset in a local variable when it
1534	* issues its I/Os to the NAND controller.
1535	*
1536	* Note that LwVM may end up splitting this I/O
1537	* into sub-I/Os if it crosses a chunk boundary. In this
1538	* case, LwVM will update this field when it dispatches
1539	* each I/O to IOFlashStorage. But from our perspective
1540	* we have only issued a single I/O.
1541	*
1542	* In the case of APFS we do not bounce through another
1543	* intermediate layer (such as CoreStorage). APFS will
1544	* issue the I/Os directly to the block device / IOMedia
1545	* via buf_strategy on the specfs node.
1546	*/
1547	buf_setcpoff(bp, foffset: f_offset);
1548	CP_DEBUG((CPDBG_OFFSET_IO \| DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, `0`);
1549	}
1550	}
1551	#endif
1552
1553	/*
1554	* we can issue the I/O because...
1555	* either B_CLUSTER is set which
1556	* means that the I/O is properly set
1557	* up to be a multiple of the page size, or
1558	* we were able to successfully set up the
1559	* physical block mapping
1560	*/
1561	bp->b_vnop_vp = devvp;
1562	error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1563	bp->b_vnop_vp = NULLVP;
1564	DTRACE_FSINFO(strategy, vnode_t, vp);
1565	return error;
1566	}
1567
1568
1569
1570	buf_t
1571	buf_alloc(vnode_t vp)
1572	{
1573	return alloc_io_buf(vp, is_vm_privileged());
1574	}
1575
1576	void
1577	buf_free(buf_t bp)
1578	{
1579	free_io_buf(bp);
1580	}
1581
1582
1583	/*
1584	* iterate buffers for the specified vp.
1585	* if BUF_SCAN_DIRTY is set, do the dirty list
1586	* if BUF_SCAN_CLEAN is set, do the clean list
1587	* if neither flag is set, default to BUF_SCAN_DIRTY
1588	* if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1589	*/
1590
1591	struct buf_iterate_info_t {
1592	int flag;
1593	struct buflists *listhead;
1594	};
1595
1596	void
1597	buf_iterate(vnode_t vp, int (callout)(buf_t, void* ), int* flags, void *arg)
1598	{
1599	buf_t bp;
1600	int retval;
1601	struct buflists local_iterblkhd;
1602	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
1603	int notify_busy = flags & BUF_NOTIFY_BUSY;
1604	struct buf_iterate_info_t list[`2`];
1605	int num_lists, i;
1606
1607	if (flags & BUF_SKIP_LOCKED) {
1608	lock_flags \|= BAC_SKIP_LOCKED;
1609	}
1610	if (flags & BUF_SKIP_NONLOCKED) {
1611	lock_flags \|= BAC_SKIP_NONLOCKED;
1612	}
1613
1614	if (!(flags & (BUF_SCAN_DIRTY \| BUF_SCAN_CLEAN))) {
1615	flags \|= BUF_SCAN_DIRTY;
1616	}
1617
1618	num_lists = `0`;
1619
1620	if (flags & BUF_SCAN_DIRTY) {
1621	list[num_lists].flag = VBI_DIRTY;
1622	list[num_lists].listhead = &vp->v_dirtyblkhd;
1623	num_lists++;
1624	}
1625	if (flags & BUF_SCAN_CLEAN) {
1626	list[num_lists].flag = VBI_CLEAN;
1627	list[num_lists].listhead = &vp->v_cleanblkhd;
1628	num_lists++;
1629	}
1630
1631	for (i = `0`; i < num_lists; i++) {
1632	lck_mtx_lock(lck: &buf_mtx);
1633
1634	if (buf_iterprepare(vp, &local_iterblkhd, flags: list[i].flag)) {
1635	lck_mtx_unlock(lck: &buf_mtx);
1636	continue;
1637	}
1638	while (!LIST_EMPTY(&local_iterblkhd)) {
1639	bp = LIST_FIRST(&local_iterblkhd);
1640	LIST_REMOVE(bp, b_vnbufs);
1641	LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1642
1643	if (buf_acquire_locked(bp, flags: lock_flags, slpflag: `0`, slptimeo: `0`)) {
1644	if (notify_busy) {
1645	bp = NULL;
1646	} else {
1647	continue;
1648	}
1649	}
1650
1651	lck_mtx_unlock(lck: &buf_mtx);
1652
1653	retval = callout(bp, arg);
1654
1655	switch (retval) {
1656	case BUF_RETURNED:
1657	if (bp) {
1658	buf_brelse(bp);
1659	}
1660	break;
1661	case BUF_CLAIMED:
1662	break;
1663	case BUF_RETURNED_DONE:
1664	if (bp) {
1665	buf_brelse(bp);
1666	}
1667	lck_mtx_lock(lck: &buf_mtx);
1668	goto out;
1669	case BUF_CLAIMED_DONE:
1670	lck_mtx_lock(lck: &buf_mtx);
1671	goto out;
1672	}
1673	lck_mtx_lock(lck: &buf_mtx);
1674	} / while list has more nodes /
1675	out:
1676	buf_itercomplete(vp, &local_iterblkhd, flags: list[i].flag);
1677	lck_mtx_unlock(lck: &buf_mtx);
1678	} / for each list /
1679	} / buf_iterate /
1680
1681
1682	/*
1683	* Flush out and invalidate all buffers associated with a vnode.
1684	*/
1685	int
1686	buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1687	{
1688	buf_t bp;
1689	int aflags;
1690	int error = `0`;
1691	int must_rescan = `1`;
1692	struct buflists local_iterblkhd;
1693
1694
1695	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1696	return `0`;
1697	}
1698
1699	lck_mtx_lock(lck: &buf_mtx);
1700
1701	for (;;) {
1702	if (must_rescan == `0`) {
1703	/*
1704	* the lists may not be empty, but all that's left at this
1705	* point are metadata or B_LOCKED buffers which are being
1706	* skipped... we know this because we made it through both
1707	* the clean and dirty lists without dropping buf_mtx...
1708	* each time we drop buf_mtx we bump "must_rescan"
1709	*/
1710	break;
1711	}
1712	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1713	break;
1714	}
1715	must_rescan = `0`;
1716	/*
1717	* iterate the clean list
1718	*/
1719	if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1720	goto try_dirty_list;
1721	}
1722	while (!LIST_EMPTY(&local_iterblkhd)) {
1723	bp = LIST_FIRST(&local_iterblkhd);
1724
1725	LIST_REMOVE(bp, b_vnbufs);
1726	LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1727
1728	/*
1729	* some filesystems distinguish meta data blocks with a negative logical block #
1730	*/
1731	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < `0` \|\| ISSET(bp->b_flags, B_META))) {
1732	continue;
1733	}
1734
1735	aflags = BAC_REMOVE;
1736
1737	if (!(flags & BUF_INVALIDATE_LOCKED)) {
1738	aflags \|= BAC_SKIP_LOCKED;
1739	}
1740
1741	if ((error = (int)buf_acquire_locked(bp, flags: aflags, slpflag, slptimeo))) {
1742	if (error == EDEADLK) {
1743	/*
1744	* this buffer was marked B_LOCKED...
1745	* we didn't drop buf_mtx, so we
1746	* we don't need to rescan
1747	*/
1748	continue;
1749	}
1750	if (error == EAGAIN) {
1751	/*
1752	* found a busy buffer... we blocked and
1753	* dropped buf_mtx, so we're going to
1754	* need to rescan after this pass is completed
1755	*/
1756	must_rescan++;
1757	continue;
1758	}
1759	/*
1760	* got some kind of 'real' error out of the msleep
1761	* in buf_acquire_locked, terminate the scan and return the error
1762	*/
1763	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1764
1765	lck_mtx_unlock(lck: &buf_mtx);
1766	return error;
1767	}
1768	lck_mtx_unlock(lck: &buf_mtx);
1769
1770	if (bp->b_flags & B_LOCKED) {
1771	KERNEL_DEBUG(`0xbbbbc038`, bp, `0`, `0`, `0`, `0`);
1772	}
1773
1774	CLR(bp->b_flags, B_LOCKED);
1775	SET(bp->b_flags, B_INVAL);
1776	buf_brelse(bp);
1777
1778	lck_mtx_lock(lck: &buf_mtx);
1779
1780	/*
1781	* by dropping buf_mtx, we allow new
1782	* buffers to be added to the vnode list(s)
1783	* we'll have to rescan at least once more
1784	* if the queues aren't empty
1785	*/
1786	must_rescan++;
1787	}
1788	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1789
1790	try_dirty_list:
1791	/*
1792	* Now iterate on dirty blks
1793	*/
1794	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1795	continue;
1796	}
1797	while (!LIST_EMPTY(&local_iterblkhd)) {
1798	bp = LIST_FIRST(&local_iterblkhd);
1799
1800	LIST_REMOVE(bp, b_vnbufs);
1801	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1802
1803	/*
1804	* some filesystems distinguish meta data blocks with a negative logical block #
1805	*/
1806	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < `0` \|\| ISSET(bp->b_flags, B_META))) {
1807	continue;
1808	}
1809
1810	aflags = BAC_REMOVE;
1811
1812	if (!(flags & BUF_INVALIDATE_LOCKED)) {
1813	aflags \|= BAC_SKIP_LOCKED;
1814	}
1815
1816	if ((error = (int)buf_acquire_locked(bp, flags: aflags, slpflag, slptimeo))) {
1817	if (error == EDEADLK) {
1818	/*
1819	* this buffer was marked B_LOCKED...
1820	* we didn't drop buf_mtx, so we
1821	* we don't need to rescan
1822	*/
1823	continue;
1824	}
1825	if (error == EAGAIN) {
1826	/*
1827	* found a busy buffer... we blocked and
1828	* dropped buf_mtx, so we're going to
1829	* need to rescan after this pass is completed
1830	*/
1831	must_rescan++;
1832	continue;
1833	}
1834	/*
1835	* got some kind of 'real' error out of the msleep
1836	* in buf_acquire_locked, terminate the scan and return the error
1837	*/
1838	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1839
1840	lck_mtx_unlock(lck: &buf_mtx);
1841	return error;
1842	}
1843	lck_mtx_unlock(lck: &buf_mtx);
1844
1845	if (bp->b_flags & B_LOCKED) {
1846	KERNEL_DEBUG(`0xbbbbc038`, bp, `0`, `0`, `1`, `0`);
1847	}
1848
1849	CLR(bp->b_flags, B_LOCKED);
1850	SET(bp->b_flags, B_INVAL);
1851
1852	if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) {
1853	(void) VNOP_BWRITE(bp);
1854	} else {
1855	buf_brelse(bp);
1856	}
1857
1858	lck_mtx_lock(lck: &buf_mtx);
1859	/*
1860	* by dropping buf_mtx, we allow new
1861	* buffers to be added to the vnode list(s)
1862	* we'll have to rescan at least once more
1863	* if the queues aren't empty
1864	*/
1865	must_rescan++;
1866	}
1867	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1868	}
1869	lck_mtx_unlock(lck: &buf_mtx);
1870
1871	return `0`;
1872	}
1873
1874	void
1875	buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg)
1876	{
1877	(void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1878	return;
1879	}
1880
1881	int
1882	buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg)
1883	{
1884	buf_t bp;
1885	int writes_issued = `0`;
1886	errno_t error;
1887	int busy = `0`;
1888	struct buflists local_iterblkhd;
1889	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
1890	int any_locked = `0`;
1891
1892	if (flags & BUF_SKIP_LOCKED) {
1893	lock_flags \|= BAC_SKIP_LOCKED;
1894	}
1895	if (flags & BUF_SKIP_NONLOCKED) {
1896	lock_flags \|= BAC_SKIP_NONLOCKED;
1897	}
1898	loop:
1899	lck_mtx_lock(lck: &buf_mtx);
1900
1901	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == `0`) {
1902	while (!LIST_EMPTY(&local_iterblkhd)) {
1903	bp = LIST_FIRST(&local_iterblkhd);
1904	LIST_REMOVE(bp, b_vnbufs);
1905	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1906
1907	if ((error = buf_acquire_locked(bp, flags: lock_flags, slpflag: `0`, slptimeo: `0`)) == EBUSY) {
1908	busy++;
1909	}
1910	if (error) {
1911	/*
1912	* If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1913	* we may want to do somethign differently if a locked or unlocked
1914	* buffer was encountered (depending on the arg specified).
1915	* In this case, we know that one of those two was set, and the
1916	* buf acquisition failed above.
1917	*
1918	* If it failed with EDEADLK, then save state which can be emitted
1919	* later on to the caller. Most callers should not care.
1920	*/
1921	if (error == EDEADLK) {
1922	any_locked++;
1923	}
1924	continue;
1925	}
1926	lck_mtx_unlock(lck: &buf_mtx);
1927
1928	bp->b_flags &= ~B_LOCKED;
1929
1930	/*
1931	* Wait for I/O associated with indirect blocks to complete,
1932	* since there is no way to quickly wait for them below.
1933	*/
1934	if ((bp->b_vp == vp) \|\| (wait == `0`)) {
1935	(void) buf_bawrite(bp);
1936	} else {
1937	(void) VNOP_BWRITE(bp);
1938	}
1939	writes_issued++;
1940
1941	lck_mtx_lock(lck: &buf_mtx);
1942	}
1943	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1944	}
1945	lck_mtx_unlock(lck: &buf_mtx);
1946
1947	if (wait) {
1948	(void)vnode_waitforwrites(vp, output_target: `0`, slpflag: `0`, slptimeout: `0`, msg);
1949
1950	if (vp->v_dirtyblkhd.lh_first && busy) {
1951	/*
1952	* we had one or more BUSY buffers on
1953	* the dirtyblock list... most likely
1954	* these are due to delayed writes that
1955	* were moved to the bclean queue but
1956	* have not yet been 'written'.
1957	* if we issued some writes on the
1958	* previous pass, we try again immediately
1959	* if we didn't, we'll sleep for some time
1960	* to allow the state to change...
1961	*/
1962	if (writes_issued == `0`) {
1963	(void)tsleep(chan: (caddr_t)&vp->v_numoutput,
1964	PRIBIO + `1`, wmesg: "vnode_flushdirtyblks", timo: hz / `20`);
1965	}
1966	writes_issued = `0`;
1967	busy = `0`;
1968
1969	goto loop;
1970	}
1971	}
1972
1973	return any_locked;
1974	}
1975
1976
1977	/*
1978	* called with buf_mtx held...
1979	* this lock protects the queue manipulation
1980	*/
1981	static int
1982	buf_iterprepare(vnode_t vp, struct buflists iterheadp, int* flags)
1983	{
1984	struct buflists * listheadp;
1985
1986	if (flags & VBI_DIRTY) {
1987	listheadp = &vp->v_dirtyblkhd;
1988	} else {
1989	listheadp = &vp->v_cleanblkhd;
1990	}
1991
1992	while (vp->v_iterblkflags & VBI_ITER) {
1993	vp->v_iterblkflags \|= VBI_ITERWANT;
1994	msleep(chan: &vp->v_iterblkflags, mtx: &buf_mtx, pri: `0`, wmesg: "buf_iterprepare", NULL);
1995	}
1996	if (LIST_EMPTY(listheadp)) {
1997	LIST_INIT(iterheadp);
1998	return EINVAL;
1999	}
2000	vp->v_iterblkflags \|= VBI_ITER;
2001
2002	iterheadp->lh_first = listheadp->lh_first;
2003	listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
2004	LIST_INIT(listheadp);
2005
2006	return `0`;
2007	}
2008
2009	/*
2010	* called with buf_mtx held...
2011	* this lock protects the queue manipulation
2012	*/
2013	static void
2014	buf_itercomplete(vnode_t vp, struct buflists iterheadp, int* flags)
2015	{
2016	struct buflists * listheadp;
2017	buf_t bp;
2018
2019	if (flags & VBI_DIRTY) {
2020	listheadp = &vp->v_dirtyblkhd;
2021	} else {
2022	listheadp = &vp->v_cleanblkhd;
2023	}
2024
2025	while (!LIST_EMPTY(iterheadp)) {
2026	bp = LIST_FIRST(iterheadp);
2027	LIST_REMOVE(bp, b_vnbufs);
2028	LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
2029	}
2030	vp->v_iterblkflags &= ~VBI_ITER;
2031
2032	if (vp->v_iterblkflags & VBI_ITERWANT) {
2033	vp->v_iterblkflags &= ~VBI_ITERWANT;
2034	wakeup(chan: &vp->v_iterblkflags);
2035	}
2036	}
2037
2038
2039	static void
2040	bremfree_locked(buf_t bp)
2041	{
2042	struct bqueues *dp = NULL;
2043	int whichq;
2044
2045	whichq = bp->b_whichq;
2046
2047	if (whichq == -`1`) {
2048	if (bp->b_shadow_ref == `0`) {
2049	panic("bremfree_locked: %p not on freelist", bp);
2050	}
2051	/*
2052	* there are clones pointing to 'bp'...
2053	* therefore, it was not put on a freelist
2054	* when buf_brelse was last called on 'bp'
2055	*/
2056	return;
2057	}
2058	/*
2059	* We only calculate the head of the freelist when removing
2060	* the last element of the list as that is the only time that
2061	* it is needed (e.g. to reset the tail pointer).
2062	*
2063	* NB: This makes an assumption about how tailq's are implemented.
2064	*/
2065	if (bp->b_freelist.tqe_next == NULL) {
2066	dp = &bufqueues[whichq];
2067
2068	if (dp->tqh_last != &bp->b_freelist.tqe_next) {
2069	panic("bremfree: lost tail");
2070	}
2071	}
2072	TAILQ_REMOVE(dp, bp, b_freelist);
2073
2074	if (whichq == BQ_LAUNDRY) {
2075	blaundrycnt--;
2076	}
2077
2078	bp->b_whichq = -`1`;
2079	bp->b_timestamp = `0`;
2080	bp->b_shadow = `0`;
2081	}
2082
2083	/*
2084	* Associate a buffer with a vnode.
2085	* buf_mtx must be locked on entry
2086	*/
2087	static void
2088	bgetvp_locked(vnode_t vp, buf_t bp)
2089	{
2090	if (bp->b_vp != vp) {
2091	panic("bgetvp_locked: not free");
2092	}
2093
2094	if (vp->v_type == VBLK \|\| vp->v_type == VCHR) {
2095	bp->b_dev = vp->v_rdev;
2096	} else {
2097	bp->b_dev = NODEV;
2098	}
2099	/*
2100	* Insert onto list for new vnode.
2101	*/
2102	bufinsvn(bp, &vp->v_cleanblkhd);
2103	}
2104
2105	/*
2106	* Disassociate a buffer from a vnode.
2107	* buf_mtx must be locked on entry
2108	*/
2109	static void
2110	brelvp_locked(buf_t bp)
2111	{
2112	/*
2113	* Delete from old vnode list, if on one.
2114	*/
2115	if (bp->b_vnbufs.le_next != NOLIST) {
2116	bufremvn(bp);
2117	}
2118
2119	bp->b_vp = (vnode_t)NULL;
2120	}
2121
2122	/*
2123	* Reassign a buffer from one vnode to another.
2124	* Used to assign file specific control information
2125	* (indirect blocks) to the vnode to which they belong.
2126	*/
2127	static void
2128	buf_reassign(buf_t bp, vnode_t newvp)
2129	{
2130	struct buflists *listheadp;
2131
2132	if (newvp == NULL) {
2133	printf("buf_reassign: NULL");
2134	return;
2135	}
2136	lck_mtx_lock_spin(lck: &buf_mtx);
2137
2138	/*
2139	* Delete from old vnode list, if on one.
2140	*/
2141	if (bp->b_vnbufs.le_next != NOLIST) {
2142	bufremvn(bp);
2143	}
2144	/*
2145	* If dirty, put on list of dirty buffers;
2146	* otherwise insert onto list of clean buffers.
2147	*/
2148	if (ISSET(bp->b_flags, B_DELWRI)) {
2149	listheadp = &newvp->v_dirtyblkhd;
2150	} else {
2151	listheadp = &newvp->v_cleanblkhd;
2152	}
2153	bufinsvn(bp, listheadp);
2154
2155	lck_mtx_unlock(lck: &buf_mtx);
2156	}
2157
2158	static __inline__ void
2159	bufhdrinit(buf_t bp)
2160	{
2161	bzero(s: (char )bp, n: sizeof* *bp);
2162	bp->b_dev = NODEV;
2163	bp->b_rcred = NOCRED;
2164	bp->b_wcred = NOCRED;
2165	bp->b_vnbufs.le_next = NOLIST;
2166	bp->b_flags = B_INVAL;
2167
2168	return;
2169	}
2170
2171	/*
2172	* Initialize buffers and hash links for buffers.
2173	*/
2174	__private_extern__ void
2175	bufinit(void)
2176	{
2177	buf_t bp;
2178	struct bqueues *dp;
2179	int i;
2180
2181	nbuf_headers = `0`;
2182	/ Initialize the buffer queues ('freelists') and the hash table /
2183	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
2184	TAILQ_INIT(dp);
2185	}
2186	bufhashtbl = hashinit(count: nbuf_hashelements, M_CACHE, hashmask: &bufhash);
2187
2188	buf_busycount = `0`;
2189
2190	/ Initialize the buffer headers /
2191	for (i = `0`; i < max_nbuf_headers; i++) {
2192	nbuf_headers++;
2193	bp = &buf_headers[i];
2194	bufhdrinit(bp);
2195
2196	BLISTNONE(bp);
2197	dp = &bufqueues[BQ_EMPTY];
2198	bp->b_whichq = BQ_EMPTY;
2199	bp->b_timestamp = buf_timestamp();
2200	binsheadfree(bp, dp, BQ_EMPTY);
2201	binshash(bp, dp: &invalhash);
2202	}
2203	boot_nbuf_headers = nbuf_headers;
2204
2205	TAILQ_INIT(&iobufqueue);
2206	TAILQ_INIT(&delaybufqueue);
2207
2208	for (; i < nbuf_headers + niobuf_headers; i++) {
2209	bp = &buf_headers[i];
2210	bufhdrinit(bp);
2211	bp->b_whichq = -`1`;
2212	binsheadfree(bp, &iobufqueue, -`1`);
2213	}
2214
2215	/*
2216	* allocate and initialize cluster specific global locks...
2217	*/
2218	cluster_init();
2219
2220	printf("using %d buffer headers and %d cluster IO buffer headers\n",
2221	nbuf_headers, niobuf_headers);
2222
2223	/ start the bcleanbuf() thread /
2224	bcleanbuf_thread_init();
2225
2226	/ Register a callout for relieving vm pressure /
2227	if (vm_set_buffer_cleanup_callout(func: buffer_cache_gc) != KERN_SUCCESS) {
2228	panic("Couldn't register buffer cache callout for vm pressure!");
2229	}
2230	}
2231
2232	/*
2233	* Zones for the meta data buffers
2234	*/
2235
2236	#define MINMETA 512
2237	#define MAXMETA 16384
2238
2239	KALLOC_HEAP_DEFINE(KHEAP_VFS_BIO, "vfs_bio", KHEAP_ID_DATA_BUFFERS);
2240
2241	static struct buf *
2242	bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2243	{
2244	buf_t bp;
2245
2246	bp = buf_getblk(vp, blkno, size, slpflag: `0`, slptimeo: `0`, operation: queuetype);
2247
2248	/*
2249	* If buffer does not have data valid, start a read.
2250	* Note that if buffer is B_INVAL, buf_getblk() won't return it.
2251	* Therefore, it's valid if it's I/O has completed or been delayed.
2252	*/
2253	if (!ISSET(bp->b_flags, (B_DONE \| B_DELWRI))) {
2254	struct proc *p;
2255
2256	p = current_proc();
2257
2258	/ Start I/O for the buffer (keeping credentials). /
2259	SET(bp->b_flags, B_READ \| async);
2260	if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2261	kauth_cred_ref(cred);
2262	bp->b_rcred = cred;
2263	}
2264
2265	VNOP_STRATEGY(bp);
2266
2267	trace(TR_BREADMISS, pack(vp, size), blkno);
2268
2269	/ Pay for the read. /
2270	if (p && p->p_stats) {
2271	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_inblock); / XXX /
2272	}
2273
2274	if (async) {
2275	/*
2276	* since we asked for an ASYNC I/O
2277	* the biodone will do the brelse
2278	* we don't want to pass back a bp
2279	* that we don't 'own'
2280	*/
2281	bp = NULL;
2282	}
2283	} else if (async) {
2284	buf_brelse(bp);
2285	bp = NULL;
2286	}
2287
2288	trace(TR_BREADHIT, pack(vp, size), blkno);
2289
2290	return bp;
2291	}
2292
2293	/*
2294	* Perform the reads for buf_breadn() and buf_meta_breadn().
2295	* Trivial modification to the breada algorithm presented in Bach (p.55).
2296	*/
2297	static errno_t
2298	do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int* *rasizes,
2299	int nrablks, kauth_cred_t cred, buf_t bpp, int* queuetype)
2300	{
2301	buf_t bp;
2302	int i;
2303
2304	bp = *bpp = bio_doread(vp, blkno, size, cred, async: `0`, queuetype);
2305
2306	/*
2307	* For each of the read-ahead blocks, start a read, if necessary.
2308	*/
2309	for (i = `0`; i < nrablks; i++) {
2310	/ If it's in the cache, just go on to next one. /
2311	if (incore(vp, blkno: rablks[i])) {
2312	continue;
2313	}
2314
2315	/ Get a buffer for the read-ahead block /
2316	(void) bio_doread(vp, blkno: rablks[i], size: rasizes[i], cred, B_ASYNC, queuetype);
2317	}
2318
2319	/ Otherwise, we had to start a read for it; wait until it's valid. /
2320	return buf_biowait(bp);
2321	}
2322
2323
2324	/*
2325	* Read a disk block.
2326	* This algorithm described in Bach (p.54).
2327	*/
2328	errno_t
2329	buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2330	{
2331	buf_t bp;
2332
2333	/ Get buffer for block. /
2334	bp = *bpp = bio_doread(vp, blkno, size, cred, async: `0`, BLK_READ);
2335
2336	/ Wait for the read to complete, and return result. /
2337	return buf_biowait(bp);
2338	}
2339
2340	/*
2341	* Read a disk block. [bread() for meta-data]
2342	* This algorithm described in Bach (p.54).
2343	*/
2344	errno_t
2345	buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2346	{
2347	buf_t bp;
2348
2349	/ Get buffer for block. /
2350	bp = *bpp = bio_doread(vp, blkno, size, cred, async: `0`, BLK_META);
2351
2352	/ Wait for the read to complete, and return result. /
2353	return buf_biowait(bp);
2354	}
2355
2356	/*
2357	* Read-ahead multiple disk blocks. The first is sync, the rest async.
2358	*/
2359	errno_t
2360	buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int* rasizes, int* nrablks, kauth_cred_t cred, buf_t *bpp)
2361	{
2362	return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ);
2363	}
2364
2365	/*
2366	* Read-ahead multiple disk blocks. The first is sync, the rest async.
2367	* [buf_breadn() for meta-data]
2368	*/
2369	errno_t
2370	buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int* rasizes, int* nrablks, kauth_cred_t cred, buf_t *bpp)
2371	{
2372	return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META);
2373	}
2374
2375	/*
2376	* Block write. Described in Bach (p.56)
2377	*/
2378	errno_t
2379	buf_bwrite(buf_t bp)
2380	{
2381	int sync, wasdelayed;
2382	errno_t rv;
2383	proc_t p = current_proc();
2384	vnode_t vp = bp->b_vp;
2385
2386	if (bp->b_datap == `0`) {
2387	if (brecover_data(bp) == `0`) {
2388	return `0`;
2389	}
2390	}
2391	/ Remember buffer type, to switch on it later. /
2392	sync = !ISSET(bp->b_flags, B_ASYNC);
2393	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2394	CLR(bp->b_flags, (B_READ \| B_DONE \| B_ERROR \| B_DELWRI));
2395
2396	if (wasdelayed) {
2397	OSAddAtomicLong(-`1`, &nbdwrite);
2398	}
2399
2400	if (!sync) {
2401	/*
2402	* If not synchronous, pay for the I/O operation and make
2403	* sure the buf is on the correct vnode queue. We have
2404	* to do this now, because if we don't, the vnode may not
2405	* be properly notified that its I/O has completed.
2406	*/
2407	if (wasdelayed) {
2408	buf_reassign(bp, newvp: vp);
2409	} else if (p && p->p_stats) {
2410	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_oublock); / XXX /
2411	}
2412	}
2413	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2414
2415	/ Initiate disk write. Make sure the appropriate party is charged. /
2416
2417	OSAddAtomic(`1`, &vp->v_numoutput);
2418
2419	VNOP_STRATEGY(bp);
2420
2421	if (sync) {
2422	/*
2423	* If I/O was synchronous, wait for it to complete.
2424	*/
2425	rv = buf_biowait(bp);
2426
2427	/*
2428	* Pay for the I/O operation, if it's not been paid for, and
2429	* make sure it's on the correct vnode queue. (async operatings
2430	* were payed for above.)
2431	*/
2432	if (wasdelayed) {
2433	buf_reassign(bp, newvp: vp);
2434	} else if (p && p->p_stats) {
2435	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_oublock); / XXX /
2436	}
2437
2438	/ Release the buffer. /
2439	buf_brelse(bp);
2440
2441	return rv;
2442	} else {
2443	return `0`;
2444	}
2445	}
2446
2447	int
2448	vn_bwrite(struct vnop_bwrite_args *ap)
2449	{
2450	return buf_bwrite(bp: ap->a_bp);
2451	}
2452
2453	/*
2454	* Delayed write.
2455	*
2456	* The buffer is marked dirty, but is not queued for I/O.
2457	* This routine should be used when the buffer is expected
2458	* to be modified again soon, typically a small write that
2459	* partially fills a buffer.
2460	*
2461	* NB: magnetic tapes cannot be delayed; they must be
2462	* written in the order that the writes are requested.
2463	*
2464	* Described in Leffler, et al. (pp. 208-213).
2465	*
2466	* Note: With the ability to allocate additional buffer
2467	* headers, we can get in to the situation where "too" many
2468	* buf_bdwrite()s can create situation where the kernel can create
2469	* buffers faster than the disks can service. Doing a buf_bawrite() in
2470	* cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2471	*/
2472	int
2473	bdwrite_internal(buf_t bp, int return_error)
2474	{
2475	proc_t p = current_proc();
2476	vnode_t vp = bp->b_vp;
2477
2478	/*
2479	* If the block hasn't been seen before:
2480	* (1) Mark it as having been seen,
2481	* (2) Charge for the write.
2482	* (3) Make sure it's on its vnode's correct block list,
2483	*/
2484	if (!ISSET(bp->b_flags, B_DELWRI)) {
2485	SET(bp->b_flags, B_DELWRI);
2486	if (p && p->p_stats) {
2487	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_oublock); / XXX /
2488	}
2489	OSAddAtomicLong(`1`, &nbdwrite);
2490	buf_reassign(bp, newvp: vp);
2491	}
2492
2493	/*
2494	* if we're not LOCKED, but the total number of delayed writes
2495	* has climbed above 75% of the total buffers in the system
2496	* return an error if the caller has indicated that it can
2497	* handle one in this case, otherwise schedule the I/O now
2498	* this is done to prevent us from allocating tons of extra
2499	* buffers when dealing with virtual disks (i.e. DiskImages),
2500	* because additional buffers are dynamically allocated to prevent
2501	* deadlocks from occurring
2502	*
2503	* however, can't do a buf_bawrite() if the LOCKED bit is set because the
2504	* buffer is part of a transaction and can't go to disk until
2505	* the LOCKED bit is cleared.
2506	*/
2507	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers / `4`) * `3`)) {
2508	if (return_error) {
2509	return EAGAIN;
2510	}
2511	/*
2512	* If the vnode has "too many" write operations in progress
2513	* wait for them to finish the IO
2514	*/
2515	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, slpflag: `0`, slptimeout: `0`, msg: "buf_bdwrite");
2516
2517	return buf_bawrite(bp);
2518	}
2519
2520	/ Otherwise, the "write" is done, so mark and release the buffer. /
2521	SET(bp->b_flags, B_DONE);
2522	buf_brelse(bp);
2523	return `0`;
2524	}
2525
2526	errno_t
2527	buf_bdwrite(buf_t bp)
2528	{
2529	return bdwrite_internal(bp, return_error: `0`);
2530	}
2531
2532
2533	/*
2534	* Asynchronous block write; just an asynchronous buf_bwrite().
2535	*
2536	* Note: With the abilitty to allocate additional buffer
2537	* headers, we can get in to the situation where "too" many
2538	* buf_bawrite()s can create situation where the kernel can create
2539	* buffers faster than the disks can service.
2540	* We limit the number of "in flight" writes a vnode can have to
2541	* avoid this.
2542	*/
2543	static int
2544	bawrite_internal(buf_t bp, int throttle)
2545	{
2546	vnode_t vp = bp->b_vp;
2547
2548	if (vp) {
2549	if (throttle) {
2550	/*
2551	* If the vnode has "too many" write operations in progress
2552	* wait for them to finish the IO
2553	*/
2554	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, slpflag: `0`, slptimeout: `0`, msg: (const char *)"buf_bawrite");
2555	} else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) {
2556	/*
2557	* return to the caller and
2558	* let him decide what to do
2559	*/
2560	return EWOULDBLOCK;
2561	}
2562	}
2563	SET(bp->b_flags, B_ASYNC);
2564
2565	return VNOP_BWRITE(bp);
2566	}
2567
2568	errno_t
2569	buf_bawrite(buf_t bp)
2570	{
2571	return bawrite_internal(bp, throttle: `1`);
2572	}
2573
2574
2575
2576	static void
2577	buf_free_meta_store(buf_t bp)
2578	{
2579	if (bp->b_bufsize) {
2580	uintptr_t datap = bp->b_datap;
2581	int bufsize = bp->b_bufsize;
2582
2583	bp->b_datap = (uintptr_t)NULL;
2584	bp->b_bufsize = `0`;
2585
2586	/*
2587	* Ensure the assignment of b_datap has global visibility
2588	* before we free the region.
2589	*/
2590	OSMemoryBarrier();
2591
2592	if (ISSET(bp->b_flags, B_ZALLOC)) {
2593	kheap_free(KHEAP_VFS_BIO, datap, bufsize);
2594	} else {
2595	kmem_free(map: kernel_map, addr: datap, size: bufsize);
2596	}
2597	}
2598	}
2599
2600
2601	static buf_t
2602	buf_brelse_shadow(buf_t bp)
2603	{
2604	buf_t bp_head;
2605	buf_t bp_temp;
2606	buf_t bp_return = NULL;
2607	#ifdef BUF_MAKE_PRIVATE
2608	buf_t bp_data;
2609	int data_ref = `0`;
2610	#endif
2611	int need_wakeup = `0`;
2612
2613	lck_mtx_lock_spin(lck: &buf_mtx);
2614
2615	__IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
2616
2617	if (bp_head->b_whichq != -`1`) {
2618	panic("buf_brelse_shadow: bp_head on freelist %d", bp_head->b_whichq);
2619	}
2620
2621	#ifdef BUF_MAKE_PRIVATE
2622	if (bp_data = bp->b_data_store) {
2623	bp_data->b_data_ref--;
2624	/*
2625	* snapshot the ref count so that we can check it
2626	* outside of the lock... we only want the guy going
2627	* from 1 -> 0 to try and release the storage
2628	*/
2629	data_ref = bp_data->b_data_ref;
2630	}
2631	#endif
2632	KERNEL_DEBUG(`0xbbbbc008` \| DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, `0`, `0`);
2633
2634	bp_head->b_shadow_ref--;
2635
2636	for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow) {
2637	;
2638	}
2639
2640	if (bp_temp == NULL) {
2641	panic("buf_brelse_shadow: bp not on list %p", bp_head);
2642	}
2643
2644	bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2645
2646	#ifdef BUF_MAKE_PRIVATE
2647	/*
2648	* we're about to free the current 'owner' of the data buffer and
2649	* there is at least one other shadow buf_t still pointing at it
2650	* so transfer it to the first shadow buf left in the chain
2651	*/
2652	if (bp == bp_data && data_ref) {
2653	if ((bp_data = bp_head->b_shadow) == NULL) {
2654	panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2655	}
2656
2657	for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) {
2658	bp_temp->b_data_store = bp_data;
2659	}
2660	bp_data->b_data_ref = data_ref;
2661	}
2662	#endif
2663	if (bp_head->b_shadow_ref == `0` && bp_head->b_shadow) {
2664	panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2665	}
2666	if (bp_head->b_shadow_ref && bp_head->b_shadow == `0`) {
2667	panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2668	}
2669
2670	if (bp_head->b_shadow_ref == `0`) {
2671	if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2672	CLR(bp_head->b_flags, B_AGE);
2673	bp_head->b_timestamp = buf_timestamp();
2674
2675	if (ISSET(bp_head->b_flags, B_LOCKED)) {
2676	bp_head->b_whichq = BQ_LOCKED;
2677	binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2678	} else {
2679	bp_head->b_whichq = BQ_META;
2680	binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2681	}
2682	} else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2683	CLR(bp_head->b_lflags, BL_WAITSHADOW);
2684
2685	bp_return = bp_head;
2686	}
2687	if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2688	CLR(bp_head->b_lflags, BL_WANTED_REF);
2689	need_wakeup = `1`;
2690	}
2691	}
2692	lck_mtx_unlock(lck: &buf_mtx);
2693
2694	if (need_wakeup) {
2695	wakeup(chan: bp_head);
2696	}
2697
2698	#ifdef BUF_MAKE_PRIVATE
2699	if (bp == bp_data && data_ref == `0`) {
2700	buf_free_meta_store(bp);
2701	}
2702
2703	bp->b_data_store = NULL;
2704	#endif
2705	KERNEL_DEBUG(`0xbbbbc008` \| DBG_FUNC_END, bp, `0`, `0`, `0`, `0`);
2706
2707	return bp_return;
2708	}
2709
2710
2711	/*
2712	* Release a buffer on to the free lists.
2713	* Described in Bach (p. 46).
2714	*/
2715	void
2716	buf_brelse(buf_t bp)
2717	{
2718	struct bqueues *bufq;
2719	int whichq;
2720	upl_t upl;
2721	int need_wakeup = `0`;
2722	int need_bp_wakeup = `0`;
2723
2724
2725	if (bp->b_whichq != -`1` \|\| !(bp->b_lflags & BL_BUSY)) {
2726	panic("buf_brelse: bad buffer = %p", bp);
2727	}
2728
2729	#ifdef JOE_DEBUG
2730	(void) OSBacktrace(&bp->b_stackbrelse[`0`], `6`);
2731
2732	bp->b_lastbrelse = current_thread();
2733	bp->b_tag = `0`;
2734	#endif
2735	if (bp->b_lflags & BL_IOBUF) {
2736	buf_t shadow_master_bp = NULL;
2737
2738	if (ISSET(bp->b_lflags, BL_SHADOW)) {
2739	shadow_master_bp = buf_brelse_shadow(bp);
2740	} else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) {
2741	buf_free_meta_store(bp);
2742	}
2743	free_io_buf(bp);
2744
2745	if (shadow_master_bp) {
2746	bp = shadow_master_bp;
2747	goto finish_shadow_master;
2748	}
2749	return;
2750	}
2751
2752	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `388`)) \| DBG_FUNC_START,
2753	bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2754	bp->b_flags, `0`);
2755
2756	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2757
2758	/*
2759	* if we're invalidating a buffer that has the B_FILTER bit
2760	* set then call the b_iodone function so it gets cleaned
2761	* up properly.
2762	*
2763	* the HFS journal code depends on this
2764	*/
2765	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2766	if (ISSET(bp->b_flags, B_FILTER)) { / if necessary, call out /
2767	void (iodone_func)(struct* buf , void* *) = bp->b_iodone;
2768	void *arg = bp->b_transaction;
2769
2770	CLR(bp->b_flags, B_FILTER); / but note callout done /
2771	bp->b_iodone = NULL;
2772	bp->b_transaction = NULL;
2773
2774	if (iodone_func == NULL) {
2775	panic("brelse: bp @ %p has NULL b_iodone!", bp);
2776	}
2777	(*iodone_func)(bp, arg);
2778	}
2779	}
2780	/*
2781	* I/O is done. Cleanup the UPL state
2782	*/
2783	upl = bp->b_upl;
2784
2785	if (!ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2786	kern_return_t kret;
2787	int upl_flags;
2788
2789	if (upl == NULL) {
2790	if (!ISSET(bp->b_flags, B_INVAL)) {
2791	kret = ubc_create_upl_kernel(bp->b_vp,
2792	ubc_blktooff(bp->b_vp, bp->b_lblkno),
2793	bp->b_bufsize,
2794	&upl,
2795	NULL,
2796	UPL_PRECIOUS,
2797	VM_KERN_MEMORY_FILE);
2798
2799	if (kret != KERN_SUCCESS) {
2800	panic("brelse: Failed to create UPL");
2801	}
2802	#if UPL_DEBUG
2803	upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) `5`);
2804	#endif /* UPL_DEBUG */
2805	}
2806	} else {
2807	if (bp->b_datap) {
2808	kret = ubc_upl_unmap(upl);
2809
2810	if (kret != KERN_SUCCESS) {
2811	panic("ubc_upl_unmap failed");
2812	}
2813	bp->b_datap = (uintptr_t)NULL;
2814	}
2815	}
2816	if (upl) {
2817	if (bp->b_flags & (B_ERROR \| B_INVAL)) {
2818	if (bp->b_flags & (B_READ \| B_INVAL)) {
2819	upl_flags = UPL_ABORT_DUMP_PAGES;
2820	} else {
2821	upl_flags = `0`;
2822	}
2823
2824	ubc_upl_abort(upl, upl_flags);
2825	} else {
2826	if (ISSET(bp->b_flags, B_DELWRI \| B_WASDIRTY)) {
2827	upl_flags = UPL_COMMIT_SET_DIRTY;
2828	} else {
2829	upl_flags = UPL_COMMIT_CLEAR_DIRTY;
2830	}
2831
2832	ubc_upl_commit_range(upl, `0`, bp->b_bufsize, upl_flags \|
2833	UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
2834	}
2835	bp->b_upl = NULL;
2836	}
2837	} else {
2838	if ((upl)) {
2839	panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2840	}
2841	}
2842
2843	/*
2844	* If it's locked, don't report an error; try again later.
2845	*/
2846	if (ISSET(bp->b_flags, (B_LOCKED \| B_ERROR)) == (B_LOCKED \| B_ERROR)) {
2847	CLR(bp->b_flags, B_ERROR);
2848	}
2849	/*
2850	* If it's not cacheable, or an error, mark it invalid.
2851	*/
2852	if (ISSET(bp->b_flags, (B_NOCACHE \| B_ERROR))) {
2853	SET(bp->b_flags, B_INVAL);
2854	}
2855
2856	if ((bp->b_bufsize <= `0`) \|\|
2857	ISSET(bp->b_flags, B_INVAL) \|\|
2858	(ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2859	boolean_t delayed_buf_free_meta_store = FALSE;
2860
2861	/*
2862	* If it's invalid or empty, dissociate it from its vnode,
2863	* release its storage if B_META, and
2864	* clean it up a bit and put it on the EMPTY queue
2865	*/
2866	if (ISSET(bp->b_flags, B_DELWRI)) {
2867	OSAddAtomicLong(-`1`, &nbdwrite);
2868	}
2869
2870	if (ISSET(bp->b_flags, B_META)) {
2871	if (bp->b_shadow_ref) {
2872	delayed_buf_free_meta_store = TRUE;
2873	} else {
2874	buf_free_meta_store(bp);
2875	}
2876	}
2877	/*
2878	* nuke any credentials we were holding
2879	*/
2880	buf_release_credentials(bp);
2881
2882	lck_mtx_lock_spin(lck: &buf_mtx);
2883
2884	if (bp->b_shadow_ref) {
2885	SET(bp->b_lflags, BL_WAITSHADOW);
2886
2887	lck_mtx_unlock(lck: &buf_mtx);
2888
2889	return;
2890	}
2891	if (delayed_buf_free_meta_store == TRUE) {
2892	lck_mtx_unlock(lck: &buf_mtx);
2893	finish_shadow_master:
2894	buf_free_meta_store(bp);
2895
2896	lck_mtx_lock_spin(lck: &buf_mtx);
2897	}
2898	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED \| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
2899
2900	if (bp->b_vp) {
2901	brelvp_locked(bp);
2902	}
2903
2904	bremhash(bp);
2905	BLISTNONE(bp);
2906	binshash(bp, dp: &invalhash);
2907
2908	bp->b_whichq = BQ_EMPTY;
2909	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2910	} else {
2911	/*
2912	* It has valid data. Put it on the end of the appropriate
2913	* queue, so that it'll stick around for as long as possible.
2914	*/
2915	if (ISSET(bp->b_flags, B_LOCKED)) {
2916	whichq = BQ_LOCKED; / locked in core /
2917	} else if (ISSET(bp->b_flags, B_META)) {
2918	whichq = BQ_META; / meta-data /
2919	} else if (ISSET(bp->b_flags, B_AGE)) {
2920	whichq = BQ_AGE; / stale but valid data /
2921	} else {
2922	whichq = BQ_LRU; / valid data /
2923	}
2924	bufq = &bufqueues[whichq];
2925
2926	bp->b_timestamp = buf_timestamp();
2927
2928	lck_mtx_lock_spin(lck: &buf_mtx);
2929
2930	/*
2931	* the buf_brelse_shadow routine doesn't take 'ownership'
2932	* of the parent buf_t... it updates state that is protected by
2933	* the buf_mtx, and checks for BL_BUSY to determine whether to
2934	* put the buf_t back on a free list. b_shadow_ref is protected
2935	* by the lock, and since we have not yet cleared B_BUSY, we need
2936	* to check it while holding the lock to insure that one of us
2937	* puts this buf_t back on a free list when it is safe to do so
2938	*/
2939	if (bp->b_shadow_ref == `0`) {
2940	CLR(bp->b_flags, (B_AGE \| B_ASYNC \| B_NOCACHE));
2941	bp->b_whichq = whichq;
2942	binstailfree(bp, bufq, whichq);
2943	} else {
2944	/*
2945	* there are still cloned buf_t's pointing
2946	* at this guy... need to keep it off the
2947	* freelists until a buf_brelse is done on
2948	* the last clone
2949	*/
2950	CLR(bp->b_flags, (B_ASYNC \| B_NOCACHE));
2951	}
2952	}
2953	if (needbuffer) {
2954	/*
2955	* needbuffer is a global
2956	* we're currently using buf_mtx to protect it
2957	* delay doing the actual wakeup until after
2958	* we drop buf_mtx
2959	*/
2960	needbuffer = `0`;
2961	need_wakeup = `1`;
2962	}
2963	if (ISSET(bp->b_lflags, BL_WANTED)) {
2964	/*
2965	* delay the actual wakeup until after we
2966	* clear BL_BUSY and we've dropped buf_mtx
2967	*/
2968	need_bp_wakeup = `1`;
2969	}
2970	/*
2971	* Unlock the buffer.
2972	*/
2973	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
2974	buf_busycount--;
2975
2976	lck_mtx_unlock(lck: &buf_mtx);
2977
2978	if (need_wakeup) {
2979	/*
2980	* Wake up any processes waiting for any buffer to become free.
2981	*/
2982	wakeup(chan: &needbuffer);
2983	}
2984	if (need_bp_wakeup) {
2985	/*
2986	* Wake up any proceeses waiting for _this_ buffer to become free.
2987	*/
2988	wakeup(chan: bp);
2989	}
2990	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `388`)) \| DBG_FUNC_END,
2991	bp, bp->b_datap, bp->b_flags, `0`, `0`);
2992	}
2993
2994	/*
2995	* Determine if a block is in the cache.
2996	* Just look on what would be its hash chain. If it's there, return
2997	* a pointer to it, unless it's marked invalid. If it's marked invalid,
2998	* we normally don't return the buffer, unless the caller explicitly
2999	* wants us to.
3000	*/
3001	static boolean_t
3002	incore(vnode_t vp, daddr64_t blkno)
3003	{
3004	boolean_t retval;
3005	struct bufhashhdr *dp;
3006
3007	dp = BUFHASH(vp, blkno);
3008
3009	lck_mtx_lock_spin(lck: &buf_mtx);
3010
3011	if (incore_locked(vp, blkno, dp)) {
3012	retval = TRUE;
3013	} else {
3014	retval = FALSE;
3015	}
3016	lck_mtx_unlock(lck: &buf_mtx);
3017
3018	return retval;
3019	}
3020
3021
3022	static buf_t
3023	incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
3024	{
3025	struct buf *bp;
3026
3027	/ Search hash chain /
3028	for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
3029	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
3030	!ISSET(bp->b_flags, B_INVAL)) {
3031	return bp;
3032	}
3033	}
3034	return NULL;
3035	}
3036
3037
3038	void
3039	buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
3040	{
3041	buf_t bp;
3042	struct bufhashhdr *dp;
3043
3044	dp = BUFHASH(vp, blkno);
3045
3046	lck_mtx_lock_spin(lck: &buf_mtx);
3047
3048	for (;;) {
3049	if ((bp = incore_locked(vp, blkno, dp)) == NULL) {
3050	break;
3051	}
3052
3053	if (bp->b_shadow_ref == `0`) {
3054	break;
3055	}
3056
3057	SET(bp->b_lflags, BL_WANTED_REF);
3058
3059	(void) msleep(chan: bp, mtx: &buf_mtx, PSPIN \| (PRIBIO + `1`), wmesg: "buf_wait_for_shadow", NULL);
3060	}
3061	lck_mtx_unlock(lck: &buf_mtx);
3062	}
3063
3064	/ XXX FIXME -- Update the comment to reflect the UBC changes (please) -- /
3065	/*
3066	* Get a block of requested size that is associated with
3067	* a given vnode and block offset. If it is found in the
3068	* block cache, mark it as having been found, make it busy
3069	* and return it. Otherwise, return an empty block of the
3070	* correct size. It is up to the caller to insure that the
3071	* cached blocks be of the correct size.
3072	*/
3073	buf_t
3074	buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
3075	{
3076	buf_t bp;
3077	int err;
3078	upl_t upl;
3079	upl_page_info_t *pl;
3080	kern_return_t kret;
3081	int ret_only_valid;
3082	struct timespec ts;
3083	int upl_flags;
3084	struct bufhashhdr *dp;
3085
3086	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `386`)) \| DBG_FUNC_START,
3087	(uintptr_t)(blkno * PAGE_SIZE), size, operation, `0`, `0`);
3088
3089	ret_only_valid = operation & BLK_ONLYVALID;
3090	operation &= ~BLK_ONLYVALID;
3091	dp = BUFHASH(vp, blkno);
3092	start:
3093	lck_mtx_lock_spin(lck: &buf_mtx);
3094
3095	if ((bp = incore_locked(vp, blkno, dp))) {
3096	/*
3097	* Found in the Buffer Cache
3098	*/
3099	if (ISSET(bp->b_lflags, BL_BUSY)) {
3100	/*
3101	* but is busy
3102	*/
3103	switch (operation) {
3104	case BLK_READ:
3105	case BLK_WRITE:
3106	case BLK_META:
3107	SET(bp->b_lflags, BL_WANTED);
3108	bufstats.bufs_busyincore++;
3109
3110	/*
3111	* don't retake the mutex after being awakened...
3112	* the time out is in msecs
3113	*/
3114	ts.tv_sec = (slptimeo / `1000`);
3115	ts.tv_nsec = (slptimeo % `1000`) * `10` * NSEC_PER_USEC * `1000`;
3116
3117	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `396`)) \| DBG_FUNC_NONE,
3118	(uintptr_t)blkno, size, operation, `0`, `0`);
3119
3120	err = msleep(chan: bp, mtx: &buf_mtx, pri: slpflag \| PDROP \| (PRIBIO + `1`), wmesg: "buf_getblk", ts: &ts);
3121
3122	/*
3123	* Callers who call with PCATCH or timeout are
3124	* willing to deal with the NULL pointer
3125	*/
3126	if (err && ((slpflag & PCATCH) \|\| ((err == EWOULDBLOCK) && slptimeo))) {
3127	return NULL;
3128	}
3129	goto start;
3130	/NOTREACHED/
3131
3132	default:
3133	/*
3134	* unknown operation requested
3135	*/
3136	panic("getblk: paging or unknown operation for incore busy buffer - %x", operation);
3137	/NOTREACHED/
3138	break;
3139	}
3140	} else {
3141	int clear_bdone;
3142
3143	/*
3144	* buffer in core and not busy
3145	*/
3146	SET(bp->b_lflags, BL_BUSY);
3147	SET(bp->b_flags, B_CACHE);
3148	buf_busycount++;
3149
3150	bremfree_locked(bp);
3151	bufstats.bufs_incore++;
3152
3153	lck_mtx_unlock(lck: &buf_mtx);
3154	#ifdef JOE_DEBUG
3155	bp->b_owner = current_thread();
3156	bp->b_tag = `1`;
3157	#endif
3158	if ((bp->b_upl)) {
3159	panic("buffer has UPL, but not marked BUSY: %p", bp);
3160	}
3161
3162	clear_bdone = FALSE;
3163	if (!ret_only_valid) {
3164	/*
3165	* If the number bytes that are valid is going
3166	* to increase (even if we end up not doing a
3167	* reallocation through allocbuf) we have to read
3168	* the new size first.
3169	*
3170	* This is required in cases where we doing a read
3171	* modify write of a already valid data on disk but
3172	* in cases where the data on disk beyond (blkno + b_bcount)
3173	* is invalid, we may end up doing extra I/O.
3174	*/
3175	if (operation == BLK_META && bp->b_bcount < (uint32_t)size) {
3176	/*
3177	* Since we are going to read in the whole size first
3178	* we first have to ensure that any pending delayed write
3179	* is flushed to disk first.
3180	*/
3181	if (ISSET(bp->b_flags, B_DELWRI)) {
3182	CLR(bp->b_flags, B_CACHE);
3183	buf_bwrite(bp);
3184	goto start;
3185	}
3186	/*
3187	* clear B_DONE before returning from
3188	* this function so that the caller can
3189	* can issue a read for the new size.
3190	*/
3191	clear_bdone = TRUE;
3192	}
3193
3194	if (bp->b_bufsize != (uint32_t)size) {
3195	allocbuf(bp, size);
3196	}
3197	}
3198
3199	upl_flags = `0`;
3200	switch (operation) {
3201	case BLK_WRITE:
3202	/*
3203	* "write" operation: let the UPL subsystem
3204	* know that we intend to modify the buffer
3205	* cache pages we're gathering.
3206	*/
3207	upl_flags \|= UPL_WILL_MODIFY;
3208	OS_FALLTHROUGH;
3209	case BLK_READ:
3210	upl_flags \|= UPL_PRECIOUS;
3211	if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
3212	kret = ubc_create_upl_kernel(vp,
3213	ubc_blktooff(vp, bp->b_lblkno),
3214	bp->b_bufsize,
3215	&upl,
3216	&pl,
3217	upl_flags,
3218	VM_KERN_MEMORY_FILE);
3219	if (kret != KERN_SUCCESS) {
3220	panic("Failed to create UPL");
3221	}
3222
3223	bp->b_upl = upl;
3224
3225	if (upl_valid_page(upl: pl, index: `0`)) {
3226	if (upl_dirty_page(upl: pl, index: `0`)) {
3227	SET(bp->b_flags, B_WASDIRTY);
3228	} else {
3229	CLR(bp->b_flags, B_WASDIRTY);
3230	}
3231	} else {
3232	CLR(bp->b_flags, (B_DONE \| B_CACHE \| B_WASDIRTY \| B_DELWRI));
3233	}
3234
3235	kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3236
3237	if (kret != KERN_SUCCESS) {
3238	panic("getblk: ubc_upl_map() failed with (%d)", kret);
3239	}
3240	}
3241	break;
3242
3243	case BLK_META:
3244	/*
3245	* VM is not involved in IO for the meta data
3246	* buffer already has valid data
3247	*/
3248	break;
3249
3250	default:
3251	panic("getblk: paging or unknown operation for incore buffer- %d", operation);
3252	/NOTREACHED/
3253	break;
3254	}
3255
3256	if (clear_bdone) {
3257	CLR(bp->b_flags, B_DONE);
3258	}
3259	}
3260	} else { / not incore() /
3261	int queue = BQ_EMPTY; / Start with no preference /
3262
3263	if (ret_only_valid) {
3264	lck_mtx_unlock(lck: &buf_mtx);
3265	return NULL;
3266	}
3267	if ((vnode_isreg(vp) == `0`) \|\| (UBCINFOEXISTS(vp) == `0`) /\|\| (vnode_issystem(vp) == 1)/) {
3268	operation = BLK_META;
3269	}
3270
3271	if ((bp = getnewbuf(slpflag, slptimeo, queue: &queue)) == NULL) {
3272	goto start;
3273	}
3274
3275	/*
3276	* getnewbuf may block for a number of different reasons...
3277	* if it does, it's then possible for someone else to
3278	* create a buffer for the same block and insert it into
3279	* the hash... if we see it incore at this point we dump
3280	* the buffer we were working on and start over
3281	*/
3282	if (incore_locked(vp, blkno, dp)) {
3283	SET(bp->b_flags, B_INVAL);
3284	binshash(bp, dp: &invalhash);
3285
3286	lck_mtx_unlock(lck: &buf_mtx);
3287
3288	buf_brelse(bp);
3289	goto start;
3290	}
3291	/*
3292	* NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3293	* CALLED! BE CAREFUL.
3294	*/
3295
3296	/*
3297	* mark the buffer as B_META if indicated
3298	* so that when buffer is released it will goto META queue
3299	*/
3300	if (operation == BLK_META) {
3301	SET(bp->b_flags, B_META);
3302	}
3303
3304	bp->b_blkno = bp->b_lblkno = blkno;
3305	bp->b_lblksize = `0`; / Should be set by caller /
3306	bp->b_vp = vp;
3307
3308	/*
3309	* Insert in the hash so that incore() can find it
3310	*/
3311	binshash(bp, BUFHASH(vp, blkno));
3312
3313	bgetvp_locked(vp, bp);
3314
3315	lck_mtx_unlock(lck: &buf_mtx);
3316
3317	allocbuf(bp, size);
3318
3319	upl_flags = `0`;
3320	switch (operation) {
3321	case BLK_META:
3322	/*
3323	* buffer data is invalid...
3324	*
3325	* I don't want to have to retake buf_mtx,
3326	* so the miss and vmhits counters are done
3327	* with Atomic updates... all other counters
3328	* in bufstats are protected with either
3329	* buf_mtx or iobuffer_mtxp
3330	*/
3331	OSAddAtomicLong(`1`, &bufstats.bufs_miss);
3332	break;
3333
3334	case BLK_WRITE:
3335	/*
3336	* "write" operation: let the UPL subsystem know
3337	* that we intend to modify the buffer cache pages
3338	* we're gathering.
3339	*/
3340	upl_flags \|= UPL_WILL_MODIFY;
3341	OS_FALLTHROUGH;
3342	case BLK_READ:
3343	{ off_t f_offset;
3344	size_t contig_bytes;
3345	int bmap_flags;
3346
3347	#if DEVELOPMENT \|\| DEBUG
3348	/*
3349	* Apple implemented file systems use UBC excludively; they should
3350	* not call in here."
3351	*/
3352	const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3353	"exfat", "msdos", "webdav", NULL};
3354
3355	for (int i = `0`; excldfs[i] != NULL; i++) {
3356	if (vp->v_mount &&
3357	!strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3358	excldfs[i])) {
3359	panic("%s %s calls buf_getblk",
3360	excldfs[i],
3361	operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3362	}
3363	}
3364	#endif
3365
3366	if ((bp->b_upl)) {
3367	panic("bp already has UPL: %p", bp);
3368	}
3369
3370	f_offset = ubc_blktooff(vp, blkno);
3371
3372	upl_flags \|= UPL_PRECIOUS;
3373	kret = ubc_create_upl_kernel(vp,
3374	f_offset,
3375	bp->b_bufsize,
3376	&upl,
3377	&pl,
3378	upl_flags,
3379	VM_KERN_MEMORY_FILE);
3380
3381	if (kret != KERN_SUCCESS) {
3382	panic("Failed to create UPL");
3383	}
3384	#if UPL_DEBUG
3385	upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) `4`);
3386	#endif /* UPL_DEBUG */
3387	bp->b_upl = upl;
3388
3389	if (upl_valid_page(upl: pl, index: `0`)) {
3390	if (operation == BLK_READ) {
3391	bmap_flags = VNODE_READ;
3392	} else {
3393	bmap_flags = VNODE_WRITE;
3394	}
3395
3396	SET(bp->b_flags, B_CACHE \| B_DONE);
3397
3398	OSAddAtomicLong(`1`, &bufstats.bufs_vmhits);
3399
3400	bp->b_validoff = `0`;
3401	bp->b_dirtyoff = `0`;
3402
3403	if (upl_dirty_page(upl: pl, index: `0`)) {
3404	/ page is dirty /
3405	SET(bp->b_flags, B_WASDIRTY);
3406
3407	bp->b_validend = bp->b_bcount;
3408	bp->b_dirtyend = bp->b_bcount;
3409	} else {
3410	/ page is clean /
3411	bp->b_validend = bp->b_bcount;
3412	bp->b_dirtyend = `0`;
3413	}
3414	/*
3415	* try to recreate the physical block number associated with
3416	* this buffer...
3417	*/
3418	if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) {
3419	panic("getblk: VNOP_BLOCKMAP failed");
3420	}
3421	/*
3422	* if the extent represented by this buffer
3423	* is not completely physically contiguous on
3424	* disk, than we can't cache the physical mapping
3425	* in the buffer header
3426	*/
3427	if ((uint32_t)contig_bytes < bp->b_bcount) {
3428	bp->b_blkno = bp->b_lblkno;
3429	}
3430	} else {
3431	OSAddAtomicLong(`1`, &bufstats.bufs_miss);
3432	}
3433	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3434
3435	if (kret != KERN_SUCCESS) {
3436	panic("getblk: ubc_upl_map() failed with (%d)", kret);
3437	}
3438	break;} // end BLK_READ
3439	default:
3440	panic("getblk: paging or unknown operation - %x", operation);
3441	/NOTREACHED/
3442	break;
3443	} // end switch
3444	} //end buf_t !incore
3445
3446	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `386`)) \| DBG_FUNC_END,
3447	bp, bp->b_datap, bp->b_flags, `3`, `0`);
3448
3449	#ifdef JOE_DEBUG
3450	(void) OSBacktrace(&bp->b_stackgetblk[`0`], `6`);
3451	#endif
3452	return bp;
3453	}
3454
3455	/*
3456	* Get an empty, disassociated buffer of given size.
3457	*/
3458	buf_t
3459	buf_geteblk(int size)
3460	{
3461	buf_t bp = NULL;
3462	int queue = BQ_EMPTY;
3463
3464	do {
3465	lck_mtx_lock_spin(lck: &buf_mtx);
3466
3467	bp = getnewbuf(slpflag: `0`, slptimeo: `0`, queue: &queue);
3468	} while (bp == NULL);
3469
3470	SET(bp->b_flags, (B_META \| B_INVAL));
3471
3472	#if DIAGNOSTIC
3473	assert(queue == BQ_EMPTY);
3474	#endif /* DIAGNOSTIC */
3475	/ XXX need to implement logic to deal with other queues /
3476
3477	binshash(bp, dp: &invalhash);
3478	bufstats.bufs_eblk++;
3479
3480	lck_mtx_unlock(lck: &buf_mtx);
3481
3482	allocbuf(bp, size);
3483
3484	return bp;
3485	}
3486
3487	uint32_t
3488	buf_redundancy_flags(buf_t bp)
3489	{
3490	return bp->b_redundancy_flags;
3491	}
3492
3493	void
3494	buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3495	{
3496	SET(bp->b_redundancy_flags, flags);
3497	}
3498
3499	void
3500	buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3501	{
3502	CLR(bp->b_redundancy_flags, flags);
3503	}
3504
3505
3506
3507	static void *
3508	recycle_buf_from_pool(int nsize)
3509	{
3510	buf_t bp;
3511	void *ptr = NULL;
3512
3513	lck_mtx_lock_spin(lck: &buf_mtx);
3514
3515	TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3516	if (ISSET(bp->b_flags, B_DELWRI) \|\| bp->b_bufsize != (uint32_t)nsize) {
3517	continue;
3518	}
3519	ptr = (void *)bp->b_datap;
3520	bp->b_bufsize = `0`;
3521
3522	bcleanbuf(bp, TRUE);
3523	break;
3524	}
3525	lck_mtx_unlock(lck: &buf_mtx);
3526
3527	return ptr;
3528	}
3529
3530
3531
3532	int zalloc_nopagewait_failed = `0`;
3533	int recycle_buf_failed = `0`;
3534
3535	static void *
3536	grab_memory_for_meta_buf(int nsize)
3537	{
3538	void *ptr;
3539	boolean_t was_vmpriv;
3540
3541
3542	/*
3543	* make sure we're NOT priviliged so that
3544	* if a vm_page_grab is needed, it won't
3545	* block if we're out of free pages... if
3546	* it blocks, then we can't honor the
3547	* nopagewait request
3548	*/
3549	was_vmpriv = set_vm_privilege(FALSE);
3550
3551	ptr = kheap_alloc(KHEAP_VFS_BIO, nsize, Z_NOPAGEWAIT);
3552
3553	if (was_vmpriv == TRUE) {
3554	set_vm_privilege(TRUE);
3555	}
3556
3557	if (ptr == NULL) {
3558	zalloc_nopagewait_failed++;
3559
3560	ptr = recycle_buf_from_pool(nsize);
3561
3562	if (ptr == NULL) {
3563	recycle_buf_failed++;
3564
3565	if (was_vmpriv == FALSE) {
3566	set_vm_privilege(TRUE);
3567	}
3568
3569	ptr = kheap_alloc(KHEAP_VFS_BIO, nsize, Z_WAITOK);
3570
3571	if (was_vmpriv == FALSE) {
3572	set_vm_privilege(FALSE);
3573	}
3574	}
3575	}
3576	return ptr;
3577	}
3578
3579	/*
3580	* With UBC, there is no need to expand / shrink the file data
3581	* buffer. The VM uses the same pages, hence no waste.
3582	* All the file data buffers can have one size.
3583	* In fact expand / shrink would be an expensive operation.
3584	*
3585	* Only exception to this is meta-data buffers. Most of the
3586	* meta data operations are smaller than PAGE_SIZE. Having the
3587	* meta-data buffers grow and shrink as needed, optimizes use
3588	* of the kernel wired memory.
3589	*/
3590
3591	int
3592	allocbuf(buf_t bp, int size)
3593	{
3594	vm_size_t desired_size;
3595
3596	desired_size = roundup(size, CLBYTES);
3597
3598	if (desired_size < PAGE_SIZE) {
3599	desired_size = PAGE_SIZE;
3600	}
3601	if (desired_size > MAXBSIZE) {
3602	panic("allocbuf: buffer larger than MAXBSIZE requested");
3603	}
3604
3605	if (ISSET(bp->b_flags, B_META)) {
3606	int nsize = roundup(size, MINMETA);
3607
3608	if (bp->b_datap) {
3609	void elem = (void* *)bp->b_datap;
3610
3611	if (ISSET(bp->b_flags, B_ZALLOC)) {
3612	if (bp->b_bufsize < (uint32_t)nsize) {
3613	/ reallocate to a bigger size /
3614
3615	if (nsize <= MAXMETA) {
3616	desired_size = nsize;
3617
3618	/ b_datap not really a ptr /
3619	(void* **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3620	} else {
3621	bp->b_datap = (uintptr_t)NULL;
3622	kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&bp->b_datap, size: desired_size,
3623	flags: KMA_KOBJECT \| KMA_DATA \| KMA_NOFAIL,
3624	VM_KERN_MEMORY_FILE);
3625	CLR(bp->b_flags, B_ZALLOC);
3626	}
3627	bcopy(src: elem, dst: (caddr_t)bp->b_datap, n: bp->b_bufsize);
3628	kheap_free(KHEAP_VFS_BIO, elem, bp->b_bufsize);
3629	} else {
3630	desired_size = bp->b_bufsize;
3631	}
3632	} else {
3633	if ((vm_size_t)bp->b_bufsize < desired_size) {
3634	/ reallocate to a bigger size /
3635	bp->b_datap = (uintptr_t)NULL;
3636	kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&bp->b_datap, size: desired_size,
3637	flags: KMA_KOBJECT \| KMA_DATA \| KMA_NOFAIL,
3638	VM_KERN_MEMORY_FILE);
3639	bcopy(src: elem, dst: (caddr_t)bp->b_datap, n: bp->b_bufsize);
3640	kmem_free(map: kernel_map, addr: (vm_offset_t)elem, size: bp->b_bufsize);
3641	} else {
3642	desired_size = bp->b_bufsize;
3643	}
3644	}
3645	} else {
3646	/ new allocation /
3647	if (nsize <= MAXMETA) {
3648	desired_size = nsize;
3649
3650	/ b_datap not really a ptr /
3651	(void* **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3652	SET(bp->b_flags, B_ZALLOC);
3653	} else {
3654	kmem_alloc(map: kernel_map, addrp: (vm_offset_t *)&bp->b_datap, size: desired_size,
3655	flags: KMA_KOBJECT \| KMA_DATA \| KMA_NOFAIL,
3656	VM_KERN_MEMORY_FILE);
3657	}
3658	}
3659	}
3660	bp->b_bufsize = (uint32_t)desired_size;
3661	bp->b_bcount = size;
3662
3663	return `0`;
3664	}
3665
3666	/*
3667	* Get a new buffer from one of the free lists.
3668	*
3669	* Request for a queue is passes in. The queue from which the buffer was taken
3670	* from is returned. Out of range queue requests get BQ_EMPTY. Request for
3671	* BQUEUE means no preference. Use heuristics in that case.
3672	* Heuristics is as follows:
3673	* Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3674	* If none available block till one is made available.
3675	* If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3676	* Pick the most stale buffer.
3677	* If found buffer was marked delayed write, start the async. write
3678	* and restart the search.
3679	* Initialize the fields and disassociate the buffer from the vnode.
3680	* Remove the buffer from the hash. Return the buffer and the queue
3681	* on which it was found.
3682	*
3683	* buf_mtx is held upon entry
3684	* returns with buf_mtx locked if new buf available
3685	* returns with buf_mtx UNlocked if new buf NOT available
3686	*/
3687
3688	static buf_t
3689	getnewbuf(int slpflag, int slptimeo, int * queue)
3690	{
3691	buf_t bp;
3692	buf_t lru_bp;
3693	buf_t age_bp;
3694	buf_t meta_bp;
3695	int age_time, lru_time, bp_time, meta_time;
3696	int req = queue; /* save it for restarts /
3697	struct timespec ts;
3698
3699	start:
3700	/*
3701	* invalid request gets empty queue
3702	*/
3703	if ((queue >= BQUEUES) \|\| (queue < `0`)
3704	\|\| (queue == BQ_LAUNDRY) \|\| (queue == BQ_LOCKED)) {
3705	*queue = BQ_EMPTY;
3706	}
3707
3708
3709	if (queue == BQ_EMPTY && (bp = bufqueues[queue].tqh_first)) {
3710	goto found;
3711	}
3712
3713	/*
3714	* need to grow number of bufs, add another one rather than recycling
3715	*/
3716	if (nbuf_headers < max_nbuf_headers) {
3717	/*
3718	* Increment count now as lock
3719	* is dropped for allocation.
3720	* That avoids over commits
3721	*/
3722	nbuf_headers++;
3723	goto add_newbufs;
3724	}
3725	/ Try for the requested queue first /
3726	bp = bufqueues[*queue].tqh_first;
3727	if (bp) {
3728	goto found;
3729	}
3730
3731	/ Unable to use requested queue /
3732	age_bp = bufqueues[BQ_AGE].tqh_first;
3733	lru_bp = bufqueues[BQ_LRU].tqh_first;
3734	meta_bp = bufqueues[BQ_META].tqh_first;
3735
3736	if (!age_bp && !lru_bp && !meta_bp) {
3737	/*
3738	* Unavailble on AGE or LRU or META queues
3739	* Try the empty list first
3740	*/
3741	bp = bufqueues[BQ_EMPTY].tqh_first;
3742	if (bp) {
3743	*queue = BQ_EMPTY;
3744	goto found;
3745	}
3746	/*
3747	* We have seen is this is hard to trigger.
3748	* This is an overcommit of nbufs but needed
3749	* in some scenarios with diskiamges
3750	*/
3751
3752	add_newbufs:
3753	lck_mtx_unlock(lck: &buf_mtx);
3754
3755	/ Create a new temporary buffer header /
3756	bp = zalloc_flags(buf_hdr_zone, Z_WAITOK \| Z_NOFAIL);
3757	bufhdrinit(bp);
3758	bp->b_whichq = BQ_EMPTY;
3759	bp->b_timestamp = buf_timestamp();
3760	BLISTNONE(bp);
3761	SET(bp->b_flags, B_HDRALLOC);
3762	*queue = BQ_EMPTY;
3763	lck_mtx_lock_spin(lck: &buf_mtx);
3764
3765	if (bp) {
3766	binshash(bp, dp: &invalhash);
3767	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3768	buf_hdr_count++;
3769	goto found;
3770	}
3771	/ subtract already accounted bufcount /
3772	nbuf_headers--;
3773
3774	bufstats.bufs_sleeps++;
3775
3776	/ wait for a free buffer of any kind /
3777	needbuffer = `1`;
3778	/ hz value is 100 /
3779	ts.tv_sec = (slptimeo / `1000`);
3780	/ the hz value is 100; which leads to 10ms /
3781	ts.tv_nsec = (slptimeo % `1000`) * NSEC_PER_USEC * `1000` * `10`;
3782
3783	msleep(chan: &needbuffer, mtx: &buf_mtx, pri: slpflag \| PDROP \| (PRIBIO + `1`), wmesg: "getnewbuf", ts: &ts);
3784	return NULL;
3785	}
3786
3787	/ Buffer available either on AGE or LRU or META /
3788	bp = NULL;
3789	*queue = -`1`;
3790
3791	/ Buffer available either on AGE or LRU /
3792	if (!age_bp) {
3793	bp = lru_bp;
3794	*queue = BQ_LRU;
3795	} else if (!lru_bp) {
3796	bp = age_bp;
3797	*queue = BQ_AGE;
3798	} else { / buffer available on both AGE and LRU /
3799	int t = buf_timestamp();
3800
3801	age_time = t - age_bp->b_timestamp;
3802	lru_time = t - lru_bp->b_timestamp;
3803	if ((age_time < `0`) \|\| (lru_time < `0`)) { / time set backwards /
3804	bp = age_bp;
3805	*queue = BQ_AGE;
3806	/*
3807	* we should probably re-timestamp eveything in the
3808	* queues at this point with the current time
3809	*/
3810	} else {
3811	if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3812	bp = lru_bp;
3813	*queue = BQ_LRU;
3814	} else {
3815	bp = age_bp;
3816	*queue = BQ_AGE;
3817	}
3818	}
3819	}
3820
3821	if (!bp) { / Neither on AGE nor on LRU /
3822	bp = meta_bp;
3823	*queue = BQ_META;
3824	} else if (meta_bp) {
3825	int t = buf_timestamp();
3826
3827	bp_time = t - bp->b_timestamp;
3828	meta_time = t - meta_bp->b_timestamp;
3829
3830	if (!(bp_time < `0`) && !(meta_time < `0`)) {
3831	/ time not set backwards /
3832	int bp_is_stale;
3833	bp_is_stale = (*queue == BQ_LRU) ?
3834	lru_is_stale : age_is_stale;
3835
3836	if ((meta_time >= meta_is_stale) &&
3837	(bp_time < bp_is_stale)) {
3838	bp = meta_bp;
3839	*queue = BQ_META;
3840	}
3841	}
3842	}
3843	found:
3844	if (ISSET(bp->b_flags, B_LOCKED) \|\| ISSET(bp->b_lflags, BL_BUSY)) {
3845	panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)", bp, bp->b_flags);
3846	}
3847
3848	/ Clean it /
3849	if (bcleanbuf(bp, FALSE)) {
3850	/*
3851	* moved to the laundry thread, buffer not ready
3852	*/
3853	*queue = req;
3854	goto start;
3855	}
3856	return bp;
3857	}
3858
3859
3860	/*
3861	* Clean a buffer.
3862	* Returns 0 if buffer is ready to use,
3863	* Returns 1 if issued a buf_bawrite() to indicate
3864	* that the buffer is not ready.
3865	*
3866	* buf_mtx is held upon entry
3867	* returns with buf_mtx locked
3868	*/
3869	int
3870	bcleanbuf(buf_t bp, boolean_t discard)
3871	{
3872	/ Remove from the queue /
3873	bremfree_locked(bp);
3874
3875	#ifdef JOE_DEBUG
3876	bp->b_owner = current_thread();
3877	bp->b_tag = `2`;
3878	#endif
3879	/*
3880	* If buffer was a delayed write, start the IO by queuing
3881	* it on the LAUNDRY queue, and return 1
3882	*/
3883	if (ISSET(bp->b_flags, B_DELWRI)) {
3884	if (discard) {
3885	SET(bp->b_lflags, BL_WANTDEALLOC);
3886	}
3887
3888	bmovelaundry(bp);
3889
3890	lck_mtx_unlock(lck: &buf_mtx);
3891
3892	wakeup(chan: &bufqueues[BQ_LAUNDRY]);
3893	/*
3894	* and give it a chance to run
3895	*/
3896	(void)thread_block(THREAD_CONTINUE_NULL);
3897
3898	lck_mtx_lock_spin(lck: &buf_mtx);
3899
3900	return `1`;
3901	}
3902	#ifdef JOE_DEBUG
3903	bp->b_owner = current_thread();
3904	bp->b_tag = `8`;
3905	#endif
3906	/*
3907	* Buffer is no longer on any free list... we own it
3908	*/
3909	SET(bp->b_lflags, BL_BUSY);
3910	buf_busycount++;
3911
3912	bremhash(bp);
3913
3914	/*
3915	* disassociate us from our vnode, if we had one...
3916	*/
3917	if (bp->b_vp) {
3918	brelvp_locked(bp);
3919	}
3920
3921	lck_mtx_unlock(lck: &buf_mtx);
3922
3923	BLISTNONE(bp);
3924
3925	if (ISSET(bp->b_flags, B_META)) {
3926	buf_free_meta_store(bp);
3927	}
3928
3929	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3930
3931	buf_release_credentials(bp);
3932
3933	/ If discarding, just move to the empty queue /
3934	if (discard) {
3935	lck_mtx_lock_spin(lck: &buf_mtx);
3936	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED \| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
3937	bp->b_whichq = BQ_EMPTY;
3938	binshash(bp, dp: &invalhash);
3939	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3940	CLR(bp->b_lflags, BL_BUSY);
3941	buf_busycount--;
3942	} else {
3943	/ Not discarding: clean up and prepare for reuse /
3944	bp->b_bufsize = `0`;
3945	bp->b_datap = (uintptr_t)NULL;
3946	bp->b_upl = (void *)NULL;
3947	bp->b_fsprivate = (void *)NULL;
3948	/*
3949	* preserve the state of whether this buffer
3950	* was allocated on the fly or not...
3951	* the only other flag that should be set at
3952	* this point is BL_BUSY...
3953	*/
3954	#ifdef JOE_DEBUG
3955	bp->b_owner = current_thread();
3956	bp->b_tag = `3`;
3957	#endif
3958	bp->b_lflags = BL_BUSY;
3959	bp->b_flags = (bp->b_flags & B_HDRALLOC);
3960	bp->b_redundancy_flags = `0`;
3961	bp->b_dev = NODEV;
3962	bp->b_blkno = bp->b_lblkno = `0`;
3963	bp->b_lblksize = `0`;
3964	bp->b_iodone = NULL;
3965	bp->b_error = `0`;
3966	bp->b_resid = `0`;
3967	bp->b_bcount = `0`;
3968	bp->b_dirtyoff = bp->b_dirtyend = `0`;
3969	bp->b_validoff = bp->b_validend = `0`;
3970	bzero(s: &bp->b_attr, n: sizeof(struct bufattr));
3971
3972	lck_mtx_lock_spin(lck: &buf_mtx);
3973	}
3974	return `0`;
3975	}
3976
3977
3978
3979	errno_t
3980	buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3981	{
3982	buf_t bp;
3983	errno_t error;
3984	struct bufhashhdr *dp;
3985
3986	dp = BUFHASH(vp, lblkno);
3987
3988	relook:
3989	lck_mtx_lock_spin(lck: &buf_mtx);
3990
3991	if ((bp = incore_locked(vp, blkno: lblkno, dp)) == (struct buf *)`0`) {
3992	lck_mtx_unlock(lck: &buf_mtx);
3993	return `0`;
3994	}
3995	if (ISSET(bp->b_lflags, BL_BUSY)) {
3996	if (!ISSET(flags, BUF_WAIT)) {
3997	lck_mtx_unlock(lck: &buf_mtx);
3998	return EBUSY;
3999	}
4000	SET(bp->b_lflags, BL_WANTED);
4001
4002	error = msleep(chan: (caddr_t)bp, mtx: &buf_mtx, PDROP \| (PRIBIO + `1`), wmesg: "buf_invalblkno", NULL);
4003
4004	if (error) {
4005	return error;
4006	}
4007	goto relook;
4008	}
4009	bremfree_locked(bp);
4010	SET(bp->b_lflags, BL_BUSY);
4011	SET(bp->b_flags, B_INVAL);
4012	buf_busycount++;
4013	#ifdef JOE_DEBUG
4014	bp->b_owner = current_thread();
4015	bp->b_tag = `4`;
4016	#endif
4017	lck_mtx_unlock(lck: &buf_mtx);
4018	buf_brelse(bp);
4019
4020	return `0`;
4021	}
4022
4023
4024	void
4025	buf_drop(buf_t bp)
4026	{
4027	int need_wakeup = `0`;
4028
4029	lck_mtx_lock_spin(lck: &buf_mtx);
4030
4031	if (ISSET(bp->b_lflags, BL_WANTED)) {
4032	/*
4033	* delay the actual wakeup until after we
4034	* clear BL_BUSY and we've dropped buf_mtx
4035	*/
4036	need_wakeup = `1`;
4037	}
4038	#ifdef JOE_DEBUG
4039	bp->b_owner = current_thread();
4040	bp->b_tag = `9`;
4041	#endif
4042	/*
4043	* Unlock the buffer.
4044	*/
4045	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
4046	buf_busycount--;
4047
4048	lck_mtx_unlock(lck: &buf_mtx);
4049
4050	if (need_wakeup) {
4051	/*
4052	* Wake up any proceeses waiting for _this_ buffer to become free.
4053	*/
4054	wakeup(chan: bp);
4055	}
4056	}
4057
4058
4059	errno_t
4060	buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo)
4061	{
4062	errno_t error;
4063
4064	lck_mtx_lock_spin(lck: &buf_mtx);
4065
4066	error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
4067
4068	lck_mtx_unlock(lck: &buf_mtx);
4069
4070	return error;
4071	}
4072
4073
4074	static errno_t
4075	buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
4076	{
4077	errno_t error;
4078	struct timespec ts;
4079
4080	if (ISSET(bp->b_flags, B_LOCKED)) {
4081	if ((flags & BAC_SKIP_LOCKED)) {
4082	return EDEADLK;
4083	}
4084	} else {
4085	if ((flags & BAC_SKIP_NONLOCKED)) {
4086	return EDEADLK;
4087	}
4088	}
4089	if (ISSET(bp->b_lflags, BL_BUSY)) {
4090	/*
4091	* since the lck_mtx_lock may block, the buffer
4092	* may become BUSY, so we need to
4093	* recheck for a NOWAIT request
4094	*/
4095	if (flags & BAC_NOWAIT) {
4096	return EBUSY;
4097	}
4098	SET(bp->b_lflags, BL_WANTED);
4099
4100	/ the hz value is 100; which leads to 10ms /
4101	ts.tv_sec = (slptimeo / `100`);
4102	ts.tv_nsec = (slptimeo % `100`) * `10` * NSEC_PER_USEC * `1000`;
4103	error = msleep(chan: (caddr_t)bp, mtx: &buf_mtx, pri: slpflag \| (PRIBIO + `1`), wmesg: "buf_acquire", ts: &ts);
4104
4105	if (error) {
4106	return error;
4107	}
4108	return EAGAIN;
4109	}
4110	if (flags & BAC_REMOVE) {
4111	bremfree_locked(bp);
4112	}
4113	SET(bp->b_lflags, BL_BUSY);
4114	buf_busycount++;
4115
4116	#ifdef JOE_DEBUG
4117	bp->b_owner = current_thread();
4118	bp->b_tag = `5`;
4119	#endif
4120	return `0`;
4121	}
4122
4123
4124	/*
4125	* Wait for operations on the buffer to complete.
4126	* When they do, extract and return the I/O's error value.
4127	*/
4128	errno_t
4129	buf_biowait(buf_t bp)
4130	{
4131	while (!ISSET(bp->b_flags, B_DONE)) {
4132	lck_mtx_lock_spin(lck: &buf_mtx);
4133
4134	if (!ISSET(bp->b_flags, B_DONE)) {
4135	DTRACE_IO1(wait__start, buf_t, bp);
4136	(void) msleep(chan: bp, mtx: &buf_mtx, PDROP \| (PRIBIO + `1`), wmesg: "buf_biowait", NULL);
4137	DTRACE_IO1(wait__done, buf_t, bp);
4138	} else {
4139	lck_mtx_unlock(lck: &buf_mtx);
4140	}
4141	}
4142	/ check for interruption of I/O (e.g. via NFS), then errors. /
4143	if (ISSET(bp->b_flags, B_EINTR)) {
4144	CLR(bp->b_flags, B_EINTR);
4145	return EINTR;
4146	} else if (ISSET(bp->b_flags, B_ERROR)) {
4147	return bp->b_error ? bp->b_error : EIO;
4148	} else {
4149	return `0`;
4150	}
4151	}
4152
4153
4154	/*
4155	* Mark I/O complete on a buffer.
4156	*
4157	* If a callback has been requested, e.g. the pageout
4158	* daemon, do so. Otherwise, awaken waiting processes.
4159	*
4160	* [ Leffler, et al., says on p.247:
4161	* "This routine wakes up the blocked process, frees the buffer
4162	* for an asynchronous write, or, for a request by the pagedaemon
4163	* process, invokes a procedure specified in the buffer structure" ]
4164	*
4165	* In real life, the pagedaemon (or other system processes) wants
4166	* to do async stuff to, and doesn't want the buffer buf_brelse()'d.
4167	* (for swap pager, that puts swap buffers on the free lists (!!!),
4168	* for the vn device, that puts malloc'd buffers on the free lists!)
4169	*/
4170
4171	void
4172	buf_biodone(buf_t bp)
4173	{
4174	mount_t mp;
4175	struct bufattr *bap;
4176	struct timeval real_elapsed;
4177	uint64_t real_elapsed_usec = `0`;
4178
4179	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `387`)) \| DBG_FUNC_START,
4180	bp, bp->b_datap, bp->b_flags, `0`, `0`);
4181
4182	/ Record our progress. /
4183	vfs_update_last_completion_time();
4184
4185	if (ISSET(bp->b_flags, B_DONE)) {
4186	panic("biodone already");
4187	}
4188
4189	bap = &bp->b_attr;
4190
4191	if (bp->b_vp && bp->b_vp->v_mount) {
4192	mp = bp->b_vp->v_mount;
4193	} else {
4194	mp = NULL;
4195	}
4196
4197	if (ISSET(bp->b_flags, B_ERROR)) {
4198	if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
4199	dk_error_description_t desc;
4200	bzero(s: &desc, n: sizeof(desc));
4201	desc.description = panic_disk_error_description;
4202	desc.description_size = panic_disk_error_description_size;
4203	VNOP_IOCTL(vp: mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, data: (caddr_t)&desc, fflag: `0`, ctx: vfs_context_kernel());
4204	}
4205	}
4206
4207	if (mp && (bp->b_flags & B_READ) == `0`) {
4208	update_last_io_time(mp);
4209	INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
4210	} else if (mp) {
4211	INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
4212	}
4213
4214	throttle_info_end_io(bp);
4215
4216	if (kdebug_enable) {
4217	int code = DKIO_DONE;
4218	int io_tier = GET_BUFATTR_IO_TIER(bap);
4219
4220	if (bp->b_flags & B_READ) {
4221	code \|= DKIO_READ;
4222	}
4223	if (bp->b_flags & B_ASYNC) {
4224	code \|= DKIO_ASYNC;
4225	}
4226
4227	if (bp->b_flags & B_META) {
4228	code \|= DKIO_META;
4229	} else if (bp->b_flags & B_PAGEIO) {
4230	code \|= DKIO_PAGING;
4231	}
4232
4233	if (io_tier != `0`) {
4234	code \|= DKIO_THROTTLE;
4235	}
4236
4237	code \|= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
4238
4239	if (bp->b_flags & B_PASSIVE) {
4240	code \|= DKIO_PASSIVE;
4241	}
4242
4243	if (bap->ba_flags & BA_NOCACHE) {
4244	code \|= DKIO_NOCACHE;
4245	}
4246
4247	if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
4248	code \|= DKIO_TIER_UPGRADE;
4249	}
4250
4251	KDBG_RELEASE_NOPROCFILT(FSDBG_CODE(DBG_DKRW, code),
4252	buf_kernel_addrperm_addr(bp),
4253	(uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid,
4254	bp->b_error);
4255	}
4256
4257	microuptime(tv: &real_elapsed);
4258	timevalsub(t1: &real_elapsed, t2: &bp->b_timestamp_tv);
4259	real_elapsed_usec = real_elapsed.tv_sec * USEC_PER_SEC + real_elapsed.tv_usec;
4260	disk_conditioner_delay(bp, `1`, bp->b_bcount, real_elapsed_usec);
4261
4262	/*
4263	* I/O was done, so don't believe
4264	* the DIRTY state from VM anymore...
4265	* and we need to reset the THROTTLED/PASSIVE
4266	* indicators
4267	*/
4268	CLR(bp->b_flags, (B_WASDIRTY \| B_PASSIVE));
4269	CLR(bap->ba_flags, (BA_META \| BA_NOCACHE \| BA_DELAYIDLESLEEP \| BA_IO_TIER_UPGRADE));
4270
4271	SET_BUFATTR_IO_TIER(bap, `0`);
4272
4273	DTRACE_IO1(done, buf_t, bp);
4274
4275	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) {
4276	/*
4277	* wake up any writer's blocked
4278	* on throttle or waiting for I/O
4279	* to drain
4280	*/
4281	vnode_writedone(vp: bp->b_vp);
4282	}
4283
4284	if (ISSET(bp->b_flags, (B_CALL \| B_FILTER))) { / if necessary, call out /
4285	void (iodone_func)(struct* buf , void* *) = bp->b_iodone;
4286	void *arg = bp->b_transaction;
4287	int callout = ISSET(bp->b_flags, B_CALL);
4288
4289	if (iodone_func == NULL) {
4290	panic("biodone: bp @ %p has NULL b_iodone!", bp);
4291	}
4292
4293	CLR(bp->b_flags, (B_CALL \| B_FILTER)); / filters and callouts are one-shot /
4294	bp->b_iodone = NULL;
4295	bp->b_transaction = NULL;
4296
4297	if (callout) {
4298	SET(bp->b_flags, B_DONE); / note that it's done /
4299	}
4300	(*iodone_func)(bp, arg);
4301
4302	if (callout) {
4303	/*
4304	* assumes that the callback function takes
4305	* ownership of the bp and deals with releasing it if necessary
4306	*/
4307	goto biodone_done;
4308	}
4309	/*
4310	* in this case the call back function is acting
4311	* strictly as a filter... it does not take
4312	* ownership of the bp and is expecting us
4313	* to finish cleaning up... this is currently used
4314	* by the HFS journaling code
4315	*/
4316	}
4317	if (ISSET(bp->b_flags, B_ASYNC)) { / if async, release it /
4318	SET(bp->b_flags, B_DONE); / note that it's done /
4319
4320	buf_brelse(bp);
4321	} else { / or just wakeup the buffer /
4322	/*
4323	* by taking the mutex, we serialize
4324	* the buf owner calling buf_biowait so that we'll
4325	* only see him in one of 2 states...
4326	* state 1: B_DONE wasn't set and he's
4327	* blocked in msleep
4328	* state 2: he's blocked trying to take the
4329	* mutex before looking at B_DONE
4330	* BL_WANTED is cleared in case anyone else
4331	* is blocked waiting for the buffer... note
4332	* that we haven't cleared B_BUSY yet, so if
4333	* they do get to run, their going to re-set
4334	* BL_WANTED and go back to sleep
4335	*/
4336	lck_mtx_lock_spin(lck: &buf_mtx);
4337
4338	CLR(bp->b_lflags, BL_WANTED);
4339	SET(bp->b_flags, B_DONE); / note that it's done /
4340
4341	lck_mtx_unlock(lck: &buf_mtx);
4342
4343	wakeup(chan: bp);
4344	}
4345	biodone_done:
4346	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, `387`)) \| DBG_FUNC_END,
4347	(uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, `0`, `0`);
4348	}
4349
4350	/*
4351	* Obfuscate buf pointers.
4352	*/
4353	vm_offset_t
4354	buf_kernel_addrperm_addr(void * addr)
4355	{
4356	if ((vm_offset_t)addr == `0`) {
4357	return `0`;
4358	} else {
4359	return (vm_offset_t)addr + buf_kernel_addrperm;
4360	}
4361	}
4362
4363	/*
4364	* Return a count of buffers on the "locked" queue.
4365	*/
4366	int
4367	count_lock_queue(void)
4368	{
4369	buf_t bp;
4370	int n = `0`;
4371
4372	lck_mtx_lock_spin(lck: &buf_mtx);
4373
4374	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4375	bp = bp->b_freelist.tqe_next) {
4376	n++;
4377	}
4378	lck_mtx_unlock(lck: &buf_mtx);
4379
4380	return n;
4381	}
4382
4383	/*
4384	* Return a count of 'busy' buffers. Used at the time of shutdown.
4385	* note: This is also called from the mach side in debug context in kdp.c
4386	*/
4387	uint32_t
4388	count_busy_buffers(void)
4389	{
4390	return buf_busycount + bufstats.bufs_iobufinuse;
4391	}
4392
4393	#if DIAGNOSTIC
4394	/*
4395	* Print out statistics on the current allocation of the buffer pool.
4396	* Can be enabled to print out on every ``sync'' by setting "syncprt"
4397	* in vfs_syscalls.c using sysctl.
4398	*/
4399	void
4400	vfs_bufstats()
4401	{
4402	int i, j, count;
4403	struct buf *bp;
4404	struct bqueues *dp;
4405	int counts[MAXBSIZE / CLBYTES + `1`];
4406	static char *bname[BQUEUES] =
4407	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4408
4409	for (dp = bufqueues, i = `0`; dp < &bufqueues[BQUEUES]; dp++, i++) {
4410	count = `0`;
4411	for (j = `0`; j <= MAXBSIZE / CLBYTES; j++) {
4412	counts[j] = `0`;
4413	}
4414
4415	lck_mtx_lock(&buf_mtx);
4416
4417	for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4418	counts[bp->b_bufsize / CLBYTES]++;
4419	count++;
4420	}
4421	lck_mtx_unlock(&buf_mtx);
4422
4423	printf("%s: total-%d", bname[i], count);
4424	for (j = `0`; j <= MAXBSIZE / CLBYTES; j++) {
4425	if (counts[j] != `0`) {
4426	printf(", %d-%d", j * CLBYTES, counts[j]);
4427	}
4428	}
4429	printf("\n");
4430	}
4431	}
4432	#endif /* DIAGNOSTIC */
4433
4434	#define NRESERVEDIOBUFS 128
4435
4436	#define MNT_VIRTUALDEV_MAX_IOBUFS 128
4437	#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
4438
4439	buf_t
4440	alloc_io_buf(vnode_t vp, int priv)
4441	{
4442	buf_t bp;
4443	mount_t mp = NULL;
4444	int alloc_for_virtualdev = FALSE;
4445
4446	lck_mtx_lock_spin(lck: &iobuffer_mtxp);
4447
4448	/*
4449	* We subject iobuf requests for diskimages to additional restrictions.
4450	*
4451	* a) A single diskimage mount cannot use up more than
4452	* MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
4453	* are not subject to this restriction.
4454	* b) iobuf headers used by all diskimage headers by all mount
4455	* points cannot exceed VIRTUALDEV_MAX_IOBUFS.
4456	*/
4457	if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
4458	mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
4459	alloc_for_virtualdev = TRUE;
4460	while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) \|\|
4461	bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
4462	bufstats.bufs_iobufsleeps++;
4463
4464	need_iobuffer = `1`;
4465	(void)msleep(chan: &need_iobuffer, mtx: &iobuffer_mtxp,
4466	PSPIN \| (PRIBIO + `1`), wmesg: (const char *)"alloc_io_buf (1)",
4467	NULL);
4468	}
4469	}
4470
4471	while ((((uint32_t)(niobuf_headers - NRESERVEDIOBUFS) < bufstats.bufs_iobufinuse) && !priv) \|\|
4472	(bp = iobufqueue.tqh_first) == NULL) {
4473	bufstats.bufs_iobufsleeps++;
4474
4475	need_iobuffer = `1`;
4476	(void)msleep(chan: &need_iobuffer, mtx: &iobuffer_mtxp, PSPIN \| (PRIBIO + `1`),
4477	wmesg: (const char *)"alloc_io_buf (2)", NULL);
4478	}
4479	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4480
4481	bufstats.bufs_iobufinuse++;
4482	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) {
4483	bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4484	}
4485
4486	if (alloc_for_virtualdev) {
4487	mp->mnt_iobufinuse++;
4488	bufstats.bufs_iobufinuse_vdev++;
4489	}
4490
4491	lck_mtx_unlock(lck: &iobuffer_mtxp);
4492
4493	/*
4494	* initialize various fields
4495	* we don't need to hold the mutex since the buffer
4496	* is now private... the vp should have a reference
4497	* on it and is not protected by this mutex in any event
4498	*/
4499	bp->b_timestamp = `0`;
4500	bp->b_proc = NULL;
4501
4502	bp->b_datap = `0`;
4503	bp->b_flags = `0`;
4504	bp->b_lflags = BL_BUSY \| BL_IOBUF;
4505	if (alloc_for_virtualdev) {
4506	bp->b_lflags \|= BL_IOBUF_VDEV;
4507	}
4508	bp->b_redundancy_flags = `0`;
4509	bp->b_blkno = bp->b_lblkno = `0`;
4510	bp->b_lblksize = `0`;
4511	#ifdef JOE_DEBUG
4512	bp->b_owner = current_thread();
4513	bp->b_tag = `6`;
4514	#endif
4515	bp->b_iodone = NULL;
4516	bp->b_error = `0`;
4517	bp->b_resid = `0`;
4518	bp->b_bcount = `0`;
4519	bp->b_bufsize = `0`;
4520	bp->b_upl = NULL;
4521	bp->b_fsprivate = (void *)NULL;
4522	bp->b_vp = vp;
4523	bzero(s: &bp->b_attr, n: sizeof(struct bufattr));
4524
4525	if (vp && (vp->v_type == VBLK \|\| vp->v_type == VCHR)) {
4526	bp->b_dev = vp->v_rdev;
4527	} else {
4528	bp->b_dev = NODEV;
4529	}
4530
4531	return bp;
4532	}
4533
4534
4535	void
4536	free_io_buf(buf_t bp)
4537	{
4538	int need_wakeup = `0`;
4539	int free_for_virtualdev = FALSE;
4540	mount_t mp = NULL;
4541
4542	/ Was this iobuf for a diskimage ? /
4543	if (bp->b_lflags & BL_IOBUF_VDEV) {
4544	free_for_virtualdev = TRUE;
4545	if (bp->b_vp) {
4546	mp = bp->b_vp->v_mount;
4547	}
4548	}
4549
4550	/*
4551	* put buffer back on the head of the iobufqueue
4552	*/
4553	bp->b_vp = NULL;
4554	bp->b_flags = B_INVAL;
4555
4556	/ Zero out the bufattr and its flags before relinquishing this iobuf /
4557	bzero(s: &bp->b_attr, n: sizeof(struct bufattr));
4558
4559	lck_mtx_lock_spin(lck: &iobuffer_mtxp);
4560
4561	binsheadfree(bp, &iobufqueue, -`1`);
4562
4563	if (need_iobuffer) {
4564	/*
4565	* Wake up any processes waiting because they need an io buffer
4566	*
4567	* do the wakeup after we drop the mutex... it's possible that the
4568	* wakeup will be superfluous if need_iobuffer gets set again and
4569	* another thread runs this path, but it's highly unlikely, doesn't
4570	* hurt, and it means we don't hold up I/O progress if the wakeup blocks
4571	* trying to grab a task related lock...
4572	*/
4573	need_iobuffer = `0`;
4574	need_wakeup = `1`;
4575	}
4576	if (bufstats.bufs_iobufinuse <= `0`) {
4577	panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4578	}
4579
4580	bufstats.bufs_iobufinuse--;
4581
4582	if (free_for_virtualdev) {
4583	bufstats.bufs_iobufinuse_vdev--;
4584	if (mp && mp != dead_mountp) {
4585	mp->mnt_iobufinuse--;
4586	}
4587	}
4588
4589	lck_mtx_unlock(lck: &iobuffer_mtxp);
4590
4591	if (need_wakeup) {
4592	wakeup(chan: &need_iobuffer);
4593	}
4594	}
4595
4596
4597	void
4598	buf_list_lock(void)
4599	{
4600	lck_mtx_lock_spin(lck: &buf_mtx);
4601	}
4602
4603	void
4604	buf_list_unlock(void)
4605	{
4606	lck_mtx_unlock(lck: &buf_mtx);
4607	}
4608
4609	/*
4610	* If getnewbuf() calls bcleanbuf() on the same thread
4611	* there is a potential for stack overrun and deadlocks.
4612	* So we always handoff the work to a worker thread for completion
4613	*/
4614
4615
4616	static void
4617	bcleanbuf_thread_init(void)
4618	{
4619	thread_t thread = THREAD_NULL;
4620
4621	/ create worker thread /
4622	kernel_thread_start(continuation: (thread_continue_t)bcleanbuf_thread, NULL, new_thread: &thread);
4623	thread_deallocate(thread);
4624	}
4625
4626	typedef int (bcleanbufcontinuation)(int*);
4627
4628	__attribute__((noreturn))
4629	static void
4630	bcleanbuf_thread(void)
4631	{
4632	struct buf *bp;
4633	int error = `0`;
4634	int loopcnt = `0`;
4635
4636	for (;;) {
4637	lck_mtx_lock_spin(lck: &buf_mtx);
4638
4639	while ((bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4640	(void)msleep0(chan: &bufqueues[BQ_LAUNDRY], mtx: &buf_mtx, PRIBIO \| PDROP, wmesg: "blaundry", timo: `0`, continuation: (bcleanbufcontinuation)bcleanbuf_thread);
4641	}
4642
4643	/*
4644	* Remove from the queue
4645	*/
4646	bremfree_locked(bp);
4647
4648	/*
4649	* Buffer is no longer on any free list
4650	*/
4651	SET(bp->b_lflags, BL_BUSY);
4652	buf_busycount++;
4653
4654	#ifdef JOE_DEBUG
4655	bp->b_owner = current_thread();
4656	bp->b_tag = `10`;
4657	#endif
4658
4659	lck_mtx_unlock(lck: &buf_mtx);
4660	/*
4661	* do the IO
4662	*/
4663	error = bawrite_internal(bp, throttle: `0`);
4664
4665	if (error) {
4666	bp->b_whichq = BQ_LAUNDRY;
4667	bp->b_timestamp = buf_timestamp();
4668
4669	lck_mtx_lock_spin(lck: &buf_mtx);
4670
4671	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4672	blaundrycnt++;
4673
4674	/ we never leave a busy page on the laundry queue /
4675	CLR(bp->b_lflags, BL_BUSY);
4676	buf_busycount--;
4677	#ifdef JOE_DEBUG
4678	bp->b_owner = current_thread();
4679	bp->b_tag = `11`;
4680	#endif
4681
4682	lck_mtx_unlock(lck: &buf_mtx);
4683
4684	if (loopcnt > MAXLAUNDRY) {
4685	/*
4686	* bawrite_internal() can return errors if we're throttled. If we've
4687	* done several I/Os and failed, give the system some time to unthrottle
4688	* the vnode
4689	*/
4690	(void)tsleep(chan: (void *)&bufqueues[BQ_LAUNDRY], PRIBIO, wmesg: "blaundry", timo: `1`);
4691	loopcnt = `0`;
4692	} else {
4693	/ give other threads a chance to run /
4694	(void)thread_block(THREAD_CONTINUE_NULL);
4695	loopcnt++;
4696	}
4697	}
4698	}
4699	}
4700
4701
4702	static int
4703	brecover_data(buf_t bp)
4704	{
4705	int upl_offset;
4706	upl_t upl;
4707	upl_page_info_t *pl;
4708	kern_return_t kret;
4709	vnode_t vp = bp->b_vp;
4710	int upl_flags;
4711
4712
4713	if (!UBCINFOEXISTS(vp) \|\| bp->b_bufsize == `0`) {
4714	goto dump_buffer;
4715	}
4716
4717	upl_flags = UPL_PRECIOUS;
4718	if (!(buf_flags(bp) & B_READ)) {
4719	/*
4720	* "write" operation: let the UPL subsystem know
4721	* that we intend to modify the buffer cache pages we're
4722	* gathering.
4723	*/
4724	upl_flags \|= UPL_WILL_MODIFY;
4725	}
4726
4727	kret = ubc_create_upl_kernel(vp,
4728	ubc_blktooff(vp, bp->b_lblkno),
4729	bp->b_bufsize,
4730	&upl,
4731	&pl,
4732	upl_flags,
4733	VM_KERN_MEMORY_FILE);
4734	if (kret != KERN_SUCCESS) {
4735	panic("Failed to create UPL");
4736	}
4737
4738	for (upl_offset = `0`; (uint32_t)upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4739	if (!upl_valid_page(upl: pl, index: upl_offset / PAGE_SIZE) \|\| !upl_dirty_page(upl: pl, index: upl_offset / PAGE_SIZE)) {
4740	ubc_upl_abort(upl, `0`);
4741	goto dump_buffer;
4742	}
4743	}
4744	bp->b_upl = upl;
4745
4746	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4747
4748	if (kret != KERN_SUCCESS) {
4749	panic("getblk: ubc_upl_map() failed with (%d)", kret);
4750	}
4751	return `1`;
4752
4753	dump_buffer:
4754	bp->b_bufsize = `0`;
4755	SET(bp->b_flags, B_INVAL);
4756	buf_brelse(bp);
4757
4758	return `0`;
4759	}
4760
4761	int
4762	fs_buffer_cache_gc_register(void (* callout)(int, void ), void* *context)
4763	{
4764	lck_mtx_lock(lck: &buf_gc_callout);
4765	for (int i = `0`; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4766	if (fs_callouts[i].callout == NULL) {
4767	fs_callouts[i].callout = callout;
4768	fs_callouts[i].context = context;
4769	lck_mtx_unlock(lck: &buf_gc_callout);
4770	return `0`;
4771	}
4772	}
4773
4774	lck_mtx_unlock(lck: &buf_gc_callout);
4775	return ENOMEM;
4776	}
4777
4778	int
4779	fs_buffer_cache_gc_unregister(void (* callout)(int, void ), void* *context)
4780	{
4781	lck_mtx_lock(lck: &buf_gc_callout);
4782	for (int i = `0`; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4783	if (fs_callouts[i].callout == callout &&
4784	fs_callouts[i].context == context) {
4785	fs_callouts[i].callout = NULL;
4786	fs_callouts[i].context = NULL;
4787	}
4788	}
4789	lck_mtx_unlock(lck: &buf_gc_callout);
4790	return `0`;
4791	}
4792
4793	static void
4794	fs_buffer_cache_gc_dispatch_callouts(int all)
4795	{
4796	lck_mtx_lock(lck: &buf_gc_callout);
4797	for (int i = `0`; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4798	if (fs_callouts[i].callout != NULL) {
4799	fs_callouts[i].callout(all, fs_callouts[i].context);
4800	}
4801	}
4802	lck_mtx_unlock(lck: &buf_gc_callout);
4803	}
4804
4805	static boolean_t
4806	buffer_cache_gc(int all)
4807	{
4808	buf_t bp;
4809	boolean_t did_large_zfree = FALSE;
4810	boolean_t need_wakeup = FALSE;
4811	int now = buf_timestamp();
4812	uint32_t found = `0`;
4813	struct bqueues privq;
4814	int thresh_hold = BUF_STALE_THRESHHOLD;
4815
4816	if (all) {
4817	thresh_hold = `0`;
4818	}
4819	/*
4820	* We only care about metadata (incore storage comes from zalloc()).
4821	* Unless "all" is set (used to evict meta data buffers in preparation
4822	* for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4823	* that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4824	* BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4825	* "buf_mtx" and the length of time we spend compute bound in the GC
4826	* thread which calls this function
4827	*/
4828	lck_mtx_lock(lck: &buf_mtx);
4829
4830	do {
4831	found = `0`;
4832	TAILQ_INIT(&privq);
4833	need_wakeup = FALSE;
4834
4835	while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4836	(now > bp->b_timestamp) &&
4837	(now - bp->b_timestamp > thresh_hold) &&
4838	(found < BUF_MAX_GC_BATCH_SIZE)) {
4839	/ Remove from free list /
4840	bremfree_locked(bp);
4841	found++;
4842
4843	#ifdef JOE_DEBUG
4844	bp->b_owner = current_thread();
4845	bp->b_tag = `12`;
4846	#endif
4847
4848	/ If dirty, move to laundry queue and remember to do wakeup /
4849	if (ISSET(bp->b_flags, B_DELWRI)) {
4850	SET(bp->b_lflags, BL_WANTDEALLOC);
4851
4852	bmovelaundry(bp);
4853	need_wakeup = TRUE;
4854
4855	continue;
4856	}
4857
4858	/*
4859	* Mark busy and put on private list. We could technically get
4860	* away without setting BL_BUSY here.
4861	*/
4862	SET(bp->b_lflags, BL_BUSY);
4863	buf_busycount++;
4864
4865	/*
4866	* Remove from hash and dissociate from vp.
4867	*/
4868	bremhash(bp);
4869	if (bp->b_vp) {
4870	brelvp_locked(bp);
4871	}
4872
4873	TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4874	}
4875
4876	if (found == `0`) {
4877	break;
4878	}
4879
4880	/ Drop lock for batch processing /
4881	lck_mtx_unlock(lck: &buf_mtx);
4882
4883	/ Wakeup and yield for laundry if need be /
4884	if (need_wakeup) {
4885	wakeup(chan: &bufqueues[BQ_LAUNDRY]);
4886	(void)thread_block(THREAD_CONTINUE_NULL);
4887	}
4888
4889	/ Clean up every buffer on private list /
4890	TAILQ_FOREACH(bp, &privq, b_freelist) {
4891	/ Take note if we've definitely freed at least a page to a zone /
4892	if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4893	did_large_zfree = TRUE;
4894	}
4895
4896	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4897
4898	/ Free Storage /
4899	buf_free_meta_store(bp);
4900
4901	/ Release credentials /
4902	buf_release_credentials(bp);
4903
4904	/ Prepare for moving to empty queue /
4905	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED
4906	\| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
4907	bp->b_whichq = BQ_EMPTY;
4908	BLISTNONE(bp);
4909	}
4910	lck_mtx_lock(lck: &buf_mtx);
4911
4912	/ Back under lock, move them all to invalid hash and clear busy /
4913	TAILQ_FOREACH(bp, &privq, b_freelist) {
4914	binshash(bp, dp: &invalhash);
4915	CLR(bp->b_lflags, BL_BUSY);
4916	buf_busycount--;
4917
4918	#ifdef JOE_DEBUG
4919	if (bp->b_owner != current_thread()) {
4920	panic("Buffer stolen from buffer_cache_gc()");
4921	}
4922	bp->b_owner = current_thread();
4923	bp->b_tag = `13`;
4924	#endif
4925	}
4926
4927	/ And do a big bulk move to the empty queue /
4928	TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4929	} while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4930
4931	lck_mtx_unlock(lck: &buf_mtx);
4932
4933	fs_buffer_cache_gc_dispatch_callouts(all);
4934
4935	return did_large_zfree;
4936	}
4937
4938
4939	/*
4940	* disabled for now
4941	*/
4942
4943	#if FLUSH_QUEUES
4944
4945	#define NFLUSH 32
4946
4947	static int
4948	bp_cmp(void a, void* *b)
4949	{
4950	buf_t bp_a = (buf_t **)a,
4951	bp_b = (buf_t **)b;
4952	daddr64_t res;
4953
4954	// don't have to worry about negative block
4955	// numbers so this is ok to do.
4956	//
4957	res = (bp_a->b_blkno - bp_b->b_blkno);
4958
4959	return (int)res;
4960	}
4961
4962
4963	int
4964	bflushq(int whichq, mount_t mp)
4965	{
4966	buf_t bp, next;
4967	int i, buf_count;
4968	int total_writes = `0`;
4969	static buf_t flush_table[NFLUSH];
4970
4971	if (whichq < `0` \|\| whichq >= BQUEUES) {
4972	return `0`;
4973	}
4974
4975	restart:
4976	lck_mtx_lock(&buf_mtx);
4977
4978	bp = TAILQ_FIRST(&bufqueues[whichq]);
4979
4980	for (buf_count = `0`; bp; bp = next) {
4981	next = bp->b_freelist.tqe_next;
4982
4983	if (bp->b_vp == NULL \|\| bp->b_vp->v_mount != mp) {
4984	continue;
4985	}
4986
4987	if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4988	bremfree_locked(bp);
4989	#ifdef JOE_DEBUG
4990	bp->b_owner = current_thread();
4991	bp->b_tag = `7`;
4992	#endif
4993	SET(bp->b_lflags, BL_BUSY);
4994	buf_busycount++;
4995
4996	flush_table[buf_count] = bp;
4997	buf_count++;
4998	total_writes++;
4999
5000	if (buf_count >= NFLUSH) {
5001	lck_mtx_unlock(&buf_mtx);
5002
5003	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
5004
5005	for (i = `0`; i < buf_count; i++) {
5006	buf_bawrite(flush_table[i]);
5007	}
5008	goto restart;
5009	}
5010	}
5011	}
5012	lck_mtx_unlock(&buf_mtx);
5013
5014	if (buf_count > `0`) {
5015	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
5016
5017	for (i = `0`; i < buf_count; i++) {
5018	buf_bawrite(flush_table[i]);
5019	}
5020	}
5021
5022	return total_writes;
5023	}
5024	#endif
5025

Browse the source code of xnu/bsd/vfs/vfs_bio.c