vfs_subr.c source code [xnu/bsd/vfs/vfs_subr.c]

1	/*
2	*
3	* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
4	*
5	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6	*
7	* This file contains Original Code and/or Modifications of Original Code
8	* as defined in and that are subject to the Apple Public Source License
9	* Version 2.0 (the 'License'). You may not use this file except in
10	* compliance with the License. The rights granted to you under the License
11	* may not be used to create, or enable the creation or redistribution of,
12	* unlawful or unlicensed copies of an Apple operating system, or to
13	* circumvent, violate, or enable the circumvention or violation of, any
14	* terms of an Apple operating system software license agreement.
15	*
16	* Please obtain a copy of the License at
17	* http://www.opensource.apple.com/apsl/ and read it before using this file.
18	*
19	* The Original Code and all software distributed under the License are
20	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
21	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
22	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
23	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
24	* Please see the License for the specific language governing rights and
25	* limitations under the License.
26	*
27	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28	*/
29	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
30	/*
31	* Copyright (c) 1989, 1993
32	* The Regents of the University of California. All rights reserved.
33	* (c) UNIX System Laboratories, Inc.
34	* All or some portions of this file are derived from material licensed
35	* to the University of California by American Telephone and Telegraph
36	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
37	* the permission of UNIX System Laboratories, Inc.
38	*
39	* Redistribution and use in source and binary forms, with or without
40	* modification, are permitted provided that the following conditions
41	* are met:
42	* 1. Redistributions of source code must retain the above copyright
43	* notice, this list of conditions and the following disclaimer.
44	* 2. Redistributions in binary form must reproduce the above copyright
45	* notice, this list of conditions and the following disclaimer in the
46	* documentation and/or other materials provided with the distribution.
47	* 3. All advertising materials mentioning features or use of this software
48	* must display the following acknowledgement:
49	* This product includes software developed by the University of
50	* California, Berkeley and its contributors.
51	* 4. Neither the name of the University nor the names of its contributors
52	* may be used to endorse or promote products derived from this software
53	* without specific prior written permission.
54	*
55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65	* SUCH DAMAGE.
66	*
67	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
68	*/
69	/*
70	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
71	* support for mandatory and extensible security protections. This notice
72	* is included in support of clause 2.2 (b) of the Apple Public License,
73	* Version 2.0.
74	*/
75
76	/*
77	* External virtual filesystem routines
78	*/
79
80	#include <sys/param.h>
81	#include <sys/systm.h>
82	#include <sys/proc_internal.h>
83	#include <sys/kauth.h>
84	#include <sys/mount_internal.h>
85	#include <sys/time.h>
86	#include <sys/lock.h>
87	#include <sys/vnode.h>
88	#include <sys/vnode_internal.h>
89	#include <sys/stat.h>
90	#include <sys/namei.h>
91	#include <sys/ucred.h>
92	#include <sys/buf_internal.h>
93	#include <sys/errno.h>
94	#include <kern/kalloc.h>
95	#include <sys/uio_internal.h>
96	#include <sys/uio.h>
97	#include <sys/domain.h>
98	#include <sys/mbuf.h>
99	#include <sys/syslog.h>
100	#include <sys/ubc_internal.h>
101	#include <sys/vm.h>
102	#include <sys/sysctl.h>
103	#include <sys/filedesc.h>
104	#include <sys/event.h>
105	#include <sys/kdebug.h>
106	#include <sys/kauth.h>
107	#include <sys/user.h>
108	#include <sys/systm.h>
109	#include <sys/kern_memorystatus.h>
110	#include <sys/lockf.h>
111	#include <sys/reboot.h>
112	#include <miscfs/fifofs/fifo.h>
113
114	#include <nfs/nfs.h>
115
116	#include <string.h>
117	#include <machine/machine_routines.h>
118
119	#include <kern/assert.h>
120	#include <mach/kern_return.h>
121	#include <kern/thread.h>
122	#include <kern/sched_prim.h>
123	#include <kern/smr.h>
124
125	#include <miscfs/specfs/specdev.h>
126
127	#include <mach/mach_types.h>
128	#include <mach/memory_object_types.h>
129	#include <mach/memory_object_control.h>
130
131	#include <kern/kalloc.h> /* kalloc()/kfree() */
132	#include <kern/clock.h> /* delay_for_interval() */
133	#include <libkern/coreanalytics/coreanalytics.h>
134	#include <libkern/OSAtomic.h> /* OSAddAtomic() */
135	#include <os/atomic_private.h>
136	#if defined(XNU_TARGET_OS_OSX)
137	#include <console/video_console.h>
138	#endif
139
140	#ifdef CONFIG_IOCOUNT_TRACE
141	#include <libkern/OSDebug.h>
142	#endif
143
144	#include <vm/vm_protos.h> /* vnode_pager_vrele() */
145
146	#if CONFIG_MACF
147	#include <security/mac_framework.h>
148	#endif
149
150	#include <vfs/vfs_disk_conditioner.h>
151	#include <libkern/section_keywords.h>
152
153	static LCK_GRP_DECLARE(vnode_lck_grp, "vnode");
154	static LCK_ATTR_DECLARE(vnode_lck_attr, `0`, `0`);
155
156	#if CONFIG_TRIGGERS
157	static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode");
158	static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, `0`, `0`);
159	#endif
160
161	extern lck_mtx_t mnt_list_mtx_lock;
162
163	static KALLOC_TYPE_DEFINE(specinfo_zone, struct specinfo, KT_DEFAULT);
164
165	ZONE_DEFINE(vnode_zone, "vnodes",
166	sizeof(struct vnode), ZC_NOGC \| ZC_ZFREE_CLEARMEM);
167
168	enum vtype iftovt_tab[`16`] = {
169	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
170	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
171	};
172	int vttoif_tab[`9`] = {
173	`0`, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
174	S_IFSOCK, S_IFIFO, S_IFMT,
175	};
176
177	/ XXX These should be in a BSD accessible Mach header, but aren't. /
178	extern void memory_object_mark_used(
179	memory_object_control_t control);
180
181	extern void memory_object_mark_unused(
182	memory_object_control_t control,
183	boolean_t rage);
184
185	extern void memory_object_mark_io_tracking(
186	memory_object_control_t control);
187
188	extern int paniclog_append_noflush(const char *format, ...);
189
190	/ XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern /
191	__private_extern__ void qsort(
192	void * array,
193	size_t nmembers,
194	size_t member_size,
195	int ()(const* void , const* void *));
196
197	__private_extern__ void vntblinit(void);
198	__private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
199	enum uio_seg, int);
200
201	static void vnode_list_add(vnode_t);
202	static void vnode_async_list_add(vnode_t);
203	static void vnode_list_remove(vnode_t);
204	static void vnode_list_remove_locked(vnode_t);
205
206	static void vnode_abort_advlocks(vnode_t);
207	static errno_t vnode_drain(vnode_t);
208	static void vgone(vnode_t, int flags);
209	static void vclean(vnode_t vp, int flag);
210	static void vnode_reclaim_internal(vnode_t, int, int, int);
211
212	static void vnode_dropiocount(vnode_t);
213
214	static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
215	static int vnode_reload(vnode_t);
216
217	static int unmount_callback(mount_t, __unused void *);
218
219	static void insmntque(vnode_t vp, mount_t mp);
220	static int mount_getvfscnt(void);
221	static int mount_fillfsids(fsid_t , int* );
222	static void vnode_iterate_setup(mount_t);
223	int vnode_umount_preflight(mount_t, vnode_t, int);
224	static int vnode_iterate_prepare(mount_t);
225	static int vnode_iterate_reloadq(mount_t);
226	static void vnode_iterate_clear(mount_t);
227	static mount_t vfs_getvfs_locked(fsid_t *);
228	static int vn_create_reg(vnode_t dvp, vnode_t vpp, struct* nameidata *ndp,
229	struct vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx);
230	static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int* noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
231
232	errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
233
234	#ifdef CONFIG_IOCOUNT_TRACE
235	static void record_vp(vnode_t vp, int count);
236	static TUNABLE(int, bootarg_vnode_iocount_trace, "vnode_iocount_trace", `0`);
237	static TUNABLE(int, bootarg_uthread_iocount_trace, "uthread_iocount_trace", `0`);
238	#endif /* CONFIG_IOCOUNT_TRACE */
239
240	#if CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG)
241	static TUNABLE(bool, bootarg_no_vnode_jetsam, "-no_vnode_jetsam", false);
242	#endif /* CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG) */
243
244	static TUNABLE(bool, bootarg_no_vnode_drain, "-no_vnode_drain", false);
245
246	__options_decl(freeable_vnode_level_t, uint32_t, {
247	DEALLOC_VNODE_NONE = `0`,
248	DEALLOC_VNODE_ONLY_OVERFLOW = `1`,
249	DEALLOC_VNODE_ALL = `2`
250	});
251
252	#if XNU_TARGET_OS_OSX
253	static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level", DEALLOC_VNODE_NONE);
254	#else
255	static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level", DEALLOC_VNODE_ONLY_OVERFLOW);
256	#endif /* CONFIG_VNDEALLOC */
257
258	static freeable_vnode_level_t vn_dealloc_level = DEALLOC_VNODE_NONE;
259
260	boolean_t root_is_CF_drive = FALSE;
261
262	#if CONFIG_TRIGGERS
263	static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
264	static void vnode_resolver_detach(vnode_t);
265	#endif
266
267	TAILQ_HEAD(freelst, vnode) vnode_free_list; / vnode free list /
268	TAILQ_HEAD(deadlst, vnode) vnode_dead_list; / vnode dead list /
269	TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
270
271
272	TAILQ_HEAD(ragelst, vnode) vnode_rage_list; / vnode rapid age list /
273	struct timeval rage_tv;
274	int rage_limit = `0`;
275	int ragevnodes = `0`;
276
277	long reusablevnodes_max = LONG_MAX;
278	long reusablevnodes = `0`;
279	int deadvnodes_low = `0`;
280	int deadvnodes_high = `0`;
281	int numvnodes_min = `0`;
282	int numvnodes_max = `0`;
283
284	uint64_t newvnode = `0`;
285	unsigned long newvnode_nodead = `0`;
286
287	static int vfs_unmountall_started = `0`;
288	static int vfs_unmountall_finished = `0`;
289	static uint64_t vfs_shutdown_last_completion_time;
290
291	#define RAGE_LIMIT_MIN 100
292	#define RAGE_TIME_LIMIT 5
293
294	VFS_SMR_DECLARE;
295	extern uint32_t nc_smr_enabled;
296
297	/*
298	* ROSV definitions
299	* NOTE: These are shadowed from PlatformSupport definitions, but XNU
300	* builds standalone.
301	*/
302	#define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data"
303
304	/*
305	* These could be in PlatformSupport but aren't yet
306	*/
307	#define PLATFORM_PREBOOT_VOLUME_MOUNT_POINT "/System/Volumes/Preboot"
308	#define PLATFORM_RECOVERY_VOLUME_MOUNT_POINT "/System/Volumes/Recovery"
309
310	#if CONFIG_MOUNT_VM
311	#define PLATFORM_VM_VOLUME_MOUNT_POINT "/System/Volumes/VM"
312	#endif
313
314	struct mntlist mountlist; / mounted filesystem list /
315	static int nummounts = `0`;
316
317	static int print_busy_vnodes = `0`; / print out busy vnodes /
318
319	#if DIAGNOSTIC
320	#define VLISTCHECK(fun, vp, list) \
321	if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
322	panic("%s: %s vnode not on %slist", (fun), (list), (list));
323	#else
324	#define VLISTCHECK(fun, vp, list)
325	#endif /* DIAGNOSTIC */
326
327	#define VLISTNONE(vp) \
328	do { \
329	(vp)->v_freelist.tqe_next = (struct vnode *)0; \
330	(vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
331	} while(0)
332
333	#define VONLIST(vp) \
334	((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
335
336	/ remove a vnode from free vnode list /
337	#define VREMFREE(fun, vp) \
338	do { \
339	VLISTCHECK((fun), (vp), "free"); \
340	TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
341	VLISTNONE((vp)); \
342	freevnodes--; \
343	reusablevnodes--; \
344	} while(0)
345
346
347	/ remove a vnode from dead vnode list /
348	#define VREMDEAD(fun, vp) \
349	do { \
350	VLISTCHECK((fun), (vp), "dead"); \
351	TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
352	VLISTNONE((vp)); \
353	vp->v_listflag &= ~VLIST_DEAD; \
354	deadvnodes--; \
355	if (vp->v_listflag & VLIST_NO_REUSE) { \
356	deadvnodes_noreuse--; \
357	} \
358	} while(0)
359
360
361	/ remove a vnode from async work vnode list /
362	#define VREMASYNC_WORK(fun, vp) \
363	do { \
364	VLISTCHECK((fun), (vp), "async_work"); \
365	TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
366	VLISTNONE((vp)); \
367	vp->v_listflag &= ~VLIST_ASYNC_WORK; \
368	async_work_vnodes--; \
369	if (!(vp->v_listflag & VLIST_NO_REUSE)) { \
370	reusablevnodes--; \
371	} \
372	} while(0)
373
374
375	/ remove a vnode from rage vnode list /
376	#define VREMRAGE(fun, vp) \
377	do { \
378	if ( !(vp->v_listflag & VLIST_RAGE)) \
379	panic("VREMRAGE: vp not on rage list"); \
380	VLISTCHECK((fun), (vp), "rage"); \
381	TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
382	VLISTNONE((vp)); \
383	vp->v_listflag &= ~VLIST_RAGE; \
384	ragevnodes--; \
385	reusablevnodes--; \
386	} while(0)
387
388	static void async_work_continue(void);
389	static void vn_laundry_continue(void);
390	static void wakeup_laundry_thread(void);
391	static void vnode_smr_free(void *, size_t);
392
393	CA_EVENT(freeable_vnodes,
394	CA_INT, numvnodes_min,
395	CA_INT, numvnodes_max,
396	CA_INT, desiredvnodes,
397	CA_INT, numvnodes,
398	CA_INT, freevnodes,
399	CA_INT, deadvnodes,
400	CA_INT, freeablevnodes,
401	CA_INT, busyvnodes,
402	CA_BOOL, threshold_crossed);
403	static CA_EVENT_TYPE(freeable_vnodes) freeable_vnodes_telemetry;
404
405	static bool freeablevnodes_threshold_crossed = false;
406
407	/*
408	* Initialize the vnode management data structures.
409	*/
410	__private_extern__ void
411	vntblinit(void)
412	{
413	thread_t thread = THREAD_NULL;
414	int desiredvnodes_one_percent = desiredvnodes / `100`;
415
416	TAILQ_INIT(&vnode_free_list);
417	TAILQ_INIT(&vnode_rage_list);
418	TAILQ_INIT(&vnode_dead_list);
419	TAILQ_INIT(&vnode_async_work_list);
420	TAILQ_INIT(&mountlist);
421
422	microuptime(tv: &rage_tv);
423	rage_limit = desiredvnodes_one_percent;
424	if (rage_limit < RAGE_LIMIT_MIN) {
425	rage_limit = RAGE_LIMIT_MIN;
426	}
427
428	deadvnodes_low = desiredvnodes_one_percent;
429	if (deadvnodes_low > `300`) {
430	deadvnodes_low = `300`;
431	}
432	deadvnodes_high = deadvnodes_low * `2`;
433
434	numvnodes_min = numvnodes_max = desiredvnodes;
435	if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ONLY_OVERFLOW) {
436	numvnodes_max = desiredvnodes * `2`;
437	vn_dealloc_level = bootarg_vn_dealloc_level;
438	} else if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ALL) {
439	numvnodes_min = desiredvnodes_one_percent * `40`;
440	numvnodes_max = desiredvnodes * `2`;
441	reusablevnodes_max = (desiredvnodes_one_percent * `20`) - deadvnodes_low;
442	vn_dealloc_level = bootarg_vn_dealloc_level;
443	}
444
445	bzero(s: &freeable_vnodes_telemetry, n: sizeof(CA_EVENT_TYPE(freeable_vnodes)));
446	freeable_vnodes_telemetry.numvnodes_min = numvnodes_min;
447	freeable_vnodes_telemetry.numvnodes_max = numvnodes_max;
448	freeable_vnodes_telemetry.desiredvnodes = desiredvnodes;
449
450	if (nc_smr_enabled) {
451	zone_enable_smr(zone: vnode_zone, VFS_SMR(), free_cb: &vnode_smr_free);
452	}
453
454	/*
455	* create worker threads
456	*/
457	kernel_thread_start(continuation: (thread_continue_t)async_work_continue, NULL, new_thread: &thread);
458	thread_deallocate(thread);
459	kernel_thread_start(continuation: (thread_continue_t)vn_laundry_continue, NULL, new_thread: &thread);
460	thread_deallocate(thread);
461	}
462
463	/ the timeout is in 10 msecs /
464	int
465	vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
466	{
467	int error = `0`;
468	struct timespec ts;
469
470	if (output_target < `0`) {
471	return EINVAL;
472	}
473
474	KERNEL_DEBUG(`0x3010280` \| DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, `0`, `0`);
475
476	if (vp->v_numoutput > output_target) {
477	slpflag \|= PDROP;
478
479	vnode_lock_spin(vp);
480
481	while ((vp->v_numoutput > output_target) && error == `0`) {
482	if (output_target) {
483	vp->v_flag \|= VTHROTTLED;
484	} else {
485	vp->v_flag \|= VBWAIT;
486	}
487
488	ts.tv_sec = (slptimeout / `100`);
489	ts.tv_nsec = (slptimeout % `1000`) * `10` * NSEC_PER_USEC * `1000`;
490	error = msleep(chan: (caddr_t)&vp->v_numoutput, mtx: &vp->v_lock, pri: (slpflag \| (PRIBIO + `1`)), wmesg: msg, ts: &ts);
491
492	vnode_lock_spin(vp);
493	}
494	vnode_unlock(vp);
495	}
496	KERNEL_DEBUG(`0x3010280` \| DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, `0`);
497
498	return error;
499	}
500
501
502	void
503	vnode_startwrite(vnode_t vp)
504	{
505	OSAddAtomic(`1`, &vp->v_numoutput);
506	}
507
508
509	void
510	vnode_writedone(vnode_t vp)
511	{
512	if (vp) {
513	int need_wakeup = `0`;
514
515	OSAddAtomic(-`1`, &vp->v_numoutput);
516
517	vnode_lock_spin(vp);
518
519	if (vp->v_numoutput < `0`) {
520	panic("vnode_writedone: numoutput < 0");
521	}
522
523	if ((vp->v_flag & VTHROTTLED)) {
524	vp->v_flag &= ~VTHROTTLED;
525	need_wakeup = `1`;
526	}
527	if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == `0`)) {
528	vp->v_flag &= ~VBWAIT;
529	need_wakeup = `1`;
530	}
531	vnode_unlock(vp);
532
533	if (need_wakeup) {
534	wakeup(chan: (caddr_t)&vp->v_numoutput);
535	}
536	}
537	}
538
539
540
541	int
542	vnode_hasdirtyblks(vnode_t vp)
543	{
544	struct cl_writebehind *wbp;
545
546	/*
547	* Not taking the buf_mtx as there is little
548	* point doing it. Even if the lock is taken the
549	* state can change right after that. If their
550	* needs to be a synchronization, it must be driven
551	* by the caller
552	*/
553	if (vp->v_dirtyblkhd.lh_first) {
554	return `1`;
555	}
556
557	if (!UBCINFOEXISTS(vp)) {
558	return `0`;
559	}
560
561	wbp = vp->v_ubcinfo->cl_wbehind;
562
563	if (wbp && (wbp->cl_number \|\| wbp->cl_scmap)) {
564	return `1`;
565	}
566
567	return `0`;
568	}
569
570	int
571	vnode_hascleanblks(vnode_t vp)
572	{
573	/*
574	* Not taking the buf_mtx as there is little
575	* point doing it. Even if the lock is taken the
576	* state can change right after that. If their
577	* needs to be a synchronization, it must be driven
578	* by the caller
579	*/
580	if (vp->v_cleanblkhd.lh_first) {
581	return `1`;
582	}
583	return `0`;
584	}
585
586	void
587	vnode_iterate_setup(mount_t mp)
588	{
589	mp->mnt_lflag \|= MNT_LITER;
590	}
591
592	int
593	vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
594	{
595	vnode_t vp;
596	int ret = `0`;
597
598	TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
599	if (vp->v_type == VDIR) {
600	continue;
601	}
602	if (vp == skipvp) {
603	continue;
604	}
605	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\| (vp->v_flag & VNOFLUSH))) {
606	continue;
607	}
608	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
609	continue;
610	}
611	if ((flags & WRITECLOSE) && (vp->v_writecount == `0` \|\| vp->v_type != VREG)) {
612	continue;
613	}
614
615	/ Look for busy vnode /
616	if ((vp->v_usecount != `0`) && ((vp->v_usecount - vp->v_kusecount) != `0`)) {
617	ret = `1`;
618	if (print_busy_vnodes && ((flags & FORCECLOSE) == `0`)) {
619	vprint(label: "vnode_umount_preflight - busy vnode", vp);
620	} else {
621	return ret;
622	}
623	} else if (vp->v_iocount > `0`) {
624	/ Busy if iocount is > 0 for more than 3 seconds /
625	tsleep(chan: &vp->v_iocount, PVFS, wmesg: "vnode_drain_network", timo: `3` * hz);
626	if (vp->v_iocount > `0`) {
627	ret = `1`;
628	if (print_busy_vnodes && ((flags & FORCECLOSE) == `0`)) {
629	vprint(label: "vnode_umount_preflight - busy vnode", vp);
630	} else {
631	return ret;
632	}
633	}
634	continue;
635	}
636	}
637
638	return ret;
639	}
640
641	/*
642	* This routine prepares iteration by moving all the vnodes to worker queue
643	* called with mount lock held
644	*/
645	int
646	vnode_iterate_prepare(mount_t mp)
647	{
648	vnode_t vp;
649
650	if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
651	/ nothing to do /
652	return `0`;
653	}
654
655	vp = TAILQ_FIRST(&mp->mnt_vnodelist);
656	vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
657	mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
658	mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
659
660	TAILQ_INIT(&mp->mnt_vnodelist);
661	if (mp->mnt_newvnodes.tqh_first != NULL) {
662	panic("vnode_iterate_prepare: newvnode when entering vnode");
663	}
664	TAILQ_INIT(&mp->mnt_newvnodes);
665
666	return `1`;
667	}
668
669
670	/ called with mount lock held /
671	int
672	vnode_iterate_reloadq(mount_t mp)
673	{
674	int moved = `0`;
675
676	/ add the remaining entries in workerq to the end of mount vnode list /
677	if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
678	struct vnode * mvp;
679	mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
680
681	/ Joining the workerque entities to mount vnode list /
682	if (mvp) {
683	mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
684	} else {
685	mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
686	}
687	mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
688	mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
689	TAILQ_INIT(&mp->mnt_workerqueue);
690	}
691
692	/ add the newvnodes to the head of mount vnode list /
693	if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
694	struct vnode * nlvp;
695	nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
696
697	mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
698	nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
699	if (mp->mnt_vnodelist.tqh_first) {
700	mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
701	} else {
702	mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
703	}
704	mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
705	TAILQ_INIT(&mp->mnt_newvnodes);
706	moved = `1`;
707	}
708
709	return moved;
710	}
711
712
713	void
714	vnode_iterate_clear(mount_t mp)
715	{
716	mp->mnt_lflag &= ~MNT_LITER;
717	}
718
719	#if defined(__x86_64__)
720
721	#include <i386/panic_hooks.h>
722
723	struct vnode_iterate_panic_hook {
724	panic_hook_t hook;
725	mount_t mp;
726	struct vnode *vp;
727	};
728
729	static void
730	vnode_iterate_panic_hook(panic_hook_t *hook_)
731	{
732	struct vnode_iterate_panic_hook hook = (struct* vnode_iterate_panic_hook *)hook_;
733	panic_phys_range_t range;
734	uint64_t phys;
735
736	if (panic_phys_range_before(hook->mp, &phys, &range)) {
737	paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
738	hook->mp, phys, range.type, range.phys_start,
739	range.phys_start + range.len);
740	} else {
741	paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
742	}
743
744	if (panic_phys_range_before(hook->vp, &phys, &range)) {
745	paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
746	hook->vp, phys, range.type, range.phys_start,
747	range.phys_start + range.len);
748	} else {
749	paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
750	}
751	panic_dump_mem((void *)(((vm_offset_t)hook->mp - `4096`) & ~`4095`), `12288`);
752	}
753	#endif /* defined(__x86_64__) */
754
755	int
756	vnode_iterate(mount_t mp, int flags, int (callout)(struct* vnode , void* *),
757	void *arg)
758	{
759	struct vnode *vp;
760	int vid, retval;
761	int ret = `0`;
762
763	/*
764	* The mount iterate mutex is held for the duration of the iteration.
765	* This can be done by a state flag on the mount structure but we can
766	* run into priority inversion issues sometimes.
767	* Using a mutex allows us to benefit from the priority donation
768	* mechanisms in the kernel for locks. This mutex should never be
769	* acquired in spin mode and it should be acquired before attempting to
770	* acquire the mount lock.
771	*/
772	mount_iterate_lock(mp);
773
774	mount_lock(mp);
775
776	vnode_iterate_setup(mp);
777
778	/ If it returns 0 then there is nothing to do /
779	retval = vnode_iterate_prepare(mp);
780
781	if (retval == `0`) {
782	vnode_iterate_clear(mp);
783	mount_unlock(mp);
784	mount_iterate_unlock(mp);
785	return ret;
786	}
787
788	#if defined(__x86_64__)
789	struct vnode_iterate_panic_hook hook;
790	hook.mp = mp;
791	hook.vp = NULL;
792	panic_hook(&hook.hook, vnode_iterate_panic_hook);
793	#endif
794	/ iterate over all the vnodes /
795	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
796	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
797	#if defined(__x86_64__)
798	hook.vp = vp;
799	#endif
800	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
801	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
802	vid = vp->v_id;
803	if ((vp->v_data == NULL) \|\| (vp->v_type == VNON) \|\| (vp->v_mount != mp)) {
804	continue;
805	}
806	vnode_hold(vp);
807	mount_unlock(mp);
808
809	if (vget_internal(vp, vid, (flags \| VNODE_NODEAD \| VNODE_WITHID \| VNODE_NOSUSPEND))) {
810	mount_lock(mp);
811	vnode_drop(vp);
812	continue;
813	}
814	vnode_drop(vp);
815	if (flags & VNODE_RELOAD) {
816	/*
817	* we're reloading the filesystem
818	* cast out any inactive vnodes...
819	*/
820	if (vnode_reload(vp)) {
821	/ vnode will be recycled on the refcount drop /
822	vnode_put(vp);
823	mount_lock(mp);
824	continue;
825	}
826	}
827
828	retval = callout(vp, arg);
829
830	switch (retval) {
831	case VNODE_RETURNED:
832	case VNODE_RETURNED_DONE:
833	vnode_put(vp);
834	if (retval == VNODE_RETURNED_DONE) {
835	mount_lock(mp);
836	ret = `0`;
837	goto out;
838	}
839	break;
840
841	case VNODE_CLAIMED_DONE:
842	mount_lock(mp);
843	ret = `0`;
844	goto out;
845	case VNODE_CLAIMED:
846	default:
847	break;
848	}
849	mount_lock(mp);
850	}
851
852	out:
853	#if defined(__x86_64__)
854	panic_unhook(&hook.hook);
855	#endif
856	(void)vnode_iterate_reloadq(mp);
857	vnode_iterate_clear(mp);
858	mount_unlock(mp);
859	mount_iterate_unlock(mp);
860	return ret;
861	}
862
863	void
864	mount_lock_renames(mount_t mp)
865	{
866	lck_mtx_lock(lck: &mp->mnt_renamelock);
867	}
868
869	void
870	mount_unlock_renames(mount_t mp)
871	{
872	lck_mtx_unlock(lck: &mp->mnt_renamelock);
873	}
874
875	void
876	mount_iterate_lock(mount_t mp)
877	{
878	lck_mtx_lock(lck: &mp->mnt_iter_lock);
879	}
880
881	void
882	mount_iterate_unlock(mount_t mp)
883	{
884	lck_mtx_unlock(lck: &mp->mnt_iter_lock);
885	}
886
887	void
888	mount_lock(mount_t mp)
889	{
890	lck_mtx_lock(lck: &mp->mnt_mlock);
891	}
892
893	void
894	mount_lock_spin(mount_t mp)
895	{
896	lck_mtx_lock_spin(lck: &mp->mnt_mlock);
897	}
898
899	void
900	mount_unlock(mount_t mp)
901	{
902	lck_mtx_unlock(lck: &mp->mnt_mlock);
903	}
904
905
906	void
907	mount_ref(mount_t mp, int locked)
908	{
909	if (!locked) {
910	mount_lock_spin(mp);
911	}
912
913	mp->mnt_count++;
914
915	if (!locked) {
916	mount_unlock(mp);
917	}
918	}
919
920
921	void
922	mount_drop(mount_t mp, int locked)
923	{
924	if (!locked) {
925	mount_lock_spin(mp);
926	}
927
928	mp->mnt_count--;
929
930	if (mp->mnt_count == `0` && (mp->mnt_lflag & MNT_LDRAIN)) {
931	wakeup(chan: &mp->mnt_lflag);
932	}
933
934	if (!locked) {
935	mount_unlock(mp);
936	}
937	}
938
939
940	int
941	mount_iterref(mount_t mp, int locked)
942	{
943	int retval = `0`;
944
945	if (!locked) {
946	mount_list_lock();
947	}
948	if (mp->mnt_iterref < `0`) {
949	retval = `1`;
950	} else {
951	mp->mnt_iterref++;
952	}
953	if (!locked) {
954	mount_list_unlock();
955	}
956	return retval;
957	}
958
959	int
960	mount_isdrained(mount_t mp, int locked)
961	{
962	int retval;
963
964	if (!locked) {
965	mount_list_lock();
966	}
967	if (mp->mnt_iterref < `0`) {
968	retval = `1`;
969	} else {
970	retval = `0`;
971	}
972	if (!locked) {
973	mount_list_unlock();
974	}
975	return retval;
976	}
977
978	void
979	mount_iterdrop(mount_t mp)
980	{
981	mount_list_lock();
982	mp->mnt_iterref--;
983	wakeup(chan: &mp->mnt_iterref);
984	mount_list_unlock();
985	}
986
987	void
988	mount_iterdrain(mount_t mp)
989	{
990	mount_list_lock();
991	while (mp->mnt_iterref) {
992	msleep(chan: (caddr_t)&mp->mnt_iterref, mtx: &mnt_list_mtx_lock, PVFS, wmesg: "mount_iterdrain", NULL);
993	}
994	/ mount iterations drained /
995	mp->mnt_iterref = -`1`;
996	mount_list_unlock();
997	}
998	void
999	mount_iterreset(mount_t mp)
1000	{
1001	mount_list_lock();
1002	if (mp->mnt_iterref == -`1`) {
1003	mp->mnt_iterref = `0`;
1004	}
1005	mount_list_unlock();
1006	}
1007
1008	/ always called with mount lock held /
1009	int
1010	mount_refdrain(mount_t mp)
1011	{
1012	if (mp->mnt_lflag & MNT_LDRAIN) {
1013	panic("already in drain");
1014	}
1015	mp->mnt_lflag \|= MNT_LDRAIN;
1016
1017	while (mp->mnt_count) {
1018	msleep(chan: (caddr_t)&mp->mnt_lflag, mtx: &mp->mnt_mlock, PVFS, wmesg: "mount_drain", NULL);
1019	}
1020
1021	if (mp->mnt_vnodelist.tqh_first != NULL) {
1022	panic("mount_refdrain: dangling vnode");
1023	}
1024
1025	mp->mnt_lflag &= ~MNT_LDRAIN;
1026
1027	return `0`;
1028	}
1029
1030	/ Tags the mount point as not supportine extended readdir for NFS exports /
1031	void
1032	mount_set_noreaddirext(mount_t mp)
1033	{
1034	mount_lock(mp);
1035	mp->mnt_kern_flag \|= MNTK_DENY_READDIREXT;
1036	mount_unlock(mp);
1037	}
1038
1039	/*
1040	* Mark a mount point as busy. Used to synchronize access and to delay
1041	* unmounting.
1042	*/
1043	int
1044	vfs_busy(mount_t mp, int flags)
1045	{
1046	restart:
1047	if (mp->mnt_lflag & MNT_LDEAD) {
1048	return ENOENT;
1049	}
1050
1051	mount_lock(mp);
1052
1053	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1054	if (flags & LK_NOWAIT \|\| mp->mnt_lflag & MNT_LDEAD) {
1055	mount_unlock(mp);
1056	return ENOENT;
1057	}
1058
1059	/*
1060	* Since all busy locks are shared except the exclusive
1061	* lock granted when unmounting, the only place that a
1062	* wakeup needs to be done is at the release of the
1063	* exclusive lock at the end of dounmount.
1064	*/
1065	mp->mnt_lflag \|= MNT_LWAIT;
1066	msleep(chan: (caddr_t)mp, mtx: &mp->mnt_mlock, pri: (PVFS \| PDROP), wmesg: "vfsbusy", NULL);
1067	return ENOENT;
1068	}
1069
1070	mount_unlock(mp);
1071
1072	lck_rw_lock_shared(lck: &mp->mnt_rwlock);
1073
1074	/*
1075	* Until we are granted the rwlock, it's possible for the mount point to
1076	* change state, so re-evaluate before granting the vfs_busy.
1077	*/
1078	if (mp->mnt_lflag & (MNT_LDEAD \| MNT_LUNMOUNT)) {
1079	lck_rw_done(lck: &mp->mnt_rwlock);
1080	goto restart;
1081	}
1082	return `0`;
1083	}
1084
1085	/*
1086	* Free a busy filesystem.
1087	*/
1088	void
1089	vfs_unbusy(mount_t mp)
1090	{
1091	lck_rw_done(lck: &mp->mnt_rwlock);
1092	}
1093
1094
1095
1096	static void
1097	vfs_rootmountfailed(mount_t mp)
1098	{
1099	mount_list_lock();
1100	mp->mnt_vtable->vfc_refcount--;
1101	mount_list_unlock();
1102
1103	vfs_unbusy(mp);
1104
1105	if (nc_smr_enabled) {
1106	vfs_smr_synchronize();
1107	}
1108
1109	mount_lock_destroy(mp);
1110
1111	#if CONFIG_MACF
1112	mac_mount_label_destroy(mp);
1113	#endif
1114
1115	zfree(mount_zone, mp);
1116	}
1117
1118	/*
1119	* Lookup a filesystem type, and if found allocate and initialize
1120	* a mount structure for it.
1121	*
1122	* Devname is usually updated by mount(8) after booting.
1123	*/
1124	static mount_t
1125	vfs_rootmountalloc_internal(struct vfstable vfsp, const* char *devname)
1126	{
1127	mount_t mp;
1128
1129	mp = zalloc_flags(mount_zone, Z_WAITOK \| Z_ZERO);
1130	/ Initialize the default IO constraints /
1131	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1132	mp->mnt_segreadcnt = mp->mnt_segwritecnt = `32`;
1133	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1134	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1135	mp->mnt_devblocksize = DEV_BSIZE;
1136	mp->mnt_alignmentmask = PAGE_MASK;
1137	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1138	mp->mnt_ioscale = `1`;
1139	mp->mnt_ioflags = `0`;
1140	mp->mnt_realrootvp = NULLVP;
1141	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1142	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - `1`;
1143	mp->mnt_devbsdunit = `0`;
1144
1145	mount_lock_init(mp);
1146	(void)vfs_busy(mp, LK_NOWAIT);
1147
1148	TAILQ_INIT(&mp->mnt_vnodelist);
1149	TAILQ_INIT(&mp->mnt_workerqueue);
1150	TAILQ_INIT(&mp->mnt_newvnodes);
1151
1152	mp->mnt_vtable = vfsp;
1153	mp->mnt_op = vfsp->vfc_vfsops;
1154	mp->mnt_flag = MNT_RDONLY \| MNT_ROOTFS;
1155	mp->mnt_vnodecovered = NULLVP;
1156	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1157	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
1158
1159	mount_list_lock();
1160	vfsp->vfc_refcount++;
1161	mount_list_unlock();
1162
1163	strlcpy(dst: mp->mnt_vfsstat.f_fstypename, src: vfsp->vfc_name, MFSTYPENAMELEN);
1164	mp->mnt_vfsstat.f_mntonname[`0`] = `'/'`;
1165	/ XXX const poisoning layering violation /
1166	(void) copystr(kfaddr: (const void *)devname, kdaddr: mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - `1`, NULL);
1167
1168	#if CONFIG_MACF
1169	mac_mount_label_init(mp);
1170	mac_mount_label_associate(ctx: vfs_context_kernel(), mp);
1171	#endif
1172	return mp;
1173	}
1174
1175	errno_t
1176	vfs_rootmountalloc(const char fstypename, const* char devname, mount_t mpp)
1177	{
1178	struct vfstable *vfsp;
1179
1180	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1181	if (!strncmp(s1: vfsp->vfc_name, s2: fstypename,
1182	n: sizeof(vfsp->vfc_name))) {
1183	break;
1184	}
1185	}
1186	if (vfsp == NULL) {
1187	return ENODEV;
1188	}
1189
1190	*mpp = vfs_rootmountalloc_internal(vfsp, devname);
1191
1192	if (*mpp) {
1193	return `0`;
1194	}
1195
1196	return ENOMEM;
1197	}
1198
1199	#define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1200
1201	/*
1202	* Find an appropriate filesystem to use for the root. If a filesystem
1203	* has not been preselected, walk through the list of known filesystems
1204	* trying those that have mountroot routines, and try them until one
1205	* works or we have tried them all.
1206	*/
1207	extern int (mountroot)(void*);
1208
1209	int
1210	vfs_mountroot(void)
1211	{
1212	#if CONFIG_MACF
1213	struct vnode *vp;
1214	#endif
1215	struct vfstable *vfsp;
1216	vfs_context_t ctx = vfs_context_kernel();
1217	struct vfs_attr vfsattr;
1218	int error;
1219	mount_t mp;
1220	vnode_t bdevvp_rootvp;
1221
1222	/*
1223	* Reset any prior "unmounting everything" state. This handles the
1224	* situation where mount root and then unmountall and re-mountroot
1225	* a new image (see bsd/kern/imageboot.c).
1226	*/
1227	vfs_unmountall_started = vfs_unmountall_finished = `0`;
1228	OSMemoryBarrier();
1229
1230	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_START);
1231	if (mountroot != NULL) {
1232	/*
1233	* used for netboot which follows a different set of rules
1234	*/
1235	error = (*mountroot)();
1236
1237	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, `0`);
1238	return error;
1239	}
1240	if ((error = bdevvp(dev: rootdev, vpp: &rootvp))) {
1241	printf("vfs_mountroot: can't setup bdevvp\n");
1242
1243	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, `1`);
1244	return error;
1245	}
1246	/*
1247	* 4951998 - code we call in vfc_mountroot may replace rootvp
1248	* so keep a local copy for some house keeping.
1249	*/
1250	bdevvp_rootvp = rootvp;
1251
1252	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1253	if (vfsp->vfc_mountroot == NULL
1254	&& !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1255	continue;
1256	}
1257
1258	mp = vfs_rootmountalloc_internal(vfsp, devname: "root_device");
1259	mp->mnt_devvp = rootvp;
1260
1261	if (vfsp->vfc_mountroot) {
1262	error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1263	} else {
1264	error = VFS_MOUNT(mp, rootvp, `0`, ctx);
1265	}
1266
1267	if (!error) {
1268	if (bdevvp_rootvp != rootvp) {
1269	/*
1270	* rootvp changed...
1271	* bump the iocount and fix up mnt_devvp for the
1272	* new rootvp (it will already have a usecount taken)...
1273	* drop the iocount and the usecount on the orignal
1274	* since we are no longer going to use it...
1275	*/
1276	vnode_getwithref(vp: rootvp);
1277	mp->mnt_devvp = rootvp;
1278
1279	vnode_rele(vp: bdevvp_rootvp);
1280	vnode_put(vp: bdevvp_rootvp);
1281	}
1282	mp->mnt_devvp->v_specflags \|= SI_MOUNTEDON;
1283
1284	vfs_unbusy(mp);
1285
1286	mount_list_add(mp);
1287
1288	/*
1289	* cache the IO attributes for the underlying physical media...
1290	* an error return indicates the underlying driver doesn't
1291	* support all the queries necessary... however, reasonable
1292	* defaults will have been set, so no reason to bail or care
1293	*/
1294	vfs_init_io_attributes(devvp: rootvp, mp);
1295
1296	if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1297	root_is_CF_drive = TRUE;
1298	}
1299
1300	/*
1301	* Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1302	*/
1303	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1304	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1305	}
1306	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1307	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
1308	}
1309
1310	#if defined(XNU_TARGET_OS_OSX)
1311	uint32_t speed;
1312
1313	if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1314	speed = `128`;
1315	} else if (disk_conditioner_mount_is_ssd(mp)) {
1316	speed = `7` * `256`;
1317	} else {
1318	speed = `256`;
1319	}
1320	vc_progress_setdiskspeed(speed);
1321	#endif /* XNU_TARGET_OS_OSX */
1322	/*
1323	* Probe root file system for additional features.
1324	*/
1325	(void)VFS_START(mp, `0`, ctx);
1326
1327	VFSATTR_INIT(&vfsattr);
1328	VFSATTR_WANTED(&vfsattr, f_capabilities);
1329	if (vfs_getattr(mp, vfa: &vfsattr, ctx) == `0` &&
1330	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1331	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1332	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1333	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1334	}
1335	#if NAMEDSTREAMS
1336	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1337	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1338	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
1339	}
1340	#endif
1341	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1342	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1343	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1344	}
1345
1346	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1347	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1348	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
1349	}
1350	}
1351
1352	/*
1353	* get rid of iocount reference returned
1354	* by bdevvp (or picked up by us on the substitued
1355	* rootvp)... it (or we) will have also taken
1356	* a usecount reference which we want to keep
1357	*/
1358	vnode_put(vp: rootvp);
1359
1360	#if CONFIG_MACF
1361	if ((vfs_flags(mp) & MNT_MULTILABEL) == `0`) {
1362	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, `0`, `2`);
1363	return `0`;
1364	}
1365
1366	error = VFS_ROOT(mp, &vp, ctx);
1367	if (error) {
1368	printf("%s() VFS_ROOT() returned %d\n",
1369	__func__, error);
1370	dounmount(mp, MNT_FORCE, `0`, ctx);
1371	goto fail;
1372	}
1373	error = vnode_label(mp, NULL, vp, NULL, flags: `0`, ctx);
1374	/*
1375	* get rid of reference provided by VFS_ROOT
1376	*/
1377	vnode_put(vp);
1378
1379	if (error) {
1380	printf("%s() vnode_label() returned %d\n",
1381	__func__, error);
1382	dounmount(mp, MNT_FORCE, `0`, ctx);
1383	goto fail;
1384	}
1385	#endif
1386	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, `0`, `3`);
1387	return `0`;
1388	}
1389	vfs_rootmountfailed(mp);
1390	#if CONFIG_MACF
1391	fail:
1392	#endif
1393	if (error != EINVAL) {
1394	printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1395	}
1396	}
1397	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error ? error : ENODEV, `4`);
1398	return ENODEV;
1399	}
1400
1401	static int
1402	cache_purge_callback(mount_t mp, __unused void * arg)
1403	{
1404	cache_purgevfs(mp);
1405	return VFS_RETURNED;
1406	}
1407
1408	extern lck_rw_t rootvnode_rw_lock;
1409	extern void set_rootvnode(vnode_t);
1410
1411
1412	static int
1413	mntonname_fixup_callback(mount_t mp, __unused void *arg)
1414	{
1415	int error = `0`;
1416
1417	if ((strncmp(s1: &mp->mnt_vfsstat.f_mntonname[`0`], s2: "/", n: sizeof("/")) == `0`) \|\|
1418	(strncmp(s1: &mp->mnt_vfsstat.f_mntonname[`0`], s2: "/dev", n: sizeof("/dev")) == `0`)) {
1419	return `0`;
1420	}
1421
1422	if ((error = vfs_busy(mp, LK_NOWAIT))) {
1423	printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
1424	return -`1`;
1425	}
1426
1427	size_t pathlen = MAXPATHLEN;
1428	if ((error = vn_getpath_ext(vp: mp->mnt_vnodecovered, NULL, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER))) {
1429	printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
1430	}
1431
1432	vfs_unbusy(mp);
1433
1434	return error;
1435	}
1436
1437	static int
1438	clear_mntk_backs_root_callback(mount_t mp, __unused void *arg)
1439	{
1440	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
1441	mp->mnt_kern_flag &= ~MNTK_BACKS_ROOT;
1442	lck_rw_done(lck: &mp->mnt_rwlock);
1443	return VFS_RETURNED;
1444	}
1445
1446	static int
1447	verify_incoming_rootfs(vnode_t *incoming_rootvnodep, vfs_context_t ctx,
1448	vfs_switch_root_flags_t flags)
1449	{
1450	mount_t mp;
1451	vnode_t tdp;
1452	vnode_t incoming_rootvnode_with_iocount = *incoming_rootvnodep;
1453	vnode_t incoming_rootvnode_with_usecount = NULLVP;
1454	int error = `0`;
1455
1456	if (vnode_vtype(vp: incoming_rootvnode_with_iocount) != VDIR) {
1457	printf("Incoming rootfs path not a directory\n");
1458	error = ENOTDIR;
1459	goto done;
1460	}
1461
1462	/*
1463	* Before we call VFS_ROOT, we have to let go of the iocount already
1464	* acquired, but before doing that get a usecount.
1465	*/
1466	vnode_ref_ext(incoming_rootvnode_with_iocount, `0`, VNODE_REF_FORCE);
1467	incoming_rootvnode_with_usecount = incoming_rootvnode_with_iocount;
1468	vnode_lock_spin(incoming_rootvnode_with_usecount);
1469	if ((mp = incoming_rootvnode_with_usecount->v_mount)) {
1470	mp->mnt_crossref++;
1471	vnode_unlock(incoming_rootvnode_with_usecount);
1472	} else {
1473	vnode_unlock(incoming_rootvnode_with_usecount);
1474	printf("Incoming rootfs root vnode does not have associated mount\n");
1475	error = ENOTDIR;
1476	goto done;
1477	}
1478
1479	if (vfs_busy(mp, LK_NOWAIT)) {
1480	printf("Incoming rootfs root vnode mount is busy\n");
1481	error = ENOENT;
1482	goto out;
1483	}
1484
1485	vnode_put(vp: incoming_rootvnode_with_iocount);
1486	incoming_rootvnode_with_iocount = NULLVP;
1487
1488	error = VFS_ROOT(mp, &tdp, ctx);
1489
1490	if (error) {
1491	printf("Could not get rootvnode of incoming rootfs\n");
1492	} else if (tdp != incoming_rootvnode_with_usecount) {
1493	vnode_put(vp: tdp);
1494	tdp = NULLVP;
1495	printf("Incoming rootfs root vnode mount is is not a mountpoint\n");
1496	error = EINVAL;
1497	goto out_busy;
1498	} else {
1499	incoming_rootvnode_with_iocount = tdp;
1500	tdp = NULLVP;
1501	}
1502
1503	if ((flags & VFSSR_VIRTUALDEV_PROHIBITED) != `0`) {
1504	if (mp->mnt_flag & MNTK_VIRTUALDEV) {
1505	error = ENODEV;
1506	}
1507	if (error) {
1508	printf("Incoming rootfs is backed by a virtual device; cannot switch to it");
1509	goto out_busy;
1510	}
1511	}
1512
1513	out_busy:
1514	vfs_unbusy(mp);
1515
1516	out:
1517	vnode_lock(incoming_rootvnode_with_usecount);
1518	mp->mnt_crossref--;
1519	if (mp->mnt_crossref < `0`) {
1520	panic("mount cross refs -ve");
1521	}
1522	vnode_unlock(incoming_rootvnode_with_usecount);
1523
1524	done:
1525	if (incoming_rootvnode_with_usecount) {
1526	vnode_rele(vp: incoming_rootvnode_with_usecount);
1527	incoming_rootvnode_with_usecount = NULLVP;
1528	}
1529
1530	if (error && incoming_rootvnode_with_iocount) {
1531	vnode_put(vp: incoming_rootvnode_with_iocount);
1532	incoming_rootvnode_with_iocount = NULLVP;
1533	}
1534
1535	*incoming_rootvnodep = incoming_rootvnode_with_iocount;
1536	return error;
1537	}
1538
1539	/*
1540	* vfs_switch_root()
1541	*
1542	* Move the current root volume, and put a different volume at the root.
1543	*
1544	* incoming_vol_old_path: This is the path where the incoming root volume
1545	* is mounted when this function begins.
1546	* outgoing_vol_new_path: This is the path where the outgoing root volume
1547	* will be mounted when this function (successfully) ends.
1548	* Note: Do not use a leading slash.
1549	*
1550	* Volumes mounted at several fixed points (including /dev) will be preserved
1551	* at the same absolute path. That means they will move within the folder
1552	* hierarchy during the pivot operation. For example, /dev before the pivot
1553	* will be at /dev after the pivot.
1554	*
1555	* If any filesystem has MNTK_BACKS_ROOT set, it will be cleared. If the
1556	* incoming root volume is actually a disk image backed by some other
1557	* filesystem, it is the caller's responsibility to re-set MNTK_BACKS_ROOT
1558	* as appropriate.
1559	*/
1560	int
1561	vfs_switch_root(const char *incoming_vol_old_path,
1562	const char *outgoing_vol_new_path,
1563	vfs_switch_root_flags_t flags)
1564	{
1565	// grumble grumble
1566	#define countof(x) (sizeof(x) / sizeof(x[0]))
1567
1568	struct preserved_mount {
1569	vnode_t pm_rootvnode;
1570	mount_t pm_mount;
1571	vnode_t pm_new_covered_vp;
1572	vnode_t pm_old_covered_vp;
1573	const char *pm_path;
1574	};
1575
1576	vfs_context_t ctx = vfs_context_kernel();
1577	vnode_t incoming_rootvnode = NULLVP;
1578	vnode_t outgoing_vol_new_covered_vp = NULLVP;
1579	vnode_t incoming_vol_old_covered_vp = NULLVP;
1580	mount_t outgoing = NULL;
1581	mount_t incoming = NULL;
1582
1583	struct preserved_mount devfs = { NULLVP, NULL, NULLVP, NULLVP, "dev" };
1584	struct preserved_mount preboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Preboot" };
1585	struct preserved_mount recovery = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Recovery" };
1586	struct preserved_mount vm = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/VM" };
1587	struct preserved_mount update = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Update" };
1588	struct preserved_mount iscPreboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/iSCPreboot" };
1589	struct preserved_mount hardware = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Hardware" };
1590	struct preserved_mount xarts = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/xarts" };
1591	struct preserved_mount factorylogs = { NULLVP, NULL, NULLVP, NULLVP, "FactoryLogs" };
1592	struct preserved_mount idiags = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Diags" };
1593
1594	struct preserved_mount *preserved[`10`];
1595	preserved[`0`] = &devfs;
1596	preserved[`1`] = &preboot;
1597	preserved[`2`] = &recovery;
1598	preserved[`3`] = &vm;
1599	preserved[`4`] = &update;
1600	preserved[`5`] = &iscPreboot;
1601	preserved[`6`] = &hardware;
1602	preserved[`7`] = &xarts;
1603	preserved[`8`] = &factorylogs;
1604	preserved[`9`] = &idiags;
1605
1606	int error;
1607
1608	printf("%s : shuffling mount points : %s <-> / <-> %s\n", __FUNCTION__, incoming_vol_old_path, outgoing_vol_new_path);
1609
1610	if (outgoing_vol_new_path[`0`] == `'/'`) {
1611	// I should have written this to be more helpful and just advance the pointer forward past the slash
1612	printf("Do not use a leading slash in outgoing_vol_new_path\n");
1613	return EINVAL;
1614	}
1615
1616	// Set incoming_rootvnode.
1617	// Find the vnode representing the mountpoint of the new root
1618	// filesystem. That will be the new root directory.
1619	error = vnode_lookup(path: incoming_vol_old_path, flags: `0`, vpp: &incoming_rootvnode, ctx);
1620	if (error) {
1621	printf("Incoming rootfs root vnode not found\n");
1622	error = ENOENT;
1623	goto done;
1624	}
1625
1626	/*
1627	* This function drops the icoount and sets the vnode to NULL on error.
1628	*/
1629	error = verify_incoming_rootfs(incoming_rootvnodep: &incoming_rootvnode, ctx, flags);
1630	if (error) {
1631	goto done;
1632	}
1633
1634	/*
1635	* Set outgoing_vol_new_covered_vp.
1636	* Find the vnode representing the future mountpoint of the old
1637	* root filesystem, inside the directory incoming_rootvnode.
1638	* Right now it's at "/incoming_vol_old_path/outgoing_vol_new_path".
1639	* soon it will become "/oldrootfs_path_after", which will be covered.
1640	*/
1641	error = vnode_lookupat(path: outgoing_vol_new_path, flags: `0`, vpp: &outgoing_vol_new_covered_vp, ctx, start_dvp: incoming_rootvnode);
1642	if (error) {
1643	printf("Outgoing rootfs path not found, abandoning / switch, error = %d\n", error);
1644	error = ENOENT;
1645	goto done;
1646	}
1647	if (vnode_vtype(vp: outgoing_vol_new_covered_vp) != VDIR) {
1648	printf("Outgoing rootfs path is not a directory, abandoning / switch\n");
1649	error = ENOTDIR;
1650	goto done;
1651	}
1652
1653	/*
1654	* Find the preserved mounts - see if they are mounted. Get their root
1655	* vnode if they are. If they aren't, leave rootvnode NULL which will
1656	* be the signal to ignore this mount later on.
1657	*
1658	* Also get preserved mounts' new_covered_vp.
1659	* Find the node representing the folder "dev" inside the directory newrootvnode.
1660	* Right now it's at "/incoming_vol_old_path/dev".
1661	* Soon it will become /dev, which will be covered by the devfs mountpoint.
1662	*/
1663	for (size_t i = `0`; i < countof(preserved); i++) {
1664	struct preserved_mount *pmi = preserved[i];
1665
1666	error = vnode_lookupat(path: pmi->pm_path, flags: `0`, vpp: &pmi->pm_rootvnode, ctx, start_dvp: rootvnode);
1667	if (error) {
1668	printf("skipping preserved mountpoint because not found or error: %d: %s\n", error, pmi->pm_path);
1669	// not fatal. try the next one in the list.
1670	continue;
1671	}
1672	bool is_mountpoint = false;
1673	vnode_lock_spin(pmi->pm_rootvnode);
1674	if ((pmi->pm_rootvnode->v_flag & VROOT) != `0`) {
1675	is_mountpoint = true;
1676	}
1677	vnode_unlock(pmi->pm_rootvnode);
1678	if (!is_mountpoint) {
1679	printf("skipping preserved mountpoint because not a mountpoint: %s\n", pmi->pm_path);
1680	vnode_put(vp: pmi->pm_rootvnode);
1681	pmi->pm_rootvnode = NULLVP;
1682	// not fatal. try the next one in the list.
1683	continue;
1684	}
1685
1686	error = vnode_lookupat(path: pmi->pm_path, flags: `0`, vpp: &pmi->pm_new_covered_vp, ctx, start_dvp: incoming_rootvnode);
1687	if (error) {
1688	printf("preserved new mount directory not found or error: %d: %s\n", error, pmi->pm_path);
1689	error = ENOENT;
1690	goto done;
1691	}
1692	if (vnode_vtype(vp: pmi->pm_new_covered_vp) != VDIR) {
1693	printf("preserved new mount directory not directory: %s\n", pmi->pm_path);
1694	error = ENOTDIR;
1695	goto done;
1696	}
1697
1698	printf("will preserve mountpoint across pivot: /%s\n", pmi->pm_path);
1699	}
1700
1701	/*
1702	* --
1703	* At this point, everything has been prepared and all error conditions
1704	* have been checked. We check everything we can before this point;
1705	* from now on we start making destructive changes, and we can't stop
1706	* until we reach the end.
1707	* ----
1708	*/
1709
1710	/ this usecount is transferred to the mnt_vnodecovered /
1711	vnode_ref_ext(outgoing_vol_new_covered_vp, `0`, VNODE_REF_FORCE);
1712	/ this usecount is transferred to set_rootvnode /
1713	vnode_ref_ext(incoming_rootvnode, `0`, VNODE_REF_FORCE);
1714
1715
1716	for (size_t i = `0`; i < countof(preserved); i++) {
1717	struct preserved_mount *pmi = preserved[i];
1718	if (pmi->pm_rootvnode == NULLVP) {
1719	continue;
1720	}
1721
1722	/ this usecount is transferred to the mnt_vnodecovered /
1723	vnode_ref_ext(pmi->pm_new_covered_vp, `0`, VNODE_REF_FORCE);
1724
1725	/ The new_covered_vp is a mountpoint from now on. /
1726	vnode_lock_spin(pmi->pm_new_covered_vp);
1727	pmi->pm_new_covered_vp->v_flag \|= VMOUNTEDHERE;
1728	vnode_unlock(pmi->pm_new_covered_vp);
1729	}
1730
1731	/ The outgoing_vol_new_covered_vp is a mountpoint from now on. /
1732	vnode_lock_spin(outgoing_vol_new_covered_vp);
1733	outgoing_vol_new_covered_vp->v_flag \|= VMOUNTEDHERE;
1734	vnode_unlock(outgoing_vol_new_covered_vp);
1735
1736
1737	/*
1738	* Identify the mount_ts of the mounted filesystems that are being
1739	* manipulated: outgoing rootfs, incoming rootfs, and the preserved
1740	* mounts.
1741	*/
1742	outgoing = rootvnode->v_mount;
1743	incoming = incoming_rootvnode->v_mount;
1744	for (size_t i = `0`; i < countof(preserved); i++) {
1745	struct preserved_mount *pmi = preserved[i];
1746	if (pmi->pm_rootvnode == NULLVP) {
1747	continue;
1748	}
1749
1750	pmi->pm_mount = pmi->pm_rootvnode->v_mount;
1751	}
1752
1753	lck_rw_lock_exclusive(lck: &rootvnode_rw_lock);
1754
1755	/ Setup incoming as the new rootfs /
1756	lck_rw_lock_exclusive(lck: &incoming->mnt_rwlock);
1757	incoming_vol_old_covered_vp = incoming->mnt_vnodecovered;
1758	incoming->mnt_vnodecovered = NULLVP;
1759	strlcpy(dst: incoming->mnt_vfsstat.f_mntonname, src: "/", MAXPATHLEN);
1760	incoming->mnt_flag \|= MNT_ROOTFS;
1761	lck_rw_done(lck: &incoming->mnt_rwlock);
1762
1763	/*
1764	* The preserved mountpoints will now be moved to
1765	* incoming_rootnode/pm_path, and then by the end of the function,
1766	* since incoming_rootnode is going to /, the preserved mounts
1767	* will be end up back at /pm_path
1768	*/
1769	for (size_t i = `0`; i < countof(preserved); i++) {
1770	struct preserved_mount *pmi = preserved[i];
1771	if (pmi->pm_rootvnode == NULLVP) {
1772	continue;
1773	}
1774
1775	lck_rw_lock_exclusive(lck: &pmi->pm_mount->mnt_rwlock);
1776	pmi->pm_old_covered_vp = pmi->pm_mount->mnt_vnodecovered;
1777	pmi->pm_mount->mnt_vnodecovered = pmi->pm_new_covered_vp;
1778	vnode_lock_spin(pmi->pm_new_covered_vp);
1779	pmi->pm_new_covered_vp->v_mountedhere = pmi->pm_mount;
1780	SET(pmi->pm_new_covered_vp->v_flag, VMOUNTEDHERE);
1781	vnode_unlock(pmi->pm_new_covered_vp);
1782	lck_rw_done(lck: &pmi->pm_mount->mnt_rwlock);
1783	}
1784
1785	/*
1786	* The old root volume now covers outgoing_vol_new_covered_vp
1787	* on the new root volume. Remove the ROOTFS marker.
1788	* Now it is to be found at outgoing_vol_new_path
1789	*/
1790	lck_rw_lock_exclusive(lck: &outgoing->mnt_rwlock);
1791	outgoing->mnt_vnodecovered = outgoing_vol_new_covered_vp;
1792	strlcpy(dst: outgoing->mnt_vfsstat.f_mntonname, src: "/", MAXPATHLEN);
1793	strlcat(dst: outgoing->mnt_vfsstat.f_mntonname, src: outgoing_vol_new_path, MAXPATHLEN);
1794	outgoing->mnt_flag &= ~MNT_ROOTFS;
1795	vnode_lock_spin(outgoing_vol_new_covered_vp);
1796	outgoing_vol_new_covered_vp->v_mountedhere = outgoing;
1797	vnode_unlock(outgoing_vol_new_covered_vp);
1798	lck_rw_done(lck: &outgoing->mnt_rwlock);
1799
1800	if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1801	(TAILQ_FIRST(&mountlist) == outgoing)) {
1802	vfs_setmntsystem(mp: outgoing);
1803	}
1804
1805	/*
1806	* Finally, remove the mount_t linkage from the previously covered
1807	* vnodes on the old root volume. These were incoming_vol_old_path,
1808	* and each preserved mounts's "/pm_path". The filesystems previously
1809	* mounted there have already been moved away.
1810	*/
1811	vnode_lock_spin(incoming_vol_old_covered_vp);
1812	incoming_vol_old_covered_vp->v_flag &= ~VMOUNT;
1813	incoming_vol_old_covered_vp->v_mountedhere = NULL;
1814	vnode_unlock(incoming_vol_old_covered_vp);
1815
1816	for (size_t i = `0`; i < countof(preserved); i++) {
1817	struct preserved_mount *pmi = preserved[i];
1818	if (pmi->pm_rootvnode == NULLVP) {
1819	continue;
1820	}
1821
1822	vnode_lock_spin(pmi->pm_old_covered_vp);
1823	CLR(pmi->pm_old_covered_vp->v_flag, VMOUNTEDHERE);
1824	pmi->pm_old_covered_vp->v_mountedhere = NULL;
1825	vnode_unlock(pmi->pm_old_covered_vp);
1826	}
1827
1828	/*
1829	* Clear the name cache since many cached names are now invalid.
1830	*/
1831	vfs_iterate(flags: `0` / flags /, callout: cache_purge_callback, NULL);
1832
1833	/*
1834	* Actually change the rootvnode! And finally drop the lock that
1835	* prevents concurrent vnode_lookups.
1836	*/
1837	set_rootvnode(incoming_rootvnode);
1838	lck_rw_unlock_exclusive(lck: &rootvnode_rw_lock);
1839
1840	if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1841	!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1842	/*
1843	* Switch the order of mount structures in the mountlist, new root
1844	* mount moves to the head of the list followed by /dev and the other
1845	* preserved mounts then all the preexisting mounts (old rootfs + any
1846	* others)
1847	*/
1848	mount_list_lock();
1849	for (size_t i = `0`; i < countof(preserved); i++) {
1850	struct preserved_mount *pmi = preserved[i];
1851	if (pmi->pm_rootvnode == NULLVP) {
1852	continue;
1853	}
1854
1855	TAILQ_REMOVE(&mountlist, pmi->pm_mount, mnt_list);
1856	TAILQ_INSERT_HEAD(&mountlist, pmi->pm_mount, mnt_list);
1857	}
1858	TAILQ_REMOVE(&mountlist, incoming, mnt_list);
1859	TAILQ_INSERT_HEAD(&mountlist, incoming, mnt_list);
1860	mount_list_unlock();
1861	}
1862
1863	/*
1864	* Fixups across all volumes
1865	*/
1866	vfs_iterate(flags: `0` / flags /, callout: mntonname_fixup_callback, NULL);
1867	vfs_iterate(flags: `0` / flags /, callout: clear_mntk_backs_root_callback, NULL);
1868
1869	error = `0`;
1870
1871	done:
1872	for (size_t i = `0`; i < countof(preserved); i++) {
1873	struct preserved_mount *pmi = preserved[i];
1874
1875	if (pmi->pm_rootvnode) {
1876	vnode_put(vp: pmi->pm_rootvnode);
1877	}
1878	if (pmi->pm_new_covered_vp) {
1879	vnode_put(vp: pmi->pm_new_covered_vp);
1880	}
1881	if (pmi->pm_old_covered_vp) {
1882	vnode_rele(vp: pmi->pm_old_covered_vp);
1883	}
1884	}
1885
1886	if (outgoing_vol_new_covered_vp) {
1887	vnode_put(vp: outgoing_vol_new_covered_vp);
1888	}
1889
1890	if (incoming_vol_old_covered_vp) {
1891	vnode_rele(vp: incoming_vol_old_covered_vp);
1892	}
1893
1894	if (incoming_rootvnode) {
1895	vnode_put(vp: incoming_rootvnode);
1896	}
1897
1898	printf("%s : done shuffling mount points with error: %d\n", __FUNCTION__, error);
1899	return error;
1900	}
1901
1902	/*
1903	* Mount the Recovery volume of a container
1904	*/
1905	int
1906	vfs_mount_recovery(void)
1907	{
1908	#if CONFIG_MOUNT_PREBOOTRECOVERY
1909	int error = `0`;
1910
1911	error = vnode_get(rootvnode);
1912	if (error) {
1913	/ root must be mounted first /
1914	printf("vnode_get(rootvnode) failed with error %d\n", error);
1915	return error;
1916	}
1917
1918	char recoverypath[] = PLATFORM_RECOVERY_VOLUME_MOUNT_POINT; / !const because of internal casting /
1919
1920	/ Mount the recovery volume /
1921	printf("attempting kernel mount for recovery volume... \n");
1922	error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
1923	recoverypath, (rootvnode->v_mount), `0`, `0`, (KERNEL_MOUNT_RECOVERYVOL), vfs_context_kernel());
1924
1925	if (error) {
1926	printf("Failed to mount recovery volume (%d)\n", error);
1927	} else {
1928	printf("mounted recovery volume\n");
1929	}
1930
1931	vnode_put(vp: rootvnode);
1932	return error;
1933	#else
1934	return `0`;
1935	#endif
1936	}
1937
1938	/*
1939	* Lookup a mount point by filesystem identifier.
1940	*/
1941
1942	struct mount *
1943	vfs_getvfs(fsid_t *fsid)
1944	{
1945	return mount_list_lookupby_fsid(fsid, `0`, `0`);
1946	}
1947
1948	static struct mount *
1949	vfs_getvfs_locked(fsid_t *fsid)
1950	{
1951	return mount_list_lookupby_fsid(fsid, `1`, `0`);
1952	}
1953
1954	struct mount *
1955	vfs_getvfs_with_vfsops(fsid_t fsid, const* struct vfsops * const ops)
1956	{
1957	mount_t mp = mount_list_lookupby_fsid(fsid, `0`, `0`);
1958
1959	if (mp != NULL && mp->mnt_op != ops) {
1960	mp = NULL;
1961	}
1962	return mp;
1963	}
1964
1965	struct mount *
1966	vfs_getvfs_by_mntonname(char *path)
1967	{
1968	mount_t retmp = (mount_t)`0`;
1969	mount_t mp;
1970
1971	mount_list_lock();
1972	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1973	if (!strncmp(s1: mp->mnt_vfsstat.f_mntonname, s2: path,
1974	n: sizeof(mp->mnt_vfsstat.f_mntonname))) {
1975	retmp = mp;
1976	if (mount_iterref(mp: retmp, locked: `1`)) {
1977	retmp = NULL;
1978	}
1979	goto out;
1980	}
1981	}
1982	out:
1983	mount_list_unlock();
1984	return retmp;
1985	}
1986
1987	/ generation number for creation of new fsids /
1988	u_short mntid_gen = `0`;
1989	/*
1990	* Get a new unique fsid
1991	*/
1992	void
1993	vfs_getnewfsid(struct mount *mp)
1994	{
1995	fsid_t tfsid;
1996	int mtype;
1997
1998	mount_list_lock();
1999
2000	/ generate a new fsid /
2001	mtype = mp->mnt_vtable->vfc_typenum;
2002	if (++mntid_gen == `0`) {
2003	mntid_gen++;
2004	}
2005	tfsid.val[`0`] = makedev(nblkdev + mtype, mntid_gen);
2006	tfsid.val[`1`] = mtype;
2007
2008	while (vfs_getvfs_locked(fsid: &tfsid)) {
2009	if (++mntid_gen == `0`) {
2010	mntid_gen++;
2011	}
2012	tfsid.val[`0`] = makedev(nblkdev + mtype, mntid_gen);
2013	}
2014
2015	mp->mnt_vfsstat.f_fsid.val[`0`] = tfsid.val[`0`];
2016	mp->mnt_vfsstat.f_fsid.val[`1`] = tfsid.val[`1`];
2017	mount_list_unlock();
2018	}
2019
2020	/*
2021	* Routines having to do with the management of the vnode table.
2022	*/
2023	extern int(*dead_vnodeop_p)(void* *);
2024	long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
2025	long busyvnodes = `0`;
2026	long deadvnodes_noreuse = `0`;
2027	int32_t freeablevnodes = `0`;
2028	uint64_t allocedvnodes = `0`;
2029	uint64_t deallocedvnodes = `0`;
2030
2031
2032	int async_work_timed_out = `0`;
2033	int async_work_handled = `0`;
2034	int dead_vnode_wanted = `0`;
2035	int dead_vnode_waited = `0`;
2036
2037	/*
2038	* Move a vnode from one mount queue to another.
2039	*/
2040	static void
2041	insmntque(vnode_t vp, mount_t mp)
2042	{
2043	mount_t lmp;
2044	/*
2045	* Delete from old mount point vnode list, if on one.
2046	*/
2047	if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
2048	if ((vp->v_lflag & VNAMED_MOUNT) == `0`) {
2049	panic("insmntque: vp not in mount vnode list");
2050	}
2051	vp->v_lflag &= ~VNAMED_MOUNT;
2052
2053	mount_lock_spin(mp: lmp);
2054
2055	mount_drop(mp: lmp, locked: `1`);
2056
2057	if (vp->v_mntvnodes.tqe_next == NULL) {
2058	if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
2059	TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
2060	} else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
2061	TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
2062	} else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
2063	TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
2064	}
2065	} else {
2066	vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
2067	*vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
2068	}
2069	vp->v_mntvnodes.tqe_next = NULL;
2070	vp->v_mntvnodes.tqe_prev = NULL;
2071	mount_unlock(mp: lmp);
2072	vnode_drop(vp);
2073	return;
2074	}
2075
2076	/*
2077	* Insert into list of vnodes for the new mount point, if available.
2078	*/
2079	if ((vp->v_mount = mp) != NULL) {
2080	mount_lock_spin(mp);
2081	if ((vp->v_mntvnodes.tqe_next != `0`) && (vp->v_mntvnodes.tqe_prev != `0`)) {
2082	panic("vp already in mount list");
2083	}
2084	if (mp->mnt_lflag & MNT_LITER) {
2085	TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
2086	} else {
2087	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
2088	}
2089	if (vp->v_lflag & VNAMED_MOUNT) {
2090	panic("insmntque: vp already in mount vnode list");
2091	}
2092	vnode_hold(vp);
2093	vp->v_lflag \|= VNAMED_MOUNT;
2094	mount_ref(mp, locked: `1`);
2095	mount_unlock(mp);
2096	}
2097	}
2098
2099
2100	/*
2101	* Create a vnode for a block device.
2102	* Used for root filesystem, argdev, and swap areas.
2103	* Also used for memory file system special devices.
2104	*/
2105	int
2106	bdevvp(dev_t dev, vnode_t *vpp)
2107	{
2108	vnode_t nvp;
2109	int error;
2110	struct vnode_fsparam vfsp;
2111	struct vfs_context context;
2112
2113	if (dev == NODEV) {
2114	*vpp = NULLVP;
2115	return ENODEV;
2116	}
2117
2118	context.vc_thread = current_thread();
2119	context.vc_ucred = FSCRED;
2120
2121	vfsp.vnfs_mp = (struct mount *)`0`;
2122	vfsp.vnfs_vtype = VBLK;
2123	vfsp.vnfs_str = "bdevvp";
2124	vfsp.vnfs_dvp = NULL;
2125	vfsp.vnfs_fsnode = NULL;
2126	vfsp.vnfs_cnp = NULL;
2127	vfsp.vnfs_vops = spec_vnodeop_p;
2128	vfsp.vnfs_rdev = dev;
2129	vfsp.vnfs_filesize = `0`;
2130
2131	vfsp.vnfs_flags = VNFS_NOCACHE \| VNFS_CANTCACHE;
2132
2133	vfsp.vnfs_marksystem = `0`;
2134	vfsp.vnfs_markroot = `0`;
2135
2136	if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, data: &vfsp, vpp: &nvp))) {
2137	*vpp = NULLVP;
2138	return error;
2139	}
2140	vnode_lock_spin(nvp);
2141	nvp->v_flag \|= VBDEVVP;
2142	nvp->v_tag = VT_NON; / set this to VT_NON so during aliasing it can be replaced /
2143	vnode_unlock(nvp);
2144	if ((error = vnode_ref(vp: nvp))) {
2145	panic("bdevvp failed: vnode_ref");
2146	return error;
2147	}
2148	if ((error = VNOP_FSYNC(vp: nvp, MNT_WAIT, ctx: &context))) {
2149	panic("bdevvp failed: fsync");
2150	return error;
2151	}
2152	if ((error = buf_invalidateblks(vp: nvp, BUF_WRITE_DATA, slpflag: `0`, slptimeo: `0`))) {
2153	panic("bdevvp failed: invalidateblks");
2154	return error;
2155	}
2156
2157	#if CONFIG_MACF
2158	/*
2159	* XXXMAC: We can't put a MAC check here, the system will
2160	* panic without this vnode.
2161	*/
2162	#endif /* MAC */
2163
2164	if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
2165	panic("bdevvp failed: open");
2166	return error;
2167	}
2168	*vpp = nvp;
2169
2170	return `0`;
2171	}
2172
2173	/*
2174	* Check to see if the new vnode represents a special device
2175	* for which we already have a vnode (either because of
2176	* bdevvp() or because of a different vnode representing
2177	* the same block device). If such an alias exists, deallocate
2178	* the existing contents and return the aliased vnode. The
2179	* caller is responsible for filling it with its new contents.
2180	*/
2181	static vnode_t
2182	checkalias(struct vnode *nvp, dev_t nvp_rdev)
2183	{
2184	struct vnode *vp;
2185	struct vnode **vpp;
2186	struct specinfo *sin = NULL;
2187	int vid = `0`;
2188
2189	vpp = &speclisth[SPECHASH(nvp_rdev)];
2190	loop:
2191	SPECHASH_LOCK();
2192
2193	for (vp = *vpp; vp; vp = vp->v_specnext) {
2194	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
2195	vid = vp->v_id;
2196	vnode_hold(vp);
2197	break;
2198	}
2199	}
2200	SPECHASH_UNLOCK();
2201
2202	if (vp) {
2203	found_alias:
2204	if (vnode_getwithvid(vp, vid)) {
2205	vnode_drop(vp);
2206	goto loop;
2207	}
2208	vnode_drop(vp);
2209	/*
2210	* Termination state is checked in vnode_getwithvid
2211	*/
2212	vnode_lock(vp);
2213
2214	/*
2215	* Alias, but not in use, so flush it out.
2216	*/
2217	if ((vp->v_iocount == `1`) && (vp->v_usecount == `0`)) {
2218	vnode_hold(vp);
2219	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2220	vnode_put_locked(vp);
2221	vnode_drop_and_unlock(vp);
2222	goto loop;
2223	}
2224	}
2225	if (vp == NULL \|\| vp->v_tag != VT_NON) {
2226	if (sin == NULL) {
2227	sin = zalloc_flags(specinfo_zone, Z_WAITOK \| Z_ZERO);
2228	} else {
2229	bzero(s: sin, n: sizeof(struct specinfo));
2230	}
2231
2232	nvp->v_specinfo = sin;
2233	nvp->v_rdev = nvp_rdev;
2234	nvp->v_specflags = `0`;
2235	nvp->v_speclastr = -`1`;
2236	nvp->v_specinfo->si_opencount = `0`;
2237	nvp->v_specinfo->si_initted = `0`;
2238	nvp->v_specinfo->si_throttleable = `0`;
2239	nvp->v_specinfo->si_devbsdunit = LOWPRI_MAX_NUM_DEV;
2240
2241	SPECHASH_LOCK();
2242
2243	/ We dropped the lock, someone could have added /
2244	if (vp == NULLVP) {
2245	for (vp = *vpp; vp; vp = vp->v_specnext) {
2246	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
2247	vid = vp->v_id;
2248	vnode_hold(vp);
2249	SPECHASH_UNLOCK();
2250	goto found_alias;
2251	}
2252	}
2253	}
2254
2255	nvp->v_hashchain = vpp;
2256	nvp->v_specnext = *vpp;
2257	*vpp = nvp;
2258
2259	if (vp != NULLVP) {
2260	nvp->v_specflags \|= SI_ALIASED;
2261	vp->v_specflags \|= SI_ALIASED;
2262	SPECHASH_UNLOCK();
2263	vnode_put_locked(vp);
2264	vnode_unlock(vp);
2265	} else {
2266	SPECHASH_UNLOCK();
2267	}
2268
2269	return NULLVP;
2270	}
2271
2272	if (sin) {
2273	zfree(specinfo_zone, sin);
2274	}
2275
2276	if ((vp->v_flag & (VBDEVVP \| VDEVFLUSH)) != `0`) {
2277	return vp;
2278	}
2279
2280	panic("checkalias with VT_NON vp that shouldn't: %p", vp);
2281
2282	return vp;
2283	}
2284
2285
2286	/*
2287	* Get a reference on a particular vnode and lock it if requested.
2288	* If the vnode was on the inactive list, remove it from the list.
2289	* If the vnode was on the free list, remove it from the list and
2290	* move it to inactive list as needed.
2291	* The vnode lock bit is set if the vnode is being eliminated in
2292	* vgone. The process is awakened when the transition is completed,
2293	* and an error returned to indicate that the vnode is no longer
2294	* usable (possibly having been changed to a new file system type).
2295	*/
2296	int
2297	vget_internal(vnode_t vp, int vid, int vflags)
2298	{
2299	int error = `0`;
2300
2301	vnode_lock_spin(vp);
2302
2303	if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == `0`)) {
2304	/*
2305	* vnode to be returned only if it has writers opened
2306	*/
2307	error = EINVAL;
2308	} else {
2309	error = vnode_getiocount(vp, vid, vflags);
2310	}
2311
2312	vnode_unlock(vp);
2313
2314	return error;
2315	}
2316
2317	/*
2318	* Returns: 0 Success
2319	* ENOENT No such file or directory [terminating]
2320	*/
2321	int
2322	vnode_ref(vnode_t vp)
2323	{
2324	return vnode_ref_ext(vp, `0`, `0`);
2325	}
2326
2327	/*
2328	* Returns: 0 Success
2329	* ENOENT No such file or directory [terminating]
2330	*/
2331	int
2332	vnode_ref_ext(vnode_t vp, int fmode, int flags)
2333	{
2334	int error = `0`;
2335
2336	vnode_lock_spin(vp);
2337
2338	/*
2339	* once all the current call sites have been fixed to insure they have
2340	* taken an iocount, we can toughen this assert up and insist that the
2341	* iocount is non-zero... a non-zero usecount doesn't insure correctness
2342	*/
2343	if (vp->v_iocount <= `0` && vp->v_usecount <= `0`) {
2344	panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
2345	}
2346
2347	/*
2348	* if you are the owner of drain/termination, can acquire usecount
2349	*/
2350	if ((flags & VNODE_REF_FORCE) == `0`) {
2351	if ((vp->v_lflag & (VL_DRAIN \| VL_TERMINATE \| VL_DEAD))) {
2352	if (vp->v_owner != current_thread()) {
2353	error = ENOENT;
2354	goto out;
2355	}
2356	}
2357	}
2358
2359	/ Enable atomic ops on v_usecount without the vnode lock /
2360	os_atomic_inc(&vp->v_usecount, relaxed);
2361
2362	if (fmode & FWRITE) {
2363	if (++vp->v_writecount <= `0`) {
2364	panic("vnode_ref_ext: v_writecount");
2365	}
2366	}
2367	if (fmode & O_EVTONLY) {
2368	if (++vp->v_kusecount <= `0`) {
2369	panic("vnode_ref_ext: v_kusecount");
2370	}
2371	}
2372	if (vp->v_flag & VRAGE) {
2373	struct uthread *ut;
2374
2375	ut = current_uthread();
2376
2377	if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
2378	!(ut->uu_flag & UT_RAGE_VNODES)) {
2379	/*
2380	* a 'normal' process accessed this vnode
2381	* so make sure its no longer marked
2382	* for rapid aging... also, make sure
2383	* it gets removed from the rage list...
2384	* when v_usecount drops back to 0, it
2385	* will be put back on the real free list
2386	*/
2387	vp->v_flag &= ~VRAGE;
2388	vp->v_references = `0`;
2389	vnode_list_remove(vp);
2390	}
2391	}
2392	if (vp->v_usecount == `1` && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2393	if (vp->v_ubcinfo) {
2394	vnode_lock_convert(vp);
2395	memory_object_mark_used(control: vp->v_ubcinfo->ui_control);
2396	}
2397	}
2398	out:
2399	vnode_unlock(vp);
2400
2401	return error;
2402	}
2403
2404
2405	boolean_t
2406	vnode_on_reliable_media(vnode_t vp)
2407	{
2408	mount_t mp = vp->v_mount;
2409
2410	/*
2411	* A NULL mountpoint would imply it's not attached to a any filesystem.
2412	* This can only happen with a vnode created by bdevvp(). We'll consider
2413	* those as not unreliable as the primary use of this function is determine
2414	* which vnodes are to be handed off to the async cleaner thread for
2415	* reclaim.
2416	*/
2417	if (!mp \|\| (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && (mp->mnt_flag & MNT_LOCAL))) {
2418	return TRUE;
2419	}
2420
2421	return FALSE;
2422	}
2423
2424	static void
2425	vnode_async_list_add_locked(vnode_t vp)
2426	{
2427	if (VONLIST(vp) \|\| (vp->v_lflag & (VL_TERMINATE \| VL_DEAD))) {
2428	panic("vnode_async_list_add: %p is in wrong state", vp);
2429	}
2430
2431	TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
2432	vp->v_listflag \|= VLIST_ASYNC_WORK;
2433
2434	async_work_vnodes++;
2435	if (!(vp->v_listflag & VLIST_NO_REUSE)) {
2436	reusablevnodes++;
2437	}
2438	if (vp->v_flag & VCANDEALLOC) {
2439	os_atomic_dec(&busyvnodes, relaxed);
2440	}
2441	}
2442
2443	static void
2444	vnode_async_list_add(vnode_t vp)
2445	{
2446	vnode_list_lock();
2447
2448	if (VONLIST(vp)) {
2449	if (!(vp->v_listflag & VLIST_ASYNC_WORK)) {
2450	vnode_list_remove_locked(vp);
2451	vnode_async_list_add_locked(vp);
2452	}
2453	} else {
2454	vnode_async_list_add_locked(vp);
2455	}
2456
2457	vnode_list_unlock();
2458
2459	wakeup(chan: &vnode_async_work_list);
2460	}
2461
2462
2463	/*
2464	* put the vnode on appropriate free list.
2465	* called with vnode LOCKED
2466	*/
2467	static void
2468	vnode_list_add(vnode_t vp)
2469	{
2470	boolean_t need_dead_wakeup = FALSE;
2471	bool no_busy_decrement = false;
2472
2473	#if DIAGNOSTIC
2474	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2475	#endif
2476
2477	again:
2478
2479	/*
2480	* if it is already on a list or non zero references return
2481	*/
2482	if (VONLIST(vp) \|\| (vp->v_usecount != `0`) \|\| (vp->v_iocount != `0`) \|\| (vp->v_lflag & VL_TERMINATE)) {
2483	return;
2484	}
2485
2486	/*
2487	* In vclean, we might have deferred ditching locked buffers
2488	* because something was still referencing them (indicated by
2489	* usecount). We can ditch them now.
2490	*/
2491	if (ISSET(vp->v_lflag, VL_DEAD)
2492	&& (!LIST_EMPTY(&vp->v_cleanblkhd) \|\| !LIST_EMPTY(&vp->v_dirtyblkhd))) {
2493	++vp->v_iocount; // Probably not necessary, but harmless
2494	#ifdef CONFIG_IOCOUNT_TRACE
2495	record_vp(vp, `1`);
2496	#endif
2497	vnode_unlock(vp);
2498	buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, slpflag: `0`, slptimeo: `0`);
2499	vnode_lock(vp);
2500	vnode_dropiocount(vp);
2501	goto again;
2502	}
2503
2504	vnode_list_lock();
2505
2506	if (!(vp->v_lflag & VL_DEAD) && (vp->v_listflag & VLIST_NO_REUSE)) {
2507	if (!(vp->v_listflag & VLIST_ASYNC_WORK)) {
2508	vnode_async_list_add_locked(vp);
2509	}
2510	no_busy_decrement = true;
2511	} else if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
2512	/*
2513	* add the new guy to the appropriate end of the RAGE list
2514	*/
2515	if ((vp->v_flag & VAGE)) {
2516	TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
2517	} else {
2518	TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
2519	}
2520
2521	vp->v_listflag \|= VLIST_RAGE;
2522	ragevnodes++;
2523	reusablevnodes++;
2524	wakeup_laundry_thread();
2525
2526	/*
2527	* reset the timestamp for the last inserted vp on the RAGE
2528	* queue to let new_vnode know that its not ok to start stealing
2529	* from this list... as long as we're actively adding to this list
2530	* we'll push out the vnodes we want to donate to the real free list
2531	* once we stop pushing, we'll let some time elapse before we start
2532	* stealing them in the new_vnode routine
2533	*/
2534	microuptime(tv: &rage_tv);
2535	} else {
2536	/*
2537	* if VL_DEAD, insert it at head of the dead list
2538	* else insert at tail of LRU list or at head if VAGE is set
2539	*/
2540	if ((vp->v_lflag & VL_DEAD)) {
2541	if (vp->v_flag & VCANDEALLOC) {
2542	TAILQ_INSERT_TAIL(&vnode_dead_list, vp, v_freelist);
2543	if (vp->v_listflag & VLIST_NO_REUSE) {
2544	deadvnodes_noreuse++;
2545	}
2546	} else {
2547	TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
2548	}
2549	vp->v_listflag \|= VLIST_DEAD;
2550	deadvnodes++;
2551
2552	if (dead_vnode_wanted) {
2553	dead_vnode_wanted--;
2554	need_dead_wakeup = TRUE;
2555	}
2556	} else if ((vp->v_flag & VAGE)) {
2557	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2558	vp->v_flag &= ~VAGE;
2559	freevnodes++;
2560	reusablevnodes++;
2561	wakeup_laundry_thread();
2562	} else {
2563	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2564	freevnodes++;
2565	reusablevnodes++;
2566	wakeup_laundry_thread();
2567	}
2568	}
2569	if ((vp->v_flag & VCANDEALLOC) && !no_busy_decrement) {
2570	os_atomic_dec(&busyvnodes, relaxed);
2571	}
2572	vnode_list_unlock();
2573
2574	if (need_dead_wakeup == TRUE) {
2575	wakeup_one(chan: (caddr_t)&dead_vnode_wanted);
2576	}
2577	}
2578
2579
2580	/*
2581	* remove the vnode from appropriate free list.
2582	* called with vnode LOCKED and
2583	* the list lock held
2584	*/
2585	static void
2586	vnode_list_remove_locked(vnode_t vp)
2587	{
2588	if (VONLIST(vp)) {
2589	/*
2590	* the v_listflag field is
2591	* protected by the vnode_list_lock
2592	*/
2593	if (vp->v_listflag & VLIST_RAGE) {
2594	VREMRAGE("vnode_list_remove", vp);
2595	} else if (vp->v_listflag & VLIST_DEAD) {
2596	VREMDEAD("vnode_list_remove", vp);
2597	wakeup_laundry_thread();
2598	} else if (vp->v_listflag & VLIST_ASYNC_WORK) {
2599	VREMASYNC_WORK("vnode_list_remove", vp);
2600	} else {
2601	VREMFREE("vnode_list_remove", vp);
2602	}
2603	if (vp->v_flag & VCANDEALLOC) {
2604	os_atomic_inc(&busyvnodes, relaxed);
2605	}
2606	}
2607	}
2608
2609
2610	/*
2611	* remove the vnode from appropriate free list.
2612	* called with vnode LOCKED
2613	*/
2614	static void
2615	vnode_list_remove(vnode_t vp)
2616	{
2617	#if DIAGNOSTIC
2618	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2619	#endif
2620	/*
2621	* we want to avoid taking the list lock
2622	* in the case where we're not on the free
2623	* list... this will be true for most
2624	* directories and any currently in use files
2625	*
2626	* we're guaranteed that we can't go from
2627	* the not-on-list state to the on-list
2628	* state since we hold the vnode lock...
2629	* all calls to vnode_list_add are done
2630	* under the vnode lock... so we can
2631	* check for that condition (the prevelant one)
2632	* without taking the list lock
2633	*/
2634	if (VONLIST(vp)) {
2635	vnode_list_lock();
2636	/*
2637	* however, we're not guaranteed that
2638	* we won't go from the on-list state
2639	* to the not-on-list state until we
2640	* hold the vnode_list_lock... this
2641	* is due to "new_vnode" removing vnodes
2642	* from the free list uder the list_lock
2643	* w/o the vnode lock... so we need to
2644	* check again whether we're currently
2645	* on the free list
2646	*/
2647	vnode_list_remove_locked(vp);
2648
2649	vnode_list_unlock();
2650	}
2651	}
2652
2653
2654	void
2655	vnode_rele(vnode_t vp)
2656	{
2657	vnode_rele_internal(vp, `0`, `0`, `0`);
2658	}
2659
2660
2661	void
2662	vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
2663	{
2664	vnode_rele_internal(vp, fmode, dont_reenter, `0`);
2665	}
2666
2667
2668	void
2669	vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
2670	{
2671	int32_t old_usecount;
2672
2673	if (!locked) {
2674	vnode_hold(vp);
2675	vnode_lock_spin(vp);
2676	}
2677	#if DIAGNOSTIC
2678	else {
2679	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2680	}
2681	#endif
2682	/ Enable atomic ops on v_usecount without the vnode lock /
2683	old_usecount = os_atomic_dec_orig(&vp->v_usecount, relaxed);
2684	if (old_usecount < `1`) {
2685	/*
2686	* Because we allow atomic ops on usecount (in lookup only, under
2687	* specific conditions of already having a usecount) it is
2688	* possible that when the vnode is examined, its usecount is
2689	* different than what will be printed in this panic message.
2690	*/
2691	panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.",
2692	vp, old_usecount - `1`, vp->v_tag, vp->v_type, vp->v_flag);
2693	}
2694
2695	if (fmode & FWRITE) {
2696	if (--vp->v_writecount < `0`) {
2697	panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
2698	}
2699	}
2700	if (fmode & O_EVTONLY) {
2701	if (--vp->v_kusecount < `0`) {
2702	panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
2703	}
2704	}
2705	if (vp->v_kusecount > vp->v_usecount) {
2706	panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
2707	}
2708
2709	if ((vp->v_iocount > `0`) \|\| (vp->v_usecount > `0`)) {
2710	/*
2711	* vnode is still busy... if we're the last
2712	* usecount, mark for a future call to VNOP_INACTIVE
2713	* when the iocount finally drops to 0
2714	*/
2715	if (vp->v_usecount == `0`) {
2716	vp->v_lflag \|= VL_NEEDINACTIVE;
2717	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
2718	}
2719	goto done;
2720	}
2721	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
2722
2723	if (ISSET(vp->v_lflag, VL_TERMINATE \| VL_DEAD) \|\| dont_reenter) {
2724	/*
2725	* vnode is being cleaned, or
2726	* we've requested that we don't reenter
2727	* the filesystem on this release...in
2728	* the latter case, we'll mark the vnode aged
2729	*/
2730	if (dont_reenter) {
2731	if (!(vp->v_lflag & (VL_TERMINATE \| VL_DEAD \| VL_MARKTERM))) {
2732	vp->v_lflag \|= VL_NEEDINACTIVE;
2733
2734	if (vnode_on_reliable_media(vp) == FALSE \|\| vp->v_flag & VISDIRTY) {
2735	vnode_async_list_add(vp);
2736	goto done;
2737	}
2738	}
2739	vp->v_flag \|= VAGE;
2740	}
2741	vnode_list_add(vp);
2742
2743	goto done;
2744	}
2745	/*
2746	* at this point both the iocount and usecount
2747	* are zero
2748	* pick up an iocount so that we can call
2749	* VNOP_INACTIVE with the vnode lock unheld
2750	*/
2751	vp->v_iocount++;
2752	#ifdef CONFIG_IOCOUNT_TRACE
2753	record_vp(vp, `1`);
2754	#endif
2755	vp->v_lflag &= ~VL_NEEDINACTIVE;
2756
2757	if (UBCINFOEXISTS(vp)) {
2758	ubc_cs_free_and_vnode_unlock(vp);
2759	} else {
2760	vnode_unlock(vp);
2761	}
2762
2763	VNOP_INACTIVE(vp, vfs_context_current());
2764
2765	vnode_lock_spin(vp);
2766
2767	/*
2768	* because we dropped the vnode lock to call VNOP_INACTIVE
2769	* the state of the vnode may have changed... we may have
2770	* picked up an iocount, usecount or the MARKTERM may have
2771	* been set... we need to reevaluate the reference counts
2772	* to determine if we can call vnode_reclaim_internal at
2773	* this point... if the reference counts are up, we'll pick
2774	* up the MARKTERM state when they get subsequently dropped
2775	*/
2776	if ((vp->v_iocount == `1`) && (vp->v_usecount == `0`) &&
2777	((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM)) {
2778	struct uthread *ut;
2779
2780	ut = current_uthread();
2781
2782	if (ut->uu_defer_reclaims) {
2783	vp->v_defer_reclaimlist = ut->uu_vreclaims;
2784	ut->uu_vreclaims = vp;
2785	goto done;
2786	}
2787	vnode_lock_convert(vp);
2788	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2789	}
2790	vnode_dropiocount(vp);
2791	vnode_list_add(vp);
2792	done:
2793	if (vp->v_usecount == `0` && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2794	if (vp->v_ubcinfo) {
2795	vnode_lock_convert(vp);
2796	memory_object_mark_unused(control: vp->v_ubcinfo->ui_control, rage: (vp->v_flag & VRAGE) == VRAGE);
2797	}
2798	}
2799	if (!locked) {
2800	vnode_drop_and_unlock(vp);
2801	}
2802	return;
2803	}
2804
2805	/*
2806	* Remove any vnodes in the vnode table belonging to mount point mp.
2807	*
2808	* If MNT_NOFORCE is specified, there should not be any active ones,
2809	* return error if any are found (nb: this is a user error, not a
2810	* system error). If MNT_FORCE is specified, detach any active vnodes
2811	* that are found.
2812	*/
2813
2814	int
2815	vflush(struct mount mp, struct* vnode skipvp, int* flags)
2816	{
2817	struct vnode *vp;
2818	int busy = `0`;
2819	int reclaimed = `0`;
2820	int retval;
2821	unsigned int vid;
2822	bool first_try = true;
2823
2824	/*
2825	* See comments in vnode_iterate() for the rationale for this lock
2826	*/
2827	mount_iterate_lock(mp);
2828
2829	mount_lock(mp);
2830	vnode_iterate_setup(mp);
2831	/*
2832	* On regular unmounts(not forced) do a
2833	* quick check for vnodes to be in use. This
2834	* preserves the caching of vnodes. automounter
2835	* tries unmounting every so often to see whether
2836	* it is still busy or not.
2837	*/
2838	if (((flags & FORCECLOSE) == `0`) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != `0`)) {
2839	if (vnode_umount_preflight(mp, skipvp, flags)) {
2840	vnode_iterate_clear(mp);
2841	mount_unlock(mp);
2842	mount_iterate_unlock(mp);
2843	return EBUSY;
2844	}
2845	}
2846	loop:
2847	/ If it returns 0 then there is nothing to do /
2848	retval = vnode_iterate_prepare(mp);
2849
2850	if (retval == `0`) {
2851	vnode_iterate_clear(mp);
2852	mount_unlock(mp);
2853	mount_iterate_unlock(mp);
2854	return retval;
2855	}
2856
2857	/ iterate over all the vnodes /
2858	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2859	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2860	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2861	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2862
2863	if ((vp->v_mount != mp) \|\| (vp == skipvp)) {
2864	continue;
2865	}
2866	vid = vp->v_id;
2867	mount_unlock(mp);
2868
2869	vnode_lock_spin(vp);
2870
2871	// If vnode is already terminating, wait for it...
2872	while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2873	vp->v_lflag \|= VL_TERMWANT;
2874	msleep(chan: &vp->v_lflag, mtx: &vp->v_lock, PVFS, wmesg: "vflush", NULL);
2875	}
2876
2877	if ((vp->v_id != vid) \|\| ISSET(vp->v_lflag, VL_DEAD)) {
2878	vnode_unlock(vp);
2879	mount_lock(mp);
2880	continue;
2881	}
2882
2883	/*
2884	* If requested, skip over vnodes marked VSYSTEM.
2885	* Skip over all vnodes marked VNOFLUSH.
2886	*/
2887	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\|
2888	(vp->v_flag & VNOFLUSH))) {
2889	vnode_unlock(vp);
2890	mount_lock(mp);
2891	continue;
2892	}
2893	/*
2894	* If requested, skip over vnodes marked VSWAP.
2895	*/
2896	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2897	vnode_unlock(vp);
2898	mount_lock(mp);
2899	continue;
2900	}
2901	/*
2902	* If requested, skip over vnodes marked VROOT.
2903	*/
2904	if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2905	vnode_unlock(vp);
2906	mount_lock(mp);
2907	continue;
2908	}
2909	/*
2910	* If WRITECLOSE is set, only flush out regular file
2911	* vnodes open for writing.
2912	*/
2913	if ((flags & WRITECLOSE) &&
2914	(vp->v_writecount == `0` \|\| vp->v_type != VREG)) {
2915	vnode_unlock(vp);
2916	mount_lock(mp);
2917	continue;
2918	}
2919	/*
2920	* If the real usecount is 0, all we need to do is clear
2921	* out the vnode data structures and we are done.
2922	*/
2923	if (((vp->v_usecount == `0`) \|\|
2924	((vp->v_usecount - vp->v_kusecount) == `0`))) {
2925	vnode_lock_convert(vp);
2926	vnode_hold(vp);
2927	vp->v_iocount++; / so that drain waits for * other iocounts /
2928	#ifdef CONFIG_IOCOUNT_TRACE
2929	record_vp(vp, `1`);
2930	#endif
2931	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2932	vnode_dropiocount(vp);
2933	vnode_list_add(vp);
2934	vnode_drop_and_unlock(vp);
2935
2936	reclaimed++;
2937	mount_lock(mp);
2938	continue;
2939	}
2940	/*
2941	* If FORCECLOSE is set, forcibly close the vnode.
2942	* For block or character devices, revert to an
2943	* anonymous device. For all other files, just kill them.
2944	*/
2945	if (flags & FORCECLOSE) {
2946	vnode_lock_convert(vp);
2947
2948	if (vp->v_type != VBLK && vp->v_type != VCHR) {
2949	vp->v_iocount++; / so that drain waits * for other iocounts /
2950	vnode_hold(vp);
2951	#ifdef CONFIG_IOCOUNT_TRACE
2952	record_vp(vp, `1`);
2953	#endif
2954	vnode_abort_advlocks(vp);
2955	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2956	vnode_dropiocount(vp);
2957	vnode_list_add(vp);
2958	vnode_drop_and_unlock(vp);
2959	} else {
2960	vnode_hold(vp);
2961	vp->v_lflag \|= VL_OPSCHANGE;
2962	vclean(vp, flag: `0`);
2963	vp->v_lflag &= ~VL_DEAD;
2964	vp->v_op = spec_vnodeop_p;
2965	vp->v_flag \|= VDEVFLUSH;
2966	vnode_drop_and_unlock(vp);
2967	wakeup(chan: &vp->v_lflag); / chkvnlock is waitng for VL_DEAD to get unset /
2968	}
2969	mount_lock(mp);
2970	continue;
2971	}
2972
2973	/ log vnodes blocking unforced unmounts /
2974	if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == `0`)) {
2975	vprint(label: "vflush - busy vnode", vp);
2976	}
2977
2978	vnode_unlock(vp);
2979	mount_lock(mp);
2980	busy++;
2981	}
2982
2983	/ At this point the worker queue is completed /
2984	if (busy && ((flags & FORCECLOSE) == `0`) && reclaimed) {
2985	busy = `0`;
2986	reclaimed = `0`;
2987	(void)vnode_iterate_reloadq(mp);
2988	first_try = false;
2989	/ returned with mount lock held /
2990	goto loop;
2991	}
2992
2993	/ if new vnodes were created in between retry the reclaim /
2994	if (vnode_iterate_reloadq(mp) != `0`) {
2995	if (!(busy && ((flags & FORCECLOSE) == `0`))) {
2996	first_try = false;
2997	goto loop;
2998	}
2999	}
3000	vnode_iterate_clear(mp);
3001	mount_unlock(mp);
3002	mount_iterate_unlock(mp);
3003
3004	if (busy && ((flags & FORCECLOSE) == `0`)) {
3005	return EBUSY;
3006	}
3007	return `0`;
3008	}
3009
3010	long num_recycledvnodes = `0`;
3011	/*
3012	* Disassociate the underlying file system from a vnode.
3013	* The vnode lock is held on entry.
3014	*/
3015	static void
3016	vclean(vnode_t vp, int flags)
3017	{
3018	vfs_context_t ctx = vfs_context_current();
3019	int active;
3020	int need_inactive;
3021	int already_terminating;
3022	int clflags = `0`;
3023	#if NAMEDSTREAMS
3024	int is_namedstream;
3025	#endif
3026
3027	/*
3028	* Check to see if the vnode is in use.
3029	* If so we have to reference it before we clean it out
3030	* so that its count cannot fall to zero and generate a
3031	* race against ourselves to recycle it.
3032	*/
3033	active = vp->v_usecount;
3034
3035	/*
3036	* just in case we missed sending a needed
3037	* VNOP_INACTIVE, we'll do it now
3038	*/
3039	need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
3040
3041	vp->v_lflag &= ~VL_NEEDINACTIVE;
3042
3043	/*
3044	* Prevent the vnode from being recycled or
3045	* brought into use while we clean it out.
3046	*/
3047	already_terminating = (vp->v_lflag & VL_TERMINATE);
3048
3049	vp->v_lflag \|= VL_TERMINATE;
3050
3051	#if NAMEDSTREAMS
3052	is_namedstream = vnode_isnamedstream(vp);
3053	#endif
3054
3055	vnode_unlock(vp);
3056
3057	OSAddAtomicLong(`1`, &num_recycledvnodes);
3058
3059	if (flags & DOCLOSE) {
3060	clflags \|= IO_NDELAY;
3061	}
3062	if (flags & REVOKEALL) {
3063	clflags \|= IO_REVOKE;
3064	}
3065
3066	#if CONFIG_MACF
3067	if (vp->v_mount) {
3068	/*
3069	* It is possible for bdevvp vnodes to not have a mount
3070	* pointer. It's fine to let it get reclaimed without
3071	* notifying.
3072	*/
3073	mac_vnode_notify_reclaim(vp);
3074	}
3075	#endif
3076
3077	if (active && (flags & DOCLOSE)) {
3078	VNOP_CLOSE(vp, clflags, ctx);
3079	}
3080
3081	/*
3082	* Clean out any buffers associated with the vnode.
3083	*/
3084	if (flags & DOCLOSE) {
3085	if (vp->v_tag == VT_NFS) {
3086	nfs_vinvalbuf(vp, V_SAVE, ctx, `0`);
3087	} else {
3088	VNOP_FSYNC(vp, MNT_WAIT, ctx);
3089
3090	/*
3091	* If the vnode is still in use (by the journal for
3092	* example) we don't want to invalidate locked buffers
3093	* here. In that case, either the journal will tidy them
3094	* up, or we will deal with it when the usecount is
3095	* finally released in vnode_rele_internal.
3096	*/
3097	buf_invalidateblks(vp, BUF_WRITE_DATA \| (active ? `0` : BUF_INVALIDATE_LOCKED), slpflag: `0`, slptimeo: `0`);
3098	}
3099	if (UBCINFOEXISTS(vp)) {
3100	/*
3101	* Clean the pages in VM.
3102	*/
3103	(void)ubc_msync(vp, (off_t)`0`, ubc_getsize(vp), NULL, UBC_PUSHALL \| UBC_INVALIDATE \| UBC_SYNC);
3104	}
3105	}
3106	if (active \|\| need_inactive) {
3107	VNOP_INACTIVE(vp, ctx);
3108	}
3109
3110	#if NAMEDSTREAMS
3111	if ((is_namedstream != `0`) && (vp->v_parent != NULLVP)) {
3112	vnode_t pvp = vp->v_parent;
3113
3114	/ Delete the shadow stream file before we reclaim its vnode /
3115	if (vnode_isshadow(vp)) {
3116	vnode_relenamedstream(vp: pvp, svp: vp);
3117	}
3118
3119	/*
3120	* No more streams associated with the parent. We
3121	* have a ref on it, so its identity is stable.
3122	* If the parent is on an opaque volume, then we need to know
3123	* whether it has associated named streams.
3124	*/
3125	if (vfs_authopaque(mp: pvp->v_mount)) {
3126	vnode_lock_spin(pvp);
3127	pvp->v_lflag &= ~VL_HASSTREAMS;
3128	vnode_unlock(pvp);
3129	}
3130	}
3131	#endif
3132
3133	vm_object_destroy_reason_t reason = VM_OBJECT_DESTROY_UNKNOWN_REASON;
3134	bool forced_unmount = vnode_mount(vp) != NULL && (vnode_mount(vp)->mnt_lflag & MNT_LFORCE) != `0`;
3135	bool ungraft_heuristic = flags & REVOKEALL;
3136	if (forced_unmount) {
3137	reason = VM_OBJECT_DESTROY_FORCED_UNMOUNT;
3138	} else if (ungraft_heuristic) {
3139	reason = VM_OBJECT_DESTROY_UNGRAFT;
3140	}
3141
3142	/*
3143	* Destroy ubc named reference
3144	* cluster_release is done on this path
3145	* along with dropping the reference on the ucred
3146	* (and in the case of forced unmount of an mmap-ed file,
3147	* the ubc reference on the vnode is dropped here too).
3148	*/
3149	ubc_destroy_named(vp, reason);
3150
3151	#if CONFIG_TRIGGERS
3152	/*
3153	* cleanup trigger info from vnode (if any)
3154	*/
3155	if (vp->v_resolve) {
3156	vnode_resolver_detach(vp);
3157	}
3158	#endif
3159
3160	#if CONFIG_IO_COMPRESSION_STATS
3161	if ((vp->io_compression_stats)) {
3162	vnode_iocs_record_and_free(vp);
3163	}
3164	#endif /* CONFIG_IO_COMPRESSION_STATS */
3165
3166	/*
3167	* Reclaim the vnode.
3168	*/
3169	if (VNOP_RECLAIM(vp, ctx)) {
3170	panic("vclean: cannot reclaim");
3171	}
3172
3173	// make sure the name & parent ptrs get cleaned out!
3174	vnode_update_identity(vp, NULLVP, NULL, name_len: `0`, name_hashval: `0`, VNODE_UPDATE_PARENT \| VNODE_UPDATE_NAME \| VNODE_UPDATE_PURGE \| VNODE_UPDATE_PURGEFIRMLINK);
3175
3176	vnode_lock(vp);
3177
3178	/*
3179	* Remove the vnode from any mount list it might be on. It is not
3180	* safe to do this any earlier because unmount needs to wait for
3181	* any vnodes to terminate and it cannot do that if it cannot find
3182	* them.
3183	*/
3184	insmntque(vp, mp: (struct mount *)`0`);
3185
3186	vp->v_lflag \|= VL_DEAD;
3187	vp->v_mount = dead_mountp;
3188	vp->v_op = dead_vnodeop_p;
3189	vp->v_tag = VT_NON;
3190	vp->v_data = NULL;
3191
3192	vp->v_flag &= ~VISDIRTY;
3193
3194	if (already_terminating == `0`) {
3195	vp->v_lflag &= ~VL_TERMINATE;
3196	/*
3197	* Done with purge, notify sleepers of the grim news.
3198	*/
3199	if (vp->v_lflag & VL_TERMWANT) {
3200	vp->v_lflag &= ~VL_TERMWANT;
3201	wakeup(chan: &vp->v_lflag);
3202	}
3203	}
3204	}
3205
3206	/*
3207	* Eliminate all activity associated with the requested vnode
3208	* and with all vnodes aliased to the requested vnode.
3209	*/
3210	int
3211	#if DIAGNOSTIC
3212	vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
3213	#else
3214	vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
3215	#endif
3216	{
3217	struct vnode *vq;
3218	int vid;
3219
3220	#if DIAGNOSTIC
3221	if ((flags & REVOKEALL) == `0`) {
3222	panic("vnop_revoke");
3223	}
3224	#endif
3225
3226	if (vnode_isaliased(vp)) {
3227	/*
3228	* If a vgone (or vclean) is already in progress,
3229	* return an immediate error
3230	*/
3231	if (vp->v_lflag & VL_TERMINATE) {
3232	return ENOENT;
3233	}
3234
3235	/*
3236	* Ensure that vp will not be vgone'd while we
3237	* are eliminating its aliases.
3238	*/
3239	SPECHASH_LOCK();
3240	while ((vp->v_specflags & SI_ALIASED)) {
3241	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3242	if (vq->v_rdev != vp->v_rdev \|\|
3243	vq->v_type != vp->v_type \|\| vp == vq) {
3244	continue;
3245	}
3246	vid = vq->v_id;
3247	vnode_hold(vp: vq);
3248	SPECHASH_UNLOCK();
3249	if (vnode_getwithvid(vq, vid)) {
3250	vq = vnode_drop(vp: vq);
3251	SPECHASH_LOCK();
3252	break;
3253	}
3254	vnode_lock(vq);
3255	if (!(vq->v_lflag & VL_TERMINATE)) {
3256	vnode_reclaim_internal(vq, `1`, `1`, `0`);
3257	}
3258	vnode_put_locked(vq);
3259	vq = vnode_drop_and_unlock(vq);
3260	SPECHASH_LOCK();
3261	break;
3262	}
3263	}
3264	SPECHASH_UNLOCK();
3265	}
3266	vnode_lock(vp);
3267	if (vp->v_lflag & VL_TERMINATE) {
3268	vnode_unlock(vp);
3269	return ENOENT;
3270	}
3271	vnode_reclaim_internal(vp, `1`, `0`, REVOKEALL);
3272	vnode_unlock(vp);
3273
3274	return `0`;
3275	}
3276
3277	/*
3278	* Recycle an unused vnode to the front of the free list.
3279	* Release the passed interlock if the vnode will be recycled.
3280	*/
3281	int
3282	vnode_recycle(struct vnode *vp)
3283	{
3284	vnode_lock_spin(vp);
3285
3286	if (vp->v_iocount \|\| vp->v_usecount) {
3287	vp->v_lflag \|= VL_MARKTERM;
3288	vnode_unlock(vp);
3289	return `0`;
3290	}
3291	vnode_lock_convert(vp);
3292	vnode_hold(vp);
3293	vnode_reclaim_internal(vp, `1`, `0`, `0`);
3294
3295	vnode_drop_and_unlock(vp);
3296
3297	return `1`;
3298	}
3299
3300	static int
3301	vnode_reload(vnode_t vp)
3302	{
3303	vnode_lock_spin(vp);
3304
3305	if ((vp->v_iocount > `1`) \|\| vp->v_usecount) {
3306	vnode_unlock(vp);
3307	return `0`;
3308	}
3309	if (vp->v_iocount <= `0`) {
3310	panic("vnode_reload with no iocount %d", vp->v_iocount);
3311	}
3312
3313	/ mark for release when iocount is dopped /
3314	vp->v_lflag \|= VL_MARKTERM;
3315	vnode_unlock(vp);
3316
3317	return `1`;
3318	}
3319
3320
3321	static void
3322	vgone(vnode_t vp, int flags)
3323	{
3324	struct vnode *vq;
3325	struct vnode *vx;
3326
3327	/*
3328	* Clean out the filesystem specific data.
3329	* vclean also takes care of removing the
3330	* vnode from any mount list it might be on
3331	*/
3332	vclean(vp, flags: flags \| DOCLOSE);
3333
3334	/*
3335	* If special device, remove it from special device alias list
3336	* if it is on one.
3337	*/
3338	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_specinfo != `0`) {
3339	SPECHASH_LOCK();
3340	if (*vp->v_hashchain == vp) {
3341	*vp->v_hashchain = vp->v_specnext;
3342	} else {
3343	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3344	if (vq->v_specnext != vp) {
3345	continue;
3346	}
3347	vq->v_specnext = vp->v_specnext;
3348	break;
3349	}
3350	if (vq == NULL) {
3351	panic("missing bdev");
3352	}
3353	}
3354	if (vp->v_specflags & SI_ALIASED) {
3355	vx = NULL;
3356	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3357	if (vq->v_rdev != vp->v_rdev \|\|
3358	vq->v_type != vp->v_type) {
3359	continue;
3360	}
3361	if (vx) {
3362	break;
3363	}
3364	vx = vq;
3365	}
3366	if (vx == NULL) {
3367	panic("missing alias");
3368	}
3369	if (vq == NULL) {
3370	vx->v_specflags &= ~SI_ALIASED;
3371	}
3372	vp->v_specflags &= ~SI_ALIASED;
3373	}
3374	SPECHASH_UNLOCK();
3375	{
3376	struct specinfo *tmp = vp->v_specinfo;
3377	vp->v_specinfo = NULL;
3378	zfree(specinfo_zone, tmp);
3379	}
3380	}
3381	}
3382
3383	/*
3384	* internal helper function only!
3385	* vend an _iocounted_ vnode via output argument, or return an error if unable.
3386	*/
3387	static int
3388	get_vp_from_dev(dev_t dev, enum vtype type, vnode_t *outvp)
3389	{
3390	vnode_t vp;
3391	int vid;
3392
3393	loop:
3394	SPECHASH_LOCK();
3395	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
3396	if (dev != vp->v_rdev \|\| type != vp->v_type) {
3397	continue;
3398	}
3399	vid = vp->v_id;
3400	vnode_hold(vp);
3401	SPECHASH_UNLOCK();
3402
3403	/ acquire iocount /
3404	if (vnode_getwithvid(vp, vid)) {
3405	vnode_drop(vp);
3406	goto loop;
3407	}
3408	vnode_drop(vp);
3409
3410	/ Vend iocounted vnode /
3411	*outvp = vp;
3412	return `0`;
3413	}
3414
3415	/ vnode not found, error out /
3416	SPECHASH_UNLOCK();
3417	return ENOENT;
3418	}
3419
3420
3421
3422	/*
3423	* Lookup a vnode by device number.
3424	*/
3425	int
3426	check_mountedon(dev_t dev, enum vtype type, int *errorp)
3427	{
3428	vnode_t vp = NULLVP;
3429	int rc = `0`;
3430
3431	rc = get_vp_from_dev(dev, type, outvp: &vp);
3432	if (rc) {
3433	/ if no vnode found, it cannot be mounted on /
3434	return `0`;
3435	}
3436
3437	/ otherwise, examine it /
3438	vnode_lock_spin(vp);
3439	/ note: exclude the iocount we JUST got (e.g. >1, not >0) /
3440	if ((vp->v_usecount > `0`) \|\| (vp->v_iocount > `1`)) {
3441	vnode_unlock(vp);
3442	if ((*errorp = vfs_mountedon(vp)) != `0`) {
3443	rc = `1`;
3444	}
3445	} else {
3446	vnode_unlock(vp);
3447	}
3448	/ release iocount! /
3449	vnode_put(vp);
3450
3451	return rc;
3452	}
3453
3454	extern dev_t chrtoblk(dev_t d);
3455
3456	/*
3457	* Examine the supplied vnode's dev_t and find its counterpart
3458	* (e.g. VCHR => VDEV) to compare against.
3459	*/
3460	static int
3461	vnode_cmp_paired_dev(vnode_t vp, vnode_t bdev_vp, enum vtype in_type,
3462	enum vtype out_type)
3463	{
3464	if (!vp \|\| !bdev_vp) {
3465	return EINVAL;
3466	}
3467	/ Verify iocounts /
3468	if (vnode_iocount(vp) <= `0` \|\|
3469	vnode_iocount(vp: bdev_vp) <= `0`) {
3470	return EINVAL;
3471	}
3472
3473	/ check for basic matches /
3474	if (vnode_vtype(vp) != in_type) {
3475	return EINVAL;
3476	}
3477	if (vnode_vtype(vp: bdev_vp) != out_type) {
3478	return EINVAL;
3479	}
3480
3481	dev_t dev = vnode_specrdev(vp);
3482	dev_t blk_devt = vnode_specrdev(vp: bdev_vp);
3483
3484	if (in_type == VCHR) {
3485	if (out_type != VBLK) {
3486	return EINVAL;
3487	}
3488	dev_t bdev = chrtoblk(d: dev);
3489	if (bdev == NODEV) {
3490	return EINVAL;
3491	} else if (bdev == blk_devt) {
3492	return `0`;
3493	}
3494	//fall through
3495	}
3496	/*
3497	* else case:
3498	*
3499	* in_type == VBLK? => VCHR?
3500	* not implemented...
3501	* exercise to the reader: this can be built by
3502	* taking the device's major, and iterating the `chrtoblktab`
3503	* array to look for a value that matches.
3504	*/
3505	return EINVAL;
3506	}
3507	/*
3508	* Vnode compare: does the supplied vnode's CHR device, match the dev_t
3509	* of the accompanying `blk_vp` ?
3510	* NOTE: vnodes MUST be iocounted BEFORE calling this!
3511	*/
3512
3513	int
3514	vnode_cmp_chrtoblk(vnode_t vp, vnode_t blk_vp)
3515	{
3516	return vnode_cmp_paired_dev(vp, bdev_vp: blk_vp, in_type: VCHR, out_type: VBLK);
3517	}
3518
3519
3520
3521	/*
3522	* Calculate the total number of references to a special device.
3523	*/
3524	int
3525	vcount(vnode_t vp)
3526	{
3527	vnode_t vq, vnext;
3528	int count;
3529	int vid;
3530
3531	if (!vnode_isspec(vp)) {
3532	return vp->v_usecount - vp->v_kusecount;
3533	}
3534
3535	loop:
3536	if (!vnode_isaliased(vp)) {
3537	return vp->v_specinfo->si_opencount;
3538	}
3539	count = `0`;
3540
3541	SPECHASH_LOCK();
3542	/*
3543	* Grab first vnode and its vid.
3544	*/
3545	vq = *vp->v_hashchain;
3546	if (vq) {
3547	vid = vq->v_id;
3548	vnode_hold(vp: vq);
3549	} else {
3550	vid = `0`;
3551	}
3552	SPECHASH_UNLOCK();
3553
3554	while (vq) {
3555	/*
3556	* Attempt to get the vnode outside the SPECHASH lock.
3557	* Don't take iocount on 'vp' as iocount is already held by the caller.
3558	*/
3559	if ((vq != vp) && vnode_getwithvid(vq, vid)) {
3560	vnode_drop(vp: vq);
3561	goto loop;
3562	}
3563	vnode_drop(vp: vq);
3564	vnode_lock(vq);
3565
3566	if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
3567	if ((vq->v_usecount == `0`) && (vq->v_iocount == `1`) && vq != vp) {
3568	/*
3569	* Alias, but not in use, so flush it out.
3570	*/
3571	vnode_hold(vp: vq);
3572	vnode_reclaim_internal(vq, `1`, `1`, `0`);
3573	vnode_put_locked(vq);
3574	vnode_drop_and_unlock(vq);
3575	goto loop;
3576	}
3577	count += vq->v_specinfo->si_opencount;
3578	}
3579	vnode_unlock(vq);
3580
3581	SPECHASH_LOCK();
3582	/*
3583	* must do this with the reference still held on 'vq'
3584	* so that it can't be destroyed while we're poking
3585	* through v_specnext
3586	*/
3587	vnext = vq->v_specnext;
3588	if (vnext) {
3589	vid = vnext->v_id;
3590	vnode_hold(vp: vnext);
3591	} else {
3592	vid = `0`;
3593	}
3594	SPECHASH_UNLOCK();
3595
3596	if (vq != vp) {
3597	vnode_put(vp: vq);
3598	}
3599
3600	vq = vnext;
3601	}
3602
3603	return count;
3604	}
3605
3606	int prtactive = `0`; / 1 => print out reclaim of active vnodes /
3607
3608	/*
3609	* Print out a description of a vnode.
3610	*/
3611	static const char *typename[] =
3612	{ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
3613
3614	void
3615	vprint(const char label, struct* vnode *vp)
3616	{
3617	char sbuf[`64`];
3618
3619	if (label != NULL) {
3620	printf("%s: ", label);
3621	}
3622	printf("name %s type %s, usecount %d, writecount %d\n",
3623	vp->v_name, typename[vp->v_type],
3624	vp->v_usecount, vp->v_writecount);
3625	sbuf[`0`] = `'\0'`;
3626	if (vp->v_flag & VROOT) {
3627	strlcat(dst: sbuf, src: "\|VROOT", n: sizeof(sbuf));
3628	}
3629	if (vp->v_flag & VTEXT) {
3630	strlcat(dst: sbuf, src: "\|VTEXT", n: sizeof(sbuf));
3631	}
3632	if (vp->v_flag & VSYSTEM) {
3633	strlcat(dst: sbuf, src: "\|VSYSTEM", n: sizeof(sbuf));
3634	}
3635	if (vp->v_flag & VNOFLUSH) {
3636	strlcat(dst: sbuf, src: "\|VNOFLUSH", n: sizeof(sbuf));
3637	}
3638	if (vp->v_flag & VBWAIT) {
3639	strlcat(dst: sbuf, src: "\|VBWAIT", n: sizeof(sbuf));
3640	}
3641	if (vnode_isaliased(vp)) {
3642	strlcat(dst: sbuf, src: "\|VALIASED", n: sizeof(sbuf));
3643	}
3644	if (sbuf[`0`] != `'\0'`) {
3645	printf("vnode flags (%s\n", &sbuf[`1`]);
3646	}
3647	}
3648
3649	static int
3650	vn_getpath_flags_to_buildpath_flags(int flags)
3651	{
3652	int bpflags = (flags & VN_GETPATH_FSENTER) ? `0` : BUILDPATH_NO_FS_ENTER;
3653
3654	if (flags && (flags != VN_GETPATH_FSENTER)) {
3655	if (flags & VN_GETPATH_NO_FIRMLINK) {
3656	bpflags \|= BUILDPATH_NO_FIRMLINK;
3657	}
3658	if (flags & VN_GETPATH_VOLUME_RELATIVE) {
3659	bpflags \|= (BUILDPATH_VOLUME_RELATIVE \|
3660	BUILDPATH_NO_FIRMLINK);
3661	}
3662	if (flags & VN_GETPATH_NO_PROCROOT) {
3663	bpflags \|= BUILDPATH_NO_PROCROOT;
3664	}
3665	if (flags & VN_GETPATH_CHECK_MOVED) {
3666	bpflags \|= BUILDPATH_CHECK_MOVED;
3667	}
3668	}
3669
3670	return bpflags;
3671	}
3672
3673	int
3674	vn_getpath_ext_with_mntlen(struct vnode vp, struct* vnode dvp, char* *pathbuf,
3675	size_t len, size_t mntlen, int flags)
3676	{
3677	int bpflags = vn_getpath_flags_to_buildpath_flags(flags);
3678	int local_len;
3679	int error;
3680
3681	if (*len > INT_MAX) {
3682	return EINVAL;
3683	}
3684
3685	local_len = *len;
3686
3687	error = build_path_with_parent(vp, dvp, pathbuf, local_len, &local_len,
3688	mntlen, bpflags, vfs_context_current());
3689
3690	if (local_len >= `0` && local_len <= (int)*len) {
3691	*len = (size_t)local_len;
3692	}
3693
3694	return error;
3695	}
3696
3697	int
3698	vn_getpath_ext(struct vnode vp, struct* vnode dvp, char* pathbuf, size_t len,
3699	int flags)
3700	{
3701	return vn_getpath_ext_with_mntlen(vp, dvp, pathbuf, len, NULL, flags);
3702	}
3703
3704	/*
3705	* Wrapper around vn_getpath_ext() that takes care of the int * <-> size_t *
3706	* conversion for the legacy KPIs.
3707	*/
3708	static int
3709	vn_getpath_ext_int(struct vnode vp, struct* vnode dvp, char* *pathbuf,
3710	int len, int* flags)
3711	{
3712	size_t slen = *len;
3713	int error;
3714
3715	if (*len < `0`) {
3716	return EINVAL;
3717	}
3718
3719	error = vn_getpath_ext(vp, dvp, pathbuf, len: &slen, flags);
3720
3721	if (slen <= INT_MAX) {
3722	len = (int*)slen;
3723	}
3724
3725	return error;
3726	}
3727
3728	int
3729	vn_getpath(struct vnode vp, char* pathbuf, int* *len)
3730	{
3731	return vn_getpath_ext_int(vp, NULL, pathbuf, len, flags: `0`);
3732	}
3733
3734	int
3735	vn_getpath_fsenter(struct vnode vp, char* pathbuf, int* *len)
3736	{
3737	return vn_getpath_ext_int(vp, NULL, pathbuf, len, VN_GETPATH_FSENTER);
3738	}
3739
3740	/*
3741	* vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
3742	* vnode. It requires that there are IO counts on both the vnode and the directory vnode.
3743	*
3744	* vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
3745	* unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
3746	* problems where if the path can not be found from the name cache, those operations can
3747	* erroneously fail with EPERM even though the call should succeed. When removing or moving
3748	* file system objects with operations such as unlink or rename, those operations need to
3749	* take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
3750	* MAC hook from these operations during forced unmount operations can lead to dead
3751	* lock. This happens when the operation starts, IO counts are taken on the containing
3752	* directories and targets. Before the MAC hook is called a forced unmount from another
3753	* thread takes place and blocks on the on going operation's directory vnode in vdrain.
3754	* After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
3755	* is called with the understanding that there is an IO count on the target. If in
3756	* build_path the directory vnode is no longer in the cache, then the parent object id via
3757	* vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
3758	* vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
3759	* an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
3760	* depending on which version and how it calls the vnode_get family of interfaces.
3761	*
3762	* N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
3763	* call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
3764	* cause issues, but there is no guarantee that all or any file systems are doing that.
3765	*
3766	* vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
3767	* IO count on the directory vnode by calling build_path_with_parent.
3768	*/
3769
3770	int
3771	vn_getpath_fsenter_with_parent(struct vnode dvp, struct* vnode vp, char* pathbuf, int* *len)
3772	{
3773	return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, `0`, vfs_context_current());
3774	}
3775
3776	int
3777	vn_getpath_no_firmlink(struct vnode vp, char* pathbuf, int* *len)
3778	{
3779	return vn_getpath_ext_int(vp, NULLVP, pathbuf, len,
3780	VN_GETPATH_NO_FIRMLINK);
3781	}
3782
3783	int
3784	vn_getcdhash(struct vnode vp, off_t offset, unsigned* char *cdhash)
3785	{
3786	return ubc_cs_getcdhash(vp, offset, cdhash);
3787	}
3788
3789
3790	static char *extension_table = NULL;
3791	static int nexts;
3792	static int max_ext_width;
3793
3794	static int
3795	extension_cmp(const void a, const* void *b)
3796	{
3797	return (int)(strlen(s: (const char )a) - strlen(s: (const* char *)b));
3798	}
3799
3800
3801	//
3802	// This is the api LaunchServices uses to inform the kernel
3803	// the list of package extensions to ignore.
3804	//
3805	// Internally we keep the list sorted by the length of the
3806	// the extension (from longest to shortest). We sort the
3807	// list of extensions so that we can speed up our searches
3808	// when comparing file names -- we only compare extensions
3809	// that could possibly fit into the file name, not all of
3810	// them (i.e. a short 8 character name can't have an 8
3811	// character extension).
3812	//
3813	extern lck_mtx_t pkg_extensions_lck;
3814
3815	__private_extern__ int
3816	set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
3817	{
3818	char new_exts, old_exts;
3819	int old_nentries = `0`, old_maxwidth = `0`;
3820	int error;
3821
3822	if (nentries <= `0` \|\| nentries > `1024` \|\| maxwidth <= `0` \|\| maxwidth > `255`) {
3823	return EINVAL;
3824	}
3825
3826
3827	// allocate one byte extra so we can guarantee null termination
3828	new_exts = kalloc_data((nentries * maxwidth) + `1`, Z_WAITOK);
3829	if (new_exts == NULL) {
3830	return ENOMEM;
3831	}
3832
3833	error = copyin(data, new_exts, nentries * maxwidth);
3834	if (error) {
3835	kfree_data(new_exts, (nentries * maxwidth) + `1`);
3836	return error;
3837	}
3838
3839	new_exts[(nentries * maxwidth)] = `'\0'`; // guarantee null termination of the block
3840
3841	qsort(array: new_exts, nmembers: nentries, member_size: maxwidth, extension_cmp);
3842
3843	lck_mtx_lock(lck: &pkg_extensions_lck);
3844
3845	old_exts = extension_table;
3846	old_nentries = nexts;
3847	old_maxwidth = max_ext_width;
3848	extension_table = new_exts;
3849	nexts = nentries;
3850	max_ext_width = maxwidth;
3851
3852	lck_mtx_unlock(lck: &pkg_extensions_lck);
3853
3854	kfree_data(old_exts, (old_nentries * old_maxwidth) + `1`);
3855
3856	return `0`;
3857	}
3858
3859
3860	int
3861	is_package_name(const char name, int* len)
3862	{
3863	int i;
3864	size_t extlen;
3865	const char ptr, name_ext;
3866
3867	// if the name is less than 3 bytes it can't be of the
3868	// form A.B and if it begins with a "." then it is also
3869	// not a package.
3870	if (len <= `3` \|\| name[`0`] == `'.'`) {
3871	return `0`;
3872	}
3873
3874	name_ext = NULL;
3875	for (ptr = name; *ptr != `'\0'`; ptr++) {
3876	if (*ptr == `'.'`) {
3877	name_ext = ptr;
3878	}
3879	}
3880
3881	// if there is no "." extension, it can't match
3882	if (name_ext == NULL) {
3883	return `0`;
3884	}
3885
3886	// advance over the "."
3887	name_ext++;
3888
3889	lck_mtx_lock(lck: &pkg_extensions_lck);
3890
3891	// now iterate over all the extensions to see if any match
3892	ptr = &extension_table[`0`];
3893	for (i = `0`; i < nexts; i++, ptr += max_ext_width) {
3894	extlen = strlen(s: ptr);
3895	if (strncasecmp(s1: name_ext, s2: ptr, n: extlen) == `0` && name_ext[extlen] == `'\0'`) {
3896	// aha, a match!
3897	lck_mtx_unlock(lck: &pkg_extensions_lck);
3898	return `1`;
3899	}
3900	}
3901
3902	lck_mtx_unlock(lck: &pkg_extensions_lck);
3903
3904	// if we get here, no extension matched
3905	return `0`;
3906	}
3907
3908	int
3909	vn_path_package_check(__unused vnode_t vp, char path, int* pathlen, int *component)
3910	{
3911	char ptr, end;
3912	int comp = `0`;
3913
3914	if (pathlen < `0`) {
3915	return EINVAL;
3916	}
3917
3918	*component = -`1`;
3919	if (*path != `'/'`) {
3920	return EINVAL;
3921	}
3922
3923	end = path + `1`;
3924	while (end < path + pathlen && *end != `'\0'`) {
3925	while (end < path + pathlen && end == `'/'` && end != `'\0'`) {
3926	end++;
3927	}
3928
3929	ptr = end;
3930
3931	while (end < path + pathlen && end != `'/'` && end != `'\0'`) {
3932	end++;
3933	}
3934
3935	if (end > path + pathlen) {
3936	// hmm, string wasn't null terminated
3937	return EINVAL;
3938	}
3939
3940	*end = `'\0'`;
3941	if (is_package_name(name: ptr, len: (int)(end - ptr))) {
3942	*component = comp;
3943	break;
3944	}
3945
3946	end++;
3947	comp++;
3948	}
3949
3950	return `0`;
3951	}
3952
3953	/*
3954	* Determine if a name is inappropriate for a searchfs query.
3955	* This list consists of /System currently.
3956	*/
3957
3958	int
3959	vn_searchfs_inappropriate_name(const char name, int* len)
3960	{
3961	const char *bad_names[] = { "System" };
3962	int bad_len[] = { `6` };
3963	int i;
3964
3965	if (len < `0`) {
3966	return EINVAL;
3967	}
3968
3969	for (i = `0`; i < (int) (sizeof(bad_names) / sizeof(bad_names[`0`])); i++) {
3970	if (len == bad_len[i] && strncmp(s1: name, s2: bad_names[i], n: strlen(s: bad_names[i]) + `1`) == `0`) {
3971	return `1`;
3972	}
3973	}
3974
3975	// if we get here, no name matched
3976	return `0`;
3977	}
3978
3979	/*
3980	* Top level filesystem related information gathering.
3981	*/
3982	extern unsigned int vfs_nummntops;
3983
3984	/*
3985	* The VFS_NUMMNTOPS shouldn't be at name[1] since
3986	* is a VFS generic variable. Since we no longer support
3987	* VT_UFS, we reserve its value to support this sysctl node.
3988	*
3989	* It should have been:
3990	* name[0]: VFS_GENERIC
3991	* name[1]: VFS_NUMMNTOPS
3992	*/
3993	SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
3994	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
3995	&vfs_nummntops, `0`, "");
3996
3997	int
3998	vfs_sysctl(int *name __unused, u_int namelen __unused,
3999	user_addr_t oldp __unused, size_t *oldlenp __unused,
4000	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
4001
4002	int
4003	vfs_sysctl(int *name __unused, u_int namelen __unused,
4004	user_addr_t oldp __unused, size_t *oldlenp __unused,
4005	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
4006	{
4007	return EINVAL;
4008	}
4009
4010
4011	//
4012	// The following code disallows specific sysctl's that came through
4013	// the direct sysctl interface (vfs_sysctl_node) instead of the newer
4014	// sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors
4015	// through vfs_sysctl_node() because it passes the user's oldp pointer
4016	// directly to the file system which (for these selectors) casts it
4017	// back to a struct sysctl_req and then proceed to use SYSCTL_IN()
4018	// which jumps through an arbitrary function pointer. When called
4019	// through the sysctl_vfs_ctlbyfsid() interface this does not happen
4020	// and so it's safe.
4021	//
4022	// Unfortunately we have to pull in definitions from AFP and SMB and
4023	// perform explicit name checks on the file system to determine if
4024	// these selectors are being used.
4025	//
4026
4027	#define AFPFS_VFS_CTL_GETID 0x00020001
4028	#define AFPFS_VFS_CTL_NETCHANGE 0x00020002
4029	#define AFPFS_VFS_CTL_VOLCHANGE 0x00020003
4030
4031	#define SMBFS_SYSCTL_REMOUNT 1
4032	#define SMBFS_SYSCTL_REMOUNT_INFO 2
4033	#define SMBFS_SYSCTL_GET_SERVER_SHARE 3
4034
4035
4036	static int
4037	is_bad_sysctl_name(struct vfstable vfsp, int* selector_name)
4038	{
4039	switch (selector_name) {
4040	case VFS_CTL_QUERY:
4041	case VFS_CTL_TIMEO:
4042	case VFS_CTL_NOLOCKS:
4043	case VFS_CTL_NSTATUS:
4044	case VFS_CTL_SADDR:
4045	case VFS_CTL_DISC:
4046	case VFS_CTL_SERVERINFO:
4047	return `1`;
4048
4049	default:
4050	break;
4051	}
4052
4053	// the more complicated check for some of SMB's special values
4054	if (strcmp(s1: vfsp->vfc_name, s2: "smbfs") == `0`) {
4055	switch (selector_name) {
4056	case SMBFS_SYSCTL_REMOUNT:
4057	case SMBFS_SYSCTL_REMOUNT_INFO:
4058	case SMBFS_SYSCTL_GET_SERVER_SHARE:
4059	return `1`;
4060	}
4061	} else if (strcmp(s1: vfsp->vfc_name, s2: "afpfs") == `0`) {
4062	switch (selector_name) {
4063	case AFPFS_VFS_CTL_GETID:
4064	case AFPFS_VFS_CTL_NETCHANGE:
4065	case AFPFS_VFS_CTL_VOLCHANGE:
4066	return `1`;
4067	}
4068	}
4069
4070	//
4071	// If we get here we passed all the checks so the selector is ok
4072	//
4073	return `0`;
4074	}
4075
4076
4077	int vfs_sysctl_node SYSCTL_HANDLER_ARGS
4078	{
4079	int *name, namelen;
4080	struct vfstable *vfsp;
4081	int error;
4082	int fstypenum;
4083
4084	fstypenum = oidp->oid_number;
4085	name = arg1;
4086	namelen = arg2;
4087
4088	/ all sysctl names at this level should have at least one name slot for the FS /
4089	if (namelen < `1`) {
4090	return EISDIR; / overloaded /
4091	}
4092	mount_list_lock();
4093	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
4094	if (vfsp->vfc_typenum == fstypenum) {
4095	vfsp->vfc_refcount++;
4096	break;
4097	}
4098	}
4099	mount_list_unlock();
4100
4101	if (vfsp == NULL) {
4102	return ENOTSUP;
4103	}
4104
4105	if (is_bad_sysctl_name(vfsp, selector_name: name[`0`])) {
4106	printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n", name[`0`]);
4107	error = EPERM;
4108	} else {
4109	error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen,
4110	req->oldptr, &req->oldlen, req->newptr, req->newlen,
4111	vfs_context_current());
4112	}
4113
4114	mount_list_lock();
4115	vfsp->vfc_refcount--;
4116	mount_list_unlock();
4117
4118	return error;
4119	}
4120
4121	/*
4122	* Check to see if a filesystem is mounted on a block device.
4123	*/
4124	int
4125	vfs_mountedon(struct vnode *vp)
4126	{
4127	struct vnode *vq;
4128	int error = `0`;
4129
4130	restart:
4131	SPECHASH_LOCK();
4132	if (vp->v_specflags & SI_MOUNTING && (vp->v_specinfo->si_mountingowner != current_thread())) {
4133	msleep(chan: (caddr_t)&vp->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS \| PDROP, wmesg: "vnode_waitformounting", NULL);
4134	goto restart;
4135	}
4136	if (vp->v_specflags & SI_MOUNTEDON) {
4137	error = EBUSY;
4138	goto out;
4139	}
4140	if (vp->v_specflags & SI_ALIASED) {
4141	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
4142	if (vq->v_rdev != vp->v_rdev \|\|
4143	vq->v_type != vp->v_type) {
4144	continue;
4145	}
4146	if (vq->v_specflags & SI_MOUNTING) {
4147	msleep(chan: (caddr_t)&vq->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS \| PDROP, wmesg: "vnode_waitformounting", NULL);
4148	goto restart;
4149	}
4150	if (vq->v_specflags & SI_MOUNTEDON) {
4151	error = EBUSY;
4152	break;
4153	}
4154	}
4155	}
4156	out:
4157	SPECHASH_UNLOCK();
4158	return error;
4159	}
4160
4161	void
4162	vfs_setmountedon(vnode_t vp)
4163	{
4164	vnode_lock(vp);
4165	SPECHASH_LOCK();
4166	vp->v_specflags \|= SI_MOUNTEDON;
4167	vp->v_specflags &= ~SI_MOUNTING;
4168	vp->v_specinfo->si_mountingowner = NULL;
4169	SPECHASH_UNLOCK();
4170	vnode_unlock(vp);
4171	wakeup(chan: &vp->v_specflags);
4172	}
4173
4174	void
4175	vfs_clearmounting(vnode_t vp)
4176	{
4177	vnode_lock(vp);
4178	SPECHASH_LOCK();
4179	vp->v_specflags &= ~SI_MOUNTING;
4180	vp->v_specinfo->si_mountingowner = NULL;
4181	SPECHASH_UNLOCK();
4182	vnode_unlock(vp);
4183	wakeup(chan: &vp->v_specflags);
4184	}
4185
4186	/*
4187	* Check to see if a filesystem is mounted on a block device.
4188	*/
4189	int
4190	vfs_setmounting(vnode_t vp)
4191	{
4192	struct vnode *vq;
4193	int error = `0`;
4194
4195	vnode_lock(vp);
4196	while (vp->v_specflags & SI_MOUNTING) {
4197	msleep(chan: (caddr_t)&vp->v_specflags, mtx: &vp->v_lock, PVFS, wmesg: "vnode_waitformounting", NULL);
4198	}
4199	if (vp->v_specflags & SI_MOUNTEDON) {
4200	vnode_unlock(vp);
4201	return EBUSY;
4202	}
4203	SPECHASH_LOCK();
4204	vp->v_specflags \|= SI_MOUNTING;
4205	vp->v_specinfo->si_mountingowner = current_thread();
4206	vnode_unlock(vp);
4207	restart:
4208	if (vp->v_specflags & SI_ALIASED) {
4209	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
4210	if (vq->v_rdev != vp->v_rdev \|\|
4211	vq->v_type != vp->v_type \|\| vq == vp) {
4212	continue;
4213	}
4214	if (vq->v_specflags & SI_MOUNTING) {
4215	msleep(chan: (caddr_t)&vq->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS \| PDROP, wmesg: "vnode_waitformounting", NULL);
4216	SPECHASH_LOCK();
4217	goto restart;
4218	}
4219	if (vq->v_specflags & SI_MOUNTEDON) {
4220	error = EBUSY;
4221	break;
4222	}
4223	}
4224	}
4225	SPECHASH_UNLOCK();
4226	if (error) {
4227	vnode_lock(vp);
4228	SPECHASH_LOCK();
4229	vp->v_specflags &= ~SI_MOUNTING;
4230	SPECHASH_UNLOCK();
4231	vnode_unlock(vp);
4232	wakeup(chan: &vp->v_specflags);
4233	}
4234	return error;
4235	}
4236
4237	struct unmount_info {
4238	int u_errs; // Total failed unmounts
4239	int u_busy; // EBUSY failed unmounts
4240	int u_count; // Total volumes iterated
4241	int u_only_non_system;
4242	};
4243
4244	static int
4245	unmount_callback(mount_t mp, void *arg)
4246	{
4247	int error;
4248	char *mntname;
4249	struct unmount_info *uip = arg;
4250
4251	uip->u_count++;
4252
4253	mntname = zalloc_flags(ZV_NAMEI, Z_WAITOK \| Z_NOFAIL);
4254	strlcpy(dst: mntname, src: mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
4255
4256	if (uip->u_only_non_system
4257	&& ((mp->mnt_flag & MNT_ROOTFS) \|\| (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT
4258	printf("unmount(%d) %s skipped\n", uip->u_only_non_system, mntname);
4259	mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF
4260	} else {
4261	printf("unmount(%d) %s\n", uip->u_only_non_system, mntname);
4262
4263	mount_ref(mp, locked: `0`);
4264	mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF
4265	error = dounmount(mp, MNT_FORCE, `1`, vfs_context_current());
4266	if (error) {
4267	uip->u_errs++;
4268	printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
4269	if (error == EBUSY) {
4270	uip->u_busy++;
4271	}
4272	}
4273	}
4274	zfree(ZV_NAMEI, mntname);
4275
4276	return VFS_RETURNED;
4277	}
4278
4279	/*
4280	* Unmount all filesystems. The list is traversed in reverse order
4281	* of mounting to avoid dependencies.
4282	* Busy mounts are retried.
4283	*/
4284	__private_extern__ void
4285	vfs_unmountall(int only_non_system)
4286	{
4287	int mounts, sec = `1`;
4288	struct unmount_info ui;
4289
4290	/*
4291	* Ensure last-completion-time is valid before anyone can see that
4292	* VFS shutdown has started.
4293	*/
4294	vfs_shutdown_last_completion_time = mach_absolute_time();
4295	OSMemoryBarrier();
4296	vfs_unmountall_started = `1`;
4297	printf("vfs_unmountall(%ssystem) start\n", only_non_system ? "non" : "");
4298
4299	retry:
4300	ui.u_errs = ui.u_busy = ui.u_count = `0`;
4301	ui.u_only_non_system = only_non_system;
4302	// avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF
4303	vfs_iterate(VFS_ITERATE_CB_DROPREF \| VFS_ITERATE_TAIL_FIRST, callout: unmount_callback, arg: &ui);
4304	mounts = mount_getvfscnt();
4305	if (mounts == `0`) {
4306	goto out;
4307	}
4308	if (ui.u_busy > `0`) { // Busy mounts - wait & retry
4309	tsleep(chan: &nummounts, PVFS, wmesg: "busy mount", timo: sec * hz);
4310	sec *= `2`;
4311	if (sec <= `32`) {
4312	goto retry;
4313	}
4314	printf("Unmounting timed out\n");
4315	} else if (ui.u_count < mounts) {
4316	// If the vfs_iterate missed mounts in progress - wait a bit
4317	tsleep(chan: &nummounts, PVFS, wmesg: "missed mount", timo: `2` * hz);
4318	}
4319
4320	out:
4321	printf("vfs_unmountall(%ssystem) end\n", only_non_system ? "non" : "");
4322
4323	/*
4324	* reboot_kernel() calls us twice; once to deal with non-system
4325	* mounts, and again to sweep up anything left after terminating
4326	* DEXTs. We're only finished once we've completed the second pass.
4327	*/
4328	if (!only_non_system) {
4329	vfs_unmountall_finished = `1`;
4330	}
4331	}
4332
4333	/*
4334	* vfs_shutdown_in_progress --
4335	*
4336	* Returns whether or not the VFS is shutting down the file systems.
4337	*/
4338	boolean_t
4339	vfs_shutdown_in_progress(void)
4340	{
4341	return vfs_unmountall_started && !vfs_unmountall_finished;
4342	}
4343
4344	/*
4345	* vfs_shutdown_finished --
4346	*
4347	* Returns whether or not the VFS shutdown has completed.
4348	*/
4349	boolean_t
4350	vfs_shutdown_finished(void)
4351	{
4352	return !!vfs_unmountall_finished;
4353	}
4354
4355	/*
4356	* vfs_update_last_completion_time --
4357	*
4358	* Updates the "last I/O completion time" timestamp used by the watchdog
4359	* to monitor VFS shutdown progress. Called by various I/O stack layers
4360	* as operations complete and progress moves forward.
4361	*/
4362	void
4363	vfs_update_last_completion_time(void)
4364	{
4365	if (vfs_unmountall_started) {
4366	vfs_shutdown_last_completion_time = mach_absolute_time();
4367	}
4368	}
4369
4370	/*
4371	* vfs_last_completion_time --
4372	*
4373	* Returns the "last I/O completion time" timestamp. Return
4374	* value is a mach_absolute_time() value, and is not meaningful
4375	* unless vfs_is_shutting_down() also returns true.
4376	*/
4377	uint64_t
4378	vfs_last_completion_time(void)
4379	{
4380	return vfs_unmountall_started ? vfs_shutdown_last_completion_time : `0`;
4381	}
4382
4383	/*
4384	* This routine is called from vnode_pager_deallocate out of the VM
4385	* The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
4386	* on a vnode that has a UBCINFO
4387	*/
4388	__private_extern__ void
4389	vnode_pager_vrele(vnode_t vp)
4390	{
4391	struct ubc_info *uip;
4392
4393	vnode_lock_spin(vp);
4394
4395	vp->v_lflag &= ~VNAMED_UBC;
4396	if (vp->v_usecount != `0`) {
4397	/*
4398	* At the eleventh hour, just before the ubcinfo is
4399	* destroyed, ensure the ubc-specific v_usecount
4400	* reference has gone. We use v_usecount != 0 as a hint;
4401	* ubc_unmap() does nothing if there's no mapping.
4402	*
4403	* This case is caused by coming here via forced unmount,
4404	* versus the usual vm_object_deallocate() path.
4405	* In the forced unmount case, ubc_destroy_named()
4406	* releases the pager before memory_object_last_unmap()
4407	* can be called.
4408	*/
4409	vnode_unlock(vp);
4410	ubc_unmap(vp);
4411	vnode_lock_spin(vp);
4412	}
4413
4414	uip = vp->v_ubcinfo;
4415	vp->v_ubcinfo = UBC_INFO_NULL;
4416
4417	vnode_unlock(vp);
4418
4419	ubc_info_deallocate(uip);
4420	}
4421
4422
4423	#include <sys/disk.h>
4424
4425	u_int32_t rootunit = (u_int32_t)-`1`;
4426
4427	#if CONFIG_IOSCHED
4428	extern int lowpri_throttle_enabled;
4429	extern int iosched_enabled;
4430	#endif
4431
4432	errno_t
4433	vfs_init_io_attributes(vnode_t devvp, mount_t mp)
4434	{
4435	int error;
4436	off_t readblockcnt = `0`;
4437	off_t writeblockcnt = `0`;
4438	off_t readmaxcnt = `0`;
4439	off_t writemaxcnt = `0`;
4440	off_t readsegcnt = `0`;
4441	off_t writesegcnt = `0`;
4442	off_t readsegsize = `0`;
4443	off_t writesegsize = `0`;
4444	off_t alignment = `0`;
4445	u_int32_t minsaturationbytecount = `0`;
4446	u_int32_t ioqueue_depth = `0`;
4447	u_int32_t blksize;
4448	u_int64_t temp;
4449	u_int32_t features;
4450	u_int64_t location = `0`;
4451	vfs_context_t ctx = vfs_context_current();
4452	dk_corestorage_info_t cs_info;
4453	boolean_t cs_present = FALSE;
4454	int isssd = `0`;
4455	int isvirtual = `0`;
4456
4457
4458	VNOP_IOCTL(vp: devvp, DKIOCGETTHROTTLEMASK, data: (caddr_t)&mp->mnt_throttle_mask, fflag: `0`, NULL);
4459	/*
4460	* as a reasonable approximation, only use the lowest bit of the mask
4461	* to generate a disk unit number
4462	*/
4463	mp->mnt_devbsdunit = num_trailing_0(n: mp->mnt_throttle_mask);
4464
4465	if (devvp == rootvp) {
4466	rootunit = mp->mnt_devbsdunit;
4467	}
4468
4469	if (mp->mnt_devbsdunit == rootunit) {
4470	/*
4471	* this mount point exists on the same device as the root
4472	* partition, so it comes under the hard throttle control...
4473	* this is true even for the root mount point itself
4474	*/
4475	mp->mnt_kern_flag \|= MNTK_ROOTDEV;
4476	}
4477	/*
4478	* force the spec device to re-cache
4479	* the underlying block size in case
4480	* the filesystem overrode the initial value
4481	*/
4482	set_fsblocksize(devvp);
4483
4484
4485	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKSIZE,
4486	data: (caddr_t)&blksize, fflag: `0`, ctx))) {
4487	return error;
4488	}
4489
4490	mp->mnt_devblocksize = blksize;
4491
4492	/*
4493	* set the maximum possible I/O size
4494	* this may get clipped to a smaller value
4495	* based on which constraints are being advertised
4496	* and if those advertised constraints result in a smaller
4497	* limit for a given I/O
4498	*/
4499	mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
4500	mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
4501
4502	if (VNOP_IOCTL(vp: devvp, DKIOCISVIRTUAL, data: (caddr_t)&isvirtual, fflag: `0`, ctx) == `0`) {
4503	if (isvirtual) {
4504	mp->mnt_kern_flag \|= MNTK_VIRTUALDEV;
4505	mp->mnt_flag \|= MNT_REMOVABLE;
4506	}
4507	}
4508	if (VNOP_IOCTL(vp: devvp, DKIOCISSOLIDSTATE, data: (caddr_t)&isssd, fflag: `0`, ctx) == `0`) {
4509	if (isssd) {
4510	mp->mnt_kern_flag \|= MNTK_SSD;
4511	}
4512	}
4513	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETFEATURES,
4514	data: (caddr_t)&features, fflag: `0`, ctx))) {
4515	return error;
4516	}
4517
4518	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBLOCKCOUNTREAD,
4519	data: (caddr_t)&readblockcnt, fflag: `0`, ctx))) {
4520	return error;
4521	}
4522
4523	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
4524	data: (caddr_t)&writeblockcnt, fflag: `0`, ctx))) {
4525	return error;
4526	}
4527
4528	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBYTECOUNTREAD,
4529	data: (caddr_t)&readmaxcnt, fflag: `0`, ctx))) {
4530	return error;
4531	}
4532
4533	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBYTECOUNTWRITE,
4534	data: (caddr_t)&writemaxcnt, fflag: `0`, ctx))) {
4535	return error;
4536	}
4537
4538	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
4539	data: (caddr_t)&readsegcnt, fflag: `0`, ctx))) {
4540	return error;
4541	}
4542
4543	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
4544	data: (caddr_t)&writesegcnt, fflag: `0`, ctx))) {
4545	return error;
4546	}
4547
4548	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
4549	data: (caddr_t)&readsegsize, fflag: `0`, ctx))) {
4550	return error;
4551	}
4552
4553	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
4554	data: (caddr_t)&writesegsize, fflag: `0`, ctx))) {
4555	return error;
4556	}
4557
4558	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
4559	data: (caddr_t)&alignment, fflag: `0`, ctx))) {
4560	return error;
4561	}
4562
4563	if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETCOMMANDPOOLSIZE,
4564	data: (caddr_t)&ioqueue_depth, fflag: `0`, ctx))) {
4565	return error;
4566	}
4567
4568	if (readmaxcnt) {
4569	mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX :(uint32_t) readmaxcnt;
4570	}
4571
4572	if (readblockcnt) {
4573	temp = readblockcnt * blksize;
4574	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
4575
4576	if (temp < mp->mnt_maxreadcnt) {
4577	mp->mnt_maxreadcnt = (u_int32_t)temp;
4578	}
4579	}
4580
4581	if (writemaxcnt) {
4582	mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : (uint32_t)writemaxcnt;
4583	}
4584
4585	if (writeblockcnt) {
4586	temp = writeblockcnt * blksize;
4587	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
4588
4589	if (temp < mp->mnt_maxwritecnt) {
4590	mp->mnt_maxwritecnt = (u_int32_t)temp;
4591	}
4592	}
4593
4594	if (readsegcnt) {
4595	temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
4596	} else {
4597	temp = mp->mnt_maxreadcnt / PAGE_SIZE;
4598
4599	if (temp > UINT16_MAX) {
4600	temp = UINT16_MAX;
4601	}
4602	}
4603	mp->mnt_segreadcnt = (u_int16_t)temp;
4604
4605	if (writesegcnt) {
4606	temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
4607	} else {
4608	temp = mp->mnt_maxwritecnt / PAGE_SIZE;
4609
4610	if (temp > UINT16_MAX) {
4611	temp = UINT16_MAX;
4612	}
4613	}
4614	mp->mnt_segwritecnt = (u_int16_t)temp;
4615
4616	if (readsegsize) {
4617	temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
4618	} else {
4619	temp = mp->mnt_maxreadcnt;
4620	}
4621	mp->mnt_maxsegreadsize = (u_int32_t)temp;
4622
4623	if (writesegsize) {
4624	temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
4625	} else {
4626	temp = mp->mnt_maxwritecnt;
4627	}
4628	mp->mnt_maxsegwritesize = (u_int32_t)temp;
4629
4630	if (alignment) {
4631	temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - `1`;
4632	} else {
4633	temp = `0`;
4634	}
4635	mp->mnt_alignmentmask = (uint32_t)temp;
4636
4637
4638	if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
4639	temp = ioqueue_depth;
4640	} else {
4641	temp = MNT_DEFAULT_IOQUEUE_DEPTH;
4642	}
4643
4644	mp->mnt_ioqueue_depth = (uint32_t)temp;
4645	mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
4646
4647	if (mp->mnt_ioscale > `1`) {
4648	printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
4649	}
4650
4651	if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
4652	mp->mnt_ioflags \|= MNT_IOFLAGS_FUA_SUPPORTED;
4653	}
4654
4655	if (VNOP_IOCTL(vp: devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, data: (caddr_t)&minsaturationbytecount, fflag: `0`, ctx) == `0`) {
4656	mp->mnt_minsaturationbytecount = minsaturationbytecount;
4657	} else {
4658	mp->mnt_minsaturationbytecount = `0`;
4659	}
4660
4661	if (VNOP_IOCTL(vp: devvp, DKIOCCORESTORAGE, data: (caddr_t)&cs_info, fflag: `0`, ctx) == `0`) {
4662	cs_present = TRUE;
4663	}
4664
4665	if (features & DK_FEATURE_UNMAP) {
4666	mp->mnt_ioflags \|= MNT_IOFLAGS_UNMAP_SUPPORTED;
4667
4668	if (cs_present == TRUE) {
4669	mp->mnt_ioflags \|= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
4670	}
4671	}
4672	if (cs_present == TRUE) {
4673	/*
4674	* for now we'll use the following test as a proxy for
4675	* the underlying drive being FUSION in nature
4676	*/
4677	if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
4678	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
4679	}
4680	} else {
4681	/ Check for APFS Fusion /
4682	dk_apfs_flavour_t flavour;
4683	if ((VNOP_IOCTL(vp: devvp, DKIOCGETAPFSFLAVOUR, data: (caddr_t)&flavour, fflag: `0`, ctx) == `0`) &&
4684	(flavour == DK_APFS_FUSION)) {
4685	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
4686	}
4687	}
4688
4689	if (VNOP_IOCTL(vp: devvp, DKIOCGETLOCATION, data: (caddr_t)&location, fflag: `0`, ctx) == `0`) {
4690	if (location & DK_LOCATION_EXTERNAL) {
4691	mp->mnt_ioflags \|= MNT_IOFLAGS_PERIPHERAL_DRIVE;
4692	mp->mnt_flag \|= MNT_REMOVABLE;
4693	}
4694	}
4695
4696	#if CONFIG_IOSCHED
4697	if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
4698	mp->mnt_ioflags \|= MNT_IOFLAGS_IOSCHED_SUPPORTED;
4699	throttle_info_disable_throttle(devno: mp->mnt_devbsdunit, isfusion: (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != `0`);
4700	}
4701	#endif /* CONFIG_IOSCHED */
4702	return error;
4703	}
4704
4705	static struct klist fs_klist;
4706	static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist");
4707	static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp);
4708
4709	void
4710	vfs_event_init(void)
4711	{
4712	klist_init(list: &fs_klist);
4713	}
4714
4715	void
4716	vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
4717	{
4718	if (event == VQ_DEAD \|\| event == VQ_NOTRESP) {
4719	struct mount *mp = vfs_getvfs(fsid);
4720	if (mp) {
4721	mount_lock_spin(mp);
4722	if (data) {
4723	mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding
4724	} else {
4725	mp->mnt_kern_flag \|= MNT_LNOTRESP; // Not responding
4726	}
4727	mount_unlock(mp);
4728	}
4729	}
4730
4731	lck_mtx_lock(lck: &fs_klist_lock);
4732	KNOTE(&fs_klist, event);
4733	lck_mtx_unlock(lck: &fs_klist_lock);
4734	}
4735
4736	/*
4737	* return the number of mounted filesystems.
4738	*/
4739	static int
4740	sysctl_vfs_getvfscnt(void)
4741	{
4742	return mount_getvfscnt();
4743	}
4744
4745
4746	static int
4747	mount_getvfscnt(void)
4748	{
4749	int ret;
4750
4751	mount_list_lock();
4752	ret = nummounts;
4753	mount_list_unlock();
4754	return ret;
4755	}
4756
4757
4758
4759	static int
4760	mount_fillfsids(fsid_t fsidlst, int* count)
4761	{
4762	struct mount *mp;
4763	int actual = `0`;
4764
4765	actual = `0`;
4766	mount_list_lock();
4767	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4768	if (actual < count) {
4769	fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
4770	actual++;
4771	}
4772	}
4773	mount_list_unlock();
4774	return actual;
4775	}
4776
4777	/*
4778	* fill in the array of fsid_t's up to a max of 'count', the actual
4779	* number filled in will be set in '*actual'. If there are more fsid_t's
4780	* than room in fsidlst then ENOMEM will be returned and '*actual' will
4781	* have the actual count.
4782	* having *actual filled out even in the error case is depended upon.
4783	*/
4784	static int
4785	sysctl_vfs_getvfslist(fsid_t fsidlst, unsigned* long count, unsigned long *actual)
4786	{
4787	struct mount *mp;
4788
4789	*actual = `0`;
4790	mount_list_lock();
4791	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4792	(*actual)++;
4793	if (*actual <= count) {
4794	fsidlst[(*actual) - `1`] = mp->mnt_vfsstat.f_fsid;
4795	}
4796	}
4797	mount_list_unlock();
4798	return *actual <= count ? `0` : ENOMEM;
4799	}
4800
4801	static int
4802	sysctl_vfs_vfslist(__unused struct sysctl_oid oidp, __unused void* *arg1,
4803	__unused int arg2, struct sysctl_req *req)
4804	{
4805	unsigned long actual;
4806	int error;
4807	size_t space;
4808	fsid_t *fsidlst;
4809
4810	/ This is a readonly node. /
4811	if (req->newptr != USER_ADDR_NULL) {
4812	return EPERM;
4813	}
4814
4815	/ they are querying us so just return the space required. /
4816	if (req->oldptr == USER_ADDR_NULL) {
4817	req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
4818	return `0`;
4819	}
4820	again:
4821	/*
4822	* Retrieve an accurate count of the amount of space required to copy
4823	* out all the fsids in the system.
4824	*/
4825	space = req->oldlen;
4826	req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
4827
4828	/ they didn't give us enough space. /
4829	if (space < req->oldlen) {
4830	return ENOMEM;
4831	}
4832
4833	fsidlst = kalloc_data(req->oldlen, Z_WAITOK \| Z_ZERO);
4834	if (fsidlst == NULL) {
4835	return ENOMEM;
4836	}
4837
4838	error = sysctl_vfs_getvfslist(fsidlst, count: req->oldlen / sizeof(fsid_t),
4839	actual: &actual);
4840	/*
4841	* If we get back ENOMEM, then another mount has been added while we
4842	* slept in malloc above. If this is the case then try again.
4843	*/
4844	if (error == ENOMEM) {
4845	kfree_data(fsidlst, req->oldlen);
4846	req->oldlen = space;
4847	goto again;
4848	}
4849	if (error == `0`) {
4850	error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
4851	}
4852	kfree_data(fsidlst, req->oldlen);
4853	return error;
4854	}
4855
4856	/*
4857	* Do a sysctl by fsid.
4858	*/
4859	static int
4860	sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid oidp, void* arg1, int* arg2,
4861	struct sysctl_req *req)
4862	{
4863	union union_vfsidctl vc;
4864	struct mount *mp = NULL;
4865	struct vfsstatfs *sp;
4866	int *name, namelen;
4867	int flags = `0`;
4868	int error = `0`, gotref = `0`;
4869	vfs_context_t ctx = vfs_context_current();
4870	proc_t p = req->p; / XXX req->p != current_proc()? /
4871	boolean_t is_64_bit;
4872	union {
4873	struct statfs64 sfs64;
4874	struct user64_statfs osfs64;
4875	struct user32_statfs osfs32;
4876	} *sfsbuf;
4877
4878	if (req->newptr == USER_ADDR_NULL) {
4879	error = EINVAL;
4880	goto out;
4881	}
4882
4883	name = arg1;
4884	namelen = arg2;
4885	is_64_bit = proc_is64bit(p);
4886
4887	error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
4888	if (error) {
4889	goto out;
4890	}
4891	if (vc.vc32.vc_vers != VFS_CTL_VERS1) { / works for 32 and 64 /
4892	error = EINVAL;
4893	goto out;
4894	}
4895	mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, `0`, `1`); / works for 32 and 64 /
4896	if (mp == NULL) {
4897	error = ENOENT;
4898	goto out;
4899	}
4900	gotref = `1`;
4901	/ reset so that the fs specific code can fetch it. /
4902	req->newidx = `0`;
4903	/*
4904	* Note if this is a VFS_CTL then we pass the actual sysctl req
4905	* in for "oldp" so that the lower layer can DTRT and use the
4906	* SYSCTL_IN/OUT routines.
4907	*/
4908	if (mp->mnt_op->vfs_sysctl != NULL) {
4909	if (is_64_bit) {
4910	if (vfs_64bitready(mp)) {
4911	error = mp->mnt_op->vfs_sysctl(name, namelen,
4912	CAST_USER_ADDR_T(req),
4913	NULL, USER_ADDR_NULL, `0`,
4914	ctx);
4915	} else {
4916	error = ENOTSUP;
4917	}
4918	} else {
4919	error = mp->mnt_op->vfs_sysctl(name, namelen,
4920	CAST_USER_ADDR_T(req),
4921	NULL, USER_ADDR_NULL, `0`,
4922	ctx);
4923	}
4924	if (error != ENOTSUP) {
4925	goto out;
4926	}
4927	}
4928	switch (name[`0`]) {
4929	case VFS_CTL_UMOUNT:
4930	#if CONFIG_MACF
4931	error = mac_mount_check_umount(ctx, mp);
4932	if (error != `0`) {
4933	goto out;
4934	}
4935	#endif
4936	req->newidx = `0`;
4937	if (is_64_bit) {
4938	req->newptr = vc.vc64.vc_ptr;
4939	req->newlen = (size_t)vc.vc64.vc_len;
4940	} else {
4941	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
4942	req->newlen = vc.vc32.vc_len;
4943	}
4944	error = SYSCTL_IN(req, &flags, sizeof(flags));
4945	if (error) {
4946	break;
4947	}
4948
4949	mount_ref(mp, locked: `0`);
4950	mount_iterdrop(mp);
4951	gotref = `0`;
4952	/ safedounmount consumes a ref /
4953	error = safedounmount(mp, flags, ctx);
4954	break;
4955	case VFS_CTL_OSTATFS:
4956	case VFS_CTL_STATFS64:
4957	#if CONFIG_MACF
4958	error = mac_mount_check_stat(ctx, mp);
4959	if (error != `0`) {
4960	break;
4961	}
4962	#endif
4963	req->newidx = `0`;
4964	if (is_64_bit) {
4965	req->newptr = vc.vc64.vc_ptr;
4966	req->newlen = (size_t)vc.vc64.vc_len;
4967	} else {
4968	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
4969	req->newlen = vc.vc32.vc_len;
4970	}
4971	error = SYSCTL_IN(req, &flags, sizeof(flags));
4972	if (error) {
4973	break;
4974	}
4975	sp = &mp->mnt_vfsstat;
4976	if (((flags & MNT_NOWAIT) == `0` \|\| (flags & (MNT_WAIT \| MNT_DWAIT))) &&
4977	(error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
4978	goto out;
4979	}
4980
4981	sfsbuf = kalloc_type(typeof(*sfsbuf), Z_WAITOK);
4982
4983	if (name[`0`] == VFS_CTL_STATFS64) {
4984	struct statfs64 *sfs = &sfsbuf->sfs64;
4985
4986	vfs_get_statfs64(mp, sfs);
4987	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
4988	} else if (is_64_bit) {
4989	struct user64_statfs *sfs = &sfsbuf->osfs64;
4990
4991	bzero(s: sfs, n: sizeof(*sfs));
4992	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4993	sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
4994	sfs->f_bsize = (user64_long_t)sp->f_bsize;
4995	sfs->f_iosize = (user64_long_t)sp->f_iosize;
4996	sfs->f_blocks = (user64_long_t)sp->f_blocks;
4997	sfs->f_bfree = (user64_long_t)sp->f_bfree;
4998	sfs->f_bavail = (user64_long_t)sp->f_bavail;
4999	sfs->f_files = (user64_long_t)sp->f_files;
5000	sfs->f_ffree = (user64_long_t)sp->f_ffree;
5001	sfs->f_fsid = sp->f_fsid;
5002	sfs->f_owner = sp->f_owner;
5003	vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSNAMELEN);
5004	strlcpy(dst: sfs->f_mntonname, src: sp->f_mntonname, MNAMELEN);
5005	strlcpy(dst: sfs->f_mntfromname, src: sp->f_mntfromname, MNAMELEN);
5006
5007	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
5008	} else {
5009	struct user32_statfs *sfs = &sfsbuf->osfs32;
5010	long temp;
5011
5012	bzero(s: sfs, n: sizeof(*sfs));
5013	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
5014	sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
5015
5016	/*
5017	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
5018	* have to fudge the numbers here in that case. We inflate the blocksize in order
5019	* to reflect the filesystem size as best we can.
5020	*/
5021	if (sp->f_blocks > INT_MAX) {
5022	int shift;
5023
5024	/*
5025	* Work out how far we have to shift the block count down to make it fit.
5026	* Note that it's possible to have to shift so far that the resulting
5027	* blocksize would be unreportably large. At that point, we will clip
5028	* any values that don't fit.
5029	*
5030	* For safety's sake, we also ensure that f_iosize is never reported as
5031	* being smaller than f_bsize.
5032	*/
5033	for (shift = `0`; shift < `32`; shift++) {
5034	if ((sp->f_blocks >> shift) <= INT_MAX) {
5035	break;
5036	}
5037	if ((((long long)sp->f_bsize) << (shift + `1`)) > INT_MAX) {
5038	break;
5039	}
5040	}
5041	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
5042	sfs->f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
5043	sfs->f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
5044	sfs->f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
5045	#undef __SHIFT_OR_CLIP
5046	sfs->f_bsize = (user32_long_t)(sp->f_bsize << shift);
5047	temp = lmax(a: sp->f_iosize, b: sp->f_bsize);
5048	if (temp > INT32_MAX) {
5049	error = EINVAL;
5050	kfree_type(typeof(*sfsbuf), sfsbuf);
5051	goto out;
5052	}
5053	sfs->f_iosize = (user32_long_t)temp;
5054	} else {
5055	sfs->f_bsize = (user32_long_t)sp->f_bsize;
5056	sfs->f_iosize = (user32_long_t)sp->f_iosize;
5057	sfs->f_blocks = (user32_long_t)sp->f_blocks;
5058	sfs->f_bfree = (user32_long_t)sp->f_bfree;
5059	sfs->f_bavail = (user32_long_t)sp->f_bavail;
5060	}
5061	sfs->f_files = (user32_long_t)sp->f_files;
5062	sfs->f_ffree = (user32_long_t)sp->f_ffree;
5063	sfs->f_fsid = sp->f_fsid;
5064	sfs->f_owner = sp->f_owner;
5065
5066	vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSNAMELEN);
5067	strlcpy(dst: sfs->f_mntonname, src: sp->f_mntonname, MNAMELEN);
5068	strlcpy(dst: sfs->f_mntfromname, src: sp->f_mntfromname, MNAMELEN);
5069
5070	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
5071	}
5072	kfree_type(typeof(*sfsbuf), sfsbuf);
5073	break;
5074	default:
5075	error = ENOTSUP;
5076	goto out;
5077	}
5078	out:
5079	if (gotref != `0`) {
5080	mount_iterdrop(mp);
5081	}
5082	return error;
5083	}
5084
5085	static int filt_fsattach(struct knote kn, struct* kevent_qos_s *kev);
5086	static void filt_fsdetach(struct knote *kn);
5087	static int filt_fsevent(struct knote kn, long* hint);
5088	static int filt_fstouch(struct knote kn, struct* kevent_qos_s *kev);
5089	static int filt_fsprocess(struct knote kn, struct* kevent_qos_s *kev);
5090	SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
5091	.f_attach = filt_fsattach,
5092	.f_detach = filt_fsdetach,
5093	.f_event = filt_fsevent,
5094	.f_touch = filt_fstouch,
5095	.f_process = filt_fsprocess,
5096	};
5097
5098	static int
5099	filt_fsattach(struct knote kn, __unused struct* kevent_qos_s *kev)
5100	{
5101	kn->kn_flags \|= EV_CLEAR; / automatic /
5102	kn->kn_sdata = `0`; / incoming data is ignored /
5103
5104	lck_mtx_lock(lck: &fs_klist_lock);
5105	KNOTE_ATTACH(&fs_klist, kn);
5106	lck_mtx_unlock(lck: &fs_klist_lock);
5107
5108	/*
5109	* filter only sees future events,
5110	* so it can't be fired already.
5111	*/
5112	return `0`;
5113	}
5114
5115	static void
5116	filt_fsdetach(struct knote *kn)
5117	{
5118	lck_mtx_lock(lck: &fs_klist_lock);
5119	KNOTE_DETACH(&fs_klist, kn);
5120	lck_mtx_unlock(lck: &fs_klist_lock);
5121	}
5122
5123	static int
5124	filt_fsevent(struct knote kn, long* hint)
5125	{
5126	/*
5127	* Backwards compatibility:
5128	* Other filters would do nothing if kn->kn_sfflags == 0
5129	*/
5130
5131	if ((kn->kn_sfflags == `0`) \|\| (kn->kn_sfflags & hint)) {
5132	kn->kn_fflags \|= hint;
5133	}
5134
5135	return kn->kn_fflags != `0`;
5136	}
5137
5138	static int
5139	filt_fstouch(struct knote kn, struct* kevent_qos_s *kev)
5140	{
5141	int res;
5142
5143	lck_mtx_lock(lck: &fs_klist_lock);
5144
5145	kn->kn_sfflags = kev->fflags;
5146
5147	/*
5148	* the above filter function sets bits even if nobody is looking for them.
5149	* Just preserve those bits even in the new mask is more selective
5150	* than before.
5151	*
5152	* For compatibility with previous implementations, we leave kn_fflags
5153	* as they were before.
5154	*/
5155	//if (kn->kn_sfflags)
5156	// kn->kn_fflags &= kn->kn_sfflags;
5157	res = (kn->kn_fflags != `0`);
5158
5159	lck_mtx_unlock(lck: &fs_klist_lock);
5160
5161	return res;
5162	}
5163
5164	static int
5165	filt_fsprocess(struct knote kn, struct* kevent_qos_s *kev)
5166	{
5167	int res = `0`;
5168
5169	lck_mtx_lock(lck: &fs_klist_lock);
5170	if (kn->kn_fflags) {
5171	knote_fill_kevent(kn, kev, data: `0`);
5172	res = `1`;
5173	}
5174	lck_mtx_unlock(lck: &fs_klist_lock);
5175	return res;
5176	}
5177
5178	static int
5179	sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
5180	__unused void arg1, __unused int* arg2, struct sysctl_req *req)
5181	{
5182	int out, error;
5183	pid_t pid;
5184	proc_t p;
5185
5186	/ We need a pid. /
5187	if (req->newptr == USER_ADDR_NULL) {
5188	return EINVAL;
5189	}
5190
5191	error = SYSCTL_IN(req, &pid, sizeof(pid));
5192	if (error) {
5193	return error;
5194	}
5195
5196	p = proc_find(pid: pid < `0` ? -pid : pid);
5197	if (p == NULL) {
5198	return ESRCH;
5199	}
5200
5201	/*
5202	* Fetching the value is ok, but we only fetch if the old
5203	* pointer is given.
5204	*/
5205	if (req->oldptr != USER_ADDR_NULL) {
5206	out = !((p->p_flag & P_NOREMOTEHANG) == `0`);
5207	proc_rele(p);
5208	error = SYSCTL_OUT(req, &out, sizeof(out));
5209	return error;
5210	}
5211
5212	/ cansignal offers us enough security. /
5213	if (p != req->p && proc_suser(p: req->p) != `0`) {
5214	proc_rele(p);
5215	return EPERM;
5216	}
5217
5218	if (pid < `0`) {
5219	OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
5220	} else {
5221	OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
5222	}
5223	proc_rele(p);
5224
5225	return `0`;
5226	}
5227
5228	static int
5229	sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
5230	{
5231	int *name, namelen;
5232	struct vfstable *vfsp;
5233	struct vfsconf vfsc = {};
5234
5235	(void)oidp;
5236	name = arg1;
5237	namelen = arg2;
5238
5239	if (namelen < `1`) {
5240	return EISDIR;
5241	} else if (namelen > `1`) {
5242	return ENOTDIR;
5243	}
5244
5245	mount_list_lock();
5246	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
5247	if (vfsp->vfc_typenum == name[`0`]) {
5248	break;
5249	}
5250	}
5251
5252	if (vfsp == NULL) {
5253	mount_list_unlock();
5254	return ENOTSUP;
5255	}
5256
5257	vfsc.vfc_reserved1 = `0`;
5258	bcopy(src: vfsp->vfc_name, dst: vfsc.vfc_name, n: sizeof(vfsc.vfc_name));
5259	vfsc.vfc_typenum = vfsp->vfc_typenum;
5260	vfsc.vfc_refcount = vfsp->vfc_refcount;
5261	vfsc.vfc_flags = vfsp->vfc_flags;
5262	vfsc.vfc_reserved2 = `0`;
5263	vfsc.vfc_reserved3 = `0`;
5264
5265	mount_list_unlock();
5266	return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
5267	}
5268
5269	/ the vfs.generic. branch. /
5270	SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic,
5271	CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, "vfs generic hinge");
5272	/ retreive a list of mounted filesystem fsid_t /
5273	SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
5274	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
5275	NULL, `0`, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
5276	/ perform operations on filesystem via fsid_t /
5277	SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW \| CTLFLAG_LOCKED,
5278	sysctl_vfs_ctlbyfsid, "ctlbyfsid");
5279	SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW \| CTLFLAG_ANYBODY,
5280	NULL, `0`, sysctl_vfs_noremotehang, "I", "noremotehang");
5281	SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
5282	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
5283	&maxvfstypenum, `0`, "");
5284	SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW \| CTLFLAG_LOCKED, &sync_timeout_seconds, `0`, "");
5285	SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
5286	CTLFLAG_RD \| CTLFLAG_LOCKED,
5287	sysctl_vfs_generic_conf, "");
5288	#if DEVELOPMENT \|\| DEBUG
5289	SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes,
5290	CTLTYPE_INT \| CTLFLAG_RW,
5291	&print_busy_vnodes, `0`,
5292	"VFS log busy vnodes blocking unmount");
5293	#endif
5294
5295	/ Indicate that the root file system unmounted cleanly /
5296	static int vfs_root_unmounted_cleanly = `0`;
5297	SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, `0`, "Root filesystem was unmounted cleanly");
5298
5299	void
5300	vfs_set_root_unmounted_cleanly(void)
5301	{
5302	vfs_root_unmounted_cleanly = `1`;
5303	}
5304
5305	/*
5306	* Print vnode state.
5307	*/
5308	void
5309	vn_print_state(struct vnode vp, const* char *fmt, ...)
5310	{
5311	va_list ap;
5312	char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
5313	char fs_name[MFSNAMELEN];
5314
5315	va_start(ap, fmt);
5316	vprintf(fmt, ap);
5317	va_end(ap);
5318	printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
5319	printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
5320	/ Counts .. /
5321	printf(" iocount %d, usecount %d, kusecount %d references %d\n",
5322	vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
5323	printf(" writecount %d, numoutput %d\n", vp->v_writecount,
5324	vp->v_numoutput);
5325	/ Flags /
5326	printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
5327	vp->v_lflag, vp->v_listflag);
5328
5329	if (vp->v_mount == NULL \|\| vp->v_mount == dead_mountp) {
5330	strlcpy(dst: fs_name, src: "deadfs", MFSNAMELEN);
5331	} else {
5332	vfs_name(mp: vp->v_mount, buffer: fs_name);
5333	}
5334
5335	printf(" v_data 0x%0llx %s\n",
5336	(vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : `0`),
5337	perm_str);
5338	printf(" v_mount 0x%0llx %s vfs_name %s\n",
5339	(vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : `0`),
5340	perm_str, fs_name);
5341	}
5342
5343	long num_reusedvnodes = `0`;
5344
5345
5346	static vnode_t
5347	process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred)
5348	{
5349	unsigned int vpid;
5350
5351	*deferred = `0`;
5352
5353	vpid = vp->v_id;
5354
5355	vnode_list_remove_locked(vp);
5356
5357	vnode_hold(vp);
5358	vnode_list_unlock();
5359
5360	vnode_lock_spin(vp);
5361
5362	/*
5363	* We could wait for the vnode_lock after removing the vp from the freelist
5364	* and the vid is bumped only at the very end of reclaim. So it is possible
5365	* that we are looking at a vnode that is being terminated. If so skip it.
5366	*/
5367	if ((vpid != vp->v_id) \|\| (vp->v_usecount != `0`) \|\| (vp->v_iocount != `0`) \|\|
5368	VONLIST(vp) \|\| (vp->v_lflag & VL_TERMINATE)) {
5369	/*
5370	* we lost the race between dropping the list lock
5371	* and picking up the vnode_lock... someone else
5372	* used this vnode and it is now in a new state
5373	*/
5374	vnode_drop_and_unlock(vp);
5375
5376	return NULLVP;
5377	}
5378	if ((vp->v_lflag & (VL_NEEDINACTIVE \| VL_MARKTERM)) == VL_NEEDINACTIVE) {
5379	/*
5380	* we did a vnode_rele_ext that asked for
5381	* us not to reenter the filesystem during
5382	* the release even though VL_NEEDINACTIVE was
5383	* set... we'll do it here by doing a
5384	* vnode_get/vnode_put
5385	*
5386	* pick up an iocount so that we can call
5387	* vnode_put and drive the VNOP_INACTIVE...
5388	* vnode_put will either leave us off
5389	* the freelist if a new ref comes in,
5390	* or put us back on the end of the freelist
5391	* or recycle us if we were marked for termination...
5392	* so we'll just go grab a new candidate
5393	*/
5394	vp->v_iocount++;
5395	#ifdef CONFIG_IOCOUNT_TRACE
5396	record_vp(vp, `1`);
5397	#endif
5398	vnode_put_locked(vp);
5399	vnode_drop_and_unlock(vp);
5400
5401	return NULLVP;
5402	}
5403	/*
5404	* Checks for anyone racing us for recycle
5405	*/
5406	if (vp->v_type != VBAD) {
5407	if ((want_vp \|\| can_defer) && (vnode_on_reliable_media(vp) == FALSE \|\| (vp->v_flag & VISDIRTY))) {
5408	vnode_async_list_add(vp);
5409	vnode_drop_and_unlock(vp);
5410
5411	*deferred = `1`;
5412
5413	return NULLVP;
5414	}
5415	if (vp->v_lflag & VL_DEAD) {
5416	panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
5417	}
5418
5419	vnode_lock_convert(vp);
5420	(void)vnode_reclaim_internal(vp, `1`, want_vp, `0`);
5421
5422	if (want_vp) {
5423	if ((VONLIST(vp))) {
5424	panic("new_vnode(%p): vp on list", vp);
5425	}
5426	if (vp->v_usecount \|\| vp->v_iocount \|\| vp->v_kusecount \|\|
5427	(vp->v_lflag & (VNAMED_UBC \| VNAMED_MOUNT \| VNAMED_FSHASH))) {
5428	panic("new_vnode(%p): free vnode still referenced", vp);
5429	}
5430	if ((vp->v_mntvnodes.tqe_prev != `0`) && (vp->v_mntvnodes.tqe_next != `0`)) {
5431	panic("new_vnode(%p): vnode seems to be on mount list", vp);
5432	}
5433	if (!LIST_EMPTY(&vp->v_nclinks) \|\| !TAILQ_EMPTY(&vp->v_ncchildren)) {
5434	panic("new_vnode(%p): vnode still hooked into the name cache", vp);
5435	}
5436	} else {
5437	vnode_drop_and_unlock(vp);
5438	vp = NULLVP;
5439	}
5440	}
5441	return vp;
5442	}
5443
5444	__attribute__((noreturn))
5445	static void
5446	async_work_continue(void)
5447	{
5448	struct async_work_lst *q;
5449	int deferred;
5450	vnode_t vp;
5451
5452	q = &vnode_async_work_list;
5453
5454	for (;;) {
5455	vnode_list_lock();
5456
5457	if (TAILQ_EMPTY(q)) {
5458	assert_wait(event: q, interruptible: (THREAD_UNINT));
5459
5460	vnode_list_unlock();
5461
5462	thread_block(continuation: (thread_continue_t)async_work_continue);
5463
5464	continue;
5465	}
5466	async_work_handled++;
5467
5468	vp = TAILQ_FIRST(q);
5469
5470	vp = process_vp(vp, want_vp: `0`, false, deferred: &deferred);
5471
5472	if (vp != NULLVP) {
5473	panic("found VBAD vp (%p) on async queue", vp);
5474	}
5475	}
5476	}
5477
5478	#if CONFIG_JETSAM
5479	bool do_async_jetsam = false;
5480	#endif
5481
5482	__attribute__((noreturn))
5483	static void
5484	vn_laundry_continue(void)
5485	{
5486	struct freelst *free_q;
5487	struct ragelst *rage_q;
5488	vnode_t vp;
5489	int deferred;
5490	bool rage_q_empty;
5491	bool free_q_empty;
5492
5493
5494	free_q = &vnode_free_list;
5495	rage_q = &vnode_rage_list;
5496
5497	for (;;) {
5498	vnode_list_lock();
5499
5500	#if CONFIG_JETSAM
5501	if (do_async_jetsam) {
5502	do_async_jetsam = false;
5503	if (deadvnodes <= deadvnodes_low) {
5504	vnode_list_unlock();
5505
5506	log(LOG_EMERG, "Initiating vnode jetsam : %d desired, %ld numvnodes, "
5507	"%ld free, %ld dead, %ld async, %d rage\n",
5508	desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
5509
5510	memorystatus_kill_on_vnode_limit();
5511
5512	continue;
5513	}
5514	}
5515	#endif
5516
5517	if (!TAILQ_EMPTY(&vnode_async_work_list)) {
5518	vp = TAILQ_FIRST(&vnode_async_work_list);
5519	async_work_handled++;
5520
5521	vp = process_vp(vp, want_vp: `0`, false, deferred: &deferred);
5522
5523	if (vp != NULLVP) {
5524	panic("found VBAD vp (%p) on async queue", vp);
5525	}
5526	continue;
5527	}
5528
5529	free_q_empty = TAILQ_EMPTY(free_q);
5530	rage_q_empty = TAILQ_EMPTY(rage_q);
5531
5532	if (!rage_q_empty && !free_q_empty) {
5533	struct timeval current_tv;
5534
5535	microuptime(tv: &current_tv);
5536	if (ragevnodes < rage_limit &&
5537	((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) {
5538	rage_q_empty = true;
5539	}
5540	}
5541
5542	if (numvnodes < numvnodes_min \|\| (rage_q_empty && free_q_empty) \|\|
5543	(reusablevnodes <= reusablevnodes_max && deadvnodes >= deadvnodes_high)) {
5544	assert_wait(event: free_q, interruptible: (THREAD_UNINT));
5545
5546	vnode_list_unlock();
5547
5548	thread_block(continuation: (thread_continue_t)vn_laundry_continue);
5549
5550	continue;
5551	}
5552
5553	if (!rage_q_empty) {
5554	vp = TAILQ_FIRST(rage_q);
5555	} else {
5556	vp = TAILQ_FIRST(free_q);
5557	}
5558
5559	vp = process_vp(vp, want_vp: `0`, false, deferred: &deferred);
5560
5561	if (vp != NULLVP) {
5562	/ If process_vp returns a vnode, it is locked and has a holdcount /
5563	vnode_drop_and_unlock(vp);
5564	vp = NULLVP;
5565	}
5566	}
5567	}
5568
5569	static inline void
5570	wakeup_laundry_thread()
5571	{
5572	if (deadvnodes_noreuse \|\| (numvnodes >= numvnodes_min && deadvnodes < deadvnodes_low &&
5573	(reusablevnodes > reusablevnodes_max \|\| numvnodes >= desiredvnodes))) {
5574	wakeup(chan: &vnode_free_list);
5575	}
5576	}
5577
5578	/*
5579	* This must be called under vnode_list_lock() to prevent race when accessing
5580	* various vnode stats.
5581	*/
5582	static void
5583	send_freeable_vnodes_telemetry(void)
5584	{
5585	bool send_event = false;
5586
5587	/*
5588	* Log an event when the 'numvnodes' is above the freeable vnodes threshold
5589	* or when it falls back within the threshold.
5590	* When the 'numvnodes' is above the threshold, log an event when it has
5591	* been incrementally growing by 25%.
5592	*/
5593	if ((numvnodes > desiredvnodes) && (freevnodes + deadvnodes) == `0`) {
5594	long last_numvnodes = freeable_vnodes_telemetry.numvnodes;
5595
5596	if (numvnodes > (last_numvnodes + ((last_numvnodes * `25`) / `100`)) \|\|
5597	numvnodes >= numvnodes_max) {
5598	send_event = true;
5599	}
5600	freeablevnodes_threshold_crossed = true;
5601	} else if (freeablevnodes_threshold_crossed &&
5602	(freevnodes + deadvnodes) > busyvnodes) {
5603	freeablevnodes_threshold_crossed = false;
5604	send_event = true;
5605	}
5606
5607	if (__improbable(send_event)) {
5608	ca_event_t event = CA_EVENT_ALLOCATE_FLAGS(freeable_vnodes, Z_NOWAIT);
5609
5610	if (event) {
5611	/*
5612	* Update the stats except the 'numvnodes_max' and 'desiredvnodes'
5613	* as they are immutable after init.
5614	*/
5615	freeable_vnodes_telemetry.numvnodes_min = numvnodes_min;
5616	freeable_vnodes_telemetry.numvnodes = numvnodes;
5617	freeable_vnodes_telemetry.freevnodes = freevnodes;
5618	freeable_vnodes_telemetry.deadvnodes = deadvnodes;
5619	freeable_vnodes_telemetry.freeablevnodes = freeablevnodes;
5620	freeable_vnodes_telemetry.busyvnodes = busyvnodes;
5621	freeable_vnodes_telemetry.threshold_crossed =
5622	freeablevnodes_threshold_crossed;
5623
5624	memcpy(dst: event->data, src: &freeable_vnodes_telemetry,
5625	n: sizeof(CA_EVENT_TYPE(freeable_vnodes)));
5626
5627	if (!freeablevnodes_threshold_crossed) {
5628	freeable_vnodes_telemetry.numvnodes = `0`;
5629	}
5630	CA_EVENT_SEND(event);
5631	}
5632	}
5633	}
5634
5635	static int
5636	new_vnode(vnode_t *vpp, bool can_free)
5637	{
5638	long force_alloc_min;
5639	vnode_t vp;
5640	#if CONFIG_JETSAM
5641	uint32_t retries = `0`, max_retries = `2`; / retry incase of tablefull /
5642	#else
5643	uint32_t retries = `0`, max_retries = `100`; / retry incase of tablefull /
5644	#endif
5645	int force_alloc = `0`, walk_count = `0`;
5646	boolean_t need_reliable_vp = FALSE;
5647	int deferred;
5648	struct timeval initial_tv;
5649	struct timeval current_tv;
5650	proc_t curproc = current_proc();
5651	bool force_alloc_freeable = false;
5652
5653	if (vn_dealloc_level == DEALLOC_VNODE_NONE) {
5654	can_free = false;
5655	}
5656
5657	initial_tv.tv_sec = `0`;
5658	retry:
5659	vp = NULLVP;
5660
5661	vnode_list_lock();
5662	newvnode++;
5663
5664	if (need_reliable_vp == TRUE) {
5665	async_work_timed_out++;
5666	}
5667
5668	/*
5669	* The vnode list lock was dropped after force_alloc_freeable was set,
5670	* reevaluate.
5671	*/
5672	force_alloc_min = MAX(desiredvnodes, numvnodes_min);
5673	if (force_alloc_freeable &&
5674	(numvnodes < force_alloc_min \|\| numvnodes >= numvnodes_max)) {
5675	force_alloc_freeable = false;
5676	}
5677
5678	#if CONFIG_JETSAM
5679	if ((numvnodes_max > desiredvnodes) && numvnodes > (numvnodes_max - `100`)
5680	#if (DEVELOPMENT \|\| DEBUG)
5681	&& !bootarg_no_vnode_jetsam
5682	#endif
5683	) {
5684	do_async_jetsam = true;
5685	wakeup(&vnode_free_list);
5686	}
5687	#endif /* CONFIG_JETSAM */
5688
5689	if (((numvnodes - deadvnodes + deadvnodes_noreuse) < desiredvnodes) \|\|
5690	force_alloc \|\| force_alloc_freeable) {
5691	struct timespec ts;
5692	uint32_t vflag = `0`;
5693
5694	/*
5695	* Can always reuse a dead one except if it is in the process of
5696	* being freed or the FS cannot handle freeable vnodes.
5697	*/
5698	if (!TAILQ_EMPTY(&vnode_dead_list)) {
5699	/ Select an appropriate deadvnode /
5700	if (numvnodes <= numvnodes_min \|\| !can_free) {
5701	/ all vnodes upto numvnodes_min are not freeable /
5702	vp = TAILQ_FIRST(&vnode_dead_list);
5703	if (numvnodes > numvnodes_min &&
5704	(vp->v_flag & VCANDEALLOC)) {
5705	/*
5706	* Freeable vnodes are added to the
5707	* back of the queue, so if the first
5708	* from the front is freeable, then
5709	* there are none on the dead list.
5710	*/
5711	vp = NULLVP;
5712	}
5713	} else {
5714	/*
5715	* Filesystems which opt in to freeable vnodes
5716	* can get either one.
5717	*/
5718	TAILQ_FOREACH_REVERSE(vp, &vnode_dead_list,
5719	deadlst, v_freelist) {
5720	if (!(vp->v_listflag & VLIST_NO_REUSE)) {
5721	break;
5722	}
5723	}
5724	}
5725
5726	if (vp) {
5727	force_alloc_freeable = false;
5728	goto steal_this_vp;
5729	}
5730	}
5731
5732	/*
5733	* no dead vnodes available... if we're under
5734	* the limit, we'll create a new vnode
5735	*/
5736	numvnodes++;
5737	if (force_alloc) {
5738	numvnodes_min++;
5739	} else if (can_free && (numvnodes > numvnodes_min)) {
5740	allocedvnodes++;
5741	freeablevnodes++;
5742	vflag = VCANDEALLOC;
5743
5744	send_freeable_vnodes_telemetry();
5745	}
5746	vnode_list_unlock();
5747
5748	if (nc_smr_enabled) {
5749	vp = zalloc_smr(vnode_zone, Z_WAITOK_ZERO_NOFAIL);
5750	} else {
5751	vp = zalloc_flags(vnode_zone, Z_WAITOK_ZERO_NOFAIL);
5752	}
5753
5754	VLISTNONE(vp); / avoid double queue removal /
5755	lck_mtx_init(lck: &vp->v_lock, grp: &vnode_lck_grp, attr: &vnode_lck_attr);
5756
5757	TAILQ_INIT(&vp->v_ncchildren);
5758
5759	klist_init(list: &vp->v_knotes);
5760	nanouptime(ts: &ts);
5761	vp->v_id = (uint32_t)ts.tv_nsec;
5762	vp->v_flag = VSTANDARD \| vflag;
5763	if (force_alloc_freeable) {
5764	/ This vnode should be recycled and freed immediately /
5765	vp->v_lflag = VL_MARKTERM;
5766	vp->v_listflag = VLIST_NO_REUSE;
5767	}
5768
5769	if (vflag & VCANDEALLOC) {
5770	os_atomic_inc(&busyvnodes, relaxed);
5771	}
5772
5773	#if CONFIG_MACF
5774	if (mac_vnode_label_init_needed(vp)) {
5775	mac_vnode_label_init(vp);
5776	}
5777	#endif /* MAC */
5778
5779	#if CONFIG_IOCOUNT_TRACE
5780	if (__improbable(bootarg_vnode_iocount_trace)) {
5781	vp->v_iocount_trace = (vnode_iocount_trace_t)zalloc_permanent(
5782	IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace),
5783	ZALIGN(struct vnode_iocount_trace));
5784	}
5785	#endif /* CONFIG_IOCOUNT_TRACE */
5786
5787	#if CONFIG_FILE_LEASES
5788	LIST_INIT(&vp->v_leases);
5789	#endif
5790
5791	vp->v_iocount = `1`;
5792
5793	goto done;
5794	}
5795
5796	microuptime(tv: &current_tv);
5797
5798	#define MAX_WALK_COUNT 1000
5799
5800	if (!TAILQ_EMPTY(&vnode_rage_list) &&
5801	(ragevnodes >= rage_limit \|\|
5802	(current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
5803	TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
5804	if (!(vp->v_listflag & VLIST_RAGE)) {
5805	panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
5806	}
5807
5808	// if we're a dependency-capable process, skip vnodes that can
5809	// cause recycling deadlocks. (i.e. this process is diskimages
5810	// helper and the vnode is in a disk image). Querying the
5811	// mnt_kern_flag for the mount's virtual device status
5812	// is safer than checking the mnt_dependent_process, which
5813	// may not be updated if there are multiple devnode layers
5814	// in between the disk image and the final consumer.
5815
5816	if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == `0` \|\| vp->v_mount == NULL \|\|
5817	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == `0`) &&
5818	!(vp->v_listflag & VLIST_NO_REUSE) &&
5819	(can_free \|\| !(vp->v_flag & VCANDEALLOC))) {
5820	/*
5821	* if need_reliable_vp == TRUE, then we've already sent one or more
5822	* non-reliable vnodes to the async thread for processing and timed
5823	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
5824	* mechanism to first scan for a reliable vnode before forcing
5825	* a new vnode to be created
5826	*/
5827	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE) {
5828	break;
5829	}
5830	}
5831
5832	// don't iterate more than MAX_WALK_COUNT vnodes to
5833	// avoid keeping the vnode list lock held for too long.
5834
5835	if (walk_count++ > MAX_WALK_COUNT) {
5836	vp = NULL;
5837	break;
5838	}
5839	}
5840	}
5841
5842	if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
5843	/*
5844	* Pick the first vp for possible reuse
5845	*/
5846	walk_count = `0`;
5847	TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
5848	// if we're a dependency-capable process, skip vnodes that can
5849	// cause recycling deadlocks. (i.e. this process is diskimages
5850	// helper and the vnode is in a disk image). Querying the
5851	// mnt_kern_flag for the mount's virtual device status
5852	// is safer than checking the mnt_dependent_process, which
5853	// may not be updated if there are multiple devnode layers
5854	// in between the disk image and the final consumer.
5855
5856	if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == `0` \|\| vp->v_mount == NULL \|\|
5857	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == `0`) &&
5858	!(vp->v_listflag & VLIST_NO_REUSE) &&
5859	(can_free \|\| !(vp->v_flag & VCANDEALLOC))) {
5860	/*
5861	* if need_reliable_vp == TRUE, then we've already sent one or more
5862	* non-reliable vnodes to the async thread for processing and timed
5863	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
5864	* mechanism to first scan for a reliable vnode before forcing
5865	* a new vnode to be created
5866	*/
5867	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE) {
5868	break;
5869	}
5870	}
5871
5872	// don't iterate more than MAX_WALK_COUNT vnodes to
5873	// avoid keeping the vnode list lock held for too long.
5874
5875	if (walk_count++ > MAX_WALK_COUNT) {
5876	vp = NULL;
5877	break;
5878	}
5879	}
5880	}
5881
5882	//
5883	// if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
5884	// then we're trying to create a vnode on behalf of a
5885	// process like diskimages-helper that has file systems
5886	// mounted on top of itself (and thus we can't reclaim
5887	// vnodes in the file systems on top of us). if we can't
5888	// find a vnode to reclaim then we'll just have to force
5889	// the allocation.
5890	//
5891	if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
5892	force_alloc = `1`;
5893	vnode_list_unlock();
5894	goto retry;
5895	}
5896
5897	if (vp == NULL) {
5898	if (can_free && (vn_dealloc_level > DEALLOC_VNODE_NONE) &&
5899	(numvnodes >= force_alloc_min) && (numvnodes < numvnodes_max)) {
5900	force_alloc_freeable = true;
5901	vnode_list_unlock();
5902	goto retry;
5903	}
5904	vnode_list_unlock();
5905
5906	/*
5907	* we've reached the system imposed maximum number of vnodes
5908	* but there isn't a single one available
5909	* wait a bit and then retry... if we can't get a vnode
5910	* after our target number of retries, than log a complaint
5911	*/
5912	if (++retries <= max_retries) {
5913	delay_for_interval(interval: `1`, scale_factor: `1000` * `1000`);
5914	goto retry;
5915	}
5916
5917	tablefull("vnode");
5918	log(LOG_EMERG, "%d desired, %ld numvnodes, "
5919	"%ld free, %ld dead, %ld async, %d rage\n",
5920	desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
5921
5922	#if CONFIG_JETSAM
5923	/*
5924	* Running out of vnodes tends to make a system unusable. Start killing
5925	* processes that jetsam knows are killable.
5926	*/
5927	if (memorystatus_kill_on_vnode_limit() == FALSE
5928	#if DEVELOPMENT \|\| DEBUG
5929	\|\| bootarg_no_vnode_jetsam
5930	#endif
5931	) {
5932	/*
5933	* If jetsam can't find any more processes to kill and there
5934	* still aren't any free vnodes, panic. Hopefully we'll get a
5935	* panic log to tell us why we ran out.
5936	*/
5937	panic("vnode table is full");
5938	}
5939
5940	/*
5941	* Now that we've killed someone, wait a bit and continue looking
5942	*/
5943	delay_for_interval(`3`, `1000` * `1000`);
5944	retries = `0`;
5945	goto retry;
5946	#endif
5947
5948	*vpp = NULL;
5949	return ENFILE;
5950	}
5951	newvnode_nodead++;
5952	steal_this_vp:
5953	if ((vp = process_vp(vp, want_vp: `1`, true, deferred: &deferred)) == NULLVP) {
5954	if (deferred) {
5955	int elapsed_msecs;
5956	struct timeval elapsed_tv;
5957
5958	if (initial_tv.tv_sec == `0`) {
5959	microuptime(tv: &initial_tv);
5960	}
5961
5962	vnode_list_lock();
5963
5964	dead_vnode_waited++;
5965	dead_vnode_wanted++;
5966
5967	/*
5968	* note that we're only going to explicitly wait 10ms
5969	* for a dead vnode to become available, since even if one
5970	* isn't available, a reliable vnode might now be available
5971	* at the head of the VRAGE or free lists... if so, we
5972	* can satisfy the new_vnode request with less latency then waiting
5973	* for the full 100ms duration we're ultimately willing to tolerate
5974	*/
5975	assert_wait_timeout(event: (caddr_t)&dead_vnode_wanted, interruptible: (THREAD_INTERRUPTIBLE), interval: `10000`, NSEC_PER_USEC);
5976
5977	vnode_list_unlock();
5978
5979	thread_block(THREAD_CONTINUE_NULL);
5980
5981	microuptime(tv: &elapsed_tv);
5982
5983	timevalsub(t1: &elapsed_tv, t2: &initial_tv);
5984	elapsed_msecs = (int)(elapsed_tv.tv_sec * `1000` + elapsed_tv.tv_usec / `1000`);
5985
5986	if (elapsed_msecs >= `100`) {
5987	/*
5988	* we've waited long enough... 100ms is
5989	* somewhat arbitrary for this case, but the
5990	* normal worst case latency used for UI
5991	* interaction is 100ms, so I've chosen to
5992	* go with that.
5993	*
5994	* setting need_reliable_vp to TRUE
5995	* forces us to find a reliable vnode
5996	* that we can process synchronously, or
5997	* to create a new one if the scan for
5998	* a reliable one hits the scan limit
5999	*/
6000	need_reliable_vp = TRUE;
6001	}
6002	}
6003	goto retry;
6004	}
6005	OSAddAtomicLong(`1`, &num_reusedvnodes);
6006
6007
6008	#if CONFIG_MACF
6009	/*
6010	* We should never see VL_LABELWAIT or VL_LABEL here.
6011	* as those operations hold a reference.
6012	*/
6013	assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
6014	assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
6015	if (vp->v_lflag & VL_LABELED \|\| mac_vnode_label(vp) != NULL) {
6016	vnode_lock_convert(vp);
6017	mac_vnode_label_recycle(vp);
6018	} else if (mac_vnode_label_init_needed(vp)) {
6019	vnode_lock_convert(vp);
6020	mac_vnode_label_init(vp);
6021	}
6022
6023	#endif /* MAC */
6024
6025	vp->v_iocount = `1`;
6026	vp->v_lflag = `0`;
6027	vp->v_writecount = `0`;
6028	vp->v_references = `0`;
6029	vp->v_iterblkflags = `0`;
6030	if (can_free && (vp->v_flag & VCANDEALLOC)) {
6031	vp->v_flag = VSTANDARD \| VCANDEALLOC;
6032	} else {
6033	vp->v_flag = VSTANDARD;
6034	}
6035
6036	/ vbad vnodes can point to dead_mountp /
6037	vp->v_mount = NULL;
6038	vp->v_defer_reclaimlist = (vnode_t)`0`;
6039
6040	/ process_vp returns a locked vnode with a holdcount /
6041	vnode_drop_and_unlock(vp);
6042
6043	done:
6044	*vpp = vp;
6045
6046	return `0`;
6047	}
6048
6049	void
6050	vnode_lock(vnode_t vp)
6051	{
6052	lck_mtx_lock(lck: &vp->v_lock);
6053	}
6054
6055	void
6056	vnode_lock_spin(vnode_t vp)
6057	{
6058	lck_mtx_lock_spin(lck: &vp->v_lock);
6059	}
6060
6061	void
6062	vnode_unlock(vnode_t vp)
6063	{
6064	lck_mtx_unlock(lck: &vp->v_lock);
6065	}
6066
6067	void
6068	vnode_hold(vnode_t vp)
6069	{
6070	int32_t old_holdcount = os_atomic_inc_orig(&vp->v_holdcount, relaxed);
6071
6072	if (old_holdcount == INT32_MAX) {
6073	/*
6074	* Because we allow atomic ops on the holdcount it is
6075	* possible that when the vnode is examined, its holdcount
6076	* is different than what will be printed in this
6077	* panic message.
6078	*/
6079	panic("%s: vp %p holdcount overflow from : %d v_tag = %d, v_type = %d, v_flag = %x.",
6080	__FUNCTION__, vp, old_holdcount, vp->v_tag, vp->v_type, vp->v_flag);
6081	}
6082	}
6083
6084	#define VNODE_HOLD_NO_SMR (1<<29) /* Disable vnode_hold_smr */
6085
6086	/*
6087	* To be used when smr is the only protection (cache_lookup and cache_lookup_path)
6088	*/
6089	bool
6090	vnode_hold_smr(vnode_t vp)
6091	{
6092	int32_t holdcount;
6093
6094	/*
6095	* For "high traffic" vnodes like rootvnode, the atomic
6096	* cmpexcg loop below can turn into a infinite loop, no need
6097	* to do it for vnodes that won't be dealloc'ed
6098	*/
6099	if (!(os_atomic_load(&vp->v_flag, relaxed) & VCANDEALLOC)) {
6100	vnode_hold(vp);
6101	return true;
6102	}
6103
6104	for (;;) {
6105	holdcount = os_atomic_load(&vp->v_holdcount, relaxed);
6106
6107	if (holdcount & VNODE_HOLD_NO_SMR) {
6108	return false;
6109	}
6110
6111	if ((os_atomic_cmpxchg(&vp->v_holdcount, holdcount, holdcount + `1`, relaxed) != `0`)) {
6112	return true;
6113	}
6114	}
6115	}
6116
6117	/*
6118	* free callback from smr enabled zones
6119	*/
6120	static void
6121	vnode_smr_free(void *_vp, __unused size_t _size)
6122	{
6123	vnode_t vp = _vp;
6124
6125	bzero(s: vp, n: sizeof(*vp));
6126	}
6127
6128	static vnode_t
6129	vnode_drop_internal(vnode_t vp, bool locked)
6130	{
6131	int32_t old_holdcount = os_atomic_dec_orig(&vp->v_holdcount, relaxed);
6132
6133	if (old_holdcount < `1`) {
6134	if (locked) {
6135	vnode_unlock(vp);
6136	}
6137
6138	/*
6139	* Because we allow atomic ops on the holdcount it is possible
6140	* that when the vnode is examined, its holdcount is different
6141	* than what will be printed in this panic message.
6142	*/
6143	panic("%s : vp %p holdcount -ve: %d. v_tag = %d, v_type = %d, v_flag = %x.",
6144	__FUNCTION__, vp, old_holdcount - `1`, vp->v_tag, vp->v_type, vp->v_flag);
6145	}
6146
6147	if (vn_dealloc_level == DEALLOC_VNODE_NONE \|\| old_holdcount > `1` \|\|
6148	!(vp->v_flag & VCANDEALLOC) \|\| !(vp->v_lflag & VL_DEAD)) {
6149	if (locked) {
6150	vnode_unlock(vp);
6151	}
6152	return vp;
6153	}
6154
6155	if (!locked) {
6156	vnode_lock(vp);
6157	}
6158
6159	if ((os_atomic_load(&vp->v_holdcount, relaxed) != `0`) \|\| vp->v_iocount \|\|
6160	vp->v_usecount \|\| !(vp->v_flag & VCANDEALLOC) \|\| !(vp->v_lflag & VL_DEAD)) {
6161	vnode_unlock(vp);
6162	return vp;
6163	}
6164
6165	vnode_list_lock();
6166
6167	/*
6168	* the v_listflag field is protected by the vnode_list_lock
6169	*/
6170	if (VONLIST(vp) && (vp->v_listflag & VLIST_DEAD) &&
6171	(numvnodes > desiredvnodes \|\| (vp->v_listflag & VLIST_NO_REUSE) \|\|
6172	vn_dealloc_level != DEALLOC_VNODE_ALL \|\| deadvnodes >= deadvnodes_high) &&
6173	(os_atomic_cmpxchg(&vp->v_holdcount, `0`, VNODE_HOLD_NO_SMR, relaxed) != `0`)) {
6174	VREMDEAD("vnode_list_remove", vp);
6175	numvnodes--;
6176	freeablevnodes--;
6177	deallocedvnodes++;
6178	vp->v_listflag = `0`;
6179
6180	send_freeable_vnodes_telemetry();
6181	vnode_list_unlock();
6182
6183	#if CONFIG_MACF
6184	struct label *tmpl = mac_vnode_label(vp);
6185	vp->v_label = NULL;
6186	#endif /* CONFIG_MACF */
6187
6188	vnode_unlock(vp);
6189
6190	#if CONFIG_MACF
6191	if (tmpl) {
6192	mac_vnode_label_free(label: tmpl);
6193	}
6194	#endif /* CONFIG_MACF */
6195
6196	if (nc_smr_enabled) {
6197	zfree_smr(vnode_zone, vp);
6198	} else {
6199	zfree(vnode_zone, vp);
6200	}
6201
6202	vp = NULLVP;
6203	} else {
6204	vnode_list_unlock();
6205	vnode_unlock(vp);
6206	}
6207
6208	return vp;
6209	}
6210
6211	vnode_t
6212	vnode_drop_and_unlock(vnode_t vp)
6213	{
6214	return vnode_drop_internal(vp, true);
6215	}
6216
6217	vnode_t
6218	vnode_drop(vnode_t vp)
6219	{
6220	return vnode_drop_internal(vp, false);
6221	}
6222
6223	SYSCTL_NODE(_vfs, OID_AUTO, vnstats, CTLFLAG_RD \| CTLFLAG_LOCKED, NULL, "vfs vnode stats");
6224
6225	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, vn_dealloc_level,
6226	CTLFLAG_RD \| CTLFLAG_LOCKED,
6227	&vn_dealloc_level, `0`, "");
6228	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, desired_vnodes,
6229	CTLFLAG_RD \| CTLFLAG_LOCKED,
6230	&desiredvnodes, `0`, "");
6231	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_vnodes,
6232	CTLFLAG_RD \| CTLFLAG_LOCKED,
6233	&numvnodes, "");
6234	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_min,
6235	CTLFLAG_RD \| CTLFLAG_LOCKED,
6236	&numvnodes_min, `0`, "");
6237	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_max,
6238	CTLFLAG_RD \| CTLFLAG_LOCKED,
6239	&numvnodes_max, `0`, "");
6240	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_deallocable_vnodes,
6241	CTLFLAG_RD \| CTLFLAG_LOCKED,
6242	&freeablevnodes, `0`, "");
6243	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_deallocable_busy_vnodes,
6244	CTLFLAG_RD \| CTLFLAG_LOCKED,
6245	&busyvnodes, "");
6246	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes,
6247	CTLFLAG_RD \| CTLFLAG_LOCKED,
6248	&deadvnodes, "");
6249	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes_to_dealloc,
6250	CTLFLAG_RD \| CTLFLAG_LOCKED,
6251	&deadvnodes_noreuse, "");
6252	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_async_work_vnodes,
6253	CTLFLAG_RD \| CTLFLAG_LOCKED,
6254	&async_work_vnodes, "");
6255	SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_rapid_aging_vnodes,
6256	CTLFLAG_RD \| CTLFLAG_LOCKED,
6257	&ragevnodes, `0`, "");
6258	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_free_vnodes,
6259	CTLFLAG_RD \| CTLFLAG_LOCKED,
6260	&freevnodes, "");
6261	SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_recycledvnodes,
6262	CTLFLAG_RD \| CTLFLAG_LOCKED,
6263	&num_recycledvnodes, "");
6264	SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_allocedvnodes,
6265	CTLFLAG_RD \| CTLFLAG_LOCKED,
6266	&allocedvnodes, "");
6267	SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_deallocedvnodes,
6268	CTLFLAG_RD \| CTLFLAG_LOCKED,
6269	&deallocedvnodes, "");
6270	SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls,
6271	CTLFLAG_RD \| CTLFLAG_LOCKED,
6272	&newvnode, "");
6273	SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls_nodead,
6274	CTLFLAG_RD \| CTLFLAG_LOCKED,
6275	&newvnode_nodead, "");
6276
6277	int
6278	vnode_get(struct vnode *vp)
6279	{
6280	int retval;
6281
6282	vnode_lock_spin(vp);
6283	retval = vnode_get_locked(vp);
6284	vnode_unlock(vp);
6285
6286	return retval;
6287	}
6288
6289	int
6290	vnode_get_locked(struct vnode *vp)
6291	{
6292	#if DIAGNOSTIC
6293	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
6294	#endif
6295	if ((vp->v_iocount == `0`) && (vp->v_lflag & (VL_TERMINATE \| VL_DEAD))) {
6296	return ENOENT;
6297	}
6298
6299	if (os_add_overflow(vp->v_iocount, `1`, &vp->v_iocount)) {
6300	panic("v_iocount overflow");
6301	}
6302
6303	#ifdef CONFIG_IOCOUNT_TRACE
6304	record_vp(vp, `1`);
6305	#endif
6306	return `0`;
6307	}
6308
6309	/*
6310	* vnode_getwithvid() cuts in line in front of a vnode drain (that is,
6311	* while the vnode is draining, but at no point after that) to prevent
6312	* deadlocks when getting vnodes from filesystem hashes while holding
6313	* resources that may prevent other iocounts from being released.
6314	*/
6315	int
6316	vnode_getwithvid(vnode_t vp, uint32_t vid)
6317	{
6318	return vget_internal(vp, vid, vflags: (VNODE_NODEAD \| VNODE_WITHID \| VNODE_DRAINO));
6319	}
6320
6321	/*
6322	* vnode_getwithvid_drainok() is like vnode_getwithvid(), but does block behind a vnode
6323	* drain; it exists for use in the VFS name cache, where we really do want to block behind
6324	* vnode drain to prevent holding off an unmount.
6325	*/
6326	int
6327	vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
6328	{
6329	return vget_internal(vp, vid, vflags: (VNODE_NODEAD \| VNODE_WITHID));
6330	}
6331
6332	int
6333	vnode_getwithref(vnode_t vp)
6334	{
6335	return vget_internal(vp, vid: `0`, vflags: `0`);
6336	}
6337
6338	__private_extern__ int
6339	vnode_getwithref_noblock(vnode_t vp)
6340	{
6341	return vget_internal(vp, vid: `0`, VNODE_NOBLOCK);
6342	}
6343
6344	__private_extern__ int
6345	vnode_getalways(vnode_t vp)
6346	{
6347	return vget_internal(vp, vid: `0`, VNODE_ALWAYS);
6348	}
6349
6350	__private_extern__ int
6351	vnode_getalways_from_pager(vnode_t vp)
6352	{
6353	return vget_internal(vp, vid: `0`, VNODE_ALWAYS \| VNODE_PAGER);
6354	}
6355
6356	static inline void
6357	vn_set_dead(vnode_t vp)
6358	{
6359	vp->v_mount = NULL;
6360	vp->v_op = dead_vnodeop_p;
6361	vp->v_tag = VT_NON;
6362	vp->v_data = NULL;
6363	vp->v_type = VBAD;
6364	vp->v_lflag \|= VL_DEAD;
6365	}
6366
6367	static int
6368	vnode_put_internal_locked(vnode_t vp, bool from_pager)
6369	{
6370	vfs_context_t ctx = vfs_context_current(); / hoist outside loop /
6371
6372	#if DIAGNOSTIC
6373	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
6374	#endif
6375	retry:
6376	if (vp->v_iocount < `1`) {
6377	panic("vnode_put(%p): iocount < 1", vp);
6378	}
6379
6380	if ((vp->v_usecount > `0`) \|\| (vp->v_iocount > `1`)) {
6381	vnode_dropiocount(vp);
6382	return `0`;
6383	}
6384
6385	if (((vp->v_lflag & (VL_DEAD \| VL_NEEDINACTIVE)) == VL_NEEDINACTIVE)) {
6386	vp->v_lflag &= ~VL_NEEDINACTIVE;
6387
6388	if (UBCINFOEXISTS(vp)) {
6389	ubc_cs_free_and_vnode_unlock(vp);
6390	} else {
6391	vnode_unlock(vp);
6392	}
6393
6394	VNOP_INACTIVE(vp, ctx);
6395
6396	vnode_lock_spin(vp);
6397	/*
6398	* because we had to drop the vnode lock before calling
6399	* VNOP_INACTIVE, the state of this vnode may have changed...
6400	* we may pick up both VL_MARTERM and either
6401	* an iocount or a usecount while in the VNOP_INACTIVE call
6402	* we don't want to call vnode_reclaim_internal on a vnode
6403	* that has active references on it... so loop back around
6404	* and reevaluate the state
6405	*/
6406	goto retry;
6407	}
6408	vp->v_lflag &= ~VL_NEEDINACTIVE;
6409
6410	vnode_lock_convert(vp);
6411	if ((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM) {
6412	if (from_pager) {
6413	/*
6414	* We can't initiate reclaim when called from the pager
6415	* because it will deadlock with itself so we hand it
6416	* off to the async cleaner thread.
6417	*/
6418	vnode_async_list_add(vp);
6419	} else {
6420	vnode_reclaim_internal(vp, `1`, `1`, `0`);
6421	}
6422	}
6423	vnode_dropiocount(vp);
6424	vnode_list_add(vp);
6425
6426	return `0`;
6427	}
6428
6429	int
6430	vnode_put_locked(vnode_t vp)
6431	{
6432	return vnode_put_internal_locked(vp, false);
6433	}
6434
6435	int
6436	vnode_put(vnode_t vp)
6437	{
6438	int retval;
6439
6440	vnode_lock_spin(vp);
6441	vnode_hold(vp);
6442	retval = vnode_put_internal_locked(vp, false);
6443	vnode_drop_and_unlock(vp);
6444
6445	return retval;
6446	}
6447
6448	int
6449	vnode_put_from_pager(vnode_t vp)
6450	{
6451	int retval;
6452
6453	vnode_lock_spin(vp);
6454	vnode_hold(vp);
6455	/ Cannot initiate reclaim while paging /
6456	retval = vnode_put_internal_locked(vp, true);
6457	vnode_drop_and_unlock(vp);
6458
6459	return retval;
6460	}
6461
6462	int
6463	vnode_writecount(vnode_t vp)
6464	{
6465	return vp->v_writecount;
6466	}
6467
6468	/ is vnode_t in use by others? /
6469	int
6470	vnode_isinuse(vnode_t vp, int refcnt)
6471	{
6472	return vnode_isinuse_locked(vp, refcnt, `0`);
6473	}
6474
6475	int
6476	vnode_usecount(vnode_t vp)
6477	{
6478	return vp->v_usecount;
6479	}
6480
6481	int
6482	vnode_iocount(vnode_t vp)
6483	{
6484	return vp->v_iocount;
6485	}
6486
6487	int
6488	vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
6489	{
6490	int retval = `0`;
6491
6492	if (!locked) {
6493	vnode_lock_spin(vp);
6494	}
6495	if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
6496	retval = `1`;
6497	goto out;
6498	}
6499	if (vp->v_type == VREG) {
6500	retval = ubc_isinuse_locked(vp, refcnt, `1`);
6501	}
6502
6503	out:
6504	if (!locked) {
6505	vnode_unlock(vp);
6506	}
6507	return retval;
6508	}
6509
6510	kauth_cred_t
6511	vnode_cred(vnode_t vp)
6512	{
6513	if (vp->v_cred) {
6514	return kauth_cred_require(cred: vp->v_cred);
6515	}
6516
6517	return NULL;
6518	}
6519
6520
6521	/ resume vnode_t /
6522	errno_t
6523	vnode_resume(vnode_t vp)
6524	{
6525	if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
6526	vnode_lock_spin(vp);
6527	vp->v_lflag &= ~VL_SUSPENDED;
6528	vp->v_owner = NULL;
6529	vnode_unlock(vp);
6530
6531	wakeup(chan: &vp->v_iocount);
6532	}
6533	return `0`;
6534	}
6535
6536	/ suspend vnode_t*
6537	* Please do not use on more than one vnode at a time as it may
6538	* cause deadlocks.
6539	* xxx should we explicity prevent this from happening?
6540	*/
6541
6542	errno_t
6543	vnode_suspend(vnode_t vp)
6544	{
6545	if (vp->v_lflag & VL_SUSPENDED) {
6546	return EBUSY;
6547	}
6548
6549	vnode_lock_spin(vp);
6550
6551	/*
6552	* xxx is this sufficient to check if a vnode_drain is
6553	* progress?
6554	*/
6555
6556	if (vp->v_owner == NULL) {
6557	vp->v_lflag \|= VL_SUSPENDED;
6558	vp->v_owner = current_thread();
6559	}
6560	vnode_unlock(vp);
6561
6562	return `0`;
6563	}
6564
6565	/*
6566	* Release any blocked locking requests on the vnode.
6567	* Used for forced-unmounts.
6568	*
6569	* XXX What about network filesystems?
6570	*/
6571	static void
6572	vnode_abort_advlocks(vnode_t vp)
6573	{
6574	if (vp->v_flag & VLOCKLOCAL) {
6575	lf_abort_advlocks(vp);
6576	}
6577	}
6578
6579
6580	static errno_t
6581	vnode_drain(vnode_t vp)
6582	{
6583	if (vp->v_lflag & VL_DRAIN) {
6584	panic("vnode_drain: recursive drain");
6585	return ENOENT;
6586	}
6587	vp->v_lflag \|= VL_DRAIN;
6588	vp->v_owner = current_thread();
6589
6590	while (vp->v_iocount > `1`) {
6591	if (bootarg_no_vnode_drain) {
6592	struct timespec ts = {.tv_sec = `10`, .tv_nsec = `0`};
6593	int error;
6594
6595	if (vfs_unmountall_started) {
6596	ts.tv_sec = `1`;
6597	}
6598
6599	error = msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_drain_with_timeout", ts: &ts);
6600
6601	/ Try to deal with leaked iocounts under bootarg and shutting down /
6602	if (vp->v_iocount > `1` && error == EWOULDBLOCK &&
6603	ts.tv_sec == `1` && vp->v_numoutput == `0`) {
6604	vp->v_iocount = `1`;
6605	break;
6606	}
6607	} else {
6608	msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_drain", NULL);
6609	}
6610	}
6611
6612	vp->v_lflag &= ~VL_DRAIN;
6613
6614	return `0`;
6615	}
6616
6617
6618	/*
6619	* if the number of recent references via vnode_getwithvid or vnode_getwithref
6620	* exceeds this threshold, than 'UN-AGE' the vnode by removing it from
6621	* the LRU list if it's currently on it... once the iocount and usecount both drop
6622	* to 0, it will get put back on the end of the list, effectively making it younger
6623	* this allows us to keep actively referenced vnodes in the list without having
6624	* to constantly remove and add to the list each time a vnode w/o a usecount is
6625	* referenced which costs us taking and dropping a global lock twice.
6626	* However, if the vnode is marked DIRTY, we want to pull it out much earlier
6627	*/
6628	#define UNAGE_THRESHHOLD 25
6629	#define UNAGE_DIRTYTHRESHHOLD 6
6630
6631	errno_t
6632	vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
6633	{
6634	int nodead = vflags & VNODE_NODEAD;
6635	int nosusp = vflags & VNODE_NOSUSPEND;
6636	int always = vflags & VNODE_ALWAYS;
6637	int beatdrain = vflags & VNODE_DRAINO;
6638	int withvid = vflags & VNODE_WITHID;
6639	int forpager = vflags & VNODE_PAGER;
6640	int noblock = vflags & VNODE_NOBLOCK;
6641
6642	for (;;) {
6643	int sleepflg = `0`;
6644
6645	/*
6646	* if it is a dead vnode with deadfs
6647	*/
6648	if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) \|\| (vp->v_data == `0`))) {
6649	return ENOENT;
6650	}
6651	/*
6652	* will return VL_DEAD ones
6653	*/
6654	if ((vp->v_lflag & (VL_SUSPENDED \| VL_DRAIN \| VL_TERMINATE)) == `0`) {
6655	break;
6656	}
6657	/*
6658	* if suspended vnodes are to be failed
6659	*/
6660	if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
6661	return ENOENT;
6662	}
6663	/*
6664	* if you are the owner of drain/suspend/termination , can acquire iocount
6665	* check for VL_TERMINATE; it does not set owner
6666	*/
6667	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED \| VL_TERMINATE)) &&
6668	(vp->v_owner == current_thread())) {
6669	break;
6670	}
6671
6672	if (always != `0`) {
6673	break;
6674	}
6675
6676	if (noblock && (vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED \| VL_TERMINATE))) {
6677	return ENOENT;
6678	}
6679
6680	/*
6681	* If this vnode is getting drained, there are some cases where
6682	* we can't block or, in case of tty vnodes, want to be
6683	* interruptible.
6684	*/
6685	if (vp->v_lflag & VL_DRAIN) {
6686	/*
6687	* In some situations, we want to get an iocount
6688	* even if the vnode is draining to prevent deadlock,
6689	* e.g. if we're in the filesystem, potentially holding
6690	* resources that could prevent other iocounts from
6691	* being released.
6692	*/
6693	if (beatdrain) {
6694	break;
6695	}
6696	/*
6697	* Don't block if the vnode's mount point is unmounting as
6698	* we may be the thread the unmount is itself waiting on
6699	* Only callers who pass in vids (at this point, we've already
6700	* handled nosusp and nodead) are expecting error returns
6701	* from this function, so only we can only return errors for
6702	* those. ENODEV is intended to inform callers that the call
6703	* failed because an unmount is in progress.
6704	*/
6705	if (withvid && (vp->v_mount) && vfs_isunmount(mp: vp->v_mount)) {
6706	return ENODEV;
6707	}
6708
6709	if (vnode_istty(vp)) {
6710	sleepflg = PCATCH;
6711	}
6712	}
6713
6714	vnode_lock_convert(vp);
6715
6716	if (vp->v_lflag & VL_TERMINATE) {
6717	int error;
6718
6719	vp->v_lflag \|= VL_TERMWANT;
6720
6721	error = msleep(chan: &vp->v_lflag, mtx: &vp->v_lock,
6722	pri: (PVFS \| sleepflg), wmesg: "vnode getiocount", NULL);
6723	if (error) {
6724	return error;
6725	}
6726	} else {
6727	msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_getiocount", NULL);
6728	}
6729	}
6730	if (withvid && vid != vp->v_id) {
6731	return ENOENT;
6732	}
6733	if (!forpager && (++vp->v_references >= UNAGE_THRESHHOLD \|\|
6734	(vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD))) {
6735	vp->v_references = `0`;
6736	vnode_list_remove(vp);
6737	}
6738	vp->v_iocount++;
6739	#ifdef CONFIG_IOCOUNT_TRACE
6740	record_vp(vp, `1`);
6741	#endif
6742	return `0`;
6743	}
6744
6745	static void
6746	vnode_dropiocount(vnode_t vp)
6747	{
6748	if (vp->v_iocount < `1`) {
6749	panic("vnode_dropiocount(%p): v_iocount < 1", vp);
6750	}
6751
6752	vp->v_iocount--;
6753	#ifdef CONFIG_IOCOUNT_TRACE
6754	record_vp(vp, -`1`);
6755	#endif
6756	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED)) && (vp->v_iocount <= `1`)) {
6757	wakeup(chan: &vp->v_iocount);
6758	}
6759	}
6760
6761
6762	void
6763	vnode_reclaim(struct vnode * vp)
6764	{
6765	vnode_reclaim_internal(vp, `0`, `0`, `0`);
6766	}
6767
6768	__private_extern__
6769	void
6770	vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
6771	{
6772	int isfifo = `0`;
6773	bool clear_tty_revoke = false;
6774
6775	if (!locked) {
6776	vnode_lock(vp);
6777	}
6778
6779	if (vp->v_lflag & VL_TERMINATE) {
6780	panic("vnode reclaim in progress");
6781	}
6782	vp->v_lflag \|= VL_TERMINATE;
6783
6784	vn_clearunionwait(vp, `1`);
6785
6786	/*
6787	* We have to force any terminals in reads to return and give up
6788	* their iocounts. It's important to do this after VL_TERMINATE
6789	* has been set to ensure new reads are blocked while the
6790	* revoke is in progress.
6791	*/
6792	if (vnode_istty(vp) && (flags & REVOKEALL) && (vp->v_iocount > `1`)) {
6793	vnode_unlock(vp);
6794	VNOP_IOCTL(vp, TIOCREVOKE, data: (caddr_t)NULL, fflag: `0`, ctx: vfs_context_kernel());
6795	clear_tty_revoke = true;
6796	vnode_lock(vp);
6797	}
6798
6799	vnode_drain(vp);
6800
6801	if (clear_tty_revoke) {
6802	vnode_unlock(vp);
6803	VNOP_IOCTL(vp, TIOCREVOKECLEAR, data: (caddr_t)NULL, fflag: `0`, ctx: vfs_context_kernel());
6804	vnode_lock(vp);
6805	}
6806
6807	#if CONFIG_FILE_LEASES
6808	/*
6809	* Revoke all leases in place for this vnode as it is about to be reclaimed.
6810	* In normal case, there shouldn't be any leases in place by the time we
6811	* get here as there shouldn't be any opens on the vnode (usecount == 0).
6812	* However, in the case of force unmount or unmount of a volume that
6813	* contains file that was opened with O_EVTONLY then the vnode can be
6814	* reclaimed while the file is still opened.
6815	*/
6816	vnode_revokelease(vp, true);
6817	#endif
6818
6819	isfifo = (vp->v_type == VFIFO);
6820
6821	if (vp->v_type != VBAD) {
6822	vgone(vp, flags); / clean and reclaim the vnode /
6823	}
6824	/*
6825	* give the vnode a new identity so that vnode_getwithvid will fail
6826	* on any stale cache accesses...
6827	* grab the list_lock so that if we're in "new_vnode"
6828	* behind the list_lock trying to steal this vnode, the v_id is stable...
6829	* once new_vnode drops the list_lock, it will block trying to take
6830	* the vnode lock until we release it... at that point it will evaluate
6831	* whether the v_vid has changed
6832	* also need to make sure that the vnode isn't on a list where "new_vnode"
6833	* can find it after the v_id has been bumped until we are completely done
6834	* with the vnode (i.e. putting it back on a list has to be the very last
6835	* thing we do to this vnode... many of the callers of vnode_reclaim_internal
6836	* are holding an io_count on the vnode... they need to drop the io_count
6837	* BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
6838	* they are completely done with the vnode
6839	*/
6840	vnode_list_lock();
6841
6842	vnode_list_remove_locked(vp);
6843	vp->v_id++;
6844
6845	vnode_list_unlock();
6846
6847	if (isfifo) {
6848	struct fifoinfo * fip;
6849
6850	fip = vp->v_fifoinfo;
6851	vp->v_fifoinfo = NULL;
6852	kfree_type(struct fifoinfo, fip);
6853	}
6854	vp->v_type = VBAD;
6855
6856	if (vp->v_data) {
6857	panic("vnode_reclaim_internal: cleaned vnode isn't");
6858	}
6859	if (vp->v_numoutput) {
6860	panic("vnode_reclaim_internal: clean vnode has pending I/O's");
6861	}
6862	if (UBCINFOEXISTS(vp)) {
6863	panic("vnode_reclaim_internal: ubcinfo not cleaned");
6864	}
6865	if (vp->v_parent) {
6866	panic("vnode_reclaim_internal: vparent not removed");
6867	}
6868	if (vp->v_name) {
6869	panic("vnode_reclaim_internal: vname not removed");
6870	}
6871
6872	#if CONFIG_FILE_LEASES
6873	if (__improbable(!LIST_EMPTY(&vp->v_leases))) {
6874	panic("vnode_reclaim_internal: vleases NOT empty");
6875	}
6876	#endif
6877
6878	vp->v_socket = NULL;
6879
6880	vp->v_lflag &= ~VL_TERMINATE;
6881	vp->v_owner = NULL;
6882
6883	#if CONFIG_IOCOUNT_TRACE
6884	if (__improbable(bootarg_vnode_iocount_trace)) {
6885	bzero(vp->v_iocount_trace,
6886	IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace));
6887	}
6888	#endif /* CONFIG_IOCOUNT_TRACE */
6889
6890	KNOTE(&vp->v_knotes, NOTE_REVOKE);
6891
6892	/ Make sure that when we reuse the vnode, no knotes left over /
6893	klist_init(list: &vp->v_knotes);
6894
6895	if (vp->v_lflag & VL_TERMWANT) {
6896	vp->v_lflag &= ~VL_TERMWANT;
6897	wakeup(chan: &vp->v_lflag);
6898	}
6899	if (!reuse) {
6900	/*
6901	* make sure we get on the
6902	* dead list if appropriate
6903	*/
6904	vnode_list_add(vp);
6905	}
6906	if (!locked) {
6907	vnode_unlock(vp);
6908	}
6909	}
6910
6911	static int
6912	vnode_create_internal(uint32_t flavor, uint32_t size, void data, vnode_t vpp,
6913	vnode_create_options_t vc_options)
6914	{
6915	int error;
6916	int insert = `1`;
6917	vnode_t vp = NULLVP;
6918	vnode_t nvp;
6919	vnode_t dvp;
6920	struct uthread *ut;
6921	struct componentname *cnp;
6922	struct vnode_fsparam param = (struct* vnode_fsparam *)data;
6923	#if CONFIG_TRIGGERS
6924	struct vnode_trigger_param *tinfo = NULL;
6925	#endif
6926	bool existing_vnode;
6927	bool init_vnode = !(vc_options & VNODE_CREATE_EMPTY);
6928	bool is_bdevvp = false;
6929
6930	if (*vpp) {
6931	vp = *vpp;
6932	*vpp = NULLVP;
6933	existing_vnode = true;
6934	} else {
6935	existing_vnode = false;
6936	}
6937
6938	if (init_vnode) {
6939	/ Do quick sanity check on the parameters. /
6940	if ((param == NULL) \|\| (param->vnfs_vtype == VBAD)) {
6941	error = EINVAL;
6942	goto error_out;
6943	}
6944
6945	#if CONFIG_TRIGGERS
6946	if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
6947	tinfo = (struct vnode_trigger_param *)data;
6948
6949	/ Validate trigger vnode input /
6950	if ((param->vnfs_vtype != VDIR) \|\|
6951	(tinfo->vnt_resolve_func == NULL) \|\|
6952	(tinfo->vnt_flags & ~VNT_VALID_MASK)) {
6953	error = EINVAL;
6954	goto error_out;
6955	}
6956	/ Fall through a normal create (params will be the same) /
6957	flavor = VNCREATE_FLAVOR;
6958	size = VCREATESIZE;
6959	}
6960	#endif
6961	if ((flavor != VNCREATE_FLAVOR) \|\| (size != VCREATESIZE)) {
6962	error = EINVAL;
6963	goto error_out;
6964	}
6965	}
6966
6967	if (!existing_vnode) {
6968	if ((error = new_vnode(vpp: &vp, can_free: !(vc_options & VNODE_CREATE_NODEALLOC)))) {
6969	return error;
6970	}
6971	if (!init_vnode) {
6972	/ Make it so that it can be released by a vnode_put) /
6973	vnode_lock(vp);
6974	vn_set_dead(vp);
6975	vnode_unlock(vp);
6976	*vpp = vp;
6977	return `0`;
6978	}
6979	} else {
6980	/*
6981	* A vnode obtained by vnode_create_empty has been passed to
6982	* vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
6983	* this point, it is set back on any error.
6984	*/
6985	vnode_lock(vp);
6986	vp->v_lflag &= ~VL_DEAD;
6987	vnode_unlock(vp);
6988	}
6989
6990	dvp = param->vnfs_dvp;
6991	cnp = param->vnfs_cnp;
6992
6993	vp->v_op = param->vnfs_vops;
6994	vp->v_type = (uint16_t)param->vnfs_vtype;
6995	vp->v_data = param->vnfs_fsnode;
6996
6997	if (param->vnfs_markroot) {
6998	vp->v_flag \|= VROOT;
6999	}
7000	if (param->vnfs_marksystem) {
7001	vp->v_flag \|= VSYSTEM;
7002	}
7003	if (vp->v_type == VREG) {
7004	error = ubc_info_init_withsize(vp, param->vnfs_filesize);
7005	if (error) {
7006	#ifdef CONFIG_IOCOUNT_TRACE
7007	record_vp(vp, `1`);
7008	#endif
7009	vnode_hold(vp);
7010	vnode_lock(vp);
7011	vn_set_dead(vp);
7012
7013	vnode_put_locked(vp);
7014	vnode_drop_and_unlock(vp);
7015	return error;
7016	}
7017	if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
7018	memory_object_mark_io_tracking(control: vp->v_ubcinfo->ui_control);
7019	}
7020	}
7021	#ifdef CONFIG_IOCOUNT_TRACE
7022	record_vp(vp, `1`);
7023	#endif
7024
7025	#if CONFIG_FIRMLINKS
7026	vp->v_fmlink = NULLVP;
7027	#endif
7028	vp->v_flag &= ~VFMLINKTARGET;
7029
7030	#if CONFIG_TRIGGERS
7031	/*
7032	* For trigger vnodes, attach trigger info to vnode
7033	*/
7034	if ((vp->v_type == VDIR) && (tinfo != NULL)) {
7035	/*
7036	* Note: has a side effect of incrementing trigger count on the
7037	* mount if successful, which we would need to undo on a
7038	* subsequent failure.
7039	*/
7040	#ifdef CONFIG_IOCOUNT_TRACE
7041	record_vp(vp, -`1`);
7042	#endif
7043	error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
7044	if (error) {
7045	printf("vnode_create: vnode_resolver_create() err %d\n", error);
7046	vnode_hold(vp);
7047	vnode_lock(vp);
7048	vn_set_dead(vp);
7049	#ifdef CONFIG_IOCOUNT_TRACE
7050	record_vp(vp, `1`);
7051	#endif
7052	vnode_put_locked(vp);
7053	vnode_drop_and_unlock(vp);
7054	return error;
7055	}
7056	}
7057	#endif
7058	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
7059	vp->v_tag = VT_DEVFS; / callers will reset if needed (bdevvp) /
7060
7061	if ((nvp = checkalias(nvp: vp, nvp_rdev: param->vnfs_rdev))) {
7062	/*
7063	* if checkalias returns a vnode, it will be locked
7064	*
7065	* first get rid of the unneeded vnode we acquired
7066	*/
7067	vp->v_data = NULL;
7068	vp->v_op = spec_vnodeop_p;
7069	vp->v_type = VBAD;
7070	vp->v_lflag = VL_DEAD;
7071	vp->v_data = NULL;
7072	vp->v_tag = VT_NON;
7073	vnode_put(vp);
7074
7075	/*
7076	* switch to aliased vnode and finish
7077	* preparing it
7078	*/
7079	vp = nvp;
7080
7081	is_bdevvp = (vp->v_flag & VBDEVVP);
7082
7083	if (is_bdevvp) {
7084	printf("%s: alias vnode (vid = %u) is in state of change (start) v_flags = 0x%x v_numoutput = %d\n",
7085	__func__, vp->v_id, vp->v_flag, vp->v_numoutput);
7086	}
7087
7088	vnode_hold(vp);
7089	vp->v_lflag \|= VL_OPSCHANGE;
7090	vclean(vp, flags: `0`);
7091	vp->v_op = param->vnfs_vops;
7092	vp->v_type = (uint16_t)param->vnfs_vtype;
7093	vp->v_data = param->vnfs_fsnode;
7094	vp->v_lflag = VL_OPSCHANGE;
7095	vp->v_mount = NULL;
7096	insmntque(vp, mp: param->vnfs_mp);
7097	insert = `0`;
7098
7099	if (is_bdevvp) {
7100	printf("%s: alias vnode (vid = %u), is in state of change (end) v_flags = 0x%x v_numoutput = %d\n",
7101	__func__, vp->v_id, vp->v_flag, vp->v_numoutput);
7102	}
7103
7104	vnode_drop_and_unlock(vp);
7105	wakeup(chan: &vp->v_lflag); / chkvnlock is waitng for VL_DEAD to get unset /
7106	}
7107
7108	if (VCHR == vp->v_type) {
7109	u_int maj = major(vp->v_rdev);
7110
7111	if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
7112	vp->v_flag \|= VISTTY;
7113	}
7114	}
7115	}
7116
7117	if (vp->v_type == VFIFO) {
7118	struct fifoinfo *fip;
7119
7120	fip = kalloc_type(struct fifoinfo, Z_WAITOK \| Z_ZERO);
7121	vp->v_fifoinfo = fip;
7122	}
7123	/ The file systems must pass the address of the location where*
7124	* they store the vnode pointer. When we add the vnode into the mount
7125	* list and name cache they become discoverable. So the file system node
7126	* must have the connection to vnode setup by then
7127	*/
7128	*vpp = vp;
7129
7130	/ Add fs named reference. /
7131	if (param->vnfs_flags & VNFS_ADDFSREF) {
7132	vp->v_lflag \|= VNAMED_FSHASH;
7133	}
7134	if (param->vnfs_mp) {
7135	if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
7136	vp->v_flag \|= VLOCKLOCAL;
7137	}
7138	if (insert) {
7139	if ((vp->v_freelist.tqe_prev != (struct vnode **)`0xdeadb`)) {
7140	panic("insmntque: vp on the free list");
7141	}
7142
7143	/*
7144	* enter in mount vnode list
7145	*/
7146	insmntque(vp, mp: param->vnfs_mp);
7147	}
7148	}
7149	if (dvp && vnode_ref(vp: dvp) == `0`) {
7150	vp->v_parent = dvp;
7151	}
7152	if (cnp) {
7153	if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE \| VNFS_CANTCACHE)) == `0`)) {
7154	/*
7155	* enter into name cache
7156	* we've got the info to enter it into the name cache now
7157	* cache_enter_create will pick up an extra reference on
7158	* the name entered into the string cache
7159	*/
7160	vp->v_name = cache_enter_create(dvp, vp, cnp);
7161	} else {
7162	vp->v_name = vfs_addname(name: cnp->cn_nameptr, len: cnp->cn_namelen, nc_hash: cnp->cn_hash, flags: `0`);
7163	}
7164
7165	if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
7166	vp->v_flag \|= VISUNION;
7167	}
7168	}
7169	if ((param->vnfs_flags & VNFS_CANTCACHE) == `0`) {
7170	/*
7171	* this vnode is being created as cacheable in the name cache
7172	* this allows us to re-enter it in the cache
7173	*/
7174	vp->v_flag \|= VNCACHEABLE;
7175	}
7176	ut = current_uthread();
7177
7178	if ((current_proc()->p_lflag & P_LRAGE_VNODES) \|\|
7179	(ut->uu_flag & (UT_RAGE_VNODES \| UT_KERN_RAGE_VNODES))) {
7180	/*
7181	* process has indicated that it wants any
7182	* vnodes created on its behalf to be rapidly
7183	* aged to reduce the impact on the cached set
7184	* of vnodes
7185	*
7186	* if UT_KERN_RAGE_VNODES is set, then the
7187	* kernel internally wants vnodes to be rapidly
7188	* aged, even if the process hasn't requested
7189	* this
7190	*/
7191	vp->v_flag \|= VRAGE;
7192	}
7193
7194	#if CONFIG_SECLUDED_MEMORY
7195	switch (secluded_for_filecache) {
7196	case SECLUDED_FILECACHE_NONE:
7197	/*
7198	* secluded_for_filecache == 0:
7199	* + no file contents in secluded pool
7200	*/
7201	break;
7202	case SECLUDED_FILECACHE_APPS:
7203	/*
7204	* secluded_for_filecache == 1:
7205	* + no files from /
7206	* + files from /Applications/ are OK
7207	* + files from /Applications/Camera are not OK
7208	* + no files that are open for write
7209	*/
7210	if (vnode_vtype(vp) == VREG &&
7211	vnode_mount(vp) != NULL &&
7212	(!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
7213	/ not from root filesystem: eligible for secluded pages /
7214	memory_object_mark_eligible_for_secluded(
7215	ubc_getobject(vp, UBC_FLAGS_NONE),
7216	TRUE);
7217	}
7218	break;
7219	case SECLUDED_FILECACHE_RDONLY:
7220	/*
7221	* secluded_for_filecache == 2:
7222	* + all read-only files OK, except:
7223	* + dyld_shared_cache_arm64*
7224	* + Camera
7225	* + mediaserverd
7226	*/
7227	if (vnode_vtype(vp) == VREG) {
7228	memory_object_mark_eligible_for_secluded(
7229	ubc_getobject(vp, UBC_FLAGS_NONE),
7230	TRUE);
7231	}
7232	break;
7233	default:
7234	break;
7235	}
7236	#endif /* CONFIG_SECLUDED_MEMORY */
7237
7238	if (is_bdevvp) {
7239	/*
7240	* The v_flags and v_lflags felds for the vndoe above are
7241	* manipulated without the vnode lock. This is fine for
7242	* everything because no other use of this vnode is occurring.
7243	* However the case of the bdevvp alias vnode reuse is different
7244	* and the flags end up being modified while a thread may be in
7245	* vnode_waitforwrites which sets VTHROTTLED and any one of the
7246	* non atomic modifications of v_flag in this function can race
7247	* with the setting of that flag and cause VTHROTTLED on vflag
7248	* to get "lost".
7249	*
7250	* This should ideally be fixed by making sure all modifications
7251	* in this function to the vnode flags are done under the
7252	* vnode lock but at this time, a much smaller workaround is
7253	* being employed and a the more correct (and potentially
7254	* much bigger) change will follow later.
7255	*
7256	* The effect of "losing" the VTHROTTLED flags would be a lost
7257	* wakeup so we just issue that wakeup here since this happens
7258	* only once per bdevvp vnode which are only one or two for a
7259	* given boot.
7260	*/
7261	wakeup(chan: &vp->v_numoutput);
7262
7263	/*
7264	* now make sure the flags that we were suppossed to put aren't
7265	* lost.
7266	*/
7267	vnode_lock_spin(vp);
7268	if (param->vnfs_flags & VNFS_ADDFSREF) {
7269	vp->v_lflag \|= VNAMED_FSHASH;
7270	}
7271	if (param->vnfs_mp && (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)) {
7272	vp->v_flag \|= VLOCKLOCAL;
7273	}
7274	if ((param->vnfs_flags & VNFS_CANTCACHE) == `0`) {
7275	vp->v_flag \|= VNCACHEABLE;
7276	}
7277	vnode_unlock(vp);
7278	}
7279
7280	return `0`;
7281
7282	error_out:
7283	if (existing_vnode) {
7284	vnode_put(vp);
7285	}
7286	return error;
7287	}
7288
7289	int
7290	vnode_create_ext(uint32_t flavor, uint32_t size, void data, vnode_t vpp, vnode_create_options_t vc_options)
7291	{
7292	if (vc_options & ~(VNODE_CREATE_EMPTY \| VNODE_CREATE_NODEALLOC)) {
7293	return EINVAL;
7294	}
7295	*vpp = NULLVP;
7296	return vnode_create_internal(flavor, size, data, vpp, vc_options);
7297	}
7298
7299	/ USAGE:*
7300	* The following api creates a vnode and associates all the parameter specified in vnode_fsparam
7301	* structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
7302	* is obsoleted by this.
7303	*/
7304	int
7305	vnode_create(uint32_t flavor, uint32_t size, void data, vnode_t vpp)
7306	{
7307	return vnode_create_ext(flavor, size, data, vpp, vc_options: VNODE_CREATE_NODEALLOC);
7308	}
7309
7310	int
7311	vnode_create_empty(vnode_t *vpp)
7312	{
7313	return vnode_create_ext(VNCREATE_FLAVOR, VCREATESIZE, NULL,
7314	vpp, vc_options: VNODE_CREATE_EMPTY);
7315	}
7316
7317	int
7318	vnode_initialize(uint32_t __unused flavor, uint32_t size, void data, vnode_t vpp)
7319	{
7320	if (*vpp == NULLVP) {
7321	panic("NULL vnode passed to vnode_initialize");
7322	}
7323	#if DEVELOPMENT \|\| DEBUG
7324	/*
7325	* We lock to check that vnode is fit for unlocked use in
7326	* vnode_create_internal.
7327	*/
7328	vnode_lock_spin(*vpp);
7329	VNASSERT(((vpp)->v_iocount == `1`), vpp,
7330	("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
7331	VNASSERT(((vpp)->v_usecount == `0`), vpp,
7332	("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
7333	VNASSERT(((vpp)->v_lflag & VL_DEAD), vpp,
7334	("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
7335	(*vpp)->v_lflag));
7336	VNASSERT(((vpp)->v_data == NULL), vpp,
7337	("vnode_initialize : v_data not NULL"));
7338	vnode_unlock(*vpp);
7339	#endif
7340	return vnode_create_internal(flavor, size, data, vpp, VNODE_CREATE_DEFAULT);
7341	}
7342
7343	int
7344	vnode_addfsref(vnode_t vp)
7345	{
7346	vnode_lock_spin(vp);
7347	if (vp->v_lflag & VNAMED_FSHASH) {
7348	panic("add_fsref: vp already has named reference");
7349	}
7350	if ((vp->v_freelist.tqe_prev != (struct vnode **)`0xdeadb`)) {
7351	panic("addfsref: vp on the free list");
7352	}
7353	vp->v_lflag \|= VNAMED_FSHASH;
7354	vnode_unlock(vp);
7355	return `0`;
7356	}
7357	int
7358	vnode_removefsref(vnode_t vp)
7359	{
7360	vnode_lock_spin(vp);
7361	if ((vp->v_lflag & VNAMED_FSHASH) == `0`) {
7362	panic("remove_fsref: no named reference");
7363	}
7364	vp->v_lflag &= ~VNAMED_FSHASH;
7365	vnode_unlock(vp);
7366	return `0`;
7367	}
7368
7369
7370	int
7371	vfs_iterate(int flags, int (callout)(mount_t, void* ), void* *arg)
7372	{
7373	mount_t mp;
7374	int ret = `0`;
7375	fsid_t * fsid_list;
7376	int count, actualcount, i;
7377	void * allocmem;
7378	int indx_start, indx_stop, indx_incr;
7379	int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
7380	int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT);
7381
7382	count = mount_getvfscnt();
7383	count += `10`;
7384
7385	fsid_list = kalloc_data(count * sizeof(fsid_t), Z_WAITOK);
7386	allocmem = (void *)fsid_list;
7387
7388	actualcount = mount_fillfsids(fsidlst: fsid_list, count);
7389
7390	/*
7391	* Establish the iteration direction
7392	* VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
7393	*/
7394	if (flags & VFS_ITERATE_TAIL_FIRST) {
7395	indx_start = actualcount - `1`;
7396	indx_stop = -`1`;
7397	indx_incr = -`1`;
7398	} else { / Head first by default /
7399	indx_start = `0`;
7400	indx_stop = actualcount;
7401	indx_incr = `1`;
7402	}
7403
7404	for (i = indx_start; i != indx_stop; i += indx_incr) {
7405	/ obtain the mount point with iteration reference /
7406	mp = mount_list_lookupby_fsid(&fsid_list[i], `0`, `1`);
7407
7408	if (mp == (struct mount *)`0`) {
7409	continue;
7410	}
7411	mount_lock(mp);
7412	if ((mp->mnt_lflag & MNT_LDEAD) \|\|
7413	(!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) {
7414	mount_unlock(mp);
7415	mount_iterdrop(mp);
7416	continue;
7417	}
7418	mount_unlock(mp);
7419
7420	/ iterate over all the vnodes /
7421	ret = callout(mp, arg);
7422
7423	/*
7424	* Drop the iterref here if the callback didn't do it.
7425	* Note: If cb_dropref is set the mp may no longer exist.
7426	*/
7427	if (!cb_dropref) {
7428	mount_iterdrop(mp);
7429	}
7430
7431	switch (ret) {
7432	case VFS_RETURNED:
7433	case VFS_RETURNED_DONE:
7434	if (ret == VFS_RETURNED_DONE) {
7435	ret = `0`;
7436	goto out;
7437	}
7438	break;
7439
7440	case VFS_CLAIMED_DONE:
7441	ret = `0`;
7442	goto out;
7443	case VFS_CLAIMED:
7444	default:
7445	break;
7446	}
7447	ret = `0`;
7448	}
7449
7450	out:
7451	kfree_data(allocmem, count * sizeof(fsid_t));
7452	return ret;
7453	}
7454
7455	/*
7456	* Update the vfsstatfs structure in the mountpoint.
7457	* MAC: Parameter eventtype added, indicating whether the event that
7458	* triggered this update came from user space, via a system call
7459	* (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
7460	*/
7461	int
7462	vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
7463	{
7464	struct vfs_attr va;
7465	int error;
7466
7467	/*
7468	* Request the attributes we want to propagate into
7469	* the per-mount vfsstat structure.
7470	*/
7471	VFSATTR_INIT(&va);
7472	VFSATTR_WANTED(&va, f_iosize);
7473	VFSATTR_WANTED(&va, f_blocks);
7474	VFSATTR_WANTED(&va, f_bfree);
7475	VFSATTR_WANTED(&va, f_bavail);
7476	VFSATTR_WANTED(&va, f_bused);
7477	VFSATTR_WANTED(&va, f_files);
7478	VFSATTR_WANTED(&va, f_ffree);
7479	VFSATTR_WANTED(&va, f_bsize);
7480	VFSATTR_WANTED(&va, f_fssubtype);
7481
7482	if ((error = vfs_getattr(mp, vfa: &va, ctx)) != `0`) {
7483	KAUTH_DEBUG("STAT - filesystem returned error %d", error);
7484	return error;
7485	}
7486	#if CONFIG_MACF
7487	if (eventtype == VFS_USER_EVENT) {
7488	error = mac_mount_check_getattr(ctx, mp, vfa: &va);
7489	if (error != `0`) {
7490	return error;
7491	}
7492	}
7493	#endif
7494	/*
7495	* Unpack into the per-mount structure.
7496	*
7497	* We only overwrite these fields, which are likely to change:
7498	* f_blocks
7499	* f_bfree
7500	* f_bavail
7501	* f_bused
7502	* f_files
7503	* f_ffree
7504	*
7505	* And these which are not, but which the FS has no other way
7506	* of providing to us:
7507	* f_bsize
7508	* f_iosize
7509	* f_fssubtype
7510	*
7511	*/
7512	if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
7513	/ 4822056 - protect against malformed server mount /
7514	mp->mnt_vfsstat.f_bsize = (va.f_bsize > `0` ? va.f_bsize : `512`);
7515	} else {
7516	mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; / default from the device block size /
7517	}
7518	if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
7519	mp->mnt_vfsstat.f_iosize = va.f_iosize;
7520	} else {
7521	mp->mnt_vfsstat.f_iosize = `1024` * `1024`; / 1MB sensible I/O size /
7522	}
7523	if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
7524	mp->mnt_vfsstat.f_blocks = va.f_blocks;
7525	}
7526	if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
7527	mp->mnt_vfsstat.f_bfree = va.f_bfree;
7528	}
7529	if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
7530	mp->mnt_vfsstat.f_bavail = va.f_bavail;
7531	}
7532	if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
7533	mp->mnt_vfsstat.f_bused = va.f_bused;
7534	}
7535	if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
7536	mp->mnt_vfsstat.f_files = va.f_files;
7537	}
7538	if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
7539	mp->mnt_vfsstat.f_ffree = va.f_ffree;
7540	}
7541
7542	/ this is unlikely to change, but has to be queried for /
7543	if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
7544	mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
7545	}
7546
7547	return `0`;
7548	}
7549
7550	int
7551	mount_list_add(mount_t mp)
7552	{
7553	int res;
7554
7555	mount_list_lock();
7556	if (get_system_inshutdown() != `0`) {
7557	res = -`1`;
7558	} else {
7559	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
7560	nummounts++;
7561	res = `0`;
7562	}
7563	mount_list_unlock();
7564
7565	return res;
7566	}
7567
7568	void
7569	mount_list_remove(mount_t mp)
7570	{
7571	mount_list_lock();
7572	TAILQ_REMOVE(&mountlist, mp, mnt_list);
7573	nummounts--;
7574	mp->mnt_list.tqe_next = NULL;
7575	mp->mnt_list.tqe_prev = NULL;
7576	mount_list_unlock();
7577	}
7578
7579	mount_t
7580	mount_lookupby_volfsid(int volfs_id, int withref)
7581	{
7582	mount_t cur_mount = (mount_t)`0`;
7583	mount_t mp;
7584
7585	mount_list_lock();
7586	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
7587	if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
7588	(mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
7589	(mp->mnt_vfsstat.f_fsid.val[`0`] == volfs_id)) {
7590	cur_mount = mp;
7591	if (withref) {
7592	if (mount_iterref(mp: cur_mount, locked: `1`)) {
7593	cur_mount = (mount_t)`0`;
7594	mount_list_unlock();
7595	goto out;
7596	}
7597	}
7598	break;
7599	}
7600	}
7601	mount_list_unlock();
7602	if (withref && (cur_mount != (mount_t)`0`)) {
7603	mp = cur_mount;
7604	if (vfs_busy(mp, LK_NOWAIT) != `0`) {
7605	cur_mount = (mount_t)`0`;
7606	}
7607	mount_iterdrop(mp);
7608	}
7609	out:
7610	return cur_mount;
7611	}
7612
7613	mount_t
7614	mount_list_lookupby_fsid(fsid_t fsid, int* locked, int withref)
7615	{
7616	mount_t retmp = (mount_t)`0`;
7617	mount_t mp;
7618
7619	if (!locked) {
7620	mount_list_lock();
7621	}
7622	TAILQ_FOREACH(mp, &mountlist, mnt_list)
7623	if (mp->mnt_vfsstat.f_fsid.val[`0`] == fsid->val[`0`] &&
7624	mp->mnt_vfsstat.f_fsid.val[`1`] == fsid->val[`1`]) {
7625	retmp = mp;
7626	if (withref) {
7627	if (mount_iterref(mp: retmp, locked: `1`)) {
7628	retmp = (mount_t)`0`;
7629	}
7630	}
7631	goto out;
7632	}
7633	out:
7634	if (!locked) {
7635	mount_list_unlock();
7636	}
7637	return retmp;
7638	}
7639
7640	errno_t
7641	vnode_lookupat(const char path, int* flags, vnode_t *vpp, vfs_context_t ctx,
7642	vnode_t start_dvp)
7643	{
7644	struct nameidata *ndp;
7645	int error = `0`;
7646	u_int32_t ndflags = `0`;
7647
7648	if (ctx == NULL) {
7649	return EINVAL;
7650	}
7651
7652	ndp = kalloc_type(struct nameidata, Z_WAITOK \| Z_NOFAIL);
7653
7654	if (flags & VNODE_LOOKUP_NOFOLLOW) {
7655	ndflags = NOFOLLOW;
7656	} else {
7657	ndflags = FOLLOW;
7658	}
7659
7660	if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
7661	ndflags \|= NOCROSSMOUNT;
7662	}
7663
7664	if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
7665	ndflags \|= CN_NBMOUNTLOOK;
7666	}
7667
7668	/ XXX AUDITVNPATH1 needed ? /
7669	NDINIT(ndp, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
7670	CAST_USER_ADDR_T(path), ctx);
7671
7672	if (start_dvp && (path[`0`] != `'/'`)) {
7673	ndp->ni_dvp = start_dvp;
7674	ndp->ni_cnd.cn_flags \|= USEDVP;
7675	}
7676
7677	if ((error = namei(ndp))) {
7678	goto out_free;
7679	}
7680
7681	ndp->ni_cnd.cn_flags &= ~USEDVP;
7682
7683	*vpp = ndp->ni_vp;
7684	nameidone(ndp);
7685
7686	out_free:
7687	kfree_type(struct nameidata, ndp);
7688	return error;
7689	}
7690
7691	errno_t
7692	vnode_lookup(const char path, int* flags, vnode_t *vpp, vfs_context_t ctx)
7693	{
7694	return vnode_lookupat(path, flags, vpp, ctx, NULLVP);
7695	}
7696
7697	errno_t
7698	vnode_open(const char path, int* fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
7699	{
7700	struct nameidata *ndp = NULL;
7701	int error;
7702	u_int32_t ndflags = `0`;
7703	int lflags = flags;
7704
7705	if (ctx == NULL) { / XXX technically an error /
7706	ctx = vfs_context_current();
7707	}
7708
7709	ndp = kalloc_type(struct nameidata, Z_WAITOK \| Z_NOFAIL);
7710
7711	if (fmode & O_NOFOLLOW) {
7712	lflags \|= VNODE_LOOKUP_NOFOLLOW;
7713	}
7714
7715	if (lflags & VNODE_LOOKUP_NOFOLLOW) {
7716	ndflags = NOFOLLOW;
7717	} else {
7718	ndflags = FOLLOW;
7719	}
7720
7721	if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
7722	ndflags \|= NOCROSSMOUNT;
7723	}
7724
7725	if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
7726	ndflags \|= CN_NBMOUNTLOOK;
7727	}
7728
7729	/ XXX AUDITVNPATH1 needed ? /
7730	NDINIT(ndp, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
7731	CAST_USER_ADDR_T(path), ctx);
7732
7733	if ((error = vn_open(ndp, fmode, cmode))) {
7734	*vpp = NULL;
7735	} else {
7736	*vpp = ndp->ni_vp;
7737	}
7738
7739	kfree_type(struct nameidata, ndp);
7740	return error;
7741	}
7742
7743	errno_t
7744	vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
7745	{
7746	int error;
7747
7748	if (ctx == NULL) {
7749	ctx = vfs_context_current();
7750	}
7751
7752	error = vn_close(vp, flags, ctx);
7753	vnode_put(vp);
7754	return error;
7755	}
7756
7757	errno_t
7758	vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
7759	{
7760	struct vnode_attr va;
7761	int error;
7762
7763	VATTR_INIT(&va);
7764	VATTR_WANTED(&va, va_modify_time);
7765	error = vnode_getattr(vp, vap: &va, ctx);
7766	if (!error) {
7767	*mtime = va.va_modify_time;
7768	}
7769	return error;
7770	}
7771
7772	errno_t
7773	vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
7774	{
7775	struct vnode_attr va;
7776	int error;
7777
7778	VATTR_INIT(&va);
7779	VATTR_WANTED(&va, va_flags);
7780	error = vnode_getattr(vp, vap: &va, ctx);
7781	if (!error) {
7782	*flags = va.va_flags;
7783	}
7784	return error;
7785	}
7786
7787	/*
7788	* Returns: 0 Success
7789	* vnode_getattr:???
7790	*/
7791	errno_t
7792	vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
7793	{
7794	struct vnode_attr va;
7795	int error;
7796
7797	VATTR_INIT(&va);
7798	VATTR_WANTED(&va, va_data_size);
7799	error = vnode_getattr(vp, vap: &va, ctx);
7800	if (!error) {
7801	*sizep = va.va_data_size;
7802	}
7803	return error;
7804	}
7805
7806	errno_t
7807	vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
7808	{
7809	struct vnode_attr va;
7810
7811	VATTR_INIT(&va);
7812	VATTR_SET(&va, va_data_size, size);
7813	va.va_vaflags = ioflag & `0xffff`;
7814	return vnode_setattr(vp, vap: &va, ctx);
7815	}
7816
7817	int
7818	vnode_setdirty(vnode_t vp)
7819	{
7820	vnode_lock_spin(vp);
7821	vp->v_flag \|= VISDIRTY;
7822	vnode_unlock(vp);
7823	return `0`;
7824	}
7825
7826	int
7827	vnode_cleardirty(vnode_t vp)
7828	{
7829	vnode_lock_spin(vp);
7830	vp->v_flag &= ~VISDIRTY;
7831	vnode_unlock(vp);
7832	return `0`;
7833	}
7834
7835	int
7836	vnode_isdirty(vnode_t vp)
7837	{
7838	int dirty;
7839
7840	vnode_lock_spin(vp);
7841	dirty = (vp->v_flag & VISDIRTY) ? `1` : `0`;
7842	vnode_unlock(vp);
7843
7844	return dirty;
7845	}
7846
7847	static int
7848	vn_create_reg(vnode_t dvp, vnode_t vpp, struct* nameidata ndp, struct* vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx)
7849	{
7850	/ Only use compound VNOP for compound operation /
7851	if (vnode_compound_open_available(vp: dvp) && ((flags & VN_CREATE_DOOPEN) != `0`)) {
7852	*vpp = NULLVP;
7853	return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, status: statusp, vap, ctx);
7854	} else {
7855	return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
7856	}
7857	}
7858
7859	/*
7860	* Create a filesystem object of arbitrary type with arbitrary attributes in
7861	* the spevied directory with the specified name.
7862	*
7863	* Parameters: dvp Pointer to the vnode of the directory
7864	* in which to create the object.
7865	* vpp Pointer to the area into which to
7866	* return the vnode of the created object.
7867	* cnp Component name pointer from the namei
7868	* data structure, containing the name to
7869	* use for the create object.
7870	* vap Pointer to the vnode_attr structure
7871	* describing the object to be created,
7872	* including the type of object.
7873	* flags VN_* flags controlling ACL inheritance
7874	* and whether or not authorization is to
7875	* be required for the operation.
7876	*
7877	* Returns: 0 Success
7878	* !0 errno value
7879	*
7880	* Implicit: *vpp Contains the vnode of the object that
7881	* was created, if successful.
7882	* *cnp May be modified by the underlying VFS.
7883	* *vap May be modified by the underlying VFS.
7884	* modified by either ACL inheritance or
7885	*
7886	*
7887	* be modified, even if the operation is
7888	*
7889	*
7890	* Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
7891	*
7892	* Modification of 'cnp' and 'vap' by the underlying VFS is
7893	* strongly discouraged.
7894	*
7895	* XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
7896	*
7897	* XXX: We should enummerate the possible errno values here, and where
7898	* in the code they originated.
7899	*/
7900	errno_t
7901	vn_create(vnode_t dvp, vnode_t vpp, struct* nameidata ndp, struct* vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx)
7902	{
7903	errno_t error, old_error;
7904	vnode_t vp = (vnode_t)`0`;
7905	boolean_t batched;
7906	struct componentname *cnp;
7907	uint32_t defaulted;
7908
7909	cnp = &ndp->ni_cnd;
7910	error = `0`;
7911	batched = namei_compound_available(dp: dvp, ndp) ? TRUE : FALSE;
7912
7913	KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
7914
7915	if (flags & VN_CREATE_NOINHERIT) {
7916	vap->va_vaflags \|= VA_NOINHERIT;
7917	}
7918	if (flags & VN_CREATE_NOAUTH) {
7919	vap->va_vaflags \|= VA_NOAUTH;
7920	}
7921	/*
7922	* Handle ACL inheritance, initialize vap.
7923	*/
7924	error = vn_attribute_prepare(dvp, vap, defaulted_fieldsp: &defaulted, ctx);
7925	if (error) {
7926	return error;
7927	}
7928
7929	if (vap->va_type != VREG && (fmode != `0` \|\| (flags & VN_CREATE_DOOPEN) \|\| statusp)) {
7930	panic("Open parameters, but not a regular file.");
7931	}
7932	if ((fmode != `0`) && ((flags & VN_CREATE_DOOPEN) == `0`)) {
7933	panic("Mode for open, but not trying to open...");
7934	}
7935
7936
7937	/*
7938	* Create the requested node.
7939	*/
7940	switch (vap->va_type) {
7941	case VREG:
7942	error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
7943	break;
7944	case VDIR:
7945	error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
7946	break;
7947	case VSOCK:
7948	case VFIFO:
7949	case VBLK:
7950	case VCHR:
7951	error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
7952	break;
7953	default:
7954	panic("vnode_create: unknown vtype %d", vap->va_type);
7955	}
7956	if (error != `0`) {
7957	KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
7958	goto out;
7959	}
7960
7961	vp = *vpp;
7962	old_error = error;
7963
7964	/*
7965	* If some of the requested attributes weren't handled by the VNOP,
7966	* use our fallback code.
7967	*/
7968	if ((error == `0`) && !VATTR_ALL_SUPPORTED(vap) && *vpp) {
7969	KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
7970	error = vnode_setattr_fallback(vp: *vpp, vap, ctx);
7971	}
7972
7973	#if CONFIG_MACF
7974	if ((error == `0`) && !(flags & VN_CREATE_NOLABEL)) {
7975	error = vnode_label(mp: vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
7976	}
7977	#endif
7978
7979	if ((error != `0`) && (vp != (vnode_t)`0`)) {
7980	/ If we've done a compound open, close /
7981	if (batched && (old_error == `0`) && (vap->va_type == VREG)) {
7982	VNOP_CLOSE(vp, fmode, ctx);
7983	}
7984
7985	/ Need to provide notifications if a create succeeded /
7986	if (!batched) {
7987	*vpp = (vnode_t) `0`;
7988	vnode_put(vp);
7989	vp = NULLVP;
7990	}
7991	}
7992
7993	/*
7994	* For creation VNOPs, this is the equivalent of
7995	* lookup_handle_found_vnode.
7996	*/
7997	if (kdebug_enable && *vpp) {
7998	kdebug_lookup(dp: *vpp, cnp);
7999	}
8000
8001	out:
8002	vn_attribute_cleanup(vap, defaulted_fields: defaulted);
8003
8004	return error;
8005	}
8006
8007	static kauth_scope_t vnode_scope;
8008	static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
8009	uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
8010	static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
8011	vnode_t vp, vnode_t dvp, int *errorp);
8012
8013	typedef struct _vnode_authorize_context {
8014	vnode_t vp;
8015	struct vnode_attr *vap;
8016	vnode_t dvp;
8017	struct vnode_attr *dvap;
8018	vfs_context_t ctx;
8019	int flags;
8020	int flags_valid;
8021	#define _VAC_IS_OWNER (1<<0)
8022	#define _VAC_IN_GROUP (1<<1)
8023	#define _VAC_IS_DIR_OWNER (1<<2)
8024	#define _VAC_IN_DIR_GROUP (1<<3)
8025	#define _VAC_NO_VNODE_POINTERS (1<<4)
8026	} *vauth_ctx;
8027
8028	void
8029	vnode_authorize_init(void)
8030	{
8031	vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, callback: vnode_authorize_callback, NULL);
8032	}
8033
8034	#define VATTR_PREPARE_DEFAULTED_UID 0x1
8035	#define VATTR_PREPARE_DEFAULTED_GID 0x2
8036	#define VATTR_PREPARE_DEFAULTED_MODE 0x4
8037
8038	int
8039	vn_attribute_prepare(vnode_t dvp, struct vnode_attr vap, uint32_t defaulted_fieldsp, vfs_context_t ctx)
8040	{
8041	kauth_acl_t nacl = NULL, oacl = NULL;
8042	int error;
8043
8044	/*
8045	* Handle ACL inheritance.
8046	*/
8047	if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
8048	/ save the original filesec /
8049	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8050	oacl = vap->va_acl;
8051	}
8052
8053	vap->va_acl = NULL;
8054	if ((error = kauth_acl_inherit(dvp: dvp,
8055	initial: oacl,
8056	product: &nacl,
8057	isdir: vap->va_type == VDIR,
8058	ctx: ctx)) != `0`) {
8059	KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
8060	return error;
8061	}
8062
8063	/*
8064	* If the generated ACL is NULL, then we can save ourselves some effort
8065	* by clearing the active bit.
8066	*/
8067	if (nacl == NULL) {
8068	VATTR_CLEAR_ACTIVE(vap, va_acl);
8069	} else {
8070	vap->va_base_acl = oacl;
8071	VATTR_SET(vap, va_acl, nacl);
8072	}
8073	}
8074
8075	error = vnode_authattr_new_internal(dvp, vap, noauth: (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
8076	if (error) {
8077	vn_attribute_cleanup(vap, defaulted_fields: *defaulted_fieldsp);
8078	}
8079
8080	return error;
8081	}
8082
8083	void
8084	vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
8085	{
8086	/*
8087	* If the caller supplied a filesec in vap, it has been replaced
8088	* now by the post-inheritance copy. We need to put the original back
8089	* and free the inherited product.
8090	*/
8091	kauth_acl_t nacl, oacl;
8092
8093	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8094	nacl = vap->va_acl;
8095	oacl = vap->va_base_acl;
8096
8097	if (oacl) {
8098	VATTR_SET(vap, va_acl, oacl);
8099	vap->va_base_acl = NULL;
8100	} else {
8101	VATTR_CLEAR_ACTIVE(vap, va_acl);
8102	}
8103
8104	if (nacl != NULL) {
8105	/*
8106	* Only free the ACL buffer if 'VA_FILESEC_ACL' is not set as it
8107	* should be freed by the caller or it is a post-inheritance copy.
8108	*/
8109	if (!(vap->va_vaflags & VA_FILESEC_ACL) \|\|
8110	(oacl != NULL && nacl != oacl)) {
8111	kauth_acl_free(fsp: nacl);
8112	}
8113	}
8114	}
8115
8116	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != `0`) {
8117	VATTR_CLEAR_ACTIVE(vap, va_mode);
8118	}
8119	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != `0`) {
8120	VATTR_CLEAR_ACTIVE(vap, va_gid);
8121	}
8122	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != `0`) {
8123	VATTR_CLEAR_ACTIVE(vap, va_uid);
8124	}
8125
8126	return;
8127	}
8128
8129	int
8130	vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, __unused void* *reserved)
8131	{
8132	#if !CONFIG_MACF
8133	#pragma unused(cnp)
8134	#endif
8135	int error = `0`;
8136
8137	/*
8138	* Normally, unlinking of directories is not supported.
8139	* However, some file systems may have limited support.
8140	*/
8141	if ((vp->v_type == VDIR) &&
8142	!(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
8143	return EPERM; / POSIX /
8144	}
8145
8146	/ authorize the delete operation /
8147	#if CONFIG_MACF
8148	if (!error) {
8149	error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
8150	}
8151	#endif /* MAC */
8152	if (!error) {
8153	error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
8154	}
8155
8156	return error;
8157	}
8158
8159	int
8160	vn_authorize_open_existing(vnode_t vp, struct componentname cnp, int* fmode, vfs_context_t ctx, void *reserved)
8161	{
8162	/ Open of existing case /
8163	kauth_action_t action;
8164	int error = `0`;
8165	if (cnp->cn_ndp == NULL) {
8166	panic("NULL ndp");
8167	}
8168	if (reserved != NULL) {
8169	panic("reserved not NULL.");
8170	}
8171
8172	#if CONFIG_MACF
8173	/ XXX may do duplicate work here, but ignore that for now (idempotent) /
8174	if (vfs_flags(mp: vnode_mount(vp)) & MNT_MULTILABEL) {
8175	error = vnode_label(mp: vnode_mount(vp), NULL, vp, NULL, flags: `0`, ctx);
8176	if (error) {
8177	return error;
8178	}
8179	}
8180	#endif
8181
8182	if (vnode_isdir(vp)) {
8183	if ((fmode & (FWRITE \| O_TRUNC)) \|\| / disallow write operations on directories /
8184	((fmode & FSEARCH) && !(fmode & O_DIRECTORY))) {
8185	return EISDIR;
8186	}
8187	} else {
8188	if (fmode & O_DIRECTORY) {
8189	return ENOTDIR;
8190	}
8191
8192	if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
8193	return EOPNOTSUPP; / Operation not supported on socket /
8194	}
8195
8196	if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != `0`) {
8197	return ELOOP; / O_NOFOLLOW was specified and the target is a symbolic link /
8198	}
8199
8200	if (cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH) {
8201	return ENOTDIR;
8202	}
8203
8204	if (!vnode_isreg(vp) && (fmode & FEXEC)) {
8205	return EACCES;
8206	}
8207	}
8208
8209	#if CONFIG_MACF
8210	/ If a file being opened is a shadow file containing*
8211	* namedstream data, ignore the macf checks because it
8212	* is a kernel internal file and access should always
8213	* be allowed.
8214	*/
8215	if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
8216	error = mac_vnode_check_open(ctx, vp, acc_mode: fmode);
8217	if (error) {
8218	return error;
8219	}
8220	}
8221	#endif
8222
8223	/ compute action to be authorized /
8224	action = `0`;
8225	if (fmode & FREAD) {
8226	action \|= KAUTH_VNODE_READ_DATA;
8227	}
8228	if (fmode & (FWRITE \| O_TRUNC)) {
8229	/*
8230	* If we are writing, appending, and not truncating,
8231	* indicate that we are appending so that if the
8232	* UF_APPEND or SF_APPEND bits are set, we do not deny
8233	* the open.
8234	*/
8235	if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8236	action \|= KAUTH_VNODE_APPEND_DATA;
8237	} else {
8238	action \|= KAUTH_VNODE_WRITE_DATA;
8239	}
8240	}
8241	if (fmode & (FSEARCH \| FEXEC)) {
8242	if (vnode_isdir(vp)) {
8243	action \|= KAUTH_VNODE_SEARCH;
8244	} else {
8245	action \|= KAUTH_VNODE_EXECUTE;
8246	}
8247	}
8248	error = vnode_authorize(vp, NULL, action, ctx);
8249	#if NAMEDSTREAMS
8250	if (error == EACCES) {
8251	/*
8252	* Shadow files may exist on-disk with a different UID/GID
8253	* than that of the current context. Verify that this file
8254	* is really a shadow file. If it was created successfully
8255	* then it should be authorized.
8256	*/
8257	if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
8258	error = vnode_verifynamedstream(vp);
8259	}
8260	}
8261	#endif
8262
8263	return error;
8264	}
8265
8266	int
8267	vn_authorize_create(vnode_t dvp, struct componentname cnp, struct* vnode_attr vap, vfs_context_t ctx, void* *reserved)
8268	{
8269	#if !CONFIG_MACF
8270	#pragma unused(vap)
8271	#endif
8272	/ Creation case /
8273	int error;
8274
8275	if (cnp->cn_ndp == NULL) {
8276	panic("NULL cn_ndp");
8277	}
8278	if (reserved != NULL) {
8279	panic("reserved not NULL.");
8280	}
8281
8282	/ Only validate path for creation if we didn't do a complete lookup /
8283	if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
8284	error = lookup_validate_creation_path(ndp: cnp->cn_ndp);
8285	if (error) {
8286	return error;
8287	}
8288	}
8289
8290	#if CONFIG_MACF
8291	error = mac_vnode_check_create(ctx, dvp, cnp, vap);
8292	if (error) {
8293	return error;
8294	}
8295	#endif /* CONFIG_MACF */
8296
8297	return vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
8298	}
8299
8300	int
8301	vn_authorize_rename(struct vnode fdvp, struct* vnode fvp, struct* componentname *fcnp,
8302	struct vnode tdvp, struct* vnode tvp, struct* componentname *tcnp,
8303	vfs_context_t ctx, void *reserved)
8304	{
8305	return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, flags: `0`, reserved);
8306	}
8307
8308	int
8309	vn_authorize_renamex(struct vnode fdvp, struct* vnode fvp, struct* componentname *fcnp,
8310	struct vnode tdvp, struct* vnode tvp, struct* componentname *tcnp,
8311	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
8312	{
8313	return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
8314	}
8315
8316	int
8317	vn_authorize_renamex_with_paths(struct vnode fdvp, struct* vnode fvp, struct* componentname fcnp, const* char *from_path,
8318	struct vnode tdvp, struct* vnode tvp, struct* componentname tcnp, const* char *to_path,
8319	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
8320	{
8321	int error = `0`;
8322	int moving = `0`;
8323	bool swap = flags & VFS_RENAME_SWAP;
8324
8325	if (reserved != NULL) {
8326	panic("Passed something other than NULL as reserved field!");
8327	}
8328
8329	/*
8330	* Avoid renaming "." and "..".
8331	*
8332	* XXX No need to check for this in the FS. We should always have the leaves
8333	* in VFS in this case.
8334	*/
8335	if (fvp->v_type == VDIR &&
8336	((fdvp == fvp) \|\|
8337	(fcnp->cn_namelen == `1` && fcnp->cn_nameptr[`0`] == `'.'`) \|\|
8338	((fcnp->cn_flags \| tcnp->cn_flags) & ISDOTDOT))) {
8339	error = EINVAL;
8340	goto out;
8341	}
8342
8343	if (tvp == NULLVP && vnode_compound_rename_available(vp: tdvp)) {
8344	error = lookup_validate_creation_path(ndp: tcnp->cn_ndp);
8345	if (error) {
8346	goto out;
8347	}
8348	}
8349
8350	/** <MACF> **/
8351	#if CONFIG_MACF
8352	error = mac_vnode_check_rename(ctx, dvp: fdvp, vp: fvp, cnp: fcnp, tdvp, tvp, tcnp);
8353	if (error) {
8354	goto out;
8355	}
8356	if (swap) {
8357	error = mac_vnode_check_rename(ctx, dvp: tdvp, vp: tvp, cnp: tcnp, tdvp: fdvp, tvp: fvp, tcnp: fcnp);
8358	if (error) {
8359	goto out;
8360	}
8361	}
8362	#endif
8363	/** </MACF> **/
8364
8365	/** <MiscChecks> **/
8366	if (tvp != NULL) {
8367	if (!swap) {
8368	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
8369	error = ENOTDIR;
8370	goto out;
8371	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
8372	error = EISDIR;
8373	goto out;
8374	}
8375	}
8376	} else if (swap) {
8377	/*
8378	* Caller should have already checked this and returned
8379	* ENOENT. If we send back ENOENT here, caller will retry
8380	* which isn't what we want so we send back EINVAL here
8381	* instead.
8382	*/
8383	error = EINVAL;
8384	goto out;
8385	}
8386
8387	if (fvp == tdvp) {
8388	error = EINVAL;
8389	goto out;
8390	}
8391
8392	/*
8393	* The following edge case is caught here:
8394	* (to cannot be a descendent of from)
8395	*
8396	* o fdvp
8397	* /
8398	* /
8399	* o fvp
8400	* \
8401	* \
8402	* o tdvp
8403	* /
8404	* /
8405	* o tvp
8406	*/
8407	if (tdvp->v_parent == fvp) {
8408	error = EINVAL;
8409	goto out;
8410	}
8411
8412	if (swap && fdvp->v_parent == tvp) {
8413	error = EINVAL;
8414	goto out;
8415	}
8416	/** </MiscChecks> **/
8417
8418	/** <Kauth> **/
8419
8420	/*
8421	* As part of the Kauth step, we call out to allow 3rd-party
8422	* fileop notification of "about to rename". This is needed
8423	* in the event that 3rd-parties need to know that the DELETE
8424	* authorization is actually part of a rename. It's important
8425	* that we guarantee that the DELETE call-out will always be
8426	* made if the WILL_RENAME call-out is made. Another fileop
8427	* call-out will be performed once the operation is completed.
8428	* We can ignore the result of kauth_authorize_fileop().
8429	*
8430	* N.B. We are passing the vnode and both paths to each
8431	* call; kauth_authorize_fileop() extracts the "from" path
8432	* when posting a KAUTH_FILEOP_WILL_RENAME notification.
8433	* As such, we only post these notifications if all of the
8434	* information we need is provided.
8435	*/
8436
8437	if (swap) {
8438	kauth_action_t f = `0`, t = `0`;
8439
8440	/*
8441	* Directories changing parents need ...ADD_SUBDIR... to
8442	* permit changing ".."
8443	*/
8444	if (fdvp != tdvp) {
8445	if (vnode_isdir(vp: fvp)) {
8446	f = KAUTH_VNODE_ADD_SUBDIRECTORY;
8447	}
8448	if (vnode_isdir(vp: tvp)) {
8449	t = KAUTH_VNODE_ADD_SUBDIRECTORY;
8450	}
8451	}
8452	if (to_path != NULL) {
8453	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
8454	KAUTH_FILEOP_WILL_RENAME,
8455	arg0: (uintptr_t)fvp,
8456	arg1: (uintptr_t)to_path);
8457	}
8458	error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE \| f, ctx);
8459	if (error) {
8460	goto out;
8461	}
8462	if (from_path != NULL) {
8463	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
8464	KAUTH_FILEOP_WILL_RENAME,
8465	arg0: (uintptr_t)tvp,
8466	arg1: (uintptr_t)from_path);
8467	}
8468	error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE \| t, ctx);
8469	if (error) {
8470	goto out;
8471	}
8472	f = vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
8473	t = vnode_isdir(vp: tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
8474	if (fdvp == tdvp) {
8475	error = vnode_authorize(vp: fdvp, NULL, action: f \| t, ctx);
8476	} else {
8477	error = vnode_authorize(vp: fdvp, NULL, action: t, ctx);
8478	if (error) {
8479	goto out;
8480	}
8481	error = vnode_authorize(vp: tdvp, NULL, action: f, ctx);
8482	}
8483	if (error) {
8484	goto out;
8485	}
8486	} else {
8487	error = `0`;
8488	if ((tvp != NULL) && vnode_isdir(vp: tvp)) {
8489	if (tvp != fdvp) {
8490	moving = `1`;
8491	}
8492	} else if (tdvp != fdvp) {
8493	moving = `1`;
8494	}
8495
8496	/*
8497	* must have delete rights to remove the old name even in
8498	* the simple case of fdvp == tdvp.
8499	*
8500	* If fvp is a directory, and we are changing it's parent,
8501	* then we also need rights to rewrite its ".." entry as well.
8502	*/
8503	if (to_path != NULL) {
8504	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
8505	KAUTH_FILEOP_WILL_RENAME,
8506	arg0: (uintptr_t)fvp,
8507	arg1: (uintptr_t)to_path);
8508	}
8509	if (vnode_isdir(vp: fvp)) {
8510	if ((error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE \| KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != `0`) {
8511	goto out;
8512	}
8513	} else {
8514	if ((error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE, ctx)) != `0`) {
8515	goto out;
8516	}
8517	}
8518	if (moving) {
8519	/ moving into tdvp or tvp, must have rights to add /
8520	if ((error = vnode_authorize(vp: ((tvp != NULL) && vnode_isdir(vp: tvp)) ? tvp : tdvp,
8521	NULL,
8522	action: vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
8523	ctx)) != `0`) {
8524	goto out;
8525	}
8526	} else {
8527	/ node staying in same directory, must be allowed to add new name /
8528	if ((error = vnode_authorize(vp: fdvp, NULL,
8529	action: vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != `0`) {
8530	goto out;
8531	}
8532	}
8533	/ overwriting tvp /
8534	if ((tvp != NULL) && !vnode_isdir(vp: tvp) &&
8535	((error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE, ctx)) != `0`)) {
8536	goto out;
8537	}
8538	}
8539
8540	/** </Kauth> **/
8541
8542	/ XXX more checks? /
8543	out:
8544	return error;
8545	}
8546
8547	int
8548	vn_authorize_mkdir(vnode_t dvp, struct componentname cnp, struct* vnode_attr vap, vfs_context_t ctx, void* *reserved)
8549	{
8550	#if !CONFIG_MACF
8551	#pragma unused(vap)
8552	#endif
8553	int error;
8554
8555	if (reserved != NULL) {
8556	panic("reserved not NULL in vn_authorize_mkdir()");
8557	}
8558
8559	/ XXX A hack for now, to make shadow files work /
8560	if (cnp->cn_ndp == NULL) {
8561	return `0`;
8562	}
8563
8564	if (vnode_compound_mkdir_available(vp: dvp)) {
8565	error = lookup_validate_creation_path(ndp: cnp->cn_ndp);
8566	if (error) {
8567	goto out;
8568	}
8569	}
8570
8571	#if CONFIG_MACF
8572	error = mac_vnode_check_create(ctx,
8573	dvp, cnp, vap);
8574	if (error) {
8575	goto out;
8576	}
8577	#endif
8578
8579	/ authorize addition of a directory to the parent /
8580	if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != `0`) {
8581	goto out;
8582	}
8583
8584	out:
8585	return error;
8586	}
8587
8588	int
8589	vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, void* *reserved)
8590	{
8591	#if CONFIG_MACF
8592	int error;
8593	#else
8594	#pragma unused(cnp)
8595	#endif
8596	if (reserved != NULL) {
8597	panic("Non-NULL reserved argument to vn_authorize_rmdir()");
8598	}
8599
8600	if (vp->v_type != VDIR) {
8601	/*
8602	* rmdir only deals with directories
8603	*/
8604	return ENOTDIR;
8605	}
8606
8607	if (dvp == vp) {
8608	/*
8609	* No rmdir "." please.
8610	*/
8611	return EINVAL;
8612	}
8613
8614	#if CONFIG_MACF
8615	error = mac_vnode_check_unlink(ctx, dvp,
8616	vp, cnp);
8617	if (error) {
8618	return error;
8619	}
8620	#endif
8621
8622	return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
8623	}
8624
8625	/*
8626	* Authorizer for directory cloning. This does not use vnodes but instead
8627	* uses prefilled vnode attributes from the filesystem.
8628	*
8629	* The same function is called to set up the attributes required, perform the
8630	* authorization and cleanup (if required)
8631	*/
8632	int
8633	vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
8634	struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
8635	dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
8636	__unused void *reserved)
8637	{
8638	int error;
8639	int is_suser = vfs_context_issuser(ctx);
8640
8641	if (vattr_op == OP_VATTR_SETUP) {
8642	VATTR_INIT(vap);
8643
8644	/*
8645	* When ACL inheritence is implemented, both vap->va_acl and
8646	* dvap->va_acl will be required (even as superuser).
8647	*/
8648	VATTR_WANTED(vap, va_type);
8649	VATTR_WANTED(vap, va_mode);
8650	VATTR_WANTED(vap, va_flags);
8651	VATTR_WANTED(vap, va_uid);
8652	VATTR_WANTED(vap, va_gid);
8653	if (dvap) {
8654	VATTR_INIT(dvap);
8655	VATTR_WANTED(dvap, va_flags);
8656	}
8657
8658	if (!is_suser) {
8659	/*
8660	* If not superuser, we have to evaluate ACLs and
8661	* need the target directory gid to set the initial
8662	* gid of the new object.
8663	*/
8664	VATTR_WANTED(vap, va_acl);
8665	if (dvap) {
8666	VATTR_WANTED(dvap, va_gid);
8667	}
8668	} else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
8669	VATTR_WANTED(dvap, va_gid);
8670	}
8671	return `0`;
8672	} else if (vattr_op == OP_VATTR_CLEANUP) {
8673	return `0`; / Nothing to do for now /
8674	}
8675
8676	/ dvap isn't used for authorization /
8677	error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
8678
8679	if (error) {
8680	return error;
8681	}
8682
8683	/*
8684	* vn_attribute_prepare should be able to accept attributes as well as
8685	* vnodes but for now we do this inline.
8686	*/
8687	if (!is_suser \|\| (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
8688	/*
8689	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
8690	* owner is set, that owner takes ownership of all new files.
8691	*/
8692	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
8693	(mp->mnt_fsowner != KAUTH_UID_NONE)) {
8694	VATTR_SET(vap, va_uid, mp->mnt_fsowner);
8695	} else {
8696	/ default owner is current user /
8697	VATTR_SET(vap, va_uid,
8698	kauth_cred_getuid(vfs_context_ucred(ctx)));
8699	}
8700
8701	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
8702	(mp->mnt_fsgroup != KAUTH_GID_NONE)) {
8703	VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
8704	} else {
8705	/*
8706	* default group comes from parent object,
8707	* fallback to current user
8708	*/
8709	if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
8710	VATTR_SET(vap, va_gid, dvap->va_gid);
8711	} else {
8712	VATTR_SET(vap, va_gid,
8713	kauth_cred_getgid(vfs_context_ucred(ctx)));
8714	}
8715	}
8716	}
8717
8718	/ Inherit SF_RESTRICTED bit from destination directory only /
8719	if (VATTR_IS_ACTIVE(vap, va_flags)) {
8720	VATTR_SET(vap, va_flags,
8721	((vap->va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)))); / Turn off from source /
8722	if (VATTR_IS_ACTIVE(dvap, va_flags)) {
8723	VATTR_SET(vap, va_flags,
8724	vap->va_flags \| (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
8725	}
8726	} else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
8727	VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
8728	}
8729
8730	return `0`;
8731	}
8732
8733
8734	/*
8735	* Authorize an operation on a vnode.
8736	*
8737	* This is KPI, but here because it needs vnode_scope.
8738	*
8739	* Returns: 0 Success
8740	* kauth_authorize_action:EPERM ...
8741	* xlate => EACCES Permission denied
8742	* kauth_authorize_action:0 Success
8743	* kauth_authorize_action: Depends on callback return; this is
8744	* usually only vnode_authorize_callback(),
8745	* but may include other listerners, if any
8746	* exist.
8747	* EROFS
8748	* EACCES
8749	* EPERM
8750	* ???
8751	*/
8752	int
8753	vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
8754	{
8755	int error, result;
8756
8757	/*
8758	* We can't authorize against a dead vnode; allow all operations through so that
8759	* the correct error can be returned.
8760	*/
8761	if (vp->v_type == VBAD) {
8762	return `0`;
8763	}
8764
8765	error = `0`;
8766	result = kauth_authorize_action(scope: vnode_scope, credential: vfs_context_ucred(ctx), action: action,
8767	arg0: (uintptr_t)ctx, arg1: (uintptr_t)vp, arg2: (uintptr_t)dvp, arg3: (uintptr_t)&error);
8768	if (result == EPERM) { / traditional behaviour /
8769	result = EACCES;
8770	}
8771	/ did the lower layers give a better error return? /
8772	if ((result != `0`) && (error != `0`)) {
8773	return error;
8774	}
8775	return result;
8776	}
8777
8778	/*
8779	* Test for vnode immutability.
8780	*
8781	* The 'append' flag is set when the authorization request is constrained
8782	* to operations which only request the right to append to a file.
8783	*
8784	* The 'ignore' flag is set when an operation modifying the immutability flags
8785	* is being authorized. We check the system securelevel to determine which
8786	* immutability flags we can ignore.
8787	*/
8788	static int
8789	vnode_immutable(struct vnode_attr vap, int* append, int ignore)
8790	{
8791	int mask;
8792
8793	/ start with all bits precluding the operation /
8794	mask = IMMUTABLE \| APPEND;
8795
8796	/ if appending only, remove the append-only bits /
8797	if (append) {
8798	mask &= ~APPEND;
8799	}
8800
8801	/ ignore only set when authorizing flags changes /
8802	if (ignore) {
8803	if (securelevel <= `0`) {
8804	/ in insecure state, flags do not inhibit changes /
8805	mask = `0`;
8806	} else {
8807	/ in secure state, user flags don't inhibit /
8808	mask &= ~(UF_IMMUTABLE \| UF_APPEND);
8809	}
8810	}
8811	KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
8812	if ((vap->va_flags & mask) != `0`) {
8813	return EPERM;
8814	}
8815	return `0`;
8816	}
8817
8818	static int
8819	vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
8820	{
8821	int result;
8822
8823	/ default assumption is not-owner /
8824	result = `0`;
8825
8826	/*
8827	* If the filesystem has given us a UID, we treat this as authoritative.
8828	*/
8829	if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
8830	result = (vap->va_uid == kauth_cred_getuid(cred: cred)) ? `1` : `0`;
8831	}
8832	/ we could test the owner UUID here if we had a policy for it /
8833
8834	return result;
8835	}
8836
8837	/*
8838	* vauth_node_group
8839	*
8840	* Description: Ask if a cred is a member of the group owning the vnode object
8841	*
8842	* Parameters: vap vnode attribute
8843	* vap->va_gid group owner of vnode object
8844	* cred credential to check
8845	* ismember pointer to where to put the answer
8846	* idontknow Return this if we can't get an answer
8847	*
8848	* Returns: 0 Success
8849	* idontknow Can't get information
8850	* kauth_cred_ismember_gid:? Error from kauth subsystem
8851	* kauth_cred_ismember_gid:? Error from kauth subsystem
8852	*/
8853	static int
8854	vauth_node_group(struct vnode_attr vap, kauth_cred_t cred, int* ismember, int* idontknow)
8855	{
8856	int error;
8857	int result;
8858
8859	error = `0`;
8860	result = `0`;
8861
8862	/*
8863	* The caller is expected to have asked the filesystem for a group
8864	* at some point prior to calling this function. The answer may
8865	* have been that there is no group ownership supported for the
8866	* vnode object, in which case we return
8867	*/
8868	if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
8869	error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &result);
8870	/*
8871	* Credentials which are opted into external group membership
8872	* resolution which are not known to the external resolver
8873	* will result in an ENOENT error. We translate this into
8874	* the appropriate 'idontknow' response for our caller.
8875	*
8876	* XXX We do not make a distinction here between an ENOENT
8877	* XXX arising from a response from the external resolver,
8878	* XXX and an ENOENT which is internally generated. This is
8879	* XXX a deficiency of the published kauth_cred_ismember_gid()
8880	* XXX KPI which can not be overcome without new KPI. For
8881	* XXX all currently known cases, however, this wil result
8882	* XXX in correct behaviour.
8883	*/
8884	if (error == ENOENT) {
8885	error = idontknow;
8886	}
8887	}
8888	/*
8889	* XXX We could test the group UUID here if we had a policy for it,
8890	* XXX but this is problematic from the perspective of synchronizing
8891	* XXX group UUID and POSIX GID ownership of a file and keeping the
8892	* XXX values coherent over time. The problem is that the local
8893	* XXX system will vend transient group UUIDs for unknown POSIX GID
8894	* XXX values, and these are not persistent, whereas storage of values
8895	* XXX is persistent. One potential solution to this is a local
8896	* XXX (persistent) replica of remote directory entries and vended
8897	* XXX local ids in a local directory server (think in terms of a
8898	* XXX caching DNS server).
8899	*/
8900
8901	if (!error) {
8902	*ismember = result;
8903	}
8904	return error;
8905	}
8906
8907	static int
8908	vauth_file_owner(vauth_ctx vcp)
8909	{
8910	int result;
8911
8912	if (vcp->flags_valid & _VAC_IS_OWNER) {
8913	result = (vcp->flags & _VAC_IS_OWNER) ? `1` : `0`;
8914	} else {
8915	result = vauth_node_owner(vap: vcp->vap, cred: vcp->ctx->vc_ucred);
8916
8917	/ cache our result /
8918	vcp->flags_valid \|= _VAC_IS_OWNER;
8919	if (result) {
8920	vcp->flags \|= _VAC_IS_OWNER;
8921	} else {
8922	vcp->flags &= ~_VAC_IS_OWNER;
8923	}
8924	}
8925	return result;
8926	}
8927
8928
8929	/*
8930	* vauth_file_ingroup
8931	*
8932	* Description: Ask if a user is a member of the group owning the directory
8933	*
8934	* Parameters: vcp The vnode authorization context that
8935	* contains the user and directory info
8936	* vcp->flags_valid Valid flags
8937	* vcp->flags Flags values
8938	* vcp->vap File vnode attributes
8939	* vcp->ctx VFS Context (for user)
8940	* ismember pointer to where to put the answer
8941	* idontknow Return this if we can't get an answer
8942	*
8943	* Returns: 0 Success
8944	* vauth_node_group:? Error from vauth_node_group()
8945	*
8946	* Implicit returns: *ismember 0 The user is not a group member
8947	* 1 The user is a group member
8948	*/
8949	static int
8950	vauth_file_ingroup(vauth_ctx vcp, int ismember, int* idontknow)
8951	{
8952	int error;
8953
8954	/ Check for a cached answer first, to avoid the check if possible /
8955	if (vcp->flags_valid & _VAC_IN_GROUP) {
8956	*ismember = (vcp->flags & _VAC_IN_GROUP) ? `1` : `0`;
8957	error = `0`;
8958	} else {
8959	/ Otherwise, go look for it /
8960	error = vauth_node_group(vap: vcp->vap, cred: vcp->ctx->vc_ucred, ismember, idontknow);
8961
8962	if (!error) {
8963	/ cache our result /
8964	vcp->flags_valid \|= _VAC_IN_GROUP;
8965	if (*ismember) {
8966	vcp->flags \|= _VAC_IN_GROUP;
8967	} else {
8968	vcp->flags &= ~_VAC_IN_GROUP;
8969	}
8970	}
8971	}
8972	return error;
8973	}
8974
8975	static int
8976	vauth_dir_owner(vauth_ctx vcp)
8977	{
8978	int result;
8979
8980	if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
8981	result = (vcp->flags & _VAC_IS_DIR_OWNER) ? `1` : `0`;
8982	} else {
8983	result = vauth_node_owner(vap: vcp->dvap, cred: vcp->ctx->vc_ucred);
8984
8985	/ cache our result /
8986	vcp->flags_valid \|= _VAC_IS_DIR_OWNER;
8987	if (result) {
8988	vcp->flags \|= _VAC_IS_DIR_OWNER;
8989	} else {
8990	vcp->flags &= ~_VAC_IS_DIR_OWNER;
8991	}
8992	}
8993	return result;
8994	}
8995
8996	/*
8997	* vauth_dir_ingroup
8998	*
8999	* Description: Ask if a user is a member of the group owning the directory
9000	*
9001	* Parameters: vcp The vnode authorization context that
9002	* contains the user and directory info
9003	* vcp->flags_valid Valid flags
9004	* vcp->flags Flags values
9005	* vcp->dvap Dir vnode attributes
9006	* vcp->ctx VFS Context (for user)
9007	* ismember pointer to where to put the answer
9008	* idontknow Return this if we can't get an answer
9009	*
9010	* Returns: 0 Success
9011	* vauth_node_group:? Error from vauth_node_group()
9012	*
9013	* Implicit returns: *ismember 0 The user is not a group member
9014	* 1 The user is a group member
9015	*/
9016	static int
9017	vauth_dir_ingroup(vauth_ctx vcp, int ismember, int* idontknow)
9018	{
9019	int error;
9020
9021	/ Check for a cached answer first, to avoid the check if possible /
9022	if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
9023	*ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? `1` : `0`;
9024	error = `0`;
9025	} else {
9026	/ Otherwise, go look for it /
9027	error = vauth_node_group(vap: vcp->dvap, cred: vcp->ctx->vc_ucred, ismember, idontknow);
9028
9029	if (!error) {
9030	/ cache our result /
9031	vcp->flags_valid \|= _VAC_IN_DIR_GROUP;
9032	if (*ismember) {
9033	vcp->flags \|= _VAC_IN_DIR_GROUP;
9034	} else {
9035	vcp->flags &= ~_VAC_IN_DIR_GROUP;
9036	}
9037	}
9038	}
9039	return error;
9040	}
9041
9042	/*
9043	* Test the posix permissions in (vap) to determine whether (credential)
9044	* may perform (action)
9045	*/
9046	static int
9047	vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
9048	{
9049	struct vnode_attr *vap;
9050	int needed, error, owner_ok, group_ok, world_ok, ismember;
9051	#ifdef KAUTH_DEBUG_ENABLE
9052	const char *where = "uninitialized";
9053	# define _SETWHERE(c) where = c;
9054	#else
9055	# define _SETWHERE(c)
9056	#endif
9057
9058	/ checking file or directory? /
9059	if (on_dir) {
9060	vap = vcp->dvap;
9061	} else {
9062	vap = vcp->vap;
9063	}
9064
9065	error = `0`;
9066
9067	/*
9068	* We want to do as little work here as possible. So first we check
9069	* which sets of permissions grant us the access we need, and avoid checking
9070	* whether specific permissions grant access when more generic ones would.
9071	*/
9072
9073	/ owner permissions /
9074	needed = `0`;
9075	if (action & VREAD) {
9076	needed \|= S_IRUSR;
9077	}
9078	if (action & VWRITE) {
9079	needed \|= S_IWUSR;
9080	}
9081	if (action & VEXEC) {
9082	needed \|= S_IXUSR;
9083	}
9084	owner_ok = (needed & vap->va_mode) == needed;
9085
9086	/*
9087	* Processes with the appropriate entitlement can marked themselves as
9088	* ignoring file/directory permissions if they own it.
9089	*/
9090	if (!owner_ok && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) {
9091	owner_ok = `1`;
9092	}
9093
9094	/ group permissions /
9095	needed = `0`;
9096	if (action & VREAD) {
9097	needed \|= S_IRGRP;
9098	}
9099	if (action & VWRITE) {
9100	needed \|= S_IWGRP;
9101	}
9102	if (action & VEXEC) {
9103	needed \|= S_IXGRP;
9104	}
9105	group_ok = (needed & vap->va_mode) == needed;
9106
9107	/ world permissions /
9108	needed = `0`;
9109	if (action & VREAD) {
9110	needed \|= S_IROTH;
9111	}
9112	if (action & VWRITE) {
9113	needed \|= S_IWOTH;
9114	}
9115	if (action & VEXEC) {
9116	needed \|= S_IXOTH;
9117	}
9118	world_ok = (needed & vap->va_mode) == needed;
9119
9120	/ If granted/denied by all three, we're done /
9121	if (owner_ok && group_ok && world_ok) {
9122	_SETWHERE("all");
9123	goto out;
9124	}
9125
9126	if (!owner_ok && !group_ok && !world_ok) {
9127	_SETWHERE("all");
9128	error = EACCES;
9129	goto out;
9130	}
9131
9132	/ Check ownership (relatively cheap) /
9133	if ((on_dir && vauth_dir_owner(vcp)) \|\|
9134	(!on_dir && vauth_file_owner(vcp))) {
9135	_SETWHERE("user");
9136	if (!owner_ok) {
9137	error = EACCES;
9138	}
9139	goto out;
9140	}
9141
9142	/ Not owner; if group and world both grant it we're done /
9143	if (group_ok && world_ok) {
9144	_SETWHERE("group/world");
9145	goto out;
9146	}
9147	if (!group_ok && !world_ok) {
9148	_SETWHERE("group/world");
9149	error = EACCES;
9150	goto out;
9151	}
9152
9153	/ Check group membership (most expensive) /
9154	ismember = `0`; / Default to allow, if the target has no group owner /
9155
9156	/*
9157	* In the case we can't get an answer about the user from the call to
9158	* vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
9159	* the side of caution, rather than simply granting access, or we will
9160	* fail to correctly implement exclusion groups, so we set the third
9161	* parameter on the basis of the state of 'group_ok'.
9162	*/
9163	if (on_dir) {
9164	error = vauth_dir_ingroup(vcp, ismember: &ismember, idontknow: (!group_ok ? EACCES : `0`));
9165	} else {
9166	error = vauth_file_ingroup(vcp, ismember: &ismember, idontknow: (!group_ok ? EACCES : `0`));
9167	}
9168	if (error) {
9169	if (!group_ok) {
9170	ismember = `1`;
9171	}
9172	error = `0`;
9173	}
9174	if (ismember) {
9175	_SETWHERE("group");
9176	if (!group_ok) {
9177	error = EACCES;
9178	}
9179	goto out;
9180	}
9181
9182	/ Not owner, not in group, use world result /
9183	_SETWHERE("world");
9184	if (!world_ok) {
9185	error = EACCES;
9186	}
9187
9188	/ FALLTHROUGH /
9189
9190	out:
9191	KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
9192	vcp->vp, (error == `0`) ? "ALLOWED" : "DENIED", where,
9193	(action & VREAD) ? "r" : "-",
9194	(action & VWRITE) ? "w" : "-",
9195	(action & VEXEC) ? "x" : "-",
9196	needed,
9197	(vap->va_mode & S_IRUSR) ? "r" : "-",
9198	(vap->va_mode & S_IWUSR) ? "w" : "-",
9199	(vap->va_mode & S_IXUSR) ? "x" : "-",
9200	(vap->va_mode & S_IRGRP) ? "r" : "-",
9201	(vap->va_mode & S_IWGRP) ? "w" : "-",
9202	(vap->va_mode & S_IXGRP) ? "x" : "-",
9203	(vap->va_mode & S_IROTH) ? "r" : "-",
9204	(vap->va_mode & S_IWOTH) ? "w" : "-",
9205	(vap->va_mode & S_IXOTH) ? "x" : "-",
9206	kauth_cred_getuid(vcp->ctx->vc_ucred),
9207	on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
9208	on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
9209	return error;
9210	}
9211
9212	/*
9213	* Authorize the deletion of the node vp from the directory dvp.
9214	*
9215	* We assume that:
9216	* - Neither the node nor the directory are immutable.
9217	* - The user is not the superuser.
9218	*
9219	* The precedence of factors for authorizing or denying delete for a credential
9220	*
9221	* 1) Explicit ACE on the node. (allow or deny DELETE)
9222	* 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
9223	*
9224	* If there are conflicting ACEs on the node and the directory, the node
9225	* ACE wins.
9226	*
9227	* 3) Sticky bit on the directory.
9228	* Deletion is not permitted if the directory is sticky and the caller is
9229	* not owner of the node or directory. The sticky bit rules are like a deny
9230	* delete ACE except lower in priority than ACL's either allowing or denying
9231	* delete.
9232	*
9233	* 4) POSIX permisions on the directory.
9234	*
9235	* As an optimization, we cache whether or not delete child is permitted
9236	* on directories. This enables us to skip directory ACL and POSIX checks
9237	* as we already have the result from those checks. However, we always check the
9238	* node ACL and, if the directory has the sticky bit set, we always check its
9239	* ACL (even for a directory with an authorized delete child). Furthermore,
9240	* caching the delete child authorization is independent of the sticky bit
9241	* being set as it is only applicable in determining whether the node can be
9242	* deleted or not.
9243	*/
9244	static int
9245	vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
9246	{
9247	struct vnode_attr *vap = vcp->vap;
9248	struct vnode_attr *dvap = vcp->dvap;
9249	kauth_cred_t cred = vcp->ctx->vc_ucred;
9250	struct kauth_acl_eval eval;
9251	int error, ismember;
9252
9253	/ Check the ACL on the node first /
9254	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
9255	eval.ae_requested = KAUTH_VNODE_DELETE;
9256	eval.ae_acl = &vap->va_acl->acl_ace[`0`];
9257	eval.ae_count = vap->va_acl->acl_entrycount;
9258	eval.ae_options = `0`;
9259	if (vauth_file_owner(vcp)) {
9260	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
9261	}
9262	/*
9263	* We use ENOENT as a marker to indicate we could not get
9264	* information in order to delay evaluation until after we
9265	* have the ACL evaluation answer. Previously, we would
9266	* always deny the operation at this point.
9267	*/
9268	if ((error = vauth_file_ingroup(vcp, ismember: &ismember, ENOENT)) != `0` && error != ENOENT) {
9269	return error;
9270	}
9271	if (error == ENOENT) {
9272	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9273	} else if (ismember) {
9274	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
9275	}
9276	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9277	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9278	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9279	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9280
9281	if ((error = kauth_acl_evaluate(credential: cred, eval: &eval)) != `0`) {
9282	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
9283	return error;
9284	}
9285
9286	switch (eval.ae_result) {
9287	case KAUTH_RESULT_DENY:
9288	if (vauth_file_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) {
9289	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
9290	return `0`;
9291	}
9292	KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
9293	return EACCES;
9294	case KAUTH_RESULT_ALLOW:
9295	KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
9296	return `0`;
9297	case KAUTH_RESULT_DEFER:
9298	default:
9299	/ Defer to directory /
9300	KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
9301	break;
9302	}
9303	}
9304
9305	/*
9306	* Without a sticky bit, a previously authorized delete child is
9307	* sufficient to authorize this delete.
9308	*
9309	* If the sticky bit is set, a directory ACL which allows delete child
9310	* overrides a (potential) sticky bit deny. The authorized delete child
9311	* cannot tell us if it was authorized because of an explicit delete
9312	* child allow ACE or because of POSIX permisions so we have to check
9313	* the directory ACL everytime if the directory has a sticky bit.
9314	*/
9315	if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
9316	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
9317	return `0`;
9318	}
9319
9320	/ check the ACL on the directory /
9321	if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
9322	eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
9323	eval.ae_acl = &dvap->va_acl->acl_ace[`0`];
9324	eval.ae_count = dvap->va_acl->acl_entrycount;
9325	eval.ae_options = `0`;
9326	if (vauth_dir_owner(vcp)) {
9327	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
9328	}
9329	/*
9330	* We use ENOENT as a marker to indicate we could not get
9331	* information in order to delay evaluation until after we
9332	* have the ACL evaluation answer. Previously, we would
9333	* always deny the operation at this point.
9334	*/
9335	if ((error = vauth_dir_ingroup(vcp, ismember: &ismember, ENOENT)) != `0` && error != ENOENT) {
9336	return error;
9337	}
9338	if (error == ENOENT) {
9339	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9340	} else if (ismember) {
9341	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
9342	}
9343	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9344	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9345	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9346	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9347
9348	/*
9349	* If there is no entry, we are going to defer to other
9350	* authorization mechanisms.
9351	*/
9352	error = kauth_acl_evaluate(credential: cred, eval: &eval);
9353
9354	if (error != `0`) {
9355	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
9356	return error;
9357	}
9358	switch (eval.ae_result) {
9359	case KAUTH_RESULT_DENY:
9360	if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) {
9361	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
9362	return `0`;
9363	}
9364	KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
9365	return EACCES;
9366	case KAUTH_RESULT_ALLOW:
9367	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
9368	if (!cached_delete_child && vcp->dvp) {
9369	vnode_cache_authorized_action(vp: vcp->dvp,
9370	context: vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
9371	}
9372	return `0`;
9373	case KAUTH_RESULT_DEFER:
9374	default:
9375	/ Deferred by directory ACL /
9376	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
9377	break;
9378	}
9379	}
9380
9381	/*
9382	* From this point, we can't explicitly allow and if we reach the end
9383	* of the function without a denial, then the delete is authorized.
9384	*/
9385	if (!cached_delete_child) {
9386	if (vnode_authorize_posix(vcp, VWRITE, on_dir: `1` / on_dir /) != `0`) {
9387	KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
9388	return EACCES;
9389	}
9390	/*
9391	* Cache the authorized action on the vnode if allowed by the
9392	* directory ACL or POSIX permissions. It is correct to cache
9393	* this action even if sticky bit would deny deleting the node.
9394	*/
9395	if (vcp->dvp) {
9396	vnode_cache_authorized_action(vp: vcp->dvp, context: vcp->ctx,
9397	KAUTH_VNODE_DELETE_CHILD);
9398	}
9399	}
9400
9401	/ enforce sticky bit behaviour /
9402	if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
9403	KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
9404	vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
9405	return EACCES;
9406	}
9407
9408	/ not denied, must be OK /
9409	return `0`;
9410	}
9411
9412
9413	/*
9414	* Authorize an operation based on the node's attributes.
9415	*/
9416	static int
9417	vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
9418	{
9419	struct vnode_attr *vap = vcp->vap;
9420	kauth_cred_t cred = vcp->ctx->vc_ucred;
9421	struct kauth_acl_eval eval;
9422	int error, ismember;
9423	mode_t posix_action;
9424
9425	/*
9426	* If we are the file owner, we automatically have some rights.
9427	*
9428	* Do we need to expand this to support group ownership?
9429	*/
9430	if (vauth_file_owner(vcp)) {
9431	acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
9432	}
9433
9434	/*
9435	* If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
9436	* mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
9437	* change ownership to themselves, and WRITE_SECURITY is implicitly
9438	* granted to the owner. We need to do this because at this point
9439	* WRITE_SECURITY may not be granted as the caller is not currently
9440	* the owner.
9441	*/
9442	if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
9443	(acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
9444	acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
9445	}
9446
9447	if (acl_rights == `0`) {
9448	KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
9449	return `0`;
9450	}
9451
9452	/ if we have an ACL, evaluate it /
9453	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
9454	eval.ae_requested = acl_rights;
9455	eval.ae_acl = &vap->va_acl->acl_ace[`0`];
9456	eval.ae_count = vap->va_acl->acl_entrycount;
9457	eval.ae_options = `0`;
9458	if (vauth_file_owner(vcp)) {
9459	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
9460	}
9461	/*
9462	* We use ENOENT as a marker to indicate we could not get
9463	* information in order to delay evaluation until after we
9464	* have the ACL evaluation answer. Previously, we would
9465	* always deny the operation at this point.
9466	*/
9467	if ((error = vauth_file_ingroup(vcp, ismember: &ismember, ENOENT)) != `0` && error != ENOENT) {
9468	return error;
9469	}
9470	if (error == ENOENT) {
9471	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
9472	} else if (ismember) {
9473	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
9474	}
9475	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
9476	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
9477	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
9478	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
9479
9480	if ((error = kauth_acl_evaluate(credential: cred, eval: &eval)) != `0`) {
9481	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
9482	return error;
9483	}
9484
9485	switch (eval.ae_result) {
9486	case KAUTH_RESULT_DENY:
9487	if (vauth_file_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) {
9488	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
9489	return `0`;
9490	}
9491	KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
9492	return EACCES; / deny, deny, counter-allege /
9493	case KAUTH_RESULT_ALLOW:
9494	KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
9495	return `0`;
9496	case KAUTH_RESULT_DEFER:
9497	default:
9498	/ Effectively the same as !delete_child_denied /
9499	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
9500	break;
9501	}
9502
9503	*found_deny = eval.ae_found_deny;
9504
9505	/ fall through and evaluate residual rights /
9506	} else {
9507	/ no ACL, everything is residual /
9508	eval.ae_residual = acl_rights;
9509	}
9510
9511	/*
9512	* Grant residual rights that have been pre-authorized.
9513	*/
9514	eval.ae_residual &= ~preauth_rights;
9515
9516	/*
9517	* We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
9518	*/
9519	if (vauth_file_owner(vcp)) {
9520	eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
9521	}
9522
9523	if (eval.ae_residual == `0`) {
9524	KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
9525	return `0`;
9526	}
9527
9528	/*
9529	* Bail if we have residual rights that can't be granted by posix permissions,
9530	* or aren't presumed granted at this point.
9531	*
9532	* XXX these can be collapsed for performance
9533	*/
9534	if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
9535	KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
9536	return EACCES;
9537	}
9538	if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
9539	KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
9540	return EACCES;
9541	}
9542
9543	#if DIAGNOSTIC
9544	if (eval.ae_residual & KAUTH_VNODE_DELETE) {
9545	panic("vnode_authorize: can't be checking delete permission here");
9546	}
9547	#endif
9548
9549	/*
9550	* Compute the fallback posix permissions that will satisfy the remaining
9551	* rights.
9552	*/
9553	posix_action = `0`;
9554	if (eval.ae_residual & (KAUTH_VNODE_READ_DATA \|
9555	KAUTH_VNODE_LIST_DIRECTORY \|
9556	KAUTH_VNODE_READ_EXTATTRIBUTES)) {
9557	posix_action \|= VREAD;
9558	}
9559	if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA \|
9560	KAUTH_VNODE_ADD_FILE \|
9561	KAUTH_VNODE_ADD_SUBDIRECTORY \|
9562	KAUTH_VNODE_DELETE_CHILD \|
9563	KAUTH_VNODE_WRITE_ATTRIBUTES \|
9564	KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
9565	posix_action \|= VWRITE;
9566	}
9567	if (eval.ae_residual & (KAUTH_VNODE_EXECUTE \|
9568	KAUTH_VNODE_SEARCH)) {
9569	posix_action \|= VEXEC;
9570	}
9571
9572	if (posix_action != `0`) {
9573	return vnode_authorize_posix(vcp, action: posix_action, on_dir: `0` / !on_dir /);
9574	} else {
9575	KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
9576	vcp->vp,
9577	(eval.ae_residual & KAUTH_VNODE_READ_DATA)
9578	? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
9579	(eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
9580	? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
9581	(eval.ae_residual & KAUTH_VNODE_EXECUTE)
9582	? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
9583	(eval.ae_residual & KAUTH_VNODE_DELETE)
9584	? " DELETE" : "",
9585	(eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
9586	? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
9587	(eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
9588	? " DELETE_CHILD" : "",
9589	(eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
9590	? " READ_ATTRIBUTES" : "",
9591	(eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
9592	? " WRITE_ATTRIBUTES" : "",
9593	(eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
9594	? " READ_EXTATTRIBUTES" : "",
9595	(eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
9596	? " WRITE_EXTATTRIBUTES" : "",
9597	(eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
9598	? " READ_SECURITY" : "",
9599	(eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
9600	? " WRITE_SECURITY" : "",
9601	(eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
9602	? " CHECKIMMUTABLE" : "",
9603	(eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
9604	? " CHANGE_OWNER" : "");
9605	}
9606
9607	/*
9608	* Lack of required Posix permissions implies no reason to deny access.
9609	*/
9610	return `0`;
9611	}
9612
9613	/*
9614	* Check for file immutability.
9615	*/
9616	static int
9617	vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp,
9618	struct vnode_attr vap, int* rights, int ignore)
9619	{
9620	int error;
9621	int append;
9622
9623	/*
9624	* Perform immutability checks for operations that change data.
9625	*
9626	* Sockets, fifos and devices require special handling.
9627	*/
9628	switch (vap->va_type) {
9629	case VSOCK:
9630	case VFIFO:
9631	case VBLK:
9632	case VCHR:
9633	/*
9634	* Writing to these nodes does not change the filesystem data,
9635	* so forget that it's being tried.
9636	*/
9637	rights &= ~KAUTH_VNODE_WRITE_DATA;
9638	break;
9639	default:
9640	break;
9641	}
9642
9643	error = `0`;
9644	if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
9645	/ check per-filesystem options if possible /
9646	if (mp != NULL) {
9647	/ check for no-EA filesystems /
9648	if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
9649	(vfs_flags(mp) & MNT_NOUSERXATTR)) {
9650	KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vap);
9651	error = EACCES; / User attributes disabled /
9652	goto out;
9653	}
9654	}
9655
9656	/*
9657	* check for file immutability. first, check if the requested rights are
9658	* allowable for a UF_APPEND file.
9659	*/
9660	append = `0`;
9661	if (vap->va_type == VDIR) {
9662	if ((rights & (KAUTH_VNODE_ADD_FILE \| KAUTH_VNODE_ADD_SUBDIRECTORY \| KAUTH_VNODE_WRITE_EXTATTRIBUTES \| ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) {
9663	append = `1`;
9664	}
9665	} else {
9666	if ((rights & (KAUTH_VNODE_APPEND_DATA \| KAUTH_VNODE_WRITE_EXTATTRIBUTES \| ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) {
9667	append = `1`;
9668	}
9669	}
9670	if ((error = vnode_immutable(vap, append, ignore)) != `0`) {
9671	if (error && !ignore) {
9672	/*
9673	* In case of a rename, we want to check ownership for dvp as well.
9674	*/
9675	int owner = `0`;
9676	if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) {
9677	owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp);
9678	} else {
9679	owner = vauth_file_owner(vcp);
9680	}
9681	if (owner && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) {
9682	error = vnode_immutable(vap, append, ignore: `1`);
9683	}
9684	}
9685	}
9686	if (error) {
9687	KAUTH_DEBUG("%p DENIED - file is immutable", vap);
9688	goto out;
9689	}
9690	}
9691	out:
9692	return error;
9693	}
9694
9695	/*
9696	* Handle authorization actions for filesystems that advertise that the
9697	* server will be enforcing.
9698	*
9699	* Returns: 0 Authorization should be handled locally
9700	* 1 Authorization was handled by the FS
9701	*
9702	* Note: Imputed returns will only occur if the authorization request
9703	* was handled by the FS.
9704	*
9705	* Imputed: *resultp, modified Return code from FS when the request is
9706	* handled by the FS.
9707	* VNOP_ACCESS:???
9708	* VNOP_OPEN:???
9709	*/
9710	static int
9711	vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
9712	{
9713	int error;
9714
9715	/*
9716	* If the vp is a device node, socket or FIFO it actually represents a local
9717	* endpoint, so we need to handle it locally.
9718	*/
9719	switch (vp->v_type) {
9720	case VBLK:
9721	case VCHR:
9722	case VSOCK:
9723	case VFIFO:
9724	return `0`;
9725	default:
9726	break;
9727	}
9728
9729	/*
9730	* In the advisory request case, if the filesystem doesn't think it's reliable
9731	* we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
9732	*/
9733	if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(mp: vp->v_mount)) {
9734	return `0`;
9735	}
9736
9737	/*
9738	* Let the filesystem have a say in the matter. It's OK for it to not implemnent
9739	* VNOP_ACCESS, as most will authorise inline with the actual request.
9740	*/
9741	if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
9742	*resultp = error;
9743	KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
9744	return `1`;
9745	}
9746
9747	/*
9748	* Typically opaque filesystems do authorisation in-line, but exec is a special case. In
9749	* order to be reasonably sure that exec will be permitted, we try a bit harder here.
9750	*/
9751	if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
9752	/ try a VNOP_OPEN for readonly access /
9753	if ((error = VNOP_OPEN(vp, FREAD, ctx)) != `0`) {
9754	*resultp = error;
9755	KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
9756	return `1`;
9757	}
9758	VNOP_CLOSE(vp, FREAD, ctx);
9759	}
9760
9761	/*
9762	* We don't have any reason to believe that the request has to be denied at this point,
9763	* so go ahead and allow it.
9764	*/
9765	*resultp = `0`;
9766	KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
9767	return `1`;
9768	}
9769
9770
9771
9772
9773	/*
9774	* Returns: KAUTH_RESULT_ALLOW
9775	* KAUTH_RESULT_DENY
9776	*
9777	* Imputed: *arg3, modified Error code in the deny case
9778	* EROFS Read-only file system
9779	* EACCES Permission denied
9780	* EPERM Operation not permitted [no execute]
9781	* vnode_getattr:ENOMEM Not enough space [only if has filesec]
9782	* vnode_getattr:???
9783	* vnode_authorize_opaque:*arg2 ???
9784	* vnode_authorize_checkimmutable:???
9785	* vnode_authorize_delete:???
9786	* vnode_authorize_simple:???
9787	*/
9788
9789
9790	static int
9791	vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
9792	kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
9793	uintptr_t arg3)
9794	{
9795	vfs_context_t ctx;
9796	vnode_t cvp = NULLVP;
9797	vnode_t vp, dvp;
9798	int result = KAUTH_RESULT_DENY;
9799	int parent_iocount = `0`;
9800	int parent_action = `0`; / In case we need to use namedstream's data fork for cached rights/
9801
9802	ctx = (vfs_context_t)arg0;
9803	vp = (vnode_t)arg1;
9804	dvp = (vnode_t)arg2;
9805
9806	/*
9807	* if there are 2 vnodes passed in, we don't know at
9808	* this point which rights to look at based on the
9809	* combined action being passed in... defer until later...
9810	* otherwise check the kauth 'rights' cache hung
9811	* off of the vnode we're interested in... if we've already
9812	* been granted the right we're currently interested in,
9813	* we can just return success... otherwise we'll go through
9814	* the process of authorizing the requested right(s)... if that
9815	* succeeds, we'll add the right(s) to the cache.
9816	* VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
9817	*/
9818	if (dvp && vp) {
9819	goto defer;
9820	}
9821	if (dvp) {
9822	cvp = dvp;
9823	} else {
9824	/*
9825	* For named streams on local-authorization volumes, rights are cached on the parent;
9826	* authorization is determined by looking at the parent's properties anyway, so storing
9827	* on the parent means that we don't recompute for the named stream and that if
9828	* we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
9829	* stream to flush its cache separately. If we miss in the cache, then we authorize
9830	* as if there were no cached rights (passing the named stream vnode and desired rights to
9831	* vnode_authorize_callback_int()).
9832	*
9833	* On an opaquely authorized volume, we don't know the relationship between the
9834	* data fork's properties and the rights granted on a stream. Thus, named stream vnodes
9835	* on such a volume are authorized directly (rather than using the parent) and have their
9836	* own caches. When a named stream vnode is created, we mark the parent as having a named
9837	* stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
9838	* find the stream and flush its cache.
9839	*/
9840	if (vnode_isnamedstream(vp) && (!vfs_authopaque(mp: vp->v_mount))) {
9841	cvp = vnode_getparent(vp);
9842	if (cvp != NULLVP) {
9843	parent_iocount = `1`;
9844	} else {
9845	cvp = NULL;
9846	goto defer; / If we can't use the parent, take the slow path /
9847	}
9848
9849	/ Have to translate some actions /
9850	parent_action = action;
9851	if (parent_action & KAUTH_VNODE_READ_DATA) {
9852	parent_action &= ~KAUTH_VNODE_READ_DATA;
9853	parent_action \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
9854	}
9855	if (parent_action & KAUTH_VNODE_WRITE_DATA) {
9856	parent_action &= ~KAUTH_VNODE_WRITE_DATA;
9857	parent_action \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
9858	}
9859	} else {
9860	cvp = vp;
9861	}
9862	}
9863
9864	if (vnode_cache_is_authorized(vp: cvp, context: ctx, action: parent_iocount ? parent_action : action) == TRUE) {
9865	result = KAUTH_RESULT_ALLOW;
9866	goto out;
9867	}
9868	defer:
9869	result = vnode_authorize_callback_int(action, ctx, vp, dvp, errorp: (int *)arg3);
9870
9871	if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
9872	KAUTH_DEBUG("%p - caching action = %x", cvp, action);
9873	vnode_cache_authorized_action(vp: cvp, context: ctx, action);
9874	}
9875
9876	out:
9877	if (parent_iocount) {
9878	vnode_put(vp: cvp);
9879	}
9880
9881	return result;
9882	}
9883
9884	static int
9885	vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
9886	kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
9887	int noimmutable, int parent_authorized_for_delete_child)
9888	{
9889	int result;
9890
9891	/*
9892	* Check for immutability.
9893	*
9894	* In the deletion case, parent directory immutability vetoes specific
9895	* file rights.
9896	*/
9897	if ((result = vnode_authorize_checkimmutable(mp, vcp, vap: vcp->vap, rights,
9898	ignore: noimmutable)) != `0`) {
9899	goto out;
9900	}
9901
9902	if ((rights & KAUTH_VNODE_DELETE) &&
9903	!parent_authorized_for_delete_child) {
9904	result = vnode_authorize_checkimmutable(mp, vcp, vap: vcp->dvap,
9905	KAUTH_VNODE_DELETE_CHILD, ignore: `0`);
9906	if (result) {
9907	goto out;
9908	}
9909	}
9910
9911	/*
9912	* Clear rights that have been authorized by reaching this point, bail if nothing left to
9913	* check.
9914	*/
9915	rights &= ~(KAUTH_VNODE_LINKTARGET \| KAUTH_VNODE_CHECKIMMUTABLE);
9916	if (rights == `0`) {
9917	goto out;
9918	}
9919
9920	/*
9921	* If we're not the superuser, authorize based on file properties;
9922	* note that even if parent_authorized_for_delete_child is TRUE, we
9923	* need to check on the node itself.
9924	*/
9925	if (!is_suser) {
9926	/ process delete rights /
9927	if ((rights & KAUTH_VNODE_DELETE) &&
9928	((result = vnode_authorize_delete(vcp, cached_delete_child: parent_authorized_for_delete_child)) != `0`)) {
9929	goto out;
9930	}
9931
9932	/ process remaining rights /
9933	if ((rights & ~KAUTH_VNODE_DELETE) &&
9934	(result = vnode_authorize_simple(vcp, acl_rights: rights, preauth_rights: rights & KAUTH_VNODE_DELETE, found_deny)) != `0`) {
9935	goto out;
9936	}
9937	} else {
9938	/*
9939	* Execute is only granted to root if one of the x bits is set. This check only
9940	* makes sense if the posix mode bits are actually supported.
9941	*/
9942	if ((rights & KAUTH_VNODE_EXECUTE) &&
9943	(vcp->vap->va_type == VREG) &&
9944	VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
9945	!(vcp->vap->va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH))) {
9946	result = EPERM;
9947	KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode);
9948	goto out;
9949	}
9950
9951	/ Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE /
9952	*found_deny = TRUE;
9953
9954	KAUTH_DEBUG("%p ALLOWED - caller is superuser", vcp);
9955	}
9956	out:
9957	return result;
9958	}
9959
9960	static int
9961	vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
9962	vnode_t vp, vnode_t dvp, int *errorp)
9963	{
9964	struct _vnode_authorize_context auth_context;
9965	vauth_ctx vcp;
9966	kauth_cred_t cred;
9967	kauth_ace_rights_t rights;
9968	struct vnode_attr va, dva;
9969	int result;
9970	int noimmutable;
9971	boolean_t parent_authorized_for_delete_child = FALSE;
9972	boolean_t found_deny = FALSE;
9973	boolean_t parent_ref = FALSE;
9974	boolean_t is_suser = FALSE;
9975
9976	vcp = &auth_context;
9977	vcp->ctx = ctx;
9978	vcp->vp = vp;
9979	vcp->dvp = dvp;
9980	/*
9981	* Note that we authorize against the context, not the passed cred
9982	* (the same thing anyway)
9983	*/
9984	cred = ctx->vc_ucred;
9985
9986	VATTR_INIT(&va);
9987	vcp->vap = &va;
9988	VATTR_INIT(&dva);
9989	vcp->dvap = &dva;
9990
9991	vcp->flags = vcp->flags_valid = `0`;
9992
9993	#if DIAGNOSTIC
9994	if ((ctx == NULL) \|\| (vp == NULL) \|\| (cred == NULL)) {
9995	panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
9996	}
9997	#endif
9998
9999	KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
10000	vp, vfs_context_proc(ctx)->p_comm,
10001	(action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
10002	(action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
10003	(action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
10004	(action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
10005	(action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
10006	(action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
10007	(action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
10008	(action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
10009	(action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
10010	(action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
10011	(action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
10012	(action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
10013	(action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
10014	(action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
10015	(action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
10016	vnode_isdir(vp) ? "directory" : "file",
10017	vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
10018
10019	/*
10020	* Extract the control bits from the action, everything else is
10021	* requested rights.
10022	*/
10023	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? `1` : `0`;
10024	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
10025
10026	if (rights & KAUTH_VNODE_DELETE) {
10027	#if DIAGNOSTIC
10028	if (dvp == NULL) {
10029	panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
10030	}
10031	#endif
10032	/*
10033	* check to see if we've already authorized the parent
10034	* directory for deletion of its children... if so, we
10035	* can skip a whole bunch of work... we will still have to
10036	* authorize that this specific child can be removed
10037	*/
10038	if (vnode_cache_is_authorized(vp: dvp, context: ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
10039	parent_authorized_for_delete_child = TRUE;
10040	}
10041	} else {
10042	vcp->dvp = NULLVP;
10043	vcp->dvap = NULL;
10044	}
10045
10046	/*
10047	* Check for read-only filesystems.
10048	*/
10049	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
10050	(vp->v_mount->mnt_flag & MNT_RDONLY) &&
10051	((vp->v_type == VREG) \|\| (vp->v_type == VDIR) \|\|
10052	(vp->v_type == VLNK) \|\| (vp->v_type == VCPLX) \|\|
10053	(rights & KAUTH_VNODE_DELETE) \|\| (rights & KAUTH_VNODE_DELETE_CHILD))) {
10054	result = EROFS;
10055	goto out;
10056	}
10057
10058	/*
10059	* Check for noexec filesystems.
10060	*/
10061	if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
10062	result = EACCES;
10063	goto out;
10064	}
10065
10066	/*
10067	* Handle cases related to filesystems with non-local enforcement.
10068	* This call can return 0, in which case we will fall through to perform a
10069	* check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
10070	* an appropriate result, at which point we can return immediately.
10071	*/
10072	if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, resultp: &result, action, ctx)) {
10073	goto out;
10074	}
10075
10076	/*
10077	* If the vnode is a namedstream (extended attribute) data vnode (eg.
10078	* a resource fork), _DATA becomes _EXTATTRIBUTES.
10079	*/
10080	if (vnode_isnamedstream(vp)) {
10081	if (rights & KAUTH_VNODE_READ_DATA) {
10082	rights &= ~KAUTH_VNODE_READ_DATA;
10083	rights \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
10084	}
10085	if (rights & KAUTH_VNODE_WRITE_DATA) {
10086	rights &= ~KAUTH_VNODE_WRITE_DATA;
10087	rights \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
10088	}
10089
10090	/*
10091	* Point 'vp' to the namedstream's parent for ACL checking
10092	*/
10093	if ((vp->v_parent != NULL) &&
10094	(vget_internal(vp: vp->v_parent, vid: `0`, VNODE_NODEAD \| VNODE_DRAINO) == `0`)) {
10095	parent_ref = TRUE;
10096	vcp->vp = vp = vp->v_parent;
10097	}
10098	}
10099
10100	if (vfs_context_issuser(ctx)) {
10101	/*
10102	* if we're not asking for execute permissions or modifications,
10103	* then we're done, this action is authorized.
10104	*/
10105	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS))) {
10106	goto success;
10107	}
10108
10109	is_suser = TRUE;
10110	}
10111
10112	/*
10113	* Get vnode attributes and extended security information for the vnode
10114	* and directory if required.
10115	*
10116	* If we're root we only want mode bits and flags for checking
10117	* execute and immutability.
10118	*/
10119	VATTR_WANTED(&va, va_mode);
10120	VATTR_WANTED(&va, va_flags);
10121	if (!is_suser) {
10122	VATTR_WANTED(&va, va_uid);
10123	VATTR_WANTED(&va, va_gid);
10124	VATTR_WANTED(&va, va_acl);
10125	}
10126	if ((result = vnode_getattr(vp, vap: &va, ctx)) != `0`) {
10127	KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
10128	goto out;
10129	}
10130	VATTR_WANTED(&va, va_type);
10131	VATTR_RETURN(&va, va_type, vnode_vtype(vp));
10132
10133	if (vcp->dvp) {
10134	VATTR_WANTED(&dva, va_mode);
10135	VATTR_WANTED(&dva, va_flags);
10136	if (!is_suser) {
10137	VATTR_WANTED(&dva, va_uid);
10138	VATTR_WANTED(&dva, va_gid);
10139	VATTR_WANTED(&dva, va_acl);
10140	}
10141	if ((result = vnode_getattr(vp: vcp->dvp, vap: &dva, ctx)) != `0`) {
10142	KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
10143	goto out;
10144	}
10145	VATTR_WANTED(&dva, va_type);
10146	VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
10147	}
10148
10149	result = vnode_attr_authorize_internal(vcp, mp: vp->v_mount, rights, is_suser,
10150	found_deny: &found_deny, noimmutable, parent_authorized_for_delete_child);
10151	out:
10152	if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
10153	kauth_acl_free(fsp: va.va_acl);
10154	}
10155	if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
10156	kauth_acl_free(fsp: dva.va_acl);
10157	}
10158
10159	if (result) {
10160	if (parent_ref) {
10161	vnode_put(vp);
10162	}
10163	*errorp = result;
10164	KAUTH_DEBUG("%p DENIED - auth denied", vp);
10165	return KAUTH_RESULT_DENY;
10166	}
10167	if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
10168	/*
10169	* if we were successfully granted the right to search this directory
10170	* and there were NO ACL DENYs for search and the posix permissions also don't
10171	* deny execute, we can synthesize a global right that allows anyone to
10172	* traverse this directory during a pathname lookup without having to
10173	* match the credential associated with this cache of rights.
10174	*
10175	* Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
10176	* only if we actually check ACLs which we don't for root. As
10177	* a workaround, the lookup fast path checks for root.
10178	*/
10179	if (!VATTR_IS_SUPPORTED(&va, va_mode) \|\|
10180	((va.va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) ==
10181	(S_IXUSR \| S_IXGRP \| S_IXOTH))) {
10182	vnode_cache_authorized_action(vp, context: ctx, KAUTH_VNODE_SEARCHBYANYONE);
10183	}
10184	}
10185	success:
10186	if (parent_ref) {
10187	vnode_put(vp);
10188	}
10189
10190	/*
10191	* Note that this implies that we will allow requests for no rights, as well as
10192	* for rights that we do not recognise. There should be none of these.
10193	*/
10194	KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
10195	return KAUTH_RESULT_ALLOW;
10196	}
10197
10198	int
10199	vnode_attr_authorize_init(struct vnode_attr vap, struct* vnode_attr *dvap,
10200	kauth_action_t action, vfs_context_t ctx)
10201	{
10202	VATTR_INIT(vap);
10203	VATTR_WANTED(vap, va_type);
10204	VATTR_WANTED(vap, va_mode);
10205	VATTR_WANTED(vap, va_flags);
10206	if (dvap) {
10207	VATTR_INIT(dvap);
10208	if (action & KAUTH_VNODE_DELETE) {
10209	VATTR_WANTED(dvap, va_type);
10210	VATTR_WANTED(dvap, va_mode);
10211	VATTR_WANTED(dvap, va_flags);
10212	}
10213	} else if (action & KAUTH_VNODE_DELETE) {
10214	return EINVAL;
10215	}
10216
10217	if (!vfs_context_issuser(ctx)) {
10218	VATTR_WANTED(vap, va_uid);
10219	VATTR_WANTED(vap, va_gid);
10220	VATTR_WANTED(vap, va_acl);
10221	if (dvap && (action & KAUTH_VNODE_DELETE)) {
10222	VATTR_WANTED(dvap, va_uid);
10223	VATTR_WANTED(dvap, va_gid);
10224	VATTR_WANTED(dvap, va_acl);
10225	}
10226	}
10227
10228	return `0`;
10229	}
10230
10231	#define VNODE_SEC_ATTRS_NO_ACL (VNODE_ATTR_va_uid \| VNODE_ATTR_va_gid \| VNODE_ATTR_va_mode \| VNODE_ATTR_va_flags \| VNODE_ATTR_va_type)
10232
10233	int
10234	vnode_attr_authorize(struct vnode_attr vap, struct* vnode_attr *dvap, mount_t mp,
10235	kauth_action_t action, vfs_context_t ctx)
10236	{
10237	struct _vnode_authorize_context auth_context;
10238	vauth_ctx vcp;
10239	kauth_ace_rights_t rights;
10240	int noimmutable;
10241	boolean_t found_deny;
10242	boolean_t is_suser = FALSE;
10243	int result = `0`;
10244	uid_t ouid = vap->va_uid;
10245	gid_t ogid = vap->va_gid;
10246
10247	vcp = &auth_context;
10248	vcp->ctx = ctx;
10249	vcp->vp = NULLVP;
10250	vcp->vap = vap;
10251	vcp->dvp = NULLVP;
10252	vcp->dvap = dvap;
10253	vcp->flags = vcp->flags_valid = `0`;
10254
10255	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? `1` : `0`;
10256	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
10257
10258	/*
10259	* Check for read-only filesystems.
10260	*/
10261	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
10262	mp && (mp->mnt_flag & MNT_RDONLY) &&
10263	((vap->va_type == VREG) \|\| (vap->va_type == VDIR) \|\|
10264	(vap->va_type == VLNK) \|\| (rights & KAUTH_VNODE_DELETE) \|\|
10265	(rights & KAUTH_VNODE_DELETE_CHILD))) {
10266	result = EROFS;
10267	goto out;
10268	}
10269
10270	/*
10271	* Check for noexec filesystems.
10272	*/
10273	if ((rights & KAUTH_VNODE_EXECUTE) &&
10274	(vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
10275	result = EACCES;
10276	goto out;
10277	}
10278
10279	if (vfs_context_issuser(ctx)) {
10280	/*
10281	* if we're not asking for execute permissions or modifications,
10282	* then we're done, this action is authorized.
10283	*/
10284	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS))) {
10285	goto out;
10286	}
10287	is_suser = TRUE;
10288	}
10289
10290	if (mp) {
10291	if (vfs_extendedsecurity(mp) && VATTR_IS_ACTIVE(vap, va_acl) && !VATTR_IS_SUPPORTED(vap, va_acl)) {
10292	panic("(1) vnode attrs not complete for vnode_attr_authorize");
10293	}
10294	vnode_attr_handle_uid_and_gid(vap, mp, ctx);
10295	}
10296
10297	if ((vap->va_active & VNODE_SEC_ATTRS_NO_ACL) != (vap->va_supported & VNODE_SEC_ATTRS_NO_ACL)) {
10298	panic("(2) vnode attrs not complete for vnode_attr_authorize (2) vap->va_active = 0x%llx , vap->va_supported = 0x%llx",
10299	vap->va_active, vap->va_supported);
10300	}
10301
10302	result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
10303	found_deny: &found_deny, noimmutable, FALSE);
10304
10305	if (mp) {
10306	vap->va_uid = ouid;
10307	vap->va_gid = ogid;
10308	}
10309
10310	if (result == EPERM) {
10311	result = EACCES;
10312	}
10313	out:
10314	return result;
10315	}
10316
10317
10318	int
10319	vnode_authattr_new(vnode_t dvp, struct vnode_attr vap, int* noauth, vfs_context_t ctx)
10320	{
10321	return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
10322	}
10323
10324	/*
10325	* Check that the attribute information in vattr can be legally applied to
10326	* a new file by the context.
10327	*/
10328	static int
10329	vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int* noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
10330	{
10331	int error;
10332	int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
10333	uint32_t inherit_flags;
10334	kauth_cred_t cred;
10335	guid_t changer;
10336	mount_t dmp;
10337	struct vnode_attr dva;
10338
10339	error = `0`;
10340
10341	if (defaulted_fieldsp) {
10342	*defaulted_fieldsp = `0`;
10343	}
10344
10345	defaulted_owner = defaulted_group = defaulted_mode = `0`;
10346
10347	inherit_flags = `0`;
10348
10349	/*
10350	* Require that the filesystem support extended security to apply any.
10351	*/
10352	if (!vfs_extendedsecurity(dvp->v_mount) &&
10353	(VATTR_IS_ACTIVE(vap, va_acl) \|\| VATTR_IS_ACTIVE(vap, va_uuuid) \|\| VATTR_IS_ACTIVE(vap, va_guuid))) {
10354	error = EINVAL;
10355	goto out;
10356	}
10357
10358	/*
10359	* Default some fields.
10360	*/
10361	dmp = dvp->v_mount;
10362
10363	/*
10364	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
10365	* owner takes ownership of all new files.
10366	*/
10367	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
10368	VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
10369	defaulted_owner = `1`;
10370	} else {
10371	if (!VATTR_IS_ACTIVE(vap, va_uid)) {
10372	/ default owner is current user /
10373	VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
10374	defaulted_owner = `1`;
10375	}
10376	}
10377
10378	/*
10379	* We need the dvp's va_flags and may need the gid of the directory,
10380	* we ask for both here.
10381	*/
10382	VATTR_INIT(&dva);
10383	VATTR_WANTED(&dva, va_gid);
10384	VATTR_WANTED(&dva, va_flags);
10385	if ((error = vnode_getattr(vp: dvp, vap: &dva, ctx)) != `0`) {
10386	goto out;
10387	}
10388
10389	/*
10390	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
10391	* group takes ownership of all new files.
10392	*/
10393	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
10394	VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
10395	defaulted_group = `1`;
10396	} else {
10397	if (!VATTR_IS_ACTIVE(vap, va_gid)) {
10398	/ default group comes from parent object, fallback to current user /
10399	if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
10400	VATTR_SET(vap, va_gid, dva.va_gid);
10401	} else {
10402	VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
10403	}
10404	defaulted_group = `1`;
10405	}
10406	}
10407
10408	if (!VATTR_IS_ACTIVE(vap, va_flags)) {
10409	VATTR_SET(vap, va_flags, `0`);
10410	}
10411
10412	/ Determine if SF_RESTRICTED should be inherited from the parent*
10413	* directory. */
10414	if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
10415	inherit_flags = dva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED);
10416	}
10417
10418	/ default mode is everything, masked with current umask /
10419	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
10420	VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd.fd_cmask);
10421	KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o",
10422	vap->va_mode, vfs_context_proc(ctx)->p_fd.fd_cmask);
10423	defaulted_mode = `1`;
10424	}
10425	/ set timestamps to now /
10426	if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
10427	nanotime(ts: &vap->va_create_time);
10428	VATTR_SET_ACTIVE(vap, va_create_time);
10429	}
10430
10431	/*
10432	* Check for attempts to set nonsensical fields.
10433	*/
10434	if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
10435	error = EINVAL;
10436	KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
10437	vap->va_active & ~VNODE_ATTR_NEWOBJ);
10438	goto out;
10439	}
10440
10441	/*
10442	* Quickly check for the applicability of any enforcement here.
10443	* Tests below maintain the integrity of the local security model.
10444	*/
10445	if (vfs_authopaque(mp: dvp->v_mount)) {
10446	goto out;
10447	}
10448
10449	/*
10450	* We need to know if the caller is the superuser, or if the work is
10451	* otherwise already authorised.
10452	*/
10453	cred = vfs_context_ucred(ctx);
10454	if (noauth) {
10455	/ doing work for the kernel /
10456	has_priv_suser = `1`;
10457	} else {
10458	has_priv_suser = vfs_context_issuser(ctx);
10459	}
10460
10461
10462	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10463	vap->va_flags &= ~SF_SYNTHETIC;
10464	if (has_priv_suser) {
10465	if ((vap->va_flags & (UF_SETTABLE \| SF_SETTABLE)) != vap->va_flags) {
10466	error = EPERM;
10467	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
10468	goto out;
10469	}
10470	} else {
10471	if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
10472	error = EPERM;
10473	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
10474	goto out;
10475	}
10476	}
10477	}
10478
10479	/ if not superuser, validate legality of new-item attributes /
10480	if (!has_priv_suser) {
10481	if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
10482	/ setgid? /
10483	if (vap->va_mode & S_ISGID) {
10484	if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != `0`) {
10485	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
10486	goto out;
10487	}
10488	if (!ismember) {
10489	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
10490	error = EPERM;
10491	goto out;
10492	}
10493	}
10494
10495	/ setuid? /
10496	if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred: cred))) {
10497	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
10498	error = EPERM;
10499	goto out;
10500	}
10501	}
10502	if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred: cred))) {
10503	KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
10504	error = EPERM;
10505	goto out;
10506	}
10507	if (!defaulted_group) {
10508	if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != `0`) {
10509	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
10510	goto out;
10511	}
10512	if (!ismember) {
10513	KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
10514	error = EPERM;
10515	goto out;
10516	}
10517	}
10518
10519	/ initialising owner/group UUID /
10520	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
10521	if ((error = kauth_cred_getguid(cred: cred, guidp: &changer)) != `0`) {
10522	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
10523	/ XXX ENOENT here - no GUID - should perhaps become EPERM /
10524	goto out;
10525	}
10526	if (!kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &changer)) {
10527	KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
10528	error = EPERM;
10529	goto out;
10530	}
10531	}
10532	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
10533	if ((error = kauth_cred_ismember_guid(cred: cred, guidp: &vap->va_guuid, resultp: &ismember)) != `0`) {
10534	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
10535	goto out;
10536	}
10537	if (!ismember) {
10538	KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
10539	error = EPERM;
10540	goto out;
10541	}
10542	}
10543	}
10544	out:
10545	if (inherit_flags) {
10546	/ Apply SF_RESTRICTED to the file if its parent directory was*
10547	* restricted. This is done at the end so that root is not
10548	* required if this flag is only set due to inheritance. */
10549	VATTR_SET(vap, va_flags, (vap->va_flags \| inherit_flags));
10550	}
10551	if (defaulted_fieldsp) {
10552	if (defaulted_mode) {
10553	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_MODE;
10554	}
10555	if (defaulted_group) {
10556	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_GID;
10557	}
10558	if (defaulted_owner) {
10559	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_UID;
10560	}
10561	}
10562	return error;
10563	}
10564
10565	/*
10566	* Check that the attribute information in vap can be legally written by the
10567	* context.
10568	*
10569	* Call this when you're not sure about the vnode_attr; either its contents
10570	* have come from an unknown source, or when they are variable.
10571	*
10572	* Returns errno, or zero and sets actionp to the KAUTH_VNODE_ actions that
10573	* must be authorized to be permitted to write the vattr.
10574	*/
10575	int
10576	vnode_authattr(vnode_t vp, struct vnode_attr vap, kauth_action_t actionp, vfs_context_t ctx)
10577	{
10578	struct vnode_attr ova;
10579	kauth_action_t required_action;
10580	int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
10581	guid_t changer;
10582	gid_t group;
10583	uid_t owner;
10584	mode_t newmode;
10585	kauth_cred_t cred;
10586	uint32_t fdelta;
10587
10588	VATTR_INIT(&ova);
10589	required_action = `0`;
10590	error = `0`;
10591
10592	/*
10593	* Quickly check for enforcement applicability.
10594	*/
10595	if (vfs_authopaque(mp: vp->v_mount)) {
10596	goto out;
10597	}
10598
10599	/*
10600	* Check for attempts to set nonsensical fields.
10601	*/
10602	if (vap->va_active & VNODE_ATTR_RDONLY) {
10603	KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
10604	error = EINVAL;
10605	goto out;
10606	}
10607
10608	/*
10609	* We need to know if the caller is the superuser.
10610	*/
10611	cred = vfs_context_ucred(ctx);
10612	has_priv_suser = kauth_cred_issuser(cred: cred);
10613
10614	/*
10615	* If any of the following are changing, we need information from the old file:
10616	* va_uid
10617	* va_gid
10618	* va_mode
10619	* va_uuuid
10620	* va_guuid
10621	*/
10622	if (VATTR_IS_ACTIVE(vap, va_uid) \|\|
10623	VATTR_IS_ACTIVE(vap, va_gid) \|\|
10624	VATTR_IS_ACTIVE(vap, va_mode) \|\|
10625	VATTR_IS_ACTIVE(vap, va_uuuid) \|\|
10626	VATTR_IS_ACTIVE(vap, va_guuid)) {
10627	VATTR_WANTED(&ova, va_mode);
10628	VATTR_WANTED(&ova, va_uid);
10629	VATTR_WANTED(&ova, va_gid);
10630	VATTR_WANTED(&ova, va_uuuid);
10631	VATTR_WANTED(&ova, va_guuid);
10632	KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
10633	}
10634
10635	/*
10636	* If timestamps are being changed, we need to know who the file is owned
10637	* by.
10638	*/
10639	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
10640	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
10641	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
10642	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
10643	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
10644	VATTR_IS_ACTIVE(vap, va_addedtime)) {
10645	VATTR_WANTED(&ova, va_uid);
10646	#if 0 /* enable this when we support UUIDs as official owners */
10647	VATTR_WANTED(&ova, va_uuuid);
10648	#endif
10649	KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
10650	}
10651
10652	/*
10653	* If flags are being changed, we need the old flags.
10654	*/
10655	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10656	KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
10657	VATTR_WANTED(&ova, va_flags);
10658	}
10659
10660	/*
10661	* If ACLs are being changed, we need the old ACLs.
10662	*/
10663	if (VATTR_IS_ACTIVE(vap, va_acl)) {
10664	KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
10665	VATTR_WANTED(&ova, va_acl);
10666	}
10667
10668	/*
10669	* If the size is being set, make sure it's not a directory.
10670	*/
10671	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
10672	/ size is only meaningful on regular files, don't permit otherwise /
10673	if (!vnode_isreg(vp)) {
10674	KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
10675	error = vnode_isdir(vp) ? EISDIR : EINVAL;
10676	goto out;
10677	}
10678	}
10679
10680	/*
10681	* Get old data.
10682	*/
10683	KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
10684	if ((error = vnode_getattr(vp, vap: &ova, ctx)) != `0`) {
10685	KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
10686	goto out;
10687	}
10688
10689	/*
10690	* Size changes require write access to the file data.
10691	*/
10692	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
10693	/ if we can't get the size, or it's different, we need write access /
10694	KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
10695	required_action \|= KAUTH_VNODE_WRITE_DATA;
10696	}
10697
10698	/*
10699	* Changing timestamps?
10700	*
10701	* Note that we are only called to authorize user-requested time changes;
10702	* side-effect time changes are not authorized. Authorisation is only
10703	* required for existing files.
10704	*
10705	* Non-owners are not permitted to change the time on an existing
10706	* file to anything other than the current time.
10707	*/
10708	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
10709	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
10710	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
10711	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
10712	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
10713	VATTR_IS_ACTIVE(vap, va_addedtime)) {
10714	/*
10715	* The owner and root may set any timestamps they like,
10716	* provided that the file is not immutable. The owner still needs
10717	* WRITE_ATTRIBUTES (implied by ownership but still deniable).
10718	*/
10719	if (has_priv_suser \|\| vauth_node_owner(vap: &ova, cred)) {
10720	KAUTH_DEBUG("ATTR - root or owner changing timestamps");
10721	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE \| KAUTH_VNODE_WRITE_ATTRIBUTES;
10722	} else {
10723	/ just setting the current time? /
10724	if (vap->va_vaflags & VA_UTIMES_NULL) {
10725	KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
10726	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
10727	} else {
10728	KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
10729	error = EACCES;
10730	goto out;
10731	}
10732	}
10733	}
10734
10735	/*
10736	* Changing file mode?
10737	*/
10738	if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
10739	KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
10740
10741	/*
10742	* Mode changes always have the same basic auth requirements.
10743	*/
10744	if (has_priv_suser) {
10745	KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
10746	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
10747	} else {
10748	/ need WRITE_SECURITY /
10749	KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
10750	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
10751	}
10752
10753	/*
10754	* Can't set the setgid bit if you're not in the group and not root. Have to have
10755	* existing group information in the case we're not setting it right now.
10756	*/
10757	if (vap->va_mode & S_ISGID) {
10758	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; / always required /
10759	if (!has_priv_suser) {
10760	if (VATTR_IS_ACTIVE(vap, va_gid)) {
10761	group = vap->va_gid;
10762	} else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
10763	group = ova.va_gid;
10764	} else {
10765	KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
10766	error = EINVAL;
10767	goto out;
10768	}
10769	/*
10770	* This might be too restrictive; WRITE_SECURITY might be implied by
10771	* membership in this case, rather than being an additional requirement.
10772	*/
10773	if ((error = kauth_cred_ismember_gid(cred: cred, gid: group, resultp: &ismember)) != `0`) {
10774	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
10775	goto out;
10776	}
10777	if (!ismember) {
10778	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
10779	error = EPERM;
10780	goto out;
10781	}
10782	}
10783	}
10784
10785	/*
10786	* Can't set the setuid bit unless you're root or the file's owner.
10787	*/
10788	if (vap->va_mode & S_ISUID) {
10789	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; / always required /
10790	if (!has_priv_suser) {
10791	if (VATTR_IS_ACTIVE(vap, va_uid)) {
10792	owner = vap->va_uid;
10793	} else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
10794	owner = ova.va_uid;
10795	} else {
10796	KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
10797	error = EINVAL;
10798	goto out;
10799	}
10800	if (owner != kauth_cred_getuid(cred: cred)) {
10801	/*
10802	* We could allow this if WRITE_SECURITY is permitted, perhaps.
10803	*/
10804	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
10805	error = EPERM;
10806	goto out;
10807	}
10808	}
10809	}
10810	}
10811
10812	/*
10813	* Validate/mask flags changes. This checks that only the flags in
10814	* the UF_SETTABLE mask are being set, and preserves the flags in
10815	* the SF_SETTABLE case.
10816	*
10817	* Since flags changes may be made in conjunction with other changes,
10818	* we will ask the auth code to ignore immutability in the case that
10819	* the SF_* flags are not set and we are only manipulating the file flags.
10820	*
10821	*/
10822	if (VATTR_IS_ACTIVE(vap, va_flags)) {
10823	/ compute changing flags bits /
10824	vap->va_flags &= ~SF_SYNTHETIC;
10825	ova.va_flags &= ~SF_SYNTHETIC;
10826	if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
10827	fdelta = vap->va_flags ^ ova.va_flags;
10828	} else {
10829	fdelta = vap->va_flags;
10830	}
10831
10832	if (fdelta != `0`) {
10833	KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
10834	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
10835
10836	/ check that changing bits are legal /
10837	if (has_priv_suser) {
10838	/*
10839	* The immutability check will prevent us from clearing the SF_*
10840	* flags unless the system securelevel permits it, so just check
10841	* for legal flags here.
10842	*/
10843	if (fdelta & ~(UF_SETTABLE \| SF_SETTABLE)) {
10844	error = EPERM;
10845	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
10846	goto out;
10847	}
10848	} else {
10849	if (fdelta & ~UF_SETTABLE) {
10850	error = EPERM;
10851	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
10852	goto out;
10853	}
10854	}
10855	/*
10856	* If the caller has the ability to manipulate file flags,
10857	* security is not reduced by ignoring them for this operation.
10858	*
10859	* A more complete test here would consider the 'after' states of the flags
10860	* to determine whether it would permit the operation, but this becomes
10861	* very complex.
10862	*
10863	* Ignoring immutability is conditional on securelevel; this does not bypass
10864	* the SF_* flags if securelevel > 0.
10865	*/
10866	required_action \|= KAUTH_VNODE_NOIMMUTABLE;
10867	}
10868	}
10869
10870	/*
10871	* Validate ownership information.
10872	*/
10873	chowner = `0`;
10874	chgroup = `0`;
10875	clear_suid = `0`;
10876	clear_sgid = `0`;
10877
10878	/*
10879	* uid changing
10880	* Note that if the filesystem didn't give us a UID, we expect that it doesn't
10881	* support them in general, and will ignore it if/when we try to set it.
10882	* We might want to clear the uid out of vap completely here.
10883	*/
10884	if (VATTR_IS_ACTIVE(vap, va_uid)) {
10885	if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
10886	if (!has_priv_suser && (kauth_cred_getuid(cred: cred) != vap->va_uid)) {
10887	KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
10888	error = EPERM;
10889	goto out;
10890	}
10891	chowner = `1`;
10892	}
10893	clear_suid = `1`;
10894	}
10895
10896	/*
10897	* gid changing
10898	* Note that if the filesystem didn't give us a GID, we expect that it doesn't
10899	* support them in general, and will ignore it if/when we try to set it.
10900	* We might want to clear the gid out of vap completely here.
10901	*/
10902	if (VATTR_IS_ACTIVE(vap, va_gid)) {
10903	if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
10904	if (!has_priv_suser) {
10905	if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != `0`) {
10906	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
10907	goto out;
10908	}
10909	if (!ismember) {
10910	KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
10911	ova.va_gid, vap->va_gid);
10912	error = EPERM;
10913	goto out;
10914	}
10915	}
10916	chgroup = `1`;
10917	}
10918	clear_sgid = `1`;
10919	}
10920
10921	/*
10922	* Owner UUID being set or changed.
10923	*/
10924	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
10925	/ if the owner UUID is not actually changing ... /
10926	if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
10927	if (kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &ova.va_uuuid)) {
10928	goto no_uuuid_change;
10929	}
10930
10931	/*
10932	* If the current owner UUID is a null GUID, check
10933	* it against the UUID corresponding to the owner UID.
10934	*/
10935	if (kauth_guid_equal(guid1: &ova.va_uuuid, guid2: &kauth_null_guid) &&
10936	VATTR_IS_SUPPORTED(&ova, va_uid)) {
10937	guid_t uid_guid;
10938
10939	if (kauth_cred_uid2guid(uid: ova.va_uid, guidp: &uid_guid) == `0` &&
10940	kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &uid_guid)) {
10941	goto no_uuuid_change;
10942	}
10943	}
10944	}
10945
10946	/*
10947	* The owner UUID cannot be set by a non-superuser to anything other than
10948	* their own or a null GUID (to "unset" the owner UUID).
10949	* Note that file systems must be prepared to handle the
10950	* null UUID case in a manner appropriate for that file
10951	* system.
10952	*/
10953	if (!has_priv_suser) {
10954	if ((error = kauth_cred_getguid(cred: cred, guidp: &changer)) != `0`) {
10955	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
10956	/ XXX ENOENT here - no UUID - should perhaps become EPERM /
10957	goto out;
10958	}
10959	if (!kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &changer) &&
10960	!kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &kauth_null_guid)) {
10961	KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null");
10962	error = EPERM;
10963	goto out;
10964	}
10965	}
10966	chowner = `1`;
10967	clear_suid = `1`;
10968	}
10969	no_uuuid_change:
10970	/*
10971	* Group UUID being set or changed.
10972	*/
10973	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
10974	/ if the group UUID is not actually changing ... /
10975	if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
10976	if (kauth_guid_equal(guid1: &vap->va_guuid, guid2: &ova.va_guuid)) {
10977	goto no_guuid_change;
10978	}
10979
10980	/*
10981	* If the current group UUID is a null UUID, check
10982	* it against the UUID corresponding to the group GID.
10983	*/
10984	if (kauth_guid_equal(guid1: &ova.va_guuid, guid2: &kauth_null_guid) &&
10985	VATTR_IS_SUPPORTED(&ova, va_gid)) {
10986	guid_t gid_guid;
10987
10988	if (kauth_cred_gid2guid(gid: ova.va_gid, guidp: &gid_guid) == `0` &&
10989	kauth_guid_equal(guid1: &vap->va_guuid, guid2: &gid_guid)) {
10990	goto no_guuid_change;
10991	}
10992	}
10993	}
10994
10995	/*
10996	* The group UUID cannot be set by a non-superuser to anything other than
10997	* one of which they are a member or a null GUID (to "unset"
10998	* the group UUID).
10999	* Note that file systems must be prepared to handle the
11000	* null UUID case in a manner appropriate for that file
11001	* system.
11002	*/
11003	if (!has_priv_suser) {
11004	if (kauth_guid_equal(guid1: &vap->va_guuid, guid2: &kauth_null_guid)) {
11005	ismember = `1`;
11006	} else if ((error = kauth_cred_ismember_guid(cred: cred, guidp: &vap->va_guuid, resultp: &ismember)) != `0`) {
11007	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
11008	goto out;
11009	}
11010	if (!ismember) {
11011	KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null");
11012	error = EPERM;
11013	goto out;
11014	}
11015	}
11016	chgroup = `1`;
11017	}
11018	no_guuid_change:
11019
11020	/*
11021	* Compute authorisation for group/ownership changes.
11022	*/
11023	if (chowner \|\| chgroup \|\| clear_suid \|\| clear_sgid) {
11024	if (has_priv_suser) {
11025	KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
11026	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
11027	} else {
11028	if (chowner) {
11029	KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
11030	required_action \|= KAUTH_VNODE_TAKE_OWNERSHIP;
11031	}
11032	if (chgroup && !chowner) {
11033	KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
11034	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
11035	}
11036	}
11037
11038	/*
11039	* clear set-uid and set-gid bits. POSIX only requires this for
11040	* non-privileged processes but we do it even for root.
11041	*/
11042	if (VATTR_IS_ACTIVE(vap, va_mode)) {
11043	newmode = vap->va_mode;
11044	} else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
11045	newmode = ova.va_mode;
11046	} else {
11047	KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
11048	newmode = `0`;
11049	}
11050
11051	/ chown always clears setuid/gid bits. An exception is made for*
11052	* setattrlist which can set both at the same time: <uid, gid, mode> on a file:
11053	* setattrlist is allowed to set the new mode on the file and change (chown)
11054	* uid/gid.
11055	*/
11056	if (newmode & (S_ISUID \| S_ISGID)) {
11057	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
11058	KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
11059	newmode, newmode & ~(S_ISUID \| S_ISGID));
11060	newmode &= ~(S_ISUID \| S_ISGID);
11061	}
11062	VATTR_SET(vap, va_mode, newmode);
11063	}
11064	}
11065
11066	/*
11067	* Authorise changes in the ACL.
11068	*/
11069	if (VATTR_IS_ACTIVE(vap, va_acl)) {
11070	/ no existing ACL /
11071	if (!VATTR_IS_ACTIVE(&ova, va_acl) \|\| (ova.va_acl == NULL)) {
11072	/ adding an ACL /
11073	if (vap->va_acl != NULL) {
11074	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
11075	KAUTH_DEBUG("CHMOD - adding ACL");
11076	}
11077
11078	/ removing an existing ACL /
11079	} else if (vap->va_acl == NULL) {
11080	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
11081	KAUTH_DEBUG("CHMOD - removing ACL");
11082
11083	/ updating an existing ACL /
11084	} else {
11085	if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
11086	/ entry count changed, must be different /
11087	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
11088	KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
11089	} else if (vap->va_acl->acl_entrycount > `0`) {
11090	/ both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs /
11091	if (memcmp(s1: &vap->va_acl->acl_ace[`0`], s2: &ova.va_acl->acl_ace[`0`],
11092	n: sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
11093	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
11094	KAUTH_DEBUG("CHMOD - changing ACL entries");
11095	}
11096	}
11097	}
11098	}
11099
11100	/*
11101	* Other attributes that require authorisation.
11102	*/
11103	if (VATTR_IS_ACTIVE(vap, va_encoding)) {
11104	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
11105	}
11106
11107	out:
11108	if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
11109	kauth_acl_free(fsp: ova.va_acl);
11110	}
11111	if (error == `0`) {
11112	*actionp = required_action;
11113	}
11114	return error;
11115	}
11116
11117	static int
11118	setlocklocal_callback(struct vnode vp, __unused void* *cargs)
11119	{
11120	vnode_lock_spin(vp);
11121	vp->v_flag \|= VLOCKLOCAL;
11122	vnode_unlock(vp);
11123
11124	return VNODE_RETURNED;
11125	}
11126
11127	void
11128	vfs_setlocklocal(mount_t mp)
11129	{
11130	mount_lock_spin(mp);
11131	mp->mnt_kern_flag \|= MNTK_LOCK_LOCAL;
11132	mount_unlock(mp);
11133
11134	/*
11135	* The number of active vnodes is expected to be
11136	* very small when vfs_setlocklocal is invoked.
11137	*/
11138	vnode_iterate(mp, flags: `0`, callout: setlocklocal_callback, NULL);
11139	}
11140
11141	void
11142	vfs_setcompoundopen(mount_t mp)
11143	{
11144	mount_lock_spin(mp);
11145	mp->mnt_compound_ops \|= COMPOUND_VNOP_OPEN;
11146	mount_unlock(mp);
11147	}
11148
11149	void
11150	vnode_setswapmount(vnode_t vp)
11151	{
11152	mount_lock(mp: vp->v_mount);
11153	vp->v_mount->mnt_kern_flag \|= MNTK_SWAP_MOUNT;
11154	mount_unlock(mp: vp->v_mount);
11155	}
11156
11157	void
11158	vfs_setfskit(mount_t mp)
11159	{
11160	mount_lock_spin(mp);
11161	mp->mnt_kern_flag \|= MNTK_FSKIT;
11162	mount_unlock(mp);
11163	}
11164
11165	char *
11166	vfs_getfstypenameref_locked(mount_t mp, size_t *lenp)
11167	{
11168	char *name;
11169
11170	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11171	name = mp->fstypename_override;
11172	} else {
11173	name = mp->mnt_vfsstat.f_fstypename;
11174	}
11175	if (lenp != NULL) {
11176	*lenp = strlen(s: name);
11177	}
11178	return name;
11179	}
11180
11181	void
11182	vfs_getfstypename(mount_t mp, char *buf, size_t buflen)
11183	{
11184	mount_lock_spin(mp);
11185	strlcpy(dst: buf, src: vfs_getfstypenameref_locked(mp, NULL), n: buflen);
11186	mount_unlock(mp);
11187	}
11188
11189	void
11190	vfs_setfstypename_locked(mount_t mp, const char *name)
11191	{
11192	if (name == NULL \|\| name[`0`] == `'\0'`) {
11193	mp->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11194	mp->fstypename_override[`0`] = `'\0'`;
11195	} else {
11196	strlcpy(dst: mp->fstypename_override, src: name,
11197	n: sizeof(mp->fstypename_override));
11198	mp->mnt_kern_flag \|= MNTK_TYPENAME_OVERRIDE;
11199	}
11200	}
11201
11202	void
11203	vfs_setfstypename(mount_t mp, const char *name)
11204	{
11205	mount_lock_spin(mp);
11206	vfs_setfstypename_locked(mp, name);
11207	mount_unlock(mp);
11208	}
11209
11210	int64_t
11211	vnode_getswappin_avail(vnode_t vp)
11212	{
11213	int64_t max_swappin_avail = `0`;
11214
11215	mount_lock(mp: vp->v_mount);
11216	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
11217	max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
11218	}
11219	mount_unlock(mp: vp->v_mount);
11220
11221	return max_swappin_avail;
11222	}
11223
11224
11225	void
11226	vn_setunionwait(vnode_t vp)
11227	{
11228	vnode_lock_spin(vp);
11229	vp->v_flag \|= VISUNION;
11230	vnode_unlock(vp);
11231	}
11232
11233
11234	void
11235	vn_checkunionwait(vnode_t vp)
11236	{
11237	vnode_lock_spin(vp);
11238	while ((vp->v_flag & VISUNION) == VISUNION) {
11239	msleep(chan: (caddr_t)&vp->v_flag, mtx: &vp->v_lock, pri: `0`, wmesg: `0`, ts: `0`);
11240	}
11241	vnode_unlock(vp);
11242	}
11243
11244	void
11245	vn_clearunionwait(vnode_t vp, int locked)
11246	{
11247	if (!locked) {
11248	vnode_lock_spin(vp);
11249	}
11250	if ((vp->v_flag & VISUNION) == VISUNION) {
11251	vp->v_flag &= ~VISUNION;
11252	wakeup(chan: (caddr_t)&vp->v_flag);
11253	}
11254	if (!locked) {
11255	vnode_unlock(vp);
11256	}
11257	}
11258
11259	/*
11260	* Removes orphaned apple double files during a rmdir
11261	* Works by:
11262	* 1. vnode_suspend().
11263	* 2. Call VNOP_READDIR() till the end of directory is reached.
11264	* 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
11265	* 4. Continue (2) and (3) till end of directory is reached.
11266	* 5. If all the entries in the directory were files with "._" name, delete all the files.
11267	* 6. vnode_resume()
11268	* 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
11269	*/
11270
11271	errno_t
11272	rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
11273	{
11274	#define UIO_BUFF_SIZE 2048
11275	uio_t auio = NULL;
11276	int eofflag, siz = UIO_BUFF_SIZE, alloc_size = `0`, nentries = `0`;
11277	int open_flag = `0`, full_erase_flag = `0`;
11278	UIO_STACKBUF(uio_buf, `1`);
11279	char *rbuf = NULL;
11280	void *dir_pos;
11281	void *dir_end;
11282	struct dirent *dp;
11283	errno_t error;
11284
11285	error = vnode_suspend(vp);
11286
11287	/*
11288	* restart_flag is set so that the calling rmdir sleeps and resets
11289	*/
11290	if (error == EBUSY) {
11291	*restart_flag = `1`;
11292	}
11293	if (error != `0`) {
11294	return error;
11295	}
11296
11297	/*
11298	* Prevent dataless fault materialization while we have
11299	* a suspended vnode.
11300	*/
11301	uthread_t ut = current_uthread();
11302	bool saved_nodatalessfaults =
11303	(ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false;
11304	ut->uu_flag \|= UT_NSPACE_NODATALESSFAULTS;
11305
11306	/*
11307	* set up UIO
11308	*/
11309	rbuf = kalloc_data(siz, Z_WAITOK);
11310	alloc_size = siz;
11311	if (rbuf) {
11312	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: `0`, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ,
11313	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
11314	}
11315	if (!rbuf \|\| !auio) {
11316	error = ENOMEM;
11317	goto outsc;
11318	}
11319
11320	uio_setoffset(a_uio: auio, a_offset: `0`);
11321
11322	eofflag = `0`;
11323
11324	if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
11325	goto outsc;
11326	} else {
11327	open_flag = `1`;
11328	}
11329
11330	/*
11331	* First pass checks if all files are appleDouble files.
11332	*/
11333
11334	do {
11335	siz = UIO_BUFF_SIZE;
11336	uio_reset(a_uio: auio, a_offset: uio_offset(a_uio: auio), a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
11337	uio_addiov(a_uio: auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
11338
11339	if ((error = VNOP_READDIR(vp, auio, `0`, &eofflag, &nentries, ctx))) {
11340	goto outsc;
11341	}
11342
11343	if (uio_resid(a_uio: auio) != `0`) {
11344	siz -= uio_resid(a_uio: auio);
11345	}
11346
11347	/*
11348	* Iterate through directory
11349	*/
11350	dir_pos = (void*) rbuf;
11351	dir_end = (void*) (rbuf + siz);
11352	dp = (struct dirent*) (dir_pos);
11353
11354	if (dir_pos == dir_end) {
11355	eofflag = `1`;
11356	}
11357
11358	while (dir_pos < dir_end) {
11359	/*
11360	* Check for . and .. as well as directories
11361	*/
11362	if (dp->d_ino != `0` &&
11363	!((dp->d_namlen == `1` && dp->d_name[`0`] == `'.'`) \|\|
11364	(dp->d_namlen == `2` && dp->d_name[`0`] == `'.'` && dp->d_name[`1`] == `'.'`))) {
11365	/*
11366	* Check for irregular files and ._ files
11367	* If there is a ._._ file abort the op
11368	*/
11369	if (dp->d_namlen < `2` \|\|
11370	strncmp(s1: dp->d_name, s2: "._", n: `2`) \|\|
11371	(dp->d_namlen >= `4` && !strncmp(s1: &(dp->d_name[`2`]), s2: "._", n: `2`))) {
11372	error = ENOTEMPTY;
11373	goto outsc;
11374	}
11375	}
11376	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
11377	dp = (struct dirent*)dir_pos;
11378	}
11379
11380	/*
11381	* workaround for HFS/NFS setting eofflag before end of file
11382	*/
11383	if (vp->v_tag == VT_HFS && nentries > `2`) {
11384	eofflag = `0`;
11385	}
11386
11387	if (vp->v_tag == VT_NFS) {
11388	if (eofflag && !full_erase_flag) {
11389	full_erase_flag = `1`;
11390	eofflag = `0`;
11391	uio_reset(a_uio: auio, a_offset: `0`, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
11392	} else if (!eofflag && full_erase_flag) {
11393	full_erase_flag = `0`;
11394	}
11395	}
11396	} while (!eofflag);
11397	/*
11398	* If we've made it here all the files in the dir are ._ files.
11399	* We can delete the files even though the node is suspended
11400	* because we are the owner of the file.
11401	*/
11402
11403	uio_reset(a_uio: auio, a_offset: `0`, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
11404	eofflag = `0`;
11405	full_erase_flag = `0`;
11406
11407	do {
11408	siz = UIO_BUFF_SIZE;
11409	uio_reset(a_uio: auio, a_offset: uio_offset(a_uio: auio), a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
11410	uio_addiov(a_uio: auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
11411
11412	error = VNOP_READDIR(vp, auio, `0`, &eofflag, &nentries, ctx);
11413
11414	if (error != `0`) {
11415	goto outsc;
11416	}
11417
11418	if (uio_resid(a_uio: auio) != `0`) {
11419	siz -= uio_resid(a_uio: auio);
11420	}
11421
11422	/*
11423	* Iterate through directory
11424	*/
11425	dir_pos = (void*) rbuf;
11426	dir_end = (void*) (rbuf + siz);
11427	dp = (struct dirent*) dir_pos;
11428
11429	if (dir_pos == dir_end) {
11430	eofflag = `1`;
11431	}
11432
11433	while (dir_pos < dir_end) {
11434	/*
11435	* Check for . and .. as well as directories
11436	*/
11437	if (dp->d_ino != `0` &&
11438	!((dp->d_namlen == `1` && dp->d_name[`0`] == `'.'`) \|\|
11439	(dp->d_namlen == `2` && dp->d_name[`0`] == `'.'` && dp->d_name[`1`] == `'.'`))
11440	) {
11441	error = unlink1(ctx, vp,
11442	CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
11443	VNODE_REMOVE_SKIP_NAMESPACE_EVENT \|
11444	VNODE_REMOVE_NO_AUDIT_PATH);
11445
11446	if (error && error != ENOENT) {
11447	goto outsc;
11448	}
11449	}
11450	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
11451	dp = (struct dirent*)dir_pos;
11452	}
11453
11454	/*
11455	* workaround for HFS/NFS setting eofflag before end of file
11456	*/
11457	if (vp->v_tag == VT_HFS && nentries > `2`) {
11458	eofflag = `0`;
11459	}
11460
11461	if (vp->v_tag == VT_NFS) {
11462	if (eofflag && !full_erase_flag) {
11463	full_erase_flag = `1`;
11464	eofflag = `0`;
11465	uio_reset(a_uio: auio, a_offset: `0`, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
11466	} else if (!eofflag && full_erase_flag) {
11467	full_erase_flag = `0`;
11468	}
11469	}
11470	} while (!eofflag);
11471
11472
11473	error = `0`;
11474
11475	outsc:
11476	if (open_flag) {
11477	VNOP_CLOSE(vp, FREAD, ctx);
11478	}
11479
11480	if (auio) {
11481	uio_free(a_uio: auio);
11482	}
11483	kfree_data(rbuf, alloc_size);
11484
11485	if (saved_nodatalessfaults == false) {
11486	ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11487	}
11488
11489	vnode_resume(vp);
11490
11491	return error;
11492	}
11493
11494
11495	void
11496	lock_vnode_and_post(vnode_t vp, int kevent_num)
11497	{
11498	/ Only take the lock if there's something there! /
11499	if (vp->v_knotes.slh_first != NULL) {
11500	vnode_lock(vp);
11501	KNOTE(&vp->v_knotes, kevent_num);
11502	vnode_unlock(vp);
11503	}
11504	}
11505
11506	void panic_print_vnodes(void);
11507
11508	/ define PANIC_PRINTS_VNODES only if investigation is required. /
11509	#ifdef PANIC_PRINTS_VNODES
11510
11511	static const char *
11512	__vtype(uint16_t vtype)
11513	{
11514	switch (vtype) {
11515	case VREG:
11516	return "R";
11517	case VDIR:
11518	return "D";
11519	case VBLK:
11520	return "B";
11521	case VCHR:
11522	return "C";
11523	case VLNK:
11524	return "L";
11525	case VSOCK:
11526	return "S";
11527	case VFIFO:
11528	return "F";
11529	case VBAD:
11530	return "x";
11531	case VSTR:
11532	return "T";
11533	case VCPLX:
11534	return "X";
11535	default:
11536	return "?";
11537	}
11538	}
11539
11540	/*
11541	* build a path from the bottom up
11542	* NOTE: called from the panic path - no alloc'ing of memory and no locks!
11543	*/
11544	static char *
11545	__vpath(vnode_t vp, char str, int* len, int depth)
11546	{
11547	int vnm_len;
11548	const char *src;
11549	char *dst;
11550
11551	if (len <= `0`) {
11552	return str;
11553	}
11554	/ str + len is the start of the string we created /
11555	if (!vp->v_name) {
11556	return str + len;
11557	}
11558
11559	/ follow mount vnodes to get the full path /
11560	if ((vp->v_flag & VROOT)) {
11561	if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
11562	return __vpath(vp->v_mount->mnt_vnodecovered,
11563	str, len, depth + `1`);
11564	}
11565	return str + len;
11566	}
11567
11568	src = vp->v_name;
11569	vnm_len = strlen(src);
11570	if (vnm_len > len) {
11571	/ truncate the name to fit in the string /
11572	src += (vnm_len - len);
11573	vnm_len = len;
11574	}
11575
11576	/ start from the back and copy just characters (no NULLs) /
11577
11578	/ this will chop off leaf path (file) names /
11579	if (depth > `0`) {
11580	dst = str + len - vnm_len;
11581	memcpy(dst, src, vnm_len);
11582	len -= vnm_len;
11583	} else {
11584	dst = str + len;
11585	}
11586
11587	if (vp->v_parent && len > `1`) {
11588	/ follow parents up the chain /
11589	len--;
11590	*(dst - `1`) = `'/'`;
11591	return __vpath(vp->v_parent, str, len, depth + `1`);
11592	}
11593
11594	return dst;
11595	}
11596
11597	#define SANE_VNODE_PRINT_LIMIT 5000
11598	void
11599	panic_print_vnodes(void)
11600	{
11601	mount_t mnt;
11602	vnode_t vp;
11603	int nvnodes = `0`;
11604	const char *type;
11605	char *nm;
11606	char vname[`257`];
11607
11608	paniclog_append_noflush("\n*** VNODES ***\n"
11609	"TYPE UREF ICNT PATH\n");
11610
11611	/ NULL-terminate the path name /
11612	vname[sizeof(vname) - `1`] = `'\0'`;
11613
11614	/*
11615	* iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
11616	*/
11617	TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
11618	if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
11619	paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
11620	&mountlist, mnt);
11621	break;
11622	}
11623
11624	TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
11625	if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
11626	paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
11627	&mnt->mnt_vnodelist, vp);
11628	break;
11629	}
11630
11631	if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
11632	return;
11633	}
11634	type = __vtype(vp->v_type);
11635	nm = __vpath(vp, vname, sizeof(vname) - `1`, `0`);
11636	paniclog_append_noflush("%s %0d %0d %s\n",
11637	type, vp->v_usecount, vp->v_iocount, nm);
11638	}
11639	}
11640	}
11641
11642	#else /* !PANIC_PRINTS_VNODES */
11643	void
11644	panic_print_vnodes(void)
11645	{
11646	return;
11647	}
11648	#endif
11649
11650
11651	#ifdef CONFIG_IOCOUNT_TRACE
11652	static void
11653	record_iocount_trace_vnode(vnode_t vp, int type)
11654	{
11655	void *stacks[IOCOUNT_TRACE_MAX_FRAMES] = {`0`};
11656	int idx = vp->v_iocount_trace[type].idx;
11657
11658	if (idx >= IOCOUNT_TRACE_MAX_IDX) {
11659	return;
11660	}
11661
11662	OSBacktrace((void **)&stacks[`0`], IOCOUNT_TRACE_MAX_FRAMES);
11663
11664	/*
11665	* To save index space, only store the unique backtraces. If dup is found,
11666	* just bump the count and return.
11667	*/
11668	for (int i = `0`; i < idx; i++) {
11669	if (memcmp(&stacks[`0`], &vp->v_iocount_trace[type].stacks[i][`0`],
11670	sizeof(stacks)) == `0`) {
11671	vp->v_iocount_trace[type].counts[i]++;
11672	return;
11673	}
11674	}
11675
11676	memcpy(&vp->v_iocount_trace[type].stacks[idx][`0`], &stacks[`0`],
11677	sizeof(stacks));
11678	vp->v_iocount_trace[type].counts[idx] = `1`;
11679	vp->v_iocount_trace[type].idx++;
11680	}
11681
11682	static void
11683	record_iocount_trace_uthread(vnode_t vp, int count)
11684	{
11685	struct uthread *ut;
11686
11687	ut = current_uthread();
11688	ut->uu_iocount += count;
11689
11690	if (count == `1`) {
11691	if (ut->uu_vpindex < `32`) {
11692	OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][`0`], `10`);
11693
11694	ut->uu_vps[ut->uu_vpindex] = vp;
11695	ut->uu_vpindex++;
11696	}
11697	}
11698	}
11699
11700	static void
11701	record_vp(vnode_t vp, int count)
11702	{
11703	if (__probable(bootarg_vnode_iocount_trace == `0` &&
11704	bootarg_uthread_iocount_trace == `0`)) {
11705	return;
11706	}
11707
11708	#if CONFIG_TRIGGERS
11709	if (vp->v_resolve) {
11710	return;
11711	}
11712	#endif
11713	if ((vp->v_flag & VSYSTEM)) {
11714	return;
11715	}
11716
11717	if (bootarg_vnode_iocount_trace) {
11718	record_iocount_trace_vnode(vp,
11719	(count > `0`) ? IOCOUNT_TRACE_VGET : IOCOUNT_TRACE_VPUT);
11720	}
11721	if (bootarg_uthread_iocount_trace) {
11722	record_iocount_trace_uthread(vp, count);
11723	}
11724	}
11725	#endif /* CONFIG_IOCOUNT_TRACE */
11726
11727	#if CONFIG_TRIGGERS
11728	#define __triggers_unused
11729	#else
11730	#define __triggers_unused __unused
11731	#endif
11732
11733	resolver_result_t
11734	vfs_resolver_result(__triggers_unused uint32_t seq, __triggers_unused enum resolver_status stat, __triggers_unused int aux)
11735	{
11736	#if CONFIG_TRIGGERS
11737	/*
11738	* \|<--- 32 --->\|<--- 28 --->\|<- 4 ->\|
11739	* sequence auxiliary status
11740	*/
11741	return (((uint64_t)seq) << `32`) \|
11742	(((uint64_t)(aux & `0x0fffffff`)) << `4`) \|
11743	(uint64_t)(stat & `0x0000000F`);
11744	#else
11745	return (`0x0ULL`) \| (((uint64_t)ENOTSUP) << `4`) \| (((uint64_t)RESOLVER_ERROR) & `0xF`);
11746	#endif
11747	}
11748
11749	#if CONFIG_TRIGGERS
11750
11751	#define TRIG_DEBUG 0
11752
11753	#if TRIG_DEBUG
11754	#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
11755	#else
11756	#define TRIG_LOG(...)
11757	#endif
11758
11759	/*
11760	* Resolver result functions
11761	*/
11762
11763
11764	enum resolver_status
11765	vfs_resolver_status(resolver_result_t result)
11766	{
11767	/ lower 4 bits is status /
11768	return result & `0x0000000F`;
11769	}
11770
11771	uint32_t
11772	vfs_resolver_sequence(resolver_result_t result)
11773	{
11774	/ upper 32 bits is sequence /
11775	return (uint32_t)(result >> `32`);
11776	}
11777
11778	int
11779	vfs_resolver_auxiliary(resolver_result_t result)
11780	{
11781	/ 28 bits of auxiliary /
11782	return (int)(((uint32_t)(result & `0xFFFFFFF0`)) >> `4`);
11783	}
11784
11785	/*
11786	* SPI
11787	* Call in for resolvers to update vnode trigger state
11788	*/
11789	int
11790	vnode_trigger_update(vnode_t vp, resolver_result_t result)
11791	{
11792	vnode_resolve_t rp;
11793	uint32_t seq;
11794	enum resolver_status stat;
11795
11796	if (vp->v_resolve == NULL) {
11797	return EINVAL;
11798	}
11799
11800	stat = vfs_resolver_status(result);
11801	seq = vfs_resolver_sequence(result);
11802
11803	if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
11804	return EINVAL;
11805	}
11806
11807	rp = vp->v_resolve;
11808	lck_mtx_lock(lck: &rp->vr_lock);
11809
11810	if (seq > rp->vr_lastseq) {
11811	if (stat == RESOLVER_RESOLVED) {
11812	rp->vr_flags \|= VNT_RESOLVED;
11813	} else {
11814	rp->vr_flags &= ~VNT_RESOLVED;
11815	}
11816
11817	rp->vr_lastseq = seq;
11818	}
11819
11820	lck_mtx_unlock(lck: &rp->vr_lock);
11821
11822	return `0`;
11823	}
11824
11825	static int
11826	vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
11827	{
11828	int error;
11829
11830	vnode_lock_spin(vp);
11831	if (vp->v_resolve != NULL) {
11832	vnode_unlock(vp);
11833	return EINVAL;
11834	} else {
11835	vp->v_resolve = rp;
11836	}
11837	vnode_unlock(vp);
11838
11839	if (ref) {
11840	error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
11841	if (error != `0`) {
11842	panic("VNODE_REF_FORCE didn't help...");
11843	}
11844	}
11845
11846	return `0`;
11847	}
11848
11849	/*
11850	* VFS internal interfaces for vnode triggers
11851	*
11852	* vnode must already have an io count on entry
11853	* v_resolve is stable when io count is non-zero
11854	*/
11855	static int
11856	vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
11857	{
11858	vnode_resolve_t rp;
11859	int result;
11860	char byte;
11861
11862	#if 1
11863	/ minimum pointer test (debugging) /
11864	if (tinfo->vnt_data) {
11865	byte = ((char* *)tinfo->vnt_data);
11866	}
11867	#endif
11868	rp = kalloc_type(struct vnode_resolve, Z_WAITOK \| Z_NOFAIL);
11869
11870	lck_mtx_init(lck: &rp->vr_lock, grp: &trigger_vnode_lck_grp, attr: &trigger_vnode_lck_attr);
11871
11872	rp->vr_resolve_func = tinfo->vnt_resolve_func;
11873	rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
11874	rp->vr_rearm_func = tinfo->vnt_rearm_func;
11875	rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
11876	rp->vr_data = tinfo->vnt_data;
11877	rp->vr_lastseq = `0`;
11878	rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
11879	if (external) {
11880	rp->vr_flags \|= VNT_EXTERNAL;
11881	}
11882
11883	result = vnode_resolver_attach(vp, rp, ref: external);
11884	if (result != `0`) {
11885	goto out;
11886	}
11887
11888	if (mp) {
11889	OSAddAtomic(`1`, &mp->mnt_numtriggers);
11890	}
11891
11892	return result;
11893
11894	out:
11895	kfree_type(struct vnode_resolve, rp);
11896	return result;
11897	}
11898
11899	static void
11900	vnode_resolver_release(vnode_resolve_t rp)
11901	{
11902	/*
11903	* Give them a chance to free any private data
11904	*/
11905	if (rp->vr_data && rp->vr_reclaim_func) {
11906	rp->vr_reclaim_func(NULLVP, rp->vr_data);
11907	}
11908
11909	lck_mtx_destroy(lck: &rp->vr_lock, grp: &trigger_vnode_lck_grp);
11910	kfree_type(struct vnode_resolve, rp);
11911	}
11912
11913	/ Called after the vnode has been drained /
11914	static void
11915	vnode_resolver_detach(vnode_t vp)
11916	{
11917	vnode_resolve_t rp;
11918	mount_t mp;
11919
11920	mp = vnode_mount(vp);
11921
11922	vnode_lock(vp);
11923	rp = vp->v_resolve;
11924	vp->v_resolve = NULL;
11925	vnode_unlock(vp);
11926
11927	if ((rp->vr_flags & VNT_EXTERNAL) != `0`) {
11928	vnode_rele_ext(vp, O_EVTONLY, dont_reenter: `1`);
11929	}
11930
11931	vnode_resolver_release(rp);
11932
11933	/ Keep count of active trigger vnodes per mount /
11934	OSAddAtomic(-`1`, &mp->mnt_numtriggers);
11935	}
11936
11937	__private_extern__
11938	void
11939	vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
11940	{
11941	vnode_resolve_t rp;
11942	resolver_result_t result;
11943	enum resolver_status status;
11944	uint32_t seq;
11945
11946	if ((vp->v_resolve == NULL) \|\|
11947	(vp->v_resolve->vr_rearm_func == NULL) \|\|
11948	(vp->v_resolve->vr_flags & VNT_AUTO_REARM) == `0`) {
11949	return;
11950	}
11951
11952	rp = vp->v_resolve;
11953	lck_mtx_lock(lck: &rp->vr_lock);
11954
11955	/*
11956	* Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
11957	*/
11958	if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
11959	lck_mtx_unlock(lck: &rp->vr_lock);
11960	return;
11961	}
11962
11963	/ Check if this vnode is already armed /
11964	if ((rp->vr_flags & VNT_RESOLVED) == `0`) {
11965	lck_mtx_unlock(lck: &rp->vr_lock);
11966	return;
11967	}
11968
11969	lck_mtx_unlock(lck: &rp->vr_lock);
11970
11971	result = rp->vr_rearm_func(vp, `0`, rp->vr_data, ctx);
11972	status = vfs_resolver_status(result);
11973	seq = vfs_resolver_sequence(result);
11974
11975	lck_mtx_lock(lck: &rp->vr_lock);
11976	if (seq > rp->vr_lastseq) {
11977	if (status == RESOLVER_UNRESOLVED) {
11978	rp->vr_flags &= ~VNT_RESOLVED;
11979	}
11980	rp->vr_lastseq = seq;
11981	}
11982	lck_mtx_unlock(lck: &rp->vr_lock);
11983	}
11984
11985	__private_extern__
11986	int
11987	vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
11988	{
11989	vnode_resolve_t rp;
11990	enum path_operation op;
11991	resolver_result_t result;
11992	enum resolver_status status;
11993	uint32_t seq;
11994
11995	/*
11996	* N.B. we cannot call vfs_context_can_resolve_triggers()
11997	* here because we really only want to suppress that in
11998	* the event the trigger will be resolved by something in
11999	* user-space. Any triggers that are resolved by the kernel
12000	* do not pose a threat of deadlock.
12001	*/
12002
12003	/ Only trigger on topmost vnodes /
12004	if ((vp->v_resolve == NULL) \|\|
12005	(vp->v_resolve->vr_resolve_func == NULL) \|\|
12006	(vp->v_mountedhere != NULL)) {
12007	return `0`;
12008	}
12009
12010	rp = vp->v_resolve;
12011	lck_mtx_lock(lck: &rp->vr_lock);
12012
12013	/ Check if this vnode is already resolved /
12014	if (rp->vr_flags & VNT_RESOLVED) {
12015	lck_mtx_unlock(lck: &rp->vr_lock);
12016	return `0`;
12017	}
12018
12019	lck_mtx_unlock(lck: &rp->vr_lock);
12020
12021	#if CONFIG_MACF
12022	if ((rp->vr_flags & VNT_KERN_RESOLVE) == `0`) {
12023	/*
12024	* VNT_KERN_RESOLVE indicates this trigger has no parameters
12025	* at the discression of the accessing process other than
12026	* the act of access. All other triggers must be checked
12027	*/
12028	int rv = mac_vnode_check_trigger_resolve(ctx, dvp: vp, cnp: &ndp->ni_cnd);
12029	if (rv != `0`) {
12030	return rv;
12031	}
12032	}
12033	#endif
12034
12035	/*
12036	* XXX
12037	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
12038	* is there anyway to know this???
12039	* there can also be other legitimate lookups in parallel
12040	*
12041	* XXX - should we call this on a separate thread with a timeout?
12042	*
12043	* XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should
12044	* get the richer set and non-leafs should get generic OP_LOOKUP? TBD
12045	*/
12046	op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
12047
12048	result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, `0`, rp->vr_data, ctx);
12049	status = vfs_resolver_status(result);
12050	seq = vfs_resolver_sequence(result);
12051
12052	lck_mtx_lock(lck: &rp->vr_lock);
12053	if (seq > rp->vr_lastseq) {
12054	if (status == RESOLVER_RESOLVED) {
12055	rp->vr_flags \|= VNT_RESOLVED;
12056	}
12057	rp->vr_lastseq = seq;
12058	}
12059	lck_mtx_unlock(lck: &rp->vr_lock);
12060
12061	/ On resolver errors, propagate the error back up /
12062	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : `0`;
12063	}
12064
12065	static int
12066	vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
12067	{
12068	vnode_resolve_t rp;
12069	resolver_result_t result;
12070	enum resolver_status status;
12071	uint32_t seq;
12072
12073	if ((vp->v_resolve == NULL) \|\| (vp->v_resolve->vr_unresolve_func == NULL)) {
12074	return `0`;
12075	}
12076
12077	rp = vp->v_resolve;
12078	lck_mtx_lock(lck: &rp->vr_lock);
12079
12080	/ Check if this vnode is already resolved /
12081	if ((rp->vr_flags & VNT_RESOLVED) == `0`) {
12082	printf("vnode_trigger_unresolve: not currently resolved\n");
12083	lck_mtx_unlock(lck: &rp->vr_lock);
12084	return `0`;
12085	}
12086
12087	rp->vr_flags \|= VNT_VFS_UNMOUNTED;
12088
12089	lck_mtx_unlock(lck: &rp->vr_lock);
12090
12091	/*
12092	* XXX
12093	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
12094	* there can also be other legitimate lookups in parallel
12095	*
12096	* XXX - should we call this on a separate thread with a timeout?
12097	*/
12098
12099	result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
12100	status = vfs_resolver_status(result);
12101	seq = vfs_resolver_sequence(result);
12102
12103	lck_mtx_lock(lck: &rp->vr_lock);
12104	if (seq > rp->vr_lastseq) {
12105	if (status == RESOLVER_UNRESOLVED) {
12106	rp->vr_flags &= ~VNT_RESOLVED;
12107	}
12108	rp->vr_lastseq = seq;
12109	}
12110	rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
12111	lck_mtx_unlock(lck: &rp->vr_lock);
12112
12113	/ On resolver errors, propagate the error back up /
12114	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : `0`;
12115	}
12116
12117	static int
12118	triggerisdescendant(mount_t mp, mount_t rmp)
12119	{
12120	int match = FALSE;
12121
12122	/*
12123	* walk up vnode covered chain looking for a match
12124	*/
12125	name_cache_lock_shared();
12126
12127	while (`1`) {
12128	vnode_t vp;
12129
12130	/ did we encounter "/" ? /
12131	if (mp->mnt_flag & MNT_ROOTFS) {
12132	break;
12133	}
12134
12135	vp = mp->mnt_vnodecovered;
12136	if (vp == NULLVP) {
12137	break;
12138	}
12139
12140	mp = vp->v_mount;
12141	if (mp == rmp) {
12142	match = TRUE;
12143	break;
12144	}
12145	}
12146
12147	name_cache_unlock();
12148
12149	return match;
12150	}
12151
12152	struct trigger_unmount_info {
12153	vfs_context_t ctx;
12154	mount_t top_mp;
12155	vnode_t trigger_vp;
12156	mount_t trigger_mp;
12157	uint32_t trigger_vid;
12158	int flags;
12159	};
12160
12161	static int
12162	trigger_unmount_callback(mount_t mp, void * arg)
12163	{
12164	struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
12165	boolean_t mountedtrigger = FALSE;
12166
12167	/*
12168	* When we encounter the top level mount we're done
12169	*/
12170	if (mp == infop->top_mp) {
12171	return VFS_RETURNED_DONE;
12172	}
12173
12174	if ((mp->mnt_vnodecovered == NULL) \|\|
12175	(vnode_getwithref(vp: mp->mnt_vnodecovered) != `0`)) {
12176	return VFS_RETURNED;
12177	}
12178
12179	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
12180	(mp->mnt_vnodecovered->v_resolve != NULL) &&
12181	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
12182	mountedtrigger = TRUE;
12183	}
12184	vnode_put(vp: mp->mnt_vnodecovered);
12185
12186	/*
12187	* When we encounter a mounted trigger, check if its under the top level mount
12188	*/
12189	if (!mountedtrigger \|\| !triggerisdescendant(mp, rmp: infop->top_mp)) {
12190	return VFS_RETURNED;
12191	}
12192
12193	/*
12194	* Process any pending nested mount (now that its not referenced)
12195	*/
12196	if ((infop->trigger_vp != NULLVP) &&
12197	(vnode_getwithvid(vp: infop->trigger_vp, vid: infop->trigger_vid) == `0`)) {
12198	vnode_t vp = infop->trigger_vp;
12199	int error;
12200
12201	vnode_drop(vp: infop->trigger_vp);
12202	infop->trigger_vp = NULLVP;
12203
12204	if (mp == vp->v_mountedhere) {
12205	vnode_put(vp);
12206	printf("trigger_unmount_callback: unexpected match '%s'\n",
12207	mp->mnt_vfsstat.f_mntonname);
12208	return VFS_RETURNED;
12209	}
12210	if (infop->trigger_mp != vp->v_mountedhere) {
12211	vnode_put(vp);
12212	printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
12213	infop->trigger_mp, vp->v_mountedhere);
12214	goto savenext;
12215	}
12216
12217	error = vnode_trigger_unresolve(vp, flags: infop->flags, ctx: infop->ctx);
12218	vnode_put(vp);
12219	if (error) {
12220	printf("unresolving: '%s', err %d\n",
12221	vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
12222	"???", error);
12223	return VFS_RETURNED_DONE; / stop iteration on errors /
12224	}
12225	} else if (infop->trigger_vp != NULLVP) {
12226	vnode_drop(vp: infop->trigger_vp);
12227	}
12228
12229	savenext:
12230	/*
12231	* We can't call resolver here since we hold a mount iter
12232	* ref on mp so save its covered vp for later processing
12233	*/
12234	infop->trigger_vp = mp->mnt_vnodecovered;
12235	if ((infop->trigger_vp != NULLVP) &&
12236	(vnode_getwithref(vp: infop->trigger_vp) == `0`)) {
12237	if (infop->trigger_vp->v_mountedhere == mp) {
12238	infop->trigger_vid = infop->trigger_vp->v_id;
12239	vnode_hold(vp: infop->trigger_vp);
12240	infop->trigger_mp = mp;
12241	}
12242	vnode_put(vp: infop->trigger_vp);
12243	}
12244
12245	return VFS_RETURNED;
12246	}
12247
12248	/*
12249	* Attempt to unmount any trigger mounts nested underneath a mount.
12250	* This is a best effort attempt and no retries are performed here.
12251	*
12252	* Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
12253	*/
12254	__private_extern__
12255	void
12256	vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
12257	{
12258	struct trigger_unmount_info info;
12259
12260	/ Must have trigger vnodes /
12261	if (mp->mnt_numtriggers == `0`) {
12262	return;
12263	}
12264	/ Avoid recursive requests (by checking covered vnode) /
12265	if ((mp->mnt_vnodecovered != NULL) &&
12266	(vnode_getwithref(vp: mp->mnt_vnodecovered) == `0`)) {
12267	boolean_t recursive = FALSE;
12268
12269	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
12270	(mp->mnt_vnodecovered->v_resolve != NULL) &&
12271	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
12272	recursive = TRUE;
12273	}
12274	vnode_put(vp: mp->mnt_vnodecovered);
12275	if (recursive) {
12276	return;
12277	}
12278	}
12279
12280	/*
12281	* Attempt to unmount any nested trigger mounts (best effort)
12282	*/
12283	info.ctx = ctx;
12284	info.top_mp = mp;
12285	info.trigger_vp = NULLVP;
12286	info.trigger_vid = `0`;
12287	info.trigger_mp = NULL;
12288	info.flags = flags;
12289
12290	(void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, callout: trigger_unmount_callback, arg: &info);
12291
12292	/*
12293	* Process remaining nested mount (now that its not referenced)
12294	*/
12295	if ((info.trigger_vp != NULLVP) &&
12296	(vnode_getwithvid(vp: info.trigger_vp, vid: info.trigger_vid) == `0`)) {
12297	vnode_t vp = info.trigger_vp;
12298
12299	if (info.trigger_mp == vp->v_mountedhere) {
12300	(void) vnode_trigger_unresolve(vp, flags, ctx);
12301	}
12302	vnode_put(vp);
12303	vnode_drop(vp);
12304	} else if (info.trigger_vp != NULLVP) {
12305	vnode_drop(vp: info.trigger_vp);
12306	}
12307	}
12308
12309	int
12310	vfs_addtrigger(mount_t mp, const char relpath, struct* vnode_trigger_info *vtip, vfs_context_t ctx)
12311	{
12312	struct nameidata *ndp;
12313	int res;
12314	vnode_t rvp, vp;
12315	struct vnode_trigger_param vtp;
12316
12317	/*
12318	* Must be called for trigger callback, wherein rwlock is held
12319	*/
12320	lck_rw_assert(lck: &mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
12321
12322	TRIG_LOG("Adding trigger at %s\n", relpath);
12323	TRIG_LOG("Trying VFS_ROOT\n");
12324
12325	ndp = kalloc_type(struct nameidata, Z_WAITOK \| Z_NOFAIL);
12326
12327	/*
12328	* We do a lookup starting at the root of the mountpoint, unwilling
12329	* to cross into other mountpoints.
12330	*/
12331	res = VFS_ROOT(mp, &rvp, ctx);
12332	if (res != `0`) {
12333	goto out;
12334	}
12335
12336	TRIG_LOG("Trying namei\n");
12337
12338	NDINIT(ndp, LOOKUP, OP_LOOKUP, USEDVP \| NOCROSSMOUNT \| FOLLOW, UIO_SYSSPACE,
12339	CAST_USER_ADDR_T(relpath), ctx);
12340	ndp->ni_dvp = rvp;
12341	res = namei(ndp);
12342	if (res != `0`) {
12343	vnode_put(vp: rvp);
12344	goto out;
12345	}
12346
12347	vp = ndp->ni_vp;
12348	nameidone(ndp);
12349	vnode_put(vp: rvp);
12350
12351	TRIG_LOG("Trying vnode_resolver_create()\n");
12352
12353	/*
12354	* Set up blob. vnode_create() takes a larger structure
12355	* with creation info, and we needed something different
12356	* for this case. One needs to win, or we need to munge both;
12357	* vnode_create() wins.
12358	*/
12359	bzero(s: &vtp, n: sizeof(vtp));
12360	vtp.vnt_resolve_func = vtip->vti_resolve_func;
12361	vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
12362	vtp.vnt_rearm_func = vtip->vti_rearm_func;
12363	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
12364	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
12365	vtp.vnt_data = vtip->vti_data;
12366	vtp.vnt_flags = vtip->vti_flags;
12367
12368	res = vnode_resolver_create(mp, vp, tinfo: &vtp, TRUE);
12369	vnode_put(vp);
12370	out:
12371	kfree_type(struct nameidata, ndp);
12372	TRIG_LOG("Returning %d\n", res);
12373	return res;
12374	}
12375
12376	#endif /* CONFIG_TRIGGERS */
12377
12378	vm_offset_t
12379	kdebug_vnode(vnode_t vp)
12380	{
12381	return VM_KERNEL_ADDRPERM(vp);
12382	}
12383
12384	static int flush_cache_on_write = `0`;
12385	SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
12386	CTLFLAG_RW \| CTLFLAG_LOCKED, &flush_cache_on_write, `0`,
12387	"always flush the drive cache on writes to uncached files");
12388
12389	int
12390	vnode_should_flush_after_write(vnode_t vp, int ioflag)
12391	{
12392	return flush_cache_on_write
12393	&& (ISSET(ioflag, IO_NOCACHE) \|\| vnode_isnocache(vp));
12394	}
12395
12396	/*
12397	* sysctl for use by disk I/O tracing tools to get the list of existing
12398	* vnodes' paths
12399	*/
12400
12401	#define NPATH_WORDS (MAXPATHLEN / sizeof(unsigned long))
12402	struct vnode_trace_paths_context {
12403	uint64_t count;
12404	/*
12405	* Must be a multiple of 4, then -1, for tracing!
12406	*/
12407	unsigned long path[NPATH_WORDS + (`4` - (NPATH_WORDS % `4`)) - `1`];
12408	};
12409
12410	static int
12411	vnode_trace_path_callback(struct vnode vp, void* *vctx)
12412	{
12413	struct vnode_trace_paths_context *ctx = vctx;
12414	size_t path_len = sizeof(ctx->path);
12415
12416	int getpath_len = (int)path_len;
12417	if (vn_getpath(vp, pathbuf: (char *)ctx->path, len: &getpath_len) == `0`) {
12418	/ vn_getpath() NUL-terminates, and len includes the NUL. /
12419	assert(getpath_len >= `0`);
12420	path_len = (size_t)getpath_len;
12421
12422	assert(path_len <= sizeof(ctx->path));
12423	kdebug_vfs_lookup(path_words: ctx->path, path_len: (int)path_len, vnp: vp,
12424	KDBG_VFS_LOOKUP_FLAG_LOOKUP \| KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
12425
12426	if (++(ctx->count) == `1000`) {
12427	thread_yield_to_preemption();
12428	ctx->count = `0`;
12429	}
12430	}
12431
12432	return VNODE_RETURNED;
12433	}
12434
12435	static int
12436	vfs_trace_paths_callback(mount_t mp, void *arg)
12437	{
12438	if (mp->mnt_flag & MNT_LOCAL) {
12439	vnode_iterate(mp, VNODE_ITERATE_ALL, callout: vnode_trace_path_callback, arg);
12440	}
12441
12442	return VFS_RETURNED;
12443	}
12444
12445	static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
12446	struct vnode_trace_paths_context ctx;
12447
12448	(void)oidp;
12449	(void)arg1;
12450	(void)arg2;
12451	(void)req;
12452
12453	if (!kauth_cred_issuser(cred: kauth_cred_get())) {
12454	return EPERM;
12455	}
12456
12457	if (!kdebug_enable \|\| !kdebug_debugid_enabled(VFS_LOOKUP)) {
12458	return EINVAL;
12459	}
12460
12461	bzero(s: &ctx, n: sizeof(struct vnode_trace_paths_context));
12462
12463	vfs_iterate(flags: `0`, callout: vfs_trace_paths_callback, arg: &ctx);
12464
12465	return `0`;
12466	}
12467
12468	SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED, NULL, `0`, &sysctl_vfs_trace_paths, "-", "trace_paths");
12469
12470	#if CONFIG_FILE_LEASES
12471	#include <IOKit/IOBSD.h>
12472	#include <sys/file_internal.h>
12473
12474	#define FILE_LEASES_ENTITLEMENT "com.apple.private.vfs.file-leases"
12475
12476	static uint32_t lease_break_timeout = `60`; / secs /
12477
12478	#if (DEVELOPMENT \|\| DEBUG)
12479	static int lease_debug = `0`;
12480	static int lease_entitlement_override = `0`;
12481
12482	SYSCTL_NODE(_vfs, OID_AUTO, lease, CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, "vfs lease");
12483	SYSCTL_UINT(_vfs_lease, OID_AUTO, break_timeout, CTLFLAG_RW \| CTLFLAG_LOCKED, &lease_break_timeout, `0`, "");
12484	SYSCTL_INT(_vfs_lease, OID_AUTO, debug, CTLFLAG_RW \| CTLFLAG_LOCKED, &lease_debug, `0`, "");
12485	SYSCTL_INT(_vfs_lease, OID_AUTO, entitlement_override, CTLFLAG_RW \| CTLFLAG_LOCKED, &lease_entitlement_override, `0`, "");
12486
12487	#define LEASEDBG(fmt, args...) \
12488	do { \
12489	if (__improbable(lease_debug)) { \
12490	pid_t cur_pid = proc_getpid(current_proc()); \
12491	printf("%s(%d): " fmt "\n", __func__, cur_pid, ##args); \
12492	} \
12493	} while(0)
12494	#else
12495	#define LEASEDBG(fmt, args...) /**/
12496	#endif /* (DEVELOPMENT \|\| DEBUG) */
12497
12498	static bool
12499	allow_setlease(vfs_context_t ctx)
12500	{
12501	bool entitled;
12502
12503	entitled = IOTaskHasEntitlement(task: vfs_context_task(ctx),
12504	FILE_LEASES_ENTITLEMENT);
12505
12506	#if (DEVELOPMENT \|\| DEBUG)
12507	if (!entitled) {
12508	entitled = (lease_entitlement_override == `1`);
12509	}
12510	#endif
12511
12512	return entitled;
12513	}
12514
12515	static file_lease_t
12516	file_lease_alloc(struct fileglob fg, int* fl_type, pid_t pid)
12517	{
12518	file_lease_t fl;
12519
12520	fl = kalloc_type(struct file_lease, Z_WAITOK);
12521	/*
12522	* Duplicated file descriptors created by dup() or fork() would have the
12523	* same 'fileglob' so the lease can be released or modified with the
12524	* duplicated fds. Opening the same file (by either same or different
12525	* process) would have different 'fileglob' so a lease always follows a
12526	* 'fileglob'.
12527	*/
12528	fl->fl_fg = fg;
12529	fl->fl_type = fl_type;
12530	fl->fl_pid = pid;
12531	fl->fl_downgrade_start = fl->fl_release_start = `0`;
12532
12533	return fl;
12534	}
12535
12536	static void
12537	file_lease_free(file_lease_t fl)
12538	{
12539	kfree_type(struct file_lease, fl);
12540	}
12541
12542	/*
12543	* A read lease can be placed only on a file/directory that is opened for
12544	* read-only which means no other processes have the file/directory opened in
12545	* read-write/write-only mode or mmap'ed writable.
12546	* A write lease can be placed on a file only if there are no other opens
12547	* for the file.
12548	*
12549	* Needs to be called with vnode's lock held.
12550	*/
12551	static int
12552	check_for_open_conflict(vnode_t vp, struct fileglob fg, int* fl_type,
12553	int expcounts)
12554	{
12555	int error = `0`;
12556
12557	if (fl_type == F_RDLCK) {
12558	if (vp->v_writecount > expcounts &&
12559	!(vp->v_writecount == `1` && (fg->fg_flag & FWRITE))) {
12560	error = EAGAIN;
12561	} else if (ubc_is_mapped_writable(vp)) {
12562	error = EAGAIN;
12563	}
12564	} else if (fl_type == F_WRLCK && vp->v_usecount > expcounts) {
12565	error = EAGAIN;
12566	}
12567
12568	return error;
12569	}
12570
12571	/ Needs to be called with vnode's lock held. /
12572	static void
12573	modify_file_lease(vnode_t vp, file_lease_t fl, int new_fl_type,
12574	struct fileglob *new_fg)
12575	{
12576	LEASEDBG("fl %p changing fl_type from %d to %d (flags 0x%x)",
12577	fl, fl->fl_type, new_fl_type, fl->fl_flags);
12578
12579	fl->fl_type = new_fl_type;
12580
12581	/*
12582	* The lease being modified may be using a different file
12583	* descriptor, so usurp the fileglob pointer here. In this
12584	* case the old descriptor no longer holds the lease.
12585	*/
12586	if (new_fg != NULL) {
12587	fl->fl_fg = new_fg;
12588	}
12589
12590	if (fl->fl_flags & FL_FLAG_RELEASE_PENDING \|\|
12591	fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) {
12592	wakeup(chan: &vp->v_leases);
12593	}
12594	}
12595
12596	static int
12597	acquire_file_lease(vnode_t vp, struct fileglob fg, int* fl_type, int expcounts,
12598	vfs_context_t ctx)
12599	{
12600	file_lease_t fl, new_fl, our_fl;
12601	int error;
12602
12603	/ Make sure "expected count" looks sane. /
12604	if (expcounts < `0` \|\| expcounts > OPEN_MAX) {
12605	return EINVAL;
12606	}
12607
12608	new_fl = file_lease_alloc(fg, fl_type, pid: vfs_context_pid(ctx));
12609
12610	vnode_lock(vp);
12611
12612	error = check_for_open_conflict(vp, fg, fl_type, expcounts);
12613	if (error) {
12614	LEASEDBG("open conflict on vp %p type %d writecnt %d usecnt %d "
12615	"fl_type %d expcounts %d",
12616	vp, vp->v_type, vp->v_writecount, vp->v_usecount, fl_type,
12617	expcounts);
12618	goto out;
12619	}
12620
12621	our_fl = NULL;
12622	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
12623	/ Does the existing lease belong to us? /
12624	if (fl->fl_fg == new_fl->fl_fg \|\|
12625	fl->fl_pid == new_fl->fl_pid) {
12626	our_fl = fl;
12627	continue;
12628	}
12629
12630	/*
12631	* We don't allow placing a new write lease when there is an existing
12632	* read lease that doesn't belong to us. We also don't allow putting
12633	* a new read lease if there is a pending release on the lease.
12634	* Putting a new read lease when there is a pending downgrade on the
12635	* lease is fine as it won't cause lease conflict.
12636	*/
12637	if (fl_type == F_WRLCK \|\| fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12638	break;
12639	}
12640	}
12641
12642	/*
12643	* Found an existing lease that we don't own and it conflicts with the
12644	* new lease.
12645	*/
12646	if (fl) {
12647	LEASEDBG("lease conflict on vp %p fl %p fl_type %d cur_fl_type %d",
12648	vp, fl, fl_type, fl->fl_type);
12649	goto out;
12650	}
12651
12652	/ Found an existing lease that we own so just change the type. /
12653	if (our_fl) {
12654	LEASEDBG("replace lease on vp %p fl %p old_fl_type %d new_fl_type %d",
12655	vp, our_fl, our_fl->fl_type, fl_type);
12656
12657	modify_file_lease(vp, fl: our_fl, new_fl_type: new_fl->fl_type, new_fg: new_fl->fl_fg);
12658	goto out;
12659	}
12660
12661	LEASEDBG("acquired lease on vp %p type %d fl %p fl_type %d fg %p",
12662	vp, vp->v_type, new_fl, new_fl->fl_type, new_fl->fl_fg);
12663
12664	LIST_INSERT_HEAD(&vp->v_leases, new_fl, fl_link);
12665	new_fl = NULL;
12666
12667	out:
12668	vnode_unlock(vp);
12669
12670	if (new_fl) {
12671	file_lease_free(fl: new_fl);
12672	}
12673
12674	return error;
12675	}
12676
12677	static int
12678	release_file_lease(vnode_t vp, struct fileglob *fg)
12679	{
12680	file_lease_t fl, fl_tmp;
12681	int error = `0`;
12682
12683	LEASEDBG("request to release lease on vp %p type %d fg %p",
12684	vp, vp->v_type, fg);
12685
12686	vnode_lock(vp);
12687
12688	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
12689	if (fl->fl_fg == fg) {
12690	LEASEDBG("released lease on vp %p fl %p type %d",
12691	vp, fl, fl->fl_type);
12692
12693	LIST_REMOVE(fl, fl_link);
12694	modify_file_lease(vp, fl, F_UNLCK, NULL);
12695	break;
12696	}
12697	}
12698
12699	vnode_unlock(vp);
12700
12701	if (fl) {
12702	file_lease_free(fl);
12703	} else {
12704	error = ENOLCK;
12705	}
12706
12707	return error;
12708	}
12709
12710	/*
12711	* Acquire or release a file lease according to the given type (F_RDLCK,
12712	* F_WRLCK or F_UNLCK).
12713	*
12714	* Returns: 0 Success
12715	* EAGAIN Failed to acquire a file lease due to conflicting opens
12716	* ENOLCK Failed to release a file lease due to lease not found
12717	* EPERM Current task doesn't have the entitlement
12718	*/
12719	int
12720	vnode_setlease(vnode_t vp, struct fileglob fg, int* fl_type, int expcounts,
12721	vfs_context_t ctx)
12722	{
12723	int error;
12724
12725	if (!allow_setlease(ctx)) {
12726	return EPERM;
12727	}
12728
12729	error = (fl_type == F_UNLCK) ? release_file_lease(vp, fg) :
12730	acquire_file_lease(vp, fg, fl_type, expcounts, ctx);
12731
12732	return error;
12733	}
12734
12735	/*
12736	* Retrieve the currently in place lease for the file.
12737	*
12738	* Returns:
12739	* F_RDLCK Read lease
12740	* F_WRLCK Write lease
12741	* F_UNLCK No lease
12742	*/
12743	int
12744	vnode_getlease(vnode_t vp)
12745	{
12746	file_lease_t fl;
12747	int fl_type = F_UNLCK;
12748
12749	vnode_lock(vp);
12750
12751	/*
12752	* There should be only one type of lease in the list as read and write
12753	* leases can't co-exist for the same file.
12754	*/
12755	fl = LIST_FIRST(&vp->v_leases);
12756	if (fl) {
12757	fl_type = fl->fl_type;
12758	}
12759
12760	vnode_unlock(vp);
12761
12762	LEASEDBG("vp %p fl %p fl_type %d", vp, fl, fl_type);
12763
12764	return fl_type;
12765	}
12766
12767	/ Must be called with vnode's lock held. /
12768	static bool
12769	check_for_lease_conflict(vnode_t vp, int breaker_fl_type, vfs_context_t ctx)
12770	{
12771	file_lease_t fl;
12772	pid_t pid = vfs_context_pid(ctx);
12773	bool is_conflict = false;
12774
12775	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
12776	if ((fl->fl_type == F_WRLCK && fl->fl_pid != pid) \|\|
12777	(breaker_fl_type == F_WRLCK && fl->fl_pid != pid)) {
12778	LEASEDBG("conflict detected on vp %p type %d fl_type %d "
12779	"breaker_fl_type %d",
12780	vp, vp->v_type, fl->fl_type, breaker_fl_type);
12781
12782	is_conflict = true;
12783	break;
12784	}
12785	}
12786
12787	return is_conflict;
12788	}
12789
12790	static uint64_t
12791	absolutetime_elapsed_in_secs(uint64_t start)
12792	{
12793	uint64_t elapsed, elapsed_sec;
12794	uint64_t now = mach_absolute_time();
12795
12796	elapsed = now - start;
12797	absolutetime_to_nanoseconds(abstime: elapsed, result: &elapsed_sec);
12798	elapsed_sec /= NSEC_PER_SEC;
12799
12800	return elapsed_sec;
12801	}
12802
12803	/ Must be called with vnode's lock held. /
12804	static void
12805	handle_lease_break_timedout(vnode_t vp)
12806	{
12807	file_lease_t fl, fl_tmp;
12808	uint64_t elapsed_sec;
12809
12810	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
12811	if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) {
12812	elapsed_sec = absolutetime_elapsed_in_secs(start: fl->fl_downgrade_start);
12813
12814	if (elapsed_sec >= lease_break_timeout) {
12815	LEASEDBG("force downgrade on vp %p for fl %p elapsed %llu "
12816	"timeout %u", vp, fl, elapsed_sec, lease_break_timeout);
12817
12818	fl->fl_flags &= ~FL_FLAG_DOWNGRADE_PENDING;
12819	fl->fl_downgrade_start = `0`;
12820	modify_file_lease(vp, fl, F_RDLCK, NULL);
12821	continue;
12822	}
12823	}
12824	if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
12825	elapsed_sec = absolutetime_elapsed_in_secs(start: fl->fl_release_start);
12826
12827	if (elapsed_sec >= lease_break_timeout) {
12828	LEASEDBG("force release on vp %p for fl %p elapsed %llu "
12829	"timeout %u", vp, fl, elapsed_sec, lease_break_timeout);
12830
12831	LIST_REMOVE(fl, fl_link);
12832	file_lease_free(fl);
12833	continue;
12834	}
12835	}
12836	}
12837
12838	/ Wakeup the lease breaker(s). /
12839	wakeup(chan: &vp->v_leases);
12840	}
12841
12842	/ Must be called with vnode's lock held. /
12843	static void
12844	wait_for_lease_break(vnode_t vp, int breaker_fl_type, vfs_context_t ctx)
12845	{
12846	file_lease_t fl;
12847	struct timespec ts;
12848	uint64_t elapsed_sec, start_time;
12849	int error;
12850
12851	restart:
12852	fl = LIST_FIRST(&vp->v_leases);
12853	assert(fl);
12854
12855	/*
12856	* In a rare case it is possible that the lease that we are blocked on has
12857	* been released and a new lease has been put in place after we are
12858	* signalled to wake up. In this particular, we would treat it as no
12859	* conflict and proceed. This could only happen for directory leasing.
12860	*/
12861	if ((fl->fl_flags & (FL_FLAG_DOWNGRADE_PENDING \| FL_FLAG_RELEASE_PENDING)) == `0`) {
12862	LEASEDBG("new lease in place on vp %p fl %p fl_type %d "
12863	"breaker_fl_type %d",
12864	vp, fl, fl->fl_type, breaker_fl_type);
12865
12866	return;
12867	}
12868	/*
12869	* Figure out which timer to use for lease break timedout as we could have
12870	* both timers active. If both timers active, pick the one with earliest
12871	* start time.
12872	*/
12873	if (fl->fl_release_start) {
12874	if (fl->fl_downgrade_start == `0` \|\|
12875	fl->fl_downgrade_start < fl->fl_release_start) {
12876	start_time = fl->fl_release_start;
12877	} else {
12878	start_time = fl->fl_downgrade_start;
12879	}
12880	} else {
12881	start_time = fl->fl_downgrade_start;
12882	}
12883	assert(start_time > `0`);
12884
12885	elapsed_sec = absolutetime_elapsed_in_secs(start: start_time);
12886
12887	LEASEDBG("elapsed_sec %llu release_start %llu downgrade_start %llu",
12888	elapsed_sec, fl->fl_release_start, fl->fl_downgrade_start);
12889
12890	ts.tv_sec = (lease_break_timeout > elapsed_sec ?
12891	(lease_break_timeout - elapsed_sec) : `0`);
12892	ts.tv_nsec = (ts.tv_sec == `0` ? `1` : `0`);
12893	error = msleep(chan: &vp->v_leases, mtx: &vp->v_lock, PVFS, wmesg: __func__, ts: &ts);
12894
12895	if (error == `0` \|\| error != EWOULDBLOCK) {
12896	/*
12897	* Woken up due to lease is released/downgraded by lease holder.
12898	* We don't expect any other error from msleep() beside EWOULDBLOCK.
12899	* Check if there is any further conflicts. If so, then continue to
12900	* wait for the next conflict to resolve.
12901	*/
12902	if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) {
12903	goto restart;
12904	}
12905	} else {
12906	/*
12907	* Woken due to lease break timeout expired (EWOULDBLOCK returned).
12908	* Break/downgrade all conflicting leases.
12909	*/
12910	handle_lease_break_timedout(vp);
12911
12912	if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) {
12913	goto restart;
12914	}
12915	}
12916	}
12917
12918	/ Must be called with vnode's lock held. /
12919	static void
12920	send_lease_break_event(vnode_t vp, uint32_t event)
12921	{
12922	if (vp->v_knotes.slh_first != NULL) {
12923	KNOTE(&vp->v_knotes, event);
12924	}
12925	}
12926
12927	static bool
12928	is_dataless_file(vnode_t vp, vfs_context_t ctx)
12929	{
12930	struct vnode_attr va;
12931	bool is_dataless = false;
12932	int error;
12933
12934	VATTR_INIT(&va);
12935	VATTR_WANTED(&va, va_flags);
12936
12937	error = vnode_getattr(vp, vap: &va, ctx);
12938	if (!error && (va.va_flags & SF_DATALESS)) {
12939	is_dataless = true;
12940	}
12941
12942	return is_dataless;
12943	}
12944
12945	/*
12946	* Break lease(s) in place for the file when there is conflict.
12947	* This function would return 0 for almost all call sites. The only exception
12948	* is when it is called from open1() with O_NONBLOCK flag and it needs to block
12949	* waiting for the lease conflict(s) to resolve. In this case EWOULDBLOCK is
12950	* returned.
12951	*/
12952	int
12953	vnode_breaklease(vnode_t vp, uint32_t oflags, vfs_context_t ctx)
12954	{
12955	file_lease_t fl;
12956	uint64_t now;
12957	int fl_type;
12958	int error = `0`;
12959
12960	vnode_lock(vp);
12961
12962	if (__probable(LIST_EMPTY(&vp->v_leases))) {
12963	goto out_unlock;
12964	}
12965
12966	/ Determine the access mode requested by the lease breaker. /
12967	fl_type = (oflags & (O_WRONLY \| O_RDWR \| O_CREAT \| O_TRUNC)) ? F_WRLCK : F_RDLCK;
12968
12969	/*
12970	* If the lease-breaker is just reading, check that it can break
12971	* leases first. If the lease-breaker is writing, or if the
12972	* context was not specified, we always break.
12973	* We skip lease break if the lease-breaker is dataless manipulator and
12974	* the file is dataless.
12975	*/
12976	if ((fl_type == F_RDLCK && !vfs_context_can_break_leases(ctx)) \|\|
12977	(vfs_context_is_dataless_manipulator(ctx) && (vp->v_type == VREG) &&
12978	is_dataless_file(vp, ctx))) {
12979	goto out_unlock;
12980	}
12981
12982	if (!check_for_lease_conflict(vp, breaker_fl_type: fl_type, ctx)) {
12983	goto out_unlock;
12984	}
12985
12986	now = mach_absolute_time();
12987
12988	LEASEDBG("break lease on vp %p type %d oflags 0x%x cur_time %llu",
12989	vp, vp->v_type, oflags, now);
12990
12991	/*
12992	* We get to this point then this means all lease(s) are conflict and
12993	* we need to send the lease break event to the lease holder(s).
12994	* It is possible that a lease could have both downgrade and release events
12995	* pending triggered by multiple breakers trying to open the file in
12996	* different modes. Both events would have different lease break timers.
12997	* Consider the following case:
12998	* 1. Process A holds the write lease on file X.
12999	* 2. Provess B opens the file X in read-only mode.
13000	* This triggers downgrade lease event to Process A.
13001	* 3. While downgrade is pending, Process C opens the file X in read-write
13002	* mode. This triggers release lease event to Process A.
13003	*/
13004	LIST_FOREACH(fl, &vp->v_leases, fl_link) {
13005	if (fl_type == F_WRLCK) {
13006	/ File is opened for writing or truncate. /
13007	if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
13008	continue;
13009	}
13010	fl->fl_release_start = now;
13011	fl->fl_flags \|= FL_FLAG_RELEASE_PENDING;
13012	send_lease_break_event(vp, NOTE_LEASE_RELEASE);
13013	} else {
13014	/ File is opened for reading. /
13015	if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING \|\|
13016	fl->fl_flags & FL_FLAG_RELEASE_PENDING) {
13017	continue;
13018	}
13019	fl->fl_downgrade_start = now;
13020	fl->fl_flags \|= FL_FLAG_DOWNGRADE_PENDING;
13021	send_lease_break_event(vp, NOTE_LEASE_DOWNGRADE);
13022	}
13023	}
13024
13025	/*
13026	* If open is requested with O_NONBLOCK, then we can't block and wait for
13027	* the lease to be released/downgraded. Just bail out with EWOULDBLOCK.
13028	*/
13029	if (oflags & O_NONBLOCK) {
13030	error = EWOULDBLOCK;
13031	goto out;
13032	}
13033
13034	wait_for_lease_break(vp, breaker_fl_type: fl_type, ctx);
13035
13036	out:
13037	LEASEDBG("break lease on vp %p oflags 0x%x, error %d", vp, oflags, error);
13038
13039	out_unlock:
13040	vnode_unlock(vp);
13041
13042	return error;
13043	}
13044
13045	/*
13046	* Get parent vnode by parent ID (only for file system that supports
13047	* MNTK_PATH_FROM_ID).
13048	* On success, the parent's vnode is returned with iocount held.
13049	*/
13050	static vnode_t
13051	vnode_getparent_byid(vnode_t vp)
13052	{
13053	struct vnode_attr va;
13054	vnode_t dvp = NULLVP;
13055	vfs_context_t ctx = vfs_context_current();
13056	int error;
13057
13058	if (!(vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID)) {
13059	goto out;
13060	}
13061
13062	VATTR_INIT(&va);
13063	VATTR_WANTED(&va, va_parentid);
13064
13065	/ Get the vnode's parent id from the file system. /
13066	error = vnode_getattr(vp, vap: &va, ctx);
13067	if (error \|\| !VATTR_IS_SUPPORTED(&va, va_parentid)) {
13068	goto out;
13069	}
13070
13071	/*
13072	* Ask the file system for the parent vnode.
13073	* We are ignoring the error here as we don't expect the parent vnode to be
13074	* populated on error.
13075	*/
13076	(void)VFS_VGET(vp->v_mount, (ino64_t)va.va_parentid, &dvp, ctx);
13077
13078	out:
13079	return dvp;
13080	}
13081
13082	/*
13083	* Break directory's lease.
13084	* If 'need_parent' is true, then parent is obtained via vnode_getparent() (or
13085	* vnode_getparent_byid()) on the provided 'vp'.
13086	*/
13087	void
13088	vnode_breakdirlease(vnode_t vp, bool need_parent, uint32_t oflags)
13089	{
13090	vnode_t dvp;
13091
13092	if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR) \|\|
13093	(vp == rootvnode)) {
13094	return;
13095	}
13096
13097	/*
13098	* If parent is not provided, first try to get it from the name cache.
13099	* If failed, then we will attempt to ask the file system for parent vnode.
13100	* This is just a best effort as both attempts could still fail.
13101	*/
13102	if (need_parent) {
13103	dvp = vnode_getparent(vp);
13104	if (__improbable(dvp == NULLVP)) {
13105	dvp = vnode_getparent_byid(vp);
13106	}
13107	} else {
13108	dvp = vp;
13109	}
13110
13111	if (__probable(dvp != NULLVP)) {
13112	/ Always break dir leases. /
13113	(void)vnode_breaklease(vp: dvp, oflags, ctx: vfs_context_current());
13114	}
13115
13116	if (need_parent && (dvp != NULLVP)) {
13117	vnode_put(vp: dvp);
13118	}
13119	}
13120
13121	/*
13122	* Revoke all lease(s) in place for the file.
13123	* This is called when the vnode is reclaimed.
13124	*/
13125	void
13126	vnode_revokelease(vnode_t vp, bool locked)
13127	{
13128	file_lease_t fl, fl_tmp;
13129	bool need_wakeup = false;
13130
13131	if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR)) {
13132	return;
13133	}
13134
13135	if (!locked) {
13136	vnode_lock(vp);
13137	}
13138
13139	LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) {
13140	LIST_REMOVE(fl, fl_link);
13141	file_lease_free(fl);
13142	need_wakeup = true;
13143	}
13144
13145	/ Wakeup any lease breaker(s) that might be currently blocked. /
13146	if (__improbable(need_wakeup)) {
13147	wakeup(chan: &vp->v_leases);
13148	}
13149
13150	if (!locked) {
13151	vnode_unlock(vp);
13152	}
13153	}
13154
13155	#endif /* CONFIG_FILE_LEASES */
13156

Browse the source code of xnu/bsd/vfs/vfs_subr.c