vfs_syscalls.c source code [xnu/bsd/vfs/vfs_syscalls.c]

1	/*
2	* Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1989, 1993
30	* The Regents of the University of California. All rights reserved.
31	* (c) UNIX System Laboratories, Inc.
32	* All or some portions of this file are derived from material licensed
33	* to the University of California by American Telephone and Telegraph
34	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
35	* the permission of UNIX System Laboratories, Inc.
36	*
37	* Redistribution and use in source and binary forms, with or without
38	* modification, are permitted provided that the following conditions
39	* are met:
40	* 1. Redistributions of source code must retain the above copyright
41	* notice, this list of conditions and the following disclaimer.
42	* 2. Redistributions in binary form must reproduce the above copyright
43	* notice, this list of conditions and the following disclaimer in the
44	* documentation and/or other materials provided with the distribution.
45	* 3. All advertising materials mentioning features or use of this software
46	* must display the following acknowledgement:
47	* This product includes software developed by the University of
48	* California, Berkeley and its contributors.
49	* 4. Neither the name of the University nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*
65	* @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66	*/
67	/*
68	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69	* support for mandatory and extensible security protections. This notice
70	* is included in support of clause 2.2 (b) of the Apple Public License,
71	* Version 2.0.
72	*/
73
74	#include <sys/param.h>
75	#include <sys/systm.h>
76	#include <sys/namei.h>
77	#include <sys/filedesc.h>
78	#include <sys/kernel.h>
79	#include <sys/file_internal.h>
80	#include <sys/stat.h>
81	#include <sys/vnode_internal.h>
82	#include <sys/mount_internal.h>
83	#include <sys/proc_internal.h>
84	#include <sys/kauth.h>
85	#include <sys/uio_internal.h>
86	#include <kern/kalloc.h>
87	#include <sys/mman.h>
88	#include <sys/dirent.h>
89	#include <sys/attr.h>
90	#include <sys/sysctl.h>
91	#include <sys/ubc.h>
92	#include <sys/quota.h>
93	#include <sys/kdebug.h>
94	#include <sys/fsevents.h>
95	#include <sys/imgsrc.h>
96	#include <sys/sysproto.h>
97	#include <sys/sysctl.h>
98	#include <sys/xattr.h>
99	#include <sys/fcntl.h>
100	#include <sys/stdio.h>
101	#include <sys/fsctl.h>
102	#include <sys/ubc_internal.h>
103	#include <sys/disk.h>
104	#include <sys/content_protection.h>
105	#include <sys/clonefile.h>
106	#include <sys/snapshot.h>
107	#include <sys/priv.h>
108	#include <sys/fsgetpath.h>
109	#include <machine/cons.h>
110	#include <machine/limits.h>
111	#include <miscfs/specfs/specdev.h>
112
113	#include <vfs/vfs_disk_conditioner.h>
114	#if CONFIG_EXCLAVES
115	#include <vfs/vfs_exclave_fs.h>
116	#endif
117
118	#include <security/audit/audit.h>
119	#include <bsm/audit_kevents.h>
120
121	#include <mach/mach_types.h>
122	#include <kern/kern_types.h>
123	#include <kern/kalloc.h>
124	#include <kern/task.h>
125
126	#include <vm/vm_pageout.h>
127	#include <vm/vm_protos.h>
128
129	#include <libkern/OSAtomic.h>
130	#include <os/atomic_private.h>
131	#include <pexpert/pexpert.h>
132	#include <IOKit/IOBSD.h>
133
134	// deps for MIG call
135	#include <kern/host.h>
136	#include <kern/ipc_misc.h>
137	#include <mach/host_priv.h>
138	#include <mach/vfs_nspace.h>
139	#include <os/log.h>
140
141	#include <nfs/nfs_conf.h>
142
143	#if ROUTEFS
144	#include <miscfs/routefs/routefs.h>
145	#endif /* ROUTEFS */
146
147	#if CONFIG_MACF
148	#include <security/mac.h>
149	#include <security/mac_framework.h>
150	#endif
151
152	#if CONFIG_FSE
153	#define GET_PATH(x) \
154	((x) = get_pathbuff())
155	#define RELEASE_PATH(x) \
156	release_pathbuff(x)
157	#else
158	#define GET_PATH(x) \
159	((x) = zalloc(ZV_NAMEI))
160	#define RELEASE_PATH(x) \
161	zfree(ZV_NAMEI, x)
162	#endif /* CONFIG_FSE */
163
164	#ifndef HFS_GET_BOOT_INFO
165	#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166	#endif
167
168	#ifndef HFS_SET_BOOT_INFO
169	#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170	#endif
171
172	#ifndef APFSIOC_REVERT_TO_SNAPSHOT
173	#define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
174	#endif
175
176	extern void disk_conditioner_unmount(mount_t mp);
177
178	/ struct for checkdirs iteration /
179	struct cdirargs {
180	vnode_t olddp;
181	vnode_t newdp;
182	};
183	/ callback for checkdirs iteration /
184	static int checkdirs_callback(proc_t p, void * arg);
185
186	static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187	static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188	void enablequotas(struct mount *mp, vfs_context_t ctx);
189	static int getfsstat_callback(mount_t mp, void * arg);
190	static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191	static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec ts, int* nullflag);
192	static int sync_callback(mount_t, void *);
193	static int munge_statfs(struct mount mp, struct* vfsstatfs *sfsp,
194	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195	boolean_t partial_copy);
196	static int fsync_common(proc_t p, struct fsync_args uap, int* flags);
197	static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198	struct componentname *cnp, user_addr_t fsmountargs,
199	int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200	void vfs_notify_mount(vnode_t pdvp);
201
202	int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const* char *fsname, uint32_t internal_flags);
203
204	struct fd_vn_data * fg_vn_data_alloc(void);
205
206	/*
207	* Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208	* Concurrent lookups (or lookups by ids) on hard links can cause the
209	* vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210	* does) to return ENOENT as the path cannot be returned from the name cache
211	* alone. We have no option but to retry and hope to get one namei->reverse path
212	* generation done without an intervening lookup, lookup by id on the hard link
213	* item. This is only an issue for MAC hooks which cannot reenter the filesystem
214	* which currently are the MAC hooks for rename, unlink and rmdir.
215	*/
216	#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217
218	/ Max retry limit for rename due to vnode recycling. /
219	#define MAX_RENAME_ERECYCLE_RETRIES 1024
220
221	static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
222	int unlink_flags);
223
224	#ifdef CONFIG_IMGSRC_ACCESS
225	static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
226	static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
227	static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
228	static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
229	static void mount_end_update(mount_t mp);
230	static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname cnp, const* char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
231	#endif /* CONFIG_IMGSRC_ACCESS */
232
233	//snapshot functions
234	#if CONFIG_MNT_ROOTSNAP
235	static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236	#else
237	static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238	#endif
239
240	__private_extern__
241	int sync_internal(void);
242
243	__private_extern__
244	int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
245
246	static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
247	static LCK_ATTR_DECLARE(fd_vn_lck_attr, `0`, `0`);
248
249	/ vars for sync mutex /
250	static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
251	static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
252
253	extern lck_rw_t rootvnode_rw_lock;
254
255	VFS_SMR_DECLARE;
256	extern uint32_t nc_smr_enabled;
257
258	/*
259	* incremented each time a mount or unmount operation occurs
260	* used to invalidate the cached value of the rootvp in the
261	* mount structure utilized by cache_lookup_path
262	*/
263	uint32_t mount_generation = `0`;
264
265	/ counts number of mount and unmount operations /
266	unsigned int vfs_nummntops = `0`;
267
268	/ system-wide, per-boot unique mount ID /
269	static _Atomic uint64_t mount_unique_id = `1`;
270
271	extern const struct fileops vnops;
272	#if CONFIG_APPLEDOUBLE
273	extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
274	#endif /* CONFIG_APPLEDOUBLE */
275
276	/ Maximum buffer length supported by fsgetpath(2) /
277	#define FSGETPATH_MAXBUFLEN 8192
278
279	/*
280	* Virtual File System System Calls
281	*/
282
283	/*
284	* Private in-kernel mounting spi (specific use-cases only)
285	*/
286	boolean_t
287	vfs_iskernelmount(mount_t mp)
288	{
289	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
290	}
291
292	__private_extern__
293	int
294	kernel_mount(const char fstype, vnode_t pvp, vnode_t vp, const* char *path,
295	void data, __unused size_t datalen, int* syscall_flags, uint32_t kern_flags,
296	vfs_context_t ctx)
297	{
298	struct nameidata nd;
299	boolean_t did_namei;
300	int error;
301
302	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
303	UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
304
305	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
306
307	/*
308	* Get the vnode to be covered if it's not supplied
309	*/
310	if (vp == NULLVP) {
311	error = namei(ndp: &nd);
312	if (error) {
313	if (kern_flags & (KERNEL_MOUNT_SNAPSHOT \| KERNEL_MOUNT_VOLBYROLE_MASK)) {
314	printf("failed to locate mount-on path: %s ", path);
315	}
316	return error;
317	}
318	vp = nd.ni_vp;
319	pvp = nd.ni_dvp;
320	did_namei = TRUE;
321	} else {
322	char pnbuf = CAST_DOWN(char* *, path);
323
324	nd.ni_cnd.cn_pnbuf = pnbuf;
325	nd.ni_cnd.cn_pnlen = (int)(strlen(s: pnbuf) + `1`);
326	did_namei = FALSE;
327	}
328
329	kern_flags \|= KERNEL_MOUNT_KMOUNT;
330	error = mount_common(fstypename: fstype, pvp, vp, cnp: &nd.ni_cnd, CAST_USER_ADDR_T(data),
331	flags: syscall_flags, internal_flags: kern_flags, NULL, ctx);
332
333	if (did_namei) {
334	vnode_put(vp);
335	vnode_put(vp: pvp);
336	nameidone(&nd);
337	}
338
339	return error;
340	}
341
342	int
343	vfs_mount_at_path(const char fstype, const* char *path,
344	vnode_t pvp, vnode_t vp, void *data, size_t datalen,
345	int mnt_flags, int flags)
346	{
347	int syscall_flags = MNT_AUTOMOUNTED \| mnt_flags;
348	int error, km_flags = `0`;
349	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
350
351	/*
352	* This call is currently restricted to specific use cases.
353	*/
354	if ((strcmp(s1: fstype, s2: "lifs") != `0`) && (strcmp(s1: fstype, s2: "nfs") != `0`)) {
355	return ENOTSUP;
356	}
357
358	#if !defined(XNU_TARGET_OS_OSX)
359	if (strcmp(fstype, "lifs") == `0`) {
360	syscall_flags \|= MNT_NOEXEC;
361	}
362	#endif
363
364	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
365	km_flags \|= KERNEL_MOUNT_NOAUTH;
366	}
367	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
368	km_flags \|= KERNEL_MOUNT_PERMIT_UNMOUNT;
369	}
370
371	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
372	syscall_flags, kern_flags: km_flags, ctx);
373	if (error) {
374	printf("%s: mount on %s failed, error %d\n", __func__, path,
375	error);
376	}
377
378	return error;
379	}
380
381	/*
382	* Mount a file system.
383	*/
384	/ ARGSUSED /
385	int
386	mount(proc_t p, struct mount_args uap, __unused int32_t retval)
387	{
388	struct __mac_mount_args muap;
389
390	muap.type = uap->type;
391	muap.path = uap->path;
392	muap.flags = uap->flags;
393	muap.data = uap->data;
394	muap.mac_p = USER_ADDR_NULL;
395	return __mac_mount(p, &muap, retval);
396	}
397
398	int
399	fmount(__unused proc_t p, struct fmount_args uap, __unused int32_t retval)
400	{
401	struct componentname cn;
402	vfs_context_t ctx = vfs_context_current();
403	size_t dummy = `0`;
404	int error;
405	int flags = uap->flags;
406	char fstypename[MFSNAMELEN];
407	char labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() /
408	vnode_t pvp;
409	vnode_t vp;
410
411	AUDIT_ARG(fd, uap->fd);
412	AUDIT_ARG(fflags, flags);
413	/ fstypename will get audited by mount_common /
414
415	/ Sanity check the flags /
416	if (flags & (MNT_IMGSRC_BY_INDEX \| MNT_ROOTFS)) {
417	return ENOTSUP;
418	}
419
420	if (flags & MNT_UNION) {
421	return EPERM;
422	}
423
424	error = copyinstr(uaddr: uap->type, kaddr: fstypename, MFSNAMELEN, done: &dummy);
425	if (error) {
426	return error;
427	}
428
429	if ((error = file_vnode(uap->fd, &vp)) != `0`) {
430	return error;
431	}
432
433	if ((error = vnode_getwithref(vp)) != `0`) {
434	file_drop(uap->fd);
435	return error;
436	}
437
438	pvp = vnode_getparent(vp);
439	if (pvp == NULL) {
440	if (vp->v_mountedhere \|\| (vp->v_flag & VROOT) != `0`) {
441	error = EBUSY;
442	} else {
443	error = EINVAL;
444	}
445	vnode_put(vp);
446	file_drop(uap->fd);
447	return error;
448	}
449
450	memset(s: &cn, c: `0`, n: sizeof(struct componentname));
451	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452	cn.cn_pnlen = MAXPATHLEN;
453
454	if ((error = vn_getpath(vp, pathbuf: cn.cn_pnbuf, len: &cn.cn_pnlen)) != `0`) {
455	zfree(ZV_NAMEI, cn.cn_pnbuf);
456	vnode_put(vp: pvp);
457	vnode_put(vp);
458	file_drop(uap->fd);
459	return error;
460	}
461
462	error = mount_common(fstypename, pvp, vp, cnp: &cn, fsmountargs: uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463
464	zfree(ZV_NAMEI, cn.cn_pnbuf);
465	vnode_put(vp: pvp);
466	vnode_put(vp);
467	file_drop(uap->fd);
468
469	return error;
470	}
471
472	#define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
473
474	/*
475	* Get the size of a graft file (a manifest or payload file).
476	* The vp should be an iocounted vnode.
477	*/
478	static int
479	get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
480	{
481	struct stat64 sb = {};
482	int error;
483
484	*size = `0`;
485
486	error = vn_stat(vp: graft_vp, sb: &sb, NULL, isstat64: `1`, needsrealdev: `0`, ctx: vctx);
487	if (error) {
488	return error;
489	}
490
491	if (sb.st_size == `0`) {
492	error = ENODATA;
493	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
494	error = EFBIG;
495	} else {
496	*size = (size_t) sb.st_size;
497	}
498
499	return error;
500	}
501
502	/*
503	* Read in a graft file (a manifest or payload file) of size `size` into `buf`.
504	* `size` must already be validated.
505	*/
506	static int
507	read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
508	{
509	return vn_rdwr(rw: UIO_READ, vp: graft_vp,
510	base: (caddr_t) buf, len: (int) size, / offset / `0`,
511	segflg: UIO_SYSSPACE, IO_NOCACHE \| IO_RAOFF \| IO_UNIT,
512	cred: vfs_context_ucred(ctx: vctx), / resid / NULL,
513	p: vfs_context_proc(ctx: vctx));
514	}
515
516	/*
517	* Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
518	* and read it into `buf`.
519	*/
520	static int
521	graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t size, void* *buf)
522	{
523	vnode_t metadata_vp = NULLVP;
524	int error;
525
526	// Convert this graft fd to a vnode.
527	if ((error = vnode_getfromfd(ctx: vctx, fd, vpp: &metadata_vp)) != `0`) {
528	goto out;
529	}
530
531	// Get (and validate) size information.
532	if ((error = get_and_verify_graft_metadata_vp_size(graft_vp: metadata_vp, vctx, size)) != `0`) {
533	goto out;
534	}
535
536	// Read each file into the provided buffer - we must get the expected amount of bytes.
537	if ((error = read_graft_metadata_vp(graft_vp: metadata_vp, vctx, size: *size, buf)) != `0`) {
538	goto out;
539	}
540
541	out:
542	if (metadata_vp) {
543	vnode_put(vp: metadata_vp);
544	metadata_vp = NULLVP;
545	}
546
547	return error;
548	}
549
550	/*
551	* Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
552	* provided in `gfs`, saving the size of data read in `gfs`.
553	*/
554	static int
555	graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
556	fsioc_graft_fs_t *gfs)
557	{
558	int error;
559
560	// Read the authentic manifest.
561	if ((error = graft_secureboot_read_fd(fd: sbc_args->sbc_authentic_manifest_fd, vctx,
562	size: &gfs->authentic_manifest_size, buf: gfs->authentic_manifest))) {
563	return error;
564	}
565
566	// The user manifest is currently unused, but set its size.
567	gfs->user_manifest_size = `0`;
568
569	// Read the payload.
570	if ((error = graft_secureboot_read_fd(fd: sbc_args->sbc_payload_fd, vctx,
571	size: &gfs->payload_size, buf: gfs->payload))) {
572	return error;
573	}
574
575	return `0`;
576	}
577
578	/*
579	* Call into the filesystem to verify and graft a cryptex.
580	*/
581	static int
582	graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
583	vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
584	{
585	fsioc_graft_fs_t gfs = {};
586	uint64_t graft_dir_ino = `0`;
587	struct stat64 sb = {};
588	int error;
589
590	// Pre-flight arguments.
591	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
592	// Make sure that this graft version matches what we support.
593	return ENOTSUP;
594	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
595	// For this type, cryptex VP must live on same volume as the target of graft.
596	return EXDEV;
597	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
598	// We cannot graft upon non-directories.
599	return ENOTDIR;
600	} else if (sbc_args->sbc_authentic_manifest_fd < `0` \|\|
601	sbc_args->sbc_payload_fd < `0`) {
602	// We cannot graft without a manifest and payload.
603	return EINVAL;
604	}
605
606	if (mounton_vp) {
607	// Get the mounton's inode number.
608	error = vn_stat(vp: mounton_vp, sb: &sb, NULL, isstat64: `1`, needsrealdev: `0`, ctx: vctx);
609	if (error) {
610	return error;
611	}
612	graft_dir_ino = (uint64_t) sb.st_ino;
613	}
614
615	// Create buffers (of our maximum-defined size) to store authentication info.
616	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK \| Z_ZERO);
617	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK \| Z_ZERO);
618
619	if (!gfs.authentic_manifest \|\| !gfs.payload) {
620	error = ENOMEM;
621	goto out;
622	}
623
624	// Read our fd's into our buffers.
625	// (Note that this will set the buffer size fields in `gfs`.)
626	error = graft_secureboot_read_metadata(sbc_args, vctx, gfs: &gfs);
627	if (error) {
628	goto out;
629	}
630
631	gfs.graft_version = FSIOC_GRAFT_VERSION;
632	gfs.graft_type = graft_type;
633	gfs.graft_4cc = sbc_args->sbc_4cc;
634	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
635	gfs.graft_flags \|= FSCTL_GRAFT_PRESERVE_MOUNT;
636	}
637	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
638	gfs.graft_flags \|= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
639	}
640	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
641	gfs.graft_flags \|= FSCTL_GRAFT_SYSTEM_CONTENT;
642	}
643	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
644	gfs.graft_flags \|= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
645	}
646	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
647	gfs.graft_flags \|= FSCTL_GRAFT_STRICT_AUTH;
648	}
649	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
650	gfs.graft_flags \|= FSCTL_GRAFT_PRESERVE_GRAFT;
651	}
652	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
653
654	// Call into the FS to perform the graft (and validation).
655	error = VNOP_IOCTL(vp: cryptex_vp, FSIOC_GRAFT_FS, data: (caddr_t)&gfs, fflag: `0`, ctx: vctx);
656
657	out:
658	if (gfs.authentic_manifest) {
659	kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
660	gfs.authentic_manifest = NULL;
661	}
662	if (gfs.payload) {
663	kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
664	gfs.payload = NULL;
665	}
666
667	return error;
668	}
669
670	#define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
671
672	/*
673	* Graft a cryptex disk image (via FD) onto the appropriate mount-point
674	* { int graftdmg(int dmg_fd, const char mountdir, uint32_t graft_type, graftdmg_args_un gda); }
675	*/
676	int
677	graftdmg(__unused proc_t p, struct graftdmg_args uap, __unused int32_t retval)
678	{
679	int ua_dmgfd = uap->dmg_fd;
680	user_addr_t ua_mountdir = uap->mountdir;
681	uint32_t ua_grafttype = uap->graft_type;
682	user_addr_t ua_graftargs = uap->gda;
683
684	graftdmg_args_un kern_gda = {};
685	int error = `0`;
686	secure_boot_cryptex_args_t *sbc_args = NULL;
687
688	vnode_t cryptex_vp = NULLVP;
689	vnode_t mounton_vp = NULLVP;
690	struct nameidata nd = {};
691	vfs_context_t ctx = vfs_context_current();
692
693	if (!IOTaskHasEntitlement(task: vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
694	return EPERM;
695	}
696
697	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
698	if (error) {
699	return error;
700	}
701
702	// Copy mount dir in, if provided.
703	if (ua_mountdir != USER_ADDR_NULL) {
704	// Acquire vnode for mount-on path
705	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW \| AUDITVNPATH1),
706	UIO_USERSPACE, ua_mountdir, ctx);
707
708	error = namei(ndp: &nd);
709	if (error) {
710	return error;
711	}
712	mounton_vp = nd.ni_vp;
713	}
714
715	// Convert fd to vnode.
716	error = vnode_getfromfd(ctx, fd: ua_dmgfd, vpp: &cryptex_vp);
717	if (error) {
718	goto graftout;
719	}
720
721	if (ua_grafttype == `0` \|\| ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
722	error = EINVAL;
723	} else {
724	sbc_args = &kern_gda.sbc_args;
725	error = graft_secureboot_cryptex(graft_type: ua_grafttype, sbc_args, vctx: ctx, cryptex_vp, mounton_vp);
726	}
727
728	graftout:
729	if (cryptex_vp) {
730	vnode_put(vp: cryptex_vp);
731	cryptex_vp = NULLVP;
732	}
733	if (mounton_vp) {
734	vnode_put(vp: mounton_vp);
735	mounton_vp = NULLVP;
736	}
737	if (ua_mountdir != USER_ADDR_NULL) {
738	nameidone(&nd);
739	}
740
741	return error;
742	}
743
744	/*
745	* Ungraft a cryptex disk image (via mount dir FD)
746	* { int ungraftdmg(const char *mountdir, uint64_t flags); }
747	*/
748	int
749	ungraftdmg(__unused proc_t p, struct ungraftdmg_args uap, __unused int32_t retval)
750	{
751	int error = `0`;
752	user_addr_t ua_mountdir = uap->mountdir;
753	fsioc_ungraft_fs_t ugfs;
754	vnode_t mounton_vp = NULLVP;
755	struct nameidata nd = {};
756	vfs_context_t ctx = vfs_context_current();
757
758	if (!IOTaskHasEntitlement(task: vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
759	return EPERM;
760	}
761
762	if (uap->flags != `0` \|\| ua_mountdir == USER_ADDR_NULL) {
763	return EINVAL;
764	}
765
766	ugfs.ungraft_flags = `0`;
767
768	// Acquire vnode for mount-on path
769	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW \| AUDITVNPATH1),
770	UIO_USERSPACE, ua_mountdir, ctx);
771
772	error = namei(ndp: &nd);
773	if (error) {
774	return error;
775	}
776	mounton_vp = nd.ni_vp;
777
778	// Call into the FS to perform the ungraft
779	error = VNOP_IOCTL(vp: mounton_vp, FSIOC_UNGRAFT_FS, data: (caddr_t)&ugfs, fflag: `0`, ctx);
780
781	vnode_put(vp: mounton_vp);
782	nameidone(&nd);
783
784	return error;
785	}
786
787
788	void
789	vfs_notify_mount(vnode_t pdvp)
790	{
791	vfs_event_signal(NULL, VQ_MOUNT, data: (intptr_t)NULL);
792	lock_vnode_and_post(pdvp, NOTE_WRITE);
793	}
794
795	/*
796	* __mac_mount:
797	* Mount a file system taking into account MAC label behavior.
798	* See mount(2) man page for more information
799	*
800	* Parameters: p Process requesting the mount
801	* uap User argument descriptor (see below)
802	* retval (ignored)
803	*
804	* Indirect: uap->type Filesystem type
805	* uap->path Path to mount
806	* uap->data Mount arguments
807	* uap->mac_p MAC info
808	* uap->flags Mount flags
809	*
810	*
811	* Returns: 0 Success
812	* !0 Not success
813	*/
814	boolean_t root_fs_upgrade_try = FALSE;
815
816	int
817	__mac_mount(struct proc p, register* struct __mac_mount_args uap, __unused int32_t retval)
818	{
819	vnode_t pvp = NULL;
820	vnode_t vp = NULL;
821	int need_nameidone = `0`;
822	vfs_context_t ctx = vfs_context_current();
823	char fstypename[MFSNAMELEN];
824	struct nameidata nd;
825	size_t dummy = `0`;
826	char *labelstr = NULL;
827	size_t labelsz = `0`;
828	int flags = uap->flags;
829	int error;
830	#if CONFIG_IMGSRC_ACCESS \|\| CONFIG_MACF
831	boolean_t is_64bit = IS_64BIT_PROCESS(p);
832	#else
833	#pragma unused(p)
834	#endif
835	/*
836	* Get the fs type name from user space
837	*/
838	error = copyinstr(uaddr: uap->type, kaddr: fstypename, MFSNAMELEN, done: &dummy);
839	if (error) {
840	return error;
841	}
842
843	/*
844	* Get the vnode to be covered
845	*/
846	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
847	UIO_USERSPACE, uap->path, ctx);
848	if (flags & MNT_NOFOLLOW) {
849	nd.ni_flag \|= NAMEI_NOFOLLOW_ANY;
850	}
851	error = namei(ndp: &nd);
852	if (error) {
853	goto out;
854	}
855	need_nameidone = `1`;
856	vp = nd.ni_vp;
857	pvp = nd.ni_dvp;
858
859	#ifdef CONFIG_IMGSRC_ACCESS
860	/ Mounting image source cannot be batched with other operations /
861	if (flags == MNT_IMGSRC_BY_INDEX) {
862	error = relocate_imageboot_source(pvp, vp, cnp: &nd.ni_cnd, fsname: fstypename,
863	ctx, is64bit: is_64bit, fsmountargs: uap->data, by_index: (flags == MNT_IMGSRC_BY_INDEX));
864	goto out;
865	}
866	#endif /* CONFIG_IMGSRC_ACCESS */
867
868	#if CONFIG_MACF
869	/*
870	* Get the label string (if any) from user space
871	*/
872	if (uap->mac_p != USER_ADDR_NULL) {
873	struct user_mac mac;
874	size_t ulen = `0`;
875
876	if (is_64bit) {
877	struct user64_mac mac64;
878	error = copyin(uap->mac_p, &mac64, sizeof(mac64));
879	mac.m_buflen = (user_size_t)mac64.m_buflen;
880	mac.m_string = (user_addr_t)mac64.m_string;
881	} else {
882	struct user32_mac mac32;
883	error = copyin(uap->mac_p, &mac32, sizeof(mac32));
884	mac.m_buflen = mac32.m_buflen;
885	mac.m_string = mac32.m_string;
886	}
887	if (error) {
888	goto out;
889	}
890	if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) \|\|
891	(mac.m_buflen < `2`)) {
892	error = EINVAL;
893	goto out;
894	}
895	labelsz = mac.m_buflen;
896	labelstr = kalloc_data(labelsz, Z_WAITOK);
897	error = copyinstr(uaddr: mac.m_string, kaddr: labelstr, len: mac.m_buflen, done: &ulen);
898	if (error) {
899	goto out;
900	}
901	AUDIT_ARG(mac_string, labelstr);
902	}
903	#endif /* CONFIG_MACF */
904
905	AUDIT_ARG(fflags, flags);
906
907	#if !CONFIG_UNION_MOUNTS
908	if (flags & MNT_UNION) {
909	error = EPERM;
910	goto out;
911	}
912	#endif
913
914	if ((vp->v_flag & VROOT) &&
915	(vp->v_mount->mnt_flag & MNT_ROOTFS)) {
916	#if CONFIG_UNION_MOUNTS
917	if (!(flags & MNT_UNION)) {
918	flags \|= MNT_UPDATE;
919	} else {
920	/*
921	* For a union mount on '/', treat it as fresh
922	* mount instead of update.
923	* Otherwise, union mouting on '/' used to panic the
924	* system before, since mnt_vnodecovered was found to
925	* be NULL for '/' which is required for unionlookup
926	* after it gets ENOENT on union mount.
927	*/
928	flags = (flags & ~(MNT_UPDATE));
929	}
930	#else
931	flags \|= MNT_UPDATE;
932	#endif /* CONFIG_UNION_MOUNTS */
933
934	#if SECURE_KERNEL
935	if ((flags & MNT_RDONLY) == `0`) {
936	/ Release kernels are not allowed to mount "/" as rw /
937	error = EPERM;
938	goto out;
939	}
940	#endif
941
942	/*
943	* See 7392553 for more details on why this check exists.
944	* Suffice to say: If this check is ON and something tries
945	* to mount the rootFS RW, we'll turn off the codesign
946	* bitmap optimization.
947	*/
948	#if CHECK_CS_VALIDATION_BITMAP
949	if ((flags & MNT_RDONLY) == `0`) {
950	root_fs_upgrade_try = TRUE;
951	}
952	#endif
953	}
954
955	error = mount_common(fstypename, pvp, vp, cnp: &nd.ni_cnd, fsmountargs: uap->data, flags, internal_flags: `0`,
956	labelstr, ctx);
957
958	out:
959
960	#if CONFIG_MACF
961	kfree_data(labelstr, labelsz);
962	#endif /* CONFIG_MACF */
963
964	if (vp) {
965	vnode_put(vp);
966	}
967	if (pvp) {
968	vnode_put(vp: pvp);
969	}
970	if (need_nameidone) {
971	nameidone(&nd);
972	}
973
974	return error;
975	}
976
977	/*
978	* common mount implementation (final stage of mounting)
979	*
980	* Arguments:
981	* fstypename file system type (ie it's vfs name)
982	* pvp parent of covered vnode
983	* vp covered vnode
984	* cnp component name (ie path) of covered vnode
985	* flags generic mount flags
986	* fsmountargs file system specific data
987	* labelstr optional MAC label
988	* kernelmount TRUE for mounts initiated from inside the kernel
989	* ctx caller's context
990	*/
991	static int
992	mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
993	struct componentname cnp, user_addr_t fsmountargs, int* flags, uint32_t internal_flags,
994	char *labelstr, vfs_context_t ctx)
995	{
996	#if !CONFIG_MACF
997	#pragma unused(labelstr)
998	#endif
999	struct vnode *devvp = NULLVP;
1000	struct vnode *device_vnode = NULLVP;
1001	#if CONFIG_MACF
1002	struct vnode *rvp;
1003	#endif
1004	struct mount *mp = NULL;
1005	struct vfstable vfsp = (struct* vfstable *)`0`;
1006	struct proc *p = vfs_context_proc(ctx);
1007	int error, flag = `0`;
1008	bool flag_set = false;
1009	user_addr_t devpath = USER_ADDR_NULL;
1010	int ronly = `0`;
1011	int mntalloc = `0`;
1012	boolean_t vfsp_ref = FALSE;
1013	boolean_t is_rwlock_locked = FALSE;
1014	boolean_t did_rele = FALSE;
1015	boolean_t have_usecount = FALSE;
1016	boolean_t did_set_lmount = FALSE;
1017	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1018
1019	#if CONFIG_ROSV_STARTUP \|\| CONFIG_MOUNT_VM \|\| CONFIG_BASESYSTEMROOT
1020	/ Check for mutually-exclusive flag bits /
1021	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK \| KERNEL_MOUNT_BASESYSTEMROOT));
1022	int bitcount = `0`;
1023	while (checkflags != `0`) {
1024	checkflags &= (checkflags - `1`);
1025	bitcount++;
1026	}
1027
1028	if (bitcount > `1`) {
1029	//not allowed to request multiple mount-by-role flags
1030	error = EINVAL;
1031	goto out1;
1032	}
1033	#endif
1034
1035	/*
1036	* Process an update for an existing mount
1037	*/
1038	if (flags & MNT_UPDATE) {
1039	if ((vp->v_flag & VROOT) == `0`) {
1040	error = EINVAL;
1041	goto out1;
1042	}
1043	mp = vp->v_mount;
1044
1045	/ if unmount or mount in progress, return error /
1046	mount_lock_spin(mp);
1047	if (mp->mnt_lflag & (MNT_LUNMOUNT \| MNT_LMOUNT)) {
1048	mount_unlock(mp);
1049	error = EBUSY;
1050	goto out1;
1051	}
1052	mp->mnt_lflag \|= MNT_LMOUNT;
1053	did_set_lmount = TRUE;
1054	mount_unlock(mp);
1055	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
1056	is_rwlock_locked = TRUE;
1057	/*
1058	* We only allow the filesystem to be reloaded if it
1059	* is currently mounted read-only.
1060	*/
1061	if ((flags & MNT_RELOAD) &&
1062	((mp->mnt_flag & MNT_RDONLY) == `0`)) {
1063	error = ENOTSUP;
1064	goto out1;
1065	}
1066
1067	/*
1068	* If content protection is enabled, update mounts are not
1069	* allowed to turn it off.
1070	*/
1071	if ((mp->mnt_flag & MNT_CPROTECT) &&
1072	((flags & MNT_CPROTECT) == `0`)) {
1073	error = EINVAL;
1074	goto out1;
1075	}
1076
1077	/*
1078	* can't turn off MNT_REMOVABLE either but it may be an unexpected
1079	* failure to return an error for this so we'll just silently
1080	* add it if it is not passed in.
1081	*/
1082	if ((mp->mnt_flag & MNT_REMOVABLE) &&
1083	((flags & MNT_REMOVABLE) == `0`)) {
1084	flags \|= MNT_REMOVABLE;
1085	}
1086
1087	/ Can't downgrade the backer of the root FS /
1088	if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1089	(!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1090	error = ENOTSUP;
1091	goto out1;
1092	}
1093
1094	/*
1095	* Only root, or the user that did the original mount is
1096	* permitted to update it.
1097	*/
1098	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
1099	(error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
1100	goto out1;
1101	}
1102	#if CONFIG_MACF
1103	error = mac_mount_check_remount(ctx, mp);
1104	if (error != `0`) {
1105	goto out1;
1106	}
1107	#endif
1108	/*
1109	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1110	* and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1111	*/
1112	if ((!kernelmount) && suser(cred: vfs_context_ucred(ctx), NULL)) {
1113	flags \|= MNT_NOSUID \| MNT_NODEV;
1114	if (mp->mnt_flag & MNT_NOEXEC) {
1115	flags \|= MNT_NOEXEC;
1116	}
1117	}
1118	flag = mp->mnt_flag;
1119	flag_set = true;
1120
1121
1122
1123	mp->mnt_flag \|= flags & (MNT_RELOAD \| MNT_FORCE \| MNT_UPDATE);
1124
1125	vfsp = mp->mnt_vtable;
1126	goto update;
1127	} // MNT_UPDATE
1128
1129	/*
1130	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1131	* MNT_NOEXEC if mount point is already MNT_NOEXEC.
1132	*/
1133	if ((!kernelmount) && suser(cred: vfs_context_ucred(ctx), NULL)) {
1134	flags \|= MNT_NOSUID \| MNT_NODEV;
1135	if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1136	flags \|= MNT_NOEXEC;
1137	}
1138	}
1139
1140	/ XXXAUDIT: Should we capture the type on the error path as well? /
1141	/ XXX cast-away const (audit_arg_text() does not modify its input) /
1142	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1143	mount_list_lock();
1144	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1145	if (!strncmp(s1: vfsp->vfc_name, s2: fstypename, MFSNAMELEN)) {
1146	vfsp->vfc_refcount++;
1147	vfsp_ref = TRUE;
1148	break;
1149	}
1150	}
1151	mount_list_unlock();
1152	if (vfsp == NULL) {
1153	error = ENODEV;
1154	goto out1;
1155	}
1156
1157	/*
1158	* VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1159	* except in ROSV configs and for the initial BaseSystem root.
1160	*/
1161	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1162	((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == `0`) &&
1163	((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == `0`)) {
1164	error = EINVAL; / unsupported request /
1165	goto out1;
1166	}
1167
1168	error = prepare_coveredvp(vp, ctx, cnp, fsname: fstypename, internal_flags);
1169	if (error != `0`) {
1170	goto out1;
1171	}
1172
1173	/*
1174	* Allocate and initialize the filesystem (mount_t)
1175	*/
1176	mp = zalloc_flags(mount_zone, Z_WAITOK \| Z_ZERO);
1177	mntalloc = `1`;
1178
1179	/ Initialize the default IO constraints /
1180	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1181	mp->mnt_segreadcnt = mp->mnt_segwritecnt = `32`;
1182	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1183	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1184	mp->mnt_devblocksize = DEV_BSIZE;
1185	mp->mnt_alignmentmask = PAGE_MASK;
1186	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1187	mp->mnt_ioscale = `1`;
1188	mp->mnt_ioflags = `0`;
1189	mp->mnt_realrootvp = NULLVP;
1190	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1191
1192	mp->mnt_lflag \|= MNT_LMOUNT;
1193	did_set_lmount = TRUE;
1194
1195	TAILQ_INIT(&mp->mnt_vnodelist);
1196	TAILQ_INIT(&mp->mnt_workerqueue);
1197	TAILQ_INIT(&mp->mnt_newvnodes);
1198	mount_lock_init(mp);
1199	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
1200	is_rwlock_locked = TRUE;
1201	mp->mnt_op = vfsp->vfc_vfsops;
1202	mp->mnt_vtable = vfsp;
1203	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1204	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
1205	strlcpy(dst: mp->mnt_vfsstat.f_fstypename, src: vfsp->vfc_name, MFSTYPENAMELEN);
1206	do {
1207	size_t pathlen = MAXPATHLEN;
1208
1209	if (vn_getpath_ext(vp, dvp: pvp, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER)) {
1210	strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: cnp->cn_pnbuf, MAXPATHLEN);
1211	}
1212	} while (`0`);
1213	mp->mnt_vnodecovered = vp;
1214	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(cred: vfs_context_ucred(ctx));
1215	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - `1`;
1216	mp->mnt_devbsdunit = `0`;
1217	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1218
1219	/ XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later /
1220	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1221
1222	if (kernelmount) {
1223	mp->mnt_kern_flag \|= MNTK_KERNEL_MOUNT;
1224	}
1225	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != `0`) {
1226	mp->mnt_kern_flag \|= MNTK_PERMIT_UNMOUNT;
1227	}
1228
1229	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1230	// kernel mounted devfs
1231	mp->mnt_kern_flag \|= MNTK_SYSTEM;
1232	}
1233
1234	update:
1235
1236	/*
1237	* Set the mount level flags.
1238	*/
1239	if (flags & MNT_RDONLY) {
1240	mp->mnt_flag \|= MNT_RDONLY;
1241	} else if (mp->mnt_flag & MNT_RDONLY) {
1242	// disallow read/write upgrades of file systems that
1243	// had the TYPENAME_OVERRIDE feature set.
1244	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1245	error = EPERM;
1246	goto out1;
1247	}
1248	mp->mnt_kern_flag \|= MNTK_WANTRDWR;
1249	}
1250	mp->mnt_flag &= ~(MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
1251	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
1252	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
1253	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \| MNT_STRICTATIME \|
1254	MNT_QUARANTINE \| MNT_CPROTECT);
1255
1256	#if SECURE_KERNEL
1257	#if !CONFIG_MNT_SUID
1258	/*
1259	* On release builds of iOS based platforms, always enforce NOSUID on
1260	* all mounts. We do this here because we can catch update mounts as well as
1261	* non-update mounts in this case.
1262	*/
1263	mp->mnt_flag \|= (MNT_NOSUID);
1264	#endif
1265	#endif
1266
1267	mp->mnt_flag \|= flags & (MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
1268	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
1269	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
1270	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \| MNT_STRICTATIME \|
1271	MNT_QUARANTINE \| MNT_CPROTECT);
1272
1273	#if CONFIG_MACF
1274	if (flags & MNT_MULTILABEL) {
1275	if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1276	error = EINVAL;
1277	goto out1;
1278	}
1279	mp->mnt_flag \|= MNT_MULTILABEL;
1280	}
1281	#endif
1282	/*
1283	* Process device path for local file systems if requested.
1284	*
1285	* Snapshot and mount-by-role mounts do not use this path; they are
1286	* passing other opaque data in the device path field.
1287	*
1288	* Basesystemroot mounts pass a device path to be resolved here,
1289	* but it's just a char * already inside the kernel, which
1290	* kernel_mount() shoved into a user_addr_t to call us. So for such
1291	* mounts we must skip copyin (both of the address and of the string
1292	* (in NDINIT).
1293	*/
1294	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1295	!(internal_flags & (KERNEL_MOUNT_SNAPSHOT \| KERNEL_MOUNT_VOLBYROLE_MASK))) {
1296	boolean_t do_copyin_devpath = true;
1297	#if CONFIG_BASESYSTEMROOT
1298	if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1299	// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1300	// We have been passed fsmountargs, which is typed as a user_addr_t,
1301	// but is actually a char * pointing to a (kernelspace) string.*
1302	// We manually unpack it with a series of casts and dereferences
1303	// that reverses what was done just above us on the stack in
1304	// imageboot_pivot_image().
1305	// After retrieving the path to the dev node (which we will NDINIT
1306	// in a moment), we pass NULL fsmountargs on to the filesystem.
1307	_Static_assert(sizeof(char ) == sizeof**(fsmountargs), "fsmountargs should fit a (kernel) address");
1308	char *devnamepp = (char* **)fsmountargs;
1309	char devnamep = devnamepp;
1310	devpath = CAST_USER_ADDR_T(devnamep);
1311	do_copyin_devpath = false;
1312	fsmountargs = USER_ADDR_NULL;
1313
1314	//Now that we have a mp, denote that this mount is for the basesystem.
1315	mp->mnt_supl_kern_flag \|= MNTK_SUPL_BASESYSTEM;
1316	}
1317	#endif // CONFIG_BASESYSTEMROOT
1318
1319	if (do_copyin_devpath) {
1320	if (vfs_context_is64bit(ctx)) {
1321	if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1322	goto out1;
1323	}
1324	fsmountargs += sizeof(devpath);
1325	} else {
1326	user32_addr_t tmp;
1327	if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1328	goto out1;
1329	}
1330	/ munge into LP64 addr /
1331	devpath = CAST_USER_ADDR_T(tmp);
1332	fsmountargs += sizeof(tmp);
1333	}
1334	}
1335
1336	/ Lookup device and authorize access to it /
1337	if ((devpath)) {
1338	struct nameidata nd;
1339
1340	enum uio_seg seg = UIO_USERSPACE;
1341	#if CONFIG_BASESYSTEMROOT
1342	if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1343	seg = UIO_SYSSPACE;
1344	}
1345	#endif // CONFIG_BASESYSTEMROOT
1346
1347	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1348	if ((error = namei(ndp: &nd))) {
1349	goto out1;
1350	}
1351
1352	strlcpy(dst: mp->mnt_vfsstat.f_mntfromname, src: nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1353	devvp = nd.ni_vp;
1354
1355	nameidone(&nd);
1356
1357	if (devvp->v_type != VBLK) {
1358	error = ENOTBLK;
1359	goto out2;
1360	}
1361	if (major(devvp->v_rdev) >= nblkdev) {
1362	error = ENXIO;
1363	goto out2;
1364	}
1365	/*
1366	* If mount by non-root, then verify that user has necessary
1367	* permissions on the device.
1368	*/
1369	if (suser(cred: vfs_context_ucred(ctx), NULL) != `0`) {
1370	kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1371
1372	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
1373	accessmode \|= KAUTH_VNODE_WRITE_DATA;
1374	}
1375	if ((error = vnode_authorize(vp: devvp, NULL, action: accessmode, ctx)) != `0`) {
1376	goto out2;
1377	}
1378	}
1379	}
1380	/ On first mount, preflight and open device /
1381	if (devpath && ((flags & MNT_UPDATE) == `0`)) {
1382	if ((error = vnode_ref(vp: devvp))) {
1383	goto out2;
1384	}
1385	/*
1386	* Disallow multiple mounts of the same device.
1387	* Disallow mounting of a device that is currently in use
1388	* (except for root, which might share swap device for miniroot).
1389	* Flush out any old buffers remaining from a previous use.
1390	*/
1391	if ((error = vfs_setmounting(devvp))) {
1392	vnode_rele(vp: devvp);
1393	goto out2;
1394	}
1395
1396	if (vcount(vp: devvp) > `1` && !(vfs_flags(mp) & MNT_ROOTFS)) {
1397	error = EBUSY;
1398	goto out3;
1399	}
1400	if ((error = VNOP_FSYNC(vp: devvp, MNT_WAIT, ctx))) {
1401	error = ENOTBLK;
1402	goto out3;
1403	}
1404	if ((error = buf_invalidateblks(vp: devvp, BUF_WRITE_DATA, slpflag: `0`, slptimeo: `0`))) {
1405	goto out3;
1406	}
1407
1408	ronly = (mp->mnt_flag & MNT_RDONLY) != `0`;
1409	#if CONFIG_MACF
1410	error = mac_vnode_check_open(ctx,
1411	vp: devvp,
1412	acc_mode: ronly ? FREAD : FREAD \| FWRITE);
1413	if (error) {
1414	goto out3;
1415	}
1416	#endif /* MAC */
1417	if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD \| FWRITE, ctx))) {
1418	goto out3;
1419	}
1420
1421	mp->mnt_devvp = devvp;
1422	device_vnode = devvp;
1423	} else if ((mp->mnt_flag & MNT_RDONLY) &&
1424	(mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1425	(device_vnode = mp->mnt_devvp)) {
1426	dev_t dev;
1427	int maj;
1428	/*
1429	* If upgrade to read-write by non-root, then verify
1430	* that user has necessary permissions on the device.
1431	*/
1432	vnode_getalways(device_vnode);
1433
1434	if (suser(cred: vfs_context_ucred(ctx), NULL) &&
1435	(error = vnode_authorize(vp: device_vnode, NULL,
1436	KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA,
1437	ctx)) != `0`) {
1438	vnode_put(vp: device_vnode);
1439	goto out2;
1440	}
1441
1442	/ Tell the device that we're upgrading /
1443	dev = (dev_t)device_vnode->v_rdev;
1444	maj = major(dev);
1445
1446	if ((u_int)maj >= (u_int)nblkdev) {
1447	panic("Volume mounted on a device with invalid major number.");
1448	}
1449
1450	error = bdevsw[maj].d_open(dev, FREAD \| FWRITE, S_IFBLK, p);
1451	vnode_put(vp: device_vnode);
1452	device_vnode = NULLVP;
1453	if (error != `0`) {
1454	goto out2;
1455	}
1456	}
1457	} // localargs && !(snapshot \| data \| vm)
1458
1459	#if CONFIG_MACF
1460	if ((flags & MNT_UPDATE) == `0`) {
1461	mac_mount_label_init(mp);
1462	mac_mount_label_associate(ctx, mp);
1463	}
1464	if (labelstr) {
1465	if ((flags & MNT_UPDATE) != `0`) {
1466	error = mac_mount_check_label_update(ctx, mp);
1467	if (error != `0`) {
1468	goto out3;
1469	}
1470	}
1471	}
1472	#endif
1473	/*
1474	* Mount the filesystem. We already asserted that internal_flags
1475	* cannot have more than one mount-by-role bit set.
1476	*/
1477	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1478	error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1479	data: (caddr_t)fsmountargs, flags: `0`, context: ctx);
1480	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1481	#if CONFIG_ROSV_STARTUP
1482	struct mount origin_mp = (struct* mount*)fsmountargs;
1483	fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1484	error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: `0`, context: ctx);
1485	if (error) {
1486	printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1487	} else {
1488	/ Mark volume associated with system volume /
1489	mp->mnt_kern_flag \|= MNTK_SYSTEM;
1490
1491	/ Attempt to acquire the mnt_devvp and set it up /
1492	struct vnode *mp_devvp = NULL;
1493	if (mp->mnt_vfsstat.f_mntfromname[`0`] != `0`) {
1494	errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1495	flags: `0`, vpp: &mp_devvp, ctx: vfs_context_kernel());
1496	if (!lerr) {
1497	mp->mnt_devvp = mp_devvp;
1498	//vnode_lookup took an iocount, need to drop it.
1499	vnode_put(vp: mp_devvp);
1500	// now set `device_vnode` to the devvp that was acquired.
1501	// this is needed in order to ensure vfs_init_io_attributes is invoked.
1502	// note that though the iocount above was dropped, the mount acquires
1503	// an implicit reference against the device.
1504	device_vnode = mp_devvp;
1505	}
1506	}
1507	}
1508	#else
1509	error = EINVAL;
1510	#endif
1511	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1512	#if CONFIG_MOUNT_VM
1513	struct mount origin_mp = (struct* mount*)fsmountargs;
1514	fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1515	error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: `0`, context: ctx);
1516	if (error) {
1517	printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1518	} else {
1519	/ Mark volume associated with system volume and a swap mount /
1520	mp->mnt_kern_flag \|= (MNTK_SYSTEM \| MNTK_SWAP_MOUNT);
1521	/ Attempt to acquire the mnt_devvp and set it up /
1522	struct vnode *mp_devvp = NULL;
1523	if (mp->mnt_vfsstat.f_mntfromname[`0`] != `0`) {
1524	errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1525	flags: `0`, vpp: &mp_devvp, ctx: vfs_context_kernel());
1526	if (!lerr) {
1527	mp->mnt_devvp = mp_devvp;
1528	//vnode_lookup took an iocount, need to drop it.
1529	vnode_put(vp: mp_devvp);
1530
1531	// now set `device_vnode` to the devvp that was acquired.
1532	// note that though the iocount above was dropped, the mount acquires
1533	// an implicit reference against the device.
1534	device_vnode = mp_devvp;
1535	}
1536	}
1537	}
1538	#else
1539	error = EINVAL;
1540	#endif
1541	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) \|\| (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1542	#if CONFIG_MOUNT_PREBOOTRECOVERY
1543	struct mount origin_mp = (struct* mount*)fsmountargs;
1544	uint32_t mount_role = `0`;
1545	if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1546	mount_role = VFS_PREBOOT_ROLE;
1547	} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1548	mount_role = VFS_RECOVERY_ROLE;
1549	}
1550
1551	if (mount_role != `0`) {
1552	fs_role_mount_args_t frma = {origin_mp, mount_role};
1553	error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: `0`, context: ctx);
1554	if (error) {
1555	printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1556	} else {
1557	// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1558	/ Mark volume associated with system volume /
1559	//mp->mnt_kern_flag \|= MNTK_SYSTEM;
1560	/ Attempt to acquire the mnt_devvp and set it up /
1561	struct vnode *mp_devvp = NULL;
1562	if (mp->mnt_vfsstat.f_mntfromname[`0`] != `0`) {
1563	errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1564	flags: `0`, vpp: &mp_devvp, ctx: vfs_context_kernel());
1565	if (!lerr) {
1566	mp->mnt_devvp = mp_devvp;
1567	//vnode_lookup took an iocount, need to drop it.
1568	vnode_put(vp: mp_devvp);
1569
1570	// now set `device_vnode` to the devvp that was acquired.
1571	// note that though the iocount above was dropped, the mount acquires
1572	// an implicit reference against the device.
1573	device_vnode = mp_devvp;
1574	}
1575	}
1576	}
1577	} else {
1578	printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1579	error = EINVAL;
1580	}
1581	#else
1582	error = EINVAL;
1583	#endif
1584	} else {
1585	error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1586	}
1587
1588	if (flags & MNT_UPDATE) {
1589	if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1590	mp->mnt_flag &= ~MNT_RDONLY;
1591	}
1592	mp->mnt_flag &= ~
1593	(MNT_UPDATE \| MNT_RELOAD \| MNT_FORCE);
1594	mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1595	if (error) {
1596	mp->mnt_flag = flag; / restore flag value /
1597	}
1598	vfs_event_signal(NULL, VQ_UPDATE, data: (intptr_t)NULL);
1599	lck_rw_done(lck: &mp->mnt_rwlock);
1600	is_rwlock_locked = FALSE;
1601	if (!error) {
1602	enablequotas(mp, ctx);
1603	}
1604	goto exit;
1605	}
1606
1607	/*
1608	* Put the new filesystem on the mount list after root.
1609	*/
1610	if (error == `0`) {
1611	struct vfs_attr vfsattr;
1612	if (device_vnode) {
1613	/*
1614	* cache the IO attributes for the underlying physical media...
1615	* an error return indicates the underlying driver doesn't
1616	* support all the queries necessary... however, reasonable
1617	* defaults will have been set, so no reason to bail or care
1618	*
1619	* Need to do this before calling the MAC hook as it needs
1620	* information from this call.
1621	*/
1622	vfs_init_io_attributes(devvp: device_vnode, mp);
1623	}
1624
1625	#if CONFIG_MACF
1626	error = mac_mount_check_mount_late(ctx, mp);
1627	if (error != `0`) {
1628	goto out4;
1629	}
1630
1631	if (vfs_flags(mp) & MNT_MULTILABEL) {
1632	error = VFS_ROOT(mp, &rvp, ctx);
1633	if (error) {
1634	printf("%s() VFS_ROOT returned %d\n", __func__, error);
1635	goto out4;
1636	}
1637	error = vnode_label(mp, NULL, vp: rvp, NULL, flags: `0`, ctx);
1638	/*
1639	* drop reference provided by VFS_ROOT
1640	*/
1641	vnode_put(vp: rvp);
1642
1643	if (error) {
1644	goto out4;
1645	}
1646	}
1647	#endif /* MAC */
1648
1649	vnode_lock_spin(vp);
1650	CLR(vp->v_flag, VMOUNT);
1651	vp->v_mountedhere = mp;
1652	SET(vp->v_flag, VMOUNTEDHERE);
1653	vnode_unlock(vp);
1654
1655	/*
1656	* taking the name_cache_lock exclusively will
1657	* insure that everyone is out of the fast path who
1658	* might be trying to use a now stale copy of
1659	* vp->v_mountedhere->mnt_realrootvp
1660	* bumping mount_generation causes the cached values
1661	* to be invalidated
1662	*/
1663	name_cache_lock();
1664	mount_generation++;
1665	name_cache_unlock();
1666
1667	error = vnode_ref(vp);
1668	if (error != `0`) {
1669	goto out4;
1670	}
1671
1672	have_usecount = TRUE;
1673
1674	error = checkdirs(olddp: vp, ctx);
1675	if (error != `0`) {
1676	/ Unmount the filesystem as cdir/rdirs cannot be updated /
1677	goto out4;
1678	}
1679	/*
1680	* there is no cleanup code here so I have made it void
1681	* we need to revisit this
1682	*/
1683	(void)VFS_START(mp, `0`, ctx);
1684
1685	if (mount_list_add(mp) != `0`) {
1686	/*
1687	* The system is shutting down trying to umount
1688	* everything, so fail with a plausible errno.
1689	*/
1690	error = EBUSY;
1691	goto out4;
1692	}
1693	lck_rw_done(lck: &mp->mnt_rwlock);
1694	is_rwlock_locked = FALSE;
1695
1696	/ Check if this mounted file system supports EAs or named streams. /
1697	/ Skip WebDAV file systems for now since they hang in VFS_GETATTR here. /
1698	VFSATTR_INIT(&vfsattr);
1699	VFSATTR_WANTED(&vfsattr, f_capabilities);
1700	if (strncmp(s1: mp->mnt_vfsstat.f_fstypename, s2: "webdav", n: sizeof("webdav")) != `0` &&
1701	vfs_getattr(mp, vfa: &vfsattr, ctx) == `0` &&
1702	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1703	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1704	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1705	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1706	}
1707	#if NAMEDSTREAMS
1708	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1709	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1710	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
1711	}
1712	#endif
1713	/ Check if this file system supports path from id lookups. /
1714	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1715	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1716	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1717	} else if (mp->mnt_flag & MNT_DOVOLFS) {
1718	/ Legacy MNT_DOVOLFS flag also implies path from id lookups. /
1719	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1720	}
1721
1722	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1723	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1724	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
1725	}
1726	}
1727	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1728	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1729	}
1730	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1731	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
1732	}
1733	/ increment the operations count /
1734	OSAddAtomic(`1`, &vfs_nummntops);
1735	enablequotas(mp, ctx);
1736
1737	if (device_vnode) {
1738	vfs_setmountedon(device_vnode);
1739	}
1740
1741	/ Now that mount is setup, notify the listeners /
1742	vfs_notify_mount(pdvp: pvp);
1743	IOBSDMountChange(mp, op: kIOMountChangeMount);
1744	} else {
1745	/ If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. /
1746	if (mp->mnt_vnodelist.tqh_first != NULL) {
1747	panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1748	mp->mnt_vtable->vfc_name, error);
1749	}
1750
1751	vnode_lock_spin(vp);
1752	CLR(vp->v_flag, VMOUNT);
1753	vnode_unlock(vp);
1754	mount_list_lock();
1755	mp->mnt_vtable->vfc_refcount--;
1756	mount_list_unlock();
1757
1758	if (device_vnode) {
1759	vnode_rele(vp: device_vnode);
1760	VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD \| FWRITE, ctx);
1761	vfs_clearmounting(device_vnode);
1762	}
1763	lck_rw_done(lck: &mp->mnt_rwlock);
1764	is_rwlock_locked = FALSE;
1765
1766	if (nc_smr_enabled) {
1767	vfs_smr_synchronize();
1768	}
1769
1770	/*
1771	* if we get here, we have a mount structure that needs to be freed,
1772	* but since the coveredvp hasn't yet been updated to point at it,
1773	* no need to worry about other threads holding a crossref on this mp
1774	* so it's ok to just free it
1775	*/
1776	mount_lock_destroy(mp);
1777	#if CONFIG_MACF
1778	mac_mount_label_destroy(mp);
1779	#endif
1780	zfree(mount_zone, mp);
1781	did_set_lmount = false;
1782	}
1783	exit:
1784	/*
1785	* drop I/O count on the device vp if there was one
1786	*/
1787	if (devpath && devvp) {
1788	vnode_put(vp: devvp);
1789	}
1790
1791	if (did_set_lmount) {
1792	mount_lock_spin(mp);
1793	mp->mnt_lflag &= ~MNT_LMOUNT;
1794	mount_unlock(mp);
1795	}
1796
1797	return error;
1798
1799	/ Error condition exits /
1800	out4:
1801	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802
1803	/*
1804	* If the mount has been placed on the covered vp,
1805	* it may have been discovered by now, so we have
1806	* to treat this just like an unmount
1807	*/
1808	mount_lock_spin(mp);
1809	mp->mnt_lflag \|= MNT_LDEAD;
1810	mount_unlock(mp);
1811
1812	if (device_vnode != NULLVP) {
1813	vnode_rele(vp: device_vnode);
1814	VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD \| FWRITE,
1815	ctx);
1816	vfs_clearmounting(device_vnode);
1817	did_rele = TRUE;
1818	}
1819
1820	vnode_lock_spin(vp);
1821
1822	mp->mnt_crossref++;
1823	CLR(vp->v_flag, VMOUNTEDHERE);
1824	vp->v_mountedhere = (mount_t) `0`;
1825
1826	vnode_unlock(vp);
1827
1828	if (have_usecount) {
1829	vnode_rele(vp);
1830	}
1831	out3:
1832	if (devpath && ((flags & MNT_UPDATE) == `0`) && (!did_rele)) {
1833	vnode_rele(vp: devvp);
1834	vfs_clearmounting(devvp);
1835	}
1836	out2:
1837	if (devpath && devvp) {
1838	vnode_put(vp: devvp);
1839	}
1840	out1:
1841	/ Release mnt_rwlock only when it was taken /
1842	if (is_rwlock_locked == TRUE) {
1843	if (flag_set) {
1844	mp->mnt_flag = flag; / restore mnt_flag value /
1845	}
1846	lck_rw_done(lck: &mp->mnt_rwlock);
1847	}
1848
1849	if (did_set_lmount) {
1850	mount_lock_spin(mp);
1851	mp->mnt_lflag &= ~MNT_LMOUNT;
1852	mount_unlock(mp);
1853	}
1854
1855	if (mntalloc) {
1856	if (mp->mnt_crossref) {
1857	mount_dropcrossref(mp, vp, `0`);
1858	} else {
1859	if (nc_smr_enabled) {
1860	vfs_smr_synchronize();
1861	}
1862
1863	mount_lock_destroy(mp);
1864	#if CONFIG_MACF
1865	mac_mount_label_destroy(mp);
1866	#endif
1867	zfree(mount_zone, mp);
1868	}
1869	}
1870	if (vfsp_ref) {
1871	mount_list_lock();
1872	vfsp->vfc_refcount--;
1873	mount_list_unlock();
1874	}
1875
1876	return error;
1877	}
1878
1879	/*
1880	* Flush in-core data, check for competing mount attempts,
1881	* and set VMOUNT
1882	*/
1883	int
1884	prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const* char *fsname, uint32_t internal_flags)
1885	{
1886	#if !CONFIG_MACF
1887	#pragma unused(cnp,fsname)
1888	#endif
1889	struct vnode_attr va;
1890	int error;
1891	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1892	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1893	boolean_t is_busy;
1894
1895	if (!skip_auth) {
1896	/*
1897	* If the user is not root, ensure that they own the directory
1898	* onto which we are attempting to mount.
1899	*/
1900	VATTR_INIT(&va);
1901	VATTR_WANTED(&va, va_uid);
1902	if ((error = vnode_getattr(vp, vap: &va, ctx)) \|\|
1903	(va.va_uid != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
1904	(!vfs_context_issuser(ctx)))) {
1905	error = EPERM;
1906	goto out;
1907	}
1908	}
1909
1910	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1911	goto out;
1912	}
1913
1914	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, slpflag: `0`, slptimeo: `0`))) {
1915	goto out;
1916	}
1917
1918	if (vp->v_type != VDIR) {
1919	error = ENOTDIR;
1920	goto out;
1921	}
1922
1923	vnode_lock_spin(vp);
1924	is_busy = is_fmount ?
1925	(ISSET(vp->v_flag, VMOUNT) \|\| (vp->v_mountedhere != NULL)) :
1926	(ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1927	if (is_busy) {
1928	vnode_unlock(vp);
1929	error = EBUSY;
1930	goto out;
1931	}
1932	SET(vp->v_flag, VMOUNT);
1933	vnode_unlock(vp);
1934
1935	#if CONFIG_MACF
1936	error = mac_mount_check_mount(ctx, vp,
1937	cnp, vfc_name: fsname);
1938	if (error != `0`) {
1939	vnode_lock_spin(vp);
1940	CLR(vp->v_flag, VMOUNT);
1941	vnode_unlock(vp);
1942	}
1943	#endif
1944
1945	out:
1946	return error;
1947	}
1948
1949	#if CONFIG_IMGSRC_ACCESS
1950
1951	#define DEBUG_IMGSRC 0
1952
1953	#if DEBUG_IMGSRC
1954	#define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1955	#else
1956	#define IMGSRC_DEBUG(args...) do { } while(0)
1957	#endif
1958
1959	static int
1960	authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1961	{
1962	struct nameidata nd;
1963	vnode_t vp, realdevvp;
1964	kauth_action_t accessmode;
1965	int error;
1966	enum uio_seg uio = UIO_USERSPACE;
1967
1968	if (ctx == vfs_context_kernel()) {
1969	uio = UIO_SYSSPACE;
1970	}
1971
1972	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1973	if ((error = namei(ndp: &nd))) {
1974	IMGSRC_DEBUG("namei() failed with %d\n", error);
1975	return error;
1976	}
1977
1978	vp = nd.ni_vp;
1979
1980	if (!vnode_isblk(vp)) {
1981	IMGSRC_DEBUG("Not block device.\n");
1982	error = ENOTBLK;
1983	goto out;
1984	}
1985
1986	realdevvp = mp->mnt_devvp;
1987	if (realdevvp == NULLVP) {
1988	IMGSRC_DEBUG("No device backs the mount.\n");
1989	error = ENXIO;
1990	goto out;
1991	}
1992
1993	error = vnode_getwithref(vp: realdevvp);
1994	if (error != `0`) {
1995	IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1996	goto out;
1997	}
1998
1999	if (vnode_specrdev(vp) != vnode_specrdev(vp: realdevvp)) {
2000	IMGSRC_DEBUG("Wrong dev_t.\n");
2001	error = ENXIO;
2002	goto out1;
2003	}
2004
2005	strlcpy(dst: mp->mnt_vfsstat.f_mntfromname, src: nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2006
2007	/*
2008	* If mount by non-root, then verify that user has necessary
2009	* permissions on the device.
2010	*/
2011	if (!vfs_context_issuser(ctx)) {
2012	accessmode = KAUTH_VNODE_READ_DATA;
2013	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
2014	accessmode \|= KAUTH_VNODE_WRITE_DATA;
2015	}
2016	if ((error = vnode_authorize(vp, NULL, action: accessmode, ctx)) != `0`) {
2017	IMGSRC_DEBUG("Access denied.\n");
2018	goto out1;
2019	}
2020	}
2021
2022	*devvpp = vp;
2023
2024	out1:
2025	vnode_put(vp: realdevvp);
2026
2027	out:
2028	nameidone(&nd);
2029
2030	if (error) {
2031	vnode_put(vp);
2032	}
2033
2034	return error;
2035	}
2036
2037	/*
2038	* Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2039	* and call checkdirs()
2040	*/
2041	static int
2042	place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2043	{
2044	int error;
2045
2046	mp->mnt_vnodecovered = vp; / XXX This is normally only set at init-time ... /
2047
2048	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2049	mp->mnt_vtable->vfc_name, vnode_getname(vp));
2050
2051	vnode_lock_spin(vp);
2052	CLR(vp->v_flag, VMOUNT);
2053	vp->v_mountedhere = mp;
2054	SET(vp->v_flag, VMOUNTEDHERE);
2055	vnode_unlock(vp);
2056
2057	/*
2058	* taking the name_cache_lock exclusively will
2059	* insure that everyone is out of the fast path who
2060	* might be trying to use a now stale copy of
2061	* vp->v_mountedhere->mnt_realrootvp
2062	* bumping mount_generation causes the cached values
2063	* to be invalidated
2064	*/
2065	name_cache_lock();
2066	mount_generation++;
2067	name_cache_unlock();
2068
2069	error = vnode_ref(vp);
2070	if (error != `0`) {
2071	goto out;
2072	}
2073
2074	error = checkdirs(olddp: vp, ctx);
2075	if (error != `0`) {
2076	/ Unmount the filesystem as cdir/rdirs cannot be updated /
2077	vnode_rele(vp);
2078	goto out;
2079	}
2080
2081	out:
2082	if (error != `0`) {
2083	mp->mnt_vnodecovered = NULLVP;
2084	}
2085	return error;
2086	}
2087
2088	static void
2089	undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2090	{
2091	vnode_rele(vp);
2092	vnode_lock_spin(vp);
2093	CLR(vp->v_flag, (VMOUNT \| VMOUNTEDHERE));
2094	vp->v_mountedhere = (mount_t)NULL;
2095	vnode_unlock(vp);
2096
2097	mp->mnt_vnodecovered = NULLVP;
2098	}
2099
2100	static int
2101	mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2102	{
2103	int error;
2104
2105	/ unmount in progress return error /
2106	mount_lock_spin(mp);
2107	if (mp->mnt_lflag & (MNT_LUNMOUNT \| MNT_LMOUNT)) {
2108	mount_unlock(mp);
2109	return EBUSY;
2110	}
2111	mount_unlock(mp);
2112	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2113
2114	/*
2115	* We only allow the filesystem to be reloaded if it
2116	* is currently mounted read-only.
2117	*/
2118	if ((flags & MNT_RELOAD) &&
2119	((mp->mnt_flag & MNT_RDONLY) == `0`)) {
2120	error = ENOTSUP;
2121	goto out;
2122	}
2123
2124	/*
2125	* Only root, or the user that did the original mount is
2126	* permitted to update it.
2127	*/
2128	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
2129	(!vfs_context_issuser(ctx))) {
2130	error = EPERM;
2131	goto out;
2132	}
2133	#if CONFIG_MACF
2134	error = mac_mount_check_remount(ctx, mp);
2135	if (error != `0`) {
2136	goto out;
2137	}
2138	#endif
2139
2140	out:
2141	if (error) {
2142	lck_rw_done(lck: &mp->mnt_rwlock);
2143	}
2144
2145	return error;
2146	}
2147
2148	static void
2149	mount_end_update(mount_t mp)
2150	{
2151	lck_rw_done(lck: &mp->mnt_rwlock);
2152	}
2153
2154	static int
2155	get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2156	{
2157	vnode_t vp;
2158
2159	if (height >= MAX_IMAGEBOOT_NESTING) {
2160	return EINVAL;
2161	}
2162
2163	vp = imgsrc_rootvnodes[height];
2164	if ((vp != NULLVP) && (vnode_get(vp) == `0`)) {
2165	*rvpp = vp;
2166	return `0`;
2167	} else {
2168	return ENOENT;
2169	}
2170	}
2171
2172	static int
2173	relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2174	struct componentname cnp, const* char *fsname, vfs_context_t ctx,
2175	boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2176	{
2177	int error;
2178	mount_t mp;
2179	boolean_t placed = FALSE;
2180	struct vfstable *vfsp;
2181	user_addr_t devpath;
2182	char *old_mntonname;
2183	vnode_t rvp;
2184	vnode_t devvp;
2185	uint32_t height;
2186	uint32_t flags;
2187
2188	/ If we didn't imageboot, nothing to move /
2189	if (imgsrc_rootvnodes[`0`] == NULLVP) {
2190	return EINVAL;
2191	}
2192
2193	/ Only root can do this /
2194	if (!vfs_context_issuser(ctx)) {
2195	return EPERM;
2196	}
2197
2198	IMGSRC_DEBUG("looking for root vnode.\n");
2199
2200	/*
2201	* Get root vnode of filesystem we're moving.
2202	*/
2203	if (by_index) {
2204	if (is64bit) {
2205	struct user64_mnt_imgsrc_args mia64;
2206	error = copyin(fsmountargs, &mia64, sizeof(mia64));
2207	if (error != `0`) {
2208	IMGSRC_DEBUG("Failed to copy in arguments.\n");
2209	return error;
2210	}
2211
2212	height = mia64.mi_height;
2213	flags = mia64.mi_flags;
2214	devpath = (user_addr_t)mia64.mi_devpath;
2215	} else {
2216	struct user32_mnt_imgsrc_args mia32;
2217	error = copyin(fsmountargs, &mia32, sizeof(mia32));
2218	if (error != `0`) {
2219	IMGSRC_DEBUG("Failed to copy in arguments.\n");
2220	return error;
2221	}
2222
2223	height = mia32.mi_height;
2224	flags = mia32.mi_flags;
2225	devpath = mia32.mi_devpath;
2226	}
2227	} else {
2228	/*
2229	* For binary compatibility--assumes one level of nesting.
2230	*/
2231	if (is64bit) {
2232	if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2233	return error;
2234	}
2235	} else {
2236	user32_addr_t tmp;
2237	if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2238	return error;
2239	}
2240
2241	/ munge into LP64 addr /
2242	devpath = CAST_USER_ADDR_T(tmp);
2243	}
2244
2245	height = `0`;
2246	flags = `0`;
2247	}
2248
2249	if (flags != `0`) {
2250	IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2251	return EINVAL;
2252	}
2253
2254	error = get_imgsrc_rootvnode(height, rvpp: &rvp);
2255	if (error != `0`) {
2256	IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2257	return error;
2258	}
2259
2260	IMGSRC_DEBUG("got old root vnode\n");
2261
2262	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2263
2264	/ Can only move once /
2265	mp = vnode_mount(vp: rvp);
2266	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2267	IMGSRC_DEBUG("Already moved.\n");
2268	error = EBUSY;
2269	goto out0;
2270	}
2271
2272	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2273	IMGSRC_DEBUG("Starting updated.\n");
2274
2275	/ Get exclusive rwlock on mount, authorize update on mp /
2276	error = mount_begin_update(mp, ctx, flags: `0`);
2277	if (error != `0`) {
2278	IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2279	goto out0;
2280	}
2281
2282	/*
2283	* It can only be moved once. Flag is set under the rwlock,
2284	* so we're now safe to proceed.
2285	*/
2286	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2287	IMGSRC_DEBUG("Already moved [2]\n");
2288	goto out1;
2289	}
2290
2291	IMGSRC_DEBUG("Preparing coveredvp.\n");
2292
2293	/ Mark covered vnode as mount in progress, authorize placing mount on top /
2294	error = prepare_coveredvp(vp, ctx, cnp, fsname, internal_flags: `0`);
2295	if (error != `0`) {
2296	IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2297	goto out1;
2298	}
2299
2300	IMGSRC_DEBUG("Covered vp OK.\n");
2301
2302	/ Sanity check the name caller has provided /
2303	vfsp = mp->mnt_vtable;
2304	if (strncmp(s1: vfsp->vfc_name, s2: fsname, MFSNAMELEN) != `0`) {
2305	IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2306	vfsp->vfc_name, fsname);
2307	error = EINVAL;
2308	goto out2;
2309	}
2310
2311	/ Check the device vnode and update mount-from name, for local filesystems /
2312	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2313	IMGSRC_DEBUG("Local, doing device validation.\n");
2314
2315	if (devpath != USER_ADDR_NULL) {
2316	error = authorize_devpath_and_update_mntfromname(mp, devpath, devvpp: &devvp, ctx);
2317	if (error) {
2318	IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2319	goto out2;
2320	}
2321
2322	vnode_put(vp: devvp);
2323	}
2324	}
2325
2326	/*
2327	* Place mp on top of vnode, ref the vnode, call checkdirs(),
2328	* and increment the name cache's mount generation
2329	*/
2330
2331	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2332	error = place_mount_and_checkdirs(mp, vp, ctx);
2333	if (error != `0`) {
2334	goto out2;
2335	}
2336
2337	placed = TRUE;
2338
2339	strlcpy(dst: old_mntonname, src: mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2340	strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: cnp->cn_pnbuf, MAXPATHLEN);
2341
2342	/ Forbid future moves /
2343	mount_lock(mp);
2344	mp->mnt_kern_flag \|= MNTK_HAS_MOVED;
2345	mount_unlock(mp);
2346
2347	/ Finally, add to mount list, completely ready to go /
2348	if (mount_list_add(mp) != `0`) {
2349	/*
2350	* The system is shutting down trying to umount
2351	* everything, so fail with a plausible errno.
2352	*/
2353	error = EBUSY;
2354	goto out3;
2355	}
2356
2357	mount_end_update(mp);
2358	vnode_put(vp: rvp);
2359	zfree(ZV_NAMEI, old_mntonname);
2360
2361	vfs_notify_mount(pdvp: pvp);
2362
2363	return `0`;
2364	out3:
2365	strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: old_mntonname, MAXPATHLEN);
2366
2367	mount_lock(mp);
2368	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2369	mount_unlock(mp);
2370
2371	out2:
2372	/*
2373	* Placing the mp on the vnode clears VMOUNT,
2374	* so cleanup is different after that point
2375	*/
2376	if (placed) {
2377	/ Rele the vp, clear VMOUNT and v_mountedhere /
2378	undo_place_on_covered_vp(mp, vp);
2379	} else {
2380	vnode_lock_spin(vp);
2381	CLR(vp->v_flag, VMOUNT);
2382	vnode_unlock(vp);
2383	}
2384	out1:
2385	mount_end_update(mp);
2386
2387	out0:
2388	vnode_put(vp: rvp);
2389	zfree(ZV_NAMEI, old_mntonname);
2390	return error;
2391	}
2392
2393	#endif /* CONFIG_IMGSRC_ACCESS */
2394
2395	void
2396	enablequotas(struct mount *mp, vfs_context_t ctx)
2397	{
2398	struct nameidata qnd;
2399	int type;
2400	char qfpath[MAXPATHLEN];
2401	const char *qfname = QUOTAFILENAME;
2402	const char *qfopsname = QUOTAOPSNAME;
2403	const char *qfextension[] = INITQFNAMES;
2404
2405	/ XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s /
2406	if (strncmp(s1: mp->mnt_vfsstat.f_fstypename, s2: "hfs", n: sizeof("hfs")) != `0`) {
2407	return;
2408	}
2409	/*
2410	* Enable filesystem disk quotas if necessary.
2411	* We ignore errors as this should not interfere with final mount
2412	*/
2413	for (type = `0`; type < MAXQUOTAS; type++) {
2414	snprintf(qfpath, count: sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2415	NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2416	CAST_USER_ADDR_T(qfpath), ctx);
2417	if (namei(ndp: &qnd) != `0`) {
2418	continue; / option file to trigger quotas is not present /
2419	}
2420	vnode_put(vp: qnd.ni_vp);
2421	nameidone(&qnd);
2422	snprintf(qfpath, count: sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2423
2424	(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), `0`, qfpath, ctx);
2425	}
2426	return;
2427	}
2428
2429
2430	static int
2431	checkdirs_callback(proc_t p, void * arg)
2432	{
2433	struct cdirargs cdrp = (struct* cdirargs *)arg;
2434	vnode_t olddp = cdrp->olddp;
2435	vnode_t newdp = cdrp->newdp;
2436	struct filedesc *fdp = &p->p_fd;
2437	vnode_t new_cvp = newdp;
2438	vnode_t new_rvp = newdp;
2439	vnode_t old_cvp = NULL;
2440	vnode_t old_rvp = NULL;
2441
2442	/*
2443	* XXX Also needs to iterate each thread in the process to see if it
2444	* XXX is using a per-thread current working directory, and, if so,
2445	* XXX update that as well.
2446	*/
2447
2448	/*
2449	* First, with the proc_fdlock held, check to see if we will need
2450	* to do any work. If not, we will get out fast.
2451	*/
2452	proc_fdlock(p);
2453	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2454	proc_fdunlock(p);
2455	return PROC_RETURNED;
2456	}
2457	proc_fdunlock(p);
2458
2459	/*
2460	* Ok, we will have to do some work. Always take two refs
2461	* because we might need that many. We'll dispose of whatever
2462	* we ended up not using.
2463	*/
2464	if (vnode_ref(vp: newdp) != `0`) {
2465	return PROC_RETURNED;
2466	}
2467	if (vnode_ref(vp: newdp) != `0`) {
2468	vnode_rele(vp: newdp);
2469	return PROC_RETURNED;
2470	}
2471
2472	proc_dirs_lock_exclusive(p);
2473	/*
2474	* Now do the work. Note: we dropped the proc_fdlock, so we
2475	* have to do all of the checks again.
2476	*/
2477	proc_fdlock(p);
2478	if (fdp->fd_cdir == olddp) {
2479	old_cvp = olddp;
2480	fdp->fd_cdir = newdp;
2481	new_cvp = NULL;
2482	}
2483	if (fdp->fd_rdir == olddp) {
2484	old_rvp = olddp;
2485	fdp->fd_rdir = newdp;
2486	new_rvp = NULL;
2487	}
2488	proc_fdunlock(p);
2489	proc_dirs_unlock_exclusive(p);
2490
2491	/*
2492	* Dispose of any references that are no longer needed.
2493	*/
2494	if (old_cvp != NULL) {
2495	vnode_rele(vp: old_cvp);
2496	}
2497	if (old_rvp != NULL) {
2498	vnode_rele(vp: old_rvp);
2499	}
2500	if (new_cvp != NULL) {
2501	vnode_rele(vp: new_cvp);
2502	}
2503	if (new_rvp != NULL) {
2504	vnode_rele(vp: new_rvp);
2505	}
2506
2507	return PROC_RETURNED;
2508	}
2509
2510
2511
2512	/*
2513	* Scan all active processes to see if any of them have a current
2514	* or root directory onto which the new filesystem has just been
2515	* mounted. If so, replace them with the new mount point.
2516	*/
2517	static int
2518	checkdirs(vnode_t olddp, vfs_context_t ctx)
2519	{
2520	vnode_t newdp;
2521	vnode_t tvp;
2522	int err;
2523	struct cdirargs cdr;
2524
2525	if (olddp->v_usecount == `1`) {
2526	return `0`;
2527	}
2528	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2529
2530	if (err != `0`) {
2531	#if DIAGNOSTIC
2532	panic("mount: lost mount: error %d", err);
2533	#endif
2534	return err;
2535	}
2536
2537	cdr.olddp = olddp;
2538	cdr.newdp = newdp;
2539	/ do not block for exec/fork trans as the vp in cwd & rootdir are not changing /
2540	proc_iterate(PROC_ALLPROCLIST \| PROC_NOWAITTRANS, callout: checkdirs_callback, arg: (void *)&cdr, NULL, NULL);
2541
2542	if (rootvnode == olddp) {
2543	vnode_ref(vp: newdp);
2544	lck_rw_lock_exclusive(lck: &rootvnode_rw_lock);
2545	tvp = rootvnode;
2546	rootvnode = newdp;
2547	lck_rw_unlock_exclusive(lck: &rootvnode_rw_lock);
2548	vnode_rele(vp: tvp);
2549	}
2550
2551	vnode_put(vp: newdp);
2552	return `0`;
2553	}
2554
2555	#define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2556	"com.apple.private.vfs.role-account-unmount"
2557
2558	/*
2559	* Unmount a file system.
2560	*
2561	* Note: unmount takes a path to the vnode mounted on as argument,
2562	* not special file (as before).
2563	*/
2564	/ ARGSUSED /
2565	int
2566	unmount(__unused proc_t p, struct unmount_args uap, __unused int32_t retval)
2567	{
2568	vnode_t vp;
2569	struct mount *mp;
2570	int error;
2571	struct nameidata nd;
2572	vfs_context_t ctx;
2573
2574	/*
2575	* If the process has the entitlement, use the kernel's context when
2576	* performing lookup on the mount path as the process might lack proper
2577	* permission to access the directory.
2578	*/
2579	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2580	vfs_context_kernel() : vfs_context_current();
2581
2582	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW \| AUDITVNPATH1,
2583	UIO_USERSPACE, uap->path, ctx);
2584	error = namei(ndp: &nd);
2585	if (error) {
2586	return error;
2587	}
2588	vp = nd.ni_vp;
2589	mp = vp->v_mount;
2590	nameidone(&nd);
2591
2592	/*
2593	* Must be the root of the filesystem
2594	*/
2595	if ((vp->v_flag & VROOT) == `0`) {
2596	vnode_put(vp);
2597	return EINVAL;
2598	}
2599	#if CONFIG_MACF
2600	error = mac_mount_check_umount(ctx, mp);
2601	if (error != `0`) {
2602	vnode_put(vp);
2603	return error;
2604	}
2605	#endif
2606	mount_ref(mp, `0`);
2607	vnode_put(vp);
2608	/ safedounmount consumes the mount ref /
2609	return safedounmount(mp, uap->flags, ctx);
2610	}
2611
2612	int
2613	vfs_unmountbyfsid(fsid_t fsid, int* flags, vfs_context_t ctx)
2614	{
2615	mount_t mp;
2616
2617	mp = mount_list_lookupby_fsid(fsid, `0`, `1`);
2618	if (mp == (mount_t)`0`) {
2619	return ENOENT;
2620	}
2621	mount_ref(mp, `0`);
2622	mount_iterdrop(mp);
2623	/ safedounmount consumes the mount ref /
2624	return safedounmount(mp, flags, ctx);
2625	}
2626
2627	/*
2628	* The mount struct comes with a mount ref which will be consumed.
2629	* Do the actual file system unmount, prevent some common foot shooting.
2630	*/
2631	int
2632	safedounmount(struct mount mp, int* flags, vfs_context_t ctx)
2633	{
2634	int error;
2635	proc_t p = vfs_context_proc(ctx);
2636
2637	/*
2638	* If the file system is not responding and MNT_NOBLOCK
2639	* is set and not a forced unmount then return EBUSY.
2640	*/
2641	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2642	(flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == `0`)) {
2643	error = EBUSY;
2644	goto out;
2645	}
2646
2647	/*
2648	* Skip authorization in two cases:
2649	* - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2650	* This entitlement allows non-root processes unmount volumes mounted by
2651	* other processes.
2652	* - If the mount is tagged as permissive and this is not a forced-unmount
2653	* attempt.
2654	*/
2655	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2656	(!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != `0`) && ((flags & MNT_FORCE) == `0`)))) {
2657	/*
2658	* Only root, or the user that did the original mount is
2659	* permitted to unmount this filesystem.
2660	*/
2661	if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: kauth_cred_get())) &&
2662	(error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
2663	goto out;
2664	}
2665	}
2666	/*
2667	* Don't allow unmounting the root file system, or other volumes
2668	* associated with it (for example, the associated VM or DATA mounts) .
2669	*/
2670	if ((mp->mnt_flag & MNT_ROOTFS) \|\| (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2671	if (!(mp->mnt_flag & MNT_ROOTFS)) {
2672	printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2673	mp->mnt_vfsstat.f_mntonname);
2674	}
2675	error = EBUSY; / the root (or associated volumes) is always busy /
2676	goto out;
2677	}
2678
2679	/*
2680	* If the mount is providing the root filesystem's disk image
2681	* (i.e. imageboot), don't allow unmounting
2682	*/
2683	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2684	error = EBUSY;
2685	goto out;
2686	}
2687
2688	return dounmount(mp, flags, `1`, ctx);
2689
2690	out:
2691	mount_drop(mp, `0`);
2692	return error;
2693	}
2694
2695	/*
2696	* Do the actual file system unmount.
2697	*/
2698	int
2699	dounmount(struct mount mp, int* flags, int withref, vfs_context_t ctx)
2700	{
2701	vnode_t coveredvp = (vnode_t)`0`;
2702	int error;
2703	int needwakeup = `0`;
2704	int forcedunmount = `0`;
2705	int lflags = `0`;
2706	struct vnode *devvp = NULLVP;
2707	#if CONFIG_TRIGGERS
2708	proc_t p = vfs_context_proc(ctx);
2709	int did_vflush = `0`;
2710	int pflags_save = `0`;
2711	#endif /* CONFIG_TRIGGERS */
2712
2713	#if CONFIG_FSE
2714	if (!(flags & MNT_FORCE)) {
2715	fsevent_unmount(mp, ctx); / has to come first! /
2716	}
2717	#endif
2718
2719	mount_lock(mp);
2720
2721	/*
2722	* If already an unmount in progress just return EBUSY.
2723	* Even a forced unmount cannot override.
2724	*/
2725	if (mp->mnt_lflag & (MNT_LUNMOUNT \| MNT_LMOUNT)) {
2726	if (withref != `0`) {
2727	mount_drop(mp, `1`);
2728	}
2729	mount_unlock(mp);
2730	return EBUSY;
2731	}
2732
2733	if (flags & MNT_FORCE) {
2734	forcedunmount = `1`;
2735	mp->mnt_lflag \|= MNT_LFORCE;
2736	}
2737
2738	#if CONFIG_TRIGGERS
2739	if (flags & MNT_NOBLOCK && p != kernproc) {
2740	pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2741	}
2742	#endif
2743
2744	mp->mnt_kern_flag \|= MNTK_UNMOUNT;
2745	mp->mnt_lflag \|= MNT_LUNMOUNT;
2746	mp->mnt_flag &= ~MNT_ASYNC;
2747	/*
2748	* anyone currently in the fast path that
2749	* trips over the cached rootvp will be
2750	* dumped out and forced into the slow path
2751	* to regenerate a new cached value
2752	*/
2753	mp->mnt_realrootvp = NULLVP;
2754	mount_unlock(mp);
2755
2756	if (forcedunmount && (flags & MNT_LNOSUB) == `0`) {
2757	/*
2758	* Force unmount any mounts in this filesystem.
2759	* If any unmounts fail - just leave them dangling.
2760	* Avoids recursion.
2761	*/
2762	(void) dounmount_submounts(mp, flags \| MNT_LNOSUB, ctx);
2763	}
2764
2765	/*
2766	* taking the name_cache_lock exclusively will
2767	* insure that everyone is out of the fast path who
2768	* might be trying to use a now stale copy of
2769	* vp->v_mountedhere->mnt_realrootvp
2770	* bumping mount_generation causes the cached values
2771	* to be invalidated
2772	*/
2773	name_cache_lock();
2774	mount_generation++;
2775	name_cache_unlock();
2776
2777
2778	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2779	if (withref != `0`) {
2780	mount_drop(mp, `0`);
2781	}
2782	error = `0`;
2783	if (forcedunmount == `0`) {
2784	ubc_umount(mp); / release cached vnodes /
2785	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
2786	error = VFS_SYNC(mp, MNT_WAIT, ctx);
2787	if (error) {
2788	mount_lock(mp);
2789	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2790	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2791	mp->mnt_lflag &= ~MNT_LFORCE;
2792	goto out;
2793	}
2794	}
2795	}
2796
2797	IOBSDMountChange(mp, op: kIOMountChangeUnmount);
2798
2799	#if CONFIG_TRIGGERS
2800	vfs_nested_trigger_unmounts(mp, flags, ctx);
2801	did_vflush = `1`;
2802	#endif
2803	if (forcedunmount) {
2804	lflags \|= FORCECLOSE;
2805	}
2806	error = vflush(mp, NULLVP, SKIPSWAP \| SKIPSYSTEM \| SKIPROOT \| lflags);
2807	if ((forcedunmount == `0`) && error) {
2808	mount_lock(mp);
2809	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2810	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2811	mp->mnt_lflag &= ~MNT_LFORCE;
2812	goto out;
2813	}
2814
2815	/ make sure there are no one in the mount iterations or lookup /
2816	mount_iterdrain(mp);
2817
2818	error = VFS_UNMOUNT(mp, flags, ctx);
2819	if (error) {
2820	mount_iterreset(mp);
2821	mount_lock(mp);
2822	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2823	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2824	mp->mnt_lflag &= ~MNT_LFORCE;
2825	goto out;
2826	}
2827
2828	/ increment the operations count /
2829	if (!error) {
2830	OSAddAtomic(`1`, &vfs_nummntops);
2831	}
2832
2833	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2834	/ hold an io reference and drop the usecount before close /
2835	devvp = mp->mnt_devvp;
2836	vnode_getalways(devvp);
2837	vnode_rele(vp: devvp);
2838	VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD \| FWRITE,
2839	ctx);
2840	vnode_clearmountedon(vp: devvp);
2841	vnode_put(vp: devvp);
2842	}
2843	lck_rw_done(lck: &mp->mnt_rwlock);
2844	mount_list_remove(mp);
2845	lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2846
2847	/ mark the mount point hook in the vp but not drop the ref yet /
2848	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2849	/*
2850	* The covered vnode needs special handling. Trying to get an
2851	* iocount must not block here as this may lead to deadlocks
2852	* if the Filesystem to which the covered vnode belongs is
2853	* undergoing forced unmounts. Since we hold a usecount, the
2854	* vnode cannot be reused (it can, however, still be terminated)
2855	*/
2856	vnode_getalways(coveredvp);
2857	vnode_lock_spin(coveredvp);
2858
2859	mp->mnt_crossref++;
2860	coveredvp->v_mountedhere = (struct mount *)`0`;
2861	CLR(coveredvp->v_flag, VMOUNT \| VMOUNTEDHERE);
2862	vnode_unlock(coveredvp);
2863	vnode_put(vp: coveredvp);
2864	}
2865
2866	mount_list_lock();
2867	mp->mnt_vtable->vfc_refcount--;
2868	mount_list_unlock();
2869
2870	cache_purgevfs(mp); / remove cache entries for this file sys /
2871	vfs_event_signal(NULL, VQ_UNMOUNT, data: (intptr_t)NULL);
2872	mount_lock(mp);
2873	mp->mnt_lflag \|= MNT_LDEAD;
2874
2875	if (mp->mnt_lflag & MNT_LWAIT) {
2876	/*
2877	* do the wakeup here
2878	* in case we block in mount_refdrain
2879	* which will drop the mount lock
2880	* and allow anyone blocked in vfs_busy
2881	* to wakeup and see the LDEAD state
2882	*/
2883	mp->mnt_lflag &= ~MNT_LWAIT;
2884	wakeup(chan: (caddr_t)mp);
2885	}
2886	mount_refdrain(mp);
2887
2888	/ free disk_conditioner_info structure for this mount /
2889	disk_conditioner_unmount(mp);
2890
2891	out:
2892	if (mp->mnt_lflag & MNT_LWAIT) {
2893	mp->mnt_lflag &= ~MNT_LWAIT;
2894	needwakeup = `1`;
2895	}
2896
2897	#if CONFIG_TRIGGERS
2898	if (flags & MNT_NOBLOCK && p != kernproc) {
2899	// Restore P_NOREMOTEHANG bit to its previous value
2900	if ((pflags_save & P_NOREMOTEHANG) == `0`) {
2901	OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2902	}
2903	}
2904
2905	/*
2906	* Callback and context are set together under the mount lock, and
2907	* never cleared, so we're safe to examine them here, drop the lock,
2908	* and call out.
2909	*/
2910	if (mp->mnt_triggercallback != NULL) {
2911	mount_unlock(mp);
2912	if (error == `0`) {
2913	mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2914	} else if (did_vflush) {
2915	mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2916	}
2917	} else {
2918	mount_unlock(mp);
2919	}
2920	#else
2921	mount_unlock(mp);
2922	#endif /* CONFIG_TRIGGERS */
2923
2924	lck_rw_done(lck: &mp->mnt_rwlock);
2925
2926	if (needwakeup) {
2927	wakeup(chan: (caddr_t)mp);
2928	}
2929
2930	if (!error) {
2931	if ((coveredvp != NULLVP)) {
2932	vnode_t pvp = NULLVP;
2933
2934	/*
2935	* The covered vnode needs special handling. Trying to
2936	* get an iocount must not block here as this may lead
2937	* to deadlocks if the Filesystem to which the covered
2938	* vnode belongs is undergoing forced unmounts. Since we
2939	* hold a usecount, the vnode cannot be reused
2940	* (it can, however, still be terminated).
2941	*/
2942	vnode_getalways(coveredvp);
2943
2944	mount_dropcrossref(mp, coveredvp, `0`);
2945	/*
2946	* We'll _try_ to detect if this really needs to be
2947	* done. The coveredvp can only be in termination (or
2948	* terminated) if the coveredvp's mount point is in a
2949	* forced unmount (or has been) since we still hold the
2950	* ref.
2951	*/
2952	if (!vnode_isrecycled(vp: coveredvp)) {
2953	pvp = vnode_getparent(vp: coveredvp);
2954	#if CONFIG_TRIGGERS
2955	if (coveredvp->v_resolve) {
2956	vnode_trigger_rearm(coveredvp, ctx);
2957	}
2958	#endif
2959	}
2960
2961	vnode_rele(vp: coveredvp);
2962	vnode_put(vp: coveredvp);
2963	coveredvp = NULLVP;
2964
2965	if (pvp) {
2966	lock_vnode_and_post(pvp, NOTE_WRITE);
2967	vnode_put(vp: pvp);
2968	}
2969	} else if (mp->mnt_flag & MNT_ROOTFS) {
2970	if (nc_smr_enabled) {
2971	vfs_smr_synchronize();
2972	}
2973
2974	mount_lock_destroy(mp);
2975	#if CONFIG_MACF
2976	mac_mount_label_destroy(mp);
2977	#endif
2978	zfree(mount_zone, mp);
2979	} else {
2980	panic("dounmount: no coveredvp");
2981	}
2982	}
2983	return error;
2984	}
2985
2986	/*
2987	* Unmount any mounts in this filesystem.
2988	*/
2989	void
2990	dounmount_submounts(struct mount mp, int* flags, vfs_context_t ctx)
2991	{
2992	mount_t smp;
2993	fsid_t *fsids, fsid;
2994	int fsids_sz;
2995	int count = `0`, i, m = `0`;
2996	vnode_t vp;
2997
2998	mount_list_lock();
2999
3000	// Get an array to hold the submounts fsids.
3001	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3002	count++;
3003	fsids_sz = count * sizeof(fsid_t);
3004	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3005	if (fsids == NULL) {
3006	mount_list_unlock();
3007	goto out;
3008	}
3009	fsids[`0`] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3010
3011	/*
3012	* Fill the array with submount fsids.
3013	* Since mounts are always added to the tail of the mount list, the
3014	* list is always in mount order.
3015	* For each mount check if the mounted-on vnode belongs to a
3016	* mount that's already added to our array of mounts to be unmounted.
3017	*/
3018	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3019	vp = smp->mnt_vnodecovered;
3020	if (vp == NULL) {
3021	continue;
3022	}
3023	fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3024	for (i = `0`; i <= m; i++) {
3025	if (fsids[i].val[`0`] == fsid.val[`0`] &&
3026	fsids[i].val[`1`] == fsid.val[`1`]) {
3027	fsids[++m] = smp->mnt_vfsstat.f_fsid;
3028	break;
3029	}
3030	}
3031	}
3032	mount_list_unlock();
3033
3034	// Unmount the submounts in reverse order. Ignore errors.
3035	for (i = m; i > `0`; i--) {
3036	smp = mount_list_lookupby_fsid(&fsids[i], `0`, `1`);
3037	if (smp) {
3038	mount_ref(smp, `0`);
3039	mount_iterdrop(smp);
3040	(void) dounmount(mp: smp, flags, withref: `1`, ctx);
3041	}
3042	}
3043	out:
3044	kfree_data(fsids, fsids_sz);
3045	}
3046
3047	void
3048	mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3049	{
3050	vnode_hold(vp: dp);
3051	vnode_lock(dp);
3052	mp->mnt_crossref--;
3053
3054	if (mp->mnt_crossref < `0`) {
3055	panic("mount cross refs -ve");
3056	}
3057
3058	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == `0`)) {
3059	if (need_put) {
3060	vnode_put_locked(dp);
3061	}
3062	vnode_drop_and_unlock(dp);
3063
3064	if (nc_smr_enabled) {
3065	vfs_smr_synchronize();
3066	}
3067
3068	mount_lock_destroy(mp);
3069	#if CONFIG_MACF
3070	mac_mount_label_destroy(mp);
3071	#endif
3072	zfree(mount_zone, mp);
3073	return;
3074	}
3075	if (need_put) {
3076	vnode_put_locked(dp);
3077	}
3078	vnode_drop_and_unlock(dp);
3079	}
3080
3081
3082	/*
3083	* Sync each mounted filesystem.
3084	*/
3085	#if DIAGNOSTIC
3086	int syncprt = `0`;
3087	#endif
3088
3089	int print_vmpage_stat = `0`;
3090
3091	/*
3092	* sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3093	* mounted read-write with the passed waitfor value.
3094	*
3095	* Parameters: mp mount-point descriptor per mounted file-system instance.
3096	* arg user argument (please see below)
3097	*
3098	* User argument is a pointer to 32 bit unsigned integer which describes the
3099	* type of waitfor value to set for calling VFS_SYNC(). If user argument is
3100	* passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3101	* waitfor value.
3102	*
3103	* Returns: VFS_RETURNED
3104	*/
3105	static int
3106	sync_callback(mount_t mp, void *arg)
3107	{
3108	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
3109	int asyncflag = mp->mnt_flag & MNT_ASYNC;
3110	unsigned waitfor = MNT_NOWAIT;
3111
3112	if (arg) {
3113	waitfor = (uint32_t)arg;
3114	}
3115
3116	/ Sanity check for flags - these are the only valid combinations for the flag bits/
3117	if (waitfor != MNT_WAIT &&
3118	waitfor != (MNT_WAIT \| MNT_VOLUME) &&
3119	waitfor != MNT_NOWAIT &&
3120	waitfor != (MNT_NOWAIT \| MNT_VOLUME) &&
3121	waitfor != MNT_DWAIT &&
3122	waitfor != (MNT_DWAIT \| MNT_VOLUME)) {
3123	panic("Passed inappropriate waitfor %u to "
3124	"sync_callback()", waitfor);
3125	}
3126
3127	mp->mnt_flag &= ~MNT_ASYNC;
3128	(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3129	if (asyncflag) {
3130	mp->mnt_flag \|= MNT_ASYNC;
3131	}
3132	}
3133
3134	return VFS_RETURNED;
3135	}
3136
3137	/ ARGSUSED /
3138	int
3139	sync(__unused proc_t p, __unused struct sync_args uap, __unused int32_t retval)
3140	{
3141	vfs_iterate(LK_NOWAIT, callout: sync_callback, NULL);
3142
3143	if (print_vmpage_stat) {
3144	vm_countdirtypages();
3145	}
3146
3147	#if DIAGNOSTIC
3148	if (syncprt) {
3149	vfs_bufstats();
3150	}
3151	#endif /* DIAGNOSTIC */
3152	return `0`;
3153	}
3154
3155	typedef enum {
3156	SYNC_ALL = `0`,
3157	SYNC_ONLY_RELIABLE_MEDIA = `1`,
3158	SYNC_ONLY_UNRELIABLE_MEDIA = `2`
3159	} sync_type_t;
3160
3161	static int
3162	sync_internal_callback(mount_t mp, void *arg)
3163	{
3164	if (arg) {
3165	int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3166	(mp->mnt_flag & MNT_LOCAL);
3167	sync_type_t sync_type = ((sync_type_t )arg);
3168
3169	if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3170	return VFS_RETURNED;
3171	} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3172	return VFS_RETURNED;
3173	}
3174	}
3175
3176	(void)sync_callback(mp, NULL);
3177
3178	return VFS_RETURNED;
3179	}
3180
3181	int sync_thread_state = `0`;
3182	int sync_timeout_seconds = `5`;
3183
3184	#define SYNC_THREAD_RUN 0x0001
3185	#define SYNC_THREAD_RUNNING 0x0002
3186
3187	#if CONFIG_PHYS_WRITE_ACCT
3188	thread_t pm_sync_thread;
3189	#endif /* CONFIG_PHYS_WRITE_ACCT */
3190
3191	static void
3192	sync_thread(__unused void *arg, __unused wait_result_t wr)
3193	{
3194	sync_type_t sync_type;
3195	#if CONFIG_PHYS_WRITE_ACCT
3196	pm_sync_thread = current_thread();
3197	#endif /* CONFIG_PHYS_WRITE_ACCT */
3198
3199	lck_mtx_lock(lck: &sync_mtx_lck);
3200	while (sync_thread_state & SYNC_THREAD_RUN) {
3201	sync_thread_state &= ~SYNC_THREAD_RUN;
3202	lck_mtx_unlock(lck: &sync_mtx_lck);
3203
3204	sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3205	vfs_iterate(LK_NOWAIT, callout: sync_internal_callback, arg: &sync_type);
3206	sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3207	vfs_iterate(LK_NOWAIT, callout: sync_internal_callback, arg: &sync_type);
3208
3209	lck_mtx_lock(lck: &sync_mtx_lck);
3210	}
3211	/*
3212	* This wakeup _has_ to be issued before the lock is released otherwise
3213	* we may end up waking up a thread in sync_internal which is
3214	* expecting a wakeup from a thread it just created and not from this
3215	* thread which is about to exit.
3216	*/
3217	wakeup(chan: &sync_thread_state);
3218	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3219	#if CONFIG_PHYS_WRITE_ACCT
3220	pm_sync_thread = NULL;
3221	#endif /* CONFIG_PHYS_WRITE_ACCT */
3222	lck_mtx_unlock(lck: &sync_mtx_lck);
3223
3224	if (print_vmpage_stat) {
3225	vm_countdirtypages();
3226	}
3227
3228	#if DIAGNOSTIC
3229	if (syncprt) {
3230	vfs_bufstats();
3231	}
3232	#endif /* DIAGNOSTIC */
3233	}
3234
3235	struct timeval sync_timeout_last_print = {.tv_sec = `0`, .tv_usec = `0`};
3236
3237	/*
3238	* An in-kernel sync for power management to call.
3239	* This function always returns within sync_timeout seconds.
3240	*/
3241	__private_extern__ int
3242	sync_internal(void)
3243	{
3244	thread_t thd = NULL;
3245	int error;
3246	int thread_created = FALSE;
3247	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = `0`};
3248
3249	lck_mtx_lock(lck: &sync_mtx_lck);
3250	sync_thread_state \|= SYNC_THREAD_RUN;
3251	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3252	int kr;
3253
3254	sync_thread_state \|= SYNC_THREAD_RUNNING;
3255	kr = kernel_thread_start(continuation: sync_thread, NULL, new_thread: &thd);
3256	if (kr != KERN_SUCCESS) {
3257	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3258	lck_mtx_unlock(lck: &sync_mtx_lck);
3259	printf("sync_thread failed\n");
3260	return `0`;
3261	}
3262	thread_created = TRUE;
3263	}
3264
3265	error = msleep(chan: (caddr_t)&sync_thread_state, mtx: &sync_mtx_lck,
3266	pri: (PVFS \| PDROP \| PCATCH), wmesg: "sync_thread", ts: &ts);
3267	if (error) {
3268	struct timeval now;
3269
3270	microtime(tv: &now);
3271	if (now.tv_sec - sync_timeout_last_print.tv_sec > `120`) {
3272	printf("sync timed out: %d sec\n", sync_timeout_seconds);
3273	sync_timeout_last_print.tv_sec = now.tv_sec;
3274	}
3275	}
3276
3277	if (thread_created) {
3278	thread_deallocate(thread: thd);
3279	}
3280
3281	return `0`;
3282	} / end of sync_internal call /
3283
3284	/*
3285	* Change filesystem quotas.
3286	*/
3287	#if QUOTA
3288	int
3289	quotactl(proc_t p, struct quotactl_args uap, __unused int32_t retval)
3290	{
3291	struct mount *mp;
3292	int error, quota_cmd, quota_status = `0`;
3293	caddr_t datap;
3294	size_t fnamelen;
3295	struct nameidata nd;
3296	vfs_context_t ctx = vfs_context_current();
3297	struct dqblk my_dqblk = {};
3298
3299	AUDIT_ARG(uid, uap->uid);
3300	AUDIT_ARG(cmd, uap->cmd);
3301	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
3302	uap->path, ctx);
3303	error = namei(ndp: &nd);
3304	if (error) {
3305	return error;
3306	}
3307	mp = nd.ni_vp->v_mount;
3308	mount_ref(mp, `0`);
3309	vnode_put(vp: nd.ni_vp);
3310	nameidone(&nd);
3311
3312	#if CONFIG_MACF
3313	error = mac_mount_check_quotactl(ctx, mp, cmd: uap->cmd, id: uap->uid);
3314	if (error != `0`) {
3315	goto out;
3316	}
3317	#endif
3318
3319	/ copyin any data we will need for downstream code /
3320	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3321
3322	switch (quota_cmd) {
3323	case Q_QUOTAON:
3324	/ uap->arg specifies a file from which to take the quotas /
3325	fnamelen = MAXPATHLEN;
3326	datap = zalloc(view: ZV_NAMEI);
3327	error = copyinstr(uaddr: uap->arg, kaddr: datap, MAXPATHLEN, done: &fnamelen);
3328	break;
3329	case Q_GETQUOTA:
3330	/ uap->arg is a pointer to a dqblk structure. /
3331	datap = (caddr_t) &my_dqblk;
3332	break;
3333	case Q_SETQUOTA:
3334	case Q_SETUSE:
3335	/ uap->arg is a pointer to a dqblk structure. /
3336	datap = (caddr_t) &my_dqblk;
3337	if (proc_is64bit(p)) {
3338	struct user_dqblk my_dqblk64;
3339	error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3340	if (error == `0`) {
3341	munge_dqblk(dqblkp: &my_dqblk, user_dqblkp: &my_dqblk64, FALSE);
3342	}
3343	} else {
3344	error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3345	}
3346	break;
3347	case Q_QUOTASTAT:
3348	/ uap->arg is a pointer to an integer /
3349	datap = (caddr_t) &quota_status;
3350	break;
3351	default:
3352	datap = NULL;
3353	break;
3354	} / switch /
3355
3356	if (error == `0`) {
3357	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3358	}
3359
3360	switch (quota_cmd) {
3361	case Q_QUOTAON:
3362	if (datap != NULL) {
3363	zfree(ZV_NAMEI, datap);
3364	}
3365	break;
3366	case Q_GETQUOTA:
3367	/ uap->arg is a pointer to a dqblk structure we need to copy out to /
3368	if (error == `0`) {
3369	if (proc_is64bit(p)) {
3370	struct user_dqblk my_dqblk64;
3371
3372	memset(s: &my_dqblk64, c: `0`, n: sizeof(my_dqblk64));
3373	munge_dqblk(dqblkp: &my_dqblk, user_dqblkp: &my_dqblk64, TRUE);
3374	error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3375	} else {
3376	error = copyout(datap, uap->arg, sizeof(struct dqblk));
3377	}
3378	}
3379	break;
3380	case Q_QUOTASTAT:
3381	/ uap->arg is a pointer to an integer /
3382	if (error == `0`) {
3383	error = copyout(datap, uap->arg, sizeof(quota_status));
3384	}
3385	break;
3386	default:
3387	break;
3388	} / switch /
3389
3390	out:
3391	mount_drop(mp, `0`);
3392	return error;
3393	}
3394	#else
3395	int
3396	quotactl(__unused proc_t p, __unused struct quotactl_args uap, __unused int32_t retval)
3397	{
3398	return EOPNOTSUPP;
3399	}
3400	#endif /* QUOTA */
3401
3402	static int
3403	statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3404	{
3405	int error;
3406	vfs_context_t ctx = vfs_context_current();
3407
3408	#if CONFIG_MACF
3409	error = mac_mount_check_stat(ctx, mp);
3410	if (error != `0`) {
3411	return error;
3412	}
3413	#endif
3414
3415	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3416	if (error != `0`) {
3417	return error;
3418	}
3419
3420	return munge_statfs(mp, sfsp: &mp->mnt_vfsstat, bufp, NULL, is_64_bit: IS_64BIT_PROCESS(p), TRUE);
3421	}
3422
3423	/*
3424	* Get filesystem statistics.
3425	*
3426	* Returns: 0 Success
3427	* namei:???
3428	* vfs_update_vfsstat:???
3429	* munge_statfs:EFAULT
3430	*/
3431	/ ARGSUSED /
3432	int
3433	statfs(proc_t p, struct statfs_args uap, __unused int32_t retval)
3434	{
3435	int error;
3436	struct mount *mp;
3437	struct nameidata nd;
3438	vfs_context_t ctx = vfs_context_current();
3439	vnode_t vp;
3440
3441	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
3442	UIO_USERSPACE, uap->path, ctx);
3443	error = namei(ndp: &nd);
3444	if (error != `0`) {
3445	return error;
3446	}
3447	vp = nd.ni_vp;
3448	mp = vp->v_mount;
3449	nameidone(&nd);
3450
3451	error = statfs_internal(p, mp, bufp: uap->buf);
3452	vnode_put(vp);
3453
3454	return error;
3455	}
3456
3457	/*
3458	* Get filesystem statistics.
3459	*/
3460	/ ARGSUSED /
3461	int
3462	fstatfs(proc_t p, struct fstatfs_args uap, __unused int32_t retval)
3463	{
3464	int error;
3465	vnode_t vp = NULL;
3466	struct mount *mp;
3467
3468	AUDIT_ARG(fd, uap->fd);
3469
3470	if ((error = file_vnode(uap->fd, &vp)) \|\|
3471	(error = vnode_getwithref(vp))) {
3472	goto out;
3473	}
3474
3475	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3476
3477	mp = vp->v_mount;
3478	if (!mp) {
3479	error = EBADF;
3480	goto out_vnode;
3481	}
3482
3483	error = statfs_internal(p, mp, bufp: uap->buf);
3484
3485	out_vnode:
3486	vnode_put(vp);
3487
3488	out:
3489	if (vp != NULL) {
3490	file_drop(uap->fd);
3491	}
3492
3493	return error;
3494	}
3495
3496	void
3497	vfs_get_statfs64(struct mount mp, struct* statfs64 *sfs)
3498	{
3499	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3500
3501	bzero(s: sfs, n: sizeof(*sfs));
3502
3503	sfs->f_bsize = vsfs->f_bsize;
3504	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3505	sfs->f_blocks = vsfs->f_blocks;
3506	sfs->f_bfree = vsfs->f_bfree;
3507	sfs->f_bavail = vsfs->f_bavail;
3508	sfs->f_files = vsfs->f_files;
3509	sfs->f_ffree = vsfs->f_ffree;
3510	sfs->f_fsid = vsfs->f_fsid;
3511	sfs->f_owner = vsfs->f_owner;
3512	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3513	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3514	sfs->f_fssubtype = vsfs->f_fssubtype;
3515	sfs->f_flags_ext = `0`;
3516	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3517	sfs->f_flags_ext \|= MNT_EXT_ROOT_DATA_VOL;
3518	}
3519	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3520	sfs->f_flags_ext \|= MNT_EXT_FSKIT;
3521	}
3522	vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSTYPENAMELEN);
3523	strlcpy(dst: &sfs->f_mntonname[`0`], src: &vsfs->f_mntonname[`0`], MAXPATHLEN);
3524	strlcpy(dst: &sfs->f_mntfromname[`0`], src: &vsfs->f_mntfromname[`0`], MAXPATHLEN);
3525	}
3526
3527	/*
3528	* Get file system statistics in 64-bit mode
3529	*/
3530	int
3531	statfs64(__unused struct proc p, struct* statfs64_args uap, __unused int32_t retval)
3532	{
3533	struct mount *mp;
3534	int error;
3535	struct nameidata *ndp;
3536	struct statfs64 *sfsp;
3537	vfs_context_t ctxp = vfs_context_current();
3538	vnode_t vp;
3539	struct {
3540	struct nameidata nd;
3541	struct statfs64 sfs;
3542	} *__nameidata_statfs64;
3543
3544	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3545	Z_WAITOK);
3546	ndp = &__nameidata_statfs64->nd;
3547
3548	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
3549	UIO_USERSPACE, uap->path, ctxp);
3550	error = namei(ndp);
3551	if (error != `0`) {
3552	goto out;
3553	}
3554	vp = ndp->ni_vp;
3555	mp = vp->v_mount;
3556	nameidone(ndp);
3557
3558	#if CONFIG_MACF
3559	error = mac_mount_check_stat(ctx: ctxp, mp);
3560	if (error != `0`) {
3561	vnode_put(vp);
3562	goto out;
3563	}
3564	#endif
3565
3566	error = vfs_update_vfsstat(mp, ctx: ctxp, VFS_USER_EVENT);
3567	if (error != `0`) {
3568	vnode_put(vp);
3569	goto out;
3570	}
3571
3572	sfsp = &__nameidata_statfs64->sfs;
3573	vfs_get_statfs64(mp, sfs: sfsp);
3574	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3575	(p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3576	/ This process does not want to see a seperate data volume mountpoint /
3577	strlcpy(dst: &sfsp->f_mntonname[`0`], src: "/", n: sizeof("/"));
3578	}
3579	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3580	vnode_put(vp);
3581
3582	out:
3583	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3584
3585	return error;
3586	}
3587
3588	/*
3589	* Get file system statistics in 64-bit mode
3590	*/
3591	int
3592	fstatfs64(__unused struct proc p, struct* fstatfs64_args uap, __unused int32_t retval)
3593	{
3594	struct vnode *vp;
3595	struct mount *mp;
3596	struct statfs64 sfs;
3597	int error;
3598
3599	AUDIT_ARG(fd, uap->fd);
3600
3601	if ((error = file_vnode(uap->fd, &vp))) {
3602	return error;
3603	}
3604
3605	error = vnode_getwithref(vp);
3606	if (error) {
3607	file_drop(uap->fd);
3608	return error;
3609	}
3610
3611	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3612
3613	mp = vp->v_mount;
3614	if (!mp) {
3615	error = EBADF;
3616	goto out;
3617	}
3618
3619	#if CONFIG_MACF
3620	error = mac_mount_check_stat(ctx: vfs_context_current(), mp);
3621	if (error != `0`) {
3622	goto out;
3623	}
3624	#endif
3625
3626	if ((error = vfs_update_vfsstat(mp, ctx: vfs_context_current(), VFS_USER_EVENT)) != `0`) {
3627	goto out;
3628	}
3629
3630	vfs_get_statfs64(mp, sfs: &sfs);
3631	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3632	(p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3633	/ This process does not want to see a seperate data volume mountpoint /
3634	strlcpy(dst: &sfs.f_mntonname[`0`], src: "/", n: sizeof("/"));
3635	}
3636	error = copyout(&sfs, uap->buf, sizeof(sfs));
3637
3638	out:
3639	file_drop(uap->fd);
3640	vnode_put(vp);
3641
3642	return error;
3643	}
3644
3645	struct getfsstat_struct {
3646	user_addr_t sfsp;
3647	user_addr_t *mp;
3648	int count;
3649	int maxcount;
3650	int flags;
3651	int error;
3652	};
3653
3654
3655	static int
3656	getfsstat_callback(mount_t mp, void * arg)
3657	{
3658	struct getfsstat_struct fstp = (struct* getfsstat_struct *)arg;
3659	struct vfsstatfs *sp;
3660	int error, my_size;
3661	vfs_context_t ctx = vfs_context_current();
3662
3663	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3664	#if CONFIG_MACF
3665	error = mac_mount_check_stat(ctx, mp);
3666	if (error != `0`) {
3667	fstp->error = error;
3668	return VFS_RETURNED_DONE;
3669	}
3670	#endif
3671	sp = &mp->mnt_vfsstat;
3672	/*
3673	* If MNT_NOWAIT is specified, do not refresh the
3674	* fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3675	*/
3676	if ((mp->mnt_lflag & MNT_LDEAD) \|\|
3677	(((fstp->flags & MNT_NOWAIT) == `0` \|\| (fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
3678	(!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3679	(error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3680	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3681	return VFS_RETURNED;
3682	}
3683
3684	/*
3685	* Need to handle LP64 version of struct statfs
3686	*/
3687	error = munge_statfs(mp, sfsp: sp, bufp: fstp->sfsp, sizep: &my_size, is_64_bit: IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3688	if (error) {
3689	fstp->error = error;
3690	return VFS_RETURNED_DONE;
3691	}
3692	fstp->sfsp += my_size;
3693
3694	if (fstp->mp) {
3695	#if CONFIG_MACF
3696	error = mac_mount_label_get(mp, mac_p: *fstp->mp);
3697	if (error) {
3698	fstp->error = error;
3699	return VFS_RETURNED_DONE;
3700	}
3701	#endif
3702	fstp->mp++;
3703	}
3704	}
3705	fstp->count++;
3706	return VFS_RETURNED;
3707	}
3708
3709	/*
3710	* Get statistics on all filesystems.
3711	*/
3712	int
3713	getfsstat(__unused proc_t p, struct getfsstat_args uap, int* *retval)
3714	{
3715	struct __mac_getfsstat_args muap;
3716
3717	muap.buf = uap->buf;
3718	muap.bufsize = uap->bufsize;
3719	muap.mac = USER_ADDR_NULL;
3720	muap.macsize = `0`;
3721	muap.flags = uap->flags;
3722
3723	return __mac_getfsstat(p, &muap, retval);
3724	}
3725
3726	/*
3727	* __mac_getfsstat: Get MAC-related file system statistics
3728	*
3729	* Parameters: p (ignored)
3730	* uap User argument descriptor (see below)
3731	* retval Count of file system statistics (N stats)
3732	*
3733	* Indirect: uap->bufsize Buffer size
3734	* uap->macsize MAC info size
3735	* uap->buf Buffer where information will be returned
3736	* uap->mac MAC info
3737	* uap->flags File system flags
3738	*
3739	*
3740	* Returns: 0 Success
3741	* !0 Not success
3742	*
3743	*/
3744	int
3745	__mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args uap, int* *retval)
3746	{
3747	user_addr_t sfsp;
3748	user_addr_t *mp;
3749	size_t count, maxcount, bufsize, macsize;
3750	struct getfsstat_struct fst;
3751
3752	if ((unsigned)uap->bufsize > INT_MAX \|\| (unsigned)uap->macsize > INT_MAX) {
3753	return EINVAL;
3754	}
3755
3756	bufsize = (size_t) uap->bufsize;
3757	macsize = (size_t) uap->macsize;
3758
3759	if (IS_64BIT_PROCESS(p)) {
3760	maxcount = bufsize / sizeof(struct user64_statfs);
3761	} else {
3762	maxcount = bufsize / sizeof(struct user32_statfs);
3763	}
3764	sfsp = uap->buf;
3765	count = `0`;
3766
3767	mp = NULL;
3768
3769	#if CONFIG_MACF
3770	if (uap->mac != USER_ADDR_NULL) {
3771	u_int32_t *mp0;
3772	int error;
3773	unsigned int i;
3774
3775	count = (macsize / (IS_64BIT_PROCESS(p) ? `8` : `4`));
3776	if (count != maxcount) {
3777	return EINVAL;
3778	}
3779
3780	/ Copy in the array /
3781	mp0 = kalloc_data(macsize, Z_WAITOK);
3782	if (mp0 == NULL) {
3783	return ENOMEM;
3784	}
3785
3786	error = copyin(uap->mac, mp0, macsize);
3787	if (error) {
3788	kfree_data(mp0, macsize);
3789	return error;
3790	}
3791
3792	/ Normalize to an array of user_addr_t /
3793	mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3794	if (mp == NULL) {
3795	kfree_data(mp0, macsize);
3796	return ENOMEM;
3797	}
3798
3799	for (i = `0`; i < count; i++) {
3800	if (IS_64BIT_PROCESS(p)) {
3801	mp[i] = ((user_addr_t *)mp0)[i];
3802	} else {
3803	mp[i] = (user_addr_t)mp0[i];
3804	}
3805	}
3806	kfree_data(mp0, macsize);
3807	}
3808	#endif
3809
3810
3811	fst.sfsp = sfsp;
3812	fst.mp = mp;
3813	fst.flags = uap->flags;
3814	fst.count = `0`;
3815	fst.error = `0`;
3816	fst.maxcount = (int)maxcount;
3817
3818
3819	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, callout: getfsstat_callback, arg: &fst);
3820
3821	if (mp) {
3822	kfree_data(mp, count * sizeof(user_addr_t));
3823	}
3824
3825	if (fst.error) {
3826	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3827	return fst.error;
3828	}
3829
3830	if (fst.sfsp && fst.count > fst.maxcount) {
3831	*retval = fst.maxcount;
3832	} else {
3833	*retval = fst.count;
3834	}
3835	return `0`;
3836	}
3837
3838	static int
3839	getfsstat64_callback(mount_t mp, void * arg)
3840	{
3841	struct getfsstat_struct fstp = (struct* getfsstat_struct *)arg;
3842	struct vfsstatfs *sp;
3843	struct statfs64 sfs;
3844	int error;
3845
3846	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3847	#if CONFIG_MACF
3848	error = mac_mount_check_stat(ctx: vfs_context_current(), mp);
3849	if (error != `0`) {
3850	fstp->error = error;
3851	return VFS_RETURNED_DONE;
3852	}
3853	#endif
3854	sp = &mp->mnt_vfsstat;
3855	/*
3856	* If MNT_NOWAIT is specified, do not refresh the fsstat
3857	* cache. MNT_WAIT overrides MNT_NOWAIT.
3858	*
3859	* We treat MNT_DWAIT as MNT_WAIT for all instances of
3860	* getfsstat, since the constants are out of the same
3861	* namespace.
3862	*/
3863	if ((mp->mnt_lflag & MNT_LDEAD) \|\|
3864	((((fstp->flags & MNT_NOWAIT) == `0`) \|\| (fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
3865	(!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3866	(error = vfs_update_vfsstat(mp, ctx: vfs_context_current(), VFS_USER_EVENT)))) {
3867	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3868	return VFS_RETURNED;
3869	}
3870
3871	vfs_get_statfs64(mp, sfs: &sfs);
3872	error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3873	if (error) {
3874	fstp->error = error;
3875	return VFS_RETURNED_DONE;
3876	}
3877	fstp->sfsp += sizeof(sfs);
3878	}
3879	fstp->count++;
3880	return VFS_RETURNED;
3881	}
3882
3883	/*
3884	* Get statistics on all file systems in 64 bit mode.
3885	*/
3886	int
3887	getfsstat64(__unused proc_t p, struct getfsstat64_args uap, int* *retval)
3888	{
3889	user_addr_t sfsp;
3890	int count, maxcount;
3891	struct getfsstat_struct fst;
3892
3893	maxcount = uap->bufsize / sizeof(struct statfs64);
3894
3895	sfsp = uap->buf;
3896	count = `0`;
3897
3898	fst.sfsp = sfsp;
3899	fst.flags = uap->flags;
3900	fst.count = `0`;
3901	fst.error = `0`;
3902	fst.maxcount = maxcount;
3903
3904	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, callout: getfsstat64_callback, arg: &fst);
3905
3906	if (fst.error) {
3907	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3908	return fst.error;
3909	}
3910
3911	if (fst.sfsp && fst.count > fst.maxcount) {
3912	*retval = fst.maxcount;
3913	} else {
3914	*retval = fst.count;
3915	}
3916
3917	return `0`;
3918	}
3919
3920	/*
3921	* gets the associated vnode with the file descriptor passed.
3922	* as input
3923	*
3924	* INPUT
3925	* ctx - vfs context of caller
3926	* fd - file descriptor for which vnode is required.
3927	* vpp - Pointer to pointer to vnode to be returned.
3928	*
3929	* The vnode is returned with an iocount so any vnode obtained
3930	* by this call needs a vnode_put
3931	*
3932	*/
3933	int
3934	vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3935	{
3936	int error;
3937	vnode_t vp;
3938	struct fileproc *fp;
3939	proc_t p = vfs_context_proc(ctx);
3940
3941	*vpp = NULLVP;
3942
3943	error = fp_getfvp(p, fd, resultfp: &fp, resultvp: &vp);
3944	if (error) {
3945	return error;
3946	}
3947
3948	error = vnode_getwithref(vp);
3949	if (error) {
3950	(void)fp_drop(p, fd, fp, locked: `0`);
3951	return error;
3952	}
3953
3954	(void)fp_drop(p, fd, fp, locked: `0`);
3955	*vpp = vp;
3956	return error;
3957	}
3958
3959	/*
3960	* Wrapper function around namei to start lookup from a directory
3961	* specified by a file descriptor ni_dirfd.
3962	*
3963	* In addition to all the errors returned by namei, this call can
3964	* return ENOTDIR if the file descriptor does not refer to a directory.
3965	* and EBADF if the file descriptor is not valid.
3966	*/
3967	int
3968	nameiat(struct nameidata ndp, int* dirfd)
3969	{
3970	if ((dirfd != AT_FDCWD) &&
3971	!(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3972	!(ndp->ni_cnd.cn_flags & USEDVP)) {
3973	int error = `0`;
3974	char c;
3975
3976	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3977	error = copyin(ndp->ni_dirp, &c, sizeof(char));
3978	if (error) {
3979	return error;
3980	}
3981	} else {
3982	c = ((char* *)(ndp->ni_dirp));
3983	}
3984
3985	if (c != `'/'`) {
3986	vnode_t dvp_at;
3987
3988	error = vnode_getfromfd(ctx: ndp->ni_cnd.cn_context, fd: dirfd,
3989	vpp: &dvp_at);
3990	if (error) {
3991	return error;
3992	}
3993
3994	if (vnode_vtype(vp: dvp_at) != VDIR) {
3995	vnode_put(vp: dvp_at);
3996	return ENOTDIR;
3997	}
3998
3999	ndp->ni_dvp = dvp_at;
4000	ndp->ni_cnd.cn_flags \|= USEDVP;
4001	error = namei(ndp);
4002	ndp->ni_cnd.cn_flags &= ~USEDVP;
4003	vnode_put(vp: dvp_at);
4004	return error;
4005	}
4006	}
4007
4008	return namei(ndp);
4009	}
4010
4011	/*
4012	* Change current working directory to a given file descriptor.
4013	*/
4014	/ ARGSUSED /
4015	int
4016	fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4017	{
4018	vnode_t vp;
4019	vnode_t tdp;
4020	vnode_t tvp;
4021	struct mount *mp;
4022	int error, should_put = `1`;
4023
4024	AUDIT_ARG(fd, fd);
4025	if (per_thread && fd == -`1`) {
4026	/*
4027	* Switching back from per-thread to per process CWD; verify we
4028	* in fact have one before proceeding. The only success case
4029	* for this code path is to return 0 preemptively after zapping
4030	* the thread structure contents.
4031	*/
4032	thread_t th = vfs_context_thread(ctx);
4033	if (th) {
4034	uthread_t uth = get_bsdthread_info(th);
4035	tvp = uth->uu_cdir;
4036	uth->uu_cdir = NULLVP;
4037	if (tvp != NULLVP) {
4038	vnode_rele(vp: tvp);
4039	return `0`;
4040	}
4041	}
4042	return EBADF;
4043	}
4044
4045	if ((error = file_vnode(fd, &vp))) {
4046	return error;
4047	}
4048	if ((error = vnode_getwithref(vp))) {
4049	file_drop(fd);
4050	return error;
4051	}
4052
4053	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4054
4055	if (vp->v_type != VDIR) {
4056	error = ENOTDIR;
4057	goto out;
4058	}
4059
4060	#if CONFIG_MACF
4061	error = mac_vnode_check_chdir(ctx, dvp: vp);
4062	if (error) {
4063	goto out;
4064	}
4065	#endif
4066	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4067	if (error) {
4068	goto out;
4069	}
4070
4071	while (!error && (mp = vp->v_mountedhere) != NULL) {
4072	if (vfs_busy(mp, LK_NOWAIT)) {
4073	error = EACCES;
4074	goto out;
4075	}
4076	error = VFS_ROOT(mp, &tdp, ctx);
4077	vfs_unbusy(mp);
4078	if (error) {
4079	break;
4080	}
4081	vnode_put(vp);
4082	vp = tdp;
4083	}
4084	if (error) {
4085	goto out;
4086	}
4087	if ((error = vnode_ref(vp))) {
4088	goto out;
4089	}
4090	vnode_put(vp);
4091	should_put = `0`;
4092
4093	if (per_thread) {
4094	thread_t th = vfs_context_thread(ctx);
4095	if (th) {
4096	uthread_t uth = get_bsdthread_info(th);
4097	tvp = uth->uu_cdir;
4098	uth->uu_cdir = vp;
4099	OSBitOrAtomic(P_THCWD, &p->p_flag);
4100	} else {
4101	vnode_rele(vp);
4102	error = ENOENT;
4103	goto out;
4104	}
4105	} else {
4106	proc_dirs_lock_exclusive(p);
4107	proc_fdlock(p);
4108	tvp = p->p_fd.fd_cdir;
4109	p->p_fd.fd_cdir = vp;
4110	proc_fdunlock(p);
4111	proc_dirs_unlock_exclusive(p);
4112	}
4113
4114	if (tvp) {
4115	vnode_rele(vp: tvp);
4116	}
4117
4118	out:
4119	if (should_put) {
4120	vnode_put(vp);
4121	}
4122	file_drop(fd);
4123
4124	return error;
4125	}
4126
4127	int
4128	sys_fchdir(proc_t p, struct fchdir_args uap, __unused int32_t retval)
4129	{
4130	return fchdir(p, ctx: vfs_context_current(), fd: uap->fd, false);
4131	}
4132
4133	int
4134	__pthread_fchdir(proc_t p, struct __pthread_fchdir_args uap, __unused int32_t retval)
4135	{
4136	return fchdir(p, ctx: vfs_context_current(), fd: uap->fd, true);
4137	}
4138
4139
4140	/*
4141	* Change current working directory (".").
4142	*
4143	* Returns: 0 Success
4144	* change_dir:ENOTDIR
4145	* change_dir:???
4146	* vnode_ref:ENOENT No such file or directory
4147	*/
4148	/ ARGSUSED /
4149	int
4150	chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata ndp, int* per_thread)
4151	{
4152	int error;
4153	vnode_t tvp;
4154
4155	error = change_dir(ndp, ctx);
4156	if (error) {
4157	return error;
4158	}
4159	if ((error = vnode_ref(vp: ndp->ni_vp))) {
4160	vnode_put(vp: ndp->ni_vp);
4161	return error;
4162	}
4163	/*
4164	* drop the iocount we picked up in change_dir
4165	*/
4166	vnode_put(vp: ndp->ni_vp);
4167
4168	if (per_thread) {
4169	thread_t th = vfs_context_thread(ctx);
4170	if (th) {
4171	uthread_t uth = get_bsdthread_info(th);
4172	tvp = uth->uu_cdir;
4173	uth->uu_cdir = ndp->ni_vp;
4174	OSBitOrAtomic(P_THCWD, &p->p_flag);
4175	} else {
4176	vnode_rele(vp: ndp->ni_vp);
4177	return ENOENT;
4178	}
4179	} else {
4180	proc_dirs_lock_exclusive(p);
4181	proc_fdlock(p);
4182	tvp = p->p_fd.fd_cdir;
4183	p->p_fd.fd_cdir = ndp->ni_vp;
4184	proc_fdunlock(p);
4185	proc_dirs_unlock_exclusive(p);
4186	}
4187
4188	if (tvp) {
4189	vnode_rele(vp: tvp);
4190	}
4191
4192	return `0`;
4193	}
4194
4195
4196	/*
4197	* Change current working directory (".").
4198	*
4199	* Returns: 0 Success
4200	* chdir_internal:ENOTDIR
4201	* chdir_internal:ENOENT No such file or directory
4202	* chdir_internal:???
4203	*/
4204	/ ARGSUSED /
4205	static int
4206	common_chdir(proc_t p, struct chdir_args uap, int* per_thread)
4207	{
4208	struct nameidata nd;
4209	vfs_context_t ctx = vfs_context_current();
4210
4211	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW \| AUDITVNPATH1,
4212	UIO_USERSPACE, uap->path, ctx);
4213
4214	return chdir_internal(p, ctx, ndp: &nd, per_thread);
4215	}
4216
4217
4218	/*
4219	* chdir
4220	*
4221	* Change current working directory (".") for the entire process
4222	*
4223	* Parameters: p Process requesting the call
4224	* uap User argument descriptor (see below)
4225	* retval (ignored)
4226	*
4227	* Indirect parameters: uap->path Directory path
4228	*
4229	* Returns: 0 Success
4230	* common_chdir: ENOTDIR
4231	* common_chdir: ENOENT No such file or directory
4232	* common_chdir: ???
4233	*
4234	*/
4235	int
4236	sys_chdir(proc_t p, struct chdir_args uap, __unused int32_t retval)
4237	{
4238	return common_chdir(p, uap: (void *)uap, per_thread: `0`);
4239	}
4240
4241	/*
4242	* __pthread_chdir
4243	*
4244	* Change current working directory (".") for a single thread
4245	*
4246	* Parameters: p Process requesting the call
4247	* uap User argument descriptor (see below)
4248	* retval (ignored)
4249	*
4250	* Indirect parameters: uap->path Directory path
4251	*
4252	* Returns: 0 Success
4253	* common_chdir: ENOTDIR
4254	* common_chdir: ENOENT No such file or directory
4255	* common_chdir: ???
4256	*
4257	*/
4258	int
4259	__pthread_chdir(proc_t p, struct __pthread_chdir_args uap, __unused int32_t retval)
4260	{
4261	return common_chdir(p, uap: (void *)uap, per_thread: `1`);
4262	}
4263
4264
4265	/*
4266	* Change notion of root (``/'') directory.
4267	*/
4268	/ ARGSUSED /
4269	int
4270	chroot(proc_t p, struct chroot_args uap, __unused int32_t retval)
4271	{
4272	struct filedesc *fdp = &p->p_fd;
4273	int error;
4274	struct nameidata nd;
4275	vnode_t tvp;
4276	vfs_context_t ctx = vfs_context_current();
4277
4278	if ((error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
4279	return error;
4280	}
4281
4282	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW \| AUDITVNPATH1,
4283	UIO_USERSPACE, uap->path, ctx);
4284	error = change_dir(ndp: &nd, ctx);
4285	if (error) {
4286	return error;
4287	}
4288
4289	#if CONFIG_MACF
4290	error = mac_vnode_check_chroot(ctx, dvp: nd.ni_vp,
4291	cnp: &nd.ni_cnd);
4292	if (error) {
4293	vnode_put(vp: nd.ni_vp);
4294	return error;
4295	}
4296	#endif
4297
4298	if ((error = vnode_ref(vp: nd.ni_vp))) {
4299	vnode_put(vp: nd.ni_vp);
4300	return error;
4301	}
4302	vnode_put(vp: nd.ni_vp);
4303
4304	/*
4305	* This lock provides the guarantee that as long as you hold the lock
4306	* fdp->fd_rdir has a usecount on it. This is used to take an iocount
4307	* on a referenced vnode in namei when determining the rootvnode for
4308	* a process.
4309	*/
4310	/ needed for synchronization with lookup /
4311	proc_dirs_lock_exclusive(p);
4312	/ needed for setting the flag and other activities on the fd itself /
4313	proc_fdlock(p);
4314	tvp = fdp->fd_rdir;
4315	fdp->fd_rdir = nd.ni_vp;
4316	fdt_flag_set(fdp, FD_CHROOT);
4317	proc_fdunlock(p);
4318	proc_dirs_unlock_exclusive(p);
4319
4320	if (tvp != NULL) {
4321	vnode_rele(vp: tvp);
4322	}
4323
4324	return `0`;
4325	}
4326
4327	#define PATHSTATICBUFLEN 256
4328	#define PIVOT_ROOT_ENTITLEMENT \
4329	"com.apple.private.vfs.pivot-root"
4330
4331	#if defined(XNU_TARGET_OS_OSX)
4332	int
4333	pivot_root(proc_t p, struct pivot_root_args uap, __unused int* *retval)
4334	{
4335	int error;
4336	char new_rootfs_path_before[PATHSTATICBUFLEN] = {`0`};
4337	char old_rootfs_path_after[PATHSTATICBUFLEN] = {`0`};
4338	char *new_rootfs_path_before_buf = NULL;
4339	char *old_rootfs_path_after_buf = NULL;
4340	char *incoming = NULL;
4341	char *outgoing = NULL;
4342	vnode_t incoming_rootvp = NULLVP;
4343	size_t bytes_copied;
4344
4345	/*
4346	* XXX : Additional restrictions needed
4347	* - perhaps callable only once.
4348	*/
4349	if ((error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
4350	return error;
4351	}
4352
4353	/*
4354	* pivot_root can be executed by launchd only.
4355	* Enforce entitlement.
4356	*/
4357	if ((proc_getpid(p) != `1`) \|\| !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4358	return EPERM;
4359	}
4360
4361	error = copyinstr(uaddr: uap->new_rootfs_path_before, kaddr: &new_rootfs_path_before[`0`], PATHSTATICBUFLEN, done: &bytes_copied);
4362	if (error == ENAMETOOLONG) {
4363	new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4364	error = copyinstr(uaddr: uap->new_rootfs_path_before, kaddr: new_rootfs_path_before_buf, MAXPATHLEN, done: &bytes_copied);
4365	}
4366
4367	if (error) {
4368	goto out;
4369	}
4370
4371	error = copyinstr(uaddr: uap->old_rootfs_path_after, kaddr: &old_rootfs_path_after[`0`], PATHSTATICBUFLEN, done: &bytes_copied);
4372	if (error == ENAMETOOLONG) {
4373	old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4374	error = copyinstr(uaddr: uap->old_rootfs_path_after, kaddr: old_rootfs_path_after_buf, MAXPATHLEN, done: &bytes_copied);
4375	}
4376	if (error) {
4377	goto out;
4378	}
4379
4380	if (new_rootfs_path_before_buf) {
4381	incoming = new_rootfs_path_before_buf;
4382	} else {
4383	incoming = &new_rootfs_path_before[`0`];
4384	}
4385
4386	if (old_rootfs_path_after_buf) {
4387	outgoing = old_rootfs_path_after_buf;
4388	} else {
4389	outgoing = &old_rootfs_path_after[`0`];
4390	}
4391
4392	/*
4393	* The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4394	* Userland is not allowed to pivot to an image.
4395	*/
4396	error = vnode_lookup(path: incoming, flags: `0`, vpp: &incoming_rootvp, ctx: vfs_context_kernel());
4397	if (error) {
4398	goto out;
4399	}
4400	error = VNOP_IOCTL(vp: incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, fflag: `0`, ctx: vfs_context_kernel());
4401	if (error) {
4402	goto out;
4403	}
4404
4405	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4406
4407	out:
4408	if (incoming_rootvp != NULLVP) {
4409	vnode_put(vp: incoming_rootvp);
4410	incoming_rootvp = NULLVP;
4411	}
4412
4413	if (old_rootfs_path_after_buf) {
4414	zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4415	}
4416
4417	if (new_rootfs_path_before_buf) {
4418	zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4419	}
4420
4421	return error;
4422	}
4423	#else
4424	int
4425	pivot_root(proc_t p, __unused struct pivot_root_args uap, int* *retval)
4426	{
4427	return nosys(p, NULL, retval);
4428	}
4429	#endif /* XNU_TARGET_OS_OSX */
4430
4431	/*
4432	* Common routine for chroot and chdir.
4433	*
4434	* Returns: 0 Success
4435	* ENOTDIR Not a directory
4436	* namei:??? [anything namei can return]
4437	* vnode_authorize:??? [anything vnode_authorize can return]
4438	*/
4439	static int
4440	change_dir(struct nameidata *ndp, vfs_context_t ctx)
4441	{
4442	vnode_t vp;
4443	int error;
4444
4445	if ((error = namei(ndp))) {
4446	return error;
4447	}
4448	nameidone(ndp);
4449	vp = ndp->ni_vp;
4450
4451	if (vp->v_type != VDIR) {
4452	vnode_put(vp);
4453	return ENOTDIR;
4454	}
4455
4456	#if CONFIG_MACF
4457	error = mac_vnode_check_chdir(ctx, dvp: vp);
4458	if (error) {
4459	vnode_put(vp);
4460	return error;
4461	}
4462	#endif
4463
4464	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4465	if (error) {
4466	vnode_put(vp);
4467	return error;
4468	}
4469
4470	return error;
4471	}
4472
4473	/*
4474	* Free the vnode data (for directories) associated with the file glob.
4475	*/
4476	struct fd_vn_data *
4477	fg_vn_data_alloc(void)
4478	{
4479	struct fd_vn_data *fvdata;
4480
4481	/ Allocate per fd vnode data /
4482	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK \| Z_ZERO);
4483	lck_mtx_init(lck: &fvdata->fv_lock, grp: &fd_vn_lck_grp, attr: &fd_vn_lck_attr);
4484	return fvdata;
4485	}
4486
4487	/*
4488	* Free the vnode data (for directories) associated with the file glob.
4489	*/
4490	void
4491	fg_vn_data_free(void *fgvndata)
4492	{
4493	struct fd_vn_data fvdata = (struct* fd_vn_data *)fgvndata;
4494
4495	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4496	lck_mtx_destroy(lck: &fvdata->fv_lock, grp: &fd_vn_lck_grp);
4497	kfree_type(struct fd_vn_data, fvdata);
4498	}
4499
4500	/*
4501	* Check permissions, allocate an open file structure,
4502	* and call the device open routine if any.
4503	*
4504	* Returns: 0 Success
4505	* EINVAL
4506	* EINTR
4507	* falloc:ENFILE
4508	* falloc:EMFILE
4509	* falloc:ENOMEM
4510	* vn_open_auth:???
4511	* dupfdopen:???
4512	* VNOP_ADVLOCK:???
4513	* vnode_setsize:???
4514	*
4515	* XXX Need to implement uid, gid
4516	*/
4517	int
4518	open1(vfs_context_t ctx, struct nameidata ndp, int* uflags,
4519	struct vnode_attr vap, fp_initfn_t fp_init, void* initarg, int32_t retval, int authfd)
4520	{
4521	proc_t p = vfs_context_proc(ctx);
4522	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4523	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4524	struct fileproc *fp;
4525	vnode_t vp;
4526	int flags, oflags, amode;
4527	int type, indx, error;
4528	struct vfs_context context;
4529	vnode_t authvp = NULLVP;
4530
4531	oflags = uflags;
4532
4533	amode = oflags & O_ACCMODE;
4534	/*
4535	* Because O_RDONLY is 0, it is not possible to distinguish between
4536	* O_EXEC \| O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4537	* with FREAD/FWRITE.
4538	*/
4539	if ((amode == O_ACCMODE) \|\| (amode && (oflags & O_EXEC))) {
4540	return EINVAL;
4541	}
4542
4543	flags = FFLAGS(uflags);
4544	CLR(flags, FENCRYPTED);
4545	CLR(flags, FUNENCRYPTED);
4546
4547	AUDIT_ARG(fflags, oflags);
4548	AUDIT_ARG(mode, vap->va_mode);
4549
4550	if ((error = falloc_withinit(p, p_cred, ctx, resultfp: &fp, resultfd: &indx, fp_init, initarg)) != `0`) {
4551	return error;
4552	}
4553	if (flags & O_CLOEXEC) {
4554	fp->fp_flags \|= FP_CLOEXEC;
4555	}
4556	if (flags & O_CLOFORK) {
4557	fp->fp_flags \|= FP_CLOFORK;
4558	}
4559
4560	/ setup state to recognize when fdesc_open was called /
4561	uu->uu_dupfd = -`1`;
4562
4563	/*
4564	* Disable read/write access if file is opened with O_EVTONLY and
4565	* the process has requested to deny read/write access.
4566	*/
4567	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4568	flags &= ~(FREAD \| FWRITE);
4569	}
4570
4571	if (authfd != AUTH_OPEN_NOAUTHFD) {
4572	error = vnode_getfromfd(ctx, fd: authfd, vpp: &authvp);
4573	if (error) {
4574	fp_free(p, fd: indx, fp);
4575	return error;
4576	}
4577	}
4578
4579	if ((error = vn_open_auth(ndp, fmode: &flags, vap, authvp))) {
4580	if (authvp != NULLVP) {
4581	vnode_put(vp: authvp);
4582	}
4583	if ((error == ENODEV \|\| error == ENXIO) && (uu->uu_dupfd >= `0`)) {
4584	if ((error = dupfdopen(p, indx, dfd: uu->uu_dupfd, mode: flags, error)) == `0`) {
4585	*retval = indx;
4586	return `0`;
4587	}
4588	}
4589	if (error == ERESTART) {
4590	error = EINTR;
4591	}
4592	fp_free(p, fd: indx, fp);
4593	return error;
4594	}
4595
4596	if (authvp != NULLVP) {
4597	vnode_put(vp: authvp);
4598	}
4599
4600	uu->uu_dupfd = `0`;
4601	vp = ndp->ni_vp;
4602
4603	fp->fp_glob->fg_flag = flags & (FMASK \| O_EVTONLY \| FENCRYPTED \| FUNENCRYPTED);
4604	fp->fp_glob->fg_ops = &vnops;
4605	fp_set_data(fp, fg_data: vp);
4606
4607	#if CONFIG_FILE_LEASES
4608	/*
4609	* If we are creating a file or open with truncate, we need to break the
4610	* lease if there is a read lease placed on the parent dir.
4611	*/
4612	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT \| O_TRUNC))) {
4613	vnode_breakdirlease(vp, true, oflags);
4614	}
4615	/ Now check if there is a lease placed on the file itself. /
4616	error = vnode_breaklease(vp, oflags, ctx);
4617	if (error) {
4618	goto bad;
4619	}
4620	#endif /* CONFIG_FILE_LEASES */
4621
4622	if (flags & (O_EXLOCK \| O_SHLOCK)) {
4623	struct flock lf = {
4624	.l_whence = SEEK_SET,
4625	};
4626
4627	if (flags & O_EXLOCK) {
4628	lf.l_type = F_WRLCK;
4629	} else {
4630	lf.l_type = F_RDLCK;
4631	}
4632	type = F_FLOCK;
4633	if ((flags & FNONBLOCK) == `0`) {
4634	type \|= F_WAIT;
4635	}
4636	#if CONFIG_MACF
4637	error = mac_file_check_lock(cred: vfs_context_ucred(ctx), fg: fp->fp_glob,
4638	F_SETLK, fl: &lf);
4639	if (error) {
4640	goto bad;
4641	}
4642	#endif
4643	if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4644	goto bad;
4645	}
4646	fp->fp_glob->fg_flag \|= FWASLOCKED;
4647	}
4648
4649	/ try to truncate by setting the size attribute /
4650	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)`0`, ioflag: `0`, ctx)) != `0`)) {
4651	goto bad;
4652	}
4653
4654	/*
4655	* For directories we hold some additional information in the fd.
4656	*/
4657	if (vnode_vtype(vp) == VDIR) {
4658	fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4659	} else {
4660	fp->fp_glob->fg_vn_data = NULL;
4661	}
4662
4663	#if CONFIG_SECLUDED_MEMORY
4664	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4665	memory_object_control_t moc;
4666	const char *v_name;
4667
4668	moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669
4670	if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671	/ nothing to do... /
4672	} else if (fp->fp_glob->fg_flag & FWRITE) {
4673	/ writable -> no longer eligible for secluded pages /
4674	memory_object_mark_eligible_for_secluded(moc,
4675	FALSE);
4676	} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4677	char pathname[`32`] = { `0`, };
4678	size_t copied;
4679	/ XXX FBDP: better way to detect /Applications/ ? /
4680	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681	(void)copyinstr(ndp->ni_dirp,
4682	pathname,
4683	sizeof(pathname),
4684	&copied);
4685	} else {
4686	copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687	pathname,
4688	sizeof(pathname),
4689	&copied);
4690	}
4691	pathname[sizeof(pathname) - `1`] = `'\0'`;
4692	if (strncmp(pathname,
4693	"/Applications/",
4694	strlen("/Applications/")) == `0` &&
4695	strncmp(pathname,
4696	"/Applications/Camera.app/",
4697	strlen("/Applications/Camera.app/")) != `0`) {
4698	/*
4699	* not writable
4700	* AND from "/Applications/"
4701	* AND not from "/Applications/Camera.app/"
4702	* ==> eligible for secluded
4703	*/
4704	memory_object_mark_eligible_for_secluded(moc,
4705	TRUE);
4706	}
4707	} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4708	(v_name = vnode_getname(vp))) {
4709	size_t len = strlen(v_name);
4710
4711	if (!strncmp(v_name, "dyld", len) \|\|
4712	!strncmp(v_name, "launchd", len) \|\|
4713	!strncmp(v_name, "Camera", len) \|\|
4714	!strncmp(v_name, "SpringBoard", len) \|\|
4715	!strncmp(v_name, "backboardd", len)) {
4716	/*
4717	* This file matters when launching Camera:
4718	* do not store its contents in the secluded
4719	* pool that will be drained on Camera launch.
4720	*/
4721	memory_object_mark_eligible_for_secluded(moc,
4722	FALSE);
4723	} else if (!strncmp(v_name, "audiomxd", len) \|\|
4724	!strncmp(v_name, "mediaplaybackd", len)) {
4725	memory_object_mark_eligible_for_secluded(moc,
4726	FALSE);
4727	memory_object_mark_for_realtime(moc,
4728	true);
4729	} else if (!strncmp(v_name, "bluetoothd", len)) {
4730	/*
4731	* bluetoothd might be needed for realtime audio
4732	* playback.
4733	*/
4734	memory_object_mark_eligible_for_secluded(moc,
4735	FALSE);
4736	memory_object_mark_for_realtime(moc,
4737	true);
4738	} else {
4739	char pathname[`64`] = { `0`, };
4740	size_t copied;
4741	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4742	(void)copyinstr(ndp->ni_dirp,
4743	pathname,
4744	sizeof(pathname),
4745	&copied);
4746	} else {
4747	copystr(CAST_DOWN(void *, ndp->ni_dirp),
4748	pathname,
4749	sizeof(pathname),
4750	&copied);
4751	}
4752	pathname[sizeof(pathname) - `1`] = `'\0'`;
4753	if (strncmp(pathname,
4754	"/Library/Audio/Plug-Ins/",
4755	strlen("/Library/Audio/Plug-Ins/")) == `0` \|\|
4756	strncmp(pathname,
4757	"/System/Library/Audio/Plug-Ins/",
4758	strlen("/System/Library/Audio/Plug-Ins/")) == `0`) {
4759	/*
4760	* This may be an audio plugin required
4761	* for realtime playback.
4762	* ==> NOT eligible for secluded.
4763	*/
4764	memory_object_mark_eligible_for_secluded(moc,
4765	FALSE);
4766	memory_object_mark_for_realtime(moc,
4767	true);
4768	}
4769	}
4770	vnode_putname(v_name);
4771	}
4772	}
4773	#endif /* CONFIG_SECLUDED_MEMORY */
4774
4775	vnode_put(vp);
4776
4777	/*
4778	* The first terminal open (without a O_NOCTTY) by a session leader
4779	* results in it being set as the controlling terminal.
4780	*/
4781	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4782	!(flags & O_NOCTTY)) {
4783	int tmp = `0`;
4784
4785	(void)(fp->fp_glob->fg_ops->fo_ioctl)(fp, (int*)TIOCSCTTY,
4786	(caddr_t)&tmp, ctx);
4787	}
4788
4789	proc_fdlock(p);
4790	procfdtbl_releasefd(p, fd: indx, NULL);
4791
4792	fp_drop(p, fd: indx, fp, locked: `1`);
4793	proc_fdunlock(p);
4794
4795	*retval = indx;
4796
4797	return `0`;
4798	bad:
4799	context = *vfs_context_current();
4800	context.vc_ucred = fp->fp_glob->fg_cred;
4801
4802	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4803	(FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4804	struct flock lf = {
4805	.l_whence = SEEK_SET,
4806	.l_type = F_UNLCK,
4807	};
4808
4809	(void)VNOP_ADVLOCK(
4810	vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4811	}
4812
4813	vn_close(vp, flags: fp->fp_glob->fg_flag, ctx: &context);
4814	vnode_put(vp);
4815	fp_free(p, fd: indx, fp);
4816
4817	return error;
4818	}
4819
4820	/*
4821	* While most of the *at syscall handlers can call nameiat() which
4822	* is a wrapper around namei, the use of namei and initialisation
4823	* of nameidata are far removed and in different functions - namei
4824	* gets called in vn_open_auth for open1. So we'll just do here what
4825	* nameiat() does.
4826	*/
4827	static int
4828	open1at(vfs_context_t ctx, struct nameidata ndp, int* uflags,
4829	struct vnode_attr vap, fp_initfn_t fp_init, void* initarg, int32_t retval,
4830	int dirfd, int authfd)
4831	{
4832	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4833	int error;
4834	char c;
4835
4836	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4837	error = copyin(ndp->ni_dirp, &c, sizeof(char));
4838	if (error) {
4839	return error;
4840	}
4841	} else {
4842	c = ((char* *)(ndp->ni_dirp));
4843	}
4844
4845	if (c != `'/'`) {
4846	vnode_t dvp_at;
4847
4848	error = vnode_getfromfd(ctx: ndp->ni_cnd.cn_context, fd: dirfd,
4849	vpp: &dvp_at);
4850	if (error) {
4851	return error;
4852	}
4853
4854	if (vnode_vtype(vp: dvp_at) != VDIR) {
4855	vnode_put(vp: dvp_at);
4856	return ENOTDIR;
4857	}
4858
4859	ndp->ni_dvp = dvp_at;
4860	ndp->ni_cnd.cn_flags \|= USEDVP;
4861	error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4862	retval, authfd);
4863	vnode_put(vp: dvp_at);
4864	return error;
4865	}
4866	}
4867
4868	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4869	}
4870
4871	/*
4872	* open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4873	*
4874	* Parameters: p Process requesting the open
4875	* uap User argument descriptor (see below)
4876	* retval Pointer to an area to receive the
4877	* return calue from the system call
4878	*
4879	* Indirect: uap->path Path to open (same as 'open')
4880	* uap->flags Flags to open (same as 'open'
4881	* uap->uid UID to set, if creating
4882	* uap->gid GID to set, if creating
4883	* uap->mode File mode, if creating (same as 'open')
4884	* uap->xsecurity ACL to set, if creating
4885	*
4886	* Returns: 0 Success
4887	* !0 errno value
4888	*
4889	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4890	*
4891	* XXX: We should enummerate the possible errno values here, and where
4892	* in the code they originated.
4893	*/
4894	int
4895	open_extended(proc_t p, struct open_extended_args uap, int32_t retval)
4896	{
4897	int ciferror;
4898	kauth_filesec_t xsecdst;
4899	struct vnode_attr va;
4900	struct nameidata nd;
4901	int cmode;
4902
4903	AUDIT_ARG(owner, uap->uid, uap->gid);
4904
4905	xsecdst = NULL;
4906	if ((uap->xsecurity != USER_ADDR_NULL) &&
4907	((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != `0`)) {
4908	return ciferror;
4909	}
4910
4911	VATTR_INIT(&va);
4912	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4913	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4914	if (uap->uid != KAUTH_UID_NONE) {
4915	VATTR_SET(&va, va_uid, uap->uid);
4916	}
4917	if (uap->gid != KAUTH_GID_NONE) {
4918	VATTR_SET(&va, va_gid, uap->gid);
4919	}
4920	if (xsecdst != NULL) {
4921	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4922	va.va_vaflags \|= VA_FILESEC_ACL;
4923	}
4924
4925	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
4926	uap->path, vfs_context_current());
4927
4928	ciferror = open1(ctx: vfs_context_current(), ndp: &nd, uflags: uap->flags, vap: &va,
4929	NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4930	if (xsecdst != NULL) {
4931	kauth_filesec_free(fsp: xsecdst);
4932	}
4933
4934	return ciferror;
4935	}
4936
4937	/*
4938	* Go through the data-protected atomically controlled open (2)
4939	*
4940	* int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4941	*/
4942	static int
4943	openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4944	int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4945	{
4946	/*
4947	* Follow the same path as normal open(2)
4948	* Look up the item if it exists, and acquire the vnode.
4949	*/
4950	struct vnode_attr va;
4951	struct nameidata nd;
4952	int cmode;
4953	int error;
4954	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4955
4956	VATTR_INIT(&va);
4957	/ Mask off all but regular access permissions /
4958	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4959	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4960
4961	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, segflg,
4962	path, ctx);
4963
4964	/*
4965	* Initialize the extra fields in vnode_attr to pass down our
4966	* extra fields.
4967	* 1. target cprotect class.
4968	* 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4969	*/
4970	if (flags & O_CREAT) {
4971	/ lower level kernel code validates that the class is valid before applying it. /
4972	if (class != PROTECTION_CLASS_DEFAULT) {
4973	/*
4974	* PROTECTION_CLASS_DEFAULT implies that we make the class for this
4975	* file behave the same as open (2)
4976	*/
4977	VATTR_SET(&va, va_dataprotect_class, class);
4978	}
4979	}
4980
4981	if (dpflags & (O_DP_GETRAWENCRYPTED \| O_DP_GETRAWUNENCRYPTED \| O_DP_AUTHENTICATE)) {
4982	if (flags & (O_RDWR \| O_WRONLY)) {
4983	/*
4984	* Not allowed to write raw encrypted bytes or when opening authenticated.
4985	*/
4986	return EINVAL;
4987	}
4988	if (dpflags & O_DP_GETRAWENCRYPTED) {
4989	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4990	}
4991	if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4992	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4993	}
4994	if (dpflags & O_DP_AUTHENTICATE) {
4995	VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4996	}
4997	}
4998
4999	error = open1at(ctx: vfs_context_current(), ndp: &nd, uflags: flags, vap: &va,
5000	NULL, NULL, retval, dirfd: fd, authfd);
5001
5002	return error;
5003	}
5004
5005	int
5006	openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args uap, int32_t retval)
5007	{
5008	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5009	return EINVAL;
5010	}
5011
5012	return openat_dprotected_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags, mode: uap->mode,
5013	class: uap->class, dpflags: uap->dpflags, fd: uap->fd, authfd: uap->authfd, segflg: UIO_USERSPACE, retval);
5014	}
5015
5016	int
5017	open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args uap, int32_t retval)
5018	{
5019	if (uap->dpflags & O_DP_AUTHENTICATE) {
5020	return EINVAL;
5021	}
5022
5023	return openat_dprotected_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags, mode: uap->mode,
5024	class: uap->class, dpflags: uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, segflg: UIO_USERSPACE, retval);
5025	}
5026
5027	static int
5028	openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5029	int fd, enum uio_seg segflg, int *retval)
5030	{
5031	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5032	struct {
5033	struct vnode_attr va;
5034	struct nameidata nd;
5035	} *__open_data;
5036	struct vnode_attr *vap;
5037	struct nameidata *ndp;
5038	int cmode;
5039	int error;
5040
5041	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5042	vap = &__open_data->va;
5043	ndp = &__open_data->nd;
5044
5045	VATTR_INIT(vap);
5046	/ Mask off all but regular access permissions /
5047	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5048	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5049
5050	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1,
5051	segflg, path, ctx);
5052
5053	error = open1at(ctx, ndp, uflags: flags, vap, NULL, NULL, retval, dirfd: fd, AUTH_OPEN_NOAUTHFD);
5054
5055	kfree_type(typeof(*__open_data), __open_data);
5056
5057	return error;
5058	}
5059
5060	int
5061	open(proc_t p, struct open_args uap, int32_t retval)
5062	{
5063	__pthread_testcancel(presyscall: `1`);
5064	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5065	}
5066
5067	int
5068	open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5069	int32_t *retval)
5070	{
5071	return openat_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags,
5072	mode: uap->mode, AT_FDCWD, segflg: UIO_USERSPACE, retval);
5073	}
5074
5075	int
5076	openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5077	int32_t *retval)
5078	{
5079	return openat_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags,
5080	mode: uap->mode, fd: uap->fd, segflg: UIO_USERSPACE, retval);
5081	}
5082
5083	int
5084	openat(proc_t p, struct openat_args uap, int32_t retval)
5085	{
5086	__pthread_testcancel(presyscall: `1`);
5087	return openat_nocancel(p, uap: (struct openat_nocancel_args *)uap, retval);
5088	}
5089
5090	#define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5091
5092	static boolean_t
5093	vfs_context_can_open_by_id(vfs_context_t ctx)
5094	{
5095	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5096	return TRUE;
5097	}
5098
5099	return IOTaskHasEntitlement(task: vfs_context_task(ctx),
5100	OPEN_BY_ID_ENTITLEMENT);
5101	}
5102
5103	/*
5104	* openbyid_np: open a file given a file system id and a file system object id
5105	* the hfs file system object id is an fsobj_id_t {uint32, uint32}
5106	* file systems that don't support object ids it is a node id (uint64_t).
5107	*
5108	* Parameters: p Process requesting the open
5109	* uap User argument descriptor (see below)
5110	* retval Pointer to an area to receive the
5111	* return calue from the system call
5112	*
5113	* Indirect: uap->path Path to open (same as 'open')
5114	*
5115	* uap->fsid id of target file system
5116	* uap->objid id of target file system object
5117	* uap->flags Flags to open (same as 'open')
5118	*
5119	* Returns: 0 Success
5120	* !0 errno value
5121	*
5122	*
5123	* XXX: We should enummerate the possible errno values here, and where
5124	* in the code they originated.
5125	*/
5126	int
5127	openbyid_np(__unused proc_t p, struct openbyid_np_args uap, int* *retval)
5128	{
5129	fsid_t fsid;
5130	uint64_t objid;
5131	int error;
5132	char *buf = NULL;
5133	int buflen = MAXPATHLEN;
5134	int pathlen = `0`;
5135	vfs_context_t ctx = vfs_context_current();
5136
5137	if (!vfs_context_can_open_by_id(ctx)) {
5138	return EPERM;
5139	}
5140
5141	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5142	return error;
5143	}
5144
5145	/uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} /
5146	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5147	return error;
5148	}
5149
5150	AUDIT_ARG(value32, fsid.val[`0`]);
5151	AUDIT_ARG(value64, objid);
5152
5153	/resolve path from fsis, objid/
5154	do {
5155	buf = kalloc_data(buflen + `1`, Z_WAITOK);
5156	if (buf == NULL) {
5157	return ENOMEM;
5158	}
5159
5160	error = fsgetpath_internal( ctx, fsid.val[`0`], objid, buflen,
5161	buf, FSOPT_ISREALFSID, &pathlen);
5162
5163	if (error) {
5164	kfree_data(buf, buflen + `1`);
5165	buf = NULL;
5166	}
5167	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5168
5169	if (error) {
5170	return error;
5171	}
5172
5173	buf[pathlen] = `0`;
5174
5175	error = openat_internal(
5176	ctx, path: (user_addr_t)buf, flags: uap->oflags, mode: `0`, AT_FDCWD, segflg: UIO_SYSSPACE, retval);
5177
5178	kfree_data(buf, buflen + `1`);
5179
5180	return error;
5181	}
5182
5183
5184	/*
5185	* Create a special file.
5186	*/
5187	static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5188	int fd);
5189
5190	static int
5191	mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5192	mode_t mode, int fd)
5193	{
5194	vfs_context_t ctx = vfs_context_current();
5195	struct nameidata nd;
5196	vnode_t vp, dvp;
5197	int error;
5198
5199	/ If it's a mknod() of a FIFO, call mkfifo1() instead /
5200	if ((mode & S_IFMT) == S_IFIFO) {
5201	return mkfifo1(ctx, upath, vap, fd);
5202	}
5203
5204	AUDIT_ARG(mode, mode);
5205	AUDIT_ARG(value32, vap->va_rdev);
5206
5207	if ((error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
5208	return error;
5209	}
5210	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT \| AUDITVNPATH1,
5211	UIO_USERSPACE, upath, ctx);
5212	error = nameiat(ndp: &nd, dirfd: fd);
5213	if (error) {
5214	return error;
5215	}
5216	dvp = nd.ni_dvp;
5217	vp = nd.ni_vp;
5218
5219	if (vp != NULL) {
5220	error = EEXIST;
5221	goto out;
5222	}
5223
5224	switch (mode & S_IFMT) {
5225	case S_IFCHR:
5226	VATTR_SET(vap, va_type, VCHR);
5227	break;
5228	case S_IFBLK:
5229	VATTR_SET(vap, va_type, VBLK);
5230	break;
5231	default:
5232	error = EINVAL;
5233	goto out;
5234	}
5235
5236	#if CONFIG_MACF
5237	error = mac_vnode_check_create(ctx,
5238	dvp: nd.ni_dvp, cnp: &nd.ni_cnd, vap);
5239	if (error) {
5240	goto out;
5241	}
5242	#endif
5243
5244	if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`) {
5245	goto out;
5246	}
5247
5248	#if CONFIG_FILE_LEASES
5249	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5250	#endif
5251
5252	if ((error = vn_create(dvp, &vp, &nd, vap, `0`, `0`, NULL, ctx)) != `0`) {
5253	goto out;
5254	}
5255
5256	if (vp) {
5257	int update_flags = `0`;
5258
5259	// Make sure the name & parent pointers are hooked up
5260	if (vp->v_name == NULL) {
5261	update_flags \|= VNODE_UPDATE_NAME;
5262	}
5263	if (vp->v_parent == NULLVP) {
5264	update_flags \|= VNODE_UPDATE_PARENT;
5265	}
5266
5267	if (update_flags) {
5268	vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
5269	}
5270
5271	#if CONFIG_FSE
5272	add_fsevent(FSE_CREATE_FILE, ctx,
5273	FSE_ARG_VNODE, vp,
5274	FSE_ARG_DONE);
5275	#endif
5276	}
5277
5278	out:
5279	/*
5280	* nameidone has to happen before we vnode_put(dvp)
5281	* since it may need to release the fs_nodelock on the dvp
5282	*/
5283	nameidone(&nd);
5284
5285	if (vp) {
5286	vnode_put(vp);
5287	}
5288	vnode_put(vp: dvp);
5289
5290	return error;
5291	}
5292
5293	int
5294	mknod(proc_t p, struct mknod_args uap, __unused int32_t retval)
5295	{
5296	struct vnode_attr va;
5297
5298	VATTR_INIT(&va);
5299	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5300	VATTR_SET(&va, va_rdev, uap->dev);
5301
5302	return mknodat_internal(p, upath: uap->path, vap: &va, mode: (mode_t)uap->mode, AT_FDCWD);
5303	}
5304
5305	int
5306	mknodat(proc_t p, struct mknodat_args uap, __unused int32_t retval)
5307	{
5308	struct vnode_attr va;
5309
5310	VATTR_INIT(&va);
5311	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5312	VATTR_SET(&va, va_rdev, uap->dev);
5313
5314	return mknodat_internal(p, upath: uap->path, vap: &va, mode: (mode_t)uap->mode, fd: uap->fd);
5315	}
5316
5317	/*
5318	* Create a named pipe.
5319	*
5320	* Returns: 0 Success
5321	* EEXIST
5322	* namei:???
5323	* vnode_authorize:???
5324	* vn_create:???
5325	*/
5326	static int
5327	mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr vap, int* fd)
5328	{
5329	vnode_t vp, dvp;
5330	int error;
5331	struct nameidata nd;
5332
5333	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT \| AUDITVNPATH1,
5334	UIO_USERSPACE, upath, ctx);
5335	error = nameiat(ndp: &nd, dirfd: fd);
5336	if (error) {
5337	return error;
5338	}
5339	dvp = nd.ni_dvp;
5340	vp = nd.ni_vp;
5341
5342	/ check that this is a new file and authorize addition /
5343	if (vp != NULL) {
5344	error = EEXIST;
5345	goto out;
5346	}
5347	VATTR_SET(vap, va_type, VFIFO);
5348
5349	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != `0`) {
5350	goto out;
5351	}
5352
5353	error = vn_create(dvp, &vp, &nd, vap, `0`, `0`, NULL, ctx);
5354	out:
5355	/*
5356	* nameidone has to happen before we vnode_put(dvp)
5357	* since it may need to release the fs_nodelock on the dvp
5358	*/
5359	nameidone(&nd);
5360
5361	if (vp) {
5362	vnode_put(vp);
5363	}
5364	vnode_put(vp: dvp);
5365
5366	return error;
5367	}
5368
5369
5370	/*
5371	* mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5372	*
5373	* Parameters: p Process requesting the open
5374	* uap User argument descriptor (see below)
5375	* retval (Ignored)
5376	*
5377	* Indirect: uap->path Path to fifo (same as 'mkfifo')
5378	* uap->uid UID to set
5379	* uap->gid GID to set
5380	* uap->mode File mode to set (same as 'mkfifo')
5381	* uap->xsecurity ACL to set, if creating
5382	*
5383	* Returns: 0 Success
5384	* !0 errno value
5385	*
5386	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5387	*
5388	* XXX: We should enummerate the possible errno values here, and where
5389	* in the code they originated.
5390	*/
5391	int
5392	mkfifo_extended(proc_t p, struct mkfifo_extended_args uap, __unused int32_t retval)
5393	{
5394	int ciferror;
5395	kauth_filesec_t xsecdst;
5396	struct vnode_attr va;
5397
5398	AUDIT_ARG(owner, uap->uid, uap->gid);
5399
5400	xsecdst = KAUTH_FILESEC_NONE;
5401	if (uap->xsecurity != USER_ADDR_NULL) {
5402	if ((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != `0`) {
5403	return ciferror;
5404	}
5405	}
5406
5407	VATTR_INIT(&va);
5408	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5409	if (uap->uid != KAUTH_UID_NONE) {
5410	VATTR_SET(&va, va_uid, uap->uid);
5411	}
5412	if (uap->gid != KAUTH_GID_NONE) {
5413	VATTR_SET(&va, va_gid, uap->gid);
5414	}
5415	if (xsecdst != KAUTH_FILESEC_NONE) {
5416	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5417	va.va_vaflags \|= VA_FILESEC_ACL;
5418	}
5419
5420	ciferror = mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, AT_FDCWD);
5421
5422	if (xsecdst != KAUTH_FILESEC_NONE) {
5423	kauth_filesec_free(fsp: xsecdst);
5424	}
5425	return ciferror;
5426	}
5427
5428	/ ARGSUSED /
5429	int
5430	mkfifo(proc_t p, struct mkfifo_args uap, __unused int32_t retval)
5431	{
5432	struct vnode_attr va;
5433
5434	VATTR_INIT(&va);
5435	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5436
5437	return mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, AT_FDCWD);
5438	}
5439
5440	int
5441	mkfifoat(proc_t p, struct mkfifoat_args uap, __unused int32_t retval)
5442	{
5443	struct vnode_attr va;
5444
5445	VATTR_INIT(&va);
5446	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5447
5448	return mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, fd: uap->fd);
5449	}
5450
5451	extern int safe_getpath_new(struct vnode dvp, char* leafname, char* path, int* _len, int truncated_path, int* firmlink);
5452	extern int safe_getpath(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path);
5453	extern int safe_getpath_no_firmlink(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path);
5454
5455	int
5456	safe_getpath_new(struct vnode dvp, char* leafname, char* path, int* _len, int truncated_path, int* firmlink)
5457	{
5458	int ret, len = _len;
5459
5460	*truncated_path = `0`;
5461
5462	if (firmlink) {
5463	ret = vn_getpath(vp: dvp, pathbuf: path, len: &len);
5464	} else {
5465	ret = vn_getpath_no_firmlink(vp: dvp, pathbuf: path, len: &len);
5466	}
5467	if (ret == `0` && len < (MAXPATHLEN - `1`)) {
5468	if (leafname) {
5469	path[len - `1`] = `'/'`;
5470	len += strlcpy(dst: &path[len], src: leafname, MAXPATHLEN - len) + `1`;
5471	if (len > MAXPATHLEN) {
5472	char *ptr;
5473
5474	// the string got truncated!
5475	*truncated_path = `1`;
5476	ptr = strrchr(s: path, c: `'/'`);
5477	if (ptr) {
5478	ptr = `'\0'`; // chop off the string at the last directory component*
5479	}
5480	len = (int)strlen(s: path) + `1`;
5481	}
5482	}
5483	} else if (ret == `0`) {
5484	*truncated_path = `1`;
5485	} else if (ret != `0`) {
5486	struct vnode *mydvp = dvp;
5487
5488	if (ret != ENOSPC) {
5489	printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5490	dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5491	}
5492	*truncated_path = `1`;
5493
5494	do {
5495	if (mydvp->v_parent != NULL) {
5496	mydvp = mydvp->v_parent;
5497	} else if (mydvp->v_mount) {
5498	strlcpy(dst: path, src: mydvp->v_mount->mnt_vfsstat.f_mntonname, n: _len);
5499	break;
5500	} else {
5501	// no parent and no mount point? only thing is to punt and say "/" changed
5502	strlcpy(dst: path, src: "/", n: _len);
5503	len = `2`;
5504	mydvp = NULL;
5505	}
5506
5507	if (mydvp == NULL) {
5508	break;
5509	}
5510
5511	len = _len;
5512	if (firmlink) {
5513	ret = vn_getpath(vp: mydvp, pathbuf: path, len: &len);
5514	} else {
5515	ret = vn_getpath_no_firmlink(vp: mydvp, pathbuf: path, len: &len);
5516	}
5517	} while (ret == ENOSPC);
5518	}
5519
5520	return len;
5521	}
5522
5523	int
5524	safe_getpath(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path)
5525	{
5526	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, firmlink: `1`);
5527	}
5528
5529	int
5530	safe_getpath_no_firmlink(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path)
5531	{
5532	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, firmlink: `0`);
5533	}
5534
5535	/*
5536	* Make a hard file link.
5537	*
5538	* Returns: 0 Success
5539	* EPERM
5540	* EEXIST
5541	* EXDEV
5542	* namei:???
5543	* vnode_authorize:???
5544	* VNOP_LINK:???
5545	*/
5546	/ ARGSUSED /
5547	static int
5548	linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5549	user_addr_t link, int flag, enum uio_seg segflg)
5550	{
5551	vnode_t vp, pvp, dvp, lvp;
5552	struct nameidata nd;
5553	int follow;
5554	int error;
5555	#if CONFIG_FSE
5556	fse_info finfo;
5557	#endif
5558	int need_event, has_listeners, need_kpath2;
5559	char *target_path = NULL;
5560	char *no_firmlink_path = NULL;
5561	int truncated = `0`;
5562	int truncated_no_firmlink_path = `0`;
5563
5564	vp = dvp = lvp = NULLVP;
5565
5566	/ look up the object we are linking to /
5567	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5568	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 \| follow,
5569	segflg, path, ctx);
5570
5571	error = nameiat(ndp: &nd, dirfd: fd1);
5572	if (error) {
5573	return error;
5574	}
5575	vp = nd.ni_vp;
5576
5577	nameidone(&nd);
5578
5579	/*
5580	* Normally, linking to directories is not supported.
5581	* However, some file systems may have limited support.
5582	*/
5583	if (vp->v_type == VDIR) {
5584	if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5585	error = EPERM; / POSIX /
5586	goto out;
5587	}
5588
5589	/ Linking to a directory requires ownership. /
5590	if (!kauth_cred_issuser(cred: vfs_context_ucred(ctx))) {
5591	struct vnode_attr dva;
5592
5593	VATTR_INIT(&dva);
5594	VATTR_WANTED(&dva, va_uid);
5595	if (vnode_getattr(vp, vap: &dva, ctx) != `0` \|\|
5596	!VATTR_IS_SUPPORTED(&dva, va_uid) \|\|
5597	(dva.va_uid != kauth_cred_getuid(cred: vfs_context_ucred(ctx)))) {
5598	error = EACCES;
5599	goto out;
5600	}
5601	}
5602	}
5603
5604	/ lookup the target node /
5605	#if CONFIG_TRIGGERS
5606	nd.ni_op = OP_LINK;
5607	#endif
5608	nd.ni_cnd.cn_nameiop = CREATE;
5609	nd.ni_cnd.cn_flags = LOCKPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK;
5610	nd.ni_dirp = link;
5611	error = nameiat(ndp: &nd, dirfd: fd2);
5612	if (error != `0`) {
5613	goto out;
5614	}
5615	dvp = nd.ni_dvp;
5616	lvp = nd.ni_vp;
5617
5618	#if CONFIG_MACF
5619	if ((error = mac_vnode_check_link(ctx, dvp, vp, cnp: &nd.ni_cnd)) != `0`) {
5620	goto out2;
5621	}
5622	#endif
5623
5624	/ or to anything that kauth doesn't want us to (eg. immutable items) /
5625	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != `0`) {
5626	goto out2;
5627	}
5628
5629	/ target node must not exist /
5630	if (lvp != NULLVP) {
5631	error = EEXIST;
5632	goto out2;
5633	}
5634	/ cannot link across mountpoints /
5635	if (vnode_mount(vp) != vnode_mount(vp: dvp)) {
5636	error = EXDEV;
5637	goto out2;
5638	}
5639
5640	/ authorize creation of the target note /
5641	if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`) {
5642	goto out2;
5643	}
5644
5645	#if CONFIG_FILE_LEASES
5646	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5647	#endif
5648
5649	/ and finally make the link /
5650	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5651	if (error) {
5652	goto out2;
5653	}
5654
5655	#if CONFIG_MACF
5656	(void)mac_vnode_notify_link(ctx, vp, dvp, cnp: &nd.ni_cnd);
5657	#endif
5658
5659	#if CONFIG_FSE
5660	need_event = need_fsevent(FSE_CREATE_FILE, vp: dvp);
5661	#else
5662	need_event = `0`;
5663	#endif
5664	has_listeners = kauth_authorize_fileop_has_listeners();
5665
5666	need_kpath2 = `0`;
5667	#if CONFIG_AUDIT
5668	if (AUDIT_RECORD_EXISTS()) {
5669	need_kpath2 = `1`;
5670	}
5671	#endif
5672
5673	if (need_event \|\| has_listeners \|\| need_kpath2) {
5674	char *link_to_path = NULL;
5675	int len, link_name_len;
5676	int len_no_firmlink_path = `0`;
5677
5678	/ build the path to the new link file /
5679	GET_PATH(target_path);
5680
5681	len = safe_getpath(dvp, leafname: nd.ni_cnd.cn_nameptr, path: target_path, MAXPATHLEN, truncated_path: &truncated);
5682	if (no_firmlink_path == NULL) {
5683	GET_PATH(no_firmlink_path);
5684	}
5685	len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: nd.ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
5686
5687	AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5688
5689	if (has_listeners) {
5690	/ build the path to file we are linking to /
5691	GET_PATH(link_to_path);
5692
5693	link_name_len = MAXPATHLEN;
5694	if (vn_getpath(vp, pathbuf: link_to_path, len: &link_name_len) == `0`) {
5695	/*
5696	* Call out to allow 3rd party notification of rename.
5697	* Ignore result of kauth_authorize_fileop call.
5698	*/
5699	kauth_authorize_fileop(credential: vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5700	arg0: (uintptr_t)link_to_path,
5701	arg1: (uintptr_t)target_path);
5702	}
5703	if (link_to_path != NULL) {
5704	RELEASE_PATH(link_to_path);
5705	}
5706	}
5707	#if CONFIG_FSE
5708	if (need_event) {
5709	/ construct fsevent /
5710	if (get_fse_info(vp, fse: &finfo, ctx) == `0`) {
5711	if (truncated_no_firmlink_path) {
5712	finfo.mode \|= FSE_TRUNCATED_PATH;
5713	}
5714
5715	// build the path to the destination of the link
5716	add_fsevent(FSE_CREATE_FILE, ctx,
5717	FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5718	FSE_ARG_FINFO, &finfo,
5719	FSE_ARG_DONE);
5720	}
5721
5722	pvp = vp->v_parent;
5723	// need an iocount on parent vnode in this case
5724	if (pvp && pvp != dvp) {
5725	pvp = vnode_getparent_if_different(vp, dvp);
5726	}
5727	if (pvp) {
5728	add_fsevent(FSE_STAT_CHANGED, ctx,
5729	FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5730	}
5731	if (pvp && pvp != dvp) {
5732	vnode_put(vp: pvp);
5733	}
5734	}
5735	#endif
5736	}
5737	out2:
5738	/*
5739	* nameidone has to happen before we vnode_put(dvp)
5740	* since it may need to release the fs_nodelock on the dvp
5741	*/
5742	nameidone(&nd);
5743	if (target_path != NULL) {
5744	RELEASE_PATH(target_path);
5745	}
5746	if (no_firmlink_path != NULL) {
5747	RELEASE_PATH(no_firmlink_path);
5748	no_firmlink_path = NULL;
5749	}
5750	out:
5751	if (lvp) {
5752	vnode_put(vp: lvp);
5753	}
5754	if (dvp) {
5755	vnode_put(vp: dvp);
5756	}
5757	vnode_put(vp);
5758	return error;
5759	}
5760
5761	int
5762	link(__unused proc_t p, struct link_args uap, __unused int32_t retval)
5763	{
5764	return linkat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
5765	AT_FDCWD, link: uap->link, AT_SYMLINK_FOLLOW, segflg: UIO_USERSPACE);
5766	}
5767
5768	int
5769	linkat(__unused proc_t p, struct linkat_args uap, __unused int32_t retval)
5770	{
5771	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5772	return EINVAL;
5773	}
5774
5775	return linkat_internal(ctx: vfs_context_current(), fd1: uap->fd1, path: uap->path,
5776	fd2: uap->fd2, link: uap->link, flag: uap->flag, segflg: UIO_USERSPACE);
5777	}
5778
5779	/*
5780	* Make a symbolic link.
5781	*
5782	* We could add support for ACLs here too...
5783	*/
5784	/ ARGSUSED /
5785	static int
5786	symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5787	user_addr_t link, enum uio_seg segflg)
5788	{
5789	struct vnode_attr va;
5790	char *path;
5791	int error;
5792	struct nameidata nd;
5793	vnode_t vp, dvp;
5794	size_t dummy = `0`;
5795	proc_t p;
5796
5797	error = `0`;
5798	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5799	path = zalloc(view: ZV_NAMEI);
5800	error = copyinstr(uaddr: path_data, kaddr: path, MAXPATHLEN, done: &dummy);
5801	} else {
5802	path = (char *)path_data;
5803	}
5804	if (error) {
5805	goto out;
5806	}
5807	AUDIT_ARG(text, path); / This is the link string /
5808
5809	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT \| AUDITVNPATH1,
5810	segflg, link, ctx);
5811
5812	error = nameiat(ndp: &nd, dirfd: fd);
5813	if (error) {
5814	goto out;
5815	}
5816	dvp = nd.ni_dvp;
5817	vp = nd.ni_vp;
5818
5819	p = vfs_context_proc(ctx);
5820	VATTR_INIT(&va);
5821	VATTR_SET(&va, va_type, VLNK);
5822	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5823
5824	#if CONFIG_MACF
5825	error = mac_vnode_check_create(ctx,
5826	dvp, cnp: &nd.ni_cnd, vap: &va);
5827	#endif
5828	if (error != `0`) {
5829	goto skipit;
5830	}
5831
5832	if (vp != NULL) {
5833	error = EEXIST;
5834	goto skipit;
5835	}
5836
5837	/ authorize /
5838	if (error == `0`) {
5839	error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5840	}
5841	/ get default ownership, etc. /
5842	if (error == `0`) {
5843	error = vnode_authattr_new(dvp, vap: &va, noauth: `0`, ctx);
5844	}
5845
5846	#if CONFIG_FILE_LEASES
5847	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5848	#endif
5849
5850	if (error == `0`) {
5851	error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5852	}
5853
5854	/ do fallback attribute handling /
5855	if (error == `0` && vp) {
5856	error = vnode_setattr_fallback(vp, vap: &va, ctx);
5857	}
5858
5859	#if CONFIG_MACF
5860	if (error == `0` && vp) {
5861	error = vnode_label(mp: vnode_mount(vp), dvp, vp, cnp: &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5862	}
5863	#endif
5864
5865	if (error == `0`) {
5866	int update_flags = `0`;
5867
5868	/check if a new vnode was created, else try to get one/
5869	if (vp == NULL) {
5870	nd.ni_cnd.cn_nameiop = LOOKUP;
5871	#if CONFIG_TRIGGERS
5872	nd.ni_op = OP_LOOKUP;
5873	#endif
5874	/*
5875	* Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5876	* reallocated again in namei().
5877	*/
5878	nd.ni_cnd.cn_flags &= HASBUF;
5879	error = nameiat(ndp: &nd, dirfd: fd);
5880	if (error) {
5881	goto skipit;
5882	}
5883	vp = nd.ni_vp;
5884	}
5885
5886	#if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5887	/ call out to allow 3rd party notification of rename.*
5888	* Ignore result of kauth_authorize_fileop call.
5889	*/
5890	if (kauth_authorize_fileop_has_listeners() &&
5891	namei(&nd) == `0`) {
5892	char *new_link_path = NULL;
5893	int len;
5894
5895	/ build the path to the new link file /
5896	new_link_path = get_pathbuff();
5897	len = MAXPATHLEN;
5898	vn_getpath(dvp, new_link_path, &len);
5899	if ((len + `1` + nd.ni_cnd.cn_namelen + `1`) < MAXPATHLEN) {
5900	new_link_path[len - `1`] = `'/'`;
5901	strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5902	}
5903
5904	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5905	(uintptr_t)path, (uintptr_t)new_link_path);
5906	if (new_link_path != NULL) {
5907	release_pathbuff(new_link_path);
5908	}
5909	}
5910	#endif
5911	// Make sure the name & parent pointers are hooked up
5912	if (vp->v_name == NULL) {
5913	update_flags \|= VNODE_UPDATE_NAME;
5914	}
5915	if (vp->v_parent == NULLVP) {
5916	update_flags \|= VNODE_UPDATE_PARENT;
5917	}
5918
5919	if (update_flags) {
5920	vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
5921	}
5922
5923	#if CONFIG_FSE
5924	add_fsevent(FSE_CREATE_FILE, ctx,
5925	FSE_ARG_VNODE, vp,
5926	FSE_ARG_DONE);
5927	#endif
5928	}
5929
5930	skipit:
5931	/*
5932	* nameidone has to happen before we vnode_put(dvp)
5933	* since it may need to release the fs_nodelock on the dvp
5934	*/
5935	nameidone(&nd);
5936
5937	if (vp) {
5938	vnode_put(vp);
5939	}
5940	vnode_put(vp: dvp);
5941	out:
5942	if (path && (path != (char *)path_data)) {
5943	zfree(ZV_NAMEI, path);
5944	}
5945
5946	return error;
5947	}
5948
5949	int
5950	symlink(__unused proc_t p, struct symlink_args uap, __unused int32_t retval)
5951	{
5952	return symlinkat_internal(ctx: vfs_context_current(), path_data: uap->path, AT_FDCWD,
5953	link: uap->link, segflg: UIO_USERSPACE);
5954	}
5955
5956	int
5957	symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5958	__unused int32_t *retval)
5959	{
5960	return symlinkat_internal(ctx: vfs_context_current(), path_data: uap->path1, fd: uap->fd,
5961	link: uap->path2, segflg: UIO_USERSPACE);
5962	}
5963
5964	/*
5965	* Delete a whiteout from the filesystem.
5966	* No longer supported.
5967	*/
5968	int
5969	undelete(__unused proc_t p, __unused struct undelete_args uap, __unused int32_t retval)
5970	{
5971	return ENOTSUP;
5972	}
5973
5974	/*
5975	* Delete a name from the filesystem.
5976	*/
5977	/ ARGSUSED /
5978	static int
5979	unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5980	user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5981	{
5982	struct {
5983	struct nameidata nd;
5984	#if CONFIG_FSE
5985	struct vnode_attr va;
5986	fse_info finfo;
5987	#endif
5988	} *__unlink_data;
5989	struct nameidata *ndp;
5990	vnode_t vp, dvp;
5991	int error;
5992	struct componentname *cnp;
5993	char *path = NULL;
5994	char *no_firmlink_path = NULL;
5995	int len_path = `0`;
5996	int len_no_firmlink_path = `0`;
5997	int flags;
5998	int need_event;
5999	int has_listeners;
6000	int truncated_path;
6001	int truncated_no_firmlink_path;
6002	int batched;
6003	struct vnode_attr *vap;
6004	int do_retry;
6005	int retry_count = `0`;
6006	int cn_flags;
6007	int nofollow_any = `0`;
6008
6009	cn_flags = LOCKPARENT;
6010	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6011	cn_flags \|= AUDITVNPATH1;
6012	}
6013	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6014	nofollow_any = NAMEI_NOFOLLOW_ANY;
6015	unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6016	}
6017	/ If a starting dvp is passed, it trumps any fd passed. /
6018	if (start_dvp) {
6019	cn_flags \|= USEDVP;
6020	}
6021
6022	#if NAMEDRSRCFORK
6023	/ unlink or delete is allowed on rsrc forks and named streams /
6024	cn_flags \|= CN_ALLOWRSRCFORK;
6025	#endif
6026
6027	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6028	ndp = &__unlink_data->nd;
6029	#if CONFIG_FSE
6030	fse_info *finfop = &__unlink_data->finfo;
6031	#endif
6032
6033	retry:
6034	do_retry = `0`;
6035	flags = `0`;
6036	need_event = `0`;
6037	has_listeners = `0`;
6038	truncated_path = `0`;
6039	truncated_no_firmlink_path = `0`;
6040	vap = NULL;
6041
6042	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6043
6044	ndp->ni_dvp = start_dvp;
6045	ndp->ni_flag \|= NAMEI_COMPOUNDREMOVE \| nofollow_any;
6046	cnp = &ndp->ni_cnd;
6047
6048	continue_lookup:
6049	error = nameiat(ndp, dirfd: fd);
6050	if (error) {
6051	goto early_out;
6052	}
6053
6054	dvp = ndp->ni_dvp;
6055	vp = ndp->ni_vp;
6056
6057	/ With Carbon delete semantics, busy files cannot be deleted /
6058	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6059	flags \|= VNODE_REMOVE_NODELETEBUSY;
6060	}
6061
6062	/ Skip any potential upcalls if told to. /
6063	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6064	flags \|= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6065	}
6066
6067	if (vp) {
6068	batched = vnode_compound_remove_available(vp);
6069	/*
6070	* The root of a mounted filesystem cannot be deleted.
6071	*/
6072	if ((vp->v_flag & VROOT) \|\| (dvp->v_mount != vp->v_mount)) {
6073	error = EBUSY;
6074	goto out;
6075	}
6076
6077	#if DEVELOPMENT \|\| DEBUG
6078	/*
6079	* XXX VSWAP: Check for entitlements or special flag here
6080	* so we can restrict access appropriately.
6081	*/
6082	#else /* DEVELOPMENT \|\| DEBUG */
6083
6084	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6085	error = EPERM;
6086	goto out;
6087	}
6088	#endif /* DEVELOPMENT \|\| DEBUG */
6089
6090	if (!batched) {
6091	error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6092	if (error) {
6093	if (error == ENOENT) {
6094	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6095	do_retry = `1`;
6096	retry_count++;
6097	}
6098	}
6099	goto out;
6100	}
6101	}
6102	} else {
6103	batched = `1`;
6104
6105	if (!vnode_compound_remove_available(vp: dvp)) {
6106	panic("No vp, but no compound remove?");
6107	}
6108	}
6109
6110	#if CONFIG_FSE
6111	need_event = need_fsevent(FSE_DELETE, vp: dvp);
6112	if (need_event) {
6113	if (!batched) {
6114	if ((vp->v_flag & VISHARDLINK) == `0`) {
6115	/ XXX need to get these data in batched VNOP /
6116	get_fse_info(vp, fse: finfop, ctx);
6117	}
6118	} else {
6119	error =
6120	vfs_get_notify_attributes(vap: &__unlink_data->va);
6121	if (error) {
6122	goto out;
6123	}
6124
6125	vap = &__unlink_data->va;
6126	}
6127	}
6128	#endif
6129	has_listeners = kauth_authorize_fileop_has_listeners();
6130	if (need_event \|\| has_listeners) {
6131	if (path == NULL) {
6132	GET_PATH(path);
6133	}
6134	len_path = safe_getpath(dvp, leafname: ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, truncated_path: &truncated_path);
6135	if (no_firmlink_path == NULL) {
6136	GET_PATH(no_firmlink_path);
6137	}
6138	len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: ndp->ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
6139	}
6140
6141	#if NAMEDRSRCFORK
6142	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6143	error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, `0`, ctx);
6144	} else
6145	#endif
6146	{
6147	#if CONFIG_FILE_LEASES
6148	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
6149	#endif
6150
6151	error = vn_remove(dvp, vpp: &ndp->ni_vp, ndp, flags, vap, ctx);
6152	vp = ndp->ni_vp;
6153	if (error == EKEEPLOOKING) {
6154	if (!batched) {
6155	panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6156	}
6157
6158	if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == `0`) {
6159	panic("EKEEPLOOKING, but continue flag not set?");
6160	}
6161
6162	if (vnode_isdir(vp)) {
6163	error = EISDIR;
6164	goto out;
6165	}
6166	goto continue_lookup;
6167	} else if (error == ENOENT && batched) {
6168	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6169	/*
6170	* For compound VNOPs, the authorization callback may
6171	* return ENOENT in case of racing hardlink lookups
6172	* hitting the name cache, redrive the lookup.
6173	*/
6174	do_retry = `1`;
6175	retry_count += `1`;
6176	goto out;
6177	}
6178	}
6179	}
6180
6181	/*
6182	* Call out to allow 3rd party notification of delete.
6183	* Ignore result of kauth_authorize_fileop call.
6184	*/
6185	if (!error) {
6186	if (has_listeners) {
6187	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
6188	KAUTH_FILEOP_DELETE,
6189	arg0: (uintptr_t)vp,
6190	arg1: (uintptr_t)path);
6191	}
6192
6193	if (vp->v_flag & VISHARDLINK) {
6194	//
6195	// if a hardlink gets deleted we want to blow away the
6196	// v_parent link because the path that got us to this
6197	// instance of the link is no longer valid. this will
6198	// force the next call to get the path to ask the file
6199	// system instead of just following the v_parent link.
6200	//
6201	vnode_update_identity(vp, NULL, NULL, name_len: `0`, name_hashval: `0`, VNODE_UPDATE_PARENT);
6202	}
6203
6204	#if CONFIG_FSE
6205	if (need_event) {
6206	if (vp->v_flag & VISHARDLINK) {
6207	get_fse_info(vp, fse: finfop, ctx);
6208	} else if (vap) {
6209	vnode_get_fse_info_from_vap(vp, fse: finfop, vap);
6210	}
6211	if (truncated_path) {
6212	finfop->mode \|= FSE_TRUNCATED_PATH;
6213	}
6214	add_fsevent(FSE_DELETE, ctx,
6215	FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6216	FSE_ARG_FINFO, finfop,
6217	FSE_ARG_DONE);
6218	}
6219	#endif
6220
6221	#if CONFIG_MACF
6222	mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6223	#endif
6224	}
6225
6226	out:
6227	if (path != NULL) {
6228	RELEASE_PATH(path);
6229	path = NULL;
6230	}
6231
6232	if (no_firmlink_path != NULL) {
6233	RELEASE_PATH(no_firmlink_path);
6234	no_firmlink_path = NULL;
6235	}
6236	#if NAMEDRSRCFORK
6237	/ recycle the deleted rsrc fork vnode to force a reclaim, which*
6238	* will cause its shadow file to go away if necessary.
6239	*/
6240	if (vp && (vnode_isnamedstream(vp)) &&
6241	(vp->v_parent != NULLVP) &&
6242	vnode_isshadow(vp)) {
6243	vnode_recycle(vp);
6244	}
6245	#endif
6246	/*
6247	* nameidone has to happen before we vnode_put(dvp)
6248	* since it may need to release the fs_nodelock on the dvp
6249	*/
6250	nameidone(ndp);
6251	vnode_put(vp: dvp);
6252	if (vp) {
6253	vnode_put(vp);
6254	}
6255
6256	if (do_retry) {
6257	goto retry;
6258	}
6259
6260	early_out:
6261	kfree_type(typeof(*__unlink_data), __unlink_data);
6262	return error;
6263	}
6264
6265	int
6266	unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6267	enum uio_seg segflg, int unlink_flags)
6268	{
6269	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6270	unlink_flags);
6271	}
6272
6273	/*
6274	* Delete a name from the filesystem using Carbon semantics.
6275	*/
6276	int
6277	delete(__unused proc_t p, struct delete_args uap, __unused int32_t retval)
6278	{
6279	return unlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULLVP,
6280	path_arg: uap->path, segflg: UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6281	}
6282
6283	/*
6284	* Delete a name from the filesystem using POSIX semantics.
6285	*/
6286	int
6287	unlink(__unused proc_t p, struct unlink_args uap, __unused int32_t retval)
6288	{
6289	return unlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULLVP,
6290	path_arg: uap->path, segflg: UIO_USERSPACE, unlink_flags: `0`);
6291	}
6292
6293	int
6294	unlinkat(__unused proc_t p, struct unlinkat_args uap, __unused int32_t retval)
6295	{
6296	int unlink_flags = `0`;
6297
6298	if (uap->flag & ~(AT_REMOVEDIR \| AT_REMOVEDIR_DATALESS \| AT_SYMLINK_NOFOLLOW_ANY)) {
6299	return EINVAL;
6300	}
6301
6302	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6303	unlink_flags \|= VNODE_REMOVE_NOFOLLOW_ANY;
6304	}
6305
6306	if (uap->flag & (AT_REMOVEDIR \| AT_REMOVEDIR_DATALESS)) {
6307	if (uap->flag & AT_REMOVEDIR_DATALESS) {
6308	unlink_flags \|= VNODE_REMOVE_DATALESS_DIR;
6309	}
6310	return rmdirat_internal(vfs_context_current(), uap->fd,
6311	uap->path, UIO_USERSPACE, unlink_flags);
6312	} else {
6313	return unlinkat_internal(ctx: vfs_context_current(), fd: uap->fd,
6314	NULLVP, path_arg: uap->path, segflg: UIO_USERSPACE, unlink_flags);
6315	}
6316	}
6317
6318	/*
6319	* Reposition read/write file offset.
6320	*/
6321	int
6322	lseek(proc_t p, struct lseek_args uap, off_t retval)
6323	{
6324	struct fileproc *fp;
6325	vnode_t vp;
6326	struct vfs_context *ctx;
6327	off_t offset = uap->offset, file_size;
6328	int error;
6329
6330	if ((error = fp_getfvp(p, fd: uap->fd, resultfp: &fp, resultvp: &vp))) {
6331	if (error == ENOTSUP) {
6332	return ESPIPE;
6333	}
6334	return error;
6335	}
6336	if (vnode_isfifo(vp)) {
6337	file_drop(uap->fd);
6338	return ESPIPE;
6339	}
6340
6341
6342	ctx = vfs_context_current();
6343	#if CONFIG_MACF
6344	if (uap->whence == L_INCR && uap->offset == `0`) {
6345	error = mac_file_check_get_offset(cred: vfs_context_ucred(ctx),
6346	fg: fp->fp_glob);
6347	} else {
6348	error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx),
6349	fg: fp->fp_glob);
6350	}
6351	if (error) {
6352	file_drop(uap->fd);
6353	return error;
6354	}
6355	#endif
6356	if ((error = vnode_getwithref(vp))) {
6357	file_drop(uap->fd);
6358	return error;
6359	}
6360
6361	switch (uap->whence) {
6362	case L_INCR:
6363	offset += fp->fp_glob->fg_offset;
6364	break;
6365	case L_XTND:
6366	if ((error = vnode_size(vp, &file_size, ctx)) != `0`) {
6367	break;
6368	}
6369	offset += file_size;
6370	break;
6371	case L_SET:
6372	break;
6373	case SEEK_HOLE:
6374	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, data: (caddr_t)&offset, fflag: `0`, ctx);
6375	break;
6376	case SEEK_DATA:
6377	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, data: (caddr_t)&offset, fflag: `0`, ctx);
6378	break;
6379	default:
6380	error = EINVAL;
6381	}
6382	if (error == `0`) {
6383	if (uap->offset > `0` && offset < `0`) {
6384	/ Incremented/relative move past max size /
6385	error = EOVERFLOW;
6386	} else {
6387	/*
6388	* Allow negative offsets on character devices, per
6389	* POSIX 1003.1-2001. Most likely for writing disk
6390	* labels.
6391	*/
6392	if (offset < `0` && vp->v_type != VCHR) {
6393	/ Decremented/relative move before start /
6394	error = EINVAL;
6395	} else {
6396	/ Success /
6397	fp->fp_glob->fg_offset = offset;
6398	*retval = fp->fp_glob->fg_offset;
6399	}
6400	}
6401	}
6402
6403	/*
6404	* An lseek can affect whether data is "available to read." Use
6405	* hint of NOTE_NONE so no EVFILT_VNODE events fire
6406	*/
6407	post_event_if_success(vp, error, NOTE_NONE);
6408	(void)vnode_put(vp);
6409	file_drop(uap->fd);
6410	return error;
6411	}
6412
6413
6414	/*
6415	* Check access permissions.
6416	*
6417	* Returns: 0 Success
6418	* vnode_authorize:???
6419	*/
6420	static int
6421	access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6422	{
6423	kauth_action_t action;
6424	int error;
6425
6426	/*
6427	* If just the regular access bits, convert them to something
6428	* that vnode_authorize will understand.
6429	*/
6430	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6431	action = `0`;
6432	if (uflags & R_OK) {
6433	action \|= KAUTH_VNODE_READ_DATA; / aka KAUTH_VNODE_LIST_DIRECTORY /
6434	}
6435	if (uflags & W_OK) {
6436	if (vnode_isdir(vp)) {
6437	action \|= KAUTH_VNODE_ADD_FILE \|
6438	KAUTH_VNODE_ADD_SUBDIRECTORY;
6439	/ might want delete rights here too /
6440	} else {
6441	action \|= KAUTH_VNODE_WRITE_DATA;
6442	}
6443	}
6444	if (uflags & X_OK) {
6445	if (vnode_isdir(vp)) {
6446	action \|= KAUTH_VNODE_SEARCH;
6447	} else {
6448	action \|= KAUTH_VNODE_EXECUTE;
6449	}
6450	}
6451	} else {
6452	/ take advantage of definition of uflags /
6453	action = uflags >> `8`;
6454	}
6455
6456	#if CONFIG_MACF
6457	error = mac_vnode_check_access(ctx, vp, acc_mode: uflags);
6458	if (error) {
6459	return error;
6460	}
6461	#endif /* MAC */
6462
6463	/ action == 0 means only check for existence /
6464	if (action != `0`) {
6465	error = vnode_authorize(vp, dvp, action: action \| KAUTH_VNODE_ACCESS, ctx);
6466	} else {
6467	error = `0`;
6468	}
6469
6470	return error;
6471	}
6472
6473
6474
6475	/*
6476	* access_extended: Check access permissions in bulk.
6477	*
6478	* Description: uap->entries Pointer to an array of accessx
6479	* descriptor structs, plus one or
6480	* more NULL terminated strings (see
6481	* "Notes" section below).
6482	* uap->size Size of the area pointed to by
6483	* uap->entries.
6484	* uap->results Pointer to the results array.
6485	*
6486	* Returns: 0 Success
6487	* ENOMEM Insufficient memory
6488	* EINVAL Invalid arguments
6489	* namei:EFAULT Bad address
6490	* namei:ENAMETOOLONG Filename too long
6491	* namei:ENOENT No such file or directory
6492	* namei:ELOOP Too many levels of symbolic links
6493	* namei:EBADF Bad file descriptor
6494	* namei:ENOTDIR Not a directory
6495	* namei:???
6496	* access1:
6497	*
6498	* Implicit returns:
6499	* uap->results Array contents modified
6500	*
6501	* Notes: The uap->entries are structured as an arbitrary length array
6502	* of accessx descriptors, followed by one or more NULL terminated
6503	* strings
6504	*
6505	* struct accessx_descriptor[0]
6506	* ...
6507	* struct accessx_descriptor[n]
6508	* char name_data[0];
6509	*
6510	* We determine the entry count by walking the buffer containing
6511	* the uap->entries argument descriptor. For each descriptor we
6512	* see, the valid values for the offset ad_name_offset will be
6513	* in the byte range:
6514	*
6515	* [ uap->entries + sizeof(struct accessx_descriptor) ]
6516	* to
6517	* [ uap->entries + uap->size - 2 ]
6518	*
6519	* since we must have at least one string, and the string must
6520	* be at least one character plus the NULL terminator in length.
6521	*
6522	* XXX: Need to support the check-as uid argument
6523	*/
6524	int
6525	access_extended(__unused proc_t p, struct access_extended_args uap, __unused int32_t retval)
6526	{
6527	struct accessx_descriptor *input = NULL;
6528	errno_t *result = NULL;
6529	errno_t error = `0`;
6530	int wantdelete = `0`;
6531	size_t desc_max, desc_actual = `0`;
6532	unsigned int i, j;
6533	struct vfs_context context;
6534	struct nameidata nd;
6535	int niopts;
6536	vnode_t vp = NULL;
6537	vnode_t dvp = NULL;
6538	#define ACCESSX_MAX_DESCR_ON_STACK 10
6539	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6540
6541	context.vc_ucred = NULL;
6542
6543	/*
6544	* Validate parameters; if valid, copy the descriptor array and string
6545	* arguments into local memory. Before proceeding, the following
6546	* conditions must have been met:
6547	*
6548	* o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6549	* o There must be sufficient room in the request for at least one
6550	* descriptor and a one yte NUL terminated string.
6551	* o The allocation of local storage must not fail.
6552	*/
6553	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6554	return ENOMEM;
6555	}
6556	if (uap->size < (sizeof(struct accessx_descriptor) + `2`)) {
6557	return EINVAL;
6558	}
6559	if (uap->size <= sizeof(stack_input)) {
6560	input = stack_input;
6561	} else {
6562	input = kalloc_data(uap->size, Z_WAITOK);
6563	if (input == NULL) {
6564	error = ENOMEM;
6565	goto out;
6566	}
6567	}
6568	error = copyin(uap->entries, input, uap->size);
6569	if (error) {
6570	goto out;
6571	}
6572
6573	AUDIT_ARG(opaque, input, uap->size);
6574
6575	/*
6576	* Force NUL termination of the copyin buffer to avoid nami() running
6577	* off the end. If the caller passes us bogus data, they may get a
6578	* bogus result.
6579	*/
6580	((char *)input)[uap->size - `1`] = `0`;
6581
6582	/*
6583	* Access is defined as checking against the process' real identity,
6584	* even if operations are checking the effective identity. This
6585	* requires that we use a local vfs context.
6586	*/
6587	context.vc_ucred = kauth_cred_copy_real(cred: kauth_cred_get());
6588	context.vc_thread = current_thread();
6589
6590	/*
6591	* Find out how many entries we have, so we can allocate the result
6592	* array by walking the list and adjusting the count downward by the
6593	* earliest string offset we see.
6594	*/
6595	desc_max = (uap->size - `2`) / sizeof(struct accessx_descriptor);
6596	desc_actual = desc_max;
6597	for (i = `0`; i < desc_actual; i++) {
6598	/*
6599	* Take the offset to the name string for this entry and
6600	* convert to an input array index, which would be one off
6601	* the end of the array if this entry was the lowest-addressed
6602	* name string.
6603	*/
6604	j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6605
6606	/*
6607	* An offset greater than the max allowable offset is an error.
6608	* It is also an error for any valid entry to point
6609	* to a location prior to the end of the current entry, if
6610	* it's not a reference to the string of the previous entry.
6611	*/
6612	if (j > desc_max \|\| (j != `0` && j <= i)) {
6613	error = EINVAL;
6614	goto out;
6615	}
6616
6617	/ Also do not let ad_name_offset point to something beyond the size of the input /
6618	if (input[i].ad_name_offset >= uap->size) {
6619	error = EINVAL;
6620	goto out;
6621	}
6622
6623	/*
6624	* An offset of 0 means use the previous descriptor's offset;
6625	* this is used to chain multiple requests for the same file
6626	* to avoid multiple lookups.
6627	*/
6628	if (j == `0`) {
6629	/ This is not valid for the first entry /
6630	if (i == `0`) {
6631	error = EINVAL;
6632	goto out;
6633	}
6634	continue;
6635	}
6636
6637	/*
6638	* If the offset of the string for this descriptor is before
6639	* what we believe is the current actual last descriptor,
6640	* then we need to adjust our estimate downward; this permits
6641	* the string table following the last descriptor to be out
6642	* of order relative to the descriptor list.
6643	*/
6644	if (j < desc_actual) {
6645	desc_actual = j;
6646	}
6647	}
6648
6649	/*
6650	* We limit the actual number of descriptors we are willing to process
6651	* to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6652	* requested does not exceed this limit,
6653	*/
6654	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6655	error = ENOMEM;
6656	goto out;
6657	}
6658	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK \| Z_ZERO);
6659	if (result == NULL) {
6660	error = ENOMEM;
6661	goto out;
6662	}
6663
6664	/*
6665	* Do the work by iterating over the descriptor entries we know to
6666	* at least appear to contain valid data.
6667	*/
6668	error = `0`;
6669	for (i = `0`; i < desc_actual; i++) {
6670	/*
6671	* If the ad_name_offset is 0, then we use the previous
6672	* results to make the check; otherwise, we are looking up
6673	* a new file name.
6674	*/
6675	if (input[i].ad_name_offset != `0`) {
6676	/ discard old vnodes /
6677	if (vp) {
6678	vnode_put(vp);
6679	vp = NULL;
6680	}
6681	if (dvp) {
6682	vnode_put(vp: dvp);
6683	dvp = NULL;
6684	}
6685
6686	/*
6687	* Scan forward in the descriptor list to see if we
6688	* need the parent vnode. We will need it if we are
6689	* deleting, since we must have rights to remove
6690	* entries in the parent directory, as well as the
6691	* rights to delete the object itself.
6692	*/
6693	wantdelete = input[i].ad_flags & _DELETE_OK;
6694	for (j = i + `1`; (j < desc_actual) && (input[j].ad_name_offset == `0`); j++) {
6695	if (input[j].ad_flags & _DELETE_OK) {
6696	wantdelete = `1`;
6697	}
6698	}
6699
6700	niopts = FOLLOW \| AUDITVNPATH1;
6701
6702	/ need parent for vnode_authorize for deletion test /
6703	if (wantdelete) {
6704	niopts \|= WANTPARENT;
6705	}
6706
6707	/ do the lookup /
6708	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6709	CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6710	&context);
6711	error = namei(ndp: &nd);
6712	if (!error) {
6713	vp = nd.ni_vp;
6714	if (wantdelete) {
6715	dvp = nd.ni_dvp;
6716	}
6717	}
6718	nameidone(&nd);
6719	}
6720
6721	/*
6722	* Handle lookup errors.
6723	*/
6724	switch (error) {
6725	case ENOENT:
6726	case EACCES:
6727	case EPERM:
6728	case ENOTDIR:
6729	result[i] = error;
6730	break;
6731	case `0`:
6732	/ run this access check /
6733	result[i] = access1(vp, dvp, uflags: input[i].ad_flags, ctx: &context);
6734	break;
6735	default:
6736	/ fatal lookup error /
6737
6738	goto out;
6739	}
6740	}
6741
6742	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6743
6744	/ copy out results /
6745	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6746
6747	out:
6748	if (input && input != stack_input) {
6749	kfree_data(input, uap->size);
6750	}
6751	if (result) {
6752	kfree_data(result, desc_actual * sizeof(errno_t));
6753	}
6754	if (vp) {
6755	vnode_put(vp);
6756	}
6757	if (dvp) {
6758	vnode_put(vp: dvp);
6759	}
6760	if (IS_VALID_CRED(context.vc_ucred)) {
6761	kauth_cred_unref(&context.vc_ucred);
6762	}
6763	return error;
6764	}
6765
6766
6767	/*
6768	* Returns: 0 Success
6769	* namei:EFAULT Bad address
6770	* namei:ENAMETOOLONG Filename too long
6771	* namei:ENOENT No such file or directory
6772	* namei:ELOOP Too many levels of symbolic links
6773	* namei:EBADF Bad file descriptor
6774	* namei:ENOTDIR Not a directory
6775	* namei:???
6776	* access1:
6777	*/
6778	static int
6779	faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6780	int flag, enum uio_seg segflg)
6781	{
6782	int error;
6783	struct nameidata nd;
6784	int niopts;
6785	struct vfs_context context;
6786	#if NAMEDRSRCFORK
6787	int is_namedstream = `0`;
6788	#endif
6789
6790	/*
6791	* Unless the AT_EACCESS option is used, Access is defined as checking
6792	* against the process' real identity, even if operations are checking
6793	* the effective identity. So we need to tweak the credential
6794	* in the context for that case.
6795	*/
6796	if (!(flag & AT_EACCESS)) {
6797	context.vc_ucred = kauth_cred_copy_real(cred: kauth_cred_get());
6798	} else {
6799	context.vc_ucred = ctx->vc_ucred;
6800	}
6801	context.vc_thread = ctx->vc_thread;
6802
6803
6804	niopts = (flag & (AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) \| AUDITVNPATH1;
6805	/ need parent for vnode_authorize for deletion test /
6806	if (amode & _DELETE_OK) {
6807	niopts \|= WANTPARENT;
6808	}
6809	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6810	path, &context);
6811	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6812	nd.ni_flag \|= NAMEI_NOFOLLOW_ANY;
6813	}
6814
6815	#if NAMEDRSRCFORK
6816	/ access(F_OK) calls are allowed for resource forks. /
6817	if (amode == F_OK) {
6818	nd.ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
6819	}
6820	#endif
6821	error = nameiat(ndp: &nd, dirfd: fd);
6822	if (error) {
6823	goto out;
6824	}
6825
6826	#if NAMEDRSRCFORK
6827	/ Grab reference on the shadow stream file vnode to*
6828	* force an inactive on release which will mark it
6829	* for recycle.
6830	*/
6831	if (vnode_isnamedstream(vp: nd.ni_vp) &&
6832	(nd.ni_vp->v_parent != NULLVP) &&
6833	vnode_isshadow(nd.ni_vp)) {
6834	is_namedstream = `1`;
6835	vnode_ref(vp: nd.ni_vp);
6836	}
6837	#endif
6838
6839	error = access1(vp: nd.ni_vp, dvp: nd.ni_dvp, uflags: amode, ctx: &context);
6840
6841	#if NAMEDRSRCFORK
6842	if (is_namedstream) {
6843	vnode_rele(vp: nd.ni_vp);
6844	}
6845	#endif
6846
6847	vnode_put(vp: nd.ni_vp);
6848	if (amode & _DELETE_OK) {
6849	vnode_put(vp: nd.ni_dvp);
6850	}
6851	nameidone(&nd);
6852
6853	out:
6854	if (!(flag & AT_EACCESS)) {
6855	kauth_cred_unref(&context.vc_ucred);
6856	}
6857	return error;
6858	}
6859
6860	int
6861	access(__unused proc_t p, struct access_args uap, __unused int32_t retval)
6862	{
6863	return faccessat_internal(ctx: vfs_context_current(), AT_FDCWD,
6864	path: uap->path, amode: uap->flags, flag: `0`, segflg: UIO_USERSPACE);
6865	}
6866
6867	int
6868	faccessat(__unused proc_t p, struct faccessat_args *uap,
6869	__unused int32_t *retval)
6870	{
6871	if (uap->flag & ~(AT_EACCESS \| AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY)) {
6872	return EINVAL;
6873	}
6874
6875	return faccessat_internal(ctx: vfs_context_current(), fd: uap->fd,
6876	path: uap->path, amode: uap->amode, flag: uap->flag, segflg: UIO_USERSPACE);
6877	}
6878
6879	/*
6880	* Returns: 0 Success
6881	* EFAULT
6882	* copyout:EFAULT
6883	* namei:???
6884	* vn_stat:???
6885	*/
6886	static int
6887	fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6888	user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6889	enum uio_seg segflg, int fd, int flag)
6890	{
6891	struct nameidata *ndp = NULL;
6892	int follow;
6893	union {
6894	struct stat sb;
6895	struct stat64 sb64;
6896	} source = {};
6897	union {
6898	struct user64_stat user64_sb;
6899	struct user32_stat user32_sb;
6900	struct user64_stat64 user64_sb64;
6901	struct user32_stat64 user32_sb64;
6902	} dest = {};
6903	caddr_t sbp;
6904	int error, my_size;
6905	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6906	size_t xsecurity_bufsize;
6907	void * statptr;
6908	struct fileproc *fp = NULL;
6909	int needsrealdev = `0`;
6910
6911	follow = (flag & (AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6912	ndp = kalloc_type(struct nameidata, Z_WAITOK);
6913	NDINIT(ndp, LOOKUP, OP_GETATTR, follow \| AUDITVNPATH1,
6914	segflg, path, ctx);
6915	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6916	ndp->ni_flag \|= NAMEI_NOFOLLOW_ANY;
6917	}
6918
6919	#if NAMEDRSRCFORK
6920	int is_namedstream = `0`;
6921	/ stat calls are allowed for resource forks. /
6922	ndp->ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
6923	#endif
6924
6925	if (flag & AT_FDONLY) {
6926	vnode_t fvp;
6927
6928	error = fp_getfvp(p: vfs_context_proc(ctx), fd, resultfp: &fp, resultvp: &fvp);
6929	if (error) {
6930	goto out;
6931	}
6932	if ((error = vnode_getwithref(vp: fvp))) {
6933	file_drop(fd);
6934	goto out;
6935	}
6936	ndp->ni_vp = fvp;
6937	} else {
6938	error = nameiat(ndp, dirfd: fd);
6939	if (error) {
6940	goto out;
6941	}
6942	}
6943
6944	statptr = (void *)&source;
6945
6946	#if NAMEDRSRCFORK
6947	/ Grab reference on the shadow stream file vnode to*
6948	* force an inactive on release which will mark it
6949	* for recycle.
6950	*/
6951	if (vnode_isnamedstream(vp: ndp->ni_vp) &&
6952	(ndp->ni_vp->v_parent != NULLVP) &&
6953	vnode_isshadow(ndp->ni_vp)) {
6954	is_namedstream = `1`;
6955	vnode_ref(vp: ndp->ni_vp);
6956	}
6957	#endif
6958
6959	needsrealdev = flag & AT_REALDEV ? `1` : `0`;
6960	if (fp && (xsecurity == USER_ADDR_NULL)) {
6961	/*
6962	* If the caller has the file open, and is not
6963	* requesting extended security information, we are
6964	* going to let them get the basic stat information.
6965	*/
6966	error = vn_stat_noauth(vp: ndp->ni_vp, sb: statptr, NULL, isstat64, needsrealdev, ctx,
6967	file_cred: fp->fp_glob->fg_cred);
6968	} else {
6969	error = vn_stat(vp: ndp->ni_vp, sb: statptr, xsec: (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6970	isstat64, needsrealdev, ctx);
6971	}
6972
6973	#if NAMEDRSRCFORK
6974	if (is_namedstream) {
6975	vnode_rele(vp: ndp->ni_vp);
6976	}
6977	#endif
6978	vnode_put(vp: ndp->ni_vp);
6979	nameidone(ndp);
6980
6981	if (fp) {
6982	file_drop(fd);
6983	fp = NULL;
6984	}
6985
6986	if (error) {
6987	goto out;
6988	}
6989	/ Zap spare fields /
6990	if (isstat64 != `0`) {
6991	source.sb64.st_lspare = `0`;
6992	source.sb64.st_qspare[`0`] = `0LL`;
6993	source.sb64.st_qspare[`1`] = `0LL`;
6994	if (vfs_context_is64bit(ctx)) {
6995	munge_user64_stat64(sbp: &source.sb64, usbp: &dest.user64_sb64);
6996	my_size = sizeof(dest.user64_sb64);
6997	sbp = (caddr_t)&dest.user64_sb64;
6998	} else {
6999	munge_user32_stat64(sbp: &source.sb64, usbp: &dest.user32_sb64);
7000	my_size = sizeof(dest.user32_sb64);
7001	sbp = (caddr_t)&dest.user32_sb64;
7002	}
7003	/*
7004	* Check if we raced (post lookup) against the last unlink of a file.
7005	*/
7006	if ((source.sb64.st_nlink == `0`) && S_ISREG(source.sb64.st_mode)) {
7007	source.sb64.st_nlink = `1`;
7008	}
7009	} else {
7010	source.sb.st_lspare = `0`;
7011	source.sb.st_qspare[`0`] = `0LL`;
7012	source.sb.st_qspare[`1`] = `0LL`;
7013	if (vfs_context_is64bit(ctx)) {
7014	munge_user64_stat(sbp: &source.sb, usbp: &dest.user64_sb);
7015	my_size = sizeof(dest.user64_sb);
7016	sbp = (caddr_t)&dest.user64_sb;
7017	} else {
7018	munge_user32_stat(sbp: &source.sb, usbp: &dest.user32_sb);
7019	my_size = sizeof(dest.user32_sb);
7020	sbp = (caddr_t)&dest.user32_sb;
7021	}
7022
7023	/*
7024	* Check if we raced (post lookup) against the last unlink of a file.
7025	*/
7026	if ((source.sb.st_nlink == `0`) && S_ISREG(source.sb.st_mode)) {
7027	source.sb.st_nlink = `1`;
7028	}
7029	}
7030	if ((error = copyout(sbp, ub, my_size)) != `0`) {
7031	goto out;
7032	}
7033
7034	/ caller wants extended security information? /
7035	if (xsecurity != USER_ADDR_NULL) {
7036	/ did we get any? /
7037	if (fsec == KAUTH_FILESEC_NONE) {
7038	if (susize(xsecurity_size, `0`) != `0`) {
7039	error = EFAULT;
7040	goto out;
7041	}
7042	} else {
7043	/ find the user buffer size /
7044	xsecurity_bufsize = fusize(xsecurity_size);
7045
7046	/ copy out the actual data size /
7047	if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != `0`) {
7048	error = EFAULT;
7049	goto out;
7050	}
7051
7052	/ if the caller supplied enough room, copy out to it /
7053	if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7054	error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7055	}
7056	}
7057	}
7058	out:
7059	if (ndp) {
7060	kfree_type(struct nameidata, ndp);
7061	}
7062	if (fsec != KAUTH_FILESEC_NONE) {
7063	kauth_filesec_free(fsp: fsec);
7064	}
7065	return error;
7066	}
7067
7068	/*
7069	* stat_extended: Get file status; with extended security (ACL).
7070	*
7071	* Parameters: p (ignored)
7072	* uap User argument descriptor (see below)
7073	* retval (ignored)
7074	*
7075	* Indirect: uap->path Path of file to get status from
7076	* uap->ub User buffer (holds file status info)
7077	* uap->xsecurity ACL to get (extended security)
7078	* uap->xsecurity_size Size of ACL
7079	*
7080	* Returns: 0 Success
7081	* !0 errno value
7082	*
7083	*/
7084	int
7085	stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7086	__unused int32_t *retval)
7087	{
7088	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7089	xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: `0`, segflg: UIO_USERSPACE, AT_FDCWD,
7090	flag: `0`);
7091	}
7092
7093	/*
7094	* Returns: 0 Success
7095	* fstatat_internal:??? [see fstatat_internal() in this file]
7096	*/
7097	int
7098	stat(__unused proc_t p, struct stat_args uap, __unused int32_t retval)
7099	{
7100	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7101	xsecurity: `0`, xsecurity_size: `0`, isstat64: `0`, segflg: UIO_USERSPACE, AT_FDCWD, flag: `0`);
7102	}
7103
7104	int
7105	stat64(__unused proc_t p, struct stat64_args uap, __unused int32_t retval)
7106	{
7107	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7108	xsecurity: `0`, xsecurity_size: `0`, isstat64: `1`, segflg: UIO_USERSPACE, AT_FDCWD, flag: `0`);
7109	}
7110
7111	/*
7112	* stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7113	*
7114	* Parameters: p (ignored)
7115	* uap User argument descriptor (see below)
7116	* retval (ignored)
7117	*
7118	* Indirect: uap->path Path of file to get status from
7119	* uap->ub User buffer (holds file status info)
7120	* uap->xsecurity ACL to get (extended security)
7121	* uap->xsecurity_size Size of ACL
7122	*
7123	* Returns: 0 Success
7124	* !0 errno value
7125	*
7126	*/
7127	int
7128	stat64_extended(__unused proc_t p, struct stat64_extended_args uap, __unused int32_t retval)
7129	{
7130	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7131	xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: `1`, segflg: UIO_USERSPACE, AT_FDCWD,
7132	flag: `0`);
7133	}
7134
7135	/*
7136	* lstat_extended: Get file status; does not follow links; with extended security (ACL).
7137	*
7138	* Parameters: p (ignored)
7139	* uap User argument descriptor (see below)
7140	* retval (ignored)
7141	*
7142	* Indirect: uap->path Path of file to get status from
7143	* uap->ub User buffer (holds file status info)
7144	* uap->xsecurity ACL to get (extended security)
7145	* uap->xsecurity_size Size of ACL
7146	*
7147	* Returns: 0 Success
7148	* !0 errno value
7149	*
7150	*/
7151	int
7152	lstat_extended(__unused proc_t p, struct lstat_extended_args uap, __unused int32_t retval)
7153	{
7154	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7155	xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: `0`, segflg: UIO_USERSPACE, AT_FDCWD,
7156	AT_SYMLINK_NOFOLLOW);
7157	}
7158
7159	/*
7160	* Get file status; this version does not follow links.
7161	*/
7162	int
7163	lstat(__unused proc_t p, struct lstat_args uap, __unused int32_t retval)
7164	{
7165	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7166	xsecurity: `0`, xsecurity_size: `0`, isstat64: `0`, segflg: UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7167	}
7168
7169	int
7170	lstat64(__unused proc_t p, struct lstat64_args uap, __unused int32_t retval)
7171	{
7172	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7173	xsecurity: `0`, xsecurity_size: `0`, isstat64: `1`, segflg: UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7174	}
7175
7176	/*
7177	* lstat64_extended: Get file status; can handle large inode numbers; does not
7178	* follow links; with extended security (ACL).
7179	*
7180	* Parameters: p (ignored)
7181	* uap User argument descriptor (see below)
7182	* retval (ignored)
7183	*
7184	* Indirect: uap->path Path of file to get status from
7185	* uap->ub User buffer (holds file status info)
7186	* uap->xsecurity ACL to get (extended security)
7187	* uap->xsecurity_size Size of ACL
7188	*
7189	* Returns: 0 Success
7190	* !0 errno value
7191	*
7192	*/
7193	int
7194	lstat64_extended(__unused proc_t p, struct lstat64_extended_args uap, __unused int32_t retval)
7195	{
7196	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7197	xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: `1`, segflg: UIO_USERSPACE, AT_FDCWD,
7198	AT_SYMLINK_NOFOLLOW);
7199	}
7200
7201	int
7202	fstatat(__unused proc_t p, struct fstatat_args uap, __unused int32_t retval)
7203	{
7204	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW \| AT_REALDEV \| AT_FDONLY \| AT_SYMLINK_NOFOLLOW_ANY)) {
7205	return EINVAL;
7206	}
7207
7208	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7209	xsecurity: `0`, xsecurity_size: `0`, isstat64: `0`, segflg: UIO_USERSPACE, fd: uap->fd, flag: uap->flag);
7210	}
7211
7212	int
7213	fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7214	__unused int32_t *retval)
7215	{
7216	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW \| AT_REALDEV \| AT_FDONLY \| AT_SYMLINK_NOFOLLOW_ANY)) {
7217	return EINVAL;
7218	}
7219
7220	return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7221	xsecurity: `0`, xsecurity_size: `0`, isstat64: `1`, segflg: UIO_USERSPACE, fd: uap->fd, flag: uap->flag);
7222	}
7223
7224	/*
7225	* Get configurable pathname variables.
7226	*
7227	* Returns: 0 Success
7228	* namei:???
7229	* vn_pathconf:???
7230	*
7231	* Notes: Global implementation constants are intended to be
7232	* implemented in this function directly; all other constants
7233	* are per-FS implementation, and therefore must be handled in
7234	* each respective FS, instead.
7235	*
7236	* XXX We implement some things globally right now that should actually be
7237	* XXX per-FS; we will need to deal with this at some point.
7238	*/
7239	/ ARGSUSED /
7240	int
7241	pathconf(__unused proc_t p, struct pathconf_args uap, int32_t retval)
7242	{
7243	int error;
7244	struct nameidata nd;
7245	vfs_context_t ctx = vfs_context_current();
7246
7247	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW \| AUDITVNPATH1,
7248	UIO_USERSPACE, uap->path, ctx);
7249	error = namei(ndp: &nd);
7250	if (error) {
7251	return error;
7252	}
7253
7254	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7255
7256	vnode_put(vp: nd.ni_vp);
7257	nameidone(&nd);
7258	return error;
7259	}
7260
7261	/*
7262	* Return target name of a symbolic link.
7263	*/
7264	/ ARGSUSED /
7265	static int
7266	readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7267	enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7268	int *retval)
7269	{
7270	vnode_t vp;
7271	uio_t auio;
7272	int error;
7273	struct nameidata nd;
7274	UIO_STACKBUF(uio_buf, `1`);
7275	bool put_vnode;
7276
7277	if (bufsize > INT32_MAX) {
7278	return EINVAL;
7279	}
7280
7281	if (lnk_vp) {
7282	vp = lnk_vp;
7283	put_vnode = false;
7284	} else {
7285	NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW \| AUDITVNPATH1,
7286	seg, path, ctx);
7287
7288	error = nameiat(ndp: &nd, dirfd: fd);
7289	if (error) {
7290	return error;
7291	}
7292	vp = nd.ni_vp;
7293	put_vnode = true;
7294	nameidone(&nd);
7295	}
7296
7297	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: `0`, a_spacetype: bufseg, a_iodirection: UIO_READ,
7298	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
7299	uio_addiov(a_uio: auio, a_baseaddr: buf, a_length: bufsize);
7300	if (vp->v_type != VLNK) {
7301	error = EINVAL;
7302	} else {
7303	#if CONFIG_MACF
7304	error = mac_vnode_check_readlink(ctx, vp);
7305	#endif
7306	if (error == `0`) {
7307	error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7308	ctx);
7309	}
7310	if (error == `0`) {
7311	error = VNOP_READLINK(vp, auio, ctx);
7312	}
7313	}
7314
7315	if (put_vnode) {
7316	vnode_put(vp);
7317	}
7318
7319	retval = (int*)(bufsize - uio_resid(a_uio: auio));
7320	return error;
7321	}
7322
7323	int
7324	freadlink(proc_t p, struct freadlink_args uap, int32_t retval)
7325	{
7326	enum uio_seg procseg;
7327	vnode_t vp;
7328	int error;
7329
7330	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7331
7332	AUDIT_ARG(fd, uap->fd);
7333
7334	if ((error = file_vnode(uap->fd, &vp))) {
7335	return error;
7336	}
7337	if ((error = vnode_getwithref(vp))) {
7338	file_drop(uap->fd);
7339	return error;
7340	}
7341
7342	error = readlinkat_internal(ctx: vfs_context_current(), fd: -`1`,
7343	lnk_vp: vp, path: `0`, seg: procseg, CAST_USER_ADDR_T(uap->buf),
7344	bufsize: uap->bufsize, bufseg: procseg, retval);
7345
7346	vnode_put(vp);
7347	file_drop(uap->fd);
7348	return error;
7349	}
7350
7351	int
7352	readlink(proc_t p, struct readlink_args uap, int32_t retval)
7353	{
7354	enum uio_seg procseg;
7355
7356	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7357	return readlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULL,
7358	CAST_USER_ADDR_T(uap->path), seg: procseg, CAST_USER_ADDR_T(uap->buf),
7359	bufsize: uap->count, bufseg: procseg, retval);
7360	}
7361
7362	int
7363	readlinkat(proc_t p, struct readlinkat_args uap, int32_t retval)
7364	{
7365	enum uio_seg procseg;
7366
7367	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7368	return readlinkat_internal(ctx: vfs_context_current(), fd: uap->fd, NULL,
7369	CAST_USER_ADDR_T(uap->path), seg: procseg, buf: uap->buf, bufsize: uap->bufsize, bufseg: procseg,
7370	retval);
7371	}
7372
7373	/*
7374	* Change file flags, the deep inner layer.
7375	*/
7376	static int
7377	chflags0(vnode_t vp, struct vnode_attr *va,
7378	int (setattr)(vnode_t, void* *, vfs_context_t),
7379	void *arg, vfs_context_t ctx)
7380	{
7381	kauth_action_t action = `0`;
7382	int error;
7383
7384	#if CONFIG_MACF
7385	error = mac_vnode_check_setflags(ctx, vp, flags: va->va_flags);
7386	if (error) {
7387	goto out;
7388	}
7389	#endif
7390
7391	/ request authorisation, disregard immutability /
7392	if ((error = vnode_authattr(vp, vap: va, actionp: &action, ctx)) != `0`) {
7393	goto out;
7394	}
7395	/*
7396	* Request that the auth layer disregard those file flags it's allowed to when
7397	* authorizing this operation; we need to do this in order to be able to
7398	* clear immutable flags.
7399	*/
7400	if (action && ((error = vnode_authorize(vp, NULL, action: action \| KAUTH_VNODE_NOIMMUTABLE, ctx)) != `0`)) {
7401	goto out;
7402	}
7403	error = (*setattr)(vp, arg, ctx);
7404
7405	#if CONFIG_MACF
7406	if (error == `0`) {
7407	mac_vnode_notify_setflags(ctx, vp, flags: va->va_flags);
7408	}
7409	#endif
7410
7411	out:
7412	return error;
7413	}
7414
7415	/*
7416	* Change file flags.
7417	*
7418	* NOTE: this will vnode_put() `vp'
7419	*/
7420	static int
7421	chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7422	{
7423	struct vnode_attr va;
7424	int error;
7425
7426	VATTR_INIT(&va);
7427	VATTR_SET(&va, va_flags, flags);
7428
7429	error = chflags0(vp, va: &va, setattr: (void *)vnode_setattr, arg: &va, ctx);
7430	vnode_put(vp);
7431
7432	if ((error == `0`) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7433	error = ENOTSUP;
7434	}
7435
7436	return error;
7437	}
7438
7439	/*
7440	* Change flags of a file given a path name.
7441	*/
7442	/ ARGSUSED /
7443	int
7444	chflags(__unused proc_t p, struct chflags_args uap, __unused int32_t retval)
7445	{
7446	vnode_t vp;
7447	vfs_context_t ctx = vfs_context_current();
7448	int error;
7449	struct nameidata nd;
7450	uint32_t wantparent = `0`;
7451
7452	#if CONFIG_FILE_LEASES
7453	wantparent = WANTPARENT;
7454	#endif
7455
7456	AUDIT_ARG(fflags, uap->flags);
7457	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1 \| wantparent,
7458	UIO_USERSPACE, uap->path, ctx);
7459	error = namei(ndp: &nd);
7460	if (error) {
7461	return error;
7462	}
7463	vp = nd.ni_vp;
7464
7465	#if CONFIG_FILE_LEASES
7466	vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
7467	vnode_put(vp: nd.ni_dvp);
7468	#endif
7469
7470	nameidone(&nd);
7471
7472	/ we don't vnode_put() here because chflags1 does internally /
7473	error = chflags1(vp, flags: uap->flags, ctx);
7474
7475	return error;
7476	}
7477
7478	/*
7479	* Change flags of a file given a file descriptor.
7480	*/
7481	/ ARGSUSED /
7482	int
7483	fchflags(__unused proc_t p, struct fchflags_args uap, __unused int32_t retval)
7484	{
7485	vnode_t vp;
7486	int error;
7487
7488	AUDIT_ARG(fd, uap->fd);
7489	AUDIT_ARG(fflags, uap->flags);
7490	if ((error = file_vnode(uap->fd, &vp))) {
7491	return error;
7492	}
7493
7494	if ((error = vnode_getwithref(vp))) {
7495	file_drop(uap->fd);
7496	return error;
7497	}
7498
7499	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7500
7501	#if CONFIG_FILE_LEASES
7502	vnode_breakdirlease(vp, true, O_WRONLY);
7503	#endif
7504
7505	/ we don't vnode_put() here because chflags1 does internally /
7506	error = chflags1(vp, flags: uap->flags, ctx: vfs_context_current());
7507
7508	file_drop(uap->fd);
7509	return error;
7510	}
7511
7512	/*
7513	* Change security information on a filesystem object.
7514	*
7515	* Returns: 0 Success
7516	* EPERM Operation not permitted
7517	* vnode_authattr:??? [anything vnode_authattr can return]
7518	* vnode_authorize:??? [anything vnode_authorize can return]
7519	* vnode_setattr:??? [anything vnode_setattr can return]
7520	*
7521	* Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7522	* translated to EPERM before being returned.
7523	*/
7524	static int
7525	chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7526	{
7527	kauth_action_t action;
7528	int error;
7529
7530	AUDIT_ARG(mode, vap->va_mode);
7531	/ XXX audit new args /
7532
7533	#if NAMEDSTREAMS
7534	/ chmod calls are not allowed for resource forks. /
7535	if (vp->v_flag & VISNAMEDSTREAM) {
7536	return EPERM;
7537	}
7538	#endif
7539
7540	#if CONFIG_MACF
7541	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7542	(error = mac_vnode_check_setmode(ctx, vp, mode: (mode_t)vap->va_mode)) != `0`) {
7543	return error;
7544	}
7545
7546	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid)) {
7547	if ((error = mac_vnode_check_setowner(ctx, vp,
7548	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -`1`,
7549	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -`1`))) {
7550	return error;
7551	}
7552	}
7553
7554	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7555	(error = mac_vnode_check_setacl(ctx, vp, acl: vap->va_acl))) {
7556	return error;
7557	}
7558	#endif
7559
7560	/ make sure that the caller is allowed to set this security information /
7561	if (((error = vnode_authattr(vp, vap, actionp: &action, ctx)) != `0`) \|\|
7562	((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
7563	if (error == EACCES) {
7564	error = EPERM;
7565	}
7566	return error;
7567	}
7568
7569	if ((error = vnode_setattr(vp, vap, ctx)) != `0`) {
7570	return error;
7571	}
7572
7573	#if CONFIG_MACF
7574	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7575	mac_vnode_notify_setmode(ctx, vp, mode: (mode_t)vap->va_mode);
7576	}
7577
7578	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid)) {
7579	mac_vnode_notify_setowner(ctx, vp,
7580	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -`1`,
7581	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -`1`);
7582	}
7583
7584	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7585	mac_vnode_notify_setacl(ctx, vp, acl: vap->va_acl);
7586	}
7587	#endif
7588
7589	return error;
7590	}
7591
7592
7593	/*
7594	* Change mode of a file given a path name.
7595	*
7596	* Returns: 0 Success
7597	* namei:??? [anything namei can return]
7598	* chmod_vnode:??? [anything chmod_vnode can return]
7599	*/
7600	static int
7601	chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7602	int fd, int flag, enum uio_seg segflg)
7603	{
7604	struct nameidata nd;
7605	int follow, error;
7606	uint32_t wantparent = `0`;
7607
7608	#if CONFIG_FILE_LEASES
7609	wantparent = WANTPARENT;
7610	#endif
7611
7612	follow = (flag & (AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7613	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1 \| wantparent,
7614	segflg, path, ctx);
7615	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7616	nd.ni_flag \|= NAMEI_NOFOLLOW_ANY;
7617	}
7618	if ((error = nameiat(ndp: &nd, dirfd: fd))) {
7619	return error;
7620	}
7621
7622	#if CONFIG_FILE_LEASES
7623	vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
7624	vnode_put(vp: nd.ni_dvp);
7625	#endif
7626
7627	error = chmod_vnode(ctx, vp: nd.ni_vp, vap);
7628	vnode_put(vp: nd.ni_vp);
7629	nameidone(&nd);
7630	return error;
7631	}
7632
7633	static int
7634	chmod_extended_init(struct vnode_attr pva, kauth_filesec_t pxsecdst, int mode, uid_t uid,
7635	gid_t gid, user_addr_t xsecurity)
7636	{
7637	int error;
7638
7639	VATTR_INIT(pva);
7640
7641	if (mode != -`1`) {
7642	VATTR_SET(pva, va_mode, mode & ALLPERMS);
7643	} else {
7644	pva->va_mode = `0`;
7645	}
7646
7647	if (uid != KAUTH_UID_NONE) {
7648	VATTR_SET(pva, va_uid, uid);
7649	}
7650
7651	if (gid != KAUTH_GID_NONE) {
7652	VATTR_SET(pva, va_gid, gid);
7653	}
7654
7655	*pxsecdst = NULL;
7656	switch (xsecurity) {
7657	case USER_ADDR_NULL:
7658	break;
7659
7660	case CAST_USER_ADDR_T((void )`1`): /* _FILESEC_REMOVE_ACL /
7661	VATTR_SET(pva, va_acl, NULL);
7662	break;
7663
7664	default:
7665	if ((error = kauth_copyinfilesec(xsecurity, xsecdestpp: pxsecdst)) != `0`) {
7666	return error;
7667	}
7668
7669	VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7670	pva->va_vaflags \|= VA_FILESEC_ACL;
7671	KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7672	break;
7673	}
7674
7675	return `0`;
7676	}
7677
7678	/*
7679	* chmod_extended: Change the mode of a file given a path name; with extended
7680	* argument list (including extended security (ACL)).
7681	*
7682	* Parameters: p Process requesting the open
7683	* uap User argument descriptor (see below)
7684	* retval (ignored)
7685	*
7686	* Indirect: uap->path Path to object (same as 'chmod')
7687	* uap->uid UID to set
7688	* uap->gid GID to set
7689	* uap->mode File mode to set (same as 'chmod')
7690	* uap->xsecurity ACL to set (or delete)
7691	*
7692	* Returns: 0 Success
7693	* !0 errno value
7694	*
7695	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7696	*
7697	* XXX: We should enummerate the possible errno values here, and where
7698	* in the code they originated.
7699	*/
7700	int
7701	chmod_extended(__unused proc_t p, struct chmod_extended_args uap, __unused int32_t retval)
7702	{
7703	int error;
7704	struct vnode_attr va;
7705	kauth_filesec_t xsecdst = NULL;
7706
7707	AUDIT_ARG(owner, uap->uid, uap->gid);
7708
7709	error = chmod_extended_init(pva: &va, pxsecdst: &xsecdst, mode: uap->mode, uid: uap->uid,
7710	gid: uap->gid, xsecurity: uap->xsecurity);
7711
7712	if (error) {
7713	return error;
7714	}
7715
7716	error = chmodat(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD, flag: `0`,
7717	segflg: UIO_USERSPACE);
7718
7719	if (xsecdst != NULL) {
7720	kauth_filesec_free(fsp: xsecdst);
7721	}
7722	return error;
7723	}
7724
7725	/*
7726	* Returns: 0 Success
7727	* chmodat:??? [anything chmodat can return]
7728	*/
7729	static int
7730	fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7731	int flag, enum uio_seg segflg)
7732	{
7733	struct vnode_attr va;
7734
7735	VATTR_INIT(&va);
7736	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7737
7738	return chmodat(ctx, path, vap: &va, fd, flag, segflg);
7739	}
7740
7741	int
7742	chmod(__unused proc_t p, struct chmod_args uap, __unused int32_t retval)
7743	{
7744	return fchmodat_internal(ctx: vfs_context_current(), path: uap->path, mode: uap->mode,
7745	AT_FDCWD, flag: `0`, segflg: UIO_USERSPACE);
7746	}
7747
7748	int
7749	fchmodat(__unused proc_t p, struct fchmodat_args uap, __unused int32_t retval)
7750	{
7751	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY)) {
7752	return EINVAL;
7753	}
7754
7755	return fchmodat_internal(ctx: vfs_context_current(), path: uap->path, mode: uap->mode,
7756	fd: uap->fd, flag: uap->flag, segflg: UIO_USERSPACE);
7757	}
7758
7759	/*
7760	* Change mode of a file given a file descriptor.
7761	*/
7762	static int
7763	fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7764	{
7765	vnode_t vp;
7766	int error;
7767
7768	AUDIT_ARG(fd, fd);
7769
7770	if ((error = file_vnode(fd, &vp)) != `0`) {
7771	return error;
7772	}
7773	if ((error = vnode_getwithref(vp)) != `0`) {
7774	file_drop(fd);
7775	return error;
7776	}
7777	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7778
7779	#if CONFIG_FILE_LEASES
7780	vnode_breakdirlease(vp, true, O_WRONLY);
7781	#endif
7782
7783	error = chmod_vnode(ctx: vfs_context_current(), vp, vap);
7784	(void)vnode_put(vp);
7785	file_drop(fd);
7786
7787	return error;
7788	}
7789
7790	/*
7791	* fchmod_extended: Change mode of a file given a file descriptor; with
7792	* extended argument list (including extended security (ACL)).
7793	*
7794	* Parameters: p Process requesting to change file mode
7795	* uap User argument descriptor (see below)
7796	* retval (ignored)
7797	*
7798	* Indirect: uap->mode File mode to set (same as 'chmod')
7799	* uap->uid UID to set
7800	* uap->gid GID to set
7801	* uap->xsecurity ACL to set (or delete)
7802	* uap->fd File descriptor of file to change mode
7803	*
7804	* Returns: 0 Success
7805	* !0 errno value
7806	*
7807	*/
7808	int
7809	fchmod_extended(proc_t p, struct fchmod_extended_args uap, __unused int32_t retval)
7810	{
7811	int error;
7812	struct vnode_attr va;
7813	kauth_filesec_t xsecdst = NULL;
7814
7815	AUDIT_ARG(owner, uap->uid, uap->gid);
7816
7817	error = chmod_extended_init(pva: &va, pxsecdst: &xsecdst, mode: uap->mode, uid: uap->uid,
7818	gid: uap->gid, xsecurity: uap->xsecurity);
7819
7820	if (error) {
7821	return error;
7822	}
7823
7824	error = fchmod1(p, fd: uap->fd, vap: &va);
7825
7826	if (xsecdst != NULL) {
7827	kauth_filesec_free(fsp: xsecdst);
7828	}
7829	return error;
7830	}
7831
7832	int
7833	fchmod(proc_t p, struct fchmod_args uap, __unused int32_t retval)
7834	{
7835	struct vnode_attr va;
7836
7837	VATTR_INIT(&va);
7838	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7839
7840	return fchmod1(p, fd: uap->fd, vap: &va);
7841	}
7842
7843	static int
7844	vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7845	{
7846	struct vnode_attr va;
7847	kauth_action_t action;
7848	int error;
7849
7850	VATTR_INIT(&va);
7851	if (uid != (uid_t)VNOVAL) {
7852	VATTR_SET(&va, va_uid, uid);
7853	}
7854	if (gid != (gid_t)VNOVAL) {
7855	VATTR_SET(&va, va_gid, gid);
7856	}
7857
7858	#if NAMEDSTREAMS
7859	/ chown calls are not allowed for resource forks. /
7860	if (vp->v_flag & VISNAMEDSTREAM) {
7861	error = EPERM;
7862	goto out;
7863	}
7864	#endif
7865
7866	#if CONFIG_MACF
7867	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7868	if (error) {
7869	goto out;
7870	}
7871	#endif
7872
7873	/ preflight and authorize attribute changes /
7874	if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != `0`) {
7875	goto out;
7876	}
7877	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
7878	/*
7879	* EACCES is only allowed from namei(); permissions failure should
7880	* return EPERM, so we need to translate the error code.
7881	*/
7882	if (error == EACCES) {
7883	error = EPERM;
7884	}
7885
7886	goto out;
7887	}
7888
7889	#if CONFIG_FILE_LEASES
7890	vnode_breakdirlease(vp, true, O_WRONLY);
7891	#endif
7892
7893	error = vnode_setattr(vp, vap: &va, ctx);
7894
7895	#if CONFIG_MACF
7896	if (error == `0`) {
7897	mac_vnode_notify_setowner(ctx, vp, uid, gid);
7898	}
7899	#endif
7900
7901	out:
7902	return error;
7903	}
7904
7905	/*
7906	* Set ownership given a path name.
7907	*/
7908	/ ARGSUSED /
7909	static int
7910	fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7911	gid_t gid, int flag, enum uio_seg segflg)
7912	{
7913	vnode_t vp;
7914	int error;
7915	struct nameidata nd;
7916	int follow;
7917
7918	AUDIT_ARG(owner, uid, gid);
7919
7920	follow = (flag & (AT_SYMLINK_NOFOLLOW \| AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7921	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1, segflg, path, ctx);
7922	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7923	nd.ni_flag \|= NAMEI_NOFOLLOW_ANY;
7924	}
7925
7926	error = nameiat(ndp: &nd, dirfd: fd);
7927	if (error) {
7928	return error;
7929	}
7930
7931	vp = nd.ni_vp;
7932	error = vn_chown_internal(ctx, vp, uid, gid);
7933
7934	nameidone(&nd);
7935	vnode_put(vp);
7936	return error;
7937	}
7938
7939	int
7940	chown(__unused proc_t p, struct chown_args uap, __unused int32_t retval)
7941	{
7942	return fchownat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
7943	uid: uap->uid, gid: uap->gid, flag: `0`, segflg: UIO_USERSPACE);
7944	}
7945
7946	int
7947	lchown(__unused proc_t p, struct lchown_args uap, __unused int32_t retval)
7948	{
7949	return fchownat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
7950	uid: uap->owner, gid: uap->group, AT_SYMLINK_NOFOLLOW, segflg: UIO_USERSPACE);
7951	}
7952
7953	int
7954	fchownat(__unused proc_t p, struct fchownat_args uap, __unused int32_t retval)
7955	{
7956	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7957	return EINVAL;
7958	}
7959
7960	return fchownat_internal(ctx: vfs_context_current(), fd: uap->fd, path: uap->path,
7961	uid: uap->uid, gid: uap->gid, flag: uap->flag, segflg: UIO_USERSPACE);
7962	}
7963
7964	/*
7965	* Set ownership given a file descriptor.
7966	*/
7967	/ ARGSUSED /
7968	int
7969	fchown(__unused proc_t p, struct fchown_args uap, __unused int32_t retval)
7970	{
7971	vfs_context_t ctx = vfs_context_current();
7972	vnode_t vp;
7973	int error;
7974
7975	AUDIT_ARG(owner, uap->uid, uap->gid);
7976	AUDIT_ARG(fd, uap->fd);
7977
7978	if ((error = file_vnode(uap->fd, &vp))) {
7979	return error;
7980	}
7981
7982	if ((error = vnode_getwithref(vp))) {
7983	file_drop(uap->fd);
7984	return error;
7985	}
7986	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7987
7988	error = vn_chown_internal(ctx, vp, uid: uap->uid, gid: uap->gid);
7989
7990	(void)vnode_put(vp);
7991	file_drop(uap->fd);
7992	return error;
7993	}
7994
7995	static int
7996	getutimes(user_addr_t usrtvp, struct timespec *tsp)
7997	{
7998	int error;
7999
8000	if (usrtvp == USER_ADDR_NULL) {
8001	struct timeval old_tv;
8002	/ XXX Y2038 bug because of microtime argument /
8003	microtime(tv: &old_tv);
8004	TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[`0`]);
8005	tsp[`1`] = tsp[`0`];
8006	} else {
8007	if (IS_64BIT_PROCESS(current_proc())) {
8008	struct user64_timeval tv[`2`];
8009	error = copyin(usrtvp, (void )tv, sizeof*(tv));
8010	if (error) {
8011	return error;
8012	}
8013	TIMEVAL64_TO_TIMESPEC(&tv[`0`], &tsp[`0`]);
8014	TIMEVAL64_TO_TIMESPEC(&tv[`1`], &tsp[`1`]);
8015	} else {
8016	struct user32_timeval tv[`2`];
8017	error = copyin(usrtvp, (void )tv, sizeof*(tv));
8018	if (error) {
8019	return error;
8020	}
8021	TIMEVAL_TO_TIMESPEC(&tv[`0`], &tsp[`0`]);
8022	TIMEVAL_TO_TIMESPEC(&tv[`1`], &tsp[`1`]);
8023	}
8024	}
8025	return `0`;
8026	}
8027
8028	static int
8029	setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8030	int nullflag)
8031	{
8032	int error;
8033	struct vnode_attr va;
8034	kauth_action_t action;
8035
8036	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8037
8038	VATTR_INIT(&va);
8039	VATTR_SET(&va, va_access_time, ts[`0`]);
8040	VATTR_SET(&va, va_modify_time, ts[`1`]);
8041	if (nullflag) {
8042	va.va_vaflags \|= VA_UTIMES_NULL;
8043	}
8044
8045	#if NAMEDSTREAMS
8046	/ utimes calls are not allowed for resource forks. /
8047	if (vp->v_flag & VISNAMEDSTREAM) {
8048	error = EPERM;
8049	goto out;
8050	}
8051	#endif
8052
8053	#if CONFIG_MACF
8054	error = mac_vnode_check_setutimes(ctx, vp, atime: ts[`0`], mtime: ts[`1`]);
8055	if (error) {
8056	goto out;
8057	}
8058	#endif
8059	if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != `0`) {
8060	if (!nullflag && error == EACCES) {
8061	error = EPERM;
8062	}
8063	goto out;
8064	}
8065
8066	/ since we may not need to auth anything, check here /
8067	if ((action != `0`) && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
8068	if (!nullflag && error == EACCES) {
8069	error = EPERM;
8070	}
8071	goto out;
8072	}
8073	error = vnode_setattr(vp, vap: &va, ctx);
8074
8075	#if CONFIG_MACF
8076	if (error == `0`) {
8077	mac_vnode_notify_setutimes(ctx, vp, atime: ts[`0`], mtime: ts[`1`]);
8078	}
8079	#endif
8080
8081	out:
8082	return error;
8083	}
8084
8085	/*
8086	* Set the access and modification times of a file.
8087	*/
8088	/ ARGSUSED /
8089	int
8090	utimes(__unused proc_t p, struct utimes_args uap, __unused int32_t retval)
8091	{
8092	struct timespec ts[`2`];
8093	user_addr_t usrtvp;
8094	int error;
8095	struct nameidata nd;
8096	vfs_context_t ctx = vfs_context_current();
8097	uint32_t wantparent = `0`;
8098
8099	#if CONFIG_FILE_LEASES
8100	wantparent = WANTPARENT;
8101	#endif
8102
8103	/*
8104	* AUDIT: Needed to change the order of operations to do the
8105	* name lookup first because auditing wants the path.
8106	*/
8107	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1 \| wantparent,
8108	UIO_USERSPACE, uap->path, ctx);
8109	error = namei(ndp: &nd);
8110	if (error) {
8111	return error;
8112	}
8113
8114	/*
8115	* Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8116	* the current time instead.
8117	*/
8118	usrtvp = uap->tptr;
8119	if ((error = getutimes(usrtvp, tsp: ts)) != `0`) {
8120	goto out;
8121	}
8122
8123	#if CONFIG_FILE_LEASES
8124	vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
8125	#endif
8126
8127	error = setutimes(ctx, vp: nd.ni_vp, ts, nullflag: usrtvp == USER_ADDR_NULL);
8128
8129	out:
8130	#if CONFIG_FILE_LEASES
8131	vnode_put(vp: nd.ni_dvp);
8132	#endif
8133	nameidone(&nd);
8134	vnode_put(vp: nd.ni_vp);
8135	return error;
8136	}
8137
8138	/*
8139	* Set the access and modification times of a file.
8140	*/
8141	/ ARGSUSED /
8142	int
8143	futimes(__unused proc_t p, struct futimes_args uap, __unused int32_t retval)
8144	{
8145	struct timespec ts[`2`];
8146	vnode_t vp;
8147	user_addr_t usrtvp;
8148	int error;
8149
8150	AUDIT_ARG(fd, uap->fd);
8151	usrtvp = uap->tptr;
8152	if ((error = getutimes(usrtvp, tsp: ts)) != `0`) {
8153	return error;
8154	}
8155	if ((error = file_vnode(uap->fd, &vp)) != `0`) {
8156	return error;
8157	}
8158	if ((error = vnode_getwithref(vp))) {
8159	file_drop(uap->fd);
8160	return error;
8161	}
8162
8163	#if CONFIG_FILE_LEASES
8164	vnode_breakdirlease(vp, true, O_WRONLY);
8165	#endif
8166
8167	error = setutimes(ctx: vfs_context_current(), vp, ts, nullflag: usrtvp == `0`);
8168
8169	vnode_put(vp);
8170	file_drop(uap->fd);
8171	return error;
8172	}
8173
8174	static int
8175	truncate_validate_common(proc_t p, off_t length)
8176	{
8177	rlim_t fsize_limit;
8178
8179	if (length < `0`) {
8180	return EINVAL;
8181	}
8182
8183	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8184	if ((rlim_t)length > fsize_limit) {
8185	psignal(p, SIGXFSZ);
8186	return EFBIG;
8187	}
8188
8189	return `0`;
8190	}
8191
8192	static int
8193	truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8194	vfs_context_t ctx, boolean_t need_auth)
8195	{
8196	struct vnode_attr va;
8197	kauth_action_t action;
8198	int error;
8199
8200	VATTR_INIT(&va);
8201	VATTR_SET(&va, va_data_size, length);
8202
8203	#if CONFIG_MACF
8204	error = mac_vnode_check_truncate(ctx, file_cred: cred, vp);
8205	if (error) {
8206	return error;
8207	}
8208	#endif
8209
8210	/*
8211	* If we reached here from `ftruncate` then we already did an effective
8212	* `vnode_authorize` upon open. We honour the result from then.
8213	*/
8214	if (need_auth) {
8215	if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != `0`) {
8216	return error;
8217	}
8218
8219	if ((action != `0`) && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
8220	return error;
8221	}
8222	}
8223
8224	#if CONFIG_FILE_LEASES
8225	/ Check if there is a lease placed on the parent directory. /
8226	vnode_breakdirlease(vp, true, O_WRONLY);
8227
8228	/ Now check if there is a lease placed on the file itself. /
8229	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8230	#endif
8231
8232	error = vnode_setattr(vp, vap: &va, ctx);
8233
8234	#if CONFIG_MACF
8235	if (error == `0`) {
8236	mac_vnode_notify_truncate(ctx, file_cred: cred, vp);
8237	}
8238	#endif
8239
8240	return error;
8241	}
8242
8243	/*
8244	* Truncate a file given its path name.
8245	*/
8246	/ ARGSUSED /
8247	int
8248	truncate(proc_t p, struct truncate_args uap, __unused int32_t retval)
8249	{
8250	vfs_context_t ctx = vfs_context_current();
8251	vnode_t vp;
8252	int error;
8253	struct nameidata nd;
8254
8255	if ((error = truncate_validate_common(p, length: uap->length))) {
8256	return error;
8257	}
8258
8259	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW \| AUDITVNPATH1,
8260	UIO_USERSPACE, uap->path, ctx);
8261
8262	if ((error = namei(ndp: &nd))) {
8263	return error;
8264	}
8265
8266	vp = nd.ni_vp;
8267	nameidone(&nd);
8268
8269	error = truncate_internal(vp, length: uap->length, NOCRED, ctx, true);
8270	vnode_put(vp);
8271
8272	return error;
8273	}
8274
8275	/*
8276	* Truncate a file given a file descriptor.
8277	*/
8278	/ ARGSUSED /
8279	int
8280	ftruncate(proc_t p, struct ftruncate_args uap, int32_t retval)
8281	{
8282	vnode_t vp;
8283	struct fileproc *fp;
8284	int error;
8285
8286	AUDIT_ARG(fd, uap->fd);
8287
8288	if ((error = truncate_validate_common(p, length: uap->length))) {
8289	return error;
8290	}
8291
8292	if ((error = fp_lookup(p, fd: uap->fd, resultfp: &fp, locked: `0`))) {
8293	return error;
8294	}
8295
8296	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8297	case DTYPE_PSXSHM:
8298	error = pshm_truncate(p, fp, fd: uap->fd, length: uap->length, retval);
8299	goto out;
8300	case DTYPE_VNODE:
8301	break;
8302	default:
8303	error = EINVAL;
8304	goto out;
8305	}
8306
8307	vp = (vnode_t)fp_get_data(fp);
8308
8309	if ((fp->fp_glob->fg_flag & FWRITE) == `0`) {
8310	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8311	error = EINVAL;
8312	goto out;
8313	}
8314
8315	if ((error = vnode_getwithref(vp)) != `0`) {
8316	goto out;
8317	}
8318
8319	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8320
8321	error = truncate_internal(vp, length: uap->length, cred: fp->fp_glob->fg_cred,
8322	ctx: vfs_context_current(), false);
8323	vnode_put(vp);
8324
8325	out:
8326	file_drop(uap->fd);
8327	return error;
8328	}
8329
8330
8331	/*
8332	* Sync an open file with synchronized I/O _file_ integrity completion
8333	*/
8334	/ ARGSUSED /
8335	int
8336	fsync(proc_t p, struct fsync_args uap, __unused int32_t retval)
8337	{
8338	__pthread_testcancel(presyscall: `1`);
8339	return fsync_common(p, uap, MNT_WAIT);
8340	}
8341
8342
8343	/*
8344	* Sync an open file with synchronized I/O _file_ integrity completion
8345	*
8346	* Notes: This is a legacy support function that does not test for
8347	* thread cancellation points.
8348	*/
8349	/ ARGSUSED /
8350	int
8351	fsync_nocancel(proc_t p, struct fsync_nocancel_args uap, __unused int32_t retval)
8352	{
8353	return fsync_common(p, uap: (struct fsync_args *)uap, MNT_WAIT);
8354	}
8355
8356
8357	/*
8358	* Sync an open file with synchronized I/O _data_ integrity completion
8359	*/
8360	/ ARGSUSED /
8361	int
8362	fdatasync(proc_t p, struct fdatasync_args uap, __unused int32_t retval)
8363	{
8364	__pthread_testcancel(presyscall: `1`);
8365	return fsync_common(p, uap: (struct fsync_args *)uap, MNT_DWAIT);
8366	}
8367
8368
8369	/*
8370	* fsync_common
8371	*
8372	* Common fsync code to support both synchronized I/O file integrity completion
8373	* (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8374	*
8375	* If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8376	* will only guarantee that the file data contents are retrievable. If
8377	* 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8378	* includes additional metadata unnecessary for retrieving the file data
8379	* contents, such as atime, mtime, ctime, etc., also be committed to stable
8380	* storage.
8381	*
8382	* Parameters: p The process
8383	* uap->fd The descriptor to synchronize
8384	* flags The data integrity flags
8385	*
8386	* Returns: int Success
8387	* fp_getfvp:EBADF Bad file descriptor
8388	* fp_getfvp:ENOTSUP fd does not refer to a vnode
8389	* VNOP_FSYNC:??? unspecified
8390	*
8391	* Notes: We use struct fsync_args because it is a short name, and all
8392	* caller argument structures are otherwise identical.
8393	*/
8394	static int
8395	fsync_common(proc_t p, struct fsync_args uap, int* flags)
8396	{
8397	vnode_t vp;
8398	struct fileproc *fp;
8399	vfs_context_t ctx = vfs_context_current();
8400	int error;
8401
8402	AUDIT_ARG(fd, uap->fd);
8403
8404	if ((error = fp_getfvp(p, fd: uap->fd, resultfp: &fp, resultvp: &vp))) {
8405	return error;
8406	}
8407	if ((error = vnode_getwithref(vp))) {
8408	file_drop(uap->fd);
8409	return error;
8410	}
8411
8412	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8413
8414	error = VNOP_FSYNC(vp, waitfor: flags, ctx);
8415
8416	#if NAMEDRSRCFORK
8417	/ Sync resource fork shadow file if necessary. /
8418	if ((error == `0`) &&
8419	(vp->v_flag & VISNAMEDSTREAM) &&
8420	(vp->v_parent != NULLVP) &&
8421	vnode_isshadow(vp) &&
8422	(fp->fp_glob->fg_flag & FWASWRITTEN)) {
8423	(void) vnode_flushnamedstream(vp: vp->v_parent, svp: vp, context: ctx);
8424	}
8425	#endif
8426
8427	(void)vnode_put(vp);
8428	file_drop(uap->fd);
8429	return error;
8430	}
8431
8432	/*
8433	* Duplicate files. Source must be a file, target must be a file or
8434	* must not exist.
8435	*
8436	* XXX Copyfile authorisation checking is woefully inadequate, and will not
8437	* perform inheritance correctly.
8438	*/
8439	/ ARGSUSED /
8440	int
8441	copyfile(__unused proc_t p, struct copyfile_args uap, __unused int32_t retval)
8442	{
8443	vnode_t tvp, fvp, tdvp, sdvp;
8444	struct nameidata fromnd, tond;
8445	int error;
8446	vfs_context_t ctx = vfs_context_current();
8447
8448	/ Check that the flags are valid. /
8449	if (uap->flags & ~CPF_MASK) {
8450	return EINVAL;
8451	}
8452
8453	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8454	UIO_USERSPACE, uap->from, ctx);
8455	if ((error = namei(ndp: &fromnd))) {
8456	return error;
8457	}
8458	fvp = fromnd.ni_vp;
8459
8460	NDINIT(&tond, CREATE, OP_LINK,
8461	LOCKPARENT \| LOCKLEAF \| NOCACHE \| SAVESTART \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
8462	UIO_USERSPACE, uap->to, ctx);
8463	if ((error = namei(ndp: &tond))) {
8464	goto out1;
8465	}
8466	tdvp = tond.ni_dvp;
8467	tvp = tond.ni_vp;
8468
8469	if (tvp != NULL) {
8470	if (!(uap->flags & CPF_OVERWRITE)) {
8471	error = EEXIST;
8472	goto out;
8473	}
8474	}
8475
8476	if (fvp->v_type == VDIR \|\| (tvp && tvp->v_type == VDIR)) {
8477	error = EISDIR;
8478	goto out;
8479	}
8480
8481	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8482	error = EOPNOTSUPP;
8483	goto out;
8484	}
8485
8486	#if CONFIG_MACF
8487	if ((error = mac_vnode_check_copyfile(ctx, dvp: tdvp, tvp, fvp, cnp: &tond.ni_cnd, mode: (mode_t)uap->mode, flags: uap->flags)) != `0`) {
8488	goto out;
8489	}
8490	#endif /* CONFIG_MACF */
8491
8492	if ((error = vnode_authorize(vp: fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != `0`) {
8493	goto out;
8494	}
8495	if (tvp) {
8496	if ((error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE, ctx)) != `0`) {
8497	goto out;
8498	}
8499	}
8500	if ((error = vnode_authorize(vp: tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`) {
8501	goto out;
8502	}
8503
8504	if (fvp == tdvp) {
8505	error = EINVAL;
8506	}
8507	/*
8508	* If source is the same as the destination (that is the
8509	* same inode number) then there is nothing to do.
8510	* (fixed to have POSIX semantics - CSM 3/2/98)
8511	*/
8512	if (fvp == tvp) {
8513	error = -`1`;
8514	}
8515
8516	#if CONFIG_FILE_LEASES
8517	vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
8518	#endif
8519
8520	if (!error) {
8521	error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8522	}
8523	out:
8524	sdvp = tond.ni_startdir;
8525	/*
8526	* nameidone has to happen before we vnode_put(tdvp)
8527	* since it may need to release the fs_nodelock on the tdvp
8528	*/
8529	nameidone(&tond);
8530
8531	if (tvp) {
8532	vnode_put(vp: tvp);
8533	}
8534	vnode_put(vp: tdvp);
8535	vnode_put(vp: sdvp);
8536	out1:
8537	vnode_put(vp: fvp);
8538
8539	nameidone(&fromnd);
8540
8541	if (error == -`1`) {
8542	return `0`;
8543	}
8544	return error;
8545	}
8546
8547	#define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8548
8549	/*
8550	* Helper function for doing clones. The caller is expected to provide an
8551	* iocounted source vnode and release it.
8552	*/
8553	static int
8554	clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8555	user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8556	{
8557	vnode_t tvp, tdvp;
8558	struct nameidata tond;
8559	int error;
8560	int follow;
8561	boolean_t free_src_acl;
8562	boolean_t attr_cleanup;
8563	enum vtype v_type;
8564	kauth_action_t action;
8565	struct componentname *cnp;
8566	uint32_t defaulted = `0`;
8567	struct vnode_attr va;
8568	struct vnode_attr nva;
8569	uint32_t vnop_flags;
8570
8571	v_type = vnode_vtype(vp: fvp);
8572	switch (v_type) {
8573	case VLNK:
8574	/ FALLTHRU /
8575	case VREG:
8576	action = KAUTH_VNODE_ADD_FILE;
8577	break;
8578	case VDIR:
8579	if (vnode_isvroot(vp: fvp) \|\| vnode_ismount(vp: fvp) \|\|
8580	fvp->v_mountedhere) {
8581	return EINVAL;
8582	}
8583	action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8584	break;
8585	default:
8586	return EINVAL;
8587	}
8588
8589	AUDIT_ARG(fd2, dst_dirfd);
8590	AUDIT_ARG(value32, flags);
8591
8592	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8593	NDINIT(&tond, CREATE, OP_LINK, follow \| WANTPARENT \| AUDITVNPATH2,
8594	UIO_USERSPACE, dst, ctx);
8595	if ((error = nameiat(ndp: &tond, dirfd: dst_dirfd))) {
8596	return error;
8597	}
8598	cnp = &tond.ni_cnd;
8599	tdvp = tond.ni_dvp;
8600	tvp = tond.ni_vp;
8601
8602	free_src_acl = FALSE;
8603	attr_cleanup = FALSE;
8604
8605	if (tvp != NULL) {
8606	error = EEXIST;
8607	goto out;
8608	}
8609
8610	if (vnode_mount(vp: tdvp) != vnode_mount(vp: fvp)) {
8611	error = EXDEV;
8612	goto out;
8613	}
8614
8615	#if CONFIG_MACF
8616	if ((error = mac_vnode_check_clone(ctx, dvp: tdvp, vp: fvp, cnp))) {
8617	goto out;
8618	}
8619	#endif
8620	if ((error = vnode_authorize(vp: tdvp, NULL, action, ctx))) {
8621	goto out;
8622	}
8623
8624	action = KAUTH_VNODE_GENERIC_READ_BITS;
8625	if (data_read_authorised) {
8626	action &= ~KAUTH_VNODE_READ_DATA;
8627	}
8628	if ((error = vnode_authorize(vp: fvp, NULL, action, ctx))) {
8629	goto out;
8630	}
8631
8632	/*
8633	* certain attributes may need to be changed from the source, we ask for
8634	* those here with the exception of source file's ACLs unless the CLONE_ACL
8635	* flag is specified. By default, the clone file will inherit the target
8636	* directory's ACLs unless the the CLONE_ACL flag is specified then it
8637	* will inherit the source file's ACLs instead.
8638	*/
8639	VATTR_INIT(&va);
8640	VATTR_WANTED(&va, va_uid);
8641	VATTR_WANTED(&va, va_gid);
8642	VATTR_WANTED(&va, va_mode);
8643	VATTR_WANTED(&va, va_flags);
8644	if (flags & CLONE_ACL) {
8645	VATTR_WANTED(&va, va_acl);
8646	}
8647
8648	if ((error = vnode_getattr(vp: fvp, vap: &va, ctx)) != `0`) {
8649	goto out;
8650	}
8651
8652	VATTR_INIT(&nva);
8653	VATTR_SET(&nva, va_type, v_type);
8654	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8655	VATTR_SET(&nva, va_acl, va.va_acl);
8656	free_src_acl = TRUE;
8657	}
8658
8659	/ Handle ACL inheritance, initialize vap. /
8660	if (v_type == VLNK) {
8661	error = vnode_authattr_new(dvp: tdvp, vap: &nva, noauth: `0`, ctx);
8662	} else {
8663	error = vn_attribute_prepare(dvp: tdvp, vap: &nva, defaulted_fieldsp: &defaulted, ctx);
8664	if (error) {
8665	goto out;
8666	}
8667	attr_cleanup = TRUE;
8668	}
8669
8670	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8671	/*
8672	* We've got initial values for all security parameters,
8673	* If we are superuser, then we can change owners to be the
8674	* same as the source. Both superuser and the owner have default
8675	* WRITE_SECURITY privileges so all other fields can be taken
8676	* from source as well.
8677	*/
8678	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8679	if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8680	VATTR_SET(&nva, va_uid, va.va_uid);
8681	}
8682	if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8683	VATTR_SET(&nva, va_gid, va.va_gid);
8684	}
8685	} else {
8686	vnop_flags \|= VNODE_CLONEFILE_NOOWNERCOPY;
8687	}
8688
8689	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8690	VATTR_SET(&nva, va_mode, va.va_mode);
8691	}
8692	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8693	VATTR_SET(&nva, va_flags,
8694	((va.va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)) \| / Turn off from source /
8695	(nva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED))));
8696	}
8697
8698	#if CONFIG_FILE_LEASES
8699	vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
8700	#endif
8701
8702	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8703
8704	if (!error && tvp) {
8705	int update_flags = `0`;
8706	#if CONFIG_FSE
8707	int fsevent;
8708	#endif /* CONFIG_FSE */
8709
8710	/*
8711	* If some of the requested attributes weren't handled by the
8712	* VNOP, use our fallback code.
8713	*/
8714	if (!VATTR_ALL_SUPPORTED(&nva)) {
8715	(void)vnode_setattr_fallback(vp: tvp, vap: &nva, ctx);
8716	}
8717
8718	#if CONFIG_MACF
8719	(void)vnode_label(mp: vnode_mount(vp: tvp), dvp: tdvp, vp: tvp, cnp,
8720	VNODE_LABEL_CREATE, ctx);
8721	#endif
8722
8723	// Make sure the name & parent pointers are hooked up
8724	if (tvp->v_name == NULL) {
8725	update_flags \|= VNODE_UPDATE_NAME;
8726	}
8727	if (tvp->v_parent == NULLVP) {
8728	update_flags \|= VNODE_UPDATE_PARENT;
8729	}
8730
8731	if (update_flags) {
8732	(void)vnode_update_identity(vp: tvp, dvp: tdvp, name: cnp->cn_nameptr,
8733	name_len: cnp->cn_namelen, name_hashval: cnp->cn_hash, flags: update_flags);
8734	}
8735
8736	#if CONFIG_FSE
8737	switch (vnode_vtype(vp: tvp)) {
8738	case VLNK:
8739	/ FALLTHRU /
8740	case VREG:
8741	fsevent = FSE_CREATE_FILE;
8742	break;
8743	case VDIR:
8744	fsevent = FSE_CREATE_DIR;
8745	break;
8746	default:
8747	goto out;
8748	}
8749
8750	if (need_fsevent(type: fsevent, vp: tvp)) {
8751	/*
8752	* The following is a sequence of three explicit events.
8753	* A pair of FSE_CLONE events representing the source and destination
8754	* followed by an FSE_CREATE_[FILE \| DIR] for the destination.
8755	* fseventsd may coalesce the destination clone and create events
8756	* into a single event resulting in the following sequence for a client
8757	* FSE_CLONE (src)
8758	* FSE_CLONE \| FSE_CREATE (dst)
8759	*/
8760	add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8761	FSE_ARG_DONE);
8762	add_fsevent(type: fsevent, ctx, FSE_ARG_VNODE, tvp,
8763	FSE_ARG_DONE);
8764	}
8765	#endif /* CONFIG_FSE */
8766	}
8767
8768	out:
8769	if (attr_cleanup) {
8770	vn_attribute_cleanup(vap: &nva, defaulted_fields: defaulted);
8771	}
8772	if (free_src_acl && va.va_acl) {
8773	kauth_acl_free(fsp: va.va_acl);
8774	}
8775	nameidone(&tond);
8776	if (tvp) {
8777	vnode_put(vp: tvp);
8778	}
8779	vnode_put(vp: tdvp);
8780	return error;
8781	}
8782
8783	/*
8784	* clone files or directories, target must not exist.
8785	*/
8786	/ ARGSUSED /
8787	int
8788	clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8789	__unused int32_t *retval)
8790	{
8791	vnode_t fvp;
8792	struct nameidata fromnd;
8793	int follow;
8794	int error;
8795	vfs_context_t ctx = vfs_context_current();
8796
8797	/ Check that the flags are valid. /
8798	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY \| CLONE_ACL)) {
8799	return EINVAL;
8800	}
8801
8802	AUDIT_ARG(fd, uap->src_dirfd);
8803
8804	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8805	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow \| AUDITVNPATH1,
8806	UIO_USERSPACE, uap->src, ctx);
8807	if ((error = nameiat(ndp: &fromnd, dirfd: uap->src_dirfd))) {
8808	return error;
8809	}
8810
8811	fvp = fromnd.ni_vp;
8812	nameidone(&fromnd);
8813
8814	error = clonefile_internal(fvp, FALSE, dst_dirfd: uap->dst_dirfd, dst: uap->dst,
8815	flags: uap->flags, ctx);
8816
8817	vnode_put(vp: fvp);
8818	return error;
8819	}
8820
8821	int
8822	fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8823	__unused int32_t *retval)
8824	{
8825	vnode_t fvp;
8826	struct fileproc *fp;
8827	int error;
8828	vfs_context_t ctx = vfs_context_current();
8829
8830	/ Check that the flags are valid. /
8831	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY \| CLONE_ACL)) {
8832	return EINVAL;
8833	}
8834
8835	AUDIT_ARG(fd, uap->src_fd);
8836	error = fp_getfvp(p, fd: uap->src_fd, resultfp: &fp, resultvp: &fvp);
8837	if (error) {
8838	return error;
8839	}
8840
8841	if ((fp->fp_glob->fg_flag & FREAD) == `0`) {
8842	AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8843	error = EBADF;
8844	goto out;
8845	}
8846
8847	if ((error = vnode_getwithref(vp: fvp))) {
8848	goto out;
8849	}
8850
8851	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8852
8853	error = clonefile_internal(fvp, TRUE, dst_dirfd: uap->dst_dirfd, dst: uap->dst,
8854	flags: uap->flags, ctx);
8855
8856	vnode_put(vp: fvp);
8857	out:
8858	file_drop(uap->src_fd);
8859	return error;
8860	}
8861
8862	static int
8863	rename_submounts_callback(mount_t mp, void *arg)
8864	{
8865	int error = `0`;
8866	mount_t pmp = (mount_t)arg;
8867	int prefix_len = (int)strlen(s: pmp->mnt_vfsstat.f_mntonname);
8868
8869	if (strncmp(s1: mp->mnt_vfsstat.f_mntonname, s2: pmp->mnt_vfsstat.f_mntonname, n: prefix_len) != `0`) {
8870	return `0`;
8871	}
8872
8873	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != `'/'`) {
8874	return `0`;
8875	}
8876
8877	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8878	printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8879	return -`1`;
8880	}
8881
8882	size_t pathlen = MAXPATHLEN;
8883	if ((error = vn_getpath_ext(vp: mp->mnt_vnodecovered, NULL, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER))) {
8884	printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8885	}
8886
8887	vfs_unbusy(mp);
8888
8889	return error;
8890	}
8891
8892	/*
8893	* Rename files. Source and destination must either both be directories,
8894	* or both not be directories. If target is a directory, it must be empty.
8895	*/
8896	/ ARGSUSED /
8897	static int
8898	renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8899	int tofd, user_addr_t to, int segflg, u_int uflags)
8900	{
8901	vnode_t tvp, tdvp;
8902	vnode_t fvp, fdvp;
8903	vnode_t mnt_fvp;
8904	struct nameidata fromnd, tond;
8905	int error = `0`;
8906	int do_retry;
8907	int retry_count;
8908	int mntrename;
8909	int need_event;
8910	int need_kpath2;
8911	int has_listeners;
8912	const char *oname = NULL;
8913	char from_name = NULL, to_name = NULL;
8914	char from_name_no_firmlink = NULL, to_name_no_firmlink = NULL;
8915	int from_len = `0`, to_len = `0`;
8916	int from_len_no_firmlink = `0`, to_len_no_firmlink = `0`;
8917	int holding_mntlock;
8918	int vn_authorize_skipped;
8919	mount_t locked_mp = NULL;
8920	vnode_t oparent = NULLVP;
8921	#if CONFIG_FSE
8922	fse_info from_finfo = {}, to_finfo;
8923	#endif
8924	int from_truncated = `0`, to_truncated = `0`;
8925	int from_truncated_no_firmlink = `0`, to_truncated_no_firmlink = `0`;
8926	int batched = `0`;
8927	struct vnode_attr fvap, tvap;
8928	int continuing = `0`;
8929	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8930	int32_t nofollow_any = `0`;
8931	/ carving out a chunk for structs that are too big to be on stack. /
8932	struct {
8933	struct nameidata from_node, to_node;
8934	struct vnode_attr fv_attr, tv_attr;
8935	} * __rename_data;
8936
8937	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8938	fromnd = &__rename_data->from_node;
8939	tond = &__rename_data->to_node;
8940
8941	holding_mntlock = `0`;
8942	do_retry = `0`;
8943	retry_count = `0`;
8944	retry:
8945	fvp = tvp = NULL;
8946	fdvp = tdvp = NULL;
8947	fvap = tvap = NULL;
8948	mnt_fvp = NULLVP;
8949	mntrename = FALSE;
8950	vn_authorize_skipped = FALSE;
8951
8952	if (uflags & RENAME_NOFOLLOW_ANY) {
8953	nofollow_any = NAMEI_NOFOLLOW_ANY;
8954	}
8955	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT \| AUDITVNPATH1,
8956	segflg, from, ctx);
8957	fromnd->ni_flag = NAMEI_COMPOUNDRENAME \| nofollow_any;
8958
8959	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
8960	segflg, to, ctx);
8961	tond->ni_flag = NAMEI_COMPOUNDRENAME \| nofollow_any;
8962
8963	continue_lookup:
8964	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != `0` \|\| !continuing) {
8965	if ((error = nameiat(ndp: fromnd, dirfd: fromfd))) {
8966	goto out1;
8967	}
8968	fdvp = fromnd->ni_dvp;
8969	fvp = fromnd->ni_vp;
8970
8971	if (fvp && fvp->v_type == VDIR) {
8972	tond->ni_cnd.cn_flags \|= WILLBEDIR;
8973	}
8974	}
8975
8976	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != `0` \|\| !continuing) {
8977	if ((error = nameiat(ndp: tond, dirfd: tofd))) {
8978	/*
8979	* Translate error code for rename("dir1", "dir2/.").
8980	*/
8981	if (error == EISDIR && fvp->v_type == VDIR) {
8982	error = EINVAL;
8983	}
8984	goto out1;
8985	}
8986	tdvp = tond->ni_dvp;
8987	tvp = tond->ni_vp;
8988	}
8989
8990	#if DEVELOPMENT \|\| DEBUG
8991	/*
8992	* XXX VSWAP: Check for entitlements or special flag here
8993	* so we can restrict access appropriately.
8994	*/
8995	#else /* DEVELOPMENT \|\| DEBUG */
8996
8997	if (fromnd->ni_vp && vnode_isswap(vp: fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8998	error = EPERM;
8999	goto out1;
9000	}
9001
9002	if (tond->ni_vp && vnode_isswap(vp: tond->ni_vp) && (ctx != vfs_context_kernel())) {
9003	error = EPERM;
9004	goto out1;
9005	}
9006	#endif /* DEVELOPMENT \|\| DEBUG */
9007
9008	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9009	error = ENOENT;
9010	goto out1;
9011	}
9012
9013	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9014	int32_t pval = `0`;
9015	int err = `0`;
9016
9017	/*
9018	* We allow rename with VFS_RENAME_EXCL flag for an existing file which
9019	* has the same name as target iff the following conditions are met:
9020	* 1. the target file system is case insensitive
9021	* 2. source and target directories are the same
9022	* 3. source and target files are the same
9023	* 4. name only differs in case (determined by underlying filesystem)
9024	*/
9025	if (fvp != tvp \|\| fdvp != tdvp) {
9026	error = EEXIST;
9027	goto out1;
9028	}
9029
9030	/*
9031	* Assume that the target file system is case sensitive if
9032	* _PC_CASE_SENSITIVE selector isn't supported.
9033	*/
9034	err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9035	if (err != `0` \|\| pval != `0`) {
9036	error = EEXIST;
9037	goto out1;
9038	}
9039	}
9040
9041	batched = vnode_compound_rename_available(vp: fdvp);
9042
9043	#if CONFIG_FSE
9044	need_event = need_fsevent(FSE_RENAME, vp: fdvp);
9045	if (need_event) {
9046	if (fvp) {
9047	get_fse_info(vp: fvp, fse: &from_finfo, ctx);
9048	} else {
9049	error = vfs_get_notify_attributes(vap: &__rename_data->fv_attr);
9050	if (error) {
9051	goto out1;
9052	}
9053
9054	fvap = &__rename_data->fv_attr;
9055	}
9056
9057	if (tvp) {
9058	get_fse_info(vp: tvp, fse: &to_finfo, ctx);
9059	} else if (batched) {
9060	error = vfs_get_notify_attributes(vap: &__rename_data->tv_attr);
9061	if (error) {
9062	goto out1;
9063	}
9064
9065	tvap = &__rename_data->tv_attr;
9066	}
9067	}
9068	#else
9069	need_event = `0`;
9070	#endif /* CONFIG_FSE */
9071
9072	has_listeners = kauth_authorize_fileop_has_listeners();
9073
9074	need_kpath2 = `0`;
9075	#if CONFIG_AUDIT
9076	if (AUDIT_RECORD_EXISTS()) {
9077	need_kpath2 = `1`;
9078	}
9079	#endif
9080
9081	if (need_event \|\| has_listeners) {
9082	if (from_name == NULL) {
9083	GET_PATH(from_name);
9084	}
9085
9086	from_len = safe_getpath(dvp: fdvp, leafname: fromnd->ni_cnd.cn_nameptr, path: from_name, MAXPATHLEN, truncated_path: &from_truncated);
9087
9088	if (from_name_no_firmlink == NULL) {
9089	GET_PATH(from_name_no_firmlink);
9090	}
9091
9092	from_len_no_firmlink = safe_getpath_no_firmlink(dvp: fdvp, leafname: fromnd->ni_cnd.cn_nameptr, path: from_name_no_firmlink, MAXPATHLEN, truncated_path: &from_truncated_no_firmlink);
9093	}
9094
9095	if (need_event \|\| need_kpath2 \|\| has_listeners) {
9096	if (to_name == NULL) {
9097	GET_PATH(to_name);
9098	}
9099
9100	to_len = safe_getpath(dvp: tdvp, leafname: tond->ni_cnd.cn_nameptr, path: to_name, MAXPATHLEN, truncated_path: &to_truncated);
9101
9102	if (to_name_no_firmlink == NULL) {
9103	GET_PATH(to_name_no_firmlink);
9104	}
9105
9106	to_len_no_firmlink = safe_getpath_no_firmlink(dvp: tdvp, leafname: tond->ni_cnd.cn_nameptr, path: to_name_no_firmlink, MAXPATHLEN, truncated_path: &to_truncated_no_firmlink);
9107	if (to_name && need_kpath2) {
9108	AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9109	}
9110	}
9111	if (!fvp) {
9112	/*
9113	* Claim: this check will never reject a valid rename.
9114	* For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9115	* Suppose fdvp and tdvp are not on the same mount.
9116	* If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9117	* then you can't move it to within another dir on the same mountpoint.
9118	* If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9119	*
9120	* If this check passes, then we are safe to pass these vnodes to the same FS.
9121	*/
9122	if (fdvp->v_mount != tdvp->v_mount) {
9123	error = EXDEV;
9124	goto out1;
9125	}
9126	goto skipped_lookup;
9127	}
9128
9129	/*
9130	* If the source and destination are the same (i.e. they're
9131	* links to the same vnode) and the target file system is
9132	* case sensitive, then there is nothing to do.
9133	*
9134	* XXX Come back to this.
9135	*/
9136	if (fvp == tvp) {
9137	int pathconf_val;
9138
9139	/*
9140	* Note: if _PC_CASE_SENSITIVE selector isn't supported,
9141	* then assume that this file system is case sensitive.
9142	*/
9143	if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != `0` \|\|
9144	pathconf_val != `0`) {
9145	vn_authorize_skipped = TRUE;
9146	goto out1;
9147	}
9148	}
9149
9150	/*
9151	* Allow the renaming of mount points.
9152	* - target must not exist
9153	* - target must reside in the same directory as source
9154	* - union mounts cannot be renamed
9155	* - the root fs, and tightly-linked system volumes, cannot be renamed
9156	*
9157	* XXX Handle this in VFS after a continued lookup (if we missed
9158	* in the cache to start off)
9159	*
9160	* N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9161	* we'll skip past here. The file system is responsible for
9162	* checking that @tvp is not a descendent of @fvp and vice versa
9163	* so it should always return EINVAL if either @tvp or @fvp is the
9164	* root of a volume.
9165	*/
9166	if ((fvp->v_flag & VROOT) &&
9167	(fvp->v_type == VDIR) &&
9168	(tvp == NULL) &&
9169	(fvp->v_mountedhere == NULL) &&
9170	(fdvp == tdvp) &&
9171	((fvp->v_mount->mnt_flag & (MNT_UNION \| MNT_ROOTFS)) == `0`) &&
9172	((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == `0`) &&
9173	(fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9174	vnode_t coveredvp;
9175
9176	/ switch fvp to the covered vnode /
9177	coveredvp = fvp->v_mount->mnt_vnodecovered;
9178	if ((vnode_getwithref(vp: coveredvp))) {
9179	error = ENOENT;
9180	goto out1;
9181	}
9182	/*
9183	* Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9184	* later.
9185	*/
9186	mnt_fvp = fvp;
9187
9188	fvp = coveredvp;
9189	mntrename = TRUE;
9190	}
9191	/*
9192	* Check for cross-device rename.
9193	*/
9194	if ((fvp->v_mount != tdvp->v_mount) \|\|
9195	(tvp && (fvp->v_mount != tvp->v_mount))) {
9196	error = EXDEV;
9197	goto out1;
9198	}
9199
9200	/*
9201	* If source is the same as the destination (that is the
9202	* same inode number) then there is nothing to do...
9203	* EXCEPT if the underlying file system supports case
9204	* insensitivity and is case preserving. In this case
9205	* the file system needs to handle the special case of
9206	* getting the same vnode as target (fvp) and source (tvp).
9207	*
9208	* Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9209	* and _PC_CASE_PRESERVING can have this exception, and they need to
9210	* handle the special case of getting the same vnode as target and
9211	* source. NOTE: Then the target is unlocked going into vnop_rename,
9212	* so not to cause locking problems. There is a single reference on tvp.
9213	*
9214	* NOTE - that fvp == tvp also occurs if they are hard linked and
9215	* that correct behaviour then is just to return success without doing
9216	* anything.
9217	*
9218	* XXX filesystem should take care of this itself, perhaps...
9219	*/
9220	if (fvp == tvp && fdvp == tdvp) {
9221	if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9222	!bcmp(s1: fromnd->ni_cnd.cn_nameptr, s2: tond->ni_cnd.cn_nameptr,
9223	n: fromnd->ni_cnd.cn_namelen)) {
9224	vn_authorize_skipped = TRUE;
9225	goto out1;
9226	}
9227	}
9228
9229	if (holding_mntlock && fvp->v_mount != locked_mp) {
9230	/*
9231	* we're holding a reference and lock
9232	* on locked_mp, but it no longer matches
9233	* what we want to do... so drop our hold
9234	*/
9235	mount_unlock_renames(locked_mp);
9236	mount_drop(locked_mp, `0`);
9237	holding_mntlock = `0`;
9238	}
9239	if (tdvp != fdvp && fvp->v_type == VDIR) {
9240	/*
9241	* serialize renames that re-shape
9242	* the tree... if holding_mntlock is
9243	* set, then we're ready to go...
9244	* otherwise we
9245	* first need to drop the iocounts
9246	* we picked up, second take the
9247	* lock to serialize the access,
9248	* then finally start the lookup
9249	* process over with the lock held
9250	*/
9251	if (!holding_mntlock) {
9252	/*
9253	* need to grab a reference on
9254	* the mount point before we
9255	* drop all the iocounts... once
9256	* the iocounts are gone, the mount
9257	* could follow
9258	*/
9259	locked_mp = fvp->v_mount;
9260	mount_ref(locked_mp, `0`);
9261
9262	/*
9263	* nameidone has to happen before we vnode_put(tvp)
9264	* since it may need to release the fs_nodelock on the tvp
9265	*/
9266	nameidone(tond);
9267
9268	if (tvp) {
9269	vnode_put(vp: tvp);
9270	}
9271	vnode_put(vp: tdvp);
9272
9273	/*
9274	* nameidone has to happen before we vnode_put(fdvp)
9275	* since it may need to release the fs_nodelock on the fvp
9276	*/
9277	nameidone(fromnd);
9278
9279	vnode_put(vp: fvp);
9280	vnode_put(vp: fdvp);
9281
9282	if (mnt_fvp != NULLVP) {
9283	vnode_put(vp: mnt_fvp);
9284	}
9285
9286	mount_lock_renames(locked_mp);
9287	holding_mntlock = `1`;
9288
9289	goto retry;
9290	}
9291	} else {
9292	/*
9293	* when we dropped the iocounts to take
9294	* the lock, we allowed the identity of
9295	* the various vnodes to change... if they did,
9296	* we may no longer be dealing with a rename
9297	* that reshapes the tree... once we're holding
9298	* the iocounts, the vnodes can't change type
9299	* so we're free to drop the lock at this point
9300	* and continue on
9301	*/
9302	if (holding_mntlock) {
9303	mount_unlock_renames(locked_mp);
9304	mount_drop(locked_mp, `0`);
9305	holding_mntlock = `0`;
9306	}
9307	}
9308
9309	if (!batched) {
9310	error = vn_authorize_renamex_with_paths(fdvp, fvp: mntrename ? mnt_fvp : fvp,
9311	fcnp: &fromnd->ni_cnd, from_path: from_name, tdvp, tvp, tcnp: &tond->ni_cnd, to_path: to_name, ctx,
9312	flags, NULL);
9313	if (error) {
9314	if (error == ENOENT) {
9315	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9316	/*
9317	* We encountered a race where after doing the namei,
9318	* tvp stops being valid. If so, simply re-drive the rename
9319	* call from the top.
9320	*/
9321	do_retry = `1`;
9322	retry_count += `1`;
9323	}
9324	}
9325	goto out1;
9326	}
9327	}
9328
9329	/ Release the 'mnt_fvp' now that it is no longer needed. /
9330	if (mnt_fvp != NULLVP) {
9331	vnode_put(vp: mnt_fvp);
9332	mnt_fvp = NULLVP;
9333	}
9334
9335	// save these off so we can later verify that fvp is the same
9336	oname = fvp->v_name;
9337	oparent = fvp->v_parent;
9338
9339	skipped_lookup:
9340	#if CONFIG_FILE_LEASES
9341	/ Lease break needed for source's parent dir? /
9342	vnode_breakdirlease(vp: fdvp, false, O_WRONLY);
9343
9344	/ Lease break needed for target's parent dir? /
9345	vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
9346	#endif
9347
9348	error = vn_rename(fdvp, fvpp: &fvp, fcnp: &fromnd->ni_cnd, fvap,
9349	tdvp, tvpp: &tvp, tcnp: &tond->ni_cnd, tvap,
9350	flags, ctx);
9351
9352	if (holding_mntlock) {
9353	/*
9354	* we can drop our serialization
9355	* lock now
9356	*/
9357	mount_unlock_renames(locked_mp);
9358	mount_drop(locked_mp, `0`);
9359	holding_mntlock = `0`;
9360	}
9361	if (error) {
9362	if (error == EDATALESS) {
9363	/*
9364	* If we've been here before, something has gone
9365	* horribly wrong and we should just get out lest
9366	* we spiral around the drain forever.
9367	*/
9368	if (flags & VFS_RENAME_DATALESS) {
9369	error = EIO;
9370	goto out1;
9371	}
9372
9373	/*
9374	* The object we're renaming is dataless (or has a
9375	* dataless descendent) and requires materialization
9376	* before the rename occurs. But we're holding the
9377	* mount point's rename lock, so it's not safe to
9378	* make the upcall.
9379	*
9380	* In this case, we release the lock (above), perform
9381	* the materialization, and start the whole thing over.
9382	*/
9383	error = vfs_materialize_reparent(vp: fvp, tdvp);
9384	if (error == `0`) {
9385	/*
9386	* The next time around we need to tell the
9387	* file system that the materializtaion has
9388	* been performed.
9389	*/
9390	flags \|= VFS_RENAME_DATALESS;
9391	do_retry = `1`;
9392	}
9393	goto out1;
9394	}
9395	if (error == EKEEPLOOKING) {
9396	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == `0`) {
9397	if ((tond->ni_flag & NAMEI_CONTLOOKUP) == `0`) {
9398	panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9399	}
9400	}
9401
9402	fromnd->ni_vp = fvp;
9403	tond->ni_vp = tvp;
9404
9405	goto continue_lookup;
9406	}
9407
9408	/*
9409	* We may encounter a race in the VNOP where the destination didn't
9410	* exist when we did the namei, but it does by the time we go and
9411	* try to create the entry. In this case, we should re-drive this rename
9412	* call from the top again. Currently, only HFS bubbles out ERECYCLE,
9413	* but other filesystems susceptible to this race could return it, too.
9414	*/
9415	if (error == ERECYCLE) {
9416	if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9417	do_retry = `1`;
9418	retry_count += `1`;
9419	} else {
9420	printf("rename retry limit due to ERECYCLE reached\n");
9421	error = ENOENT;
9422	}
9423	}
9424
9425	/*
9426	* For compound VNOPs, the authorization callback may return
9427	* ENOENT in case of racing hardlink lookups hitting the name
9428	* cache, redrive the lookup.
9429	*/
9430	if (batched && error == ENOENT) {
9431	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9432	do_retry = `1`;
9433	retry_count += `1`;
9434	}
9435	}
9436
9437	goto out1;
9438	}
9439
9440	/ call out to allow 3rd party notification of rename.*
9441	* Ignore result of kauth_authorize_fileop call.
9442	*/
9443	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
9444	KAUTH_FILEOP_RENAME,
9445	arg0: (uintptr_t)from_name, arg1: (uintptr_t)to_name);
9446	if (flags & VFS_RENAME_SWAP) {
9447	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
9448	KAUTH_FILEOP_RENAME,
9449	arg0: (uintptr_t)to_name, arg1: (uintptr_t)from_name);
9450	}
9451
9452	#if CONFIG_FSE
9453	if (from_name != NULL && to_name != NULL) {
9454	if (from_truncated \|\| to_truncated) {
9455	// set it here since only the from_finfo gets reported up to user space
9456	from_finfo.mode \|= FSE_TRUNCATED_PATH;
9457	}
9458
9459	if (tvap && tvp) {
9460	vnode_get_fse_info_from_vap(vp: tvp, fse: &to_finfo, vap: tvap);
9461	}
9462	if (fvap) {
9463	vnode_get_fse_info_from_vap(vp: fvp, fse: &from_finfo, vap: fvap);
9464	}
9465
9466	if (tvp) {
9467	add_fsevent(FSE_RENAME, ctx,
9468	FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9469	FSE_ARG_FINFO, &from_finfo,
9470	FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9471	FSE_ARG_FINFO, &to_finfo,
9472	FSE_ARG_DONE);
9473	if (flags & VFS_RENAME_SWAP) {
9474	/*
9475	* Strictly speaking, swap is the equivalent of
9476	* three renames. FSEvents clients should only take
9477	* the events as a hint, so we only bother reporting
9478	* two.
9479	*/
9480	add_fsevent(FSE_RENAME, ctx,
9481	FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9482	FSE_ARG_FINFO, &to_finfo,
9483	FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9484	FSE_ARG_FINFO, &from_finfo,
9485	FSE_ARG_DONE);
9486	}
9487	} else {
9488	add_fsevent(FSE_RENAME, ctx,
9489	FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9490	FSE_ARG_FINFO, &from_finfo,
9491	FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9492	FSE_ARG_DONE);
9493	}
9494	}
9495	#endif /* CONFIG_FSE */
9496
9497	/*
9498	* update filesystem's mount point data
9499	*/
9500	if (mntrename) {
9501	char cp, pathend, *mpname;
9502	char * tobuf;
9503	struct mount *mp;
9504	int maxlen;
9505	size_t len = `0`;
9506
9507	mp = fvp->v_mountedhere;
9508
9509	if (vfs_busy(mp, LK_NOWAIT)) {
9510	error = EBUSY;
9511	goto out1;
9512	}
9513	tobuf = zalloc(view: ZV_NAMEI);
9514
9515	if (UIO_SEG_IS_USER_SPACE(segflg)) {
9516	error = copyinstr(uaddr: to, kaddr: tobuf, MAXPATHLEN, done: &len);
9517	} else {
9518	error = copystr(kfaddr: (void *)to, kdaddr: tobuf, MAXPATHLEN, done: &len);
9519	}
9520	if (!error) {
9521	/ find current mount point prefix /
9522	pathend = &mp->mnt_vfsstat.f_mntonname[`0`];
9523	for (cp = pathend; *cp != `'\0'`; ++cp) {
9524	if (*cp == `'/'`) {
9525	pathend = cp + `1`;
9526	}
9527	}
9528	/ find last component of target name /
9529	for (mpname = cp = tobuf; *cp != `'\0'`; ++cp) {
9530	if (*cp == `'/'`) {
9531	mpname = cp + `1`;
9532	}
9533	}
9534
9535	/ Update f_mntonname of sub mounts /
9536	vfs_iterate(flags: `0`, callout: rename_submounts_callback, arg: (void *)mp);
9537
9538	/ append name to prefix /
9539	maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9540	bzero(s: pathend, n: maxlen);
9541
9542	strlcpy(dst: pathend, src: mpname, n: maxlen);
9543	}
9544	zfree(ZV_NAMEI, tobuf);
9545
9546	vfs_unbusy(mp);
9547
9548	vfs_event_signal(NULL, VQ_UPDATE, data: (intptr_t)NULL);
9549	}
9550	/*
9551	* fix up name & parent pointers. note that we first
9552	* check that fvp has the same name/parent pointers it
9553	* had before the rename call... this is a 'weak' check
9554	* at best...
9555	*
9556	* XXX oparent and oname may not be set in the compound vnop case
9557	*/
9558	if (batched \|\| (oname == fvp->v_name && oparent == fvp->v_parent)) {
9559	int update_flags;
9560
9561	update_flags = VNODE_UPDATE_NAME;
9562
9563	if (fdvp != tdvp) {
9564	update_flags \|= VNODE_UPDATE_PARENT;
9565	}
9566
9567	vnode_update_identity(vp: fvp, dvp: tdvp, name: tond->ni_cnd.cn_nameptr, name_len: tond->ni_cnd.cn_namelen, name_hashval: tond->ni_cnd.cn_hash, flags: update_flags);
9568	}
9569	out1:
9570	/*
9571	* There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9572	* skipped earlier as no actual rename was performed.
9573	*/
9574	if (vn_authorize_skipped && error == `0`) {
9575	error = vn_authorize_renamex_with_paths(fdvp, fvp,
9576	fcnp: &fromnd->ni_cnd, from_path: from_name, tdvp, tvp, tcnp: &tond->ni_cnd, to_path: to_name, ctx,
9577	flags, NULL);
9578	if (error && error == ENOENT) {
9579	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9580	do_retry = `1`;
9581	retry_count += `1`;
9582	}
9583	}
9584	}
9585	if (to_name != NULL) {
9586	RELEASE_PATH(to_name);
9587	to_name = NULL;
9588	}
9589	if (to_name_no_firmlink != NULL) {
9590	RELEASE_PATH(to_name_no_firmlink);
9591	to_name_no_firmlink = NULL;
9592	}
9593	if (from_name != NULL) {
9594	RELEASE_PATH(from_name);
9595	from_name = NULL;
9596	}
9597	if (from_name_no_firmlink != NULL) {
9598	RELEASE_PATH(from_name_no_firmlink);
9599	from_name_no_firmlink = NULL;
9600	}
9601	if (holding_mntlock) {
9602	mount_unlock_renames(locked_mp);
9603	mount_drop(locked_mp, `0`);
9604	holding_mntlock = `0`;
9605	}
9606	if (tdvp) {
9607	/*
9608	* nameidone has to happen before we vnode_put(tdvp)
9609	* since it may need to release the fs_nodelock on the tdvp
9610	*/
9611	nameidone(tond);
9612
9613	if (tvp) {
9614	vnode_put(vp: tvp);
9615	}
9616	vnode_put(vp: tdvp);
9617	}
9618	if (fdvp) {
9619	/*
9620	* nameidone has to happen before we vnode_put(fdvp)
9621	* since it may need to release the fs_nodelock on the fdvp
9622	*/
9623	nameidone(fromnd);
9624
9625	if (fvp) {
9626	vnode_put(vp: fvp);
9627	}
9628	vnode_put(vp: fdvp);
9629	}
9630	if (mnt_fvp != NULLVP) {
9631	vnode_put(vp: mnt_fvp);
9632	}
9633	/*
9634	* If things changed after we did the namei, then we will re-drive
9635	* this rename call from the top.
9636	*/
9637	if (do_retry) {
9638	do_retry = `0`;
9639	goto retry;
9640	}
9641
9642	kfree_type(typeof(*__rename_data), __rename_data);
9643	return error;
9644	}
9645
9646	int
9647	rename(__unused proc_t p, struct rename_args uap, __unused int32_t retval)
9648	{
9649	return renameat_internal(ctx: vfs_context_current(), AT_FDCWD, from: uap->from,
9650	AT_FDCWD, to: uap->to, segflg: UIO_USERSPACE, uflags: `0`);
9651	}
9652
9653	int
9654	renameatx_np(__unused proc_t p, struct renameatx_np_args uap, __unused int32_t retval)
9655	{
9656	if (uap->flags & ~(RENAME_SECLUDE \| RENAME_EXCL \| RENAME_SWAP \| RENAME_NOFOLLOW_ANY)) {
9657	return EINVAL;
9658	}
9659
9660	if ((uap->flags & (RENAME_EXCL \| RENAME_SWAP)) == (RENAME_EXCL \| RENAME_SWAP)) {
9661	return EINVAL;
9662	}
9663
9664	return renameat_internal(ctx: vfs_context_current(), fromfd: uap->fromfd, from: uap->from,
9665	tofd: uap->tofd, to: uap->to, segflg: UIO_USERSPACE, uflags: uap->flags);
9666	}
9667
9668	int
9669	renameat(__unused proc_t p, struct renameat_args uap, __unused int32_t retval)
9670	{
9671	return renameat_internal(ctx: vfs_context_current(), fromfd: uap->fromfd, from: uap->from,
9672	tofd: uap->tofd, to: uap->to, segflg: UIO_USERSPACE, uflags: `0`);
9673	}
9674
9675	/*
9676	* Make a directory file.
9677	*
9678	* Returns: 0 Success
9679	* EEXIST
9680	* namei:???
9681	* vnode_authorize:???
9682	* vn_create:???
9683	*/
9684	/ ARGSUSED /
9685	static int
9686	mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr vap, int* fd,
9687	enum uio_seg segflg)
9688	{
9689	vnode_t vp, dvp;
9690	int error;
9691	int update_flags = `0`;
9692	int batched;
9693	struct nameidata nd;
9694
9695	AUDIT_ARG(mode, vap->va_mode);
9696	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT \| AUDITVNPATH1, segflg,
9697	path, ctx);
9698	nd.ni_cnd.cn_flags \|= WILLBEDIR;
9699	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9700
9701	continue_lookup:
9702	error = nameiat(ndp: &nd, dirfd: fd);
9703	if (error) {
9704	return error;
9705	}
9706	dvp = nd.ni_dvp;
9707	vp = nd.ni_vp;
9708
9709	if (vp != NULL) {
9710	error = EEXIST;
9711	goto out;
9712	}
9713
9714	batched = vnode_compound_mkdir_available(vp: dvp);
9715
9716	VATTR_SET(vap, va_type, VDIR);
9717
9718	/*
9719	* XXX
9720	* Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9721	* only get EXISTS or EISDIR for existing path components, and not that it could see
9722	* EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9723	* it will fail in a spurious manner. Need to figure out if this is valid behavior.
9724	*/
9725	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != `0`) {
9726	if (error == EACCES \|\| error == EPERM) {
9727	int error2;
9728
9729	nameidone(&nd);
9730	vnode_put(vp: dvp);
9731	dvp = NULLVP;
9732
9733	/*
9734	* Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9735	* rather than EACCESS if the target exists.
9736	*/
9737	NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9738	path, ctx);
9739	error2 = nameiat(ndp: &nd, dirfd: fd);
9740	if (error2) {
9741	goto out;
9742	} else {
9743	vp = nd.ni_vp;
9744	error = EEXIST;
9745	goto out;
9746	}
9747	}
9748
9749	goto out;
9750	}
9751
9752	#if CONFIG_FILE_LEASES
9753	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
9754	#endif
9755
9756	/*
9757	* make the directory
9758	*/
9759	if ((error = vn_create(dvp, &vp, &nd, vap, `0`, `0`, NULL, ctx)) != `0`) {
9760	if (error == EKEEPLOOKING) {
9761	nd.ni_vp = vp;
9762	goto continue_lookup;
9763	}
9764
9765	goto out;
9766	}
9767
9768	// Make sure the name & parent pointers are hooked up
9769	if (vp->v_name == NULL) {
9770	update_flags \|= VNODE_UPDATE_NAME;
9771	}
9772	if (vp->v_parent == NULLVP) {
9773	update_flags \|= VNODE_UPDATE_PARENT;
9774	}
9775
9776	if (update_flags) {
9777	vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
9778	}
9779
9780	#if CONFIG_FSE
9781	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9782	#endif
9783
9784	out:
9785	/*
9786	* nameidone has to happen before we vnode_put(dvp)
9787	* since it may need to release the fs_nodelock on the dvp
9788	*/
9789	nameidone(&nd);
9790
9791	if (vp) {
9792	vnode_put(vp);
9793	}
9794	if (dvp) {
9795	vnode_put(vp: dvp);
9796	}
9797
9798	return error;
9799	}
9800
9801	/*
9802	* mkdir_extended: Create a directory; with extended security (ACL).
9803	*
9804	* Parameters: p Process requesting to create the directory
9805	* uap User argument descriptor (see below)
9806	* retval (ignored)
9807	*
9808	* Indirect: uap->path Path of directory to create
9809	* uap->mode Access permissions to set
9810	* uap->xsecurity ACL to set
9811	*
9812	* Returns: 0 Success
9813	* !0 Not success
9814	*
9815	*/
9816	int
9817	mkdir_extended(proc_t p, struct mkdir_extended_args uap, __unused int32_t retval)
9818	{
9819	int ciferror;
9820	kauth_filesec_t xsecdst;
9821	struct vnode_attr va;
9822
9823	AUDIT_ARG(owner, uap->uid, uap->gid);
9824
9825	xsecdst = NULL;
9826	if ((uap->xsecurity != USER_ADDR_NULL) &&
9827	((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != `0`)) {
9828	return ciferror;
9829	}
9830
9831	VATTR_INIT(&va);
9832	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9833	if (xsecdst != NULL) {
9834	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9835	va.va_vaflags \|= VA_FILESEC_ACL;
9836	}
9837
9838	ciferror = mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD,
9839	segflg: UIO_USERSPACE);
9840	if (xsecdst != NULL) {
9841	kauth_filesec_free(fsp: xsecdst);
9842	}
9843	return ciferror;
9844	}
9845
9846	int
9847	mkdir(proc_t p, struct mkdir_args uap, __unused int32_t retval)
9848	{
9849	struct vnode_attr va;
9850
9851	VATTR_INIT(&va);
9852	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9853
9854	return mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD,
9855	segflg: UIO_USERSPACE);
9856	}
9857
9858	int
9859	mkdirat(proc_t p, struct mkdirat_args uap, __unused int32_t retval)
9860	{
9861	struct vnode_attr va;
9862
9863	VATTR_INIT(&va);
9864	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9865
9866	return mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, fd: uap->fd,
9867	segflg: UIO_USERSPACE);
9868	}
9869
9870	static int
9871	rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9872	enum uio_seg segflg, int unlink_flags)
9873	{
9874	struct {
9875	struct nameidata nd;
9876	#if CONFIG_FSE
9877	struct vnode_attr va;
9878	#endif /* CONFIG_FSE */
9879	} *__rmdir_data;
9880	vnode_t vp, dvp;
9881	int error;
9882	struct nameidata *ndp;
9883	char *path = NULL;
9884	char *no_firmlink_path = NULL;
9885	int len_path = `0`;
9886	int len_no_firmlink_path = `0`;
9887	int has_listeners = `0`;
9888	int need_event = `0`;
9889	int truncated_path = `0`;
9890	int truncated_no_firmlink_path = `0`;
9891	struct vnode_attr *vap = NULL;
9892	int restart_count = `0`;
9893	int batched;
9894
9895	int restart_flag;
9896	int nofollow_any = `0`;
9897
9898	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9899	ndp = &__rmdir_data->nd;
9900
9901	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9902	nofollow_any = NAMEI_NOFOLLOW_ANY;
9903	unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9904	}
9905
9906	/*
9907	* This loop exists to restart rmdir in the unlikely case that two
9908	* processes are simultaneously trying to remove the same directory
9909	* containing orphaned appleDouble files.
9910	*/
9911	do {
9912	NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT \| AUDITVNPATH1,
9913	segflg, dirpath, ctx);
9914	ndp->ni_flag = NAMEI_COMPOUNDRMDIR \| nofollow_any;
9915	continue_lookup:
9916	restart_flag = `0`;
9917	vap = NULL;
9918
9919	error = nameiat(ndp, dirfd: fd);
9920	if (error) {
9921	goto err_out;
9922	}
9923
9924	dvp = ndp->ni_dvp;
9925	vp = ndp->ni_vp;
9926
9927	if (vp) {
9928	batched = vnode_compound_rmdir_available(vp);
9929
9930	if (vp->v_flag & VROOT) {
9931	/*
9932	* The root of a mounted filesystem cannot be deleted.
9933	*/
9934	error = EBUSY;
9935	goto out;
9936	}
9937
9938	#if DEVELOPMENT \|\| DEBUG
9939	/*
9940	* XXX VSWAP: Check for entitlements or special flag here
9941	* so we can restrict access appropriately.
9942	*/
9943	#else /* DEVELOPMENT \|\| DEBUG */
9944
9945	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9946	error = EPERM;
9947	goto out;
9948	}
9949	#endif /* DEVELOPMENT \|\| DEBUG */
9950
9951	/*
9952	* Removed a check here; we used to abort if vp's vid
9953	* was not the same as what we'd seen the last time around.
9954	* I do not think that check was valid, because if we retry
9955	* and all dirents are gone, the directory could legitimately
9956	* be recycled but still be present in a situation where we would
9957	* have had permission to delete. Therefore, we won't make
9958	* an effort to preserve that check now that we may not have a
9959	* vp here.
9960	*/
9961
9962	if (!batched) {
9963	error = vn_authorize_rmdir(dvp, vp, cnp: &ndp->ni_cnd, ctx, NULL);
9964	if (error) {
9965	if (error == ENOENT) {
9966	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9967	restart_flag = `1`;
9968	restart_count += `1`;
9969	}
9970	}
9971	goto out;
9972	}
9973	}
9974	} else {
9975	batched = `1`;
9976
9977	if (!vnode_compound_rmdir_available(vp: dvp)) {
9978	panic("No error, but no compound rmdir?");
9979	}
9980	}
9981
9982	#if CONFIG_FSE
9983	fse_info finfo = {`0`};
9984
9985	need_event = need_fsevent(FSE_DELETE, vp: dvp);
9986	if (need_event) {
9987	if (!batched) {
9988	get_fse_info(vp, fse: &finfo, ctx);
9989	} else {
9990	error = vfs_get_notify_attributes(vap: &__rmdir_data->va);
9991	if (error) {
9992	goto out;
9993	}
9994
9995	vap = &__rmdir_data->va;
9996	}
9997	}
9998	#endif
9999	has_listeners = kauth_authorize_fileop_has_listeners();
10000	if (need_event \|\| has_listeners) {
10001	if (path == NULL) {
10002	GET_PATH(path);
10003	}
10004
10005	len_path = safe_getpath(dvp, leafname: ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, truncated_path: &truncated_path);
10006
10007	if (no_firmlink_path == NULL) {
10008	GET_PATH(no_firmlink_path);
10009	}
10010
10011	len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: ndp->ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
10012	#if CONFIG_FSE
10013	if (truncated_no_firmlink_path) {
10014	finfo.mode \|= FSE_TRUNCATED_PATH;
10015	}
10016	#endif
10017	}
10018
10019	#if CONFIG_FILE_LEASES
10020	vnode_breakdirlease(vp: dvp, false, O_WRONLY);
10021	#endif
10022
10023	error = vn_rmdir(dvp, vpp: &vp, ndp, vap, ctx);
10024	ndp->ni_vp = vp;
10025	if (vp == NULLVP) {
10026	/ Couldn't find a vnode /
10027	goto out;
10028	}
10029
10030	if (error == EKEEPLOOKING) {
10031	goto continue_lookup;
10032	} else if (batched && error == ENOENT) {
10033	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10034	/*
10035	* For compound VNOPs, the authorization callback
10036	* may return ENOENT in case of racing hard link lookups
10037	* redrive the lookup.
10038	*/
10039	restart_flag = `1`;
10040	restart_count += `1`;
10041	goto out;
10042	}
10043	}
10044
10045	/*
10046	* XXX There's no provision for passing flags
10047	* to VNOP_RMDIR(). So, if vn_rmdir() fails
10048	* because it's not empty, then we try again
10049	* with VNOP_REMOVE(), passing in a special
10050	* flag that clever file systems will know
10051	* how to handle.
10052	*/
10053	if (error == ENOTEMPTY &&
10054	(unlink_flags & VNODE_REMOVE_DATALESS_DIR) != `0`) {
10055	/*
10056	* Only do this if the directory is actually
10057	* marked as DATALESS.
10058	*/
10059	struct vnode_attr *lvap =
10060	kalloc_type(struct vnode_attr, Z_WAITOK);
10061
10062	VATTR_INIT(lvap);
10063	VATTR_WANTED(lvap, va_flags);
10064	if (vnode_getattr(vp, vap: lvap, ctx) == `0` &&
10065	VATTR_IS_SUPPORTED(lvap, va_flags) &&
10066	(lvap->va_flags & SF_DATALESS) != `0`) {
10067	/*
10068	* If this fails, we want to keep the original
10069	* error.
10070	*/
10071	if (vn_remove(dvp, vpp: &vp, ndp,
10072	VNODE_REMOVE_DATALESS_DIR, vap, ctx) == `0`) {
10073	error = `0`;
10074	}
10075	}
10076	kfree_type(struct vnode_attr, lvap);
10077	}
10078
10079	#if CONFIG_APPLEDOUBLE
10080	/*
10081	* Special case to remove orphaned AppleDouble
10082	* files. I don't like putting this in the kernel,
10083	* but carbon does not like putting this in carbon either,
10084	* so here we are.
10085	*/
10086	if (error == ENOTEMPTY) {
10087	int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10088	if (ad_error == EBUSY) {
10089	error = ad_error;
10090	goto out;
10091	}
10092
10093
10094	/*
10095	* Assuming everything went well, we will try the RMDIR again
10096	*/
10097	if (!ad_error) {
10098	error = vn_rmdir(dvp, vpp: &vp, ndp, vap, ctx);
10099	}
10100	}
10101	#endif /* CONFIG_APPLEDOUBLE */
10102	/*
10103	* Call out to allow 3rd party notification of delete.
10104	* Ignore result of kauth_authorize_fileop call.
10105	*/
10106	if (!error) {
10107	if (has_listeners) {
10108	kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
10109	KAUTH_FILEOP_DELETE,
10110	arg0: (uintptr_t)vp,
10111	arg1: (uintptr_t)path);
10112	}
10113
10114	if (vp->v_flag & VISHARDLINK) {
10115	// see the comment in unlink1() about why we update
10116	// the parent of a hard link when it is removed
10117	vnode_update_identity(vp, NULL, NULL, name_len: `0`, name_hashval: `0`, VNODE_UPDATE_PARENT);
10118	}
10119
10120	#if CONFIG_FSE
10121	if (need_event) {
10122	if (vap) {
10123	vnode_get_fse_info_from_vap(vp, fse: &finfo, vap);
10124	}
10125	add_fsevent(FSE_DELETE, ctx,
10126	FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10127	FSE_ARG_FINFO, &finfo,
10128	FSE_ARG_DONE);
10129	}
10130	#endif
10131
10132	#if CONFIG_MACF
10133	mac_vnode_notify_unlink(ctx, dvp, vp, cnp: &ndp->ni_cnd);
10134	#endif
10135	}
10136
10137	out:
10138	if (path != NULL) {
10139	RELEASE_PATH(path);
10140	path = NULL;
10141	}
10142
10143	if (no_firmlink_path != NULL) {
10144	RELEASE_PATH(no_firmlink_path);
10145	no_firmlink_path = NULL;
10146	}
10147
10148	/*
10149	* nameidone has to happen before we vnode_put(dvp)
10150	* since it may need to release the fs_nodelock on the dvp
10151	*/
10152	nameidone(ndp);
10153	vnode_put(vp: dvp);
10154
10155	if (vp) {
10156	vnode_put(vp);
10157	}
10158
10159	if (restart_flag == `0`) {
10160	wakeup_one(chan: (caddr_t)vp);
10161	goto err_out;
10162	}
10163	tsleep(chan: vp, PVFS, wmesg: "rm AD", timo: `1`);
10164	} while (restart_flag != `0`);
10165
10166	err_out:
10167	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10168
10169	return error;
10170	}
10171
10172	/*
10173	* Remove a directory file.
10174	*/
10175	/ ARGSUSED /
10176	int
10177	rmdir(__unused proc_t p, struct rmdir_args uap, __unused int32_t retval)
10178	{
10179	return rmdirat_internal(ctx: vfs_context_current(), AT_FDCWD,
10180	CAST_USER_ADDR_T(uap->path), segflg: UIO_USERSPACE, unlink_flags: `0`);
10181	}
10182
10183	/ Get direntry length padded to 8 byte alignment /
10184	#define DIRENT64_LEN(namlen) \
10185	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10186
10187	/ Get dirent length padded to 4 byte alignment /
10188	#define DIRENT_LEN(namelen) \
10189	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10190
10191	/ Get the end of this dirent /
10192	#define DIRENT_END(dep) \
10193	(((char *)(dep)) + (dep)->d_reclen - 1)
10194
10195	errno_t
10196	vnode_readdir64(struct vnode vp, struct* uio uio, int* flags, int *eofflag,
10197	int *numdirent, vfs_context_t ctxp)
10198	{
10199	/ Check if fs natively supports VNODE_READDIR_EXTENDED /
10200	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10201	((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == `0`)) {
10202	return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10203	} else {
10204	size_t bufsize;
10205	void * bufptr;
10206	uio_t auio;
10207	struct direntry *entry64;
10208	struct dirent *dep;
10209	size_t bytesread;
10210	int error;
10211
10212	/*
10213	* We're here because the underlying file system does not
10214	* support direnties or we mounted denying support so we must
10215	* fall back to dirents and convert them to direntries.
10216	*
10217	* Our kernel buffer needs to be smaller since re-packing will
10218	* expand each dirent. The worse case (when the name length
10219	* is 3 or less) corresponds to a struct direntry size of 32
10220	* bytes (8-byte aligned) and a struct dirent size of 12 bytes
10221	* (4-byte aligned). So having a buffer that is 3/8 the size
10222	* will prevent us from reading more than we can pack.
10223	*
10224	* Since this buffer is wired memory, we will limit the
10225	* buffer size to a maximum of 32K. We would really like to
10226	* use 32K in the MIN(), but we use magic number 87371 to
10227	* prevent uio_resid() * 3 / 8 from overflowing.
10228	*/
10229	bufsize = `3` * MIN((user_size_t)uio_resid(uio), `87371u`) / `8`;
10230	bufptr = kalloc_data(bufsize, Z_WAITOK);
10231	if (bufptr == NULL) {
10232	return ENOMEM;
10233	}
10234
10235	auio = uio_create(a_iovcount: `1`, a_offset: `0`, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
10236	uio_addiov(a_uio: auio, a_baseaddr: (uintptr_t)bufptr, a_length: bufsize);
10237	auio->uio_offset = uio->uio_offset;
10238
10239	error = VNOP_READDIR(vp, auio, `0`, eofflag, numdirent, ctxp);
10240
10241	dep = (struct dirent *)bufptr;
10242	bytesread = bufsize - uio_resid(a_uio: auio);
10243
10244	entry64 = kalloc_type(struct direntry, Z_WAITOK);
10245	/*
10246	* Convert all the entries and copy them out to user's buffer.
10247	*/
10248	while (error == `0` && (char )dep < ((char* *)bufptr + bytesread)) {
10249	/ First check that the dirent struct up to d_name is within the buffer /
10250	if ((char)dep + offsetof(struct* dirent, d_name) > ((char *)bufptr + bytesread) \|\|
10251	/ Check that the length of the entire dirent is within the buffer /
10252	DIRENT_END(dep) > ((char *)bufptr + bytesread) \|\|
10253	/ Check that the actual length including the name doesn't exceed d_reclen /
10254	DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10255	printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10256	vp->v_mount->mnt_vfsstat.f_mntonname,
10257	vp->v_name ? vp->v_name : "<unknown>");
10258	error = EIO;
10259	break;
10260	}
10261
10262	size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10263
10264	bzero(s: entry64, n: enbufsize);
10265	/ Convert a dirent to a dirent64. /
10266	entry64->d_ino = dep->d_ino;
10267	entry64->d_seekoff = `0`;
10268	entry64->d_reclen = (uint16_t)enbufsize;
10269	entry64->d_namlen = dep->d_namlen;
10270	entry64->d_type = dep->d_type;
10271	bcopy(src: dep->d_name, dst: entry64->d_name, n: dep->d_namlen + `1`);
10272
10273	/ Move to next entry. /
10274	dep = (struct dirent )((char* *)dep + dep->d_reclen);
10275
10276	/ Copy entry64 to user's buffer. /
10277	error = uiomove(cp: (caddr_t)entry64, n: entry64->d_reclen, uio);
10278	}
10279
10280	/ Update the real offset using the offset we got from VNOP_READDIR. /
10281	if (error == `0`) {
10282	uio->uio_offset = auio->uio_offset;
10283	}
10284	uio_free(a_uio: auio);
10285	kfree_data(bufptr, bufsize);
10286	kfree_type(struct direntry, entry64);
10287	return error;
10288	}
10289	}
10290
10291	#define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10292
10293	/*
10294	* Read a block of directory entries in a file system independent format.
10295	*/
10296	static int
10297	getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10298	off_t offset, int* eofflag, int* flags)
10299	{
10300	vnode_t vp;
10301	struct vfs_context context = vfs_context_current(); /* local copy /
10302	struct fileproc *fp;
10303	uio_t auio;
10304	int spacetype = proc_is64bit(vfs_context_proc(ctx: &context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10305	off_t loff;
10306	int error, numdirent;
10307	UIO_STACKBUF(uio_buf, `1`);
10308
10309	get_from_fd:
10310	error = fp_getfvp(p: vfs_context_proc(ctx: &context), fd, resultfp: &fp, resultvp: &vp);
10311	if (error) {
10312	return error;
10313	}
10314
10315	vn_offset_lock(fg: fp->fp_glob);
10316	if (((vnode_t)fp_get_data(fp)) != vp) {
10317	vn_offset_unlock(fg: fp->fp_glob);
10318	file_drop(fd);
10319	goto get_from_fd;
10320	}
10321
10322	if ((fp->fp_glob->fg_flag & FREAD) == `0`) {
10323	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10324	error = EBADF;
10325	goto out;
10326	}
10327
10328	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10329	bufsize = GETDIRENTRIES_MAXBUFSIZE;
10330	}
10331
10332	#if CONFIG_MACF
10333	error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx: &context), fg: fp->fp_glob);
10334	if (error) {
10335	goto out;
10336	}
10337	#endif
10338
10339	if ((error = vnode_getwithref(vp))) {
10340	goto out;
10341	}
10342	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10343
10344	#if CONFIG_UNION_MOUNTS
10345	unionread:
10346	#endif /* CONFIG_UNION_MOUNTS */
10347	if (vp->v_type != VDIR) {
10348	(void)vnode_put(vp);
10349	error = EINVAL;
10350	goto out;
10351	}
10352
10353	#if CONFIG_MACF
10354	error = mac_vnode_check_readdir(ctx: &context, vp);
10355	if (error != `0`) {
10356	(void)vnode_put(vp);
10357	goto out;
10358	}
10359	#endif /* MAC */
10360
10361	loff = fp->fp_glob->fg_offset;
10362	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: loff, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
10363	uio_addiov(a_uio: auio, a_baseaddr: bufp, a_length: bufsize);
10364
10365	if (flags & VNODE_READDIR_EXTENDED) {
10366	error = vnode_readdir64(vp, uio: auio, flags, eofflag, numdirent: &numdirent, ctxp: &context);
10367	fp->fp_glob->fg_offset = uio_offset(a_uio: auio);
10368	} else {
10369	error = VNOP_READDIR(vp, auio, `0`, eofflag, &numdirent, &context);
10370	fp->fp_glob->fg_offset = uio_offset(a_uio: auio);
10371	}
10372	if (error) {
10373	(void)vnode_put(vp);
10374	goto out;
10375	}
10376
10377	#if CONFIG_UNION_MOUNTS
10378	if ((user_ssize_t)bufsize == uio_resid(a_uio: auio) &&
10379	(vp->v_mount->mnt_flag & MNT_UNION)) {
10380	vnode_t uvp;
10381
10382	if (lookup_traverse_union(dvp: vp, new_dvp: &uvp, ctx: &context) == `0`) {
10383	if (vnode_ref(vp: uvp) == `0`) {
10384	fp_set_data(fp, fg_data: uvp);
10385	fp->fp_glob->fg_offset = `0`;
10386	vnode_rele(vp);
10387	vnode_put(vp);
10388	vp = uvp;
10389	goto unionread;
10390	} else {
10391	/ could not get a ref, can't replace in fd /
10392	vnode_put(vp: uvp);
10393	}
10394	}
10395	}
10396	#endif /* CONFIG_UNION_MOUNTS */
10397
10398	vnode_put(vp);
10399	if (offset) {
10400	*offset = loff;
10401	}
10402
10403	*bytesread = bufsize - uio_resid(a_uio: auio);
10404	out:
10405	vn_offset_unlock(fg: fp->fp_glob);
10406	file_drop(fd);
10407	return error;
10408	}
10409
10410
10411	int
10412	getdirentries(__unused struct proc p, struct* getdirentries_args uap, int32_t retval)
10413	{
10414	off_t offset;
10415	ssize_t bytesread;
10416	int error, eofflag;
10417
10418	AUDIT_ARG(fd, uap->fd);
10419	error = getdirentries_common(fd: uap->fd, bufp: uap->buf, bufsize: uap->count,
10420	bytesread: &bytesread, offset: &offset, eofflag: &eofflag, flags: `0`);
10421
10422	if (error == `0`) {
10423	if (proc_is64bit(p)) {
10424	user64_long_t base = (user64_long_t)offset;
10425	error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10426	} else {
10427	user32_long_t base = (user32_long_t)offset;
10428	error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10429	}
10430	retval = (int*)bytesread;
10431	}
10432	return error;
10433	}
10434
10435	int
10436	getdirentries64(__unused struct proc p, struct* getdirentries64_args uap, user_ssize_t retval)
10437	{
10438	off_t offset;
10439	ssize_t bytesread;
10440	int error, eofflag;
10441	user_size_t bufsize;
10442
10443	AUDIT_ARG(fd, uap->fd);
10444
10445	/*
10446	* If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10447	* then the kernel carves out the last 4 bytes to return extended
10448	* information to userspace (namely whether we reached EOF with this call).
10449	*/
10450	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10451	bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10452	} else {
10453	bufsize = uap->bufsize;
10454	}
10455
10456	error = getdirentries_common(fd: uap->fd, bufp: uap->buf, bufsize,
10457	bytesread: &bytesread, offset: &offset, eofflag: &eofflag, VNODE_READDIR_EXTENDED);
10458
10459	if (error == `0`) {
10460	*retval = bytesread;
10461	error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10462
10463	if (error == `0` && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10464	getdirentries64_flags_t flags = `0`;
10465	if (eofflag) {
10466	flags \|= GETDIRENTRIES64_EOF;
10467	}
10468	error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10469	sizeof(flags));
10470	}
10471	}
10472	return error;
10473	}
10474
10475
10476	/*
10477	* Set the mode mask for creation of filesystem nodes.
10478	* XXX implement xsecurity
10479	*/
10480	#define UMASK_NOXSECURITY (void )1 / leave existing xsecurity alone */
10481	static int
10482	umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10483	{
10484	AUDIT_ARG(mask, newmask);
10485	proc_fdlock(p);
10486	*retval = p->p_fd.fd_cmask;
10487	p->p_fd.fd_cmask = newmask & ALLPERMS;
10488	proc_fdunlock(p);
10489	return `0`;
10490	}
10491
10492	/*
10493	* umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10494	*
10495	* Parameters: p Process requesting to set the umask
10496	* uap User argument descriptor (see below)
10497	* retval umask of the process (parameter p)
10498	*
10499	* Indirect: uap->newmask umask to set
10500	* uap->xsecurity ACL to set
10501	*
10502	* Returns: 0 Success
10503	* !0 Not success
10504	*
10505	*/
10506	int
10507	umask_extended(proc_t p, struct umask_extended_args uap, int32_t retval)
10508	{
10509	return umask1(p, newmask: uap->newmask, KAUTH_FILESEC_NONE, retval);
10510	}
10511
10512	int
10513	umask(proc_t p, struct umask_args uap, int32_t retval)
10514	{
10515	return umask1(p, newmask: uap->newmask, UMASK_NOXSECURITY, retval);
10516	}
10517
10518	#define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10519	"com.apple.private.vfs.revoke-mounted-device"
10520
10521	/*
10522	* Void all references to file by ripping underlying filesystem
10523	* away from vnode.
10524	*/
10525	/ ARGSUSED /
10526	int
10527	revoke(proc_t p, struct revoke_args uap, __unused int32_t retval)
10528	{
10529	vnode_t vp;
10530	struct vnode_attr va;
10531	vfs_context_t ctx = vfs_context_current();
10532	int error;
10533	struct nameidata nd;
10534
10535	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
10536	uap->path, ctx);
10537	error = namei(ndp: &nd);
10538	if (error) {
10539	return error;
10540	}
10541	vp = nd.ni_vp;
10542
10543	nameidone(&nd);
10544
10545	if (!(vnode_ischr(vp) \|\| vnode_isblk(vp))) {
10546	error = ENOTSUP;
10547	goto out;
10548	}
10549
10550	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10551	error = EBUSY;
10552	goto out;
10553	}
10554
10555	#if CONFIG_MACF
10556	error = mac_vnode_check_revoke(ctx, vp);
10557	if (error) {
10558	goto out;
10559	}
10560	#endif
10561
10562	VATTR_INIT(&va);
10563	VATTR_WANTED(&va, va_uid);
10564	if ((error = vnode_getattr(vp, vap: &va, ctx))) {
10565	goto out;
10566	}
10567	if (kauth_cred_getuid(cred: vfs_context_ucred(ctx)) != va.va_uid &&
10568	(error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
10569	goto out;
10570	}
10571	if (vp->v_usecount > `0` \|\| (vnode_isaliased(vp))) {
10572	VNOP_REVOKE(vp, REVOKEALL, ctx);
10573	}
10574	out:
10575	vnode_put(vp);
10576	return error;
10577	}
10578
10579
10580	/*
10581	* HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10582	* The following system calls are designed to support features
10583	* which are specific to the HFS & HFS Plus volume formats
10584	*/
10585
10586
10587	/*
10588	* Obtain attribute information on objects in a directory while enumerating
10589	* the directory.
10590	*/
10591	/ ARGSUSED /
10592	int
10593	getdirentriesattr(proc_t p, struct getdirentriesattr_args uap, int32_t retval)
10594	{
10595	vnode_t vp;
10596	struct fileproc *fp;
10597	uio_t auio = NULL;
10598	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10599	uint32_t count = `0`, savecount = `0`;
10600	uint32_t newstate = `0`;
10601	int error, eofflag = `0`;
10602	off_t loff = `0`;
10603	struct attrlist attributelist;
10604	vfs_context_t ctx = vfs_context_current();
10605	int fd = uap->fd;
10606	UIO_STACKBUF(uio_buf, `1`);
10607	kauth_action_t action;
10608
10609	AUDIT_ARG(fd, fd);
10610
10611	/ Get the attributes into kernel space /
10612	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10613	return error;
10614	}
10615	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10616	return error;
10617	}
10618	savecount = count;
10619
10620	get_from_fd:
10621	if ((error = fp_getfvp(p, fd, resultfp: &fp, resultvp: &vp))) {
10622	return error;
10623	}
10624
10625	vn_offset_lock(fg: fp->fp_glob);
10626	if (((vnode_t)fp_get_data(fp)) != vp) {
10627	vn_offset_unlock(fg: fp->fp_glob);
10628	file_drop(fd);
10629	goto get_from_fd;
10630	}
10631
10632	if ((fp->fp_glob->fg_flag & FREAD) == `0`) {
10633	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10634	error = EBADF;
10635	goto out;
10636	}
10637
10638
10639	#if CONFIG_MACF
10640	error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx),
10641	fg: fp->fp_glob);
10642	if (error) {
10643	goto out;
10644	}
10645	#endif
10646
10647
10648	if ((error = vnode_getwithref(vp))) {
10649	goto out;
10650	}
10651
10652	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10653
10654	#if CONFIG_UNION_MOUNTS
10655	unionread:
10656	#endif /* CONFIG_UNION_MOUNTS */
10657	if (vp->v_type != VDIR) {
10658	(void)vnode_put(vp);
10659	error = EINVAL;
10660	goto out;
10661	}
10662
10663	#if CONFIG_MACF
10664	error = mac_vnode_check_readdir(ctx, vp);
10665	if (error != `0`) {
10666	(void)vnode_put(vp);
10667	goto out;
10668	}
10669	#endif /* MAC */
10670
10671	/ set up the uio structure which will contain the users return buffer /
10672	loff = fp->fp_glob->fg_offset;
10673	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: loff, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
10674	uio_addiov(a_uio: auio, a_baseaddr: uap->buffer, a_length: uap->buffersize);
10675
10676	/*
10677	* If the only item requested is file names, we can let that past with
10678	* just LIST_DIRECTORY. If they want any other attributes, that means
10679	* they need SEARCH as well.
10680	*/
10681	action = KAUTH_VNODE_LIST_DIRECTORY;
10682	if ((attributelist.commonattr & ~ATTR_CMN_NAME) \|\|
10683	attributelist.fileattr \|\| attributelist.dirattr) {
10684	action \|= KAUTH_VNODE_SEARCH;
10685	}
10686
10687	if ((error = vnode_authorize(vp, NULL, action, ctx)) == `0`) {
10688	/ Believe it or not, uap->options only has 32-bits of valid*
10689	* info, so truncate before extending again */
10690
10691	error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10692	(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10693	}
10694
10695	if (error) {
10696	(void) vnode_put(vp);
10697	goto out;
10698	}
10699
10700	#if CONFIG_UNION_MOUNTS
10701	/*
10702	* If we've got the last entry of a directory in a union mount
10703	* then reset the eofflag and pretend there's still more to come.
10704	* The next call will again set eofflag and the buffer will be empty,
10705	* so traverse to the underlying directory and do the directory
10706	* read there.
10707	*/
10708	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10709	if (uio_resid(a_uio: auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10710	eofflag = `0`;
10711	} else { // Empty buffer
10712	vnode_t uvp;
10713	if (lookup_traverse_union(dvp: vp, new_dvp: &uvp, ctx) == `0`) {
10714	if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, `0`) == `0`) {
10715	fp_set_data(fp, fg_data: uvp);
10716	fp->fp_glob->fg_offset = `0`; // reset index for new dir
10717	count = savecount;
10718	vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, `0`, `0`);
10719	vnode_put(vp);
10720	vp = uvp;
10721	goto unionread;
10722	} else {
10723	/ could not get a ref, can't replace in fd /
10724	vnode_put(vp: uvp);
10725	}
10726	}
10727	}
10728	}
10729	#endif /* CONFIG_UNION_MOUNTS */
10730
10731	(void)vnode_put(vp);
10732
10733	if (error) {
10734	goto out;
10735	}
10736	fp->fp_glob->fg_offset = uio_offset(a_uio: auio); / should be multiple of dirent, not variable /
10737
10738	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10739	goto out;
10740	}
10741	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10742	goto out;
10743	}
10744	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10745	goto out;
10746	}
10747
10748	retval = eofflag; /* similar to getdirentries /
10749	error = `0`;
10750	out:
10751	vn_offset_unlock(fg: fp->fp_glob);
10752	file_drop(fd);
10753	return error; / return error earlier, an retval of 0 or 1 now /
10754	} / end of getdirentriesattr system call /
10755
10756	/*
10757	* Exchange data between two files
10758	*/
10759
10760	/ ARGSUSED /
10761	int
10762	exchangedata(__unused proc_t p, struct exchangedata_args uap, __unused int32_t retval)
10763	{
10764	struct nameidata fnd, snd;
10765	vfs_context_t ctx = vfs_context_current();
10766	vnode_t fvp;
10767	vnode_t svp;
10768	int error;
10769	u_int32_t nameiflags;
10770	char *fpath = NULL;
10771	char *spath = NULL;
10772	int flen = `0`, slen = `0`;
10773	int from_truncated = `0`, to_truncated = `0`;
10774	#if CONFIG_FSE
10775	fse_info f_finfo, s_finfo;
10776	#endif
10777
10778	nameiflags = `0`;
10779	if ((uap->options & FSOPT_NOFOLLOW) == `0`) {
10780	nameiflags \|= FOLLOW;
10781	}
10782
10783	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags \| AUDITVNPATH1,
10784	UIO_USERSPACE, uap->path1, ctx);
10785
10786	error = namei(ndp: &fnd);
10787	if (error) {
10788	goto out2;
10789	}
10790
10791	nameidone(&fnd);
10792	fvp = fnd.ni_vp;
10793
10794	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK \| nameiflags \| AUDITVNPATH2,
10795	UIO_USERSPACE, uap->path2, ctx);
10796
10797	error = namei(ndp: &snd);
10798	if (error) {
10799	vnode_put(vp: fvp);
10800	goto out2;
10801	}
10802	nameidone(&snd);
10803	svp = snd.ni_vp;
10804
10805	/*
10806	* if the files are the same, return an inval error
10807	*/
10808	if (svp == fvp) {
10809	error = EINVAL;
10810	goto out;
10811	}
10812
10813	/*
10814	* if the files are on different volumes, return an error
10815	*/
10816	if (svp->v_mount != fvp->v_mount) {
10817	error = EXDEV;
10818	goto out;
10819	}
10820
10821	/ If they're not files, return an error /
10822	if ((vnode_isreg(vp: fvp) == `0`) \|\| (vnode_isreg(vp: svp) == `0`)) {
10823	error = EINVAL;
10824	goto out;
10825	}
10826
10827	#if CONFIG_MACF
10828	error = mac_vnode_check_exchangedata(ctx,
10829	v1: fvp, v2: svp);
10830	if (error) {
10831	goto out;
10832	}
10833	#endif
10834	if (((error = vnode_authorize(vp: fvp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != `0`) \|\|
10835	((error = vnode_authorize(vp: svp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != `0`)) {
10836	goto out;
10837	}
10838
10839	if (
10840	#if CONFIG_FSE
10841	need_fsevent(FSE_EXCHANGE, vp: fvp) \|\|
10842	#endif
10843	kauth_authorize_fileop_has_listeners()) {
10844	GET_PATH(fpath);
10845	GET_PATH(spath);
10846
10847	flen = safe_getpath(dvp: fvp, NULL, path: fpath, MAXPATHLEN, truncated_path: &from_truncated);
10848	slen = safe_getpath(dvp: svp, NULL, path: spath, MAXPATHLEN, truncated_path: &to_truncated);
10849
10850	#if CONFIG_FSE
10851	get_fse_info(vp: fvp, fse: &f_finfo, ctx);
10852	get_fse_info(vp: svp, fse: &s_finfo, ctx);
10853	if (from_truncated \|\| to_truncated) {
10854	// set it here since only the f_finfo gets reported up to user space
10855	f_finfo.mode \|= FSE_TRUNCATED_PATH;
10856	}
10857	#endif
10858	}
10859	/ Ok, make the call /
10860	error = VNOP_EXCHANGE(fvp, svp, `0`, ctx);
10861
10862	if (error == `0`) {
10863	const char *tmpname;
10864
10865	if (fpath != NULL && spath != NULL) {
10866	/ call out to allow 3rd party notification of exchangedata.*
10867	* Ignore result of kauth_authorize_fileop call.
10868	*/
10869	kauth_authorize_fileop(credential: vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10870	arg0: (uintptr_t)fpath, arg1: (uintptr_t)spath);
10871	}
10872	name_cache_lock();
10873
10874	tmpname = fvp->v_name;
10875	fvp->v_name = svp->v_name;
10876	svp->v_name = tmpname;
10877
10878	if (fvp->v_parent != svp->v_parent) {
10879	vnode_t tmp;
10880
10881	tmp = fvp->v_parent;
10882	fvp->v_parent = svp->v_parent;
10883	svp->v_parent = tmp;
10884	}
10885	name_cache_unlock();
10886
10887	#if CONFIG_FSE
10888	if (fpath != NULL && spath != NULL) {
10889	add_fsevent(FSE_EXCHANGE, ctx,
10890	FSE_ARG_STRING, flen, fpath,
10891	FSE_ARG_FINFO, &f_finfo,
10892	FSE_ARG_STRING, slen, spath,
10893	FSE_ARG_FINFO, &s_finfo,
10894	FSE_ARG_DONE);
10895	}
10896	#endif
10897	}
10898
10899	out:
10900	if (fpath != NULL) {
10901	RELEASE_PATH(fpath);
10902	}
10903	if (spath != NULL) {
10904	RELEASE_PATH(spath);
10905	}
10906	vnode_put(vp: svp);
10907	vnode_put(vp: fvp);
10908	out2:
10909	return error;
10910	}
10911
10912	/*
10913	* Return (in MB) the amount of freespace on the given vnode's volume.
10914	*/
10915	uint32_t freespace_mb(vnode_t vp);
10916
10917	uint32_t
10918	freespace_mb(vnode_t vp)
10919	{
10920	vfs_update_vfsstat(mp: vp->v_mount, ctx: vfs_context_current(), VFS_USER_EVENT);
10921	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10922	vp->v_mount->mnt_vfsstat.f_bsize) >> `20`);
10923	}
10924
10925	#if CONFIG_SEARCHFS
10926
10927	/ ARGSUSED /
10928
10929	int
10930	searchfs(proc_t p, struct searchfs_args uap, __unused int32_t retval)
10931	{
10932	vnode_t vp, tvp;
10933	int i, error = `0`;
10934	int fserror = `0`;
10935	struct nameidata nd;
10936	struct user64_fssearchblock searchblock;
10937	struct searchstate *state;
10938	struct attrlist *returnattrs;
10939	struct timeval timelimit;
10940	void searchparams1, searchparams2;
10941	uio_t auio = NULL;
10942	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10943	uint32_t nummatches;
10944	size_t mallocsize;
10945	uint32_t nameiflags;
10946	vfs_context_t ctx = vfs_context_current();
10947	UIO_STACKBUF(uio_buf, `1`);
10948
10949	/ Start by copying in fsearchblock parameter list /
10950	if (IS_64BIT_PROCESS(p)) {
10951	error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10952	timelimit.tv_sec = searchblock.timelimit.tv_sec;
10953	timelimit.tv_usec = searchblock.timelimit.tv_usec;
10954	} else {
10955	struct user32_fssearchblock tmp_searchblock;
10956
10957	error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10958	// munge into 64-bit version
10959	searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10960	searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10961	searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10962	searchblock.maxmatches = tmp_searchblock.maxmatches;
10963	/*
10964	* These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10965	* from a 32 bit long, and tv_usec is already a signed 32 bit int.
10966	*/
10967	timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10968	timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10969	searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10970	searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10971	searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10972	searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10973	searchblock.searchattrs = tmp_searchblock.searchattrs;
10974	}
10975	if (error) {
10976	return error;
10977	}
10978
10979	/ Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.*
10980	*/
10981	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS \|\|
10982	searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10983	return EINVAL;
10984	}
10985
10986	/ Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. /
10987	/ It all has to do into local memory and it's not that big so we might as well put it all together. /
10988	/ Searchparams1 shall be first so we might as well use that to hold the base address of the allocated/
10989	/ block. /
10990	/ /
10991	/ NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate /
10992	/ due to the changes in rdar://problem/12438273. That way if a 3rd party file system /
10993	/ assumes the size is still 556 bytes it will continue to work /
10994
10995	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10996	sizeof(struct attrlist) + sizeof(struct searchstate) + (`2` * sizeof(uint32_t));
10997
10998	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10999
11000	/ Now set up the various pointers to the correct place in our newly allocated memory /
11001
11002	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11003	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11004	state = (struct searchstate ) (((caddr_t) returnattrs) + sizeof(struct* attrlist));
11005
11006	/ Now copy in the stuff given our local variables. /
11007
11008	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11009	goto freeandexit;
11010	}
11011
11012	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11013	goto freeandexit;
11014	}
11015
11016	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11017	goto freeandexit;
11018	}
11019
11020	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11021	goto freeandexit;
11022	}
11023
11024	/*
11025	* When searching a union mount, need to set the
11026	* start flag at the first call on each layer to
11027	* reset state for the new volume.
11028	*/
11029	if (uap->options & SRCHFS_START) {
11030	state->ss_union_layer = `0`;
11031	} else {
11032	uap->options \|= state->ss_union_flags;
11033	}
11034	state->ss_union_flags = `0`;
11035
11036	/*
11037	* Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11038	* which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11039	* The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11040	* and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11041	* validate the user-supplied data offset of the attrreference_t, we'll do it here.
11042	*/
11043
11044	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11045	attrreference_t* string_ref;
11046	u_int32_t* start_length;
11047	user64_size_t param_length;
11048
11049	/ validate searchparams1 /
11050	param_length = searchblock.sizeofsearchparams1;
11051	/ skip the word that specifies length of the buffer /
11052	start_length = (u_int32_t*) searchparams1;
11053	start_length = start_length + `1`;
11054	string_ref = (attrreference_t*) start_length;
11055
11056	/ ensure no negative offsets or too big offsets /
11057	if (string_ref->attr_dataoffset < `0`) {
11058	error = EINVAL;
11059	goto freeandexit;
11060	}
11061	if (string_ref->attr_length > MAXPATHLEN) {
11062	error = EINVAL;
11063	goto freeandexit;
11064	}
11065
11066	/ Check for pointer overflow in the string ref /
11067	if (((char) string_ref + string_ref->attr_dataoffset) < (char**) string_ref) {
11068	error = EINVAL;
11069	goto freeandexit;
11070	}
11071
11072	if (((char) string_ref + string_ref->attr_dataoffset) > ((char**)searchparams1 + param_length)) {
11073	error = EINVAL;
11074	goto freeandexit;
11075	}
11076	if (((char)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char**)searchparams1 + param_length)) {
11077	error = EINVAL;
11078	goto freeandexit;
11079	}
11080	}
11081
11082	/ set up the uio structure which will contain the users return buffer /
11083	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: `0`, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
11084	uio_addiov(a_uio: auio, a_baseaddr: searchblock.returnbuffer, a_length: searchblock.returnbuffersize);
11085
11086	nameiflags = `0`;
11087	if ((uap->options & FSOPT_NOFOLLOW) == `0`) {
11088	nameiflags \|= FOLLOW;
11089	}
11090	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags \| AUDITVNPATH1,
11091	UIO_USERSPACE, uap->path, ctx);
11092
11093	error = namei(ndp: &nd);
11094	if (error) {
11095	goto freeandexit;
11096	}
11097	vp = nd.ni_vp;
11098	nameidone(&nd);
11099
11100	/*
11101	* Switch to the root vnode for the volume
11102	*/
11103	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11104	vnode_put(vp);
11105	if (error) {
11106	goto freeandexit;
11107	}
11108	vp = tvp;
11109
11110	#if CONFIG_UNION_MOUNTS
11111	/*
11112	* If it's a union mount, the path lookup takes
11113	* us to the top layer. But we may need to descend
11114	* to a lower layer. For non-union mounts the layer
11115	* is always zero.
11116	*/
11117	for (i = `0`; i < (int) state->ss_union_layer; i++) {
11118	if ((vp->v_mount->mnt_flag & MNT_UNION) == `0`) {
11119	break;
11120	}
11121	tvp = vp;
11122	vp = vp->v_mount->mnt_vnodecovered;
11123	if (vp == NULL) {
11124	vnode_put(vp: tvp);
11125	error = ENOENT;
11126	goto freeandexit;
11127	}
11128	error = vnode_getwithref(vp);
11129	vnode_put(vp: tvp);
11130	if (error) {
11131	goto freeandexit;
11132	}
11133	}
11134	#endif /* CONFIG_UNION_MOUNTS */
11135
11136	#if CONFIG_MACF
11137	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, searchattrs: &searchblock.searchattrs);
11138	if (error) {
11139	vnode_put(vp);
11140	goto freeandexit;
11141	}
11142	#endif
11143
11144
11145	/*
11146	* If searchblock.maxmatches == 0, then skip the search. This has happened
11147	* before and sometimes the underlying code doesnt deal with it well.
11148	*/
11149	if (searchblock.maxmatches == `0`) {
11150	nummatches = `0`;
11151	goto saveandexit;
11152	}
11153
11154	/*
11155	* Allright, we have everything we need, so lets make that call.
11156	*
11157	* We keep special track of the return value from the file system:
11158	* EAGAIN is an acceptable error condition that shouldn't keep us
11159	* from copying out any results...
11160	*/
11161
11162	fserror = VNOP_SEARCHFS(vp,
11163	searchparams1,
11164	searchparams2,
11165	&searchblock.searchattrs,
11166	(uint32_t)searchblock.maxmatches,
11167	&timelimit,
11168	returnattrs,
11169	&nummatches,
11170	(uint32_t)uap->scriptcode,
11171	(uint32_t)uap->options,
11172	auio,
11173	(struct searchstate *) &state->ss_fsstate,
11174	ctx);
11175
11176	#if CONFIG_UNION_MOUNTS
11177	/*
11178	* If it's a union mount we need to be called again
11179	* to search the mounted-on filesystem.
11180	*/
11181	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == `0`) {
11182	state->ss_union_flags = SRCHFS_START;
11183	state->ss_union_layer++; // search next layer down
11184	fserror = EAGAIN;
11185	}
11186	#endif /* CONFIG_UNION_MOUNTS */
11187
11188	saveandexit:
11189
11190	vnode_put(vp);
11191
11192	/ Now copy out the stuff that needs copying out. That means the number of matches, the*
11193	* search state. Everything was already put into he return buffer by the vop call. */
11194
11195	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != `0`) {
11196	goto freeandexit;
11197	}
11198
11199	if ((error = suulong(addr: uap->nummatches, ulongword: (uint64_t)nummatches)) != `0`) {
11200	goto freeandexit;
11201	}
11202
11203	error = fserror;
11204
11205	freeandexit:
11206
11207	kfree_data(searchparams1, mallocsize);
11208
11209	return error;
11210	} / end of searchfs system call /
11211
11212	#else /* CONFIG_SEARCHFS */
11213
11214	int
11215	searchfs(__unused proc_t p, __unused struct searchfs_args uap, __unused int32_t retval)
11216	{
11217	return ENOTSUP;
11218	}
11219
11220	#endif /* CONFIG_SEARCHFS */
11221
11222
11223	#if CONFIG_DATALESS_FILES
11224
11225	/*
11226	* === Namespace Resolver Up-call Mechanism ===
11227	*
11228	* When I/O is performed to a dataless file or directory (read, write,
11229	* lookup-in, etc.), the file system performs an upcall to the namespace
11230	* resolver (filecoordinationd) to materialize the object.
11231	*
11232	* We need multiple up-calls to be in flight at once, and we need these
11233	* up-calls to be interruptible, thus the following implementation:
11234	*
11235	* => The nspace_resolver_request represents the in-kernel request state.
11236	* It contains a request ID, storage space for the errno code returned
11237	* by filecoordinationd, and flags.
11238	*
11239	* => The request ID is simply a global monotonically incrementing 32-bit
11240	* number. Outstanding requests are stored in a hash table, and the
11241	* hash function is extremely simple.
11242	*
11243	* => When an upcall is to be made to filecoordinationd, a request structure
11244	* is allocated on the stack (it is small, and needs to live only during
11245	* the duration of the call to resolve_nspace_item_ext()). It is
11246	* initialized and inserted into the table. Some backpressure from
11247	* filecoordinationd is applied by limiting the numnber of entries that
11248	* can be inserted into the table (and thus limiting the number of
11249	* outstanding requests issued to filecoordinationd); waiting for an
11250	* available slot is interruptible.
11251	*
11252	* => Once the request has been inserted into the table, the up-call is made
11253	* to filecoordinationd via a MiG-generated stub. The up-call returns
11254	* immediately and filecoordinationd processes the request asynchronously.
11255	*
11256	* => The caller now waits for the request to complete. Tnis is achieved by
11257	* sleeping on the address of the request structure and waiting for
11258	* filecoordinationd to mark the request structure as complete. This
11259	* is an interruptible sleep call; if interrupted, the request structure
11260	* is removed from the table and EINTR is returned to the caller. If
11261	* this occurs, an advisory up-call is made to filecoordinationd with
11262	* the request ID to indicate that the request can be aborted or
11263	* de-prioritized at the discretion of filecoordinationd.
11264	*
11265	* => When filecoordinationd has completed the request, it signals completion
11266	* by writing to the vfs.nspace.complete sysctl node. Only a process
11267	* decorated as a namespace resolver can write to this sysctl node. The
11268	* value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11269	* The request ID is looked up in the table, and if the request is found,
11270	* the error code is stored in the request structure and a wakeup()
11271	* issued on the address of the request structure. If the request is not
11272	* found, we simply drop the completion notification, assuming that the
11273	* caller was interrupted.
11274	*
11275	* => When the waiting thread wakes up, it extracts the error code from the
11276	* request structure, removes the request from the table, and returns the
11277	* error code to the calling function. Fini!
11278	*/
11279
11280	struct nspace_resolver_request {
11281	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11282	vnode_t r_vp;
11283	vnode_t r_tdvp;
11284	uint32_t r_req_id;
11285	int r_resolver_error;
11286	int r_flags;
11287	};
11288
11289	#define RRF_COMPLETE 0x0001
11290	#define RRF_COMPLETING 0x0002
11291
11292	struct nspace_resolver_completion_data {
11293	uint32_t req_id;
11294	int32_t resolver_error;
11295	uint64_t orig_gencount;
11296	uint64_t orig_syncroot;
11297	};
11298
11299	static uint32_t
11300	next_nspace_req_id(void)
11301	{
11302	static uint32_t next_req_id;
11303
11304	return OSAddAtomic(`1`, &next_req_id);
11305	}
11306
11307	#define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11308	#define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11309
11310	static LIST_HEAD(nspace_resolver_requesthead,
11311	nspace_resolver_request) * nspace_resolver_request_hashtbl;
11312	static u_long nspace_resolver_request_hashmask;
11313	static u_int nspace_resolver_request_count;
11314	static bool nspace_resolver_request_wait_slot;
11315	static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11316	static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11317	&nspace_resolver_request_lck_grp);
11318
11319	#define NSPACE_REQ_LOCK() \
11320	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11321	#define NSPACE_REQ_UNLOCK() \
11322	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11323
11324	#define NSPACE_RESOLVER_HASH(req_id) \
11325	(&nspace_resolver_request_hashtbl[(req_id) & \
11326	nspace_resolver_request_hashmask])
11327
11328	static struct nspace_resolver_request *
11329	nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11330	{
11331	struct nspace_resolver_requesthead *bucket;
11332	struct nspace_resolver_request *req;
11333
11334	bucket = NSPACE_RESOLVER_HASH(req_id);
11335	LIST_FOREACH(req, bucket, r_hashlink) {
11336	if (req->r_req_id == req_id) {
11337	/*
11338	* If this request already has a completion
11339	* pending, don't return it again.
11340	*/
11341	if ((req->r_flags & RRF_COMPLETING) != `0` &&
11342	skip_completing) {
11343	req = NULL;
11344	}
11345	return req;
11346	}
11347	}
11348
11349	return NULL;
11350	}
11351
11352	static int
11353	nspace_resolver_req_add(struct nspace_resolver_request *req)
11354	{
11355	struct nspace_resolver_requesthead *bucket;
11356	int error;
11357
11358	NSPACE_REQ_LOCK();
11359
11360	while (nspace_resolver_request_count >=
11361	NSPACE_RESOLVER_MAX_OUTSTANDING) {
11362	nspace_resolver_request_wait_slot = true;
11363	error = msleep(chan: &nspace_resolver_request_count,
11364	mtx: &nspace_resolver_request_hash_mutex,
11365	PVFS \| PCATCH, wmesg: "nspacerq", NULL);
11366	if (error) {
11367	NSPACE_REQ_UNLOCK();
11368	return error;
11369	}
11370	}
11371
11372	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11373	#if DIAGNOSTIC
11374	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11375	#endif /* DIAGNOSTIC */
11376	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11377	nspace_resolver_request_count++;
11378
11379	NSPACE_REQ_UNLOCK();
11380
11381	return `0`;
11382	}
11383
11384	static void
11385	nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11386	{
11387	/*
11388	* If a completion is in-progress, we have to wait for the
11389	* completion handler to finish because it's still using 'req',
11390	* which is allocated on our stack a couple of frames up.
11391	*/
11392	while ((req->r_flags & RRF_COMPLETING) != `0`) {
11393	(void) msleep(chan: req, mtx: &nspace_resolver_request_hash_mutex,
11394	PVFS, wmesg: "nspacecmplt", NULL);
11395	}
11396	}
11397
11398	static void
11399	nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11400	{
11401	struct nspace_resolver_requesthead *bucket;
11402
11403	/ We're called with NSPACE_REQ_LOCK held. /
11404
11405	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11406	#if DIAGNOSTIC
11407	assert((req->r_flags & RRF_COMPLETING) == `0`);
11408	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11409	#endif /* DIAGNOSTIC */
11410	LIST_REMOVE(req, r_hashlink);
11411	nspace_resolver_request_count--;
11412
11413	if (nspace_resolver_request_wait_slot) {
11414	nspace_resolver_request_wait_slot = false;
11415	wakeup(chan: &nspace_resolver_request_count);
11416	}
11417
11418	nspace_resolver_req_wait_pending_completion(req);
11419
11420	NSPACE_REQ_UNLOCK();
11421	}
11422
11423	static void
11424	nspace_resolver_req_remove(struct nspace_resolver_request *req)
11425	{
11426	NSPACE_REQ_LOCK();
11427	nspace_resolver_req_remove_and_unlock(req);
11428	}
11429
11430	static void
11431	nspace_resolver_req_cancel(uint32_t req_id)
11432	{
11433	kern_return_t kr;
11434	mach_port_t mp;
11435
11436	// Failures here aren't fatal -- the cancellation message
11437	// sent to the resolver is merely advisory.
11438
11439	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11440	if (kr != KERN_SUCCESS \|\| !IPC_PORT_VALID(mp)) {
11441	return;
11442	}
11443
11444	kr = send_nspace_resolve_cancel(nspace_handler_port: mp, req_id);
11445	if (kr != KERN_SUCCESS) {
11446	os_log_error(OS_LOG_DEFAULT,
11447	"NSPACE send_nspace_resolve_cancel failure: %d", kr);
11448	}
11449
11450	ipc_port_release_send(port: mp);
11451	}
11452
11453	static int
11454	nspace_resolver_req_wait(struct nspace_resolver_request *req)
11455	{
11456	bool send_cancel_message = false;
11457	int error;
11458
11459	NSPACE_REQ_LOCK();
11460
11461	while ((req->r_flags & RRF_COMPLETE) == `0`) {
11462	error = msleep(chan: req, mtx: &nspace_resolver_request_hash_mutex,
11463	PVFS \| PCATCH, wmesg: "nspace", NULL);
11464	if (error && error != ERESTART) {
11465	req->r_resolver_error = (error == EINTR) ? EINTR :
11466	ETIMEDOUT;
11467	send_cancel_message = true;
11468	break;
11469	}
11470	}
11471
11472	nspace_resolver_req_remove_and_unlock(req);
11473
11474	/*
11475	* It's safe to continue referencing 'req' here because it's
11476	* allocated on our caller's stack.
11477	*/
11478
11479	if (send_cancel_message) {
11480	nspace_resolver_req_cancel(req_id: req->r_req_id);
11481	}
11482
11483	return req->r_resolver_error;
11484	}
11485
11486	static void
11487	nspace_resolver_req_mark_complete(
11488	struct nspace_resolver_request *req,
11489	int resolver_error)
11490	{
11491	req->r_resolver_error = resolver_error;
11492	req->r_flags = (req->r_flags & ~RRF_COMPLETING) \| RRF_COMPLETE;
11493	wakeup(chan: req);
11494	}
11495
11496	static void
11497	nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11498	{
11499	req->r_flags \|= RRF_COMPLETING;
11500	}
11501
11502	static void
11503	nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11504	{
11505	struct nspace_resolver_request *req;
11506	int error;
11507	struct vnode_attr va;
11508	vnode_t vp;
11509
11510	NSPACE_REQ_LOCK();
11511
11512	req = nspace_resolver_req_lookup(req_id: c->req_id, true);
11513	if (req == NULL) {
11514	/*
11515	* If we don't find the request corresponding to our req_id,
11516	* just drop the completion on the floor; it's likely that
11517	* the requester interrupted with a signal, or it may already
11518	* be completing.
11519	*/
11520	NSPACE_REQ_UNLOCK();
11521	return;
11522	}
11523
11524	/*
11525	* Get out now if the resolver reported an error.
11526	*/
11527	if ((error = c->resolver_error) != `0`) {
11528	goto out;
11529	}
11530
11531	/*
11532	* If the resolver did not specify any namespace shape criteria
11533	* for letting the operation proceed, then get out now.
11534	*/
11535	if (c->orig_gencount == `0` && c->orig_syncroot == `0`) {
11536	goto out;
11537	}
11538
11539	/*
11540	* We're going to have to acquire the mount rename lock and do
11541	* some I/O in order to verify the criteria. Mark the request
11542	* as pending so no one else messes with it after we drop the
11543	* NSPACE_REQ_LOCK.
11544	*/
11545	nspace_resolver_req_mark_completion_pending(req);
11546	NSPACE_REQ_UNLOCK();
11547
11548	/*
11549	* Lock out renames from changing the shape of the tree while
11550	* validate the criteria.
11551	*/
11552	mount_t locked_mp = req->r_vp->v_mount;
11553	mount_ref(locked_mp, `0`);
11554	mount_lock_renames(locked_mp);
11555
11556	if (c->orig_gencount != `0`) {
11557	vp = req->r_vp;
11558	if (error) {
11559	goto out_dropmount;
11560	}
11561
11562	VATTR_INIT(&va);
11563	VATTR_WANTED(&va, va_recursive_gencount);
11564	error = vnode_getattr(vp, vap: &va, ctx: vfs_context_kernel());
11565	if (error) {
11566	goto out_dropmount;
11567	}
11568	if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) \|\|
11569	va.va_recursive_gencount != c->orig_gencount) {
11570	printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11571	c->orig_gencount, va.va_recursive_gencount);
11572	error = EBUSY;
11573	goto out_dropmount;
11574	}
11575	}
11576
11577	/*
11578	* Ignore orig_syncroot if a destination directory wasn't specified
11579	* in the request.
11580	*/
11581	if (c->orig_syncroot != `0` && (vp = req->r_tdvp) != NULL) {
11582	uint64_t syncroot_id;
11583
11584	if (error) {
11585	goto out_dropmount;
11586	}
11587
11588	#ifndef APFSIOC_GET_SYNC_ROOT
11589	#define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11590	#endif
11591
11592	error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11593	data: (caddr_t)&syncroot_id, fflag: `0`, ctx: vfs_context_kernel());
11594	if (error) {
11595	goto out_dropmount;
11596	}
11597	if (syncroot_id != c->orig_syncroot) {
11598	printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11599	c->orig_syncroot, syncroot_id);
11600	error = EBUSY;
11601	goto out_dropmount;
11602	}
11603	}
11604
11605	out_dropmount:
11606	mount_unlock_renames(locked_mp);
11607	mount_drop(locked_mp, `0`);
11608	NSPACE_REQ_LOCK();
11609
11610	out:
11611	nspace_resolver_req_mark_complete(req, resolver_error: error);
11612	NSPACE_REQ_UNLOCK();
11613	}
11614
11615	static struct proc *nspace_resolver_proc;
11616
11617	static int
11618	nspace_resolver_get_proc_state(struct proc p, int* *is_resolver)
11619	{
11620	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11621	p == nspace_resolver_proc) ? `1` : `0`;
11622	return `0`;
11623	}
11624
11625	static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11626
11627	static int
11628	nspace_resolver_set_proc_state(struct proc p, int* is_resolver)
11629	{
11630	vfs_context_t ctx = vfs_context_current();
11631	int error = `0`;
11632
11633	//
11634	// The system filecoordinationd runs as uid == 0. This also
11635	// has the nice side-effect of filtering out filecoordinationd
11636	// running in the simulator.
11637	//
11638	if (!vfs_context_issuser(ctx) \|\|
11639	!vfs_context_is_dataless_resolver(ctx)) {
11640	return EPERM;
11641	}
11642
11643	if (is_resolver) {
11644	NSPACE_REQ_LOCK();
11645
11646	if (nspace_resolver_proc == NULL) {
11647	proc_lock(p);
11648	p->p_lflag \|= P_LNSPACE_RESOLVER;
11649	proc_unlock(p);
11650	nspace_resolver_proc = p;
11651	} else {
11652	error = EBUSY;
11653	}
11654
11655	NSPACE_REQ_UNLOCK();
11656	} else {
11657	// This is basically just like the exit case.
11658	// nspace_resolver_exited() will verify that the
11659	// process is the resolver, and will clear the
11660	// global.
11661	nspace_resolver_exited(p);
11662	}
11663
11664	return error;
11665	}
11666
11667	static int
11668	nspace_materialization_get_proc_state(struct proc p, int* *is_prevented)
11669	{
11670	if ((p->p_lflag & P_LNSPACE_RESOLVER) != `0` \|\|
11671	(p->p_vfs_iopolicy &
11672	P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == `0`) {
11673	*is_prevented = `1`;
11674	} else {
11675	*is_prevented = `0`;
11676	}
11677	return `0`;
11678	}
11679
11680	static int
11681	nspace_materialization_set_proc_state(struct proc p, int* is_prevented)
11682	{
11683	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11684	return is_prevented ? `0` : EBUSY;
11685	}
11686
11687	if (is_prevented) {
11688	OSBitAndAtomic16(mask: ~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), address: &p->p_vfs_iopolicy);
11689	} else {
11690	OSBitOrAtomic16(mask: (uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, address: &p->p_vfs_iopolicy);
11691	}
11692	return `0`;
11693	}
11694
11695	static int
11696	nspace_materialization_get_thread_state(int *is_prevented)
11697	{
11698	uthread_t ut = current_uthread();
11699
11700	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? `1` : `0`;
11701	return `0`;
11702	}
11703
11704	static int
11705	nspace_materialization_set_thread_state(int is_prevented)
11706	{
11707	uthread_t ut = current_uthread();
11708
11709	if (is_prevented) {
11710	ut->uu_flag \|= UT_NSPACE_NODATALESSFAULTS;
11711	} else {
11712	ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11713	}
11714	return `0`;
11715	}
11716
11717	/ the vfs.nspace branch /
11718	SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11719
11720	static int
11721	sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11722	__unused void arg1, __unused int* arg2, struct sysctl_req *req)
11723	{
11724	struct proc *p = req->p;
11725	int new_value, old_value, changed = `0`;
11726	int error;
11727
11728	error = nspace_resolver_get_proc_state(p, is_resolver: &old_value);
11729	if (error) {
11730	return error;
11731	}
11732
11733	error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11734	changed: &changed);
11735	if (error == `0` && changed) {
11736	error = nspace_resolver_set_proc_state(p, is_resolver: new_value);
11737	}
11738	return error;
11739	}
11740
11741	/ decorate this process as the dataless file resolver /
11742	SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11743	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_LOCKED,
11744	`0`, `0`, sysctl_nspace_resolver, "I", "");
11745
11746	static int
11747	sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11748	__unused void arg1, __unused int* arg2, struct sysctl_req *req)
11749	{
11750	struct proc *p = req->p;
11751	int new_value, old_value, changed = `0`;
11752	int error;
11753
11754	error = nspace_materialization_get_proc_state(p, is_prevented: &old_value);
11755	if (error) {
11756	return error;
11757	}
11758
11759	error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11760	changed: &changed);
11761	if (error == `0` && changed) {
11762	error = nspace_materialization_set_proc_state(p, is_prevented: new_value);
11763	}
11764	return error;
11765	}
11766
11767	/ decorate this process as not wanting to materialize dataless files /
11768	SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11769	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_LOCKED,
11770	`0`, `0`, sysctl_nspace_prevent_materialization, "I", "");
11771
11772	static int
11773	sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11774	__unused void arg1, __unused int* arg2, struct sysctl_req *req)
11775	{
11776	int new_value, old_value, changed = `0`;
11777	int error;
11778
11779	error = nspace_materialization_get_thread_state(is_prevented: &old_value);
11780	if (error) {
11781	return error;
11782	}
11783
11784	error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11785	changed: &changed);
11786	if (error == `0` && changed) {
11787	error = nspace_materialization_set_thread_state(is_prevented: new_value);
11788	}
11789	return error;
11790	}
11791
11792	/ decorate this thread as not wanting to materialize dataless files /
11793	SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11794	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_LOCKED,
11795	`0`, `0`, sysctl_nspace_thread_prevent_materialization, "I", "");
11796
11797	static int
11798	sysctl_nspace_complete(__unused struct sysctl_oid oidp, __unused void* *arg1,
11799	__unused int arg2, struct sysctl_req *req)
11800	{
11801	struct proc *p = req->p;
11802	uint32_t req_status[`2`] = { `0`, `0` };
11803	uint64_t gencount = `0`;
11804	uint64_t syncroot = `0`;
11805	int error, is_resolver, changed = `0`, other_changed;
11806
11807	error = nspace_resolver_get_proc_state(p, is_resolver: &is_resolver);
11808	if (error) {
11809	return error;
11810	}
11811
11812	if (!is_resolver) {
11813	return EPERM;
11814	}
11815
11816	error = sysctl_io_opaque(req, pValue: req_status, valueSize: sizeof(req_status),
11817	changed: &changed);
11818	if (error) {
11819	return error;
11820	}
11821
11822	/*
11823	* Get the gencount if it was passed. Ignore errors, because
11824	* it's optional.
11825	*/
11826	error = sysctl_io_opaque(req, pValue: &gencount, valueSize: sizeof(gencount),
11827	changed: &other_changed);
11828	if (error) {
11829	gencount = `0`;
11830	error = `0`;
11831	}
11832
11833	/*
11834	* ...and now the syncroot ID.
11835	*/
11836	error = sysctl_io_opaque(req, pValue: &syncroot, valueSize: sizeof(syncroot),
11837	changed: &other_changed);
11838	if (error) {
11839	syncroot = `0`;
11840	error = `0`;
11841	}
11842
11843	/*
11844	* req_status[0] is the req_id
11845	*
11846	* req_status[1] is the errno
11847	*/
11848	if (error == `0` && changed) {
11849	const struct nspace_resolver_completion_data cd = {
11850	.req_id = req_status[`0`],
11851	.resolver_error = req_status[`1`],
11852	.orig_gencount = gencount,
11853	.orig_syncroot = syncroot,
11854	};
11855	nspace_resolver_req_completed(c: &cd);
11856	}
11857	return error;
11858	}
11859
11860	/ Resolver reports completed reqs here. /
11861	SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11862	CTLTYPE_OPAQUE \| CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_LOCKED,
11863	`0`, `0`, sysctl_nspace_complete, "-", "");
11864
11865	#endif /* CONFIG_DATALESS_FILES */
11866
11867	#if CONFIG_DATALESS_FILES
11868	#define __no_dataless_unused /* nothing */
11869	#else
11870	#define __no_dataless_unused __unused
11871	#endif
11872
11873	int
11874	vfs_context_dataless_materialization_is_prevented(
11875	vfs_context_t const ctx __no_dataless_unused)
11876	{
11877	#if CONFIG_DATALESS_FILES
11878	proc_t const p = vfs_context_proc(ctx);
11879	thread_t const t = vfs_context_thread(ctx);
11880	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11881
11882	/*
11883	* Kernel context ==> return EDEADLK, as we would with any random
11884	* process decorated as no-materialize.
11885	*/
11886	if (ctx == vfs_context_kernel()) {
11887	return EDEADLK;
11888	}
11889
11890	/*
11891	* If the process has the dataless-manipulation entitlement,
11892	* materialization is prevented, and depending on the kind
11893	* of file system operation, things get to proceed as if the
11894	* object is not dataless.
11895	*/
11896	if (vfs_context_is_dataless_manipulator(ctx)) {
11897	return EJUSTRETURN;
11898	}
11899
11900	/*
11901	* Per-thread decorations override any process-wide decorations.
11902	* (Foundation uses this, and this overrides even the dataless-
11903	* manipulation entitlement so as to make API contracts consistent.)
11904	*/
11905	if (ut != NULL) {
11906	if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11907	return EDEADLK;
11908	}
11909	if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11910	return `0`;
11911	}
11912	}
11913
11914	/*
11915	* If the process's iopolicy specifies that dataless files
11916	* can be materialized, then we let it go ahead.
11917	*/
11918	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11919	return `0`;
11920	}
11921	#endif /* CONFIG_DATALESS_FILES */
11922
11923	/*
11924	* The default behavior is to not materialize dataless files;
11925	* return to the caller that deadlock was detected.
11926	*/
11927	return EDEADLK;
11928	}
11929
11930	void
11931	nspace_resolver_init(void)
11932	{
11933	#if CONFIG_DATALESS_FILES
11934	nspace_resolver_request_hashtbl =
11935	hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11936	M_VNODE / XXX /, hashmask: &nspace_resolver_request_hashmask);
11937	#endif /* CONFIG_DATALESS_FILES */
11938	}
11939
11940	void
11941	nspace_resolver_exited(struct proc *p __no_dataless_unused)
11942	{
11943	#if CONFIG_DATALESS_FILES
11944	struct nspace_resolver_requesthead *bucket;
11945	struct nspace_resolver_request *req;
11946	u_long idx;
11947
11948	NSPACE_REQ_LOCK();
11949
11950	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11951	p == nspace_resolver_proc) {
11952	for (idx = `0`; idx <= nspace_resolver_request_hashmask; idx++) {
11953	bucket = &nspace_resolver_request_hashtbl[idx];
11954	LIST_FOREACH(req, bucket, r_hashlink) {
11955	nspace_resolver_req_wait_pending_completion(req);
11956	nspace_resolver_req_mark_complete(req,
11957	ETIMEDOUT);
11958	}
11959	}
11960	nspace_resolver_proc = NULL;
11961	}
11962
11963	NSPACE_REQ_UNLOCK();
11964	#endif /* CONFIG_DATALESS_FILES */
11965	}
11966
11967	#define DATALESS_RESOLVER_ENTITLEMENT \
11968	"com.apple.private.vfs.dataless-resolver"
11969	#define DATALESS_MANIPULATION_ENTITLEMENT \
11970	"com.apple.private.vfs.dataless-manipulation"
11971
11972	#if CONFIG_DATALESS_FILES
11973	/*
11974	* Return TRUE if the vfs context is associated with the dataless
11975	* resolver.
11976	*/
11977	static boolean_t
11978	vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11979	{
11980	return IOTaskHasEntitlement(task: vfs_context_task(ctx),
11981	DATALESS_RESOLVER_ENTITLEMENT);
11982	}
11983	#endif /* CONFIG_DATALESS_FILES */
11984
11985	/*
11986	* Return TRUE if the vfs context is associated with a process entitled
11987	* for dataless manipulation.
11988	*
11989	* XXX Arguably belongs in vfs_subr.c, but is here because of the
11990	* complication around CONFIG_DATALESS_FILES.
11991	*/
11992	boolean_t
11993	vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11994	{
11995	#if CONFIG_DATALESS_FILES
11996	task_t task = vfs_context_task(ctx);
11997	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) \|\|
11998	IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11999	#else
12000	return false;
12001	#endif /* CONFIG_DATALESS_FILES */
12002	}
12003
12004	#if CONFIG_DATALESS_FILES
12005	static void
12006	log_materialization_prevented(vnode_t vp, uint64_t op)
12007	{
12008	char p_name[MAXCOMLEN + `1`];
12009	char *vntype;
12010	proc_selfname(buf: &p_name[`0`], size: sizeof(p_name));
12011
12012	if (vp->v_type == VREG) {
12013	vntype = "File";
12014	} else if (vp->v_type == VDIR) {
12015	vntype = "Dir";
12016	} else if (vp->v_type == VLNK) {
12017	vntype = "SymLink";
12018	} else {
12019	vntype = "Other";
12020	}
12021
12022	#if DEVELOPMENT
12023	char *path = NULL;
12024	int len;
12025
12026	path = get_pathbuff();
12027	len = MAXPATHLEN;
12028	if (path) {
12029	vn_getpath(vp, path, &len);
12030	}
12031
12032	os_log_debug(OS_LOG_DEFAULT,
12033	"NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12034	p_name, proc_selfpid(),
12035	op, vntype, path ? path : "<unknown-path>");
12036	if (path) {
12037	release_pathbuff(path);
12038	}
12039	#else
12040	os_log_debug(OS_LOG_DEFAULT,
12041	"NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12042	p_name, proc_selfpid(),
12043	op, vntype);
12044	#endif
12045	}
12046	#endif /* CONFIG_DATALESS_FILES */
12047
12048	static int
12049	vfs_materialize_item(
12050	vnode_t vp __no_dataless_unused,
12051	uint32_t op __no_dataless_unused,
12052	int64_t offset __no_dataless_unused,
12053	int64_t size __no_dataless_unused,
12054	char *lookup_name __no_dataless_unused,
12055	size_t const namelen __no_dataless_unused,
12056	vnode_t tdvp __no_dataless_unused)
12057	{
12058	#if CONFIG_DATALESS_FILES
12059	kern_return_t kern_ret;
12060	mach_port_t mach_port;
12061	char *path = NULL;
12062	vfs_context_t context;
12063	int path_len;
12064	int error;
12065	audit_token_t atoken;
12066	enum vtype vp_vtype;
12067
12068	/ Swap files are special; ignore them /
12069	if (vnode_isswap(vp)) {
12070	return `0`;
12071	}
12072
12073	/*
12074	* NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12075	* are no longer used nor supported.
12076	*/
12077	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12078	os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12079	return ENOTSUP;
12080	}
12081	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12082	os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12083	return ENOTSUP;
12084	}
12085
12086	/ Normalize 'op'. /
12087	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12088
12089	/*
12090	* To-directory is only meaningful for rename operations;
12091	* ignore it if someone handed one to us unexpectedly.
12092	*/
12093	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12094	tdvp = NULL;
12095	}
12096
12097	context = vfs_context_current();
12098
12099	/ Remember this for later. /
12100	vp_vtype = vnode_vtype(vp);
12101
12102	error = vfs_context_dataless_materialization_is_prevented(ctx: context);
12103	if (error) {
12104	log_materialization_prevented(vp, op);
12105	goto out_check_errors;
12106	}
12107
12108	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12109	&mach_port);
12110	if (kern_ret != KERN_SUCCESS \|\| !IPC_PORT_VALID(mach_port)) {
12111	os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12112	/*
12113	* Treat this like being unable to access the backing store
12114	* server.
12115	*/
12116	return ETIMEDOUT;
12117	}
12118
12119	int path_alloc_len = MAXPATHLEN;
12120	do {
12121	path = kalloc_data(path_alloc_len, Z_WAITOK \| Z_ZERO);
12122	if (path == NULL) {
12123	return ENOMEM;
12124	}
12125
12126	path_len = path_alloc_len;
12127	error = vn_getpath(vp, pathbuf: path, len: &path_len);
12128	if (error == `0`) {
12129	break;
12130	} else if (error == ENOSPC) {
12131	kfree_data(path, path_alloc_len);
12132	path = NULL;
12133	} else {
12134	goto out_release_port;
12135	}
12136	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12137
12138	error = vfs_context_copy_audit_token(ctx: context, token: &atoken);
12139	if (error) {
12140	goto out_release_port;
12141	}
12142
12143	struct nspace_resolver_request req = {
12144	.r_req_id = next_nspace_req_id(),
12145	.r_vp = vp,
12146	.r_tdvp = tdvp,
12147	};
12148
12149	error = nspace_resolver_req_add(req: &req);
12150	if (error) {
12151	goto out_release_port;
12152	}
12153
12154	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12155
12156	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12157	char *dest_path = NULL;
12158	int dest_path_len;
12159
12160	dest_path = zalloc(view: ZV_NAMEI);
12161	dest_path_len = MAXPATHLEN;
12162
12163	error = vn_getpath(vp: tdvp, pathbuf: dest_path, len: &dest_path_len);
12164	if (error) {
12165	zfree(ZV_NAMEI, dest_path);
12166	goto out_release_port;
12167	}
12168
12169	/*
12170	* Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12171	* compatibility with existing agents in user-space
12172	* who get passed this value.
12173	*/
12174	kern_ret = send_vfs_resolve_reparent_with_audit_token(nspace_handler_port: mach_port,
12175	req_id: req.r_req_id,
12176	op: op \| NAMESPACE_HANDLER_NSPACE_EVENT,
12177	path, dest_path, req_atoken: atoken);
12178
12179	zfree(ZV_NAMEI, dest_path);
12180	} else if (vp_vtype == VDIR) {
12181	char *tmpname = NULL;
12182
12183	/*
12184	* If the caller provided a lookup_name and a name length,
12185	* then we assume the lookup_name is not NUL-terminated.
12186	* Allocate a temporary buffer in this case to provide
12187	* a NUL-terminated path name to the IPC call.
12188	*/
12189	if (lookup_name != NULL && namelen != `0`) {
12190	if (namelen >= PATH_MAX) {
12191	error = EINVAL;
12192	goto out_req_remove;
12193	}
12194	tmpname = zalloc(view: ZV_NAMEI);
12195	strlcpy(dst: tmpname, src: lookup_name, n: namelen + `1`);
12196	lookup_name = tmpname;
12197	} else if (lookup_name != NULL) {
12198	/*
12199	* If the caller provided a lookup_name with a
12200	* zero name length, then we assume it's NUL-
12201	* terminated. Verify it has a valid length.
12202	*/
12203	if (strlen(s: lookup_name) >= PATH_MAX) {
12204	error = EINVAL;
12205	goto out_req_remove;
12206	}
12207	}
12208
12209	/ (See above.) /
12210	kern_ret = send_vfs_resolve_dir_with_audit_token(nspace_handler_port: mach_port,
12211	req_id: req.r_req_id,
12212	op: op \| NAMESPACE_HANDLER_NSPACE_EVENT,
12213	file_name: lookup_name == NULL ? "" : lookup_name, path, req_atoken: atoken);
12214
12215	if (tmpname != NULL) {
12216	zfree(ZV_NAMEI, tmpname);
12217
12218	/*
12219	* Poison lookup_name rather than reference
12220	* freed memory.
12221	*/
12222	lookup_name = NULL;
12223	}
12224	} else {
12225	/ (See above.) /
12226	kern_ret = send_vfs_resolve_file_with_audit_token(nspace_handler_port: mach_port,
12227	req_id: req.r_req_id,
12228	op: op \| NAMESPACE_HANDLER_NSPACE_EVENT,
12229	offset, size, path, req_atoken: atoken);
12230	}
12231	if (kern_ret != KERN_SUCCESS) {
12232	/*
12233	* Also treat this like being unable to access the backing
12234	* store server.
12235	*/
12236	os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12237	kern_ret);
12238	error = ETIMEDOUT;
12239	goto out_req_remove;
12240	}
12241
12242	/*
12243	* Give back the memory we allocated earlier while we wait; we
12244	* no longer need it.
12245	*/
12246	kfree_data(path, path_alloc_len);
12247	path = NULL;
12248
12249	/*
12250	* Request has been submitted to the resolver. Now (interruptibly)
12251	* wait for completion. Upon requrn, the request will have been
12252	* removed from the lookup table.
12253	*/
12254	error = nspace_resolver_req_wait(req: &req);
12255
12256	out_release_port:
12257	if (path != NULL) {
12258	kfree_data(path, path_alloc_len);
12259	path = NULL;
12260	}
12261	ipc_port_release_send(port: mach_port);
12262
12263	out_check_errors:
12264	/*
12265	* The file resolver owns the logic about what error to return
12266	* to the caller. We only need to handle a couple of special
12267	* cases here:
12268	*/
12269	if (error == EJUSTRETURN) {
12270	/*
12271	* The requesting process is allowed to interact with
12272	* dataless objects. Make a couple of sanity-checks
12273	* here to ensure the action makes sense.
12274	*/
12275	switch (op) {
12276	case NAMESPACE_HANDLER_WRITE_OP:
12277	case NAMESPACE_HANDLER_TRUNCATE_OP:
12278	case NAMESPACE_HANDLER_RENAME_OP:
12279	/*
12280	* This handles the case of the resolver itself
12281	* writing data to the file (or throwing it
12282	* away).
12283	*/
12284	error = `0`;
12285	break;
12286	case NAMESPACE_HANDLER_READ_OP:
12287	case NAMESPACE_HANDLER_LOOKUP_OP:
12288	/*
12289	* This handles the case of the resolver needing
12290	* to look up inside of a dataless directory while
12291	* it's in the process of materializing it (for
12292	* example, creating files or directories).
12293	*/
12294	error = (vp_vtype == VDIR) ? `0` : EBADF;
12295	break;
12296	default:
12297	error = EBADF;
12298	break;
12299	}
12300	}
12301
12302	return error;
12303
12304	out_req_remove:
12305	nspace_resolver_req_remove(req: &req);
12306	goto out_release_port;
12307	#else
12308	return ENOTSUP;
12309	#endif /* CONFIG_DATALESS_FILES */
12310	}
12311
12312	/*
12313	* vfs_materialize_file: Materialize a regular file.
12314	*
12315	* Inputs:
12316	* vp The dataless file to be materialized.
12317	*
12318	* op What kind of operation is being performed:
12319	* -> NAMESPACE_HANDLER_READ_OP
12320	* -> NAMESPACE_HANDLER_WRITE_OP
12321	* -> NAMESPACE_HANDLER_LINK_CREATE
12322	* -> NAMESPACE_HANDLER_DELETE_OP
12323	* -> NAMESPACE_HANDLER_TRUNCATE_OP
12324	* -> NAMESPACE_HANDLER_RENAME_OP
12325	*
12326	* offset offset of I/O for READ or WRITE. Ignored for
12327	* other ops.
12328	*
12329	* size size of I/O for READ or WRITE Ignored for
12330	* other ops.
12331	*
12332	* If offset or size are -1 for a READ or WRITE, then the resolver should
12333	* consider the range to be unknown.
12334	*
12335	* Upon successful return, the caller may proceed with the operation.
12336	* N.B. the file may still be "dataless" in this case.
12337	*/
12338	int
12339	vfs_materialize_file(
12340	struct vnode *vp,
12341	uint64_t op,
12342	int64_t offset,
12343	int64_t size)
12344	{
12345	if (vp->v_type != VREG) {
12346	return EFTYPE;
12347	}
12348	return vfs_materialize_item(vp, op: (uint32_t)op, offset, size, NULL, namelen: `0`,
12349	NULL);
12350	}
12351
12352	/*
12353	* vfs_materialize_dir:
12354	*
12355	* Inputs:
12356	* vp The dataless directory to be materialized.
12357	*
12358	* op What kind of operation is being performed:
12359	* -> NAMESPACE_HANDLER_READ_OP
12360	* -> NAMESPACE_HANDLER_WRITE_OP
12361	* -> NAMESPACE_HANDLER_DELETE_OP
12362	* -> NAMESPACE_HANDLER_RENAME_OP
12363	* -> NAMESPACE_HANDLER_LOOKUP_OP
12364	*
12365	* lookup_name Name being looked up for a LOOKUP op. Ignored for
12366	* other ops. May or may not be NUL-terminated; see below.
12367	*
12368	* namelen If non-zero, then lookup_name is assumed to not be NUL-
12369	* terminated and namelen is the number of valid bytes in
12370	* lookup_name. If zero, then lookup_name is assumed to be
12371	* NUL-terminated.
12372	*
12373	* Upon successful return, the caller may proceed with the operation.
12374	* N.B. the directory may still be "dataless" in this case.
12375	*/
12376	int
12377	vfs_materialize_dir(
12378	struct vnode *vp,
12379	uint64_t op,
12380	char *lookup_name,
12381	size_t namelen)
12382	{
12383	if (vp->v_type != VDIR) {
12384	return EFTYPE;
12385	}
12386	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12387	return EINVAL;
12388	}
12389	return vfs_materialize_item(vp, op: (uint32_t)op, offset: `0`, size: `0`, lookup_name,
12390	namelen, NULL);
12391	}
12392
12393	/*
12394	* vfs_materialize_reparent:
12395	*
12396	* Inputs:
12397	* vp The dataless file or directory to be materialized.
12398	*
12399	* tdvp The new parent directory for the dataless file.
12400	*
12401	* Upon successful return, the caller may proceed with the operation.
12402	* N.B. the item may still be "dataless" in this case.
12403	*/
12404	int
12405	vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12406	{
12407	if (vp->v_type != VDIR && vp->v_type != VREG) {
12408	return EFTYPE;
12409	}
12410	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12411	offset: `0`, size: `0`, NULL, namelen: `0`, tdvp);
12412	}
12413
12414	#if 0
12415	static int
12416	build_volfs_path(struct vnode vp, char* path, int* *len)
12417	{
12418	struct vnode_attr va;
12419	int ret;
12420
12421	VATTR_INIT(&va);
12422	VATTR_WANTED(&va, va_fsid);
12423	VATTR_WANTED(&va, va_fileid);
12424
12425	if (vnode_getattr(vp, &va, vfs_context_kernel()) != `0`) {
12426	len = snprintf(path, len, "/non/existent/path/because/vnode_getattr/failed") + `1`;
12427	ret = -`1`;
12428	} else {
12429	len = snprintf(path, len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + `1`;
12430	ret = `0`;
12431	}
12432
12433	return ret;
12434	}
12435	#endif
12436
12437	static unsigned long
12438	fsctl_bogus_command_compat(unsigned long cmd)
12439	{
12440	switch (cmd) {
12441	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12442	return FSIOC_SYNC_VOLUME;
12443	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12444	return FSIOC_ROUTEFS_SETROUTEID;
12445	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12446	return FSIOC_SET_PACKAGE_EXTS;
12447	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12448	return FSIOC_SET_FSTYPENAME_OVERRIDE;
12449	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12450	return DISK_CONDITIONER_IOC_GET;
12451	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12452	return DISK_CONDITIONER_IOC_SET;
12453	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12454	return FSIOC_FIOSEEKHOLE;
12455	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12456	return FSIOC_FIOSEEKDATA;
12457	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12458	return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12459	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12460	return SPOTLIGHT_IOC_GET_LAST_MTIME;
12461	}
12462
12463	return cmd;
12464	}
12465
12466	static int
12467	cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12468	{
12469	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, data: arg, FWRITE, ctx);
12470	}
12471
12472	static int __attribute__((noinline))
12473	handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12474	{
12475	struct vfs_attr vfa;
12476	mount_t mp = vp->v_mount;
12477	unsigned arg;
12478	int error;
12479
12480	/ record vid of vp so we can drop it below. /
12481	uint32_t vvid = vp->v_id;
12482
12483	/*
12484	* Then grab mount_iterref so that we can release the vnode.
12485	* Without this, a thread may call vnode_iterate_prepare then
12486	* get into a deadlock because we've never released the root vp
12487	*/
12488	error = mount_iterref(mp, `0`);
12489	if (error) {
12490	return error;
12491	}
12492	vnode_hold(vp);
12493	vnode_put(vp);
12494
12495	arg = MNT_NOWAIT;
12496	if ((uint32_t)data & FSCTL_SYNC_WAIT) {
12497	arg = MNT_WAIT;
12498	}
12499
12500	/*
12501	* If the filessytem supports multiple filesytems in a
12502	* partition (For eg APFS volumes in a container, it knows
12503	* that the waitfor argument to VFS_SYNC are flags.
12504	*/
12505	VFSATTR_INIT(&vfa);
12506	VFSATTR_WANTED(&vfa, f_capabilities);
12507	if ((vfs_getattr(mp, vfa: &vfa, ctx: vfs_context_current()) == `0`) &&
12508	VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12509	((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12510	((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12511	arg \|= MNT_VOLUME;
12512	}
12513
12514	/ issue the sync for this volume /
12515	(void)sync_callback(mp, arg: &arg);
12516
12517	/*
12518	* Then release the mount_iterref once we're done syncing; it's not
12519	* needed for the VNOP_IOCTL below
12520	*/
12521	mount_iterdrop(mp);
12522
12523	if (arg & FSCTL_SYNC_FULLSYNC) {
12524	/ re-obtain vnode iocount on the root vp, if possible /
12525	error = vnode_getwithvid(vp, vvid);
12526	if (error == `0`) {
12527	error = VNOP_IOCTL(vp, F_FULLFSYNC, data: (caddr_t)NULL, fflag: `0`, ctx);
12528	vnode_put(vp);
12529	}
12530	}
12531	vnode_drop(vp);
12532	/ mark the argument VP as having been released /
12533	*arg_vp = NULL;
12534	return error;
12535	}
12536
12537	#if ROUTEFS
12538	static int __attribute__((noinline))
12539	handle_routes(user_addr_t udata)
12540	{
12541	char routepath[MAXPATHLEN];
12542	size_t len = `0`;
12543	int error;
12544
12545	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12546	return error;
12547	}
12548	bzero(routepath, MAXPATHLEN);
12549	error = copyinstr(udata, &routepath[`0`], MAXPATHLEN, &len);
12550	if (error) {
12551	return error;
12552	}
12553	error = routefs_kernel_mount(routepath);
12554	return error;
12555	}
12556	#endif
12557
12558	static int __attribute__((noinline))
12559	handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12560	{
12561	struct fsioc_cas_bsdflags cas = (struct* fsioc_cas_bsdflags *)data;
12562	struct vnode_attr va;
12563	int error;
12564
12565	VATTR_INIT(&va);
12566	VATTR_SET(&va, va_flags, cas->new_flags);
12567
12568	error = chflags0(vp, va: &va, setattr: cas_bsdflags_setattr, arg: cas, ctx);
12569
12570	#if CONFIG_FSE
12571	if (error == `0` && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12572	add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12573	}
12574	#endif
12575
12576	return error;
12577	}
12578
12579	static int __attribute__((noinline))
12580	handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12581	{
12582	struct mount *mp = NULL;
12583	errno_t rootauth = `0`;
12584
12585	mp = vp->v_mount;
12586
12587	/*
12588	* query the underlying FS and see if it reports something
12589	* sane for this vnode. If volume is authenticated via
12590	* chunklist, leave that for the caller to determine.
12591	*/
12592	rootauth = VNOP_IOCTL(vp, command: cmd, data, fflag: (int)options, ctx);
12593
12594	return rootauth;
12595	}
12596
12597	#define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12598	"com.apple.private.kernel.set-package-extensions"
12599
12600	/*
12601	* Make a filesystem-specific control call:
12602	*/
12603	/ ARGSUSED /
12604	static int
12605	fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12606	{
12607	int error = `0`;
12608	boolean_t is64bit;
12609	u_int size;
12610	#define STK_PARAMS 128
12611	char stkbuf[STK_PARAMS] = {`0`};
12612	caddr_t data, memp;
12613	vnode_t vp = *arg_vp;
12614
12615	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
12616	return ENOTTY;
12617	}
12618
12619	cmd = fsctl_bogus_command_compat(cmd);
12620
12621	size = IOCPARM_LEN(cmd);
12622	if (size > IOCPARM_MAX) {
12623	return EINVAL;
12624	}
12625
12626	is64bit = proc_is64bit(p);
12627
12628	memp = NULL;
12629
12630	if (size > sizeof(stkbuf)) {
12631	if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == `0`) {
12632	return ENOMEM;
12633	}
12634	data = memp;
12635	} else {
12636	data = &stkbuf[`0`];
12637	};
12638
12639	if (cmd & IOC_IN) {
12640	if (size) {
12641	error = copyin(udata, data, size);
12642	if (error) {
12643	if (memp) {
12644	kfree_data(memp, size);
12645	}
12646	return error;
12647	}
12648	} else {
12649	if (is64bit) {
12650	(user_addr_t )data = udata;
12651	} else {
12652	(uint32_t )data = (uint32_t)udata;
12653	}
12654	};
12655	} else if ((cmd & IOC_OUT) && size) {
12656	/*
12657	* Zero the buffer so the user always
12658	* gets back something deterministic.
12659	*/
12660	bzero(s: data, n: size);
12661	} else if (cmd & IOC_VOID) {
12662	if (is64bit) {
12663	(user_addr_t )data = udata;
12664	} else {
12665	(uint32_t )data = (uint32_t)udata;
12666	}
12667	}
12668
12669	/ Check to see if it's a generic command /
12670	switch (cmd) {
12671	case FSIOC_SYNC_VOLUME:
12672	error = handle_sync_volume(vp, arg_vp, data, ctx);
12673	break;
12674
12675	case FSIOC_ROUTEFS_SETROUTEID:
12676	#if ROUTEFS
12677	error = handle_routes(udata);
12678	#endif
12679	break;
12680
12681	case FSIOC_SET_PACKAGE_EXTS: {
12682	user_addr_t ext_strings;
12683	uint32_t num_entries;
12684	uint32_t max_width;
12685
12686	if (!IOTaskHasEntitlement(task: vfs_context_task(ctx),
12687	SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12688	error = EPERM;
12689	break;
12690	}
12691
12692	if ((is64bit && size != sizeof(user64_package_ext_info))
12693	\|\| (is64bit == `0` && size != sizeof(user32_package_ext_info))) {
12694	// either you're 64-bit and passed a 64-bit struct or
12695	// you're 32-bit and passed a 32-bit struct. otherwise
12696	// it's not ok.
12697	error = EINVAL;
12698	break;
12699	}
12700
12701	if (is64bit) {
12702	if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12703	assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12704	}
12705	ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12706	num_entries = ((user64_package_ext_info *)data)->num_entries;
12707	max_width = ((user64_package_ext_info *)data)->max_width;
12708	} else {
12709	ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12710	num_entries = ((user32_package_ext_info *)data)->num_entries;
12711	max_width = ((user32_package_ext_info *)data)->max_width;
12712	}
12713	error = set_package_extensions_table(data: ext_strings, nentries: num_entries, maxwidth: max_width);
12714	}
12715	break;
12716
12717	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12718	{
12719	mount_t mp;
12720
12721	if ((error = suser(cred: kauth_cred_get(), acflag: &(current_proc()->p_acflag)))) {
12722	break;
12723	}
12724	if ((mp = vp->v_mount) != NULL) {
12725	mount_lock(mp);
12726	if (data[`0`] != `0`) {
12727	for (int i = `0`; i < MFSTYPENAMELEN; i++) {
12728	if (!data[i]) {
12729	goto continue_copy;
12730	}
12731	}
12732	/*
12733	* Getting here means we have a user data
12734	* string which has no NULL termination in
12735	* its first MFSTYPENAMELEN bytes. This is
12736	* bogus, let's avoid strlcpy-ing the read
12737	* data and return an error.
12738	*/
12739	error = EINVAL;
12740	goto unlock;
12741	continue_copy:
12742	vfs_setfstypename_locked(mp, name: data);
12743	if (vfs_isrdonly(mp) &&
12744	strcmp(s1: data, s2: "mtmfs") == `0`) {
12745	mp->mnt_kern_flag \|=
12746	MNTK_EXTENDED_SECURITY;
12747	mp->mnt_kern_flag &=
12748	~MNTK_AUTH_OPAQUE;
12749	}
12750	} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12751	const char *name =
12752	vfs_getfstypenameref_locked(mp, NULL);
12753	if (strcmp(s1: name, s2: "mtmfs") == `0`) {
12754	mp->mnt_kern_flag &=
12755	~MNTK_EXTENDED_SECURITY;
12756	}
12757	vfs_setfstypename_locked(mp, NULL);
12758	}
12759	unlock:
12760	mount_unlock(mp);
12761	}
12762	}
12763	break;
12764
12765	case DISK_CONDITIONER_IOC_GET: {
12766	error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12767	}
12768	break;
12769
12770	case DISK_CONDITIONER_IOC_SET: {
12771	error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12772	}
12773	break;
12774
12775	case FSIOC_CAS_BSDFLAGS:
12776	error = handle_flags(vp, data, ctx);
12777	break;
12778
12779	case FSIOC_FD_ONLY_OPEN_ONCE: {
12780	error = `0`;
12781	if (vnode_usecount(vp) > `1`) {
12782	vnode_lock_spin(vp);
12783	if (vp->v_lflag & VL_HASSTREAMS) {
12784	if (vnode_isinuse_locked(vp, `1`, `1`)) {
12785	error = EBUSY;
12786	}
12787	} else if (vnode_usecount(vp) > `1`) {
12788	error = EBUSY;
12789	}
12790	vnode_unlock(vp);
12791	}
12792	}
12793	break;
12794
12795	case FSIOC_EVAL_ROOTAUTH:
12796	error = handle_auth(vp, cmd, data, options, ctx);
12797	break;
12798
12799	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12800	error = test_fse_access_granted(vp, type: (unsigned long)udata, ctx);
12801	break;
12802
12803	#if CONFIG_EXCLAVES
12804	case FSIOC_EXCLAVE_FS_REGISTER:
12805	if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12806	error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12807	} else {
12808	error = EPERM;
12809	}
12810	break;
12811
12812	case FSIOC_EXCLAVE_FS_UNREGISTER:
12813	if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12814	error = vfs_exclave_fs_unregister(vp);
12815	} else {
12816	error = EPERM;
12817	}
12818	break;
12819
12820	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12821	exclave_fs_get_base_dirs_t get_base_dirs = ((exclave_fs_get_base_dirs_t )data);
12822	exclave_fs_base_dir_t *dirs = NULL;
12823	if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12824	error = EPERM;
12825	break;
12826	}
12827	if (get_base_dirs->base_dirs) {
12828	if ((get_base_dirs->count == `0`) \|\| (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12829	error = EINVAL;
12830	break;
12831	}
12832	dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK \| Z_ZERO);
12833	if (!dirs) {
12834	error = ENOSPC;
12835	break;
12836	}
12837	}
12838	error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12839	if (!error && dirs) {
12840	error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12841	get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12842	}
12843	if (dirs) {
12844	kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12845	}
12846	}
12847	break;
12848	#endif
12849
12850	default: {
12851	/*
12852	* Other, known commands shouldn't be passed down here.
12853	* (When adding a selector to this list, it may be prudent
12854	* to consider adding it to the list in sys_fcntl_nocancel() as well.)
12855	*/
12856	switch (cmd) {
12857	case F_PUNCHHOLE:
12858	case F_TRIM_ACTIVE_FILE:
12859	case F_RDADVISE:
12860	case F_TRANSCODEKEY:
12861	case F_GETPROTECTIONLEVEL:
12862	case F_GETDEFAULTPROTLEVEL:
12863	case F_MAKECOMPRESSED:
12864	case F_SET_GREEDY_MODE:
12865	case F_SETSTATICCONTENT:
12866	case F_SETIOTYPE:
12867	case F_SETBACKINGSTORE:
12868	case F_GETPATH_MTMINFO:
12869	case APFSIOC_REVERT_TO_SNAPSHOT:
12870	case FSIOC_FIOSEEKHOLE:
12871	case FSIOC_FIOSEEKDATA:
12872	case HFS_GET_BOOT_INFO:
12873	case HFS_SET_BOOT_INFO:
12874	case FIOPINSWAP:
12875	case F_CHKCLEAN:
12876	case F_FULLFSYNC:
12877	case F_BARRIERFSYNC:
12878	case F_FREEZE_FS:
12879	case F_THAW_FS:
12880	case FSIOC_KERNEL_ROOTAUTH:
12881	case FSIOC_GRAFT_FS:
12882	case FSIOC_UNGRAFT_FS:
12883	case FSIOC_AUTH_FS:
12884	error = EINVAL;
12885	goto outdrop;
12886	}
12887	/ Invoke the filesystem-specific code /
12888	error = VNOP_IOCTL(vp, command: cmd, data, fflag: (int)options, ctx);
12889	}
12890	} / end switch stmt /
12891
12892	/*
12893	* if no errors, copy any data to user. Size was
12894	* already set and checked above.
12895	*/
12896	if (error == `0` && (cmd & IOC_OUT) && size) {
12897	error = copyout(data, udata, size);
12898	}
12899
12900	outdrop:
12901	if (memp) {
12902	kfree_data(memp, size);
12903	}
12904
12905	return error;
12906	}
12907
12908	/ ARGSUSED /
12909	int
12910	fsctl(proc_t p, struct fsctl_args uap, __unused int32_t retval)
12911	{
12912	int error;
12913	struct nameidata nd;
12914	uint32_t nameiflags;
12915	vnode_t vp = NULL;
12916	vfs_context_t ctx = vfs_context_current();
12917
12918	AUDIT_ARG(cmd, (int)uap->cmd);
12919	AUDIT_ARG(value32, uap->options);
12920	/ Get the vnode for the file we are getting info on: /
12921	nameiflags = `0`;
12922	//
12923	// if we come through fsctl() then the file is by definition not open.
12924	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12925	// lest the caller mistakenly thinks the only open is their own (but in
12926	// reality it's someone elses).
12927	//
12928	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12929	return EINVAL;
12930	}
12931	if ((uap->options & FSOPT_NOFOLLOW) == `0`) {
12932	nameiflags \|= FOLLOW;
12933	}
12934	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12935	nameiflags \|= (CN_FIRMLINK_NOFOLLOW \| NOCACHE);
12936	}
12937	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags \| AUDITVNPATH1,
12938	UIO_USERSPACE, uap->path, ctx);
12939	if ((error = namei(ndp: &nd))) {
12940	goto done;
12941	}
12942	vp = nd.ni_vp;
12943	nameidone(&nd);
12944
12945	#if CONFIG_MACF
12946	error = mac_mount_check_fsctl(ctx, mp: vnode_mount(vp), cmd: uap->cmd);
12947	if (error) {
12948	goto done;
12949	}
12950	#endif
12951
12952	error = fsctl_internal(p, arg_vp: &vp, cmd: uap->cmd, udata: (user_addr_t)uap->data, options: uap->options, ctx);
12953
12954	done:
12955	if (vp) {
12956	vnode_put(vp);
12957	}
12958	return error;
12959	}
12960	/ ARGSUSED /
12961	int
12962	ffsctl(proc_t p, struct ffsctl_args uap, __unused int32_t retval)
12963	{
12964	int error;
12965	vnode_t vp = NULL;
12966	vfs_context_t ctx = vfs_context_current();
12967	int fd = -`1`;
12968
12969	AUDIT_ARG(fd, uap->fd);
12970	AUDIT_ARG(cmd, (int)uap->cmd);
12971	AUDIT_ARG(value32, uap->options);
12972
12973	/ Get the vnode for the file we are getting info on: /
12974	if ((error = file_vnode(uap->fd, &vp))) {
12975	return error;
12976	}
12977	fd = uap->fd;
12978	if ((error = vnode_getwithref(vp))) {
12979	file_drop(fd);
12980	return error;
12981	}
12982
12983	#if CONFIG_MACF
12984	if ((error = mac_mount_check_fsctl(ctx, mp: vnode_mount(vp), cmd: uap->cmd))) {
12985	file_drop(fd);
12986	vnode_put(vp);
12987	return error;
12988	}
12989	#endif
12990
12991	error = fsctl_internal(p, arg_vp: &vp, cmd: uap->cmd, udata: (user_addr_t)uap->data, options: uap->options, ctx);
12992
12993	file_drop(fd);
12994
12995	/validate vp; fsctl_internal() can drop iocount and reset vp to NULL/
12996	if (vp) {
12997	vnode_put(vp);
12998	}
12999
13000	return error;
13001	}
13002	/ end of fsctl system call /
13003
13004	#define FILESEC_ACCESS_ENTITLEMENT \
13005	"com.apple.private.vfs.filesec-access"
13006
13007	static int
13008	xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13009	{
13010	if (strcmp(s1: attrname, KAUTH_FILESEC_XATTR) == `0`) {
13011	/*
13012	* get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13013	* set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13014	*/
13015	if ((!setting && vfs_context_issuser(ctx)) \|\|
13016	IOTaskHasEntitlement(task: vfs_context_task(ctx),
13017	FILESEC_ACCESS_ENTITLEMENT)) {
13018	return `0`;
13019	}
13020	}
13021
13022	return EPERM;
13023	}
13024
13025	/*
13026	* Retrieve the data of an extended attribute.
13027	*/
13028	int
13029	getxattr(proc_t p, struct getxattr_args uap, user_ssize_t retval)
13030	{
13031	vnode_t vp;
13032	struct nameidata nd;
13033	char attrname[XATTR_MAXNAMELEN + `1`];
13034	vfs_context_t ctx = vfs_context_current();
13035	uio_t auio = NULL;
13036	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13037	size_t attrsize = `0`;
13038	size_t namelen;
13039	u_int32_t nameiflags;
13040	int error;
13041	UIO_STACKBUF(uio_buf, `1`);
13042
13043	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13044	return EINVAL;
13045	}
13046
13047	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
13048	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13049	if ((error = namei(ndp: &nd))) {
13050	return error;
13051	}
13052	vp = nd.ni_vp;
13053	nameidone(&nd);
13054
13055	error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13056	if (error != `0`) {
13057	goto out;
13058	}
13059	if (xattr_protected(attrname) &&
13060	(error = xattr_entitlement_check(attrname, ctx, false)) != `0`) {
13061	goto out;
13062	}
13063	/*
13064	* the specific check for 0xffffffff is a hack to preserve
13065	* binaray compatibilty in K64 with applications that discovered
13066	* that passing in a buf pointer and a size of -1 resulted in
13067	* just the size of the indicated extended attribute being returned.
13068	* this isn't part of the documented behavior, but because of the
13069	* original implemtation's check for "uap->size > 0", this behavior
13070	* was allowed. In K32 that check turned into a signed comparison
13071	* even though uap->size is unsigned... in K64, we blow by that
13072	* check because uap->size is unsigned and doesn't get sign smeared
13073	* in the munger for a 32 bit user app. we also need to add a
13074	* check to limit the maximum size of the buffer being passed in...
13075	* unfortunately, the underlying fileystems seem to just malloc
13076	* the requested size even if the actual extended attribute is tiny.
13077	* because that malloc is for kernel wired memory, we have to put a
13078	* sane limit on it.
13079	*
13080	* U32 running on K64 will yield 0x00000000ffffffff for uap->size
13081	* U64 running on K64 will yield -1 (64 bits wide)
13082	* U32/U64 running on K32 will yield -1 (32 bits wide)
13083	*/
13084	if (uap->size == `0xffffffff` \|\| uap->size == (size_t)-`1`) {
13085	goto no_uio;
13086	}
13087
13088	if (uap->value) {
13089	if (uap->size > (size_t)XATTR_MAXSIZE) {
13090	uap->size = XATTR_MAXSIZE;
13091	}
13092
13093	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_READ,
13094	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
13095	uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13096	}
13097	no_uio:
13098	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13099	out:
13100	vnode_put(vp);
13101
13102	if (auio) {
13103	*retval = uap->size - uio_resid(a_uio: auio);
13104	} else {
13105	*retval = (user_ssize_t)attrsize;
13106	}
13107
13108	return error;
13109	}
13110
13111	/*
13112	* Retrieve the data of an extended attribute.
13113	*/
13114	int
13115	fgetxattr(proc_t p, struct fgetxattr_args uap, user_ssize_t retval)
13116	{
13117	vnode_t vp;
13118	char attrname[XATTR_MAXNAMELEN + `1`];
13119	vfs_context_t ctx = vfs_context_current();
13120	uio_t auio = NULL;
13121	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13122	size_t attrsize = `0`;
13123	size_t namelen;
13124	int error;
13125	UIO_STACKBUF(uio_buf, `1`);
13126
13127	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13128	return EINVAL;
13129	}
13130
13131	if ((error = file_vnode(uap->fd, &vp))) {
13132	return error;
13133	}
13134	if ((error = vnode_getwithref(vp))) {
13135	file_drop(uap->fd);
13136	return error;
13137	}
13138	error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13139	if (error != `0`) {
13140	goto out;
13141	}
13142	if (xattr_protected(attrname) &&
13143	(error = xattr_entitlement_check(attrname, ctx, false)) != `0`) {
13144	goto out;
13145	}
13146	if (uap->value && uap->size > `0`) {
13147	if (uap->size > (size_t)XATTR_MAXSIZE) {
13148	uap->size = XATTR_MAXSIZE;
13149	}
13150
13151	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_READ,
13152	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
13153	uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13154	}
13155
13156	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13157	out:
13158	(void)vnode_put(vp);
13159	file_drop(uap->fd);
13160
13161	if (auio) {
13162	*retval = uap->size - uio_resid(a_uio: auio);
13163	} else {
13164	*retval = (user_ssize_t)attrsize;
13165	}
13166	return error;
13167	}
13168
13169	/ struct for checkdirs iteration /
13170	struct setxattr_ctx {
13171	struct nameidata nd;
13172	char attrname[XATTR_MAXNAMELEN + `1`];
13173	UIO_STACKBUF(uio_buf, `1`);
13174	};
13175
13176	/*
13177	* Set the data of an extended attribute.
13178	*/
13179	int
13180	setxattr(proc_t p, struct setxattr_args uap, int* *retval)
13181	{
13182	vnode_t vp;
13183	vfs_context_t ctx = vfs_context_current();
13184	uio_t auio = NULL;
13185	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13186	size_t namelen;
13187	u_int32_t nameiflags;
13188	int error;
13189	struct setxattr_ctx *sactx;
13190
13191	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13192	return EINVAL;
13193	}
13194
13195	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13196	if (sactx == NULL) {
13197	return ENOMEM;
13198	}
13199
13200	error = copyinstr(uaddr: uap->attrname, kaddr: sactx->attrname, len: sizeof(sactx->attrname), done: &namelen);
13201	if (error != `0`) {
13202	if (error == EPERM) {
13203	/ if the string won't fit in attrname, copyinstr emits EPERM /
13204	error = ENAMETOOLONG;
13205	}
13206	/ Otherwise return the default error from copyinstr to detect ERANGE, etc /
13207	goto out;
13208	}
13209	if (xattr_protected(sactx->attrname) &&
13210	(error = xattr_entitlement_check(attrname: sactx->attrname, ctx, true)) != `0`) {
13211	goto out;
13212	}
13213	if (uap->size != `0` && uap->value == `0`) {
13214	error = EINVAL;
13215	goto out;
13216	}
13217	if (uap->size > INT_MAX) {
13218	error = E2BIG;
13219	goto out;
13220	}
13221
13222	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
13223	#if CONFIG_FILE_LEASES
13224	nameiflags \|= WANTPARENT;
13225	#endif
13226	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13227	if ((error = namei(ndp: &sactx->nd))) {
13228	goto out;
13229	}
13230	vp = sactx->nd.ni_vp;
13231	#if CONFIG_FILE_LEASES
13232	vnode_breakdirlease(vp: sactx->nd.ni_dvp, false, O_WRONLY);
13233	vnode_put(vp: sactx->nd.ni_dvp);
13234	#endif
13235	nameidone(&sactx->nd);
13236
13237	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_WRITE,
13238	a_buf_p: &sactx->uio_buf[`0`], a_buffer_size: sizeof(sactx->uio_buf));
13239	uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13240
13241	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13242	#if CONFIG_FSE
13243	if (error == `0`) {
13244	add_fsevent(FSE_XATTR_MODIFIED, ctx,
13245	FSE_ARG_VNODE, vp,
13246	FSE_ARG_DONE);
13247	}
13248	#endif
13249	vnode_put(vp);
13250	out:
13251	kfree_type(struct setxattr_ctx, sactx);
13252	*retval = `0`;
13253	return error;
13254	}
13255
13256	/*
13257	* Set the data of an extended attribute.
13258	*/
13259	int
13260	fsetxattr(proc_t p, struct fsetxattr_args uap, int* *retval)
13261	{
13262	vnode_t vp;
13263	char attrname[XATTR_MAXNAMELEN + `1`];
13264	vfs_context_t ctx = vfs_context_current();
13265	uio_t auio = NULL;
13266	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13267	size_t namelen;
13268	int error;
13269	UIO_STACKBUF(uio_buf, `1`);
13270
13271	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13272	return EINVAL;
13273	}
13274
13275	error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13276	if (error != `0`) {
13277	if (error == EPERM) {
13278	/ if the string won't fit in attrname, copyinstr emits EPERM /
13279	return ENAMETOOLONG;
13280	}
13281	/ Otherwise return the default error from copyinstr to detect ERANGE, etc /
13282	return error;
13283	}
13284	if (xattr_protected(attrname) &&
13285	(error = xattr_entitlement_check(attrname, ctx, true)) != `0`) {
13286	return error;
13287	}
13288	if (uap->size != `0` && uap->value == `0`) {
13289	return EINVAL;
13290	}
13291	if (uap->size > INT_MAX) {
13292	return E2BIG;
13293	}
13294	if ((error = file_vnode(uap->fd, &vp))) {
13295	return error;
13296	}
13297	if ((error = vnode_getwithref(vp))) {
13298	file_drop(uap->fd);
13299	return error;
13300	}
13301
13302	#if CONFIG_FILE_LEASES
13303	vnode_breakdirlease(vp, true, O_WRONLY);
13304	#endif
13305
13306	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_WRITE,
13307	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
13308	uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13309
13310	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13311	#if CONFIG_FSE
13312	if (error == `0`) {
13313	add_fsevent(FSE_XATTR_MODIFIED, ctx,
13314	FSE_ARG_VNODE, vp,
13315	FSE_ARG_DONE);
13316	}
13317	#endif
13318	vnode_put(vp);
13319	file_drop(uap->fd);
13320	*retval = `0`;
13321	return error;
13322	}
13323
13324	/*
13325	* Remove an extended attribute.
13326	* XXX Code duplication here.
13327	*/
13328	int
13329	removexattr(proc_t p, struct removexattr_args uap, int* *retval)
13330	{
13331	vnode_t vp;
13332	struct nameidata nd;
13333	char attrname[XATTR_MAXNAMELEN + `1`];
13334	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13335	vfs_context_t ctx = vfs_context_current();
13336	size_t namelen;
13337	u_int32_t nameiflags;
13338	int error;
13339
13340	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13341	return EINVAL;
13342	}
13343
13344	error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13345	if (error != `0`) {
13346	return error;
13347	}
13348	if (xattr_protected(attrname)) {
13349	return EPERM;
13350	}
13351	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
13352	#if CONFIG_FILE_LEASES
13353	nameiflags \|= WANTPARENT;
13354	#endif
13355	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13356	if ((error = namei(ndp: &nd))) {
13357	return error;
13358	}
13359	vp = nd.ni_vp;
13360	#if CONFIG_FILE_LEASES
13361	vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
13362	vnode_put(vp: nd.ni_dvp);
13363	#endif
13364	nameidone(&nd);
13365
13366	error = vn_removexattr(vp, attrname, uap->options, ctx);
13367	#if CONFIG_FSE
13368	if (error == `0`) {
13369	add_fsevent(FSE_XATTR_REMOVED, ctx,
13370	FSE_ARG_VNODE, vp,
13371	FSE_ARG_DONE);
13372	}
13373	#endif
13374	vnode_put(vp);
13375	*retval = `0`;
13376	return error;
13377	}
13378
13379	/*
13380	* Remove an extended attribute.
13381	* XXX Code duplication here.
13382	*/
13383	int
13384	fremovexattr(__unused proc_t p, struct fremovexattr_args uap, int* *retval)
13385	{
13386	vnode_t vp;
13387	char attrname[XATTR_MAXNAMELEN + `1`];
13388	size_t namelen;
13389	int error;
13390	#if CONFIG_FSE
13391	vfs_context_t ctx = vfs_context_current();
13392	#endif
13393
13394	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13395	return EINVAL;
13396	}
13397
13398	error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13399	if (error != `0`) {
13400	return error;
13401	}
13402	if (xattr_protected(attrname)) {
13403	return EPERM;
13404	}
13405	if ((error = file_vnode(uap->fd, &vp))) {
13406	return error;
13407	}
13408	if ((error = vnode_getwithref(vp))) {
13409	file_drop(uap->fd);
13410	return error;
13411	}
13412
13413	#if CONFIG_FILE_LEASES
13414	vnode_breakdirlease(vp, true, O_WRONLY);
13415	#endif
13416
13417	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13418	#if CONFIG_FSE
13419	if (error == `0`) {
13420	add_fsevent(FSE_XATTR_REMOVED, ctx,
13421	FSE_ARG_VNODE, vp,
13422	FSE_ARG_DONE);
13423	}
13424	#endif
13425	vnode_put(vp);
13426	file_drop(uap->fd);
13427	*retval = `0`;
13428	return error;
13429	}
13430
13431	/*
13432	* Retrieve the list of extended attribute names.
13433	* XXX Code duplication here.
13434	*/
13435	int
13436	listxattr(proc_t p, struct listxattr_args uap, user_ssize_t retval)
13437	{
13438	vnode_t vp;
13439	struct nameidata nd;
13440	vfs_context_t ctx = vfs_context_current();
13441	uio_t auio = NULL;
13442	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13443	size_t attrsize = `0`;
13444	u_int32_t nameiflags;
13445	int error;
13446	UIO_STACKBUF(uio_buf, `1`);
13447
13448	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13449	return EINVAL;
13450	}
13451
13452	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
13453	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13454	if ((error = namei(ndp: &nd))) {
13455	return error;
13456	}
13457	vp = nd.ni_vp;
13458	nameidone(&nd);
13459	if (uap->namebuf != `0` && uap->bufsize > `0`) {
13460	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: `0`, a_spacetype: spacetype, a_iodirection: UIO_READ,
13461	a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
13462	uio_addiov(a_uio: auio, a_baseaddr: uap->namebuf, a_length: uap->bufsize);
13463	}
13464
13465	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13466
13467	vnode_put(vp);
13468	if (auio) {
13469	*retval = (user_ssize_t)uap->bufsize - uio_resid(a_uio: auio);
13470	} else {
13471	*retval = (user_ssize_t)attrsize;
13472	}
13473	return error;
13474	}
13475
13476	/*
13477	* Retrieve the list of extended attribute names.
13478	* XXX Code duplication here.
13479	*/
13480	int
13481	flistxattr(proc_t p, struct flistxattr_args uap, user_ssize_t retval)
13482	{
13483	vnode_t vp;
13484	uio_t auio = NULL;
13485	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13486	size_t attrsize = `0`;
13487	int error;
13488	UIO_STACKBUF(uio_buf, `1`);
13489
13490	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT)) {
13491	return EINVAL;
13492	}
13493
13494	if ((error = file_vnode(uap->fd, &vp))) {
13495	return error;
13496	}
13497	if ((error = vnode_getwithref(vp))) {
13498	file_drop(uap->fd);
13499	return error;
13500	}
13501	if (uap->namebuf != `0` && uap->bufsize > `0`) {
13502	auio = uio_createwithbuffer(a_iovcount: `1`, a_offset: `0`, a_spacetype: spacetype,
13503	a_iodirection: UIO_READ, a_buf_p: &uio_buf[`0`], a_buffer_size: sizeof(uio_buf));
13504	uio_addiov(a_uio: auio, a_baseaddr: uap->namebuf, a_length: uap->bufsize);
13505	}
13506
13507	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13508
13509	vnode_put(vp);
13510	file_drop(uap->fd);
13511	if (auio) {
13512	*retval = (user_ssize_t)uap->bufsize - uio_resid(a_uio: auio);
13513	} else {
13514	*retval = (user_ssize_t)attrsize;
13515	}
13516	return error;
13517	}
13518
13519	int
13520	fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13521	vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13522	{
13523	int error;
13524	struct mount *mp = NULL;
13525	vnode_t vp;
13526	int length;
13527	int bpflags;
13528	/ maximum number of times to retry build_path /
13529	unsigned int retries = `0x10`;
13530
13531	if (bufsize > FSGETPATH_MAXBUFLEN) {
13532	return EINVAL;
13533	}
13534
13535	if (buf == NULL) {
13536	return ENOMEM;
13537	}
13538
13539	retry:
13540	if ((mp = mount_lookupby_volfsid(volfs_id, `1`)) == NULL) {
13541	error = ENOTSUP; / unexpected failure /
13542	return ENOTSUP;
13543	}
13544
13545	#if CONFIG_UNION_MOUNTS
13546	unionget:
13547	#endif /* CONFIG_UNION_MOUNTS */
13548	if (objid == `2`) {
13549	struct vfs_attr vfsattr;
13550	int use_vfs_root = TRUE;
13551
13552	VFSATTR_INIT(&vfsattr);
13553	VFSATTR_WANTED(&vfsattr, f_capabilities);
13554	if (!(options & FSOPT_ISREALFSID) &&
13555	vfs_getattr(mp, vfa: &vfsattr, ctx: vfs_context_kernel()) == `0` &&
13556	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13557	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13558	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13559	use_vfs_root = FALSE;
13560	}
13561	}
13562
13563	if (use_vfs_root) {
13564	error = VFS_ROOT(mp, &vp, ctx);
13565	} else {
13566	error = VFS_VGET(mp, objid, &vp, ctx);
13567	}
13568	} else {
13569	error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13570	}
13571
13572	#if CONFIG_UNION_MOUNTS
13573	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13574	/*
13575	* If the fileid isn't found and we're in a union
13576	* mount volume, then see if the fileid is in the
13577	* mounted-on volume.
13578	*/
13579	struct mount *tmp = mp;
13580	mp = vnode_mount(vp: tmp->mnt_vnodecovered);
13581	vfs_unbusy(mp: tmp);
13582	if (vfs_busy(mp, LK_NOWAIT) == `0`) {
13583	goto unionget;
13584	}
13585	} else {
13586	vfs_unbusy(mp);
13587	}
13588	#else
13589	vfs_unbusy(mp);
13590	#endif /* CONFIG_UNION_MOUNTS */
13591
13592	if (error) {
13593	return error;
13594	}
13595
13596	#if CONFIG_MACF
13597	error = mac_vnode_check_fsgetpath(ctx, vp);
13598	if (error) {
13599	vnode_put(vp);
13600	return error;
13601	}
13602	#endif
13603
13604	/ Obtain the absolute path to this vnode. /
13605	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : `0`;
13606	if (options & FSOPT_NOFIRMLINKPATH) {
13607	bpflags \|= BUILDPATH_NO_FIRMLINK;
13608	}
13609	bpflags \|= BUILDPATH_CHECK_MOVED;
13610	error = build_path(first_vp: vp, buff: buf, buflen: (int)bufsize, outlen: &length, flags: bpflags, ctx);
13611	vnode_put(vp);
13612
13613	if (error) {
13614	/ there was a race building the path, try a few more times /
13615	if (error == EAGAIN) {
13616	--retries;
13617	if (retries > `0`) {
13618	goto retry;
13619	}
13620
13621	error = ENOENT;
13622	}
13623	goto out;
13624	}
13625
13626	AUDIT_ARG(text, buf);
13627
13628	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > `0`) {
13629	unsigned long path_words[NUMPARMS];
13630	size_t path_len = sizeof(path_words);
13631
13632	if ((size_t)length < path_len) {
13633	memcpy(dst: (char *)path_words, src: buf, n: length);
13634	memset(s: (char *)path_words + length, c: `0`, n: path_len - length);
13635
13636	path_len = length;
13637	} else {
13638	memcpy(dst: (char *)path_words, src: buf + (length - path_len), n: path_len);
13639	}
13640
13641	kdebug_vfs_lookup(path_words, path_len: (int)path_len, vnp: vp,
13642	KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13643	}
13644
13645	pathlen = length; /* may be superseded by error /
13646
13647	out:
13648	return error;
13649	}
13650
13651	/*
13652	* Obtain the full pathname of a file system object by id.
13653	*/
13654	static int
13655	fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13656	uint32_t options, user_ssize_t *retval)
13657	{
13658	vfs_context_t ctx = vfs_context_current();
13659	fsid_t fsid;
13660	char *realpath;
13661	int length;
13662	int error;
13663
13664	if (options & ~(FSOPT_NOFIRMLINKPATH \| FSOPT_ISREALFSID)) {
13665	return EINVAL;
13666	}
13667
13668	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13669	return error;
13670	}
13671	AUDIT_ARG(value32, fsid.val[`0`]);
13672	AUDIT_ARG(value64, objid);
13673	/ Restrict output buffer size for now. /
13674
13675	if (bufsize > FSGETPATH_MAXBUFLEN \|\| bufsize <= `0`) {
13676	return EINVAL;
13677	}
13678	realpath = kalloc_data(bufsize, Z_WAITOK \| Z_ZERO);
13679	if (realpath == NULL) {
13680	return ENOMEM;
13681	}
13682
13683	error = fsgetpath_internal(ctx, volfs_id: fsid.val[`0`], objid, bufsize, buf: realpath,
13684	options, pathlen: &length);
13685
13686	if (error) {
13687	goto out;
13688	}
13689
13690	error = copyout((caddr_t)realpath, buf, length);
13691
13692	retval = (user_ssize_t)length; /* may be superseded by error /
13693	out:
13694	kfree_data(realpath, bufsize);
13695	return error;
13696	}
13697
13698	int
13699	fsgetpath(__unused proc_t p, struct fsgetpath_args uap, user_ssize_t retval)
13700	{
13701	return fsgetpath_extended(buf: uap->buf, bufsize: uap->bufsize, user_fsid: uap->fsid, objid: uap->objid,
13702	options: `0`, retval);
13703	}
13704
13705	int
13706	fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args uap, user_ssize_t retval)
13707	{
13708	return fsgetpath_extended(buf: uap->buf, bufsize: uap->bufsize, user_fsid: uap->fsid, objid: uap->objid,
13709	options: uap->options, retval);
13710	}
13711
13712	/*
13713	* Common routine to handle various flavors of statfs data heading out
13714	* to user space.
13715	*
13716	* Returns: 0 Success
13717	* EFAULT
13718	*/
13719	static int
13720	munge_statfs(struct mount mp, struct* vfsstatfs *sfsp,
13721	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13722	boolean_t partial_copy)
13723	{
13724	int error;
13725	int my_size, copy_size;
13726
13727	if (is_64_bit) {
13728	struct user64_statfs sfs;
13729	my_size = copy_size = sizeof(sfs);
13730	bzero(s: &sfs, n: my_size);
13731	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13732	sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13733	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13734	sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13735	sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13736	sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13737	sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13738	sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13739	sfs.f_files = (user64_long_t)sfsp->f_files;
13740	sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13741	sfs.f_fsid = sfsp->f_fsid;
13742	sfs.f_owner = sfsp->f_owner;
13743	vfs_getfstypename(mp, buf: sfs.f_fstypename, MFSNAMELEN);
13744	strlcpy(dst: &sfs.f_mntonname[`0`], src: &sfsp->f_mntonname[`0`], MNAMELEN);
13745	strlcpy(dst: &sfs.f_mntfromname[`0`], src: &sfsp->f_mntfromname[`0`], MNAMELEN);
13746
13747	if (partial_copy) {
13748	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13749	}
13750	error = copyout((caddr_t)&sfs, bufp, copy_size);
13751	} else {
13752	struct user32_statfs sfs;
13753
13754	my_size = copy_size = sizeof(sfs);
13755	bzero(s: &sfs, n: my_size);
13756
13757	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13758	sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13759	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13760
13761	/*
13762	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13763	* have to fudge the numbers here in that case. We inflate the blocksize in order
13764	* to reflect the filesystem size as best we can.
13765	*/
13766	if ((sfsp->f_blocks > INT_MAX)
13767	/ Hack for 4061702 . I think the real fix is for Carbon to*
13768	* look for some volume capability and not depend on hidden
13769	* semantics agreed between a FS and carbon.
13770	* f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13771	* for Carbon to set bNoVolumeSizes volume attribute.
13772	* Without this the webdavfs files cannot be copied onto
13773	* disk as they look huge. This change should not affect
13774	* XSAN as they should not setting these to -1..
13775	*/
13776	&& (sfsp->f_blocks != `0xffffffffffffffffULL`)
13777	&& (sfsp->f_bfree != `0xffffffffffffffffULL`)
13778	&& (sfsp->f_bavail != `0xffffffffffffffffULL`)) {
13779	int shift;
13780
13781	/*
13782	* Work out how far we have to shift the block count down to make it fit.
13783	* Note that it's possible to have to shift so far that the resulting
13784	* blocksize would be unreportably large. At that point, we will clip
13785	* any values that don't fit.
13786	*
13787	* For safety's sake, we also ensure that f_iosize is never reported as
13788	* being smaller than f_bsize.
13789	*/
13790	for (shift = `0`; shift < `32`; shift++) {
13791	if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13792	break;
13793	}
13794	if ((sfsp->f_bsize << (shift + `1`)) > INT_MAX) {
13795	break;
13796	}
13797	}
13798	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13799	sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13800	sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13801	sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13802	#undef __SHIFT_OR_CLIP
13803	sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13804	sfs.f_iosize = (int)lmax(a: sfsp->f_iosize, b: sfsp->f_bsize);
13805	} else {
13806	/ filesystem is small enough to be reported honestly /
13807	sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13808	sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13809	sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13810	sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13811	sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13812	}
13813	sfs.f_files = (user32_long_t)sfsp->f_files;
13814	sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13815	sfs.f_fsid = sfsp->f_fsid;
13816	sfs.f_owner = sfsp->f_owner;
13817	vfs_getfstypename(mp, buf: sfs.f_fstypename, MFSNAMELEN);
13818	strlcpy(dst: &sfs.f_mntonname[`0`], src: &sfsp->f_mntonname[`0`], MNAMELEN);
13819	strlcpy(dst: &sfs.f_mntfromname[`0`], src: &sfsp->f_mntfromname[`0`], MNAMELEN);
13820
13821	if (partial_copy) {
13822	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13823	}
13824	error = copyout((caddr_t)&sfs, bufp, copy_size);
13825	}
13826
13827	if (sizep != NULL) {
13828	*sizep = my_size;
13829	}
13830	return error;
13831	}
13832
13833	/*
13834	* copy stat structure into user_stat structure.
13835	*/
13836	void
13837	munge_user64_stat(struct stat sbp, struct* user64_stat *usbp)
13838	{
13839	bzero(s: usbp, n: sizeof(*usbp));
13840
13841	usbp->st_dev = sbp->st_dev;
13842	usbp->st_ino = sbp->st_ino;
13843	usbp->st_mode = sbp->st_mode;
13844	usbp->st_nlink = sbp->st_nlink;
13845	usbp->st_uid = sbp->st_uid;
13846	usbp->st_gid = sbp->st_gid;
13847	usbp->st_rdev = sbp->st_rdev;
13848	#ifndef _POSIX_C_SOURCE
13849	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13850	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13851	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13852	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13853	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13854	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13855	#else
13856	usbp->st_atime = sbp->st_atime;
13857	usbp->st_atimensec = sbp->st_atimensec;
13858	usbp->st_mtime = sbp->st_mtime;
13859	usbp->st_mtimensec = sbp->st_mtimensec;
13860	usbp->st_ctime = sbp->st_ctime;
13861	usbp->st_ctimensec = sbp->st_ctimensec;
13862	#endif
13863	usbp->st_size = sbp->st_size;
13864	usbp->st_blocks = sbp->st_blocks;
13865	usbp->st_blksize = sbp->st_blksize;
13866	usbp->st_flags = sbp->st_flags;
13867	usbp->st_gen = sbp->st_gen;
13868	usbp->st_lspare = sbp->st_lspare;
13869	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
13870	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
13871	}
13872
13873	void
13874	munge_user32_stat(struct stat sbp, struct* user32_stat *usbp)
13875	{
13876	bzero(s: usbp, n: sizeof(*usbp));
13877
13878	usbp->st_dev = sbp->st_dev;
13879	usbp->st_ino = sbp->st_ino;
13880	usbp->st_mode = sbp->st_mode;
13881	usbp->st_nlink = sbp->st_nlink;
13882	usbp->st_uid = sbp->st_uid;
13883	usbp->st_gid = sbp->st_gid;
13884	usbp->st_rdev = sbp->st_rdev;
13885	#ifndef _POSIX_C_SOURCE
13886	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13887	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13888	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13889	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13890	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13891	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13892	#else
13893	usbp->st_atime = sbp->st_atime;
13894	usbp->st_atimensec = sbp->st_atimensec;
13895	usbp->st_mtime = sbp->st_mtime;
13896	usbp->st_mtimensec = sbp->st_mtimensec;
13897	usbp->st_ctime = sbp->st_ctime;
13898	usbp->st_ctimensec = sbp->st_ctimensec;
13899	#endif
13900	usbp->st_size = sbp->st_size;
13901	usbp->st_blocks = sbp->st_blocks;
13902	usbp->st_blksize = sbp->st_blksize;
13903	usbp->st_flags = sbp->st_flags;
13904	usbp->st_gen = sbp->st_gen;
13905	usbp->st_lspare = sbp->st_lspare;
13906	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
13907	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
13908	}
13909
13910	/*
13911	* copy stat64 structure into user_stat64 structure.
13912	*/
13913	void
13914	munge_user64_stat64(struct stat64 sbp, struct* user64_stat64 *usbp)
13915	{
13916	bzero(s: usbp, n: sizeof(*usbp));
13917
13918	usbp->st_dev = sbp->st_dev;
13919	usbp->st_ino = sbp->st_ino;
13920	usbp->st_mode = sbp->st_mode;
13921	usbp->st_nlink = sbp->st_nlink;
13922	usbp->st_uid = sbp->st_uid;
13923	usbp->st_gid = sbp->st_gid;
13924	usbp->st_rdev = sbp->st_rdev;
13925	#ifndef _POSIX_C_SOURCE
13926	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13927	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13928	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13929	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13930	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13931	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13932	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13933	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13934	#else
13935	usbp->st_atime = sbp->st_atime;
13936	usbp->st_atimensec = sbp->st_atimensec;
13937	usbp->st_mtime = sbp->st_mtime;
13938	usbp->st_mtimensec = sbp->st_mtimensec;
13939	usbp->st_ctime = sbp->st_ctime;
13940	usbp->st_ctimensec = sbp->st_ctimensec;
13941	usbp->st_birthtime = sbp->st_birthtime;
13942	usbp->st_birthtimensec = sbp->st_birthtimensec;
13943	#endif
13944	usbp->st_size = sbp->st_size;
13945	usbp->st_blocks = sbp->st_blocks;
13946	usbp->st_blksize = sbp->st_blksize;
13947	usbp->st_flags = sbp->st_flags;
13948	usbp->st_gen = sbp->st_gen;
13949	usbp->st_lspare = sbp->st_lspare;
13950	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
13951	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
13952	}
13953
13954	void
13955	munge_user32_stat64(struct stat64 sbp, struct* user32_stat64 *usbp)
13956	{
13957	bzero(s: usbp, n: sizeof(*usbp));
13958
13959	usbp->st_dev = sbp->st_dev;
13960	usbp->st_ino = sbp->st_ino;
13961	usbp->st_mode = sbp->st_mode;
13962	usbp->st_nlink = sbp->st_nlink;
13963	usbp->st_uid = sbp->st_uid;
13964	usbp->st_gid = sbp->st_gid;
13965	usbp->st_rdev = sbp->st_rdev;
13966	#ifndef _POSIX_C_SOURCE
13967	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13968	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13969	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13970	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13971	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13972	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13973	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13974	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13975	#else
13976	usbp->st_atime = sbp->st_atime;
13977	usbp->st_atimensec = sbp->st_atimensec;
13978	usbp->st_mtime = sbp->st_mtime;
13979	usbp->st_mtimensec = sbp->st_mtimensec;
13980	usbp->st_ctime = sbp->st_ctime;
13981	usbp->st_ctimensec = sbp->st_ctimensec;
13982	usbp->st_birthtime = sbp->st_birthtime;
13983	usbp->st_birthtimensec = sbp->st_birthtimensec;
13984	#endif
13985	usbp->st_size = sbp->st_size;
13986	usbp->st_blocks = sbp->st_blocks;
13987	usbp->st_blksize = sbp->st_blksize;
13988	usbp->st_flags = sbp->st_flags;
13989	usbp->st_gen = sbp->st_gen;
13990	usbp->st_lspare = sbp->st_lspare;
13991	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
13992	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
13993	}
13994
13995	/*
13996	* Purge buffer cache for simulating cold starts
13997	*/
13998	static int
13999	vnode_purge_callback(struct vnode vp, __unused void* *cargs)
14000	{
14001	ubc_msync(vp, (off_t)`0`, ubc_getsize(vp), NULL / off_t resid_off /*, UBC_PUSHALL \| UBC_INVALIDATE);
14002
14003	return VNODE_RETURNED;
14004	}
14005
14006	static int
14007	vfs_purge_callback(mount_t mp, __unused void * arg)
14008	{
14009	vnode_iterate(mp, VNODE_WAIT \| VNODE_ITERATE_ALL, callout: vnode_purge_callback, NULL);
14010
14011	return VFS_RETURNED;
14012	}
14013
14014	static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14015	SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW \| CTLFLAG_LOCKED, &vfs_purge_vm_pagers, `0`, "VFS purge also purges file-backed VM pagers");
14016
14017	int
14018	vfs_purge(__unused struct proc p, __unused struct* vfs_purge_args uap, __unused int32_t retval)
14019	{
14020	if (!kauth_cred_issuser(cred: kauth_cred_get())) {
14021	return EPERM;
14022	}
14023
14024	vfs_iterate(flags: `0` / flags /, callout: vfs_purge_callback, NULL);
14025
14026	/ also flush any VM pagers backed by files /
14027	if (vfs_purge_vm_pagers) {
14028	vm_purge_filebacked_pagers();
14029	}
14030
14031	return `0`;
14032	}
14033
14034	/*
14035	* gets the vnode associated with the (unnamed) snapshot directory
14036	* for a Filesystem. The snapshot directory vnode is returned with
14037	* an iocount on it.
14038	*/
14039	int
14040	vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14041	{
14042	return VFS_VGET_SNAPDIR(vnode_mount(vp: rvp), sdvpp, ctx);
14043	}
14044
14045	/*
14046	* Get the snapshot vnode.
14047	*
14048	* If successful, the call returns with an iocount on rvpp ,sdvpp and
14049	* needs nameidone() on ndp.
14050	*
14051	* If the snapshot vnode exists it is returned in ndp->ni_vp.
14052	*
14053	* If it returns with an error, rvpp, sdvpp are NULL and nameidone() is
14054	* not needed.
14055	*/
14056	static int
14057	vnode_get_snapshot(int dirfd, vnode_t rvpp, vnode_t sdvpp,
14058	user_addr_t name, struct nameidata *ndp, int32_t op,
14059	#if !CONFIG_TRIGGERS
14060	__unused
14061	#endif
14062	enum path_operation pathop,
14063	vfs_context_t ctx)
14064	{
14065	int error, i;
14066	caddr_t name_buf;
14067	size_t name_len;
14068	struct vfs_attr vfa;
14069
14070	*sdvpp = NULLVP;
14071	*rvpp = NULLVP;
14072
14073	error = vnode_getfromfd(ctx, fd: dirfd, vpp: rvpp);
14074	if (error) {
14075	return error;
14076	}
14077
14078	if (!vnode_isvroot(vp: *rvpp)) {
14079	error = EINVAL;
14080	goto out;
14081	}
14082
14083	/ Make sure the filesystem supports snapshots /
14084	VFSATTR_INIT(&vfa);
14085	VFSATTR_WANTED(&vfa, f_capabilities);
14086	if ((vfs_getattr(mp: vnode_mount(vp: *rvpp), vfa: &vfa, ctx) != `0`) \|\|
14087	!VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) \|\|
14088	!((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14089	VOL_CAP_INT_SNAPSHOT)) \|\|
14090	!((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14091	VOL_CAP_INT_SNAPSHOT))) {
14092	error = ENOTSUP;
14093	goto out;
14094	}
14095
14096	error = vnode_get_snapdir(rvp: *rvpp, sdvpp, ctx);
14097	if (error) {
14098	goto out;
14099	}
14100
14101	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14102	error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14103	if (error) {
14104	goto out1;
14105	}
14106
14107	/*
14108	* Some sanity checks- name can't be empty, "." or ".." or have slashes.
14109	* (the length returned by copyinstr includes the terminating NUL)
14110	*/
14111	if ((name_len == `1`) \|\| (name_len == `2` && name_buf[`0`] == `'.'`) \|\|
14112	(name_len == `3` && name_buf[`0`] == `'.'` && name_buf[`1`] == `'.'`)) {
14113	error = EINVAL;
14114	goto out1;
14115	}
14116	for (i = `0`; i < (int)name_len && name_buf[i] != `'/'`; i++) {
14117	;
14118	}
14119	if (i < (int)name_len) {
14120	error = EINVAL;
14121	goto out1;
14122	}
14123
14124	#if CONFIG_MACF
14125	if (op == CREATE) {
14126	error = mac_mount_check_snapshot_create(ctx, mp: vnode_mount(vp: *rvpp),
14127	name: name_buf);
14128	} else if (op == DELETE) {
14129	error = mac_mount_check_snapshot_delete(ctx, mp: vnode_mount(vp: *rvpp),
14130	name: name_buf);
14131	}
14132	if (error) {
14133	goto out1;
14134	}
14135	#endif
14136
14137	/ Check if the snapshot already exists ... /
14138	NDINIT(ndp, op, pathop, USEDVP \| NOCACHE \| AUDITVNPATH1,
14139	UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14140	ndp->ni_dvp = *sdvpp;
14141
14142	error = namei(ndp);
14143	out1:
14144	zfree(ZV_NAMEI, name_buf);
14145	out:
14146	if (error) {
14147	if (*sdvpp) {
14148	vnode_put(vp: *sdvpp);
14149	*sdvpp = NULLVP;
14150	}
14151	if (*rvpp) {
14152	vnode_put(vp: *rvpp);
14153	*rvpp = NULLVP;
14154	}
14155	}
14156	return error;
14157	}
14158
14159	/*
14160	* create a filesystem snapshot (for supporting filesystems)
14161	*
14162	* A much simplified version of openat(dirfd, name, O_CREAT \| O_EXCL)
14163	* We get to the (unnamed) snapshot directory vnode and create the vnode
14164	* for the snapshot in it.
14165	*
14166	* Restrictions:
14167	*
14168	* a) Passed in name for snapshot cannot have slashes.
14169	* b) name can't be "." or ".."
14170	*
14171	* Since this requires superuser privileges, vnode_authorize calls are not
14172	* made.
14173	*/
14174	static int __attribute__((noinline))
14175	snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14176	vfs_context_t ctx)
14177	{
14178	vnode_t rvp, snapdvp;
14179	int error;
14180	struct nameidata *ndp;
14181
14182	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14183
14184	error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp, CREATE,
14185	pathop: OP_LINK, ctx);
14186	if (error) {
14187	goto out;
14188	}
14189
14190	if (ndp->ni_vp) {
14191	vnode_put(vp: ndp->ni_vp);
14192	error = EEXIST;
14193	} else {
14194	struct vnode_attr *vap;
14195	vnode_t vp = NULLVP;
14196
14197	vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14198
14199	VATTR_INIT(vap);
14200	VATTR_SET(vap, va_type, VREG);
14201	VATTR_SET(vap, va_mode, `0`);
14202
14203	error = vn_create(snapdvp, &vp, ndp, vap,
14204	VN_CREATE_NOAUTH \| VN_CREATE_NOINHERIT, `0`, NULL, ctx);
14205	if (!error && vp) {
14206	vnode_put(vp);
14207	}
14208
14209	kfree_type(struct vnode_attr, vap);
14210	}
14211
14212	nameidone(ndp);
14213	vnode_put(vp: snapdvp);
14214	vnode_put(vp: rvp);
14215	out:
14216	kfree_type(struct nameidata, ndp);
14217
14218	return error;
14219	}
14220
14221	/*
14222	* Delete a Filesystem snapshot
14223	*
14224	* get the vnode for the unnamed snapshot directory and the snapshot and
14225	* delete the snapshot.
14226	*/
14227	static int __attribute__((noinline))
14228	snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14229	vfs_context_t ctx)
14230	{
14231	vnode_t rvp, snapdvp;
14232	int error;
14233	struct nameidata *ndp;
14234
14235	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14236
14237	error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp, DELETE,
14238	pathop: OP_UNLINK, ctx);
14239	if (error) {
14240	goto out;
14241	}
14242
14243	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14244	VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14245
14246	vnode_put(vp: ndp->ni_vp);
14247	nameidone(ndp);
14248	vnode_put(vp: snapdvp);
14249	vnode_put(vp: rvp);
14250	out:
14251	kfree_type(struct nameidata, ndp);
14252
14253	return error;
14254	}
14255
14256	/*
14257	* Revert a filesystem to a snapshot
14258	*
14259	* Marks the filesystem to revert to the given snapshot on next mount.
14260	*/
14261	static int __attribute__((noinline))
14262	snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14263	vfs_context_t ctx)
14264	{
14265	int error;
14266	vnode_t rvp;
14267	mount_t mp;
14268	struct fs_snapshot_revert_args revert_data;
14269	struct componentname cnp;
14270	caddr_t name_buf;
14271	size_t name_len;
14272
14273	error = vnode_getfromfd(ctx, fd: dirfd, vpp: &rvp);
14274	if (error) {
14275	return error;
14276	}
14277	mp = vnode_mount(vp: rvp);
14278
14279	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14280	error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14281	if (error) {
14282	zfree(ZV_NAMEI, name_buf);
14283	vnode_put(vp: rvp);
14284	return error;
14285	}
14286
14287	#if CONFIG_MACF
14288	error = mac_mount_check_snapshot_revert(ctx, mp, name: name_buf);
14289	if (error) {
14290	zfree(ZV_NAMEI, name_buf);
14291	vnode_put(vp: rvp);
14292	return error;
14293	}
14294	#endif
14295
14296	/*
14297	* Grab mount_iterref so that we can release the vnode,
14298	* since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14299	*/
14300	error = mount_iterref(mp, `0`);
14301	vnode_put(vp: rvp);
14302	if (error) {
14303	zfree(ZV_NAMEI, name_buf);
14304	return error;
14305	}
14306
14307	memset(s: &cnp, c: `0`, n: sizeof(cnp));
14308	cnp.cn_pnbuf = (char *)name_buf;
14309	cnp.cn_nameiop = LOOKUP;
14310	cnp.cn_flags = ISLASTCN \| HASBUF;
14311	cnp.cn_pnlen = MAXPATHLEN;
14312	cnp.cn_nameptr = cnp.cn_pnbuf;
14313	cnp.cn_namelen = (int)name_len;
14314	revert_data.sr_cnp = &cnp;
14315
14316	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, data: (caddr_t)&revert_data, flags: `0`, context: ctx);
14317	mount_iterdrop(mp);
14318	zfree(ZV_NAMEI, name_buf);
14319
14320	if (error) {
14321	/ If there was any error, try again using VNOP_IOCTL /
14322
14323	vnode_t snapdvp;
14324	struct nameidata namend;
14325
14326	error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp: &namend, LOOKUP,
14327	pathop: OP_LOOKUP, ctx);
14328	if (error) {
14329	return error;
14330	}
14331
14332
14333	error = VNOP_IOCTL(vp: namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, data: (caddr_t) NULL,
14334	fflag: `0`, ctx);
14335
14336	vnode_put(vp: namend.ni_vp);
14337	nameidone(&namend);
14338	vnode_put(vp: snapdvp);
14339	vnode_put(vp: rvp);
14340	}
14341
14342	return error;
14343	}
14344
14345	/*
14346	* rename a Filesystem snapshot
14347	*
14348	* get the vnode for the unnamed snapshot directory and the snapshot and
14349	* rename the snapshot. This is a very specialised (and simple) case of
14350	* rename(2) (which has to deal with a lot more complications). It differs
14351	* slightly from rename(2) in that EEXIST is returned if the new name exists.
14352	*/
14353	static int __attribute__((noinline))
14354	snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14355	__unused uint32_t flags, vfs_context_t ctx)
14356	{
14357	vnode_t rvp, snapdvp;
14358	int error, i;
14359	caddr_t newname_buf;
14360	size_t name_len;
14361	vnode_t fvp;
14362	struct nameidata fromnd, tond;
14363	/ carving out a chunk for structs that are too big to be on stack. /
14364	struct {
14365	struct nameidata from_node;
14366	struct nameidata to_node;
14367	} * __rename_data;
14368
14369	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14370	fromnd = &__rename_data->from_node;
14371	tond = &__rename_data->to_node;
14372
14373	error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name: old, ndp: fromnd, DELETE,
14374	pathop: OP_UNLINK, ctx);
14375	if (error) {
14376	goto out;
14377	}
14378	fvp = fromnd->ni_vp;
14379
14380	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14381	error = copyinstr(uaddr: new, kaddr: newname_buf, MAXPATHLEN, done: &name_len);
14382	if (error) {
14383	goto out1;
14384	}
14385
14386	/*
14387	* Some sanity checks- new name can't be empty, "." or ".." or have
14388	* slashes.
14389	* (the length returned by copyinstr includes the terminating NUL)
14390	*
14391	* The FS rename VNOP is suppossed to handle this but we'll pick it
14392	* off here itself.
14393	*/
14394	if ((name_len == `1`) \|\| (name_len == `2` && newname_buf[`0`] == `'.'`) \|\|
14395	(name_len == `3` && newname_buf[`0`] == `'.'` && newname_buf[`1`] == `'.'`)) {
14396	error = EINVAL;
14397	goto out1;
14398	}
14399	for (i = `0`; i < (int)name_len && newname_buf[i] != `'/'`; i++) {
14400	;
14401	}
14402	if (i < (int)name_len) {
14403	error = EINVAL;
14404	goto out1;
14405	}
14406
14407	#if CONFIG_MACF
14408	error = mac_mount_check_snapshot_create(ctx, mp: vnode_mount(vp: rvp),
14409	name: newname_buf);
14410	if (error) {
14411	goto out1;
14412	}
14413	#endif
14414
14415	NDINIT(tond, RENAME, OP_RENAME, USEDVP \| NOCACHE \| AUDITVNPATH2,
14416	UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14417	tond->ni_dvp = snapdvp;
14418
14419	error = namei(ndp: tond);
14420	if (error) {
14421	goto out2;
14422	} else if (tond->ni_vp) {
14423	/*
14424	* snapshot rename behaves differently than rename(2) - if the
14425	* new name exists, EEXIST is returned.
14426	*/
14427	vnode_put(vp: tond->ni_vp);
14428	error = EEXIST;
14429	goto out2;
14430	}
14431
14432	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14433	&tond->ni_cnd, ctx);
14434
14435	out2:
14436	nameidone(tond);
14437	out1:
14438	zfree(ZV_NAMEI, newname_buf);
14439	vnode_put(vp: fvp);
14440	vnode_put(vp: snapdvp);
14441	vnode_put(vp: rvp);
14442	nameidone(fromnd);
14443	out:
14444	kfree_type(typeof(*__rename_data), __rename_data);
14445	return error;
14446	}
14447
14448	/*
14449	* Mount a Filesystem snapshot
14450	*
14451	* get the vnode for the unnamed snapshot directory and the snapshot and
14452	* mount the snapshot.
14453	*/
14454	static int __attribute__((noinline))
14455	snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14456	__unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14457	{
14458	mount_t mp;
14459	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14460	struct fs_snapshot_mount_args smnt_data;
14461	int error;
14462	struct nameidata snapndp, dirndp;
14463	/ carving out a chunk for structs that are too big to be on stack. /
14464	struct {
14465	struct nameidata snapnd;
14466	struct nameidata dirnd;
14467	} * __snapshot_mount_data;
14468
14469	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14470	snapndp = &__snapshot_mount_data->snapnd;
14471	dirndp = &__snapshot_mount_data->dirnd;
14472
14473	error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp: snapndp, LOOKUP,
14474	pathop: OP_LOOKUP, ctx);
14475	if (error) {
14476	goto out;
14477	}
14478
14479	snapvp = snapndp->ni_vp;
14480	if (!vnode_mount(vp: rvp) \|\| (vnode_mount(vp: rvp) == dead_mountp)) {
14481	error = EIO;
14482	goto out1;
14483	}
14484
14485	/ Get the vnode to be covered /
14486	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
14487	UIO_USERSPACE, directory, ctx);
14488	error = namei(ndp: dirndp);
14489	if (error) {
14490	goto out1;
14491	}
14492
14493	vp = dirndp->ni_vp;
14494	pvp = dirndp->ni_dvp;
14495	mp = vnode_mount(vp: rvp);
14496
14497	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14498	error = EINVAL;
14499	goto out2;
14500	}
14501
14502	#if CONFIG_MACF
14503	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, cnp: &dirndp->ni_cnd, name: snapndp->ni_cnd.cn_nameptr,
14504	vfc_name: mp->mnt_vfsstat.f_fstypename);
14505	if (error) {
14506	goto out2;
14507	}
14508	#endif
14509
14510	smnt_data.sm_mp = mp;
14511	smnt_data.sm_cnp = &snapndp->ni_cnd;
14512	error = mount_common(fstypename: mp->mnt_vfsstat.f_fstypename, pvp, vp,
14513	cnp: &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags: flags & (MNT_DONTBROWSE \| MNT_IGNORE_OWNERSHIP),
14514	KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14515
14516	out2:
14517	vnode_put(vp);
14518	vnode_put(vp: pvp);
14519	nameidone(dirndp);
14520	out1:
14521	vnode_put(vp: snapvp);
14522	vnode_put(vp: snapdvp);
14523	vnode_put(vp: rvp);
14524	nameidone(snapndp);
14525	out:
14526	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14527	return error;
14528	}
14529
14530	/*
14531	* Root from a snapshot of the filesystem
14532	*
14533	* Marks the filesystem to root from the given snapshot on next boot.
14534	*/
14535	static int __attribute__((noinline))
14536	snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14537	vfs_context_t ctx)
14538	{
14539	int error;
14540	vnode_t rvp;
14541	mount_t mp;
14542	struct fs_snapshot_root_args root_data;
14543	struct componentname cnp;
14544	caddr_t name_buf;
14545	size_t name_len;
14546
14547	error = vnode_getfromfd(ctx, fd: dirfd, vpp: &rvp);
14548	if (error) {
14549	return error;
14550	}
14551	mp = vnode_mount(vp: rvp);
14552
14553	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14554	error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14555	if (error) {
14556	zfree(ZV_NAMEI, name_buf);
14557	vnode_put(vp: rvp);
14558	return error;
14559	}
14560
14561	// XXX MAC checks ?
14562
14563	/*
14564	* Grab mount_iterref so that we can release the vnode,
14565	* since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14566	*/
14567	error = mount_iterref(mp, `0`);
14568	vnode_put(vp: rvp);
14569	if (error) {
14570	zfree(ZV_NAMEI, name_buf);
14571	return error;
14572	}
14573
14574	memset(s: &cnp, c: `0`, n: sizeof(cnp));
14575	cnp.cn_pnbuf = (char *)name_buf;
14576	cnp.cn_nameiop = LOOKUP;
14577	cnp.cn_flags = ISLASTCN \| HASBUF;
14578	cnp.cn_pnlen = MAXPATHLEN;
14579	cnp.cn_nameptr = cnp.cn_pnbuf;
14580	cnp.cn_namelen = (int)name_len;
14581	root_data.sr_cnp = &cnp;
14582
14583	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, data: (caddr_t)&root_data, flags: `0`, context: ctx);
14584
14585	mount_iterdrop(mp);
14586	zfree(ZV_NAMEI, name_buf);
14587
14588	return error;
14589	}
14590
14591	static boolean_t
14592	vfs_context_can_snapshot(vfs_context_t ctx)
14593	{
14594	static const char * const snapshot_entitlements[] = {
14595	"com.apple.private.vfs.snapshot",
14596	"com.apple.developer.vfs.snapshot",
14597	"com.apple.private.apfs.arv.limited.snapshot",
14598	};
14599	static const size_t nentitlements =
14600	sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[`0`]);
14601	size_t i;
14602
14603	task_t task = vfs_context_task(ctx);
14604	for (i = `0`; i < nentitlements; i++) {
14605	if (IOTaskHasEntitlement(task, entitlement: snapshot_entitlements[i])) {
14606	return TRUE;
14607	}
14608	}
14609	return FALSE;
14610	}
14611
14612	/*
14613	* FS snapshot operations dispatcher
14614	*/
14615	int
14616	fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14617	__unused int32_t *retval)
14618	{
14619	int error;
14620	vfs_context_t ctx = vfs_context_current();
14621
14622	AUDIT_ARG(fd, uap->dirfd);
14623	AUDIT_ARG(value32, uap->op);
14624
14625	if (!vfs_context_can_snapshot(ctx)) {
14626	return EPERM;
14627	}
14628
14629	/*
14630	* Enforce user authorization for snapshot modification operations,
14631	* or if trying to root from snapshot.
14632	*/
14633	if (uap->op != SNAPSHOT_OP_MOUNT) {
14634	vnode_t dvp = NULLVP;
14635	vnode_t devvp = NULLVP;
14636	mount_t mp;
14637
14638	error = vnode_getfromfd(ctx, fd: uap->dirfd, vpp: &dvp);
14639	if (error) {
14640	return error;
14641	}
14642	mp = vnode_mount(vp: dvp);
14643	devvp = mp->mnt_devvp;
14644
14645	/ get an iocount on devvp /
14646	if (devvp == NULLVP) {
14647	error = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname, flags: `0`, vpp: &devvp, ctx);
14648	/ for mounts which arent block devices /
14649	if (error == ENOENT) {
14650	error = ENXIO;
14651	}
14652	} else {
14653	error = vnode_getwithref(vp: devvp);
14654	}
14655
14656	if (error) {
14657	vnode_put(vp: dvp);
14658	return error;
14659	}
14660
14661	if ((vfs_context_issuser(ctx) == `0`) &&
14662	(vnode_authorize(vp: devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != `0`) &&
14663	(!IOTaskHasEntitlement(task: vfs_context_task(ctx), entitlement: "com.apple.private.vfs.snapshot.user"))) {
14664	error = EPERM;
14665	}
14666	vnode_put(vp: dvp);
14667	vnode_put(vp: devvp);
14668
14669	if (error) {
14670	return error;
14671	}
14672	}
14673
14674	switch (uap->op) {
14675	case SNAPSHOT_OP_CREATE:
14676	error = snapshot_create(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14677	break;
14678	case SNAPSHOT_OP_DELETE:
14679	error = snapshot_delete(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14680	break;
14681	case SNAPSHOT_OP_RENAME:
14682	error = snapshot_rename(dirfd: uap->dirfd, old: uap->name1, new: uap->name2,
14683	flags: uap->flags, ctx);
14684	break;
14685	case SNAPSHOT_OP_MOUNT:
14686	error = snapshot_mount(dirfd: uap->dirfd, name: uap->name1, directory: uap->name2,
14687	mnt_data: uap->data, flags: uap->flags, ctx);
14688	break;
14689	case SNAPSHOT_OP_REVERT:
14690	error = snapshot_revert(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14691	break;
14692	#if CONFIG_MNT_ROOTSNAP
14693	case SNAPSHOT_OP_ROOT:
14694	error = snapshot_root(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14695	break;
14696	#endif /* CONFIG_MNT_ROOTSNAP */
14697	default:
14698	error = ENOSYS;
14699	}
14700
14701	return error;
14702	}
14703

Browse the source code of xnu/bsd/vfs/vfs_syscalls.c