vfs_syscalls.c source code [xnu/bsd/vfs/vfs_syscalls.c]

1	/*
2	* Copyright (c) 1995-2017 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1989, 1993
30	* The Regents of the University of California. All rights reserved.
31	* (c) UNIX System Laboratories, Inc.
32	* All or some portions of this file are derived from material licensed
33	* to the University of California by American Telephone and Telegraph
34	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
35	* the permission of UNIX System Laboratories, Inc.
36	*
37	* Redistribution and use in source and binary forms, with or without
38	* modification, are permitted provided that the following conditions
39	* are met:
40	* 1. Redistributions of source code must retain the above copyright
41	* notice, this list of conditions and the following disclaimer.
42	* 2. Redistributions in binary form must reproduce the above copyright
43	* notice, this list of conditions and the following disclaimer in the
44	* documentation and/or other materials provided with the distribution.
45	* 3. All advertising materials mentioning features or use of this software
46	* must display the following acknowledgement:
47	* This product includes software developed by the University of
48	* California, Berkeley and its contributors.
49	* 4. Neither the name of the University nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*
65	* @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66	*/
67	/*
68	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69	* support for mandatory and extensible security protections. This notice
70	* is included in support of clause 2.2 (b) of the Apple Public License,
71	* Version 2.0.
72	*/
73
74	#include <sys/param.h>
75	#include <sys/systm.h>
76	#include <sys/namei.h>
77	#include <sys/filedesc.h>
78	#include <sys/kernel.h>
79	#include <sys/file_internal.h>
80	#include <sys/stat.h>
81	#include <sys/vnode_internal.h>
82	#include <sys/mount_internal.h>
83	#include <sys/proc_internal.h>
84	#include <sys/kauth.h>
85	#include <sys/uio_internal.h>
86	#include <sys/malloc.h>
87	#include <sys/mman.h>
88	#include <sys/dirent.h>
89	#include <sys/attr.h>
90	#include <sys/sysctl.h>
91	#include <sys/ubc.h>
92	#include <sys/quota.h>
93	#include <sys/kdebug.h>
94	#include <sys/fsevents.h>
95	#include <sys/imgsrc.h>
96	#include <sys/sysproto.h>
97	#include <sys/xattr.h>
98	#include <sys/fcntl.h>
99	#include <sys/fsctl.h>
100	#include <sys/ubc_internal.h>
101	#include <sys/disk.h>
102	#include <sys/content_protection.h>
103	#include <sys/clonefile.h>
104	#include <sys/snapshot.h>
105	#include <sys/priv.h>
106	#include <machine/cons.h>
107	#include <machine/limits.h>
108	#include <miscfs/specfs/specdev.h>
109
110	#include <vfs/vfs_disk_conditioner.h>
111
112	#include <security/audit/audit.h>
113	#include <bsm/audit_kevents.h>
114
115	#include <mach/mach_types.h>
116	#include <kern/kern_types.h>
117	#include <kern/kalloc.h>
118	#include <kern/task.h>
119
120	#include <vm/vm_pageout.h>
121	#include <vm/vm_protos.h>
122
123	#include <libkern/OSAtomic.h>
124	#include <pexpert/pexpert.h>
125	#include <IOKit/IOBSD.h>
126
127	#if ROUTEFS
128	#include <miscfs/routefs/routefs.h>
129	#endif /* ROUTEFS */
130
131	#if CONFIG_MACF
132	#include <security/mac.h>
133	#include <security/mac_framework.h>
134	#endif
135
136	#if CONFIG_FSE
137	#define GET_PATH(x) \
138	(x) = get_pathbuff();
139	#define RELEASE_PATH(x) \
140	release_pathbuff(x);
141	#else
142	#define GET_PATH(x) \
143	MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
144	#define RELEASE_PATH(x) \
145	FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
146	#endif /* CONFIG_FSE */
147
148	#ifndef HFS_GET_BOOT_INFO
149	#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
150	#endif
151
152	#ifndef HFS_SET_BOOT_INFO
153	#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
154	#endif
155
156	#ifndef APFSIOC_REVERT_TO_SNAPSHOT
157	#define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
158	#endif
159
160	extern void disk_conditioner_unmount(mount_t mp);
161
162	/ struct for checkdirs iteration /
163	struct cdirargs {
164	vnode_t olddp;
165	vnode_t newdp;
166	};
167	/ callback for checkdirs iteration /
168	static int checkdirs_callback(proc_t p, void * arg);
169
170	static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
171	static int checkdirs(vnode_t olddp, vfs_context_t ctx);
172	void enablequotas(struct mount *mp, vfs_context_t ctx);
173	static int getfsstat_callback(mount_t mp, void * arg);
174	static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
175	static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec ts, int* nullflag);
176	static int sync_callback(mount_t, void *);
177	static int munge_statfs(struct mount mp, struct* vfsstatfs *sfsp,
178	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
179	boolean_t partial_copy);
180	static int statfs64_common(struct mount mp, struct* vfsstatfs *sfsp,
181	user_addr_t bufp);
182	static int fsync_common(proc_t p, struct fsync_args uap, int* flags);
183	static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
184	struct componentname *cnp, user_addr_t fsmountargs,
185	int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
186	vfs_context_t ctx);
187	void vfs_notify_mount(vnode_t pdvp);
188
189	int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const* char *fsname, boolean_t skip_auth);
190
191	struct fd_vn_data * fg_vn_data_alloc(void);
192
193	/*
194	* Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
195	* Concurrent lookups (or lookups by ids) on hard links can cause the
196	* vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
197	* does) to return ENOENT as the path cannot be returned from the name cache
198	* alone. We have no option but to retry and hope to get one namei->reverse path
199	* generation done without an intervening lookup, lookup by id on the hard link
200	* item. This is only an issue for MAC hooks which cannot reenter the filesystem
201	* which currently are the MAC hooks for rename, unlink and rmdir.
202	*/
203	#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
204
205	static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
206
207	static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
208
209	#ifdef CONFIG_IMGSRC_ACCESS
210	static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
211	static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
212	static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
213	static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
214	static void mount_end_update(mount_t mp);
215	static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname cnp, const* char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
216	#endif /* CONFIG_IMGSRC_ACCESS */
217
218	//snapshot functions
219	#if CONFIG_MNT_ROOTSNAP
220	static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
221	#else
222	static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
223	#endif
224
225	int (union_dircheckp)(struct* vnode , struct** fileproc *, vfs_context_t);
226
227	__private_extern__
228	int sync_internal(void);
229
230	__private_extern__
231	int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
232
233	extern lck_grp_t *fd_vn_lck_grp;
234	extern lck_grp_attr_t *fd_vn_lck_grp_attr;
235	extern lck_attr_t *fd_vn_lck_attr;
236
237	/*
238	* incremented each time a mount or unmount operation occurs
239	* used to invalidate the cached value of the rootvp in the
240	* mount structure utilized by cache_lookup_path
241	*/
242	uint32_t mount_generation = `0`;
243
244	/ counts number of mount and unmount operations /
245	unsigned int vfs_nummntops=`0`;
246
247	extern const struct fileops vnops;
248	#if CONFIG_APPLEDOUBLE
249	extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
250	#endif /* CONFIG_APPLEDOUBLE */
251
252	/*
253	* Virtual File System System Calls
254	*/
255
256	#if NFSCLIENT \|\| DEVFS \|\| ROUTEFS
257	/*
258	* Private in-kernel mounting spi (NFS only, not exported)
259	*/
260	__private_extern__
261	boolean_t
262	vfs_iskernelmount(mount_t mp)
263	{
264	return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
265	}
266
267	__private_extern__
268	int
269	kernel_mount(char fstype, vnode_t pvp, vnode_t vp, const* char *path,
270	void data, __unused size_t datalen, int* syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
271	{
272	struct nameidata nd;
273	boolean_t did_namei;
274	int error;
275
276	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
277	UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
278
279	/*
280	* Get the vnode to be covered if it's not supplied
281	*/
282	if (vp == NULLVP) {
283	error = namei(&nd);
284	if (error)
285	return (error);
286	vp = nd.ni_vp;
287	pvp = nd.ni_dvp;
288	did_namei = TRUE;
289	} else {
290	char pnbuf = CAST_DOWN(char* *, path);
291
292	nd.ni_cnd.cn_pnbuf = pnbuf;
293	nd.ni_cnd.cn_pnlen = strlen(pnbuf) + `1`;
294	did_namei = FALSE;
295	}
296
297	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
298	syscall_flags, kern_flags, NULL, TRUE, ctx);
299
300	if (did_namei) {
301	vnode_put(vp);
302	vnode_put(pvp);
303	nameidone(&nd);
304	}
305
306	return (error);
307	}
308	#endif /* NFSCLIENT \|\| DEVFS */
309
310	/*
311	* Mount a file system.
312	*/
313	/ ARGSUSED /
314	int
315	mount(proc_t p, struct mount_args uap, __unused int32_t retval)
316	{
317	struct __mac_mount_args muap;
318
319	muap.type = uap->type;
320	muap.path = uap->path;
321	muap.flags = uap->flags;
322	muap.data = uap->data;
323	muap.mac_p = USER_ADDR_NULL;
324	return (__mac_mount(p, &muap, retval));
325	}
326
327	int
328	fmount(__unused proc_t p, struct fmount_args uap, __unused int32_t retval)
329	{
330	struct componentname cn;
331	vfs_context_t ctx = vfs_context_current();
332	size_t dummy = `0`;
333	int error;
334	int flags = uap->flags;
335	char fstypename[MFSNAMELEN];
336	char labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() /
337	vnode_t pvp;
338	vnode_t vp;
339
340	AUDIT_ARG(fd, uap->fd);
341	AUDIT_ARG(fflags, flags);
342	/ fstypename will get audited by mount_common /
343
344	/ Sanity check the flags /
345	if (flags & (MNT_IMGSRC_BY_INDEX\|MNT_ROOTFS)) {
346	return (ENOTSUP);
347	}
348
349	if (flags & MNT_UNION) {
350	return (EPERM);
351	}
352
353	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
354	if (error) {
355	return (error);
356	}
357
358	if ((error = file_vnode(uap->fd, &vp)) != `0`) {
359	return (error);
360	}
361
362	if ((error = vnode_getwithref(vp)) != `0`) {
363	file_drop(uap->fd);
364	return (error);
365	}
366
367	pvp = vnode_getparent(vp);
368	if (pvp == NULL) {
369	vnode_put(vp);
370	file_drop(uap->fd);
371	return (EINVAL);
372	}
373
374	memset(&cn, `0`, sizeof(struct componentname));
375	MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
376	cn.cn_pnlen = MAXPATHLEN;
377
378	if((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != `0`) {
379	FREE(cn.cn_pnbuf, M_TEMP);
380	vnode_put(pvp);
381	vnode_put(vp);
382	file_drop(uap->fd);
383	return (error);
384	}
385
386	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, `0`, labelstr, FALSE, ctx);
387
388	FREE(cn.cn_pnbuf, M_TEMP);
389	vnode_put(pvp);
390	vnode_put(vp);
391	file_drop(uap->fd);
392
393	return (error);
394	}
395
396	void
397	vfs_notify_mount(vnode_t pdvp)
398	{
399	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
400	lock_vnode_and_post(pdvp, NOTE_WRITE);
401	}
402
403	/*
404	* __mac_mount:
405	* Mount a file system taking into account MAC label behavior.
406	* See mount(2) man page for more information
407	*
408	* Parameters: p Process requesting the mount
409	* uap User argument descriptor (see below)
410	* retval (ignored)
411	*
412	* Indirect: uap->type Filesystem type
413	* uap->path Path to mount
414	* uap->data Mount arguments
415	* uap->mac_p MAC info
416	* uap->flags Mount flags
417	*
418	*
419	* Returns: 0 Success
420	* !0 Not success
421	*/
422	boolean_t root_fs_upgrade_try = FALSE;
423
424	int
425	__mac_mount(struct proc p, register* struct __mac_mount_args uap, __unused int32_t retval)
426	{
427	vnode_t pvp = NULL;
428	vnode_t vp = NULL;
429	int need_nameidone = `0`;
430	vfs_context_t ctx = vfs_context_current();
431	char fstypename[MFSNAMELEN];
432	struct nameidata nd;
433	size_t dummy=`0`;
434	char *labelstr = NULL;
435	int flags = uap->flags;
436	int error;
437	#if CONFIG_IMGSRC_ACCESS \|\| CONFIG_MACF
438	boolean_t is_64bit = IS_64BIT_PROCESS(p);
439	#else
440	#pragma unused(p)
441	#endif
442	/*
443	* Get the fs type name from user space
444	*/
445	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
446	if (error)
447	return (error);
448
449	/*
450	* Get the vnode to be covered
451	*/
452	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
453	UIO_USERSPACE, uap->path, ctx);
454	error = namei(&nd);
455	if (error) {
456	goto out;
457	}
458	need_nameidone = `1`;
459	vp = nd.ni_vp;
460	pvp = nd.ni_dvp;
461
462	#ifdef CONFIG_IMGSRC_ACCESS
463	/ Mounting image source cannot be batched with other operations /
464	if (flags == MNT_IMGSRC_BY_INDEX) {
465	error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
466	ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
467	goto out;
468	}
469	#endif /* CONFIG_IMGSRC_ACCESS */
470
471	#if CONFIG_MACF
472	/*
473	* Get the label string (if any) from user space
474	*/
475	if (uap->mac_p != USER_ADDR_NULL) {
476	struct user_mac mac;
477	size_t ulen = `0`;
478
479	if (is_64bit) {
480	struct user64_mac mac64;
481	error = copyin(uap->mac_p, &mac64, sizeof(mac64));
482	mac.m_buflen = mac64.m_buflen;
483	mac.m_string = mac64.m_string;
484	} else {
485	struct user32_mac mac32;
486	error = copyin(uap->mac_p, &mac32, sizeof(mac32));
487	mac.m_buflen = mac32.m_buflen;
488	mac.m_string = mac32.m_string;
489	}
490	if (error)
491	goto out;
492	if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) \|\|
493	(mac.m_buflen < `2`)) {
494	error = EINVAL;
495	goto out;
496	}
497	MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
498	error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
499	if (error) {
500	goto out;
501	}
502	AUDIT_ARG(mac_string, labelstr);
503	}
504	#endif /* CONFIG_MACF */
505
506	AUDIT_ARG(fflags, flags);
507
508	#if SECURE_KERNEL
509	if (flags & MNT_UNION) {
510	/ No union mounts on release kernels /
511	error = EPERM;
512	goto out;
513	}
514	#endif
515
516	if ((vp->v_flag & VROOT) &&
517	(vp->v_mount->mnt_flag & MNT_ROOTFS)) {
518	if (!(flags & MNT_UNION)) {
519	flags \|= MNT_UPDATE;
520	}
521	else {
522	/*
523	* For a union mount on '/', treat it as fresh
524	* mount instead of update.
525	* Otherwise, union mouting on '/' used to panic the
526	* system before, since mnt_vnodecovered was found to
527	* be NULL for '/' which is required for unionlookup
528	* after it gets ENOENT on union mount.
529	*/
530	flags = (flags & ~(MNT_UPDATE));
531	}
532
533	#if SECURE_KERNEL
534	if ((flags & MNT_RDONLY) == `0`) {
535	/ Release kernels are not allowed to mount "/" as rw /
536	error = EPERM;
537	goto out;
538	}
539	#endif
540	/*
541	* See 7392553 for more details on why this check exists.
542	* Suffice to say: If this check is ON and something tries
543	* to mount the rootFS RW, we'll turn off the codesign
544	* bitmap optimization.
545	*/
546	#if CHECK_CS_VALIDATION_BITMAP
547	if ((flags & MNT_RDONLY) == `0` ) {
548	root_fs_upgrade_try = TRUE;
549	}
550	#endif
551	}
552
553	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, `0`,
554	labelstr, FALSE, ctx);
555
556	out:
557
558	#if CONFIG_MACF
559	if (labelstr)
560	FREE(labelstr, M_MACTEMP);
561	#endif /* CONFIG_MACF */
562
563	if (vp) {
564	vnode_put(vp);
565	}
566	if (pvp) {
567	vnode_put(pvp);
568	}
569	if (need_nameidone) {
570	nameidone(&nd);
571	}
572
573	return (error);
574	}
575
576	/*
577	* common mount implementation (final stage of mounting)
578
579	* Arguments:
580	* fstypename file system type (ie it's vfs name)
581	* pvp parent of covered vnode
582	* vp covered vnode
583	* cnp component name (ie path) of covered vnode
584	* flags generic mount flags
585	* fsmountargs file system specific data
586	* labelstr optional MAC label
587	* kernelmount TRUE for mounts initiated from inside the kernel
588	* ctx caller's context
589	*/
590	static int
591	mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
592	struct componentname cnp, user_addr_t fsmountargs, int* flags, uint32_t internal_flags,
593	char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
594	{
595	#if !CONFIG_MACF
596	#pragma unused(labelstr)
597	#endif
598	struct vnode *devvp = NULLVP;
599	struct vnode *device_vnode = NULLVP;
600	#if CONFIG_MACF
601	struct vnode *rvp;
602	#endif
603	struct mount *mp;
604	struct vfstable vfsp = (struct* vfstable *)`0`;
605	struct proc *p = vfs_context_proc(ctx);
606	int error, flag = `0`;
607	user_addr_t devpath = USER_ADDR_NULL;
608	int ronly = `0`;
609	int mntalloc = `0`;
610	boolean_t vfsp_ref = FALSE;
611	boolean_t is_rwlock_locked = FALSE;
612	boolean_t did_rele = FALSE;
613	boolean_t have_usecount = FALSE;
614
615	/*
616	* Process an update for an existing mount
617	*/
618	if (flags & MNT_UPDATE) {
619	if ((vp->v_flag & VROOT) == `0`) {
620	error = EINVAL;
621	goto out1;
622	}
623	mp = vp->v_mount;
624
625	/ unmount in progress return error /
626	mount_lock_spin(mp);
627	if (mp->mnt_lflag & MNT_LUNMOUNT) {
628	mount_unlock(mp);
629	error = EBUSY;
630	goto out1;
631	}
632	mount_unlock(mp);
633	lck_rw_lock_exclusive(&mp->mnt_rwlock);
634	is_rwlock_locked = TRUE;
635	/*
636	* We only allow the filesystem to be reloaded if it
637	* is currently mounted read-only.
638	*/
639	if ((flags & MNT_RELOAD) &&
640	((mp->mnt_flag & MNT_RDONLY) == `0`)) {
641	error = ENOTSUP;
642	goto out1;
643	}
644
645	/*
646	* If content protection is enabled, update mounts are not
647	* allowed to turn it off.
648	*/
649	if ((mp->mnt_flag & MNT_CPROTECT) &&
650	((flags & MNT_CPROTECT) == `0`)) {
651	error = EINVAL;
652	goto out1;
653	}
654
655	#ifdef CONFIG_IMGSRC_ACCESS
656	/ Can't downgrade the backer of the root FS /
657	if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
658	(!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
659	error = ENOTSUP;
660	goto out1;
661	}
662	#endif /* CONFIG_IMGSRC_ACCESS */
663
664	/*
665	* Only root, or the user that did the original mount is
666	* permitted to update it.
667	*/
668	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
669	(error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
670	goto out1;
671	}
672	#if CONFIG_MACF
673	error = mac_mount_check_remount(ctx, mp);
674	if (error != `0`) {
675	goto out1;
676	}
677	#endif
678	/*
679	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
680	* and MNT_NOEXEC if mount point is already MNT_NOEXEC.
681	*/
682	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
683	flags \|= MNT_NOSUID \| MNT_NODEV;
684	if (mp->mnt_flag & MNT_NOEXEC)
685	flags \|= MNT_NOEXEC;
686	}
687	flag = mp->mnt_flag;
688
689
690
691	mp->mnt_flag \|= flags & (MNT_RELOAD \| MNT_FORCE \| MNT_UPDATE);
692
693	vfsp = mp->mnt_vtable;
694	goto update;
695	}
696
697	/*
698	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
699	* MNT_NOEXEC if mount point is already MNT_NOEXEC.
700	*/
701	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
702	flags \|= MNT_NOSUID \| MNT_NODEV;
703	if (vp->v_mount->mnt_flag & MNT_NOEXEC)
704	flags \|= MNT_NOEXEC;
705	}
706
707	/ XXXAUDIT: Should we capture the type on the error path as well? /
708	AUDIT_ARG(text, fstypename);
709	mount_list_lock();
710	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
711	if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
712	vfsp->vfc_refcount++;
713	vfsp_ref = TRUE;
714	break;
715	}
716	mount_list_unlock();
717	if (vfsp == NULL) {
718	error = ENODEV;
719	goto out1;
720	}
721
722	/*
723	* VFC_VFSLOCALARGS is not currently supported for kernel mounts
724	*/
725	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
726	error = EINVAL; / unsupported request /
727	goto out1;
728	}
729
730	error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != `0`));
731	if (error != `0`) {
732	goto out1;
733	}
734
735	/*
736	* Allocate and initialize the filesystem (mount_t)
737	*/
738	MALLOC_ZONE(mp, struct mount , (u_int32_t)sizeof(struct* mount),
739	M_MOUNT, M_WAITOK);
740	bzero((char )mp, (u_int32_t)sizeof(struct* mount));
741	mntalloc = `1`;
742
743	/ Initialize the default IO constraints /
744	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
745	mp->mnt_segreadcnt = mp->mnt_segwritecnt = `32`;
746	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
747	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
748	mp->mnt_devblocksize = DEV_BSIZE;
749	mp->mnt_alignmentmask = PAGE_MASK;
750	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
751	mp->mnt_ioscale = `1`;
752	mp->mnt_ioflags = `0`;
753	mp->mnt_realrootvp = NULLVP;
754	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
755
756	TAILQ_INIT(&mp->mnt_vnodelist);
757	TAILQ_INIT(&mp->mnt_workerqueue);
758	TAILQ_INIT(&mp->mnt_newvnodes);
759	mount_lock_init(mp);
760	lck_rw_lock_exclusive(&mp->mnt_rwlock);
761	is_rwlock_locked = TRUE;
762	mp->mnt_op = vfsp->vfc_vfsops;
763	mp->mnt_vtable = vfsp;
764	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
765	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
766	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
767	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
768	mp->mnt_vnodecovered = vp;
769	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
770	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - `1`;
771	mp->mnt_devbsdunit = `0`;
772
773	/ XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later /
774	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
775
776	#if NFSCLIENT \|\| DEVFS \|\| ROUTEFS
777	if (kernelmount)
778	mp->mnt_kern_flag \|= MNTK_KERNEL_MOUNT;
779	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != `0`)
780	mp->mnt_kern_flag \|= MNTK_PERMIT_UNMOUNT;
781	#endif /* NFSCLIENT \|\| DEVFS */
782
783	update:
784
785	/*
786	* Set the mount level flags.
787	*/
788	if (flags & MNT_RDONLY)
789	mp->mnt_flag \|= MNT_RDONLY;
790	else if (mp->mnt_flag & MNT_RDONLY) {
791	// disallow read/write upgrades of file systems that
792	// had the TYPENAME_OVERRIDE feature set.
793	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
794	error = EPERM;
795	goto out1;
796	}
797	mp->mnt_kern_flag \|= MNTK_WANTRDWR;
798	}
799	mp->mnt_flag &= ~(MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
800	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
801	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
802	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \|
803	MNT_QUARANTINE \| MNT_CPROTECT);
804
805	#if SECURE_KERNEL
806	#if !CONFIG_MNT_SUID
807	/*
808	* On release builds of iOS based platforms, always enforce NOSUID on
809	* all mounts. We do this here because we can catch update mounts as well as
810	* non-update mounts in this case.
811	*/
812	mp->mnt_flag \|= (MNT_NOSUID);
813	#endif
814	#endif
815
816	mp->mnt_flag \|= flags & (MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
817	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
818	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
819	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \|
820	MNT_QUARANTINE \| MNT_CPROTECT);
821
822	#if CONFIG_MACF
823	if (flags & MNT_MULTILABEL) {
824	if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
825	error = EINVAL;
826	goto out1;
827	}
828	mp->mnt_flag \|= MNT_MULTILABEL;
829	}
830	#endif
831	/*
832	* Process device path for local file systems if requested
833	*/
834	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
835	!(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
836	if (vfs_context_is64bit(ctx)) {
837	if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
838	goto out1;
839	fsmountargs += sizeof(devpath);
840	} else {
841	user32_addr_t tmp;
842	if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
843	goto out1;
844	/ munge into LP64 addr /
845	devpath = CAST_USER_ADDR_T(tmp);
846	fsmountargs += sizeof(tmp);
847	}
848
849	/ Lookup device and authorize access to it /
850	if ((devpath)) {
851	struct nameidata nd;
852
853	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
854	if ( (error = namei(&nd)) )
855	goto out1;
856
857	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
858	devvp = nd.ni_vp;
859
860	nameidone(&nd);
861
862	if (devvp->v_type != VBLK) {
863	error = ENOTBLK;
864	goto out2;
865	}
866	if (major(devvp->v_rdev) >= nblkdev) {
867	error = ENXIO;
868	goto out2;
869	}
870	/*
871	* If mount by non-root, then verify that user has necessary
872	* permissions on the device.
873	*/
874	if (suser(vfs_context_ucred(ctx), NULL) != `0`) {
875	mode_t accessmode = KAUTH_VNODE_READ_DATA;
876
877	if ((mp->mnt_flag & MNT_RDONLY) == `0`)
878	accessmode \|= KAUTH_VNODE_WRITE_DATA;
879	if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != `0`)
880	goto out2;
881	}
882	}
883	/ On first mount, preflight and open device /
884	if (devpath && ((flags & MNT_UPDATE) == `0`)) {
885	if ( (error = vnode_ref(devvp)) )
886	goto out2;
887	/*
888	* Disallow multiple mounts of the same device.
889	* Disallow mounting of a device that is currently in use
890	* (except for root, which might share swap device for miniroot).
891	* Flush out any old buffers remaining from a previous use.
892	*/
893	if ( (error = vfs_mountedon(devvp)) )
894	goto out3;
895
896	if (vcount(devvp) > `1` && !(vfs_flags(mp) & MNT_ROOTFS)) {
897	error = EBUSY;
898	goto out3;
899	}
900	if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
901	error = ENOTBLK;
902	goto out3;
903	}
904	if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, `0`, `0`)) )
905	goto out3;
906
907	ronly = (mp->mnt_flag & MNT_RDONLY) != `0`;
908	#if CONFIG_MACF
909	error = mac_vnode_check_open(ctx,
910	devvp,
911	ronly ? FREAD : FREAD\|FWRITE);
912	if (error)
913	goto out3;
914	#endif /* MAC */
915	if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD\|FWRITE, ctx)) )
916	goto out3;
917
918	mp->mnt_devvp = devvp;
919	device_vnode = devvp;
920
921	} else if ((mp->mnt_flag & MNT_RDONLY) &&
922	(mp->mnt_kern_flag & MNTK_WANTRDWR) &&
923	(device_vnode = mp->mnt_devvp)) {
924	dev_t dev;
925	int maj;
926	/*
927	* If upgrade to read-write by non-root, then verify
928	* that user has necessary permissions on the device.
929	*/
930	vnode_getalways(device_vnode);
931
932	if (suser(vfs_context_ucred(ctx), NULL) &&
933	(error = vnode_authorize(device_vnode, NULL,
934	KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA,
935	ctx)) != `0`) {
936	vnode_put(device_vnode);
937	goto out2;
938	}
939
940	/ Tell the device that we're upgrading /
941	dev = (dev_t)device_vnode->v_rdev;
942	maj = major(dev);
943
944	if ((u_int)maj >= (u_int)nblkdev)
945	panic("Volume mounted on a device with invalid major number.");
946
947	error = bdevsw[maj].d_open(dev, FREAD \| FWRITE, S_IFBLK, p);
948	vnode_put(device_vnode);
949	device_vnode = NULLVP;
950	if (error != `0`) {
951	goto out2;
952	}
953	}
954	}
955	#if CONFIG_MACF
956	if ((flags & MNT_UPDATE) == `0`) {
957	mac_mount_label_init(mp);
958	mac_mount_label_associate(ctx, mp);
959	}
960	if (labelstr) {
961	if ((flags & MNT_UPDATE) != `0`) {
962	error = mac_mount_check_label_update(ctx, mp);
963	if (error != `0`)
964	goto out3;
965	}
966	}
967	#endif
968	/*
969	* Mount the filesystem.
970	*/
971	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
972	error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
973	(caddr_t)fsmountargs, `0`, ctx);
974	} else {
975	error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
976	}
977
978	if (flags & MNT_UPDATE) {
979	if (mp->mnt_kern_flag & MNTK_WANTRDWR)
980	mp->mnt_flag &= ~MNT_RDONLY;
981	mp->mnt_flag &=~
982	(MNT_UPDATE \| MNT_RELOAD \| MNT_FORCE);
983	mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
984	if (error)
985	mp->mnt_flag = flag; / restore flag value /
986	vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
987	lck_rw_done(&mp->mnt_rwlock);
988	is_rwlock_locked = FALSE;
989	if (!error)
990	enablequotas(mp, ctx);
991	goto exit;
992	}
993
994	/*
995	* Put the new filesystem on the mount list after root.
996	*/
997	if (error == `0`) {
998	struct vfs_attr vfsattr;
999	#if CONFIG_MACF
1000	if (vfs_flags(mp) & MNT_MULTILABEL) {
1001	error = VFS_ROOT(mp, &rvp, ctx);
1002	if (error) {
1003	printf("%s() VFS_ROOT returned %d\n", __func__, error);
1004	goto out3;
1005	}
1006	error = vnode_label(mp, NULL, rvp, NULL, `0`, ctx);
1007	/*
1008	* drop reference provided by VFS_ROOT
1009	*/
1010	vnode_put(rvp);
1011
1012	if (error)
1013	goto out3;
1014	}
1015	#endif /* MAC */
1016
1017	vnode_lock_spin(vp);
1018	CLR(vp->v_flag, VMOUNT);
1019	vp->v_mountedhere = mp;
1020	vnode_unlock(vp);
1021
1022	/*
1023	* taking the name_cache_lock exclusively will
1024	* insure that everyone is out of the fast path who
1025	* might be trying to use a now stale copy of
1026	* vp->v_mountedhere->mnt_realrootvp
1027	* bumping mount_generation causes the cached values
1028	* to be invalidated
1029	*/
1030	name_cache_lock();
1031	mount_generation++;
1032	name_cache_unlock();
1033
1034	error = vnode_ref(vp);
1035	if (error != `0`) {
1036	goto out4;
1037	}
1038
1039	have_usecount = TRUE;
1040
1041	error = checkdirs(vp, ctx);
1042	if (error != `0`) {
1043	/ Unmount the filesystem as cdir/rdirs cannot be updated /
1044	goto out4;
1045	}
1046	/*
1047	* there is no cleanup code here so I have made it void
1048	* we need to revisit this
1049	*/
1050	(void)VFS_START(mp, `0`, ctx);
1051
1052	if (mount_list_add(mp) != `0`) {
1053	/*
1054	* The system is shutting down trying to umount
1055	* everything, so fail with a plausible errno.
1056	*/
1057	error = EBUSY;
1058	goto out4;
1059	}
1060	lck_rw_done(&mp->mnt_rwlock);
1061	is_rwlock_locked = FALSE;
1062
1063	/ Check if this mounted file system supports EAs or named streams. /
1064	/ Skip WebDAV file systems for now since they hang in VFS_GETATTR here. /
1065	VFSATTR_INIT(&vfsattr);
1066	VFSATTR_WANTED(&vfsattr, f_capabilities);
1067	if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != `0` &&
1068	vfs_getattr(mp, &vfsattr, ctx) == `0` &&
1069	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1070	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1071	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1072	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1073	}
1074	#if NAMEDSTREAMS
1075	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1076	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1077	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
1078	}
1079	#endif
1080	/ Check if this file system supports path from id lookups. /
1081	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1082	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1083	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1084	} else if (mp->mnt_flag & MNT_DOVOLFS) {
1085	/ Legacy MNT_DOVOLFS flag also implies path from id lookups. /
1086	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1087	}
1088
1089	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1090	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1091	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
1092	}
1093	}
1094	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1095	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1096	}
1097	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1098	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
1099	}
1100	/ increment the operations count /
1101	OSAddAtomic(`1`, &vfs_nummntops);
1102	enablequotas(mp, ctx);
1103
1104	if (device_vnode) {
1105	device_vnode->v_specflags \|= SI_MOUNTEDON;
1106
1107	/*
1108	* cache the IO attributes for the underlying physical media...
1109	* an error return indicates the underlying driver doesn't
1110	* support all the queries necessary... however, reasonable
1111	* defaults will have been set, so no reason to bail or care
1112	*/
1113	vfs_init_io_attributes(device_vnode, mp);
1114	}
1115
1116	/ Now that mount is setup, notify the listeners /
1117	vfs_notify_mount(pvp);
1118	IOBSDMountChange(mp, kIOMountChangeMount);
1119
1120	} else {
1121	/ If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. /
1122	if (mp->mnt_vnodelist.tqh_first != NULL) {
1123	panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1124	mp->mnt_vtable->vfc_name, error);
1125	}
1126
1127	vnode_lock_spin(vp);
1128	CLR(vp->v_flag, VMOUNT);
1129	vnode_unlock(vp);
1130	mount_list_lock();
1131	mp->mnt_vtable->vfc_refcount--;
1132	mount_list_unlock();
1133
1134	if (device_vnode ) {
1135	vnode_rele(device_vnode);
1136	VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD\|FWRITE, ctx);
1137	}
1138	lck_rw_done(&mp->mnt_rwlock);
1139	is_rwlock_locked = FALSE;
1140
1141	/*
1142	* if we get here, we have a mount structure that needs to be freed,
1143	* but since the coveredvp hasn't yet been updated to point at it,
1144	* no need to worry about other threads holding a crossref on this mp
1145	* so it's ok to just free it
1146	*/
1147	mount_lock_destroy(mp);
1148	#if CONFIG_MACF
1149	mac_mount_label_destroy(mp);
1150	#endif
1151	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1152	}
1153	exit:
1154	/*
1155	* drop I/O count on the device vp if there was one
1156	*/
1157	if (devpath && devvp)
1158	vnode_put(devvp);
1159
1160	return(error);
1161
1162	/ Error condition exits /
1163	out4:
1164	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1165
1166	/*
1167	* If the mount has been placed on the covered vp,
1168	* it may have been discovered by now, so we have
1169	* to treat this just like an unmount
1170	*/
1171	mount_lock_spin(mp);
1172	mp->mnt_lflag \|= MNT_LDEAD;
1173	mount_unlock(mp);
1174
1175	if (device_vnode != NULLVP) {
1176	vnode_rele(device_vnode);
1177	VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD\|FWRITE,
1178	ctx);
1179	did_rele = TRUE;
1180	}
1181
1182	vnode_lock_spin(vp);
1183
1184	mp->mnt_crossref++;
1185	vp->v_mountedhere = (mount_t) `0`;
1186
1187	vnode_unlock(vp);
1188
1189	if (have_usecount) {
1190	vnode_rele(vp);
1191	}
1192	out3:
1193	if (devpath && ((flags & MNT_UPDATE) == `0`) && (!did_rele))
1194	vnode_rele(devvp);
1195	out2:
1196	if (devpath && devvp)
1197	vnode_put(devvp);
1198	out1:
1199	/ Release mnt_rwlock only when it was taken /
1200	if (is_rwlock_locked == TRUE) {
1201	lck_rw_done(&mp->mnt_rwlock);
1202	}
1203
1204	if (mntalloc) {
1205	if (mp->mnt_crossref)
1206	mount_dropcrossref(mp, vp, `0`);
1207	else {
1208	mount_lock_destroy(mp);
1209	#if CONFIG_MACF
1210	mac_mount_label_destroy(mp);
1211	#endif
1212	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1213	}
1214	}
1215	if (vfsp_ref) {
1216	mount_list_lock();
1217	vfsp->vfc_refcount--;
1218	mount_list_unlock();
1219	}
1220
1221	return(error);
1222	}
1223
1224	/*
1225	* Flush in-core data, check for competing mount attempts,
1226	* and set VMOUNT
1227	*/
1228	int
1229	prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const* char *fsname, boolean_t skip_auth)
1230	{
1231	#if !CONFIG_MACF
1232	#pragma unused(cnp,fsname)
1233	#endif
1234	struct vnode_attr va;
1235	int error;
1236
1237	if (!skip_auth) {
1238	/*
1239	* If the user is not root, ensure that they own the directory
1240	* onto which we are attempting to mount.
1241	*/
1242	VATTR_INIT(&va);
1243	VATTR_WANTED(&va, va_uid);
1244	if ((error = vnode_getattr(vp, &va, ctx)) \|\|
1245	(va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1246	(!vfs_context_issuser(ctx)))) {
1247	error = EPERM;
1248	goto out;
1249	}
1250	}
1251
1252	if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1253	goto out;
1254
1255	if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, `0`, `0`)) )
1256	goto out;
1257
1258	if (vp->v_type != VDIR) {
1259	error = ENOTDIR;
1260	goto out;
1261	}
1262
1263	if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1264	error = EBUSY;
1265	goto out;
1266	}
1267
1268	#if CONFIG_MACF
1269	error = mac_mount_check_mount(ctx, vp,
1270	cnp, fsname);
1271	if (error != `0`)
1272	goto out;
1273	#endif
1274
1275	vnode_lock_spin(vp);
1276	SET(vp->v_flag, VMOUNT);
1277	vnode_unlock(vp);
1278
1279	out:
1280	return error;
1281	}
1282
1283	#if CONFIG_IMGSRC_ACCESS
1284
1285	#if DEBUG
1286	#define IMGSRC_DEBUG(args...) printf(args)
1287	#else
1288	#define IMGSRC_DEBUG(args...) do { } while(0)
1289	#endif
1290
1291	static int
1292	authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1293	{
1294	struct nameidata nd;
1295	vnode_t vp, realdevvp;
1296	mode_t accessmode;
1297	int error;
1298
1299	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1300	if ( (error = namei(&nd)) ) {
1301	IMGSRC_DEBUG("namei() failed with %d\n", error);
1302	return error;
1303	}
1304
1305	vp = nd.ni_vp;
1306
1307	if (!vnode_isblk(vp)) {
1308	IMGSRC_DEBUG("Not block device.\n");
1309	error = ENOTBLK;
1310	goto out;
1311	}
1312
1313	realdevvp = mp->mnt_devvp;
1314	if (realdevvp == NULLVP) {
1315	IMGSRC_DEBUG("No device backs the mount.\n");
1316	error = ENXIO;
1317	goto out;
1318	}
1319
1320	error = vnode_getwithref(realdevvp);
1321	if (error != `0`) {
1322	IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1323	goto out;
1324	}
1325
1326	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1327	IMGSRC_DEBUG("Wrong dev_t.\n");
1328	error = ENXIO;
1329	goto out1;
1330	}
1331
1332	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1333
1334	/*
1335	* If mount by non-root, then verify that user has necessary
1336	* permissions on the device.
1337	*/
1338	if (!vfs_context_issuser(ctx)) {
1339	accessmode = KAUTH_VNODE_READ_DATA;
1340	if ((mp->mnt_flag & MNT_RDONLY) == `0`)
1341	accessmode \|= KAUTH_VNODE_WRITE_DATA;
1342	if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != `0`) {
1343	IMGSRC_DEBUG("Access denied.\n");
1344	goto out1;
1345	}
1346	}
1347
1348	*devvpp = vp;
1349
1350	out1:
1351	vnode_put(realdevvp);
1352	out:
1353	nameidone(&nd);
1354	if (error) {
1355	vnode_put(vp);
1356	}
1357
1358	return error;
1359	}
1360
1361	/*
1362	* Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1363	* and call checkdirs()
1364	*/
1365	static int
1366	place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1367	{
1368	int error;
1369
1370	mp->mnt_vnodecovered = vp; / XXX This is normally only set at init-time ... /
1371
1372	vnode_lock_spin(vp);
1373	CLR(vp->v_flag, VMOUNT);
1374	vp->v_mountedhere = mp;
1375	vnode_unlock(vp);
1376
1377	/*
1378	* taking the name_cache_lock exclusively will
1379	* insure that everyone is out of the fast path who
1380	* might be trying to use a now stale copy of
1381	* vp->v_mountedhere->mnt_realrootvp
1382	* bumping mount_generation causes the cached values
1383	* to be invalidated
1384	*/
1385	name_cache_lock();
1386	mount_generation++;
1387	name_cache_unlock();
1388
1389	error = vnode_ref(vp);
1390	if (error != `0`) {
1391	goto out;
1392	}
1393
1394	error = checkdirs(vp, ctx);
1395	if (error != `0`) {
1396	/ Unmount the filesystem as cdir/rdirs cannot be updated /
1397	vnode_rele(vp);
1398	goto out;
1399	}
1400
1401	out:
1402	if (error != `0`) {
1403	mp->mnt_vnodecovered = NULLVP;
1404	}
1405	return error;
1406	}
1407
1408	static void
1409	undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1410	{
1411	vnode_rele(vp);
1412	vnode_lock_spin(vp);
1413	vp->v_mountedhere = (mount_t)NULL;
1414	vnode_unlock(vp);
1415
1416	mp->mnt_vnodecovered = NULLVP;
1417	}
1418
1419	static int
1420	mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1421	{
1422	int error;
1423
1424	/ unmount in progress return error /
1425	mount_lock_spin(mp);
1426	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1427	mount_unlock(mp);
1428	return EBUSY;
1429	}
1430	mount_unlock(mp);
1431	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1432
1433	/*
1434	* We only allow the filesystem to be reloaded if it
1435	* is currently mounted read-only.
1436	*/
1437	if ((flags & MNT_RELOAD) &&
1438	((mp->mnt_flag & MNT_RDONLY) == `0`)) {
1439	error = ENOTSUP;
1440	goto out;
1441	}
1442
1443	/*
1444	* Only root, or the user that did the original mount is
1445	* permitted to update it.
1446	*/
1447	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1448	(!vfs_context_issuser(ctx))) {
1449	error = EPERM;
1450	goto out;
1451	}
1452	#if CONFIG_MACF
1453	error = mac_mount_check_remount(ctx, mp);
1454	if (error != `0`) {
1455	goto out;
1456	}
1457	#endif
1458
1459	out:
1460	if (error) {
1461	lck_rw_done(&mp->mnt_rwlock);
1462	}
1463
1464	return error;
1465	}
1466
1467	static void
1468	mount_end_update(mount_t mp)
1469	{
1470	lck_rw_done(&mp->mnt_rwlock);
1471	}
1472
1473	static int
1474	get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1475	{
1476	vnode_t vp;
1477
1478	if (height >= MAX_IMAGEBOOT_NESTING) {
1479	return EINVAL;
1480	}
1481
1482	vp = imgsrc_rootvnodes[height];
1483	if ((vp != NULLVP) && (vnode_get(vp) == `0`)) {
1484	*rvpp = vp;
1485	return `0`;
1486	} else {
1487	return ENOENT;
1488	}
1489	}
1490
1491	static int
1492	relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1493	const char *fsname, vfs_context_t ctx,
1494	boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1495	{
1496	int error;
1497	mount_t mp;
1498	boolean_t placed = FALSE;
1499	vnode_t devvp = NULLVP;
1500	struct vfstable *vfsp;
1501	user_addr_t devpath;
1502	char *old_mntonname;
1503	vnode_t rvp;
1504	uint32_t height;
1505	uint32_t flags;
1506
1507	/ If we didn't imageboot, nothing to move /
1508	if (imgsrc_rootvnodes[`0`] == NULLVP) {
1509	return EINVAL;
1510	}
1511
1512	/ Only root can do this /
1513	if (!vfs_context_issuser(ctx)) {
1514	return EPERM;
1515	}
1516
1517	IMGSRC_DEBUG("looking for root vnode.\n");
1518
1519	/*
1520	* Get root vnode of filesystem we're moving.
1521	*/
1522	if (by_index) {
1523	if (is64bit) {
1524	struct user64_mnt_imgsrc_args mia64;
1525	error = copyin(fsmountargs, &mia64, sizeof(mia64));
1526	if (error != `0`) {
1527	IMGSRC_DEBUG("Failed to copy in arguments.\n");
1528	return error;
1529	}
1530
1531	height = mia64.mi_height;
1532	flags = mia64.mi_flags;
1533	devpath = mia64.mi_devpath;
1534	} else {
1535	struct user32_mnt_imgsrc_args mia32;
1536	error = copyin(fsmountargs, &mia32, sizeof(mia32));
1537	if (error != `0`) {
1538	IMGSRC_DEBUG("Failed to copy in arguments.\n");
1539	return error;
1540	}
1541
1542	height = mia32.mi_height;
1543	flags = mia32.mi_flags;
1544	devpath = mia32.mi_devpath;
1545	}
1546	} else {
1547	/*
1548	* For binary compatibility--assumes one level of nesting.
1549	*/
1550	if (is64bit) {
1551	if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1552	return error;
1553	} else {
1554	user32_addr_t tmp;
1555	if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1556	return error;
1557
1558	/ munge into LP64 addr /
1559	devpath = CAST_USER_ADDR_T(tmp);
1560	}
1561
1562	height = `0`;
1563	flags = `0`;
1564	}
1565
1566	if (flags != `0`) {
1567	IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1568	return EINVAL;
1569	}
1570
1571	error = get_imgsrc_rootvnode(height, &rvp);
1572	if (error != `0`) {
1573	IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1574	return error;
1575	}
1576
1577	IMGSRC_DEBUG("got root vnode.\n");
1578
1579	MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1580
1581	/ Can only move once /
1582	mp = vnode_mount(rvp);
1583	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1584	IMGSRC_DEBUG("Already moved.\n");
1585	error = EBUSY;
1586	goto out0;
1587	}
1588
1589	IMGSRC_DEBUG("Starting updated.\n");
1590
1591	/ Get exclusive rwlock on mount, authorize update on mp /
1592	error = mount_begin_update(mp , ctx, `0`);
1593	if (error != `0`) {
1594	IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1595	goto out0;
1596	}
1597
1598	/*
1599	* It can only be moved once. Flag is set under the rwlock,
1600	* so we're now safe to proceed.
1601	*/
1602	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1603	IMGSRC_DEBUG("Already moved [2]\n");
1604	goto out1;
1605	}
1606
1607
1608	IMGSRC_DEBUG("Preparing coveredvp.\n");
1609
1610	/ Mark covered vnode as mount in progress, authorize placing mount on top /
1611	error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1612	if (error != `0`) {
1613	IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1614	goto out1;
1615	}
1616
1617	IMGSRC_DEBUG("Covered vp OK.\n");
1618
1619	/ Sanity check the name caller has provided /
1620	vfsp = mp->mnt_vtable;
1621	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != `0`) {
1622	IMGSRC_DEBUG("Wrong fs name.\n");
1623	error = EINVAL;
1624	goto out2;
1625	}
1626
1627	/ Check the device vnode and update mount-from name, for local filesystems /
1628	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1629	IMGSRC_DEBUG("Local, doing device validation.\n");
1630
1631	if (devpath != USER_ADDR_NULL) {
1632	error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1633	if (error) {
1634	IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1635	goto out2;
1636	}
1637
1638	vnode_put(devvp);
1639	}
1640	}
1641
1642	/*
1643	* Place mp on top of vnode, ref the vnode, call checkdirs(),
1644	* and increment the name cache's mount generation
1645	*/
1646
1647	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1648	error = place_mount_and_checkdirs(mp, vp, ctx);
1649	if (error != `0`) {
1650	goto out2;
1651	}
1652
1653	placed = TRUE;
1654
1655	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1656	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1657
1658	/ Forbid future moves /
1659	mount_lock(mp);
1660	mp->mnt_kern_flag \|= MNTK_HAS_MOVED;
1661	mount_unlock(mp);
1662
1663	/ Finally, add to mount list, completely ready to go /
1664	if (mount_list_add(mp) != `0`) {
1665	/*
1666	* The system is shutting down trying to umount
1667	* everything, so fail with a plausible errno.
1668	*/
1669	error = EBUSY;
1670	goto out3;
1671	}
1672
1673	mount_end_update(mp);
1674	vnode_put(rvp);
1675	FREE(old_mntonname, M_TEMP);
1676
1677	vfs_notify_mount(pvp);
1678
1679	return `0`;
1680	out3:
1681	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1682
1683	mount_lock(mp);
1684	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1685	mount_unlock(mp);
1686
1687	out2:
1688	/*
1689	* Placing the mp on the vnode clears VMOUNT,
1690	* so cleanup is different after that point
1691	*/
1692	if (placed) {
1693	/ Rele the vp, clear VMOUNT and v_mountedhere /
1694	undo_place_on_covered_vp(mp, vp);
1695	} else {
1696	vnode_lock_spin(vp);
1697	CLR(vp->v_flag, VMOUNT);
1698	vnode_unlock(vp);
1699	}
1700	out1:
1701	mount_end_update(mp);
1702
1703	out0:
1704	vnode_put(rvp);
1705	FREE(old_mntonname, M_TEMP);
1706	return error;
1707	}
1708
1709	#endif /* CONFIG_IMGSRC_ACCESS */
1710
1711	void
1712	enablequotas(struct mount *mp, vfs_context_t ctx)
1713	{
1714	struct nameidata qnd;
1715	int type;
1716	char qfpath[MAXPATHLEN];
1717	const char *qfname = QUOTAFILENAME;
1718	const char *qfopsname = QUOTAOPSNAME;
1719	const char *qfextension[] = INITQFNAMES;
1720
1721	/ XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s /
1722	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != `0` ) {
1723	return;
1724	}
1725	/*
1726	* Enable filesystem disk quotas if necessary.
1727	* We ignore errors as this should not interfere with final mount
1728	*/
1729	for (type=`0`; type < MAXQUOTAS; type++) {
1730	snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1731	NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1732	CAST_USER_ADDR_T(qfpath), ctx);
1733	if (namei(&qnd) != `0`)
1734	continue; / option file to trigger quotas is not present /
1735	vnode_put(qnd.ni_vp);
1736	nameidone(&qnd);
1737	snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1738
1739	(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), `0`, qfpath, ctx);
1740	}
1741	return;
1742	}
1743
1744
1745	static int
1746	checkdirs_callback(proc_t p, void * arg)
1747	{
1748	struct cdirargs * cdrp = (struct cdirargs * )arg;
1749	vnode_t olddp = cdrp->olddp;
1750	vnode_t newdp = cdrp->newdp;
1751	struct filedesc *fdp;
1752	vnode_t tvp;
1753	vnode_t fdp_cvp;
1754	vnode_t fdp_rvp;
1755	int cdir_changed = `0`;
1756	int rdir_changed = `0`;
1757
1758	/*
1759	* XXX Also needs to iterate each thread in the process to see if it
1760	* XXX is using a per-thread current working directory, and, if so,
1761	* XXX update that as well.
1762	*/
1763
1764	proc_fdlock(p);
1765	fdp = p->p_fd;
1766	if (fdp == (struct filedesc *)`0`) {
1767	proc_fdunlock(p);
1768	return(PROC_RETURNED);
1769	}
1770	fdp_cvp = fdp->fd_cdir;
1771	fdp_rvp = fdp->fd_rdir;
1772	proc_fdunlock(p);
1773
1774	if (fdp_cvp == olddp) {
1775	vnode_ref(newdp);
1776	tvp = fdp->fd_cdir;
1777	fdp_cvp = newdp;
1778	cdir_changed = `1`;
1779	vnode_rele(tvp);
1780	}
1781	if (fdp_rvp == olddp) {
1782	vnode_ref(newdp);
1783	tvp = fdp->fd_rdir;
1784	fdp_rvp = newdp;
1785	rdir_changed = `1`;
1786	vnode_rele(tvp);
1787	}
1788	if (cdir_changed \|\| rdir_changed) {
1789	proc_fdlock(p);
1790	fdp->fd_cdir = fdp_cvp;
1791	fdp->fd_rdir = fdp_rvp;
1792	proc_fdunlock(p);
1793	}
1794	return(PROC_RETURNED);
1795	}
1796
1797
1798
1799	/*
1800	* Scan all active processes to see if any of them have a current
1801	* or root directory onto which the new filesystem has just been
1802	* mounted. If so, replace them with the new mount point.
1803	*/
1804	static int
1805	checkdirs(vnode_t olddp, vfs_context_t ctx)
1806	{
1807	vnode_t newdp;
1808	vnode_t tvp;
1809	int err;
1810	struct cdirargs cdr;
1811
1812	if (olddp->v_usecount == `1`)
1813	return(`0`);
1814	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1815
1816	if (err != `0`) {
1817	#if DIAGNOSTIC
1818	panic("mount: lost mount: error %d", err);
1819	#endif
1820	return(err);
1821	}
1822
1823	cdr.olddp = olddp;
1824	cdr.newdp = newdp;
1825	/ do not block for exec/fork trans as the vp in cwd & rootdir are not changing /
1826	proc_iterate(PROC_ALLPROCLIST \| PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1827
1828	if (rootvnode == olddp) {
1829	vnode_ref(newdp);
1830	tvp = rootvnode;
1831	rootvnode = newdp;
1832	vnode_rele(tvp);
1833	}
1834
1835	vnode_put(newdp);
1836	return(`0`);
1837	}
1838
1839	/*
1840	* Unmount a file system.
1841	*
1842	* Note: unmount takes a path to the vnode mounted on as argument,
1843	* not special file (as before).
1844	*/
1845	/ ARGSUSED /
1846	int
1847	unmount(__unused proc_t p, struct unmount_args uap, __unused int32_t retval)
1848	{
1849	vnode_t vp;
1850	struct mount *mp;
1851	int error;
1852	struct nameidata nd;
1853	vfs_context_t ctx = vfs_context_current();
1854
1855	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW \| AUDITVNPATH1,
1856	UIO_USERSPACE, uap->path, ctx);
1857	error = namei(&nd);
1858	if (error)
1859	return (error);
1860	vp = nd.ni_vp;
1861	mp = vp->v_mount;
1862	nameidone(&nd);
1863
1864	#if CONFIG_MACF
1865	error = mac_mount_check_umount(ctx, mp);
1866	if (error != `0`) {
1867	vnode_put(vp);
1868	return (error);
1869	}
1870	#endif
1871	/*
1872	* Must be the root of the filesystem
1873	*/
1874	if ((vp->v_flag & VROOT) == `0`) {
1875	vnode_put(vp);
1876	return (EINVAL);
1877	}
1878	mount_ref(mp, `0`);
1879	vnode_put(vp);
1880	/ safedounmount consumes the mount ref /
1881	return (safedounmount(mp, uap->flags, ctx));
1882	}
1883
1884	int
1885	vfs_unmountbyfsid(fsid_t fsid, int* flags, vfs_context_t ctx)
1886	{
1887	mount_t mp;
1888
1889	mp = mount_list_lookupby_fsid(fsid, `0`, `1`);
1890	if (mp == (mount_t)`0`) {
1891	return(ENOENT);
1892	}
1893	mount_ref(mp, `0`);
1894	mount_iterdrop(mp);
1895	/ safedounmount consumes the mount ref /
1896	return(safedounmount(mp, flags, ctx));
1897	}
1898
1899
1900	/*
1901	* The mount struct comes with a mount ref which will be consumed.
1902	* Do the actual file system unmount, prevent some common foot shooting.
1903	*/
1904	int
1905	safedounmount(struct mount mp, int* flags, vfs_context_t ctx)
1906	{
1907	int error;
1908	proc_t p = vfs_context_proc(ctx);
1909
1910	/*
1911	* If the file system is not responding and MNT_NOBLOCK
1912	* is set and not a forced unmount then return EBUSY.
1913	*/
1914	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1915	(flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == `0`)) {
1916	error = EBUSY;
1917	goto out;
1918	}
1919
1920	/*
1921	* Skip authorization if the mount is tagged as permissive and
1922	* this is not a forced-unmount attempt.
1923	*/
1924	if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != `0`) && ((flags & MNT_FORCE) == `0`))) {
1925	/*
1926	* Only root, or the user that did the original mount is
1927	* permitted to unmount this filesystem.
1928	*/
1929	if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1930	(error = suser(kauth_cred_get(), &p->p_acflag)))
1931	goto out;
1932	}
1933	/*
1934	* Don't allow unmounting the root file system.
1935	*/
1936	if (mp->mnt_flag & MNT_ROOTFS) {
1937	error = EBUSY; / the root is always busy /
1938	goto out;
1939	}
1940
1941	#ifdef CONFIG_IMGSRC_ACCESS
1942	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1943	error = EBUSY;
1944	goto out;
1945	}
1946	#endif /* CONFIG_IMGSRC_ACCESS */
1947
1948	return (dounmount(mp, flags, `1`, ctx));
1949
1950	out:
1951	mount_drop(mp, `0`);
1952	return(error);
1953	}
1954
1955	/*
1956	* Do the actual file system unmount.
1957	*/
1958	int
1959	dounmount(struct mount mp, int* flags, int withref, vfs_context_t ctx)
1960	{
1961	vnode_t coveredvp = (vnode_t)`0`;
1962	int error;
1963	int needwakeup = `0`;
1964	int forcedunmount = `0`;
1965	int lflags = `0`;
1966	struct vnode *devvp = NULLVP;
1967	#if CONFIG_TRIGGERS
1968	proc_t p = vfs_context_proc(ctx);
1969	int did_vflush = `0`;
1970	int pflags_save = `0`;
1971	#endif /* CONFIG_TRIGGERS */
1972
1973	#if CONFIG_FSE
1974	if (!(flags & MNT_FORCE)) {
1975	fsevent_unmount(mp, ctx); / has to come first! /
1976	}
1977	#endif
1978
1979	mount_lock(mp);
1980
1981	/*
1982	* If already an unmount in progress just return EBUSY.
1983	* Even a forced unmount cannot override.
1984	*/
1985	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1986	if (withref != `0`)
1987	mount_drop(mp, `1`);
1988	mount_unlock(mp);
1989	return (EBUSY);
1990	}
1991
1992	if (flags & MNT_FORCE) {
1993	forcedunmount = `1`;
1994	mp->mnt_lflag \|= MNT_LFORCE;
1995	}
1996
1997	#if CONFIG_TRIGGERS
1998	if (flags & MNT_NOBLOCK && p != kernproc)
1999	pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2000	#endif
2001
2002	mp->mnt_kern_flag \|= MNTK_UNMOUNT;
2003	mp->mnt_lflag \|= MNT_LUNMOUNT;
2004	mp->mnt_flag &=~ MNT_ASYNC;
2005	/*
2006	* anyone currently in the fast path that
2007	* trips over the cached rootvp will be
2008	* dumped out and forced into the slow path
2009	* to regenerate a new cached value
2010	*/
2011	mp->mnt_realrootvp = NULLVP;
2012	mount_unlock(mp);
2013
2014	if (forcedunmount && (flags & MNT_LNOSUB) == `0`) {
2015	/*
2016	* Force unmount any mounts in this filesystem.
2017	* If any unmounts fail - just leave them dangling.
2018	* Avoids recursion.
2019	*/
2020	(void) dounmount_submounts(mp, flags \| MNT_LNOSUB, ctx);
2021	}
2022
2023	/*
2024	* taking the name_cache_lock exclusively will
2025	* insure that everyone is out of the fast path who
2026	* might be trying to use a now stale copy of
2027	* vp->v_mountedhere->mnt_realrootvp
2028	* bumping mount_generation causes the cached values
2029	* to be invalidated
2030	*/
2031	name_cache_lock();
2032	mount_generation++;
2033	name_cache_unlock();
2034
2035
2036	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2037	if (withref != `0`)
2038	mount_drop(mp, `0`);
2039	error = `0`;
2040	if (forcedunmount == `0`) {
2041	ubc_umount(mp); / release cached vnodes /
2042	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
2043	error = VFS_SYNC(mp, MNT_WAIT, ctx);
2044	if (error) {
2045	mount_lock(mp);
2046	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2047	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2048	mp->mnt_lflag &= ~MNT_LFORCE;
2049	goto out;
2050	}
2051	}
2052	}
2053
2054	/ free disk_conditioner_info structure for this mount /
2055	disk_conditioner_unmount(mp);
2056
2057	IOBSDMountChange(mp, kIOMountChangeUnmount);
2058
2059	#if CONFIG_TRIGGERS
2060	vfs_nested_trigger_unmounts(mp, flags, ctx);
2061	did_vflush = `1`;
2062	#endif
2063	if (forcedunmount)
2064	lflags \|= FORCECLOSE;
2065	error = vflush(mp, NULLVP, SKIPSWAP \| SKIPSYSTEM \| SKIPROOT \| lflags);
2066	if ((forcedunmount == `0`) && error) {
2067	mount_lock(mp);
2068	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2069	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2070	mp->mnt_lflag &= ~MNT_LFORCE;
2071	goto out;
2072	}
2073
2074	/ make sure there are no one in the mount iterations or lookup /
2075	mount_iterdrain(mp);
2076
2077	error = VFS_UNMOUNT(mp, flags, ctx);
2078	if (error) {
2079	mount_iterreset(mp);
2080	mount_lock(mp);
2081	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2082	mp->mnt_lflag &= ~MNT_LUNMOUNT;
2083	mp->mnt_lflag &= ~MNT_LFORCE;
2084	goto out;
2085	}
2086
2087	/ increment the operations count /
2088	if (!error)
2089	OSAddAtomic(`1`, &vfs_nummntops);
2090
2091	if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2092	/ hold an io reference and drop the usecount before close /
2093	devvp = mp->mnt_devvp;
2094	vnode_getalways(devvp);
2095	vnode_rele(devvp);
2096	VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD\|FWRITE,
2097	ctx);
2098	vnode_clearmountedon(devvp);
2099	vnode_put(devvp);
2100	}
2101	lck_rw_done(&mp->mnt_rwlock);
2102	mount_list_remove(mp);
2103	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104
2105	/ mark the mount point hook in the vp but not drop the ref yet /
2106	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2107	/*
2108	* The covered vnode needs special handling. Trying to get an
2109	* iocount must not block here as this may lead to deadlocks
2110	* if the Filesystem to which the covered vnode belongs is
2111	* undergoing forced unmounts. Since we hold a usecount, the
2112	* vnode cannot be reused (it can, however, still be terminated)
2113	*/
2114	vnode_getalways(coveredvp);
2115	vnode_lock_spin(coveredvp);
2116
2117	mp->mnt_crossref++;
2118	coveredvp->v_mountedhere = (struct mount *)`0`;
2119	CLR(coveredvp->v_flag, VMOUNT);
2120
2121	vnode_unlock(coveredvp);
2122	vnode_put(coveredvp);
2123	}
2124
2125	mount_list_lock();
2126	mp->mnt_vtable->vfc_refcount--;
2127	mount_list_unlock();
2128
2129	cache_purgevfs(mp); / remove cache entries for this file sys /
2130	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2131	mount_lock(mp);
2132	mp->mnt_lflag \|= MNT_LDEAD;
2133
2134	if (mp->mnt_lflag & MNT_LWAIT) {
2135	/*
2136	* do the wakeup here
2137	* in case we block in mount_refdrain
2138	* which will drop the mount lock
2139	* and allow anyone blocked in vfs_busy
2140	* to wakeup and see the LDEAD state
2141	*/
2142	mp->mnt_lflag &= ~MNT_LWAIT;
2143	wakeup((caddr_t)mp);
2144	}
2145	mount_refdrain(mp);
2146	out:
2147	if (mp->mnt_lflag & MNT_LWAIT) {
2148	mp->mnt_lflag &= ~MNT_LWAIT;
2149	needwakeup = `1`;
2150	}
2151
2152	#if CONFIG_TRIGGERS
2153	if (flags & MNT_NOBLOCK && p != kernproc) {
2154	// Restore P_NOREMOTEHANG bit to its previous value
2155	if ((pflags_save & P_NOREMOTEHANG) == `0`)
2156	OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2157	}
2158
2159	/*
2160	* Callback and context are set together under the mount lock, and
2161	* never cleared, so we're safe to examine them here, drop the lock,
2162	* and call out.
2163	*/
2164	if (mp->mnt_triggercallback != NULL) {
2165	mount_unlock(mp);
2166	if (error == `0`) {
2167	mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2168	} else if (did_vflush) {
2169	mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2170	}
2171	} else {
2172	mount_unlock(mp);
2173	}
2174	#else
2175	mount_unlock(mp);
2176	#endif /* CONFIG_TRIGGERS */
2177
2178	lck_rw_done(&mp->mnt_rwlock);
2179
2180	if (needwakeup)
2181	wakeup((caddr_t)mp);
2182
2183	if (!error) {
2184	if ((coveredvp != NULLVP)) {
2185	vnode_t pvp = NULLVP;
2186
2187	/*
2188	* The covered vnode needs special handling. Trying to
2189	* get an iocount must not block here as this may lead
2190	* to deadlocks if the Filesystem to which the covered
2191	* vnode belongs is undergoing forced unmounts. Since we
2192	* hold a usecount, the vnode cannot be reused
2193	* (it can, however, still be terminated).
2194	*/
2195	vnode_getalways(coveredvp);
2196
2197	mount_dropcrossref(mp, coveredvp, `0`);
2198	/*
2199	* We'll _try_ to detect if this really needs to be
2200	* done. The coveredvp can only be in termination (or
2201	* terminated) if the coveredvp's mount point is in a
2202	* forced unmount (or has been) since we still hold the
2203	* ref.
2204	*/
2205	if (!vnode_isrecycled(coveredvp)) {
2206	pvp = vnode_getparent(coveredvp);
2207	#if CONFIG_TRIGGERS
2208	if (coveredvp->v_resolve) {
2209	vnode_trigger_rearm(coveredvp, ctx);
2210	}
2211	#endif
2212	}
2213
2214	vnode_rele(coveredvp);
2215	vnode_put(coveredvp);
2216	coveredvp = NULLVP;
2217
2218	if (pvp) {
2219	lock_vnode_and_post(pvp, NOTE_WRITE);
2220	vnode_put(pvp);
2221	}
2222	} else if (mp->mnt_flag & MNT_ROOTFS) {
2223	mount_lock_destroy(mp);
2224	#if CONFIG_MACF
2225	mac_mount_label_destroy(mp);
2226	#endif
2227	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2228	} else
2229	panic("dounmount: no coveredvp");
2230	}
2231	return (error);
2232	}
2233
2234	/*
2235	* Unmount any mounts in this filesystem.
2236	*/
2237	void
2238	dounmount_submounts(struct mount mp, int* flags, vfs_context_t ctx)
2239	{
2240	mount_t smp;
2241	fsid_t *fsids, fsid;
2242	int fsids_sz;
2243	int count = `0`, i, m = `0`;
2244	vnode_t vp;
2245
2246	mount_list_lock();
2247
2248	// Get an array to hold the submounts fsids.
2249	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2250	count++;
2251	fsids_sz = count * sizeof(fsid_t);
2252	MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2253	if (fsids == NULL) {
2254	mount_list_unlock();
2255	goto out;
2256	}
2257	fsids[`0`] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2258
2259	/*
2260	* Fill the array with submount fsids.
2261	* Since mounts are always added to the tail of the mount list, the
2262	* list is always in mount order.
2263	* For each mount check if the mounted-on vnode belongs to a
2264	* mount that's already added to our array of mounts to be unmounted.
2265	*/
2266	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2267	vp = smp->mnt_vnodecovered;
2268	if (vp == NULL)
2269	continue;
2270	fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2271	for (i = `0`; i <= m; i++) {
2272	if (fsids[i].val[`0`] == fsid.val[`0`] &&
2273	fsids[i].val[`1`] == fsid.val[`1`]) {
2274	fsids[++m] = smp->mnt_vfsstat.f_fsid;
2275	break;
2276	}
2277	}
2278	}
2279	mount_list_unlock();
2280
2281	// Unmount the submounts in reverse order. Ignore errors.
2282	for (i = m; i > `0`; i--) {
2283	smp = mount_list_lookupby_fsid(&fsids[i], `0`, `1`);
2284	if (smp) {
2285	mount_ref(smp, `0`);
2286	mount_iterdrop(smp);
2287	(void) dounmount(smp, flags, `1`, ctx);
2288	}
2289	}
2290	out:
2291	if (fsids)
2292	FREE(fsids, M_TEMP);
2293	}
2294
2295	void
2296	mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2297	{
2298	vnode_lock(dp);
2299	mp->mnt_crossref--;
2300
2301	if (mp->mnt_crossref < `0`)
2302	panic("mount cross refs -ve");
2303
2304	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == `0`)) {
2305
2306	if (need_put)
2307	vnode_put_locked(dp);
2308	vnode_unlock(dp);
2309
2310	mount_lock_destroy(mp);
2311	#if CONFIG_MACF
2312	mac_mount_label_destroy(mp);
2313	#endif
2314	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2315	return;
2316	}
2317	if (need_put)
2318	vnode_put_locked(dp);
2319	vnode_unlock(dp);
2320	}
2321
2322
2323	/*
2324	* Sync each mounted filesystem.
2325	*/
2326	#if DIAGNOSTIC
2327	int syncprt = `0`;
2328	#endif
2329
2330	int print_vmpage_stat=`0`;
2331
2332	static int
2333	sync_callback(mount_t mp, __unused void *arg)
2334	{
2335	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
2336	int asyncflag = mp->mnt_flag & MNT_ASYNC;
2337
2338	mp->mnt_flag &= ~MNT_ASYNC;
2339	VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2340	if (asyncflag)
2341	mp->mnt_flag \|= MNT_ASYNC;
2342	}
2343
2344	return (VFS_RETURNED);
2345	}
2346
2347	/ ARGSUSED /
2348	int
2349	sync(__unused proc_t p, __unused struct sync_args uap, __unused int32_t retval)
2350	{
2351	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2352
2353	if (print_vmpage_stat) {
2354	vm_countdirtypages();
2355	}
2356
2357	#if DIAGNOSTIC
2358	if (syncprt)
2359	vfs_bufstats();
2360	#endif /* DIAGNOSTIC */
2361	return `0`;
2362	}
2363
2364	typedef enum {
2365	SYNC_ALL = `0`,
2366	SYNC_ONLY_RELIABLE_MEDIA = `1`,
2367	SYNC_ONLY_UNRELIABLE_MEDIA = `2`
2368	} sync_type_t;
2369
2370	static int
2371	sync_internal_callback(mount_t mp, void *arg)
2372	{
2373	if (arg) {
2374	int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2375	(mp->mnt_flag & MNT_LOCAL);
2376	sync_type_t sync_type = ((sync_type_t )arg);
2377
2378	if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable)
2379	return (VFS_RETURNED);
2380	else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable)
2381	return (VFS_RETURNED);
2382	}
2383
2384	(void)sync_callback(mp, NULL);
2385
2386	return (VFS_RETURNED);
2387	}
2388
2389	int sync_thread_state = `0`;
2390	int sync_timeout_seconds = `5`;
2391
2392	#define SYNC_THREAD_RUN 0x0001
2393	#define SYNC_THREAD_RUNNING 0x0002
2394
2395	static void
2396	sync_thread(__unused void *arg, __unused wait_result_t wr)
2397	{
2398	sync_type_t sync_type;
2399
2400	lck_mtx_lock(sync_mtx_lck);
2401	while (sync_thread_state & SYNC_THREAD_RUN) {
2402	sync_thread_state &= ~SYNC_THREAD_RUN;
2403	lck_mtx_unlock(sync_mtx_lck);
2404
2405	sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2406	vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2407	sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2408	vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2409
2410	lck_mtx_lock(sync_mtx_lck);
2411	}
2412	/*
2413	* This wakeup _has_ to be issued before the lock is released otherwise
2414	* we may end up waking up a thread in sync_internal which is
2415	* expecting a wakeup from a thread it just created and not from this
2416	* thread which is about to exit.
2417	*/
2418	wakeup(&sync_thread_state);
2419	sync_thread_state &= ~SYNC_THREAD_RUNNING;
2420	lck_mtx_unlock(sync_mtx_lck);
2421
2422	if (print_vmpage_stat) {
2423	vm_countdirtypages();
2424	}
2425
2426	#if DIAGNOSTIC
2427	if (syncprt)
2428	vfs_bufstats();
2429	#endif /* DIAGNOSTIC */
2430	}
2431
2432	struct timeval sync_timeout_last_print = {`0`, `0`};
2433
2434	/*
2435	* An in-kernel sync for power management to call.
2436	* This function always returns within sync_timeout seconds.
2437	*/
2438	__private_extern__ int
2439	sync_internal(void)
2440	{
2441	thread_t thd;
2442	int error;
2443	int thread_created = FALSE;
2444	struct timespec ts = {sync_timeout_seconds, `0`};
2445
2446	lck_mtx_lock(sync_mtx_lck);
2447	sync_thread_state \|= SYNC_THREAD_RUN;
2448	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2449	int kr;
2450
2451	sync_thread_state \|= SYNC_THREAD_RUNNING;
2452	kr = kernel_thread_start(sync_thread, NULL, &thd);
2453	if (kr != KERN_SUCCESS) {
2454	sync_thread_state &= ~SYNC_THREAD_RUNNING;
2455	lck_mtx_unlock(sync_mtx_lck);
2456	printf("sync_thread failed\n");
2457	return (`0`);
2458	}
2459	thread_created = TRUE;
2460	}
2461
2462	error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2463	(PVFS \| PDROP \| PCATCH), "sync_thread", &ts);
2464	if (error) {
2465	struct timeval now;
2466
2467	microtime(&now);
2468	if (now.tv_sec - sync_timeout_last_print.tv_sec > `120`) {
2469	printf("sync timed out: %d sec\n", sync_timeout_seconds);
2470	sync_timeout_last_print.tv_sec = now.tv_sec;
2471	}
2472	}
2473
2474	if (thread_created)
2475	thread_deallocate(thd);
2476
2477	return (`0`);
2478	} / end of sync_internal call /
2479
2480	/*
2481	* Change filesystem quotas.
2482	*/
2483	#if QUOTA
2484	int
2485	quotactl(proc_t p, struct quotactl_args uap, __unused int32_t retval)
2486	{
2487	struct mount *mp;
2488	int error, quota_cmd, quota_status = `0`;
2489	caddr_t datap;
2490	size_t fnamelen;
2491	struct nameidata nd;
2492	vfs_context_t ctx = vfs_context_current();
2493	struct dqblk my_dqblk = {};
2494
2495	AUDIT_ARG(uid, uap->uid);
2496	AUDIT_ARG(cmd, uap->cmd);
2497	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
2498	uap->path, ctx);
2499	error = namei(&nd);
2500	if (error)
2501	return (error);
2502	mp = nd.ni_vp->v_mount;
2503	vnode_put(nd.ni_vp);
2504	nameidone(&nd);
2505
2506	/ copyin any data we will need for downstream code /
2507	quota_cmd = uap->cmd >> SUBCMDSHIFT;
2508
2509	switch (quota_cmd) {
2510	case Q_QUOTAON:
2511	/ uap->arg specifies a file from which to take the quotas /
2512	fnamelen = MAXPATHLEN;
2513	datap = kalloc(MAXPATHLEN);
2514	error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2515	break;
2516	case Q_GETQUOTA:
2517	/ uap->arg is a pointer to a dqblk structure. /
2518	datap = (caddr_t) &my_dqblk;
2519	break;
2520	case Q_SETQUOTA:
2521	case Q_SETUSE:
2522	/ uap->arg is a pointer to a dqblk structure. /
2523	datap = (caddr_t) &my_dqblk;
2524	if (proc_is64bit(p)) {
2525	struct user_dqblk my_dqblk64;
2526	error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2527	if (error == `0`) {
2528	munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2529	}
2530	}
2531	else {
2532	error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2533	}
2534	break;
2535	case Q_QUOTASTAT:
2536	/ uap->arg is a pointer to an integer /
2537	datap = (caddr_t) &quota_status;
2538	break;
2539	default:
2540	datap = NULL;
2541	break;
2542	} / switch /
2543
2544	if (error == `0`) {
2545	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2546	}
2547
2548	switch (quota_cmd) {
2549	case Q_QUOTAON:
2550	if (datap != NULL)
2551	kfree(datap, MAXPATHLEN);
2552	break;
2553	case Q_GETQUOTA:
2554	/ uap->arg is a pointer to a dqblk structure we need to copy out to /
2555	if (error == `0`) {
2556	if (proc_is64bit(p)) {
2557	struct user_dqblk my_dqblk64;
2558
2559	memset(&my_dqblk64, `0`, sizeof(my_dqblk64));
2560	munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2561	error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2562	}
2563	else {
2564	error = copyout(datap, uap->arg, sizeof (struct dqblk));
2565	}
2566	}
2567	break;
2568	case Q_QUOTASTAT:
2569	/ uap->arg is a pointer to an integer /
2570	if (error == `0`) {
2571	error = copyout(datap, uap->arg, sizeof(quota_status));
2572	}
2573	break;
2574	default:
2575	break;
2576	} / switch /
2577
2578	return (error);
2579	}
2580	#else
2581	int
2582	quotactl(__unused proc_t p, __unused struct quotactl_args uap, __unused int32_t retval)
2583	{
2584	return (EOPNOTSUPP);
2585	}
2586	#endif /* QUOTA */
2587
2588	/*
2589	* Get filesystem statistics.
2590	*
2591	* Returns: 0 Success
2592	* namei:???
2593	* vfs_update_vfsstat:???
2594	* munge_statfs:EFAULT
2595	*/
2596	/ ARGSUSED /
2597	int
2598	statfs(__unused proc_t p, struct statfs_args uap, __unused int32_t retval)
2599	{
2600	struct mount *mp;
2601	struct vfsstatfs *sp;
2602	int error;
2603	struct nameidata nd;
2604	vfs_context_t ctx = vfs_context_current();
2605	vnode_t vp;
2606
2607	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
2608	UIO_USERSPACE, uap->path, ctx);
2609	error = namei(&nd);
2610	if (error != `0`)
2611	return (error);
2612	vp = nd.ni_vp;
2613	mp = vp->v_mount;
2614	sp = &mp->mnt_vfsstat;
2615	nameidone(&nd);
2616
2617	#if CONFIG_MACF
2618	error = mac_mount_check_stat(ctx, mp);
2619	if (error != `0`)
2620	return (error);
2621	#endif
2622
2623	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2624	if (error != `0`) {
2625	vnode_put(vp);
2626	return (error);
2627	}
2628
2629	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2630	vnode_put(vp);
2631	return (error);
2632	}
2633
2634	/*
2635	* Get filesystem statistics.
2636	*/
2637	/ ARGSUSED /
2638	int
2639	fstatfs(__unused proc_t p, struct fstatfs_args uap, __unused int32_t retval)
2640	{
2641	vnode_t vp;
2642	struct mount *mp;
2643	struct vfsstatfs *sp;
2644	int error;
2645
2646	AUDIT_ARG(fd, uap->fd);
2647
2648	if ( (error = file_vnode(uap->fd, &vp)) )
2649	return (error);
2650
2651	error = vnode_getwithref(vp);
2652	if (error) {
2653	file_drop(uap->fd);
2654	return (error);
2655	}
2656
2657	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2658
2659	mp = vp->v_mount;
2660	if (!mp) {
2661	error = EBADF;
2662	goto out;
2663	}
2664
2665	#if CONFIG_MACF
2666	error = mac_mount_check_stat(vfs_context_current(), mp);
2667	if (error != `0`)
2668	goto out;
2669	#endif
2670
2671	sp = &mp->mnt_vfsstat;
2672	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != `0`) {
2673	goto out;
2674	}
2675
2676	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2677
2678	out:
2679	file_drop(uap->fd);
2680	vnode_put(vp);
2681
2682	return (error);
2683	}
2684
2685	/*
2686	* Common routine to handle copying of statfs64 data to user space
2687	*/
2688	static int
2689	statfs64_common(struct mount mp, struct* vfsstatfs *sfsp, user_addr_t bufp)
2690	{
2691	int error;
2692	struct statfs64 sfs;
2693
2694	bzero(&sfs, sizeof(sfs));
2695
2696	sfs.f_bsize = sfsp->f_bsize;
2697	sfs.f_iosize = (int32_t)sfsp->f_iosize;
2698	sfs.f_blocks = sfsp->f_blocks;
2699	sfs.f_bfree = sfsp->f_bfree;
2700	sfs.f_bavail = sfsp->f_bavail;
2701	sfs.f_files = sfsp->f_files;
2702	sfs.f_ffree = sfsp->f_ffree;
2703	sfs.f_fsid = sfsp->f_fsid;
2704	sfs.f_owner = sfsp->f_owner;
2705	sfs.f_type = mp->mnt_vtable->vfc_typenum;
2706	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2707	sfs.f_fssubtype = sfsp->f_fssubtype;
2708	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2709	strlcpy(&sfs.f_fstypename[`0`], &mp->fstypename_override[`0`], MFSTYPENAMELEN);
2710	} else {
2711	strlcpy(&sfs.f_fstypename[`0`], &sfsp->f_fstypename[`0`], MFSTYPENAMELEN);
2712	}
2713	strlcpy(&sfs.f_mntonname[`0`], &sfsp->f_mntonname[`0`], MAXPATHLEN);
2714	strlcpy(&sfs.f_mntfromname[`0`], &sfsp->f_mntfromname[`0`], MAXPATHLEN);
2715
2716	error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2717
2718	return(error);
2719	}
2720
2721	/*
2722	* Get file system statistics in 64-bit mode
2723	*/
2724	int
2725	statfs64(__unused struct proc p, struct* statfs64_args uap, __unused int32_t retval)
2726	{
2727	struct mount *mp;
2728	struct vfsstatfs *sp;
2729	int error;
2730	struct nameidata nd;
2731	vfs_context_t ctxp = vfs_context_current();
2732	vnode_t vp;
2733
2734	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
2735	UIO_USERSPACE, uap->path, ctxp);
2736	error = namei(&nd);
2737	if (error != `0`)
2738	return (error);
2739	vp = nd.ni_vp;
2740	mp = vp->v_mount;
2741	sp = &mp->mnt_vfsstat;
2742	nameidone(&nd);
2743
2744	#if CONFIG_MACF
2745	error = mac_mount_check_stat(ctxp, mp);
2746	if (error != `0`)
2747	return (error);
2748	#endif
2749
2750	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2751	if (error != `0`) {
2752	vnode_put(vp);
2753	return (error);
2754	}
2755
2756	error = statfs64_common(mp, sp, uap->buf);
2757	vnode_put(vp);
2758
2759	return (error);
2760	}
2761
2762	/*
2763	* Get file system statistics in 64-bit mode
2764	*/
2765	int
2766	fstatfs64(__unused struct proc p, struct* fstatfs64_args uap, __unused int32_t retval)
2767	{
2768	struct vnode *vp;
2769	struct mount *mp;
2770	struct vfsstatfs *sp;
2771	int error;
2772
2773	AUDIT_ARG(fd, uap->fd);
2774
2775	if ( (error = file_vnode(uap->fd, &vp)) )
2776	return (error);
2777
2778	error = vnode_getwithref(vp);
2779	if (error) {
2780	file_drop(uap->fd);
2781	return (error);
2782	}
2783
2784	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2785
2786	mp = vp->v_mount;
2787	if (!mp) {
2788	error = EBADF;
2789	goto out;
2790	}
2791
2792	#if CONFIG_MACF
2793	error = mac_mount_check_stat(vfs_context_current(), mp);
2794	if (error != `0`)
2795	goto out;
2796	#endif
2797
2798	sp = &mp->mnt_vfsstat;
2799	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != `0`) {
2800	goto out;
2801	}
2802
2803	error = statfs64_common(mp, sp, uap->buf);
2804
2805	out:
2806	file_drop(uap->fd);
2807	vnode_put(vp);
2808
2809	return (error);
2810	}
2811
2812	struct getfsstat_struct {
2813	user_addr_t sfsp;
2814	user_addr_t *mp;
2815	int count;
2816	int maxcount;
2817	int flags;
2818	int error;
2819	};
2820
2821
2822	static int
2823	getfsstat_callback(mount_t mp, void * arg)
2824	{
2825
2826	struct getfsstat_struct fstp = (struct* getfsstat_struct *)arg;
2827	struct vfsstatfs *sp;
2828	int error, my_size;
2829	vfs_context_t ctx = vfs_context_current();
2830
2831	if (fstp->sfsp && fstp->count < fstp->maxcount) {
2832	#if CONFIG_MACF
2833	error = mac_mount_check_stat(ctx, mp);
2834	if (error != `0`) {
2835	fstp->error = error;
2836	return(VFS_RETURNED_DONE);
2837	}
2838	#endif
2839	sp = &mp->mnt_vfsstat;
2840	/*
2841	* If MNT_NOWAIT is specified, do not refresh the
2842	* fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2843	*/
2844	if (((fstp->flags & MNT_NOWAIT) == `0` \|\| (fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
2845	(error = vfs_update_vfsstat(mp, ctx,
2846	VFS_USER_EVENT))) {
2847	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2848	return(VFS_RETURNED);
2849	}
2850
2851	/*
2852	* Need to handle LP64 version of struct statfs
2853	*/
2854	error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2855	if (error) {
2856	fstp->error = error;
2857	return(VFS_RETURNED_DONE);
2858	}
2859	fstp->sfsp += my_size;
2860
2861	if (fstp->mp) {
2862	#if CONFIG_MACF
2863	error = mac_mount_label_get(mp, *fstp->mp);
2864	if (error) {
2865	fstp->error = error;
2866	return(VFS_RETURNED_DONE);
2867	}
2868	#endif
2869	fstp->mp++;
2870	}
2871	}
2872	fstp->count++;
2873	return(VFS_RETURNED);
2874	}
2875
2876	/*
2877	* Get statistics on all filesystems.
2878	*/
2879	int
2880	getfsstat(__unused proc_t p, struct getfsstat_args uap, int* *retval)
2881	{
2882	struct __mac_getfsstat_args muap;
2883
2884	muap.buf = uap->buf;
2885	muap.bufsize = uap->bufsize;
2886	muap.mac = USER_ADDR_NULL;
2887	muap.macsize = `0`;
2888	muap.flags = uap->flags;
2889
2890	return (__mac_getfsstat(p, &muap, retval));
2891	}
2892
2893	/*
2894	* __mac_getfsstat: Get MAC-related file system statistics
2895	*
2896	* Parameters: p (ignored)
2897	* uap User argument descriptor (see below)
2898	* retval Count of file system statistics (N stats)
2899	*
2900	* Indirect: uap->bufsize Buffer size
2901	* uap->macsize MAC info size
2902	* uap->buf Buffer where information will be returned
2903	* uap->mac MAC info
2904	* uap->flags File system flags
2905	*
2906	*
2907	* Returns: 0 Success
2908	* !0 Not success
2909	*
2910	*/
2911	int
2912	__mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args uap, int* *retval)
2913	{
2914	user_addr_t sfsp;
2915	user_addr_t *mp;
2916	size_t count, maxcount, bufsize, macsize;
2917	struct getfsstat_struct fst;
2918
2919	bufsize = (size_t) uap->bufsize;
2920	macsize = (size_t) uap->macsize;
2921
2922	if (IS_64BIT_PROCESS(p)) {
2923	maxcount = bufsize / sizeof(struct user64_statfs);
2924	}
2925	else {
2926	maxcount = bufsize / sizeof(struct user32_statfs);
2927	}
2928	sfsp = uap->buf;
2929	count = `0`;
2930
2931	mp = NULL;
2932
2933	#if CONFIG_MACF
2934	if (uap->mac != USER_ADDR_NULL) {
2935	u_int32_t *mp0;
2936	int error;
2937	unsigned int i;
2938
2939	count = (macsize / (IS_64BIT_PROCESS(p) ? `8` : `4`));
2940	if (count != maxcount)
2941	return (EINVAL);
2942
2943	/ Copy in the array /
2944	MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2945	if (mp0 == NULL) {
2946	return (ENOMEM);
2947	}
2948
2949	error = copyin(uap->mac, mp0, macsize);
2950	if (error) {
2951	FREE(mp0, M_MACTEMP);
2952	return (error);
2953	}
2954
2955	/ Normalize to an array of user_addr_t /
2956	MALLOC(mp, user_addr_t , count sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2957	if (mp == NULL) {
2958	FREE(mp0, M_MACTEMP);
2959	return (ENOMEM);
2960	}
2961
2962	for (i = `0`; i < count; i++) {
2963	if (IS_64BIT_PROCESS(p))
2964	mp[i] = ((user_addr_t *)mp0)[i];
2965	else
2966	mp[i] = (user_addr_t)mp0[i];
2967	}
2968	FREE(mp0, M_MACTEMP);
2969	}
2970	#endif
2971
2972
2973	fst.sfsp = sfsp;
2974	fst.mp = mp;
2975	fst.flags = uap->flags;
2976	fst.count = `0`;
2977	fst.error = `0`;
2978	fst.maxcount = maxcount;
2979
2980
2981	vfs_iterate(`0`, getfsstat_callback, &fst);
2982
2983	if (mp)
2984	FREE(mp, M_MACTEMP);
2985
2986	if (fst.error ) {
2987	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2988	return(fst.error);
2989	}
2990
2991	if (fst.sfsp && fst.count > fst.maxcount)
2992	*retval = fst.maxcount;
2993	else
2994	*retval = fst.count;
2995	return (`0`);
2996	}
2997
2998	static int
2999	getfsstat64_callback(mount_t mp, void * arg)
3000	{
3001	struct getfsstat_struct fstp = (struct* getfsstat_struct *)arg;
3002	struct vfsstatfs *sp;
3003	int error;
3004
3005	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3006	#if CONFIG_MACF
3007	error = mac_mount_check_stat(vfs_context_current(), mp);
3008	if (error != `0`) {
3009	fstp->error = error;
3010	return(VFS_RETURNED_DONE);
3011	}
3012	#endif
3013	sp = &mp->mnt_vfsstat;
3014	/*
3015	* If MNT_NOWAIT is specified, do not refresh the fsstat
3016	* cache. MNT_WAIT overrides MNT_NOWAIT.
3017	*
3018	* We treat MNT_DWAIT as MNT_WAIT for all instances of
3019	* getfsstat, since the constants are out of the same
3020	* namespace.
3021	*/
3022	if (((fstp->flags & MNT_NOWAIT) == `0` \|\|
3023	(fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
3024	(error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
3025	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3026	return(VFS_RETURNED);
3027	}
3028
3029	error = statfs64_common(mp, sp, fstp->sfsp);
3030	if (error) {
3031	fstp->error = error;
3032	return(VFS_RETURNED_DONE);
3033	}
3034	fstp->sfsp += sizeof(struct statfs64);
3035	}
3036	fstp->count++;
3037	return(VFS_RETURNED);
3038	}
3039
3040	/*
3041	* Get statistics on all file systems in 64 bit mode.
3042	*/
3043	int
3044	getfsstat64(__unused proc_t p, struct getfsstat64_args uap, int* *retval)
3045	{
3046	user_addr_t sfsp;
3047	int count, maxcount;
3048	struct getfsstat_struct fst;
3049
3050	maxcount = uap->bufsize / sizeof(struct statfs64);
3051
3052	sfsp = uap->buf;
3053	count = `0`;
3054
3055	fst.sfsp = sfsp;
3056	fst.flags = uap->flags;
3057	fst.count = `0`;
3058	fst.error = `0`;
3059	fst.maxcount = maxcount;
3060
3061	vfs_iterate(`0`, getfsstat64_callback, &fst);
3062
3063	if (fst.error ) {
3064	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3065	return(fst.error);
3066	}
3067
3068	if (fst.sfsp && fst.count > fst.maxcount)
3069	*retval = fst.maxcount;
3070	else
3071	*retval = fst.count;
3072
3073	return (`0`);
3074	}
3075
3076	/*
3077	* gets the associated vnode with the file descriptor passed.
3078	* as input
3079	*
3080	* INPUT
3081	* ctx - vfs context of caller
3082	* fd - file descriptor for which vnode is required.
3083	* vpp - Pointer to pointer to vnode to be returned.
3084	*
3085	* The vnode is returned with an iocount so any vnode obtained
3086	* by this call needs a vnode_put
3087	*
3088	*/
3089	int
3090	vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3091	{
3092	int error;
3093	vnode_t vp;
3094	struct fileproc *fp;
3095	proc_t p = vfs_context_proc(ctx);
3096
3097	*vpp = NULLVP;
3098
3099	error = fp_getfvp(p, fd, &fp, &vp);
3100	if (error)
3101	return (error);
3102
3103	error = vnode_getwithref(vp);
3104	if (error) {
3105	(void)fp_drop(p, fd, fp, `0`);
3106	return (error);
3107	}
3108
3109	(void)fp_drop(p, fd, fp, `0`);
3110	*vpp = vp;
3111	return (error);
3112	}
3113
3114	/*
3115	* Wrapper function around namei to start lookup from a directory
3116	* specified by a file descriptor ni_dirfd.
3117	*
3118	* In addition to all the errors returned by namei, this call can
3119	* return ENOTDIR if the file descriptor does not refer to a directory.
3120	* and EBADF if the file descriptor is not valid.
3121	*/
3122	int
3123	nameiat(struct nameidata ndp, int* dirfd)
3124	{
3125	if ((dirfd != AT_FDCWD) &&
3126	!(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3127	!(ndp->ni_cnd.cn_flags & USEDVP)) {
3128	int error = `0`;
3129	char c;
3130
3131	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3132	error = copyin(ndp->ni_dirp, &c, sizeof(char));
3133	if (error)
3134	return (error);
3135	} else {
3136	c = ((char* *)(ndp->ni_dirp));
3137	}
3138
3139	if (c != `'/'`) {
3140	vnode_t dvp_at;
3141
3142	error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3143	&dvp_at);
3144	if (error)
3145	return (error);
3146
3147	if (vnode_vtype(dvp_at) != VDIR) {
3148	vnode_put(dvp_at);
3149	return (ENOTDIR);
3150	}
3151
3152	ndp->ni_dvp = dvp_at;
3153	ndp->ni_cnd.cn_flags \|= USEDVP;
3154	error = namei(ndp);
3155	ndp->ni_cnd.cn_flags &= ~USEDVP;
3156	vnode_put(dvp_at);
3157	return (error);
3158	}
3159	}
3160
3161	return (namei(ndp));
3162	}
3163
3164	/*
3165	* Change current working directory to a given file descriptor.
3166	*/
3167	/ ARGSUSED /
3168	static int
3169	common_fchdir(proc_t p, struct fchdir_args uap, int* per_thread)
3170	{
3171	struct filedesc *fdp = p->p_fd;
3172	vnode_t vp;
3173	vnode_t tdp;
3174	vnode_t tvp;
3175	struct mount *mp;
3176	int error;
3177	vfs_context_t ctx = vfs_context_current();
3178
3179	AUDIT_ARG(fd, uap->fd);
3180	if (per_thread && uap->fd == -`1`) {
3181	/*
3182	* Switching back from per-thread to per process CWD; verify we
3183	* in fact have one before proceeding. The only success case
3184	* for this code path is to return 0 preemptively after zapping
3185	* the thread structure contents.
3186	*/
3187	thread_t th = vfs_context_thread(ctx);
3188	if (th) {
3189	uthread_t uth = get_bsdthread_info(th);
3190	tvp = uth->uu_cdir;
3191	uth->uu_cdir = NULLVP;
3192	if (tvp != NULLVP) {
3193	vnode_rele(tvp);
3194	return (`0`);
3195	}
3196	}
3197	return (EBADF);
3198	}
3199
3200	if ( (error = file_vnode(uap->fd, &vp)) )
3201	return(error);
3202	if ( (error = vnode_getwithref(vp)) ) {
3203	file_drop(uap->fd);
3204	return(error);
3205	}
3206
3207	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3208
3209	if (vp->v_type != VDIR) {
3210	error = ENOTDIR;
3211	goto out;
3212	}
3213
3214	#if CONFIG_MACF
3215	error = mac_vnode_check_chdir(ctx, vp);
3216	if (error)
3217	goto out;
3218	#endif
3219	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3220	if (error)
3221	goto out;
3222
3223	while (!error && (mp = vp->v_mountedhere) != NULL) {
3224	if (vfs_busy(mp, LK_NOWAIT)) {
3225	error = EACCES;
3226	goto out;
3227	}
3228	error = VFS_ROOT(mp, &tdp, ctx);
3229	vfs_unbusy(mp);
3230	if (error)
3231	break;
3232	vnode_put(vp);
3233	vp = tdp;
3234	}
3235	if (error)
3236	goto out;
3237	if ( (error = vnode_ref(vp)) )
3238	goto out;
3239	vnode_put(vp);
3240
3241	if (per_thread) {
3242	thread_t th = vfs_context_thread(ctx);
3243	if (th) {
3244	uthread_t uth = get_bsdthread_info(th);
3245	tvp = uth->uu_cdir;
3246	uth->uu_cdir = vp;
3247	OSBitOrAtomic(P_THCWD, &p->p_flag);
3248	} else {
3249	vnode_rele(vp);
3250	return (ENOENT);
3251	}
3252	} else {
3253	proc_fdlock(p);
3254	tvp = fdp->fd_cdir;
3255	fdp->fd_cdir = vp;
3256	proc_fdunlock(p);
3257	}
3258
3259	if (tvp)
3260	vnode_rele(tvp);
3261	file_drop(uap->fd);
3262
3263	return (`0`);
3264	out:
3265	vnode_put(vp);
3266	file_drop(uap->fd);
3267
3268	return(error);
3269	}
3270
3271	int
3272	fchdir(proc_t p, struct fchdir_args uap, __unused int32_t retval)
3273	{
3274	return common_fchdir(p, uap, `0`);
3275	}
3276
3277	int
3278	__pthread_fchdir(proc_t p, struct __pthread_fchdir_args uap, __unused int32_t retval)
3279	{
3280	return common_fchdir(p, (void *)uap, `1`);
3281	}
3282
3283	/*
3284	* Change current working directory (".").
3285	*
3286	* Returns: 0 Success
3287	* change_dir:ENOTDIR
3288	* change_dir:???
3289	* vnode_ref:ENOENT No such file or directory
3290	*/
3291	/ ARGSUSED /
3292	static int
3293	common_chdir(proc_t p, struct chdir_args uap, int* per_thread)
3294	{
3295	struct filedesc *fdp = p->p_fd;
3296	int error;
3297	struct nameidata nd;
3298	vnode_t tvp;
3299	vfs_context_t ctx = vfs_context_current();
3300
3301	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW \| AUDITVNPATH1,
3302	UIO_USERSPACE, uap->path, ctx);
3303	error = change_dir(&nd, ctx);
3304	if (error)
3305	return (error);
3306	if ( (error = vnode_ref(nd.ni_vp)) ) {
3307	vnode_put(nd.ni_vp);
3308	return (error);
3309	}
3310	/*
3311	* drop the iocount we picked up in change_dir
3312	*/
3313	vnode_put(nd.ni_vp);
3314
3315	if (per_thread) {
3316	thread_t th = vfs_context_thread(ctx);
3317	if (th) {
3318	uthread_t uth = get_bsdthread_info(th);
3319	tvp = uth->uu_cdir;
3320	uth->uu_cdir = nd.ni_vp;
3321	OSBitOrAtomic(P_THCWD, &p->p_flag);
3322	} else {
3323	vnode_rele(nd.ni_vp);
3324	return (ENOENT);
3325	}
3326	} else {
3327	proc_fdlock(p);
3328	tvp = fdp->fd_cdir;
3329	fdp->fd_cdir = nd.ni_vp;
3330	proc_fdunlock(p);
3331	}
3332
3333	if (tvp)
3334	vnode_rele(tvp);
3335
3336	return (`0`);
3337	}
3338
3339
3340	/*
3341	* chdir
3342	*
3343	* Change current working directory (".") for the entire process
3344	*
3345	* Parameters: p Process requesting the call
3346	* uap User argument descriptor (see below)
3347	* retval (ignored)
3348	*
3349	* Indirect parameters: uap->path Directory path
3350	*
3351	* Returns: 0 Success
3352	* common_chdir: ENOTDIR
3353	* common_chdir: ENOENT No such file or directory
3354	* common_chdir: ???
3355	*
3356	*/
3357	int
3358	chdir(proc_t p, struct chdir_args uap, __unused int32_t retval)
3359	{
3360	return common_chdir(p, (void *)uap, `0`);
3361	}
3362
3363	/*
3364	* __pthread_chdir
3365	*
3366	* Change current working directory (".") for a single thread
3367	*
3368	* Parameters: p Process requesting the call
3369	* uap User argument descriptor (see below)
3370	* retval (ignored)
3371	*
3372	* Indirect parameters: uap->path Directory path
3373	*
3374	* Returns: 0 Success
3375	* common_chdir: ENOTDIR
3376	* common_chdir: ENOENT No such file or directory
3377	* common_chdir: ???
3378	*
3379	*/
3380	int
3381	__pthread_chdir(proc_t p, struct __pthread_chdir_args uap, __unused int32_t retval)
3382	{
3383	return common_chdir(p, (void *)uap, `1`);
3384	}
3385
3386
3387	/*
3388	* Change notion of root (``/'') directory.
3389	*/
3390	/ ARGSUSED /
3391	int
3392	chroot(proc_t p, struct chroot_args uap, __unused int32_t retval)
3393	{
3394	struct filedesc *fdp = p->p_fd;
3395	int error;
3396	struct nameidata nd;
3397	vnode_t tvp;
3398	vfs_context_t ctx = vfs_context_current();
3399
3400	if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3401	return (error);
3402
3403	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW \| AUDITVNPATH1,
3404	UIO_USERSPACE, uap->path, ctx);
3405	error = change_dir(&nd, ctx);
3406	if (error)
3407	return (error);
3408
3409	#if CONFIG_MACF
3410	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3411	&nd.ni_cnd);
3412	if (error) {
3413	vnode_put(nd.ni_vp);
3414	return (error);
3415	}
3416	#endif
3417
3418	if ( (error = vnode_ref(nd.ni_vp)) ) {
3419	vnode_put(nd.ni_vp);
3420	return (error);
3421	}
3422	vnode_put(nd.ni_vp);
3423
3424	proc_fdlock(p);
3425	tvp = fdp->fd_rdir;
3426	fdp->fd_rdir = nd.ni_vp;
3427	fdp->fd_flags \|= FD_CHROOT;
3428	proc_fdunlock(p);
3429
3430	if (tvp != NULL)
3431	vnode_rele(tvp);
3432
3433	return (`0`);
3434	}
3435
3436	/*
3437	* Common routine for chroot and chdir.
3438	*
3439	* Returns: 0 Success
3440	* ENOTDIR Not a directory
3441	* namei:??? [anything namei can return]
3442	* vnode_authorize:??? [anything vnode_authorize can return]
3443	*/
3444	static int
3445	change_dir(struct nameidata *ndp, vfs_context_t ctx)
3446	{
3447	vnode_t vp;
3448	int error;
3449
3450	if ((error = namei(ndp)))
3451	return (error);
3452	nameidone(ndp);
3453	vp = ndp->ni_vp;
3454
3455	if (vp->v_type != VDIR) {
3456	vnode_put(vp);
3457	return (ENOTDIR);
3458	}
3459
3460	#if CONFIG_MACF
3461	error = mac_vnode_check_chdir(ctx, vp);
3462	if (error) {
3463	vnode_put(vp);
3464	return (error);
3465	}
3466	#endif
3467
3468	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3469	if (error) {
3470	vnode_put(vp);
3471	return (error);
3472	}
3473
3474	return (error);
3475	}
3476
3477	/*
3478	* Free the vnode data (for directories) associated with the file glob.
3479	*/
3480	struct fd_vn_data *
3481	fg_vn_data_alloc(void)
3482	{
3483	struct fd_vn_data *fvdata;
3484
3485	/ Allocate per fd vnode data /
3486	MALLOC(fvdata, struct fd_vn_data , (sizeof(struct* fd_vn_data)),
3487	M_FD_VN_DATA, M_WAITOK \| M_ZERO);
3488	lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3489	return fvdata;
3490	}
3491
3492	/*
3493	* Free the vnode data (for directories) associated with the file glob.
3494	*/
3495	void
3496	fg_vn_data_free(void *fgvndata)
3497	{
3498	struct fd_vn_data fvdata = (struct* fd_vn_data *)fgvndata;
3499
3500	if (fvdata->fv_buf)
3501	FREE(fvdata->fv_buf, M_FD_DIRBUF);
3502	lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3503	FREE(fvdata, M_FD_VN_DATA);
3504	}
3505
3506	/*
3507	* Check permissions, allocate an open file structure,
3508	* and call the device open routine if any.
3509	*
3510	* Returns: 0 Success
3511	* EINVAL
3512	* EINTR
3513	* falloc:ENFILE
3514	* falloc:EMFILE
3515	* falloc:ENOMEM
3516	* vn_open_auth:???
3517	* dupfdopen:???
3518	* VNOP_ADVLOCK:???
3519	* vnode_setsize:???
3520	*
3521	* XXX Need to implement uid, gid
3522	*/
3523	int
3524	open1(vfs_context_t ctx, struct nameidata ndp, int* uflags,
3525	struct vnode_attr vap, fp_allocfn_t fp_zalloc, void* *cra,
3526	int32_t *retval)
3527	{
3528	proc_t p = vfs_context_proc(ctx);
3529	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3530	struct fileproc *fp;
3531	vnode_t vp;
3532	int flags, oflags;
3533	int type, indx, error;
3534	struct flock lf;
3535	struct vfs_context context;
3536
3537	oflags = uflags;
3538
3539	if ((oflags & O_ACCMODE) == O_ACCMODE)
3540	return(EINVAL);
3541
3542	flags = FFLAGS(uflags);
3543	CLR(flags, FENCRYPTED);
3544	CLR(flags, FUNENCRYPTED);
3545
3546	AUDIT_ARG(fflags, oflags);
3547	AUDIT_ARG(mode, vap->va_mode);
3548
3549	if ((error = falloc_withalloc(p,
3550	&fp, &indx, ctx, fp_zalloc, cra)) != `0`) {
3551	return (error);
3552	}
3553	uu->uu_dupfd = -indx - `1`;
3554
3555	if ((error = vn_open_auth(ndp, &flags, vap))) {
3556	if ((error == ENODEV \|\| error == ENXIO) && (uu->uu_dupfd >= `0`)){ / XXX from fdopen /
3557	if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == `0`) {
3558	fp_drop(p, indx, NULL, `0`);
3559	*retval = indx;
3560	return (`0`);
3561	}
3562	}
3563	if (error == ERESTART)
3564	error = EINTR;
3565	fp_free(p, indx, fp);
3566	return (error);
3567	}
3568	uu->uu_dupfd = `0`;
3569	vp = ndp->ni_vp;
3570
3571	fp->f_fglob->fg_flag = flags & (FMASK \| O_EVTONLY \| FENCRYPTED \| FUNENCRYPTED);
3572	fp->f_fglob->fg_ops = &vnops;
3573	fp->f_fglob->fg_data = (caddr_t)vp;
3574
3575	if (flags & (O_EXLOCK \| O_SHLOCK)) {
3576	lf.l_whence = SEEK_SET;
3577	lf.l_start = `0`;
3578	lf.l_len = `0`;
3579	if (flags & O_EXLOCK)
3580	lf.l_type = F_WRLCK;
3581	else
3582	lf.l_type = F_RDLCK;
3583	type = F_FLOCK;
3584	if ((flags & FNONBLOCK) == `0`)
3585	type \|= F_WAIT;
3586	#if CONFIG_MACF
3587	error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3588	F_SETLK, &lf);
3589	if (error)
3590	goto bad;
3591	#endif
3592	if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3593	goto bad;
3594	fp->f_fglob->fg_flag \|= FHASLOCK;
3595	}
3596
3597	#if DEVELOPMENT \|\| DEBUG
3598	/*
3599	* XXX VSWAP: Check for entitlements or special flag here
3600	* so we can restrict access appropriately.
3601	*/
3602	#else /* DEVELOPMENT \|\| DEBUG */
3603
3604	if (vnode_isswap(vp) && (flags & (FWRITE \| O_TRUNC)) && (ctx != vfs_context_kernel())) {
3605	/ block attempt to write/truncate swapfile /
3606	error = EPERM;
3607	goto bad;
3608	}
3609	#endif /* DEVELOPMENT \|\| DEBUG */
3610
3611	/ try to truncate by setting the size attribute /
3612	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)`0`, `0`, ctx)) != `0`))
3613	goto bad;
3614
3615	/*
3616	* For directories we hold some additional information in the fd.
3617	*/
3618	if (vnode_vtype(vp) == VDIR) {
3619	fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3620	} else {
3621	fp->f_fglob->fg_vn_data = NULL;
3622	}
3623
3624	vnode_put(vp);
3625
3626	/*
3627	* The first terminal open (without a O_NOCTTY) by a session leader
3628	* results in it being set as the controlling terminal.
3629	*/
3630	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3631	!(flags & O_NOCTTY)) {
3632	int tmp = `0`;
3633
3634	(void)(fp->f_fglob->fg_ops->fo_ioctl)(fp, (int*)TIOCSCTTY,
3635	(caddr_t)&tmp, ctx);
3636	}
3637
3638	proc_fdlock(p);
3639	if (flags & O_CLOEXEC)
3640	*fdflags(p, indx) \|= UF_EXCLOSE;
3641	if (flags & O_CLOFORK)
3642	*fdflags(p, indx) \|= UF_FORKCLOSE;
3643	procfdtbl_releasefd(p, indx, NULL);
3644
3645	#if CONFIG_SECLUDED_MEMORY
3646	if (secluded_for_filecache &&
3647	FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3648	vnode_vtype(vp) == VREG) {
3649	memory_object_control_t moc;
3650
3651	moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3652
3653	if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3654	/ nothing to do... /
3655	} else if (fp->f_fglob->fg_flag & FWRITE) {
3656	/ writable -> no longer eligible for secluded pages /
3657	memory_object_mark_eligible_for_secluded(moc,
3658	FALSE);
3659	} else if (secluded_for_filecache == `1`) {
3660	char pathname[`32`] = { `0`, };
3661	size_t copied;
3662	/ XXX FBDP: better way to detect /Applications/ ? /
3663	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3664	copyinstr(ndp->ni_dirp,
3665	pathname,
3666	sizeof (pathname),
3667	&copied);
3668	} else {
3669	copystr(CAST_DOWN(void *, ndp->ni_dirp),
3670	pathname,
3671	sizeof (pathname),
3672	&copied);
3673	}
3674	pathname[sizeof (pathname) - `1`] = `'\0'`;
3675	if (strncmp(pathname,
3676	"/Applications/",
3677	strlen("/Applications/")) == `0` &&
3678	strncmp(pathname,
3679	"/Applications/Camera.app/",
3680	strlen("/Applications/Camera.app/")) != `0`) {
3681	/*
3682	* not writable
3683	* AND from "/Applications/"
3684	* AND not from "/Applications/Camera.app/"
3685	* ==> eligible for secluded
3686	*/
3687	memory_object_mark_eligible_for_secluded(moc,
3688	TRUE);
3689	}
3690	} else if (secluded_for_filecache == `2`) {
3691	#if __arm64__
3692	#define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
3693	#elif __arm__
3694	#define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
3695	#else
3696	/ not implemented... /
3697	#endif
3698	if (!strncmp(vp->v_name,
3699	DYLD_SHARED_CACHE_NAME,
3700	strlen(DYLD_SHARED_CACHE_NAME)) \|\|
3701	!strncmp(vp->v_name,
3702	"dyld",
3703	strlen(vp->v_name)) \|\|
3704	!strncmp(vp->v_name,
3705	"launchd",
3706	strlen(vp->v_name)) \|\|
3707	!strncmp(vp->v_name,
3708	"Camera",
3709	strlen(vp->v_name)) \|\|
3710	!strncmp(vp->v_name,
3711	"mediaserverd",
3712	strlen(vp->v_name)) \|\|
3713	!strncmp(vp->v_name,
3714	"SpringBoard",
3715	strlen(vp->v_name)) \|\|
3716	!strncmp(vp->v_name,
3717	"backboardd",
3718	strlen(vp->v_name))) {
3719	/*
3720	* This file matters when launching Camera:
3721	* do not store its contents in the secluded
3722	* pool that will be drained on Camera launch.
3723	*/
3724	memory_object_mark_eligible_for_secluded(moc,
3725	FALSE);
3726	}
3727	}
3728	}
3729	#endif /* CONFIG_SECLUDED_MEMORY */
3730
3731	fp_drop(p, indx, fp, `1`);
3732	proc_fdunlock(p);
3733
3734	*retval = indx;
3735
3736	return (`0`);
3737	bad:
3738	context = *vfs_context_current();
3739	context.vc_ucred = fp->f_fglob->fg_cred;
3740
3741	if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3742	(FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3743	lf.l_whence = SEEK_SET;
3744	lf.l_start = `0`;
3745	lf.l_len = `0`;
3746	lf.l_type = F_UNLCK;
3747
3748	(void)VNOP_ADVLOCK(
3749	vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3750	}
3751
3752	vn_close(vp, fp->f_fglob->fg_flag, &context);
3753	vnode_put(vp);
3754	fp_free(p, indx, fp);
3755
3756	return (error);
3757	}
3758
3759	/*
3760	* While most of the *at syscall handlers can call nameiat() which
3761	* is a wrapper around namei, the use of namei and initialisation
3762	* of nameidata are far removed and in different functions - namei
3763	* gets called in vn_open_auth for open1. So we'll just do here what
3764	* nameiat() does.
3765	*/
3766	static int
3767	open1at(vfs_context_t ctx, struct nameidata ndp, int* uflags,
3768	struct vnode_attr vap, fp_allocfn_t fp_zalloc, void* cra, int32_t retval,
3769	int dirfd)
3770	{
3771	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3772	int error;
3773	char c;
3774
3775	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3776	error = copyin(ndp->ni_dirp, &c, sizeof(char));
3777	if (error)
3778	return (error);
3779	} else {
3780	c = ((char* *)(ndp->ni_dirp));
3781	}
3782
3783	if (c != `'/'`) {
3784	vnode_t dvp_at;
3785
3786	error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3787	&dvp_at);
3788	if (error)
3789	return (error);
3790
3791	if (vnode_vtype(dvp_at) != VDIR) {
3792	vnode_put(dvp_at);
3793	return (ENOTDIR);
3794	}
3795
3796	ndp->ni_dvp = dvp_at;
3797	ndp->ni_cnd.cn_flags \|= USEDVP;
3798	error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3799	retval);
3800	vnode_put(dvp_at);
3801	return (error);
3802	}
3803	}
3804
3805	return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3806	}
3807
3808	/*
3809	* open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3810	*
3811	* Parameters: p Process requesting the open
3812	* uap User argument descriptor (see below)
3813	* retval Pointer to an area to receive the
3814	* return calue from the system call
3815	*
3816	* Indirect: uap->path Path to open (same as 'open')
3817	* uap->flags Flags to open (same as 'open'
3818	* uap->uid UID to set, if creating
3819	* uap->gid GID to set, if creating
3820	* uap->mode File mode, if creating (same as 'open')
3821	* uap->xsecurity ACL to set, if creating
3822	*
3823	* Returns: 0 Success
3824	* !0 errno value
3825	*
3826	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
3827	*
3828	* XXX: We should enummerate the possible errno values here, and where
3829	* in the code they originated.
3830	*/
3831	int
3832	open_extended(proc_t p, struct open_extended_args uap, int32_t retval)
3833	{
3834	struct filedesc *fdp = p->p_fd;
3835	int ciferror;
3836	kauth_filesec_t xsecdst;
3837	struct vnode_attr va;
3838	struct nameidata nd;
3839	int cmode;
3840
3841	AUDIT_ARG(owner, uap->uid, uap->gid);
3842
3843	xsecdst = NULL;
3844	if ((uap->xsecurity != USER_ADDR_NULL) &&
3845	((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`))
3846	return ciferror;
3847
3848	VATTR_INIT(&va);
3849	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3850	VATTR_SET(&va, va_mode, cmode);
3851	if (uap->uid != KAUTH_UID_NONE)
3852	VATTR_SET(&va, va_uid, uap->uid);
3853	if (uap->gid != KAUTH_GID_NONE)
3854	VATTR_SET(&va, va_gid, uap->gid);
3855	if (xsecdst != NULL)
3856	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3857
3858	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
3859	uap->path, vfs_context_current());
3860
3861	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3862	fileproc_alloc_init, NULL, retval);
3863	if (xsecdst != NULL)
3864	kauth_filesec_free(xsecdst);
3865
3866	return ciferror;
3867	}
3868
3869	/*
3870	* Go through the data-protected atomically controlled open (2)
3871	*
3872	* int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3873	*/
3874	int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args uap, int32_t retval) {
3875	int flags = uap->flags;
3876	int class = uap->class;
3877	int dpflags = uap->dpflags;
3878
3879	/*
3880	* Follow the same path as normal open(2)
3881	* Look up the item if it exists, and acquire the vnode.
3882	*/
3883	struct filedesc *fdp = p->p_fd;
3884	struct vnode_attr va;
3885	struct nameidata nd;
3886	int cmode;
3887	int error;
3888
3889	VATTR_INIT(&va);
3890	/ Mask off all but regular access permissions /
3891	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3892	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3893
3894	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
3895	uap->path, vfs_context_current());
3896
3897	/*
3898	* Initialize the extra fields in vnode_attr to pass down our
3899	* extra fields.
3900	* 1. target cprotect class.
3901	* 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3902	*/
3903	if (flags & O_CREAT) {
3904	/ lower level kernel code validates that the class is valid before applying it. /
3905	if (class != PROTECTION_CLASS_DEFAULT) {
3906	/*
3907	* PROTECTION_CLASS_DEFAULT implies that we make the class for this
3908	* file behave the same as open (2)
3909	*/
3910	VATTR_SET(&va, va_dataprotect_class, class);
3911	}
3912	}
3913
3914	if (dpflags & (O_DP_GETRAWENCRYPTED\|O_DP_GETRAWUNENCRYPTED)) {
3915	if ( flags & (O_RDWR \| O_WRONLY)) {
3916	/ Not allowed to write raw encrypted bytes /
3917	return EINVAL;
3918	}
3919	if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3920	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3921	}
3922	if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3923	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3924	}
3925	}
3926
3927	error = open1(vfs_context_current(), &nd, uap->flags, &va,
3928	fileproc_alloc_init, NULL, retval);
3929
3930	return error;
3931	}
3932
3933	static int
3934	openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3935	int fd, enum uio_seg segflg, int *retval)
3936	{
3937	struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3938	struct vnode_attr va;
3939	struct nameidata nd;
3940	int cmode;
3941
3942	VATTR_INIT(&va);
3943	/ Mask off all but regular access permissions /
3944	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3945	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3946
3947	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1,
3948	segflg, path, ctx);
3949
3950	return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3951	retval, fd));
3952	}
3953
3954	int
3955	open(proc_t p, struct open_args uap, int32_t retval)
3956	{
3957	__pthread_testcancel(`1`);
3958	return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3959	}
3960
3961	int
3962	open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3963	int32_t *retval)
3964	{
3965	return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3966	uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3967	}
3968
3969	int
3970	openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3971	int32_t *retval)
3972	{
3973	return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3974	uap->mode, uap->fd, UIO_USERSPACE, retval));
3975	}
3976
3977	int
3978	openat(proc_t p, struct openat_args uap, int32_t retval)
3979	{
3980	__pthread_testcancel(`1`);
3981	return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3982	}
3983
3984	/*
3985	* openbyid_np: open a file given a file system id and a file system object id
3986	* the hfs file system object id is an fsobj_id_t {uint32, uint32}
3987	* file systems that don't support object ids it is a node id (uint64_t).
3988	*
3989	* Parameters: p Process requesting the open
3990	* uap User argument descriptor (see below)
3991	* retval Pointer to an area to receive the
3992	* return calue from the system call
3993	*
3994	* Indirect: uap->path Path to open (same as 'open')
3995	*
3996	* uap->fsid id of target file system
3997	* uap->objid id of target file system object
3998	* uap->flags Flags to open (same as 'open')
3999	*
4000	* Returns: 0 Success
4001	* !0 errno value
4002	*
4003	*
4004	* XXX: We should enummerate the possible errno values here, and where
4005	* in the code they originated.
4006	*/
4007	int
4008	openbyid_np(__unused proc_t p, struct openbyid_np_args uap, int* *retval)
4009	{
4010	fsid_t fsid;
4011	uint64_t objid;
4012	int error;
4013	char *buf = NULL;
4014	int buflen = MAXPATHLEN;
4015	int pathlen = `0`;
4016	vfs_context_t ctx = vfs_context_current();
4017
4018	if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, `0`))) {
4019	return (error);
4020	}
4021
4022	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4023	return (error);
4024	}
4025
4026	/uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} /
4027	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4028	return (error);
4029	}
4030
4031	AUDIT_ARG(value32, fsid.val[`0`]);
4032	AUDIT_ARG(value64, objid);
4033
4034	/resolve path from fsis, objid/
4035	do {
4036	MALLOC(buf, char *, buflen + `1`, M_TEMP, M_WAITOK);
4037	if (buf == NULL) {
4038	return (ENOMEM);
4039	}
4040
4041	error = fsgetpath_internal(
4042	ctx, fsid.val[`0`], objid,
4043	buflen, buf, &pathlen);
4044
4045	if (error) {
4046	FREE(buf, M_TEMP);
4047	buf = NULL;
4048	}
4049	} while (error == ENOSPC && (buflen += MAXPATHLEN));
4050
4051	if (error) {
4052	return error;
4053	}
4054
4055	buf[pathlen] = `0`;
4056
4057	error = openat_internal(
4058	ctx, (user_addr_t)buf, uap->oflags, `0`, AT_FDCWD, UIO_SYSSPACE, retval);
4059
4060	FREE(buf, M_TEMP);
4061
4062	return error;
4063	}
4064
4065
4066	/*
4067	* Create a special file.
4068	*/
4069	static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4070
4071	int
4072	mknod(proc_t p, struct mknod_args uap, __unused int32_t retval)
4073	{
4074	struct vnode_attr va;
4075	vfs_context_t ctx = vfs_context_current();
4076	int error;
4077	struct nameidata nd;
4078	vnode_t vp, dvp;
4079
4080	VATTR_INIT(&va);
4081	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4082	VATTR_SET(&va, va_rdev, uap->dev);
4083
4084	/ If it's a mknod() of a FIFO, call mkfifo1() instead /
4085	if ((uap->mode & S_IFMT) == S_IFIFO)
4086	return(mkfifo1(ctx, uap->path, &va));
4087
4088	AUDIT_ARG(mode, uap->mode);
4089	AUDIT_ARG(value32, uap->dev);
4090
4091	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
4092	return (error);
4093	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT \| AUDITVNPATH1,
4094	UIO_USERSPACE, uap->path, ctx);
4095	error = namei(&nd);
4096	if (error)
4097	return (error);
4098	dvp = nd.ni_dvp;
4099	vp = nd.ni_vp;
4100
4101	if (vp != NULL) {
4102	error = EEXIST;
4103	goto out;
4104	}
4105
4106	switch (uap->mode & S_IFMT) {
4107	case S_IFCHR:
4108	VATTR_SET(&va, va_type, VCHR);
4109	break;
4110	case S_IFBLK:
4111	VATTR_SET(&va, va_type, VBLK);
4112	break;
4113	default:
4114	error = EINVAL;
4115	goto out;
4116	}
4117
4118	#if CONFIG_MACF
4119	error = mac_vnode_check_create(ctx,
4120	nd.ni_dvp, &nd.ni_cnd, &va);
4121	if (error)
4122	goto out;
4123	#endif
4124
4125	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`)
4126	goto out;
4127
4128	if ((error = vn_create(dvp, &vp, &nd, &va, `0`, `0`, NULL, ctx)) != `0`)
4129	goto out;
4130
4131	if (vp) {
4132	int update_flags = `0`;
4133
4134	// Make sure the name & parent pointers are hooked up
4135	if (vp->v_name == NULL)
4136	update_flags \|= VNODE_UPDATE_NAME;
4137	if (vp->v_parent == NULLVP)
4138	update_flags \|= VNODE_UPDATE_PARENT;
4139
4140	if (update_flags)
4141	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4142
4143	#if CONFIG_FSE
4144	add_fsevent(FSE_CREATE_FILE, ctx,
4145	FSE_ARG_VNODE, vp,
4146	FSE_ARG_DONE);
4147	#endif
4148	}
4149
4150	out:
4151	/*
4152	* nameidone has to happen before we vnode_put(dvp)
4153	* since it may need to release the fs_nodelock on the dvp
4154	*/
4155	nameidone(&nd);
4156
4157	if (vp)
4158	vnode_put(vp);
4159	vnode_put(dvp);
4160
4161	return (error);
4162	}
4163
4164	/*
4165	* Create a named pipe.
4166	*
4167	* Returns: 0 Success
4168	* EEXIST
4169	* namei:???
4170	* vnode_authorize:???
4171	* vn_create:???
4172	*/
4173	static int
4174	mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4175	{
4176	vnode_t vp, dvp;
4177	int error;
4178	struct nameidata nd;
4179
4180	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT \| AUDITVNPATH1,
4181	UIO_USERSPACE, upath, ctx);
4182	error = namei(&nd);
4183	if (error)
4184	return (error);
4185	dvp = nd.ni_dvp;
4186	vp = nd.ni_vp;
4187
4188	/ check that this is a new file and authorize addition /
4189	if (vp != NULL) {
4190	error = EEXIST;
4191	goto out;
4192	}
4193	VATTR_SET(vap, va_type, VFIFO);
4194
4195	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != `0`)
4196	goto out;
4197
4198	error = vn_create(dvp, &vp, &nd, vap, `0`, `0`, NULL, ctx);
4199	out:
4200	/*
4201	* nameidone has to happen before we vnode_put(dvp)
4202	* since it may need to release the fs_nodelock on the dvp
4203	*/
4204	nameidone(&nd);
4205
4206	if (vp)
4207	vnode_put(vp);
4208	vnode_put(dvp);
4209
4210	return error;
4211	}
4212
4213
4214	/*
4215	* mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4216	*
4217	* Parameters: p Process requesting the open
4218	* uap User argument descriptor (see below)
4219	* retval (Ignored)
4220	*
4221	* Indirect: uap->path Path to fifo (same as 'mkfifo')
4222	* uap->uid UID to set
4223	* uap->gid GID to set
4224	* uap->mode File mode to set (same as 'mkfifo')
4225	* uap->xsecurity ACL to set, if creating
4226	*
4227	* Returns: 0 Success
4228	* !0 errno value
4229	*
4230	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4231	*
4232	* XXX: We should enummerate the possible errno values here, and where
4233	* in the code they originated.
4234	*/
4235	int
4236	mkfifo_extended(proc_t p, struct mkfifo_extended_args uap, __unused int32_t retval)
4237	{
4238	int ciferror;
4239	kauth_filesec_t xsecdst;
4240	struct vnode_attr va;
4241
4242	AUDIT_ARG(owner, uap->uid, uap->gid);
4243
4244	xsecdst = KAUTH_FILESEC_NONE;
4245	if (uap->xsecurity != USER_ADDR_NULL) {
4246	if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`)
4247	return ciferror;
4248	}
4249
4250	VATTR_INIT(&va);
4251	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4252	if (uap->uid != KAUTH_UID_NONE)
4253	VATTR_SET(&va, va_uid, uap->uid);
4254	if (uap->gid != KAUTH_GID_NONE)
4255	VATTR_SET(&va, va_gid, uap->gid);
4256	if (xsecdst != KAUTH_FILESEC_NONE)
4257	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4258
4259	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4260
4261	if (xsecdst != KAUTH_FILESEC_NONE)
4262	kauth_filesec_free(xsecdst);
4263	return ciferror;
4264	}
4265
4266	/ ARGSUSED /
4267	int
4268	mkfifo(proc_t p, struct mkfifo_args uap, __unused int32_t retval)
4269	{
4270	struct vnode_attr va;
4271
4272	VATTR_INIT(&va);
4273	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4274
4275	return(mkfifo1(vfs_context_current(), uap->path, &va));
4276	}
4277
4278
4279	static char *
4280	my_strrchr(char p, int* ch)
4281	{
4282	char *save;
4283
4284	for (save = NULL;; ++p) {
4285	if (*p == ch)
4286	save = p;
4287	if (!*p)
4288	return(save);
4289	}
4290	/ NOTREACHED /
4291	}
4292
4293	extern int safe_getpath(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path);
4294
4295	int
4296	safe_getpath(struct vnode dvp, char* leafname, char* path, int* _len, int *truncated_path)
4297	{
4298	int ret, len = _len;
4299
4300	*truncated_path = `0`;
4301	ret = vn_getpath(dvp, path, &len);
4302	if (ret == `0` && len < (MAXPATHLEN - `1`)) {
4303	if (leafname) {
4304	path[len-`1`] = `'/'`;
4305	len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + `1`;
4306	if (len > MAXPATHLEN) {
4307	char *ptr;
4308
4309	// the string got truncated!
4310	*truncated_path = `1`;
4311	ptr = my_strrchr(path, `'/'`);
4312	if (ptr) {
4313	ptr = `'\0'`; // chop off the string at the last directory component*
4314	}
4315	len = strlen(path) + `1`;
4316	}
4317	}
4318	} else if (ret == `0`) {
4319	*truncated_path = `1`;
4320	} else if (ret != `0`) {
4321	struct vnode *mydvp=dvp;
4322
4323	if (ret != ENOSPC) {
4324	printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4325	dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4326	}
4327	*truncated_path = `1`;
4328
4329	do {
4330	if (mydvp->v_parent != NULL) {
4331	mydvp = mydvp->v_parent;
4332	} else if (mydvp->v_mount) {
4333	strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4334	break;
4335	} else {
4336	// no parent and no mount point? only thing is to punt and say "/" changed
4337	strlcpy(path, "/", _len);
4338	len = `2`;
4339	mydvp = NULL;
4340	}
4341
4342	if (mydvp == NULL) {
4343	break;
4344	}
4345
4346	len = _len;
4347	ret = vn_getpath(mydvp, path, &len);
4348	} while (ret == ENOSPC);
4349	}
4350
4351	return len;
4352	}
4353
4354
4355	/*
4356	* Make a hard file link.
4357	*
4358	* Returns: 0 Success
4359	* EPERM
4360	* EEXIST
4361	* EXDEV
4362	* namei:???
4363	* vnode_authorize:???
4364	* VNOP_LINK:???
4365	*/
4366	/ ARGSUSED /
4367	static int
4368	linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4369	user_addr_t link, int flag, enum uio_seg segflg)
4370	{
4371	vnode_t vp, dvp, lvp;
4372	struct nameidata nd;
4373	int follow;
4374	int error;
4375	#if CONFIG_FSE
4376	fse_info finfo;
4377	#endif
4378	int need_event, has_listeners;
4379	char *target_path = NULL;
4380	int truncated=`0`;
4381
4382	vp = dvp = lvp = NULLVP;
4383
4384	/ look up the object we are linking to /
4385	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4386	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 \| follow,
4387	segflg, path, ctx);
4388
4389	error = nameiat(&nd, fd1);
4390	if (error)
4391	return (error);
4392	vp = nd.ni_vp;
4393
4394	nameidone(&nd);
4395
4396	/*
4397	* Normally, linking to directories is not supported.
4398	* However, some file systems may have limited support.
4399	*/
4400	if (vp->v_type == VDIR) {
4401	if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4402	error = EPERM; / POSIX /
4403	goto out;
4404	}
4405
4406	/ Linking to a directory requires ownership. /
4407	if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4408	struct vnode_attr dva;
4409
4410	VATTR_INIT(&dva);
4411	VATTR_WANTED(&dva, va_uid);
4412	if (vnode_getattr(vp, &dva, ctx) != `0` \|\|
4413	!VATTR_IS_SUPPORTED(&dva, va_uid) \|\|
4414	(dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4415	error = EACCES;
4416	goto out;
4417	}
4418	}
4419	}
4420
4421	/ lookup the target node /
4422	#if CONFIG_TRIGGERS
4423	nd.ni_op = OP_LINK;
4424	#endif
4425	nd.ni_cnd.cn_nameiop = CREATE;
4426	nd.ni_cnd.cn_flags = LOCKPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK;
4427	nd.ni_dirp = link;
4428	error = nameiat(&nd, fd2);
4429	if (error != `0`)
4430	goto out;
4431	dvp = nd.ni_dvp;
4432	lvp = nd.ni_vp;
4433
4434	#if CONFIG_MACF
4435	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != `0`)
4436	goto out2;
4437	#endif
4438
4439	/ or to anything that kauth doesn't want us to (eg. immutable items) /
4440	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != `0`)
4441	goto out2;
4442
4443	/ target node must not exist /
4444	if (lvp != NULLVP) {
4445	error = EEXIST;
4446	goto out2;
4447	}
4448	/ cannot link across mountpoints /
4449	if (vnode_mount(vp) != vnode_mount(dvp)) {
4450	error = EXDEV;
4451	goto out2;
4452	}
4453
4454	/ authorize creation of the target note /
4455	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`)
4456	goto out2;
4457
4458	/ and finally make the link /
4459	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4460	if (error)
4461	goto out2;
4462
4463	#if CONFIG_MACF
4464	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4465	#endif
4466
4467	#if CONFIG_FSE
4468	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4469	#else
4470	need_event = `0`;
4471	#endif
4472	has_listeners = kauth_authorize_fileop_has_listeners();
4473
4474	if (need_event \|\| has_listeners) {
4475	char *link_to_path = NULL;
4476	int len, link_name_len;
4477
4478	/ build the path to the new link file /
4479	GET_PATH(target_path);
4480	if (target_path == NULL) {
4481	error = ENOMEM;
4482	goto out2;
4483	}
4484
4485	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4486
4487	if (has_listeners) {
4488	/ build the path to file we are linking to /
4489	GET_PATH(link_to_path);
4490	if (link_to_path == NULL) {
4491	error = ENOMEM;
4492	goto out2;
4493	}
4494
4495	link_name_len = MAXPATHLEN;
4496	if (vn_getpath(vp, link_to_path, &link_name_len) == `0`) {
4497	/*
4498	* Call out to allow 3rd party notification of rename.
4499	* Ignore result of kauth_authorize_fileop call.
4500	*/
4501	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4502	(uintptr_t)link_to_path,
4503	(uintptr_t)target_path);
4504	}
4505	if (link_to_path != NULL) {
4506	RELEASE_PATH(link_to_path);
4507	}
4508	}
4509	#if CONFIG_FSE
4510	if (need_event) {
4511	/ construct fsevent /
4512	if (get_fse_info(vp, &finfo, ctx) == `0`) {
4513	if (truncated) {
4514	finfo.mode \|= FSE_TRUNCATED_PATH;
4515	}
4516
4517	// build the path to the destination of the link
4518	add_fsevent(FSE_CREATE_FILE, ctx,
4519	FSE_ARG_STRING, len, target_path,
4520	FSE_ARG_FINFO, &finfo,
4521	FSE_ARG_DONE);
4522	}
4523	if (vp->v_parent) {
4524	add_fsevent(FSE_STAT_CHANGED, ctx,
4525	FSE_ARG_VNODE, vp->v_parent,
4526	FSE_ARG_DONE);
4527	}
4528	}
4529	#endif
4530	}
4531	out2:
4532	/*
4533	* nameidone has to happen before we vnode_put(dvp)
4534	* since it may need to release the fs_nodelock on the dvp
4535	*/
4536	nameidone(&nd);
4537	if (target_path != NULL) {
4538	RELEASE_PATH(target_path);
4539	}
4540	out:
4541	if (lvp)
4542	vnode_put(lvp);
4543	if (dvp)
4544	vnode_put(dvp);
4545	vnode_put(vp);
4546	return (error);
4547	}
4548
4549	int
4550	link(__unused proc_t p, struct link_args uap, __unused int32_t retval)
4551	{
4552	return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4553	AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4554	}
4555
4556	int
4557	linkat(__unused proc_t p, struct linkat_args uap, __unused int32_t retval)
4558	{
4559	if (uap->flag & ~AT_SYMLINK_FOLLOW)
4560	return (EINVAL);
4561
4562	return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4563	uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4564	}
4565
4566	/*
4567	* Make a symbolic link.
4568	*
4569	* We could add support for ACLs here too...
4570	*/
4571	/ ARGSUSED /
4572	static int
4573	symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4574	user_addr_t link, enum uio_seg segflg)
4575	{
4576	struct vnode_attr va;
4577	char *path;
4578	int error;
4579	struct nameidata nd;
4580	vnode_t vp, dvp;
4581	size_t dummy=`0`;
4582	proc_t p;
4583
4584	error = `0`;
4585	if (UIO_SEG_IS_USER_SPACE(segflg)) {
4586	MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4587	error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4588	} else {
4589	path = (char *)path_data;
4590	}
4591	if (error)
4592	goto out;
4593	AUDIT_ARG(text, path); / This is the link string /
4594
4595	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT \| AUDITVNPATH1,
4596	segflg, link, ctx);
4597
4598	error = nameiat(&nd, fd);
4599	if (error)
4600	goto out;
4601	dvp = nd.ni_dvp;
4602	vp = nd.ni_vp;
4603
4604	p = vfs_context_proc(ctx);
4605	VATTR_INIT(&va);
4606	VATTR_SET(&va, va_type, VLNK);
4607	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4608
4609	#if CONFIG_MACF
4610	error = mac_vnode_check_create(ctx,
4611	dvp, &nd.ni_cnd, &va);
4612	#endif
4613	if (error != `0`) {
4614	goto skipit;
4615	}
4616
4617	if (vp != NULL) {
4618	error = EEXIST;
4619	goto skipit;
4620	}
4621
4622	/ authorize /
4623	if (error == `0`)
4624	error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4625	/ get default ownership, etc. /
4626	if (error == `0`)
4627	error = vnode_authattr_new(dvp, &va, `0`, ctx);
4628	if (error == `0`)
4629	error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4630
4631	#if CONFIG_MACF
4632	if (error == `0` && vp)
4633	error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4634	#endif
4635
4636	/ do fallback attribute handling /
4637	if (error == `0` && vp)
4638	error = vnode_setattr_fallback(vp, &va, ctx);
4639
4640	if (error == `0`) {
4641	int update_flags = `0`;
4642
4643	/check if a new vnode was created, else try to get one/
4644	if (vp == NULL) {
4645	nd.ni_cnd.cn_nameiop = LOOKUP;
4646	#if CONFIG_TRIGGERS
4647	nd.ni_op = OP_LOOKUP;
4648	#endif
4649	nd.ni_cnd.cn_flags = `0`;
4650	error = nameiat(&nd, fd);
4651	vp = nd.ni_vp;
4652
4653	if (vp == NULL)
4654	goto skipit;
4655	}
4656
4657	#if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4658	/ call out to allow 3rd party notification of rename.*
4659	* Ignore result of kauth_authorize_fileop call.
4660	*/
4661	if (kauth_authorize_fileop_has_listeners() &&
4662	namei(&nd) == `0`) {
4663	char *new_link_path = NULL;
4664	int len;
4665
4666	/ build the path to the new link file /
4667	new_link_path = get_pathbuff();
4668	len = MAXPATHLEN;
4669	vn_getpath(dvp, new_link_path, &len);
4670	if ((len + `1` + nd.ni_cnd.cn_namelen + `1`) < MAXPATHLEN) {
4671	new_link_path[len - `1`] = `'/'`;
4672	strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4673	}
4674
4675	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4676	(uintptr_t)path, (uintptr_t)new_link_path);
4677	if (new_link_path != NULL)
4678	release_pathbuff(new_link_path);
4679	}
4680	#endif
4681	// Make sure the name & parent pointers are hooked up
4682	if (vp->v_name == NULL)
4683	update_flags \|= VNODE_UPDATE_NAME;
4684	if (vp->v_parent == NULLVP)
4685	update_flags \|= VNODE_UPDATE_PARENT;
4686
4687	if (update_flags)
4688	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4689
4690	#if CONFIG_FSE
4691	add_fsevent(FSE_CREATE_FILE, ctx,
4692	FSE_ARG_VNODE, vp,
4693	FSE_ARG_DONE);
4694	#endif
4695	}
4696
4697	skipit:
4698	/*
4699	* nameidone has to happen before we vnode_put(dvp)
4700	* since it may need to release the fs_nodelock on the dvp
4701	*/
4702	nameidone(&nd);
4703
4704	if (vp)
4705	vnode_put(vp);
4706	vnode_put(dvp);
4707	out:
4708	if (path && (path != (char *)path_data))
4709	FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4710
4711	return (error);
4712	}
4713
4714	int
4715	symlink(__unused proc_t p, struct symlink_args uap, __unused int32_t retval)
4716	{
4717	return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4718	uap->link, UIO_USERSPACE));
4719	}
4720
4721	int
4722	symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4723	__unused int32_t *retval)
4724	{
4725	return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4726	uap->path2, UIO_USERSPACE));
4727	}
4728
4729	/*
4730	* Delete a whiteout from the filesystem.
4731	* No longer supported.
4732	*/
4733	int
4734	undelete(__unused proc_t p, __unused struct undelete_args uap, __unused int32_t retval)
4735	{
4736	return (ENOTSUP);
4737	}
4738
4739	/*
4740	* Delete a name from the filesystem.
4741	*/
4742	/ ARGSUSED /
4743	static int
4744	unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4745	user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4746	{
4747	struct nameidata nd;
4748	vnode_t vp, dvp;
4749	int error;
4750	struct componentname *cnp;
4751	char *path = NULL;
4752	int len=`0`;
4753	#if CONFIG_FSE
4754	fse_info finfo;
4755	struct vnode_attr va;
4756	#endif
4757	int flags;
4758	int need_event;
4759	int has_listeners;
4760	int truncated_path;
4761	int batched;
4762	struct vnode_attr *vap;
4763	int do_retry;
4764	int retry_count = `0`;
4765	int cn_flags;
4766
4767	cn_flags = LOCKPARENT;
4768	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4769	cn_flags \|= AUDITVNPATH1;
4770	/ If a starting dvp is passed, it trumps any fd passed. /
4771	if (start_dvp)
4772	cn_flags \|= USEDVP;
4773
4774	#if NAMEDRSRCFORK
4775	/ unlink or delete is allowed on rsrc forks and named streams /
4776	cn_flags \|= CN_ALLOWRSRCFORK;
4777	#endif
4778
4779	retry:
4780	do_retry = `0`;
4781	flags = `0`;
4782	need_event = `0`;
4783	has_listeners = `0`;
4784	truncated_path = `0`;
4785	vap = NULL;
4786
4787	NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4788
4789	nd.ni_dvp = start_dvp;
4790	nd.ni_flag \|= NAMEI_COMPOUNDREMOVE;
4791	cnp = &nd.ni_cnd;
4792
4793	continue_lookup:
4794	error = nameiat(&nd, fd);
4795	if (error)
4796	return (error);
4797
4798	dvp = nd.ni_dvp;
4799	vp = nd.ni_vp;
4800
4801
4802	/ With Carbon delete semantics, busy files cannot be deleted /
4803	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4804	flags \|= VNODE_REMOVE_NODELETEBUSY;
4805	}
4806
4807	/ Skip any potential upcalls if told to. /
4808	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4809	flags \|= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4810	}
4811
4812	if (vp) {
4813	batched = vnode_compound_remove_available(vp);
4814	/*
4815	* The root of a mounted filesystem cannot be deleted.
4816	*/
4817	if (vp->v_flag & VROOT) {
4818	error = EBUSY;
4819	}
4820
4821	#if DEVELOPMENT \|\| DEBUG
4822	/*
4823	* XXX VSWAP: Check for entitlements or special flag here
4824	* so we can restrict access appropriately.
4825	*/
4826	#else /* DEVELOPMENT \|\| DEBUG */
4827
4828	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
4829	error = EPERM;
4830	goto out;
4831	}
4832	#endif /* DEVELOPMENT \|\| DEBUG */
4833
4834	if (!batched) {
4835	error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4836	if (error) {
4837	if (error == ENOENT) {
4838	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4839	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4840	do_retry = `1`;
4841	retry_count++;
4842	}
4843	}
4844	goto out;
4845	}
4846	}
4847	} else {
4848	batched = `1`;
4849
4850	if (!vnode_compound_remove_available(dvp)) {
4851	panic("No vp, but no compound remove?");
4852	}
4853	}
4854
4855	#if CONFIG_FSE
4856	need_event = need_fsevent(FSE_DELETE, dvp);
4857	if (need_event) {
4858	if (!batched) {
4859	if ((vp->v_flag & VISHARDLINK) == `0`) {
4860	/ XXX need to get these data in batched VNOP /
4861	get_fse_info(vp, &finfo, ctx);
4862	}
4863	} else {
4864	error = vfs_get_notify_attributes(&va);
4865	if (error) {
4866	goto out;
4867	}
4868
4869	vap = &va;
4870	}
4871	}
4872	#endif
4873	has_listeners = kauth_authorize_fileop_has_listeners();
4874	if (need_event \|\| has_listeners) {
4875	if (path == NULL) {
4876	GET_PATH(path);
4877	if (path == NULL) {
4878	error = ENOMEM;
4879	goto out;
4880	}
4881	}
4882	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4883	}
4884
4885	#if NAMEDRSRCFORK
4886	if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4887	error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, `0`, ctx);
4888	else
4889	#endif
4890	{
4891	error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4892	vp = nd.ni_vp;
4893	if (error == EKEEPLOOKING) {
4894	if (!batched) {
4895	panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4896	}
4897
4898	if ((nd.ni_flag & NAMEI_CONTLOOKUP) == `0`) {
4899	panic("EKEEPLOOKING, but continue flag not set?");
4900	}
4901
4902	if (vnode_isdir(vp)) {
4903	error = EISDIR;
4904	goto out;
4905	}
4906	goto continue_lookup;
4907	} else if (error == ENOENT && batched) {
4908	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4909	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4910	/*
4911	* For compound VNOPs, the authorization callback may
4912	* return ENOENT in case of racing hardlink lookups
4913	* hitting the name cache, redrive the lookup.
4914	*/
4915	do_retry = `1`;
4916	retry_count += `1`;
4917	goto out;
4918	}
4919	}
4920	}
4921
4922	/*
4923	* Call out to allow 3rd party notification of delete.
4924	* Ignore result of kauth_authorize_fileop call.
4925	*/
4926	if (!error) {
4927	if (has_listeners) {
4928	kauth_authorize_fileop(vfs_context_ucred(ctx),
4929	KAUTH_FILEOP_DELETE,
4930	(uintptr_t)vp,
4931	(uintptr_t)path);
4932	}
4933
4934	if (vp->v_flag & VISHARDLINK) {
4935	//
4936	// if a hardlink gets deleted we want to blow away the
4937	// v_parent link because the path that got us to this
4938	// instance of the link is no longer valid. this will
4939	// force the next call to get the path to ask the file
4940	// system instead of just following the v_parent link.
4941	//
4942	vnode_update_identity(vp, NULL, NULL, `0`, `0`, VNODE_UPDATE_PARENT);
4943	}
4944
4945	#if CONFIG_FSE
4946	if (need_event) {
4947	if (vp->v_flag & VISHARDLINK) {
4948	get_fse_info(vp, &finfo, ctx);
4949	} else if (vap) {
4950	vnode_get_fse_info_from_vap(vp, &finfo, vap);
4951	}
4952	if (truncated_path) {
4953	finfo.mode \|= FSE_TRUNCATED_PATH;
4954	}
4955	add_fsevent(FSE_DELETE, ctx,
4956	FSE_ARG_STRING, len, path,
4957	FSE_ARG_FINFO, &finfo,
4958	FSE_ARG_DONE);
4959	}
4960	#endif
4961	}
4962
4963	out:
4964	if (path != NULL)
4965	RELEASE_PATH(path);
4966
4967	#if NAMEDRSRCFORK
4968	/ recycle the deleted rsrc fork vnode to force a reclaim, which*
4969	* will cause its shadow file to go away if necessary.
4970	*/
4971	if (vp && (vnode_isnamedstream(vp)) &&
4972	(vp->v_parent != NULLVP) &&
4973	vnode_isshadow(vp)) {
4974	vnode_recycle(vp);
4975	}
4976	#endif
4977	/*
4978	* nameidone has to happen before we vnode_put(dvp)
4979	* since it may need to release the fs_nodelock on the dvp
4980	*/
4981	nameidone(&nd);
4982	vnode_put(dvp);
4983	if (vp) {
4984	vnode_put(vp);
4985	}
4986
4987	if (do_retry) {
4988	goto retry;
4989	}
4990
4991	return (error);
4992	}
4993
4994	int
4995	unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4996	enum uio_seg segflg, int unlink_flags)
4997	{
4998	return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4999	unlink_flags));
5000	}
5001
5002	/*
5003	* Delete a name from the filesystem using Carbon semantics.
5004	*/
5005	int
5006	delete(__unused proc_t p, struct delete_args uap, __unused int32_t retval)
5007	{
5008	return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5009	uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
5010	}
5011
5012	/*
5013	* Delete a name from the filesystem using POSIX semantics.
5014	*/
5015	int
5016	unlink(__unused proc_t p, struct unlink_args uap, __unused int32_t retval)
5017	{
5018	return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5019	uap->path, UIO_USERSPACE, `0`));
5020	}
5021
5022	int
5023	unlinkat(__unused proc_t p, struct unlinkat_args uap, __unused int32_t retval)
5024	{
5025	if (uap->flag & ~AT_REMOVEDIR)
5026	return (EINVAL);
5027
5028	if (uap->flag & AT_REMOVEDIR)
5029	return (rmdirat_internal(vfs_context_current(), uap->fd,
5030	uap->path, UIO_USERSPACE));
5031	else
5032	return (unlinkat_internal(vfs_context_current(), uap->fd,
5033	NULLVP, uap->path, UIO_USERSPACE, `0`));
5034	}
5035
5036	/*
5037	* Reposition read/write file offset.
5038	*/
5039	int
5040	lseek(proc_t p, struct lseek_args uap, off_t retval)
5041	{
5042	struct fileproc *fp;
5043	vnode_t vp;
5044	struct vfs_context *ctx;
5045	off_t offset = uap->offset, file_size;
5046	int error;
5047
5048	if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
5049	if (error == ENOTSUP)
5050	return (ESPIPE);
5051	return (error);
5052	}
5053	if (vnode_isfifo(vp)) {
5054	file_drop(uap->fd);
5055	return(ESPIPE);
5056	}
5057
5058
5059	ctx = vfs_context_current();
5060	#if CONFIG_MACF
5061	if (uap->whence == L_INCR && uap->offset == `0`)
5062	error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5063	fp->f_fglob);
5064	else
5065	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5066	fp->f_fglob);
5067	if (error) {
5068	file_drop(uap->fd);
5069	return (error);
5070	}
5071	#endif
5072	if ( (error = vnode_getwithref(vp)) ) {
5073	file_drop(uap->fd);
5074	return(error);
5075	}
5076
5077	switch (uap->whence) {
5078	case L_INCR:
5079	offset += fp->f_fglob->fg_offset;
5080	break;
5081	case L_XTND:
5082	if ((error = vnode_size(vp, &file_size, ctx)) != `0`)
5083	break;
5084	offset += file_size;
5085	break;
5086	case L_SET:
5087	break;
5088	case SEEK_HOLE:
5089	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, `0`, ctx);
5090	break;
5091	case SEEK_DATA:
5092	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, `0`, ctx);
5093	break;
5094	default:
5095	error = EINVAL;
5096	}
5097	if (error == `0`) {
5098	if (uap->offset > `0` && offset < `0`) {
5099	/ Incremented/relative move past max size /
5100	error = EOVERFLOW;
5101	} else {
5102	/*
5103	* Allow negative offsets on character devices, per
5104	* POSIX 1003.1-2001. Most likely for writing disk
5105	* labels.
5106	*/
5107	if (offset < `0` && vp->v_type != VCHR) {
5108	/ Decremented/relative move before start /
5109	error = EINVAL;
5110	} else {
5111	/ Success /
5112	fp->f_fglob->fg_offset = offset;
5113	*retval = fp->f_fglob->fg_offset;
5114	}
5115	}
5116	}
5117
5118	/*
5119	* An lseek can affect whether data is "available to read." Use
5120	* hint of NOTE_NONE so no EVFILT_VNODE events fire
5121	*/
5122	post_event_if_success(vp, error, NOTE_NONE);
5123	(void)vnode_put(vp);
5124	file_drop(uap->fd);
5125	return (error);
5126	}
5127
5128
5129	/*
5130	* Check access permissions.
5131	*
5132	* Returns: 0 Success
5133	* vnode_authorize:???
5134	*/
5135	static int
5136	access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5137	{
5138	kauth_action_t action;
5139	int error;
5140
5141	/*
5142	* If just the regular access bits, convert them to something
5143	* that vnode_authorize will understand.
5144	*/
5145	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5146	action = `0`;
5147	if (uflags & R_OK)
5148	action \|= KAUTH_VNODE_READ_DATA; / aka KAUTH_VNODE_LIST_DIRECTORY /
5149	if (uflags & W_OK) {
5150	if (vnode_isdir(vp)) {
5151	action \|= KAUTH_VNODE_ADD_FILE \|
5152	KAUTH_VNODE_ADD_SUBDIRECTORY;
5153	/ might want delete rights here too /
5154	} else {
5155	action \|= KAUTH_VNODE_WRITE_DATA;
5156	}
5157	}
5158	if (uflags & X_OK) {
5159	if (vnode_isdir(vp)) {
5160	action \|= KAUTH_VNODE_SEARCH;
5161	} else {
5162	action \|= KAUTH_VNODE_EXECUTE;
5163	}
5164	}
5165	} else {
5166	/ take advantage of definition of uflags /
5167	action = uflags >> `8`;
5168	}
5169
5170	#if CONFIG_MACF
5171	error = mac_vnode_check_access(ctx, vp, uflags);
5172	if (error)
5173	return (error);
5174	#endif /* MAC */
5175
5176	/ action == 0 means only check for existence /
5177	if (action != `0`) {
5178	error = vnode_authorize(vp, dvp, action \| KAUTH_VNODE_ACCESS, ctx);
5179	} else {
5180	error = `0`;
5181	}
5182
5183	return(error);
5184	}
5185
5186
5187
5188	/*
5189	* access_extended: Check access permissions in bulk.
5190	*
5191	* Description: uap->entries Pointer to an array of accessx
5192	* descriptor structs, plus one or
5193	* more NULL terminated strings (see
5194	* "Notes" section below).
5195	* uap->size Size of the area pointed to by
5196	* uap->entries.
5197	* uap->results Pointer to the results array.
5198	*
5199	* Returns: 0 Success
5200	* ENOMEM Insufficient memory
5201	* EINVAL Invalid arguments
5202	* namei:EFAULT Bad address
5203	* namei:ENAMETOOLONG Filename too long
5204	* namei:ENOENT No such file or directory
5205	* namei:ELOOP Too many levels of symbolic links
5206	* namei:EBADF Bad file descriptor
5207	* namei:ENOTDIR Not a directory
5208	* namei:???
5209	* access1:
5210	*
5211	* Implicit returns:
5212	* uap->results Array contents modified
5213	*
5214	* Notes: The uap->entries are structured as an arbitrary length array
5215	* of accessx descriptors, followed by one or more NULL terminated
5216	* strings
5217	*
5218	* struct accessx_descriptor[0]
5219	* ...
5220	* struct accessx_descriptor[n]
5221	* char name_data[0];
5222	*
5223	* We determine the entry count by walking the buffer containing
5224	* the uap->entries argument descriptor. For each descriptor we
5225	* see, the valid values for the offset ad_name_offset will be
5226	* in the byte range:
5227	*
5228	* [ uap->entries + sizeof(struct accessx_descriptor) ]
5229	* to
5230	* [ uap->entries + uap->size - 2 ]
5231	*
5232	* since we must have at least one string, and the string must
5233	* be at least one character plus the NULL terminator in length.
5234	*
5235	* XXX: Need to support the check-as uid argument
5236	*/
5237	int
5238	access_extended(__unused proc_t p, struct access_extended_args uap, __unused int32_t retval)
5239	{
5240	struct accessx_descriptor *input = NULL;
5241	errno_t *result = NULL;
5242	errno_t error = `0`;
5243	int wantdelete = `0`;
5244	unsigned int desc_max, desc_actual, i, j;
5245	struct vfs_context context;
5246	struct nameidata nd;
5247	int niopts;
5248	vnode_t vp = NULL;
5249	vnode_t dvp = NULL;
5250	#define ACCESSX_MAX_DESCR_ON_STACK 10
5251	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5252
5253	context.vc_ucred = NULL;
5254
5255	/*
5256	* Validate parameters; if valid, copy the descriptor array and string
5257	* arguments into local memory. Before proceeding, the following
5258	* conditions must have been met:
5259	*
5260	* o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5261	* o There must be sufficient room in the request for at least one
5262	* descriptor and a one yte NUL terminated string.
5263	* o The allocation of local storage must not fail.
5264	*/
5265	if (uap->size > ACCESSX_MAX_TABLESIZE)
5266	return(ENOMEM);
5267	if (uap->size < (sizeof(struct accessx_descriptor) + `2`))
5268	return(EINVAL);
5269	if (uap->size <= sizeof (stack_input)) {
5270	input = stack_input;
5271	} else {
5272	MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5273	if (input == NULL) {
5274	error = ENOMEM;
5275	goto out;
5276	}
5277	}
5278	error = copyin(uap->entries, input, uap->size);
5279	if (error)
5280	goto out;
5281
5282	AUDIT_ARG(opaque, input, uap->size);
5283
5284	/*
5285	* Force NUL termination of the copyin buffer to avoid nami() running
5286	* off the end. If the caller passes us bogus data, they may get a
5287	* bogus result.
5288	*/
5289	((char *)input)[uap->size - `1`] = `0`;
5290
5291	/*
5292	* Access is defined as checking against the process' real identity,
5293	* even if operations are checking the effective identity. This
5294	* requires that we use a local vfs context.
5295	*/
5296	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5297	context.vc_thread = current_thread();
5298
5299	/*
5300	* Find out how many entries we have, so we can allocate the result
5301	* array by walking the list and adjusting the count downward by the
5302	* earliest string offset we see.
5303	*/
5304	desc_max = (uap->size - `2`) / sizeof(struct accessx_descriptor);
5305	desc_actual = desc_max;
5306	for (i = `0`; i < desc_actual; i++) {
5307	/*
5308	* Take the offset to the name string for this entry and
5309	* convert to an input array index, which would be one off
5310	* the end of the array if this entry was the lowest-addressed
5311	* name string.
5312	*/
5313	j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5314
5315	/*
5316	* An offset greater than the max allowable offset is an error.
5317	* It is also an error for any valid entry to point
5318	* to a location prior to the end of the current entry, if
5319	* it's not a reference to the string of the previous entry.
5320	*/
5321	if (j > desc_max \|\| (j != `0` && j <= i)) {
5322	error = EINVAL;
5323	goto out;
5324	}
5325
5326	/ Also do not let ad_name_offset point to something beyond the size of the input /
5327	if (input[i].ad_name_offset >= uap->size) {
5328	error = EINVAL;
5329	goto out;
5330	}
5331
5332	/*
5333	* An offset of 0 means use the previous descriptor's offset;
5334	* this is used to chain multiple requests for the same file
5335	* to avoid multiple lookups.
5336	*/
5337	if (j == `0`) {
5338	/ This is not valid for the first entry /
5339	if (i == `0`) {
5340	error = EINVAL;
5341	goto out;
5342	}
5343	continue;
5344	}
5345
5346	/*
5347	* If the offset of the string for this descriptor is before
5348	* what we believe is the current actual last descriptor,
5349	* then we need to adjust our estimate downward; this permits
5350	* the string table following the last descriptor to be out
5351	* of order relative to the descriptor list.
5352	*/
5353	if (j < desc_actual)
5354	desc_actual = j;
5355	}
5356
5357	/*
5358	* We limit the actual number of descriptors we are willing to process
5359	* to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
5360	* requested does not exceed this limit,
5361	*/
5362	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5363	error = ENOMEM;
5364	goto out;
5365	}
5366	MALLOC(result, errno_t , desc_actual sizeof(errno_t), M_TEMP, M_WAITOK \| M_ZERO);
5367	if (result == NULL) {
5368	error = ENOMEM;
5369	goto out;
5370	}
5371
5372	/*
5373	* Do the work by iterating over the descriptor entries we know to
5374	* at least appear to contain valid data.
5375	*/
5376	error = `0`;
5377	for (i = `0`; i < desc_actual; i++) {
5378	/*
5379	* If the ad_name_offset is 0, then we use the previous
5380	* results to make the check; otherwise, we are looking up
5381	* a new file name.
5382	*/
5383	if (input[i].ad_name_offset != `0`) {
5384	/ discard old vnodes /
5385	if (vp) {
5386	vnode_put(vp);
5387	vp = NULL;
5388	}
5389	if (dvp) {
5390	vnode_put(dvp);
5391	dvp = NULL;
5392	}
5393
5394	/*
5395	* Scan forward in the descriptor list to see if we
5396	* need the parent vnode. We will need it if we are
5397	* deleting, since we must have rights to remove
5398	* entries in the parent directory, as well as the
5399	* rights to delete the object itself.
5400	*/
5401	wantdelete = input[i].ad_flags & _DELETE_OK;
5402	for (j = i + `1`; (j < desc_actual) && (input[j].ad_name_offset == `0`); j++)
5403	if (input[j].ad_flags & _DELETE_OK)
5404	wantdelete = `1`;
5405
5406	niopts = FOLLOW \| AUDITVNPATH1;
5407
5408	/ need parent for vnode_authorize for deletion test /
5409	if (wantdelete)
5410	niopts \|= WANTPARENT;
5411
5412	/ do the lookup /
5413	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5414	CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5415	&context);
5416	error = namei(&nd);
5417	if (!error) {
5418	vp = nd.ni_vp;
5419	if (wantdelete)
5420	dvp = nd.ni_dvp;
5421	}
5422	nameidone(&nd);
5423	}
5424
5425	/*
5426	* Handle lookup errors.
5427	*/
5428	switch(error) {
5429	case ENOENT:
5430	case EACCES:
5431	case EPERM:
5432	case ENOTDIR:
5433	result[i] = error;
5434	break;
5435	case `0`:
5436	/ run this access check /
5437	result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5438	break;
5439	default:
5440	/ fatal lookup error /
5441
5442	goto out;
5443	}
5444	}
5445
5446	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5447
5448	/ copy out results /
5449	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5450
5451	out:
5452	if (input && input != stack_input)
5453	FREE(input, M_TEMP);
5454	if (result)
5455	FREE(result, M_TEMP);
5456	if (vp)
5457	vnode_put(vp);
5458	if (dvp)
5459	vnode_put(dvp);
5460	if (IS_VALID_CRED(context.vc_ucred))
5461	kauth_cred_unref(&context.vc_ucred);
5462	return(error);
5463	}
5464
5465
5466	/*
5467	* Returns: 0 Success
5468	* namei:EFAULT Bad address
5469	* namei:ENAMETOOLONG Filename too long
5470	* namei:ENOENT No such file or directory
5471	* namei:ELOOP Too many levels of symbolic links
5472	* namei:EBADF Bad file descriptor
5473	* namei:ENOTDIR Not a directory
5474	* namei:???
5475	* access1:
5476	*/
5477	static int
5478	faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5479	int flag, enum uio_seg segflg)
5480	{
5481	int error;
5482	struct nameidata nd;
5483	int niopts;
5484	struct vfs_context context;
5485	#if NAMEDRSRCFORK
5486	int is_namedstream = `0`;
5487	#endif
5488
5489	/*
5490	* Unless the AT_EACCESS option is used, Access is defined as checking
5491	* against the process' real identity, even if operations are checking
5492	* the effective identity. So we need to tweak the credential
5493	* in the context for that case.
5494	*/
5495	if (!(flag & AT_EACCESS))
5496	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5497	else
5498	context.vc_ucred = ctx->vc_ucred;
5499	context.vc_thread = ctx->vc_thread;
5500
5501
5502	niopts = FOLLOW \| AUDITVNPATH1;
5503	/ need parent for vnode_authorize for deletion test /
5504	if (amode & _DELETE_OK)
5505	niopts \|= WANTPARENT;
5506	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5507	path, &context);
5508
5509	#if NAMEDRSRCFORK
5510	/ access(F_OK) calls are allowed for resource forks. /
5511	if (amode == F_OK)
5512	nd.ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
5513	#endif
5514	error = nameiat(&nd, fd);
5515	if (error)
5516	goto out;
5517
5518	#if NAMEDRSRCFORK
5519	/ Grab reference on the shadow stream file vnode to*
5520	* force an inactive on release which will mark it
5521	* for recycle.
5522	*/
5523	if (vnode_isnamedstream(nd.ni_vp) &&
5524	(nd.ni_vp->v_parent != NULLVP) &&
5525	vnode_isshadow(nd.ni_vp)) {
5526	is_namedstream = `1`;
5527	vnode_ref(nd.ni_vp);
5528	}
5529	#endif
5530
5531	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5532
5533	#if NAMEDRSRCFORK
5534	if (is_namedstream) {
5535	vnode_rele(nd.ni_vp);
5536	}
5537	#endif
5538
5539	vnode_put(nd.ni_vp);
5540	if (amode & _DELETE_OK)
5541	vnode_put(nd.ni_dvp);
5542	nameidone(&nd);
5543
5544	out:
5545	if (!(flag & AT_EACCESS))
5546	kauth_cred_unref(&context.vc_ucred);
5547	return (error);
5548	}
5549
5550	int
5551	access(__unused proc_t p, struct access_args uap, __unused int32_t retval)
5552	{
5553	return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5554	uap->path, uap->flags, `0`, UIO_USERSPACE));
5555	}
5556
5557	int
5558	faccessat(__unused proc_t p, struct faccessat_args *uap,
5559	__unused int32_t *retval)
5560	{
5561	if (uap->flag & ~AT_EACCESS)
5562	return (EINVAL);
5563
5564	return (faccessat_internal(vfs_context_current(), uap->fd,
5565	uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5566	}
5567
5568	/*
5569	* Returns: 0 Success
5570	* EFAULT
5571	* copyout:EFAULT
5572	* namei:???
5573	* vn_stat:???
5574	*/
5575	static int
5576	fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5577	user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5578	enum uio_seg segflg, int fd, int flag)
5579	{
5580	struct nameidata nd;
5581	int follow;
5582	union {
5583	struct stat sb;
5584	struct stat64 sb64;
5585	} source = {};
5586	union {
5587	struct user64_stat user64_sb;
5588	struct user32_stat user32_sb;
5589	struct user64_stat64 user64_sb64;
5590	struct user32_stat64 user32_sb64;
5591	} dest = {};
5592	caddr_t sbp;
5593	int error, my_size;
5594	kauth_filesec_t fsec;
5595	size_t xsecurity_bufsize;
5596	void * statptr;
5597
5598	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5599	NDINIT(&nd, LOOKUP, OP_GETATTR, follow \| AUDITVNPATH1,
5600	segflg, path, ctx);
5601
5602	#if NAMEDRSRCFORK
5603	int is_namedstream = `0`;
5604	/ stat calls are allowed for resource forks. /
5605	nd.ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
5606	#endif
5607	error = nameiat(&nd, fd);
5608	if (error)
5609	return (error);
5610	fsec = KAUTH_FILESEC_NONE;
5611
5612	statptr = (void *)&source;
5613
5614	#if NAMEDRSRCFORK
5615	/ Grab reference on the shadow stream file vnode to*
5616	* force an inactive on release which will mark it
5617	* for recycle.
5618	*/
5619	if (vnode_isnamedstream(nd.ni_vp) &&
5620	(nd.ni_vp->v_parent != NULLVP) &&
5621	vnode_isshadow(nd.ni_vp)) {
5622	is_namedstream = `1`;
5623	vnode_ref(nd.ni_vp);
5624	}
5625	#endif
5626
5627	error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5628
5629	#if NAMEDRSRCFORK
5630	if (is_namedstream) {
5631	vnode_rele(nd.ni_vp);
5632	}
5633	#endif
5634	vnode_put(nd.ni_vp);
5635	nameidone(&nd);
5636
5637	if (error)
5638	return (error);
5639	/ Zap spare fields /
5640	if (isstat64 != `0`) {
5641	source.sb64.st_lspare = `0`;
5642	source.sb64.st_qspare[`0`] = `0LL`;
5643	source.sb64.st_qspare[`1`] = `0LL`;
5644	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5645	munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5646	my_size = sizeof(dest.user64_sb64);
5647	sbp = (caddr_t)&dest.user64_sb64;
5648	} else {
5649	munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5650	my_size = sizeof(dest.user32_sb64);
5651	sbp = (caddr_t)&dest.user32_sb64;
5652	}
5653	/*
5654	* Check if we raced (post lookup) against the last unlink of a file.
5655	*/
5656	if ((source.sb64.st_nlink == `0`) && S_ISREG(source.sb64.st_mode)) {
5657	source.sb64.st_nlink = `1`;
5658	}
5659	} else {
5660	source.sb.st_lspare = `0`;
5661	source.sb.st_qspare[`0`] = `0LL`;
5662	source.sb.st_qspare[`1`] = `0LL`;
5663	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5664	munge_user64_stat(&source.sb, &dest.user64_sb);
5665	my_size = sizeof(dest.user64_sb);
5666	sbp = (caddr_t)&dest.user64_sb;
5667	} else {
5668	munge_user32_stat(&source.sb, &dest.user32_sb);
5669	my_size = sizeof(dest.user32_sb);
5670	sbp = (caddr_t)&dest.user32_sb;
5671	}
5672
5673	/*
5674	* Check if we raced (post lookup) against the last unlink of a file.
5675	*/
5676	if ((source.sb.st_nlink == `0`) && S_ISREG(source.sb.st_mode)) {
5677	source.sb.st_nlink = `1`;
5678	}
5679	}
5680	if ((error = copyout(sbp, ub, my_size)) != `0`)
5681	goto out;
5682
5683	/ caller wants extended security information? /
5684	if (xsecurity != USER_ADDR_NULL) {
5685
5686	/ did we get any? /
5687	if (fsec == KAUTH_FILESEC_NONE) {
5688	if (susize(xsecurity_size, `0`) != `0`) {
5689	error = EFAULT;
5690	goto out;
5691	}
5692	} else {
5693	/ find the user buffer size /
5694	xsecurity_bufsize = fusize(xsecurity_size);
5695
5696	/ copy out the actual data size /
5697	if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != `0`) {
5698	error = EFAULT;
5699	goto out;
5700	}
5701
5702	/ if the caller supplied enough room, copy out to it /
5703	if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5704	error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5705	}
5706	}
5707	out:
5708	if (fsec != KAUTH_FILESEC_NONE)
5709	kauth_filesec_free(fsec);
5710	return (error);
5711	}
5712
5713	/*
5714	* stat_extended: Get file status; with extended security (ACL).
5715	*
5716	* Parameters: p (ignored)
5717	* uap User argument descriptor (see below)
5718	* retval (ignored)
5719	*
5720	* Indirect: uap->path Path of file to get status from
5721	* uap->ub User buffer (holds file status info)
5722	* uap->xsecurity ACL to get (extended security)
5723	* uap->xsecurity_size Size of ACL
5724	*
5725	* Returns: 0 Success
5726	* !0 errno value
5727	*
5728	*/
5729	int
5730	stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5731	__unused int32_t *retval)
5732	{
5733	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5734	uap->xsecurity, uap->xsecurity_size, `0`, UIO_USERSPACE, AT_FDCWD,
5735	`0`));
5736	}
5737
5738	/*
5739	* Returns: 0 Success
5740	* fstatat_internal:??? [see fstatat_internal() in this file]
5741	*/
5742	int
5743	stat(__unused proc_t p, struct stat_args uap, __unused int32_t retval)
5744	{
5745	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5746	`0`, `0`, `0`, UIO_USERSPACE, AT_FDCWD, `0`));
5747	}
5748
5749	int
5750	stat64(__unused proc_t p, struct stat64_args uap, __unused int32_t retval)
5751	{
5752	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5753	`0`, `0`, `1`, UIO_USERSPACE, AT_FDCWD, `0`));
5754	}
5755
5756	/*
5757	* stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5758	*
5759	* Parameters: p (ignored)
5760	* uap User argument descriptor (see below)
5761	* retval (ignored)
5762	*
5763	* Indirect: uap->path Path of file to get status from
5764	* uap->ub User buffer (holds file status info)
5765	* uap->xsecurity ACL to get (extended security)
5766	* uap->xsecurity_size Size of ACL
5767	*
5768	* Returns: 0 Success
5769	* !0 errno value
5770	*
5771	*/
5772	int
5773	stat64_extended(__unused proc_t p, struct stat64_extended_args uap, __unused int32_t retval)
5774	{
5775	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5776	uap->xsecurity, uap->xsecurity_size, `1`, UIO_USERSPACE, AT_FDCWD,
5777	`0`));
5778	}
5779
5780	/*
5781	* lstat_extended: Get file status; does not follow links; with extended security (ACL).
5782	*
5783	* Parameters: p (ignored)
5784	* uap User argument descriptor (see below)
5785	* retval (ignored)
5786	*
5787	* Indirect: uap->path Path of file to get status from
5788	* uap->ub User buffer (holds file status info)
5789	* uap->xsecurity ACL to get (extended security)
5790	* uap->xsecurity_size Size of ACL
5791	*
5792	* Returns: 0 Success
5793	* !0 errno value
5794	*
5795	*/
5796	int
5797	lstat_extended(__unused proc_t p, struct lstat_extended_args uap, __unused int32_t retval)
5798	{
5799	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5800	uap->xsecurity, uap->xsecurity_size, `0`, UIO_USERSPACE, AT_FDCWD,
5801	AT_SYMLINK_NOFOLLOW));
5802	}
5803
5804	/*
5805	* Get file status; this version does not follow links.
5806	*/
5807	int
5808	lstat(__unused proc_t p, struct lstat_args uap, __unused int32_t retval)
5809	{
5810	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5811	`0`, `0`, `0`, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5812	}
5813
5814	int
5815	lstat64(__unused proc_t p, struct lstat64_args uap, __unused int32_t retval)
5816	{
5817	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5818	`0`, `0`, `1`, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5819	}
5820
5821	/*
5822	* lstat64_extended: Get file status; can handle large inode numbers; does not
5823	* follow links; with extended security (ACL).
5824	*
5825	* Parameters: p (ignored)
5826	* uap User argument descriptor (see below)
5827	* retval (ignored)
5828	*
5829	* Indirect: uap->path Path of file to get status from
5830	* uap->ub User buffer (holds file status info)
5831	* uap->xsecurity ACL to get (extended security)
5832	* uap->xsecurity_size Size of ACL
5833	*
5834	* Returns: 0 Success
5835	* !0 errno value
5836	*
5837	*/
5838	int
5839	lstat64_extended(__unused proc_t p, struct lstat64_extended_args uap, __unused int32_t retval)
5840	{
5841	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5842	uap->xsecurity, uap->xsecurity_size, `1`, UIO_USERSPACE, AT_FDCWD,
5843	AT_SYMLINK_NOFOLLOW));
5844	}
5845
5846	int
5847	fstatat(__unused proc_t p, struct fstatat_args uap, __unused int32_t retval)
5848	{
5849	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5850	return (EINVAL);
5851
5852	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5853	`0`, `0`, `0`, UIO_USERSPACE, uap->fd, uap->flag));
5854	}
5855
5856	int
5857	fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5858	__unused int32_t *retval)
5859	{
5860	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5861	return (EINVAL);
5862
5863	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5864	`0`, `0`, `1`, UIO_USERSPACE, uap->fd, uap->flag));
5865	}
5866
5867	/*
5868	* Get configurable pathname variables.
5869	*
5870	* Returns: 0 Success
5871	* namei:???
5872	* vn_pathconf:???
5873	*
5874	* Notes: Global implementation constants are intended to be
5875	* implemented in this function directly; all other constants
5876	* are per-FS implementation, and therefore must be handled in
5877	* each respective FS, instead.
5878	*
5879	* XXX We implement some things globally right now that should actually be
5880	* XXX per-FS; we will need to deal with this at some point.
5881	*/
5882	/ ARGSUSED /
5883	int
5884	pathconf(__unused proc_t p, struct pathconf_args uap, int32_t retval)
5885	{
5886	int error;
5887	struct nameidata nd;
5888	vfs_context_t ctx = vfs_context_current();
5889
5890	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW \| AUDITVNPATH1,
5891	UIO_USERSPACE, uap->path, ctx);
5892	error = namei(&nd);
5893	if (error)
5894	return (error);
5895
5896	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5897
5898	vnode_put(nd.ni_vp);
5899	nameidone(&nd);
5900	return (error);
5901	}
5902
5903	/*
5904	* Return target name of a symbolic link.
5905	*/
5906	/ ARGSUSED /
5907	static int
5908	readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5909	enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5910	int *retval)
5911	{
5912	vnode_t vp;
5913	uio_t auio;
5914	int error;
5915	struct nameidata nd;
5916	char uio_buf[ UIO_SIZEOF(`1`) ];
5917
5918	NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW \| AUDITVNPATH1,
5919	seg, path, ctx);
5920
5921	error = nameiat(&nd, fd);
5922	if (error)
5923	return (error);
5924	vp = nd.ni_vp;
5925
5926	nameidone(&nd);
5927
5928	auio = uio_createwithbuffer(`1`, `0`, bufseg, UIO_READ,
5929	&uio_buf[`0`], sizeof(uio_buf));
5930	uio_addiov(auio, buf, bufsize);
5931	if (vp->v_type != VLNK) {
5932	error = EINVAL;
5933	} else {
5934	#if CONFIG_MACF
5935	error = mac_vnode_check_readlink(ctx, vp);
5936	#endif
5937	if (error == `0`)
5938	error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5939	ctx);
5940	if (error == `0`)
5941	error = VNOP_READLINK(vp, auio, ctx);
5942	}
5943	vnode_put(vp);
5944
5945	retval = bufsize - (int*)uio_resid(auio);
5946	return (error);
5947	}
5948
5949	int
5950	readlink(proc_t p, struct readlink_args uap, int32_t retval)
5951	{
5952	enum uio_seg procseg;
5953
5954	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5955	return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5956	CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5957	uap->count, procseg, retval));
5958	}
5959
5960	int
5961	readlinkat(proc_t p, struct readlinkat_args uap, int32_t retval)
5962	{
5963	enum uio_seg procseg;
5964
5965	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5966	return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5967	procseg, uap->buf, uap->bufsize, procseg, retval));
5968	}
5969
5970	/*
5971	* Change file flags.
5972	*
5973	* NOTE: this will vnode_put() `vp'
5974	*/
5975	static int
5976	chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5977	{
5978	struct vnode_attr va;
5979	kauth_action_t action;
5980	int error;
5981
5982	VATTR_INIT(&va);
5983	VATTR_SET(&va, va_flags, flags);
5984
5985	#if CONFIG_MACF
5986	error = mac_vnode_check_setflags(ctx, vp, flags);
5987	if (error)
5988	goto out;
5989	#endif
5990
5991	/ request authorisation, disregard immutability /
5992	if ((error = vnode_authattr(vp, &va, &action, ctx)) != `0`)
5993	goto out;
5994	/*
5995	* Request that the auth layer disregard those file flags it's allowed to when
5996	* authorizing this operation; we need to do this in order to be able to
5997	* clear immutable flags.
5998	*/
5999	if (action && ((error = vnode_authorize(vp, NULL, action \| KAUTH_VNODE_NOIMMUTABLE, ctx)) != `0`))
6000	goto out;
6001	error = vnode_setattr(vp, &va, ctx);
6002
6003	#if CONFIG_MACF
6004	if (error == `0`)
6005	mac_vnode_notify_setflags(ctx, vp, flags);
6006	#endif
6007
6008	if ((error == `0`) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6009	error = ENOTSUP;
6010	}
6011	out:
6012	vnode_put(vp);
6013	return(error);
6014	}
6015
6016	/*
6017	* Change flags of a file given a path name.
6018	*/
6019	/ ARGSUSED /
6020	int
6021	chflags(__unused proc_t p, struct chflags_args uap, __unused int32_t retval)
6022	{
6023	vnode_t vp;
6024	vfs_context_t ctx = vfs_context_current();
6025	int error;
6026	struct nameidata nd;
6027
6028	AUDIT_ARG(fflags, uap->flags);
6029	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1,
6030	UIO_USERSPACE, uap->path, ctx);
6031	error = namei(&nd);
6032	if (error)
6033	return (error);
6034	vp = nd.ni_vp;
6035	nameidone(&nd);
6036
6037	/ we don't vnode_put() here because chflags1 does internally /
6038	error = chflags1(vp, uap->flags, ctx);
6039
6040	return(error);
6041	}
6042
6043	/*
6044	* Change flags of a file given a file descriptor.
6045	*/
6046	/ ARGSUSED /
6047	int
6048	fchflags(__unused proc_t p, struct fchflags_args uap, __unused int32_t retval)
6049	{
6050	vnode_t vp;
6051	int error;
6052
6053	AUDIT_ARG(fd, uap->fd);
6054	AUDIT_ARG(fflags, uap->flags);
6055	if ( (error = file_vnode(uap->fd, &vp)) )
6056	return (error);
6057
6058	if ((error = vnode_getwithref(vp))) {
6059	file_drop(uap->fd);
6060	return(error);
6061	}
6062
6063	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6064
6065	/ we don't vnode_put() here because chflags1 does internally /
6066	error = chflags1(vp, uap->flags, vfs_context_current());
6067
6068	file_drop(uap->fd);
6069	return (error);
6070	}
6071
6072	/*
6073	* Change security information on a filesystem object.
6074	*
6075	* Returns: 0 Success
6076	* EPERM Operation not permitted
6077	* vnode_authattr:??? [anything vnode_authattr can return]
6078	* vnode_authorize:??? [anything vnode_authorize can return]
6079	* vnode_setattr:??? [anything vnode_setattr can return]
6080	*
6081	* Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6082	* translated to EPERM before being returned.
6083	*/
6084	static int
6085	chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6086	{
6087	kauth_action_t action;
6088	int error;
6089
6090	AUDIT_ARG(mode, vap->va_mode);
6091	/ XXX audit new args /
6092
6093	#if NAMEDSTREAMS
6094	/ chmod calls are not allowed for resource forks. /
6095	if (vp->v_flag & VISNAMEDSTREAM) {
6096	return (EPERM);
6097	}
6098	#endif
6099
6100	#if CONFIG_MACF
6101	if (VATTR_IS_ACTIVE(vap, va_mode) &&
6102	(error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != `0`)
6103	return (error);
6104
6105	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid)) {
6106	if ((error = mac_vnode_check_setowner(ctx, vp,
6107	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -`1`,
6108	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -`1`)))
6109	return (error);
6110	}
6111
6112	if (VATTR_IS_ACTIVE(vap, va_acl) &&
6113	(error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
6114	return (error);
6115	#endif
6116
6117	/ make sure that the caller is allowed to set this security information /
6118	if (((error = vnode_authattr(vp, vap, &action, ctx)) != `0`) \|\|
6119	((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
6120	if (error == EACCES)
6121	error = EPERM;
6122	return(error);
6123	}
6124
6125	if ((error = vnode_setattr(vp, vap, ctx)) != `0`)
6126	return (error);
6127
6128	#if CONFIG_MACF
6129	if (VATTR_IS_ACTIVE(vap, va_mode))
6130	mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6131
6132	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid))
6133	mac_vnode_notify_setowner(ctx, vp,
6134	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -`1`,
6135	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -`1`);
6136
6137	if (VATTR_IS_ACTIVE(vap, va_acl))
6138	mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6139	#endif
6140
6141	return (error);
6142	}
6143
6144
6145	/*
6146	* Change mode of a file given a path name.
6147	*
6148	* Returns: 0 Success
6149	* namei:??? [anything namei can return]
6150	* chmod_vnode:??? [anything chmod_vnode can return]
6151	*/
6152	static int
6153	chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6154	int fd, int flag, enum uio_seg segflg)
6155	{
6156	struct nameidata nd;
6157	int follow, error;
6158
6159	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6160	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1,
6161	segflg, path, ctx);
6162	if ((error = nameiat(&nd, fd)))
6163	return (error);
6164	error = chmod_vnode(ctx, nd.ni_vp, vap);
6165	vnode_put(nd.ni_vp);
6166	nameidone(&nd);
6167	return(error);
6168	}
6169
6170	/*
6171	* chmod_extended: Change the mode of a file given a path name; with extended
6172	* argument list (including extended security (ACL)).
6173	*
6174	* Parameters: p Process requesting the open
6175	* uap User argument descriptor (see below)
6176	* retval (ignored)
6177	*
6178	* Indirect: uap->path Path to object (same as 'chmod')
6179	* uap->uid UID to set
6180	* uap->gid GID to set
6181	* uap->mode File mode to set (same as 'chmod')
6182	* uap->xsecurity ACL to set (or delete)
6183	*
6184	* Returns: 0 Success
6185	* !0 errno value
6186	*
6187	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
6188	*
6189	* XXX: We should enummerate the possible errno values here, and where
6190	* in the code they originated.
6191	*/
6192	int
6193	chmod_extended(__unused proc_t p, struct chmod_extended_args uap, __unused int32_t retval)
6194	{
6195	int error;
6196	struct vnode_attr va;
6197	kauth_filesec_t xsecdst;
6198
6199	AUDIT_ARG(owner, uap->uid, uap->gid);
6200
6201	VATTR_INIT(&va);
6202	if (uap->mode != -`1`)
6203	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6204	if (uap->uid != KAUTH_UID_NONE)
6205	VATTR_SET(&va, va_uid, uap->uid);
6206	if (uap->gid != KAUTH_GID_NONE)
6207	VATTR_SET(&va, va_gid, uap->gid);
6208
6209	xsecdst = NULL;
6210	switch(uap->xsecurity) {
6211	/ explicit remove request /
6212	case CAST_USER_ADDR_T((void )`1`): /* _FILESEC_REMOVE_ACL /
6213	VATTR_SET(&va, va_acl, NULL);
6214	break;
6215	/ not being set /
6216	case USER_ADDR_NULL:
6217	break;
6218	default:
6219	if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`)
6220	return(error);
6221	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6222	KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6223	}
6224
6225	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, `0`,
6226	UIO_USERSPACE);
6227
6228	if (xsecdst != NULL)
6229	kauth_filesec_free(xsecdst);
6230	return(error);
6231	}
6232
6233	/*
6234	* Returns: 0 Success
6235	* chmodat:??? [anything chmodat can return]
6236	*/
6237	static int
6238	fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6239	int flag, enum uio_seg segflg)
6240	{
6241	struct vnode_attr va;
6242
6243	VATTR_INIT(&va);
6244	VATTR_SET(&va, va_mode, mode & ALLPERMS);
6245
6246	return (chmodat(ctx, path, &va, fd, flag, segflg));
6247	}
6248
6249	int
6250	chmod(__unused proc_t p, struct chmod_args uap, __unused int32_t retval)
6251	{
6252	return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6253	AT_FDCWD, `0`, UIO_USERSPACE));
6254	}
6255
6256	int
6257	fchmodat(__unused proc_t p, struct fchmodat_args uap, __unused int32_t retval)
6258	{
6259	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6260	return (EINVAL);
6261
6262	return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6263	uap->fd, uap->flag, UIO_USERSPACE));
6264	}
6265
6266	/*
6267	* Change mode of a file given a file descriptor.
6268	*/
6269	static int
6270	fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6271	{
6272	vnode_t vp;
6273	int error;
6274
6275	AUDIT_ARG(fd, fd);
6276
6277	if ((error = file_vnode(fd, &vp)) != `0`)
6278	return (error);
6279	if ((error = vnode_getwithref(vp)) != `0`) {
6280	file_drop(fd);
6281	return(error);
6282	}
6283	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6284
6285	error = chmod_vnode(vfs_context_current(), vp, vap);
6286	(void)vnode_put(vp);
6287	file_drop(fd);
6288
6289	return (error);
6290	}
6291
6292	/*
6293	* fchmod_extended: Change mode of a file given a file descriptor; with
6294	* extended argument list (including extended security (ACL)).
6295	*
6296	* Parameters: p Process requesting to change file mode
6297	* uap User argument descriptor (see below)
6298	* retval (ignored)
6299	*
6300	* Indirect: uap->mode File mode to set (same as 'chmod')
6301	* uap->uid UID to set
6302	* uap->gid GID to set
6303	* uap->xsecurity ACL to set (or delete)
6304	* uap->fd File descriptor of file to change mode
6305	*
6306	* Returns: 0 Success
6307	* !0 errno value
6308	*
6309	*/
6310	int
6311	fchmod_extended(proc_t p, struct fchmod_extended_args uap, __unused int32_t retval)
6312	{
6313	int error;
6314	struct vnode_attr va;
6315	kauth_filesec_t xsecdst;
6316
6317	AUDIT_ARG(owner, uap->uid, uap->gid);
6318
6319	VATTR_INIT(&va);
6320	if (uap->mode != -`1`)
6321	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6322	if (uap->uid != KAUTH_UID_NONE)
6323	VATTR_SET(&va, va_uid, uap->uid);
6324	if (uap->gid != KAUTH_GID_NONE)
6325	VATTR_SET(&va, va_gid, uap->gid);
6326
6327	xsecdst = NULL;
6328	switch(uap->xsecurity) {
6329	case USER_ADDR_NULL:
6330	VATTR_SET(&va, va_acl, NULL);
6331	break;
6332	case CAST_USER_ADDR_T((void )`1`): /* _FILESEC_REMOVE_ACL /
6333	VATTR_SET(&va, va_acl, NULL);
6334	break;
6335	/ not being set /
6336	case CAST_USER_ADDR_T(-`1`):
6337	break;
6338	default:
6339	if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`)
6340	return(error);
6341	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6342	}
6343
6344	error = fchmod1(p, uap->fd, &va);
6345
6346
6347	switch(uap->xsecurity) {
6348	case USER_ADDR_NULL:
6349	case CAST_USER_ADDR_T(-`1`):
6350	break;
6351	default:
6352	if (xsecdst != NULL)
6353	kauth_filesec_free(xsecdst);
6354	}
6355	return(error);
6356	}
6357
6358	int
6359	fchmod(proc_t p, struct fchmod_args uap, __unused int32_t retval)
6360	{
6361	struct vnode_attr va;
6362
6363	VATTR_INIT(&va);
6364	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6365
6366	return(fchmod1(p, uap->fd, &va));
6367	}
6368
6369
6370	/*
6371	* Set ownership given a path name.
6372	*/
6373	/ ARGSUSED /
6374	static int
6375	fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6376	gid_t gid, int flag, enum uio_seg segflg)
6377	{
6378	vnode_t vp;
6379	struct vnode_attr va;
6380	int error;
6381	struct nameidata nd;
6382	int follow;
6383	kauth_action_t action;
6384
6385	AUDIT_ARG(owner, uid, gid);
6386
6387	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6388	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1, segflg,
6389	path, ctx);
6390	error = nameiat(&nd, fd);
6391	if (error)
6392	return (error);
6393	vp = nd.ni_vp;
6394
6395	nameidone(&nd);
6396
6397	VATTR_INIT(&va);
6398	if (uid != (uid_t)VNOVAL)
6399	VATTR_SET(&va, va_uid, uid);
6400	if (gid != (gid_t)VNOVAL)
6401	VATTR_SET(&va, va_gid, gid);
6402
6403	#if CONFIG_MACF
6404	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6405	if (error)
6406	goto out;
6407	#endif
6408
6409	/ preflight and authorize attribute changes /
6410	if ((error = vnode_authattr(vp, &va, &action, ctx)) != `0`)
6411	goto out;
6412	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`))
6413	goto out;
6414	error = vnode_setattr(vp, &va, ctx);
6415
6416	#if CONFIG_MACF
6417	if (error == `0`)
6418	mac_vnode_notify_setowner(ctx, vp, uid, gid);
6419	#endif
6420
6421	out:
6422	/*
6423	* EACCES is only allowed from namei(); permissions failure should
6424	* return EPERM, so we need to translate the error code.
6425	*/
6426	if (error == EACCES)
6427	error = EPERM;
6428
6429	vnode_put(vp);
6430	return (error);
6431	}
6432
6433	int
6434	chown(__unused proc_t p, struct chown_args uap, __unused int32_t retval)
6435	{
6436	return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6437	uap->uid, uap->gid, `0`, UIO_USERSPACE));
6438	}
6439
6440	int
6441	lchown(__unused proc_t p, struct lchown_args uap, __unused int32_t retval)
6442	{
6443	return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6444	uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6445	}
6446
6447	int
6448	fchownat(__unused proc_t p, struct fchownat_args uap, __unused int32_t retval)
6449	{
6450	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6451	return (EINVAL);
6452
6453	return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6454	uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6455	}
6456
6457	/*
6458	* Set ownership given a file descriptor.
6459	*/
6460	/ ARGSUSED /
6461	int
6462	fchown(__unused proc_t p, struct fchown_args uap, __unused int32_t retval)
6463	{
6464	struct vnode_attr va;
6465	vfs_context_t ctx = vfs_context_current();
6466	vnode_t vp;
6467	int error;
6468	kauth_action_t action;
6469
6470	AUDIT_ARG(owner, uap->uid, uap->gid);
6471	AUDIT_ARG(fd, uap->fd);
6472
6473	if ( (error = file_vnode(uap->fd, &vp)) )
6474	return (error);
6475
6476	if ( (error = vnode_getwithref(vp)) ) {
6477	file_drop(uap->fd);
6478	return(error);
6479	}
6480	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6481
6482	VATTR_INIT(&va);
6483	if (uap->uid != VNOVAL)
6484	VATTR_SET(&va, va_uid, uap->uid);
6485	if (uap->gid != VNOVAL)
6486	VATTR_SET(&va, va_gid, uap->gid);
6487
6488	#if NAMEDSTREAMS
6489	/ chown calls are not allowed for resource forks. /
6490	if (vp->v_flag & VISNAMEDSTREAM) {
6491	error = EPERM;
6492	goto out;
6493	}
6494	#endif
6495
6496	#if CONFIG_MACF
6497	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6498	if (error)
6499	goto out;
6500	#endif
6501
6502	/ preflight and authorize attribute changes /
6503	if ((error = vnode_authattr(vp, &va, &action, ctx)) != `0`)
6504	goto out;
6505	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
6506	if (error == EACCES)
6507	error = EPERM;
6508	goto out;
6509	}
6510	error = vnode_setattr(vp, &va, ctx);
6511
6512	#if CONFIG_MACF
6513	if (error == `0`)
6514	mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6515	#endif
6516
6517	out:
6518	(void)vnode_put(vp);
6519	file_drop(uap->fd);
6520	return (error);
6521	}
6522
6523	static int
6524	getutimes(user_addr_t usrtvp, struct timespec *tsp)
6525	{
6526	int error;
6527
6528	if (usrtvp == USER_ADDR_NULL) {
6529	struct timeval old_tv;
6530	/ XXX Y2038 bug because of microtime argument /
6531	microtime(&old_tv);
6532	TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[`0`]);
6533	tsp[`1`] = tsp[`0`];
6534	} else {
6535	if (IS_64BIT_PROCESS(current_proc())) {
6536	struct user64_timeval tv[`2`];
6537	error = copyin(usrtvp, (void )tv, sizeof*(tv));
6538	if (error)
6539	return (error);
6540	TIMEVAL_TO_TIMESPEC(&tv[`0`], &tsp[`0`]);
6541	TIMEVAL_TO_TIMESPEC(&tv[`1`], &tsp[`1`]);
6542	} else {
6543	struct user32_timeval tv[`2`];
6544	error = copyin(usrtvp, (void )tv, sizeof*(tv));
6545	if (error)
6546	return (error);
6547	TIMEVAL_TO_TIMESPEC(&tv[`0`], &tsp[`0`]);
6548	TIMEVAL_TO_TIMESPEC(&tv[`1`], &tsp[`1`]);
6549	}
6550	}
6551	return `0`;
6552	}
6553
6554	static int
6555	setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6556	int nullflag)
6557	{
6558	int error;
6559	struct vnode_attr va;
6560	kauth_action_t action;
6561
6562	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6563
6564	VATTR_INIT(&va);
6565	VATTR_SET(&va, va_access_time, ts[`0`]);
6566	VATTR_SET(&va, va_modify_time, ts[`1`]);
6567	if (nullflag)
6568	va.va_vaflags \|= VA_UTIMES_NULL;
6569
6570	#if NAMEDSTREAMS
6571	/ utimes calls are not allowed for resource forks. /
6572	if (vp->v_flag & VISNAMEDSTREAM) {
6573	error = EPERM;
6574	goto out;
6575	}
6576	#endif
6577
6578	#if CONFIG_MACF
6579	error = mac_vnode_check_setutimes(ctx, vp, ts[`0`], ts[`1`]);
6580	if (error)
6581	goto out;
6582	#endif
6583	if ((error = vnode_authattr(vp, &va, &action, ctx)) != `0`) {
6584	if (!nullflag && error == EACCES)
6585	error = EPERM;
6586	goto out;
6587	}
6588
6589	/ since we may not need to auth anything, check here /
6590	if ((action != `0`) && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)) {
6591	if (!nullflag && error == EACCES)
6592	error = EPERM;
6593	goto out;
6594	}
6595	error = vnode_setattr(vp, &va, ctx);
6596
6597	#if CONFIG_MACF
6598	if (error == `0`)
6599	mac_vnode_notify_setutimes(ctx, vp, ts[`0`], ts[`1`]);
6600	#endif
6601
6602	out:
6603	return error;
6604	}
6605
6606	/*
6607	* Set the access and modification times of a file.
6608	*/
6609	/ ARGSUSED /
6610	int
6611	utimes(__unused proc_t p, struct utimes_args uap, __unused int32_t retval)
6612	{
6613	struct timespec ts[`2`];
6614	user_addr_t usrtvp;
6615	int error;
6616	struct nameidata nd;
6617	vfs_context_t ctx = vfs_context_current();
6618
6619	/*
6620	* AUDIT: Needed to change the order of operations to do the
6621	* name lookup first because auditing wants the path.
6622	*/
6623	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1,
6624	UIO_USERSPACE, uap->path, ctx);
6625	error = namei(&nd);
6626	if (error)
6627	return (error);
6628	nameidone(&nd);
6629
6630	/*
6631	* Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
6632	* the current time instead.
6633	*/
6634	usrtvp = uap->tptr;
6635	if ((error = getutimes(usrtvp, ts)) != `0`)
6636	goto out;
6637
6638	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6639
6640	out:
6641	vnode_put(nd.ni_vp);
6642	return (error);
6643	}
6644
6645	/*
6646	* Set the access and modification times of a file.
6647	*/
6648	/ ARGSUSED /
6649	int
6650	futimes(__unused proc_t p, struct futimes_args uap, __unused int32_t retval)
6651	{
6652	struct timespec ts[`2`];
6653	vnode_t vp;
6654	user_addr_t usrtvp;
6655	int error;
6656
6657	AUDIT_ARG(fd, uap->fd);
6658	usrtvp = uap->tptr;
6659	if ((error = getutimes(usrtvp, ts)) != `0`)
6660	return (error);
6661	if ((error = file_vnode(uap->fd, &vp)) != `0`)
6662	return (error);
6663	if((error = vnode_getwithref(vp))) {
6664	file_drop(uap->fd);
6665	return(error);
6666	}
6667
6668	error = setutimes(vfs_context_current(), vp, ts, usrtvp == `0`);
6669	vnode_put(vp);
6670	file_drop(uap->fd);
6671	return(error);
6672	}
6673
6674	/*
6675	* Truncate a file given its path name.
6676	*/
6677	/ ARGSUSED /
6678	int
6679	truncate(__unused proc_t p, struct truncate_args uap, __unused int32_t retval)
6680	{
6681	vnode_t vp;
6682	struct vnode_attr va;
6683	vfs_context_t ctx = vfs_context_current();
6684	int error;
6685	struct nameidata nd;
6686	kauth_action_t action;
6687
6688	if (uap->length < `0`)
6689	return(EINVAL);
6690	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW \| AUDITVNPATH1,
6691	UIO_USERSPACE, uap->path, ctx);
6692	if ((error = namei(&nd)))
6693	return (error);
6694	vp = nd.ni_vp;
6695
6696	nameidone(&nd);
6697
6698	VATTR_INIT(&va);
6699	VATTR_SET(&va, va_data_size, uap->length);
6700
6701	#if CONFIG_MACF
6702	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6703	if (error)
6704	goto out;
6705	#endif
6706
6707	if ((error = vnode_authattr(vp, &va, &action, ctx)) != `0`)
6708	goto out;
6709	if ((action != `0`) && ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`))
6710	goto out;
6711	error = vnode_setattr(vp, &va, ctx);
6712
6713	#if CONFIG_MACF
6714	if (error == `0`)
6715	mac_vnode_notify_truncate(ctx, NOCRED, vp);
6716	#endif
6717
6718	out:
6719	vnode_put(vp);
6720	return (error);
6721	}
6722
6723	/*
6724	* Truncate a file given a file descriptor.
6725	*/
6726	/ ARGSUSED /
6727	int
6728	ftruncate(proc_t p, struct ftruncate_args uap, int32_t retval)
6729	{
6730	vfs_context_t ctx = vfs_context_current();
6731	struct vnode_attr va;
6732	vnode_t vp;
6733	struct fileproc *fp;
6734	int error ;
6735	int fd = uap->fd;
6736
6737	AUDIT_ARG(fd, uap->fd);
6738	if (uap->length < `0`)
6739	return(EINVAL);
6740
6741	if ( (error = fp_lookup(p,fd,&fp,`0`)) ) {
6742	return(error);
6743	}
6744
6745	switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6746	case DTYPE_PSXSHM:
6747	error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6748	goto out;
6749	case DTYPE_VNODE:
6750	break;
6751	default:
6752	error = EINVAL;
6753	goto out;
6754	}
6755
6756	vp = (vnode_t)fp->f_fglob->fg_data;
6757
6758	if ((fp->f_fglob->fg_flag & FWRITE) == `0`) {
6759	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6760	error = EINVAL;
6761	goto out;
6762	}
6763
6764	if ((error = vnode_getwithref(vp)) != `0`) {
6765	goto out;
6766	}
6767
6768	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6769
6770	#if CONFIG_MACF
6771	error = mac_vnode_check_truncate(ctx,
6772	fp->f_fglob->fg_cred, vp);
6773	if (error) {
6774	(void)vnode_put(vp);
6775	goto out;
6776	}
6777	#endif
6778	VATTR_INIT(&va);
6779	VATTR_SET(&va, va_data_size, uap->length);
6780	error = vnode_setattr(vp, &va, ctx);
6781
6782	#if CONFIG_MACF
6783	if (error == `0`)
6784	mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6785	#endif
6786
6787	(void)vnode_put(vp);
6788	out:
6789	file_drop(fd);
6790	return (error);
6791	}
6792
6793
6794	/*
6795	* Sync an open file with synchronized I/O _file_ integrity completion
6796	*/
6797	/ ARGSUSED /
6798	int
6799	fsync(proc_t p, struct fsync_args uap, __unused int32_t retval)
6800	{
6801	__pthread_testcancel(`1`);
6802	return(fsync_common(p, uap, MNT_WAIT));
6803	}
6804
6805
6806	/*
6807	* Sync an open file with synchronized I/O _file_ integrity completion
6808	*
6809	* Notes: This is a legacy support function that does not test for
6810	* thread cancellation points.
6811	*/
6812	/ ARGSUSED /
6813	int
6814	fsync_nocancel(proc_t p, struct fsync_nocancel_args uap, __unused int32_t retval)
6815	{
6816	return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6817	}
6818
6819
6820	/*
6821	* Sync an open file with synchronized I/O _data_ integrity completion
6822	*/
6823	/ ARGSUSED /
6824	int
6825	fdatasync(proc_t p, struct fdatasync_args uap, __unused int32_t retval)
6826	{
6827	__pthread_testcancel(`1`);
6828	return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6829	}
6830
6831
6832	/*
6833	* fsync_common
6834	*
6835	* Common fsync code to support both synchronized I/O file integrity completion
6836	* (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6837	*
6838	* If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6839	* will only guarantee that the file data contents are retrievable. If
6840	* 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6841	* includes additional metadata unnecessary for retrieving the file data
6842	* contents, such as atime, mtime, ctime, etc., also be committed to stable
6843	* storage.
6844	*
6845	* Parameters: p The process
6846	* uap->fd The descriptor to synchronize
6847	* flags The data integrity flags
6848	*
6849	* Returns: int Success
6850	* fp_getfvp:EBADF Bad file descriptor
6851	* fp_getfvp:ENOTSUP fd does not refer to a vnode
6852	* VNOP_FSYNC:??? unspecified
6853	*
6854	* Notes: We use struct fsync_args because it is a short name, and all
6855	* caller argument structures are otherwise identical.
6856	*/
6857	static int
6858	fsync_common(proc_t p, struct fsync_args uap, int* flags)
6859	{
6860	vnode_t vp;
6861	struct fileproc *fp;
6862	vfs_context_t ctx = vfs_context_current();
6863	int error;
6864
6865	AUDIT_ARG(fd, uap->fd);
6866
6867	if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6868	return (error);
6869	if ( (error = vnode_getwithref(vp)) ) {
6870	file_drop(uap->fd);
6871	return(error);
6872	}
6873
6874	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6875
6876	error = VNOP_FSYNC(vp, flags, ctx);
6877
6878	#if NAMEDRSRCFORK
6879	/ Sync resource fork shadow file if necessary. /
6880	if ((error == `0`) &&
6881	(vp->v_flag & VISNAMEDSTREAM) &&
6882	(vp->v_parent != NULLVP) &&
6883	vnode_isshadow(vp) &&
6884	(fp->f_flags & FP_WRITTEN)) {
6885	(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6886	}
6887	#endif
6888
6889	(void)vnode_put(vp);
6890	file_drop(uap->fd);
6891	return (error);
6892	}
6893
6894	/*
6895	* Duplicate files. Source must be a file, target must be a file or
6896	* must not exist.
6897	*
6898	* XXX Copyfile authorisation checking is woefully inadequate, and will not
6899	* perform inheritance correctly.
6900	*/
6901	/ ARGSUSED /
6902	int
6903	copyfile(__unused proc_t p, struct copyfile_args uap, __unused int32_t retval)
6904	{
6905	vnode_t tvp, fvp, tdvp, sdvp;
6906	struct nameidata fromnd, tond;
6907	int error;
6908	vfs_context_t ctx = vfs_context_current();
6909	#if CONFIG_MACF
6910	struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6911	struct vnode_attr va;
6912	#endif
6913
6914	/ Check that the flags are valid. /
6915
6916	if (uap->flags & ~CPF_MASK) {
6917	return(EINVAL);
6918	}
6919
6920	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6921	UIO_USERSPACE, uap->from, ctx);
6922	if ((error = namei(&fromnd)))
6923	return (error);
6924	fvp = fromnd.ni_vp;
6925
6926	NDINIT(&tond, CREATE, OP_LINK,
6927	LOCKPARENT \| LOCKLEAF \| NOCACHE \| SAVESTART \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
6928	UIO_USERSPACE, uap->to, ctx);
6929	if ((error = namei(&tond))) {
6930	goto out1;
6931	}
6932	tdvp = tond.ni_dvp;
6933	tvp = tond.ni_vp;
6934
6935	if (tvp != NULL) {
6936	if (!(uap->flags & CPF_OVERWRITE)) {
6937	error = EEXIST;
6938	goto out;
6939	}
6940	}
6941
6942	if (fvp->v_type == VDIR \|\| (tvp && tvp->v_type == VDIR)) {
6943	error = EISDIR;
6944	goto out;
6945	}
6946
6947	/ This calls existing MAC hooks for open /
6948	if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6949	NULL))) {
6950	goto out;
6951	}
6952
6953	if (tvp) {
6954	/*
6955	* See unlinkat_internal for an explanation of the potential
6956	* ENOENT from the MAC hook but the gist is that the MAC hook
6957	* can fail because vn_getpath isn't able to return the full
6958	* path. We choose to ignore this failure.
6959	*/
6960	error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6961	if (error && error != ENOENT)
6962	goto out;
6963	error = `0`;
6964	}
6965
6966	#if CONFIG_MACF
6967	VATTR_INIT(&va);
6968	VATTR_SET(&va, va_type, fvp->v_type);
6969	/ Mask off all but regular access permissions /
6970	VATTR_SET(&va, va_mode,
6971	((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6972	error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6973	if (error)
6974	goto out;
6975	#endif /* CONFIG_MACF */
6976
6977	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != `0`)
6978	goto out;
6979
6980	if (fvp == tdvp)
6981	error = EINVAL;
6982	/*
6983	* If source is the same as the destination (that is the
6984	* same inode number) then there is nothing to do.
6985	* (fixed to have POSIX semantics - CSM 3/2/98)
6986	*/
6987	if (fvp == tvp)
6988	error = -`1`;
6989	if (!error)
6990	error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6991	out:
6992	sdvp = tond.ni_startdir;
6993	/*
6994	* nameidone has to happen before we vnode_put(tdvp)
6995	* since it may need to release the fs_nodelock on the tdvp
6996	*/
6997	nameidone(&tond);
6998
6999	if (tvp)
7000	vnode_put(tvp);
7001	vnode_put(tdvp);
7002	vnode_put(sdvp);
7003	out1:
7004	vnode_put(fvp);
7005
7006	nameidone(&fromnd);
7007
7008	if (error == -`1`)
7009	return (`0`);
7010	return (error);
7011	}
7012
7013	#define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7014
7015	/*
7016	* Helper function for doing clones. The caller is expected to provide an
7017	* iocounted source vnode and release it.
7018	*/
7019	static int
7020	clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7021	user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7022	{
7023	vnode_t tvp, tdvp;
7024	struct nameidata tond;
7025	int error;
7026	int follow;
7027	boolean_t free_src_acl;
7028	boolean_t attr_cleanup;
7029	enum vtype v_type;
7030	kauth_action_t action;
7031	struct componentname *cnp;
7032	uint32_t defaulted;
7033	struct vnode_attr va;
7034	struct vnode_attr nva;
7035	uint32_t vnop_flags;
7036
7037	v_type = vnode_vtype(fvp);
7038	switch (v_type) {
7039	case VLNK:
7040	/ FALLTHRU /
7041	case VREG:
7042	action = KAUTH_VNODE_ADD_FILE;
7043	break;
7044	case VDIR:
7045	if (vnode_isvroot(fvp) \|\| vnode_ismount(fvp) \|\|
7046	fvp->v_mountedhere) {
7047	return (EINVAL);
7048	}
7049	action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7050	break;
7051	default:
7052	return (EINVAL);
7053	}
7054
7055	AUDIT_ARG(fd2, dst_dirfd);
7056	AUDIT_ARG(value32, flags);
7057
7058	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7059	NDINIT(&tond, CREATE, OP_LINK, follow \| WANTPARENT \| AUDITVNPATH2,
7060	UIO_USERSPACE, dst, ctx);
7061	if ((error = nameiat(&tond, dst_dirfd)))
7062	return (error);
7063	cnp = &tond.ni_cnd;
7064	tdvp = tond.ni_dvp;
7065	tvp = tond.ni_vp;
7066
7067	free_src_acl = FALSE;
7068	attr_cleanup = FALSE;
7069
7070	if (tvp != NULL) {
7071	error = EEXIST;
7072	goto out;
7073	}
7074
7075	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7076	error = EXDEV;
7077	goto out;
7078	}
7079
7080	#if CONFIG_MACF
7081	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
7082	goto out;
7083	#endif
7084	if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
7085	goto out;
7086
7087	action = KAUTH_VNODE_GENERIC_READ_BITS;
7088	if (data_read_authorised)
7089	action &= ~KAUTH_VNODE_READ_DATA;
7090	if ((error = vnode_authorize(fvp, NULL, action, ctx)))
7091	goto out;
7092
7093	/*
7094	* certain attributes may need to be changed from the source, we ask for
7095	* those here.
7096	*/
7097	VATTR_INIT(&va);
7098	VATTR_WANTED(&va, va_uid);
7099	VATTR_WANTED(&va, va_gid);
7100	VATTR_WANTED(&va, va_mode);
7101	VATTR_WANTED(&va, va_flags);
7102	VATTR_WANTED(&va, va_acl);
7103
7104	if ((error = vnode_getattr(fvp, &va, ctx)) != `0`)
7105	goto out;
7106
7107	VATTR_INIT(&nva);
7108	VATTR_SET(&nva, va_type, v_type);
7109	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7110	VATTR_SET(&nva, va_acl, va.va_acl);
7111	free_src_acl = TRUE;
7112	}
7113
7114	/ Handle ACL inheritance, initialize vap. /
7115	if (v_type == VLNK) {
7116	error = vnode_authattr_new(tdvp, &nva, `0`, ctx);
7117	} else {
7118	error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7119	if (error)
7120	goto out;
7121	attr_cleanup = TRUE;
7122	}
7123
7124	vnop_flags = VNODE_CLONEFILE_DEFAULT;
7125	/*
7126	* We've got initial values for all security parameters,
7127	* If we are superuser, then we can change owners to be the
7128	* same as the source. Both superuser and the owner have default
7129	* WRITE_SECURITY privileges so all other fields can be taken
7130	* from source as well.
7131	*/
7132	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7133	if (VATTR_IS_SUPPORTED(&va, va_uid))
7134	VATTR_SET(&nva, va_uid, va.va_uid);
7135	if (VATTR_IS_SUPPORTED(&va, va_gid))
7136	VATTR_SET(&nva, va_gid, va.va_gid);
7137	} else {
7138	vnop_flags \|= VNODE_CLONEFILE_NOOWNERCOPY;
7139	}
7140
7141	if (VATTR_IS_SUPPORTED(&va, va_mode))
7142	VATTR_SET(&nva, va_mode, va.va_mode);
7143	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7144	VATTR_SET(&nva, va_flags,
7145	((va.va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)) \| / Turn off from source /
7146	(nva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED))));
7147	}
7148
7149	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7150
7151	if (!error && tvp) {
7152	int update_flags = `0`;
7153	#if CONFIG_FSE
7154	int fsevent;
7155	#endif /* CONFIG_FSE */
7156
7157	#if CONFIG_MACF
7158	(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7159	VNODE_LABEL_CREATE, ctx);
7160	#endif
7161	/*
7162	* If some of the requested attributes weren't handled by the
7163	* VNOP, use our fallback code.
7164	*/
7165	if (!VATTR_ALL_SUPPORTED(&va))
7166	(void)vnode_setattr_fallback(tvp, &nva, ctx);
7167
7168	// Make sure the name & parent pointers are hooked up
7169	if (tvp->v_name == NULL)
7170	update_flags \|= VNODE_UPDATE_NAME;
7171	if (tvp->v_parent == NULLVP)
7172	update_flags \|= VNODE_UPDATE_PARENT;
7173
7174	if (update_flags) {
7175	(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7176	cnp->cn_namelen, cnp->cn_hash, update_flags);
7177	}
7178
7179	#if CONFIG_FSE
7180	switch (vnode_vtype(tvp)) {
7181	case VLNK:
7182	/ FALLTHRU /
7183	case VREG:
7184	fsevent = FSE_CREATE_FILE;
7185	break;
7186	case VDIR:
7187	fsevent = FSE_CREATE_DIR;
7188	break;
7189	default:
7190	goto out;
7191	}
7192
7193	if (need_fsevent(fsevent, tvp)) {
7194	/*
7195	* The following is a sequence of three explicit events.
7196	* A pair of FSE_CLONE events representing the source and destination
7197	* followed by an FSE_CREATE_[FILE \| DIR] for the destination.
7198	* fseventsd may coalesce the destination clone and create events
7199	* into a single event resulting in the following sequence for a client
7200	* FSE_CLONE (src)
7201	* FSE_CLONE \| FSE_CREATE (dst)
7202	*/
7203	add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7204	FSE_ARG_DONE);
7205	add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7206	FSE_ARG_DONE);
7207	}
7208	#endif /* CONFIG_FSE */
7209	}
7210
7211	out:
7212	if (attr_cleanup)
7213	vn_attribute_cleanup(&nva, defaulted);
7214	if (free_src_acl && va.va_acl)
7215	kauth_acl_free(va.va_acl);
7216	nameidone(&tond);
7217	if (tvp)
7218	vnode_put(tvp);
7219	vnode_put(tdvp);
7220	return (error);
7221	}
7222
7223	/*
7224	* clone files or directories, target must not exist.
7225	*/
7226	/ ARGSUSED /
7227	int
7228	clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7229	__unused int32_t *retval)
7230	{
7231	vnode_t fvp;
7232	struct nameidata fromnd;
7233	int follow;
7234	int error;
7235	vfs_context_t ctx = vfs_context_current();
7236
7237	/ Check that the flags are valid. /
7238	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY))
7239	return (EINVAL);
7240
7241	AUDIT_ARG(fd, uap->src_dirfd);
7242
7243	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7244	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow \| AUDITVNPATH1,
7245	UIO_USERSPACE, uap->src, ctx);
7246	if ((error = nameiat(&fromnd, uap->src_dirfd)))
7247	return (error);
7248
7249	fvp = fromnd.ni_vp;
7250	nameidone(&fromnd);
7251
7252	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7253	uap->flags, ctx);
7254
7255	vnode_put(fvp);
7256	return (error);
7257	}
7258
7259	int
7260	fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7261	__unused int32_t *retval)
7262	{
7263	vnode_t fvp;
7264	struct fileproc *fp;
7265	int error;
7266	vfs_context_t ctx = vfs_context_current();
7267
7268	/ Check that the flags are valid. /
7269	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY))
7270	return (EINVAL);
7271
7272	AUDIT_ARG(fd, uap->src_fd);
7273	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7274	if (error)
7275	return (error);
7276
7277	if ((fp->f_fglob->fg_flag & FREAD) == `0`) {
7278	AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7279	error = EBADF;
7280	goto out;
7281	}
7282
7283	if ((error = vnode_getwithref(fvp)))
7284	goto out;
7285
7286	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7287
7288	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7289	uap->flags, ctx);
7290
7291	vnode_put(fvp);
7292	out:
7293	file_drop(uap->src_fd);
7294	return (error);
7295	}
7296
7297	/*
7298	* Rename files. Source and destination must either both be directories,
7299	* or both not be directories. If target is a directory, it must be empty.
7300	*/
7301	/ ARGSUSED /
7302	static int
7303	renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7304	int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7305	{
7306	if (flags & ~VFS_RENAME_FLAGS_MASK)
7307	return EINVAL;
7308
7309	if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7310	return EINVAL;
7311
7312	vnode_t tvp, tdvp;
7313	vnode_t fvp, fdvp;
7314	struct nameidata fromnd, tond;
7315	int error;
7316	int do_retry;
7317	int retry_count;
7318	int mntrename;
7319	int need_event;
7320	const char *oname = NULL;
7321	char from_name = NULL, to_name = NULL;
7322	int from_len=`0`, to_len=`0`;
7323	int holding_mntlock;
7324	mount_t locked_mp = NULL;
7325	vnode_t oparent = NULLVP;
7326	#if CONFIG_FSE
7327	fse_info from_finfo, to_finfo;
7328	#endif
7329	int from_truncated=`0`, to_truncated;
7330	int batched = `0`;
7331	struct vnode_attr fvap, tvap;
7332	int continuing = `0`;
7333	/ carving out a chunk for structs that are too big to be on stack. /
7334	struct {
7335	struct nameidata from_node, to_node;
7336	struct vnode_attr fv_attr, tv_attr;
7337	} * __rename_data;
7338	MALLOC(__rename_data, void , sizeof(__rename_data), M_TEMP, M_WAITOK);
7339	fromnd = &__rename_data->from_node;
7340	tond = &__rename_data->to_node;
7341
7342	holding_mntlock = `0`;
7343	do_retry = `0`;
7344	retry_count = `0`;
7345	retry:
7346	fvp = tvp = NULL;
7347	fdvp = tdvp = NULL;
7348	fvap = tvap = NULL;
7349	mntrename = FALSE;
7350
7351	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT \| AUDITVNPATH1,
7352	segflg, from, ctx);
7353	fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7354
7355	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
7356	segflg, to, ctx);
7357	tond->ni_flag = NAMEI_COMPOUNDRENAME;
7358
7359	continue_lookup:
7360	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != `0` \|\| !continuing) {
7361	if ( (error = nameiat(fromnd, fromfd)) )
7362	goto out1;
7363	fdvp = fromnd->ni_dvp;
7364	fvp = fromnd->ni_vp;
7365
7366	if (fvp && fvp->v_type == VDIR)
7367	tond->ni_cnd.cn_flags \|= WILLBEDIR;
7368	}
7369
7370	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != `0` \|\| !continuing) {
7371	if ( (error = nameiat(tond, tofd)) ) {
7372	/*
7373	* Translate error code for rename("dir1", "dir2/.").
7374	*/
7375	if (error == EISDIR && fvp->v_type == VDIR)
7376	error = EINVAL;
7377	goto out1;
7378	}
7379	tdvp = tond->ni_dvp;
7380	tvp = tond->ni_vp;
7381	}
7382
7383	#if DEVELOPMENT \|\| DEBUG
7384	/*
7385	* XXX VSWAP: Check for entitlements or special flag here
7386	* so we can restrict access appropriately.
7387	*/
7388	#else /* DEVELOPMENT \|\| DEBUG */
7389
7390	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
7391	error = EPERM;
7392	goto out1;
7393	}
7394
7395	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
7396	error = EPERM;
7397	goto out1;
7398	}
7399	#endif /* DEVELOPMENT \|\| DEBUG */
7400
7401	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7402	error = ENOENT;
7403	goto out1;
7404	}
7405
7406	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7407	error = EEXIST;
7408	goto out1;
7409	}
7410
7411	batched = vnode_compound_rename_available(fdvp);
7412
7413	#if CONFIG_FSE
7414	need_event = need_fsevent(FSE_RENAME, fdvp);
7415	if (need_event) {
7416	if (fvp) {
7417	get_fse_info(fvp, &from_finfo, ctx);
7418	} else {
7419	error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7420	if (error) {
7421	goto out1;
7422	}
7423
7424	fvap = &__rename_data->fv_attr;
7425	}
7426
7427	if (tvp) {
7428	get_fse_info(tvp, &to_finfo, ctx);
7429	} else if (batched) {
7430	error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7431	if (error) {
7432	goto out1;
7433	}
7434
7435	tvap = &__rename_data->tv_attr;
7436	}
7437	}
7438	#else
7439	need_event = `0`;
7440	#endif /* CONFIG_FSE */
7441
7442	if (need_event \|\| kauth_authorize_fileop_has_listeners()) {
7443	if (from_name == NULL) {
7444	GET_PATH(from_name);
7445	if (from_name == NULL) {
7446	error = ENOMEM;
7447	goto out1;
7448	}
7449	}
7450
7451	from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7452
7453	if (to_name == NULL) {
7454	GET_PATH(to_name);
7455	if (to_name == NULL) {
7456	error = ENOMEM;
7457	goto out1;
7458	}
7459	}
7460
7461	to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7462	}
7463	if (!fvp) {
7464	/*
7465	* Claim: this check will never reject a valid rename.
7466	* For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7467	* Suppose fdvp and tdvp are not on the same mount.
7468	* If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
7469	* then you can't move it to within another dir on the same mountpoint.
7470	* If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7471	*
7472	* If this check passes, then we are safe to pass these vnodes to the same FS.
7473	*/
7474	if (fdvp->v_mount != tdvp->v_mount) {
7475	error = EXDEV;
7476	goto out1;
7477	}
7478	goto skipped_lookup;
7479	}
7480
7481	if (!batched) {
7482	error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
7483	if (error) {
7484	if (error == ENOENT) {
7485	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7486	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7487	/*
7488	* We encountered a race where after doing the namei, tvp stops
7489	* being valid. If so, simply re-drive the rename call from the
7490	* top.
7491	*/
7492	do_retry = `1`;
7493	retry_count += `1`;
7494	}
7495	}
7496	goto out1;
7497	}
7498	}
7499
7500	/*
7501	* If the source and destination are the same (i.e. they're
7502	* links to the same vnode) and the target file system is
7503	* case sensitive, then there is nothing to do.
7504	*
7505	* XXX Come back to this.
7506	*/
7507	if (fvp == tvp) {
7508	int pathconf_val;
7509
7510	/*
7511	* Note: if _PC_CASE_SENSITIVE selector isn't supported,
7512	* then assume that this file system is case sensitive.
7513	*/
7514	if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != `0` \|\|
7515	pathconf_val != `0`) {
7516	goto out1;
7517	}
7518	}
7519
7520	/*
7521	* Allow the renaming of mount points.
7522	* - target must not exist
7523	* - target must reside in the same directory as source
7524	* - union mounts cannot be renamed
7525	* - "/" cannot be renamed
7526	*
7527	* XXX Handle this in VFS after a continued lookup (if we missed
7528	* in the cache to start off)
7529	*
7530	* N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7531	* we'll skip past here. The file system is responsible for
7532	* checking that @tvp is not a descendent of @fvp and vice versa
7533	* so it should always return EINVAL if either @tvp or @fvp is the
7534	* root of a volume.
7535	*/
7536	if ((fvp->v_flag & VROOT) &&
7537	(fvp->v_type == VDIR) &&
7538	(tvp == NULL) &&
7539	(fvp->v_mountedhere == NULL) &&
7540	(fdvp == tdvp) &&
7541	((fvp->v_mount->mnt_flag & (MNT_UNION \| MNT_ROOTFS)) == `0`) &&
7542	(fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7543	vnode_t coveredvp;
7544
7545	/ switch fvp to the covered vnode /
7546	coveredvp = fvp->v_mount->mnt_vnodecovered;
7547	if ( (vnode_getwithref(coveredvp)) ) {
7548	error = ENOENT;
7549	goto out1;
7550	}
7551	vnode_put(fvp);
7552
7553	fvp = coveredvp;
7554	mntrename = TRUE;
7555	}
7556	/*
7557	* Check for cross-device rename.
7558	*/
7559	if ((fvp->v_mount != tdvp->v_mount) \|\|
7560	(tvp && (fvp->v_mount != tvp->v_mount))) {
7561	error = EXDEV;
7562	goto out1;
7563	}
7564
7565	/*
7566	* If source is the same as the destination (that is the
7567	* same inode number) then there is nothing to do...
7568	* EXCEPT if the underlying file system supports case
7569	* insensitivity and is case preserving. In this case
7570	* the file system needs to handle the special case of
7571	* getting the same vnode as target (fvp) and source (tvp).
7572	*
7573	* Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7574	* and _PC_CASE_PRESERVING can have this exception, and they need to
7575	* handle the special case of getting the same vnode as target and
7576	* source. NOTE: Then the target is unlocked going into vnop_rename,
7577	* so not to cause locking problems. There is a single reference on tvp.
7578	*
7579	* NOTE - that fvp == tvp also occurs if they are hard linked and
7580	* that correct behaviour then is just to return success without doing
7581	* anything.
7582	*
7583	* XXX filesystem should take care of this itself, perhaps...
7584	*/
7585	if (fvp == tvp && fdvp == tdvp) {
7586	if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7587	!bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7588	fromnd->ni_cnd.cn_namelen)) {
7589	goto out1;
7590	}
7591	}
7592
7593	if (holding_mntlock && fvp->v_mount != locked_mp) {
7594	/*
7595	* we're holding a reference and lock
7596	* on locked_mp, but it no longer matches
7597	* what we want to do... so drop our hold
7598	*/
7599	mount_unlock_renames(locked_mp);
7600	mount_drop(locked_mp, `0`);
7601	holding_mntlock = `0`;
7602	}
7603	if (tdvp != fdvp && fvp->v_type == VDIR) {
7604	/*
7605	* serialize renames that re-shape
7606	* the tree... if holding_mntlock is
7607	* set, then we're ready to go...
7608	* otherwise we
7609	* first need to drop the iocounts
7610	* we picked up, second take the
7611	* lock to serialize the access,
7612	* then finally start the lookup
7613	* process over with the lock held
7614	*/
7615	if (!holding_mntlock) {
7616	/*
7617	* need to grab a reference on
7618	* the mount point before we
7619	* drop all the iocounts... once
7620	* the iocounts are gone, the mount
7621	* could follow
7622	*/
7623	locked_mp = fvp->v_mount;
7624	mount_ref(locked_mp, `0`);
7625
7626	/*
7627	* nameidone has to happen before we vnode_put(tvp)
7628	* since it may need to release the fs_nodelock on the tvp
7629	*/
7630	nameidone(tond);
7631
7632	if (tvp)
7633	vnode_put(tvp);
7634	vnode_put(tdvp);
7635
7636	/*
7637	* nameidone has to happen before we vnode_put(fdvp)
7638	* since it may need to release the fs_nodelock on the fvp
7639	*/
7640	nameidone(fromnd);
7641
7642	vnode_put(fvp);
7643	vnode_put(fdvp);
7644
7645	mount_lock_renames(locked_mp);
7646	holding_mntlock = `1`;
7647
7648	goto retry;
7649	}
7650	} else {
7651	/*
7652	* when we dropped the iocounts to take
7653	* the lock, we allowed the identity of
7654	* the various vnodes to change... if they did,
7655	* we may no longer be dealing with a rename
7656	* that reshapes the tree... once we're holding
7657	* the iocounts, the vnodes can't change type
7658	* so we're free to drop the lock at this point
7659	* and continue on
7660	*/
7661	if (holding_mntlock) {
7662	mount_unlock_renames(locked_mp);
7663	mount_drop(locked_mp, `0`);
7664	holding_mntlock = `0`;
7665	}
7666	}
7667
7668	// save these off so we can later verify that fvp is the same
7669	oname = fvp->v_name;
7670	oparent = fvp->v_parent;
7671
7672	skipped_lookup:
7673	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7674	tdvp, &tvp, &tond->ni_cnd, tvap,
7675	flags, ctx);
7676
7677	if (holding_mntlock) {
7678	/*
7679	* we can drop our serialization
7680	* lock now
7681	*/
7682	mount_unlock_renames(locked_mp);
7683	mount_drop(locked_mp, `0`);
7684	holding_mntlock = `0`;
7685	}
7686	if (error) {
7687	if (error == EKEEPLOOKING) {
7688	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == `0`) {
7689	if ((tond->ni_flag & NAMEI_CONTLOOKUP) == `0`) {
7690	panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7691	}
7692	}
7693
7694	fromnd->ni_vp = fvp;
7695	tond->ni_vp = tvp;
7696
7697	goto continue_lookup;
7698	}
7699
7700	/*
7701	* We may encounter a race in the VNOP where the destination didn't
7702	* exist when we did the namei, but it does by the time we go and
7703	* try to create the entry. In this case, we should re-drive this rename
7704	* call from the top again. Currently, only HFS bubbles out ERECYCLE,
7705	* but other filesystems susceptible to this race could return it, too.
7706	*/
7707	if (error == ERECYCLE) {
7708	do_retry = `1`;
7709	}
7710
7711	/*
7712	* For compound VNOPs, the authorization callback may return
7713	* ENOENT in case of racing hardlink lookups hitting the name
7714	* cache, redrive the lookup.
7715	*/
7716	if (batched && error == ENOENT) {
7717	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7718	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7719	do_retry = `1`;
7720	retry_count += `1`;
7721	}
7722	}
7723
7724	goto out1;
7725	}
7726
7727	/ call out to allow 3rd party notification of rename.*
7728	* Ignore result of kauth_authorize_fileop call.
7729	*/
7730	kauth_authorize_fileop(vfs_context_ucred(ctx),
7731	KAUTH_FILEOP_RENAME,
7732	(uintptr_t)from_name, (uintptr_t)to_name);
7733	if (flags & VFS_RENAME_SWAP) {
7734	kauth_authorize_fileop(vfs_context_ucred(ctx),
7735	KAUTH_FILEOP_RENAME,
7736	(uintptr_t)to_name, (uintptr_t)from_name);
7737	}
7738
7739	#if CONFIG_FSE
7740	if (from_name != NULL && to_name != NULL) {
7741	if (from_truncated \|\| to_truncated) {
7742	// set it here since only the from_finfo gets reported up to user space
7743	from_finfo.mode \|= FSE_TRUNCATED_PATH;
7744	}
7745
7746	if (tvap && tvp) {
7747	vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7748	}
7749	if (fvap) {
7750	vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7751	}
7752
7753	if (tvp) {
7754	add_fsevent(FSE_RENAME, ctx,
7755	FSE_ARG_STRING, from_len, from_name,
7756	FSE_ARG_FINFO, &from_finfo,
7757	FSE_ARG_STRING, to_len, to_name,
7758	FSE_ARG_FINFO, &to_finfo,
7759	FSE_ARG_DONE);
7760	if (flags & VFS_RENAME_SWAP) {
7761	/*
7762	* Strictly speaking, swap is the equivalent of
7763	* three renames. FSEvents clients should only take
7764	* the events as a hint, so we only bother reporting
7765	* two.
7766	*/
7767	add_fsevent(FSE_RENAME, ctx,
7768	FSE_ARG_STRING, to_len, to_name,
7769	FSE_ARG_FINFO, &to_finfo,
7770	FSE_ARG_STRING, from_len, from_name,
7771	FSE_ARG_FINFO, &from_finfo,
7772	FSE_ARG_DONE);
7773	}
7774	} else {
7775	add_fsevent(FSE_RENAME, ctx,
7776	FSE_ARG_STRING, from_len, from_name,
7777	FSE_ARG_FINFO, &from_finfo,
7778	FSE_ARG_STRING, to_len, to_name,
7779	FSE_ARG_DONE);
7780	}
7781	}
7782	#endif /* CONFIG_FSE */
7783
7784	/*
7785	* update filesystem's mount point data
7786	*/
7787	if (mntrename) {
7788	char cp, pathend, *mpname;
7789	char * tobuf;
7790	struct mount *mp;
7791	int maxlen;
7792	size_t len = `0`;
7793
7794	mp = fvp->v_mountedhere;
7795
7796	if (vfs_busy(mp, LK_NOWAIT)) {
7797	error = EBUSY;
7798	goto out1;
7799	}
7800	MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7801
7802	if (UIO_SEG_IS_USER_SPACE(segflg))
7803	error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7804	else
7805	error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7806	if (!error) {
7807	/ find current mount point prefix /
7808	pathend = &mp->mnt_vfsstat.f_mntonname[`0`];
7809	for (cp = pathend; *cp != `'\0'`; ++cp) {
7810	if (*cp == `'/'`)
7811	pathend = cp + `1`;
7812	}
7813	/ find last component of target name /
7814	for (mpname = cp = tobuf; *cp != `'\0'`; ++cp) {
7815	if (*cp == `'/'`)
7816	mpname = cp + `1`;
7817	}
7818	/ append name to prefix /
7819	maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7820	bzero(pathend, maxlen);
7821	strlcpy(pathend, mpname, maxlen);
7822	}
7823	FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7824
7825	vfs_unbusy(mp);
7826	}
7827	/*
7828	* fix up name & parent pointers. note that we first
7829	* check that fvp has the same name/parent pointers it
7830	* had before the rename call... this is a 'weak' check
7831	* at best...
7832	*
7833	* XXX oparent and oname may not be set in the compound vnop case
7834	*/
7835	if (batched \|\| (oname == fvp->v_name && oparent == fvp->v_parent)) {
7836	int update_flags;
7837
7838	update_flags = VNODE_UPDATE_NAME;
7839
7840	if (fdvp != tdvp)
7841	update_flags \|= VNODE_UPDATE_PARENT;
7842
7843	vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7844	}
7845	out1:
7846	if (to_name != NULL) {
7847	RELEASE_PATH(to_name);
7848	to_name = NULL;
7849	}
7850	if (from_name != NULL) {
7851	RELEASE_PATH(from_name);
7852	from_name = NULL;
7853	}
7854	if (holding_mntlock) {
7855	mount_unlock_renames(locked_mp);
7856	mount_drop(locked_mp, `0`);
7857	holding_mntlock = `0`;
7858	}
7859	if (tdvp) {
7860	/*
7861	* nameidone has to happen before we vnode_put(tdvp)
7862	* since it may need to release the fs_nodelock on the tdvp
7863	*/
7864	nameidone(tond);
7865
7866	if (tvp)
7867	vnode_put(tvp);
7868	vnode_put(tdvp);
7869	}
7870	if (fdvp) {
7871	/*
7872	* nameidone has to happen before we vnode_put(fdvp)
7873	* since it may need to release the fs_nodelock on the fdvp
7874	*/
7875	nameidone(fromnd);
7876
7877	if (fvp)
7878	vnode_put(fvp);
7879	vnode_put(fdvp);
7880	}
7881
7882	/*
7883	* If things changed after we did the namei, then we will re-drive
7884	* this rename call from the top.
7885	*/
7886	if (do_retry) {
7887	do_retry = `0`;
7888	goto retry;
7889	}
7890
7891	FREE(__rename_data, M_TEMP);
7892	return (error);
7893	}
7894
7895	int
7896	rename(__unused proc_t p, struct rename_args uap, __unused int32_t retval)
7897	{
7898	return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7899	AT_FDCWD, uap->to, UIO_USERSPACE, `0`));
7900	}
7901
7902	int renameatx_np(__unused proc_t p, struct renameatx_np_args uap, __unused int32_t retval)
7903	{
7904	return renameat_internal(
7905	vfs_context_current(),
7906	uap->fromfd, uap->from,
7907	uap->tofd, uap->to,
7908	UIO_USERSPACE, uap->flags);
7909	}
7910
7911	int
7912	renameat(__unused proc_t p, struct renameat_args uap, __unused int32_t retval)
7913	{
7914	return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7915	uap->tofd, uap->to, UIO_USERSPACE, `0`));
7916	}
7917
7918	/*
7919	* Make a directory file.
7920	*
7921	* Returns: 0 Success
7922	* EEXIST
7923	* namei:???
7924	* vnode_authorize:???
7925	* vn_create:???
7926	*/
7927	/ ARGSUSED /
7928	static int
7929	mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr vap, int* fd,
7930	enum uio_seg segflg)
7931	{
7932	vnode_t vp, dvp;
7933	int error;
7934	int update_flags = `0`;
7935	int batched;
7936	struct nameidata nd;
7937
7938	AUDIT_ARG(mode, vap->va_mode);
7939	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT \| AUDITVNPATH1, segflg,
7940	path, ctx);
7941	nd.ni_cnd.cn_flags \|= WILLBEDIR;
7942	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7943
7944	continue_lookup:
7945	error = nameiat(&nd, fd);
7946	if (error)
7947	return (error);
7948	dvp = nd.ni_dvp;
7949	vp = nd.ni_vp;
7950
7951	if (vp != NULL) {
7952	error = EEXIST;
7953	goto out;
7954	}
7955
7956	batched = vnode_compound_mkdir_available(dvp);
7957
7958	VATTR_SET(vap, va_type, VDIR);
7959
7960	/*
7961	* XXX
7962	* Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7963	* only get EXISTS or EISDIR for existing path components, and not that it could see
7964	* EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7965	* it will fail in a spurious manner. Need to figure out if this is valid behavior.
7966	*/
7967	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != `0`) {
7968	if (error == EACCES \|\| error == EPERM) {
7969	int error2;
7970
7971	nameidone(&nd);
7972	vnode_put(dvp);
7973	dvp = NULLVP;
7974
7975	/*
7976	* Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7977	* rather than EACCESS if the target exists.
7978	*/
7979	NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7980	path, ctx);
7981	error2 = nameiat(&nd, fd);
7982	if (error2) {
7983	goto out;
7984	} else {
7985	vp = nd.ni_vp;
7986	error = EEXIST;
7987	goto out;
7988	}
7989	}
7990
7991	goto out;
7992	}
7993
7994	/*
7995	* make the directory
7996	*/
7997	if ((error = vn_create(dvp, &vp, &nd, vap, `0`, `0`, NULL, ctx)) != `0`) {
7998	if (error == EKEEPLOOKING) {
7999	nd.ni_vp = vp;
8000	goto continue_lookup;
8001	}
8002
8003	goto out;
8004	}
8005
8006	// Make sure the name & parent pointers are hooked up
8007	if (vp->v_name == NULL)
8008	update_flags \|= VNODE_UPDATE_NAME;
8009	if (vp->v_parent == NULLVP)
8010	update_flags \|= VNODE_UPDATE_PARENT;
8011
8012	if (update_flags)
8013	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8014
8015	#if CONFIG_FSE
8016	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8017	#endif
8018
8019	out:
8020	/*
8021	* nameidone has to happen before we vnode_put(dvp)
8022	* since it may need to release the fs_nodelock on the dvp
8023	*/
8024	nameidone(&nd);
8025
8026	if (vp)
8027	vnode_put(vp);
8028	if (dvp)
8029	vnode_put(dvp);
8030
8031	return (error);
8032	}
8033
8034	/*
8035	* mkdir_extended: Create a directory; with extended security (ACL).
8036	*
8037	* Parameters: p Process requesting to create the directory
8038	* uap User argument descriptor (see below)
8039	* retval (ignored)
8040	*
8041	* Indirect: uap->path Path of directory to create
8042	* uap->mode Access permissions to set
8043	* uap->xsecurity ACL to set
8044	*
8045	* Returns: 0 Success
8046	* !0 Not success
8047	*
8048	*/
8049	int
8050	mkdir_extended(proc_t p, struct mkdir_extended_args uap, __unused int32_t retval)
8051	{
8052	int ciferror;
8053	kauth_filesec_t xsecdst;
8054	struct vnode_attr va;
8055
8056	AUDIT_ARG(owner, uap->uid, uap->gid);
8057
8058	xsecdst = NULL;
8059	if ((uap->xsecurity != USER_ADDR_NULL) &&
8060	((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`))
8061	return ciferror;
8062
8063	VATTR_INIT(&va);
8064	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8065	if (xsecdst != NULL)
8066	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8067
8068	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8069	UIO_USERSPACE);
8070	if (xsecdst != NULL)
8071	kauth_filesec_free(xsecdst);
8072	return ciferror;
8073	}
8074
8075	int
8076	mkdir(proc_t p, struct mkdir_args uap, __unused int32_t retval)
8077	{
8078	struct vnode_attr va;
8079
8080	VATTR_INIT(&va);
8081	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8082
8083	return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8084	UIO_USERSPACE));
8085	}
8086
8087	int
8088	mkdirat(proc_t p, struct mkdirat_args uap, __unused int32_t retval)
8089	{
8090	struct vnode_attr va;
8091
8092	VATTR_INIT(&va);
8093	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8094
8095	return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8096	UIO_USERSPACE));
8097	}
8098
8099	static int
8100	rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8101	enum uio_seg segflg)
8102	{
8103	vnode_t vp, dvp;
8104	int error;
8105	struct nameidata nd;
8106	char *path = NULL;
8107	int len=`0`;
8108	int has_listeners = `0`;
8109	int need_event = `0`;
8110	int truncated = `0`;
8111	#if CONFIG_FSE
8112	struct vnode_attr va;
8113	#endif /* CONFIG_FSE */
8114	struct vnode_attr *vap = NULL;
8115	int restart_count = `0`;
8116	int batched;
8117
8118	int restart_flag;
8119
8120	/*
8121	* This loop exists to restart rmdir in the unlikely case that two
8122	* processes are simultaneously trying to remove the same directory
8123	* containing orphaned appleDouble files.
8124	*/
8125	do {
8126	NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT \| AUDITVNPATH1,
8127	segflg, dirpath, ctx);
8128	nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8129	continue_lookup:
8130	restart_flag = `0`;
8131	vap = NULL;
8132
8133	error = nameiat(&nd, fd);
8134	if (error)
8135	return (error);
8136
8137	dvp = nd.ni_dvp;
8138	vp = nd.ni_vp;
8139
8140	if (vp) {
8141	batched = vnode_compound_rmdir_available(vp);
8142
8143	if (vp->v_flag & VROOT) {
8144	/*
8145	* The root of a mounted filesystem cannot be deleted.
8146	*/
8147	error = EBUSY;
8148	goto out;
8149	}
8150
8151	#if DEVELOPMENT \|\| DEBUG
8152	/*
8153	* XXX VSWAP: Check for entitlements or special flag here
8154	* so we can restrict access appropriately.
8155	*/
8156	#else /* DEVELOPMENT \|\| DEBUG */
8157
8158	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8159	error = EPERM;
8160	goto out;
8161	}
8162	#endif /* DEVELOPMENT \|\| DEBUG */
8163
8164	/*
8165	* Removed a check here; we used to abort if vp's vid
8166	* was not the same as what we'd seen the last time around.
8167	* I do not think that check was valid, because if we retry
8168	* and all dirents are gone, the directory could legitimately
8169	* be recycled but still be present in a situation where we would
8170	* have had permission to delete. Therefore, we won't make
8171	* an effort to preserve that check now that we may not have a
8172	* vp here.
8173	*/
8174
8175	if (!batched) {
8176	error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8177	if (error) {
8178	if (error == ENOENT) {
8179	assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8180	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8181	restart_flag = `1`;
8182	restart_count += `1`;
8183	}
8184	}
8185	goto out;
8186	}
8187	}
8188	} else {
8189	batched = `1`;
8190
8191	if (!vnode_compound_rmdir_available(dvp)) {
8192	panic("No error, but no compound rmdir?");
8193	}
8194	}
8195
8196	#if CONFIG_FSE
8197	fse_info finfo;
8198
8199	need_event = need_fsevent(FSE_DELETE, dvp);
8200	if (need_event) {
8201	if (!batched) {
8202	get_fse_info(vp, &finfo, ctx);
8203	} else {
8204	error = vfs_get_notify_attributes(&va);
8205	if (error) {
8206	goto out;
8207	}
8208
8209	vap = &va;
8210	}
8211	}
8212	#endif
8213	has_listeners = kauth_authorize_fileop_has_listeners();
8214	if (need_event \|\| has_listeners) {
8215	if (path == NULL) {
8216	GET_PATH(path);
8217	if (path == NULL) {
8218	error = ENOMEM;
8219	goto out;
8220	}
8221	}
8222
8223	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
8224	#if CONFIG_FSE
8225	if (truncated) {
8226	finfo.mode \|= FSE_TRUNCATED_PATH;
8227	}
8228	#endif
8229	}
8230
8231	error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8232	nd.ni_vp = vp;
8233	if (vp == NULLVP) {
8234	/ Couldn't find a vnode /
8235	goto out;
8236	}
8237
8238	if (error == EKEEPLOOKING) {
8239	goto continue_lookup;
8240	} else if (batched && error == ENOENT) {
8241	assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8242	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8243	/*
8244	* For compound VNOPs, the authorization callback
8245	* may return ENOENT in case of racing hard link lookups
8246	* redrive the lookup.
8247	*/
8248	restart_flag = `1`;
8249	restart_count += `1`;
8250	goto out;
8251	}
8252	}
8253	#if CONFIG_APPLEDOUBLE
8254	/*
8255	* Special case to remove orphaned AppleDouble
8256	* files. I don't like putting this in the kernel,
8257	* but carbon does not like putting this in carbon either,
8258	* so here we are.
8259	*/
8260	if (error == ENOTEMPTY) {
8261	error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8262	if (error == EBUSY) {
8263	goto out;
8264	}
8265
8266
8267	/*
8268	* Assuming everything went well, we will try the RMDIR again
8269	*/
8270	if (!error)
8271	error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8272	}
8273	#endif /* CONFIG_APPLEDOUBLE */
8274	/*
8275	* Call out to allow 3rd party notification of delete.
8276	* Ignore result of kauth_authorize_fileop call.
8277	*/
8278	if (!error) {
8279	if (has_listeners) {
8280	kauth_authorize_fileop(vfs_context_ucred(ctx),
8281	KAUTH_FILEOP_DELETE,
8282	(uintptr_t)vp,
8283	(uintptr_t)path);
8284	}
8285
8286	if (vp->v_flag & VISHARDLINK) {
8287	// see the comment in unlink1() about why we update
8288	// the parent of a hard link when it is removed
8289	vnode_update_identity(vp, NULL, NULL, `0`, `0`, VNODE_UPDATE_PARENT);
8290	}
8291
8292	#if CONFIG_FSE
8293	if (need_event) {
8294	if (vap) {
8295	vnode_get_fse_info_from_vap(vp, &finfo, vap);
8296	}
8297	add_fsevent(FSE_DELETE, ctx,
8298	FSE_ARG_STRING, len, path,
8299	FSE_ARG_FINFO, &finfo,
8300	FSE_ARG_DONE);
8301	}
8302	#endif
8303	}
8304
8305	out:
8306	if (path != NULL) {
8307	RELEASE_PATH(path);
8308	path = NULL;
8309	}
8310	/*
8311	* nameidone has to happen before we vnode_put(dvp)
8312	* since it may need to release the fs_nodelock on the dvp
8313	*/
8314	nameidone(&nd);
8315	vnode_put(dvp);
8316
8317	if (vp)
8318	vnode_put(vp);
8319
8320	if (restart_flag == `0`) {
8321	wakeup_one((caddr_t)vp);
8322	return (error);
8323	}
8324	tsleep(vp, PVFS, "rm AD", `1`);
8325
8326	} while (restart_flag != `0`);
8327
8328	return (error);
8329
8330	}
8331
8332	/*
8333	* Remove a directory file.
8334	*/
8335	/ ARGSUSED /
8336	int
8337	rmdir(__unused proc_t p, struct rmdir_args uap, __unused int32_t retval)
8338	{
8339	return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8340	CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8341	}
8342
8343	/ Get direntry length padded to 8 byte alignment /
8344	#define DIRENT64_LEN(namlen) \
8345	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8346
8347	/ Get dirent length padded to 4 byte alignment /
8348	#define DIRENT_LEN(namelen) \
8349	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
8350
8351	/ Get the end of this dirent /
8352	#define DIRENT_END(dep) \
8353	(((char *)(dep)) + (dep)->d_reclen - 1)
8354
8355	errno_t
8356	vnode_readdir64(struct vnode vp, struct* uio uio, int* flags, int *eofflag,
8357	int *numdirent, vfs_context_t ctxp)
8358	{
8359	/ Check if fs natively supports VNODE_READDIR_EXTENDED /
8360	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8361	((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == `0`)) {
8362	return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8363	} else {
8364	size_t bufsize;
8365	void * bufptr;
8366	uio_t auio;
8367	struct direntry *entry64;
8368	struct dirent *dep;
8369	int bytesread;
8370	int error;
8371
8372	/*
8373	* We're here because the underlying file system does not
8374	* support direnties or we mounted denying support so we must
8375	* fall back to dirents and convert them to direntries.
8376	*
8377	* Our kernel buffer needs to be smaller since re-packing will
8378	* expand each dirent. The worse case (when the name length
8379	* is 3 or less) corresponds to a struct direntry size of 32
8380	* bytes (8-byte aligned) and a struct dirent size of 12 bytes
8381	* (4-byte aligned). So having a buffer that is 3/8 the size
8382	* will prevent us from reading more than we can pack.
8383	*
8384	* Since this buffer is wired memory, we will limit the
8385	* buffer size to a maximum of 32K. We would really like to
8386	* use 32K in the MIN(), but we use magic number 87371 to
8387	* prevent uio_resid() * 3 / 8 from overflowing.
8388	*/
8389	bufsize = `3` * MIN((user_size_t)uio_resid(uio), `87371u`) / `8`;
8390	MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8391	if (bufptr == NULL) {
8392	return ENOMEM;
8393	}
8394
8395	auio = uio_create(`1`, `0`, UIO_SYSSPACE, UIO_READ);
8396	uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8397	auio->uio_offset = uio->uio_offset;
8398
8399	error = VNOP_READDIR(vp, auio, `0`, eofflag, numdirent, ctxp);
8400
8401	dep = (struct dirent *)bufptr;
8402	bytesread = bufsize - uio_resid(auio);
8403
8404	MALLOC(entry64, struct direntry , sizeof(struct* direntry),
8405	M_TEMP, M_WAITOK);
8406	/*
8407	* Convert all the entries and copy them out to user's buffer.
8408	*/
8409	while (error == `0` && (char )dep < ((char* *)bufptr + bytesread)) {
8410	size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
8411
8412	if (DIRENT_END(dep) > ((char *)bufptr + bytesread) \|\|
8413	DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
8414	printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
8415	vp->v_mount->mnt_vfsstat.f_mntonname,
8416	vp->v_name ? vp->v_name : "<unknown>");
8417	error = EIO;
8418	break;
8419	}
8420
8421	bzero(entry64, enbufsize);
8422	/ Convert a dirent to a dirent64. /
8423	entry64->d_ino = dep->d_ino;
8424	entry64->d_seekoff = `0`;
8425	entry64->d_reclen = enbufsize;
8426	entry64->d_namlen = dep->d_namlen;
8427	entry64->d_type = dep->d_type;
8428	bcopy(dep->d_name, entry64->d_name, dep->d_namlen + `1`);
8429
8430	/ Move to next entry. /
8431	dep = (struct dirent )((char* *)dep + dep->d_reclen);
8432
8433	/ Copy entry64 to user's buffer. /
8434	error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8435	}
8436
8437	/ Update the real offset using the offset we got from VNOP_READDIR. /
8438	if (error == `0`) {
8439	uio->uio_offset = auio->uio_offset;
8440	}
8441	uio_free(auio);
8442	FREE(bufptr, M_TEMP);
8443	FREE(entry64, M_TEMP);
8444	return (error);
8445	}
8446	}
8447
8448	#define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
8449
8450	/*
8451	* Read a block of directory entries in a file system independent format.
8452	*/
8453	static int
8454	getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8455	off_t offset, int* flags)
8456	{
8457	vnode_t vp;
8458	struct vfs_context context = vfs_context_current(); /* local copy /
8459	struct fileproc *fp;
8460	uio_t auio;
8461	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8462	off_t loff;
8463	int error, eofflag, numdirent;
8464	char uio_buf[ UIO_SIZEOF(`1`) ];
8465
8466	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8467	if (error) {
8468	return (error);
8469	}
8470	if ((fp->f_fglob->fg_flag & FREAD) == `0`) {
8471	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8472	error = EBADF;
8473	goto out;
8474	}
8475
8476	if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8477	bufsize = GETDIRENTRIES_MAXBUFSIZE;
8478
8479	#if CONFIG_MACF
8480	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8481	if (error)
8482	goto out;
8483	#endif
8484	if ( (error = vnode_getwithref(vp)) ) {
8485	goto out;
8486	}
8487	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8488
8489	unionread:
8490	if (vp->v_type != VDIR) {
8491	(void)vnode_put(vp);
8492	error = EINVAL;
8493	goto out;
8494	}
8495
8496	#if CONFIG_MACF
8497	error = mac_vnode_check_readdir(&context, vp);
8498	if (error != `0`) {
8499	(void)vnode_put(vp);
8500	goto out;
8501	}
8502	#endif /* MAC */
8503
8504	loff = fp->f_fglob->fg_offset;
8505	auio = uio_createwithbuffer(`1`, loff, spacetype, UIO_READ, &uio_buf[`0`], sizeof(uio_buf));
8506	uio_addiov(auio, bufp, bufsize);
8507
8508	if (flags & VNODE_READDIR_EXTENDED) {
8509	error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8510	fp->f_fglob->fg_offset = uio_offset(auio);
8511	} else {
8512	error = VNOP_READDIR(vp, auio, `0`, &eofflag, &numdirent, &context);
8513	fp->f_fglob->fg_offset = uio_offset(auio);
8514	}
8515	if (error) {
8516	(void)vnode_put(vp);
8517	goto out;
8518	}
8519
8520	if ((user_ssize_t)bufsize == uio_resid(auio)){
8521	if (union_dircheckp) {
8522	error = union_dircheckp(&vp, fp, &context);
8523	if (error == -`1`)
8524	goto unionread;
8525	if (error) {
8526	(void)vnode_put(vp);
8527	goto out;
8528	}
8529	}
8530
8531	if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8532	struct vnode *tvp = vp;
8533	if (lookup_traverse_union(tvp, &vp, &context) == `0`) {
8534	vnode_ref(vp);
8535	fp->f_fglob->fg_data = (caddr_t) vp;
8536	fp->f_fglob->fg_offset = `0`;
8537	vnode_rele(tvp);
8538	vnode_put(tvp);
8539	goto unionread;
8540	}
8541	vp = tvp;
8542	}
8543	}
8544
8545	vnode_put(vp);
8546	if (offset) {
8547	*offset = loff;
8548	}
8549
8550	*bytesread = bufsize - uio_resid(auio);
8551	out:
8552	file_drop(fd);
8553	return (error);
8554	}
8555
8556
8557	int
8558	getdirentries(__unused struct proc p, struct* getdirentries_args uap, int32_t retval)
8559	{
8560	off_t offset;
8561	ssize_t bytesread;
8562	int error;
8563
8564	AUDIT_ARG(fd, uap->fd);
8565	error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, `0`);
8566
8567	if (error == `0`) {
8568	if (proc_is64bit(p)) {
8569	user64_long_t base = (user64_long_t)offset;
8570	error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8571	} else {
8572	user32_long_t base = (user32_long_t)offset;
8573	error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8574	}
8575	*retval = bytesread;
8576	}
8577	return (error);
8578	}
8579
8580	int
8581	getdirentries64(__unused struct proc p, struct* getdirentries64_args uap, user_ssize_t retval)
8582	{
8583	off_t offset;
8584	ssize_t bytesread;
8585	int error;
8586
8587	AUDIT_ARG(fd, uap->fd);
8588	error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8589
8590	if (error == `0`) {
8591	*retval = bytesread;
8592	error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8593	}
8594	return (error);
8595	}
8596
8597
8598	/*
8599	* Set the mode mask for creation of filesystem nodes.
8600	* XXX implement xsecurity
8601	*/
8602	#define UMASK_NOXSECURITY (void )1 / leave existing xsecurity alone */
8603	static int
8604	umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8605	{
8606	struct filedesc *fdp;
8607
8608	AUDIT_ARG(mask, newmask);
8609	proc_fdlock(p);
8610	fdp = p->p_fd;
8611	*retval = fdp->fd_cmask;
8612	fdp->fd_cmask = newmask & ALLPERMS;
8613	proc_fdunlock(p);
8614	return (`0`);
8615	}
8616
8617	/*
8618	* umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8619	*
8620	* Parameters: p Process requesting to set the umask
8621	* uap User argument descriptor (see below)
8622	* retval umask of the process (parameter p)
8623	*
8624	* Indirect: uap->newmask umask to set
8625	* uap->xsecurity ACL to set
8626	*
8627	* Returns: 0 Success
8628	* !0 Not success
8629	*
8630	*/
8631	int
8632	umask_extended(proc_t p, struct umask_extended_args uap, int32_t retval)
8633	{
8634	int ciferror;
8635	kauth_filesec_t xsecdst;
8636
8637	xsecdst = KAUTH_FILESEC_NONE;
8638	if (uap->xsecurity != USER_ADDR_NULL) {
8639	if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != `0`)
8640	return ciferror;
8641	} else {
8642	xsecdst = KAUTH_FILESEC_NONE;
8643	}
8644
8645	ciferror = umask1(p, uap->newmask, xsecdst, retval);
8646
8647	if (xsecdst != KAUTH_FILESEC_NONE)
8648	kauth_filesec_free(xsecdst);
8649	return ciferror;
8650	}
8651
8652	int
8653	umask(proc_t p, struct umask_args uap, int32_t retval)
8654	{
8655	return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8656	}
8657
8658	/*
8659	* Void all references to file by ripping underlying filesystem
8660	* away from vnode.
8661	*/
8662	/ ARGSUSED /
8663	int
8664	revoke(proc_t p, struct revoke_args uap, __unused int32_t retval)
8665	{
8666	vnode_t vp;
8667	struct vnode_attr va;
8668	vfs_context_t ctx = vfs_context_current();
8669	int error;
8670	struct nameidata nd;
8671
8672	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
8673	uap->path, ctx);
8674	error = namei(&nd);
8675	if (error)
8676	return (error);
8677	vp = nd.ni_vp;
8678
8679	nameidone(&nd);
8680
8681	if (!(vnode_ischr(vp) \|\| vnode_isblk(vp))) {
8682	error = ENOTSUP;
8683	goto out;
8684	}
8685
8686	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8687	error = EBUSY;
8688	goto out;
8689	}
8690
8691	#if CONFIG_MACF
8692	error = mac_vnode_check_revoke(ctx, vp);
8693	if (error)
8694	goto out;
8695	#endif
8696
8697	VATTR_INIT(&va);
8698	VATTR_WANTED(&va, va_uid);
8699	if ((error = vnode_getattr(vp, &va, ctx)))
8700	goto out;
8701	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8702	(error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8703	goto out;
8704	if (vp->v_usecount > `0` \|\| (vnode_isaliased(vp)))
8705	VNOP_REVOKE(vp, REVOKEALL, ctx);
8706	out:
8707	vnode_put(vp);
8708	return (error);
8709	}
8710
8711
8712	/*
8713	* HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8714	* The following system calls are designed to support features
8715	* which are specific to the HFS & HFS Plus volume formats
8716	*/
8717
8718
8719	/*
8720	* Obtain attribute information on objects in a directory while enumerating
8721	* the directory.
8722	*/
8723	/ ARGSUSED /
8724	int
8725	getdirentriesattr (proc_t p, struct getdirentriesattr_args uap, int32_t retval)
8726	{
8727	vnode_t vp;
8728	struct fileproc *fp;
8729	uio_t auio = NULL;
8730	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8731	uint32_t count = `0`, savecount = `0`;
8732	uint32_t newstate = `0`;
8733	int error, eofflag;
8734	uint32_t loff = `0`;
8735	struct attrlist attributelist;
8736	vfs_context_t ctx = vfs_context_current();
8737	int fd = uap->fd;
8738	char uio_buf[ UIO_SIZEOF(`1`) ];
8739	kauth_action_t action;
8740
8741	AUDIT_ARG(fd, fd);
8742
8743	/ Get the attributes into kernel space /
8744	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8745	return(error);
8746	}
8747	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8748	return(error);
8749	}
8750	savecount = count;
8751	if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8752	return (error);
8753	}
8754	if ((fp->f_fglob->fg_flag & FREAD) == `0`) {
8755	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8756	error = EBADF;
8757	goto out;
8758	}
8759
8760
8761	#if CONFIG_MACF
8762	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8763	fp->f_fglob);
8764	if (error)
8765	goto out;
8766	#endif
8767
8768
8769	if ( (error = vnode_getwithref(vp)) )
8770	goto out;
8771
8772	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8773
8774	unionread:
8775	if (vp->v_type != VDIR) {
8776	(void)vnode_put(vp);
8777	error = EINVAL;
8778	goto out;
8779	}
8780
8781	#if CONFIG_MACF
8782	error = mac_vnode_check_readdir(ctx, vp);
8783	if (error != `0`) {
8784	(void)vnode_put(vp);
8785	goto out;
8786	}
8787	#endif /* MAC */
8788
8789	/ set up the uio structure which will contain the users return buffer /
8790	loff = fp->f_fglob->fg_offset;
8791	auio = uio_createwithbuffer(`1`, loff, spacetype, UIO_READ, &uio_buf[`0`], sizeof(uio_buf));
8792	uio_addiov(auio, uap->buffer, uap->buffersize);
8793
8794	/*
8795	* If the only item requested is file names, we can let that past with
8796	* just LIST_DIRECTORY. If they want any other attributes, that means
8797	* they need SEARCH as well.
8798	*/
8799	action = KAUTH_VNODE_LIST_DIRECTORY;
8800	if ((attributelist.commonattr & ~ATTR_CMN_NAME) \|\|
8801	attributelist.fileattr \|\| attributelist.dirattr)
8802	action \|= KAUTH_VNODE_SEARCH;
8803
8804	if ((error = vnode_authorize(vp, NULL, action, ctx)) == `0`) {
8805
8806	/ Believe it or not, uap->options only has 32-bits of valid*
8807	* info, so truncate before extending again */
8808
8809	error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8810	(u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8811	}
8812
8813	if (error) {
8814	(void) vnode_put(vp);
8815	goto out;
8816	}
8817
8818	/*
8819	* If we've got the last entry of a directory in a union mount
8820	* then reset the eofflag and pretend there's still more to come.
8821	* The next call will again set eofflag and the buffer will be empty,
8822	* so traverse to the underlying directory and do the directory
8823	* read there.
8824	*/
8825	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8826	if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8827	eofflag = `0`;
8828	} else { // Empty buffer
8829	struct vnode *tvp = vp;
8830	if (lookup_traverse_union(tvp, &vp, ctx) == `0`) {
8831	vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, `0`);
8832	fp->f_fglob->fg_data = (caddr_t) vp;
8833	fp->f_fglob->fg_offset = `0`; // reset index for new dir
8834	count = savecount;
8835	vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, `0`, `0`);
8836	vnode_put(tvp);
8837	goto unionread;
8838	}
8839	vp = tvp;
8840	}
8841	}
8842
8843	(void)vnode_put(vp);
8844
8845	if (error)
8846	goto out;
8847	fp->f_fglob->fg_offset = uio_offset(auio); / should be multiple of dirent, not variable /
8848
8849	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8850	goto out;
8851	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8852	goto out;
8853	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8854	goto out;
8855
8856	retval = eofflag; /* similar to getdirentries /
8857	error = `0`;
8858	out:
8859	file_drop(fd);
8860	return (error); / return error earlier, an retval of 0 or 1 now /
8861
8862	} / end of getdirentriesattr system call /
8863
8864	/*
8865	* Exchange data between two files
8866	*/
8867
8868	/ ARGSUSED /
8869	int
8870	exchangedata (__unused proc_t p, struct exchangedata_args uap, __unused int32_t retval)
8871	{
8872
8873	struct nameidata fnd, snd;
8874	vfs_context_t ctx = vfs_context_current();
8875	vnode_t fvp;
8876	vnode_t svp;
8877	int error;
8878	u_int32_t nameiflags;
8879	char *fpath = NULL;
8880	char *spath = NULL;
8881	int flen=`0`, slen=`0`;
8882	int from_truncated=`0`, to_truncated=`0`;
8883	#if CONFIG_FSE
8884	fse_info f_finfo, s_finfo;
8885	#endif
8886
8887	nameiflags = `0`;
8888	if ((uap->options & FSOPT_NOFOLLOW) == `0`) nameiflags \|= FOLLOW;
8889
8890	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags \| AUDITVNPATH1,
8891	UIO_USERSPACE, uap->path1, ctx);
8892
8893	error = namei(&fnd);
8894	if (error)
8895	goto out2;
8896
8897	nameidone(&fnd);
8898	fvp = fnd.ni_vp;
8899
8900	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK \| nameiflags \| AUDITVNPATH2,
8901	UIO_USERSPACE, uap->path2, ctx);
8902
8903	error = namei(&snd);
8904	if (error) {
8905	vnode_put(fvp);
8906	goto out2;
8907	}
8908	nameidone(&snd);
8909	svp = snd.ni_vp;
8910
8911	/*
8912	* if the files are the same, return an inval error
8913	*/
8914	if (svp == fvp) {
8915	error = EINVAL;
8916	goto out;
8917	}
8918
8919	/*
8920	* if the files are on different volumes, return an error
8921	*/
8922	if (svp->v_mount != fvp->v_mount) {
8923	error = EXDEV;
8924	goto out;
8925	}
8926
8927	/ If they're not files, return an error /
8928	if ( (vnode_isreg(fvp) == `0`) \|\| (vnode_isreg(svp) == `0`)) {
8929	error = EINVAL;
8930	goto out;
8931	}
8932
8933	#if CONFIG_MACF
8934	error = mac_vnode_check_exchangedata(ctx,
8935	fvp, svp);
8936	if (error)
8937	goto out;
8938	#endif
8939	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != `0`) \|\|
8940	((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != `0`))
8941	goto out;
8942
8943	if (
8944	#if CONFIG_FSE
8945	need_fsevent(FSE_EXCHANGE, fvp) \|\|
8946	#endif
8947	kauth_authorize_fileop_has_listeners()) {
8948	GET_PATH(fpath);
8949	GET_PATH(spath);
8950	if (fpath == NULL \|\| spath == NULL) {
8951	error = ENOMEM;
8952	goto out;
8953	}
8954
8955	flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8956	slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8957
8958	#if CONFIG_FSE
8959	get_fse_info(fvp, &f_finfo, ctx);
8960	get_fse_info(svp, &s_finfo, ctx);
8961	if (from_truncated \|\| to_truncated) {
8962	// set it here since only the f_finfo gets reported up to user space
8963	f_finfo.mode \|= FSE_TRUNCATED_PATH;
8964	}
8965	#endif
8966	}
8967	/ Ok, make the call /
8968	error = VNOP_EXCHANGE(fvp, svp, `0`, ctx);
8969
8970	if (error == `0`) {
8971	const char *tmpname;
8972
8973	if (fpath != NULL && spath != NULL) {
8974	/ call out to allow 3rd party notification of exchangedata.*
8975	* Ignore result of kauth_authorize_fileop call.
8976	*/
8977	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8978	(uintptr_t)fpath, (uintptr_t)spath);
8979	}
8980	name_cache_lock();
8981
8982	tmpname = fvp->v_name;
8983	fvp->v_name = svp->v_name;
8984	svp->v_name = tmpname;
8985
8986	if (fvp->v_parent != svp->v_parent) {
8987	vnode_t tmp;
8988
8989	tmp = fvp->v_parent;
8990	fvp->v_parent = svp->v_parent;
8991	svp->v_parent = tmp;
8992	}
8993	name_cache_unlock();
8994
8995	#if CONFIG_FSE
8996	if (fpath != NULL && spath != NULL) {
8997	add_fsevent(FSE_EXCHANGE, ctx,
8998	FSE_ARG_STRING, flen, fpath,
8999	FSE_ARG_FINFO, &f_finfo,
9000	FSE_ARG_STRING, slen, spath,
9001	FSE_ARG_FINFO, &s_finfo,
9002	FSE_ARG_DONE);
9003	}
9004	#endif
9005	}
9006
9007	out:
9008	if (fpath != NULL)
9009	RELEASE_PATH(fpath);
9010	if (spath != NULL)
9011	RELEASE_PATH(spath);
9012	vnode_put(svp);
9013	vnode_put(fvp);
9014	out2:
9015	return (error);
9016	}
9017
9018	/*
9019	* Return (in MB) the amount of freespace on the given vnode's volume.
9020	*/
9021	uint32_t freespace_mb(vnode_t vp);
9022
9023	uint32_t
9024	freespace_mb(vnode_t vp)
9025	{
9026	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9027	return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9028	vp->v_mount->mnt_vfsstat.f_bsize) >> `20`);
9029	}
9030
9031	#if CONFIG_SEARCHFS
9032
9033	/ ARGSUSED /
9034
9035	int
9036	searchfs(proc_t p, struct searchfs_args uap, __unused int32_t retval)
9037	{
9038	vnode_t vp, tvp;
9039	int i, error=`0`;
9040	int fserror = `0`;
9041	struct nameidata nd;
9042	struct user64_fssearchblock searchblock;
9043	struct searchstate *state;
9044	struct attrlist *returnattrs;
9045	struct timeval timelimit;
9046	void searchparams1,searchparams2;
9047	uio_t auio = NULL;
9048	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9049	uint32_t nummatches;
9050	int mallocsize;
9051	uint32_t nameiflags;
9052	vfs_context_t ctx = vfs_context_current();
9053	char uio_buf[ UIO_SIZEOF(`1`) ];
9054
9055	/ Start by copying in fsearchblock parameter list /
9056	if (IS_64BIT_PROCESS(p)) {
9057	error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9058	timelimit.tv_sec = searchblock.timelimit.tv_sec;
9059	timelimit.tv_usec = searchblock.timelimit.tv_usec;
9060	}
9061	else {
9062	struct user32_fssearchblock tmp_searchblock;
9063
9064	error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9065	// munge into 64-bit version
9066	searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9067	searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9068	searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9069	searchblock.maxmatches = tmp_searchblock.maxmatches;
9070	/*
9071	* These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9072	* from a 32 bit long, and tv_usec is already a signed 32 bit int.
9073	*/
9074	timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9075	timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9076	searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9077	searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9078	searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9079	searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9080	searchblock.searchattrs = tmp_searchblock.searchattrs;
9081	}
9082	if (error)
9083	return(error);
9084
9085	/ Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.*
9086	*/
9087	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS \|\|
9088	searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
9089	return(EINVAL);
9090
9091	/ Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. /
9092	/ It all has to do into local memory and it's not that big so we might as well put it all together. /
9093	/ Searchparams1 shall be first so we might as well use that to hold the base address of the allocated/
9094	/ block. /
9095	/ /
9096	/ NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate /
9097	/ due to the changes in rdar://problem/12438273. That way if a 3rd party file system /
9098	/ assumes the size is still 556 bytes it will continue to work /
9099
9100	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9101	sizeof(struct attrlist) + sizeof(struct searchstate) + (`2`*sizeof(uint32_t));
9102
9103	MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9104
9105	/ Now set up the various pointers to the correct place in our newly allocated memory /
9106
9107	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9108	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9109	state = (struct searchstate ) (((caddr_t) returnattrs) + sizeof* (struct attrlist));
9110
9111	/ Now copy in the stuff given our local variables. /
9112
9113	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
9114	goto freeandexit;
9115
9116	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
9117	goto freeandexit;
9118
9119	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
9120	goto freeandexit;
9121
9122	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
9123	goto freeandexit;
9124
9125	/*
9126	* When searching a union mount, need to set the
9127	* start flag at the first call on each layer to
9128	* reset state for the new volume.
9129	*/
9130	if (uap->options & SRCHFS_START)
9131	state->ss_union_layer = `0`;
9132	else
9133	uap->options \|= state->ss_union_flags;
9134	state->ss_union_flags = `0`;
9135
9136	/*
9137	* Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
9138	* which is passed in with an attrreference_t, we need to inspect the buffer manually here.
9139	* The KPI does not provide us the ability to pass in the length of the buffers searchparams1
9140	* and searchparams2. To obviate the need for all searchfs-supporting filesystems to
9141	* validate the user-supplied data offset of the attrreference_t, we'll do it here.
9142	*/
9143
9144	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
9145	attrreference_t* string_ref;
9146	u_int32_t* start_length;
9147	user64_size_t param_length;
9148
9149	/ validate searchparams1 /
9150	param_length = searchblock.sizeofsearchparams1;
9151	/ skip the word that specifies length of the buffer /
9152	start_length= (u_int32_t*) searchparams1;
9153	start_length= start_length+`1`;
9154	string_ref= (attrreference_t*) start_length;
9155
9156	/ ensure no negative offsets or too big offsets /
9157	if (string_ref->attr_dataoffset < `0` ) {
9158	error = EINVAL;
9159	goto freeandexit;
9160	}
9161	if (string_ref->attr_length > MAXPATHLEN) {
9162	error = EINVAL;
9163	goto freeandexit;
9164	}
9165
9166	/ Check for pointer overflow in the string ref /
9167	if (((char) string_ref + string_ref->attr_dataoffset) < (char**) string_ref) {
9168	error = EINVAL;
9169	goto freeandexit;
9170	}
9171
9172	if (((char) string_ref + string_ref->attr_dataoffset) > ((char**)searchparams1 + param_length)) {
9173	error = EINVAL;
9174	goto freeandexit;
9175	}
9176	if (((char)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char**)searchparams1 + param_length)) {
9177	error = EINVAL;
9178	goto freeandexit;
9179	}
9180	}
9181
9182	/ set up the uio structure which will contain the users return buffer /
9183	auio = uio_createwithbuffer(`1`, `0`, spacetype, UIO_READ, &uio_buf[`0`], sizeof(uio_buf));
9184	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
9185
9186	nameiflags = `0`;
9187	if ((uap->options & FSOPT_NOFOLLOW) == `0`) nameiflags \|= FOLLOW;
9188	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags \| AUDITVNPATH1,
9189	UIO_USERSPACE, uap->path, ctx);
9190
9191	error = namei(&nd);
9192	if (error)
9193	goto freeandexit;
9194	vp = nd.ni_vp;
9195	nameidone(&nd);
9196
9197	/*
9198	* Switch to the root vnode for the volume
9199	*/
9200	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
9201	vnode_put(vp);
9202	if (error)
9203	goto freeandexit;
9204	vp = tvp;
9205
9206	/*
9207	* If it's a union mount, the path lookup takes
9208	* us to the top layer. But we may need to descend
9209	* to a lower layer. For non-union mounts the layer
9210	* is always zero.
9211	*/
9212	for (i = `0`; i < (int) state->ss_union_layer; i++) {
9213	if ((vp->v_mount->mnt_flag & MNT_UNION) == `0`)
9214	break;
9215	tvp = vp;
9216	vp = vp->v_mount->mnt_vnodecovered;
9217	if (vp == NULL) {
9218	vnode_put(tvp);
9219	error = ENOENT;
9220	goto freeandexit;
9221	}
9222	error = vnode_getwithref(vp);
9223	vnode_put(tvp);
9224	if (error)
9225	goto freeandexit;
9226	}
9227
9228	#if CONFIG_MACF
9229	error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
9230	if (error) {
9231	vnode_put(vp);
9232	goto freeandexit;
9233	}
9234	#endif
9235
9236
9237	/*
9238	* If searchblock.maxmatches == 0, then skip the search. This has happened
9239	* before and sometimes the underlying code doesnt deal with it well.
9240	*/
9241	if (searchblock.maxmatches == `0`) {
9242	nummatches = `0`;
9243	goto saveandexit;
9244	}
9245
9246	/*
9247	* Allright, we have everything we need, so lets make that call.
9248	*
9249	* We keep special track of the return value from the file system:
9250	* EAGAIN is an acceptable error condition that shouldn't keep us
9251	* from copying out any results...
9252	*/
9253
9254	fserror = VNOP_SEARCHFS(vp,
9255	searchparams1,
9256	searchparams2,
9257	&searchblock.searchattrs,
9258	(u_long)searchblock.maxmatches,
9259	&timelimit,
9260	returnattrs,
9261	&nummatches,
9262	(u_long)uap->scriptcode,
9263	(u_long)uap->options,
9264	auio,
9265	(struct searchstate *) &state->ss_fsstate,
9266	ctx);
9267
9268	/*
9269	* If it's a union mount we need to be called again
9270	* to search the mounted-on filesystem.
9271	*/
9272	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == `0`) {
9273	state->ss_union_flags = SRCHFS_START;
9274	state->ss_union_layer++; // search next layer down
9275	fserror = EAGAIN;
9276	}
9277
9278	saveandexit:
9279
9280	vnode_put(vp);
9281
9282	/ Now copy out the stuff that needs copying out. That means the number of matches, the*
9283	search state. Everything was already put into he return buffer by the vop call. /*
9284
9285	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != `0`)
9286	goto freeandexit;
9287
9288	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != `0`)
9289	goto freeandexit;
9290
9291	error = fserror;
9292
9293	freeandexit:
9294
9295	FREE(searchparams1,M_TEMP);
9296
9297	return(error);
9298
9299
9300	} / end of searchfs system call /
9301
9302	#else /* CONFIG_SEARCHFS */
9303
9304	int
9305	searchfs(__unused proc_t p, __unused struct searchfs_args uap, __unused int32_t retval)
9306	{
9307	return (ENOTSUP);
9308	}
9309
9310	#endif /* CONFIG_SEARCHFS */
9311
9312
9313	lck_grp_attr_t * nspace_group_attr;
9314	lck_attr_t * nspace_lock_attr;
9315	lck_grp_t * nspace_mutex_group;
9316
9317	lck_mtx_t nspace_handler_lock;
9318	lck_mtx_t nspace_handler_exclusion_lock;
9319
9320	time_t snapshot_timestamp=`0`;
9321	int nspace_allow_virtual_devs=`0`;
9322
9323	void nspace_handler_init(void);
9324
9325	typedef struct nspace_item_info {
9326	struct vnode *vp;
9327	void *arg;
9328	uint64_t op;
9329	uint32_t vid;
9330	uint32_t flags;
9331	uint32_t token;
9332	uint32_t refcount;
9333	} nspace_item_info;
9334
9335	#define MAX_NSPACE_ITEMS 128
9336	nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9337	uint32_t nspace_item_idx=`0`; // also used as the sleep/wakeup rendezvous address
9338	uint32_t nspace_token_id=`0`;
9339	uint32_t nspace_handler_timeout = `15`; // seconds
9340
9341	#define NSPACE_ITEM_NEW 0x0001
9342	#define NSPACE_ITEM_PROCESSING 0x0002
9343	#define NSPACE_ITEM_DEAD 0x0004
9344	#define NSPACE_ITEM_CANCELLED 0x0008
9345	#define NSPACE_ITEM_DONE 0x0010
9346	#define NSPACE_ITEM_RESET_TIMER 0x0020
9347
9348	#define NSPACE_ITEM_NSPACE_EVENT 0x0040
9349	#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9350
9351	#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT \| NSPACE_ITEM_SNAPSHOT_EVENT)
9352
9353	//#pragma optimization_level 0
9354
9355	typedef enum {
9356	NSPACE_HANDLER_NSPACE = `0`,
9357	NSPACE_HANDLER_SNAPSHOT = `1`,
9358
9359	NSPACE_HANDLER_COUNT,
9360	} nspace_type_t;
9361
9362	typedef struct {
9363	uint64_t handler_tid;
9364	struct proc *handler_proc;
9365	int handler_busy;
9366	} nspace_handler_t;
9367
9368	nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9369
9370	/ namespace fsctl functions /
9371	static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9372	static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9373	static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9374	static nspace_type_t nspace_type_for_op(uint64_t op);
9375	static int nspace_is_special_process(struct proc *proc);
9376	static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9377	static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9378	static int validate_namespace_args (int is64bit, int size);
9379	static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9380
9381
9382	static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9383	{
9384	switch(nspace_type) {
9385	case NSPACE_HANDLER_NSPACE:
9386	return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9387	case NSPACE_HANDLER_SNAPSHOT:
9388	return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9389	default:
9390	printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9391	return `0`;
9392	}
9393	}
9394
9395	static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9396	{
9397	switch(nspace_type) {
9398	case NSPACE_HANDLER_NSPACE:
9399	return NSPACE_ITEM_NSPACE_EVENT;
9400	case NSPACE_HANDLER_SNAPSHOT:
9401	return NSPACE_ITEM_SNAPSHOT_EVENT;
9402	default:
9403	printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9404	return `0`;
9405	}
9406	}
9407
9408	static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9409	{
9410	switch(nspace_type) {
9411	case NSPACE_HANDLER_NSPACE:
9412	return FREAD \| FWRITE \| O_EVTONLY;
9413	case NSPACE_HANDLER_SNAPSHOT:
9414	return FREAD \| O_EVTONLY;
9415	default:
9416	printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9417	return `0`;
9418	}
9419	}
9420
9421	static inline nspace_type_t nspace_type_for_op(uint64_t op)
9422	{
9423	switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9424	case NAMESPACE_HANDLER_NSPACE_EVENT:
9425	return NSPACE_HANDLER_NSPACE;
9426	case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9427	return NSPACE_HANDLER_SNAPSHOT;
9428	default:
9429	printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9430	return NSPACE_HANDLER_NSPACE;
9431	}
9432	}
9433
9434	static inline int nspace_is_special_process(struct proc *proc)
9435	{
9436	int i;
9437	for (i = `0`; i < NSPACE_HANDLER_COUNT; i++) {
9438	if (proc == nspace_handlers[i].handler_proc)
9439	return `1`;
9440	}
9441	return `0`;
9442	}
9443
9444	void
9445	nspace_handler_init(void)
9446	{
9447	nspace_lock_attr = lck_attr_alloc_init();
9448	nspace_group_attr = lck_grp_attr_alloc_init();
9449	nspace_mutex_group = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9450	lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9451	lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9452	memset(&nspace_items[`0`], `0`, sizeof(nspace_items));
9453	}
9454
9455	void
9456	nspace_proc_exit(struct proc *p)
9457	{
9458	int i, event_mask = `0`;
9459
9460	for (i = `0`; i < NSPACE_HANDLER_COUNT; i++) {
9461	if (p == nspace_handlers[i].handler_proc) {
9462	event_mask \|= nspace_item_flags_for_type(i);
9463	nspace_handlers[i].handler_tid = `0`;
9464	nspace_handlers[i].handler_proc = NULL;
9465	}
9466	}
9467
9468	if (event_mask == `0`) {
9469	return;
9470	}
9471
9472	lck_mtx_lock(&nspace_handler_lock);
9473	if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9474	// if this process was the snapshot handler, zero snapshot_timeout
9475	snapshot_timestamp = `0`;
9476	}
9477
9478	//
9479	// unblock anyone that's waiting for the handler that died
9480	//
9481	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
9482	if (nspace_items[i].flags & (NSPACE_ITEM_NEW \| NSPACE_ITEM_PROCESSING)) {
9483
9484	if ( nspace_items[i].flags & event_mask ) {
9485
9486	if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9487	vnode_lock_spin(nspace_items[i].vp);
9488	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9489	vnode_unlock(nspace_items[i].vp);
9490	}
9491	nspace_items[i].vp = NULL;
9492	nspace_items[i].vid = `0`;
9493	nspace_items[i].flags = NSPACE_ITEM_DONE;
9494	nspace_items[i].token = `0`;
9495
9496	wakeup((caddr_t)&(nspace_items[i].vp));
9497	}
9498	}
9499	}
9500
9501	wakeup((caddr_t)&nspace_item_idx);
9502	lck_mtx_unlock(&nspace_handler_lock);
9503	}
9504
9505
9506	int
9507	resolve_nspace_item(struct vnode *vp, uint64_t op)
9508	{
9509	return resolve_nspace_item_ext(vp, op, NULL);
9510	}
9511
9512	int
9513	resolve_nspace_item_ext(struct vnode vp, uint64_t op, void* *arg)
9514	{
9515	int i, error, keep_waiting;
9516	struct timespec ts;
9517	nspace_type_t nspace_type = nspace_type_for_op(op);
9518
9519	// only allow namespace events on regular files, directories and symlinks.
9520	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9521	return `0`;
9522	}
9523
9524	//
9525	// if this is a snapshot event and the vnode is on a
9526	// disk image just pretend nothing happened since any
9527	// change to the disk image will cause the disk image
9528	// itself to get backed up and this avoids multi-way
9529	// deadlocks between the snapshot handler and the ever
9530	// popular diskimages-helper process. the variable
9531	// nspace_allow_virtual_devs allows this behavior to
9532	// be overridden (for use by the Mobile TimeMachine
9533	// testing infrastructure which uses disk images)
9534	//
9535	if ( (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9536	&& (vp->v_mount != NULL)
9537	&& (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9538	&& !nspace_allow_virtual_devs) {
9539
9540	return `0`;
9541	}
9542
9543	// if (thread_tid(current_thread()) == namespace_handler_tid) {
9544	if (nspace_handlers[nspace_type].handler_proc == NULL) {
9545	return `0`;
9546	}
9547
9548	if (nspace_is_special_process(current_proc())) {
9549	return EDEADLK;
9550	}
9551
9552	lck_mtx_lock(&nspace_handler_lock);
9553
9554	retry:
9555	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
9556	if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9557	break;
9558	}
9559	}
9560
9561	if (i >= MAX_NSPACE_ITEMS) {
9562	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
9563	if (nspace_items[i].flags == `0`) {
9564	break;
9565	}
9566	}
9567	} else {
9568	nspace_items[i].refcount++;
9569	}
9570
9571	if (i >= MAX_NSPACE_ITEMS) {
9572	ts.tv_sec = nspace_handler_timeout;
9573	ts.tv_nsec = `0`;
9574
9575	error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS\|PCATCH, "nspace-no-space", &ts);
9576	if (error == `0`) {
9577	// an entry got free'd up, go see if we can get a slot
9578	goto retry;
9579	} else {
9580	lck_mtx_unlock(&nspace_handler_lock);
9581	return error;
9582	}
9583	}
9584
9585	//
9586	// if it didn't already exist, add it. if it did exist
9587	// we'll get woken up when someone does a wakeup() on
9588	// the slot in the nspace_items table.
9589	//
9590	if (vp != nspace_items[i].vp) {
9591	nspace_items[i].vp = vp;
9592	nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg; // arg is {NULL, true, uio } - only pass uio thru to the user*
9593	nspace_items[i].op = op;
9594	nspace_items[i].vid = vnode_vid(vp);
9595	nspace_items[i].flags = NSPACE_ITEM_NEW;
9596	nspace_items[i].flags \|= nspace_item_flags_for_type(nspace_type);
9597	if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9598	if (arg) {
9599	vnode_lock_spin(vp);
9600	vp->v_flag \|= VNEEDSSNAPSHOT;
9601	vnode_unlock(vp);
9602	}
9603	}
9604
9605	nspace_items[i].token = `0`;
9606	nspace_items[i].refcount = `1`;
9607
9608	wakeup((caddr_t)&nspace_item_idx);
9609	}
9610
9611	//
9612	// Now go to sleep until the handler does a wakeup on this
9613	// slot in the nspace_items table (or we timeout).
9614	//
9615	keep_waiting = `1`;
9616	while(keep_waiting) {
9617	ts.tv_sec = nspace_handler_timeout;
9618	ts.tv_nsec = `0`;
9619	error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS\|PCATCH, "namespace-done", &ts);
9620
9621	if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9622	error = `0`;
9623	} else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9624	error = nspace_items[i].token;
9625	} else if (error == EWOULDBLOCK \|\| error == ETIMEDOUT) {
9626	if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9627	nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9628	continue;
9629	} else {
9630	error = ETIMEDOUT;
9631	}
9632	} else if (error == `0`) {
9633	// hmmm, why did we get woken up?
9634	printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9635	nspace_items[i].token);
9636	}
9637
9638	if (--nspace_items[i].refcount == `0`) {
9639	nspace_items[i].vp = NULL; // clear this so that no one will match on it again
9640	nspace_items[i].arg = NULL;
9641	nspace_items[i].token = `0`; // clear this so that the handler will not find it anymore
9642	nspace_items[i].flags = `0`; // this clears it for re-use
9643	}
9644	wakeup(&nspace_token_id);
9645	keep_waiting = `0`;
9646	}
9647
9648	lck_mtx_unlock(&nspace_handler_lock);
9649
9650	return error;
9651	}
9652
9653	int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9654	{
9655	int snapshot_error = `0`;
9656
9657	if (vp == NULL) {
9658	return `0`;
9659	}
9660
9661	/ Swap files are special; skip them /
9662	if (vnode_isswap(vp)) {
9663	return `0`;
9664	}
9665
9666	if (ctime != `0` && snapshot_timestamp != `0` && (ctime <= snapshot_timestamp \|\| vnode_needssnapshots(vp))) {
9667	// the change time is within this epoch
9668	int error;
9669
9670	error = resolve_nspace_item_ext(vp, op_type \| NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9671	if (error == EDEADLK) {
9672	snapshot_error = `0`;
9673	} else if (error) {
9674	if (error == EAGAIN) {
9675	printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9676	} else if (error == EINTR) {
9677	// printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9678	snapshot_error = EINTR;
9679	}
9680	}
9681	}
9682
9683	return snapshot_error;
9684	}
9685
9686	int
9687	get_nspace_item_status(struct vnode vp, int32_t status)
9688	{
9689	int i;
9690
9691	lck_mtx_lock(&nspace_handler_lock);
9692	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
9693	if (nspace_items[i].vp == vp) {
9694	break;
9695	}
9696	}
9697
9698	if (i >= MAX_NSPACE_ITEMS) {
9699	lck_mtx_unlock(&nspace_handler_lock);
9700	return ENOENT;
9701	}
9702
9703	*status = nspace_items[i].flags;
9704	lck_mtx_unlock(&nspace_handler_lock);
9705	return `0`;
9706	}
9707
9708
9709	#if 0
9710	static int
9711	build_volfs_path(struct vnode vp, char* path, int* *len)
9712	{
9713	struct vnode_attr va;
9714	int ret;
9715
9716	VATTR_INIT(&va);
9717	VATTR_WANTED(&va, va_fsid);
9718	VATTR_WANTED(&va, va_fileid);
9719
9720	if (vnode_getattr(vp, &va, vfs_context_kernel()) != `0`) {
9721	len = snprintf(path, len, "/non/existent/path/because/vnode_getattr/failed") + `1`;
9722	ret = -`1`;
9723	} else {
9724	len = snprintf(path, len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + `1`;
9725	ret = `0`;
9726	}
9727
9728	return ret;
9729	}
9730	#endif
9731
9732	//
9733	// Note: this function does NOT check permissions on all of the
9734	// parent directories leading to this vnode. It should only be
9735	// called on behalf of a root process. Otherwise a process may
9736	// get access to a file because the file itself is readable even
9737	// though its parent directories would prevent access.
9738	//
9739	static int
9740	vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9741	{
9742	int error, action;
9743
9744	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9745	return error;
9746	}
9747
9748	#if CONFIG_MACF
9749	error = mac_vnode_check_open(ctx, vp, fmode);
9750	if (error)
9751	return error;
9752	#endif
9753
9754	/ compute action to be authorized /
9755	action = `0`;
9756	if (fmode & FREAD) {
9757	action \|= KAUTH_VNODE_READ_DATA;
9758	}
9759	if (fmode & (FWRITE \| O_TRUNC)) {
9760	/*
9761	* If we are writing, appending, and not truncating,
9762	* indicate that we are appending so that if the
9763	* UF_APPEND or SF_APPEND bits are set, we do not deny
9764	* the open.
9765	*/
9766	if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9767	action \|= KAUTH_VNODE_APPEND_DATA;
9768	} else {
9769	action \|= KAUTH_VNODE_WRITE_DATA;
9770	}
9771	}
9772
9773	if ((error = vnode_authorize(vp, NULL, action, ctx)) != `0`)
9774	return error;
9775
9776
9777	//
9778	// if the vnode is tagged VOPENEVT and the current process
9779	// has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9780	// flag to the open mode so that this open won't count against
9781	// the vnode when carbon delete() does a vnode_isinuse() to see
9782	// if a file is currently in use. this allows spotlight
9783	// importers to not interfere with carbon apps that depend on
9784	// the no-delete-if-busy semantics of carbon delete().
9785	//
9786	if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9787	fmode \|= O_EVTONLY;
9788	}
9789
9790	if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9791	return error;
9792	}
9793	if ( (error = vnode_ref_ext(vp, fmode, `0`)) ) {
9794	VNOP_CLOSE(vp, fmode, ctx);
9795	return error;
9796	}
9797
9798	/ Call out to allow 3rd party notification of open.*
9799	* Ignore result of kauth_authorize_fileop call.
9800	*/
9801	#if CONFIG_MACF
9802	mac_vnode_notify_open(ctx, vp, fmode);
9803	#endif
9804	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9805	(uintptr_t)vp, `0`);
9806
9807
9808	return `0`;
9809	}
9810
9811	static int
9812	wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9813	{
9814	int i;
9815	int error = `0`;
9816	int unblock = `0`;
9817	task_t curtask;
9818
9819	lck_mtx_lock(&nspace_handler_exclusion_lock);
9820	if (nspace_handlers[nspace_type].handler_busy) {
9821	lck_mtx_unlock(&nspace_handler_exclusion_lock);
9822	return EBUSY;
9823	}
9824
9825	nspace_handlers[nspace_type].handler_busy = `1`;
9826	lck_mtx_unlock(&nspace_handler_exclusion_lock);
9827
9828	/*
9829	* Any process that gets here will be one of the namespace handlers.
9830	* As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9831	* as we can cause deadlocks to occur, because the namespace handler may prevent
9832	* VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE
9833	* process.
9834	*/
9835	curtask = current_task();
9836	bsd_set_dependency_capable (curtask);
9837
9838	lck_mtx_lock(&nspace_handler_lock);
9839	if (nspace_handlers[nspace_type].handler_proc == NULL) {
9840	nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9841	nspace_handlers[nspace_type].handler_proc = current_proc();
9842	}
9843
9844	if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9845	(snapshot_timestamp == `0` \|\| snapshot_timestamp == ~`0`)) {
9846	error = EINVAL;
9847	}
9848
9849	while (error == `0`) {
9850
9851	/ Try to find matching namespace item /
9852	for (i = `0`; i < MAX_NSPACE_ITEMS; i++) {
9853	if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9854	if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9855	break;
9856	}
9857	}
9858	}
9859
9860	if (i >= MAX_NSPACE_ITEMS) {
9861	/ Nothing is there yet. Wait for wake up and retry /
9862	error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS\|PCATCH, "namespace-items", `0`);
9863	if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == `0` \|\| snapshot_timestamp == ~`0`)) {
9864	/ Prevent infinite loop if snapshot handler exited /
9865	error = EINVAL;
9866	break;
9867	}
9868	continue;
9869	}
9870
9871	nspace_items[i].flags &= ~NSPACE_ITEM_NEW;
9872	nspace_items[i].flags \|= NSPACE_ITEM_PROCESSING;
9873	nspace_items[i].token = ++nspace_token_id;
9874
9875	assert(nspace_items[i].vp);
9876	struct fileproc *fp;
9877	int32_t indx;
9878	int32_t fmode;
9879	struct proc *p = current_proc();
9880	vfs_context_t ctx = vfs_context_current();
9881	struct vnode_attr va;
9882	bool vn_get_succsessful = false;
9883	bool vn_open_successful = false;
9884	bool fp_alloc_successful = false;
9885
9886	/*
9887	* Use vnode pointer to acquire a file descriptor for
9888	* hand-off to userland
9889	*/
9890	fmode = nspace_open_flags_for_type(nspace_type);
9891	error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9892	if (error) goto cleanup;
9893	vn_get_succsessful = true;
9894
9895	error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9896	if (error) goto cleanup;
9897	vn_open_successful = true;
9898
9899	error = falloc(p, &fp, &indx, ctx);
9900	if (error) goto cleanup;
9901	fp_alloc_successful = true;
9902
9903	fp->f_fglob->fg_flag = fmode;
9904	fp->f_fglob->fg_ops = &vnops;
9905	fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9906
9907	proc_fdlock(p);
9908	procfdtbl_releasefd(p, indx, NULL);
9909	fp_drop(p, indx, fp, `1`);
9910	proc_fdunlock(p);
9911
9912	/*
9913	* All variants of the namespace handler struct support these three fields:
9914	* token, flags, and the FD pointer
9915	*/
9916	error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9917	if (error) goto cleanup;
9918	error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9919	if (error) goto cleanup;
9920	error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9921	if (error) goto cleanup;
9922
9923	/*
9924	* Handle optional fields:
9925	* extended version support an info ptr (offset, length), and the
9926	*
9927	* namedata version supports a unique per-link object ID
9928	*
9929	*/
9930	if (nhd->infoptr) {
9931	uio_t uio = (uio_t)nspace_items[i].arg;
9932	uint64_t u_offset, u_length;
9933
9934	if (uio) {
9935	u_offset = uio_offset(uio);
9936	u_length = uio_resid(uio);
9937	} else {
9938	u_offset = `0`;
9939	u_length = `0`;
9940	}
9941	error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9942	if (error) goto cleanup;
9943	error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9944	if (error) goto cleanup;
9945	}
9946
9947	if (nhd->objid) {
9948	VATTR_INIT(&va);
9949	VATTR_WANTED(&va, va_linkid);
9950	error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9951	if (error) goto cleanup;
9952
9953	uint64_t linkid = `0`;
9954	if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9955	linkid = (uint64_t)va.va_linkid;
9956	}
9957	error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9958	}
9959	cleanup:
9960	if (error) {
9961	if (fp_alloc_successful) fp_free(p, indx, fp);
9962	if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9963	unblock = `1`;
9964	}
9965
9966	if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9967
9968	break;
9969	}
9970
9971	if (unblock) {
9972	if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9973	vnode_lock_spin(nspace_items[i].vp);
9974	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9975	vnode_unlock(nspace_items[i].vp);
9976	}
9977	nspace_items[i].vp = NULL;
9978	nspace_items[i].vid = `0`;
9979	nspace_items[i].flags = NSPACE_ITEM_DONE;
9980	nspace_items[i].token = `0`;
9981
9982	wakeup((caddr_t)&(nspace_items[i].vp));
9983	}
9984
9985	if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9986	// just go through every snapshot event and unblock it immediately.
9987	if (error && (snapshot_timestamp == `0` \|\| snapshot_timestamp == ~`0`)) {
9988	for(i = `0`; i < MAX_NSPACE_ITEMS; i++) {
9989	if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9990	if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9991	nspace_items[i].vp = NULL;
9992	nspace_items[i].vid = `0`;
9993	nspace_items[i].flags = NSPACE_ITEM_DONE;
9994	nspace_items[i].token = `0`;
9995
9996	wakeup((caddr_t)&(nspace_items[i].vp));
9997	}
9998	}
9999	}
10000	}
10001	}
10002
10003	lck_mtx_unlock(&nspace_handler_lock);
10004
10005	lck_mtx_lock(&nspace_handler_exclusion_lock);
10006	nspace_handlers[nspace_type].handler_busy = `0`;
10007	lck_mtx_unlock(&nspace_handler_exclusion_lock);
10008
10009	return error;
10010	}
10011
10012	static inline int validate_namespace_args (int is64bit, int size) {
10013
10014	if (is64bit) {
10015	/ Must be one of these /
10016	if (size == sizeof(user64_namespace_handler_info)) {
10017	goto sizeok;
10018	}
10019	if (size == sizeof(user64_namespace_handler_info_ext)) {
10020	goto sizeok;
10021	}
10022	if (size == sizeof(user64_namespace_handler_data)) {
10023	goto sizeok;
10024	}
10025	return EINVAL;
10026	}
10027	else {
10028	/ 32 bit -- must be one of these /
10029	if (size == sizeof(user32_namespace_handler_info)) {
10030	goto sizeok;
10031	}
10032	if (size == sizeof(user32_namespace_handler_info_ext)) {
10033	goto sizeok;
10034	}
10035	if (size == sizeof(user32_namespace_handler_data)) {
10036	goto sizeok;
10037	}
10038	return EINVAL;
10039	}
10040
10041	sizeok:
10042
10043	return `0`;
10044
10045	}
10046
10047	static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
10048	{
10049	int error = `0`;
10050	namespace_handler_data nhd;
10051
10052	bzero (&nhd, sizeof(namespace_handler_data));
10053
10054	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10055	return error;
10056	}
10057
10058	error = validate_namespace_args (is64bit, size);
10059	if (error) {
10060	return error;
10061	}
10062
10063	/ Copy in the userland pointers into our kernel-only struct /
10064
10065	if (is64bit) {
10066	/ 64 bit userland structures /
10067	nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
10068	nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
10069	nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
10070
10071	/ If the size is greater than the standard info struct, add in extra fields /
10072	if (size > (sizeof(user64_namespace_handler_info))) {
10073	if (size >= (sizeof(user64_namespace_handler_info_ext))) {
10074	nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
10075	}
10076	if (size == (sizeof(user64_namespace_handler_data))) {
10077	nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
10078	}
10079	/ Otherwise the fields were pre-zeroed when we did the bzero above. /
10080	}
10081	}
10082	else {
10083	/ 32 bit userland structures /
10084	nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
10085	nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
10086	nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
10087
10088	if (size > (sizeof(user32_namespace_handler_info))) {
10089	if (size >= (sizeof(user32_namespace_handler_info_ext))) {
10090	nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
10091	}
10092	if (size == (sizeof(user32_namespace_handler_data))) {
10093	nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
10094	}
10095	/ Otherwise the fields were pre-zeroed when we did the bzero above. /
10096	}
10097	}
10098
10099	return wait_for_namespace_event(&nhd, nspace_type);
10100	}
10101
10102	static unsigned long
10103	fsctl_bogus_command_compat(unsigned long cmd)
10104	{
10105
10106	switch (cmd) {
10107	case IOCBASECMD(FSIOC_SYNC_VOLUME):
10108	return (FSIOC_SYNC_VOLUME);
10109	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10110	return (FSIOC_ROUTEFS_SETROUTEID);
10111	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10112	return (FSIOC_SET_PACKAGE_EXTS);
10113	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
10114	return (FSIOC_NAMESPACE_HANDLER_GET);
10115	case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
10116	return (FSIOC_OLD_SNAPSHOT_HANDLER_GET);
10117	case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
10118	return (FSIOC_SNAPSHOT_HANDLER_GET_EXT);
10119	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
10120	return (FSIOC_NAMESPACE_HANDLER_UPDATE);
10121	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
10122	return (FSIOC_NAMESPACE_HANDLER_UNBLOCK);
10123	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
10124	return (FSIOC_NAMESPACE_HANDLER_CANCEL);
10125	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
10126	return (FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME);
10127	case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
10128	return (FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS);
10129	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10130	return (FSIOC_SET_FSTYPENAME_OVERRIDE);
10131	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10132	return (DISK_CONDITIONER_IOC_GET);
10133	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10134	return (DISK_CONDITIONER_IOC_SET);
10135	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10136	return (FSIOC_FIOSEEKHOLE);
10137	case IOCBASECMD(FSIOC_FIOSEEKDATA):
10138	return (FSIOC_FIOSEEKDATA);
10139	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10140	return (SPOTLIGHT_IOC_GET_MOUNT_TIME);
10141	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10142	return (SPOTLIGHT_IOC_GET_LAST_MTIME);
10143	}
10144
10145	return (cmd);
10146	}
10147
10148	/*
10149	* Make a filesystem-specific control call:
10150	*/
10151	/ ARGSUSED /
10152	static int
10153	fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10154	{
10155	int error=`0`;
10156	boolean_t is64bit;
10157	u_int size;
10158	#define STK_PARAMS 128
10159	char stkbuf[STK_PARAMS] = {`0`};
10160	caddr_t data, memp;
10161	vnode_t vp = *arg_vp;
10162
10163	cmd = fsctl_bogus_command_compat(cmd);
10164
10165	size = IOCPARM_LEN(cmd);
10166	if (size > IOCPARM_MAX) return (EINVAL);
10167
10168	is64bit = proc_is64bit(p);
10169
10170	memp = NULL;
10171
10172	if (size > sizeof (stkbuf)) {
10173	if ((memp = (caddr_t)kalloc(size)) == `0`) return ENOMEM;
10174	data = memp;
10175	} else {
10176	data = &stkbuf[`0`];
10177	};
10178
10179	if (cmd & IOC_IN) {
10180	if (size) {
10181	error = copyin(udata, data, size);
10182	if (error) {
10183	if (memp) {
10184	kfree (memp, size);
10185	}
10186	return error;
10187	}
10188	} else {
10189	if (is64bit) {
10190	(user_addr_t )data = udata;
10191	}
10192	else {
10193	(uint32_t )data = (uint32_t)udata;
10194	}
10195	};
10196	} else if ((cmd & IOC_OUT) && size) {
10197	/*
10198	* Zero the buffer so the user always
10199	* gets back something deterministic.
10200	*/
10201	bzero(data, size);
10202	} else if (cmd & IOC_VOID) {
10203	if (is64bit) {
10204	(user_addr_t )data = udata;
10205	}
10206	else {
10207	(uint32_t )data = (uint32_t)udata;
10208	}
10209	}
10210
10211	/ Check to see if it's a generic command /
10212	switch (cmd) {
10213
10214	case FSIOC_SYNC_VOLUME: {
10215	mount_t mp = vp->v_mount;
10216	int arg = (uint32_t)data;
10217
10218	/ record vid of vp so we can drop it below. /
10219	uint32_t vvid = vp->v_id;
10220
10221	/*
10222	* Then grab mount_iterref so that we can release the vnode.
10223	* Without this, a thread may call vnode_iterate_prepare then
10224	* get into a deadlock because we've never released the root vp
10225	*/
10226	error = mount_iterref (mp, `0`);
10227	if (error) {
10228	break;
10229	}
10230	vnode_put(vp);
10231
10232	/ issue the sync for this volume /
10233	(void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
10234
10235	/*
10236	* Then release the mount_iterref once we're done syncing; it's not
10237	* needed for the VNOP_IOCTL below
10238	*/
10239	mount_iterdrop(mp);
10240
10241	if (arg & FSCTL_SYNC_FULLSYNC) {
10242	/ re-obtain vnode iocount on the root vp, if possible /
10243	error = vnode_getwithvid (vp, vvid);
10244	if (error == `0`) {
10245	error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, `0`, ctx);
10246	vnode_put (vp);
10247	}
10248	}
10249	/ mark the argument VP as having been released /
10250	*arg_vp = NULL;
10251	}
10252	break;
10253
10254	case FSIOC_ROUTEFS_SETROUTEID: {
10255	#if ROUTEFS
10256	char routepath[MAXPATHLEN];
10257	size_t len = `0`;
10258
10259	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10260	break;
10261	}
10262	bzero(routepath, MAXPATHLEN);
10263	error = copyinstr(udata, &routepath[`0`], MAXPATHLEN, &len);
10264	if (error) {
10265	break;
10266	}
10267	error = routefs_kernel_mount(routepath);
10268	if (error) {
10269	break;
10270	}
10271	#endif
10272	}
10273	break;
10274
10275	case FSIOC_SET_PACKAGE_EXTS: {
10276	user_addr_t ext_strings;
10277	uint32_t num_entries;
10278	uint32_t max_width;
10279
10280	if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, `0`)))
10281	break;
10282
10283	if ( (is64bit && size != sizeof(user64_package_ext_info))
10284	\|\| (is64bit == `0` && size != sizeof(user32_package_ext_info))) {
10285
10286	// either you're 64-bit and passed a 64-bit struct or
10287	// you're 32-bit and passed a 32-bit struct. otherwise
10288	// it's not ok.
10289	error = EINVAL;
10290	break;
10291	}
10292
10293	if (is64bit) {
10294	ext_strings = ((user64_package_ext_info *)data)->strings;
10295	num_entries = ((user64_package_ext_info *)data)->num_entries;
10296	max_width = ((user64_package_ext_info *)data)->max_width;
10297	} else {
10298	ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10299	num_entries = ((user32_package_ext_info *)data)->num_entries;
10300	max_width = ((user32_package_ext_info *)data)->max_width;
10301	}
10302	error = set_package_extensions_table(ext_strings, num_entries, max_width);
10303	}
10304	break;
10305
10306	/ namespace handlers /
10307	case FSIOC_NAMESPACE_HANDLER_GET: {
10308	error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10309	}
10310	break;
10311
10312	/ Snapshot handlers /
10313	case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
10314	error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10315	}
10316	break;
10317
10318	case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
10319	error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10320	}
10321	break;
10322
10323	case FSIOC_NAMESPACE_HANDLER_UPDATE: {
10324	uint32_t token, val;
10325	int i;
10326
10327	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10328	break;
10329	}
10330
10331	if (!nspace_is_special_process(p)) {
10332	error = EINVAL;
10333	break;
10334	}
10335
10336	token = ((uint32_t *)data)[`0`];
10337	val = ((uint32_t *)data)[`1`];
10338
10339	lck_mtx_lock(&nspace_handler_lock);
10340
10341	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
10342	if (nspace_items[i].token == token) {
10343	break; / exit for loop, not case stmt /
10344	}
10345	}
10346
10347	if (i >= MAX_NSPACE_ITEMS) {
10348	error = ENOENT;
10349	} else {
10350	//
10351	// if this bit is set, when resolve_nspace_item() times out
10352	// it will loop and go back to sleep.
10353	//
10354	nspace_items[i].flags \|= NSPACE_ITEM_RESET_TIMER;
10355	}
10356
10357	lck_mtx_unlock(&nspace_handler_lock);
10358
10359	if (error) {
10360	printf("nspace-handler-update: did not find token %u\n", token);
10361	}
10362	}
10363	break;
10364
10365	case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
10366	uint32_t token, val;
10367	int i;
10368
10369	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10370	break;
10371	}
10372
10373	if (!nspace_is_special_process(p)) {
10374	error = EINVAL;
10375	break;
10376	}
10377
10378	token = ((uint32_t *)data)[`0`];
10379	val = ((uint32_t *)data)[`1`];
10380
10381	lck_mtx_lock(&nspace_handler_lock);
10382
10383	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
10384	if (nspace_items[i].token == token) {
10385	break; / exit for loop, not case statement /
10386	}
10387	}
10388
10389	if (i >= MAX_NSPACE_ITEMS) {
10390	printf("nspace-handler-unblock: did not find token %u\n", token);
10391	error = ENOENT;
10392	} else {
10393	if (val == `0` && nspace_items[i].vp) {
10394	vnode_lock_spin(nspace_items[i].vp);
10395	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10396	vnode_unlock(nspace_items[i].vp);
10397	}
10398
10399	nspace_items[i].vp = NULL;
10400	nspace_items[i].arg = NULL;
10401	nspace_items[i].op = `0`;
10402	nspace_items[i].vid = `0`;
10403	nspace_items[i].flags = NSPACE_ITEM_DONE;
10404	nspace_items[i].token = `0`;
10405
10406	wakeup((caddr_t)&(nspace_items[i].vp));
10407	}
10408
10409	lck_mtx_unlock(&nspace_handler_lock);
10410	}
10411	break;
10412
10413	case FSIOC_NAMESPACE_HANDLER_CANCEL: {
10414	uint32_t token, val;
10415	int i;
10416
10417	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10418	break;
10419	}
10420
10421	if (!nspace_is_special_process(p)) {
10422	error = EINVAL;
10423	break;
10424	}
10425
10426	token = ((uint32_t *)data)[`0`];
10427	val = ((uint32_t *)data)[`1`];
10428
10429	lck_mtx_lock(&nspace_handler_lock);
10430
10431	for(i=`0`; i < MAX_NSPACE_ITEMS; i++) {
10432	if (nspace_items[i].token == token) {
10433	break; / exit for loop, not case stmt /
10434	}
10435	}
10436
10437	if (i >= MAX_NSPACE_ITEMS) {
10438	printf("nspace-handler-cancel: did not find token %u\n", token);
10439	error = ENOENT;
10440	} else {
10441	if (nspace_items[i].vp) {
10442	vnode_lock_spin(nspace_items[i].vp);
10443	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10444	vnode_unlock(nspace_items[i].vp);
10445	}
10446
10447	nspace_items[i].vp = NULL;
10448	nspace_items[i].arg = NULL;
10449	nspace_items[i].vid = `0`;
10450	nspace_items[i].token = val;
10451	nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10452	nspace_items[i].flags \|= NSPACE_ITEM_CANCELLED;
10453
10454	wakeup((caddr_t)&(nspace_items[i].vp));
10455	}
10456
10457	lck_mtx_unlock(&nspace_handler_lock);
10458	}
10459	break;
10460
10461	case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10462	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10463	break;
10464	}
10465
10466	// we explicitly do not do the namespace_handler_proc check here
10467
10468	lck_mtx_lock(&nspace_handler_lock);
10469	snapshot_timestamp = ((uint32_t *)data)[`0`];
10470	wakeup(&nspace_item_idx);
10471	lck_mtx_unlock(&nspace_handler_lock);
10472	printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10473
10474	}
10475	break;
10476
10477	case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10478	{
10479	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10480	break;
10481	}
10482
10483	lck_mtx_lock(&nspace_handler_lock);
10484	nspace_allow_virtual_devs = ((uint32_t *)data)[`0`];
10485	lck_mtx_unlock(&nspace_handler_lock);
10486	printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10487	nspace_allow_virtual_devs ? "" : " NOT");
10488	error = `0`;
10489
10490	}
10491	break;
10492
10493	case FSIOC_SET_FSTYPENAME_OVERRIDE:
10494	{
10495	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10496	break;
10497	}
10498	if (vp->v_mount) {
10499	mount_lock(vp->v_mount);
10500	if (data[`0`] != `0`) {
10501	strlcpy(&vp->v_mount->fstypename_override[`0`], data, MFSTYPENAMELEN);
10502	vp->v_mount->mnt_kern_flag \|= MNTK_TYPENAME_OVERRIDE;
10503	if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == `0`) {
10504	vp->v_mount->mnt_kern_flag \|= MNTK_EXTENDED_SECURITY;
10505	vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10506	}
10507	} else {
10508	if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == `0`) {
10509	vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10510	}
10511	vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10512	vp->v_mount->fstypename_override[`0`] = `'\0'`;
10513	}
10514	mount_unlock(vp->v_mount);
10515	}
10516	}
10517	break;
10518
10519	case DISK_CONDITIONER_IOC_GET: {
10520	error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
10521	}
10522	break;
10523
10524	case DISK_CONDITIONER_IOC_SET: {
10525	error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
10526	}
10527	break;
10528
10529	default: {
10530	/ other, known commands shouldn't be passed down here /
10531	switch (cmd) {
10532	case F_PUNCHHOLE:
10533	case F_TRIM_ACTIVE_FILE:
10534	case F_RDADVISE:
10535	case F_TRANSCODEKEY:
10536	case F_GETPROTECTIONLEVEL:
10537	case F_GETDEFAULTPROTLEVEL:
10538	case F_MAKECOMPRESSED:
10539	case F_SET_GREEDY_MODE:
10540	case F_SETSTATICCONTENT:
10541	case F_SETIOTYPE:
10542	case F_SETBACKINGSTORE:
10543	case F_GETPATH_MTMINFO:
10544	case APFSIOC_REVERT_TO_SNAPSHOT:
10545	case FSIOC_FIOSEEKHOLE:
10546	case FSIOC_FIOSEEKDATA:
10547	case HFS_GET_BOOT_INFO:
10548	case HFS_SET_BOOT_INFO:
10549	case FIOPINSWAP:
10550	case F_CHKCLEAN:
10551	case F_FULLFSYNC:
10552	case F_BARRIERFSYNC:
10553	case F_FREEZE_FS:
10554	case F_THAW_FS:
10555	error = EINVAL;
10556	goto outdrop;
10557	}
10558	/ Invoke the filesystem-specific code /
10559	error = VNOP_IOCTL(vp, cmd, data, options, ctx);
10560	}
10561
10562	} / end switch stmt /
10563
10564	/*
10565	* if no errors, copy any data to user. Size was
10566	* already set and checked above.
10567	*/
10568	if (error == `0` && (cmd & IOC_OUT) && size)
10569	error = copyout(data, udata, size);
10570
10571	outdrop:
10572	if (memp) {
10573	kfree(memp, size);
10574	}
10575
10576	return error;
10577	}
10578
10579	/ ARGSUSED /
10580	int
10581	fsctl (proc_t p, struct fsctl_args uap, __unused int32_t retval)
10582	{
10583	int error;
10584	struct nameidata nd;
10585	u_long nameiflags;
10586	vnode_t vp = NULL;
10587	vfs_context_t ctx = vfs_context_current();
10588
10589	AUDIT_ARG(cmd, uap->cmd);
10590	AUDIT_ARG(value32, uap->options);
10591	/ Get the vnode for the file we are getting info on: /
10592	nameiflags = `0`;
10593	if ((uap->options & FSOPT_NOFOLLOW) == `0`) nameiflags \|= FOLLOW;
10594	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags \| AUDITVNPATH1,
10595	UIO_USERSPACE, uap->path, ctx);
10596	if ((error = namei(&nd))) goto done;
10597	vp = nd.ni_vp;
10598	nameidone(&nd);
10599
10600	#if CONFIG_MACF
10601	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10602	if (error) {
10603	goto done;
10604	}
10605	#endif
10606
10607	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10608
10609	done:
10610	if (vp)
10611	vnode_put(vp);
10612	return error;
10613	}
10614	/ ARGSUSED /
10615	int
10616	ffsctl (proc_t p, struct ffsctl_args uap, __unused int32_t retval)
10617	{
10618	int error;
10619	vnode_t vp = NULL;
10620	vfs_context_t ctx = vfs_context_current();
10621	int fd = -`1`;
10622
10623	AUDIT_ARG(fd, uap->fd);
10624	AUDIT_ARG(cmd, uap->cmd);
10625	AUDIT_ARG(value32, uap->options);
10626
10627	/ Get the vnode for the file we are getting info on: /
10628	if ((error = file_vnode(uap->fd, &vp)))
10629	return error;
10630	fd = uap->fd;
10631	if ((error = vnode_getwithref(vp))) {
10632	file_drop(fd);
10633	return error;
10634	}
10635
10636	#if CONFIG_MACF
10637	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10638	file_drop(fd);
10639	vnode_put(vp);
10640	return error;
10641	}
10642	#endif
10643
10644	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10645
10646	file_drop(fd);
10647
10648	/validate vp; fsctl_internal() can drop iocount and reset vp to NULL/
10649	if (vp) {
10650	vnode_put(vp);
10651	}
10652
10653	return error;
10654	}
10655	/ end of fsctl system call /
10656
10657	/*
10658	* Retrieve the data of an extended attribute.
10659	*/
10660	int
10661	getxattr(proc_t p, struct getxattr_args uap, user_ssize_t retval)
10662	{
10663	vnode_t vp;
10664	struct nameidata nd;
10665	char attrname[XATTR_MAXNAMELEN+`1`];
10666	vfs_context_t ctx = vfs_context_current();
10667	uio_t auio = NULL;
10668	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10669	size_t attrsize = `0`;
10670	size_t namelen;
10671	u_int32_t nameiflags;
10672	int error;
10673	char uio_buf[ UIO_SIZEOF(`1`) ];
10674
10675	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
10676	return (EINVAL);
10677
10678	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
10679	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10680	if ((error = namei(&nd))) {
10681	return (error);
10682	}
10683	vp = nd.ni_vp;
10684	nameidone(&nd);
10685
10686	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10687	if (error != `0`) {
10688	goto out;
10689	}
10690	if (xattr_protected(attrname)) {
10691	if (!vfs_context_issuser(ctx) \|\| strcmp(attrname, "com.apple.system.Security") != `0`) {
10692	error = EPERM;
10693	goto out;
10694	}
10695	}
10696	/*
10697	* the specific check for 0xffffffff is a hack to preserve
10698	* binaray compatibilty in K64 with applications that discovered
10699	* that passing in a buf pointer and a size of -1 resulted in
10700	* just the size of the indicated extended attribute being returned.
10701	* this isn't part of the documented behavior, but because of the
10702	* original implemtation's check for "uap->size > 0", this behavior
10703	* was allowed. In K32 that check turned into a signed comparison
10704	* even though uap->size is unsigned... in K64, we blow by that
10705	* check because uap->size is unsigned and doesn't get sign smeared
10706	* in the munger for a 32 bit user app. we also need to add a
10707	* check to limit the maximum size of the buffer being passed in...
10708	* unfortunately, the underlying fileystems seem to just malloc
10709	* the requested size even if the actual extended attribute is tiny.
10710	* because that malloc is for kernel wired memory, we have to put a
10711	* sane limit on it.
10712	*
10713	* U32 running on K64 will yield 0x00000000ffffffff for uap->size
10714	* U64 running on K64 will yield -1 (64 bits wide)
10715	* U32/U64 running on K32 will yield -1 (32 bits wide)
10716	*/
10717	if (uap->size == `0xffffffff` \|\| uap->size == (size_t)-`1`)
10718	goto no_uio;
10719
10720	if (uap->value) {
10721	if (uap->size > (size_t)XATTR_MAXSIZE)
10722	uap->size = XATTR_MAXSIZE;
10723
10724	auio = uio_createwithbuffer(`1`, uap->position, spacetype, UIO_READ,
10725	&uio_buf[`0`], sizeof(uio_buf));
10726	uio_addiov(auio, uap->value, uap->size);
10727	}
10728	no_uio:
10729	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10730	out:
10731	vnode_put(vp);
10732
10733	if (auio) {
10734	*retval = uap->size - uio_resid(auio);
10735	} else {
10736	*retval = (user_ssize_t)attrsize;
10737	}
10738
10739	return (error);
10740	}
10741
10742	/*
10743	* Retrieve the data of an extended attribute.
10744	*/
10745	int
10746	fgetxattr(proc_t p, struct fgetxattr_args uap, user_ssize_t retval)
10747	{
10748	vnode_t vp;
10749	char attrname[XATTR_MAXNAMELEN+`1`];
10750	uio_t auio = NULL;
10751	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10752	size_t attrsize = `0`;
10753	size_t namelen;
10754	int error;
10755	char uio_buf[ UIO_SIZEOF(`1`) ];
10756
10757	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
10758	return (EINVAL);
10759
10760	if ( (error = file_vnode(uap->fd, &vp)) ) {
10761	return (error);
10762	}
10763	if ( (error = vnode_getwithref(vp)) ) {
10764	file_drop(uap->fd);
10765	return(error);
10766	}
10767	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10768	if (error != `0`) {
10769	goto out;
10770	}
10771	if (xattr_protected(attrname)) {
10772	error = EPERM;
10773	goto out;
10774	}
10775	if (uap->value && uap->size > `0`) {
10776	auio = uio_createwithbuffer(`1`, uap->position, spacetype, UIO_READ,
10777	&uio_buf[`0`], sizeof(uio_buf));
10778	uio_addiov(auio, uap->value, uap->size);
10779	}
10780
10781	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10782	out:
10783	(void)vnode_put(vp);
10784	file_drop(uap->fd);
10785
10786	if (auio) {
10787	*retval = uap->size - uio_resid(auio);
10788	} else {
10789	*retval = (user_ssize_t)attrsize;
10790	}
10791	return (error);
10792	}
10793
10794	/*
10795	* Set the data of an extended attribute.
10796	*/
10797	int
10798	setxattr(proc_t p, struct setxattr_args uap, int* *retval)
10799	{
10800	vnode_t vp;
10801	struct nameidata nd;
10802	char attrname[XATTR_MAXNAMELEN+`1`];
10803	vfs_context_t ctx = vfs_context_current();
10804	uio_t auio = NULL;
10805	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10806	size_t namelen;
10807	u_int32_t nameiflags;
10808	int error;
10809	char uio_buf[ UIO_SIZEOF(`1`) ];
10810
10811	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
10812	return (EINVAL);
10813
10814	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10815	if (error != `0`) {
10816	if (error == EPERM) {
10817	/ if the string won't fit in attrname, copyinstr emits EPERM /
10818	return (ENAMETOOLONG);
10819	}
10820	/ Otherwise return the default error from copyinstr to detect ERANGE, etc /
10821	return error;
10822	}
10823	if (xattr_protected(attrname))
10824	return(EPERM);
10825	if (uap->size != `0` && uap->value == `0`) {
10826	return (EINVAL);
10827	}
10828
10829	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
10830	NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10831	if ((error = namei(&nd))) {
10832	return (error);
10833	}
10834	vp = nd.ni_vp;
10835	nameidone(&nd);
10836
10837	auio = uio_createwithbuffer(`1`, uap->position, spacetype, UIO_WRITE,
10838	&uio_buf[`0`], sizeof(uio_buf));
10839	uio_addiov(auio, uap->value, uap->size);
10840
10841	error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10842	#if CONFIG_FSE
10843	if (error == `0`) {
10844	add_fsevent(FSE_XATTR_MODIFIED, ctx,
10845	FSE_ARG_VNODE, vp,
10846	FSE_ARG_DONE);
10847	}
10848	#endif
10849	vnode_put(vp);
10850	*retval = `0`;
10851	return (error);
10852	}
10853
10854	/*
10855	* Set the data of an extended attribute.
10856	*/
10857	int
10858	fsetxattr(proc_t p, struct fsetxattr_args uap, int* *retval)
10859	{
10860	vnode_t vp;
10861	char attrname[XATTR_MAXNAMELEN+`1`];
10862	uio_t auio = NULL;
10863	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10864	size_t namelen;
10865	int error;
10866	char uio_buf[ UIO_SIZEOF(`1`) ];
10867	#if CONFIG_FSE
10868	vfs_context_t ctx = vfs_context_current();
10869	#endif
10870
10871	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
10872	return (EINVAL);
10873
10874	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10875	if (error != `0`) {
10876	if (error == EPERM) {
10877	/ if the string won't fit in attrname, copyinstr emits EPERM /
10878	return (ENAMETOOLONG);
10879	}
10880	/ Otherwise return the default error from copyinstr to detect ERANGE, etc /
10881	return error;
10882	}
10883	if (xattr_protected(attrname))
10884	return(EPERM);
10885	if (uap->size != `0` && uap->value == `0`) {
10886	return (EINVAL);
10887	}
10888	if ( (error = file_vnode(uap->fd, &vp)) ) {
10889	return (error);
10890	}
10891	if ( (error = vnode_getwithref(vp)) ) {
10892	file_drop(uap->fd);
10893	return(error);
10894	}
10895	auio = uio_createwithbuffer(`1`, uap->position, spacetype, UIO_WRITE,
10896	&uio_buf[`0`], sizeof(uio_buf));
10897	uio_addiov(auio, uap->value, uap->size);
10898
10899	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10900	#if CONFIG_FSE
10901	if (error == `0`) {
10902	add_fsevent(FSE_XATTR_MODIFIED, ctx,
10903	FSE_ARG_VNODE, vp,
10904	FSE_ARG_DONE);
10905	}
10906	#endif
10907	vnode_put(vp);
10908	file_drop(uap->fd);
10909	*retval = `0`;
10910	return (error);
10911	}
10912
10913	/*
10914	* Remove an extended attribute.
10915	* XXX Code duplication here.
10916	*/
10917	int
10918	removexattr(proc_t p, struct removexattr_args uap, int* *retval)
10919	{
10920	vnode_t vp;
10921	struct nameidata nd;
10922	char attrname[XATTR_MAXNAMELEN+`1`];
10923	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10924	vfs_context_t ctx = vfs_context_current();
10925	size_t namelen;
10926	u_int32_t nameiflags;
10927	int error;
10928
10929	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
10930	return (EINVAL);
10931
10932	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10933	if (error != `0`) {
10934	return (error);
10935	}
10936	if (xattr_protected(attrname))
10937	return(EPERM);
10938	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
10939	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10940	if ((error = namei(&nd))) {
10941	return (error);
10942	}
10943	vp = nd.ni_vp;
10944	nameidone(&nd);
10945
10946	error = vn_removexattr(vp, attrname, uap->options, ctx);
10947	#if CONFIG_FSE
10948	if (error == `0`) {
10949	add_fsevent(FSE_XATTR_REMOVED, ctx,
10950	FSE_ARG_VNODE, vp,
10951	FSE_ARG_DONE);
10952	}
10953	#endif
10954	vnode_put(vp);
10955	*retval = `0`;
10956	return (error);
10957	}
10958
10959	/*
10960	* Remove an extended attribute.
10961	* XXX Code duplication here.
10962	*/
10963	int
10964	fremovexattr(__unused proc_t p, struct fremovexattr_args uap, int* *retval)
10965	{
10966	vnode_t vp;
10967	char attrname[XATTR_MAXNAMELEN+`1`];
10968	size_t namelen;
10969	int error;
10970	#if CONFIG_FSE
10971	vfs_context_t ctx = vfs_context_current();
10972	#endif
10973
10974	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
10975	return (EINVAL);
10976
10977	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10978	if (error != `0`) {
10979	return (error);
10980	}
10981	if (xattr_protected(attrname))
10982	return(EPERM);
10983	if ( (error = file_vnode(uap->fd, &vp)) ) {
10984	return (error);
10985	}
10986	if ( (error = vnode_getwithref(vp)) ) {
10987	file_drop(uap->fd);
10988	return(error);
10989	}
10990
10991	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10992	#if CONFIG_FSE
10993	if (error == `0`) {
10994	add_fsevent(FSE_XATTR_REMOVED, ctx,
10995	FSE_ARG_VNODE, vp,
10996	FSE_ARG_DONE);
10997	}
10998	#endif
10999	vnode_put(vp);
11000	file_drop(uap->fd);
11001	*retval = `0`;
11002	return (error);
11003	}
11004
11005	/*
11006	* Retrieve the list of extended attribute names.
11007	* XXX Code duplication here.
11008	*/
11009	int
11010	listxattr(proc_t p, struct listxattr_args uap, user_ssize_t retval)
11011	{
11012	vnode_t vp;
11013	struct nameidata nd;
11014	vfs_context_t ctx = vfs_context_current();
11015	uio_t auio = NULL;
11016	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11017	size_t attrsize = `0`;
11018	u_int32_t nameiflags;
11019	int error;
11020	char uio_buf[ UIO_SIZEOF(`1`) ];
11021
11022	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
11023	return (EINVAL);
11024
11025	nameiflags = (uap->options & XATTR_NOFOLLOW) ? `0` : FOLLOW;
11026	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11027	if ((error = namei(&nd))) {
11028	return (error);
11029	}
11030	vp = nd.ni_vp;
11031	nameidone(&nd);
11032	if (uap->namebuf != `0` && uap->bufsize > `0`) {
11033	auio = uio_createwithbuffer(`1`, `0`, spacetype, UIO_READ,
11034	&uio_buf[`0`], sizeof(uio_buf));
11035	uio_addiov(auio, uap->namebuf, uap->bufsize);
11036	}
11037
11038	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11039
11040	vnode_put(vp);
11041	if (auio) {
11042	*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11043	} else {
11044	*retval = (user_ssize_t)attrsize;
11045	}
11046	return (error);
11047	}
11048
11049	/*
11050	* Retrieve the list of extended attribute names.
11051	* XXX Code duplication here.
11052	*/
11053	int
11054	flistxattr(proc_t p, struct flistxattr_args uap, user_ssize_t retval)
11055	{
11056	vnode_t vp;
11057	uio_t auio = NULL;
11058	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11059	size_t attrsize = `0`;
11060	int error;
11061	char uio_buf[ UIO_SIZEOF(`1`) ];
11062
11063	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
11064	return (EINVAL);
11065
11066	if ( (error = file_vnode(uap->fd, &vp)) ) {
11067	return (error);
11068	}
11069	if ( (error = vnode_getwithref(vp)) ) {
11070	file_drop(uap->fd);
11071	return(error);
11072	}
11073	if (uap->namebuf != `0` && uap->bufsize > `0`) {
11074	auio = uio_createwithbuffer(`1`, `0`, spacetype,
11075	UIO_READ, &uio_buf[`0`], sizeof(uio_buf));
11076	uio_addiov(auio, uap->namebuf, uap->bufsize);
11077	}
11078
11079	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11080
11081	vnode_put(vp);
11082	file_drop(uap->fd);
11083	if (auio) {
11084	*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11085	} else {
11086	*retval = (user_ssize_t)attrsize;
11087	}
11088	return (error);
11089	}
11090
11091	static int fsgetpath_internal(
11092	vfs_context_t ctx, int volfs_id, uint64_t objid,
11093	vm_size_t bufsize, caddr_t buf, int *pathlen)
11094	{
11095	int error;
11096	struct mount *mp = NULL;
11097	vnode_t vp;
11098	int length;
11099	int bpflags;
11100	/ maximum number of times to retry build_path /
11101	unsigned int retries = `0x10`;
11102
11103	if (bufsize > PAGE_SIZE) {
11104	return (EINVAL);
11105	}
11106
11107	if (buf == NULL) {
11108	return (ENOMEM);
11109	}
11110
11111	retry:
11112	if ((mp = mount_lookupby_volfsid(volfs_id, `1`)) == NULL) {
11113	error = ENOTSUP; / unexpected failure /
11114	return ENOTSUP;
11115	}
11116
11117	unionget:
11118	if (objid == `2`) {
11119	error = VFS_ROOT(mp, &vp, ctx);
11120	} else {
11121	error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11122	}
11123
11124	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11125	/*
11126	* If the fileid isn't found and we're in a union
11127	* mount volume, then see if the fileid is in the
11128	* mounted-on volume.
11129	*/
11130	struct mount *tmp = mp;
11131	mp = vnode_mount(tmp->mnt_vnodecovered);
11132	vfs_unbusy(tmp);
11133	if (vfs_busy(mp, LK_NOWAIT) == `0`)
11134	goto unionget;
11135	} else {
11136	vfs_unbusy(mp);
11137	}
11138
11139	if (error) {
11140	return error;
11141	}
11142
11143	#if CONFIG_MACF
11144	error = mac_vnode_check_fsgetpath(ctx, vp);
11145	if (error) {
11146	vnode_put(vp);
11147	return error;
11148	}
11149	#endif
11150
11151	/ Obtain the absolute path to this vnode. /
11152	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : `0`;
11153	bpflags \|= BUILDPATH_CHECK_MOVED;
11154	error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11155	vnode_put(vp);
11156
11157	if (error) {
11158	/ there was a race building the path, try a few more times /
11159	if (error == EAGAIN) {
11160	--retries;
11161	if (retries > `0`)
11162	goto retry;
11163
11164	error = ENOENT;
11165	}
11166	goto out;
11167	}
11168
11169	AUDIT_ARG(text, buf);
11170
11171	if (kdebug_enable) {
11172	long dbg_parms[NUMPARMS];
11173	int dbg_namelen;
11174
11175	dbg_namelen = (int)sizeof(dbg_parms);
11176
11177	if (length < dbg_namelen) {
11178	memcpy((char *)dbg_parms, buf, length);
11179	memset((char *)dbg_parms + length, `0`, dbg_namelen - length);
11180
11181	dbg_namelen = length;
11182	} else {
11183	memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11184	}
11185
11186	kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11187	KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11188	}
11189
11190	pathlen = (user_ssize_t)length; /* may be superseded by error /
11191
11192	out:
11193	return (error);
11194	}
11195
11196	/*
11197	* Obtain the full pathname of a file system object by id.
11198	*/
11199	int
11200	fsgetpath(__unused proc_t p, struct fsgetpath_args uap, user_ssize_t retval)
11201	{
11202	vfs_context_t ctx = vfs_context_current();
11203	fsid_t fsid;
11204	char *realpath;
11205	int length;
11206	int error;
11207
11208	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11209	return (error);
11210	}
11211	AUDIT_ARG(value32, fsid.val[`0`]);
11212	AUDIT_ARG(value64, uap->objid);
11213	/ Restrict output buffer size for now. /
11214
11215	if (uap->bufsize > PAGE_SIZE) {
11216	return (EINVAL);
11217	}
11218	MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK \| M_ZERO);
11219	if (realpath == NULL) {
11220	return (ENOMEM);
11221	}
11222
11223	error = fsgetpath_internal(
11224	ctx, fsid.val[`0`], uap->objid,
11225	uap->bufsize, realpath, &length);
11226
11227	if (error) {
11228	goto out;
11229	}
11230
11231	error = copyout((caddr_t)realpath, uap->buf, length);
11232
11233	retval = (user_ssize_t)length; /* may be superseded by error /
11234	out:
11235	if (realpath) {
11236	FREE(realpath, M_TEMP);
11237	}
11238	return (error);
11239	}
11240
11241	/*
11242	* Common routine to handle various flavors of statfs data heading out
11243	* to user space.
11244	*
11245	* Returns: 0 Success
11246	* EFAULT
11247	*/
11248	static int
11249	munge_statfs(struct mount mp, struct* vfsstatfs *sfsp,
11250	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11251	boolean_t partial_copy)
11252	{
11253	int error;
11254	int my_size, copy_size;
11255
11256	if (is_64_bit) {
11257	struct user64_statfs sfs;
11258	my_size = copy_size = sizeof(sfs);
11259	bzero(&sfs, my_size);
11260	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11261	sfs.f_type = mp->mnt_vtable->vfc_typenum;
11262	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11263	sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
11264	sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
11265	sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
11266	sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
11267	sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
11268	sfs.f_files = (user64_long_t)sfsp->f_files;
11269	sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11270	sfs.f_fsid = sfsp->f_fsid;
11271	sfs.f_owner = sfsp->f_owner;
11272	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11273	strlcpy(&sfs.f_fstypename[`0`], &mp->fstypename_override[`0`], MFSNAMELEN);
11274	} else {
11275	strlcpy(&sfs.f_fstypename[`0`], &sfsp->f_fstypename[`0`], MFSNAMELEN);
11276	}
11277	strlcpy(&sfs.f_mntonname[`0`], &sfsp->f_mntonname[`0`], MNAMELEN);
11278	strlcpy(&sfs.f_mntfromname[`0`], &sfsp->f_mntfromname[`0`], MNAMELEN);
11279
11280	if (partial_copy) {
11281	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11282	}
11283	error = copyout((caddr_t)&sfs, bufp, copy_size);
11284	}
11285	else {
11286	struct user32_statfs sfs;
11287
11288	my_size = copy_size = sizeof(sfs);
11289	bzero(&sfs, my_size);
11290
11291	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11292	sfs.f_type = mp->mnt_vtable->vfc_typenum;
11293	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11294
11295	/*
11296	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
11297	* have to fudge the numbers here in that case. We inflate the blocksize in order
11298	* to reflect the filesystem size as best we can.
11299	*/
11300	if ((sfsp->f_blocks > INT_MAX)
11301	/ Hack for 4061702 . I think the real fix is for Carbon to*
11302	* look for some volume capability and not depend on hidden
11303	* semantics agreed between a FS and carbon.
11304	* f_blocks, f_bfree, and f_bavail set to -1 is the trigger
11305	* for Carbon to set bNoVolumeSizes volume attribute.
11306	* Without this the webdavfs files cannot be copied onto
11307	* disk as they look huge. This change should not affect
11308	* XSAN as they should not setting these to -1..
11309	*/
11310	&& (sfsp->f_blocks != `0xffffffffffffffffULL`)
11311	&& (sfsp->f_bfree != `0xffffffffffffffffULL`)
11312	&& (sfsp->f_bavail != `0xffffffffffffffffULL`)) {
11313	int shift;
11314
11315	/*
11316	* Work out how far we have to shift the block count down to make it fit.
11317	* Note that it's possible to have to shift so far that the resulting
11318	* blocksize would be unreportably large. At that point, we will clip
11319	* any values that don't fit.
11320	*
11321	* For safety's sake, we also ensure that f_iosize is never reported as
11322	* being smaller than f_bsize.
11323	*/
11324	for (shift = `0`; shift < `32`; shift++) {
11325	if ((sfsp->f_blocks >> shift) <= INT_MAX)
11326	break;
11327	if ((sfsp->f_bsize << (shift + `1`)) > INT_MAX)
11328	break;
11329	}
11330	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
11331	sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
11332	sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
11333	sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
11334	#undef __SHIFT_OR_CLIP
11335	sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
11336	sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
11337	} else {
11338	/ filesystem is small enough to be reported honestly /
11339	sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11340	sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11341	sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11342	sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11343	sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11344	}
11345	sfs.f_files = (user32_long_t)sfsp->f_files;
11346	sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11347	sfs.f_fsid = sfsp->f_fsid;
11348	sfs.f_owner = sfsp->f_owner;
11349	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11350	strlcpy(&sfs.f_fstypename[`0`], &mp->fstypename_override[`0`], MFSNAMELEN);
11351	} else {
11352	strlcpy(&sfs.f_fstypename[`0`], &sfsp->f_fstypename[`0`], MFSNAMELEN);
11353	}
11354	strlcpy(&sfs.f_mntonname[`0`], &sfsp->f_mntonname[`0`], MNAMELEN);
11355	strlcpy(&sfs.f_mntfromname[`0`], &sfsp->f_mntfromname[`0`], MNAMELEN);
11356
11357	if (partial_copy) {
11358	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11359	}
11360	error = copyout((caddr_t)&sfs, bufp, copy_size);
11361	}
11362
11363	if (sizep != NULL) {
11364	*sizep = my_size;
11365	}
11366	return(error);
11367	}
11368
11369	/*
11370	* copy stat structure into user_stat structure.
11371	*/
11372	void munge_user64_stat(struct stat sbp, struct* user64_stat *usbp)
11373	{
11374	bzero(usbp, sizeof(*usbp));
11375
11376	usbp->st_dev = sbp->st_dev;
11377	usbp->st_ino = sbp->st_ino;
11378	usbp->st_mode = sbp->st_mode;
11379	usbp->st_nlink = sbp->st_nlink;
11380	usbp->st_uid = sbp->st_uid;
11381	usbp->st_gid = sbp->st_gid;
11382	usbp->st_rdev = sbp->st_rdev;
11383	#ifndef _POSIX_C_SOURCE
11384	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11385	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11386	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11387	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11388	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11389	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11390	#else
11391	usbp->st_atime = sbp->st_atime;
11392	usbp->st_atimensec = sbp->st_atimensec;
11393	usbp->st_mtime = sbp->st_mtime;
11394	usbp->st_mtimensec = sbp->st_mtimensec;
11395	usbp->st_ctime = sbp->st_ctime;
11396	usbp->st_ctimensec = sbp->st_ctimensec;
11397	#endif
11398	usbp->st_size = sbp->st_size;
11399	usbp->st_blocks = sbp->st_blocks;
11400	usbp->st_blksize = sbp->st_blksize;
11401	usbp->st_flags = sbp->st_flags;
11402	usbp->st_gen = sbp->st_gen;
11403	usbp->st_lspare = sbp->st_lspare;
11404	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
11405	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
11406	}
11407
11408	void munge_user32_stat(struct stat sbp, struct* user32_stat *usbp)
11409	{
11410	bzero(usbp, sizeof(*usbp));
11411
11412	usbp->st_dev = sbp->st_dev;
11413	usbp->st_ino = sbp->st_ino;
11414	usbp->st_mode = sbp->st_mode;
11415	usbp->st_nlink = sbp->st_nlink;
11416	usbp->st_uid = sbp->st_uid;
11417	usbp->st_gid = sbp->st_gid;
11418	usbp->st_rdev = sbp->st_rdev;
11419	#ifndef _POSIX_C_SOURCE
11420	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11421	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11422	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11423	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11424	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11425	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11426	#else
11427	usbp->st_atime = sbp->st_atime;
11428	usbp->st_atimensec = sbp->st_atimensec;
11429	usbp->st_mtime = sbp->st_mtime;
11430	usbp->st_mtimensec = sbp->st_mtimensec;
11431	usbp->st_ctime = sbp->st_ctime;
11432	usbp->st_ctimensec = sbp->st_ctimensec;
11433	#endif
11434	usbp->st_size = sbp->st_size;
11435	usbp->st_blocks = sbp->st_blocks;
11436	usbp->st_blksize = sbp->st_blksize;
11437	usbp->st_flags = sbp->st_flags;
11438	usbp->st_gen = sbp->st_gen;
11439	usbp->st_lspare = sbp->st_lspare;
11440	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
11441	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
11442	}
11443
11444	/*
11445	* copy stat64 structure into user_stat64 structure.
11446	*/
11447	void munge_user64_stat64(struct stat64 sbp, struct* user64_stat64 *usbp)
11448	{
11449	bzero(usbp, sizeof(*usbp));
11450
11451	usbp->st_dev = sbp->st_dev;
11452	usbp->st_ino = sbp->st_ino;
11453	usbp->st_mode = sbp->st_mode;
11454	usbp->st_nlink = sbp->st_nlink;
11455	usbp->st_uid = sbp->st_uid;
11456	usbp->st_gid = sbp->st_gid;
11457	usbp->st_rdev = sbp->st_rdev;
11458	#ifndef _POSIX_C_SOURCE
11459	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11460	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11461	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11462	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11463	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11464	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11465	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11466	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11467	#else
11468	usbp->st_atime = sbp->st_atime;
11469	usbp->st_atimensec = sbp->st_atimensec;
11470	usbp->st_mtime = sbp->st_mtime;
11471	usbp->st_mtimensec = sbp->st_mtimensec;
11472	usbp->st_ctime = sbp->st_ctime;
11473	usbp->st_ctimensec = sbp->st_ctimensec;
11474	usbp->st_birthtime = sbp->st_birthtime;
11475	usbp->st_birthtimensec = sbp->st_birthtimensec;
11476	#endif
11477	usbp->st_size = sbp->st_size;
11478	usbp->st_blocks = sbp->st_blocks;
11479	usbp->st_blksize = sbp->st_blksize;
11480	usbp->st_flags = sbp->st_flags;
11481	usbp->st_gen = sbp->st_gen;
11482	usbp->st_lspare = sbp->st_lspare;
11483	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
11484	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
11485	}
11486
11487	void munge_user32_stat64(struct stat64 sbp, struct* user32_stat64 *usbp)
11488	{
11489	bzero(usbp, sizeof(*usbp));
11490
11491	usbp->st_dev = sbp->st_dev;
11492	usbp->st_ino = sbp->st_ino;
11493	usbp->st_mode = sbp->st_mode;
11494	usbp->st_nlink = sbp->st_nlink;
11495	usbp->st_uid = sbp->st_uid;
11496	usbp->st_gid = sbp->st_gid;
11497	usbp->st_rdev = sbp->st_rdev;
11498	#ifndef _POSIX_C_SOURCE
11499	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11500	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11501	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11502	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11503	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11504	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11505	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11506	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11507	#else
11508	usbp->st_atime = sbp->st_atime;
11509	usbp->st_atimensec = sbp->st_atimensec;
11510	usbp->st_mtime = sbp->st_mtime;
11511	usbp->st_mtimensec = sbp->st_mtimensec;
11512	usbp->st_ctime = sbp->st_ctime;
11513	usbp->st_ctimensec = sbp->st_ctimensec;
11514	usbp->st_birthtime = sbp->st_birthtime;
11515	usbp->st_birthtimensec = sbp->st_birthtimensec;
11516	#endif
11517	usbp->st_size = sbp->st_size;
11518	usbp->st_blocks = sbp->st_blocks;
11519	usbp->st_blksize = sbp->st_blksize;
11520	usbp->st_flags = sbp->st_flags;
11521	usbp->st_gen = sbp->st_gen;
11522	usbp->st_lspare = sbp->st_lspare;
11523	usbp->st_qspare[`0`] = sbp->st_qspare[`0`];
11524	usbp->st_qspare[`1`] = sbp->st_qspare[`1`];
11525	}
11526
11527	/*
11528	* Purge buffer cache for simulating cold starts
11529	*/
11530	static int vnode_purge_callback(struct vnode vp, __unused void* *cargs)
11531	{
11532	ubc_msync(vp, (off_t)`0`, ubc_getsize(vp), NULL / off_t resid_off /*, UBC_PUSHALL \| UBC_INVALIDATE);
11533
11534	return VNODE_RETURNED;
11535	}
11536
11537	static int vfs_purge_callback(mount_t mp, __unused void * arg)
11538	{
11539	vnode_iterate(mp, VNODE_WAIT \| VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11540
11541	return VFS_RETURNED;
11542	}
11543
11544	int
11545	vfs_purge(__unused struct proc p, __unused struct* vfs_purge_args uap, __unused int32_t retval)
11546	{
11547	if (!kauth_cred_issuser(kauth_cred_get()))
11548	return EPERM;
11549
11550	vfs_iterate(`0`/ flags /, vfs_purge_callback, NULL);
11551
11552	return `0`;
11553	}
11554
11555	/*
11556	* gets the vnode associated with the (unnamed) snapshot directory
11557	* for a Filesystem. The snapshot directory vnode is returned with
11558	* an iocount on it.
11559	*/
11560	int
11561	vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11562	{
11563	return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
11564	}
11565
11566	/*
11567	* Get the snapshot vnode.
11568	*
11569	* If successful, the call returns with an iocount on rvpp ,sdvpp and
11570	* needs nameidone() on ndp.
11571	*
11572	* If the snapshot vnode exists it is returned in ndp->ni_vp.
11573	*
11574	* If it returns with an error, rvpp, sdvpp are NULL and nameidone() is
11575	* not needed.
11576	*/
11577	static int
11578	vnode_get_snapshot(int dirfd, vnode_t rvpp, vnode_t sdvpp,
11579	user_addr_t name, struct nameidata *ndp, int32_t op,
11580	#if !CONFIG_TRIGGERS
11581	__unused
11582	#endif
11583	enum path_operation pathop,
11584	vfs_context_t ctx)
11585	{
11586	int error, i;
11587	caddr_t name_buf;
11588	size_t name_len;
11589	struct vfs_attr vfa;
11590
11591	*sdvpp = NULLVP;
11592	*rvpp = NULLVP;
11593
11594	error = vnode_getfromfd(ctx, dirfd, rvpp);
11595	if (error)
11596	return (error);
11597
11598	if (!vnode_isvroot(*rvpp)) {
11599	error = EINVAL;
11600	goto out;
11601	}
11602
11603	/ Make sure the filesystem supports snapshots /
11604	VFSATTR_INIT(&vfa);
11605	VFSATTR_WANTED(&vfa, f_capabilities);
11606	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != `0`) \|\|
11607	!VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) \|\|
11608	!((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11609	VOL_CAP_INT_SNAPSHOT)) \|\|
11610	!((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11611	VOL_CAP_INT_SNAPSHOT))) {
11612	error = ENOTSUP;
11613	goto out;
11614	}
11615
11616	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11617	if (error)
11618	goto out;
11619
11620	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11621	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11622	if (error)
11623	goto out1;
11624
11625	/*
11626	* Some sanity checks- name can't be empty, "." or ".." or have slashes.
11627	* (the length returned by copyinstr includes the terminating NUL)
11628	*/
11629	if ((name_len == `1`) \|\| (name_len == `2` && name_buf[`0`] == `'.'`) \|\|
11630	(name_len == `3` && name_buf[`0`] == `'.'` && name_buf[`1`] == `'.'`)) {
11631	error = EINVAL;
11632	goto out1;
11633	}
11634	for (i = `0`; i < (int)name_len && name_buf[i] != `'/'`; i++);
11635	if (i < (int)name_len) {
11636	error = EINVAL;
11637	goto out1;
11638	}
11639
11640	#if CONFIG_MACF
11641	if (op == CREATE) {
11642	error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11643	name_buf);
11644	} else if (op == DELETE) {
11645	error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11646	name_buf);
11647	}
11648	if (error)
11649	goto out1;
11650	#endif
11651
11652	/ Check if the snapshot already exists ... /
11653	NDINIT(ndp, op, pathop, USEDVP \| NOCACHE \| AUDITVNPATH1,
11654	UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11655	ndp->ni_dvp = *sdvpp;
11656
11657	error = namei(ndp);
11658	out1:
11659	FREE(name_buf, M_TEMP);
11660	out:
11661	if (error) {
11662	if (*sdvpp) {
11663	vnode_put(*sdvpp);
11664	*sdvpp = NULLVP;
11665	}
11666	if (*rvpp) {
11667	vnode_put(*rvpp);
11668	*rvpp = NULLVP;
11669	}
11670	}
11671	return (error);
11672	}
11673
11674	/*
11675	* create a filesystem snapshot (for supporting filesystems)
11676	*
11677	* A much simplified version of openat(dirfd, name, O_CREAT \| O_EXCL)
11678	* We get to the (unnamed) snapshot directory vnode and create the vnode
11679	* for the snapshot in it.
11680	*
11681	* Restrictions:
11682	*
11683	* a) Passed in name for snapshot cannot have slashes.
11684	* b) name can't be "." or ".."
11685	*
11686	* Since this requires superuser privileges, vnode_authorize calls are not
11687	* made.
11688	*/
11689	static int
11690	snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11691	vfs_context_t ctx)
11692	{
11693	vnode_t rvp, snapdvp;
11694	int error;
11695	struct nameidata namend;
11696
11697	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11698	OP_LINK, ctx);
11699	if (error)
11700	return (error);
11701
11702	if (namend.ni_vp) {
11703	vnode_put(namend.ni_vp);
11704	error = EEXIST;
11705	} else {
11706	struct vnode_attr va;
11707	vnode_t vp = NULLVP;
11708
11709	VATTR_INIT(&va);
11710	VATTR_SET(&va, va_type, VREG);
11711	VATTR_SET(&va, va_mode, `0`);
11712
11713	error = vn_create(snapdvp, &vp, &namend, &va,
11714	VN_CREATE_NOAUTH \| VN_CREATE_NOINHERIT, `0`, NULL, ctx);
11715	if (!error && vp)
11716	vnode_put(vp);
11717	}
11718
11719	nameidone(&namend);
11720	vnode_put(snapdvp);
11721	vnode_put(rvp);
11722	return (error);
11723	}
11724
11725	/*
11726	* Delete a Filesystem snapshot
11727	*
11728	* get the vnode for the unnamed snapshot directory and the snapshot and
11729	* delete the snapshot.
11730	*/
11731	static int
11732	snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11733	vfs_context_t ctx)
11734	{
11735	vnode_t rvp, snapdvp;
11736	int error;
11737	struct nameidata namend;
11738
11739	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11740	OP_UNLINK, ctx);
11741	if (error)
11742	goto out;
11743
11744	error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11745	VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11746
11747	vnode_put(namend.ni_vp);
11748	nameidone(&namend);
11749	vnode_put(snapdvp);
11750	vnode_put(rvp);
11751	out:
11752	return (error);
11753	}
11754
11755	/*
11756	* Revert a filesystem to a snapshot
11757	*
11758	* Marks the filesystem to revert to the given snapshot on next mount.
11759	*/
11760	static int
11761	snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11762	vfs_context_t ctx)
11763	{
11764	int error;
11765	vnode_t rvp;
11766	mount_t mp;
11767	struct fs_snapshot_revert_args revert_data;
11768	struct componentname cnp;
11769	caddr_t name_buf;
11770	size_t name_len;
11771
11772	error = vnode_getfromfd(ctx, dirfd, &rvp);
11773	if (error) {
11774	return (error);
11775	}
11776	mp = vnode_mount(rvp);
11777
11778	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11779	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11780	if (error) {
11781	FREE(name_buf, M_TEMP);
11782	vnode_put(rvp);
11783	return (error);
11784	}
11785
11786	#if CONFIG_MACF
11787	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
11788	if (error) {
11789	FREE(name_buf, M_TEMP);
11790	vnode_put(rvp);
11791	return (error);
11792	}
11793	#endif
11794
11795	/*
11796	* Grab mount_iterref so that we can release the vnode,
11797	* since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11798	*/
11799	error = mount_iterref (mp, `0`);
11800	vnode_put(rvp);
11801	if (error) {
11802	FREE(name_buf, M_TEMP);
11803	return (error);
11804	}
11805
11806	memset(&cnp, `0`, sizeof(cnp));
11807	cnp.cn_pnbuf = (char *)name_buf;
11808	cnp.cn_nameiop = LOOKUP;
11809	cnp.cn_flags = ISLASTCN \| HASBUF;
11810	cnp.cn_pnlen = MAXPATHLEN;
11811	cnp.cn_nameptr = cnp.cn_pnbuf;
11812	cnp.cn_namelen = (int)name_len;
11813	revert_data.sr_cnp = &cnp;
11814
11815	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, `0`, ctx);
11816	mount_iterdrop(mp);
11817	FREE(name_buf, M_TEMP);
11818
11819	if (error) {
11820	/ If there was any error, try again using VNOP_IOCTL /
11821
11822	vnode_t snapdvp;
11823	struct nameidata namend;
11824
11825	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11826	OP_LOOKUP, ctx);
11827	if (error) {
11828	return (error);
11829	}
11830
11831
11832	error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11833	`0`, ctx);
11834
11835	vnode_put(namend.ni_vp);
11836	nameidone(&namend);
11837	vnode_put(snapdvp);
11838	vnode_put(rvp);
11839	}
11840
11841	return (error);
11842	}
11843
11844	/*
11845	* rename a Filesystem snapshot
11846	*
11847	* get the vnode for the unnamed snapshot directory and the snapshot and
11848	* rename the snapshot. This is a very specialised (and simple) case of
11849	* rename(2) (which has to deal with a lot more complications). It differs
11850	* slightly from rename(2) in that EEXIST is returned if the new name exists.
11851	*/
11852	static int
11853	snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11854	__unused uint32_t flags, vfs_context_t ctx)
11855	{
11856	vnode_t rvp, snapdvp;
11857	int error, i;
11858	caddr_t newname_buf;
11859	size_t name_len;
11860	vnode_t fvp;
11861	struct nameidata fromnd, tond;
11862	/ carving out a chunk for structs that are too big to be on stack. /
11863	struct {
11864	struct nameidata from_node;
11865	struct nameidata to_node;
11866	} * __rename_data;
11867
11868	MALLOC(__rename_data, void , sizeof(__rename_data), M_TEMP, M_WAITOK);
11869	fromnd = &__rename_data->from_node;
11870	tond = &__rename_data->to_node;
11871
11872	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11873	OP_UNLINK, ctx);
11874	if (error)
11875	goto out;
11876	fvp = fromnd->ni_vp;
11877
11878	MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11879	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11880	if (error)
11881	goto out1;
11882
11883	/*
11884	* Some sanity checks- new name can't be empty, "." or ".." or have
11885	* slashes.
11886	* (the length returned by copyinstr includes the terminating NUL)
11887	*
11888	* The FS rename VNOP is suppossed to handle this but we'll pick it
11889	* off here itself.
11890	*/
11891	if ((name_len == `1`) \|\| (name_len == `2` && newname_buf[`0`] == `'.'`) \|\|
11892	(name_len == `3` && newname_buf[`0`] == `'.'` && newname_buf[`1`] == `'.'`)) {
11893	error = EINVAL;
11894	goto out1;
11895	}
11896	for (i = `0`; i < (int)name_len && newname_buf[i] != `'/'`; i++);
11897	if (i < (int)name_len) {
11898	error = EINVAL;
11899	goto out1;
11900	}
11901
11902	#if CONFIG_MACF
11903	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11904	newname_buf);
11905	if (error)
11906	goto out1;
11907	#endif
11908
11909	NDINIT(tond, RENAME, OP_RENAME, USEDVP \| NOCACHE \| AUDITVNPATH2,
11910	UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11911	tond->ni_dvp = snapdvp;
11912
11913	error = namei(tond);
11914	if (error) {
11915	goto out2;
11916	} else if (tond->ni_vp) {
11917	/*
11918	* snapshot rename behaves differently than rename(2) - if the
11919	* new name exists, EEXIST is returned.
11920	*/
11921	vnode_put(tond->ni_vp);
11922	error = EEXIST;
11923	goto out2;
11924	}
11925
11926	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11927	&tond->ni_cnd, ctx);
11928
11929	out2:
11930	nameidone(tond);
11931	out1:
11932	FREE(newname_buf, M_TEMP);
11933	vnode_put(fvp);
11934	vnode_put(snapdvp);
11935	vnode_put(rvp);
11936	nameidone(fromnd);
11937	out:
11938	FREE(__rename_data, M_TEMP);
11939	return (error);
11940	}
11941
11942	/*
11943	* Mount a Filesystem snapshot
11944	*
11945	* get the vnode for the unnamed snapshot directory and the snapshot and
11946	* mount the snapshot.
11947	*/
11948	static int
11949	snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11950	__unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11951	{
11952	vnode_t rvp, snapdvp, snapvp, vp, pvp;
11953	int error;
11954	struct nameidata snapndp, dirndp;
11955	/ carving out a chunk for structs that are too big to be on stack. /
11956	struct {
11957	struct nameidata snapnd;
11958	struct nameidata dirnd;
11959	} * __snapshot_mount_data;
11960
11961	MALLOC(__snapshot_mount_data, void , sizeof(__snapshot_mount_data),
11962	M_TEMP, M_WAITOK);
11963	snapndp = &__snapshot_mount_data->snapnd;
11964	dirndp = &__snapshot_mount_data->dirnd;
11965
11966	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11967	OP_LOOKUP, ctx);
11968	if (error)
11969	goto out;
11970
11971	snapvp = snapndp->ni_vp;
11972	if (!vnode_mount(rvp) \|\| (vnode_mount(rvp) == dead_mountp)) {
11973	error = EIO;
11974	goto out1;
11975	}
11976
11977	/ Get the vnode to be covered /
11978	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
11979	UIO_USERSPACE, directory, ctx);
11980	error = namei(dirndp);
11981	if (error)
11982	goto out1;
11983
11984	vp = dirndp->ni_vp;
11985	pvp = dirndp->ni_dvp;
11986
11987	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11988	error = EINVAL;
11989	} else {
11990	mount_t mp = vnode_mount(rvp);
11991	struct fs_snapshot_mount_args smnt_data;
11992
11993	smnt_data.sm_mp = mp;
11994	smnt_data.sm_cnp = &snapndp->ni_cnd;
11995	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11996	&dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
11997	KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11998	}
11999
12000	vnode_put(vp);
12001	vnode_put(pvp);
12002	nameidone(dirndp);
12003	out1:
12004	vnode_put(snapvp);
12005	vnode_put(snapdvp);
12006	vnode_put(rvp);
12007	nameidone(snapndp);
12008	out:
12009	FREE(__snapshot_mount_data, M_TEMP);
12010	return (error);
12011	}
12012
12013	/*
12014	* Root from a snapshot of the filesystem
12015	*
12016	* Marks the filesystem to root from the given snapshot on next boot.
12017	*/
12018	static int
12019	snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12020	vfs_context_t ctx)
12021	{
12022	int error;
12023	vnode_t rvp;
12024	mount_t mp;
12025	struct fs_snapshot_root_args root_data;
12026	struct componentname cnp;
12027	caddr_t name_buf;
12028	size_t name_len;
12029
12030	error = vnode_getfromfd(ctx, dirfd, &rvp);
12031	if (error) {
12032	return (error);
12033	}
12034	mp = vnode_mount(rvp);
12035
12036	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12037	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12038	if (error) {
12039	FREE(name_buf, M_TEMP);
12040	vnode_put(rvp);
12041	return (error);
12042	}
12043
12044	// XXX MAC checks ?
12045
12046	/*
12047	* Grab mount_iterref so that we can release the vnode,
12048	* since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12049	*/
12050	error = mount_iterref (mp, `0`);
12051	vnode_put(rvp);
12052	if (error) {
12053	FREE(name_buf, M_TEMP);
12054	return (error);
12055	}
12056
12057	memset(&cnp, `0`, sizeof(cnp));
12058	cnp.cn_pnbuf = (char *)name_buf;
12059	cnp.cn_nameiop = LOOKUP;
12060	cnp.cn_flags = ISLASTCN \| HASBUF;
12061	cnp.cn_pnlen = MAXPATHLEN;
12062	cnp.cn_nameptr = cnp.cn_pnbuf;
12063	cnp.cn_namelen = (int)name_len;
12064	root_data.sr_cnp = &cnp;
12065
12066	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, `0`, ctx);
12067
12068	mount_iterdrop(mp);
12069	FREE(name_buf, M_TEMP);
12070
12071	return (error);
12072	}
12073
12074	/*
12075	* FS snapshot operations dispatcher
12076	*/
12077	int
12078	fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12079	__unused int32_t *retval)
12080	{
12081	int error;
12082	vfs_context_t ctx = vfs_context_current();
12083
12084	AUDIT_ARG(fd, uap->dirfd);
12085	AUDIT_ARG(value32, uap->op);
12086
12087	error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, `0`);
12088	if (error)
12089	return (error);
12090
12091	switch (uap->op) {
12092	case SNAPSHOT_OP_CREATE:
12093	error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12094	break;
12095	case SNAPSHOT_OP_DELETE:
12096	error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12097	break;
12098	case SNAPSHOT_OP_RENAME:
12099	error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12100	uap->flags, ctx);
12101	break;
12102	case SNAPSHOT_OP_MOUNT:
12103	error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12104	uap->data, uap->flags, ctx);
12105	break;
12106	case SNAPSHOT_OP_REVERT:
12107	error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12108	break;
12109	#if CONFIG_MNT_ROOTSNAP
12110	case SNAPSHOT_OP_ROOT:
12111	error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12112	break;
12113	#endif /* CONFIG_MNT_ROOTSNAP */
12114	default:
12115	error = ENOSYS;
12116	}
12117
12118	return (error);
12119	}
12120

Browse the source code of xnu/bsd/vfs/vfs_syscalls.c