1/*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67/*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/namei.h>
77#include <sys/filedesc.h>
78#include <sys/kernel.h>
79#include <sys/file_internal.h>
80#include <sys/stat.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
83#include <sys/proc_internal.h>
84#include <sys/kauth.h>
85#include <sys/uio_internal.h>
86#include <kern/kalloc.h>
87#include <sys/mman.h>
88#include <sys/dirent.h>
89#include <sys/attr.h>
90#include <sys/sysctl.h>
91#include <sys/ubc.h>
92#include <sys/quota.h>
93#include <sys/kdebug.h>
94#include <sys/fsevents.h>
95#include <sys/imgsrc.h>
96#include <sys/sysproto.h>
97#include <sys/sysctl.h>
98#include <sys/xattr.h>
99#include <sys/fcntl.h>
100#include <sys/stdio.h>
101#include <sys/fsctl.h>
102#include <sys/ubc_internal.h>
103#include <sys/disk.h>
104#include <sys/content_protection.h>
105#include <sys/clonefile.h>
106#include <sys/snapshot.h>
107#include <sys/priv.h>
108#include <sys/fsgetpath.h>
109#include <machine/cons.h>
110#include <machine/limits.h>
111#include <miscfs/specfs/specdev.h>
112
113#include <vfs/vfs_disk_conditioner.h>
114#if CONFIG_EXCLAVES
115#include <vfs/vfs_exclave_fs.h>
116#endif
117
118#include <security/audit/audit.h>
119#include <bsm/audit_kevents.h>
120
121#include <mach/mach_types.h>
122#include <kern/kern_types.h>
123#include <kern/kalloc.h>
124#include <kern/task.h>
125
126#include <vm/vm_pageout.h>
127#include <vm/vm_protos.h>
128
129#include <libkern/OSAtomic.h>
130#include <os/atomic_private.h>
131#include <pexpert/pexpert.h>
132#include <IOKit/IOBSD.h>
133
134// deps for MIG call
135#include <kern/host.h>
136#include <kern/ipc_misc.h>
137#include <mach/host_priv.h>
138#include <mach/vfs_nspace.h>
139#include <os/log.h>
140
141#include <nfs/nfs_conf.h>
142
143#if ROUTEFS
144#include <miscfs/routefs/routefs.h>
145#endif /* ROUTEFS */
146
147#if CONFIG_MACF
148#include <security/mac.h>
149#include <security/mac_framework.h>
150#endif
151
152#if CONFIG_FSE
153#define GET_PATH(x) \
154 ((x) = get_pathbuff())
155#define RELEASE_PATH(x) \
156 release_pathbuff(x)
157#else
158#define GET_PATH(x) \
159 ((x) = zalloc(ZV_NAMEI))
160#define RELEASE_PATH(x) \
161 zfree(ZV_NAMEI, x)
162#endif /* CONFIG_FSE */
163
164#ifndef HFS_GET_BOOT_INFO
165#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166#endif
167
168#ifndef HFS_SET_BOOT_INFO
169#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170#endif
171
172#ifndef APFSIOC_REVERT_TO_SNAPSHOT
173#define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
174#endif
175
176extern void disk_conditioner_unmount(mount_t mp);
177
178/* struct for checkdirs iteration */
179struct cdirargs {
180 vnode_t olddp;
181 vnode_t newdp;
182};
183/* callback for checkdirs iteration */
184static int checkdirs_callback(proc_t p, void * arg);
185
186static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188void enablequotas(struct mount *mp, vfs_context_t ctx);
189static int getfsstat_callback(mount_t mp, void * arg);
190static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192static int sync_callback(mount_t, void *);
193static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195 boolean_t partial_copy);
196static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198 struct componentname *cnp, user_addr_t fsmountargs,
199 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200void vfs_notify_mount(vnode_t pdvp);
201
202int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203
204struct fd_vn_data * fg_vn_data_alloc(void);
205
206/*
207 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208 * Concurrent lookups (or lookups by ids) on hard links can cause the
209 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210 * does) to return ENOENT as the path cannot be returned from the name cache
211 * alone. We have no option but to retry and hope to get one namei->reverse path
212 * generation done without an intervening lookup, lookup by id on the hard link
213 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214 * which currently are the MAC hooks for rename, unlink and rmdir.
215 */
216#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217
218/* Max retry limit for rename due to vnode recycling. */
219#define MAX_RENAME_ERECYCLE_RETRIES 1024
220
221static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
222 int unlink_flags);
223
224#ifdef CONFIG_IMGSRC_ACCESS
225static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
226static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
227static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
228static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
229static void mount_end_update(mount_t mp);
230static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
231#endif /* CONFIG_IMGSRC_ACCESS */
232
233//snapshot functions
234#if CONFIG_MNT_ROOTSNAP
235static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236#else
237static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238#endif
239
240__private_extern__
241int sync_internal(void);
242
243__private_extern__
244int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
245
246static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
247static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
248
249/* vars for sync mutex */
250static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
251static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
252
253extern lck_rw_t rootvnode_rw_lock;
254
255VFS_SMR_DECLARE;
256extern uint32_t nc_smr_enabled;
257
258/*
259 * incremented each time a mount or unmount operation occurs
260 * used to invalidate the cached value of the rootvp in the
261 * mount structure utilized by cache_lookup_path
262 */
263uint32_t mount_generation = 0;
264
265/* counts number of mount and unmount operations */
266unsigned int vfs_nummntops = 0;
267
268/* system-wide, per-boot unique mount ID */
269static _Atomic uint64_t mount_unique_id = 1;
270
271extern const struct fileops vnops;
272#if CONFIG_APPLEDOUBLE
273extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
274#endif /* CONFIG_APPLEDOUBLE */
275
276/* Maximum buffer length supported by fsgetpath(2) */
277#define FSGETPATH_MAXBUFLEN 8192
278
279/*
280 * Virtual File System System Calls
281 */
282
283/*
284 * Private in-kernel mounting spi (specific use-cases only)
285 */
286boolean_t
287vfs_iskernelmount(mount_t mp)
288{
289 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
290}
291
292__private_extern__
293int
294kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
295 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
296 vfs_context_t ctx)
297{
298 struct nameidata nd;
299 boolean_t did_namei;
300 int error;
301
302 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
303 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
304
305 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
306
307 /*
308 * Get the vnode to be covered if it's not supplied
309 */
310 if (vp == NULLVP) {
311 error = namei(ndp: &nd);
312 if (error) {
313 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
314 printf("failed to locate mount-on path: %s ", path);
315 }
316 return error;
317 }
318 vp = nd.ni_vp;
319 pvp = nd.ni_dvp;
320 did_namei = TRUE;
321 } else {
322 char *pnbuf = CAST_DOWN(char *, path);
323
324 nd.ni_cnd.cn_pnbuf = pnbuf;
325 nd.ni_cnd.cn_pnlen = (int)(strlen(s: pnbuf) + 1);
326 did_namei = FALSE;
327 }
328
329 kern_flags |= KERNEL_MOUNT_KMOUNT;
330 error = mount_common(fstypename: fstype, pvp, vp, cnp: &nd.ni_cnd, CAST_USER_ADDR_T(data),
331 flags: syscall_flags, internal_flags: kern_flags, NULL, ctx);
332
333 if (did_namei) {
334 vnode_put(vp);
335 vnode_put(vp: pvp);
336 nameidone(&nd);
337 }
338
339 return error;
340}
341
342int
343vfs_mount_at_path(const char *fstype, const char *path,
344 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
345 int mnt_flags, int flags)
346{
347 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
348 int error, km_flags = 0;
349 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
350
351 /*
352 * This call is currently restricted to specific use cases.
353 */
354 if ((strcmp(s1: fstype, s2: "lifs") != 0) && (strcmp(s1: fstype, s2: "nfs") != 0)) {
355 return ENOTSUP;
356 }
357
358#if !defined(XNU_TARGET_OS_OSX)
359 if (strcmp(fstype, "lifs") == 0) {
360 syscall_flags |= MNT_NOEXEC;
361 }
362#endif
363
364 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
365 km_flags |= KERNEL_MOUNT_NOAUTH;
366 }
367 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
368 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
369 }
370
371 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
372 syscall_flags, kern_flags: km_flags, ctx);
373 if (error) {
374 printf("%s: mount on %s failed, error %d\n", __func__, path,
375 error);
376 }
377
378 return error;
379}
380
381/*
382 * Mount a file system.
383 */
384/* ARGSUSED */
385int
386mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
387{
388 struct __mac_mount_args muap;
389
390 muap.type = uap->type;
391 muap.path = uap->path;
392 muap.flags = uap->flags;
393 muap.data = uap->data;
394 muap.mac_p = USER_ADDR_NULL;
395 return __mac_mount(p, &muap, retval);
396}
397
398int
399fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
400{
401 struct componentname cn;
402 vfs_context_t ctx = vfs_context_current();
403 size_t dummy = 0;
404 int error;
405 int flags = uap->flags;
406 char fstypename[MFSNAMELEN];
407 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
408 vnode_t pvp;
409 vnode_t vp;
410
411 AUDIT_ARG(fd, uap->fd);
412 AUDIT_ARG(fflags, flags);
413 /* fstypename will get audited by mount_common */
414
415 /* Sanity check the flags */
416 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
417 return ENOTSUP;
418 }
419
420 if (flags & MNT_UNION) {
421 return EPERM;
422 }
423
424 error = copyinstr(uaddr: uap->type, kaddr: fstypename, MFSNAMELEN, done: &dummy);
425 if (error) {
426 return error;
427 }
428
429 if ((error = file_vnode(uap->fd, &vp)) != 0) {
430 return error;
431 }
432
433 if ((error = vnode_getwithref(vp)) != 0) {
434 file_drop(uap->fd);
435 return error;
436 }
437
438 pvp = vnode_getparent(vp);
439 if (pvp == NULL) {
440 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
441 error = EBUSY;
442 } else {
443 error = EINVAL;
444 }
445 vnode_put(vp);
446 file_drop(uap->fd);
447 return error;
448 }
449
450 memset(s: &cn, c: 0, n: sizeof(struct componentname));
451 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 cn.cn_pnlen = MAXPATHLEN;
453
454 if ((error = vn_getpath(vp, pathbuf: cn.cn_pnbuf, len: &cn.cn_pnlen)) != 0) {
455 zfree(ZV_NAMEI, cn.cn_pnbuf);
456 vnode_put(vp: pvp);
457 vnode_put(vp);
458 file_drop(uap->fd);
459 return error;
460 }
461
462 error = mount_common(fstypename, pvp, vp, cnp: &cn, fsmountargs: uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(vp: pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468
469 return error;
470}
471
472#define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
473
474/*
475 * Get the size of a graft file (a manifest or payload file).
476 * The vp should be an iocounted vnode.
477 */
478static int
479get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
480{
481 struct stat64 sb = {};
482 int error;
483
484 *size = 0;
485
486 error = vn_stat(vp: graft_vp, sb: &sb, NULL, isstat64: 1, needsrealdev: 0, ctx: vctx);
487 if (error) {
488 return error;
489 }
490
491 if (sb.st_size == 0) {
492 error = ENODATA;
493 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
494 error = EFBIG;
495 } else {
496 *size = (size_t) sb.st_size;
497 }
498
499 return error;
500}
501
502/*
503 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
504 * `size` must already be validated.
505 */
506static int
507read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
508{
509 return vn_rdwr(rw: UIO_READ, vp: graft_vp,
510 base: (caddr_t) buf, len: (int) size, /* offset */ 0,
511 segflg: UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
512 cred: vfs_context_ucred(ctx: vctx), /* resid */ NULL,
513 p: vfs_context_proc(ctx: vctx));
514}
515
516/*
517 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
518 * and read it into `buf`.
519 */
520static int
521graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
522{
523 vnode_t metadata_vp = NULLVP;
524 int error;
525
526 // Convert this graft fd to a vnode.
527 if ((error = vnode_getfromfd(ctx: vctx, fd, vpp: &metadata_vp)) != 0) {
528 goto out;
529 }
530
531 // Get (and validate) size information.
532 if ((error = get_and_verify_graft_metadata_vp_size(graft_vp: metadata_vp, vctx, size)) != 0) {
533 goto out;
534 }
535
536 // Read each file into the provided buffer - we must get the expected amount of bytes.
537 if ((error = read_graft_metadata_vp(graft_vp: metadata_vp, vctx, size: *size, buf)) != 0) {
538 goto out;
539 }
540
541out:
542 if (metadata_vp) {
543 vnode_put(vp: metadata_vp);
544 metadata_vp = NULLVP;
545 }
546
547 return error;
548}
549
550/*
551 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
552 * provided in `gfs`, saving the size of data read in `gfs`.
553 */
554static int
555graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
556 fsioc_graft_fs_t *gfs)
557{
558 int error;
559
560 // Read the authentic manifest.
561 if ((error = graft_secureboot_read_fd(fd: sbc_args->sbc_authentic_manifest_fd, vctx,
562 size: &gfs->authentic_manifest_size, buf: gfs->authentic_manifest))) {
563 return error;
564 }
565
566 // The user manifest is currently unused, but set its size.
567 gfs->user_manifest_size = 0;
568
569 // Read the payload.
570 if ((error = graft_secureboot_read_fd(fd: sbc_args->sbc_payload_fd, vctx,
571 size: &gfs->payload_size, buf: gfs->payload))) {
572 return error;
573 }
574
575 return 0;
576}
577
578/*
579 * Call into the filesystem to verify and graft a cryptex.
580 */
581static int
582graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
583 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
584{
585 fsioc_graft_fs_t gfs = {};
586 uint64_t graft_dir_ino = 0;
587 struct stat64 sb = {};
588 int error;
589
590 // Pre-flight arguments.
591 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
592 // Make sure that this graft version matches what we support.
593 return ENOTSUP;
594 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
595 // For this type, cryptex VP must live on same volume as the target of graft.
596 return EXDEV;
597 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
598 // We cannot graft upon non-directories.
599 return ENOTDIR;
600 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
601 sbc_args->sbc_payload_fd < 0) {
602 // We cannot graft without a manifest and payload.
603 return EINVAL;
604 }
605
606 if (mounton_vp) {
607 // Get the mounton's inode number.
608 error = vn_stat(vp: mounton_vp, sb: &sb, NULL, isstat64: 1, needsrealdev: 0, ctx: vctx);
609 if (error) {
610 return error;
611 }
612 graft_dir_ino = (uint64_t) sb.st_ino;
613 }
614
615 // Create buffers (of our maximum-defined size) to store authentication info.
616 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
617 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
618
619 if (!gfs.authentic_manifest || !gfs.payload) {
620 error = ENOMEM;
621 goto out;
622 }
623
624 // Read our fd's into our buffers.
625 // (Note that this will set the buffer size fields in `gfs`.)
626 error = graft_secureboot_read_metadata(sbc_args, vctx, gfs: &gfs);
627 if (error) {
628 goto out;
629 }
630
631 gfs.graft_version = FSIOC_GRAFT_VERSION;
632 gfs.graft_type = graft_type;
633 gfs.graft_4cc = sbc_args->sbc_4cc;
634 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
635 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
636 }
637 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
638 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
639 }
640 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
641 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
642 }
643 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
644 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
645 }
646 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
647 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
648 }
649 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
650 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
651 }
652 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
653
654 // Call into the FS to perform the graft (and validation).
655 error = VNOP_IOCTL(vp: cryptex_vp, FSIOC_GRAFT_FS, data: (caddr_t)&gfs, fflag: 0, ctx: vctx);
656
657out:
658 if (gfs.authentic_manifest) {
659 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
660 gfs.authentic_manifest = NULL;
661 }
662 if (gfs.payload) {
663 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
664 gfs.payload = NULL;
665 }
666
667 return error;
668}
669
670#define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
671
672/*
673 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
674 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
675 */
676int
677graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
678{
679 int ua_dmgfd = uap->dmg_fd;
680 user_addr_t ua_mountdir = uap->mountdir;
681 uint32_t ua_grafttype = uap->graft_type;
682 user_addr_t ua_graftargs = uap->gda;
683
684 graftdmg_args_un kern_gda = {};
685 int error = 0;
686 secure_boot_cryptex_args_t *sbc_args = NULL;
687
688 vnode_t cryptex_vp = NULLVP;
689 vnode_t mounton_vp = NULLVP;
690 struct nameidata nd = {};
691 vfs_context_t ctx = vfs_context_current();
692
693 if (!IOTaskHasEntitlement(task: vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
694 return EPERM;
695 }
696
697 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
698 if (error) {
699 return error;
700 }
701
702 // Copy mount dir in, if provided.
703 if (ua_mountdir != USER_ADDR_NULL) {
704 // Acquire vnode for mount-on path
705 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
706 UIO_USERSPACE, ua_mountdir, ctx);
707
708 error = namei(ndp: &nd);
709 if (error) {
710 return error;
711 }
712 mounton_vp = nd.ni_vp;
713 }
714
715 // Convert fd to vnode.
716 error = vnode_getfromfd(ctx, fd: ua_dmgfd, vpp: &cryptex_vp);
717 if (error) {
718 goto graftout;
719 }
720
721 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
722 error = EINVAL;
723 } else {
724 sbc_args = &kern_gda.sbc_args;
725 error = graft_secureboot_cryptex(graft_type: ua_grafttype, sbc_args, vctx: ctx, cryptex_vp, mounton_vp);
726 }
727
728graftout:
729 if (cryptex_vp) {
730 vnode_put(vp: cryptex_vp);
731 cryptex_vp = NULLVP;
732 }
733 if (mounton_vp) {
734 vnode_put(vp: mounton_vp);
735 mounton_vp = NULLVP;
736 }
737 if (ua_mountdir != USER_ADDR_NULL) {
738 nameidone(&nd);
739 }
740
741 return error;
742}
743
744/*
745 * Ungraft a cryptex disk image (via mount dir FD)
746 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
747 */
748int
749ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
750{
751 int error = 0;
752 user_addr_t ua_mountdir = uap->mountdir;
753 fsioc_ungraft_fs_t ugfs;
754 vnode_t mounton_vp = NULLVP;
755 struct nameidata nd = {};
756 vfs_context_t ctx = vfs_context_current();
757
758 if (!IOTaskHasEntitlement(task: vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
759 return EPERM;
760 }
761
762 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
763 return EINVAL;
764 }
765
766 ugfs.ungraft_flags = 0;
767
768 // Acquire vnode for mount-on path
769 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
770 UIO_USERSPACE, ua_mountdir, ctx);
771
772 error = namei(ndp: &nd);
773 if (error) {
774 return error;
775 }
776 mounton_vp = nd.ni_vp;
777
778 // Call into the FS to perform the ungraft
779 error = VNOP_IOCTL(vp: mounton_vp, FSIOC_UNGRAFT_FS, data: (caddr_t)&ugfs, fflag: 0, ctx);
780
781 vnode_put(vp: mounton_vp);
782 nameidone(&nd);
783
784 return error;
785}
786
787
788void
789vfs_notify_mount(vnode_t pdvp)
790{
791 vfs_event_signal(NULL, VQ_MOUNT, data: (intptr_t)NULL);
792 lock_vnode_and_post(pdvp, NOTE_WRITE);
793}
794
795/*
796 * __mac_mount:
797 * Mount a file system taking into account MAC label behavior.
798 * See mount(2) man page for more information
799 *
800 * Parameters: p Process requesting the mount
801 * uap User argument descriptor (see below)
802 * retval (ignored)
803 *
804 * Indirect: uap->type Filesystem type
805 * uap->path Path to mount
806 * uap->data Mount arguments
807 * uap->mac_p MAC info
808 * uap->flags Mount flags
809 *
810 *
811 * Returns: 0 Success
812 * !0 Not success
813 */
814boolean_t root_fs_upgrade_try = FALSE;
815
816int
817__mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
818{
819 vnode_t pvp = NULL;
820 vnode_t vp = NULL;
821 int need_nameidone = 0;
822 vfs_context_t ctx = vfs_context_current();
823 char fstypename[MFSNAMELEN];
824 struct nameidata nd;
825 size_t dummy = 0;
826 char *labelstr = NULL;
827 size_t labelsz = 0;
828 int flags = uap->flags;
829 int error;
830#if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
831 boolean_t is_64bit = IS_64BIT_PROCESS(p);
832#else
833#pragma unused(p)
834#endif
835 /*
836 * Get the fs type name from user space
837 */
838 error = copyinstr(uaddr: uap->type, kaddr: fstypename, MFSNAMELEN, done: &dummy);
839 if (error) {
840 return error;
841 }
842
843 /*
844 * Get the vnode to be covered
845 */
846 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
847 UIO_USERSPACE, uap->path, ctx);
848 if (flags & MNT_NOFOLLOW) {
849 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
850 }
851 error = namei(ndp: &nd);
852 if (error) {
853 goto out;
854 }
855 need_nameidone = 1;
856 vp = nd.ni_vp;
857 pvp = nd.ni_dvp;
858
859#ifdef CONFIG_IMGSRC_ACCESS
860 /* Mounting image source cannot be batched with other operations */
861 if (flags == MNT_IMGSRC_BY_INDEX) {
862 error = relocate_imageboot_source(pvp, vp, cnp: &nd.ni_cnd, fsname: fstypename,
863 ctx, is64bit: is_64bit, fsmountargs: uap->data, by_index: (flags == MNT_IMGSRC_BY_INDEX));
864 goto out;
865 }
866#endif /* CONFIG_IMGSRC_ACCESS */
867
868#if CONFIG_MACF
869 /*
870 * Get the label string (if any) from user space
871 */
872 if (uap->mac_p != USER_ADDR_NULL) {
873 struct user_mac mac;
874 size_t ulen = 0;
875
876 if (is_64bit) {
877 struct user64_mac mac64;
878 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
879 mac.m_buflen = (user_size_t)mac64.m_buflen;
880 mac.m_string = (user_addr_t)mac64.m_string;
881 } else {
882 struct user32_mac mac32;
883 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
884 mac.m_buflen = mac32.m_buflen;
885 mac.m_string = mac32.m_string;
886 }
887 if (error) {
888 goto out;
889 }
890 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
891 (mac.m_buflen < 2)) {
892 error = EINVAL;
893 goto out;
894 }
895 labelsz = mac.m_buflen;
896 labelstr = kalloc_data(labelsz, Z_WAITOK);
897 error = copyinstr(uaddr: mac.m_string, kaddr: labelstr, len: mac.m_buflen, done: &ulen);
898 if (error) {
899 goto out;
900 }
901 AUDIT_ARG(mac_string, labelstr);
902 }
903#endif /* CONFIG_MACF */
904
905 AUDIT_ARG(fflags, flags);
906
907#if !CONFIG_UNION_MOUNTS
908 if (flags & MNT_UNION) {
909 error = EPERM;
910 goto out;
911 }
912#endif
913
914 if ((vp->v_flag & VROOT) &&
915 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
916#if CONFIG_UNION_MOUNTS
917 if (!(flags & MNT_UNION)) {
918 flags |= MNT_UPDATE;
919 } else {
920 /*
921 * For a union mount on '/', treat it as fresh
922 * mount instead of update.
923 * Otherwise, union mouting on '/' used to panic the
924 * system before, since mnt_vnodecovered was found to
925 * be NULL for '/' which is required for unionlookup
926 * after it gets ENOENT on union mount.
927 */
928 flags = (flags & ~(MNT_UPDATE));
929 }
930#else
931 flags |= MNT_UPDATE;
932#endif /* CONFIG_UNION_MOUNTS */
933
934#if SECURE_KERNEL
935 if ((flags & MNT_RDONLY) == 0) {
936 /* Release kernels are not allowed to mount "/" as rw */
937 error = EPERM;
938 goto out;
939 }
940#endif
941
942 /*
943 * See 7392553 for more details on why this check exists.
944 * Suffice to say: If this check is ON and something tries
945 * to mount the rootFS RW, we'll turn off the codesign
946 * bitmap optimization.
947 */
948#if CHECK_CS_VALIDATION_BITMAP
949 if ((flags & MNT_RDONLY) == 0) {
950 root_fs_upgrade_try = TRUE;
951 }
952#endif
953 }
954
955 error = mount_common(fstypename, pvp, vp, cnp: &nd.ni_cnd, fsmountargs: uap->data, flags, internal_flags: 0,
956 labelstr, ctx);
957
958out:
959
960#if CONFIG_MACF
961 kfree_data(labelstr, labelsz);
962#endif /* CONFIG_MACF */
963
964 if (vp) {
965 vnode_put(vp);
966 }
967 if (pvp) {
968 vnode_put(vp: pvp);
969 }
970 if (need_nameidone) {
971 nameidone(&nd);
972 }
973
974 return error;
975}
976
977/*
978 * common mount implementation (final stage of mounting)
979 *
980 * Arguments:
981 * fstypename file system type (ie it's vfs name)
982 * pvp parent of covered vnode
983 * vp covered vnode
984 * cnp component name (ie path) of covered vnode
985 * flags generic mount flags
986 * fsmountargs file system specific data
987 * labelstr optional MAC label
988 * kernelmount TRUE for mounts initiated from inside the kernel
989 * ctx caller's context
990 */
991static int
992mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
993 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
994 char *labelstr, vfs_context_t ctx)
995{
996#if !CONFIG_MACF
997#pragma unused(labelstr)
998#endif
999 struct vnode *devvp = NULLVP;
1000 struct vnode *device_vnode = NULLVP;
1001#if CONFIG_MACF
1002 struct vnode *rvp;
1003#endif
1004 struct mount *mp = NULL;
1005 struct vfstable *vfsp = (struct vfstable *)0;
1006 struct proc *p = vfs_context_proc(ctx);
1007 int error, flag = 0;
1008 bool flag_set = false;
1009 user_addr_t devpath = USER_ADDR_NULL;
1010 int ronly = 0;
1011 int mntalloc = 0;
1012 boolean_t vfsp_ref = FALSE;
1013 boolean_t is_rwlock_locked = FALSE;
1014 boolean_t did_rele = FALSE;
1015 boolean_t have_usecount = FALSE;
1016 boolean_t did_set_lmount = FALSE;
1017 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1018
1019#if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1020 /* Check for mutually-exclusive flag bits */
1021 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1022 int bitcount = 0;
1023 while (checkflags != 0) {
1024 checkflags &= (checkflags - 1);
1025 bitcount++;
1026 }
1027
1028 if (bitcount > 1) {
1029 //not allowed to request multiple mount-by-role flags
1030 error = EINVAL;
1031 goto out1;
1032 }
1033#endif
1034
1035 /*
1036 * Process an update for an existing mount
1037 */
1038 if (flags & MNT_UPDATE) {
1039 if ((vp->v_flag & VROOT) == 0) {
1040 error = EINVAL;
1041 goto out1;
1042 }
1043 mp = vp->v_mount;
1044
1045 /* if unmount or mount in progress, return error */
1046 mount_lock_spin(mp);
1047 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1048 mount_unlock(mp);
1049 error = EBUSY;
1050 goto out1;
1051 }
1052 mp->mnt_lflag |= MNT_LMOUNT;
1053 did_set_lmount = TRUE;
1054 mount_unlock(mp);
1055 lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
1056 is_rwlock_locked = TRUE;
1057 /*
1058 * We only allow the filesystem to be reloaded if it
1059 * is currently mounted read-only.
1060 */
1061 if ((flags & MNT_RELOAD) &&
1062 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1063 error = ENOTSUP;
1064 goto out1;
1065 }
1066
1067 /*
1068 * If content protection is enabled, update mounts are not
1069 * allowed to turn it off.
1070 */
1071 if ((mp->mnt_flag & MNT_CPROTECT) &&
1072 ((flags & MNT_CPROTECT) == 0)) {
1073 error = EINVAL;
1074 goto out1;
1075 }
1076
1077 /*
1078 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1079 * failure to return an error for this so we'll just silently
1080 * add it if it is not passed in.
1081 */
1082 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1083 ((flags & MNT_REMOVABLE) == 0)) {
1084 flags |= MNT_REMOVABLE;
1085 }
1086
1087 /* Can't downgrade the backer of the root FS */
1088 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1089 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1090 error = ENOTSUP;
1091 goto out1;
1092 }
1093
1094 /*
1095 * Only root, or the user that did the original mount is
1096 * permitted to update it.
1097 */
1098 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
1099 (error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
1100 goto out1;
1101 }
1102#if CONFIG_MACF
1103 error = mac_mount_check_remount(ctx, mp);
1104 if (error != 0) {
1105 goto out1;
1106 }
1107#endif
1108 /*
1109 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1110 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1111 */
1112 if ((!kernelmount) && suser(cred: vfs_context_ucred(ctx), NULL)) {
1113 flags |= MNT_NOSUID | MNT_NODEV;
1114 if (mp->mnt_flag & MNT_NOEXEC) {
1115 flags |= MNT_NOEXEC;
1116 }
1117 }
1118 flag = mp->mnt_flag;
1119 flag_set = true;
1120
1121
1122
1123 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1124
1125 vfsp = mp->mnt_vtable;
1126 goto update;
1127 } // MNT_UPDATE
1128
1129 /*
1130 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1131 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1132 */
1133 if ((!kernelmount) && suser(cred: vfs_context_ucred(ctx), NULL)) {
1134 flags |= MNT_NOSUID | MNT_NODEV;
1135 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1136 flags |= MNT_NOEXEC;
1137 }
1138 }
1139
1140 /* XXXAUDIT: Should we capture the type on the error path as well? */
1141 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1142 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1143 mount_list_lock();
1144 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1145 if (!strncmp(s1: vfsp->vfc_name, s2: fstypename, MFSNAMELEN)) {
1146 vfsp->vfc_refcount++;
1147 vfsp_ref = TRUE;
1148 break;
1149 }
1150 }
1151 mount_list_unlock();
1152 if (vfsp == NULL) {
1153 error = ENODEV;
1154 goto out1;
1155 }
1156
1157 /*
1158 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1159 * except in ROSV configs and for the initial BaseSystem root.
1160 */
1161 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1162 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1163 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1164 error = EINVAL; /* unsupported request */
1165 goto out1;
1166 }
1167
1168 error = prepare_coveredvp(vp, ctx, cnp, fsname: fstypename, internal_flags);
1169 if (error != 0) {
1170 goto out1;
1171 }
1172
1173 /*
1174 * Allocate and initialize the filesystem (mount_t)
1175 */
1176 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1177 mntalloc = 1;
1178
1179 /* Initialize the default IO constraints */
1180 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1181 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1182 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1183 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1184 mp->mnt_devblocksize = DEV_BSIZE;
1185 mp->mnt_alignmentmask = PAGE_MASK;
1186 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1187 mp->mnt_ioscale = 1;
1188 mp->mnt_ioflags = 0;
1189 mp->mnt_realrootvp = NULLVP;
1190 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1191
1192 mp->mnt_lflag |= MNT_LMOUNT;
1193 did_set_lmount = TRUE;
1194
1195 TAILQ_INIT(&mp->mnt_vnodelist);
1196 TAILQ_INIT(&mp->mnt_workerqueue);
1197 TAILQ_INIT(&mp->mnt_newvnodes);
1198 mount_lock_init(mp);
1199 lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
1200 is_rwlock_locked = TRUE;
1201 mp->mnt_op = vfsp->vfc_vfsops;
1202 mp->mnt_vtable = vfsp;
1203 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1204 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1205 strlcpy(dst: mp->mnt_vfsstat.f_fstypename, src: vfsp->vfc_name, MFSTYPENAMELEN);
1206 do {
1207 size_t pathlen = MAXPATHLEN;
1208
1209 if (vn_getpath_ext(vp, dvp: pvp, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER)) {
1210 strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: cnp->cn_pnbuf, MAXPATHLEN);
1211 }
1212 } while (0);
1213 mp->mnt_vnodecovered = vp;
1214 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(cred: vfs_context_ucred(ctx));
1215 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1216 mp->mnt_devbsdunit = 0;
1217 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1218
1219 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1220 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1221
1222 if (kernelmount) {
1223 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1224 }
1225 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1226 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1227 }
1228
1229 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1230 // kernel mounted devfs
1231 mp->mnt_kern_flag |= MNTK_SYSTEM;
1232 }
1233
1234update:
1235
1236 /*
1237 * Set the mount level flags.
1238 */
1239 if (flags & MNT_RDONLY) {
1240 mp->mnt_flag |= MNT_RDONLY;
1241 } else if (mp->mnt_flag & MNT_RDONLY) {
1242 // disallow read/write upgrades of file systems that
1243 // had the TYPENAME_OVERRIDE feature set.
1244 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1245 error = EPERM;
1246 goto out1;
1247 }
1248 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1249 }
1250 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1251 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1252 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1253 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1254 MNT_QUARANTINE | MNT_CPROTECT);
1255
1256#if SECURE_KERNEL
1257#if !CONFIG_MNT_SUID
1258 /*
1259 * On release builds of iOS based platforms, always enforce NOSUID on
1260 * all mounts. We do this here because we can catch update mounts as well as
1261 * non-update mounts in this case.
1262 */
1263 mp->mnt_flag |= (MNT_NOSUID);
1264#endif
1265#endif
1266
1267 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1268 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1269 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1270 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1271 MNT_QUARANTINE | MNT_CPROTECT);
1272
1273#if CONFIG_MACF
1274 if (flags & MNT_MULTILABEL) {
1275 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1276 error = EINVAL;
1277 goto out1;
1278 }
1279 mp->mnt_flag |= MNT_MULTILABEL;
1280 }
1281#endif
1282 /*
1283 * Process device path for local file systems if requested.
1284 *
1285 * Snapshot and mount-by-role mounts do not use this path; they are
1286 * passing other opaque data in the device path field.
1287 *
1288 * Basesystemroot mounts pass a device path to be resolved here,
1289 * but it's just a char * already inside the kernel, which
1290 * kernel_mount() shoved into a user_addr_t to call us. So for such
1291 * mounts we must skip copyin (both of the address and of the string
1292 * (in NDINIT).
1293 */
1294 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1295 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1296 boolean_t do_copyin_devpath = true;
1297#if CONFIG_BASESYSTEMROOT
1298 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1299 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1300 // We have been passed fsmountargs, which is typed as a user_addr_t,
1301 // but is actually a char ** pointing to a (kernelspace) string.
1302 // We manually unpack it with a series of casts and dereferences
1303 // that reverses what was done just above us on the stack in
1304 // imageboot_pivot_image().
1305 // After retrieving the path to the dev node (which we will NDINIT
1306 // in a moment), we pass NULL fsmountargs on to the filesystem.
1307 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1308 char **devnamepp = (char **)fsmountargs;
1309 char *devnamep = *devnamepp;
1310 devpath = CAST_USER_ADDR_T(devnamep);
1311 do_copyin_devpath = false;
1312 fsmountargs = USER_ADDR_NULL;
1313
1314 //Now that we have a mp, denote that this mount is for the basesystem.
1315 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1316 }
1317#endif // CONFIG_BASESYSTEMROOT
1318
1319 if (do_copyin_devpath) {
1320 if (vfs_context_is64bit(ctx)) {
1321 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1322 goto out1;
1323 }
1324 fsmountargs += sizeof(devpath);
1325 } else {
1326 user32_addr_t tmp;
1327 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1328 goto out1;
1329 }
1330 /* munge into LP64 addr */
1331 devpath = CAST_USER_ADDR_T(tmp);
1332 fsmountargs += sizeof(tmp);
1333 }
1334 }
1335
1336 /* Lookup device and authorize access to it */
1337 if ((devpath)) {
1338 struct nameidata nd;
1339
1340 enum uio_seg seg = UIO_USERSPACE;
1341#if CONFIG_BASESYSTEMROOT
1342 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1343 seg = UIO_SYSSPACE;
1344 }
1345#endif // CONFIG_BASESYSTEMROOT
1346
1347 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1348 if ((error = namei(ndp: &nd))) {
1349 goto out1;
1350 }
1351
1352 strlcpy(dst: mp->mnt_vfsstat.f_mntfromname, src: nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1353 devvp = nd.ni_vp;
1354
1355 nameidone(&nd);
1356
1357 if (devvp->v_type != VBLK) {
1358 error = ENOTBLK;
1359 goto out2;
1360 }
1361 if (major(devvp->v_rdev) >= nblkdev) {
1362 error = ENXIO;
1363 goto out2;
1364 }
1365 /*
1366 * If mount by non-root, then verify that user has necessary
1367 * permissions on the device.
1368 */
1369 if (suser(cred: vfs_context_ucred(ctx), NULL) != 0) {
1370 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1371
1372 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1373 accessmode |= KAUTH_VNODE_WRITE_DATA;
1374 }
1375 if ((error = vnode_authorize(vp: devvp, NULL, action: accessmode, ctx)) != 0) {
1376 goto out2;
1377 }
1378 }
1379 }
1380 /* On first mount, preflight and open device */
1381 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1382 if ((error = vnode_ref(vp: devvp))) {
1383 goto out2;
1384 }
1385 /*
1386 * Disallow multiple mounts of the same device.
1387 * Disallow mounting of a device that is currently in use
1388 * (except for root, which might share swap device for miniroot).
1389 * Flush out any old buffers remaining from a previous use.
1390 */
1391 if ((error = vfs_setmounting(devvp))) {
1392 vnode_rele(vp: devvp);
1393 goto out2;
1394 }
1395
1396 if (vcount(vp: devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1397 error = EBUSY;
1398 goto out3;
1399 }
1400 if ((error = VNOP_FSYNC(vp: devvp, MNT_WAIT, ctx))) {
1401 error = ENOTBLK;
1402 goto out3;
1403 }
1404 if ((error = buf_invalidateblks(vp: devvp, BUF_WRITE_DATA, slpflag: 0, slptimeo: 0))) {
1405 goto out3;
1406 }
1407
1408 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1409#if CONFIG_MACF
1410 error = mac_vnode_check_open(ctx,
1411 vp: devvp,
1412 acc_mode: ronly ? FREAD : FREAD | FWRITE);
1413 if (error) {
1414 goto out3;
1415 }
1416#endif /* MAC */
1417 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1418 goto out3;
1419 }
1420
1421 mp->mnt_devvp = devvp;
1422 device_vnode = devvp;
1423 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1424 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1425 (device_vnode = mp->mnt_devvp)) {
1426 dev_t dev;
1427 int maj;
1428 /*
1429 * If upgrade to read-write by non-root, then verify
1430 * that user has necessary permissions on the device.
1431 */
1432 vnode_getalways(device_vnode);
1433
1434 if (suser(cred: vfs_context_ucred(ctx), NULL) &&
1435 (error = vnode_authorize(vp: device_vnode, NULL,
1436 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1437 ctx)) != 0) {
1438 vnode_put(vp: device_vnode);
1439 goto out2;
1440 }
1441
1442 /* Tell the device that we're upgrading */
1443 dev = (dev_t)device_vnode->v_rdev;
1444 maj = major(dev);
1445
1446 if ((u_int)maj >= (u_int)nblkdev) {
1447 panic("Volume mounted on a device with invalid major number.");
1448 }
1449
1450 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1451 vnode_put(vp: device_vnode);
1452 device_vnode = NULLVP;
1453 if (error != 0) {
1454 goto out2;
1455 }
1456 }
1457 } // localargs && !(snapshot | data | vm)
1458
1459#if CONFIG_MACF
1460 if ((flags & MNT_UPDATE) == 0) {
1461 mac_mount_label_init(mp);
1462 mac_mount_label_associate(ctx, mp);
1463 }
1464 if (labelstr) {
1465 if ((flags & MNT_UPDATE) != 0) {
1466 error = mac_mount_check_label_update(ctx, mp);
1467 if (error != 0) {
1468 goto out3;
1469 }
1470 }
1471 }
1472#endif
1473 /*
1474 * Mount the filesystem. We already asserted that internal_flags
1475 * cannot have more than one mount-by-role bit set.
1476 */
1477 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1478 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1479 data: (caddr_t)fsmountargs, flags: 0, context: ctx);
1480 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1481#if CONFIG_ROSV_STARTUP
1482 struct mount *origin_mp = (struct mount*)fsmountargs;
1483 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1484 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: 0, context: ctx);
1485 if (error) {
1486 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1487 } else {
1488 /* Mark volume associated with system volume */
1489 mp->mnt_kern_flag |= MNTK_SYSTEM;
1490
1491 /* Attempt to acquire the mnt_devvp and set it up */
1492 struct vnode *mp_devvp = NULL;
1493 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1494 errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1495 flags: 0, vpp: &mp_devvp, ctx: vfs_context_kernel());
1496 if (!lerr) {
1497 mp->mnt_devvp = mp_devvp;
1498 //vnode_lookup took an iocount, need to drop it.
1499 vnode_put(vp: mp_devvp);
1500 // now set `device_vnode` to the devvp that was acquired.
1501 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1502 // note that though the iocount above was dropped, the mount acquires
1503 // an implicit reference against the device.
1504 device_vnode = mp_devvp;
1505 }
1506 }
1507 }
1508#else
1509 error = EINVAL;
1510#endif
1511 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1512#if CONFIG_MOUNT_VM
1513 struct mount *origin_mp = (struct mount*)fsmountargs;
1514 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1515 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: 0, context: ctx);
1516 if (error) {
1517 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1518 } else {
1519 /* Mark volume associated with system volume and a swap mount */
1520 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1521 /* Attempt to acquire the mnt_devvp and set it up */
1522 struct vnode *mp_devvp = NULL;
1523 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1524 errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1525 flags: 0, vpp: &mp_devvp, ctx: vfs_context_kernel());
1526 if (!lerr) {
1527 mp->mnt_devvp = mp_devvp;
1528 //vnode_lookup took an iocount, need to drop it.
1529 vnode_put(vp: mp_devvp);
1530
1531 // now set `device_vnode` to the devvp that was acquired.
1532 // note that though the iocount above was dropped, the mount acquires
1533 // an implicit reference against the device.
1534 device_vnode = mp_devvp;
1535 }
1536 }
1537 }
1538#else
1539 error = EINVAL;
1540#endif
1541 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1542#if CONFIG_MOUNT_PREBOOTRECOVERY
1543 struct mount *origin_mp = (struct mount*)fsmountargs;
1544 uint32_t mount_role = 0;
1545 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1546 mount_role = VFS_PREBOOT_ROLE;
1547 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1548 mount_role = VFS_RECOVERY_ROLE;
1549 }
1550
1551 if (mount_role != 0) {
1552 fs_role_mount_args_t frma = {origin_mp, mount_role};
1553 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, data: (caddr_t)&frma, flags: 0, context: ctx);
1554 if (error) {
1555 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1556 } else {
1557 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1558 /* Mark volume associated with system volume */
1559 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1560 /* Attempt to acquire the mnt_devvp and set it up */
1561 struct vnode *mp_devvp = NULL;
1562 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1563 errno_t lerr = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname,
1564 flags: 0, vpp: &mp_devvp, ctx: vfs_context_kernel());
1565 if (!lerr) {
1566 mp->mnt_devvp = mp_devvp;
1567 //vnode_lookup took an iocount, need to drop it.
1568 vnode_put(vp: mp_devvp);
1569
1570 // now set `device_vnode` to the devvp that was acquired.
1571 // note that though the iocount above was dropped, the mount acquires
1572 // an implicit reference against the device.
1573 device_vnode = mp_devvp;
1574 }
1575 }
1576 }
1577 } else {
1578 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1579 error = EINVAL;
1580 }
1581#else
1582 error = EINVAL;
1583#endif
1584 } else {
1585 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1586 }
1587
1588 if (flags & MNT_UPDATE) {
1589 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1590 mp->mnt_flag &= ~MNT_RDONLY;
1591 }
1592 mp->mnt_flag &= ~
1593 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1594 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1595 if (error) {
1596 mp->mnt_flag = flag; /* restore flag value */
1597 }
1598 vfs_event_signal(NULL, VQ_UPDATE, data: (intptr_t)NULL);
1599 lck_rw_done(lck: &mp->mnt_rwlock);
1600 is_rwlock_locked = FALSE;
1601 if (!error) {
1602 enablequotas(mp, ctx);
1603 }
1604 goto exit;
1605 }
1606
1607 /*
1608 * Put the new filesystem on the mount list after root.
1609 */
1610 if (error == 0) {
1611 struct vfs_attr vfsattr;
1612 if (device_vnode) {
1613 /*
1614 * cache the IO attributes for the underlying physical media...
1615 * an error return indicates the underlying driver doesn't
1616 * support all the queries necessary... however, reasonable
1617 * defaults will have been set, so no reason to bail or care
1618 *
1619 * Need to do this before calling the MAC hook as it needs
1620 * information from this call.
1621 */
1622 vfs_init_io_attributes(devvp: device_vnode, mp);
1623 }
1624
1625#if CONFIG_MACF
1626 error = mac_mount_check_mount_late(ctx, mp);
1627 if (error != 0) {
1628 goto out4;
1629 }
1630
1631 if (vfs_flags(mp) & MNT_MULTILABEL) {
1632 error = VFS_ROOT(mp, &rvp, ctx);
1633 if (error) {
1634 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1635 goto out4;
1636 }
1637 error = vnode_label(mp, NULL, vp: rvp, NULL, flags: 0, ctx);
1638 /*
1639 * drop reference provided by VFS_ROOT
1640 */
1641 vnode_put(vp: rvp);
1642
1643 if (error) {
1644 goto out4;
1645 }
1646 }
1647#endif /* MAC */
1648
1649 vnode_lock_spin(vp);
1650 CLR(vp->v_flag, VMOUNT);
1651 vp->v_mountedhere = mp;
1652 SET(vp->v_flag, VMOUNTEDHERE);
1653 vnode_unlock(vp);
1654
1655 /*
1656 * taking the name_cache_lock exclusively will
1657 * insure that everyone is out of the fast path who
1658 * might be trying to use a now stale copy of
1659 * vp->v_mountedhere->mnt_realrootvp
1660 * bumping mount_generation causes the cached values
1661 * to be invalidated
1662 */
1663 name_cache_lock();
1664 mount_generation++;
1665 name_cache_unlock();
1666
1667 error = vnode_ref(vp);
1668 if (error != 0) {
1669 goto out4;
1670 }
1671
1672 have_usecount = TRUE;
1673
1674 error = checkdirs(olddp: vp, ctx);
1675 if (error != 0) {
1676 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1677 goto out4;
1678 }
1679 /*
1680 * there is no cleanup code here so I have made it void
1681 * we need to revisit this
1682 */
1683 (void)VFS_START(mp, 0, ctx);
1684
1685 if (mount_list_add(mp) != 0) {
1686 /*
1687 * The system is shutting down trying to umount
1688 * everything, so fail with a plausible errno.
1689 */
1690 error = EBUSY;
1691 goto out4;
1692 }
1693 lck_rw_done(lck: &mp->mnt_rwlock);
1694 is_rwlock_locked = FALSE;
1695
1696 /* Check if this mounted file system supports EAs or named streams. */
1697 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1698 VFSATTR_INIT(&vfsattr);
1699 VFSATTR_WANTED(&vfsattr, f_capabilities);
1700 if (strncmp(s1: mp->mnt_vfsstat.f_fstypename, s2: "webdav", n: sizeof("webdav")) != 0 &&
1701 vfs_getattr(mp, vfa: &vfsattr, ctx) == 0 &&
1702 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1703 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1704 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1705 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1706 }
1707#if NAMEDSTREAMS
1708 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1709 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1710 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1711 }
1712#endif
1713 /* Check if this file system supports path from id lookups. */
1714 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1715 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1716 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1718 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1719 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1720 }
1721
1722 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1723 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1724 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1725 }
1726 }
1727 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1728 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1729 }
1730 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1731 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1732 }
1733 /* increment the operations count */
1734 OSAddAtomic(1, &vfs_nummntops);
1735 enablequotas(mp, ctx);
1736
1737 if (device_vnode) {
1738 vfs_setmountedon(device_vnode);
1739 }
1740
1741 /* Now that mount is setup, notify the listeners */
1742 vfs_notify_mount(pdvp: pvp);
1743 IOBSDMountChange(mp, op: kIOMountChangeMount);
1744 } else {
1745 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1746 if (mp->mnt_vnodelist.tqh_first != NULL) {
1747 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1748 mp->mnt_vtable->vfc_name, error);
1749 }
1750
1751 vnode_lock_spin(vp);
1752 CLR(vp->v_flag, VMOUNT);
1753 vnode_unlock(vp);
1754 mount_list_lock();
1755 mp->mnt_vtable->vfc_refcount--;
1756 mount_list_unlock();
1757
1758 if (device_vnode) {
1759 vnode_rele(vp: device_vnode);
1760 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1761 vfs_clearmounting(device_vnode);
1762 }
1763 lck_rw_done(lck: &mp->mnt_rwlock);
1764 is_rwlock_locked = FALSE;
1765
1766 if (nc_smr_enabled) {
1767 vfs_smr_synchronize();
1768 }
1769
1770 /*
1771 * if we get here, we have a mount structure that needs to be freed,
1772 * but since the coveredvp hasn't yet been updated to point at it,
1773 * no need to worry about other threads holding a crossref on this mp
1774 * so it's ok to just free it
1775 */
1776 mount_lock_destroy(mp);
1777#if CONFIG_MACF
1778 mac_mount_label_destroy(mp);
1779#endif
1780 zfree(mount_zone, mp);
1781 did_set_lmount = false;
1782 }
1783exit:
1784 /*
1785 * drop I/O count on the device vp if there was one
1786 */
1787 if (devpath && devvp) {
1788 vnode_put(vp: devvp);
1789 }
1790
1791 if (did_set_lmount) {
1792 mount_lock_spin(mp);
1793 mp->mnt_lflag &= ~MNT_LMOUNT;
1794 mount_unlock(mp);
1795 }
1796
1797 return error;
1798
1799/* Error condition exits */
1800out4:
1801 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802
1803 /*
1804 * If the mount has been placed on the covered vp,
1805 * it may have been discovered by now, so we have
1806 * to treat this just like an unmount
1807 */
1808 mount_lock_spin(mp);
1809 mp->mnt_lflag |= MNT_LDEAD;
1810 mount_unlock(mp);
1811
1812 if (device_vnode != NULLVP) {
1813 vnode_rele(vp: device_vnode);
1814 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 ctx);
1816 vfs_clearmounting(device_vnode);
1817 did_rele = TRUE;
1818 }
1819
1820 vnode_lock_spin(vp);
1821
1822 mp->mnt_crossref++;
1823 CLR(vp->v_flag, VMOUNTEDHERE);
1824 vp->v_mountedhere = (mount_t) 0;
1825
1826 vnode_unlock(vp);
1827
1828 if (have_usecount) {
1829 vnode_rele(vp);
1830 }
1831out3:
1832 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1833 vnode_rele(vp: devvp);
1834 vfs_clearmounting(devvp);
1835 }
1836out2:
1837 if (devpath && devvp) {
1838 vnode_put(vp: devvp);
1839 }
1840out1:
1841 /* Release mnt_rwlock only when it was taken */
1842 if (is_rwlock_locked == TRUE) {
1843 if (flag_set) {
1844 mp->mnt_flag = flag; /* restore mnt_flag value */
1845 }
1846 lck_rw_done(lck: &mp->mnt_rwlock);
1847 }
1848
1849 if (did_set_lmount) {
1850 mount_lock_spin(mp);
1851 mp->mnt_lflag &= ~MNT_LMOUNT;
1852 mount_unlock(mp);
1853 }
1854
1855 if (mntalloc) {
1856 if (mp->mnt_crossref) {
1857 mount_dropcrossref(mp, vp, 0);
1858 } else {
1859 if (nc_smr_enabled) {
1860 vfs_smr_synchronize();
1861 }
1862
1863 mount_lock_destroy(mp);
1864#if CONFIG_MACF
1865 mac_mount_label_destroy(mp);
1866#endif
1867 zfree(mount_zone, mp);
1868 }
1869 }
1870 if (vfsp_ref) {
1871 mount_list_lock();
1872 vfsp->vfc_refcount--;
1873 mount_list_unlock();
1874 }
1875
1876 return error;
1877}
1878
1879/*
1880 * Flush in-core data, check for competing mount attempts,
1881 * and set VMOUNT
1882 */
1883int
1884prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1885{
1886#if !CONFIG_MACF
1887#pragma unused(cnp,fsname)
1888#endif
1889 struct vnode_attr va;
1890 int error;
1891 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1892 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1893 boolean_t is_busy;
1894
1895 if (!skip_auth) {
1896 /*
1897 * If the user is not root, ensure that they own the directory
1898 * onto which we are attempting to mount.
1899 */
1900 VATTR_INIT(&va);
1901 VATTR_WANTED(&va, va_uid);
1902 if ((error = vnode_getattr(vp, vap: &va, ctx)) ||
1903 (va.va_uid != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
1904 (!vfs_context_issuser(ctx)))) {
1905 error = EPERM;
1906 goto out;
1907 }
1908 }
1909
1910 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1911 goto out;
1912 }
1913
1914 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, slpflag: 0, slptimeo: 0))) {
1915 goto out;
1916 }
1917
1918 if (vp->v_type != VDIR) {
1919 error = ENOTDIR;
1920 goto out;
1921 }
1922
1923 vnode_lock_spin(vp);
1924 is_busy = is_fmount ?
1925 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1926 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1927 if (is_busy) {
1928 vnode_unlock(vp);
1929 error = EBUSY;
1930 goto out;
1931 }
1932 SET(vp->v_flag, VMOUNT);
1933 vnode_unlock(vp);
1934
1935#if CONFIG_MACF
1936 error = mac_mount_check_mount(ctx, vp,
1937 cnp, vfc_name: fsname);
1938 if (error != 0) {
1939 vnode_lock_spin(vp);
1940 CLR(vp->v_flag, VMOUNT);
1941 vnode_unlock(vp);
1942 }
1943#endif
1944
1945out:
1946 return error;
1947}
1948
1949#if CONFIG_IMGSRC_ACCESS
1950
1951#define DEBUG_IMGSRC 0
1952
1953#if DEBUG_IMGSRC
1954#define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1955#else
1956#define IMGSRC_DEBUG(args...) do { } while(0)
1957#endif
1958
1959static int
1960authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1961{
1962 struct nameidata nd;
1963 vnode_t vp, realdevvp;
1964 kauth_action_t accessmode;
1965 int error;
1966 enum uio_seg uio = UIO_USERSPACE;
1967
1968 if (ctx == vfs_context_kernel()) {
1969 uio = UIO_SYSSPACE;
1970 }
1971
1972 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1973 if ((error = namei(ndp: &nd))) {
1974 IMGSRC_DEBUG("namei() failed with %d\n", error);
1975 return error;
1976 }
1977
1978 vp = nd.ni_vp;
1979
1980 if (!vnode_isblk(vp)) {
1981 IMGSRC_DEBUG("Not block device.\n");
1982 error = ENOTBLK;
1983 goto out;
1984 }
1985
1986 realdevvp = mp->mnt_devvp;
1987 if (realdevvp == NULLVP) {
1988 IMGSRC_DEBUG("No device backs the mount.\n");
1989 error = ENXIO;
1990 goto out;
1991 }
1992
1993 error = vnode_getwithref(vp: realdevvp);
1994 if (error != 0) {
1995 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1996 goto out;
1997 }
1998
1999 if (vnode_specrdev(vp) != vnode_specrdev(vp: realdevvp)) {
2000 IMGSRC_DEBUG("Wrong dev_t.\n");
2001 error = ENXIO;
2002 goto out1;
2003 }
2004
2005 strlcpy(dst: mp->mnt_vfsstat.f_mntfromname, src: nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2006
2007 /*
2008 * If mount by non-root, then verify that user has necessary
2009 * permissions on the device.
2010 */
2011 if (!vfs_context_issuser(ctx)) {
2012 accessmode = KAUTH_VNODE_READ_DATA;
2013 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2014 accessmode |= KAUTH_VNODE_WRITE_DATA;
2015 }
2016 if ((error = vnode_authorize(vp, NULL, action: accessmode, ctx)) != 0) {
2017 IMGSRC_DEBUG("Access denied.\n");
2018 goto out1;
2019 }
2020 }
2021
2022 *devvpp = vp;
2023
2024out1:
2025 vnode_put(vp: realdevvp);
2026
2027out:
2028 nameidone(&nd);
2029
2030 if (error) {
2031 vnode_put(vp);
2032 }
2033
2034 return error;
2035}
2036
2037/*
2038 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2039 * and call checkdirs()
2040 */
2041static int
2042place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2043{
2044 int error;
2045
2046 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2047
2048 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2049 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2050
2051 vnode_lock_spin(vp);
2052 CLR(vp->v_flag, VMOUNT);
2053 vp->v_mountedhere = mp;
2054 SET(vp->v_flag, VMOUNTEDHERE);
2055 vnode_unlock(vp);
2056
2057 /*
2058 * taking the name_cache_lock exclusively will
2059 * insure that everyone is out of the fast path who
2060 * might be trying to use a now stale copy of
2061 * vp->v_mountedhere->mnt_realrootvp
2062 * bumping mount_generation causes the cached values
2063 * to be invalidated
2064 */
2065 name_cache_lock();
2066 mount_generation++;
2067 name_cache_unlock();
2068
2069 error = vnode_ref(vp);
2070 if (error != 0) {
2071 goto out;
2072 }
2073
2074 error = checkdirs(olddp: vp, ctx);
2075 if (error != 0) {
2076 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2077 vnode_rele(vp);
2078 goto out;
2079 }
2080
2081out:
2082 if (error != 0) {
2083 mp->mnt_vnodecovered = NULLVP;
2084 }
2085 return error;
2086}
2087
2088static void
2089undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2090{
2091 vnode_rele(vp);
2092 vnode_lock_spin(vp);
2093 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2094 vp->v_mountedhere = (mount_t)NULL;
2095 vnode_unlock(vp);
2096
2097 mp->mnt_vnodecovered = NULLVP;
2098}
2099
2100static int
2101mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2102{
2103 int error;
2104
2105 /* unmount in progress return error */
2106 mount_lock_spin(mp);
2107 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2108 mount_unlock(mp);
2109 return EBUSY;
2110 }
2111 mount_unlock(mp);
2112 lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2113
2114 /*
2115 * We only allow the filesystem to be reloaded if it
2116 * is currently mounted read-only.
2117 */
2118 if ((flags & MNT_RELOAD) &&
2119 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2120 error = ENOTSUP;
2121 goto out;
2122 }
2123
2124 /*
2125 * Only root, or the user that did the original mount is
2126 * permitted to update it.
2127 */
2128 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: vfs_context_ucred(ctx)) &&
2129 (!vfs_context_issuser(ctx))) {
2130 error = EPERM;
2131 goto out;
2132 }
2133#if CONFIG_MACF
2134 error = mac_mount_check_remount(ctx, mp);
2135 if (error != 0) {
2136 goto out;
2137 }
2138#endif
2139
2140out:
2141 if (error) {
2142 lck_rw_done(lck: &mp->mnt_rwlock);
2143 }
2144
2145 return error;
2146}
2147
2148static void
2149mount_end_update(mount_t mp)
2150{
2151 lck_rw_done(lck: &mp->mnt_rwlock);
2152}
2153
2154static int
2155get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2156{
2157 vnode_t vp;
2158
2159 if (height >= MAX_IMAGEBOOT_NESTING) {
2160 return EINVAL;
2161 }
2162
2163 vp = imgsrc_rootvnodes[height];
2164 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2165 *rvpp = vp;
2166 return 0;
2167 } else {
2168 return ENOENT;
2169 }
2170}
2171
2172static int
2173relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2174 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2175 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2176{
2177 int error;
2178 mount_t mp;
2179 boolean_t placed = FALSE;
2180 struct vfstable *vfsp;
2181 user_addr_t devpath;
2182 char *old_mntonname;
2183 vnode_t rvp;
2184 vnode_t devvp;
2185 uint32_t height;
2186 uint32_t flags;
2187
2188 /* If we didn't imageboot, nothing to move */
2189 if (imgsrc_rootvnodes[0] == NULLVP) {
2190 return EINVAL;
2191 }
2192
2193 /* Only root can do this */
2194 if (!vfs_context_issuser(ctx)) {
2195 return EPERM;
2196 }
2197
2198 IMGSRC_DEBUG("looking for root vnode.\n");
2199
2200 /*
2201 * Get root vnode of filesystem we're moving.
2202 */
2203 if (by_index) {
2204 if (is64bit) {
2205 struct user64_mnt_imgsrc_args mia64;
2206 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2207 if (error != 0) {
2208 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2209 return error;
2210 }
2211
2212 height = mia64.mi_height;
2213 flags = mia64.mi_flags;
2214 devpath = (user_addr_t)mia64.mi_devpath;
2215 } else {
2216 struct user32_mnt_imgsrc_args mia32;
2217 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2218 if (error != 0) {
2219 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2220 return error;
2221 }
2222
2223 height = mia32.mi_height;
2224 flags = mia32.mi_flags;
2225 devpath = mia32.mi_devpath;
2226 }
2227 } else {
2228 /*
2229 * For binary compatibility--assumes one level of nesting.
2230 */
2231 if (is64bit) {
2232 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2233 return error;
2234 }
2235 } else {
2236 user32_addr_t tmp;
2237 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2238 return error;
2239 }
2240
2241 /* munge into LP64 addr */
2242 devpath = CAST_USER_ADDR_T(tmp);
2243 }
2244
2245 height = 0;
2246 flags = 0;
2247 }
2248
2249 if (flags != 0) {
2250 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2251 return EINVAL;
2252 }
2253
2254 error = get_imgsrc_rootvnode(height, rvpp: &rvp);
2255 if (error != 0) {
2256 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2257 return error;
2258 }
2259
2260 IMGSRC_DEBUG("got old root vnode\n");
2261
2262 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2263
2264 /* Can only move once */
2265 mp = vnode_mount(vp: rvp);
2266 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2267 IMGSRC_DEBUG("Already moved.\n");
2268 error = EBUSY;
2269 goto out0;
2270 }
2271
2272 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2273 IMGSRC_DEBUG("Starting updated.\n");
2274
2275 /* Get exclusive rwlock on mount, authorize update on mp */
2276 error = mount_begin_update(mp, ctx, flags: 0);
2277 if (error != 0) {
2278 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2279 goto out0;
2280 }
2281
2282 /*
2283 * It can only be moved once. Flag is set under the rwlock,
2284 * so we're now safe to proceed.
2285 */
2286 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2287 IMGSRC_DEBUG("Already moved [2]\n");
2288 goto out1;
2289 }
2290
2291 IMGSRC_DEBUG("Preparing coveredvp.\n");
2292
2293 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2294 error = prepare_coveredvp(vp, ctx, cnp, fsname, internal_flags: 0);
2295 if (error != 0) {
2296 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2297 goto out1;
2298 }
2299
2300 IMGSRC_DEBUG("Covered vp OK.\n");
2301
2302 /* Sanity check the name caller has provided */
2303 vfsp = mp->mnt_vtable;
2304 if (strncmp(s1: vfsp->vfc_name, s2: fsname, MFSNAMELEN) != 0) {
2305 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2306 vfsp->vfc_name, fsname);
2307 error = EINVAL;
2308 goto out2;
2309 }
2310
2311 /* Check the device vnode and update mount-from name, for local filesystems */
2312 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2313 IMGSRC_DEBUG("Local, doing device validation.\n");
2314
2315 if (devpath != USER_ADDR_NULL) {
2316 error = authorize_devpath_and_update_mntfromname(mp, devpath, devvpp: &devvp, ctx);
2317 if (error) {
2318 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2319 goto out2;
2320 }
2321
2322 vnode_put(vp: devvp);
2323 }
2324 }
2325
2326 /*
2327 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2328 * and increment the name cache's mount generation
2329 */
2330
2331 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2332 error = place_mount_and_checkdirs(mp, vp, ctx);
2333 if (error != 0) {
2334 goto out2;
2335 }
2336
2337 placed = TRUE;
2338
2339 strlcpy(dst: old_mntonname, src: mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2340 strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: cnp->cn_pnbuf, MAXPATHLEN);
2341
2342 /* Forbid future moves */
2343 mount_lock(mp);
2344 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2345 mount_unlock(mp);
2346
2347 /* Finally, add to mount list, completely ready to go */
2348 if (mount_list_add(mp) != 0) {
2349 /*
2350 * The system is shutting down trying to umount
2351 * everything, so fail with a plausible errno.
2352 */
2353 error = EBUSY;
2354 goto out3;
2355 }
2356
2357 mount_end_update(mp);
2358 vnode_put(vp: rvp);
2359 zfree(ZV_NAMEI, old_mntonname);
2360
2361 vfs_notify_mount(pdvp: pvp);
2362
2363 return 0;
2364out3:
2365 strlcpy(dst: mp->mnt_vfsstat.f_mntonname, src: old_mntonname, MAXPATHLEN);
2366
2367 mount_lock(mp);
2368 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2369 mount_unlock(mp);
2370
2371out2:
2372 /*
2373 * Placing the mp on the vnode clears VMOUNT,
2374 * so cleanup is different after that point
2375 */
2376 if (placed) {
2377 /* Rele the vp, clear VMOUNT and v_mountedhere */
2378 undo_place_on_covered_vp(mp, vp);
2379 } else {
2380 vnode_lock_spin(vp);
2381 CLR(vp->v_flag, VMOUNT);
2382 vnode_unlock(vp);
2383 }
2384out1:
2385 mount_end_update(mp);
2386
2387out0:
2388 vnode_put(vp: rvp);
2389 zfree(ZV_NAMEI, old_mntonname);
2390 return error;
2391}
2392
2393#endif /* CONFIG_IMGSRC_ACCESS */
2394
2395void
2396enablequotas(struct mount *mp, vfs_context_t ctx)
2397{
2398 struct nameidata qnd;
2399 int type;
2400 char qfpath[MAXPATHLEN];
2401 const char *qfname = QUOTAFILENAME;
2402 const char *qfopsname = QUOTAOPSNAME;
2403 const char *qfextension[] = INITQFNAMES;
2404
2405 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2406 if (strncmp(s1: mp->mnt_vfsstat.f_fstypename, s2: "hfs", n: sizeof("hfs")) != 0) {
2407 return;
2408 }
2409 /*
2410 * Enable filesystem disk quotas if necessary.
2411 * We ignore errors as this should not interfere with final mount
2412 */
2413 for (type = 0; type < MAXQUOTAS; type++) {
2414 snprintf(qfpath, count: sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2415 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2416 CAST_USER_ADDR_T(qfpath), ctx);
2417 if (namei(ndp: &qnd) != 0) {
2418 continue; /* option file to trigger quotas is not present */
2419 }
2420 vnode_put(vp: qnd.ni_vp);
2421 nameidone(&qnd);
2422 snprintf(qfpath, count: sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2423
2424 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2425 }
2426 return;
2427}
2428
2429
2430static int
2431checkdirs_callback(proc_t p, void * arg)
2432{
2433 struct cdirargs *cdrp = (struct cdirargs *)arg;
2434 vnode_t olddp = cdrp->olddp;
2435 vnode_t newdp = cdrp->newdp;
2436 struct filedesc *fdp = &p->p_fd;
2437 vnode_t new_cvp = newdp;
2438 vnode_t new_rvp = newdp;
2439 vnode_t old_cvp = NULL;
2440 vnode_t old_rvp = NULL;
2441
2442 /*
2443 * XXX Also needs to iterate each thread in the process to see if it
2444 * XXX is using a per-thread current working directory, and, if so,
2445 * XXX update that as well.
2446 */
2447
2448 /*
2449 * First, with the proc_fdlock held, check to see if we will need
2450 * to do any work. If not, we will get out fast.
2451 */
2452 proc_fdlock(p);
2453 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2454 proc_fdunlock(p);
2455 return PROC_RETURNED;
2456 }
2457 proc_fdunlock(p);
2458
2459 /*
2460 * Ok, we will have to do some work. Always take two refs
2461 * because we might need that many. We'll dispose of whatever
2462 * we ended up not using.
2463 */
2464 if (vnode_ref(vp: newdp) != 0) {
2465 return PROC_RETURNED;
2466 }
2467 if (vnode_ref(vp: newdp) != 0) {
2468 vnode_rele(vp: newdp);
2469 return PROC_RETURNED;
2470 }
2471
2472 proc_dirs_lock_exclusive(p);
2473 /*
2474 * Now do the work. Note: we dropped the proc_fdlock, so we
2475 * have to do all of the checks again.
2476 */
2477 proc_fdlock(p);
2478 if (fdp->fd_cdir == olddp) {
2479 old_cvp = olddp;
2480 fdp->fd_cdir = newdp;
2481 new_cvp = NULL;
2482 }
2483 if (fdp->fd_rdir == olddp) {
2484 old_rvp = olddp;
2485 fdp->fd_rdir = newdp;
2486 new_rvp = NULL;
2487 }
2488 proc_fdunlock(p);
2489 proc_dirs_unlock_exclusive(p);
2490
2491 /*
2492 * Dispose of any references that are no longer needed.
2493 */
2494 if (old_cvp != NULL) {
2495 vnode_rele(vp: old_cvp);
2496 }
2497 if (old_rvp != NULL) {
2498 vnode_rele(vp: old_rvp);
2499 }
2500 if (new_cvp != NULL) {
2501 vnode_rele(vp: new_cvp);
2502 }
2503 if (new_rvp != NULL) {
2504 vnode_rele(vp: new_rvp);
2505 }
2506
2507 return PROC_RETURNED;
2508}
2509
2510
2511
2512/*
2513 * Scan all active processes to see if any of them have a current
2514 * or root directory onto which the new filesystem has just been
2515 * mounted. If so, replace them with the new mount point.
2516 */
2517static int
2518checkdirs(vnode_t olddp, vfs_context_t ctx)
2519{
2520 vnode_t newdp;
2521 vnode_t tvp;
2522 int err;
2523 struct cdirargs cdr;
2524
2525 if (olddp->v_usecount == 1) {
2526 return 0;
2527 }
2528 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2529
2530 if (err != 0) {
2531#if DIAGNOSTIC
2532 panic("mount: lost mount: error %d", err);
2533#endif
2534 return err;
2535 }
2536
2537 cdr.olddp = olddp;
2538 cdr.newdp = newdp;
2539 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2540 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, callout: checkdirs_callback, arg: (void *)&cdr, NULL, NULL);
2541
2542 if (rootvnode == olddp) {
2543 vnode_ref(vp: newdp);
2544 lck_rw_lock_exclusive(lck: &rootvnode_rw_lock);
2545 tvp = rootvnode;
2546 rootvnode = newdp;
2547 lck_rw_unlock_exclusive(lck: &rootvnode_rw_lock);
2548 vnode_rele(vp: tvp);
2549 }
2550
2551 vnode_put(vp: newdp);
2552 return 0;
2553}
2554
2555#define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2556 "com.apple.private.vfs.role-account-unmount"
2557
2558/*
2559 * Unmount a file system.
2560 *
2561 * Note: unmount takes a path to the vnode mounted on as argument,
2562 * not special file (as before).
2563 */
2564/* ARGSUSED */
2565int
2566unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2567{
2568 vnode_t vp;
2569 struct mount *mp;
2570 int error;
2571 struct nameidata nd;
2572 vfs_context_t ctx;
2573
2574 /*
2575 * If the process has the entitlement, use the kernel's context when
2576 * performing lookup on the mount path as the process might lack proper
2577 * permission to access the directory.
2578 */
2579 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2580 vfs_context_kernel() : vfs_context_current();
2581
2582 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2583 UIO_USERSPACE, uap->path, ctx);
2584 error = namei(ndp: &nd);
2585 if (error) {
2586 return error;
2587 }
2588 vp = nd.ni_vp;
2589 mp = vp->v_mount;
2590 nameidone(&nd);
2591
2592 /*
2593 * Must be the root of the filesystem
2594 */
2595 if ((vp->v_flag & VROOT) == 0) {
2596 vnode_put(vp);
2597 return EINVAL;
2598 }
2599#if CONFIG_MACF
2600 error = mac_mount_check_umount(ctx, mp);
2601 if (error != 0) {
2602 vnode_put(vp);
2603 return error;
2604 }
2605#endif
2606 mount_ref(mp, 0);
2607 vnode_put(vp);
2608 /* safedounmount consumes the mount ref */
2609 return safedounmount(mp, uap->flags, ctx);
2610}
2611
2612int
2613vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2614{
2615 mount_t mp;
2616
2617 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2618 if (mp == (mount_t)0) {
2619 return ENOENT;
2620 }
2621 mount_ref(mp, 0);
2622 mount_iterdrop(mp);
2623 /* safedounmount consumes the mount ref */
2624 return safedounmount(mp, flags, ctx);
2625}
2626
2627/*
2628 * The mount struct comes with a mount ref which will be consumed.
2629 * Do the actual file system unmount, prevent some common foot shooting.
2630 */
2631int
2632safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2633{
2634 int error;
2635 proc_t p = vfs_context_proc(ctx);
2636
2637 /*
2638 * If the file system is not responding and MNT_NOBLOCK
2639 * is set and not a forced unmount then return EBUSY.
2640 */
2641 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2642 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2643 error = EBUSY;
2644 goto out;
2645 }
2646
2647 /*
2648 * Skip authorization in two cases:
2649 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2650 * This entitlement allows non-root processes unmount volumes mounted by
2651 * other processes.
2652 * - If the mount is tagged as permissive and this is not a forced-unmount
2653 * attempt.
2654 */
2655 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2656 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2657 /*
2658 * Only root, or the user that did the original mount is
2659 * permitted to unmount this filesystem.
2660 */
2661 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(cred: kauth_cred_get())) &&
2662 (error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
2663 goto out;
2664 }
2665 }
2666 /*
2667 * Don't allow unmounting the root file system, or other volumes
2668 * associated with it (for example, the associated VM or DATA mounts) .
2669 */
2670 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2671 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2672 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2673 mp->mnt_vfsstat.f_mntonname);
2674 }
2675 error = EBUSY; /* the root (or associated volumes) is always busy */
2676 goto out;
2677 }
2678
2679 /*
2680 * If the mount is providing the root filesystem's disk image
2681 * (i.e. imageboot), don't allow unmounting
2682 */
2683 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2684 error = EBUSY;
2685 goto out;
2686 }
2687
2688 return dounmount(mp, flags, 1, ctx);
2689
2690out:
2691 mount_drop(mp, 0);
2692 return error;
2693}
2694
2695/*
2696 * Do the actual file system unmount.
2697 */
2698int
2699dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2700{
2701 vnode_t coveredvp = (vnode_t)0;
2702 int error;
2703 int needwakeup = 0;
2704 int forcedunmount = 0;
2705 int lflags = 0;
2706 struct vnode *devvp = NULLVP;
2707#if CONFIG_TRIGGERS
2708 proc_t p = vfs_context_proc(ctx);
2709 int did_vflush = 0;
2710 int pflags_save = 0;
2711#endif /* CONFIG_TRIGGERS */
2712
2713#if CONFIG_FSE
2714 if (!(flags & MNT_FORCE)) {
2715 fsevent_unmount(mp, ctx); /* has to come first! */
2716 }
2717#endif
2718
2719 mount_lock(mp);
2720
2721 /*
2722 * If already an unmount in progress just return EBUSY.
2723 * Even a forced unmount cannot override.
2724 */
2725 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2726 if (withref != 0) {
2727 mount_drop(mp, 1);
2728 }
2729 mount_unlock(mp);
2730 return EBUSY;
2731 }
2732
2733 if (flags & MNT_FORCE) {
2734 forcedunmount = 1;
2735 mp->mnt_lflag |= MNT_LFORCE;
2736 }
2737
2738#if CONFIG_TRIGGERS
2739 if (flags & MNT_NOBLOCK && p != kernproc) {
2740 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2741 }
2742#endif
2743
2744 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2745 mp->mnt_lflag |= MNT_LUNMOUNT;
2746 mp->mnt_flag &= ~MNT_ASYNC;
2747 /*
2748 * anyone currently in the fast path that
2749 * trips over the cached rootvp will be
2750 * dumped out and forced into the slow path
2751 * to regenerate a new cached value
2752 */
2753 mp->mnt_realrootvp = NULLVP;
2754 mount_unlock(mp);
2755
2756 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2757 /*
2758 * Force unmount any mounts in this filesystem.
2759 * If any unmounts fail - just leave them dangling.
2760 * Avoids recursion.
2761 */
2762 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2763 }
2764
2765 /*
2766 * taking the name_cache_lock exclusively will
2767 * insure that everyone is out of the fast path who
2768 * might be trying to use a now stale copy of
2769 * vp->v_mountedhere->mnt_realrootvp
2770 * bumping mount_generation causes the cached values
2771 * to be invalidated
2772 */
2773 name_cache_lock();
2774 mount_generation++;
2775 name_cache_unlock();
2776
2777
2778 lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2779 if (withref != 0) {
2780 mount_drop(mp, 0);
2781 }
2782 error = 0;
2783 if (forcedunmount == 0) {
2784 ubc_umount(mp); /* release cached vnodes */
2785 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2786 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2787 if (error) {
2788 mount_lock(mp);
2789 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2790 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2791 mp->mnt_lflag &= ~MNT_LFORCE;
2792 goto out;
2793 }
2794 }
2795 }
2796
2797 IOBSDMountChange(mp, op: kIOMountChangeUnmount);
2798
2799#if CONFIG_TRIGGERS
2800 vfs_nested_trigger_unmounts(mp, flags, ctx);
2801 did_vflush = 1;
2802#endif
2803 if (forcedunmount) {
2804 lflags |= FORCECLOSE;
2805 }
2806 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2807 if ((forcedunmount == 0) && error) {
2808 mount_lock(mp);
2809 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2810 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2811 mp->mnt_lflag &= ~MNT_LFORCE;
2812 goto out;
2813 }
2814
2815 /* make sure there are no one in the mount iterations or lookup */
2816 mount_iterdrain(mp);
2817
2818 error = VFS_UNMOUNT(mp, flags, ctx);
2819 if (error) {
2820 mount_iterreset(mp);
2821 mount_lock(mp);
2822 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2823 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2824 mp->mnt_lflag &= ~MNT_LFORCE;
2825 goto out;
2826 }
2827
2828 /* increment the operations count */
2829 if (!error) {
2830 OSAddAtomic(1, &vfs_nummntops);
2831 }
2832
2833 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2834 /* hold an io reference and drop the usecount before close */
2835 devvp = mp->mnt_devvp;
2836 vnode_getalways(devvp);
2837 vnode_rele(vp: devvp);
2838 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2839 ctx);
2840 vnode_clearmountedon(vp: devvp);
2841 vnode_put(vp: devvp);
2842 }
2843 lck_rw_done(lck: &mp->mnt_rwlock);
2844 mount_list_remove(mp);
2845 lck_rw_lock_exclusive(lck: &mp->mnt_rwlock);
2846
2847 /* mark the mount point hook in the vp but not drop the ref yet */
2848 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2849 /*
2850 * The covered vnode needs special handling. Trying to get an
2851 * iocount must not block here as this may lead to deadlocks
2852 * if the Filesystem to which the covered vnode belongs is
2853 * undergoing forced unmounts. Since we hold a usecount, the
2854 * vnode cannot be reused (it can, however, still be terminated)
2855 */
2856 vnode_getalways(coveredvp);
2857 vnode_lock_spin(coveredvp);
2858
2859 mp->mnt_crossref++;
2860 coveredvp->v_mountedhere = (struct mount *)0;
2861 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2862 vnode_unlock(coveredvp);
2863 vnode_put(vp: coveredvp);
2864 }
2865
2866 mount_list_lock();
2867 mp->mnt_vtable->vfc_refcount--;
2868 mount_list_unlock();
2869
2870 cache_purgevfs(mp); /* remove cache entries for this file sys */
2871 vfs_event_signal(NULL, VQ_UNMOUNT, data: (intptr_t)NULL);
2872 mount_lock(mp);
2873 mp->mnt_lflag |= MNT_LDEAD;
2874
2875 if (mp->mnt_lflag & MNT_LWAIT) {
2876 /*
2877 * do the wakeup here
2878 * in case we block in mount_refdrain
2879 * which will drop the mount lock
2880 * and allow anyone blocked in vfs_busy
2881 * to wakeup and see the LDEAD state
2882 */
2883 mp->mnt_lflag &= ~MNT_LWAIT;
2884 wakeup(chan: (caddr_t)mp);
2885 }
2886 mount_refdrain(mp);
2887
2888 /* free disk_conditioner_info structure for this mount */
2889 disk_conditioner_unmount(mp);
2890
2891out:
2892 if (mp->mnt_lflag & MNT_LWAIT) {
2893 mp->mnt_lflag &= ~MNT_LWAIT;
2894 needwakeup = 1;
2895 }
2896
2897#if CONFIG_TRIGGERS
2898 if (flags & MNT_NOBLOCK && p != kernproc) {
2899 // Restore P_NOREMOTEHANG bit to its previous value
2900 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2901 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2902 }
2903 }
2904
2905 /*
2906 * Callback and context are set together under the mount lock, and
2907 * never cleared, so we're safe to examine them here, drop the lock,
2908 * and call out.
2909 */
2910 if (mp->mnt_triggercallback != NULL) {
2911 mount_unlock(mp);
2912 if (error == 0) {
2913 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2914 } else if (did_vflush) {
2915 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2916 }
2917 } else {
2918 mount_unlock(mp);
2919 }
2920#else
2921 mount_unlock(mp);
2922#endif /* CONFIG_TRIGGERS */
2923
2924 lck_rw_done(lck: &mp->mnt_rwlock);
2925
2926 if (needwakeup) {
2927 wakeup(chan: (caddr_t)mp);
2928 }
2929
2930 if (!error) {
2931 if ((coveredvp != NULLVP)) {
2932 vnode_t pvp = NULLVP;
2933
2934 /*
2935 * The covered vnode needs special handling. Trying to
2936 * get an iocount must not block here as this may lead
2937 * to deadlocks if the Filesystem to which the covered
2938 * vnode belongs is undergoing forced unmounts. Since we
2939 * hold a usecount, the vnode cannot be reused
2940 * (it can, however, still be terminated).
2941 */
2942 vnode_getalways(coveredvp);
2943
2944 mount_dropcrossref(mp, coveredvp, 0);
2945 /*
2946 * We'll _try_ to detect if this really needs to be
2947 * done. The coveredvp can only be in termination (or
2948 * terminated) if the coveredvp's mount point is in a
2949 * forced unmount (or has been) since we still hold the
2950 * ref.
2951 */
2952 if (!vnode_isrecycled(vp: coveredvp)) {
2953 pvp = vnode_getparent(vp: coveredvp);
2954#if CONFIG_TRIGGERS
2955 if (coveredvp->v_resolve) {
2956 vnode_trigger_rearm(coveredvp, ctx);
2957 }
2958#endif
2959 }
2960
2961 vnode_rele(vp: coveredvp);
2962 vnode_put(vp: coveredvp);
2963 coveredvp = NULLVP;
2964
2965 if (pvp) {
2966 lock_vnode_and_post(pvp, NOTE_WRITE);
2967 vnode_put(vp: pvp);
2968 }
2969 } else if (mp->mnt_flag & MNT_ROOTFS) {
2970 if (nc_smr_enabled) {
2971 vfs_smr_synchronize();
2972 }
2973
2974 mount_lock_destroy(mp);
2975#if CONFIG_MACF
2976 mac_mount_label_destroy(mp);
2977#endif
2978 zfree(mount_zone, mp);
2979 } else {
2980 panic("dounmount: no coveredvp");
2981 }
2982 }
2983 return error;
2984}
2985
2986/*
2987 * Unmount any mounts in this filesystem.
2988 */
2989void
2990dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2991{
2992 mount_t smp;
2993 fsid_t *fsids, fsid;
2994 int fsids_sz;
2995 int count = 0, i, m = 0;
2996 vnode_t vp;
2997
2998 mount_list_lock();
2999
3000 // Get an array to hold the submounts fsids.
3001 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3002 count++;
3003 fsids_sz = count * sizeof(fsid_t);
3004 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3005 if (fsids == NULL) {
3006 mount_list_unlock();
3007 goto out;
3008 }
3009 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3010
3011 /*
3012 * Fill the array with submount fsids.
3013 * Since mounts are always added to the tail of the mount list, the
3014 * list is always in mount order.
3015 * For each mount check if the mounted-on vnode belongs to a
3016 * mount that's already added to our array of mounts to be unmounted.
3017 */
3018 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3019 vp = smp->mnt_vnodecovered;
3020 if (vp == NULL) {
3021 continue;
3022 }
3023 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3024 for (i = 0; i <= m; i++) {
3025 if (fsids[i].val[0] == fsid.val[0] &&
3026 fsids[i].val[1] == fsid.val[1]) {
3027 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3028 break;
3029 }
3030 }
3031 }
3032 mount_list_unlock();
3033
3034 // Unmount the submounts in reverse order. Ignore errors.
3035 for (i = m; i > 0; i--) {
3036 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3037 if (smp) {
3038 mount_ref(smp, 0);
3039 mount_iterdrop(smp);
3040 (void) dounmount(mp: smp, flags, withref: 1, ctx);
3041 }
3042 }
3043out:
3044 kfree_data(fsids, fsids_sz);
3045}
3046
3047void
3048mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3049{
3050 vnode_hold(vp: dp);
3051 vnode_lock(dp);
3052 mp->mnt_crossref--;
3053
3054 if (mp->mnt_crossref < 0) {
3055 panic("mount cross refs -ve");
3056 }
3057
3058 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3059 if (need_put) {
3060 vnode_put_locked(dp);
3061 }
3062 vnode_drop_and_unlock(dp);
3063
3064 if (nc_smr_enabled) {
3065 vfs_smr_synchronize();
3066 }
3067
3068 mount_lock_destroy(mp);
3069#if CONFIG_MACF
3070 mac_mount_label_destroy(mp);
3071#endif
3072 zfree(mount_zone, mp);
3073 return;
3074 }
3075 if (need_put) {
3076 vnode_put_locked(dp);
3077 }
3078 vnode_drop_and_unlock(dp);
3079}
3080
3081
3082/*
3083 * Sync each mounted filesystem.
3084 */
3085#if DIAGNOSTIC
3086int syncprt = 0;
3087#endif
3088
3089int print_vmpage_stat = 0;
3090
3091/*
3092 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3093 * mounted read-write with the passed waitfor value.
3094 *
3095 * Parameters: mp mount-point descriptor per mounted file-system instance.
3096 * arg user argument (please see below)
3097 *
3098 * User argument is a pointer to 32 bit unsigned integer which describes the
3099 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3100 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3101 * waitfor value.
3102 *
3103 * Returns: VFS_RETURNED
3104 */
3105static int
3106sync_callback(mount_t mp, void *arg)
3107{
3108 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3109 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3110 unsigned waitfor = MNT_NOWAIT;
3111
3112 if (arg) {
3113 waitfor = *(uint32_t*)arg;
3114 }
3115
3116 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3117 if (waitfor != MNT_WAIT &&
3118 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3119 waitfor != MNT_NOWAIT &&
3120 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3121 waitfor != MNT_DWAIT &&
3122 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3123 panic("Passed inappropriate waitfor %u to "
3124 "sync_callback()", waitfor);
3125 }
3126
3127 mp->mnt_flag &= ~MNT_ASYNC;
3128 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3129 if (asyncflag) {
3130 mp->mnt_flag |= MNT_ASYNC;
3131 }
3132 }
3133
3134 return VFS_RETURNED;
3135}
3136
3137/* ARGSUSED */
3138int
3139sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3140{
3141 vfs_iterate(LK_NOWAIT, callout: sync_callback, NULL);
3142
3143 if (print_vmpage_stat) {
3144 vm_countdirtypages();
3145 }
3146
3147#if DIAGNOSTIC
3148 if (syncprt) {
3149 vfs_bufstats();
3150 }
3151#endif /* DIAGNOSTIC */
3152 return 0;
3153}
3154
3155typedef enum {
3156 SYNC_ALL = 0,
3157 SYNC_ONLY_RELIABLE_MEDIA = 1,
3158 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3159} sync_type_t;
3160
3161static int
3162sync_internal_callback(mount_t mp, void *arg)
3163{
3164 if (arg) {
3165 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3166 (mp->mnt_flag & MNT_LOCAL);
3167 sync_type_t sync_type = *((sync_type_t *)arg);
3168
3169 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3170 return VFS_RETURNED;
3171 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3172 return VFS_RETURNED;
3173 }
3174 }
3175
3176 (void)sync_callback(mp, NULL);
3177
3178 return VFS_RETURNED;
3179}
3180
3181int sync_thread_state = 0;
3182int sync_timeout_seconds = 5;
3183
3184#define SYNC_THREAD_RUN 0x0001
3185#define SYNC_THREAD_RUNNING 0x0002
3186
3187#if CONFIG_PHYS_WRITE_ACCT
3188thread_t pm_sync_thread;
3189#endif /* CONFIG_PHYS_WRITE_ACCT */
3190
3191static void
3192sync_thread(__unused void *arg, __unused wait_result_t wr)
3193{
3194 sync_type_t sync_type;
3195#if CONFIG_PHYS_WRITE_ACCT
3196 pm_sync_thread = current_thread();
3197#endif /* CONFIG_PHYS_WRITE_ACCT */
3198
3199 lck_mtx_lock(lck: &sync_mtx_lck);
3200 while (sync_thread_state & SYNC_THREAD_RUN) {
3201 sync_thread_state &= ~SYNC_THREAD_RUN;
3202 lck_mtx_unlock(lck: &sync_mtx_lck);
3203
3204 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3205 vfs_iterate(LK_NOWAIT, callout: sync_internal_callback, arg: &sync_type);
3206 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3207 vfs_iterate(LK_NOWAIT, callout: sync_internal_callback, arg: &sync_type);
3208
3209 lck_mtx_lock(lck: &sync_mtx_lck);
3210 }
3211 /*
3212 * This wakeup _has_ to be issued before the lock is released otherwise
3213 * we may end up waking up a thread in sync_internal which is
3214 * expecting a wakeup from a thread it just created and not from this
3215 * thread which is about to exit.
3216 */
3217 wakeup(chan: &sync_thread_state);
3218 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3219#if CONFIG_PHYS_WRITE_ACCT
3220 pm_sync_thread = NULL;
3221#endif /* CONFIG_PHYS_WRITE_ACCT */
3222 lck_mtx_unlock(lck: &sync_mtx_lck);
3223
3224 if (print_vmpage_stat) {
3225 vm_countdirtypages();
3226 }
3227
3228#if DIAGNOSTIC
3229 if (syncprt) {
3230 vfs_bufstats();
3231 }
3232#endif /* DIAGNOSTIC */
3233}
3234
3235struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3236
3237/*
3238 * An in-kernel sync for power management to call.
3239 * This function always returns within sync_timeout seconds.
3240 */
3241__private_extern__ int
3242sync_internal(void)
3243{
3244 thread_t thd = NULL;
3245 int error;
3246 int thread_created = FALSE;
3247 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3248
3249 lck_mtx_lock(lck: &sync_mtx_lck);
3250 sync_thread_state |= SYNC_THREAD_RUN;
3251 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3252 int kr;
3253
3254 sync_thread_state |= SYNC_THREAD_RUNNING;
3255 kr = kernel_thread_start(continuation: sync_thread, NULL, new_thread: &thd);
3256 if (kr != KERN_SUCCESS) {
3257 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3258 lck_mtx_unlock(lck: &sync_mtx_lck);
3259 printf("sync_thread failed\n");
3260 return 0;
3261 }
3262 thread_created = TRUE;
3263 }
3264
3265 error = msleep(chan: (caddr_t)&sync_thread_state, mtx: &sync_mtx_lck,
3266 pri: (PVFS | PDROP | PCATCH), wmesg: "sync_thread", ts: &ts);
3267 if (error) {
3268 struct timeval now;
3269
3270 microtime(tv: &now);
3271 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3272 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3273 sync_timeout_last_print.tv_sec = now.tv_sec;
3274 }
3275 }
3276
3277 if (thread_created) {
3278 thread_deallocate(thread: thd);
3279 }
3280
3281 return 0;
3282} /* end of sync_internal call */
3283
3284/*
3285 * Change filesystem quotas.
3286 */
3287#if QUOTA
3288int
3289quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3290{
3291 struct mount *mp;
3292 int error, quota_cmd, quota_status = 0;
3293 caddr_t datap;
3294 size_t fnamelen;
3295 struct nameidata nd;
3296 vfs_context_t ctx = vfs_context_current();
3297 struct dqblk my_dqblk = {};
3298
3299 AUDIT_ARG(uid, uap->uid);
3300 AUDIT_ARG(cmd, uap->cmd);
3301 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3302 uap->path, ctx);
3303 error = namei(ndp: &nd);
3304 if (error) {
3305 return error;
3306 }
3307 mp = nd.ni_vp->v_mount;
3308 mount_ref(mp, 0);
3309 vnode_put(vp: nd.ni_vp);
3310 nameidone(&nd);
3311
3312#if CONFIG_MACF
3313 error = mac_mount_check_quotactl(ctx, mp, cmd: uap->cmd, id: uap->uid);
3314 if (error != 0) {
3315 goto out;
3316 }
3317#endif
3318
3319 /* copyin any data we will need for downstream code */
3320 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3321
3322 switch (quota_cmd) {
3323 case Q_QUOTAON:
3324 /* uap->arg specifies a file from which to take the quotas */
3325 fnamelen = MAXPATHLEN;
3326 datap = zalloc(view: ZV_NAMEI);
3327 error = copyinstr(uaddr: uap->arg, kaddr: datap, MAXPATHLEN, done: &fnamelen);
3328 break;
3329 case Q_GETQUOTA:
3330 /* uap->arg is a pointer to a dqblk structure. */
3331 datap = (caddr_t) &my_dqblk;
3332 break;
3333 case Q_SETQUOTA:
3334 case Q_SETUSE:
3335 /* uap->arg is a pointer to a dqblk structure. */
3336 datap = (caddr_t) &my_dqblk;
3337 if (proc_is64bit(p)) {
3338 struct user_dqblk my_dqblk64;
3339 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3340 if (error == 0) {
3341 munge_dqblk(dqblkp: &my_dqblk, user_dqblkp: &my_dqblk64, FALSE);
3342 }
3343 } else {
3344 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3345 }
3346 break;
3347 case Q_QUOTASTAT:
3348 /* uap->arg is a pointer to an integer */
3349 datap = (caddr_t) &quota_status;
3350 break;
3351 default:
3352 datap = NULL;
3353 break;
3354 } /* switch */
3355
3356 if (error == 0) {
3357 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3358 }
3359
3360 switch (quota_cmd) {
3361 case Q_QUOTAON:
3362 if (datap != NULL) {
3363 zfree(ZV_NAMEI, datap);
3364 }
3365 break;
3366 case Q_GETQUOTA:
3367 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3368 if (error == 0) {
3369 if (proc_is64bit(p)) {
3370 struct user_dqblk my_dqblk64;
3371
3372 memset(s: &my_dqblk64, c: 0, n: sizeof(my_dqblk64));
3373 munge_dqblk(dqblkp: &my_dqblk, user_dqblkp: &my_dqblk64, TRUE);
3374 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3375 } else {
3376 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3377 }
3378 }
3379 break;
3380 case Q_QUOTASTAT:
3381 /* uap->arg is a pointer to an integer */
3382 if (error == 0) {
3383 error = copyout(datap, uap->arg, sizeof(quota_status));
3384 }
3385 break;
3386 default:
3387 break;
3388 } /* switch */
3389
3390out:
3391 mount_drop(mp, 0);
3392 return error;
3393}
3394#else
3395int
3396quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3397{
3398 return EOPNOTSUPP;
3399}
3400#endif /* QUOTA */
3401
3402static int
3403statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3404{
3405 int error;
3406 vfs_context_t ctx = vfs_context_current();
3407
3408#if CONFIG_MACF
3409 error = mac_mount_check_stat(ctx, mp);
3410 if (error != 0) {
3411 return error;
3412 }
3413#endif
3414
3415 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3416 if (error != 0) {
3417 return error;
3418 }
3419
3420 return munge_statfs(mp, sfsp: &mp->mnt_vfsstat, bufp, NULL, is_64_bit: IS_64BIT_PROCESS(p), TRUE);
3421}
3422
3423/*
3424 * Get filesystem statistics.
3425 *
3426 * Returns: 0 Success
3427 * namei:???
3428 * vfs_update_vfsstat:???
3429 * munge_statfs:EFAULT
3430 */
3431/* ARGSUSED */
3432int
3433statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3434{
3435 int error;
3436 struct mount *mp;
3437 struct nameidata nd;
3438 vfs_context_t ctx = vfs_context_current();
3439 vnode_t vp;
3440
3441 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3442 UIO_USERSPACE, uap->path, ctx);
3443 error = namei(ndp: &nd);
3444 if (error != 0) {
3445 return error;
3446 }
3447 vp = nd.ni_vp;
3448 mp = vp->v_mount;
3449 nameidone(&nd);
3450
3451 error = statfs_internal(p, mp, bufp: uap->buf);
3452 vnode_put(vp);
3453
3454 return error;
3455}
3456
3457/*
3458 * Get filesystem statistics.
3459 */
3460/* ARGSUSED */
3461int
3462fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3463{
3464 int error;
3465 vnode_t vp = NULL;
3466 struct mount *mp;
3467
3468 AUDIT_ARG(fd, uap->fd);
3469
3470 if ((error = file_vnode(uap->fd, &vp)) ||
3471 (error = vnode_getwithref(vp))) {
3472 goto out;
3473 }
3474
3475 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3476
3477 mp = vp->v_mount;
3478 if (!mp) {
3479 error = EBADF;
3480 goto out_vnode;
3481 }
3482
3483 error = statfs_internal(p, mp, bufp: uap->buf);
3484
3485out_vnode:
3486 vnode_put(vp);
3487
3488out:
3489 if (vp != NULL) {
3490 file_drop(uap->fd);
3491 }
3492
3493 return error;
3494}
3495
3496void
3497vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3498{
3499 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3500
3501 bzero(s: sfs, n: sizeof(*sfs));
3502
3503 sfs->f_bsize = vsfs->f_bsize;
3504 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3505 sfs->f_blocks = vsfs->f_blocks;
3506 sfs->f_bfree = vsfs->f_bfree;
3507 sfs->f_bavail = vsfs->f_bavail;
3508 sfs->f_files = vsfs->f_files;
3509 sfs->f_ffree = vsfs->f_ffree;
3510 sfs->f_fsid = vsfs->f_fsid;
3511 sfs->f_owner = vsfs->f_owner;
3512 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3513 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3514 sfs->f_fssubtype = vsfs->f_fssubtype;
3515 sfs->f_flags_ext = 0;
3516 if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3517 sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3518 }
3519 if (mp->mnt_kern_flag & MNTK_FSKIT) {
3520 sfs->f_flags_ext |= MNT_EXT_FSKIT;
3521 }
3522 vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSTYPENAMELEN);
3523 strlcpy(dst: &sfs->f_mntonname[0], src: &vsfs->f_mntonname[0], MAXPATHLEN);
3524 strlcpy(dst: &sfs->f_mntfromname[0], src: &vsfs->f_mntfromname[0], MAXPATHLEN);
3525}
3526
3527/*
3528 * Get file system statistics in 64-bit mode
3529 */
3530int
3531statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3532{
3533 struct mount *mp;
3534 int error;
3535 struct nameidata *ndp;
3536 struct statfs64 *sfsp;
3537 vfs_context_t ctxp = vfs_context_current();
3538 vnode_t vp;
3539 struct {
3540 struct nameidata nd;
3541 struct statfs64 sfs;
3542 } *__nameidata_statfs64;
3543
3544 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3545 Z_WAITOK);
3546 ndp = &__nameidata_statfs64->nd;
3547
3548 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3549 UIO_USERSPACE, uap->path, ctxp);
3550 error = namei(ndp);
3551 if (error != 0) {
3552 goto out;
3553 }
3554 vp = ndp->ni_vp;
3555 mp = vp->v_mount;
3556 nameidone(ndp);
3557
3558#if CONFIG_MACF
3559 error = mac_mount_check_stat(ctx: ctxp, mp);
3560 if (error != 0) {
3561 vnode_put(vp);
3562 goto out;
3563 }
3564#endif
3565
3566 error = vfs_update_vfsstat(mp, ctx: ctxp, VFS_USER_EVENT);
3567 if (error != 0) {
3568 vnode_put(vp);
3569 goto out;
3570 }
3571
3572 sfsp = &__nameidata_statfs64->sfs;
3573 vfs_get_statfs64(mp, sfs: sfsp);
3574 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3575 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3576 /* This process does not want to see a seperate data volume mountpoint */
3577 strlcpy(dst: &sfsp->f_mntonname[0], src: "/", n: sizeof("/"));
3578 }
3579 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3580 vnode_put(vp);
3581
3582out:
3583 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3584
3585 return error;
3586}
3587
3588/*
3589 * Get file system statistics in 64-bit mode
3590 */
3591int
3592fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3593{
3594 struct vnode *vp;
3595 struct mount *mp;
3596 struct statfs64 sfs;
3597 int error;
3598
3599 AUDIT_ARG(fd, uap->fd);
3600
3601 if ((error = file_vnode(uap->fd, &vp))) {
3602 return error;
3603 }
3604
3605 error = vnode_getwithref(vp);
3606 if (error) {
3607 file_drop(uap->fd);
3608 return error;
3609 }
3610
3611 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3612
3613 mp = vp->v_mount;
3614 if (!mp) {
3615 error = EBADF;
3616 goto out;
3617 }
3618
3619#if CONFIG_MACF
3620 error = mac_mount_check_stat(ctx: vfs_context_current(), mp);
3621 if (error != 0) {
3622 goto out;
3623 }
3624#endif
3625
3626 if ((error = vfs_update_vfsstat(mp, ctx: vfs_context_current(), VFS_USER_EVENT)) != 0) {
3627 goto out;
3628 }
3629
3630 vfs_get_statfs64(mp, sfs: &sfs);
3631 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3632 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3633 /* This process does not want to see a seperate data volume mountpoint */
3634 strlcpy(dst: &sfs.f_mntonname[0], src: "/", n: sizeof("/"));
3635 }
3636 error = copyout(&sfs, uap->buf, sizeof(sfs));
3637
3638out:
3639 file_drop(uap->fd);
3640 vnode_put(vp);
3641
3642 return error;
3643}
3644
3645struct getfsstat_struct {
3646 user_addr_t sfsp;
3647 user_addr_t *mp;
3648 int count;
3649 int maxcount;
3650 int flags;
3651 int error;
3652};
3653
3654
3655static int
3656getfsstat_callback(mount_t mp, void * arg)
3657{
3658 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3659 struct vfsstatfs *sp;
3660 int error, my_size;
3661 vfs_context_t ctx = vfs_context_current();
3662
3663 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3664#if CONFIG_MACF
3665 error = mac_mount_check_stat(ctx, mp);
3666 if (error != 0) {
3667 fstp->error = error;
3668 return VFS_RETURNED_DONE;
3669 }
3670#endif
3671 sp = &mp->mnt_vfsstat;
3672 /*
3673 * If MNT_NOWAIT is specified, do not refresh the
3674 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3675 */
3676 if ((mp->mnt_lflag & MNT_LDEAD) ||
3677 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3678 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3679 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3680 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3681 return VFS_RETURNED;
3682 }
3683
3684 /*
3685 * Need to handle LP64 version of struct statfs
3686 */
3687 error = munge_statfs(mp, sfsp: sp, bufp: fstp->sfsp, sizep: &my_size, is_64_bit: IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3688 if (error) {
3689 fstp->error = error;
3690 return VFS_RETURNED_DONE;
3691 }
3692 fstp->sfsp += my_size;
3693
3694 if (fstp->mp) {
3695#if CONFIG_MACF
3696 error = mac_mount_label_get(mp, mac_p: *fstp->mp);
3697 if (error) {
3698 fstp->error = error;
3699 return VFS_RETURNED_DONE;
3700 }
3701#endif
3702 fstp->mp++;
3703 }
3704 }
3705 fstp->count++;
3706 return VFS_RETURNED;
3707}
3708
3709/*
3710 * Get statistics on all filesystems.
3711 */
3712int
3713getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3714{
3715 struct __mac_getfsstat_args muap;
3716
3717 muap.buf = uap->buf;
3718 muap.bufsize = uap->bufsize;
3719 muap.mac = USER_ADDR_NULL;
3720 muap.macsize = 0;
3721 muap.flags = uap->flags;
3722
3723 return __mac_getfsstat(p, &muap, retval);
3724}
3725
3726/*
3727 * __mac_getfsstat: Get MAC-related file system statistics
3728 *
3729 * Parameters: p (ignored)
3730 * uap User argument descriptor (see below)
3731 * retval Count of file system statistics (N stats)
3732 *
3733 * Indirect: uap->bufsize Buffer size
3734 * uap->macsize MAC info size
3735 * uap->buf Buffer where information will be returned
3736 * uap->mac MAC info
3737 * uap->flags File system flags
3738 *
3739 *
3740 * Returns: 0 Success
3741 * !0 Not success
3742 *
3743 */
3744int
3745__mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3746{
3747 user_addr_t sfsp;
3748 user_addr_t *mp;
3749 size_t count, maxcount, bufsize, macsize;
3750 struct getfsstat_struct fst;
3751
3752 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3753 return EINVAL;
3754 }
3755
3756 bufsize = (size_t) uap->bufsize;
3757 macsize = (size_t) uap->macsize;
3758
3759 if (IS_64BIT_PROCESS(p)) {
3760 maxcount = bufsize / sizeof(struct user64_statfs);
3761 } else {
3762 maxcount = bufsize / sizeof(struct user32_statfs);
3763 }
3764 sfsp = uap->buf;
3765 count = 0;
3766
3767 mp = NULL;
3768
3769#if CONFIG_MACF
3770 if (uap->mac != USER_ADDR_NULL) {
3771 u_int32_t *mp0;
3772 int error;
3773 unsigned int i;
3774
3775 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3776 if (count != maxcount) {
3777 return EINVAL;
3778 }
3779
3780 /* Copy in the array */
3781 mp0 = kalloc_data(macsize, Z_WAITOK);
3782 if (mp0 == NULL) {
3783 return ENOMEM;
3784 }
3785
3786 error = copyin(uap->mac, mp0, macsize);
3787 if (error) {
3788 kfree_data(mp0, macsize);
3789 return error;
3790 }
3791
3792 /* Normalize to an array of user_addr_t */
3793 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3794 if (mp == NULL) {
3795 kfree_data(mp0, macsize);
3796 return ENOMEM;
3797 }
3798
3799 for (i = 0; i < count; i++) {
3800 if (IS_64BIT_PROCESS(p)) {
3801 mp[i] = ((user_addr_t *)mp0)[i];
3802 } else {
3803 mp[i] = (user_addr_t)mp0[i];
3804 }
3805 }
3806 kfree_data(mp0, macsize);
3807 }
3808#endif
3809
3810
3811 fst.sfsp = sfsp;
3812 fst.mp = mp;
3813 fst.flags = uap->flags;
3814 fst.count = 0;
3815 fst.error = 0;
3816 fst.maxcount = (int)maxcount;
3817
3818
3819 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, callout: getfsstat_callback, arg: &fst);
3820
3821 if (mp) {
3822 kfree_data(mp, count * sizeof(user_addr_t));
3823 }
3824
3825 if (fst.error) {
3826 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3827 return fst.error;
3828 }
3829
3830 if (fst.sfsp && fst.count > fst.maxcount) {
3831 *retval = fst.maxcount;
3832 } else {
3833 *retval = fst.count;
3834 }
3835 return 0;
3836}
3837
3838static int
3839getfsstat64_callback(mount_t mp, void * arg)
3840{
3841 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3842 struct vfsstatfs *sp;
3843 struct statfs64 sfs;
3844 int error;
3845
3846 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3847#if CONFIG_MACF
3848 error = mac_mount_check_stat(ctx: vfs_context_current(), mp);
3849 if (error != 0) {
3850 fstp->error = error;
3851 return VFS_RETURNED_DONE;
3852 }
3853#endif
3854 sp = &mp->mnt_vfsstat;
3855 /*
3856 * If MNT_NOWAIT is specified, do not refresh the fsstat
3857 * cache. MNT_WAIT overrides MNT_NOWAIT.
3858 *
3859 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3860 * getfsstat, since the constants are out of the same
3861 * namespace.
3862 */
3863 if ((mp->mnt_lflag & MNT_LDEAD) ||
3864 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3865 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3866 (error = vfs_update_vfsstat(mp, ctx: vfs_context_current(), VFS_USER_EVENT)))) {
3867 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3868 return VFS_RETURNED;
3869 }
3870
3871 vfs_get_statfs64(mp, sfs: &sfs);
3872 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3873 if (error) {
3874 fstp->error = error;
3875 return VFS_RETURNED_DONE;
3876 }
3877 fstp->sfsp += sizeof(sfs);
3878 }
3879 fstp->count++;
3880 return VFS_RETURNED;
3881}
3882
3883/*
3884 * Get statistics on all file systems in 64 bit mode.
3885 */
3886int
3887getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3888{
3889 user_addr_t sfsp;
3890 int count, maxcount;
3891 struct getfsstat_struct fst;
3892
3893 maxcount = uap->bufsize / sizeof(struct statfs64);
3894
3895 sfsp = uap->buf;
3896 count = 0;
3897
3898 fst.sfsp = sfsp;
3899 fst.flags = uap->flags;
3900 fst.count = 0;
3901 fst.error = 0;
3902 fst.maxcount = maxcount;
3903
3904 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, callout: getfsstat64_callback, arg: &fst);
3905
3906 if (fst.error) {
3907 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3908 return fst.error;
3909 }
3910
3911 if (fst.sfsp && fst.count > fst.maxcount) {
3912 *retval = fst.maxcount;
3913 } else {
3914 *retval = fst.count;
3915 }
3916
3917 return 0;
3918}
3919
3920/*
3921 * gets the associated vnode with the file descriptor passed.
3922 * as input
3923 *
3924 * INPUT
3925 * ctx - vfs context of caller
3926 * fd - file descriptor for which vnode is required.
3927 * vpp - Pointer to pointer to vnode to be returned.
3928 *
3929 * The vnode is returned with an iocount so any vnode obtained
3930 * by this call needs a vnode_put
3931 *
3932 */
3933int
3934vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3935{
3936 int error;
3937 vnode_t vp;
3938 struct fileproc *fp;
3939 proc_t p = vfs_context_proc(ctx);
3940
3941 *vpp = NULLVP;
3942
3943 error = fp_getfvp(p, fd, resultfp: &fp, resultvp: &vp);
3944 if (error) {
3945 return error;
3946 }
3947
3948 error = vnode_getwithref(vp);
3949 if (error) {
3950 (void)fp_drop(p, fd, fp, locked: 0);
3951 return error;
3952 }
3953
3954 (void)fp_drop(p, fd, fp, locked: 0);
3955 *vpp = vp;
3956 return error;
3957}
3958
3959/*
3960 * Wrapper function around namei to start lookup from a directory
3961 * specified by a file descriptor ni_dirfd.
3962 *
3963 * In addition to all the errors returned by namei, this call can
3964 * return ENOTDIR if the file descriptor does not refer to a directory.
3965 * and EBADF if the file descriptor is not valid.
3966 */
3967int
3968nameiat(struct nameidata *ndp, int dirfd)
3969{
3970 if ((dirfd != AT_FDCWD) &&
3971 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3972 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3973 int error = 0;
3974 char c;
3975
3976 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3977 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3978 if (error) {
3979 return error;
3980 }
3981 } else {
3982 c = *((char *)(ndp->ni_dirp));
3983 }
3984
3985 if (c != '/') {
3986 vnode_t dvp_at;
3987
3988 error = vnode_getfromfd(ctx: ndp->ni_cnd.cn_context, fd: dirfd,
3989 vpp: &dvp_at);
3990 if (error) {
3991 return error;
3992 }
3993
3994 if (vnode_vtype(vp: dvp_at) != VDIR) {
3995 vnode_put(vp: dvp_at);
3996 return ENOTDIR;
3997 }
3998
3999 ndp->ni_dvp = dvp_at;
4000 ndp->ni_cnd.cn_flags |= USEDVP;
4001 error = namei(ndp);
4002 ndp->ni_cnd.cn_flags &= ~USEDVP;
4003 vnode_put(vp: dvp_at);
4004 return error;
4005 }
4006 }
4007
4008 return namei(ndp);
4009}
4010
4011/*
4012 * Change current working directory to a given file descriptor.
4013 */
4014/* ARGSUSED */
4015int
4016fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4017{
4018 vnode_t vp;
4019 vnode_t tdp;
4020 vnode_t tvp;
4021 struct mount *mp;
4022 int error, should_put = 1;
4023
4024 AUDIT_ARG(fd, fd);
4025 if (per_thread && fd == -1) {
4026 /*
4027 * Switching back from per-thread to per process CWD; verify we
4028 * in fact have one before proceeding. The only success case
4029 * for this code path is to return 0 preemptively after zapping
4030 * the thread structure contents.
4031 */
4032 thread_t th = vfs_context_thread(ctx);
4033 if (th) {
4034 uthread_t uth = get_bsdthread_info(th);
4035 tvp = uth->uu_cdir;
4036 uth->uu_cdir = NULLVP;
4037 if (tvp != NULLVP) {
4038 vnode_rele(vp: tvp);
4039 return 0;
4040 }
4041 }
4042 return EBADF;
4043 }
4044
4045 if ((error = file_vnode(fd, &vp))) {
4046 return error;
4047 }
4048 if ((error = vnode_getwithref(vp))) {
4049 file_drop(fd);
4050 return error;
4051 }
4052
4053 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4054
4055 if (vp->v_type != VDIR) {
4056 error = ENOTDIR;
4057 goto out;
4058 }
4059
4060#if CONFIG_MACF
4061 error = mac_vnode_check_chdir(ctx, dvp: vp);
4062 if (error) {
4063 goto out;
4064 }
4065#endif
4066 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4067 if (error) {
4068 goto out;
4069 }
4070
4071 while (!error && (mp = vp->v_mountedhere) != NULL) {
4072 if (vfs_busy(mp, LK_NOWAIT)) {
4073 error = EACCES;
4074 goto out;
4075 }
4076 error = VFS_ROOT(mp, &tdp, ctx);
4077 vfs_unbusy(mp);
4078 if (error) {
4079 break;
4080 }
4081 vnode_put(vp);
4082 vp = tdp;
4083 }
4084 if (error) {
4085 goto out;
4086 }
4087 if ((error = vnode_ref(vp))) {
4088 goto out;
4089 }
4090 vnode_put(vp);
4091 should_put = 0;
4092
4093 if (per_thread) {
4094 thread_t th = vfs_context_thread(ctx);
4095 if (th) {
4096 uthread_t uth = get_bsdthread_info(th);
4097 tvp = uth->uu_cdir;
4098 uth->uu_cdir = vp;
4099 OSBitOrAtomic(P_THCWD, &p->p_flag);
4100 } else {
4101 vnode_rele(vp);
4102 error = ENOENT;
4103 goto out;
4104 }
4105 } else {
4106 proc_dirs_lock_exclusive(p);
4107 proc_fdlock(p);
4108 tvp = p->p_fd.fd_cdir;
4109 p->p_fd.fd_cdir = vp;
4110 proc_fdunlock(p);
4111 proc_dirs_unlock_exclusive(p);
4112 }
4113
4114 if (tvp) {
4115 vnode_rele(vp: tvp);
4116 }
4117
4118out:
4119 if (should_put) {
4120 vnode_put(vp);
4121 }
4122 file_drop(fd);
4123
4124 return error;
4125}
4126
4127int
4128sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4129{
4130 return fchdir(p, ctx: vfs_context_current(), fd: uap->fd, false);
4131}
4132
4133int
4134__pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4135{
4136 return fchdir(p, ctx: vfs_context_current(), fd: uap->fd, true);
4137}
4138
4139
4140/*
4141 * Change current working directory (".").
4142 *
4143 * Returns: 0 Success
4144 * change_dir:ENOTDIR
4145 * change_dir:???
4146 * vnode_ref:ENOENT No such file or directory
4147 */
4148/* ARGSUSED */
4149int
4150chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4151{
4152 int error;
4153 vnode_t tvp;
4154
4155 error = change_dir(ndp, ctx);
4156 if (error) {
4157 return error;
4158 }
4159 if ((error = vnode_ref(vp: ndp->ni_vp))) {
4160 vnode_put(vp: ndp->ni_vp);
4161 return error;
4162 }
4163 /*
4164 * drop the iocount we picked up in change_dir
4165 */
4166 vnode_put(vp: ndp->ni_vp);
4167
4168 if (per_thread) {
4169 thread_t th = vfs_context_thread(ctx);
4170 if (th) {
4171 uthread_t uth = get_bsdthread_info(th);
4172 tvp = uth->uu_cdir;
4173 uth->uu_cdir = ndp->ni_vp;
4174 OSBitOrAtomic(P_THCWD, &p->p_flag);
4175 } else {
4176 vnode_rele(vp: ndp->ni_vp);
4177 return ENOENT;
4178 }
4179 } else {
4180 proc_dirs_lock_exclusive(p);
4181 proc_fdlock(p);
4182 tvp = p->p_fd.fd_cdir;
4183 p->p_fd.fd_cdir = ndp->ni_vp;
4184 proc_fdunlock(p);
4185 proc_dirs_unlock_exclusive(p);
4186 }
4187
4188 if (tvp) {
4189 vnode_rele(vp: tvp);
4190 }
4191
4192 return 0;
4193}
4194
4195
4196/*
4197 * Change current working directory (".").
4198 *
4199 * Returns: 0 Success
4200 * chdir_internal:ENOTDIR
4201 * chdir_internal:ENOENT No such file or directory
4202 * chdir_internal:???
4203 */
4204/* ARGSUSED */
4205static int
4206common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4207{
4208 struct nameidata nd;
4209 vfs_context_t ctx = vfs_context_current();
4210
4211 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4212 UIO_USERSPACE, uap->path, ctx);
4213
4214 return chdir_internal(p, ctx, ndp: &nd, per_thread);
4215}
4216
4217
4218/*
4219 * chdir
4220 *
4221 * Change current working directory (".") for the entire process
4222 *
4223 * Parameters: p Process requesting the call
4224 * uap User argument descriptor (see below)
4225 * retval (ignored)
4226 *
4227 * Indirect parameters: uap->path Directory path
4228 *
4229 * Returns: 0 Success
4230 * common_chdir: ENOTDIR
4231 * common_chdir: ENOENT No such file or directory
4232 * common_chdir: ???
4233 *
4234 */
4235int
4236sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4237{
4238 return common_chdir(p, uap: (void *)uap, per_thread: 0);
4239}
4240
4241/*
4242 * __pthread_chdir
4243 *
4244 * Change current working directory (".") for a single thread
4245 *
4246 * Parameters: p Process requesting the call
4247 * uap User argument descriptor (see below)
4248 * retval (ignored)
4249 *
4250 * Indirect parameters: uap->path Directory path
4251 *
4252 * Returns: 0 Success
4253 * common_chdir: ENOTDIR
4254 * common_chdir: ENOENT No such file or directory
4255 * common_chdir: ???
4256 *
4257 */
4258int
4259__pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4260{
4261 return common_chdir(p, uap: (void *)uap, per_thread: 1);
4262}
4263
4264
4265/*
4266 * Change notion of root (``/'') directory.
4267 */
4268/* ARGSUSED */
4269int
4270chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4271{
4272 struct filedesc *fdp = &p->p_fd;
4273 int error;
4274 struct nameidata nd;
4275 vnode_t tvp;
4276 vfs_context_t ctx = vfs_context_current();
4277
4278 if ((error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
4279 return error;
4280 }
4281
4282 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4283 UIO_USERSPACE, uap->path, ctx);
4284 error = change_dir(ndp: &nd, ctx);
4285 if (error) {
4286 return error;
4287 }
4288
4289#if CONFIG_MACF
4290 error = mac_vnode_check_chroot(ctx, dvp: nd.ni_vp,
4291 cnp: &nd.ni_cnd);
4292 if (error) {
4293 vnode_put(vp: nd.ni_vp);
4294 return error;
4295 }
4296#endif
4297
4298 if ((error = vnode_ref(vp: nd.ni_vp))) {
4299 vnode_put(vp: nd.ni_vp);
4300 return error;
4301 }
4302 vnode_put(vp: nd.ni_vp);
4303
4304 /*
4305 * This lock provides the guarantee that as long as you hold the lock
4306 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4307 * on a referenced vnode in namei when determining the rootvnode for
4308 * a process.
4309 */
4310 /* needed for synchronization with lookup */
4311 proc_dirs_lock_exclusive(p);
4312 /* needed for setting the flag and other activities on the fd itself */
4313 proc_fdlock(p);
4314 tvp = fdp->fd_rdir;
4315 fdp->fd_rdir = nd.ni_vp;
4316 fdt_flag_set(fdp, FD_CHROOT);
4317 proc_fdunlock(p);
4318 proc_dirs_unlock_exclusive(p);
4319
4320 if (tvp != NULL) {
4321 vnode_rele(vp: tvp);
4322 }
4323
4324 return 0;
4325}
4326
4327#define PATHSTATICBUFLEN 256
4328#define PIVOT_ROOT_ENTITLEMENT \
4329 "com.apple.private.vfs.pivot-root"
4330
4331#if defined(XNU_TARGET_OS_OSX)
4332int
4333pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4334{
4335 int error;
4336 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4337 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4338 char *new_rootfs_path_before_buf = NULL;
4339 char *old_rootfs_path_after_buf = NULL;
4340 char *incoming = NULL;
4341 char *outgoing = NULL;
4342 vnode_t incoming_rootvp = NULLVP;
4343 size_t bytes_copied;
4344
4345 /*
4346 * XXX : Additional restrictions needed
4347 * - perhaps callable only once.
4348 */
4349 if ((error = suser(cred: kauth_cred_get(), acflag: &p->p_acflag))) {
4350 return error;
4351 }
4352
4353 /*
4354 * pivot_root can be executed by launchd only.
4355 * Enforce entitlement.
4356 */
4357 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4358 return EPERM;
4359 }
4360
4361 error = copyinstr(uaddr: uap->new_rootfs_path_before, kaddr: &new_rootfs_path_before[0], PATHSTATICBUFLEN, done: &bytes_copied);
4362 if (error == ENAMETOOLONG) {
4363 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4364 error = copyinstr(uaddr: uap->new_rootfs_path_before, kaddr: new_rootfs_path_before_buf, MAXPATHLEN, done: &bytes_copied);
4365 }
4366
4367 if (error) {
4368 goto out;
4369 }
4370
4371 error = copyinstr(uaddr: uap->old_rootfs_path_after, kaddr: &old_rootfs_path_after[0], PATHSTATICBUFLEN, done: &bytes_copied);
4372 if (error == ENAMETOOLONG) {
4373 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4374 error = copyinstr(uaddr: uap->old_rootfs_path_after, kaddr: old_rootfs_path_after_buf, MAXPATHLEN, done: &bytes_copied);
4375 }
4376 if (error) {
4377 goto out;
4378 }
4379
4380 if (new_rootfs_path_before_buf) {
4381 incoming = new_rootfs_path_before_buf;
4382 } else {
4383 incoming = &new_rootfs_path_before[0];
4384 }
4385
4386 if (old_rootfs_path_after_buf) {
4387 outgoing = old_rootfs_path_after_buf;
4388 } else {
4389 outgoing = &old_rootfs_path_after[0];
4390 }
4391
4392 /*
4393 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4394 * Userland is not allowed to pivot to an image.
4395 */
4396 error = vnode_lookup(path: incoming, flags: 0, vpp: &incoming_rootvp, ctx: vfs_context_kernel());
4397 if (error) {
4398 goto out;
4399 }
4400 error = VNOP_IOCTL(vp: incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, fflag: 0, ctx: vfs_context_kernel());
4401 if (error) {
4402 goto out;
4403 }
4404
4405 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4406
4407out:
4408 if (incoming_rootvp != NULLVP) {
4409 vnode_put(vp: incoming_rootvp);
4410 incoming_rootvp = NULLVP;
4411 }
4412
4413 if (old_rootfs_path_after_buf) {
4414 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4415 }
4416
4417 if (new_rootfs_path_before_buf) {
4418 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4419 }
4420
4421 return error;
4422}
4423#else
4424int
4425pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4426{
4427 return nosys(p, NULL, retval);
4428}
4429#endif /* XNU_TARGET_OS_OSX */
4430
4431/*
4432 * Common routine for chroot and chdir.
4433 *
4434 * Returns: 0 Success
4435 * ENOTDIR Not a directory
4436 * namei:??? [anything namei can return]
4437 * vnode_authorize:??? [anything vnode_authorize can return]
4438 */
4439static int
4440change_dir(struct nameidata *ndp, vfs_context_t ctx)
4441{
4442 vnode_t vp;
4443 int error;
4444
4445 if ((error = namei(ndp))) {
4446 return error;
4447 }
4448 nameidone(ndp);
4449 vp = ndp->ni_vp;
4450
4451 if (vp->v_type != VDIR) {
4452 vnode_put(vp);
4453 return ENOTDIR;
4454 }
4455
4456#if CONFIG_MACF
4457 error = mac_vnode_check_chdir(ctx, dvp: vp);
4458 if (error) {
4459 vnode_put(vp);
4460 return error;
4461 }
4462#endif
4463
4464 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4465 if (error) {
4466 vnode_put(vp);
4467 return error;
4468 }
4469
4470 return error;
4471}
4472
4473/*
4474 * Free the vnode data (for directories) associated with the file glob.
4475 */
4476struct fd_vn_data *
4477fg_vn_data_alloc(void)
4478{
4479 struct fd_vn_data *fvdata;
4480
4481 /* Allocate per fd vnode data */
4482 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4483 lck_mtx_init(lck: &fvdata->fv_lock, grp: &fd_vn_lck_grp, attr: &fd_vn_lck_attr);
4484 return fvdata;
4485}
4486
4487/*
4488 * Free the vnode data (for directories) associated with the file glob.
4489 */
4490void
4491fg_vn_data_free(void *fgvndata)
4492{
4493 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4494
4495 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4496 lck_mtx_destroy(lck: &fvdata->fv_lock, grp: &fd_vn_lck_grp);
4497 kfree_type(struct fd_vn_data, fvdata);
4498}
4499
4500/*
4501 * Check permissions, allocate an open file structure,
4502 * and call the device open routine if any.
4503 *
4504 * Returns: 0 Success
4505 * EINVAL
4506 * EINTR
4507 * falloc:ENFILE
4508 * falloc:EMFILE
4509 * falloc:ENOMEM
4510 * vn_open_auth:???
4511 * dupfdopen:???
4512 * VNOP_ADVLOCK:???
4513 * vnode_setsize:???
4514 *
4515 * XXX Need to implement uid, gid
4516 */
4517int
4518open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4519 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4520{
4521 proc_t p = vfs_context_proc(ctx);
4522 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4523 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4524 struct fileproc *fp;
4525 vnode_t vp;
4526 int flags, oflags, amode;
4527 int type, indx, error;
4528 struct vfs_context context;
4529 vnode_t authvp = NULLVP;
4530
4531 oflags = uflags;
4532
4533 amode = oflags & O_ACCMODE;
4534 /*
4535 * Because O_RDONLY is 0, it is not possible to distinguish between
4536 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4537 * with FREAD/FWRITE.
4538 */
4539 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4540 return EINVAL;
4541 }
4542
4543 flags = FFLAGS(uflags);
4544 CLR(flags, FENCRYPTED);
4545 CLR(flags, FUNENCRYPTED);
4546
4547 AUDIT_ARG(fflags, oflags);
4548 AUDIT_ARG(mode, vap->va_mode);
4549
4550 if ((error = falloc_withinit(p, p_cred, ctx, resultfp: &fp, resultfd: &indx, fp_init, initarg)) != 0) {
4551 return error;
4552 }
4553 if (flags & O_CLOEXEC) {
4554 fp->fp_flags |= FP_CLOEXEC;
4555 }
4556 if (flags & O_CLOFORK) {
4557 fp->fp_flags |= FP_CLOFORK;
4558 }
4559
4560 /* setup state to recognize when fdesc_open was called */
4561 uu->uu_dupfd = -1;
4562
4563 /*
4564 * Disable read/write access if file is opened with O_EVTONLY and
4565 * the process has requested to deny read/write access.
4566 */
4567 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4568 flags &= ~(FREAD | FWRITE);
4569 }
4570
4571 if (authfd != AUTH_OPEN_NOAUTHFD) {
4572 error = vnode_getfromfd(ctx, fd: authfd, vpp: &authvp);
4573 if (error) {
4574 fp_free(p, fd: indx, fp);
4575 return error;
4576 }
4577 }
4578
4579 if ((error = vn_open_auth(ndp, fmode: &flags, vap, authvp))) {
4580 if (authvp != NULLVP) {
4581 vnode_put(vp: authvp);
4582 }
4583 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4584 if ((error = dupfdopen(p, indx, dfd: uu->uu_dupfd, mode: flags, error)) == 0) {
4585 *retval = indx;
4586 return 0;
4587 }
4588 }
4589 if (error == ERESTART) {
4590 error = EINTR;
4591 }
4592 fp_free(p, fd: indx, fp);
4593 return error;
4594 }
4595
4596 if (authvp != NULLVP) {
4597 vnode_put(vp: authvp);
4598 }
4599
4600 uu->uu_dupfd = 0;
4601 vp = ndp->ni_vp;
4602
4603 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4604 fp->fp_glob->fg_ops = &vnops;
4605 fp_set_data(fp, fg_data: vp);
4606
4607#if CONFIG_FILE_LEASES
4608 /*
4609 * If we are creating a file or open with truncate, we need to break the
4610 * lease if there is a read lease placed on the parent dir.
4611 */
4612 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4613 vnode_breakdirlease(vp, true, oflags);
4614 }
4615 /* Now check if there is a lease placed on the file itself. */
4616 error = vnode_breaklease(vp, oflags, ctx);
4617 if (error) {
4618 goto bad;
4619 }
4620#endif /* CONFIG_FILE_LEASES */
4621
4622 if (flags & (O_EXLOCK | O_SHLOCK)) {
4623 struct flock lf = {
4624 .l_whence = SEEK_SET,
4625 };
4626
4627 if (flags & O_EXLOCK) {
4628 lf.l_type = F_WRLCK;
4629 } else {
4630 lf.l_type = F_RDLCK;
4631 }
4632 type = F_FLOCK;
4633 if ((flags & FNONBLOCK) == 0) {
4634 type |= F_WAIT;
4635 }
4636#if CONFIG_MACF
4637 error = mac_file_check_lock(cred: vfs_context_ucred(ctx), fg: fp->fp_glob,
4638 F_SETLK, fl: &lf);
4639 if (error) {
4640 goto bad;
4641 }
4642#endif
4643 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4644 goto bad;
4645 }
4646 fp->fp_glob->fg_flag |= FWASLOCKED;
4647 }
4648
4649 /* try to truncate by setting the size attribute */
4650 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, ioflag: 0, ctx)) != 0)) {
4651 goto bad;
4652 }
4653
4654 /*
4655 * For directories we hold some additional information in the fd.
4656 */
4657 if (vnode_vtype(vp) == VDIR) {
4658 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4659 } else {
4660 fp->fp_glob->fg_vn_data = NULL;
4661 }
4662
4663#if CONFIG_SECLUDED_MEMORY
4664 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4665 memory_object_control_t moc;
4666 const char *v_name;
4667
4668 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669
4670 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 /* nothing to do... */
4672 } else if (fp->fp_glob->fg_flag & FWRITE) {
4673 /* writable -> no longer eligible for secluded pages */
4674 memory_object_mark_eligible_for_secluded(moc,
4675 FALSE);
4676 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4677 char pathname[32] = { 0, };
4678 size_t copied;
4679 /* XXX FBDP: better way to detect /Applications/ ? */
4680 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 (void)copyinstr(ndp->ni_dirp,
4682 pathname,
4683 sizeof(pathname),
4684 &copied);
4685 } else {
4686 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 pathname,
4688 sizeof(pathname),
4689 &copied);
4690 }
4691 pathname[sizeof(pathname) - 1] = '\0';
4692 if (strncmp(pathname,
4693 "/Applications/",
4694 strlen("/Applications/")) == 0 &&
4695 strncmp(pathname,
4696 "/Applications/Camera.app/",
4697 strlen("/Applications/Camera.app/")) != 0) {
4698 /*
4699 * not writable
4700 * AND from "/Applications/"
4701 * AND not from "/Applications/Camera.app/"
4702 * ==> eligible for secluded
4703 */
4704 memory_object_mark_eligible_for_secluded(moc,
4705 TRUE);
4706 }
4707 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4708 (v_name = vnode_getname(vp))) {
4709 size_t len = strlen(v_name);
4710
4711 if (!strncmp(v_name, "dyld", len) ||
4712 !strncmp(v_name, "launchd", len) ||
4713 !strncmp(v_name, "Camera", len) ||
4714 !strncmp(v_name, "SpringBoard", len) ||
4715 !strncmp(v_name, "backboardd", len)) {
4716 /*
4717 * This file matters when launching Camera:
4718 * do not store its contents in the secluded
4719 * pool that will be drained on Camera launch.
4720 */
4721 memory_object_mark_eligible_for_secluded(moc,
4722 FALSE);
4723 } else if (!strncmp(v_name, "audiomxd", len) ||
4724 !strncmp(v_name, "mediaplaybackd", len)) {
4725 memory_object_mark_eligible_for_secluded(moc,
4726 FALSE);
4727 memory_object_mark_for_realtime(moc,
4728 true);
4729 } else if (!strncmp(v_name, "bluetoothd", len)) {
4730 /*
4731 * bluetoothd might be needed for realtime audio
4732 * playback.
4733 */
4734 memory_object_mark_eligible_for_secluded(moc,
4735 FALSE);
4736 memory_object_mark_for_realtime(moc,
4737 true);
4738 } else {
4739 char pathname[64] = { 0, };
4740 size_t copied;
4741 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4742 (void)copyinstr(ndp->ni_dirp,
4743 pathname,
4744 sizeof(pathname),
4745 &copied);
4746 } else {
4747 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4748 pathname,
4749 sizeof(pathname),
4750 &copied);
4751 }
4752 pathname[sizeof(pathname) - 1] = '\0';
4753 if (strncmp(pathname,
4754 "/Library/Audio/Plug-Ins/",
4755 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4756 strncmp(pathname,
4757 "/System/Library/Audio/Plug-Ins/",
4758 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4759 /*
4760 * This may be an audio plugin required
4761 * for realtime playback.
4762 * ==> NOT eligible for secluded.
4763 */
4764 memory_object_mark_eligible_for_secluded(moc,
4765 FALSE);
4766 memory_object_mark_for_realtime(moc,
4767 true);
4768 }
4769 }
4770 vnode_putname(v_name);
4771 }
4772 }
4773#endif /* CONFIG_SECLUDED_MEMORY */
4774
4775 vnode_put(vp);
4776
4777 /*
4778 * The first terminal open (without a O_NOCTTY) by a session leader
4779 * results in it being set as the controlling terminal.
4780 */
4781 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4782 !(flags & O_NOCTTY)) {
4783 int tmp = 0;
4784
4785 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4786 (caddr_t)&tmp, ctx);
4787 }
4788
4789 proc_fdlock(p);
4790 procfdtbl_releasefd(p, fd: indx, NULL);
4791
4792 fp_drop(p, fd: indx, fp, locked: 1);
4793 proc_fdunlock(p);
4794
4795 *retval = indx;
4796
4797 return 0;
4798bad:
4799 context = *vfs_context_current();
4800 context.vc_ucred = fp->fp_glob->fg_cred;
4801
4802 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4803 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4804 struct flock lf = {
4805 .l_whence = SEEK_SET,
4806 .l_type = F_UNLCK,
4807 };
4808
4809 (void)VNOP_ADVLOCK(
4810 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4811 }
4812
4813 vn_close(vp, flags: fp->fp_glob->fg_flag, ctx: &context);
4814 vnode_put(vp);
4815 fp_free(p, fd: indx, fp);
4816
4817 return error;
4818}
4819
4820/*
4821 * While most of the *at syscall handlers can call nameiat() which
4822 * is a wrapper around namei, the use of namei and initialisation
4823 * of nameidata are far removed and in different functions - namei
4824 * gets called in vn_open_auth for open1. So we'll just do here what
4825 * nameiat() does.
4826 */
4827static int
4828open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4829 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4830 int dirfd, int authfd)
4831{
4832 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4833 int error;
4834 char c;
4835
4836 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4837 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4838 if (error) {
4839 return error;
4840 }
4841 } else {
4842 c = *((char *)(ndp->ni_dirp));
4843 }
4844
4845 if (c != '/') {
4846 vnode_t dvp_at;
4847
4848 error = vnode_getfromfd(ctx: ndp->ni_cnd.cn_context, fd: dirfd,
4849 vpp: &dvp_at);
4850 if (error) {
4851 return error;
4852 }
4853
4854 if (vnode_vtype(vp: dvp_at) != VDIR) {
4855 vnode_put(vp: dvp_at);
4856 return ENOTDIR;
4857 }
4858
4859 ndp->ni_dvp = dvp_at;
4860 ndp->ni_cnd.cn_flags |= USEDVP;
4861 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4862 retval, authfd);
4863 vnode_put(vp: dvp_at);
4864 return error;
4865 }
4866 }
4867
4868 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4869}
4870
4871/*
4872 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4873 *
4874 * Parameters: p Process requesting the open
4875 * uap User argument descriptor (see below)
4876 * retval Pointer to an area to receive the
4877 * return calue from the system call
4878 *
4879 * Indirect: uap->path Path to open (same as 'open')
4880 * uap->flags Flags to open (same as 'open'
4881 * uap->uid UID to set, if creating
4882 * uap->gid GID to set, if creating
4883 * uap->mode File mode, if creating (same as 'open')
4884 * uap->xsecurity ACL to set, if creating
4885 *
4886 * Returns: 0 Success
4887 * !0 errno value
4888 *
4889 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4890 *
4891 * XXX: We should enummerate the possible errno values here, and where
4892 * in the code they originated.
4893 */
4894int
4895open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4896{
4897 int ciferror;
4898 kauth_filesec_t xsecdst;
4899 struct vnode_attr va;
4900 struct nameidata nd;
4901 int cmode;
4902
4903 AUDIT_ARG(owner, uap->uid, uap->gid);
4904
4905 xsecdst = NULL;
4906 if ((uap->xsecurity != USER_ADDR_NULL) &&
4907 ((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != 0)) {
4908 return ciferror;
4909 }
4910
4911 VATTR_INIT(&va);
4912 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4913 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4914 if (uap->uid != KAUTH_UID_NONE) {
4915 VATTR_SET(&va, va_uid, uap->uid);
4916 }
4917 if (uap->gid != KAUTH_GID_NONE) {
4918 VATTR_SET(&va, va_gid, uap->gid);
4919 }
4920 if (xsecdst != NULL) {
4921 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4922 va.va_vaflags |= VA_FILESEC_ACL;
4923 }
4924
4925 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4926 uap->path, vfs_context_current());
4927
4928 ciferror = open1(ctx: vfs_context_current(), ndp: &nd, uflags: uap->flags, vap: &va,
4929 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4930 if (xsecdst != NULL) {
4931 kauth_filesec_free(fsp: xsecdst);
4932 }
4933
4934 return ciferror;
4935}
4936
4937/*
4938 * Go through the data-protected atomically controlled open (2)
4939 *
4940 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4941 */
4942static int
4943openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4944 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4945{
4946 /*
4947 * Follow the same path as normal open(2)
4948 * Look up the item if it exists, and acquire the vnode.
4949 */
4950 struct vnode_attr va;
4951 struct nameidata nd;
4952 int cmode;
4953 int error;
4954 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4955
4956 VATTR_INIT(&va);
4957 /* Mask off all but regular access permissions */
4958 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4959 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4960
4961 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4962 path, ctx);
4963
4964 /*
4965 * Initialize the extra fields in vnode_attr to pass down our
4966 * extra fields.
4967 * 1. target cprotect class.
4968 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4969 */
4970 if (flags & O_CREAT) {
4971 /* lower level kernel code validates that the class is valid before applying it. */
4972 if (class != PROTECTION_CLASS_DEFAULT) {
4973 /*
4974 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4975 * file behave the same as open (2)
4976 */
4977 VATTR_SET(&va, va_dataprotect_class, class);
4978 }
4979 }
4980
4981 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4982 if (flags & (O_RDWR | O_WRONLY)) {
4983 /*
4984 * Not allowed to write raw encrypted bytes or when opening authenticated.
4985 */
4986 return EINVAL;
4987 }
4988 if (dpflags & O_DP_GETRAWENCRYPTED) {
4989 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4990 }
4991 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4992 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4993 }
4994 if (dpflags & O_DP_AUTHENTICATE) {
4995 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4996 }
4997 }
4998
4999 error = open1at(ctx: vfs_context_current(), ndp: &nd, uflags: flags, vap: &va,
5000 NULL, NULL, retval, dirfd: fd, authfd);
5001
5002 return error;
5003}
5004
5005int
5006openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5007{
5008 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5009 return EINVAL;
5010 }
5011
5012 return openat_dprotected_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags, mode: uap->mode,
5013 class: uap->class, dpflags: uap->dpflags, fd: uap->fd, authfd: uap->authfd, segflg: UIO_USERSPACE, retval);
5014}
5015
5016int
5017open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5018{
5019 if (uap->dpflags & O_DP_AUTHENTICATE) {
5020 return EINVAL;
5021 }
5022
5023 return openat_dprotected_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags, mode: uap->mode,
5024 class: uap->class, dpflags: uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, segflg: UIO_USERSPACE, retval);
5025}
5026
5027static int
5028openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5029 int fd, enum uio_seg segflg, int *retval)
5030{
5031 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5032 struct {
5033 struct vnode_attr va;
5034 struct nameidata nd;
5035 } *__open_data;
5036 struct vnode_attr *vap;
5037 struct nameidata *ndp;
5038 int cmode;
5039 int error;
5040
5041 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5042 vap = &__open_data->va;
5043 ndp = &__open_data->nd;
5044
5045 VATTR_INIT(vap);
5046 /* Mask off all but regular access permissions */
5047 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5048 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5049
5050 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5051 segflg, path, ctx);
5052
5053 error = open1at(ctx, ndp, uflags: flags, vap, NULL, NULL, retval, dirfd: fd, AUTH_OPEN_NOAUTHFD);
5054
5055 kfree_type(typeof(*__open_data), __open_data);
5056
5057 return error;
5058}
5059
5060int
5061open(proc_t p, struct open_args *uap, int32_t *retval)
5062{
5063 __pthread_testcancel(presyscall: 1);
5064 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5065}
5066
5067int
5068open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5069 int32_t *retval)
5070{
5071 return openat_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags,
5072 mode: uap->mode, AT_FDCWD, segflg: UIO_USERSPACE, retval);
5073}
5074
5075int
5076openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5077 int32_t *retval)
5078{
5079 return openat_internal(ctx: vfs_context_current(), path: uap->path, flags: uap->flags,
5080 mode: uap->mode, fd: uap->fd, segflg: UIO_USERSPACE, retval);
5081}
5082
5083int
5084openat(proc_t p, struct openat_args *uap, int32_t *retval)
5085{
5086 __pthread_testcancel(presyscall: 1);
5087 return openat_nocancel(p, uap: (struct openat_nocancel_args *)uap, retval);
5088}
5089
5090#define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5091
5092static boolean_t
5093vfs_context_can_open_by_id(vfs_context_t ctx)
5094{
5095 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5096 return TRUE;
5097 }
5098
5099 return IOTaskHasEntitlement(task: vfs_context_task(ctx),
5100 OPEN_BY_ID_ENTITLEMENT);
5101}
5102
5103/*
5104 * openbyid_np: open a file given a file system id and a file system object id
5105 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5106 * file systems that don't support object ids it is a node id (uint64_t).
5107 *
5108 * Parameters: p Process requesting the open
5109 * uap User argument descriptor (see below)
5110 * retval Pointer to an area to receive the
5111 * return calue from the system call
5112 *
5113 * Indirect: uap->path Path to open (same as 'open')
5114 *
5115 * uap->fsid id of target file system
5116 * uap->objid id of target file system object
5117 * uap->flags Flags to open (same as 'open')
5118 *
5119 * Returns: 0 Success
5120 * !0 errno value
5121 *
5122 *
5123 * XXX: We should enummerate the possible errno values here, and where
5124 * in the code they originated.
5125 */
5126int
5127openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5128{
5129 fsid_t fsid;
5130 uint64_t objid;
5131 int error;
5132 char *buf = NULL;
5133 int buflen = MAXPATHLEN;
5134 int pathlen = 0;
5135 vfs_context_t ctx = vfs_context_current();
5136
5137 if (!vfs_context_can_open_by_id(ctx)) {
5138 return EPERM;
5139 }
5140
5141 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5142 return error;
5143 }
5144
5145 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5146 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5147 return error;
5148 }
5149
5150 AUDIT_ARG(value32, fsid.val[0]);
5151 AUDIT_ARG(value64, objid);
5152
5153 /*resolve path from fsis, objid*/
5154 do {
5155 buf = kalloc_data(buflen + 1, Z_WAITOK);
5156 if (buf == NULL) {
5157 return ENOMEM;
5158 }
5159
5160 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5161 buf, FSOPT_ISREALFSID, &pathlen);
5162
5163 if (error) {
5164 kfree_data(buf, buflen + 1);
5165 buf = NULL;
5166 }
5167 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5168
5169 if (error) {
5170 return error;
5171 }
5172
5173 buf[pathlen] = 0;
5174
5175 error = openat_internal(
5176 ctx, path: (user_addr_t)buf, flags: uap->oflags, mode: 0, AT_FDCWD, segflg: UIO_SYSSPACE, retval);
5177
5178 kfree_data(buf, buflen + 1);
5179
5180 return error;
5181}
5182
5183
5184/*
5185 * Create a special file.
5186 */
5187static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5188 int fd);
5189
5190static int
5191mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5192 mode_t mode, int fd)
5193{
5194 vfs_context_t ctx = vfs_context_current();
5195 struct nameidata nd;
5196 vnode_t vp, dvp;
5197 int error;
5198
5199 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5200 if ((mode & S_IFMT) == S_IFIFO) {
5201 return mkfifo1(ctx, upath, vap, fd);
5202 }
5203
5204 AUDIT_ARG(mode, mode);
5205 AUDIT_ARG(value32, vap->va_rdev);
5206
5207 if ((error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
5208 return error;
5209 }
5210 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5211 UIO_USERSPACE, upath, ctx);
5212 error = nameiat(ndp: &nd, dirfd: fd);
5213 if (error) {
5214 return error;
5215 }
5216 dvp = nd.ni_dvp;
5217 vp = nd.ni_vp;
5218
5219 if (vp != NULL) {
5220 error = EEXIST;
5221 goto out;
5222 }
5223
5224 switch (mode & S_IFMT) {
5225 case S_IFCHR:
5226 VATTR_SET(vap, va_type, VCHR);
5227 break;
5228 case S_IFBLK:
5229 VATTR_SET(vap, va_type, VBLK);
5230 break;
5231 default:
5232 error = EINVAL;
5233 goto out;
5234 }
5235
5236#if CONFIG_MACF
5237 error = mac_vnode_check_create(ctx,
5238 dvp: nd.ni_dvp, cnp: &nd.ni_cnd, vap);
5239 if (error) {
5240 goto out;
5241 }
5242#endif
5243
5244 if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5245 goto out;
5246 }
5247
5248#if CONFIG_FILE_LEASES
5249 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5250#endif
5251
5252 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5253 goto out;
5254 }
5255
5256 if (vp) {
5257 int update_flags = 0;
5258
5259 // Make sure the name & parent pointers are hooked up
5260 if (vp->v_name == NULL) {
5261 update_flags |= VNODE_UPDATE_NAME;
5262 }
5263 if (vp->v_parent == NULLVP) {
5264 update_flags |= VNODE_UPDATE_PARENT;
5265 }
5266
5267 if (update_flags) {
5268 vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
5269 }
5270
5271#if CONFIG_FSE
5272 add_fsevent(FSE_CREATE_FILE, ctx,
5273 FSE_ARG_VNODE, vp,
5274 FSE_ARG_DONE);
5275#endif
5276 }
5277
5278out:
5279 /*
5280 * nameidone has to happen before we vnode_put(dvp)
5281 * since it may need to release the fs_nodelock on the dvp
5282 */
5283 nameidone(&nd);
5284
5285 if (vp) {
5286 vnode_put(vp);
5287 }
5288 vnode_put(vp: dvp);
5289
5290 return error;
5291}
5292
5293int
5294mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5295{
5296 struct vnode_attr va;
5297
5298 VATTR_INIT(&va);
5299 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5300 VATTR_SET(&va, va_rdev, uap->dev);
5301
5302 return mknodat_internal(p, upath: uap->path, vap: &va, mode: (mode_t)uap->mode, AT_FDCWD);
5303}
5304
5305int
5306mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5307{
5308 struct vnode_attr va;
5309
5310 VATTR_INIT(&va);
5311 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5312 VATTR_SET(&va, va_rdev, uap->dev);
5313
5314 return mknodat_internal(p, upath: uap->path, vap: &va, mode: (mode_t)uap->mode, fd: uap->fd);
5315}
5316
5317/*
5318 * Create a named pipe.
5319 *
5320 * Returns: 0 Success
5321 * EEXIST
5322 * namei:???
5323 * vnode_authorize:???
5324 * vn_create:???
5325 */
5326static int
5327mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5328{
5329 vnode_t vp, dvp;
5330 int error;
5331 struct nameidata nd;
5332
5333 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5334 UIO_USERSPACE, upath, ctx);
5335 error = nameiat(ndp: &nd, dirfd: fd);
5336 if (error) {
5337 return error;
5338 }
5339 dvp = nd.ni_dvp;
5340 vp = nd.ni_vp;
5341
5342 /* check that this is a new file and authorize addition */
5343 if (vp != NULL) {
5344 error = EEXIST;
5345 goto out;
5346 }
5347 VATTR_SET(vap, va_type, VFIFO);
5348
5349 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5350 goto out;
5351 }
5352
5353 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5354out:
5355 /*
5356 * nameidone has to happen before we vnode_put(dvp)
5357 * since it may need to release the fs_nodelock on the dvp
5358 */
5359 nameidone(&nd);
5360
5361 if (vp) {
5362 vnode_put(vp);
5363 }
5364 vnode_put(vp: dvp);
5365
5366 return error;
5367}
5368
5369
5370/*
5371 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5372 *
5373 * Parameters: p Process requesting the open
5374 * uap User argument descriptor (see below)
5375 * retval (Ignored)
5376 *
5377 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5378 * uap->uid UID to set
5379 * uap->gid GID to set
5380 * uap->mode File mode to set (same as 'mkfifo')
5381 * uap->xsecurity ACL to set, if creating
5382 *
5383 * Returns: 0 Success
5384 * !0 errno value
5385 *
5386 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5387 *
5388 * XXX: We should enummerate the possible errno values here, and where
5389 * in the code they originated.
5390 */
5391int
5392mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5393{
5394 int ciferror;
5395 kauth_filesec_t xsecdst;
5396 struct vnode_attr va;
5397
5398 AUDIT_ARG(owner, uap->uid, uap->gid);
5399
5400 xsecdst = KAUTH_FILESEC_NONE;
5401 if (uap->xsecurity != USER_ADDR_NULL) {
5402 if ((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != 0) {
5403 return ciferror;
5404 }
5405 }
5406
5407 VATTR_INIT(&va);
5408 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5409 if (uap->uid != KAUTH_UID_NONE) {
5410 VATTR_SET(&va, va_uid, uap->uid);
5411 }
5412 if (uap->gid != KAUTH_GID_NONE) {
5413 VATTR_SET(&va, va_gid, uap->gid);
5414 }
5415 if (xsecdst != KAUTH_FILESEC_NONE) {
5416 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5417 va.va_vaflags |= VA_FILESEC_ACL;
5418 }
5419
5420 ciferror = mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, AT_FDCWD);
5421
5422 if (xsecdst != KAUTH_FILESEC_NONE) {
5423 kauth_filesec_free(fsp: xsecdst);
5424 }
5425 return ciferror;
5426}
5427
5428/* ARGSUSED */
5429int
5430mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5431{
5432 struct vnode_attr va;
5433
5434 VATTR_INIT(&va);
5435 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5436
5437 return mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, AT_FDCWD);
5438}
5439
5440int
5441mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5442{
5443 struct vnode_attr va;
5444
5445 VATTR_INIT(&va);
5446 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5447
5448 return mkfifo1(ctx: vfs_context_current(), upath: uap->path, vap: &va, fd: uap->fd);
5449}
5450
5451extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5452extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5453extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5454
5455int
5456safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5457{
5458 int ret, len = _len;
5459
5460 *truncated_path = 0;
5461
5462 if (firmlink) {
5463 ret = vn_getpath(vp: dvp, pathbuf: path, len: &len);
5464 } else {
5465 ret = vn_getpath_no_firmlink(vp: dvp, pathbuf: path, len: &len);
5466 }
5467 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5468 if (leafname) {
5469 path[len - 1] = '/';
5470 len += strlcpy(dst: &path[len], src: leafname, MAXPATHLEN - len) + 1;
5471 if (len > MAXPATHLEN) {
5472 char *ptr;
5473
5474 // the string got truncated!
5475 *truncated_path = 1;
5476 ptr = strrchr(s: path, c: '/');
5477 if (ptr) {
5478 *ptr = '\0'; // chop off the string at the last directory component
5479 }
5480 len = (int)strlen(s: path) + 1;
5481 }
5482 }
5483 } else if (ret == 0) {
5484 *truncated_path = 1;
5485 } else if (ret != 0) {
5486 struct vnode *mydvp = dvp;
5487
5488 if (ret != ENOSPC) {
5489 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5490 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5491 }
5492 *truncated_path = 1;
5493
5494 do {
5495 if (mydvp->v_parent != NULL) {
5496 mydvp = mydvp->v_parent;
5497 } else if (mydvp->v_mount) {
5498 strlcpy(dst: path, src: mydvp->v_mount->mnt_vfsstat.f_mntonname, n: _len);
5499 break;
5500 } else {
5501 // no parent and no mount point? only thing is to punt and say "/" changed
5502 strlcpy(dst: path, src: "/", n: _len);
5503 len = 2;
5504 mydvp = NULL;
5505 }
5506
5507 if (mydvp == NULL) {
5508 break;
5509 }
5510
5511 len = _len;
5512 if (firmlink) {
5513 ret = vn_getpath(vp: mydvp, pathbuf: path, len: &len);
5514 } else {
5515 ret = vn_getpath_no_firmlink(vp: mydvp, pathbuf: path, len: &len);
5516 }
5517 } while (ret == ENOSPC);
5518 }
5519
5520 return len;
5521}
5522
5523int
5524safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5525{
5526 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, firmlink: 1);
5527}
5528
5529int
5530safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5531{
5532 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, firmlink: 0);
5533}
5534
5535/*
5536 * Make a hard file link.
5537 *
5538 * Returns: 0 Success
5539 * EPERM
5540 * EEXIST
5541 * EXDEV
5542 * namei:???
5543 * vnode_authorize:???
5544 * VNOP_LINK:???
5545 */
5546/* ARGSUSED */
5547static int
5548linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5549 user_addr_t link, int flag, enum uio_seg segflg)
5550{
5551 vnode_t vp, pvp, dvp, lvp;
5552 struct nameidata nd;
5553 int follow;
5554 int error;
5555#if CONFIG_FSE
5556 fse_info finfo;
5557#endif
5558 int need_event, has_listeners, need_kpath2;
5559 char *target_path = NULL;
5560 char *no_firmlink_path = NULL;
5561 int truncated = 0;
5562 int truncated_no_firmlink_path = 0;
5563
5564 vp = dvp = lvp = NULLVP;
5565
5566 /* look up the object we are linking to */
5567 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5568 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5569 segflg, path, ctx);
5570
5571 error = nameiat(ndp: &nd, dirfd: fd1);
5572 if (error) {
5573 return error;
5574 }
5575 vp = nd.ni_vp;
5576
5577 nameidone(&nd);
5578
5579 /*
5580 * Normally, linking to directories is not supported.
5581 * However, some file systems may have limited support.
5582 */
5583 if (vp->v_type == VDIR) {
5584 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5585 error = EPERM; /* POSIX */
5586 goto out;
5587 }
5588
5589 /* Linking to a directory requires ownership. */
5590 if (!kauth_cred_issuser(cred: vfs_context_ucred(ctx))) {
5591 struct vnode_attr dva;
5592
5593 VATTR_INIT(&dva);
5594 VATTR_WANTED(&dva, va_uid);
5595 if (vnode_getattr(vp, vap: &dva, ctx) != 0 ||
5596 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5597 (dva.va_uid != kauth_cred_getuid(cred: vfs_context_ucred(ctx)))) {
5598 error = EACCES;
5599 goto out;
5600 }
5601 }
5602 }
5603
5604 /* lookup the target node */
5605#if CONFIG_TRIGGERS
5606 nd.ni_op = OP_LINK;
5607#endif
5608 nd.ni_cnd.cn_nameiop = CREATE;
5609 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5610 nd.ni_dirp = link;
5611 error = nameiat(ndp: &nd, dirfd: fd2);
5612 if (error != 0) {
5613 goto out;
5614 }
5615 dvp = nd.ni_dvp;
5616 lvp = nd.ni_vp;
5617
5618#if CONFIG_MACF
5619 if ((error = mac_vnode_check_link(ctx, dvp, vp, cnp: &nd.ni_cnd)) != 0) {
5620 goto out2;
5621 }
5622#endif
5623
5624 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5625 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5626 goto out2;
5627 }
5628
5629 /* target node must not exist */
5630 if (lvp != NULLVP) {
5631 error = EEXIST;
5632 goto out2;
5633 }
5634 /* cannot link across mountpoints */
5635 if (vnode_mount(vp) != vnode_mount(vp: dvp)) {
5636 error = EXDEV;
5637 goto out2;
5638 }
5639
5640 /* authorize creation of the target note */
5641 if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5642 goto out2;
5643 }
5644
5645#if CONFIG_FILE_LEASES
5646 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5647#endif
5648
5649 /* and finally make the link */
5650 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5651 if (error) {
5652 goto out2;
5653 }
5654
5655#if CONFIG_MACF
5656 (void)mac_vnode_notify_link(ctx, vp, dvp, cnp: &nd.ni_cnd);
5657#endif
5658
5659#if CONFIG_FSE
5660 need_event = need_fsevent(FSE_CREATE_FILE, vp: dvp);
5661#else
5662 need_event = 0;
5663#endif
5664 has_listeners = kauth_authorize_fileop_has_listeners();
5665
5666 need_kpath2 = 0;
5667#if CONFIG_AUDIT
5668 if (AUDIT_RECORD_EXISTS()) {
5669 need_kpath2 = 1;
5670 }
5671#endif
5672
5673 if (need_event || has_listeners || need_kpath2) {
5674 char *link_to_path = NULL;
5675 int len, link_name_len;
5676 int len_no_firmlink_path = 0;
5677
5678 /* build the path to the new link file */
5679 GET_PATH(target_path);
5680
5681 len = safe_getpath(dvp, leafname: nd.ni_cnd.cn_nameptr, path: target_path, MAXPATHLEN, truncated_path: &truncated);
5682 if (no_firmlink_path == NULL) {
5683 GET_PATH(no_firmlink_path);
5684 }
5685 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: nd.ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
5686
5687 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5688
5689 if (has_listeners) {
5690 /* build the path to file we are linking to */
5691 GET_PATH(link_to_path);
5692
5693 link_name_len = MAXPATHLEN;
5694 if (vn_getpath(vp, pathbuf: link_to_path, len: &link_name_len) == 0) {
5695 /*
5696 * Call out to allow 3rd party notification of rename.
5697 * Ignore result of kauth_authorize_fileop call.
5698 */
5699 kauth_authorize_fileop(credential: vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5700 arg0: (uintptr_t)link_to_path,
5701 arg1: (uintptr_t)target_path);
5702 }
5703 if (link_to_path != NULL) {
5704 RELEASE_PATH(link_to_path);
5705 }
5706 }
5707#if CONFIG_FSE
5708 if (need_event) {
5709 /* construct fsevent */
5710 if (get_fse_info(vp, fse: &finfo, ctx) == 0) {
5711 if (truncated_no_firmlink_path) {
5712 finfo.mode |= FSE_TRUNCATED_PATH;
5713 }
5714
5715 // build the path to the destination of the link
5716 add_fsevent(FSE_CREATE_FILE, ctx,
5717 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5718 FSE_ARG_FINFO, &finfo,
5719 FSE_ARG_DONE);
5720 }
5721
5722 pvp = vp->v_parent;
5723 // need an iocount on parent vnode in this case
5724 if (pvp && pvp != dvp) {
5725 pvp = vnode_getparent_if_different(vp, dvp);
5726 }
5727 if (pvp) {
5728 add_fsevent(FSE_STAT_CHANGED, ctx,
5729 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5730 }
5731 if (pvp && pvp != dvp) {
5732 vnode_put(vp: pvp);
5733 }
5734 }
5735#endif
5736 }
5737out2:
5738 /*
5739 * nameidone has to happen before we vnode_put(dvp)
5740 * since it may need to release the fs_nodelock on the dvp
5741 */
5742 nameidone(&nd);
5743 if (target_path != NULL) {
5744 RELEASE_PATH(target_path);
5745 }
5746 if (no_firmlink_path != NULL) {
5747 RELEASE_PATH(no_firmlink_path);
5748 no_firmlink_path = NULL;
5749 }
5750out:
5751 if (lvp) {
5752 vnode_put(vp: lvp);
5753 }
5754 if (dvp) {
5755 vnode_put(vp: dvp);
5756 }
5757 vnode_put(vp);
5758 return error;
5759}
5760
5761int
5762link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5763{
5764 return linkat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
5765 AT_FDCWD, link: uap->link, AT_SYMLINK_FOLLOW, segflg: UIO_USERSPACE);
5766}
5767
5768int
5769linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5770{
5771 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5772 return EINVAL;
5773 }
5774
5775 return linkat_internal(ctx: vfs_context_current(), fd1: uap->fd1, path: uap->path,
5776 fd2: uap->fd2, link: uap->link, flag: uap->flag, segflg: UIO_USERSPACE);
5777}
5778
5779/*
5780 * Make a symbolic link.
5781 *
5782 * We could add support for ACLs here too...
5783 */
5784/* ARGSUSED */
5785static int
5786symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5787 user_addr_t link, enum uio_seg segflg)
5788{
5789 struct vnode_attr va;
5790 char *path;
5791 int error;
5792 struct nameidata nd;
5793 vnode_t vp, dvp;
5794 size_t dummy = 0;
5795 proc_t p;
5796
5797 error = 0;
5798 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5799 path = zalloc(view: ZV_NAMEI);
5800 error = copyinstr(uaddr: path_data, kaddr: path, MAXPATHLEN, done: &dummy);
5801 } else {
5802 path = (char *)path_data;
5803 }
5804 if (error) {
5805 goto out;
5806 }
5807 AUDIT_ARG(text, path); /* This is the link string */
5808
5809 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5810 segflg, link, ctx);
5811
5812 error = nameiat(ndp: &nd, dirfd: fd);
5813 if (error) {
5814 goto out;
5815 }
5816 dvp = nd.ni_dvp;
5817 vp = nd.ni_vp;
5818
5819 p = vfs_context_proc(ctx);
5820 VATTR_INIT(&va);
5821 VATTR_SET(&va, va_type, VLNK);
5822 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5823
5824#if CONFIG_MACF
5825 error = mac_vnode_check_create(ctx,
5826 dvp, cnp: &nd.ni_cnd, vap: &va);
5827#endif
5828 if (error != 0) {
5829 goto skipit;
5830 }
5831
5832 if (vp != NULL) {
5833 error = EEXIST;
5834 goto skipit;
5835 }
5836
5837 /* authorize */
5838 if (error == 0) {
5839 error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5840 }
5841 /* get default ownership, etc. */
5842 if (error == 0) {
5843 error = vnode_authattr_new(dvp, vap: &va, noauth: 0, ctx);
5844 }
5845
5846#if CONFIG_FILE_LEASES
5847 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
5848#endif
5849
5850 if (error == 0) {
5851 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5852 }
5853
5854 /* do fallback attribute handling */
5855 if (error == 0 && vp) {
5856 error = vnode_setattr_fallback(vp, vap: &va, ctx);
5857 }
5858
5859#if CONFIG_MACF
5860 if (error == 0 && vp) {
5861 error = vnode_label(mp: vnode_mount(vp), dvp, vp, cnp: &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5862 }
5863#endif
5864
5865 if (error == 0) {
5866 int update_flags = 0;
5867
5868 /*check if a new vnode was created, else try to get one*/
5869 if (vp == NULL) {
5870 nd.ni_cnd.cn_nameiop = LOOKUP;
5871#if CONFIG_TRIGGERS
5872 nd.ni_op = OP_LOOKUP;
5873#endif
5874 /*
5875 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5876 * reallocated again in namei().
5877 */
5878 nd.ni_cnd.cn_flags &= HASBUF;
5879 error = nameiat(ndp: &nd, dirfd: fd);
5880 if (error) {
5881 goto skipit;
5882 }
5883 vp = nd.ni_vp;
5884 }
5885
5886#if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5887 /* call out to allow 3rd party notification of rename.
5888 * Ignore result of kauth_authorize_fileop call.
5889 */
5890 if (kauth_authorize_fileop_has_listeners() &&
5891 namei(&nd) == 0) {
5892 char *new_link_path = NULL;
5893 int len;
5894
5895 /* build the path to the new link file */
5896 new_link_path = get_pathbuff();
5897 len = MAXPATHLEN;
5898 vn_getpath(dvp, new_link_path, &len);
5899 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5900 new_link_path[len - 1] = '/';
5901 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5902 }
5903
5904 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5905 (uintptr_t)path, (uintptr_t)new_link_path);
5906 if (new_link_path != NULL) {
5907 release_pathbuff(new_link_path);
5908 }
5909 }
5910#endif
5911 // Make sure the name & parent pointers are hooked up
5912 if (vp->v_name == NULL) {
5913 update_flags |= VNODE_UPDATE_NAME;
5914 }
5915 if (vp->v_parent == NULLVP) {
5916 update_flags |= VNODE_UPDATE_PARENT;
5917 }
5918
5919 if (update_flags) {
5920 vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
5921 }
5922
5923#if CONFIG_FSE
5924 add_fsevent(FSE_CREATE_FILE, ctx,
5925 FSE_ARG_VNODE, vp,
5926 FSE_ARG_DONE);
5927#endif
5928 }
5929
5930skipit:
5931 /*
5932 * nameidone has to happen before we vnode_put(dvp)
5933 * since it may need to release the fs_nodelock on the dvp
5934 */
5935 nameidone(&nd);
5936
5937 if (vp) {
5938 vnode_put(vp);
5939 }
5940 vnode_put(vp: dvp);
5941out:
5942 if (path && (path != (char *)path_data)) {
5943 zfree(ZV_NAMEI, path);
5944 }
5945
5946 return error;
5947}
5948
5949int
5950symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5951{
5952 return symlinkat_internal(ctx: vfs_context_current(), path_data: uap->path, AT_FDCWD,
5953 link: uap->link, segflg: UIO_USERSPACE);
5954}
5955
5956int
5957symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5958 __unused int32_t *retval)
5959{
5960 return symlinkat_internal(ctx: vfs_context_current(), path_data: uap->path1, fd: uap->fd,
5961 link: uap->path2, segflg: UIO_USERSPACE);
5962}
5963
5964/*
5965 * Delete a whiteout from the filesystem.
5966 * No longer supported.
5967 */
5968int
5969undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5970{
5971 return ENOTSUP;
5972}
5973
5974/*
5975 * Delete a name from the filesystem.
5976 */
5977/* ARGSUSED */
5978static int
5979unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5980 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5981{
5982 struct {
5983 struct nameidata nd;
5984#if CONFIG_FSE
5985 struct vnode_attr va;
5986 fse_info finfo;
5987#endif
5988 } *__unlink_data;
5989 struct nameidata *ndp;
5990 vnode_t vp, dvp;
5991 int error;
5992 struct componentname *cnp;
5993 char *path = NULL;
5994 char *no_firmlink_path = NULL;
5995 int len_path = 0;
5996 int len_no_firmlink_path = 0;
5997 int flags;
5998 int need_event;
5999 int has_listeners;
6000 int truncated_path;
6001 int truncated_no_firmlink_path;
6002 int batched;
6003 struct vnode_attr *vap;
6004 int do_retry;
6005 int retry_count = 0;
6006 int cn_flags;
6007 int nofollow_any = 0;
6008
6009 cn_flags = LOCKPARENT;
6010 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6011 cn_flags |= AUDITVNPATH1;
6012 }
6013 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6014 nofollow_any = NAMEI_NOFOLLOW_ANY;
6015 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6016 }
6017 /* If a starting dvp is passed, it trumps any fd passed. */
6018 if (start_dvp) {
6019 cn_flags |= USEDVP;
6020 }
6021
6022#if NAMEDRSRCFORK
6023 /* unlink or delete is allowed on rsrc forks and named streams */
6024 cn_flags |= CN_ALLOWRSRCFORK;
6025#endif
6026
6027 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6028 ndp = &__unlink_data->nd;
6029#if CONFIG_FSE
6030 fse_info *finfop = &__unlink_data->finfo;
6031#endif
6032
6033retry:
6034 do_retry = 0;
6035 flags = 0;
6036 need_event = 0;
6037 has_listeners = 0;
6038 truncated_path = 0;
6039 truncated_no_firmlink_path = 0;
6040 vap = NULL;
6041
6042 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6043
6044 ndp->ni_dvp = start_dvp;
6045 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6046 cnp = &ndp->ni_cnd;
6047
6048continue_lookup:
6049 error = nameiat(ndp, dirfd: fd);
6050 if (error) {
6051 goto early_out;
6052 }
6053
6054 dvp = ndp->ni_dvp;
6055 vp = ndp->ni_vp;
6056
6057 /* With Carbon delete semantics, busy files cannot be deleted */
6058 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6059 flags |= VNODE_REMOVE_NODELETEBUSY;
6060 }
6061
6062 /* Skip any potential upcalls if told to. */
6063 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6064 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6065 }
6066
6067 if (vp) {
6068 batched = vnode_compound_remove_available(vp);
6069 /*
6070 * The root of a mounted filesystem cannot be deleted.
6071 */
6072 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6073 error = EBUSY;
6074 goto out;
6075 }
6076
6077#if DEVELOPMENT || DEBUG
6078 /*
6079 * XXX VSWAP: Check for entitlements or special flag here
6080 * so we can restrict access appropriately.
6081 */
6082#else /* DEVELOPMENT || DEBUG */
6083
6084 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6085 error = EPERM;
6086 goto out;
6087 }
6088#endif /* DEVELOPMENT || DEBUG */
6089
6090 if (!batched) {
6091 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6092 if (error) {
6093 if (error == ENOENT) {
6094 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6095 do_retry = 1;
6096 retry_count++;
6097 }
6098 }
6099 goto out;
6100 }
6101 }
6102 } else {
6103 batched = 1;
6104
6105 if (!vnode_compound_remove_available(vp: dvp)) {
6106 panic("No vp, but no compound remove?");
6107 }
6108 }
6109
6110#if CONFIG_FSE
6111 need_event = need_fsevent(FSE_DELETE, vp: dvp);
6112 if (need_event) {
6113 if (!batched) {
6114 if ((vp->v_flag & VISHARDLINK) == 0) {
6115 /* XXX need to get these data in batched VNOP */
6116 get_fse_info(vp, fse: finfop, ctx);
6117 }
6118 } else {
6119 error =
6120 vfs_get_notify_attributes(vap: &__unlink_data->va);
6121 if (error) {
6122 goto out;
6123 }
6124
6125 vap = &__unlink_data->va;
6126 }
6127 }
6128#endif
6129 has_listeners = kauth_authorize_fileop_has_listeners();
6130 if (need_event || has_listeners) {
6131 if (path == NULL) {
6132 GET_PATH(path);
6133 }
6134 len_path = safe_getpath(dvp, leafname: ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, truncated_path: &truncated_path);
6135 if (no_firmlink_path == NULL) {
6136 GET_PATH(no_firmlink_path);
6137 }
6138 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: ndp->ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
6139 }
6140
6141#if NAMEDRSRCFORK
6142 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6143 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6144 } else
6145#endif
6146 {
6147#if CONFIG_FILE_LEASES
6148 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
6149#endif
6150
6151 error = vn_remove(dvp, vpp: &ndp->ni_vp, ndp, flags, vap, ctx);
6152 vp = ndp->ni_vp;
6153 if (error == EKEEPLOOKING) {
6154 if (!batched) {
6155 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6156 }
6157
6158 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6159 panic("EKEEPLOOKING, but continue flag not set?");
6160 }
6161
6162 if (vnode_isdir(vp)) {
6163 error = EISDIR;
6164 goto out;
6165 }
6166 goto continue_lookup;
6167 } else if (error == ENOENT && batched) {
6168 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6169 /*
6170 * For compound VNOPs, the authorization callback may
6171 * return ENOENT in case of racing hardlink lookups
6172 * hitting the name cache, redrive the lookup.
6173 */
6174 do_retry = 1;
6175 retry_count += 1;
6176 goto out;
6177 }
6178 }
6179 }
6180
6181 /*
6182 * Call out to allow 3rd party notification of delete.
6183 * Ignore result of kauth_authorize_fileop call.
6184 */
6185 if (!error) {
6186 if (has_listeners) {
6187 kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
6188 KAUTH_FILEOP_DELETE,
6189 arg0: (uintptr_t)vp,
6190 arg1: (uintptr_t)path);
6191 }
6192
6193 if (vp->v_flag & VISHARDLINK) {
6194 //
6195 // if a hardlink gets deleted we want to blow away the
6196 // v_parent link because the path that got us to this
6197 // instance of the link is no longer valid. this will
6198 // force the next call to get the path to ask the file
6199 // system instead of just following the v_parent link.
6200 //
6201 vnode_update_identity(vp, NULL, NULL, name_len: 0, name_hashval: 0, VNODE_UPDATE_PARENT);
6202 }
6203
6204#if CONFIG_FSE
6205 if (need_event) {
6206 if (vp->v_flag & VISHARDLINK) {
6207 get_fse_info(vp, fse: finfop, ctx);
6208 } else if (vap) {
6209 vnode_get_fse_info_from_vap(vp, fse: finfop, vap);
6210 }
6211 if (truncated_path) {
6212 finfop->mode |= FSE_TRUNCATED_PATH;
6213 }
6214 add_fsevent(FSE_DELETE, ctx,
6215 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6216 FSE_ARG_FINFO, finfop,
6217 FSE_ARG_DONE);
6218 }
6219#endif
6220
6221#if CONFIG_MACF
6222 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6223#endif
6224 }
6225
6226out:
6227 if (path != NULL) {
6228 RELEASE_PATH(path);
6229 path = NULL;
6230 }
6231
6232 if (no_firmlink_path != NULL) {
6233 RELEASE_PATH(no_firmlink_path);
6234 no_firmlink_path = NULL;
6235 }
6236#if NAMEDRSRCFORK
6237 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6238 * will cause its shadow file to go away if necessary.
6239 */
6240 if (vp && (vnode_isnamedstream(vp)) &&
6241 (vp->v_parent != NULLVP) &&
6242 vnode_isshadow(vp)) {
6243 vnode_recycle(vp);
6244 }
6245#endif
6246 /*
6247 * nameidone has to happen before we vnode_put(dvp)
6248 * since it may need to release the fs_nodelock on the dvp
6249 */
6250 nameidone(ndp);
6251 vnode_put(vp: dvp);
6252 if (vp) {
6253 vnode_put(vp);
6254 }
6255
6256 if (do_retry) {
6257 goto retry;
6258 }
6259
6260early_out:
6261 kfree_type(typeof(*__unlink_data), __unlink_data);
6262 return error;
6263}
6264
6265int
6266unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6267 enum uio_seg segflg, int unlink_flags)
6268{
6269 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6270 unlink_flags);
6271}
6272
6273/*
6274 * Delete a name from the filesystem using Carbon semantics.
6275 */
6276int
6277delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6278{
6279 return unlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULLVP,
6280 path_arg: uap->path, segflg: UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6281}
6282
6283/*
6284 * Delete a name from the filesystem using POSIX semantics.
6285 */
6286int
6287unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6288{
6289 return unlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULLVP,
6290 path_arg: uap->path, segflg: UIO_USERSPACE, unlink_flags: 0);
6291}
6292
6293int
6294unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6295{
6296 int unlink_flags = 0;
6297
6298 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6299 return EINVAL;
6300 }
6301
6302 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6303 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6304 }
6305
6306 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6307 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6308 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6309 }
6310 return rmdirat_internal(vfs_context_current(), uap->fd,
6311 uap->path, UIO_USERSPACE, unlink_flags);
6312 } else {
6313 return unlinkat_internal(ctx: vfs_context_current(), fd: uap->fd,
6314 NULLVP, path_arg: uap->path, segflg: UIO_USERSPACE, unlink_flags);
6315 }
6316}
6317
6318/*
6319 * Reposition read/write file offset.
6320 */
6321int
6322lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6323{
6324 struct fileproc *fp;
6325 vnode_t vp;
6326 struct vfs_context *ctx;
6327 off_t offset = uap->offset, file_size;
6328 int error;
6329
6330 if ((error = fp_getfvp(p, fd: uap->fd, resultfp: &fp, resultvp: &vp))) {
6331 if (error == ENOTSUP) {
6332 return ESPIPE;
6333 }
6334 return error;
6335 }
6336 if (vnode_isfifo(vp)) {
6337 file_drop(uap->fd);
6338 return ESPIPE;
6339 }
6340
6341
6342 ctx = vfs_context_current();
6343#if CONFIG_MACF
6344 if (uap->whence == L_INCR && uap->offset == 0) {
6345 error = mac_file_check_get_offset(cred: vfs_context_ucred(ctx),
6346 fg: fp->fp_glob);
6347 } else {
6348 error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx),
6349 fg: fp->fp_glob);
6350 }
6351 if (error) {
6352 file_drop(uap->fd);
6353 return error;
6354 }
6355#endif
6356 if ((error = vnode_getwithref(vp))) {
6357 file_drop(uap->fd);
6358 return error;
6359 }
6360
6361 switch (uap->whence) {
6362 case L_INCR:
6363 offset += fp->fp_glob->fg_offset;
6364 break;
6365 case L_XTND:
6366 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6367 break;
6368 }
6369 offset += file_size;
6370 break;
6371 case L_SET:
6372 break;
6373 case SEEK_HOLE:
6374 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, data: (caddr_t)&offset, fflag: 0, ctx);
6375 break;
6376 case SEEK_DATA:
6377 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, data: (caddr_t)&offset, fflag: 0, ctx);
6378 break;
6379 default:
6380 error = EINVAL;
6381 }
6382 if (error == 0) {
6383 if (uap->offset > 0 && offset < 0) {
6384 /* Incremented/relative move past max size */
6385 error = EOVERFLOW;
6386 } else {
6387 /*
6388 * Allow negative offsets on character devices, per
6389 * POSIX 1003.1-2001. Most likely for writing disk
6390 * labels.
6391 */
6392 if (offset < 0 && vp->v_type != VCHR) {
6393 /* Decremented/relative move before start */
6394 error = EINVAL;
6395 } else {
6396 /* Success */
6397 fp->fp_glob->fg_offset = offset;
6398 *retval = fp->fp_glob->fg_offset;
6399 }
6400 }
6401 }
6402
6403 /*
6404 * An lseek can affect whether data is "available to read." Use
6405 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6406 */
6407 post_event_if_success(vp, error, NOTE_NONE);
6408 (void)vnode_put(vp);
6409 file_drop(uap->fd);
6410 return error;
6411}
6412
6413
6414/*
6415 * Check access permissions.
6416 *
6417 * Returns: 0 Success
6418 * vnode_authorize:???
6419 */
6420static int
6421access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6422{
6423 kauth_action_t action;
6424 int error;
6425
6426 /*
6427 * If just the regular access bits, convert them to something
6428 * that vnode_authorize will understand.
6429 */
6430 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6431 action = 0;
6432 if (uflags & R_OK) {
6433 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6434 }
6435 if (uflags & W_OK) {
6436 if (vnode_isdir(vp)) {
6437 action |= KAUTH_VNODE_ADD_FILE |
6438 KAUTH_VNODE_ADD_SUBDIRECTORY;
6439 /* might want delete rights here too */
6440 } else {
6441 action |= KAUTH_VNODE_WRITE_DATA;
6442 }
6443 }
6444 if (uflags & X_OK) {
6445 if (vnode_isdir(vp)) {
6446 action |= KAUTH_VNODE_SEARCH;
6447 } else {
6448 action |= KAUTH_VNODE_EXECUTE;
6449 }
6450 }
6451 } else {
6452 /* take advantage of definition of uflags */
6453 action = uflags >> 8;
6454 }
6455
6456#if CONFIG_MACF
6457 error = mac_vnode_check_access(ctx, vp, acc_mode: uflags);
6458 if (error) {
6459 return error;
6460 }
6461#endif /* MAC */
6462
6463 /* action == 0 means only check for existence */
6464 if (action != 0) {
6465 error = vnode_authorize(vp, dvp, action: action | KAUTH_VNODE_ACCESS, ctx);
6466 } else {
6467 error = 0;
6468 }
6469
6470 return error;
6471}
6472
6473
6474
6475/*
6476 * access_extended: Check access permissions in bulk.
6477 *
6478 * Description: uap->entries Pointer to an array of accessx
6479 * descriptor structs, plus one or
6480 * more NULL terminated strings (see
6481 * "Notes" section below).
6482 * uap->size Size of the area pointed to by
6483 * uap->entries.
6484 * uap->results Pointer to the results array.
6485 *
6486 * Returns: 0 Success
6487 * ENOMEM Insufficient memory
6488 * EINVAL Invalid arguments
6489 * namei:EFAULT Bad address
6490 * namei:ENAMETOOLONG Filename too long
6491 * namei:ENOENT No such file or directory
6492 * namei:ELOOP Too many levels of symbolic links
6493 * namei:EBADF Bad file descriptor
6494 * namei:ENOTDIR Not a directory
6495 * namei:???
6496 * access1:
6497 *
6498 * Implicit returns:
6499 * uap->results Array contents modified
6500 *
6501 * Notes: The uap->entries are structured as an arbitrary length array
6502 * of accessx descriptors, followed by one or more NULL terminated
6503 * strings
6504 *
6505 * struct accessx_descriptor[0]
6506 * ...
6507 * struct accessx_descriptor[n]
6508 * char name_data[0];
6509 *
6510 * We determine the entry count by walking the buffer containing
6511 * the uap->entries argument descriptor. For each descriptor we
6512 * see, the valid values for the offset ad_name_offset will be
6513 * in the byte range:
6514 *
6515 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6516 * to
6517 * [ uap->entries + uap->size - 2 ]
6518 *
6519 * since we must have at least one string, and the string must
6520 * be at least one character plus the NULL terminator in length.
6521 *
6522 * XXX: Need to support the check-as uid argument
6523 */
6524int
6525access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6526{
6527 struct accessx_descriptor *input = NULL;
6528 errno_t *result = NULL;
6529 errno_t error = 0;
6530 int wantdelete = 0;
6531 size_t desc_max, desc_actual = 0;
6532 unsigned int i, j;
6533 struct vfs_context context;
6534 struct nameidata nd;
6535 int niopts;
6536 vnode_t vp = NULL;
6537 vnode_t dvp = NULL;
6538#define ACCESSX_MAX_DESCR_ON_STACK 10
6539 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6540
6541 context.vc_ucred = NULL;
6542
6543 /*
6544 * Validate parameters; if valid, copy the descriptor array and string
6545 * arguments into local memory. Before proceeding, the following
6546 * conditions must have been met:
6547 *
6548 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6549 * o There must be sufficient room in the request for at least one
6550 * descriptor and a one yte NUL terminated string.
6551 * o The allocation of local storage must not fail.
6552 */
6553 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6554 return ENOMEM;
6555 }
6556 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6557 return EINVAL;
6558 }
6559 if (uap->size <= sizeof(stack_input)) {
6560 input = stack_input;
6561 } else {
6562 input = kalloc_data(uap->size, Z_WAITOK);
6563 if (input == NULL) {
6564 error = ENOMEM;
6565 goto out;
6566 }
6567 }
6568 error = copyin(uap->entries, input, uap->size);
6569 if (error) {
6570 goto out;
6571 }
6572
6573 AUDIT_ARG(opaque, input, uap->size);
6574
6575 /*
6576 * Force NUL termination of the copyin buffer to avoid nami() running
6577 * off the end. If the caller passes us bogus data, they may get a
6578 * bogus result.
6579 */
6580 ((char *)input)[uap->size - 1] = 0;
6581
6582 /*
6583 * Access is defined as checking against the process' real identity,
6584 * even if operations are checking the effective identity. This
6585 * requires that we use a local vfs context.
6586 */
6587 context.vc_ucred = kauth_cred_copy_real(cred: kauth_cred_get());
6588 context.vc_thread = current_thread();
6589
6590 /*
6591 * Find out how many entries we have, so we can allocate the result
6592 * array by walking the list and adjusting the count downward by the
6593 * earliest string offset we see.
6594 */
6595 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6596 desc_actual = desc_max;
6597 for (i = 0; i < desc_actual; i++) {
6598 /*
6599 * Take the offset to the name string for this entry and
6600 * convert to an input array index, which would be one off
6601 * the end of the array if this entry was the lowest-addressed
6602 * name string.
6603 */
6604 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6605
6606 /*
6607 * An offset greater than the max allowable offset is an error.
6608 * It is also an error for any valid entry to point
6609 * to a location prior to the end of the current entry, if
6610 * it's not a reference to the string of the previous entry.
6611 */
6612 if (j > desc_max || (j != 0 && j <= i)) {
6613 error = EINVAL;
6614 goto out;
6615 }
6616
6617 /* Also do not let ad_name_offset point to something beyond the size of the input */
6618 if (input[i].ad_name_offset >= uap->size) {
6619 error = EINVAL;
6620 goto out;
6621 }
6622
6623 /*
6624 * An offset of 0 means use the previous descriptor's offset;
6625 * this is used to chain multiple requests for the same file
6626 * to avoid multiple lookups.
6627 */
6628 if (j == 0) {
6629 /* This is not valid for the first entry */
6630 if (i == 0) {
6631 error = EINVAL;
6632 goto out;
6633 }
6634 continue;
6635 }
6636
6637 /*
6638 * If the offset of the string for this descriptor is before
6639 * what we believe is the current actual last descriptor,
6640 * then we need to adjust our estimate downward; this permits
6641 * the string table following the last descriptor to be out
6642 * of order relative to the descriptor list.
6643 */
6644 if (j < desc_actual) {
6645 desc_actual = j;
6646 }
6647 }
6648
6649 /*
6650 * We limit the actual number of descriptors we are willing to process
6651 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6652 * requested does not exceed this limit,
6653 */
6654 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6655 error = ENOMEM;
6656 goto out;
6657 }
6658 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6659 if (result == NULL) {
6660 error = ENOMEM;
6661 goto out;
6662 }
6663
6664 /*
6665 * Do the work by iterating over the descriptor entries we know to
6666 * at least appear to contain valid data.
6667 */
6668 error = 0;
6669 for (i = 0; i < desc_actual; i++) {
6670 /*
6671 * If the ad_name_offset is 0, then we use the previous
6672 * results to make the check; otherwise, we are looking up
6673 * a new file name.
6674 */
6675 if (input[i].ad_name_offset != 0) {
6676 /* discard old vnodes */
6677 if (vp) {
6678 vnode_put(vp);
6679 vp = NULL;
6680 }
6681 if (dvp) {
6682 vnode_put(vp: dvp);
6683 dvp = NULL;
6684 }
6685
6686 /*
6687 * Scan forward in the descriptor list to see if we
6688 * need the parent vnode. We will need it if we are
6689 * deleting, since we must have rights to remove
6690 * entries in the parent directory, as well as the
6691 * rights to delete the object itself.
6692 */
6693 wantdelete = input[i].ad_flags & _DELETE_OK;
6694 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6695 if (input[j].ad_flags & _DELETE_OK) {
6696 wantdelete = 1;
6697 }
6698 }
6699
6700 niopts = FOLLOW | AUDITVNPATH1;
6701
6702 /* need parent for vnode_authorize for deletion test */
6703 if (wantdelete) {
6704 niopts |= WANTPARENT;
6705 }
6706
6707 /* do the lookup */
6708 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6709 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6710 &context);
6711 error = namei(ndp: &nd);
6712 if (!error) {
6713 vp = nd.ni_vp;
6714 if (wantdelete) {
6715 dvp = nd.ni_dvp;
6716 }
6717 }
6718 nameidone(&nd);
6719 }
6720
6721 /*
6722 * Handle lookup errors.
6723 */
6724 switch (error) {
6725 case ENOENT:
6726 case EACCES:
6727 case EPERM:
6728 case ENOTDIR:
6729 result[i] = error;
6730 break;
6731 case 0:
6732 /* run this access check */
6733 result[i] = access1(vp, dvp, uflags: input[i].ad_flags, ctx: &context);
6734 break;
6735 default:
6736 /* fatal lookup error */
6737
6738 goto out;
6739 }
6740 }
6741
6742 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6743
6744 /* copy out results */
6745 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6746
6747out:
6748 if (input && input != stack_input) {
6749 kfree_data(input, uap->size);
6750 }
6751 if (result) {
6752 kfree_data(result, desc_actual * sizeof(errno_t));
6753 }
6754 if (vp) {
6755 vnode_put(vp);
6756 }
6757 if (dvp) {
6758 vnode_put(vp: dvp);
6759 }
6760 if (IS_VALID_CRED(context.vc_ucred)) {
6761 kauth_cred_unref(&context.vc_ucred);
6762 }
6763 return error;
6764}
6765
6766
6767/*
6768 * Returns: 0 Success
6769 * namei:EFAULT Bad address
6770 * namei:ENAMETOOLONG Filename too long
6771 * namei:ENOENT No such file or directory
6772 * namei:ELOOP Too many levels of symbolic links
6773 * namei:EBADF Bad file descriptor
6774 * namei:ENOTDIR Not a directory
6775 * namei:???
6776 * access1:
6777 */
6778static int
6779faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6780 int flag, enum uio_seg segflg)
6781{
6782 int error;
6783 struct nameidata nd;
6784 int niopts;
6785 struct vfs_context context;
6786#if NAMEDRSRCFORK
6787 int is_namedstream = 0;
6788#endif
6789
6790 /*
6791 * Unless the AT_EACCESS option is used, Access is defined as checking
6792 * against the process' real identity, even if operations are checking
6793 * the effective identity. So we need to tweak the credential
6794 * in the context for that case.
6795 */
6796 if (!(flag & AT_EACCESS)) {
6797 context.vc_ucred = kauth_cred_copy_real(cred: kauth_cred_get());
6798 } else {
6799 context.vc_ucred = ctx->vc_ucred;
6800 }
6801 context.vc_thread = ctx->vc_thread;
6802
6803
6804 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6805 /* need parent for vnode_authorize for deletion test */
6806 if (amode & _DELETE_OK) {
6807 niopts |= WANTPARENT;
6808 }
6809 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6810 path, &context);
6811 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6812 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6813 }
6814
6815#if NAMEDRSRCFORK
6816 /* access(F_OK) calls are allowed for resource forks. */
6817 if (amode == F_OK) {
6818 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6819 }
6820#endif
6821 error = nameiat(ndp: &nd, dirfd: fd);
6822 if (error) {
6823 goto out;
6824 }
6825
6826#if NAMEDRSRCFORK
6827 /* Grab reference on the shadow stream file vnode to
6828 * force an inactive on release which will mark it
6829 * for recycle.
6830 */
6831 if (vnode_isnamedstream(vp: nd.ni_vp) &&
6832 (nd.ni_vp->v_parent != NULLVP) &&
6833 vnode_isshadow(nd.ni_vp)) {
6834 is_namedstream = 1;
6835 vnode_ref(vp: nd.ni_vp);
6836 }
6837#endif
6838
6839 error = access1(vp: nd.ni_vp, dvp: nd.ni_dvp, uflags: amode, ctx: &context);
6840
6841#if NAMEDRSRCFORK
6842 if (is_namedstream) {
6843 vnode_rele(vp: nd.ni_vp);
6844 }
6845#endif
6846
6847 vnode_put(vp: nd.ni_vp);
6848 if (amode & _DELETE_OK) {
6849 vnode_put(vp: nd.ni_dvp);
6850 }
6851 nameidone(&nd);
6852
6853out:
6854 if (!(flag & AT_EACCESS)) {
6855 kauth_cred_unref(&context.vc_ucred);
6856 }
6857 return error;
6858}
6859
6860int
6861access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6862{
6863 return faccessat_internal(ctx: vfs_context_current(), AT_FDCWD,
6864 path: uap->path, amode: uap->flags, flag: 0, segflg: UIO_USERSPACE);
6865}
6866
6867int
6868faccessat(__unused proc_t p, struct faccessat_args *uap,
6869 __unused int32_t *retval)
6870{
6871 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6872 return EINVAL;
6873 }
6874
6875 return faccessat_internal(ctx: vfs_context_current(), fd: uap->fd,
6876 path: uap->path, amode: uap->amode, flag: uap->flag, segflg: UIO_USERSPACE);
6877}
6878
6879/*
6880 * Returns: 0 Success
6881 * EFAULT
6882 * copyout:EFAULT
6883 * namei:???
6884 * vn_stat:???
6885 */
6886static int
6887fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6888 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6889 enum uio_seg segflg, int fd, int flag)
6890{
6891 struct nameidata *ndp = NULL;
6892 int follow;
6893 union {
6894 struct stat sb;
6895 struct stat64 sb64;
6896 } source = {};
6897 union {
6898 struct user64_stat user64_sb;
6899 struct user32_stat user32_sb;
6900 struct user64_stat64 user64_sb64;
6901 struct user32_stat64 user32_sb64;
6902 } dest = {};
6903 caddr_t sbp;
6904 int error, my_size;
6905 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6906 size_t xsecurity_bufsize;
6907 void * statptr;
6908 struct fileproc *fp = NULL;
6909 int needsrealdev = 0;
6910
6911 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6912 ndp = kalloc_type(struct nameidata, Z_WAITOK);
6913 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6914 segflg, path, ctx);
6915 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6916 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6917 }
6918
6919#if NAMEDRSRCFORK
6920 int is_namedstream = 0;
6921 /* stat calls are allowed for resource forks. */
6922 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6923#endif
6924
6925 if (flag & AT_FDONLY) {
6926 vnode_t fvp;
6927
6928 error = fp_getfvp(p: vfs_context_proc(ctx), fd, resultfp: &fp, resultvp: &fvp);
6929 if (error) {
6930 goto out;
6931 }
6932 if ((error = vnode_getwithref(vp: fvp))) {
6933 file_drop(fd);
6934 goto out;
6935 }
6936 ndp->ni_vp = fvp;
6937 } else {
6938 error = nameiat(ndp, dirfd: fd);
6939 if (error) {
6940 goto out;
6941 }
6942 }
6943
6944 statptr = (void *)&source;
6945
6946#if NAMEDRSRCFORK
6947 /* Grab reference on the shadow stream file vnode to
6948 * force an inactive on release which will mark it
6949 * for recycle.
6950 */
6951 if (vnode_isnamedstream(vp: ndp->ni_vp) &&
6952 (ndp->ni_vp->v_parent != NULLVP) &&
6953 vnode_isshadow(ndp->ni_vp)) {
6954 is_namedstream = 1;
6955 vnode_ref(vp: ndp->ni_vp);
6956 }
6957#endif
6958
6959 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6960 if (fp && (xsecurity == USER_ADDR_NULL)) {
6961 /*
6962 * If the caller has the file open, and is not
6963 * requesting extended security information, we are
6964 * going to let them get the basic stat information.
6965 */
6966 error = vn_stat_noauth(vp: ndp->ni_vp, sb: statptr, NULL, isstat64, needsrealdev, ctx,
6967 file_cred: fp->fp_glob->fg_cred);
6968 } else {
6969 error = vn_stat(vp: ndp->ni_vp, sb: statptr, xsec: (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6970 isstat64, needsrealdev, ctx);
6971 }
6972
6973#if NAMEDRSRCFORK
6974 if (is_namedstream) {
6975 vnode_rele(vp: ndp->ni_vp);
6976 }
6977#endif
6978 vnode_put(vp: ndp->ni_vp);
6979 nameidone(ndp);
6980
6981 if (fp) {
6982 file_drop(fd);
6983 fp = NULL;
6984 }
6985
6986 if (error) {
6987 goto out;
6988 }
6989 /* Zap spare fields */
6990 if (isstat64 != 0) {
6991 source.sb64.st_lspare = 0;
6992 source.sb64.st_qspare[0] = 0LL;
6993 source.sb64.st_qspare[1] = 0LL;
6994 if (vfs_context_is64bit(ctx)) {
6995 munge_user64_stat64(sbp: &source.sb64, usbp: &dest.user64_sb64);
6996 my_size = sizeof(dest.user64_sb64);
6997 sbp = (caddr_t)&dest.user64_sb64;
6998 } else {
6999 munge_user32_stat64(sbp: &source.sb64, usbp: &dest.user32_sb64);
7000 my_size = sizeof(dest.user32_sb64);
7001 sbp = (caddr_t)&dest.user32_sb64;
7002 }
7003 /*
7004 * Check if we raced (post lookup) against the last unlink of a file.
7005 */
7006 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7007 source.sb64.st_nlink = 1;
7008 }
7009 } else {
7010 source.sb.st_lspare = 0;
7011 source.sb.st_qspare[0] = 0LL;
7012 source.sb.st_qspare[1] = 0LL;
7013 if (vfs_context_is64bit(ctx)) {
7014 munge_user64_stat(sbp: &source.sb, usbp: &dest.user64_sb);
7015 my_size = sizeof(dest.user64_sb);
7016 sbp = (caddr_t)&dest.user64_sb;
7017 } else {
7018 munge_user32_stat(sbp: &source.sb, usbp: &dest.user32_sb);
7019 my_size = sizeof(dest.user32_sb);
7020 sbp = (caddr_t)&dest.user32_sb;
7021 }
7022
7023 /*
7024 * Check if we raced (post lookup) against the last unlink of a file.
7025 */
7026 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7027 source.sb.st_nlink = 1;
7028 }
7029 }
7030 if ((error = copyout(sbp, ub, my_size)) != 0) {
7031 goto out;
7032 }
7033
7034 /* caller wants extended security information? */
7035 if (xsecurity != USER_ADDR_NULL) {
7036 /* did we get any? */
7037 if (fsec == KAUTH_FILESEC_NONE) {
7038 if (susize(xsecurity_size, 0) != 0) {
7039 error = EFAULT;
7040 goto out;
7041 }
7042 } else {
7043 /* find the user buffer size */
7044 xsecurity_bufsize = fusize(xsecurity_size);
7045
7046 /* copy out the actual data size */
7047 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7048 error = EFAULT;
7049 goto out;
7050 }
7051
7052 /* if the caller supplied enough room, copy out to it */
7053 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7054 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7055 }
7056 }
7057 }
7058out:
7059 if (ndp) {
7060 kfree_type(struct nameidata, ndp);
7061 }
7062 if (fsec != KAUTH_FILESEC_NONE) {
7063 kauth_filesec_free(fsp: fsec);
7064 }
7065 return error;
7066}
7067
7068/*
7069 * stat_extended: Get file status; with extended security (ACL).
7070 *
7071 * Parameters: p (ignored)
7072 * uap User argument descriptor (see below)
7073 * retval (ignored)
7074 *
7075 * Indirect: uap->path Path of file to get status from
7076 * uap->ub User buffer (holds file status info)
7077 * uap->xsecurity ACL to get (extended security)
7078 * uap->xsecurity_size Size of ACL
7079 *
7080 * Returns: 0 Success
7081 * !0 errno value
7082 *
7083 */
7084int
7085stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7086 __unused int32_t *retval)
7087{
7088 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7089 xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: 0, segflg: UIO_USERSPACE, AT_FDCWD,
7090 flag: 0);
7091}
7092
7093/*
7094 * Returns: 0 Success
7095 * fstatat_internal:??? [see fstatat_internal() in this file]
7096 */
7097int
7098stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7099{
7100 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7101 xsecurity: 0, xsecurity_size: 0, isstat64: 0, segflg: UIO_USERSPACE, AT_FDCWD, flag: 0);
7102}
7103
7104int
7105stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7106{
7107 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7108 xsecurity: 0, xsecurity_size: 0, isstat64: 1, segflg: UIO_USERSPACE, AT_FDCWD, flag: 0);
7109}
7110
7111/*
7112 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7113 *
7114 * Parameters: p (ignored)
7115 * uap User argument descriptor (see below)
7116 * retval (ignored)
7117 *
7118 * Indirect: uap->path Path of file to get status from
7119 * uap->ub User buffer (holds file status info)
7120 * uap->xsecurity ACL to get (extended security)
7121 * uap->xsecurity_size Size of ACL
7122 *
7123 * Returns: 0 Success
7124 * !0 errno value
7125 *
7126 */
7127int
7128stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7129{
7130 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7131 xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: 1, segflg: UIO_USERSPACE, AT_FDCWD,
7132 flag: 0);
7133}
7134
7135/*
7136 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7137 *
7138 * Parameters: p (ignored)
7139 * uap User argument descriptor (see below)
7140 * retval (ignored)
7141 *
7142 * Indirect: uap->path Path of file to get status from
7143 * uap->ub User buffer (holds file status info)
7144 * uap->xsecurity ACL to get (extended security)
7145 * uap->xsecurity_size Size of ACL
7146 *
7147 * Returns: 0 Success
7148 * !0 errno value
7149 *
7150 */
7151int
7152lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7153{
7154 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7155 xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: 0, segflg: UIO_USERSPACE, AT_FDCWD,
7156 AT_SYMLINK_NOFOLLOW);
7157}
7158
7159/*
7160 * Get file status; this version does not follow links.
7161 */
7162int
7163lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7164{
7165 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7166 xsecurity: 0, xsecurity_size: 0, isstat64: 0, segflg: UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7167}
7168
7169int
7170lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7171{
7172 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7173 xsecurity: 0, xsecurity_size: 0, isstat64: 1, segflg: UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7174}
7175
7176/*
7177 * lstat64_extended: Get file status; can handle large inode numbers; does not
7178 * follow links; with extended security (ACL).
7179 *
7180 * Parameters: p (ignored)
7181 * uap User argument descriptor (see below)
7182 * retval (ignored)
7183 *
7184 * Indirect: uap->path Path of file to get status from
7185 * uap->ub User buffer (holds file status info)
7186 * uap->xsecurity ACL to get (extended security)
7187 * uap->xsecurity_size Size of ACL
7188 *
7189 * Returns: 0 Success
7190 * !0 errno value
7191 *
7192 */
7193int
7194lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7195{
7196 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7197 xsecurity: uap->xsecurity, xsecurity_size: uap->xsecurity_size, isstat64: 1, segflg: UIO_USERSPACE, AT_FDCWD,
7198 AT_SYMLINK_NOFOLLOW);
7199}
7200
7201int
7202fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7203{
7204 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7205 return EINVAL;
7206 }
7207
7208 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7209 xsecurity: 0, xsecurity_size: 0, isstat64: 0, segflg: UIO_USERSPACE, fd: uap->fd, flag: uap->flag);
7210}
7211
7212int
7213fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7214 __unused int32_t *retval)
7215{
7216 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7217 return EINVAL;
7218 }
7219
7220 return fstatat_internal(ctx: vfs_context_current(), path: uap->path, ub: uap->ub,
7221 xsecurity: 0, xsecurity_size: 0, isstat64: 1, segflg: UIO_USERSPACE, fd: uap->fd, flag: uap->flag);
7222}
7223
7224/*
7225 * Get configurable pathname variables.
7226 *
7227 * Returns: 0 Success
7228 * namei:???
7229 * vn_pathconf:???
7230 *
7231 * Notes: Global implementation constants are intended to be
7232 * implemented in this function directly; all other constants
7233 * are per-FS implementation, and therefore must be handled in
7234 * each respective FS, instead.
7235 *
7236 * XXX We implement some things globally right now that should actually be
7237 * XXX per-FS; we will need to deal with this at some point.
7238 */
7239/* ARGSUSED */
7240int
7241pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7242{
7243 int error;
7244 struct nameidata nd;
7245 vfs_context_t ctx = vfs_context_current();
7246
7247 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7248 UIO_USERSPACE, uap->path, ctx);
7249 error = namei(ndp: &nd);
7250 if (error) {
7251 return error;
7252 }
7253
7254 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7255
7256 vnode_put(vp: nd.ni_vp);
7257 nameidone(&nd);
7258 return error;
7259}
7260
7261/*
7262 * Return target name of a symbolic link.
7263 */
7264/* ARGSUSED */
7265static int
7266readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7267 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7268 int *retval)
7269{
7270 vnode_t vp;
7271 uio_t auio;
7272 int error;
7273 struct nameidata nd;
7274 UIO_STACKBUF(uio_buf, 1);
7275 bool put_vnode;
7276
7277 if (bufsize > INT32_MAX) {
7278 return EINVAL;
7279 }
7280
7281 if (lnk_vp) {
7282 vp = lnk_vp;
7283 put_vnode = false;
7284 } else {
7285 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7286 seg, path, ctx);
7287
7288 error = nameiat(ndp: &nd, dirfd: fd);
7289 if (error) {
7290 return error;
7291 }
7292 vp = nd.ni_vp;
7293 put_vnode = true;
7294 nameidone(&nd);
7295 }
7296
7297 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: 0, a_spacetype: bufseg, a_iodirection: UIO_READ,
7298 a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
7299 uio_addiov(a_uio: auio, a_baseaddr: buf, a_length: bufsize);
7300 if (vp->v_type != VLNK) {
7301 error = EINVAL;
7302 } else {
7303#if CONFIG_MACF
7304 error = mac_vnode_check_readlink(ctx, vp);
7305#endif
7306 if (error == 0) {
7307 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7308 ctx);
7309 }
7310 if (error == 0) {
7311 error = VNOP_READLINK(vp, auio, ctx);
7312 }
7313 }
7314
7315 if (put_vnode) {
7316 vnode_put(vp);
7317 }
7318
7319 *retval = (int)(bufsize - uio_resid(a_uio: auio));
7320 return error;
7321}
7322
7323int
7324freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7325{
7326 enum uio_seg procseg;
7327 vnode_t vp;
7328 int error;
7329
7330 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7331
7332 AUDIT_ARG(fd, uap->fd);
7333
7334 if ((error = file_vnode(uap->fd, &vp))) {
7335 return error;
7336 }
7337 if ((error = vnode_getwithref(vp))) {
7338 file_drop(uap->fd);
7339 return error;
7340 }
7341
7342 error = readlinkat_internal(ctx: vfs_context_current(), fd: -1,
7343 lnk_vp: vp, path: 0, seg: procseg, CAST_USER_ADDR_T(uap->buf),
7344 bufsize: uap->bufsize, bufseg: procseg, retval);
7345
7346 vnode_put(vp);
7347 file_drop(uap->fd);
7348 return error;
7349}
7350
7351int
7352readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7353{
7354 enum uio_seg procseg;
7355
7356 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7357 return readlinkat_internal(ctx: vfs_context_current(), AT_FDCWD, NULL,
7358 CAST_USER_ADDR_T(uap->path), seg: procseg, CAST_USER_ADDR_T(uap->buf),
7359 bufsize: uap->count, bufseg: procseg, retval);
7360}
7361
7362int
7363readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7364{
7365 enum uio_seg procseg;
7366
7367 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7368 return readlinkat_internal(ctx: vfs_context_current(), fd: uap->fd, NULL,
7369 CAST_USER_ADDR_T(uap->path), seg: procseg, buf: uap->buf, bufsize: uap->bufsize, bufseg: procseg,
7370 retval);
7371}
7372
7373/*
7374 * Change file flags, the deep inner layer.
7375 */
7376static int
7377chflags0(vnode_t vp, struct vnode_attr *va,
7378 int (*setattr)(vnode_t, void *, vfs_context_t),
7379 void *arg, vfs_context_t ctx)
7380{
7381 kauth_action_t action = 0;
7382 int error;
7383
7384#if CONFIG_MACF
7385 error = mac_vnode_check_setflags(ctx, vp, flags: va->va_flags);
7386 if (error) {
7387 goto out;
7388 }
7389#endif
7390
7391 /* request authorisation, disregard immutability */
7392 if ((error = vnode_authattr(vp, vap: va, actionp: &action, ctx)) != 0) {
7393 goto out;
7394 }
7395 /*
7396 * Request that the auth layer disregard those file flags it's allowed to when
7397 * authorizing this operation; we need to do this in order to be able to
7398 * clear immutable flags.
7399 */
7400 if (action && ((error = vnode_authorize(vp, NULL, action: action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7401 goto out;
7402 }
7403 error = (*setattr)(vp, arg, ctx);
7404
7405#if CONFIG_MACF
7406 if (error == 0) {
7407 mac_vnode_notify_setflags(ctx, vp, flags: va->va_flags);
7408 }
7409#endif
7410
7411out:
7412 return error;
7413}
7414
7415/*
7416 * Change file flags.
7417 *
7418 * NOTE: this will vnode_put() `vp'
7419 */
7420static int
7421chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7422{
7423 struct vnode_attr va;
7424 int error;
7425
7426 VATTR_INIT(&va);
7427 VATTR_SET(&va, va_flags, flags);
7428
7429 error = chflags0(vp, va: &va, setattr: (void *)vnode_setattr, arg: &va, ctx);
7430 vnode_put(vp);
7431
7432 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7433 error = ENOTSUP;
7434 }
7435
7436 return error;
7437}
7438
7439/*
7440 * Change flags of a file given a path name.
7441 */
7442/* ARGSUSED */
7443int
7444chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7445{
7446 vnode_t vp;
7447 vfs_context_t ctx = vfs_context_current();
7448 int error;
7449 struct nameidata nd;
7450 uint32_t wantparent = 0;
7451
7452#if CONFIG_FILE_LEASES
7453 wantparent = WANTPARENT;
7454#endif
7455
7456 AUDIT_ARG(fflags, uap->flags);
7457 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7458 UIO_USERSPACE, uap->path, ctx);
7459 error = namei(ndp: &nd);
7460 if (error) {
7461 return error;
7462 }
7463 vp = nd.ni_vp;
7464
7465#if CONFIG_FILE_LEASES
7466 vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
7467 vnode_put(vp: nd.ni_dvp);
7468#endif
7469
7470 nameidone(&nd);
7471
7472 /* we don't vnode_put() here because chflags1 does internally */
7473 error = chflags1(vp, flags: uap->flags, ctx);
7474
7475 return error;
7476}
7477
7478/*
7479 * Change flags of a file given a file descriptor.
7480 */
7481/* ARGSUSED */
7482int
7483fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7484{
7485 vnode_t vp;
7486 int error;
7487
7488 AUDIT_ARG(fd, uap->fd);
7489 AUDIT_ARG(fflags, uap->flags);
7490 if ((error = file_vnode(uap->fd, &vp))) {
7491 return error;
7492 }
7493
7494 if ((error = vnode_getwithref(vp))) {
7495 file_drop(uap->fd);
7496 return error;
7497 }
7498
7499 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7500
7501#if CONFIG_FILE_LEASES
7502 vnode_breakdirlease(vp, true, O_WRONLY);
7503#endif
7504
7505 /* we don't vnode_put() here because chflags1 does internally */
7506 error = chflags1(vp, flags: uap->flags, ctx: vfs_context_current());
7507
7508 file_drop(uap->fd);
7509 return error;
7510}
7511
7512/*
7513 * Change security information on a filesystem object.
7514 *
7515 * Returns: 0 Success
7516 * EPERM Operation not permitted
7517 * vnode_authattr:??? [anything vnode_authattr can return]
7518 * vnode_authorize:??? [anything vnode_authorize can return]
7519 * vnode_setattr:??? [anything vnode_setattr can return]
7520 *
7521 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7522 * translated to EPERM before being returned.
7523 */
7524static int
7525chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7526{
7527 kauth_action_t action;
7528 int error;
7529
7530 AUDIT_ARG(mode, vap->va_mode);
7531 /* XXX audit new args */
7532
7533#if NAMEDSTREAMS
7534 /* chmod calls are not allowed for resource forks. */
7535 if (vp->v_flag & VISNAMEDSTREAM) {
7536 return EPERM;
7537 }
7538#endif
7539
7540#if CONFIG_MACF
7541 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7542 (error = mac_vnode_check_setmode(ctx, vp, mode: (mode_t)vap->va_mode)) != 0) {
7543 return error;
7544 }
7545
7546 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7547 if ((error = mac_vnode_check_setowner(ctx, vp,
7548 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7549 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7550 return error;
7551 }
7552 }
7553
7554 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7555 (error = mac_vnode_check_setacl(ctx, vp, acl: vap->va_acl))) {
7556 return error;
7557 }
7558#endif
7559
7560 /* make sure that the caller is allowed to set this security information */
7561 if (((error = vnode_authattr(vp, vap, actionp: &action, ctx)) != 0) ||
7562 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7563 if (error == EACCES) {
7564 error = EPERM;
7565 }
7566 return error;
7567 }
7568
7569 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7570 return error;
7571 }
7572
7573#if CONFIG_MACF
7574 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7575 mac_vnode_notify_setmode(ctx, vp, mode: (mode_t)vap->va_mode);
7576 }
7577
7578 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7579 mac_vnode_notify_setowner(ctx, vp,
7580 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7581 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7582 }
7583
7584 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7585 mac_vnode_notify_setacl(ctx, vp, acl: vap->va_acl);
7586 }
7587#endif
7588
7589 return error;
7590}
7591
7592
7593/*
7594 * Change mode of a file given a path name.
7595 *
7596 * Returns: 0 Success
7597 * namei:??? [anything namei can return]
7598 * chmod_vnode:??? [anything chmod_vnode can return]
7599 */
7600static int
7601chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7602 int fd, int flag, enum uio_seg segflg)
7603{
7604 struct nameidata nd;
7605 int follow, error;
7606 uint32_t wantparent = 0;
7607
7608#if CONFIG_FILE_LEASES
7609 wantparent = WANTPARENT;
7610#endif
7611
7612 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7613 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7614 segflg, path, ctx);
7615 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7616 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7617 }
7618 if ((error = nameiat(ndp: &nd, dirfd: fd))) {
7619 return error;
7620 }
7621
7622#if CONFIG_FILE_LEASES
7623 vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
7624 vnode_put(vp: nd.ni_dvp);
7625#endif
7626
7627 error = chmod_vnode(ctx, vp: nd.ni_vp, vap);
7628 vnode_put(vp: nd.ni_vp);
7629 nameidone(&nd);
7630 return error;
7631}
7632
7633static int
7634chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7635 gid_t gid, user_addr_t xsecurity)
7636{
7637 int error;
7638
7639 VATTR_INIT(pva);
7640
7641 if (mode != -1) {
7642 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7643 } else {
7644 pva->va_mode = 0;
7645 }
7646
7647 if (uid != KAUTH_UID_NONE) {
7648 VATTR_SET(pva, va_uid, uid);
7649 }
7650
7651 if (gid != KAUTH_GID_NONE) {
7652 VATTR_SET(pva, va_gid, gid);
7653 }
7654
7655 *pxsecdst = NULL;
7656 switch (xsecurity) {
7657 case USER_ADDR_NULL:
7658 break;
7659
7660 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7661 VATTR_SET(pva, va_acl, NULL);
7662 break;
7663
7664 default:
7665 if ((error = kauth_copyinfilesec(xsecurity, xsecdestpp: pxsecdst)) != 0) {
7666 return error;
7667 }
7668
7669 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7670 pva->va_vaflags |= VA_FILESEC_ACL;
7671 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7672 break;
7673 }
7674
7675 return 0;
7676}
7677
7678/*
7679 * chmod_extended: Change the mode of a file given a path name; with extended
7680 * argument list (including extended security (ACL)).
7681 *
7682 * Parameters: p Process requesting the open
7683 * uap User argument descriptor (see below)
7684 * retval (ignored)
7685 *
7686 * Indirect: uap->path Path to object (same as 'chmod')
7687 * uap->uid UID to set
7688 * uap->gid GID to set
7689 * uap->mode File mode to set (same as 'chmod')
7690 * uap->xsecurity ACL to set (or delete)
7691 *
7692 * Returns: 0 Success
7693 * !0 errno value
7694 *
7695 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7696 *
7697 * XXX: We should enummerate the possible errno values here, and where
7698 * in the code they originated.
7699 */
7700int
7701chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7702{
7703 int error;
7704 struct vnode_attr va;
7705 kauth_filesec_t xsecdst = NULL;
7706
7707 AUDIT_ARG(owner, uap->uid, uap->gid);
7708
7709 error = chmod_extended_init(pva: &va, pxsecdst: &xsecdst, mode: uap->mode, uid: uap->uid,
7710 gid: uap->gid, xsecurity: uap->xsecurity);
7711
7712 if (error) {
7713 return error;
7714 }
7715
7716 error = chmodat(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD, flag: 0,
7717 segflg: UIO_USERSPACE);
7718
7719 if (xsecdst != NULL) {
7720 kauth_filesec_free(fsp: xsecdst);
7721 }
7722 return error;
7723}
7724
7725/*
7726 * Returns: 0 Success
7727 * chmodat:??? [anything chmodat can return]
7728 */
7729static int
7730fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7731 int flag, enum uio_seg segflg)
7732{
7733 struct vnode_attr va;
7734
7735 VATTR_INIT(&va);
7736 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7737
7738 return chmodat(ctx, path, vap: &va, fd, flag, segflg);
7739}
7740
7741int
7742chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7743{
7744 return fchmodat_internal(ctx: vfs_context_current(), path: uap->path, mode: uap->mode,
7745 AT_FDCWD, flag: 0, segflg: UIO_USERSPACE);
7746}
7747
7748int
7749fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7750{
7751 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7752 return EINVAL;
7753 }
7754
7755 return fchmodat_internal(ctx: vfs_context_current(), path: uap->path, mode: uap->mode,
7756 fd: uap->fd, flag: uap->flag, segflg: UIO_USERSPACE);
7757}
7758
7759/*
7760 * Change mode of a file given a file descriptor.
7761 */
7762static int
7763fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7764{
7765 vnode_t vp;
7766 int error;
7767
7768 AUDIT_ARG(fd, fd);
7769
7770 if ((error = file_vnode(fd, &vp)) != 0) {
7771 return error;
7772 }
7773 if ((error = vnode_getwithref(vp)) != 0) {
7774 file_drop(fd);
7775 return error;
7776 }
7777 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7778
7779#if CONFIG_FILE_LEASES
7780 vnode_breakdirlease(vp, true, O_WRONLY);
7781#endif
7782
7783 error = chmod_vnode(ctx: vfs_context_current(), vp, vap);
7784 (void)vnode_put(vp);
7785 file_drop(fd);
7786
7787 return error;
7788}
7789
7790/*
7791 * fchmod_extended: Change mode of a file given a file descriptor; with
7792 * extended argument list (including extended security (ACL)).
7793 *
7794 * Parameters: p Process requesting to change file mode
7795 * uap User argument descriptor (see below)
7796 * retval (ignored)
7797 *
7798 * Indirect: uap->mode File mode to set (same as 'chmod')
7799 * uap->uid UID to set
7800 * uap->gid GID to set
7801 * uap->xsecurity ACL to set (or delete)
7802 * uap->fd File descriptor of file to change mode
7803 *
7804 * Returns: 0 Success
7805 * !0 errno value
7806 *
7807 */
7808int
7809fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7810{
7811 int error;
7812 struct vnode_attr va;
7813 kauth_filesec_t xsecdst = NULL;
7814
7815 AUDIT_ARG(owner, uap->uid, uap->gid);
7816
7817 error = chmod_extended_init(pva: &va, pxsecdst: &xsecdst, mode: uap->mode, uid: uap->uid,
7818 gid: uap->gid, xsecurity: uap->xsecurity);
7819
7820 if (error) {
7821 return error;
7822 }
7823
7824 error = fchmod1(p, fd: uap->fd, vap: &va);
7825
7826 if (xsecdst != NULL) {
7827 kauth_filesec_free(fsp: xsecdst);
7828 }
7829 return error;
7830}
7831
7832int
7833fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7834{
7835 struct vnode_attr va;
7836
7837 VATTR_INIT(&va);
7838 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7839
7840 return fchmod1(p, fd: uap->fd, vap: &va);
7841}
7842
7843static int
7844vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7845{
7846 struct vnode_attr va;
7847 kauth_action_t action;
7848 int error;
7849
7850 VATTR_INIT(&va);
7851 if (uid != (uid_t)VNOVAL) {
7852 VATTR_SET(&va, va_uid, uid);
7853 }
7854 if (gid != (gid_t)VNOVAL) {
7855 VATTR_SET(&va, va_gid, gid);
7856 }
7857
7858#if NAMEDSTREAMS
7859 /* chown calls are not allowed for resource forks. */
7860 if (vp->v_flag & VISNAMEDSTREAM) {
7861 error = EPERM;
7862 goto out;
7863 }
7864#endif
7865
7866#if CONFIG_MACF
7867 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7868 if (error) {
7869 goto out;
7870 }
7871#endif
7872
7873 /* preflight and authorize attribute changes */
7874 if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != 0) {
7875 goto out;
7876 }
7877 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7878 /*
7879 * EACCES is only allowed from namei(); permissions failure should
7880 * return EPERM, so we need to translate the error code.
7881 */
7882 if (error == EACCES) {
7883 error = EPERM;
7884 }
7885
7886 goto out;
7887 }
7888
7889#if CONFIG_FILE_LEASES
7890 vnode_breakdirlease(vp, true, O_WRONLY);
7891#endif
7892
7893 error = vnode_setattr(vp, vap: &va, ctx);
7894
7895#if CONFIG_MACF
7896 if (error == 0) {
7897 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7898 }
7899#endif
7900
7901out:
7902 return error;
7903}
7904
7905/*
7906 * Set ownership given a path name.
7907 */
7908/* ARGSUSED */
7909static int
7910fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7911 gid_t gid, int flag, enum uio_seg segflg)
7912{
7913 vnode_t vp;
7914 int error;
7915 struct nameidata nd;
7916 int follow;
7917
7918 AUDIT_ARG(owner, uid, gid);
7919
7920 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7921 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7922 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7923 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7924 }
7925
7926 error = nameiat(ndp: &nd, dirfd: fd);
7927 if (error) {
7928 return error;
7929 }
7930
7931 vp = nd.ni_vp;
7932 error = vn_chown_internal(ctx, vp, uid, gid);
7933
7934 nameidone(&nd);
7935 vnode_put(vp);
7936 return error;
7937}
7938
7939int
7940chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7941{
7942 return fchownat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
7943 uid: uap->uid, gid: uap->gid, flag: 0, segflg: UIO_USERSPACE);
7944}
7945
7946int
7947lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7948{
7949 return fchownat_internal(ctx: vfs_context_current(), AT_FDCWD, path: uap->path,
7950 uid: uap->owner, gid: uap->group, AT_SYMLINK_NOFOLLOW, segflg: UIO_USERSPACE);
7951}
7952
7953int
7954fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7955{
7956 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7957 return EINVAL;
7958 }
7959
7960 return fchownat_internal(ctx: vfs_context_current(), fd: uap->fd, path: uap->path,
7961 uid: uap->uid, gid: uap->gid, flag: uap->flag, segflg: UIO_USERSPACE);
7962}
7963
7964/*
7965 * Set ownership given a file descriptor.
7966 */
7967/* ARGSUSED */
7968int
7969fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7970{
7971 vfs_context_t ctx = vfs_context_current();
7972 vnode_t vp;
7973 int error;
7974
7975 AUDIT_ARG(owner, uap->uid, uap->gid);
7976 AUDIT_ARG(fd, uap->fd);
7977
7978 if ((error = file_vnode(uap->fd, &vp))) {
7979 return error;
7980 }
7981
7982 if ((error = vnode_getwithref(vp))) {
7983 file_drop(uap->fd);
7984 return error;
7985 }
7986 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7987
7988 error = vn_chown_internal(ctx, vp, uid: uap->uid, gid: uap->gid);
7989
7990 (void)vnode_put(vp);
7991 file_drop(uap->fd);
7992 return error;
7993}
7994
7995static int
7996getutimes(user_addr_t usrtvp, struct timespec *tsp)
7997{
7998 int error;
7999
8000 if (usrtvp == USER_ADDR_NULL) {
8001 struct timeval old_tv;
8002 /* XXX Y2038 bug because of microtime argument */
8003 microtime(tv: &old_tv);
8004 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8005 tsp[1] = tsp[0];
8006 } else {
8007 if (IS_64BIT_PROCESS(current_proc())) {
8008 struct user64_timeval tv[2];
8009 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8010 if (error) {
8011 return error;
8012 }
8013 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8014 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8015 } else {
8016 struct user32_timeval tv[2];
8017 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8018 if (error) {
8019 return error;
8020 }
8021 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8022 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8023 }
8024 }
8025 return 0;
8026}
8027
8028static int
8029setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8030 int nullflag)
8031{
8032 int error;
8033 struct vnode_attr va;
8034 kauth_action_t action;
8035
8036 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8037
8038 VATTR_INIT(&va);
8039 VATTR_SET(&va, va_access_time, ts[0]);
8040 VATTR_SET(&va, va_modify_time, ts[1]);
8041 if (nullflag) {
8042 va.va_vaflags |= VA_UTIMES_NULL;
8043 }
8044
8045#if NAMEDSTREAMS
8046 /* utimes calls are not allowed for resource forks. */
8047 if (vp->v_flag & VISNAMEDSTREAM) {
8048 error = EPERM;
8049 goto out;
8050 }
8051#endif
8052
8053#if CONFIG_MACF
8054 error = mac_vnode_check_setutimes(ctx, vp, atime: ts[0], mtime: ts[1]);
8055 if (error) {
8056 goto out;
8057 }
8058#endif
8059 if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != 0) {
8060 if (!nullflag && error == EACCES) {
8061 error = EPERM;
8062 }
8063 goto out;
8064 }
8065
8066 /* since we may not need to auth anything, check here */
8067 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8068 if (!nullflag && error == EACCES) {
8069 error = EPERM;
8070 }
8071 goto out;
8072 }
8073 error = vnode_setattr(vp, vap: &va, ctx);
8074
8075#if CONFIG_MACF
8076 if (error == 0) {
8077 mac_vnode_notify_setutimes(ctx, vp, atime: ts[0], mtime: ts[1]);
8078 }
8079#endif
8080
8081out:
8082 return error;
8083}
8084
8085/*
8086 * Set the access and modification times of a file.
8087 */
8088/* ARGSUSED */
8089int
8090utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8091{
8092 struct timespec ts[2];
8093 user_addr_t usrtvp;
8094 int error;
8095 struct nameidata nd;
8096 vfs_context_t ctx = vfs_context_current();
8097 uint32_t wantparent = 0;
8098
8099#if CONFIG_FILE_LEASES
8100 wantparent = WANTPARENT;
8101#endif
8102
8103 /*
8104 * AUDIT: Needed to change the order of operations to do the
8105 * name lookup first because auditing wants the path.
8106 */
8107 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8108 UIO_USERSPACE, uap->path, ctx);
8109 error = namei(ndp: &nd);
8110 if (error) {
8111 return error;
8112 }
8113
8114 /*
8115 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8116 * the current time instead.
8117 */
8118 usrtvp = uap->tptr;
8119 if ((error = getutimes(usrtvp, tsp: ts)) != 0) {
8120 goto out;
8121 }
8122
8123#if CONFIG_FILE_LEASES
8124 vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
8125#endif
8126
8127 error = setutimes(ctx, vp: nd.ni_vp, ts, nullflag: usrtvp == USER_ADDR_NULL);
8128
8129out:
8130#if CONFIG_FILE_LEASES
8131 vnode_put(vp: nd.ni_dvp);
8132#endif
8133 nameidone(&nd);
8134 vnode_put(vp: nd.ni_vp);
8135 return error;
8136}
8137
8138/*
8139 * Set the access and modification times of a file.
8140 */
8141/* ARGSUSED */
8142int
8143futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8144{
8145 struct timespec ts[2];
8146 vnode_t vp;
8147 user_addr_t usrtvp;
8148 int error;
8149
8150 AUDIT_ARG(fd, uap->fd);
8151 usrtvp = uap->tptr;
8152 if ((error = getutimes(usrtvp, tsp: ts)) != 0) {
8153 return error;
8154 }
8155 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8156 return error;
8157 }
8158 if ((error = vnode_getwithref(vp))) {
8159 file_drop(uap->fd);
8160 return error;
8161 }
8162
8163#if CONFIG_FILE_LEASES
8164 vnode_breakdirlease(vp, true, O_WRONLY);
8165#endif
8166
8167 error = setutimes(ctx: vfs_context_current(), vp, ts, nullflag: usrtvp == 0);
8168
8169 vnode_put(vp);
8170 file_drop(uap->fd);
8171 return error;
8172}
8173
8174static int
8175truncate_validate_common(proc_t p, off_t length)
8176{
8177 rlim_t fsize_limit;
8178
8179 if (length < 0) {
8180 return EINVAL;
8181 }
8182
8183 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8184 if ((rlim_t)length > fsize_limit) {
8185 psignal(p, SIGXFSZ);
8186 return EFBIG;
8187 }
8188
8189 return 0;
8190}
8191
8192static int
8193truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8194 vfs_context_t ctx, boolean_t need_auth)
8195{
8196 struct vnode_attr va;
8197 kauth_action_t action;
8198 int error;
8199
8200 VATTR_INIT(&va);
8201 VATTR_SET(&va, va_data_size, length);
8202
8203#if CONFIG_MACF
8204 error = mac_vnode_check_truncate(ctx, file_cred: cred, vp);
8205 if (error) {
8206 return error;
8207 }
8208#endif
8209
8210 /*
8211 * If we reached here from `ftruncate` then we already did an effective
8212 * `vnode_authorize` upon open. We honour the result from then.
8213 */
8214 if (need_auth) {
8215 if ((error = vnode_authattr(vp, vap: &va, actionp: &action, ctx)) != 0) {
8216 return error;
8217 }
8218
8219 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8220 return error;
8221 }
8222 }
8223
8224#if CONFIG_FILE_LEASES
8225 /* Check if there is a lease placed on the parent directory. */
8226 vnode_breakdirlease(vp, true, O_WRONLY);
8227
8228 /* Now check if there is a lease placed on the file itself. */
8229 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8230#endif
8231
8232 error = vnode_setattr(vp, vap: &va, ctx);
8233
8234#if CONFIG_MACF
8235 if (error == 0) {
8236 mac_vnode_notify_truncate(ctx, file_cred: cred, vp);
8237 }
8238#endif
8239
8240 return error;
8241}
8242
8243/*
8244 * Truncate a file given its path name.
8245 */
8246/* ARGSUSED */
8247int
8248truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8249{
8250 vfs_context_t ctx = vfs_context_current();
8251 vnode_t vp;
8252 int error;
8253 struct nameidata nd;
8254
8255 if ((error = truncate_validate_common(p, length: uap->length))) {
8256 return error;
8257 }
8258
8259 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8260 UIO_USERSPACE, uap->path, ctx);
8261
8262 if ((error = namei(ndp: &nd))) {
8263 return error;
8264 }
8265
8266 vp = nd.ni_vp;
8267 nameidone(&nd);
8268
8269 error = truncate_internal(vp, length: uap->length, NOCRED, ctx, true);
8270 vnode_put(vp);
8271
8272 return error;
8273}
8274
8275/*
8276 * Truncate a file given a file descriptor.
8277 */
8278/* ARGSUSED */
8279int
8280ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8281{
8282 vnode_t vp;
8283 struct fileproc *fp;
8284 int error;
8285
8286 AUDIT_ARG(fd, uap->fd);
8287
8288 if ((error = truncate_validate_common(p, length: uap->length))) {
8289 return error;
8290 }
8291
8292 if ((error = fp_lookup(p, fd: uap->fd, resultfp: &fp, locked: 0))) {
8293 return error;
8294 }
8295
8296 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8297 case DTYPE_PSXSHM:
8298 error = pshm_truncate(p, fp, fd: uap->fd, length: uap->length, retval);
8299 goto out;
8300 case DTYPE_VNODE:
8301 break;
8302 default:
8303 error = EINVAL;
8304 goto out;
8305 }
8306
8307 vp = (vnode_t)fp_get_data(fp);
8308
8309 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8310 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8311 error = EINVAL;
8312 goto out;
8313 }
8314
8315 if ((error = vnode_getwithref(vp)) != 0) {
8316 goto out;
8317 }
8318
8319 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8320
8321 error = truncate_internal(vp, length: uap->length, cred: fp->fp_glob->fg_cred,
8322 ctx: vfs_context_current(), false);
8323 vnode_put(vp);
8324
8325out:
8326 file_drop(uap->fd);
8327 return error;
8328}
8329
8330
8331/*
8332 * Sync an open file with synchronized I/O _file_ integrity completion
8333 */
8334/* ARGSUSED */
8335int
8336fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8337{
8338 __pthread_testcancel(presyscall: 1);
8339 return fsync_common(p, uap, MNT_WAIT);
8340}
8341
8342
8343/*
8344 * Sync an open file with synchronized I/O _file_ integrity completion
8345 *
8346 * Notes: This is a legacy support function that does not test for
8347 * thread cancellation points.
8348 */
8349/* ARGSUSED */
8350int
8351fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8352{
8353 return fsync_common(p, uap: (struct fsync_args *)uap, MNT_WAIT);
8354}
8355
8356
8357/*
8358 * Sync an open file with synchronized I/O _data_ integrity completion
8359 */
8360/* ARGSUSED */
8361int
8362fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8363{
8364 __pthread_testcancel(presyscall: 1);
8365 return fsync_common(p, uap: (struct fsync_args *)uap, MNT_DWAIT);
8366}
8367
8368
8369/*
8370 * fsync_common
8371 *
8372 * Common fsync code to support both synchronized I/O file integrity completion
8373 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8374 *
8375 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8376 * will only guarantee that the file data contents are retrievable. If
8377 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8378 * includes additional metadata unnecessary for retrieving the file data
8379 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8380 * storage.
8381 *
8382 * Parameters: p The process
8383 * uap->fd The descriptor to synchronize
8384 * flags The data integrity flags
8385 *
8386 * Returns: int Success
8387 * fp_getfvp:EBADF Bad file descriptor
8388 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8389 * VNOP_FSYNC:??? unspecified
8390 *
8391 * Notes: We use struct fsync_args because it is a short name, and all
8392 * caller argument structures are otherwise identical.
8393 */
8394static int
8395fsync_common(proc_t p, struct fsync_args *uap, int flags)
8396{
8397 vnode_t vp;
8398 struct fileproc *fp;
8399 vfs_context_t ctx = vfs_context_current();
8400 int error;
8401
8402 AUDIT_ARG(fd, uap->fd);
8403
8404 if ((error = fp_getfvp(p, fd: uap->fd, resultfp: &fp, resultvp: &vp))) {
8405 return error;
8406 }
8407 if ((error = vnode_getwithref(vp))) {
8408 file_drop(uap->fd);
8409 return error;
8410 }
8411
8412 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8413
8414 error = VNOP_FSYNC(vp, waitfor: flags, ctx);
8415
8416#if NAMEDRSRCFORK
8417 /* Sync resource fork shadow file if necessary. */
8418 if ((error == 0) &&
8419 (vp->v_flag & VISNAMEDSTREAM) &&
8420 (vp->v_parent != NULLVP) &&
8421 vnode_isshadow(vp) &&
8422 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8423 (void) vnode_flushnamedstream(vp: vp->v_parent, svp: vp, context: ctx);
8424 }
8425#endif
8426
8427 (void)vnode_put(vp);
8428 file_drop(uap->fd);
8429 return error;
8430}
8431
8432/*
8433 * Duplicate files. Source must be a file, target must be a file or
8434 * must not exist.
8435 *
8436 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8437 * perform inheritance correctly.
8438 */
8439/* ARGSUSED */
8440int
8441copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8442{
8443 vnode_t tvp, fvp, tdvp, sdvp;
8444 struct nameidata fromnd, tond;
8445 int error;
8446 vfs_context_t ctx = vfs_context_current();
8447
8448 /* Check that the flags are valid. */
8449 if (uap->flags & ~CPF_MASK) {
8450 return EINVAL;
8451 }
8452
8453 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8454 UIO_USERSPACE, uap->from, ctx);
8455 if ((error = namei(ndp: &fromnd))) {
8456 return error;
8457 }
8458 fvp = fromnd.ni_vp;
8459
8460 NDINIT(&tond, CREATE, OP_LINK,
8461 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8462 UIO_USERSPACE, uap->to, ctx);
8463 if ((error = namei(ndp: &tond))) {
8464 goto out1;
8465 }
8466 tdvp = tond.ni_dvp;
8467 tvp = tond.ni_vp;
8468
8469 if (tvp != NULL) {
8470 if (!(uap->flags & CPF_OVERWRITE)) {
8471 error = EEXIST;
8472 goto out;
8473 }
8474 }
8475
8476 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8477 error = EISDIR;
8478 goto out;
8479 }
8480
8481 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8482 error = EOPNOTSUPP;
8483 goto out;
8484 }
8485
8486#if CONFIG_MACF
8487 if ((error = mac_vnode_check_copyfile(ctx, dvp: tdvp, tvp, fvp, cnp: &tond.ni_cnd, mode: (mode_t)uap->mode, flags: uap->flags)) != 0) {
8488 goto out;
8489 }
8490#endif /* CONFIG_MACF */
8491
8492 if ((error = vnode_authorize(vp: fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8493 goto out;
8494 }
8495 if (tvp) {
8496 if ((error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8497 goto out;
8498 }
8499 }
8500 if ((error = vnode_authorize(vp: tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8501 goto out;
8502 }
8503
8504 if (fvp == tdvp) {
8505 error = EINVAL;
8506 }
8507 /*
8508 * If source is the same as the destination (that is the
8509 * same inode number) then there is nothing to do.
8510 * (fixed to have POSIX semantics - CSM 3/2/98)
8511 */
8512 if (fvp == tvp) {
8513 error = -1;
8514 }
8515
8516#if CONFIG_FILE_LEASES
8517 vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
8518#endif
8519
8520 if (!error) {
8521 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8522 }
8523out:
8524 sdvp = tond.ni_startdir;
8525 /*
8526 * nameidone has to happen before we vnode_put(tdvp)
8527 * since it may need to release the fs_nodelock on the tdvp
8528 */
8529 nameidone(&tond);
8530
8531 if (tvp) {
8532 vnode_put(vp: tvp);
8533 }
8534 vnode_put(vp: tdvp);
8535 vnode_put(vp: sdvp);
8536out1:
8537 vnode_put(vp: fvp);
8538
8539 nameidone(&fromnd);
8540
8541 if (error == -1) {
8542 return 0;
8543 }
8544 return error;
8545}
8546
8547#define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8548
8549/*
8550 * Helper function for doing clones. The caller is expected to provide an
8551 * iocounted source vnode and release it.
8552 */
8553static int
8554clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8555 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8556{
8557 vnode_t tvp, tdvp;
8558 struct nameidata tond;
8559 int error;
8560 int follow;
8561 boolean_t free_src_acl;
8562 boolean_t attr_cleanup;
8563 enum vtype v_type;
8564 kauth_action_t action;
8565 struct componentname *cnp;
8566 uint32_t defaulted = 0;
8567 struct vnode_attr va;
8568 struct vnode_attr nva;
8569 uint32_t vnop_flags;
8570
8571 v_type = vnode_vtype(vp: fvp);
8572 switch (v_type) {
8573 case VLNK:
8574 /* FALLTHRU */
8575 case VREG:
8576 action = KAUTH_VNODE_ADD_FILE;
8577 break;
8578 case VDIR:
8579 if (vnode_isvroot(vp: fvp) || vnode_ismount(vp: fvp) ||
8580 fvp->v_mountedhere) {
8581 return EINVAL;
8582 }
8583 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8584 break;
8585 default:
8586 return EINVAL;
8587 }
8588
8589 AUDIT_ARG(fd2, dst_dirfd);
8590 AUDIT_ARG(value32, flags);
8591
8592 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8593 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8594 UIO_USERSPACE, dst, ctx);
8595 if ((error = nameiat(ndp: &tond, dirfd: dst_dirfd))) {
8596 return error;
8597 }
8598 cnp = &tond.ni_cnd;
8599 tdvp = tond.ni_dvp;
8600 tvp = tond.ni_vp;
8601
8602 free_src_acl = FALSE;
8603 attr_cleanup = FALSE;
8604
8605 if (tvp != NULL) {
8606 error = EEXIST;
8607 goto out;
8608 }
8609
8610 if (vnode_mount(vp: tdvp) != vnode_mount(vp: fvp)) {
8611 error = EXDEV;
8612 goto out;
8613 }
8614
8615#if CONFIG_MACF
8616 if ((error = mac_vnode_check_clone(ctx, dvp: tdvp, vp: fvp, cnp))) {
8617 goto out;
8618 }
8619#endif
8620 if ((error = vnode_authorize(vp: tdvp, NULL, action, ctx))) {
8621 goto out;
8622 }
8623
8624 action = KAUTH_VNODE_GENERIC_READ_BITS;
8625 if (data_read_authorised) {
8626 action &= ~KAUTH_VNODE_READ_DATA;
8627 }
8628 if ((error = vnode_authorize(vp: fvp, NULL, action, ctx))) {
8629 goto out;
8630 }
8631
8632 /*
8633 * certain attributes may need to be changed from the source, we ask for
8634 * those here with the exception of source file's ACLs unless the CLONE_ACL
8635 * flag is specified. By default, the clone file will inherit the target
8636 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8637 * will inherit the source file's ACLs instead.
8638 */
8639 VATTR_INIT(&va);
8640 VATTR_WANTED(&va, va_uid);
8641 VATTR_WANTED(&va, va_gid);
8642 VATTR_WANTED(&va, va_mode);
8643 VATTR_WANTED(&va, va_flags);
8644 if (flags & CLONE_ACL) {
8645 VATTR_WANTED(&va, va_acl);
8646 }
8647
8648 if ((error = vnode_getattr(vp: fvp, vap: &va, ctx)) != 0) {
8649 goto out;
8650 }
8651
8652 VATTR_INIT(&nva);
8653 VATTR_SET(&nva, va_type, v_type);
8654 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8655 VATTR_SET(&nva, va_acl, va.va_acl);
8656 free_src_acl = TRUE;
8657 }
8658
8659 /* Handle ACL inheritance, initialize vap. */
8660 if (v_type == VLNK) {
8661 error = vnode_authattr_new(dvp: tdvp, vap: &nva, noauth: 0, ctx);
8662 } else {
8663 error = vn_attribute_prepare(dvp: tdvp, vap: &nva, defaulted_fieldsp: &defaulted, ctx);
8664 if (error) {
8665 goto out;
8666 }
8667 attr_cleanup = TRUE;
8668 }
8669
8670 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8671 /*
8672 * We've got initial values for all security parameters,
8673 * If we are superuser, then we can change owners to be the
8674 * same as the source. Both superuser and the owner have default
8675 * WRITE_SECURITY privileges so all other fields can be taken
8676 * from source as well.
8677 */
8678 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8679 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8680 VATTR_SET(&nva, va_uid, va.va_uid);
8681 }
8682 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8683 VATTR_SET(&nva, va_gid, va.va_gid);
8684 }
8685 } else {
8686 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8687 }
8688
8689 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8690 VATTR_SET(&nva, va_mode, va.va_mode);
8691 }
8692 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8693 VATTR_SET(&nva, va_flags,
8694 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8695 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8696 }
8697
8698#if CONFIG_FILE_LEASES
8699 vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
8700#endif
8701
8702 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8703
8704 if (!error && tvp) {
8705 int update_flags = 0;
8706#if CONFIG_FSE
8707 int fsevent;
8708#endif /* CONFIG_FSE */
8709
8710 /*
8711 * If some of the requested attributes weren't handled by the
8712 * VNOP, use our fallback code.
8713 */
8714 if (!VATTR_ALL_SUPPORTED(&nva)) {
8715 (void)vnode_setattr_fallback(vp: tvp, vap: &nva, ctx);
8716 }
8717
8718#if CONFIG_MACF
8719 (void)vnode_label(mp: vnode_mount(vp: tvp), dvp: tdvp, vp: tvp, cnp,
8720 VNODE_LABEL_CREATE, ctx);
8721#endif
8722
8723 // Make sure the name & parent pointers are hooked up
8724 if (tvp->v_name == NULL) {
8725 update_flags |= VNODE_UPDATE_NAME;
8726 }
8727 if (tvp->v_parent == NULLVP) {
8728 update_flags |= VNODE_UPDATE_PARENT;
8729 }
8730
8731 if (update_flags) {
8732 (void)vnode_update_identity(vp: tvp, dvp: tdvp, name: cnp->cn_nameptr,
8733 name_len: cnp->cn_namelen, name_hashval: cnp->cn_hash, flags: update_flags);
8734 }
8735
8736#if CONFIG_FSE
8737 switch (vnode_vtype(vp: tvp)) {
8738 case VLNK:
8739 /* FALLTHRU */
8740 case VREG:
8741 fsevent = FSE_CREATE_FILE;
8742 break;
8743 case VDIR:
8744 fsevent = FSE_CREATE_DIR;
8745 break;
8746 default:
8747 goto out;
8748 }
8749
8750 if (need_fsevent(type: fsevent, vp: tvp)) {
8751 /*
8752 * The following is a sequence of three explicit events.
8753 * A pair of FSE_CLONE events representing the source and destination
8754 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8755 * fseventsd may coalesce the destination clone and create events
8756 * into a single event resulting in the following sequence for a client
8757 * FSE_CLONE (src)
8758 * FSE_CLONE | FSE_CREATE (dst)
8759 */
8760 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8761 FSE_ARG_DONE);
8762 add_fsevent(type: fsevent, ctx, FSE_ARG_VNODE, tvp,
8763 FSE_ARG_DONE);
8764 }
8765#endif /* CONFIG_FSE */
8766 }
8767
8768out:
8769 if (attr_cleanup) {
8770 vn_attribute_cleanup(vap: &nva, defaulted_fields: defaulted);
8771 }
8772 if (free_src_acl && va.va_acl) {
8773 kauth_acl_free(fsp: va.va_acl);
8774 }
8775 nameidone(&tond);
8776 if (tvp) {
8777 vnode_put(vp: tvp);
8778 }
8779 vnode_put(vp: tdvp);
8780 return error;
8781}
8782
8783/*
8784 * clone files or directories, target must not exist.
8785 */
8786/* ARGSUSED */
8787int
8788clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8789 __unused int32_t *retval)
8790{
8791 vnode_t fvp;
8792 struct nameidata fromnd;
8793 int follow;
8794 int error;
8795 vfs_context_t ctx = vfs_context_current();
8796
8797 /* Check that the flags are valid. */
8798 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8799 return EINVAL;
8800 }
8801
8802 AUDIT_ARG(fd, uap->src_dirfd);
8803
8804 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8805 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8806 UIO_USERSPACE, uap->src, ctx);
8807 if ((error = nameiat(ndp: &fromnd, dirfd: uap->src_dirfd))) {
8808 return error;
8809 }
8810
8811 fvp = fromnd.ni_vp;
8812 nameidone(&fromnd);
8813
8814 error = clonefile_internal(fvp, FALSE, dst_dirfd: uap->dst_dirfd, dst: uap->dst,
8815 flags: uap->flags, ctx);
8816
8817 vnode_put(vp: fvp);
8818 return error;
8819}
8820
8821int
8822fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8823 __unused int32_t *retval)
8824{
8825 vnode_t fvp;
8826 struct fileproc *fp;
8827 int error;
8828 vfs_context_t ctx = vfs_context_current();
8829
8830 /* Check that the flags are valid. */
8831 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8832 return EINVAL;
8833 }
8834
8835 AUDIT_ARG(fd, uap->src_fd);
8836 error = fp_getfvp(p, fd: uap->src_fd, resultfp: &fp, resultvp: &fvp);
8837 if (error) {
8838 return error;
8839 }
8840
8841 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8842 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8843 error = EBADF;
8844 goto out;
8845 }
8846
8847 if ((error = vnode_getwithref(vp: fvp))) {
8848 goto out;
8849 }
8850
8851 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8852
8853 error = clonefile_internal(fvp, TRUE, dst_dirfd: uap->dst_dirfd, dst: uap->dst,
8854 flags: uap->flags, ctx);
8855
8856 vnode_put(vp: fvp);
8857out:
8858 file_drop(uap->src_fd);
8859 return error;
8860}
8861
8862static int
8863rename_submounts_callback(mount_t mp, void *arg)
8864{
8865 int error = 0;
8866 mount_t pmp = (mount_t)arg;
8867 int prefix_len = (int)strlen(s: pmp->mnt_vfsstat.f_mntonname);
8868
8869 if (strncmp(s1: mp->mnt_vfsstat.f_mntonname, s2: pmp->mnt_vfsstat.f_mntonname, n: prefix_len) != 0) {
8870 return 0;
8871 }
8872
8873 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8874 return 0;
8875 }
8876
8877 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8878 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8879 return -1;
8880 }
8881
8882 size_t pathlen = MAXPATHLEN;
8883 if ((error = vn_getpath_ext(vp: mp->mnt_vnodecovered, NULL, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER))) {
8884 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8885 }
8886
8887 vfs_unbusy(mp);
8888
8889 return error;
8890}
8891
8892/*
8893 * Rename files. Source and destination must either both be directories,
8894 * or both not be directories. If target is a directory, it must be empty.
8895 */
8896/* ARGSUSED */
8897static int
8898renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8899 int tofd, user_addr_t to, int segflg, u_int uflags)
8900{
8901 vnode_t tvp, tdvp;
8902 vnode_t fvp, fdvp;
8903 vnode_t mnt_fvp;
8904 struct nameidata *fromnd, *tond;
8905 int error = 0;
8906 int do_retry;
8907 int retry_count;
8908 int mntrename;
8909 int need_event;
8910 int need_kpath2;
8911 int has_listeners;
8912 const char *oname = NULL;
8913 char *from_name = NULL, *to_name = NULL;
8914 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8915 int from_len = 0, to_len = 0;
8916 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8917 int holding_mntlock;
8918 int vn_authorize_skipped;
8919 mount_t locked_mp = NULL;
8920 vnode_t oparent = NULLVP;
8921#if CONFIG_FSE
8922 fse_info from_finfo = {}, to_finfo;
8923#endif
8924 int from_truncated = 0, to_truncated = 0;
8925 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8926 int batched = 0;
8927 struct vnode_attr *fvap, *tvap;
8928 int continuing = 0;
8929 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8930 int32_t nofollow_any = 0;
8931 /* carving out a chunk for structs that are too big to be on stack. */
8932 struct {
8933 struct nameidata from_node, to_node;
8934 struct vnode_attr fv_attr, tv_attr;
8935 } * __rename_data;
8936
8937 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8938 fromnd = &__rename_data->from_node;
8939 tond = &__rename_data->to_node;
8940
8941 holding_mntlock = 0;
8942 do_retry = 0;
8943 retry_count = 0;
8944retry:
8945 fvp = tvp = NULL;
8946 fdvp = tdvp = NULL;
8947 fvap = tvap = NULL;
8948 mnt_fvp = NULLVP;
8949 mntrename = FALSE;
8950 vn_authorize_skipped = FALSE;
8951
8952 if (uflags & RENAME_NOFOLLOW_ANY) {
8953 nofollow_any = NAMEI_NOFOLLOW_ANY;
8954 }
8955 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8956 segflg, from, ctx);
8957 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8958
8959 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8960 segflg, to, ctx);
8961 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8962
8963continue_lookup:
8964 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8965 if ((error = nameiat(ndp: fromnd, dirfd: fromfd))) {
8966 goto out1;
8967 }
8968 fdvp = fromnd->ni_dvp;
8969 fvp = fromnd->ni_vp;
8970
8971 if (fvp && fvp->v_type == VDIR) {
8972 tond->ni_cnd.cn_flags |= WILLBEDIR;
8973 }
8974 }
8975
8976 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8977 if ((error = nameiat(ndp: tond, dirfd: tofd))) {
8978 /*
8979 * Translate error code for rename("dir1", "dir2/.").
8980 */
8981 if (error == EISDIR && fvp->v_type == VDIR) {
8982 error = EINVAL;
8983 }
8984 goto out1;
8985 }
8986 tdvp = tond->ni_dvp;
8987 tvp = tond->ni_vp;
8988 }
8989
8990#if DEVELOPMENT || DEBUG
8991 /*
8992 * XXX VSWAP: Check for entitlements or special flag here
8993 * so we can restrict access appropriately.
8994 */
8995#else /* DEVELOPMENT || DEBUG */
8996
8997 if (fromnd->ni_vp && vnode_isswap(vp: fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8998 error = EPERM;
8999 goto out1;
9000 }
9001
9002 if (tond->ni_vp && vnode_isswap(vp: tond->ni_vp) && (ctx != vfs_context_kernel())) {
9003 error = EPERM;
9004 goto out1;
9005 }
9006#endif /* DEVELOPMENT || DEBUG */
9007
9008 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9009 error = ENOENT;
9010 goto out1;
9011 }
9012
9013 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9014 int32_t pval = 0;
9015 int err = 0;
9016
9017 /*
9018 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9019 * has the same name as target iff the following conditions are met:
9020 * 1. the target file system is case insensitive
9021 * 2. source and target directories are the same
9022 * 3. source and target files are the same
9023 * 4. name only differs in case (determined by underlying filesystem)
9024 */
9025 if (fvp != tvp || fdvp != tdvp) {
9026 error = EEXIST;
9027 goto out1;
9028 }
9029
9030 /*
9031 * Assume that the target file system is case sensitive if
9032 * _PC_CASE_SENSITIVE selector isn't supported.
9033 */
9034 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9035 if (err != 0 || pval != 0) {
9036 error = EEXIST;
9037 goto out1;
9038 }
9039 }
9040
9041 batched = vnode_compound_rename_available(vp: fdvp);
9042
9043#if CONFIG_FSE
9044 need_event = need_fsevent(FSE_RENAME, vp: fdvp);
9045 if (need_event) {
9046 if (fvp) {
9047 get_fse_info(vp: fvp, fse: &from_finfo, ctx);
9048 } else {
9049 error = vfs_get_notify_attributes(vap: &__rename_data->fv_attr);
9050 if (error) {
9051 goto out1;
9052 }
9053
9054 fvap = &__rename_data->fv_attr;
9055 }
9056
9057 if (tvp) {
9058 get_fse_info(vp: tvp, fse: &to_finfo, ctx);
9059 } else if (batched) {
9060 error = vfs_get_notify_attributes(vap: &__rename_data->tv_attr);
9061 if (error) {
9062 goto out1;
9063 }
9064
9065 tvap = &__rename_data->tv_attr;
9066 }
9067 }
9068#else
9069 need_event = 0;
9070#endif /* CONFIG_FSE */
9071
9072 has_listeners = kauth_authorize_fileop_has_listeners();
9073
9074 need_kpath2 = 0;
9075#if CONFIG_AUDIT
9076 if (AUDIT_RECORD_EXISTS()) {
9077 need_kpath2 = 1;
9078 }
9079#endif
9080
9081 if (need_event || has_listeners) {
9082 if (from_name == NULL) {
9083 GET_PATH(from_name);
9084 }
9085
9086 from_len = safe_getpath(dvp: fdvp, leafname: fromnd->ni_cnd.cn_nameptr, path: from_name, MAXPATHLEN, truncated_path: &from_truncated);
9087
9088 if (from_name_no_firmlink == NULL) {
9089 GET_PATH(from_name_no_firmlink);
9090 }
9091
9092 from_len_no_firmlink = safe_getpath_no_firmlink(dvp: fdvp, leafname: fromnd->ni_cnd.cn_nameptr, path: from_name_no_firmlink, MAXPATHLEN, truncated_path: &from_truncated_no_firmlink);
9093 }
9094
9095 if (need_event || need_kpath2 || has_listeners) {
9096 if (to_name == NULL) {
9097 GET_PATH(to_name);
9098 }
9099
9100 to_len = safe_getpath(dvp: tdvp, leafname: tond->ni_cnd.cn_nameptr, path: to_name, MAXPATHLEN, truncated_path: &to_truncated);
9101
9102 if (to_name_no_firmlink == NULL) {
9103 GET_PATH(to_name_no_firmlink);
9104 }
9105
9106 to_len_no_firmlink = safe_getpath_no_firmlink(dvp: tdvp, leafname: tond->ni_cnd.cn_nameptr, path: to_name_no_firmlink, MAXPATHLEN, truncated_path: &to_truncated_no_firmlink);
9107 if (to_name && need_kpath2) {
9108 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9109 }
9110 }
9111 if (!fvp) {
9112 /*
9113 * Claim: this check will never reject a valid rename.
9114 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9115 * Suppose fdvp and tdvp are not on the same mount.
9116 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9117 * then you can't move it to within another dir on the same mountpoint.
9118 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9119 *
9120 * If this check passes, then we are safe to pass these vnodes to the same FS.
9121 */
9122 if (fdvp->v_mount != tdvp->v_mount) {
9123 error = EXDEV;
9124 goto out1;
9125 }
9126 goto skipped_lookup;
9127 }
9128
9129 /*
9130 * If the source and destination are the same (i.e. they're
9131 * links to the same vnode) and the target file system is
9132 * case sensitive, then there is nothing to do.
9133 *
9134 * XXX Come back to this.
9135 */
9136 if (fvp == tvp) {
9137 int pathconf_val;
9138
9139 /*
9140 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9141 * then assume that this file system is case sensitive.
9142 */
9143 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9144 pathconf_val != 0) {
9145 vn_authorize_skipped = TRUE;
9146 goto out1;
9147 }
9148 }
9149
9150 /*
9151 * Allow the renaming of mount points.
9152 * - target must not exist
9153 * - target must reside in the same directory as source
9154 * - union mounts cannot be renamed
9155 * - the root fs, and tightly-linked system volumes, cannot be renamed
9156 *
9157 * XXX Handle this in VFS after a continued lookup (if we missed
9158 * in the cache to start off)
9159 *
9160 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9161 * we'll skip past here. The file system is responsible for
9162 * checking that @tvp is not a descendent of @fvp and vice versa
9163 * so it should always return EINVAL if either @tvp or @fvp is the
9164 * root of a volume.
9165 */
9166 if ((fvp->v_flag & VROOT) &&
9167 (fvp->v_type == VDIR) &&
9168 (tvp == NULL) &&
9169 (fvp->v_mountedhere == NULL) &&
9170 (fdvp == tdvp) &&
9171 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9172 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9173 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9174 vnode_t coveredvp;
9175
9176 /* switch fvp to the covered vnode */
9177 coveredvp = fvp->v_mount->mnt_vnodecovered;
9178 if ((vnode_getwithref(vp: coveredvp))) {
9179 error = ENOENT;
9180 goto out1;
9181 }
9182 /*
9183 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9184 * later.
9185 */
9186 mnt_fvp = fvp;
9187
9188 fvp = coveredvp;
9189 mntrename = TRUE;
9190 }
9191 /*
9192 * Check for cross-device rename.
9193 */
9194 if ((fvp->v_mount != tdvp->v_mount) ||
9195 (tvp && (fvp->v_mount != tvp->v_mount))) {
9196 error = EXDEV;
9197 goto out1;
9198 }
9199
9200 /*
9201 * If source is the same as the destination (that is the
9202 * same inode number) then there is nothing to do...
9203 * EXCEPT if the underlying file system supports case
9204 * insensitivity and is case preserving. In this case
9205 * the file system needs to handle the special case of
9206 * getting the same vnode as target (fvp) and source (tvp).
9207 *
9208 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9209 * and _PC_CASE_PRESERVING can have this exception, and they need to
9210 * handle the special case of getting the same vnode as target and
9211 * source. NOTE: Then the target is unlocked going into vnop_rename,
9212 * so not to cause locking problems. There is a single reference on tvp.
9213 *
9214 * NOTE - that fvp == tvp also occurs if they are hard linked and
9215 * that correct behaviour then is just to return success without doing
9216 * anything.
9217 *
9218 * XXX filesystem should take care of this itself, perhaps...
9219 */
9220 if (fvp == tvp && fdvp == tdvp) {
9221 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9222 !bcmp(s1: fromnd->ni_cnd.cn_nameptr, s2: tond->ni_cnd.cn_nameptr,
9223 n: fromnd->ni_cnd.cn_namelen)) {
9224 vn_authorize_skipped = TRUE;
9225 goto out1;
9226 }
9227 }
9228
9229 if (holding_mntlock && fvp->v_mount != locked_mp) {
9230 /*
9231 * we're holding a reference and lock
9232 * on locked_mp, but it no longer matches
9233 * what we want to do... so drop our hold
9234 */
9235 mount_unlock_renames(locked_mp);
9236 mount_drop(locked_mp, 0);
9237 holding_mntlock = 0;
9238 }
9239 if (tdvp != fdvp && fvp->v_type == VDIR) {
9240 /*
9241 * serialize renames that re-shape
9242 * the tree... if holding_mntlock is
9243 * set, then we're ready to go...
9244 * otherwise we
9245 * first need to drop the iocounts
9246 * we picked up, second take the
9247 * lock to serialize the access,
9248 * then finally start the lookup
9249 * process over with the lock held
9250 */
9251 if (!holding_mntlock) {
9252 /*
9253 * need to grab a reference on
9254 * the mount point before we
9255 * drop all the iocounts... once
9256 * the iocounts are gone, the mount
9257 * could follow
9258 */
9259 locked_mp = fvp->v_mount;
9260 mount_ref(locked_mp, 0);
9261
9262 /*
9263 * nameidone has to happen before we vnode_put(tvp)
9264 * since it may need to release the fs_nodelock on the tvp
9265 */
9266 nameidone(tond);
9267
9268 if (tvp) {
9269 vnode_put(vp: tvp);
9270 }
9271 vnode_put(vp: tdvp);
9272
9273 /*
9274 * nameidone has to happen before we vnode_put(fdvp)
9275 * since it may need to release the fs_nodelock on the fvp
9276 */
9277 nameidone(fromnd);
9278
9279 vnode_put(vp: fvp);
9280 vnode_put(vp: fdvp);
9281
9282 if (mnt_fvp != NULLVP) {
9283 vnode_put(vp: mnt_fvp);
9284 }
9285
9286 mount_lock_renames(locked_mp);
9287 holding_mntlock = 1;
9288
9289 goto retry;
9290 }
9291 } else {
9292 /*
9293 * when we dropped the iocounts to take
9294 * the lock, we allowed the identity of
9295 * the various vnodes to change... if they did,
9296 * we may no longer be dealing with a rename
9297 * that reshapes the tree... once we're holding
9298 * the iocounts, the vnodes can't change type
9299 * so we're free to drop the lock at this point
9300 * and continue on
9301 */
9302 if (holding_mntlock) {
9303 mount_unlock_renames(locked_mp);
9304 mount_drop(locked_mp, 0);
9305 holding_mntlock = 0;
9306 }
9307 }
9308
9309 if (!batched) {
9310 error = vn_authorize_renamex_with_paths(fdvp, fvp: mntrename ? mnt_fvp : fvp,
9311 fcnp: &fromnd->ni_cnd, from_path: from_name, tdvp, tvp, tcnp: &tond->ni_cnd, to_path: to_name, ctx,
9312 flags, NULL);
9313 if (error) {
9314 if (error == ENOENT) {
9315 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9316 /*
9317 * We encountered a race where after doing the namei,
9318 * tvp stops being valid. If so, simply re-drive the rename
9319 * call from the top.
9320 */
9321 do_retry = 1;
9322 retry_count += 1;
9323 }
9324 }
9325 goto out1;
9326 }
9327 }
9328
9329 /* Release the 'mnt_fvp' now that it is no longer needed. */
9330 if (mnt_fvp != NULLVP) {
9331 vnode_put(vp: mnt_fvp);
9332 mnt_fvp = NULLVP;
9333 }
9334
9335 // save these off so we can later verify that fvp is the same
9336 oname = fvp->v_name;
9337 oparent = fvp->v_parent;
9338
9339skipped_lookup:
9340#if CONFIG_FILE_LEASES
9341 /* Lease break needed for source's parent dir? */
9342 vnode_breakdirlease(vp: fdvp, false, O_WRONLY);
9343
9344 /* Lease break needed for target's parent dir? */
9345 vnode_breakdirlease(vp: tdvp, false, O_WRONLY);
9346#endif
9347
9348 error = vn_rename(fdvp, fvpp: &fvp, fcnp: &fromnd->ni_cnd, fvap,
9349 tdvp, tvpp: &tvp, tcnp: &tond->ni_cnd, tvap,
9350 flags, ctx);
9351
9352 if (holding_mntlock) {
9353 /*
9354 * we can drop our serialization
9355 * lock now
9356 */
9357 mount_unlock_renames(locked_mp);
9358 mount_drop(locked_mp, 0);
9359 holding_mntlock = 0;
9360 }
9361 if (error) {
9362 if (error == EDATALESS) {
9363 /*
9364 * If we've been here before, something has gone
9365 * horribly wrong and we should just get out lest
9366 * we spiral around the drain forever.
9367 */
9368 if (flags & VFS_RENAME_DATALESS) {
9369 error = EIO;
9370 goto out1;
9371 }
9372
9373 /*
9374 * The object we're renaming is dataless (or has a
9375 * dataless descendent) and requires materialization
9376 * before the rename occurs. But we're holding the
9377 * mount point's rename lock, so it's not safe to
9378 * make the upcall.
9379 *
9380 * In this case, we release the lock (above), perform
9381 * the materialization, and start the whole thing over.
9382 */
9383 error = vfs_materialize_reparent(vp: fvp, tdvp);
9384 if (error == 0) {
9385 /*
9386 * The next time around we need to tell the
9387 * file system that the materializtaion has
9388 * been performed.
9389 */
9390 flags |= VFS_RENAME_DATALESS;
9391 do_retry = 1;
9392 }
9393 goto out1;
9394 }
9395 if (error == EKEEPLOOKING) {
9396 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9397 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9398 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9399 }
9400 }
9401
9402 fromnd->ni_vp = fvp;
9403 tond->ni_vp = tvp;
9404
9405 goto continue_lookup;
9406 }
9407
9408 /*
9409 * We may encounter a race in the VNOP where the destination didn't
9410 * exist when we did the namei, but it does by the time we go and
9411 * try to create the entry. In this case, we should re-drive this rename
9412 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9413 * but other filesystems susceptible to this race could return it, too.
9414 */
9415 if (error == ERECYCLE) {
9416 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9417 do_retry = 1;
9418 retry_count += 1;
9419 } else {
9420 printf("rename retry limit due to ERECYCLE reached\n");
9421 error = ENOENT;
9422 }
9423 }
9424
9425 /*
9426 * For compound VNOPs, the authorization callback may return
9427 * ENOENT in case of racing hardlink lookups hitting the name
9428 * cache, redrive the lookup.
9429 */
9430 if (batched && error == ENOENT) {
9431 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9432 do_retry = 1;
9433 retry_count += 1;
9434 }
9435 }
9436
9437 goto out1;
9438 }
9439
9440 /* call out to allow 3rd party notification of rename.
9441 * Ignore result of kauth_authorize_fileop call.
9442 */
9443 kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
9444 KAUTH_FILEOP_RENAME,
9445 arg0: (uintptr_t)from_name, arg1: (uintptr_t)to_name);
9446 if (flags & VFS_RENAME_SWAP) {
9447 kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
9448 KAUTH_FILEOP_RENAME,
9449 arg0: (uintptr_t)to_name, arg1: (uintptr_t)from_name);
9450 }
9451
9452#if CONFIG_FSE
9453 if (from_name != NULL && to_name != NULL) {
9454 if (from_truncated || to_truncated) {
9455 // set it here since only the from_finfo gets reported up to user space
9456 from_finfo.mode |= FSE_TRUNCATED_PATH;
9457 }
9458
9459 if (tvap && tvp) {
9460 vnode_get_fse_info_from_vap(vp: tvp, fse: &to_finfo, vap: tvap);
9461 }
9462 if (fvap) {
9463 vnode_get_fse_info_from_vap(vp: fvp, fse: &from_finfo, vap: fvap);
9464 }
9465
9466 if (tvp) {
9467 add_fsevent(FSE_RENAME, ctx,
9468 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9469 FSE_ARG_FINFO, &from_finfo,
9470 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9471 FSE_ARG_FINFO, &to_finfo,
9472 FSE_ARG_DONE);
9473 if (flags & VFS_RENAME_SWAP) {
9474 /*
9475 * Strictly speaking, swap is the equivalent of
9476 * *three* renames. FSEvents clients should only take
9477 * the events as a hint, so we only bother reporting
9478 * two.
9479 */
9480 add_fsevent(FSE_RENAME, ctx,
9481 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9482 FSE_ARG_FINFO, &to_finfo,
9483 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9484 FSE_ARG_FINFO, &from_finfo,
9485 FSE_ARG_DONE);
9486 }
9487 } else {
9488 add_fsevent(FSE_RENAME, ctx,
9489 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9490 FSE_ARG_FINFO, &from_finfo,
9491 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9492 FSE_ARG_DONE);
9493 }
9494 }
9495#endif /* CONFIG_FSE */
9496
9497 /*
9498 * update filesystem's mount point data
9499 */
9500 if (mntrename) {
9501 char *cp, *pathend, *mpname;
9502 char * tobuf;
9503 struct mount *mp;
9504 int maxlen;
9505 size_t len = 0;
9506
9507 mp = fvp->v_mountedhere;
9508
9509 if (vfs_busy(mp, LK_NOWAIT)) {
9510 error = EBUSY;
9511 goto out1;
9512 }
9513 tobuf = zalloc(view: ZV_NAMEI);
9514
9515 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9516 error = copyinstr(uaddr: to, kaddr: tobuf, MAXPATHLEN, done: &len);
9517 } else {
9518 error = copystr(kfaddr: (void *)to, kdaddr: tobuf, MAXPATHLEN, done: &len);
9519 }
9520 if (!error) {
9521 /* find current mount point prefix */
9522 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9523 for (cp = pathend; *cp != '\0'; ++cp) {
9524 if (*cp == '/') {
9525 pathend = cp + 1;
9526 }
9527 }
9528 /* find last component of target name */
9529 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9530 if (*cp == '/') {
9531 mpname = cp + 1;
9532 }
9533 }
9534
9535 /* Update f_mntonname of sub mounts */
9536 vfs_iterate(flags: 0, callout: rename_submounts_callback, arg: (void *)mp);
9537
9538 /* append name to prefix */
9539 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9540 bzero(s: pathend, n: maxlen);
9541
9542 strlcpy(dst: pathend, src: mpname, n: maxlen);
9543 }
9544 zfree(ZV_NAMEI, tobuf);
9545
9546 vfs_unbusy(mp);
9547
9548 vfs_event_signal(NULL, VQ_UPDATE, data: (intptr_t)NULL);
9549 }
9550 /*
9551 * fix up name & parent pointers. note that we first
9552 * check that fvp has the same name/parent pointers it
9553 * had before the rename call... this is a 'weak' check
9554 * at best...
9555 *
9556 * XXX oparent and oname may not be set in the compound vnop case
9557 */
9558 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9559 int update_flags;
9560
9561 update_flags = VNODE_UPDATE_NAME;
9562
9563 if (fdvp != tdvp) {
9564 update_flags |= VNODE_UPDATE_PARENT;
9565 }
9566
9567 vnode_update_identity(vp: fvp, dvp: tdvp, name: tond->ni_cnd.cn_nameptr, name_len: tond->ni_cnd.cn_namelen, name_hashval: tond->ni_cnd.cn_hash, flags: update_flags);
9568 }
9569out1:
9570 /*
9571 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9572 * skipped earlier as no actual rename was performed.
9573 */
9574 if (vn_authorize_skipped && error == 0) {
9575 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9576 fcnp: &fromnd->ni_cnd, from_path: from_name, tdvp, tvp, tcnp: &tond->ni_cnd, to_path: to_name, ctx,
9577 flags, NULL);
9578 if (error && error == ENOENT) {
9579 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9580 do_retry = 1;
9581 retry_count += 1;
9582 }
9583 }
9584 }
9585 if (to_name != NULL) {
9586 RELEASE_PATH(to_name);
9587 to_name = NULL;
9588 }
9589 if (to_name_no_firmlink != NULL) {
9590 RELEASE_PATH(to_name_no_firmlink);
9591 to_name_no_firmlink = NULL;
9592 }
9593 if (from_name != NULL) {
9594 RELEASE_PATH(from_name);
9595 from_name = NULL;
9596 }
9597 if (from_name_no_firmlink != NULL) {
9598 RELEASE_PATH(from_name_no_firmlink);
9599 from_name_no_firmlink = NULL;
9600 }
9601 if (holding_mntlock) {
9602 mount_unlock_renames(locked_mp);
9603 mount_drop(locked_mp, 0);
9604 holding_mntlock = 0;
9605 }
9606 if (tdvp) {
9607 /*
9608 * nameidone has to happen before we vnode_put(tdvp)
9609 * since it may need to release the fs_nodelock on the tdvp
9610 */
9611 nameidone(tond);
9612
9613 if (tvp) {
9614 vnode_put(vp: tvp);
9615 }
9616 vnode_put(vp: tdvp);
9617 }
9618 if (fdvp) {
9619 /*
9620 * nameidone has to happen before we vnode_put(fdvp)
9621 * since it may need to release the fs_nodelock on the fdvp
9622 */
9623 nameidone(fromnd);
9624
9625 if (fvp) {
9626 vnode_put(vp: fvp);
9627 }
9628 vnode_put(vp: fdvp);
9629 }
9630 if (mnt_fvp != NULLVP) {
9631 vnode_put(vp: mnt_fvp);
9632 }
9633 /*
9634 * If things changed after we did the namei, then we will re-drive
9635 * this rename call from the top.
9636 */
9637 if (do_retry) {
9638 do_retry = 0;
9639 goto retry;
9640 }
9641
9642 kfree_type(typeof(*__rename_data), __rename_data);
9643 return error;
9644}
9645
9646int
9647rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9648{
9649 return renameat_internal(ctx: vfs_context_current(), AT_FDCWD, from: uap->from,
9650 AT_FDCWD, to: uap->to, segflg: UIO_USERSPACE, uflags: 0);
9651}
9652
9653int
9654renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9655{
9656 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9657 return EINVAL;
9658 }
9659
9660 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9661 return EINVAL;
9662 }
9663
9664 return renameat_internal(ctx: vfs_context_current(), fromfd: uap->fromfd, from: uap->from,
9665 tofd: uap->tofd, to: uap->to, segflg: UIO_USERSPACE, uflags: uap->flags);
9666}
9667
9668int
9669renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9670{
9671 return renameat_internal(ctx: vfs_context_current(), fromfd: uap->fromfd, from: uap->from,
9672 tofd: uap->tofd, to: uap->to, segflg: UIO_USERSPACE, uflags: 0);
9673}
9674
9675/*
9676 * Make a directory file.
9677 *
9678 * Returns: 0 Success
9679 * EEXIST
9680 * namei:???
9681 * vnode_authorize:???
9682 * vn_create:???
9683 */
9684/* ARGSUSED */
9685static int
9686mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9687 enum uio_seg segflg)
9688{
9689 vnode_t vp, dvp;
9690 int error;
9691 int update_flags = 0;
9692 int batched;
9693 struct nameidata nd;
9694
9695 AUDIT_ARG(mode, vap->va_mode);
9696 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9697 path, ctx);
9698 nd.ni_cnd.cn_flags |= WILLBEDIR;
9699 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9700
9701continue_lookup:
9702 error = nameiat(ndp: &nd, dirfd: fd);
9703 if (error) {
9704 return error;
9705 }
9706 dvp = nd.ni_dvp;
9707 vp = nd.ni_vp;
9708
9709 if (vp != NULL) {
9710 error = EEXIST;
9711 goto out;
9712 }
9713
9714 batched = vnode_compound_mkdir_available(vp: dvp);
9715
9716 VATTR_SET(vap, va_type, VDIR);
9717
9718 /*
9719 * XXX
9720 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9721 * only get EXISTS or EISDIR for existing path components, and not that it could see
9722 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9723 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9724 */
9725 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9726 if (error == EACCES || error == EPERM) {
9727 int error2;
9728
9729 nameidone(&nd);
9730 vnode_put(vp: dvp);
9731 dvp = NULLVP;
9732
9733 /*
9734 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9735 * rather than EACCESS if the target exists.
9736 */
9737 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9738 path, ctx);
9739 error2 = nameiat(ndp: &nd, dirfd: fd);
9740 if (error2) {
9741 goto out;
9742 } else {
9743 vp = nd.ni_vp;
9744 error = EEXIST;
9745 goto out;
9746 }
9747 }
9748
9749 goto out;
9750 }
9751
9752#if CONFIG_FILE_LEASES
9753 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
9754#endif
9755
9756 /*
9757 * make the directory
9758 */
9759 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9760 if (error == EKEEPLOOKING) {
9761 nd.ni_vp = vp;
9762 goto continue_lookup;
9763 }
9764
9765 goto out;
9766 }
9767
9768 // Make sure the name & parent pointers are hooked up
9769 if (vp->v_name == NULL) {
9770 update_flags |= VNODE_UPDATE_NAME;
9771 }
9772 if (vp->v_parent == NULLVP) {
9773 update_flags |= VNODE_UPDATE_PARENT;
9774 }
9775
9776 if (update_flags) {
9777 vnode_update_identity(vp, dvp, name: nd.ni_cnd.cn_nameptr, name_len: nd.ni_cnd.cn_namelen, name_hashval: nd.ni_cnd.cn_hash, flags: update_flags);
9778 }
9779
9780#if CONFIG_FSE
9781 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9782#endif
9783
9784out:
9785 /*
9786 * nameidone has to happen before we vnode_put(dvp)
9787 * since it may need to release the fs_nodelock on the dvp
9788 */
9789 nameidone(&nd);
9790
9791 if (vp) {
9792 vnode_put(vp);
9793 }
9794 if (dvp) {
9795 vnode_put(vp: dvp);
9796 }
9797
9798 return error;
9799}
9800
9801/*
9802 * mkdir_extended: Create a directory; with extended security (ACL).
9803 *
9804 * Parameters: p Process requesting to create the directory
9805 * uap User argument descriptor (see below)
9806 * retval (ignored)
9807 *
9808 * Indirect: uap->path Path of directory to create
9809 * uap->mode Access permissions to set
9810 * uap->xsecurity ACL to set
9811 *
9812 * Returns: 0 Success
9813 * !0 Not success
9814 *
9815 */
9816int
9817mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9818{
9819 int ciferror;
9820 kauth_filesec_t xsecdst;
9821 struct vnode_attr va;
9822
9823 AUDIT_ARG(owner, uap->uid, uap->gid);
9824
9825 xsecdst = NULL;
9826 if ((uap->xsecurity != USER_ADDR_NULL) &&
9827 ((ciferror = kauth_copyinfilesec(xsecurity: uap->xsecurity, xsecdestpp: &xsecdst)) != 0)) {
9828 return ciferror;
9829 }
9830
9831 VATTR_INIT(&va);
9832 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9833 if (xsecdst != NULL) {
9834 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9835 va.va_vaflags |= VA_FILESEC_ACL;
9836 }
9837
9838 ciferror = mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD,
9839 segflg: UIO_USERSPACE);
9840 if (xsecdst != NULL) {
9841 kauth_filesec_free(fsp: xsecdst);
9842 }
9843 return ciferror;
9844}
9845
9846int
9847mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9848{
9849 struct vnode_attr va;
9850
9851 VATTR_INIT(&va);
9852 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9853
9854 return mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, AT_FDCWD,
9855 segflg: UIO_USERSPACE);
9856}
9857
9858int
9859mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9860{
9861 struct vnode_attr va;
9862
9863 VATTR_INIT(&va);
9864 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9865
9866 return mkdir1at(ctx: vfs_context_current(), path: uap->path, vap: &va, fd: uap->fd,
9867 segflg: UIO_USERSPACE);
9868}
9869
9870static int
9871rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9872 enum uio_seg segflg, int unlink_flags)
9873{
9874 struct {
9875 struct nameidata nd;
9876#if CONFIG_FSE
9877 struct vnode_attr va;
9878#endif /* CONFIG_FSE */
9879 } *__rmdir_data;
9880 vnode_t vp, dvp;
9881 int error;
9882 struct nameidata *ndp;
9883 char *path = NULL;
9884 char *no_firmlink_path = NULL;
9885 int len_path = 0;
9886 int len_no_firmlink_path = 0;
9887 int has_listeners = 0;
9888 int need_event = 0;
9889 int truncated_path = 0;
9890 int truncated_no_firmlink_path = 0;
9891 struct vnode_attr *vap = NULL;
9892 int restart_count = 0;
9893 int batched;
9894
9895 int restart_flag;
9896 int nofollow_any = 0;
9897
9898 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9899 ndp = &__rmdir_data->nd;
9900
9901 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9902 nofollow_any = NAMEI_NOFOLLOW_ANY;
9903 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9904 }
9905
9906 /*
9907 * This loop exists to restart rmdir in the unlikely case that two
9908 * processes are simultaneously trying to remove the same directory
9909 * containing orphaned appleDouble files.
9910 */
9911 do {
9912 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9913 segflg, dirpath, ctx);
9914 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9915continue_lookup:
9916 restart_flag = 0;
9917 vap = NULL;
9918
9919 error = nameiat(ndp, dirfd: fd);
9920 if (error) {
9921 goto err_out;
9922 }
9923
9924 dvp = ndp->ni_dvp;
9925 vp = ndp->ni_vp;
9926
9927 if (vp) {
9928 batched = vnode_compound_rmdir_available(vp);
9929
9930 if (vp->v_flag & VROOT) {
9931 /*
9932 * The root of a mounted filesystem cannot be deleted.
9933 */
9934 error = EBUSY;
9935 goto out;
9936 }
9937
9938#if DEVELOPMENT || DEBUG
9939 /*
9940 * XXX VSWAP: Check for entitlements or special flag here
9941 * so we can restrict access appropriately.
9942 */
9943#else /* DEVELOPMENT || DEBUG */
9944
9945 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9946 error = EPERM;
9947 goto out;
9948 }
9949#endif /* DEVELOPMENT || DEBUG */
9950
9951 /*
9952 * Removed a check here; we used to abort if vp's vid
9953 * was not the same as what we'd seen the last time around.
9954 * I do not think that check was valid, because if we retry
9955 * and all dirents are gone, the directory could legitimately
9956 * be recycled but still be present in a situation where we would
9957 * have had permission to delete. Therefore, we won't make
9958 * an effort to preserve that check now that we may not have a
9959 * vp here.
9960 */
9961
9962 if (!batched) {
9963 error = vn_authorize_rmdir(dvp, vp, cnp: &ndp->ni_cnd, ctx, NULL);
9964 if (error) {
9965 if (error == ENOENT) {
9966 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9967 restart_flag = 1;
9968 restart_count += 1;
9969 }
9970 }
9971 goto out;
9972 }
9973 }
9974 } else {
9975 batched = 1;
9976
9977 if (!vnode_compound_rmdir_available(vp: dvp)) {
9978 panic("No error, but no compound rmdir?");
9979 }
9980 }
9981
9982#if CONFIG_FSE
9983 fse_info finfo = {0};
9984
9985 need_event = need_fsevent(FSE_DELETE, vp: dvp);
9986 if (need_event) {
9987 if (!batched) {
9988 get_fse_info(vp, fse: &finfo, ctx);
9989 } else {
9990 error = vfs_get_notify_attributes(vap: &__rmdir_data->va);
9991 if (error) {
9992 goto out;
9993 }
9994
9995 vap = &__rmdir_data->va;
9996 }
9997 }
9998#endif
9999 has_listeners = kauth_authorize_fileop_has_listeners();
10000 if (need_event || has_listeners) {
10001 if (path == NULL) {
10002 GET_PATH(path);
10003 }
10004
10005 len_path = safe_getpath(dvp, leafname: ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, truncated_path: &truncated_path);
10006
10007 if (no_firmlink_path == NULL) {
10008 GET_PATH(no_firmlink_path);
10009 }
10010
10011 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, leafname: ndp->ni_cnd.cn_nameptr, path: no_firmlink_path, MAXPATHLEN, truncated_path: &truncated_no_firmlink_path);
10012#if CONFIG_FSE
10013 if (truncated_no_firmlink_path) {
10014 finfo.mode |= FSE_TRUNCATED_PATH;
10015 }
10016#endif
10017 }
10018
10019#if CONFIG_FILE_LEASES
10020 vnode_breakdirlease(vp: dvp, false, O_WRONLY);
10021#endif
10022
10023 error = vn_rmdir(dvp, vpp: &vp, ndp, vap, ctx);
10024 ndp->ni_vp = vp;
10025 if (vp == NULLVP) {
10026 /* Couldn't find a vnode */
10027 goto out;
10028 }
10029
10030 if (error == EKEEPLOOKING) {
10031 goto continue_lookup;
10032 } else if (batched && error == ENOENT) {
10033 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10034 /*
10035 * For compound VNOPs, the authorization callback
10036 * may return ENOENT in case of racing hard link lookups
10037 * redrive the lookup.
10038 */
10039 restart_flag = 1;
10040 restart_count += 1;
10041 goto out;
10042 }
10043 }
10044
10045 /*
10046 * XXX There's no provision for passing flags
10047 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10048 * because it's not empty, then we try again
10049 * with VNOP_REMOVE(), passing in a special
10050 * flag that clever file systems will know
10051 * how to handle.
10052 */
10053 if (error == ENOTEMPTY &&
10054 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10055 /*
10056 * Only do this if the directory is actually
10057 * marked as DATALESS.
10058 */
10059 struct vnode_attr *lvap =
10060 kalloc_type(struct vnode_attr, Z_WAITOK);
10061
10062 VATTR_INIT(lvap);
10063 VATTR_WANTED(lvap, va_flags);
10064 if (vnode_getattr(vp, vap: lvap, ctx) == 0 &&
10065 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10066 (lvap->va_flags & SF_DATALESS) != 0) {
10067 /*
10068 * If this fails, we want to keep the original
10069 * error.
10070 */
10071 if (vn_remove(dvp, vpp: &vp, ndp,
10072 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10073 error = 0;
10074 }
10075 }
10076 kfree_type(struct vnode_attr, lvap);
10077 }
10078
10079#if CONFIG_APPLEDOUBLE
10080 /*
10081 * Special case to remove orphaned AppleDouble
10082 * files. I don't like putting this in the kernel,
10083 * but carbon does not like putting this in carbon either,
10084 * so here we are.
10085 */
10086 if (error == ENOTEMPTY) {
10087 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10088 if (ad_error == EBUSY) {
10089 error = ad_error;
10090 goto out;
10091 }
10092
10093
10094 /*
10095 * Assuming everything went well, we will try the RMDIR again
10096 */
10097 if (!ad_error) {
10098 error = vn_rmdir(dvp, vpp: &vp, ndp, vap, ctx);
10099 }
10100 }
10101#endif /* CONFIG_APPLEDOUBLE */
10102 /*
10103 * Call out to allow 3rd party notification of delete.
10104 * Ignore result of kauth_authorize_fileop call.
10105 */
10106 if (!error) {
10107 if (has_listeners) {
10108 kauth_authorize_fileop(credential: vfs_context_ucred(ctx),
10109 KAUTH_FILEOP_DELETE,
10110 arg0: (uintptr_t)vp,
10111 arg1: (uintptr_t)path);
10112 }
10113
10114 if (vp->v_flag & VISHARDLINK) {
10115 // see the comment in unlink1() about why we update
10116 // the parent of a hard link when it is removed
10117 vnode_update_identity(vp, NULL, NULL, name_len: 0, name_hashval: 0, VNODE_UPDATE_PARENT);
10118 }
10119
10120#if CONFIG_FSE
10121 if (need_event) {
10122 if (vap) {
10123 vnode_get_fse_info_from_vap(vp, fse: &finfo, vap);
10124 }
10125 add_fsevent(FSE_DELETE, ctx,
10126 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10127 FSE_ARG_FINFO, &finfo,
10128 FSE_ARG_DONE);
10129 }
10130#endif
10131
10132#if CONFIG_MACF
10133 mac_vnode_notify_unlink(ctx, dvp, vp, cnp: &ndp->ni_cnd);
10134#endif
10135 }
10136
10137out:
10138 if (path != NULL) {
10139 RELEASE_PATH(path);
10140 path = NULL;
10141 }
10142
10143 if (no_firmlink_path != NULL) {
10144 RELEASE_PATH(no_firmlink_path);
10145 no_firmlink_path = NULL;
10146 }
10147
10148 /*
10149 * nameidone has to happen before we vnode_put(dvp)
10150 * since it may need to release the fs_nodelock on the dvp
10151 */
10152 nameidone(ndp);
10153 vnode_put(vp: dvp);
10154
10155 if (vp) {
10156 vnode_put(vp);
10157 }
10158
10159 if (restart_flag == 0) {
10160 wakeup_one(chan: (caddr_t)vp);
10161 goto err_out;
10162 }
10163 tsleep(chan: vp, PVFS, wmesg: "rm AD", timo: 1);
10164 } while (restart_flag != 0);
10165
10166err_out:
10167 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10168
10169 return error;
10170}
10171
10172/*
10173 * Remove a directory file.
10174 */
10175/* ARGSUSED */
10176int
10177rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10178{
10179 return rmdirat_internal(ctx: vfs_context_current(), AT_FDCWD,
10180 CAST_USER_ADDR_T(uap->path), segflg: UIO_USERSPACE, unlink_flags: 0);
10181}
10182
10183/* Get direntry length padded to 8 byte alignment */
10184#define DIRENT64_LEN(namlen) \
10185 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10186
10187/* Get dirent length padded to 4 byte alignment */
10188#define DIRENT_LEN(namelen) \
10189 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10190
10191/* Get the end of this dirent */
10192#define DIRENT_END(dep) \
10193 (((char *)(dep)) + (dep)->d_reclen - 1)
10194
10195errno_t
10196vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10197 int *numdirent, vfs_context_t ctxp)
10198{
10199 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10200 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10201 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10202 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10203 } else {
10204 size_t bufsize;
10205 void * bufptr;
10206 uio_t auio;
10207 struct direntry *entry64;
10208 struct dirent *dep;
10209 size_t bytesread;
10210 int error;
10211
10212 /*
10213 * We're here because the underlying file system does not
10214 * support direnties or we mounted denying support so we must
10215 * fall back to dirents and convert them to direntries.
10216 *
10217 * Our kernel buffer needs to be smaller since re-packing will
10218 * expand each dirent. The worse case (when the name length
10219 * is 3 or less) corresponds to a struct direntry size of 32
10220 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10221 * (4-byte aligned). So having a buffer that is 3/8 the size
10222 * will prevent us from reading more than we can pack.
10223 *
10224 * Since this buffer is wired memory, we will limit the
10225 * buffer size to a maximum of 32K. We would really like to
10226 * use 32K in the MIN(), but we use magic number 87371 to
10227 * prevent uio_resid() * 3 / 8 from overflowing.
10228 */
10229 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10230 bufptr = kalloc_data(bufsize, Z_WAITOK);
10231 if (bufptr == NULL) {
10232 return ENOMEM;
10233 }
10234
10235 auio = uio_create(a_iovcount: 1, a_offset: 0, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ);
10236 uio_addiov(a_uio: auio, a_baseaddr: (uintptr_t)bufptr, a_length: bufsize);
10237 auio->uio_offset = uio->uio_offset;
10238
10239 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10240
10241 dep = (struct dirent *)bufptr;
10242 bytesread = bufsize - uio_resid(a_uio: auio);
10243
10244 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10245 /*
10246 * Convert all the entries and copy them out to user's buffer.
10247 */
10248 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10249 /* First check that the dirent struct up to d_name is within the buffer */
10250 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10251 /* Check that the length of the entire dirent is within the buffer */
10252 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10253 /* Check that the actual length including the name doesn't exceed d_reclen */
10254 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10255 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10256 vp->v_mount->mnt_vfsstat.f_mntonname,
10257 vp->v_name ? vp->v_name : "<unknown>");
10258 error = EIO;
10259 break;
10260 }
10261
10262 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10263
10264 bzero(s: entry64, n: enbufsize);
10265 /* Convert a dirent to a dirent64. */
10266 entry64->d_ino = dep->d_ino;
10267 entry64->d_seekoff = 0;
10268 entry64->d_reclen = (uint16_t)enbufsize;
10269 entry64->d_namlen = dep->d_namlen;
10270 entry64->d_type = dep->d_type;
10271 bcopy(src: dep->d_name, dst: entry64->d_name, n: dep->d_namlen + 1);
10272
10273 /* Move to next entry. */
10274 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10275
10276 /* Copy entry64 to user's buffer. */
10277 error = uiomove(cp: (caddr_t)entry64, n: entry64->d_reclen, uio);
10278 }
10279
10280 /* Update the real offset using the offset we got from VNOP_READDIR. */
10281 if (error == 0) {
10282 uio->uio_offset = auio->uio_offset;
10283 }
10284 uio_free(a_uio: auio);
10285 kfree_data(bufptr, bufsize);
10286 kfree_type(struct direntry, entry64);
10287 return error;
10288 }
10289}
10290
10291#define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10292
10293/*
10294 * Read a block of directory entries in a file system independent format.
10295 */
10296static int
10297getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10298 off_t *offset, int *eofflag, int flags)
10299{
10300 vnode_t vp;
10301 struct vfs_context context = *vfs_context_current(); /* local copy */
10302 struct fileproc *fp;
10303 uio_t auio;
10304 int spacetype = proc_is64bit(vfs_context_proc(ctx: &context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10305 off_t loff;
10306 int error, numdirent;
10307 UIO_STACKBUF(uio_buf, 1);
10308
10309get_from_fd:
10310 error = fp_getfvp(p: vfs_context_proc(ctx: &context), fd, resultfp: &fp, resultvp: &vp);
10311 if (error) {
10312 return error;
10313 }
10314
10315 vn_offset_lock(fg: fp->fp_glob);
10316 if (((vnode_t)fp_get_data(fp)) != vp) {
10317 vn_offset_unlock(fg: fp->fp_glob);
10318 file_drop(fd);
10319 goto get_from_fd;
10320 }
10321
10322 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10323 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10324 error = EBADF;
10325 goto out;
10326 }
10327
10328 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10329 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10330 }
10331
10332#if CONFIG_MACF
10333 error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx: &context), fg: fp->fp_glob);
10334 if (error) {
10335 goto out;
10336 }
10337#endif
10338
10339 if ((error = vnode_getwithref(vp))) {
10340 goto out;
10341 }
10342 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10343
10344#if CONFIG_UNION_MOUNTS
10345unionread:
10346#endif /* CONFIG_UNION_MOUNTS */
10347 if (vp->v_type != VDIR) {
10348 (void)vnode_put(vp);
10349 error = EINVAL;
10350 goto out;
10351 }
10352
10353#if CONFIG_MACF
10354 error = mac_vnode_check_readdir(ctx: &context, vp);
10355 if (error != 0) {
10356 (void)vnode_put(vp);
10357 goto out;
10358 }
10359#endif /* MAC */
10360
10361 loff = fp->fp_glob->fg_offset;
10362 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: loff, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
10363 uio_addiov(a_uio: auio, a_baseaddr: bufp, a_length: bufsize);
10364
10365 if (flags & VNODE_READDIR_EXTENDED) {
10366 error = vnode_readdir64(vp, uio: auio, flags, eofflag, numdirent: &numdirent, ctxp: &context);
10367 fp->fp_glob->fg_offset = uio_offset(a_uio: auio);
10368 } else {
10369 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10370 fp->fp_glob->fg_offset = uio_offset(a_uio: auio);
10371 }
10372 if (error) {
10373 (void)vnode_put(vp);
10374 goto out;
10375 }
10376
10377#if CONFIG_UNION_MOUNTS
10378 if ((user_ssize_t)bufsize == uio_resid(a_uio: auio) &&
10379 (vp->v_mount->mnt_flag & MNT_UNION)) {
10380 vnode_t uvp;
10381
10382 if (lookup_traverse_union(dvp: vp, new_dvp: &uvp, ctx: &context) == 0) {
10383 if (vnode_ref(vp: uvp) == 0) {
10384 fp_set_data(fp, fg_data: uvp);
10385 fp->fp_glob->fg_offset = 0;
10386 vnode_rele(vp);
10387 vnode_put(vp);
10388 vp = uvp;
10389 goto unionread;
10390 } else {
10391 /* could not get a ref, can't replace in fd */
10392 vnode_put(vp: uvp);
10393 }
10394 }
10395 }
10396#endif /* CONFIG_UNION_MOUNTS */
10397
10398 vnode_put(vp);
10399 if (offset) {
10400 *offset = loff;
10401 }
10402
10403 *bytesread = bufsize - uio_resid(a_uio: auio);
10404out:
10405 vn_offset_unlock(fg: fp->fp_glob);
10406 file_drop(fd);
10407 return error;
10408}
10409
10410
10411int
10412getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10413{
10414 off_t offset;
10415 ssize_t bytesread;
10416 int error, eofflag;
10417
10418 AUDIT_ARG(fd, uap->fd);
10419 error = getdirentries_common(fd: uap->fd, bufp: uap->buf, bufsize: uap->count,
10420 bytesread: &bytesread, offset: &offset, eofflag: &eofflag, flags: 0);
10421
10422 if (error == 0) {
10423 if (proc_is64bit(p)) {
10424 user64_long_t base = (user64_long_t)offset;
10425 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10426 } else {
10427 user32_long_t base = (user32_long_t)offset;
10428 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10429 }
10430 *retval = (int)bytesread;
10431 }
10432 return error;
10433}
10434
10435int
10436getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10437{
10438 off_t offset;
10439 ssize_t bytesread;
10440 int error, eofflag;
10441 user_size_t bufsize;
10442
10443 AUDIT_ARG(fd, uap->fd);
10444
10445 /*
10446 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10447 * then the kernel carves out the last 4 bytes to return extended
10448 * information to userspace (namely whether we reached EOF with this call).
10449 */
10450 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10451 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10452 } else {
10453 bufsize = uap->bufsize;
10454 }
10455
10456 error = getdirentries_common(fd: uap->fd, bufp: uap->buf, bufsize,
10457 bytesread: &bytesread, offset: &offset, eofflag: &eofflag, VNODE_READDIR_EXTENDED);
10458
10459 if (error == 0) {
10460 *retval = bytesread;
10461 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10462
10463 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10464 getdirentries64_flags_t flags = 0;
10465 if (eofflag) {
10466 flags |= GETDIRENTRIES64_EOF;
10467 }
10468 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10469 sizeof(flags));
10470 }
10471 }
10472 return error;
10473}
10474
10475
10476/*
10477 * Set the mode mask for creation of filesystem nodes.
10478 * XXX implement xsecurity
10479 */
10480#define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10481static int
10482umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10483{
10484 AUDIT_ARG(mask, newmask);
10485 proc_fdlock(p);
10486 *retval = p->p_fd.fd_cmask;
10487 p->p_fd.fd_cmask = newmask & ALLPERMS;
10488 proc_fdunlock(p);
10489 return 0;
10490}
10491
10492/*
10493 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10494 *
10495 * Parameters: p Process requesting to set the umask
10496 * uap User argument descriptor (see below)
10497 * retval umask of the process (parameter p)
10498 *
10499 * Indirect: uap->newmask umask to set
10500 * uap->xsecurity ACL to set
10501 *
10502 * Returns: 0 Success
10503 * !0 Not success
10504 *
10505 */
10506int
10507umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10508{
10509 return umask1(p, newmask: uap->newmask, KAUTH_FILESEC_NONE, retval);
10510}
10511
10512int
10513umask(proc_t p, struct umask_args *uap, int32_t *retval)
10514{
10515 return umask1(p, newmask: uap->newmask, UMASK_NOXSECURITY, retval);
10516}
10517
10518#define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10519 "com.apple.private.vfs.revoke-mounted-device"
10520
10521/*
10522 * Void all references to file by ripping underlying filesystem
10523 * away from vnode.
10524 */
10525/* ARGSUSED */
10526int
10527revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10528{
10529 vnode_t vp;
10530 struct vnode_attr va;
10531 vfs_context_t ctx = vfs_context_current();
10532 int error;
10533 struct nameidata nd;
10534
10535 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10536 uap->path, ctx);
10537 error = namei(ndp: &nd);
10538 if (error) {
10539 return error;
10540 }
10541 vp = nd.ni_vp;
10542
10543 nameidone(&nd);
10544
10545 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10546 error = ENOTSUP;
10547 goto out;
10548 }
10549
10550 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10551 error = EBUSY;
10552 goto out;
10553 }
10554
10555#if CONFIG_MACF
10556 error = mac_vnode_check_revoke(ctx, vp);
10557 if (error) {
10558 goto out;
10559 }
10560#endif
10561
10562 VATTR_INIT(&va);
10563 VATTR_WANTED(&va, va_uid);
10564 if ((error = vnode_getattr(vp, vap: &va, ctx))) {
10565 goto out;
10566 }
10567 if (kauth_cred_getuid(cred: vfs_context_ucred(ctx)) != va.va_uid &&
10568 (error = suser(cred: vfs_context_ucred(ctx), acflag: &p->p_acflag))) {
10569 goto out;
10570 }
10571 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10572 VNOP_REVOKE(vp, REVOKEALL, ctx);
10573 }
10574out:
10575 vnode_put(vp);
10576 return error;
10577}
10578
10579
10580/*
10581 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10582 * The following system calls are designed to support features
10583 * which are specific to the HFS & HFS Plus volume formats
10584 */
10585
10586
10587/*
10588 * Obtain attribute information on objects in a directory while enumerating
10589 * the directory.
10590 */
10591/* ARGSUSED */
10592int
10593getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10594{
10595 vnode_t vp;
10596 struct fileproc *fp;
10597 uio_t auio = NULL;
10598 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10599 uint32_t count = 0, savecount = 0;
10600 uint32_t newstate = 0;
10601 int error, eofflag = 0;
10602 off_t loff = 0;
10603 struct attrlist attributelist;
10604 vfs_context_t ctx = vfs_context_current();
10605 int fd = uap->fd;
10606 UIO_STACKBUF(uio_buf, 1);
10607 kauth_action_t action;
10608
10609 AUDIT_ARG(fd, fd);
10610
10611 /* Get the attributes into kernel space */
10612 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10613 return error;
10614 }
10615 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10616 return error;
10617 }
10618 savecount = count;
10619
10620get_from_fd:
10621 if ((error = fp_getfvp(p, fd, resultfp: &fp, resultvp: &vp))) {
10622 return error;
10623 }
10624
10625 vn_offset_lock(fg: fp->fp_glob);
10626 if (((vnode_t)fp_get_data(fp)) != vp) {
10627 vn_offset_unlock(fg: fp->fp_glob);
10628 file_drop(fd);
10629 goto get_from_fd;
10630 }
10631
10632 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10633 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10634 error = EBADF;
10635 goto out;
10636 }
10637
10638
10639#if CONFIG_MACF
10640 error = mac_file_check_change_offset(cred: vfs_context_ucred(ctx),
10641 fg: fp->fp_glob);
10642 if (error) {
10643 goto out;
10644 }
10645#endif
10646
10647
10648 if ((error = vnode_getwithref(vp))) {
10649 goto out;
10650 }
10651
10652 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10653
10654#if CONFIG_UNION_MOUNTS
10655unionread:
10656#endif /* CONFIG_UNION_MOUNTS */
10657 if (vp->v_type != VDIR) {
10658 (void)vnode_put(vp);
10659 error = EINVAL;
10660 goto out;
10661 }
10662
10663#if CONFIG_MACF
10664 error = mac_vnode_check_readdir(ctx, vp);
10665 if (error != 0) {
10666 (void)vnode_put(vp);
10667 goto out;
10668 }
10669#endif /* MAC */
10670
10671 /* set up the uio structure which will contain the users return buffer */
10672 loff = fp->fp_glob->fg_offset;
10673 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: loff, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
10674 uio_addiov(a_uio: auio, a_baseaddr: uap->buffer, a_length: uap->buffersize);
10675
10676 /*
10677 * If the only item requested is file names, we can let that past with
10678 * just LIST_DIRECTORY. If they want any other attributes, that means
10679 * they need SEARCH as well.
10680 */
10681 action = KAUTH_VNODE_LIST_DIRECTORY;
10682 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10683 attributelist.fileattr || attributelist.dirattr) {
10684 action |= KAUTH_VNODE_SEARCH;
10685 }
10686
10687 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10688 /* Believe it or not, uap->options only has 32-bits of valid
10689 * info, so truncate before extending again */
10690
10691 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10692 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10693 }
10694
10695 if (error) {
10696 (void) vnode_put(vp);
10697 goto out;
10698 }
10699
10700#if CONFIG_UNION_MOUNTS
10701 /*
10702 * If we've got the last entry of a directory in a union mount
10703 * then reset the eofflag and pretend there's still more to come.
10704 * The next call will again set eofflag and the buffer will be empty,
10705 * so traverse to the underlying directory and do the directory
10706 * read there.
10707 */
10708 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10709 if (uio_resid(a_uio: auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10710 eofflag = 0;
10711 } else { // Empty buffer
10712 vnode_t uvp;
10713 if (lookup_traverse_union(dvp: vp, new_dvp: &uvp, ctx) == 0) {
10714 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10715 fp_set_data(fp, fg_data: uvp);
10716 fp->fp_glob->fg_offset = 0; // reset index for new dir
10717 count = savecount;
10718 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10719 vnode_put(vp);
10720 vp = uvp;
10721 goto unionread;
10722 } else {
10723 /* could not get a ref, can't replace in fd */
10724 vnode_put(vp: uvp);
10725 }
10726 }
10727 }
10728 }
10729#endif /* CONFIG_UNION_MOUNTS */
10730
10731 (void)vnode_put(vp);
10732
10733 if (error) {
10734 goto out;
10735 }
10736 fp->fp_glob->fg_offset = uio_offset(a_uio: auio); /* should be multiple of dirent, not variable */
10737
10738 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10739 goto out;
10740 }
10741 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10742 goto out;
10743 }
10744 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10745 goto out;
10746 }
10747
10748 *retval = eofflag; /* similar to getdirentries */
10749 error = 0;
10750out:
10751 vn_offset_unlock(fg: fp->fp_glob);
10752 file_drop(fd);
10753 return error; /* return error earlier, an retval of 0 or 1 now */
10754} /* end of getdirentriesattr system call */
10755
10756/*
10757 * Exchange data between two files
10758 */
10759
10760/* ARGSUSED */
10761int
10762exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10763{
10764 struct nameidata fnd, snd;
10765 vfs_context_t ctx = vfs_context_current();
10766 vnode_t fvp;
10767 vnode_t svp;
10768 int error;
10769 u_int32_t nameiflags;
10770 char *fpath = NULL;
10771 char *spath = NULL;
10772 int flen = 0, slen = 0;
10773 int from_truncated = 0, to_truncated = 0;
10774#if CONFIG_FSE
10775 fse_info f_finfo, s_finfo;
10776#endif
10777
10778 nameiflags = 0;
10779 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10780 nameiflags |= FOLLOW;
10781 }
10782
10783 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10784 UIO_USERSPACE, uap->path1, ctx);
10785
10786 error = namei(ndp: &fnd);
10787 if (error) {
10788 goto out2;
10789 }
10790
10791 nameidone(&fnd);
10792 fvp = fnd.ni_vp;
10793
10794 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10795 UIO_USERSPACE, uap->path2, ctx);
10796
10797 error = namei(ndp: &snd);
10798 if (error) {
10799 vnode_put(vp: fvp);
10800 goto out2;
10801 }
10802 nameidone(&snd);
10803 svp = snd.ni_vp;
10804
10805 /*
10806 * if the files are the same, return an inval error
10807 */
10808 if (svp == fvp) {
10809 error = EINVAL;
10810 goto out;
10811 }
10812
10813 /*
10814 * if the files are on different volumes, return an error
10815 */
10816 if (svp->v_mount != fvp->v_mount) {
10817 error = EXDEV;
10818 goto out;
10819 }
10820
10821 /* If they're not files, return an error */
10822 if ((vnode_isreg(vp: fvp) == 0) || (vnode_isreg(vp: svp) == 0)) {
10823 error = EINVAL;
10824 goto out;
10825 }
10826
10827#if CONFIG_MACF
10828 error = mac_vnode_check_exchangedata(ctx,
10829 v1: fvp, v2: svp);
10830 if (error) {
10831 goto out;
10832 }
10833#endif
10834 if (((error = vnode_authorize(vp: fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10835 ((error = vnode_authorize(vp: svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10836 goto out;
10837 }
10838
10839 if (
10840#if CONFIG_FSE
10841 need_fsevent(FSE_EXCHANGE, vp: fvp) ||
10842#endif
10843 kauth_authorize_fileop_has_listeners()) {
10844 GET_PATH(fpath);
10845 GET_PATH(spath);
10846
10847 flen = safe_getpath(dvp: fvp, NULL, path: fpath, MAXPATHLEN, truncated_path: &from_truncated);
10848 slen = safe_getpath(dvp: svp, NULL, path: spath, MAXPATHLEN, truncated_path: &to_truncated);
10849
10850#if CONFIG_FSE
10851 get_fse_info(vp: fvp, fse: &f_finfo, ctx);
10852 get_fse_info(vp: svp, fse: &s_finfo, ctx);
10853 if (from_truncated || to_truncated) {
10854 // set it here since only the f_finfo gets reported up to user space
10855 f_finfo.mode |= FSE_TRUNCATED_PATH;
10856 }
10857#endif
10858 }
10859 /* Ok, make the call */
10860 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10861
10862 if (error == 0) {
10863 const char *tmpname;
10864
10865 if (fpath != NULL && spath != NULL) {
10866 /* call out to allow 3rd party notification of exchangedata.
10867 * Ignore result of kauth_authorize_fileop call.
10868 */
10869 kauth_authorize_fileop(credential: vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10870 arg0: (uintptr_t)fpath, arg1: (uintptr_t)spath);
10871 }
10872 name_cache_lock();
10873
10874 tmpname = fvp->v_name;
10875 fvp->v_name = svp->v_name;
10876 svp->v_name = tmpname;
10877
10878 if (fvp->v_parent != svp->v_parent) {
10879 vnode_t tmp;
10880
10881 tmp = fvp->v_parent;
10882 fvp->v_parent = svp->v_parent;
10883 svp->v_parent = tmp;
10884 }
10885 name_cache_unlock();
10886
10887#if CONFIG_FSE
10888 if (fpath != NULL && spath != NULL) {
10889 add_fsevent(FSE_EXCHANGE, ctx,
10890 FSE_ARG_STRING, flen, fpath,
10891 FSE_ARG_FINFO, &f_finfo,
10892 FSE_ARG_STRING, slen, spath,
10893 FSE_ARG_FINFO, &s_finfo,
10894 FSE_ARG_DONE);
10895 }
10896#endif
10897 }
10898
10899out:
10900 if (fpath != NULL) {
10901 RELEASE_PATH(fpath);
10902 }
10903 if (spath != NULL) {
10904 RELEASE_PATH(spath);
10905 }
10906 vnode_put(vp: svp);
10907 vnode_put(vp: fvp);
10908out2:
10909 return error;
10910}
10911
10912/*
10913 * Return (in MB) the amount of freespace on the given vnode's volume.
10914 */
10915uint32_t freespace_mb(vnode_t vp);
10916
10917uint32_t
10918freespace_mb(vnode_t vp)
10919{
10920 vfs_update_vfsstat(mp: vp->v_mount, ctx: vfs_context_current(), VFS_USER_EVENT);
10921 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10922 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10923}
10924
10925#if CONFIG_SEARCHFS
10926
10927/* ARGSUSED */
10928
10929int
10930searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10931{
10932 vnode_t vp, tvp;
10933 int i, error = 0;
10934 int fserror = 0;
10935 struct nameidata nd;
10936 struct user64_fssearchblock searchblock;
10937 struct searchstate *state;
10938 struct attrlist *returnattrs;
10939 struct timeval timelimit;
10940 void *searchparams1, *searchparams2;
10941 uio_t auio = NULL;
10942 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10943 uint32_t nummatches;
10944 size_t mallocsize;
10945 uint32_t nameiflags;
10946 vfs_context_t ctx = vfs_context_current();
10947 UIO_STACKBUF(uio_buf, 1);
10948
10949 /* Start by copying in fsearchblock parameter list */
10950 if (IS_64BIT_PROCESS(p)) {
10951 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10952 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10953 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10954 } else {
10955 struct user32_fssearchblock tmp_searchblock;
10956
10957 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10958 // munge into 64-bit version
10959 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10960 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10961 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10962 searchblock.maxmatches = tmp_searchblock.maxmatches;
10963 /*
10964 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10965 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10966 */
10967 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10968 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10969 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10970 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10971 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10972 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10973 searchblock.searchattrs = tmp_searchblock.searchattrs;
10974 }
10975 if (error) {
10976 return error;
10977 }
10978
10979 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10980 */
10981 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10982 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10983 return EINVAL;
10984 }
10985
10986 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10987 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10988 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10989 /* block. */
10990 /* */
10991 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10992 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10993 /* assumes the size is still 556 bytes it will continue to work */
10994
10995 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10996 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10997
10998 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10999
11000 /* Now set up the various pointers to the correct place in our newly allocated memory */
11001
11002 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11003 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11004 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11005
11006 /* Now copy in the stuff given our local variables. */
11007
11008 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11009 goto freeandexit;
11010 }
11011
11012 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11013 goto freeandexit;
11014 }
11015
11016 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11017 goto freeandexit;
11018 }
11019
11020 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11021 goto freeandexit;
11022 }
11023
11024 /*
11025 * When searching a union mount, need to set the
11026 * start flag at the first call on each layer to
11027 * reset state for the new volume.
11028 */
11029 if (uap->options & SRCHFS_START) {
11030 state->ss_union_layer = 0;
11031 } else {
11032 uap->options |= state->ss_union_flags;
11033 }
11034 state->ss_union_flags = 0;
11035
11036 /*
11037 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11038 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11039 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11040 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11041 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11042 */
11043
11044 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11045 attrreference_t* string_ref;
11046 u_int32_t* start_length;
11047 user64_size_t param_length;
11048
11049 /* validate searchparams1 */
11050 param_length = searchblock.sizeofsearchparams1;
11051 /* skip the word that specifies length of the buffer */
11052 start_length = (u_int32_t*) searchparams1;
11053 start_length = start_length + 1;
11054 string_ref = (attrreference_t*) start_length;
11055
11056 /* ensure no negative offsets or too big offsets */
11057 if (string_ref->attr_dataoffset < 0) {
11058 error = EINVAL;
11059 goto freeandexit;
11060 }
11061 if (string_ref->attr_length > MAXPATHLEN) {
11062 error = EINVAL;
11063 goto freeandexit;
11064 }
11065
11066 /* Check for pointer overflow in the string ref */
11067 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11068 error = EINVAL;
11069 goto freeandexit;
11070 }
11071
11072 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11073 error = EINVAL;
11074 goto freeandexit;
11075 }
11076 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11077 error = EINVAL;
11078 goto freeandexit;
11079 }
11080 }
11081
11082 /* set up the uio structure which will contain the users return buffer */
11083 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: 0, a_spacetype: spacetype, a_iodirection: UIO_READ, a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
11084 uio_addiov(a_uio: auio, a_baseaddr: searchblock.returnbuffer, a_length: searchblock.returnbuffersize);
11085
11086 nameiflags = 0;
11087 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11088 nameiflags |= FOLLOW;
11089 }
11090 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11091 UIO_USERSPACE, uap->path, ctx);
11092
11093 error = namei(ndp: &nd);
11094 if (error) {
11095 goto freeandexit;
11096 }
11097 vp = nd.ni_vp;
11098 nameidone(&nd);
11099
11100 /*
11101 * Switch to the root vnode for the volume
11102 */
11103 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11104 vnode_put(vp);
11105 if (error) {
11106 goto freeandexit;
11107 }
11108 vp = tvp;
11109
11110#if CONFIG_UNION_MOUNTS
11111 /*
11112 * If it's a union mount, the path lookup takes
11113 * us to the top layer. But we may need to descend
11114 * to a lower layer. For non-union mounts the layer
11115 * is always zero.
11116 */
11117 for (i = 0; i < (int) state->ss_union_layer; i++) {
11118 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11119 break;
11120 }
11121 tvp = vp;
11122 vp = vp->v_mount->mnt_vnodecovered;
11123 if (vp == NULL) {
11124 vnode_put(vp: tvp);
11125 error = ENOENT;
11126 goto freeandexit;
11127 }
11128 error = vnode_getwithref(vp);
11129 vnode_put(vp: tvp);
11130 if (error) {
11131 goto freeandexit;
11132 }
11133 }
11134#endif /* CONFIG_UNION_MOUNTS */
11135
11136#if CONFIG_MACF
11137 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, searchattrs: &searchblock.searchattrs);
11138 if (error) {
11139 vnode_put(vp);
11140 goto freeandexit;
11141 }
11142#endif
11143
11144
11145 /*
11146 * If searchblock.maxmatches == 0, then skip the search. This has happened
11147 * before and sometimes the underlying code doesnt deal with it well.
11148 */
11149 if (searchblock.maxmatches == 0) {
11150 nummatches = 0;
11151 goto saveandexit;
11152 }
11153
11154 /*
11155 * Allright, we have everything we need, so lets make that call.
11156 *
11157 * We keep special track of the return value from the file system:
11158 * EAGAIN is an acceptable error condition that shouldn't keep us
11159 * from copying out any results...
11160 */
11161
11162 fserror = VNOP_SEARCHFS(vp,
11163 searchparams1,
11164 searchparams2,
11165 &searchblock.searchattrs,
11166 (uint32_t)searchblock.maxmatches,
11167 &timelimit,
11168 returnattrs,
11169 &nummatches,
11170 (uint32_t)uap->scriptcode,
11171 (uint32_t)uap->options,
11172 auio,
11173 (struct searchstate *) &state->ss_fsstate,
11174 ctx);
11175
11176#if CONFIG_UNION_MOUNTS
11177 /*
11178 * If it's a union mount we need to be called again
11179 * to search the mounted-on filesystem.
11180 */
11181 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11182 state->ss_union_flags = SRCHFS_START;
11183 state->ss_union_layer++; // search next layer down
11184 fserror = EAGAIN;
11185 }
11186#endif /* CONFIG_UNION_MOUNTS */
11187
11188saveandexit:
11189
11190 vnode_put(vp);
11191
11192 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11193 * search state. Everything was already put into he return buffer by the vop call. */
11194
11195 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11196 goto freeandexit;
11197 }
11198
11199 if ((error = suulong(addr: uap->nummatches, ulongword: (uint64_t)nummatches)) != 0) {
11200 goto freeandexit;
11201 }
11202
11203 error = fserror;
11204
11205freeandexit:
11206
11207 kfree_data(searchparams1, mallocsize);
11208
11209 return error;
11210} /* end of searchfs system call */
11211
11212#else /* CONFIG_SEARCHFS */
11213
11214int
11215searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11216{
11217 return ENOTSUP;
11218}
11219
11220#endif /* CONFIG_SEARCHFS */
11221
11222
11223#if CONFIG_DATALESS_FILES
11224
11225/*
11226 * === Namespace Resolver Up-call Mechanism ===
11227 *
11228 * When I/O is performed to a dataless file or directory (read, write,
11229 * lookup-in, etc.), the file system performs an upcall to the namespace
11230 * resolver (filecoordinationd) to materialize the object.
11231 *
11232 * We need multiple up-calls to be in flight at once, and we need these
11233 * up-calls to be interruptible, thus the following implementation:
11234 *
11235 * => The nspace_resolver_request represents the in-kernel request state.
11236 * It contains a request ID, storage space for the errno code returned
11237 * by filecoordinationd, and flags.
11238 *
11239 * => The request ID is simply a global monotonically incrementing 32-bit
11240 * number. Outstanding requests are stored in a hash table, and the
11241 * hash function is extremely simple.
11242 *
11243 * => When an upcall is to be made to filecoordinationd, a request structure
11244 * is allocated on the stack (it is small, and needs to live only during
11245 * the duration of the call to resolve_nspace_item_ext()). It is
11246 * initialized and inserted into the table. Some backpressure from
11247 * filecoordinationd is applied by limiting the numnber of entries that
11248 * can be inserted into the table (and thus limiting the number of
11249 * outstanding requests issued to filecoordinationd); waiting for an
11250 * available slot is interruptible.
11251 *
11252 * => Once the request has been inserted into the table, the up-call is made
11253 * to filecoordinationd via a MiG-generated stub. The up-call returns
11254 * immediately and filecoordinationd processes the request asynchronously.
11255 *
11256 * => The caller now waits for the request to complete. Tnis is achieved by
11257 * sleeping on the address of the request structure and waiting for
11258 * filecoordinationd to mark the request structure as complete. This
11259 * is an interruptible sleep call; if interrupted, the request structure
11260 * is removed from the table and EINTR is returned to the caller. If
11261 * this occurs, an advisory up-call is made to filecoordinationd with
11262 * the request ID to indicate that the request can be aborted or
11263 * de-prioritized at the discretion of filecoordinationd.
11264 *
11265 * => When filecoordinationd has completed the request, it signals completion
11266 * by writing to the vfs.nspace.complete sysctl node. Only a process
11267 * decorated as a namespace resolver can write to this sysctl node. The
11268 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11269 * The request ID is looked up in the table, and if the request is found,
11270 * the error code is stored in the request structure and a wakeup()
11271 * issued on the address of the request structure. If the request is not
11272 * found, we simply drop the completion notification, assuming that the
11273 * caller was interrupted.
11274 *
11275 * => When the waiting thread wakes up, it extracts the error code from the
11276 * request structure, removes the request from the table, and returns the
11277 * error code to the calling function. Fini!
11278 */
11279
11280struct nspace_resolver_request {
11281 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11282 vnode_t r_vp;
11283 vnode_t r_tdvp;
11284 uint32_t r_req_id;
11285 int r_resolver_error;
11286 int r_flags;
11287};
11288
11289#define RRF_COMPLETE 0x0001
11290#define RRF_COMPLETING 0x0002
11291
11292struct nspace_resolver_completion_data {
11293 uint32_t req_id;
11294 int32_t resolver_error;
11295 uint64_t orig_gencount;
11296 uint64_t orig_syncroot;
11297};
11298
11299static uint32_t
11300next_nspace_req_id(void)
11301{
11302 static uint32_t next_req_id;
11303
11304 return OSAddAtomic(1, &next_req_id);
11305}
11306
11307#define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11308#define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11309
11310static LIST_HEAD(nspace_resolver_requesthead,
11311 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11312static u_long nspace_resolver_request_hashmask;
11313static u_int nspace_resolver_request_count;
11314static bool nspace_resolver_request_wait_slot;
11315static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11316static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11317 &nspace_resolver_request_lck_grp);
11318
11319#define NSPACE_REQ_LOCK() \
11320 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11321#define NSPACE_REQ_UNLOCK() \
11322 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11323
11324#define NSPACE_RESOLVER_HASH(req_id) \
11325 (&nspace_resolver_request_hashtbl[(req_id) & \
11326 nspace_resolver_request_hashmask])
11327
11328static struct nspace_resolver_request *
11329nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11330{
11331 struct nspace_resolver_requesthead *bucket;
11332 struct nspace_resolver_request *req;
11333
11334 bucket = NSPACE_RESOLVER_HASH(req_id);
11335 LIST_FOREACH(req, bucket, r_hashlink) {
11336 if (req->r_req_id == req_id) {
11337 /*
11338 * If this request already has a completion
11339 * pending, don't return it again.
11340 */
11341 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11342 skip_completing) {
11343 req = NULL;
11344 }
11345 return req;
11346 }
11347 }
11348
11349 return NULL;
11350}
11351
11352static int
11353nspace_resolver_req_add(struct nspace_resolver_request *req)
11354{
11355 struct nspace_resolver_requesthead *bucket;
11356 int error;
11357
11358 NSPACE_REQ_LOCK();
11359
11360 while (nspace_resolver_request_count >=
11361 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11362 nspace_resolver_request_wait_slot = true;
11363 error = msleep(chan: &nspace_resolver_request_count,
11364 mtx: &nspace_resolver_request_hash_mutex,
11365 PVFS | PCATCH, wmesg: "nspacerq", NULL);
11366 if (error) {
11367 NSPACE_REQ_UNLOCK();
11368 return error;
11369 }
11370 }
11371
11372 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11373#if DIAGNOSTIC
11374 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11375#endif /* DIAGNOSTIC */
11376 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11377 nspace_resolver_request_count++;
11378
11379 NSPACE_REQ_UNLOCK();
11380
11381 return 0;
11382}
11383
11384static void
11385nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11386{
11387 /*
11388 * If a completion is in-progress, we have to wait for the
11389 * completion handler to finish because it's still using 'req',
11390 * which is allocated on our stack a couple of frames up.
11391 */
11392 while ((req->r_flags & RRF_COMPLETING) != 0) {
11393 (void) msleep(chan: req, mtx: &nspace_resolver_request_hash_mutex,
11394 PVFS, wmesg: "nspacecmplt", NULL);
11395 }
11396}
11397
11398static void
11399nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11400{
11401 struct nspace_resolver_requesthead *bucket;
11402
11403 /* We're called with NSPACE_REQ_LOCK held. */
11404
11405 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11406#if DIAGNOSTIC
11407 assert((req->r_flags & RRF_COMPLETING) == 0);
11408 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11409#endif /* DIAGNOSTIC */
11410 LIST_REMOVE(req, r_hashlink);
11411 nspace_resolver_request_count--;
11412
11413 if (nspace_resolver_request_wait_slot) {
11414 nspace_resolver_request_wait_slot = false;
11415 wakeup(chan: &nspace_resolver_request_count);
11416 }
11417
11418 nspace_resolver_req_wait_pending_completion(req);
11419
11420 NSPACE_REQ_UNLOCK();
11421}
11422
11423static void
11424nspace_resolver_req_remove(struct nspace_resolver_request *req)
11425{
11426 NSPACE_REQ_LOCK();
11427 nspace_resolver_req_remove_and_unlock(req);
11428}
11429
11430static void
11431nspace_resolver_req_cancel(uint32_t req_id)
11432{
11433 kern_return_t kr;
11434 mach_port_t mp;
11435
11436 // Failures here aren't fatal -- the cancellation message
11437 // sent to the resolver is merely advisory.
11438
11439 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11440 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11441 return;
11442 }
11443
11444 kr = send_nspace_resolve_cancel(nspace_handler_port: mp, req_id);
11445 if (kr != KERN_SUCCESS) {
11446 os_log_error(OS_LOG_DEFAULT,
11447 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11448 }
11449
11450 ipc_port_release_send(port: mp);
11451}
11452
11453static int
11454nspace_resolver_req_wait(struct nspace_resolver_request *req)
11455{
11456 bool send_cancel_message = false;
11457 int error;
11458
11459 NSPACE_REQ_LOCK();
11460
11461 while ((req->r_flags & RRF_COMPLETE) == 0) {
11462 error = msleep(chan: req, mtx: &nspace_resolver_request_hash_mutex,
11463 PVFS | PCATCH, wmesg: "nspace", NULL);
11464 if (error && error != ERESTART) {
11465 req->r_resolver_error = (error == EINTR) ? EINTR :
11466 ETIMEDOUT;
11467 send_cancel_message = true;
11468 break;
11469 }
11470 }
11471
11472 nspace_resolver_req_remove_and_unlock(req);
11473
11474 /*
11475 * It's safe to continue referencing 'req' here because it's
11476 * allocated on our caller's stack.
11477 */
11478
11479 if (send_cancel_message) {
11480 nspace_resolver_req_cancel(req_id: req->r_req_id);
11481 }
11482
11483 return req->r_resolver_error;
11484}
11485
11486static void
11487nspace_resolver_req_mark_complete(
11488 struct nspace_resolver_request *req,
11489 int resolver_error)
11490{
11491 req->r_resolver_error = resolver_error;
11492 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11493 wakeup(chan: req);
11494}
11495
11496static void
11497nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11498{
11499 req->r_flags |= RRF_COMPLETING;
11500}
11501
11502static void
11503nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11504{
11505 struct nspace_resolver_request *req;
11506 int error;
11507 struct vnode_attr va;
11508 vnode_t vp;
11509
11510 NSPACE_REQ_LOCK();
11511
11512 req = nspace_resolver_req_lookup(req_id: c->req_id, true);
11513 if (req == NULL) {
11514 /*
11515 * If we don't find the request corresponding to our req_id,
11516 * just drop the completion on the floor; it's likely that
11517 * the requester interrupted with a signal, or it may already
11518 * be completing.
11519 */
11520 NSPACE_REQ_UNLOCK();
11521 return;
11522 }
11523
11524 /*
11525 * Get out now if the resolver reported an error.
11526 */
11527 if ((error = c->resolver_error) != 0) {
11528 goto out;
11529 }
11530
11531 /*
11532 * If the resolver did not specify any namespace shape criteria
11533 * for letting the operation proceed, then get out now.
11534 */
11535 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11536 goto out;
11537 }
11538
11539 /*
11540 * We're going to have to acquire the mount rename lock and do
11541 * some I/O in order to verify the criteria. Mark the request
11542 * as pending so no one else messes with it after we drop the
11543 * NSPACE_REQ_LOCK.
11544 */
11545 nspace_resolver_req_mark_completion_pending(req);
11546 NSPACE_REQ_UNLOCK();
11547
11548 /*
11549 * Lock out renames from changing the shape of the tree while
11550 * validate the criteria.
11551 */
11552 mount_t locked_mp = req->r_vp->v_mount;
11553 mount_ref(locked_mp, 0);
11554 mount_lock_renames(locked_mp);
11555
11556 if (c->orig_gencount != 0) {
11557 vp = req->r_vp;
11558 if (error) {
11559 goto out_dropmount;
11560 }
11561
11562 VATTR_INIT(&va);
11563 VATTR_WANTED(&va, va_recursive_gencount);
11564 error = vnode_getattr(vp, vap: &va, ctx: vfs_context_kernel());
11565 if (error) {
11566 goto out_dropmount;
11567 }
11568 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11569 va.va_recursive_gencount != c->orig_gencount) {
11570 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11571 c->orig_gencount, va.va_recursive_gencount);
11572 error = EBUSY;
11573 goto out_dropmount;
11574 }
11575 }
11576
11577 /*
11578 * Ignore orig_syncroot if a destination directory wasn't specified
11579 * in the request.
11580 */
11581 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11582 uint64_t syncroot_id;
11583
11584 if (error) {
11585 goto out_dropmount;
11586 }
11587
11588#ifndef APFSIOC_GET_SYNC_ROOT
11589#define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11590#endif
11591
11592 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11593 data: (caddr_t)&syncroot_id, fflag: 0, ctx: vfs_context_kernel());
11594 if (error) {
11595 goto out_dropmount;
11596 }
11597 if (syncroot_id != c->orig_syncroot) {
11598 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11599 c->orig_syncroot, syncroot_id);
11600 error = EBUSY;
11601 goto out_dropmount;
11602 }
11603 }
11604
11605out_dropmount:
11606 mount_unlock_renames(locked_mp);
11607 mount_drop(locked_mp, 0);
11608 NSPACE_REQ_LOCK();
11609
11610out:
11611 nspace_resolver_req_mark_complete(req, resolver_error: error);
11612 NSPACE_REQ_UNLOCK();
11613}
11614
11615static struct proc *nspace_resolver_proc;
11616
11617static int
11618nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11619{
11620 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11621 p == nspace_resolver_proc) ? 1 : 0;
11622 return 0;
11623}
11624
11625static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11626
11627static int
11628nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11629{
11630 vfs_context_t ctx = vfs_context_current();
11631 int error = 0;
11632
11633 //
11634 // The system filecoordinationd runs as uid == 0. This also
11635 // has the nice side-effect of filtering out filecoordinationd
11636 // running in the simulator.
11637 //
11638 if (!vfs_context_issuser(ctx) ||
11639 !vfs_context_is_dataless_resolver(ctx)) {
11640 return EPERM;
11641 }
11642
11643 if (is_resolver) {
11644 NSPACE_REQ_LOCK();
11645
11646 if (nspace_resolver_proc == NULL) {
11647 proc_lock(p);
11648 p->p_lflag |= P_LNSPACE_RESOLVER;
11649 proc_unlock(p);
11650 nspace_resolver_proc = p;
11651 } else {
11652 error = EBUSY;
11653 }
11654
11655 NSPACE_REQ_UNLOCK();
11656 } else {
11657 // This is basically just like the exit case.
11658 // nspace_resolver_exited() will verify that the
11659 // process is the resolver, and will clear the
11660 // global.
11661 nspace_resolver_exited(p);
11662 }
11663
11664 return error;
11665}
11666
11667static int
11668nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11669{
11670 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11671 (p->p_vfs_iopolicy &
11672 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11673 *is_prevented = 1;
11674 } else {
11675 *is_prevented = 0;
11676 }
11677 return 0;
11678}
11679
11680static int
11681nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11682{
11683 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11684 return is_prevented ? 0 : EBUSY;
11685 }
11686
11687 if (is_prevented) {
11688 OSBitAndAtomic16(mask: ~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), address: &p->p_vfs_iopolicy);
11689 } else {
11690 OSBitOrAtomic16(mask: (uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, address: &p->p_vfs_iopolicy);
11691 }
11692 return 0;
11693}
11694
11695static int
11696nspace_materialization_get_thread_state(int *is_prevented)
11697{
11698 uthread_t ut = current_uthread();
11699
11700 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11701 return 0;
11702}
11703
11704static int
11705nspace_materialization_set_thread_state(int is_prevented)
11706{
11707 uthread_t ut = current_uthread();
11708
11709 if (is_prevented) {
11710 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11711 } else {
11712 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11713 }
11714 return 0;
11715}
11716
11717/* the vfs.nspace branch */
11718SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11719
11720static int
11721sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11722 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11723{
11724 struct proc *p = req->p;
11725 int new_value, old_value, changed = 0;
11726 int error;
11727
11728 error = nspace_resolver_get_proc_state(p, is_resolver: &old_value);
11729 if (error) {
11730 return error;
11731 }
11732
11733 error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11734 changed: &changed);
11735 if (error == 0 && changed) {
11736 error = nspace_resolver_set_proc_state(p, is_resolver: new_value);
11737 }
11738 return error;
11739}
11740
11741/* decorate this process as the dataless file resolver */
11742SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11743 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11744 0, 0, sysctl_nspace_resolver, "I", "");
11745
11746static int
11747sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11748 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11749{
11750 struct proc *p = req->p;
11751 int new_value, old_value, changed = 0;
11752 int error;
11753
11754 error = nspace_materialization_get_proc_state(p, is_prevented: &old_value);
11755 if (error) {
11756 return error;
11757 }
11758
11759 error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11760 changed: &changed);
11761 if (error == 0 && changed) {
11762 error = nspace_materialization_set_proc_state(p, is_prevented: new_value);
11763 }
11764 return error;
11765}
11766
11767/* decorate this process as not wanting to materialize dataless files */
11768SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11769 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11770 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11771
11772static int
11773sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11774 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11775{
11776 int new_value, old_value, changed = 0;
11777 int error;
11778
11779 error = nspace_materialization_get_thread_state(is_prevented: &old_value);
11780 if (error) {
11781 return error;
11782 }
11783
11784 error = sysctl_io_number(req, bigValue: old_value, valueSize: sizeof(int), pValue: &new_value,
11785 changed: &changed);
11786 if (error == 0 && changed) {
11787 error = nspace_materialization_set_thread_state(is_prevented: new_value);
11788 }
11789 return error;
11790}
11791
11792/* decorate this thread as not wanting to materialize dataless files */
11793SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11794 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11795 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11796
11797static int
11798sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11799 __unused int arg2, struct sysctl_req *req)
11800{
11801 struct proc *p = req->p;
11802 uint32_t req_status[2] = { 0, 0 };
11803 uint64_t gencount = 0;
11804 uint64_t syncroot = 0;
11805 int error, is_resolver, changed = 0, other_changed;
11806
11807 error = nspace_resolver_get_proc_state(p, is_resolver: &is_resolver);
11808 if (error) {
11809 return error;
11810 }
11811
11812 if (!is_resolver) {
11813 return EPERM;
11814 }
11815
11816 error = sysctl_io_opaque(req, pValue: req_status, valueSize: sizeof(req_status),
11817 changed: &changed);
11818 if (error) {
11819 return error;
11820 }
11821
11822 /*
11823 * Get the gencount if it was passed. Ignore errors, because
11824 * it's optional.
11825 */
11826 error = sysctl_io_opaque(req, pValue: &gencount, valueSize: sizeof(gencount),
11827 changed: &other_changed);
11828 if (error) {
11829 gencount = 0;
11830 error = 0;
11831 }
11832
11833 /*
11834 * ...and now the syncroot ID.
11835 */
11836 error = sysctl_io_opaque(req, pValue: &syncroot, valueSize: sizeof(syncroot),
11837 changed: &other_changed);
11838 if (error) {
11839 syncroot = 0;
11840 error = 0;
11841 }
11842
11843 /*
11844 * req_status[0] is the req_id
11845 *
11846 * req_status[1] is the errno
11847 */
11848 if (error == 0 && changed) {
11849 const struct nspace_resolver_completion_data cd = {
11850 .req_id = req_status[0],
11851 .resolver_error = req_status[1],
11852 .orig_gencount = gencount,
11853 .orig_syncroot = syncroot,
11854 };
11855 nspace_resolver_req_completed(c: &cd);
11856 }
11857 return error;
11858}
11859
11860/* Resolver reports completed reqs here. */
11861SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11862 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11863 0, 0, sysctl_nspace_complete, "-", "");
11864
11865#endif /* CONFIG_DATALESS_FILES */
11866
11867#if CONFIG_DATALESS_FILES
11868#define __no_dataless_unused /* nothing */
11869#else
11870#define __no_dataless_unused __unused
11871#endif
11872
11873int
11874vfs_context_dataless_materialization_is_prevented(
11875 vfs_context_t const ctx __no_dataless_unused)
11876{
11877#if CONFIG_DATALESS_FILES
11878 proc_t const p = vfs_context_proc(ctx);
11879 thread_t const t = vfs_context_thread(ctx);
11880 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11881
11882 /*
11883 * Kernel context ==> return EDEADLK, as we would with any random
11884 * process decorated as no-materialize.
11885 */
11886 if (ctx == vfs_context_kernel()) {
11887 return EDEADLK;
11888 }
11889
11890 /*
11891 * If the process has the dataless-manipulation entitlement,
11892 * materialization is prevented, and depending on the kind
11893 * of file system operation, things get to proceed as if the
11894 * object is not dataless.
11895 */
11896 if (vfs_context_is_dataless_manipulator(ctx)) {
11897 return EJUSTRETURN;
11898 }
11899
11900 /*
11901 * Per-thread decorations override any process-wide decorations.
11902 * (Foundation uses this, and this overrides even the dataless-
11903 * manipulation entitlement so as to make API contracts consistent.)
11904 */
11905 if (ut != NULL) {
11906 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11907 return EDEADLK;
11908 }
11909 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11910 return 0;
11911 }
11912 }
11913
11914 /*
11915 * If the process's iopolicy specifies that dataless files
11916 * can be materialized, then we let it go ahead.
11917 */
11918 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11919 return 0;
11920 }
11921#endif /* CONFIG_DATALESS_FILES */
11922
11923 /*
11924 * The default behavior is to not materialize dataless files;
11925 * return to the caller that deadlock was detected.
11926 */
11927 return EDEADLK;
11928}
11929
11930void
11931nspace_resolver_init(void)
11932{
11933#if CONFIG_DATALESS_FILES
11934 nspace_resolver_request_hashtbl =
11935 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11936 M_VNODE /* XXX */, hashmask: &nspace_resolver_request_hashmask);
11937#endif /* CONFIG_DATALESS_FILES */
11938}
11939
11940void
11941nspace_resolver_exited(struct proc *p __no_dataless_unused)
11942{
11943#if CONFIG_DATALESS_FILES
11944 struct nspace_resolver_requesthead *bucket;
11945 struct nspace_resolver_request *req;
11946 u_long idx;
11947
11948 NSPACE_REQ_LOCK();
11949
11950 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11951 p == nspace_resolver_proc) {
11952 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11953 bucket = &nspace_resolver_request_hashtbl[idx];
11954 LIST_FOREACH(req, bucket, r_hashlink) {
11955 nspace_resolver_req_wait_pending_completion(req);
11956 nspace_resolver_req_mark_complete(req,
11957 ETIMEDOUT);
11958 }
11959 }
11960 nspace_resolver_proc = NULL;
11961 }
11962
11963 NSPACE_REQ_UNLOCK();
11964#endif /* CONFIG_DATALESS_FILES */
11965}
11966
11967#define DATALESS_RESOLVER_ENTITLEMENT \
11968 "com.apple.private.vfs.dataless-resolver"
11969#define DATALESS_MANIPULATION_ENTITLEMENT \
11970 "com.apple.private.vfs.dataless-manipulation"
11971
11972#if CONFIG_DATALESS_FILES
11973/*
11974 * Return TRUE if the vfs context is associated with the dataless
11975 * resolver.
11976 */
11977static boolean_t
11978vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11979{
11980 return IOTaskHasEntitlement(task: vfs_context_task(ctx),
11981 DATALESS_RESOLVER_ENTITLEMENT);
11982}
11983#endif /* CONFIG_DATALESS_FILES */
11984
11985/*
11986 * Return TRUE if the vfs context is associated with a process entitled
11987 * for dataless manipulation.
11988 *
11989 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11990 * complication around CONFIG_DATALESS_FILES.
11991 */
11992boolean_t
11993vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11994{
11995#if CONFIG_DATALESS_FILES
11996 task_t task = vfs_context_task(ctx);
11997 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11998 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11999#else
12000 return false;
12001#endif /* CONFIG_DATALESS_FILES */
12002}
12003
12004#if CONFIG_DATALESS_FILES
12005static void
12006log_materialization_prevented(vnode_t vp, uint64_t op)
12007{
12008 char p_name[MAXCOMLEN + 1];
12009 char *vntype;
12010 proc_selfname(buf: &p_name[0], size: sizeof(p_name));
12011
12012 if (vp->v_type == VREG) {
12013 vntype = "File";
12014 } else if (vp->v_type == VDIR) {
12015 vntype = "Dir";
12016 } else if (vp->v_type == VLNK) {
12017 vntype = "SymLink";
12018 } else {
12019 vntype = "Other";
12020 }
12021
12022#if DEVELOPMENT
12023 char *path = NULL;
12024 int len;
12025
12026 path = get_pathbuff();
12027 len = MAXPATHLEN;
12028 if (path) {
12029 vn_getpath(vp, path, &len);
12030 }
12031
12032 os_log_debug(OS_LOG_DEFAULT,
12033 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12034 p_name, proc_selfpid(),
12035 op, vntype, path ? path : "<unknown-path>");
12036 if (path) {
12037 release_pathbuff(path);
12038 }
12039#else
12040 os_log_debug(OS_LOG_DEFAULT,
12041 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12042 p_name, proc_selfpid(),
12043 op, vntype);
12044#endif
12045}
12046#endif /* CONFIG_DATALESS_FILES */
12047
12048static int
12049vfs_materialize_item(
12050 vnode_t vp __no_dataless_unused,
12051 uint32_t op __no_dataless_unused,
12052 int64_t offset __no_dataless_unused,
12053 int64_t size __no_dataless_unused,
12054 char *lookup_name __no_dataless_unused,
12055 size_t const namelen __no_dataless_unused,
12056 vnode_t tdvp __no_dataless_unused)
12057{
12058#if CONFIG_DATALESS_FILES
12059 kern_return_t kern_ret;
12060 mach_port_t mach_port;
12061 char *path = NULL;
12062 vfs_context_t context;
12063 int path_len;
12064 int error;
12065 audit_token_t atoken;
12066 enum vtype vp_vtype;
12067
12068 /* Swap files are special; ignore them */
12069 if (vnode_isswap(vp)) {
12070 return 0;
12071 }
12072
12073 /*
12074 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12075 * are no longer used nor supported.
12076 */
12077 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12078 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12079 return ENOTSUP;
12080 }
12081 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12082 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12083 return ENOTSUP;
12084 }
12085
12086 /* Normalize 'op'. */
12087 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12088
12089 /*
12090 * To-directory is only meaningful for rename operations;
12091 * ignore it if someone handed one to us unexpectedly.
12092 */
12093 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12094 tdvp = NULL;
12095 }
12096
12097 context = vfs_context_current();
12098
12099 /* Remember this for later. */
12100 vp_vtype = vnode_vtype(vp);
12101
12102 error = vfs_context_dataless_materialization_is_prevented(ctx: context);
12103 if (error) {
12104 log_materialization_prevented(vp, op);
12105 goto out_check_errors;
12106 }
12107
12108 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12109 &mach_port);
12110 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12111 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12112 /*
12113 * Treat this like being unable to access the backing store
12114 * server.
12115 */
12116 return ETIMEDOUT;
12117 }
12118
12119 int path_alloc_len = MAXPATHLEN;
12120 do {
12121 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12122 if (path == NULL) {
12123 return ENOMEM;
12124 }
12125
12126 path_len = path_alloc_len;
12127 error = vn_getpath(vp, pathbuf: path, len: &path_len);
12128 if (error == 0) {
12129 break;
12130 } else if (error == ENOSPC) {
12131 kfree_data(path, path_alloc_len);
12132 path = NULL;
12133 } else {
12134 goto out_release_port;
12135 }
12136 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12137
12138 error = vfs_context_copy_audit_token(ctx: context, token: &atoken);
12139 if (error) {
12140 goto out_release_port;
12141 }
12142
12143 struct nspace_resolver_request req = {
12144 .r_req_id = next_nspace_req_id(),
12145 .r_vp = vp,
12146 .r_tdvp = tdvp,
12147 };
12148
12149 error = nspace_resolver_req_add(req: &req);
12150 if (error) {
12151 goto out_release_port;
12152 }
12153
12154 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12155
12156 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12157 char *dest_path = NULL;
12158 int dest_path_len;
12159
12160 dest_path = zalloc(view: ZV_NAMEI);
12161 dest_path_len = MAXPATHLEN;
12162
12163 error = vn_getpath(vp: tdvp, pathbuf: dest_path, len: &dest_path_len);
12164 if (error) {
12165 zfree(ZV_NAMEI, dest_path);
12166 goto out_release_port;
12167 }
12168
12169 /*
12170 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12171 * compatibility with existing agents in user-space
12172 * who get passed this value.
12173 */
12174 kern_ret = send_vfs_resolve_reparent_with_audit_token(nspace_handler_port: mach_port,
12175 req_id: req.r_req_id,
12176 op: op | NAMESPACE_HANDLER_NSPACE_EVENT,
12177 path, dest_path, req_atoken: atoken);
12178
12179 zfree(ZV_NAMEI, dest_path);
12180 } else if (vp_vtype == VDIR) {
12181 char *tmpname = NULL;
12182
12183 /*
12184 * If the caller provided a lookup_name *and* a name length,
12185 * then we assume the lookup_name is not NUL-terminated.
12186 * Allocate a temporary buffer in this case to provide
12187 * a NUL-terminated path name to the IPC call.
12188 */
12189 if (lookup_name != NULL && namelen != 0) {
12190 if (namelen >= PATH_MAX) {
12191 error = EINVAL;
12192 goto out_req_remove;
12193 }
12194 tmpname = zalloc(view: ZV_NAMEI);
12195 strlcpy(dst: tmpname, src: lookup_name, n: namelen + 1);
12196 lookup_name = tmpname;
12197 } else if (lookup_name != NULL) {
12198 /*
12199 * If the caller provided a lookup_name with a
12200 * zero name length, then we assume it's NUL-
12201 * terminated. Verify it has a valid length.
12202 */
12203 if (strlen(s: lookup_name) >= PATH_MAX) {
12204 error = EINVAL;
12205 goto out_req_remove;
12206 }
12207 }
12208
12209 /* (See above.) */
12210 kern_ret = send_vfs_resolve_dir_with_audit_token(nspace_handler_port: mach_port,
12211 req_id: req.r_req_id,
12212 op: op | NAMESPACE_HANDLER_NSPACE_EVENT,
12213 file_name: lookup_name == NULL ? "" : lookup_name, path, req_atoken: atoken);
12214
12215 if (tmpname != NULL) {
12216 zfree(ZV_NAMEI, tmpname);
12217
12218 /*
12219 * Poison lookup_name rather than reference
12220 * freed memory.
12221 */
12222 lookup_name = NULL;
12223 }
12224 } else {
12225 /* (See above.) */
12226 kern_ret = send_vfs_resolve_file_with_audit_token(nspace_handler_port: mach_port,
12227 req_id: req.r_req_id,
12228 op: op | NAMESPACE_HANDLER_NSPACE_EVENT,
12229 offset, size, path, req_atoken: atoken);
12230 }
12231 if (kern_ret != KERN_SUCCESS) {
12232 /*
12233 * Also treat this like being unable to access the backing
12234 * store server.
12235 */
12236 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12237 kern_ret);
12238 error = ETIMEDOUT;
12239 goto out_req_remove;
12240 }
12241
12242 /*
12243 * Give back the memory we allocated earlier while we wait; we
12244 * no longer need it.
12245 */
12246 kfree_data(path, path_alloc_len);
12247 path = NULL;
12248
12249 /*
12250 * Request has been submitted to the resolver. Now (interruptibly)
12251 * wait for completion. Upon requrn, the request will have been
12252 * removed from the lookup table.
12253 */
12254 error = nspace_resolver_req_wait(req: &req);
12255
12256out_release_port:
12257 if (path != NULL) {
12258 kfree_data(path, path_alloc_len);
12259 path = NULL;
12260 }
12261 ipc_port_release_send(port: mach_port);
12262
12263out_check_errors:
12264 /*
12265 * The file resolver owns the logic about what error to return
12266 * to the caller. We only need to handle a couple of special
12267 * cases here:
12268 */
12269 if (error == EJUSTRETURN) {
12270 /*
12271 * The requesting process is allowed to interact with
12272 * dataless objects. Make a couple of sanity-checks
12273 * here to ensure the action makes sense.
12274 */
12275 switch (op) {
12276 case NAMESPACE_HANDLER_WRITE_OP:
12277 case NAMESPACE_HANDLER_TRUNCATE_OP:
12278 case NAMESPACE_HANDLER_RENAME_OP:
12279 /*
12280 * This handles the case of the resolver itself
12281 * writing data to the file (or throwing it
12282 * away).
12283 */
12284 error = 0;
12285 break;
12286 case NAMESPACE_HANDLER_READ_OP:
12287 case NAMESPACE_HANDLER_LOOKUP_OP:
12288 /*
12289 * This handles the case of the resolver needing
12290 * to look up inside of a dataless directory while
12291 * it's in the process of materializing it (for
12292 * example, creating files or directories).
12293 */
12294 error = (vp_vtype == VDIR) ? 0 : EBADF;
12295 break;
12296 default:
12297 error = EBADF;
12298 break;
12299 }
12300 }
12301
12302 return error;
12303
12304out_req_remove:
12305 nspace_resolver_req_remove(req: &req);
12306 goto out_release_port;
12307#else
12308 return ENOTSUP;
12309#endif /* CONFIG_DATALESS_FILES */
12310}
12311
12312/*
12313 * vfs_materialize_file: Materialize a regular file.
12314 *
12315 * Inputs:
12316 * vp The dataless file to be materialized.
12317 *
12318 * op What kind of operation is being performed:
12319 * -> NAMESPACE_HANDLER_READ_OP
12320 * -> NAMESPACE_HANDLER_WRITE_OP
12321 * -> NAMESPACE_HANDLER_LINK_CREATE
12322 * -> NAMESPACE_HANDLER_DELETE_OP
12323 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12324 * -> NAMESPACE_HANDLER_RENAME_OP
12325 *
12326 * offset offset of I/O for READ or WRITE. Ignored for
12327 * other ops.
12328 *
12329 * size size of I/O for READ or WRITE Ignored for
12330 * other ops.
12331 *
12332 * If offset or size are -1 for a READ or WRITE, then the resolver should
12333 * consider the range to be unknown.
12334 *
12335 * Upon successful return, the caller may proceed with the operation.
12336 * N.B. the file may still be "dataless" in this case.
12337 */
12338int
12339vfs_materialize_file(
12340 struct vnode *vp,
12341 uint64_t op,
12342 int64_t offset,
12343 int64_t size)
12344{
12345 if (vp->v_type != VREG) {
12346 return EFTYPE;
12347 }
12348 return vfs_materialize_item(vp, op: (uint32_t)op, offset, size, NULL, namelen: 0,
12349 NULL);
12350}
12351
12352/*
12353 * vfs_materialize_dir:
12354 *
12355 * Inputs:
12356 * vp The dataless directory to be materialized.
12357 *
12358 * op What kind of operation is being performed:
12359 * -> NAMESPACE_HANDLER_READ_OP
12360 * -> NAMESPACE_HANDLER_WRITE_OP
12361 * -> NAMESPACE_HANDLER_DELETE_OP
12362 * -> NAMESPACE_HANDLER_RENAME_OP
12363 * -> NAMESPACE_HANDLER_LOOKUP_OP
12364 *
12365 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12366 * other ops. May or may not be NUL-terminated; see below.
12367 *
12368 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12369 * terminated and namelen is the number of valid bytes in
12370 * lookup_name. If zero, then lookup_name is assumed to be
12371 * NUL-terminated.
12372 *
12373 * Upon successful return, the caller may proceed with the operation.
12374 * N.B. the directory may still be "dataless" in this case.
12375 */
12376int
12377vfs_materialize_dir(
12378 struct vnode *vp,
12379 uint64_t op,
12380 char *lookup_name,
12381 size_t namelen)
12382{
12383 if (vp->v_type != VDIR) {
12384 return EFTYPE;
12385 }
12386 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12387 return EINVAL;
12388 }
12389 return vfs_materialize_item(vp, op: (uint32_t)op, offset: 0, size: 0, lookup_name,
12390 namelen, NULL);
12391}
12392
12393/*
12394 * vfs_materialize_reparent:
12395 *
12396 * Inputs:
12397 * vp The dataless file or directory to be materialized.
12398 *
12399 * tdvp The new parent directory for the dataless file.
12400 *
12401 * Upon successful return, the caller may proceed with the operation.
12402 * N.B. the item may still be "dataless" in this case.
12403 */
12404int
12405vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12406{
12407 if (vp->v_type != VDIR && vp->v_type != VREG) {
12408 return EFTYPE;
12409 }
12410 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12411 offset: 0, size: 0, NULL, namelen: 0, tdvp);
12412}
12413
12414#if 0
12415static int
12416build_volfs_path(struct vnode *vp, char *path, int *len)
12417{
12418 struct vnode_attr va;
12419 int ret;
12420
12421 VATTR_INIT(&va);
12422 VATTR_WANTED(&va, va_fsid);
12423 VATTR_WANTED(&va, va_fileid);
12424
12425 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12426 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12427 ret = -1;
12428 } else {
12429 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12430 ret = 0;
12431 }
12432
12433 return ret;
12434}
12435#endif
12436
12437static unsigned long
12438fsctl_bogus_command_compat(unsigned long cmd)
12439{
12440 switch (cmd) {
12441 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12442 return FSIOC_SYNC_VOLUME;
12443 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12444 return FSIOC_ROUTEFS_SETROUTEID;
12445 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12446 return FSIOC_SET_PACKAGE_EXTS;
12447 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12448 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12449 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12450 return DISK_CONDITIONER_IOC_GET;
12451 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12452 return DISK_CONDITIONER_IOC_SET;
12453 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12454 return FSIOC_FIOSEEKHOLE;
12455 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12456 return FSIOC_FIOSEEKDATA;
12457 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12458 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12459 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12460 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12461 }
12462
12463 return cmd;
12464}
12465
12466static int
12467cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12468{
12469 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, data: arg, FWRITE, ctx);
12470}
12471
12472static int __attribute__((noinline))
12473handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12474{
12475 struct vfs_attr vfa;
12476 mount_t mp = vp->v_mount;
12477 unsigned arg;
12478 int error;
12479
12480 /* record vid of vp so we can drop it below. */
12481 uint32_t vvid = vp->v_id;
12482
12483 /*
12484 * Then grab mount_iterref so that we can release the vnode.
12485 * Without this, a thread may call vnode_iterate_prepare then
12486 * get into a deadlock because we've never released the root vp
12487 */
12488 error = mount_iterref(mp, 0);
12489 if (error) {
12490 return error;
12491 }
12492 vnode_hold(vp);
12493 vnode_put(vp);
12494
12495 arg = MNT_NOWAIT;
12496 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12497 arg = MNT_WAIT;
12498 }
12499
12500 /*
12501 * If the filessytem supports multiple filesytems in a
12502 * partition (For eg APFS volumes in a container, it knows
12503 * that the waitfor argument to VFS_SYNC are flags.
12504 */
12505 VFSATTR_INIT(&vfa);
12506 VFSATTR_WANTED(&vfa, f_capabilities);
12507 if ((vfs_getattr(mp, vfa: &vfa, ctx: vfs_context_current()) == 0) &&
12508 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12509 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12510 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12511 arg |= MNT_VOLUME;
12512 }
12513
12514 /* issue the sync for this volume */
12515 (void)sync_callback(mp, arg: &arg);
12516
12517 /*
12518 * Then release the mount_iterref once we're done syncing; it's not
12519 * needed for the VNOP_IOCTL below
12520 */
12521 mount_iterdrop(mp);
12522
12523 if (arg & FSCTL_SYNC_FULLSYNC) {
12524 /* re-obtain vnode iocount on the root vp, if possible */
12525 error = vnode_getwithvid(vp, vvid);
12526 if (error == 0) {
12527 error = VNOP_IOCTL(vp, F_FULLFSYNC, data: (caddr_t)NULL, fflag: 0, ctx);
12528 vnode_put(vp);
12529 }
12530 }
12531 vnode_drop(vp);
12532 /* mark the argument VP as having been released */
12533 *arg_vp = NULL;
12534 return error;
12535}
12536
12537#if ROUTEFS
12538static int __attribute__((noinline))
12539handle_routes(user_addr_t udata)
12540{
12541 char routepath[MAXPATHLEN];
12542 size_t len = 0;
12543 int error;
12544
12545 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12546 return error;
12547 }
12548 bzero(routepath, MAXPATHLEN);
12549 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12550 if (error) {
12551 return error;
12552 }
12553 error = routefs_kernel_mount(routepath);
12554 return error;
12555}
12556#endif
12557
12558static int __attribute__((noinline))
12559handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12560{
12561 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12562 struct vnode_attr va;
12563 int error;
12564
12565 VATTR_INIT(&va);
12566 VATTR_SET(&va, va_flags, cas->new_flags);
12567
12568 error = chflags0(vp, va: &va, setattr: cas_bsdflags_setattr, arg: cas, ctx);
12569
12570#if CONFIG_FSE
12571 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12572 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12573 }
12574#endif
12575
12576 return error;
12577}
12578
12579static int __attribute__((noinline))
12580handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12581{
12582 struct mount *mp = NULL;
12583 errno_t rootauth = 0;
12584
12585 mp = vp->v_mount;
12586
12587 /*
12588 * query the underlying FS and see if it reports something
12589 * sane for this vnode. If volume is authenticated via
12590 * chunklist, leave that for the caller to determine.
12591 */
12592 rootauth = VNOP_IOCTL(vp, command: cmd, data, fflag: (int)options, ctx);
12593
12594 return rootauth;
12595}
12596
12597#define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12598 "com.apple.private.kernel.set-package-extensions"
12599
12600/*
12601 * Make a filesystem-specific control call:
12602 */
12603/* ARGSUSED */
12604static int
12605fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12606{
12607 int error = 0;
12608 boolean_t is64bit;
12609 u_int size;
12610#define STK_PARAMS 128
12611 char stkbuf[STK_PARAMS] = {0};
12612 caddr_t data, memp;
12613 vnode_t vp = *arg_vp;
12614
12615 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12616 return ENOTTY;
12617 }
12618
12619 cmd = fsctl_bogus_command_compat(cmd);
12620
12621 size = IOCPARM_LEN(cmd);
12622 if (size > IOCPARM_MAX) {
12623 return EINVAL;
12624 }
12625
12626 is64bit = proc_is64bit(p);
12627
12628 memp = NULL;
12629
12630 if (size > sizeof(stkbuf)) {
12631 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12632 return ENOMEM;
12633 }
12634 data = memp;
12635 } else {
12636 data = &stkbuf[0];
12637 };
12638
12639 if (cmd & IOC_IN) {
12640 if (size) {
12641 error = copyin(udata, data, size);
12642 if (error) {
12643 if (memp) {
12644 kfree_data(memp, size);
12645 }
12646 return error;
12647 }
12648 } else {
12649 if (is64bit) {
12650 *(user_addr_t *)data = udata;
12651 } else {
12652 *(uint32_t *)data = (uint32_t)udata;
12653 }
12654 };
12655 } else if ((cmd & IOC_OUT) && size) {
12656 /*
12657 * Zero the buffer so the user always
12658 * gets back something deterministic.
12659 */
12660 bzero(s: data, n: size);
12661 } else if (cmd & IOC_VOID) {
12662 if (is64bit) {
12663 *(user_addr_t *)data = udata;
12664 } else {
12665 *(uint32_t *)data = (uint32_t)udata;
12666 }
12667 }
12668
12669 /* Check to see if it's a generic command */
12670 switch (cmd) {
12671 case FSIOC_SYNC_VOLUME:
12672 error = handle_sync_volume(vp, arg_vp, data, ctx);
12673 break;
12674
12675 case FSIOC_ROUTEFS_SETROUTEID:
12676#if ROUTEFS
12677 error = handle_routes(udata);
12678#endif
12679 break;
12680
12681 case FSIOC_SET_PACKAGE_EXTS: {
12682 user_addr_t ext_strings;
12683 uint32_t num_entries;
12684 uint32_t max_width;
12685
12686 if (!IOTaskHasEntitlement(task: vfs_context_task(ctx),
12687 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12688 error = EPERM;
12689 break;
12690 }
12691
12692 if ((is64bit && size != sizeof(user64_package_ext_info))
12693 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12694 // either you're 64-bit and passed a 64-bit struct or
12695 // you're 32-bit and passed a 32-bit struct. otherwise
12696 // it's not ok.
12697 error = EINVAL;
12698 break;
12699 }
12700
12701 if (is64bit) {
12702 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12703 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12704 }
12705 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12706 num_entries = ((user64_package_ext_info *)data)->num_entries;
12707 max_width = ((user64_package_ext_info *)data)->max_width;
12708 } else {
12709 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12710 num_entries = ((user32_package_ext_info *)data)->num_entries;
12711 max_width = ((user32_package_ext_info *)data)->max_width;
12712 }
12713 error = set_package_extensions_table(data: ext_strings, nentries: num_entries, maxwidth: max_width);
12714 }
12715 break;
12716
12717 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12718 {
12719 mount_t mp;
12720
12721 if ((error = suser(cred: kauth_cred_get(), acflag: &(current_proc()->p_acflag)))) {
12722 break;
12723 }
12724 if ((mp = vp->v_mount) != NULL) {
12725 mount_lock(mp);
12726 if (data[0] != 0) {
12727 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12728 if (!data[i]) {
12729 goto continue_copy;
12730 }
12731 }
12732 /*
12733 * Getting here means we have a user data
12734 * string which has no NULL termination in
12735 * its first MFSTYPENAMELEN bytes. This is
12736 * bogus, let's avoid strlcpy-ing the read
12737 * data and return an error.
12738 */
12739 error = EINVAL;
12740 goto unlock;
12741continue_copy:
12742 vfs_setfstypename_locked(mp, name: data);
12743 if (vfs_isrdonly(mp) &&
12744 strcmp(s1: data, s2: "mtmfs") == 0) {
12745 mp->mnt_kern_flag |=
12746 MNTK_EXTENDED_SECURITY;
12747 mp->mnt_kern_flag &=
12748 ~MNTK_AUTH_OPAQUE;
12749 }
12750 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12751 const char *name =
12752 vfs_getfstypenameref_locked(mp, NULL);
12753 if (strcmp(s1: name, s2: "mtmfs") == 0) {
12754 mp->mnt_kern_flag &=
12755 ~MNTK_EXTENDED_SECURITY;
12756 }
12757 vfs_setfstypename_locked(mp, NULL);
12758 }
12759unlock:
12760 mount_unlock(mp);
12761 }
12762 }
12763 break;
12764
12765 case DISK_CONDITIONER_IOC_GET: {
12766 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12767 }
12768 break;
12769
12770 case DISK_CONDITIONER_IOC_SET: {
12771 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12772 }
12773 break;
12774
12775 case FSIOC_CAS_BSDFLAGS:
12776 error = handle_flags(vp, data, ctx);
12777 break;
12778
12779 case FSIOC_FD_ONLY_OPEN_ONCE: {
12780 error = 0;
12781 if (vnode_usecount(vp) > 1) {
12782 vnode_lock_spin(vp);
12783 if (vp->v_lflag & VL_HASSTREAMS) {
12784 if (vnode_isinuse_locked(vp, 1, 1)) {
12785 error = EBUSY;
12786 }
12787 } else if (vnode_usecount(vp) > 1) {
12788 error = EBUSY;
12789 }
12790 vnode_unlock(vp);
12791 }
12792 }
12793 break;
12794
12795 case FSIOC_EVAL_ROOTAUTH:
12796 error = handle_auth(vp, cmd, data, options, ctx);
12797 break;
12798
12799 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12800 error = test_fse_access_granted(vp, type: (unsigned long)udata, ctx);
12801 break;
12802
12803#if CONFIG_EXCLAVES
12804 case FSIOC_EXCLAVE_FS_REGISTER:
12805 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12806 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12807 } else {
12808 error = EPERM;
12809 }
12810 break;
12811
12812 case FSIOC_EXCLAVE_FS_UNREGISTER:
12813 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12814 error = vfs_exclave_fs_unregister(vp);
12815 } else {
12816 error = EPERM;
12817 }
12818 break;
12819
12820 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12821 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12822 exclave_fs_base_dir_t *dirs = NULL;
12823 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12824 error = EPERM;
12825 break;
12826 }
12827 if (get_base_dirs->base_dirs) {
12828 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12829 error = EINVAL;
12830 break;
12831 }
12832 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12833 if (!dirs) {
12834 error = ENOSPC;
12835 break;
12836 }
12837 }
12838 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12839 if (!error && dirs) {
12840 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12841 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12842 }
12843 if (dirs) {
12844 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12845 }
12846 }
12847 break;
12848#endif
12849
12850 default: {
12851 /*
12852 * Other, known commands shouldn't be passed down here.
12853 * (When adding a selector to this list, it may be prudent
12854 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12855 */
12856 switch (cmd) {
12857 case F_PUNCHHOLE:
12858 case F_TRIM_ACTIVE_FILE:
12859 case F_RDADVISE:
12860 case F_TRANSCODEKEY:
12861 case F_GETPROTECTIONLEVEL:
12862 case F_GETDEFAULTPROTLEVEL:
12863 case F_MAKECOMPRESSED:
12864 case F_SET_GREEDY_MODE:
12865 case F_SETSTATICCONTENT:
12866 case F_SETIOTYPE:
12867 case F_SETBACKINGSTORE:
12868 case F_GETPATH_MTMINFO:
12869 case APFSIOC_REVERT_TO_SNAPSHOT:
12870 case FSIOC_FIOSEEKHOLE:
12871 case FSIOC_FIOSEEKDATA:
12872 case HFS_GET_BOOT_INFO:
12873 case HFS_SET_BOOT_INFO:
12874 case FIOPINSWAP:
12875 case F_CHKCLEAN:
12876 case F_FULLFSYNC:
12877 case F_BARRIERFSYNC:
12878 case F_FREEZE_FS:
12879 case F_THAW_FS:
12880 case FSIOC_KERNEL_ROOTAUTH:
12881 case FSIOC_GRAFT_FS:
12882 case FSIOC_UNGRAFT_FS:
12883 case FSIOC_AUTH_FS:
12884 error = EINVAL;
12885 goto outdrop;
12886 }
12887 /* Invoke the filesystem-specific code */
12888 error = VNOP_IOCTL(vp, command: cmd, data, fflag: (int)options, ctx);
12889 }
12890 } /* end switch stmt */
12891
12892 /*
12893 * if no errors, copy any data to user. Size was
12894 * already set and checked above.
12895 */
12896 if (error == 0 && (cmd & IOC_OUT) && size) {
12897 error = copyout(data, udata, size);
12898 }
12899
12900outdrop:
12901 if (memp) {
12902 kfree_data(memp, size);
12903 }
12904
12905 return error;
12906}
12907
12908/* ARGSUSED */
12909int
12910fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12911{
12912 int error;
12913 struct nameidata nd;
12914 uint32_t nameiflags;
12915 vnode_t vp = NULL;
12916 vfs_context_t ctx = vfs_context_current();
12917
12918 AUDIT_ARG(cmd, (int)uap->cmd);
12919 AUDIT_ARG(value32, uap->options);
12920 /* Get the vnode for the file we are getting info on: */
12921 nameiflags = 0;
12922 //
12923 // if we come through fsctl() then the file is by definition not open.
12924 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12925 // lest the caller mistakenly thinks the only open is their own (but in
12926 // reality it's someone elses).
12927 //
12928 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12929 return EINVAL;
12930 }
12931 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12932 nameiflags |= FOLLOW;
12933 }
12934 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12935 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12936 }
12937 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12938 UIO_USERSPACE, uap->path, ctx);
12939 if ((error = namei(ndp: &nd))) {
12940 goto done;
12941 }
12942 vp = nd.ni_vp;
12943 nameidone(&nd);
12944
12945#if CONFIG_MACF
12946 error = mac_mount_check_fsctl(ctx, mp: vnode_mount(vp), cmd: uap->cmd);
12947 if (error) {
12948 goto done;
12949 }
12950#endif
12951
12952 error = fsctl_internal(p, arg_vp: &vp, cmd: uap->cmd, udata: (user_addr_t)uap->data, options: uap->options, ctx);
12953
12954done:
12955 if (vp) {
12956 vnode_put(vp);
12957 }
12958 return error;
12959}
12960/* ARGSUSED */
12961int
12962ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12963{
12964 int error;
12965 vnode_t vp = NULL;
12966 vfs_context_t ctx = vfs_context_current();
12967 int fd = -1;
12968
12969 AUDIT_ARG(fd, uap->fd);
12970 AUDIT_ARG(cmd, (int)uap->cmd);
12971 AUDIT_ARG(value32, uap->options);
12972
12973 /* Get the vnode for the file we are getting info on: */
12974 if ((error = file_vnode(uap->fd, &vp))) {
12975 return error;
12976 }
12977 fd = uap->fd;
12978 if ((error = vnode_getwithref(vp))) {
12979 file_drop(fd);
12980 return error;
12981 }
12982
12983#if CONFIG_MACF
12984 if ((error = mac_mount_check_fsctl(ctx, mp: vnode_mount(vp), cmd: uap->cmd))) {
12985 file_drop(fd);
12986 vnode_put(vp);
12987 return error;
12988 }
12989#endif
12990
12991 error = fsctl_internal(p, arg_vp: &vp, cmd: uap->cmd, udata: (user_addr_t)uap->data, options: uap->options, ctx);
12992
12993 file_drop(fd);
12994
12995 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12996 if (vp) {
12997 vnode_put(vp);
12998 }
12999
13000 return error;
13001}
13002/* end of fsctl system call */
13003
13004#define FILESEC_ACCESS_ENTITLEMENT \
13005 "com.apple.private.vfs.filesec-access"
13006
13007static int
13008xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13009{
13010 if (strcmp(s1: attrname, KAUTH_FILESEC_XATTR) == 0) {
13011 /*
13012 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13013 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13014 */
13015 if ((!setting && vfs_context_issuser(ctx)) ||
13016 IOTaskHasEntitlement(task: vfs_context_task(ctx),
13017 FILESEC_ACCESS_ENTITLEMENT)) {
13018 return 0;
13019 }
13020 }
13021
13022 return EPERM;
13023}
13024
13025/*
13026 * Retrieve the data of an extended attribute.
13027 */
13028int
13029getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13030{
13031 vnode_t vp;
13032 struct nameidata nd;
13033 char attrname[XATTR_MAXNAMELEN + 1];
13034 vfs_context_t ctx = vfs_context_current();
13035 uio_t auio = NULL;
13036 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13037 size_t attrsize = 0;
13038 size_t namelen;
13039 u_int32_t nameiflags;
13040 int error;
13041 UIO_STACKBUF(uio_buf, 1);
13042
13043 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13044 return EINVAL;
13045 }
13046
13047 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13048 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13049 if ((error = namei(ndp: &nd))) {
13050 return error;
13051 }
13052 vp = nd.ni_vp;
13053 nameidone(&nd);
13054
13055 error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13056 if (error != 0) {
13057 goto out;
13058 }
13059 if (xattr_protected(attrname) &&
13060 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13061 goto out;
13062 }
13063 /*
13064 * the specific check for 0xffffffff is a hack to preserve
13065 * binaray compatibilty in K64 with applications that discovered
13066 * that passing in a buf pointer and a size of -1 resulted in
13067 * just the size of the indicated extended attribute being returned.
13068 * this isn't part of the documented behavior, but because of the
13069 * original implemtation's check for "uap->size > 0", this behavior
13070 * was allowed. In K32 that check turned into a signed comparison
13071 * even though uap->size is unsigned... in K64, we blow by that
13072 * check because uap->size is unsigned and doesn't get sign smeared
13073 * in the munger for a 32 bit user app. we also need to add a
13074 * check to limit the maximum size of the buffer being passed in...
13075 * unfortunately, the underlying fileystems seem to just malloc
13076 * the requested size even if the actual extended attribute is tiny.
13077 * because that malloc is for kernel wired memory, we have to put a
13078 * sane limit on it.
13079 *
13080 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13081 * U64 running on K64 will yield -1 (64 bits wide)
13082 * U32/U64 running on K32 will yield -1 (32 bits wide)
13083 */
13084 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13085 goto no_uio;
13086 }
13087
13088 if (uap->value) {
13089 if (uap->size > (size_t)XATTR_MAXSIZE) {
13090 uap->size = XATTR_MAXSIZE;
13091 }
13092
13093 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_READ,
13094 a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
13095 uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13096 }
13097no_uio:
13098 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13099out:
13100 vnode_put(vp);
13101
13102 if (auio) {
13103 *retval = uap->size - uio_resid(a_uio: auio);
13104 } else {
13105 *retval = (user_ssize_t)attrsize;
13106 }
13107
13108 return error;
13109}
13110
13111/*
13112 * Retrieve the data of an extended attribute.
13113 */
13114int
13115fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13116{
13117 vnode_t vp;
13118 char attrname[XATTR_MAXNAMELEN + 1];
13119 vfs_context_t ctx = vfs_context_current();
13120 uio_t auio = NULL;
13121 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13122 size_t attrsize = 0;
13123 size_t namelen;
13124 int error;
13125 UIO_STACKBUF(uio_buf, 1);
13126
13127 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13128 return EINVAL;
13129 }
13130
13131 if ((error = file_vnode(uap->fd, &vp))) {
13132 return error;
13133 }
13134 if ((error = vnode_getwithref(vp))) {
13135 file_drop(uap->fd);
13136 return error;
13137 }
13138 error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13139 if (error != 0) {
13140 goto out;
13141 }
13142 if (xattr_protected(attrname) &&
13143 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13144 goto out;
13145 }
13146 if (uap->value && uap->size > 0) {
13147 if (uap->size > (size_t)XATTR_MAXSIZE) {
13148 uap->size = XATTR_MAXSIZE;
13149 }
13150
13151 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_READ,
13152 a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
13153 uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13154 }
13155
13156 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13157out:
13158 (void)vnode_put(vp);
13159 file_drop(uap->fd);
13160
13161 if (auio) {
13162 *retval = uap->size - uio_resid(a_uio: auio);
13163 } else {
13164 *retval = (user_ssize_t)attrsize;
13165 }
13166 return error;
13167}
13168
13169/* struct for checkdirs iteration */
13170struct setxattr_ctx {
13171 struct nameidata nd;
13172 char attrname[XATTR_MAXNAMELEN + 1];
13173 UIO_STACKBUF(uio_buf, 1);
13174};
13175
13176/*
13177 * Set the data of an extended attribute.
13178 */
13179int
13180setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13181{
13182 vnode_t vp;
13183 vfs_context_t ctx = vfs_context_current();
13184 uio_t auio = NULL;
13185 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13186 size_t namelen;
13187 u_int32_t nameiflags;
13188 int error;
13189 struct setxattr_ctx *sactx;
13190
13191 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13192 return EINVAL;
13193 }
13194
13195 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13196 if (sactx == NULL) {
13197 return ENOMEM;
13198 }
13199
13200 error = copyinstr(uaddr: uap->attrname, kaddr: sactx->attrname, len: sizeof(sactx->attrname), done: &namelen);
13201 if (error != 0) {
13202 if (error == EPERM) {
13203 /* if the string won't fit in attrname, copyinstr emits EPERM */
13204 error = ENAMETOOLONG;
13205 }
13206 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13207 goto out;
13208 }
13209 if (xattr_protected(sactx->attrname) &&
13210 (error = xattr_entitlement_check(attrname: sactx->attrname, ctx, true)) != 0) {
13211 goto out;
13212 }
13213 if (uap->size != 0 && uap->value == 0) {
13214 error = EINVAL;
13215 goto out;
13216 }
13217 if (uap->size > INT_MAX) {
13218 error = E2BIG;
13219 goto out;
13220 }
13221
13222 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13223#if CONFIG_FILE_LEASES
13224 nameiflags |= WANTPARENT;
13225#endif
13226 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13227 if ((error = namei(ndp: &sactx->nd))) {
13228 goto out;
13229 }
13230 vp = sactx->nd.ni_vp;
13231#if CONFIG_FILE_LEASES
13232 vnode_breakdirlease(vp: sactx->nd.ni_dvp, false, O_WRONLY);
13233 vnode_put(vp: sactx->nd.ni_dvp);
13234#endif
13235 nameidone(&sactx->nd);
13236
13237 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_WRITE,
13238 a_buf_p: &sactx->uio_buf[0], a_buffer_size: sizeof(sactx->uio_buf));
13239 uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13240
13241 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13242#if CONFIG_FSE
13243 if (error == 0) {
13244 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13245 FSE_ARG_VNODE, vp,
13246 FSE_ARG_DONE);
13247 }
13248#endif
13249 vnode_put(vp);
13250out:
13251 kfree_type(struct setxattr_ctx, sactx);
13252 *retval = 0;
13253 return error;
13254}
13255
13256/*
13257 * Set the data of an extended attribute.
13258 */
13259int
13260fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13261{
13262 vnode_t vp;
13263 char attrname[XATTR_MAXNAMELEN + 1];
13264 vfs_context_t ctx = vfs_context_current();
13265 uio_t auio = NULL;
13266 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13267 size_t namelen;
13268 int error;
13269 UIO_STACKBUF(uio_buf, 1);
13270
13271 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13272 return EINVAL;
13273 }
13274
13275 error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13276 if (error != 0) {
13277 if (error == EPERM) {
13278 /* if the string won't fit in attrname, copyinstr emits EPERM */
13279 return ENAMETOOLONG;
13280 }
13281 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13282 return error;
13283 }
13284 if (xattr_protected(attrname) &&
13285 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13286 return error;
13287 }
13288 if (uap->size != 0 && uap->value == 0) {
13289 return EINVAL;
13290 }
13291 if (uap->size > INT_MAX) {
13292 return E2BIG;
13293 }
13294 if ((error = file_vnode(uap->fd, &vp))) {
13295 return error;
13296 }
13297 if ((error = vnode_getwithref(vp))) {
13298 file_drop(uap->fd);
13299 return error;
13300 }
13301
13302#if CONFIG_FILE_LEASES
13303 vnode_breakdirlease(vp, true, O_WRONLY);
13304#endif
13305
13306 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: uap->position, a_spacetype: spacetype, a_iodirection: UIO_WRITE,
13307 a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
13308 uio_addiov(a_uio: auio, a_baseaddr: uap->value, a_length: uap->size);
13309
13310 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13311#if CONFIG_FSE
13312 if (error == 0) {
13313 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13314 FSE_ARG_VNODE, vp,
13315 FSE_ARG_DONE);
13316 }
13317#endif
13318 vnode_put(vp);
13319 file_drop(uap->fd);
13320 *retval = 0;
13321 return error;
13322}
13323
13324/*
13325 * Remove an extended attribute.
13326 * XXX Code duplication here.
13327 */
13328int
13329removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13330{
13331 vnode_t vp;
13332 struct nameidata nd;
13333 char attrname[XATTR_MAXNAMELEN + 1];
13334 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13335 vfs_context_t ctx = vfs_context_current();
13336 size_t namelen;
13337 u_int32_t nameiflags;
13338 int error;
13339
13340 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13341 return EINVAL;
13342 }
13343
13344 error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13345 if (error != 0) {
13346 return error;
13347 }
13348 if (xattr_protected(attrname)) {
13349 return EPERM;
13350 }
13351 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13352#if CONFIG_FILE_LEASES
13353 nameiflags |= WANTPARENT;
13354#endif
13355 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13356 if ((error = namei(ndp: &nd))) {
13357 return error;
13358 }
13359 vp = nd.ni_vp;
13360#if CONFIG_FILE_LEASES
13361 vnode_breakdirlease(vp: nd.ni_dvp, false, O_WRONLY);
13362 vnode_put(vp: nd.ni_dvp);
13363#endif
13364 nameidone(&nd);
13365
13366 error = vn_removexattr(vp, attrname, uap->options, ctx);
13367#if CONFIG_FSE
13368 if (error == 0) {
13369 add_fsevent(FSE_XATTR_REMOVED, ctx,
13370 FSE_ARG_VNODE, vp,
13371 FSE_ARG_DONE);
13372 }
13373#endif
13374 vnode_put(vp);
13375 *retval = 0;
13376 return error;
13377}
13378
13379/*
13380 * Remove an extended attribute.
13381 * XXX Code duplication here.
13382 */
13383int
13384fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13385{
13386 vnode_t vp;
13387 char attrname[XATTR_MAXNAMELEN + 1];
13388 size_t namelen;
13389 int error;
13390#if CONFIG_FSE
13391 vfs_context_t ctx = vfs_context_current();
13392#endif
13393
13394 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13395 return EINVAL;
13396 }
13397
13398 error = copyinstr(uaddr: uap->attrname, kaddr: attrname, len: sizeof(attrname), done: &namelen);
13399 if (error != 0) {
13400 return error;
13401 }
13402 if (xattr_protected(attrname)) {
13403 return EPERM;
13404 }
13405 if ((error = file_vnode(uap->fd, &vp))) {
13406 return error;
13407 }
13408 if ((error = vnode_getwithref(vp))) {
13409 file_drop(uap->fd);
13410 return error;
13411 }
13412
13413#if CONFIG_FILE_LEASES
13414 vnode_breakdirlease(vp, true, O_WRONLY);
13415#endif
13416
13417 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13418#if CONFIG_FSE
13419 if (error == 0) {
13420 add_fsevent(FSE_XATTR_REMOVED, ctx,
13421 FSE_ARG_VNODE, vp,
13422 FSE_ARG_DONE);
13423 }
13424#endif
13425 vnode_put(vp);
13426 file_drop(uap->fd);
13427 *retval = 0;
13428 return error;
13429}
13430
13431/*
13432 * Retrieve the list of extended attribute names.
13433 * XXX Code duplication here.
13434 */
13435int
13436listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13437{
13438 vnode_t vp;
13439 struct nameidata nd;
13440 vfs_context_t ctx = vfs_context_current();
13441 uio_t auio = NULL;
13442 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13443 size_t attrsize = 0;
13444 u_int32_t nameiflags;
13445 int error;
13446 UIO_STACKBUF(uio_buf, 1);
13447
13448 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13449 return EINVAL;
13450 }
13451
13452 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13453 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13454 if ((error = namei(ndp: &nd))) {
13455 return error;
13456 }
13457 vp = nd.ni_vp;
13458 nameidone(&nd);
13459 if (uap->namebuf != 0 && uap->bufsize > 0) {
13460 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: 0, a_spacetype: spacetype, a_iodirection: UIO_READ,
13461 a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
13462 uio_addiov(a_uio: auio, a_baseaddr: uap->namebuf, a_length: uap->bufsize);
13463 }
13464
13465 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13466
13467 vnode_put(vp);
13468 if (auio) {
13469 *retval = (user_ssize_t)uap->bufsize - uio_resid(a_uio: auio);
13470 } else {
13471 *retval = (user_ssize_t)attrsize;
13472 }
13473 return error;
13474}
13475
13476/*
13477 * Retrieve the list of extended attribute names.
13478 * XXX Code duplication here.
13479 */
13480int
13481flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13482{
13483 vnode_t vp;
13484 uio_t auio = NULL;
13485 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13486 size_t attrsize = 0;
13487 int error;
13488 UIO_STACKBUF(uio_buf, 1);
13489
13490 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13491 return EINVAL;
13492 }
13493
13494 if ((error = file_vnode(uap->fd, &vp))) {
13495 return error;
13496 }
13497 if ((error = vnode_getwithref(vp))) {
13498 file_drop(uap->fd);
13499 return error;
13500 }
13501 if (uap->namebuf != 0 && uap->bufsize > 0) {
13502 auio = uio_createwithbuffer(a_iovcount: 1, a_offset: 0, a_spacetype: spacetype,
13503 a_iodirection: UIO_READ, a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf));
13504 uio_addiov(a_uio: auio, a_baseaddr: uap->namebuf, a_length: uap->bufsize);
13505 }
13506
13507 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13508
13509 vnode_put(vp);
13510 file_drop(uap->fd);
13511 if (auio) {
13512 *retval = (user_ssize_t)uap->bufsize - uio_resid(a_uio: auio);
13513 } else {
13514 *retval = (user_ssize_t)attrsize;
13515 }
13516 return error;
13517}
13518
13519int
13520fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13521 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13522{
13523 int error;
13524 struct mount *mp = NULL;
13525 vnode_t vp;
13526 int length;
13527 int bpflags;
13528 /* maximum number of times to retry build_path */
13529 unsigned int retries = 0x10;
13530
13531 if (bufsize > FSGETPATH_MAXBUFLEN) {
13532 return EINVAL;
13533 }
13534
13535 if (buf == NULL) {
13536 return ENOMEM;
13537 }
13538
13539retry:
13540 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13541 error = ENOTSUP; /* unexpected failure */
13542 return ENOTSUP;
13543 }
13544
13545#if CONFIG_UNION_MOUNTS
13546unionget:
13547#endif /* CONFIG_UNION_MOUNTS */
13548 if (objid == 2) {
13549 struct vfs_attr vfsattr;
13550 int use_vfs_root = TRUE;
13551
13552 VFSATTR_INIT(&vfsattr);
13553 VFSATTR_WANTED(&vfsattr, f_capabilities);
13554 if (!(options & FSOPT_ISREALFSID) &&
13555 vfs_getattr(mp, vfa: &vfsattr, ctx: vfs_context_kernel()) == 0 &&
13556 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13557 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13558 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13559 use_vfs_root = FALSE;
13560 }
13561 }
13562
13563 if (use_vfs_root) {
13564 error = VFS_ROOT(mp, &vp, ctx);
13565 } else {
13566 error = VFS_VGET(mp, objid, &vp, ctx);
13567 }
13568 } else {
13569 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13570 }
13571
13572#if CONFIG_UNION_MOUNTS
13573 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13574 /*
13575 * If the fileid isn't found and we're in a union
13576 * mount volume, then see if the fileid is in the
13577 * mounted-on volume.
13578 */
13579 struct mount *tmp = mp;
13580 mp = vnode_mount(vp: tmp->mnt_vnodecovered);
13581 vfs_unbusy(mp: tmp);
13582 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13583 goto unionget;
13584 }
13585 } else {
13586 vfs_unbusy(mp);
13587 }
13588#else
13589 vfs_unbusy(mp);
13590#endif /* CONFIG_UNION_MOUNTS */
13591
13592 if (error) {
13593 return error;
13594 }
13595
13596#if CONFIG_MACF
13597 error = mac_vnode_check_fsgetpath(ctx, vp);
13598 if (error) {
13599 vnode_put(vp);
13600 return error;
13601 }
13602#endif
13603
13604 /* Obtain the absolute path to this vnode. */
13605 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13606 if (options & FSOPT_NOFIRMLINKPATH) {
13607 bpflags |= BUILDPATH_NO_FIRMLINK;
13608 }
13609 bpflags |= BUILDPATH_CHECK_MOVED;
13610 error = build_path(first_vp: vp, buff: buf, buflen: (int)bufsize, outlen: &length, flags: bpflags, ctx);
13611 vnode_put(vp);
13612
13613 if (error) {
13614 /* there was a race building the path, try a few more times */
13615 if (error == EAGAIN) {
13616 --retries;
13617 if (retries > 0) {
13618 goto retry;
13619 }
13620
13621 error = ENOENT;
13622 }
13623 goto out;
13624 }
13625
13626 AUDIT_ARG(text, buf);
13627
13628 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13629 unsigned long path_words[NUMPARMS];
13630 size_t path_len = sizeof(path_words);
13631
13632 if ((size_t)length < path_len) {
13633 memcpy(dst: (char *)path_words, src: buf, n: length);
13634 memset(s: (char *)path_words + length, c: 0, n: path_len - length);
13635
13636 path_len = length;
13637 } else {
13638 memcpy(dst: (char *)path_words, src: buf + (length - path_len), n: path_len);
13639 }
13640
13641 kdebug_vfs_lookup(path_words, path_len: (int)path_len, vnp: vp,
13642 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13643 }
13644
13645 *pathlen = length; /* may be superseded by error */
13646
13647out:
13648 return error;
13649}
13650
13651/*
13652 * Obtain the full pathname of a file system object by id.
13653 */
13654static int
13655fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13656 uint32_t options, user_ssize_t *retval)
13657{
13658 vfs_context_t ctx = vfs_context_current();
13659 fsid_t fsid;
13660 char *realpath;
13661 int length;
13662 int error;
13663
13664 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13665 return EINVAL;
13666 }
13667
13668 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13669 return error;
13670 }
13671 AUDIT_ARG(value32, fsid.val[0]);
13672 AUDIT_ARG(value64, objid);
13673 /* Restrict output buffer size for now. */
13674
13675 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13676 return EINVAL;
13677 }
13678 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13679 if (realpath == NULL) {
13680 return ENOMEM;
13681 }
13682
13683 error = fsgetpath_internal(ctx, volfs_id: fsid.val[0], objid, bufsize, buf: realpath,
13684 options, pathlen: &length);
13685
13686 if (error) {
13687 goto out;
13688 }
13689
13690 error = copyout((caddr_t)realpath, buf, length);
13691
13692 *retval = (user_ssize_t)length; /* may be superseded by error */
13693out:
13694 kfree_data(realpath, bufsize);
13695 return error;
13696}
13697
13698int
13699fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13700{
13701 return fsgetpath_extended(buf: uap->buf, bufsize: uap->bufsize, user_fsid: uap->fsid, objid: uap->objid,
13702 options: 0, retval);
13703}
13704
13705int
13706fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13707{
13708 return fsgetpath_extended(buf: uap->buf, bufsize: uap->bufsize, user_fsid: uap->fsid, objid: uap->objid,
13709 options: uap->options, retval);
13710}
13711
13712/*
13713 * Common routine to handle various flavors of statfs data heading out
13714 * to user space.
13715 *
13716 * Returns: 0 Success
13717 * EFAULT
13718 */
13719static int
13720munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13721 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13722 boolean_t partial_copy)
13723{
13724 int error;
13725 int my_size, copy_size;
13726
13727 if (is_64_bit) {
13728 struct user64_statfs sfs;
13729 my_size = copy_size = sizeof(sfs);
13730 bzero(s: &sfs, n: my_size);
13731 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13732 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13733 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13734 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13735 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13736 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13737 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13738 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13739 sfs.f_files = (user64_long_t)sfsp->f_files;
13740 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13741 sfs.f_fsid = sfsp->f_fsid;
13742 sfs.f_owner = sfsp->f_owner;
13743 vfs_getfstypename(mp, buf: sfs.f_fstypename, MFSNAMELEN);
13744 strlcpy(dst: &sfs.f_mntonname[0], src: &sfsp->f_mntonname[0], MNAMELEN);
13745 strlcpy(dst: &sfs.f_mntfromname[0], src: &sfsp->f_mntfromname[0], MNAMELEN);
13746
13747 if (partial_copy) {
13748 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13749 }
13750 error = copyout((caddr_t)&sfs, bufp, copy_size);
13751 } else {
13752 struct user32_statfs sfs;
13753
13754 my_size = copy_size = sizeof(sfs);
13755 bzero(s: &sfs, n: my_size);
13756
13757 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13758 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13759 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13760
13761 /*
13762 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13763 * have to fudge the numbers here in that case. We inflate the blocksize in order
13764 * to reflect the filesystem size as best we can.
13765 */
13766 if ((sfsp->f_blocks > INT_MAX)
13767 /* Hack for 4061702 . I think the real fix is for Carbon to
13768 * look for some volume capability and not depend on hidden
13769 * semantics agreed between a FS and carbon.
13770 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13771 * for Carbon to set bNoVolumeSizes volume attribute.
13772 * Without this the webdavfs files cannot be copied onto
13773 * disk as they look huge. This change should not affect
13774 * XSAN as they should not setting these to -1..
13775 */
13776 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13777 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13778 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13779 int shift;
13780
13781 /*
13782 * Work out how far we have to shift the block count down to make it fit.
13783 * Note that it's possible to have to shift so far that the resulting
13784 * blocksize would be unreportably large. At that point, we will clip
13785 * any values that don't fit.
13786 *
13787 * For safety's sake, we also ensure that f_iosize is never reported as
13788 * being smaller than f_bsize.
13789 */
13790 for (shift = 0; shift < 32; shift++) {
13791 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13792 break;
13793 }
13794 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13795 break;
13796 }
13797 }
13798#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13799 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13800 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13801 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13802#undef __SHIFT_OR_CLIP
13803 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13804 sfs.f_iosize = (int)lmax(a: sfsp->f_iosize, b: sfsp->f_bsize);
13805 } else {
13806 /* filesystem is small enough to be reported honestly */
13807 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13808 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13809 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13810 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13811 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13812 }
13813 sfs.f_files = (user32_long_t)sfsp->f_files;
13814 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13815 sfs.f_fsid = sfsp->f_fsid;
13816 sfs.f_owner = sfsp->f_owner;
13817 vfs_getfstypename(mp, buf: sfs.f_fstypename, MFSNAMELEN);
13818 strlcpy(dst: &sfs.f_mntonname[0], src: &sfsp->f_mntonname[0], MNAMELEN);
13819 strlcpy(dst: &sfs.f_mntfromname[0], src: &sfsp->f_mntfromname[0], MNAMELEN);
13820
13821 if (partial_copy) {
13822 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13823 }
13824 error = copyout((caddr_t)&sfs, bufp, copy_size);
13825 }
13826
13827 if (sizep != NULL) {
13828 *sizep = my_size;
13829 }
13830 return error;
13831}
13832
13833/*
13834 * copy stat structure into user_stat structure.
13835 */
13836void
13837munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13838{
13839 bzero(s: usbp, n: sizeof(*usbp));
13840
13841 usbp->st_dev = sbp->st_dev;
13842 usbp->st_ino = sbp->st_ino;
13843 usbp->st_mode = sbp->st_mode;
13844 usbp->st_nlink = sbp->st_nlink;
13845 usbp->st_uid = sbp->st_uid;
13846 usbp->st_gid = sbp->st_gid;
13847 usbp->st_rdev = sbp->st_rdev;
13848#ifndef _POSIX_C_SOURCE
13849 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13850 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13851 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13852 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13853 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13854 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13855#else
13856 usbp->st_atime = sbp->st_atime;
13857 usbp->st_atimensec = sbp->st_atimensec;
13858 usbp->st_mtime = sbp->st_mtime;
13859 usbp->st_mtimensec = sbp->st_mtimensec;
13860 usbp->st_ctime = sbp->st_ctime;
13861 usbp->st_ctimensec = sbp->st_ctimensec;
13862#endif
13863 usbp->st_size = sbp->st_size;
13864 usbp->st_blocks = sbp->st_blocks;
13865 usbp->st_blksize = sbp->st_blksize;
13866 usbp->st_flags = sbp->st_flags;
13867 usbp->st_gen = sbp->st_gen;
13868 usbp->st_lspare = sbp->st_lspare;
13869 usbp->st_qspare[0] = sbp->st_qspare[0];
13870 usbp->st_qspare[1] = sbp->st_qspare[1];
13871}
13872
13873void
13874munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13875{
13876 bzero(s: usbp, n: sizeof(*usbp));
13877
13878 usbp->st_dev = sbp->st_dev;
13879 usbp->st_ino = sbp->st_ino;
13880 usbp->st_mode = sbp->st_mode;
13881 usbp->st_nlink = sbp->st_nlink;
13882 usbp->st_uid = sbp->st_uid;
13883 usbp->st_gid = sbp->st_gid;
13884 usbp->st_rdev = sbp->st_rdev;
13885#ifndef _POSIX_C_SOURCE
13886 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13887 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13888 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13889 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13890 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13891 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13892#else
13893 usbp->st_atime = sbp->st_atime;
13894 usbp->st_atimensec = sbp->st_atimensec;
13895 usbp->st_mtime = sbp->st_mtime;
13896 usbp->st_mtimensec = sbp->st_mtimensec;
13897 usbp->st_ctime = sbp->st_ctime;
13898 usbp->st_ctimensec = sbp->st_ctimensec;
13899#endif
13900 usbp->st_size = sbp->st_size;
13901 usbp->st_blocks = sbp->st_blocks;
13902 usbp->st_blksize = sbp->st_blksize;
13903 usbp->st_flags = sbp->st_flags;
13904 usbp->st_gen = sbp->st_gen;
13905 usbp->st_lspare = sbp->st_lspare;
13906 usbp->st_qspare[0] = sbp->st_qspare[0];
13907 usbp->st_qspare[1] = sbp->st_qspare[1];
13908}
13909
13910/*
13911 * copy stat64 structure into user_stat64 structure.
13912 */
13913void
13914munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13915{
13916 bzero(s: usbp, n: sizeof(*usbp));
13917
13918 usbp->st_dev = sbp->st_dev;
13919 usbp->st_ino = sbp->st_ino;
13920 usbp->st_mode = sbp->st_mode;
13921 usbp->st_nlink = sbp->st_nlink;
13922 usbp->st_uid = sbp->st_uid;
13923 usbp->st_gid = sbp->st_gid;
13924 usbp->st_rdev = sbp->st_rdev;
13925#ifndef _POSIX_C_SOURCE
13926 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13927 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13928 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13929 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13930 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13931 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13932 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13933 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13934#else
13935 usbp->st_atime = sbp->st_atime;
13936 usbp->st_atimensec = sbp->st_atimensec;
13937 usbp->st_mtime = sbp->st_mtime;
13938 usbp->st_mtimensec = sbp->st_mtimensec;
13939 usbp->st_ctime = sbp->st_ctime;
13940 usbp->st_ctimensec = sbp->st_ctimensec;
13941 usbp->st_birthtime = sbp->st_birthtime;
13942 usbp->st_birthtimensec = sbp->st_birthtimensec;
13943#endif
13944 usbp->st_size = sbp->st_size;
13945 usbp->st_blocks = sbp->st_blocks;
13946 usbp->st_blksize = sbp->st_blksize;
13947 usbp->st_flags = sbp->st_flags;
13948 usbp->st_gen = sbp->st_gen;
13949 usbp->st_lspare = sbp->st_lspare;
13950 usbp->st_qspare[0] = sbp->st_qspare[0];
13951 usbp->st_qspare[1] = sbp->st_qspare[1];
13952}
13953
13954void
13955munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13956{
13957 bzero(s: usbp, n: sizeof(*usbp));
13958
13959 usbp->st_dev = sbp->st_dev;
13960 usbp->st_ino = sbp->st_ino;
13961 usbp->st_mode = sbp->st_mode;
13962 usbp->st_nlink = sbp->st_nlink;
13963 usbp->st_uid = sbp->st_uid;
13964 usbp->st_gid = sbp->st_gid;
13965 usbp->st_rdev = sbp->st_rdev;
13966#ifndef _POSIX_C_SOURCE
13967 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13968 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13969 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13970 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13971 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13972 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13973 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13974 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13975#else
13976 usbp->st_atime = sbp->st_atime;
13977 usbp->st_atimensec = sbp->st_atimensec;
13978 usbp->st_mtime = sbp->st_mtime;
13979 usbp->st_mtimensec = sbp->st_mtimensec;
13980 usbp->st_ctime = sbp->st_ctime;
13981 usbp->st_ctimensec = sbp->st_ctimensec;
13982 usbp->st_birthtime = sbp->st_birthtime;
13983 usbp->st_birthtimensec = sbp->st_birthtimensec;
13984#endif
13985 usbp->st_size = sbp->st_size;
13986 usbp->st_blocks = sbp->st_blocks;
13987 usbp->st_blksize = sbp->st_blksize;
13988 usbp->st_flags = sbp->st_flags;
13989 usbp->st_gen = sbp->st_gen;
13990 usbp->st_lspare = sbp->st_lspare;
13991 usbp->st_qspare[0] = sbp->st_qspare[0];
13992 usbp->st_qspare[1] = sbp->st_qspare[1];
13993}
13994
13995/*
13996 * Purge buffer cache for simulating cold starts
13997 */
13998static int
13999vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14000{
14001 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14002
14003 return VNODE_RETURNED;
14004}
14005
14006static int
14007vfs_purge_callback(mount_t mp, __unused void * arg)
14008{
14009 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, callout: vnode_purge_callback, NULL);
14010
14011 return VFS_RETURNED;
14012}
14013
14014static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14015SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14016
14017int
14018vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14019{
14020 if (!kauth_cred_issuser(cred: kauth_cred_get())) {
14021 return EPERM;
14022 }
14023
14024 vfs_iterate(flags: 0 /* flags */, callout: vfs_purge_callback, NULL);
14025
14026 /* also flush any VM pagers backed by files */
14027 if (vfs_purge_vm_pagers) {
14028 vm_purge_filebacked_pagers();
14029 }
14030
14031 return 0;
14032}
14033
14034/*
14035 * gets the vnode associated with the (unnamed) snapshot directory
14036 * for a Filesystem. The snapshot directory vnode is returned with
14037 * an iocount on it.
14038 */
14039int
14040vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14041{
14042 return VFS_VGET_SNAPDIR(vnode_mount(vp: rvp), sdvpp, ctx);
14043}
14044
14045/*
14046 * Get the snapshot vnode.
14047 *
14048 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14049 * needs nameidone() on ndp.
14050 *
14051 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14052 *
14053 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14054 * not needed.
14055 */
14056static int
14057vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14058 user_addr_t name, struct nameidata *ndp, int32_t op,
14059#if !CONFIG_TRIGGERS
14060 __unused
14061#endif
14062 enum path_operation pathop,
14063 vfs_context_t ctx)
14064{
14065 int error, i;
14066 caddr_t name_buf;
14067 size_t name_len;
14068 struct vfs_attr vfa;
14069
14070 *sdvpp = NULLVP;
14071 *rvpp = NULLVP;
14072
14073 error = vnode_getfromfd(ctx, fd: dirfd, vpp: rvpp);
14074 if (error) {
14075 return error;
14076 }
14077
14078 if (!vnode_isvroot(vp: *rvpp)) {
14079 error = EINVAL;
14080 goto out;
14081 }
14082
14083 /* Make sure the filesystem supports snapshots */
14084 VFSATTR_INIT(&vfa);
14085 VFSATTR_WANTED(&vfa, f_capabilities);
14086 if ((vfs_getattr(mp: vnode_mount(vp: *rvpp), vfa: &vfa, ctx) != 0) ||
14087 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14088 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14089 VOL_CAP_INT_SNAPSHOT)) ||
14090 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14091 VOL_CAP_INT_SNAPSHOT))) {
14092 error = ENOTSUP;
14093 goto out;
14094 }
14095
14096 error = vnode_get_snapdir(rvp: *rvpp, sdvpp, ctx);
14097 if (error) {
14098 goto out;
14099 }
14100
14101 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14102 error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14103 if (error) {
14104 goto out1;
14105 }
14106
14107 /*
14108 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14109 * (the length returned by copyinstr includes the terminating NUL)
14110 */
14111 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14112 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14113 error = EINVAL;
14114 goto out1;
14115 }
14116 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14117 ;
14118 }
14119 if (i < (int)name_len) {
14120 error = EINVAL;
14121 goto out1;
14122 }
14123
14124#if CONFIG_MACF
14125 if (op == CREATE) {
14126 error = mac_mount_check_snapshot_create(ctx, mp: vnode_mount(vp: *rvpp),
14127 name: name_buf);
14128 } else if (op == DELETE) {
14129 error = mac_mount_check_snapshot_delete(ctx, mp: vnode_mount(vp: *rvpp),
14130 name: name_buf);
14131 }
14132 if (error) {
14133 goto out1;
14134 }
14135#endif
14136
14137 /* Check if the snapshot already exists ... */
14138 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14139 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14140 ndp->ni_dvp = *sdvpp;
14141
14142 error = namei(ndp);
14143out1:
14144 zfree(ZV_NAMEI, name_buf);
14145out:
14146 if (error) {
14147 if (*sdvpp) {
14148 vnode_put(vp: *sdvpp);
14149 *sdvpp = NULLVP;
14150 }
14151 if (*rvpp) {
14152 vnode_put(vp: *rvpp);
14153 *rvpp = NULLVP;
14154 }
14155 }
14156 return error;
14157}
14158
14159/*
14160 * create a filesystem snapshot (for supporting filesystems)
14161 *
14162 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14163 * We get to the (unnamed) snapshot directory vnode and create the vnode
14164 * for the snapshot in it.
14165 *
14166 * Restrictions:
14167 *
14168 * a) Passed in name for snapshot cannot have slashes.
14169 * b) name can't be "." or ".."
14170 *
14171 * Since this requires superuser privileges, vnode_authorize calls are not
14172 * made.
14173 */
14174static int __attribute__((noinline))
14175snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14176 vfs_context_t ctx)
14177{
14178 vnode_t rvp, snapdvp;
14179 int error;
14180 struct nameidata *ndp;
14181
14182 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14183
14184 error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp, CREATE,
14185 pathop: OP_LINK, ctx);
14186 if (error) {
14187 goto out;
14188 }
14189
14190 if (ndp->ni_vp) {
14191 vnode_put(vp: ndp->ni_vp);
14192 error = EEXIST;
14193 } else {
14194 struct vnode_attr *vap;
14195 vnode_t vp = NULLVP;
14196
14197 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14198
14199 VATTR_INIT(vap);
14200 VATTR_SET(vap, va_type, VREG);
14201 VATTR_SET(vap, va_mode, 0);
14202
14203 error = vn_create(snapdvp, &vp, ndp, vap,
14204 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14205 if (!error && vp) {
14206 vnode_put(vp);
14207 }
14208
14209 kfree_type(struct vnode_attr, vap);
14210 }
14211
14212 nameidone(ndp);
14213 vnode_put(vp: snapdvp);
14214 vnode_put(vp: rvp);
14215out:
14216 kfree_type(struct nameidata, ndp);
14217
14218 return error;
14219}
14220
14221/*
14222 * Delete a Filesystem snapshot
14223 *
14224 * get the vnode for the unnamed snapshot directory and the snapshot and
14225 * delete the snapshot.
14226 */
14227static int __attribute__((noinline))
14228snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14229 vfs_context_t ctx)
14230{
14231 vnode_t rvp, snapdvp;
14232 int error;
14233 struct nameidata *ndp;
14234
14235 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14236
14237 error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp, DELETE,
14238 pathop: OP_UNLINK, ctx);
14239 if (error) {
14240 goto out;
14241 }
14242
14243 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14244 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14245
14246 vnode_put(vp: ndp->ni_vp);
14247 nameidone(ndp);
14248 vnode_put(vp: snapdvp);
14249 vnode_put(vp: rvp);
14250out:
14251 kfree_type(struct nameidata, ndp);
14252
14253 return error;
14254}
14255
14256/*
14257 * Revert a filesystem to a snapshot
14258 *
14259 * Marks the filesystem to revert to the given snapshot on next mount.
14260 */
14261static int __attribute__((noinline))
14262snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14263 vfs_context_t ctx)
14264{
14265 int error;
14266 vnode_t rvp;
14267 mount_t mp;
14268 struct fs_snapshot_revert_args revert_data;
14269 struct componentname cnp;
14270 caddr_t name_buf;
14271 size_t name_len;
14272
14273 error = vnode_getfromfd(ctx, fd: dirfd, vpp: &rvp);
14274 if (error) {
14275 return error;
14276 }
14277 mp = vnode_mount(vp: rvp);
14278
14279 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14280 error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14281 if (error) {
14282 zfree(ZV_NAMEI, name_buf);
14283 vnode_put(vp: rvp);
14284 return error;
14285 }
14286
14287#if CONFIG_MACF
14288 error = mac_mount_check_snapshot_revert(ctx, mp, name: name_buf);
14289 if (error) {
14290 zfree(ZV_NAMEI, name_buf);
14291 vnode_put(vp: rvp);
14292 return error;
14293 }
14294#endif
14295
14296 /*
14297 * Grab mount_iterref so that we can release the vnode,
14298 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14299 */
14300 error = mount_iterref(mp, 0);
14301 vnode_put(vp: rvp);
14302 if (error) {
14303 zfree(ZV_NAMEI, name_buf);
14304 return error;
14305 }
14306
14307 memset(s: &cnp, c: 0, n: sizeof(cnp));
14308 cnp.cn_pnbuf = (char *)name_buf;
14309 cnp.cn_nameiop = LOOKUP;
14310 cnp.cn_flags = ISLASTCN | HASBUF;
14311 cnp.cn_pnlen = MAXPATHLEN;
14312 cnp.cn_nameptr = cnp.cn_pnbuf;
14313 cnp.cn_namelen = (int)name_len;
14314 revert_data.sr_cnp = &cnp;
14315
14316 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, data: (caddr_t)&revert_data, flags: 0, context: ctx);
14317 mount_iterdrop(mp);
14318 zfree(ZV_NAMEI, name_buf);
14319
14320 if (error) {
14321 /* If there was any error, try again using VNOP_IOCTL */
14322
14323 vnode_t snapdvp;
14324 struct nameidata namend;
14325
14326 error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp: &namend, LOOKUP,
14327 pathop: OP_LOOKUP, ctx);
14328 if (error) {
14329 return error;
14330 }
14331
14332
14333 error = VNOP_IOCTL(vp: namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, data: (caddr_t) NULL,
14334 fflag: 0, ctx);
14335
14336 vnode_put(vp: namend.ni_vp);
14337 nameidone(&namend);
14338 vnode_put(vp: snapdvp);
14339 vnode_put(vp: rvp);
14340 }
14341
14342 return error;
14343}
14344
14345/*
14346 * rename a Filesystem snapshot
14347 *
14348 * get the vnode for the unnamed snapshot directory and the snapshot and
14349 * rename the snapshot. This is a very specialised (and simple) case of
14350 * rename(2) (which has to deal with a lot more complications). It differs
14351 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14352 */
14353static int __attribute__((noinline))
14354snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14355 __unused uint32_t flags, vfs_context_t ctx)
14356{
14357 vnode_t rvp, snapdvp;
14358 int error, i;
14359 caddr_t newname_buf;
14360 size_t name_len;
14361 vnode_t fvp;
14362 struct nameidata *fromnd, *tond;
14363 /* carving out a chunk for structs that are too big to be on stack. */
14364 struct {
14365 struct nameidata from_node;
14366 struct nameidata to_node;
14367 } * __rename_data;
14368
14369 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14370 fromnd = &__rename_data->from_node;
14371 tond = &__rename_data->to_node;
14372
14373 error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name: old, ndp: fromnd, DELETE,
14374 pathop: OP_UNLINK, ctx);
14375 if (error) {
14376 goto out;
14377 }
14378 fvp = fromnd->ni_vp;
14379
14380 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14381 error = copyinstr(uaddr: new, kaddr: newname_buf, MAXPATHLEN, done: &name_len);
14382 if (error) {
14383 goto out1;
14384 }
14385
14386 /*
14387 * Some sanity checks- new name can't be empty, "." or ".." or have
14388 * slashes.
14389 * (the length returned by copyinstr includes the terminating NUL)
14390 *
14391 * The FS rename VNOP is suppossed to handle this but we'll pick it
14392 * off here itself.
14393 */
14394 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14395 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14396 error = EINVAL;
14397 goto out1;
14398 }
14399 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14400 ;
14401 }
14402 if (i < (int)name_len) {
14403 error = EINVAL;
14404 goto out1;
14405 }
14406
14407#if CONFIG_MACF
14408 error = mac_mount_check_snapshot_create(ctx, mp: vnode_mount(vp: rvp),
14409 name: newname_buf);
14410 if (error) {
14411 goto out1;
14412 }
14413#endif
14414
14415 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14416 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14417 tond->ni_dvp = snapdvp;
14418
14419 error = namei(ndp: tond);
14420 if (error) {
14421 goto out2;
14422 } else if (tond->ni_vp) {
14423 /*
14424 * snapshot rename behaves differently than rename(2) - if the
14425 * new name exists, EEXIST is returned.
14426 */
14427 vnode_put(vp: tond->ni_vp);
14428 error = EEXIST;
14429 goto out2;
14430 }
14431
14432 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14433 &tond->ni_cnd, ctx);
14434
14435out2:
14436 nameidone(tond);
14437out1:
14438 zfree(ZV_NAMEI, newname_buf);
14439 vnode_put(vp: fvp);
14440 vnode_put(vp: snapdvp);
14441 vnode_put(vp: rvp);
14442 nameidone(fromnd);
14443out:
14444 kfree_type(typeof(*__rename_data), __rename_data);
14445 return error;
14446}
14447
14448/*
14449 * Mount a Filesystem snapshot
14450 *
14451 * get the vnode for the unnamed snapshot directory and the snapshot and
14452 * mount the snapshot.
14453 */
14454static int __attribute__((noinline))
14455snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14456 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14457{
14458 mount_t mp;
14459 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14460 struct fs_snapshot_mount_args smnt_data;
14461 int error;
14462 struct nameidata *snapndp, *dirndp;
14463 /* carving out a chunk for structs that are too big to be on stack. */
14464 struct {
14465 struct nameidata snapnd;
14466 struct nameidata dirnd;
14467 } * __snapshot_mount_data;
14468
14469 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14470 snapndp = &__snapshot_mount_data->snapnd;
14471 dirndp = &__snapshot_mount_data->dirnd;
14472
14473 error = vnode_get_snapshot(dirfd, rvpp: &rvp, sdvpp: &snapdvp, name, ndp: snapndp, LOOKUP,
14474 pathop: OP_LOOKUP, ctx);
14475 if (error) {
14476 goto out;
14477 }
14478
14479 snapvp = snapndp->ni_vp;
14480 if (!vnode_mount(vp: rvp) || (vnode_mount(vp: rvp) == dead_mountp)) {
14481 error = EIO;
14482 goto out1;
14483 }
14484
14485 /* Get the vnode to be covered */
14486 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14487 UIO_USERSPACE, directory, ctx);
14488 error = namei(ndp: dirndp);
14489 if (error) {
14490 goto out1;
14491 }
14492
14493 vp = dirndp->ni_vp;
14494 pvp = dirndp->ni_dvp;
14495 mp = vnode_mount(vp: rvp);
14496
14497 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14498 error = EINVAL;
14499 goto out2;
14500 }
14501
14502#if CONFIG_MACF
14503 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, cnp: &dirndp->ni_cnd, name: snapndp->ni_cnd.cn_nameptr,
14504 vfc_name: mp->mnt_vfsstat.f_fstypename);
14505 if (error) {
14506 goto out2;
14507 }
14508#endif
14509
14510 smnt_data.sm_mp = mp;
14511 smnt_data.sm_cnp = &snapndp->ni_cnd;
14512 error = mount_common(fstypename: mp->mnt_vfsstat.f_fstypename, pvp, vp,
14513 cnp: &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags: flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14514 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14515
14516out2:
14517 vnode_put(vp);
14518 vnode_put(vp: pvp);
14519 nameidone(dirndp);
14520out1:
14521 vnode_put(vp: snapvp);
14522 vnode_put(vp: snapdvp);
14523 vnode_put(vp: rvp);
14524 nameidone(snapndp);
14525out:
14526 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14527 return error;
14528}
14529
14530/*
14531 * Root from a snapshot of the filesystem
14532 *
14533 * Marks the filesystem to root from the given snapshot on next boot.
14534 */
14535static int __attribute__((noinline))
14536snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14537 vfs_context_t ctx)
14538{
14539 int error;
14540 vnode_t rvp;
14541 mount_t mp;
14542 struct fs_snapshot_root_args root_data;
14543 struct componentname cnp;
14544 caddr_t name_buf;
14545 size_t name_len;
14546
14547 error = vnode_getfromfd(ctx, fd: dirfd, vpp: &rvp);
14548 if (error) {
14549 return error;
14550 }
14551 mp = vnode_mount(vp: rvp);
14552
14553 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14554 error = copyinstr(uaddr: name, kaddr: name_buf, MAXPATHLEN, done: &name_len);
14555 if (error) {
14556 zfree(ZV_NAMEI, name_buf);
14557 vnode_put(vp: rvp);
14558 return error;
14559 }
14560
14561 // XXX MAC checks ?
14562
14563 /*
14564 * Grab mount_iterref so that we can release the vnode,
14565 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14566 */
14567 error = mount_iterref(mp, 0);
14568 vnode_put(vp: rvp);
14569 if (error) {
14570 zfree(ZV_NAMEI, name_buf);
14571 return error;
14572 }
14573
14574 memset(s: &cnp, c: 0, n: sizeof(cnp));
14575 cnp.cn_pnbuf = (char *)name_buf;
14576 cnp.cn_nameiop = LOOKUP;
14577 cnp.cn_flags = ISLASTCN | HASBUF;
14578 cnp.cn_pnlen = MAXPATHLEN;
14579 cnp.cn_nameptr = cnp.cn_pnbuf;
14580 cnp.cn_namelen = (int)name_len;
14581 root_data.sr_cnp = &cnp;
14582
14583 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, data: (caddr_t)&root_data, flags: 0, context: ctx);
14584
14585 mount_iterdrop(mp);
14586 zfree(ZV_NAMEI, name_buf);
14587
14588 return error;
14589}
14590
14591static boolean_t
14592vfs_context_can_snapshot(vfs_context_t ctx)
14593{
14594 static const char * const snapshot_entitlements[] = {
14595 "com.apple.private.vfs.snapshot",
14596 "com.apple.developer.vfs.snapshot",
14597 "com.apple.private.apfs.arv.limited.snapshot",
14598 };
14599 static const size_t nentitlements =
14600 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14601 size_t i;
14602
14603 task_t task = vfs_context_task(ctx);
14604 for (i = 0; i < nentitlements; i++) {
14605 if (IOTaskHasEntitlement(task, entitlement: snapshot_entitlements[i])) {
14606 return TRUE;
14607 }
14608 }
14609 return FALSE;
14610}
14611
14612/*
14613 * FS snapshot operations dispatcher
14614 */
14615int
14616fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14617 __unused int32_t *retval)
14618{
14619 int error;
14620 vfs_context_t ctx = vfs_context_current();
14621
14622 AUDIT_ARG(fd, uap->dirfd);
14623 AUDIT_ARG(value32, uap->op);
14624
14625 if (!vfs_context_can_snapshot(ctx)) {
14626 return EPERM;
14627 }
14628
14629 /*
14630 * Enforce user authorization for snapshot modification operations,
14631 * or if trying to root from snapshot.
14632 */
14633 if (uap->op != SNAPSHOT_OP_MOUNT) {
14634 vnode_t dvp = NULLVP;
14635 vnode_t devvp = NULLVP;
14636 mount_t mp;
14637
14638 error = vnode_getfromfd(ctx, fd: uap->dirfd, vpp: &dvp);
14639 if (error) {
14640 return error;
14641 }
14642 mp = vnode_mount(vp: dvp);
14643 devvp = mp->mnt_devvp;
14644
14645 /* get an iocount on devvp */
14646 if (devvp == NULLVP) {
14647 error = vnode_lookup(path: mp->mnt_vfsstat.f_mntfromname, flags: 0, vpp: &devvp, ctx);
14648 /* for mounts which arent block devices */
14649 if (error == ENOENT) {
14650 error = ENXIO;
14651 }
14652 } else {
14653 error = vnode_getwithref(vp: devvp);
14654 }
14655
14656 if (error) {
14657 vnode_put(vp: dvp);
14658 return error;
14659 }
14660
14661 if ((vfs_context_issuser(ctx) == 0) &&
14662 (vnode_authorize(vp: devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14663 (!IOTaskHasEntitlement(task: vfs_context_task(ctx), entitlement: "com.apple.private.vfs.snapshot.user"))) {
14664 error = EPERM;
14665 }
14666 vnode_put(vp: dvp);
14667 vnode_put(vp: devvp);
14668
14669 if (error) {
14670 return error;
14671 }
14672 }
14673
14674 switch (uap->op) {
14675 case SNAPSHOT_OP_CREATE:
14676 error = snapshot_create(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14677 break;
14678 case SNAPSHOT_OP_DELETE:
14679 error = snapshot_delete(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14680 break;
14681 case SNAPSHOT_OP_RENAME:
14682 error = snapshot_rename(dirfd: uap->dirfd, old: uap->name1, new: uap->name2,
14683 flags: uap->flags, ctx);
14684 break;
14685 case SNAPSHOT_OP_MOUNT:
14686 error = snapshot_mount(dirfd: uap->dirfd, name: uap->name1, directory: uap->name2,
14687 mnt_data: uap->data, flags: uap->flags, ctx);
14688 break;
14689 case SNAPSHOT_OP_REVERT:
14690 error = snapshot_revert(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14691 break;
14692#if CONFIG_MNT_ROOTSNAP
14693 case SNAPSHOT_OP_ROOT:
14694 error = snapshot_root(dirfd: uap->dirfd, name: uap->name1, flags: uap->flags, ctx);
14695 break;
14696#endif /* CONFIG_MNT_ROOTSNAP */
14697 default:
14698 error = ENOSYS;
14699 }
14700
14701 return error;
14702}
14703