1 | /* |
2 | * |
3 | * Copyright (c) 2000-2024 Apple Inc. All rights reserved. |
4 | * |
5 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
6 | * |
7 | * This file contains Original Code and/or Modifications of Original Code |
8 | * as defined in and that are subject to the Apple Public Source License |
9 | * Version 2.0 (the 'License'). You may not use this file except in |
10 | * compliance with the License. The rights granted to you under the License |
11 | * may not be used to create, or enable the creation or redistribution of, |
12 | * unlawful or unlicensed copies of an Apple operating system, or to |
13 | * circumvent, violate, or enable the circumvention or violation of, any |
14 | * terms of an Apple operating system software license agreement. |
15 | * |
16 | * Please obtain a copy of the License at |
17 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
18 | * |
19 | * The Original Code and all software distributed under the License are |
20 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
21 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
22 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
23 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
24 | * Please see the License for the specific language governing rights and |
25 | * limitations under the License. |
26 | * |
27 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
28 | */ |
29 | /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ |
30 | /* |
31 | * Copyright (c) 1989, 1993 |
32 | * The Regents of the University of California. All rights reserved. |
33 | * (c) UNIX System Laboratories, Inc. |
34 | * All or some portions of this file are derived from material licensed |
35 | * to the University of California by American Telephone and Telegraph |
36 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
37 | * the permission of UNIX System Laboratories, Inc. |
38 | * |
39 | * Redistribution and use in source and binary forms, with or without |
40 | * modification, are permitted provided that the following conditions |
41 | * are met: |
42 | * 1. Redistributions of source code must retain the above copyright |
43 | * notice, this list of conditions and the following disclaimer. |
44 | * 2. Redistributions in binary form must reproduce the above copyright |
45 | * notice, this list of conditions and the following disclaimer in the |
46 | * documentation and/or other materials provided with the distribution. |
47 | * 3. All advertising materials mentioning features or use of this software |
48 | * must display the following acknowledgement: |
49 | * This product includes software developed by the University of |
50 | * California, Berkeley and its contributors. |
51 | * 4. Neither the name of the University nor the names of its contributors |
52 | * may be used to endorse or promote products derived from this software |
53 | * without specific prior written permission. |
54 | * |
55 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
56 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
57 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
58 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
59 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
60 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
61 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
62 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
63 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
64 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
65 | * SUCH DAMAGE. |
66 | * |
67 | * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 |
68 | */ |
69 | /* |
70 | * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce |
71 | * support for mandatory and extensible security protections. This notice |
72 | * is included in support of clause 2.2 (b) of the Apple Public License, |
73 | * Version 2.0. |
74 | */ |
75 | |
76 | /* |
77 | * External virtual filesystem routines |
78 | */ |
79 | |
80 | #include <sys/param.h> |
81 | #include <sys/systm.h> |
82 | #include <sys/proc_internal.h> |
83 | #include <sys/kauth.h> |
84 | #include <sys/mount_internal.h> |
85 | #include <sys/time.h> |
86 | #include <sys/lock.h> |
87 | #include <sys/vnode.h> |
88 | #include <sys/vnode_internal.h> |
89 | #include <sys/stat.h> |
90 | #include <sys/namei.h> |
91 | #include <sys/ucred.h> |
92 | #include <sys/buf_internal.h> |
93 | #include <sys/errno.h> |
94 | #include <kern/kalloc.h> |
95 | #include <sys/uio_internal.h> |
96 | #include <sys/uio.h> |
97 | #include <sys/domain.h> |
98 | #include <sys/mbuf.h> |
99 | #include <sys/syslog.h> |
100 | #include <sys/ubc_internal.h> |
101 | #include <sys/vm.h> |
102 | #include <sys/sysctl.h> |
103 | #include <sys/filedesc.h> |
104 | #include <sys/event.h> |
105 | #include <sys/kdebug.h> |
106 | #include <sys/kauth.h> |
107 | #include <sys/user.h> |
108 | #include <sys/systm.h> |
109 | #include <sys/kern_memorystatus.h> |
110 | #include <sys/lockf.h> |
111 | #include <sys/reboot.h> |
112 | #include <miscfs/fifofs/fifo.h> |
113 | |
114 | #include <nfs/nfs.h> |
115 | |
116 | #include <string.h> |
117 | #include <machine/machine_routines.h> |
118 | |
119 | #include <kern/assert.h> |
120 | #include <mach/kern_return.h> |
121 | #include <kern/thread.h> |
122 | #include <kern/sched_prim.h> |
123 | #include <kern/smr.h> |
124 | |
125 | #include <miscfs/specfs/specdev.h> |
126 | |
127 | #include <mach/mach_types.h> |
128 | #include <mach/memory_object_types.h> |
129 | #include <mach/memory_object_control.h> |
130 | |
131 | #include <kern/kalloc.h> /* kalloc()/kfree() */ |
132 | #include <kern/clock.h> /* delay_for_interval() */ |
133 | #include <libkern/coreanalytics/coreanalytics.h> |
134 | #include <libkern/OSAtomic.h> /* OSAddAtomic() */ |
135 | #include <os/atomic_private.h> |
136 | #if defined(XNU_TARGET_OS_OSX) |
137 | #include <console/video_console.h> |
138 | #endif |
139 | |
140 | #ifdef CONFIG_IOCOUNT_TRACE |
141 | #include <libkern/OSDebug.h> |
142 | #endif |
143 | |
144 | #include <vm/vm_protos.h> /* vnode_pager_vrele() */ |
145 | |
146 | #if CONFIG_MACF |
147 | #include <security/mac_framework.h> |
148 | #endif |
149 | |
150 | #include <vfs/vfs_disk_conditioner.h> |
151 | #include <libkern/section_keywords.h> |
152 | |
153 | static LCK_GRP_DECLARE(vnode_lck_grp, "vnode" ); |
154 | static LCK_ATTR_DECLARE(vnode_lck_attr, 0, 0); |
155 | |
156 | #if CONFIG_TRIGGERS |
157 | static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode" ); |
158 | static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, 0, 0); |
159 | #endif |
160 | |
161 | extern lck_mtx_t mnt_list_mtx_lock; |
162 | |
163 | static KALLOC_TYPE_DEFINE(specinfo_zone, struct specinfo, KT_DEFAULT); |
164 | |
165 | ZONE_DEFINE(vnode_zone, "vnodes" , |
166 | sizeof(struct vnode), ZC_NOGC | ZC_ZFREE_CLEARMEM); |
167 | |
168 | enum vtype iftovt_tab[16] = { |
169 | VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, |
170 | VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, |
171 | }; |
172 | int vttoif_tab[9] = { |
173 | 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, |
174 | S_IFSOCK, S_IFIFO, S_IFMT, |
175 | }; |
176 | |
177 | /* XXX These should be in a BSD accessible Mach header, but aren't. */ |
178 | extern void memory_object_mark_used( |
179 | memory_object_control_t control); |
180 | |
181 | extern void memory_object_mark_unused( |
182 | memory_object_control_t control, |
183 | boolean_t rage); |
184 | |
185 | extern void memory_object_mark_io_tracking( |
186 | memory_object_control_t control); |
187 | |
188 | extern int paniclog_append_noflush(const char *format, ...); |
189 | |
190 | /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ |
191 | __private_extern__ void qsort( |
192 | void * array, |
193 | size_t nmembers, |
194 | size_t member_size, |
195 | int (*)(const void *, const void *)); |
196 | |
197 | __private_extern__ void vntblinit(void); |
198 | __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t, |
199 | enum uio_seg, int); |
200 | |
201 | static void vnode_list_add(vnode_t); |
202 | static void vnode_async_list_add(vnode_t); |
203 | static void vnode_list_remove(vnode_t); |
204 | static void vnode_list_remove_locked(vnode_t); |
205 | |
206 | static void vnode_abort_advlocks(vnode_t); |
207 | static errno_t vnode_drain(vnode_t); |
208 | static void vgone(vnode_t, int flags); |
209 | static void vclean(vnode_t vp, int flag); |
210 | static void vnode_reclaim_internal(vnode_t, int, int, int); |
211 | |
212 | static void vnode_dropiocount(vnode_t); |
213 | |
214 | static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); |
215 | static int vnode_reload(vnode_t); |
216 | |
217 | static int unmount_callback(mount_t, __unused void *); |
218 | |
219 | static void insmntque(vnode_t vp, mount_t mp); |
220 | static int mount_getvfscnt(void); |
221 | static int mount_fillfsids(fsid_t *, int ); |
222 | static void vnode_iterate_setup(mount_t); |
223 | int vnode_umount_preflight(mount_t, vnode_t, int); |
224 | static int vnode_iterate_prepare(mount_t); |
225 | static int vnode_iterate_reloadq(mount_t); |
226 | static void vnode_iterate_clear(mount_t); |
227 | static mount_t vfs_getvfs_locked(fsid_t *); |
228 | static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, |
229 | struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx); |
230 | static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx); |
231 | |
232 | errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); |
233 | |
234 | #ifdef CONFIG_IOCOUNT_TRACE |
235 | static void record_vp(vnode_t vp, int count); |
236 | static TUNABLE(int, bootarg_vnode_iocount_trace, "vnode_iocount_trace" , 0); |
237 | static TUNABLE(int, bootarg_uthread_iocount_trace, "uthread_iocount_trace" , 0); |
238 | #endif /* CONFIG_IOCOUNT_TRACE */ |
239 | |
240 | #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG) |
241 | static TUNABLE(bool, bootarg_no_vnode_jetsam, "-no_vnode_jetsam" , false); |
242 | #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ |
243 | |
244 | static TUNABLE(bool, bootarg_no_vnode_drain, "-no_vnode_drain" , false); |
245 | |
246 | __options_decl(freeable_vnode_level_t, uint32_t, { |
247 | DEALLOC_VNODE_NONE = 0, |
248 | DEALLOC_VNODE_ONLY_OVERFLOW = 1, |
249 | DEALLOC_VNODE_ALL = 2 |
250 | }); |
251 | |
252 | #if XNU_TARGET_OS_OSX |
253 | static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level" , DEALLOC_VNODE_NONE); |
254 | #else |
255 | static TUNABLE(freeable_vnode_level_t, bootarg_vn_dealloc_level, "vn_dealloc_level" , DEALLOC_VNODE_ONLY_OVERFLOW); |
256 | #endif /* CONFIG_VNDEALLOC */ |
257 | |
258 | static freeable_vnode_level_t vn_dealloc_level = DEALLOC_VNODE_NONE; |
259 | |
260 | boolean_t root_is_CF_drive = FALSE; |
261 | |
262 | #if CONFIG_TRIGGERS |
263 | static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external); |
264 | static void vnode_resolver_detach(vnode_t); |
265 | #endif |
266 | |
267 | TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ |
268 | TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ |
269 | TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list; |
270 | |
271 | |
272 | TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ |
273 | struct timeval rage_tv; |
274 | int rage_limit = 0; |
275 | int ragevnodes = 0; |
276 | |
277 | long reusablevnodes_max = LONG_MAX; |
278 | long reusablevnodes = 0; |
279 | int deadvnodes_low = 0; |
280 | int deadvnodes_high = 0; |
281 | int numvnodes_min = 0; |
282 | int numvnodes_max = 0; |
283 | |
284 | uint64_t newvnode = 0; |
285 | unsigned long newvnode_nodead = 0; |
286 | |
287 | static int vfs_unmountall_started = 0; |
288 | static int vfs_unmountall_finished = 0; |
289 | static uint64_t vfs_shutdown_last_completion_time; |
290 | |
291 | #define RAGE_LIMIT_MIN 100 |
292 | #define RAGE_TIME_LIMIT 5 |
293 | |
294 | VFS_SMR_DECLARE; |
295 | extern uint32_t nc_smr_enabled; |
296 | |
297 | /* |
298 | * ROSV definitions |
299 | * NOTE: These are shadowed from PlatformSupport definitions, but XNU |
300 | * builds standalone. |
301 | */ |
302 | #define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data" |
303 | |
304 | /* |
305 | * These could be in PlatformSupport but aren't yet |
306 | */ |
307 | #define PLATFORM_PREBOOT_VOLUME_MOUNT_POINT "/System/Volumes/Preboot" |
308 | #define PLATFORM_RECOVERY_VOLUME_MOUNT_POINT "/System/Volumes/Recovery" |
309 | |
310 | #if CONFIG_MOUNT_VM |
311 | #define PLATFORM_VM_VOLUME_MOUNT_POINT "/System/Volumes/VM" |
312 | #endif |
313 | |
314 | struct mntlist mountlist; /* mounted filesystem list */ |
315 | static int nummounts = 0; |
316 | |
317 | static int print_busy_vnodes = 0; /* print out busy vnodes */ |
318 | |
319 | #if DIAGNOSTIC |
320 | #define VLISTCHECK(fun, vp, list) \ |
321 | if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \ |
322 | panic("%s: %s vnode not on %slist", (fun), (list), (list)); |
323 | #else |
324 | #define VLISTCHECK(fun, vp, list) |
325 | #endif /* DIAGNOSTIC */ |
326 | |
327 | #define VLISTNONE(vp) \ |
328 | do { \ |
329 | (vp)->v_freelist.tqe_next = (struct vnode *)0; \ |
330 | (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \ |
331 | } while(0) |
332 | |
333 | #define VONLIST(vp) \ |
334 | ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb) |
335 | |
336 | /* remove a vnode from free vnode list */ |
337 | #define VREMFREE(fun, vp) \ |
338 | do { \ |
339 | VLISTCHECK((fun), (vp), "free"); \ |
340 | TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \ |
341 | VLISTNONE((vp)); \ |
342 | freevnodes--; \ |
343 | reusablevnodes--; \ |
344 | } while(0) |
345 | |
346 | |
347 | /* remove a vnode from dead vnode list */ |
348 | #define VREMDEAD(fun, vp) \ |
349 | do { \ |
350 | VLISTCHECK((fun), (vp), "dead"); \ |
351 | TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \ |
352 | VLISTNONE((vp)); \ |
353 | vp->v_listflag &= ~VLIST_DEAD; \ |
354 | deadvnodes--; \ |
355 | if (vp->v_listflag & VLIST_NO_REUSE) { \ |
356 | deadvnodes_noreuse--; \ |
357 | } \ |
358 | } while(0) |
359 | |
360 | |
361 | /* remove a vnode from async work vnode list */ |
362 | #define VREMASYNC_WORK(fun, vp) \ |
363 | do { \ |
364 | VLISTCHECK((fun), (vp), "async_work"); \ |
365 | TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \ |
366 | VLISTNONE((vp)); \ |
367 | vp->v_listflag &= ~VLIST_ASYNC_WORK; \ |
368 | async_work_vnodes--; \ |
369 | if (!(vp->v_listflag & VLIST_NO_REUSE)) { \ |
370 | reusablevnodes--; \ |
371 | } \ |
372 | } while(0) |
373 | |
374 | |
375 | /* remove a vnode from rage vnode list */ |
376 | #define VREMRAGE(fun, vp) \ |
377 | do { \ |
378 | if ( !(vp->v_listflag & VLIST_RAGE)) \ |
379 | panic("VREMRAGE: vp not on rage list"); \ |
380 | VLISTCHECK((fun), (vp), "rage"); \ |
381 | TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \ |
382 | VLISTNONE((vp)); \ |
383 | vp->v_listflag &= ~VLIST_RAGE; \ |
384 | ragevnodes--; \ |
385 | reusablevnodes--; \ |
386 | } while(0) |
387 | |
388 | static void async_work_continue(void); |
389 | static void vn_laundry_continue(void); |
390 | static void wakeup_laundry_thread(void); |
391 | static void vnode_smr_free(void *, size_t); |
392 | |
393 | CA_EVENT(freeable_vnodes, |
394 | CA_INT, numvnodes_min, |
395 | CA_INT, numvnodes_max, |
396 | CA_INT, desiredvnodes, |
397 | CA_INT, numvnodes, |
398 | CA_INT, freevnodes, |
399 | CA_INT, deadvnodes, |
400 | CA_INT, freeablevnodes, |
401 | CA_INT, busyvnodes, |
402 | CA_BOOL, threshold_crossed); |
403 | static CA_EVENT_TYPE(freeable_vnodes) freeable_vnodes_telemetry; |
404 | |
405 | static bool freeablevnodes_threshold_crossed = false; |
406 | |
407 | /* |
408 | * Initialize the vnode management data structures. |
409 | */ |
410 | __private_extern__ void |
411 | vntblinit(void) |
412 | { |
413 | thread_t thread = THREAD_NULL; |
414 | int desiredvnodes_one_percent = desiredvnodes / 100; |
415 | |
416 | TAILQ_INIT(&vnode_free_list); |
417 | TAILQ_INIT(&vnode_rage_list); |
418 | TAILQ_INIT(&vnode_dead_list); |
419 | TAILQ_INIT(&vnode_async_work_list); |
420 | TAILQ_INIT(&mountlist); |
421 | |
422 | microuptime(tv: &rage_tv); |
423 | rage_limit = desiredvnodes_one_percent; |
424 | if (rage_limit < RAGE_LIMIT_MIN) { |
425 | rage_limit = RAGE_LIMIT_MIN; |
426 | } |
427 | |
428 | deadvnodes_low = desiredvnodes_one_percent; |
429 | if (deadvnodes_low > 300) { |
430 | deadvnodes_low = 300; |
431 | } |
432 | deadvnodes_high = deadvnodes_low * 2; |
433 | |
434 | numvnodes_min = numvnodes_max = desiredvnodes; |
435 | if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ONLY_OVERFLOW) { |
436 | numvnodes_max = desiredvnodes * 2; |
437 | vn_dealloc_level = bootarg_vn_dealloc_level; |
438 | } else if (bootarg_vn_dealloc_level == DEALLOC_VNODE_ALL) { |
439 | numvnodes_min = desiredvnodes_one_percent * 40; |
440 | numvnodes_max = desiredvnodes * 2; |
441 | reusablevnodes_max = (desiredvnodes_one_percent * 20) - deadvnodes_low; |
442 | vn_dealloc_level = bootarg_vn_dealloc_level; |
443 | } |
444 | |
445 | bzero(s: &freeable_vnodes_telemetry, n: sizeof(CA_EVENT_TYPE(freeable_vnodes))); |
446 | freeable_vnodes_telemetry.numvnodes_min = numvnodes_min; |
447 | freeable_vnodes_telemetry.numvnodes_max = numvnodes_max; |
448 | freeable_vnodes_telemetry.desiredvnodes = desiredvnodes; |
449 | |
450 | if (nc_smr_enabled) { |
451 | zone_enable_smr(zone: vnode_zone, VFS_SMR(), free_cb: &vnode_smr_free); |
452 | } |
453 | |
454 | /* |
455 | * create worker threads |
456 | */ |
457 | kernel_thread_start(continuation: (thread_continue_t)async_work_continue, NULL, new_thread: &thread); |
458 | thread_deallocate(thread); |
459 | kernel_thread_start(continuation: (thread_continue_t)vn_laundry_continue, NULL, new_thread: &thread); |
460 | thread_deallocate(thread); |
461 | } |
462 | |
463 | /* the timeout is in 10 msecs */ |
464 | int |
465 | vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) |
466 | { |
467 | int error = 0; |
468 | struct timespec ts; |
469 | |
470 | if (output_target < 0) { |
471 | return EINVAL; |
472 | } |
473 | |
474 | KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0); |
475 | |
476 | if (vp->v_numoutput > output_target) { |
477 | slpflag |= PDROP; |
478 | |
479 | vnode_lock_spin(vp); |
480 | |
481 | while ((vp->v_numoutput > output_target) && error == 0) { |
482 | if (output_target) { |
483 | vp->v_flag |= VTHROTTLED; |
484 | } else { |
485 | vp->v_flag |= VBWAIT; |
486 | } |
487 | |
488 | ts.tv_sec = (slptimeout / 100); |
489 | ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000; |
490 | error = msleep(chan: (caddr_t)&vp->v_numoutput, mtx: &vp->v_lock, pri: (slpflag | (PRIBIO + 1)), wmesg: msg, ts: &ts); |
491 | |
492 | vnode_lock_spin(vp); |
493 | } |
494 | vnode_unlock(vp); |
495 | } |
496 | KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0); |
497 | |
498 | return error; |
499 | } |
500 | |
501 | |
502 | void |
503 | vnode_startwrite(vnode_t vp) |
504 | { |
505 | OSAddAtomic(1, &vp->v_numoutput); |
506 | } |
507 | |
508 | |
509 | void |
510 | vnode_writedone(vnode_t vp) |
511 | { |
512 | if (vp) { |
513 | int need_wakeup = 0; |
514 | |
515 | OSAddAtomic(-1, &vp->v_numoutput); |
516 | |
517 | vnode_lock_spin(vp); |
518 | |
519 | if (vp->v_numoutput < 0) { |
520 | panic("vnode_writedone: numoutput < 0" ); |
521 | } |
522 | |
523 | if ((vp->v_flag & VTHROTTLED)) { |
524 | vp->v_flag &= ~VTHROTTLED; |
525 | need_wakeup = 1; |
526 | } |
527 | if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { |
528 | vp->v_flag &= ~VBWAIT; |
529 | need_wakeup = 1; |
530 | } |
531 | vnode_unlock(vp); |
532 | |
533 | if (need_wakeup) { |
534 | wakeup(chan: (caddr_t)&vp->v_numoutput); |
535 | } |
536 | } |
537 | } |
538 | |
539 | |
540 | |
541 | int |
542 | vnode_hasdirtyblks(vnode_t vp) |
543 | { |
544 | struct cl_writebehind *wbp; |
545 | |
546 | /* |
547 | * Not taking the buf_mtx as there is little |
548 | * point doing it. Even if the lock is taken the |
549 | * state can change right after that. If their |
550 | * needs to be a synchronization, it must be driven |
551 | * by the caller |
552 | */ |
553 | if (vp->v_dirtyblkhd.lh_first) { |
554 | return 1; |
555 | } |
556 | |
557 | if (!UBCINFOEXISTS(vp)) { |
558 | return 0; |
559 | } |
560 | |
561 | wbp = vp->v_ubcinfo->cl_wbehind; |
562 | |
563 | if (wbp && (wbp->cl_number || wbp->cl_scmap)) { |
564 | return 1; |
565 | } |
566 | |
567 | return 0; |
568 | } |
569 | |
570 | int |
571 | vnode_hascleanblks(vnode_t vp) |
572 | { |
573 | /* |
574 | * Not taking the buf_mtx as there is little |
575 | * point doing it. Even if the lock is taken the |
576 | * state can change right after that. If their |
577 | * needs to be a synchronization, it must be driven |
578 | * by the caller |
579 | */ |
580 | if (vp->v_cleanblkhd.lh_first) { |
581 | return 1; |
582 | } |
583 | return 0; |
584 | } |
585 | |
586 | void |
587 | vnode_iterate_setup(mount_t mp) |
588 | { |
589 | mp->mnt_lflag |= MNT_LITER; |
590 | } |
591 | |
592 | int |
593 | vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) |
594 | { |
595 | vnode_t vp; |
596 | int ret = 0; |
597 | |
598 | TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { |
599 | if (vp->v_type == VDIR) { |
600 | continue; |
601 | } |
602 | if (vp == skipvp) { |
603 | continue; |
604 | } |
605 | if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) { |
606 | continue; |
607 | } |
608 | if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { |
609 | continue; |
610 | } |
611 | if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { |
612 | continue; |
613 | } |
614 | |
615 | /* Look for busy vnode */ |
616 | if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) { |
617 | ret = 1; |
618 | if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { |
619 | vprint(label: "vnode_umount_preflight - busy vnode" , vp); |
620 | } else { |
621 | return ret; |
622 | } |
623 | } else if (vp->v_iocount > 0) { |
624 | /* Busy if iocount is > 0 for more than 3 seconds */ |
625 | tsleep(chan: &vp->v_iocount, PVFS, wmesg: "vnode_drain_network" , timo: 3 * hz); |
626 | if (vp->v_iocount > 0) { |
627 | ret = 1; |
628 | if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { |
629 | vprint(label: "vnode_umount_preflight - busy vnode" , vp); |
630 | } else { |
631 | return ret; |
632 | } |
633 | } |
634 | continue; |
635 | } |
636 | } |
637 | |
638 | return ret; |
639 | } |
640 | |
641 | /* |
642 | * This routine prepares iteration by moving all the vnodes to worker queue |
643 | * called with mount lock held |
644 | */ |
645 | int |
646 | vnode_iterate_prepare(mount_t mp) |
647 | { |
648 | vnode_t vp; |
649 | |
650 | if (TAILQ_EMPTY(&mp->mnt_vnodelist)) { |
651 | /* nothing to do */ |
652 | return 0; |
653 | } |
654 | |
655 | vp = TAILQ_FIRST(&mp->mnt_vnodelist); |
656 | vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first); |
657 | mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first; |
658 | mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last; |
659 | |
660 | TAILQ_INIT(&mp->mnt_vnodelist); |
661 | if (mp->mnt_newvnodes.tqh_first != NULL) { |
662 | panic("vnode_iterate_prepare: newvnode when entering vnode" ); |
663 | } |
664 | TAILQ_INIT(&mp->mnt_newvnodes); |
665 | |
666 | return 1; |
667 | } |
668 | |
669 | |
670 | /* called with mount lock held */ |
671 | int |
672 | vnode_iterate_reloadq(mount_t mp) |
673 | { |
674 | int moved = 0; |
675 | |
676 | /* add the remaining entries in workerq to the end of mount vnode list */ |
677 | if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { |
678 | struct vnode * mvp; |
679 | mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst); |
680 | |
681 | /* Joining the workerque entities to mount vnode list */ |
682 | if (mvp) { |
683 | mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first; |
684 | } else { |
685 | mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first; |
686 | } |
687 | mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last; |
688 | mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last; |
689 | TAILQ_INIT(&mp->mnt_workerqueue); |
690 | } |
691 | |
692 | /* add the newvnodes to the head of mount vnode list */ |
693 | if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) { |
694 | struct vnode * nlvp; |
695 | nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst); |
696 | |
697 | mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first; |
698 | nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first; |
699 | if (mp->mnt_vnodelist.tqh_first) { |
700 | mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next; |
701 | } else { |
702 | mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last; |
703 | } |
704 | mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first; |
705 | TAILQ_INIT(&mp->mnt_newvnodes); |
706 | moved = 1; |
707 | } |
708 | |
709 | return moved; |
710 | } |
711 | |
712 | |
713 | void |
714 | vnode_iterate_clear(mount_t mp) |
715 | { |
716 | mp->mnt_lflag &= ~MNT_LITER; |
717 | } |
718 | |
719 | #if defined(__x86_64__) |
720 | |
721 | #include <i386/panic_hooks.h> |
722 | |
723 | struct vnode_iterate_panic_hook { |
724 | panic_hook_t hook; |
725 | mount_t mp; |
726 | struct vnode *vp; |
727 | }; |
728 | |
729 | static void |
730 | vnode_iterate_panic_hook(panic_hook_t *hook_) |
731 | { |
732 | struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_; |
733 | panic_phys_range_t range; |
734 | uint64_t phys; |
735 | |
736 | if (panic_phys_range_before(hook->mp, &phys, &range)) { |
737 | paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n" , |
738 | hook->mp, phys, range.type, range.phys_start, |
739 | range.phys_start + range.len); |
740 | } else { |
741 | paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n" , hook->mp, phys); |
742 | } |
743 | |
744 | if (panic_phys_range_before(hook->vp, &phys, &range)) { |
745 | paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n" , |
746 | hook->vp, phys, range.type, range.phys_start, |
747 | range.phys_start + range.len); |
748 | } else { |
749 | paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n" , hook->vp, phys); |
750 | } |
751 | panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288); |
752 | } |
753 | #endif /* defined(__x86_64__) */ |
754 | |
755 | int |
756 | vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *), |
757 | void *arg) |
758 | { |
759 | struct vnode *vp; |
760 | int vid, retval; |
761 | int ret = 0; |
762 | |
763 | /* |
764 | * The mount iterate mutex is held for the duration of the iteration. |
765 | * This can be done by a state flag on the mount structure but we can |
766 | * run into priority inversion issues sometimes. |
767 | * Using a mutex allows us to benefit from the priority donation |
768 | * mechanisms in the kernel for locks. This mutex should never be |
769 | * acquired in spin mode and it should be acquired before attempting to |
770 | * acquire the mount lock. |
771 | */ |
772 | mount_iterate_lock(mp); |
773 | |
774 | mount_lock(mp); |
775 | |
776 | vnode_iterate_setup(mp); |
777 | |
778 | /* If it returns 0 then there is nothing to do */ |
779 | retval = vnode_iterate_prepare(mp); |
780 | |
781 | if (retval == 0) { |
782 | vnode_iterate_clear(mp); |
783 | mount_unlock(mp); |
784 | mount_iterate_unlock(mp); |
785 | return ret; |
786 | } |
787 | |
788 | #if defined(__x86_64__) |
789 | struct vnode_iterate_panic_hook hook; |
790 | hook.mp = mp; |
791 | hook.vp = NULL; |
792 | panic_hook(&hook.hook, vnode_iterate_panic_hook); |
793 | #endif |
794 | /* iterate over all the vnodes */ |
795 | while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { |
796 | vp = TAILQ_FIRST(&mp->mnt_workerqueue); |
797 | #if defined(__x86_64__) |
798 | hook.vp = vp; |
799 | #endif |
800 | TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); |
801 | TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); |
802 | vid = vp->v_id; |
803 | if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) { |
804 | continue; |
805 | } |
806 | vnode_hold(vp); |
807 | mount_unlock(mp); |
808 | |
809 | if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) { |
810 | mount_lock(mp); |
811 | vnode_drop(vp); |
812 | continue; |
813 | } |
814 | vnode_drop(vp); |
815 | if (flags & VNODE_RELOAD) { |
816 | /* |
817 | * we're reloading the filesystem |
818 | * cast out any inactive vnodes... |
819 | */ |
820 | if (vnode_reload(vp)) { |
821 | /* vnode will be recycled on the refcount drop */ |
822 | vnode_put(vp); |
823 | mount_lock(mp); |
824 | continue; |
825 | } |
826 | } |
827 | |
828 | retval = callout(vp, arg); |
829 | |
830 | switch (retval) { |
831 | case VNODE_RETURNED: |
832 | case VNODE_RETURNED_DONE: |
833 | vnode_put(vp); |
834 | if (retval == VNODE_RETURNED_DONE) { |
835 | mount_lock(mp); |
836 | ret = 0; |
837 | goto out; |
838 | } |
839 | break; |
840 | |
841 | case VNODE_CLAIMED_DONE: |
842 | mount_lock(mp); |
843 | ret = 0; |
844 | goto out; |
845 | case VNODE_CLAIMED: |
846 | default: |
847 | break; |
848 | } |
849 | mount_lock(mp); |
850 | } |
851 | |
852 | out: |
853 | #if defined(__x86_64__) |
854 | panic_unhook(&hook.hook); |
855 | #endif |
856 | (void)vnode_iterate_reloadq(mp); |
857 | vnode_iterate_clear(mp); |
858 | mount_unlock(mp); |
859 | mount_iterate_unlock(mp); |
860 | return ret; |
861 | } |
862 | |
863 | void |
864 | mount_lock_renames(mount_t mp) |
865 | { |
866 | lck_mtx_lock(lck: &mp->mnt_renamelock); |
867 | } |
868 | |
869 | void |
870 | mount_unlock_renames(mount_t mp) |
871 | { |
872 | lck_mtx_unlock(lck: &mp->mnt_renamelock); |
873 | } |
874 | |
875 | void |
876 | mount_iterate_lock(mount_t mp) |
877 | { |
878 | lck_mtx_lock(lck: &mp->mnt_iter_lock); |
879 | } |
880 | |
881 | void |
882 | mount_iterate_unlock(mount_t mp) |
883 | { |
884 | lck_mtx_unlock(lck: &mp->mnt_iter_lock); |
885 | } |
886 | |
887 | void |
888 | mount_lock(mount_t mp) |
889 | { |
890 | lck_mtx_lock(lck: &mp->mnt_mlock); |
891 | } |
892 | |
893 | void |
894 | mount_lock_spin(mount_t mp) |
895 | { |
896 | lck_mtx_lock_spin(lck: &mp->mnt_mlock); |
897 | } |
898 | |
899 | void |
900 | mount_unlock(mount_t mp) |
901 | { |
902 | lck_mtx_unlock(lck: &mp->mnt_mlock); |
903 | } |
904 | |
905 | |
906 | void |
907 | mount_ref(mount_t mp, int locked) |
908 | { |
909 | if (!locked) { |
910 | mount_lock_spin(mp); |
911 | } |
912 | |
913 | mp->mnt_count++; |
914 | |
915 | if (!locked) { |
916 | mount_unlock(mp); |
917 | } |
918 | } |
919 | |
920 | |
921 | void |
922 | mount_drop(mount_t mp, int locked) |
923 | { |
924 | if (!locked) { |
925 | mount_lock_spin(mp); |
926 | } |
927 | |
928 | mp->mnt_count--; |
929 | |
930 | if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) { |
931 | wakeup(chan: &mp->mnt_lflag); |
932 | } |
933 | |
934 | if (!locked) { |
935 | mount_unlock(mp); |
936 | } |
937 | } |
938 | |
939 | |
940 | int |
941 | mount_iterref(mount_t mp, int locked) |
942 | { |
943 | int retval = 0; |
944 | |
945 | if (!locked) { |
946 | mount_list_lock(); |
947 | } |
948 | if (mp->mnt_iterref < 0) { |
949 | retval = 1; |
950 | } else { |
951 | mp->mnt_iterref++; |
952 | } |
953 | if (!locked) { |
954 | mount_list_unlock(); |
955 | } |
956 | return retval; |
957 | } |
958 | |
959 | int |
960 | mount_isdrained(mount_t mp, int locked) |
961 | { |
962 | int retval; |
963 | |
964 | if (!locked) { |
965 | mount_list_lock(); |
966 | } |
967 | if (mp->mnt_iterref < 0) { |
968 | retval = 1; |
969 | } else { |
970 | retval = 0; |
971 | } |
972 | if (!locked) { |
973 | mount_list_unlock(); |
974 | } |
975 | return retval; |
976 | } |
977 | |
978 | void |
979 | mount_iterdrop(mount_t mp) |
980 | { |
981 | mount_list_lock(); |
982 | mp->mnt_iterref--; |
983 | wakeup(chan: &mp->mnt_iterref); |
984 | mount_list_unlock(); |
985 | } |
986 | |
987 | void |
988 | mount_iterdrain(mount_t mp) |
989 | { |
990 | mount_list_lock(); |
991 | while (mp->mnt_iterref) { |
992 | msleep(chan: (caddr_t)&mp->mnt_iterref, mtx: &mnt_list_mtx_lock, PVFS, wmesg: "mount_iterdrain" , NULL); |
993 | } |
994 | /* mount iterations drained */ |
995 | mp->mnt_iterref = -1; |
996 | mount_list_unlock(); |
997 | } |
998 | void |
999 | mount_iterreset(mount_t mp) |
1000 | { |
1001 | mount_list_lock(); |
1002 | if (mp->mnt_iterref == -1) { |
1003 | mp->mnt_iterref = 0; |
1004 | } |
1005 | mount_list_unlock(); |
1006 | } |
1007 | |
1008 | /* always called with mount lock held */ |
1009 | int |
1010 | mount_refdrain(mount_t mp) |
1011 | { |
1012 | if (mp->mnt_lflag & MNT_LDRAIN) { |
1013 | panic("already in drain" ); |
1014 | } |
1015 | mp->mnt_lflag |= MNT_LDRAIN; |
1016 | |
1017 | while (mp->mnt_count) { |
1018 | msleep(chan: (caddr_t)&mp->mnt_lflag, mtx: &mp->mnt_mlock, PVFS, wmesg: "mount_drain" , NULL); |
1019 | } |
1020 | |
1021 | if (mp->mnt_vnodelist.tqh_first != NULL) { |
1022 | panic("mount_refdrain: dangling vnode" ); |
1023 | } |
1024 | |
1025 | mp->mnt_lflag &= ~MNT_LDRAIN; |
1026 | |
1027 | return 0; |
1028 | } |
1029 | |
1030 | /* Tags the mount point as not supportine extended readdir for NFS exports */ |
1031 | void |
1032 | mount_set_noreaddirext(mount_t mp) |
1033 | { |
1034 | mount_lock(mp); |
1035 | mp->mnt_kern_flag |= MNTK_DENY_READDIREXT; |
1036 | mount_unlock(mp); |
1037 | } |
1038 | |
1039 | /* |
1040 | * Mark a mount point as busy. Used to synchronize access and to delay |
1041 | * unmounting. |
1042 | */ |
1043 | int |
1044 | vfs_busy(mount_t mp, int flags) |
1045 | { |
1046 | restart: |
1047 | if (mp->mnt_lflag & MNT_LDEAD) { |
1048 | return ENOENT; |
1049 | } |
1050 | |
1051 | mount_lock(mp); |
1052 | |
1053 | if (mp->mnt_lflag & MNT_LUNMOUNT) { |
1054 | if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) { |
1055 | mount_unlock(mp); |
1056 | return ENOENT; |
1057 | } |
1058 | |
1059 | /* |
1060 | * Since all busy locks are shared except the exclusive |
1061 | * lock granted when unmounting, the only place that a |
1062 | * wakeup needs to be done is at the release of the |
1063 | * exclusive lock at the end of dounmount. |
1064 | */ |
1065 | mp->mnt_lflag |= MNT_LWAIT; |
1066 | msleep(chan: (caddr_t)mp, mtx: &mp->mnt_mlock, pri: (PVFS | PDROP), wmesg: "vfsbusy" , NULL); |
1067 | return ENOENT; |
1068 | } |
1069 | |
1070 | mount_unlock(mp); |
1071 | |
1072 | lck_rw_lock_shared(lck: &mp->mnt_rwlock); |
1073 | |
1074 | /* |
1075 | * Until we are granted the rwlock, it's possible for the mount point to |
1076 | * change state, so re-evaluate before granting the vfs_busy. |
1077 | */ |
1078 | if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { |
1079 | lck_rw_done(lck: &mp->mnt_rwlock); |
1080 | goto restart; |
1081 | } |
1082 | return 0; |
1083 | } |
1084 | |
1085 | /* |
1086 | * Free a busy filesystem. |
1087 | */ |
1088 | void |
1089 | vfs_unbusy(mount_t mp) |
1090 | { |
1091 | lck_rw_done(lck: &mp->mnt_rwlock); |
1092 | } |
1093 | |
1094 | |
1095 | |
1096 | static void |
1097 | vfs_rootmountfailed(mount_t mp) |
1098 | { |
1099 | mount_list_lock(); |
1100 | mp->mnt_vtable->vfc_refcount--; |
1101 | mount_list_unlock(); |
1102 | |
1103 | vfs_unbusy(mp); |
1104 | |
1105 | if (nc_smr_enabled) { |
1106 | vfs_smr_synchronize(); |
1107 | } |
1108 | |
1109 | mount_lock_destroy(mp); |
1110 | |
1111 | #if CONFIG_MACF |
1112 | mac_mount_label_destroy(mp); |
1113 | #endif |
1114 | |
1115 | zfree(mount_zone, mp); |
1116 | } |
1117 | |
1118 | /* |
1119 | * Lookup a filesystem type, and if found allocate and initialize |
1120 | * a mount structure for it. |
1121 | * |
1122 | * Devname is usually updated by mount(8) after booting. |
1123 | */ |
1124 | static mount_t |
1125 | vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) |
1126 | { |
1127 | mount_t mp; |
1128 | |
1129 | mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO); |
1130 | /* Initialize the default IO constraints */ |
1131 | mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; |
1132 | mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; |
1133 | mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; |
1134 | mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; |
1135 | mp->mnt_devblocksize = DEV_BSIZE; |
1136 | mp->mnt_alignmentmask = PAGE_MASK; |
1137 | mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH; |
1138 | mp->mnt_ioscale = 1; |
1139 | mp->mnt_ioflags = 0; |
1140 | mp->mnt_realrootvp = NULLVP; |
1141 | mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; |
1142 | mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; |
1143 | mp->mnt_devbsdunit = 0; |
1144 | |
1145 | mount_lock_init(mp); |
1146 | (void)vfs_busy(mp, LK_NOWAIT); |
1147 | |
1148 | TAILQ_INIT(&mp->mnt_vnodelist); |
1149 | TAILQ_INIT(&mp->mnt_workerqueue); |
1150 | TAILQ_INIT(&mp->mnt_newvnodes); |
1151 | |
1152 | mp->mnt_vtable = vfsp; |
1153 | mp->mnt_op = vfsp->vfc_vfsops; |
1154 | mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS; |
1155 | mp->mnt_vnodecovered = NULLVP; |
1156 | //mp->mnt_stat.f_type = vfsp->vfc_typenum; |
1157 | mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; |
1158 | |
1159 | mount_list_lock(); |
1160 | vfsp->vfc_refcount++; |
1161 | mount_list_unlock(); |
1162 | |
1163 | strlcpy(dst: mp->mnt_vfsstat.f_fstypename, src: vfsp->vfc_name, MFSTYPENAMELEN); |
1164 | mp->mnt_vfsstat.f_mntonname[0] = '/'; |
1165 | /* XXX const poisoning layering violation */ |
1166 | (void) copystr(kfaddr: (const void *)devname, kdaddr: mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL); |
1167 | |
1168 | #if CONFIG_MACF |
1169 | mac_mount_label_init(mp); |
1170 | mac_mount_label_associate(ctx: vfs_context_kernel(), mp); |
1171 | #endif |
1172 | return mp; |
1173 | } |
1174 | |
1175 | errno_t |
1176 | vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp) |
1177 | { |
1178 | struct vfstable *vfsp; |
1179 | |
1180 | for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { |
1181 | if (!strncmp(s1: vfsp->vfc_name, s2: fstypename, |
1182 | n: sizeof(vfsp->vfc_name))) { |
1183 | break; |
1184 | } |
1185 | } |
1186 | if (vfsp == NULL) { |
1187 | return ENODEV; |
1188 | } |
1189 | |
1190 | *mpp = vfs_rootmountalloc_internal(vfsp, devname); |
1191 | |
1192 | if (*mpp) { |
1193 | return 0; |
1194 | } |
1195 | |
1196 | return ENOMEM; |
1197 | } |
1198 | |
1199 | #define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0)) |
1200 | |
1201 | /* |
1202 | * Find an appropriate filesystem to use for the root. If a filesystem |
1203 | * has not been preselected, walk through the list of known filesystems |
1204 | * trying those that have mountroot routines, and try them until one |
1205 | * works or we have tried them all. |
1206 | */ |
1207 | extern int (*mountroot)(void); |
1208 | |
1209 | int |
1210 | vfs_mountroot(void) |
1211 | { |
1212 | #if CONFIG_MACF |
1213 | struct vnode *vp; |
1214 | #endif |
1215 | struct vfstable *vfsp; |
1216 | vfs_context_t ctx = vfs_context_kernel(); |
1217 | struct vfs_attr vfsattr; |
1218 | int error; |
1219 | mount_t mp; |
1220 | vnode_t bdevvp_rootvp; |
1221 | |
1222 | /* |
1223 | * Reset any prior "unmounting everything" state. This handles the |
1224 | * situation where mount root and then unmountall and re-mountroot |
1225 | * a new image (see bsd/kern/imageboot.c). |
1226 | */ |
1227 | vfs_unmountall_started = vfs_unmountall_finished = 0; |
1228 | OSMemoryBarrier(); |
1229 | |
1230 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START); |
1231 | if (mountroot != NULL) { |
1232 | /* |
1233 | * used for netboot which follows a different set of rules |
1234 | */ |
1235 | error = (*mountroot)(); |
1236 | |
1237 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0); |
1238 | return error; |
1239 | } |
1240 | if ((error = bdevvp(dev: rootdev, vpp: &rootvp))) { |
1241 | printf("vfs_mountroot: can't setup bdevvp\n" ); |
1242 | |
1243 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1); |
1244 | return error; |
1245 | } |
1246 | /* |
1247 | * 4951998 - code we call in vfc_mountroot may replace rootvp |
1248 | * so keep a local copy for some house keeping. |
1249 | */ |
1250 | bdevvp_rootvp = rootvp; |
1251 | |
1252 | for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { |
1253 | if (vfsp->vfc_mountroot == NULL |
1254 | && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) { |
1255 | continue; |
1256 | } |
1257 | |
1258 | mp = vfs_rootmountalloc_internal(vfsp, devname: "root_device" ); |
1259 | mp->mnt_devvp = rootvp; |
1260 | |
1261 | if (vfsp->vfc_mountroot) { |
1262 | error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx); |
1263 | } else { |
1264 | error = VFS_MOUNT(mp, rootvp, 0, ctx); |
1265 | } |
1266 | |
1267 | if (!error) { |
1268 | if (bdevvp_rootvp != rootvp) { |
1269 | /* |
1270 | * rootvp changed... |
1271 | * bump the iocount and fix up mnt_devvp for the |
1272 | * new rootvp (it will already have a usecount taken)... |
1273 | * drop the iocount and the usecount on the orignal |
1274 | * since we are no longer going to use it... |
1275 | */ |
1276 | vnode_getwithref(vp: rootvp); |
1277 | mp->mnt_devvp = rootvp; |
1278 | |
1279 | vnode_rele(vp: bdevvp_rootvp); |
1280 | vnode_put(vp: bdevvp_rootvp); |
1281 | } |
1282 | mp->mnt_devvp->v_specflags |= SI_MOUNTEDON; |
1283 | |
1284 | vfs_unbusy(mp); |
1285 | |
1286 | mount_list_add(mp); |
1287 | |
1288 | /* |
1289 | * cache the IO attributes for the underlying physical media... |
1290 | * an error return indicates the underlying driver doesn't |
1291 | * support all the queries necessary... however, reasonable |
1292 | * defaults will have been set, so no reason to bail or care |
1293 | */ |
1294 | vfs_init_io_attributes(devvp: rootvp, mp); |
1295 | |
1296 | if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) { |
1297 | root_is_CF_drive = TRUE; |
1298 | } |
1299 | |
1300 | /* |
1301 | * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS. |
1302 | */ |
1303 | if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) { |
1304 | mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; |
1305 | } |
1306 | if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) { |
1307 | mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; |
1308 | } |
1309 | |
1310 | #if defined(XNU_TARGET_OS_OSX) |
1311 | uint32_t speed; |
1312 | |
1313 | if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) { |
1314 | speed = 128; |
1315 | } else if (disk_conditioner_mount_is_ssd(mp)) { |
1316 | speed = 7 * 256; |
1317 | } else { |
1318 | speed = 256; |
1319 | } |
1320 | vc_progress_setdiskspeed(speed); |
1321 | #endif /* XNU_TARGET_OS_OSX */ |
1322 | /* |
1323 | * Probe root file system for additional features. |
1324 | */ |
1325 | (void)VFS_START(mp, 0, ctx); |
1326 | |
1327 | VFSATTR_INIT(&vfsattr); |
1328 | VFSATTR_WANTED(&vfsattr, f_capabilities); |
1329 | if (vfs_getattr(mp, vfa: &vfsattr, ctx) == 0 && |
1330 | VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { |
1331 | if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && |
1332 | (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { |
1333 | mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; |
1334 | } |
1335 | #if NAMEDSTREAMS |
1336 | if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) && |
1337 | (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) { |
1338 | mp->mnt_kern_flag |= MNTK_NAMED_STREAMS; |
1339 | } |
1340 | #endif |
1341 | if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) && |
1342 | (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) { |
1343 | mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; |
1344 | } |
1345 | |
1346 | if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) && |
1347 | (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) { |
1348 | mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS; |
1349 | } |
1350 | } |
1351 | |
1352 | /* |
1353 | * get rid of iocount reference returned |
1354 | * by bdevvp (or picked up by us on the substitued |
1355 | * rootvp)... it (or we) will have also taken |
1356 | * a usecount reference which we want to keep |
1357 | */ |
1358 | vnode_put(vp: rootvp); |
1359 | |
1360 | #if CONFIG_MACF |
1361 | if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) { |
1362 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2); |
1363 | return 0; |
1364 | } |
1365 | |
1366 | error = VFS_ROOT(mp, &vp, ctx); |
1367 | if (error) { |
1368 | printf("%s() VFS_ROOT() returned %d\n" , |
1369 | __func__, error); |
1370 | dounmount(mp, MNT_FORCE, 0, ctx); |
1371 | goto fail; |
1372 | } |
1373 | error = vnode_label(mp, NULL, vp, NULL, flags: 0, ctx); |
1374 | /* |
1375 | * get rid of reference provided by VFS_ROOT |
1376 | */ |
1377 | vnode_put(vp); |
1378 | |
1379 | if (error) { |
1380 | printf("%s() vnode_label() returned %d\n" , |
1381 | __func__, error); |
1382 | dounmount(mp, MNT_FORCE, 0, ctx); |
1383 | goto fail; |
1384 | } |
1385 | #endif |
1386 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3); |
1387 | return 0; |
1388 | } |
1389 | vfs_rootmountfailed(mp); |
1390 | #if CONFIG_MACF |
1391 | fail: |
1392 | #endif |
1393 | if (error != EINVAL) { |
1394 | printf("%s_mountroot failed: %d\n" , vfsp->vfc_name, error); |
1395 | } |
1396 | } |
1397 | KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4); |
1398 | return ENODEV; |
1399 | } |
1400 | |
1401 | static int |
1402 | cache_purge_callback(mount_t mp, __unused void * arg) |
1403 | { |
1404 | cache_purgevfs(mp); |
1405 | return VFS_RETURNED; |
1406 | } |
1407 | |
1408 | extern lck_rw_t rootvnode_rw_lock; |
1409 | extern void set_rootvnode(vnode_t); |
1410 | |
1411 | |
1412 | static int |
1413 | mntonname_fixup_callback(mount_t mp, __unused void *arg) |
1414 | { |
1415 | int error = 0; |
1416 | |
1417 | if ((strncmp(s1: &mp->mnt_vfsstat.f_mntonname[0], s2: "/" , n: sizeof("/" )) == 0) || |
1418 | (strncmp(s1: &mp->mnt_vfsstat.f_mntonname[0], s2: "/dev" , n: sizeof("/dev" )) == 0)) { |
1419 | return 0; |
1420 | } |
1421 | |
1422 | if ((error = vfs_busy(mp, LK_NOWAIT))) { |
1423 | printf("vfs_busy failed with %d for %s\n" , error, mp->mnt_vfsstat.f_mntonname); |
1424 | return -1; |
1425 | } |
1426 | |
1427 | size_t pathlen = MAXPATHLEN; |
1428 | if ((error = vn_getpath_ext(vp: mp->mnt_vnodecovered, NULL, pathbuf: mp->mnt_vfsstat.f_mntonname, len: &pathlen, VN_GETPATH_FSENTER))) { |
1429 | printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n" , error, mp->mnt_vfsstat.f_mntonname); |
1430 | } |
1431 | |
1432 | vfs_unbusy(mp); |
1433 | |
1434 | return error; |
1435 | } |
1436 | |
1437 | static int |
1438 | clear_mntk_backs_root_callback(mount_t mp, __unused void *arg) |
1439 | { |
1440 | lck_rw_lock_exclusive(lck: &mp->mnt_rwlock); |
1441 | mp->mnt_kern_flag &= ~MNTK_BACKS_ROOT; |
1442 | lck_rw_done(lck: &mp->mnt_rwlock); |
1443 | return VFS_RETURNED; |
1444 | } |
1445 | |
1446 | static int |
1447 | verify_incoming_rootfs(vnode_t *incoming_rootvnodep, vfs_context_t ctx, |
1448 | vfs_switch_root_flags_t flags) |
1449 | { |
1450 | mount_t mp; |
1451 | vnode_t tdp; |
1452 | vnode_t incoming_rootvnode_with_iocount = *incoming_rootvnodep; |
1453 | vnode_t incoming_rootvnode_with_usecount = NULLVP; |
1454 | int error = 0; |
1455 | |
1456 | if (vnode_vtype(vp: incoming_rootvnode_with_iocount) != VDIR) { |
1457 | printf("Incoming rootfs path not a directory\n" ); |
1458 | error = ENOTDIR; |
1459 | goto done; |
1460 | } |
1461 | |
1462 | /* |
1463 | * Before we call VFS_ROOT, we have to let go of the iocount already |
1464 | * acquired, but before doing that get a usecount. |
1465 | */ |
1466 | vnode_ref_ext(incoming_rootvnode_with_iocount, 0, VNODE_REF_FORCE); |
1467 | incoming_rootvnode_with_usecount = incoming_rootvnode_with_iocount; |
1468 | vnode_lock_spin(incoming_rootvnode_with_usecount); |
1469 | if ((mp = incoming_rootvnode_with_usecount->v_mount)) { |
1470 | mp->mnt_crossref++; |
1471 | vnode_unlock(incoming_rootvnode_with_usecount); |
1472 | } else { |
1473 | vnode_unlock(incoming_rootvnode_with_usecount); |
1474 | printf("Incoming rootfs root vnode does not have associated mount\n" ); |
1475 | error = ENOTDIR; |
1476 | goto done; |
1477 | } |
1478 | |
1479 | if (vfs_busy(mp, LK_NOWAIT)) { |
1480 | printf("Incoming rootfs root vnode mount is busy\n" ); |
1481 | error = ENOENT; |
1482 | goto out; |
1483 | } |
1484 | |
1485 | vnode_put(vp: incoming_rootvnode_with_iocount); |
1486 | incoming_rootvnode_with_iocount = NULLVP; |
1487 | |
1488 | error = VFS_ROOT(mp, &tdp, ctx); |
1489 | |
1490 | if (error) { |
1491 | printf("Could not get rootvnode of incoming rootfs\n" ); |
1492 | } else if (tdp != incoming_rootvnode_with_usecount) { |
1493 | vnode_put(vp: tdp); |
1494 | tdp = NULLVP; |
1495 | printf("Incoming rootfs root vnode mount is is not a mountpoint\n" ); |
1496 | error = EINVAL; |
1497 | goto out_busy; |
1498 | } else { |
1499 | incoming_rootvnode_with_iocount = tdp; |
1500 | tdp = NULLVP; |
1501 | } |
1502 | |
1503 | if ((flags & VFSSR_VIRTUALDEV_PROHIBITED) != 0) { |
1504 | if (mp->mnt_flag & MNTK_VIRTUALDEV) { |
1505 | error = ENODEV; |
1506 | } |
1507 | if (error) { |
1508 | printf("Incoming rootfs is backed by a virtual device; cannot switch to it" ); |
1509 | goto out_busy; |
1510 | } |
1511 | } |
1512 | |
1513 | out_busy: |
1514 | vfs_unbusy(mp); |
1515 | |
1516 | out: |
1517 | vnode_lock(incoming_rootvnode_with_usecount); |
1518 | mp->mnt_crossref--; |
1519 | if (mp->mnt_crossref < 0) { |
1520 | panic("mount cross refs -ve" ); |
1521 | } |
1522 | vnode_unlock(incoming_rootvnode_with_usecount); |
1523 | |
1524 | done: |
1525 | if (incoming_rootvnode_with_usecount) { |
1526 | vnode_rele(vp: incoming_rootvnode_with_usecount); |
1527 | incoming_rootvnode_with_usecount = NULLVP; |
1528 | } |
1529 | |
1530 | if (error && incoming_rootvnode_with_iocount) { |
1531 | vnode_put(vp: incoming_rootvnode_with_iocount); |
1532 | incoming_rootvnode_with_iocount = NULLVP; |
1533 | } |
1534 | |
1535 | *incoming_rootvnodep = incoming_rootvnode_with_iocount; |
1536 | return error; |
1537 | } |
1538 | |
1539 | /* |
1540 | * vfs_switch_root() |
1541 | * |
1542 | * Move the current root volume, and put a different volume at the root. |
1543 | * |
1544 | * incoming_vol_old_path: This is the path where the incoming root volume |
1545 | * is mounted when this function begins. |
1546 | * outgoing_vol_new_path: This is the path where the outgoing root volume |
1547 | * will be mounted when this function (successfully) ends. |
1548 | * Note: Do not use a leading slash. |
1549 | * |
1550 | * Volumes mounted at several fixed points (including /dev) will be preserved |
1551 | * at the same absolute path. That means they will move within the folder |
1552 | * hierarchy during the pivot operation. For example, /dev before the pivot |
1553 | * will be at /dev after the pivot. |
1554 | * |
1555 | * If any filesystem has MNTK_BACKS_ROOT set, it will be cleared. If the |
1556 | * incoming root volume is actually a disk image backed by some other |
1557 | * filesystem, it is the caller's responsibility to re-set MNTK_BACKS_ROOT |
1558 | * as appropriate. |
1559 | */ |
1560 | int |
1561 | vfs_switch_root(const char *incoming_vol_old_path, |
1562 | const char *outgoing_vol_new_path, |
1563 | vfs_switch_root_flags_t flags) |
1564 | { |
1565 | // grumble grumble |
1566 | #define countof(x) (sizeof(x) / sizeof(x[0])) |
1567 | |
1568 | struct preserved_mount { |
1569 | vnode_t pm_rootvnode; |
1570 | mount_t pm_mount; |
1571 | vnode_t pm_new_covered_vp; |
1572 | vnode_t pm_old_covered_vp; |
1573 | const char *pm_path; |
1574 | }; |
1575 | |
1576 | vfs_context_t ctx = vfs_context_kernel(); |
1577 | vnode_t incoming_rootvnode = NULLVP; |
1578 | vnode_t outgoing_vol_new_covered_vp = NULLVP; |
1579 | vnode_t incoming_vol_old_covered_vp = NULLVP; |
1580 | mount_t outgoing = NULL; |
1581 | mount_t incoming = NULL; |
1582 | |
1583 | struct preserved_mount devfs = { NULLVP, NULL, NULLVP, NULLVP, "dev" }; |
1584 | struct preserved_mount preboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Preboot" }; |
1585 | struct preserved_mount recovery = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Recovery" }; |
1586 | struct preserved_mount vm = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/VM" }; |
1587 | struct preserved_mount update = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Update" }; |
1588 | struct preserved_mount iscPreboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/iSCPreboot" }; |
1589 | struct preserved_mount hardware = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Hardware" }; |
1590 | struct preserved_mount xarts = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/xarts" }; |
1591 | struct preserved_mount factorylogs = { NULLVP, NULL, NULLVP, NULLVP, "FactoryLogs" }; |
1592 | struct preserved_mount idiags = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Diags" }; |
1593 | |
1594 | struct preserved_mount *preserved[10]; |
1595 | preserved[0] = &devfs; |
1596 | preserved[1] = &preboot; |
1597 | preserved[2] = &recovery; |
1598 | preserved[3] = &vm; |
1599 | preserved[4] = &update; |
1600 | preserved[5] = &iscPreboot; |
1601 | preserved[6] = &hardware; |
1602 | preserved[7] = &xarts; |
1603 | preserved[8] = &factorylogs; |
1604 | preserved[9] = &idiags; |
1605 | |
1606 | int error; |
1607 | |
1608 | printf("%s : shuffling mount points : %s <-> / <-> %s\n" , __FUNCTION__, incoming_vol_old_path, outgoing_vol_new_path); |
1609 | |
1610 | if (outgoing_vol_new_path[0] == '/') { |
1611 | // I should have written this to be more helpful and just advance the pointer forward past the slash |
1612 | printf("Do not use a leading slash in outgoing_vol_new_path\n" ); |
1613 | return EINVAL; |
1614 | } |
1615 | |
1616 | // Set incoming_rootvnode. |
1617 | // Find the vnode representing the mountpoint of the new root |
1618 | // filesystem. That will be the new root directory. |
1619 | error = vnode_lookup(path: incoming_vol_old_path, flags: 0, vpp: &incoming_rootvnode, ctx); |
1620 | if (error) { |
1621 | printf("Incoming rootfs root vnode not found\n" ); |
1622 | error = ENOENT; |
1623 | goto done; |
1624 | } |
1625 | |
1626 | /* |
1627 | * This function drops the icoount and sets the vnode to NULL on error. |
1628 | */ |
1629 | error = verify_incoming_rootfs(incoming_rootvnodep: &incoming_rootvnode, ctx, flags); |
1630 | if (error) { |
1631 | goto done; |
1632 | } |
1633 | |
1634 | /* |
1635 | * Set outgoing_vol_new_covered_vp. |
1636 | * Find the vnode representing the future mountpoint of the old |
1637 | * root filesystem, inside the directory incoming_rootvnode. |
1638 | * Right now it's at "/incoming_vol_old_path/outgoing_vol_new_path". |
1639 | * soon it will become "/oldrootfs_path_after", which will be covered. |
1640 | */ |
1641 | error = vnode_lookupat(path: outgoing_vol_new_path, flags: 0, vpp: &outgoing_vol_new_covered_vp, ctx, start_dvp: incoming_rootvnode); |
1642 | if (error) { |
1643 | printf("Outgoing rootfs path not found, abandoning / switch, error = %d\n" , error); |
1644 | error = ENOENT; |
1645 | goto done; |
1646 | } |
1647 | if (vnode_vtype(vp: outgoing_vol_new_covered_vp) != VDIR) { |
1648 | printf("Outgoing rootfs path is not a directory, abandoning / switch\n" ); |
1649 | error = ENOTDIR; |
1650 | goto done; |
1651 | } |
1652 | |
1653 | /* |
1654 | * Find the preserved mounts - see if they are mounted. Get their root |
1655 | * vnode if they are. If they aren't, leave rootvnode NULL which will |
1656 | * be the signal to ignore this mount later on. |
1657 | * |
1658 | * Also get preserved mounts' new_covered_vp. |
1659 | * Find the node representing the folder "dev" inside the directory newrootvnode. |
1660 | * Right now it's at "/incoming_vol_old_path/dev". |
1661 | * Soon it will become /dev, which will be covered by the devfs mountpoint. |
1662 | */ |
1663 | for (size_t i = 0; i < countof(preserved); i++) { |
1664 | struct preserved_mount *pmi = preserved[i]; |
1665 | |
1666 | error = vnode_lookupat(path: pmi->pm_path, flags: 0, vpp: &pmi->pm_rootvnode, ctx, start_dvp: rootvnode); |
1667 | if (error) { |
1668 | printf("skipping preserved mountpoint because not found or error: %d: %s\n" , error, pmi->pm_path); |
1669 | // not fatal. try the next one in the list. |
1670 | continue; |
1671 | } |
1672 | bool is_mountpoint = false; |
1673 | vnode_lock_spin(pmi->pm_rootvnode); |
1674 | if ((pmi->pm_rootvnode->v_flag & VROOT) != 0) { |
1675 | is_mountpoint = true; |
1676 | } |
1677 | vnode_unlock(pmi->pm_rootvnode); |
1678 | if (!is_mountpoint) { |
1679 | printf("skipping preserved mountpoint because not a mountpoint: %s\n" , pmi->pm_path); |
1680 | vnode_put(vp: pmi->pm_rootvnode); |
1681 | pmi->pm_rootvnode = NULLVP; |
1682 | // not fatal. try the next one in the list. |
1683 | continue; |
1684 | } |
1685 | |
1686 | error = vnode_lookupat(path: pmi->pm_path, flags: 0, vpp: &pmi->pm_new_covered_vp, ctx, start_dvp: incoming_rootvnode); |
1687 | if (error) { |
1688 | printf("preserved new mount directory not found or error: %d: %s\n" , error, pmi->pm_path); |
1689 | error = ENOENT; |
1690 | goto done; |
1691 | } |
1692 | if (vnode_vtype(vp: pmi->pm_new_covered_vp) != VDIR) { |
1693 | printf("preserved new mount directory not directory: %s\n" , pmi->pm_path); |
1694 | error = ENOTDIR; |
1695 | goto done; |
1696 | } |
1697 | |
1698 | printf("will preserve mountpoint across pivot: /%s\n" , pmi->pm_path); |
1699 | } |
1700 | |
1701 | /* |
1702 | * -- |
1703 | * At this point, everything has been prepared and all error conditions |
1704 | * have been checked. We check everything we can before this point; |
1705 | * from now on we start making destructive changes, and we can't stop |
1706 | * until we reach the end. |
1707 | * ---- |
1708 | */ |
1709 | |
1710 | /* this usecount is transferred to the mnt_vnodecovered */ |
1711 | vnode_ref_ext(outgoing_vol_new_covered_vp, 0, VNODE_REF_FORCE); |
1712 | /* this usecount is transferred to set_rootvnode */ |
1713 | vnode_ref_ext(incoming_rootvnode, 0, VNODE_REF_FORCE); |
1714 | |
1715 | |
1716 | for (size_t i = 0; i < countof(preserved); i++) { |
1717 | struct preserved_mount *pmi = preserved[i]; |
1718 | if (pmi->pm_rootvnode == NULLVP) { |
1719 | continue; |
1720 | } |
1721 | |
1722 | /* this usecount is transferred to the mnt_vnodecovered */ |
1723 | vnode_ref_ext(pmi->pm_new_covered_vp, 0, VNODE_REF_FORCE); |
1724 | |
1725 | /* The new_covered_vp is a mountpoint from now on. */ |
1726 | vnode_lock_spin(pmi->pm_new_covered_vp); |
1727 | pmi->pm_new_covered_vp->v_flag |= VMOUNTEDHERE; |
1728 | vnode_unlock(pmi->pm_new_covered_vp); |
1729 | } |
1730 | |
1731 | /* The outgoing_vol_new_covered_vp is a mountpoint from now on. */ |
1732 | vnode_lock_spin(outgoing_vol_new_covered_vp); |
1733 | outgoing_vol_new_covered_vp->v_flag |= VMOUNTEDHERE; |
1734 | vnode_unlock(outgoing_vol_new_covered_vp); |
1735 | |
1736 | |
1737 | /* |
1738 | * Identify the mount_ts of the mounted filesystems that are being |
1739 | * manipulated: outgoing rootfs, incoming rootfs, and the preserved |
1740 | * mounts. |
1741 | */ |
1742 | outgoing = rootvnode->v_mount; |
1743 | incoming = incoming_rootvnode->v_mount; |
1744 | for (size_t i = 0; i < countof(preserved); i++) { |
1745 | struct preserved_mount *pmi = preserved[i]; |
1746 | if (pmi->pm_rootvnode == NULLVP) { |
1747 | continue; |
1748 | } |
1749 | |
1750 | pmi->pm_mount = pmi->pm_rootvnode->v_mount; |
1751 | } |
1752 | |
1753 | lck_rw_lock_exclusive(lck: &rootvnode_rw_lock); |
1754 | |
1755 | /* Setup incoming as the new rootfs */ |
1756 | lck_rw_lock_exclusive(lck: &incoming->mnt_rwlock); |
1757 | incoming_vol_old_covered_vp = incoming->mnt_vnodecovered; |
1758 | incoming->mnt_vnodecovered = NULLVP; |
1759 | strlcpy(dst: incoming->mnt_vfsstat.f_mntonname, src: "/" , MAXPATHLEN); |
1760 | incoming->mnt_flag |= MNT_ROOTFS; |
1761 | lck_rw_done(lck: &incoming->mnt_rwlock); |
1762 | |
1763 | /* |
1764 | * The preserved mountpoints will now be moved to |
1765 | * incoming_rootnode/pm_path, and then by the end of the function, |
1766 | * since incoming_rootnode is going to /, the preserved mounts |
1767 | * will be end up back at /pm_path |
1768 | */ |
1769 | for (size_t i = 0; i < countof(preserved); i++) { |
1770 | struct preserved_mount *pmi = preserved[i]; |
1771 | if (pmi->pm_rootvnode == NULLVP) { |
1772 | continue; |
1773 | } |
1774 | |
1775 | lck_rw_lock_exclusive(lck: &pmi->pm_mount->mnt_rwlock); |
1776 | pmi->pm_old_covered_vp = pmi->pm_mount->mnt_vnodecovered; |
1777 | pmi->pm_mount->mnt_vnodecovered = pmi->pm_new_covered_vp; |
1778 | vnode_lock_spin(pmi->pm_new_covered_vp); |
1779 | pmi->pm_new_covered_vp->v_mountedhere = pmi->pm_mount; |
1780 | SET(pmi->pm_new_covered_vp->v_flag, VMOUNTEDHERE); |
1781 | vnode_unlock(pmi->pm_new_covered_vp); |
1782 | lck_rw_done(lck: &pmi->pm_mount->mnt_rwlock); |
1783 | } |
1784 | |
1785 | /* |
1786 | * The old root volume now covers outgoing_vol_new_covered_vp |
1787 | * on the new root volume. Remove the ROOTFS marker. |
1788 | * Now it is to be found at outgoing_vol_new_path |
1789 | */ |
1790 | lck_rw_lock_exclusive(lck: &outgoing->mnt_rwlock); |
1791 | outgoing->mnt_vnodecovered = outgoing_vol_new_covered_vp; |
1792 | strlcpy(dst: outgoing->mnt_vfsstat.f_mntonname, src: "/" , MAXPATHLEN); |
1793 | strlcat(dst: outgoing->mnt_vfsstat.f_mntonname, src: outgoing_vol_new_path, MAXPATHLEN); |
1794 | outgoing->mnt_flag &= ~MNT_ROOTFS; |
1795 | vnode_lock_spin(outgoing_vol_new_covered_vp); |
1796 | outgoing_vol_new_covered_vp->v_mountedhere = outgoing; |
1797 | vnode_unlock(outgoing_vol_new_covered_vp); |
1798 | lck_rw_done(lck: &outgoing->mnt_rwlock); |
1799 | |
1800 | if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) && |
1801 | (TAILQ_FIRST(&mountlist) == outgoing)) { |
1802 | vfs_setmntsystem(mp: outgoing); |
1803 | } |
1804 | |
1805 | /* |
1806 | * Finally, remove the mount_t linkage from the previously covered |
1807 | * vnodes on the old root volume. These were incoming_vol_old_path, |
1808 | * and each preserved mounts's "/pm_path". The filesystems previously |
1809 | * mounted there have already been moved away. |
1810 | */ |
1811 | vnode_lock_spin(incoming_vol_old_covered_vp); |
1812 | incoming_vol_old_covered_vp->v_flag &= ~VMOUNT; |
1813 | incoming_vol_old_covered_vp->v_mountedhere = NULL; |
1814 | vnode_unlock(incoming_vol_old_covered_vp); |
1815 | |
1816 | for (size_t i = 0; i < countof(preserved); i++) { |
1817 | struct preserved_mount *pmi = preserved[i]; |
1818 | if (pmi->pm_rootvnode == NULLVP) { |
1819 | continue; |
1820 | } |
1821 | |
1822 | vnode_lock_spin(pmi->pm_old_covered_vp); |
1823 | CLR(pmi->pm_old_covered_vp->v_flag, VMOUNTEDHERE); |
1824 | pmi->pm_old_covered_vp->v_mountedhere = NULL; |
1825 | vnode_unlock(pmi->pm_old_covered_vp); |
1826 | } |
1827 | |
1828 | /* |
1829 | * Clear the name cache since many cached names are now invalid. |
1830 | */ |
1831 | vfs_iterate(flags: 0 /* flags */, callout: cache_purge_callback, NULL); |
1832 | |
1833 | /* |
1834 | * Actually change the rootvnode! And finally drop the lock that |
1835 | * prevents concurrent vnode_lookups. |
1836 | */ |
1837 | set_rootvnode(incoming_rootvnode); |
1838 | lck_rw_unlock_exclusive(lck: &rootvnode_rw_lock); |
1839 | |
1840 | if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) && |
1841 | !(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) { |
1842 | /* |
1843 | * Switch the order of mount structures in the mountlist, new root |
1844 | * mount moves to the head of the list followed by /dev and the other |
1845 | * preserved mounts then all the preexisting mounts (old rootfs + any |
1846 | * others) |
1847 | */ |
1848 | mount_list_lock(); |
1849 | for (size_t i = 0; i < countof(preserved); i++) { |
1850 | struct preserved_mount *pmi = preserved[i]; |
1851 | if (pmi->pm_rootvnode == NULLVP) { |
1852 | continue; |
1853 | } |
1854 | |
1855 | TAILQ_REMOVE(&mountlist, pmi->pm_mount, mnt_list); |
1856 | TAILQ_INSERT_HEAD(&mountlist, pmi->pm_mount, mnt_list); |
1857 | } |
1858 | TAILQ_REMOVE(&mountlist, incoming, mnt_list); |
1859 | TAILQ_INSERT_HEAD(&mountlist, incoming, mnt_list); |
1860 | mount_list_unlock(); |
1861 | } |
1862 | |
1863 | /* |
1864 | * Fixups across all volumes |
1865 | */ |
1866 | vfs_iterate(flags: 0 /* flags */, callout: mntonname_fixup_callback, NULL); |
1867 | vfs_iterate(flags: 0 /* flags */, callout: clear_mntk_backs_root_callback, NULL); |
1868 | |
1869 | error = 0; |
1870 | |
1871 | done: |
1872 | for (size_t i = 0; i < countof(preserved); i++) { |
1873 | struct preserved_mount *pmi = preserved[i]; |
1874 | |
1875 | if (pmi->pm_rootvnode) { |
1876 | vnode_put(vp: pmi->pm_rootvnode); |
1877 | } |
1878 | if (pmi->pm_new_covered_vp) { |
1879 | vnode_put(vp: pmi->pm_new_covered_vp); |
1880 | } |
1881 | if (pmi->pm_old_covered_vp) { |
1882 | vnode_rele(vp: pmi->pm_old_covered_vp); |
1883 | } |
1884 | } |
1885 | |
1886 | if (outgoing_vol_new_covered_vp) { |
1887 | vnode_put(vp: outgoing_vol_new_covered_vp); |
1888 | } |
1889 | |
1890 | if (incoming_vol_old_covered_vp) { |
1891 | vnode_rele(vp: incoming_vol_old_covered_vp); |
1892 | } |
1893 | |
1894 | if (incoming_rootvnode) { |
1895 | vnode_put(vp: incoming_rootvnode); |
1896 | } |
1897 | |
1898 | printf("%s : done shuffling mount points with error: %d\n" , __FUNCTION__, error); |
1899 | return error; |
1900 | } |
1901 | |
1902 | /* |
1903 | * Mount the Recovery volume of a container |
1904 | */ |
1905 | int |
1906 | vfs_mount_recovery(void) |
1907 | { |
1908 | #if CONFIG_MOUNT_PREBOOTRECOVERY |
1909 | int error = 0; |
1910 | |
1911 | error = vnode_get(rootvnode); |
1912 | if (error) { |
1913 | /* root must be mounted first */ |
1914 | printf("vnode_get(rootvnode) failed with error %d\n" , error); |
1915 | return error; |
1916 | } |
1917 | |
1918 | char recoverypath[] = PLATFORM_RECOVERY_VOLUME_MOUNT_POINT; /* !const because of internal casting */ |
1919 | |
1920 | /* Mount the recovery volume */ |
1921 | printf("attempting kernel mount for recovery volume... \n" ); |
1922 | error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP, |
1923 | recoverypath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_RECOVERYVOL), vfs_context_kernel()); |
1924 | |
1925 | if (error) { |
1926 | printf("Failed to mount recovery volume (%d)\n" , error); |
1927 | } else { |
1928 | printf("mounted recovery volume\n" ); |
1929 | } |
1930 | |
1931 | vnode_put(vp: rootvnode); |
1932 | return error; |
1933 | #else |
1934 | return 0; |
1935 | #endif |
1936 | } |
1937 | |
1938 | /* |
1939 | * Lookup a mount point by filesystem identifier. |
1940 | */ |
1941 | |
1942 | struct mount * |
1943 | vfs_getvfs(fsid_t *fsid) |
1944 | { |
1945 | return mount_list_lookupby_fsid(fsid, 0, 0); |
1946 | } |
1947 | |
1948 | static struct mount * |
1949 | vfs_getvfs_locked(fsid_t *fsid) |
1950 | { |
1951 | return mount_list_lookupby_fsid(fsid, 1, 0); |
1952 | } |
1953 | |
1954 | struct mount * |
1955 | vfs_getvfs_with_vfsops(fsid_t *fsid, const struct vfsops * const ops) |
1956 | { |
1957 | mount_t mp = mount_list_lookupby_fsid(fsid, 0, 0); |
1958 | |
1959 | if (mp != NULL && mp->mnt_op != ops) { |
1960 | mp = NULL; |
1961 | } |
1962 | return mp; |
1963 | } |
1964 | |
1965 | struct mount * |
1966 | vfs_getvfs_by_mntonname(char *path) |
1967 | { |
1968 | mount_t retmp = (mount_t)0; |
1969 | mount_t mp; |
1970 | |
1971 | mount_list_lock(); |
1972 | TAILQ_FOREACH(mp, &mountlist, mnt_list) { |
1973 | if (!strncmp(s1: mp->mnt_vfsstat.f_mntonname, s2: path, |
1974 | n: sizeof(mp->mnt_vfsstat.f_mntonname))) { |
1975 | retmp = mp; |
1976 | if (mount_iterref(mp: retmp, locked: 1)) { |
1977 | retmp = NULL; |
1978 | } |
1979 | goto out; |
1980 | } |
1981 | } |
1982 | out: |
1983 | mount_list_unlock(); |
1984 | return retmp; |
1985 | } |
1986 | |
1987 | /* generation number for creation of new fsids */ |
1988 | u_short mntid_gen = 0; |
1989 | /* |
1990 | * Get a new unique fsid |
1991 | */ |
1992 | void |
1993 | vfs_getnewfsid(struct mount *mp) |
1994 | { |
1995 | fsid_t tfsid; |
1996 | int mtype; |
1997 | |
1998 | mount_list_lock(); |
1999 | |
2000 | /* generate a new fsid */ |
2001 | mtype = mp->mnt_vtable->vfc_typenum; |
2002 | if (++mntid_gen == 0) { |
2003 | mntid_gen++; |
2004 | } |
2005 | tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); |
2006 | tfsid.val[1] = mtype; |
2007 | |
2008 | while (vfs_getvfs_locked(fsid: &tfsid)) { |
2009 | if (++mntid_gen == 0) { |
2010 | mntid_gen++; |
2011 | } |
2012 | tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); |
2013 | } |
2014 | |
2015 | mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; |
2016 | mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; |
2017 | mount_list_unlock(); |
2018 | } |
2019 | |
2020 | /* |
2021 | * Routines having to do with the management of the vnode table. |
2022 | */ |
2023 | extern int(**dead_vnodeop_p)(void *); |
2024 | long numvnodes, freevnodes, deadvnodes, async_work_vnodes; |
2025 | long busyvnodes = 0; |
2026 | long deadvnodes_noreuse = 0; |
2027 | int32_t freeablevnodes = 0; |
2028 | uint64_t allocedvnodes = 0; |
2029 | uint64_t deallocedvnodes = 0; |
2030 | |
2031 | |
2032 | int async_work_timed_out = 0; |
2033 | int async_work_handled = 0; |
2034 | int dead_vnode_wanted = 0; |
2035 | int dead_vnode_waited = 0; |
2036 | |
2037 | /* |
2038 | * Move a vnode from one mount queue to another. |
2039 | */ |
2040 | static void |
2041 | insmntque(vnode_t vp, mount_t mp) |
2042 | { |
2043 | mount_t lmp; |
2044 | /* |
2045 | * Delete from old mount point vnode list, if on one. |
2046 | */ |
2047 | if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) { |
2048 | if ((vp->v_lflag & VNAMED_MOUNT) == 0) { |
2049 | panic("insmntque: vp not in mount vnode list" ); |
2050 | } |
2051 | vp->v_lflag &= ~VNAMED_MOUNT; |
2052 | |
2053 | mount_lock_spin(mp: lmp); |
2054 | |
2055 | mount_drop(mp: lmp, locked: 1); |
2056 | |
2057 | if (vp->v_mntvnodes.tqe_next == NULL) { |
2058 | if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) { |
2059 | TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes); |
2060 | } else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) { |
2061 | TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes); |
2062 | } else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) { |
2063 | TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes); |
2064 | } |
2065 | } else { |
2066 | vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev; |
2067 | *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next; |
2068 | } |
2069 | vp->v_mntvnodes.tqe_next = NULL; |
2070 | vp->v_mntvnodes.tqe_prev = NULL; |
2071 | mount_unlock(mp: lmp); |
2072 | vnode_drop(vp); |
2073 | return; |
2074 | } |
2075 | |
2076 | /* |
2077 | * Insert into list of vnodes for the new mount point, if available. |
2078 | */ |
2079 | if ((vp->v_mount = mp) != NULL) { |
2080 | mount_lock_spin(mp); |
2081 | if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) { |
2082 | panic("vp already in mount list" ); |
2083 | } |
2084 | if (mp->mnt_lflag & MNT_LITER) { |
2085 | TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes); |
2086 | } else { |
2087 | TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); |
2088 | } |
2089 | if (vp->v_lflag & VNAMED_MOUNT) { |
2090 | panic("insmntque: vp already in mount vnode list" ); |
2091 | } |
2092 | vnode_hold(vp); |
2093 | vp->v_lflag |= VNAMED_MOUNT; |
2094 | mount_ref(mp, locked: 1); |
2095 | mount_unlock(mp); |
2096 | } |
2097 | } |
2098 | |
2099 | |
2100 | /* |
2101 | * Create a vnode for a block device. |
2102 | * Used for root filesystem, argdev, and swap areas. |
2103 | * Also used for memory file system special devices. |
2104 | */ |
2105 | int |
2106 | bdevvp(dev_t dev, vnode_t *vpp) |
2107 | { |
2108 | vnode_t nvp; |
2109 | int error; |
2110 | struct vnode_fsparam vfsp; |
2111 | struct vfs_context context; |
2112 | |
2113 | if (dev == NODEV) { |
2114 | *vpp = NULLVP; |
2115 | return ENODEV; |
2116 | } |
2117 | |
2118 | context.vc_thread = current_thread(); |
2119 | context.vc_ucred = FSCRED; |
2120 | |
2121 | vfsp.vnfs_mp = (struct mount *)0; |
2122 | vfsp.vnfs_vtype = VBLK; |
2123 | vfsp.vnfs_str = "bdevvp" ; |
2124 | vfsp.vnfs_dvp = NULL; |
2125 | vfsp.vnfs_fsnode = NULL; |
2126 | vfsp.vnfs_cnp = NULL; |
2127 | vfsp.vnfs_vops = spec_vnodeop_p; |
2128 | vfsp.vnfs_rdev = dev; |
2129 | vfsp.vnfs_filesize = 0; |
2130 | |
2131 | vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; |
2132 | |
2133 | vfsp.vnfs_marksystem = 0; |
2134 | vfsp.vnfs_markroot = 0; |
2135 | |
2136 | if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, data: &vfsp, vpp: &nvp))) { |
2137 | *vpp = NULLVP; |
2138 | return error; |
2139 | } |
2140 | vnode_lock_spin(nvp); |
2141 | nvp->v_flag |= VBDEVVP; |
2142 | nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */ |
2143 | vnode_unlock(nvp); |
2144 | if ((error = vnode_ref(vp: nvp))) { |
2145 | panic("bdevvp failed: vnode_ref" ); |
2146 | return error; |
2147 | } |
2148 | if ((error = VNOP_FSYNC(vp: nvp, MNT_WAIT, ctx: &context))) { |
2149 | panic("bdevvp failed: fsync" ); |
2150 | return error; |
2151 | } |
2152 | if ((error = buf_invalidateblks(vp: nvp, BUF_WRITE_DATA, slpflag: 0, slptimeo: 0))) { |
2153 | panic("bdevvp failed: invalidateblks" ); |
2154 | return error; |
2155 | } |
2156 | |
2157 | #if CONFIG_MACF |
2158 | /* |
2159 | * XXXMAC: We can't put a MAC check here, the system will |
2160 | * panic without this vnode. |
2161 | */ |
2162 | #endif /* MAC */ |
2163 | |
2164 | if ((error = VNOP_OPEN(nvp, FREAD, &context))) { |
2165 | panic("bdevvp failed: open" ); |
2166 | return error; |
2167 | } |
2168 | *vpp = nvp; |
2169 | |
2170 | return 0; |
2171 | } |
2172 | |
2173 | /* |
2174 | * Check to see if the new vnode represents a special device |
2175 | * for which we already have a vnode (either because of |
2176 | * bdevvp() or because of a different vnode representing |
2177 | * the same block device). If such an alias exists, deallocate |
2178 | * the existing contents and return the aliased vnode. The |
2179 | * caller is responsible for filling it with its new contents. |
2180 | */ |
2181 | static vnode_t |
2182 | checkalias(struct vnode *nvp, dev_t nvp_rdev) |
2183 | { |
2184 | struct vnode *vp; |
2185 | struct vnode **vpp; |
2186 | struct specinfo *sin = NULL; |
2187 | int vid = 0; |
2188 | |
2189 | vpp = &speclisth[SPECHASH(nvp_rdev)]; |
2190 | loop: |
2191 | SPECHASH_LOCK(); |
2192 | |
2193 | for (vp = *vpp; vp; vp = vp->v_specnext) { |
2194 | if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { |
2195 | vid = vp->v_id; |
2196 | vnode_hold(vp); |
2197 | break; |
2198 | } |
2199 | } |
2200 | SPECHASH_UNLOCK(); |
2201 | |
2202 | if (vp) { |
2203 | found_alias: |
2204 | if (vnode_getwithvid(vp, vid)) { |
2205 | vnode_drop(vp); |
2206 | goto loop; |
2207 | } |
2208 | vnode_drop(vp); |
2209 | /* |
2210 | * Termination state is checked in vnode_getwithvid |
2211 | */ |
2212 | vnode_lock(vp); |
2213 | |
2214 | /* |
2215 | * Alias, but not in use, so flush it out. |
2216 | */ |
2217 | if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) { |
2218 | vnode_hold(vp); |
2219 | vnode_reclaim_internal(vp, 1, 1, 0); |
2220 | vnode_put_locked(vp); |
2221 | vnode_drop_and_unlock(vp); |
2222 | goto loop; |
2223 | } |
2224 | } |
2225 | if (vp == NULL || vp->v_tag != VT_NON) { |
2226 | if (sin == NULL) { |
2227 | sin = zalloc_flags(specinfo_zone, Z_WAITOK | Z_ZERO); |
2228 | } else { |
2229 | bzero(s: sin, n: sizeof(struct specinfo)); |
2230 | } |
2231 | |
2232 | nvp->v_specinfo = sin; |
2233 | nvp->v_rdev = nvp_rdev; |
2234 | nvp->v_specflags = 0; |
2235 | nvp->v_speclastr = -1; |
2236 | nvp->v_specinfo->si_opencount = 0; |
2237 | nvp->v_specinfo->si_initted = 0; |
2238 | nvp->v_specinfo->si_throttleable = 0; |
2239 | nvp->v_specinfo->si_devbsdunit = LOWPRI_MAX_NUM_DEV; |
2240 | |
2241 | SPECHASH_LOCK(); |
2242 | |
2243 | /* We dropped the lock, someone could have added */ |
2244 | if (vp == NULLVP) { |
2245 | for (vp = *vpp; vp; vp = vp->v_specnext) { |
2246 | if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { |
2247 | vid = vp->v_id; |
2248 | vnode_hold(vp); |
2249 | SPECHASH_UNLOCK(); |
2250 | goto found_alias; |
2251 | } |
2252 | } |
2253 | } |
2254 | |
2255 | nvp->v_hashchain = vpp; |
2256 | nvp->v_specnext = *vpp; |
2257 | *vpp = nvp; |
2258 | |
2259 | if (vp != NULLVP) { |
2260 | nvp->v_specflags |= SI_ALIASED; |
2261 | vp->v_specflags |= SI_ALIASED; |
2262 | SPECHASH_UNLOCK(); |
2263 | vnode_put_locked(vp); |
2264 | vnode_unlock(vp); |
2265 | } else { |
2266 | SPECHASH_UNLOCK(); |
2267 | } |
2268 | |
2269 | return NULLVP; |
2270 | } |
2271 | |
2272 | if (sin) { |
2273 | zfree(specinfo_zone, sin); |
2274 | } |
2275 | |
2276 | if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) { |
2277 | return vp; |
2278 | } |
2279 | |
2280 | panic("checkalias with VT_NON vp that shouldn't: %p" , vp); |
2281 | |
2282 | return vp; |
2283 | } |
2284 | |
2285 | |
2286 | /* |
2287 | * Get a reference on a particular vnode and lock it if requested. |
2288 | * If the vnode was on the inactive list, remove it from the list. |
2289 | * If the vnode was on the free list, remove it from the list and |
2290 | * move it to inactive list as needed. |
2291 | * The vnode lock bit is set if the vnode is being eliminated in |
2292 | * vgone. The process is awakened when the transition is completed, |
2293 | * and an error returned to indicate that the vnode is no longer |
2294 | * usable (possibly having been changed to a new file system type). |
2295 | */ |
2296 | int |
2297 | vget_internal(vnode_t vp, int vid, int vflags) |
2298 | { |
2299 | int error = 0; |
2300 | |
2301 | vnode_lock_spin(vp); |
2302 | |
2303 | if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) { |
2304 | /* |
2305 | * vnode to be returned only if it has writers opened |
2306 | */ |
2307 | error = EINVAL; |
2308 | } else { |
2309 | error = vnode_getiocount(vp, vid, vflags); |
2310 | } |
2311 | |
2312 | vnode_unlock(vp); |
2313 | |
2314 | return error; |
2315 | } |
2316 | |
2317 | /* |
2318 | * Returns: 0 Success |
2319 | * ENOENT No such file or directory [terminating] |
2320 | */ |
2321 | int |
2322 | vnode_ref(vnode_t vp) |
2323 | { |
2324 | return vnode_ref_ext(vp, 0, 0); |
2325 | } |
2326 | |
2327 | /* |
2328 | * Returns: 0 Success |
2329 | * ENOENT No such file or directory [terminating] |
2330 | */ |
2331 | int |
2332 | vnode_ref_ext(vnode_t vp, int fmode, int flags) |
2333 | { |
2334 | int error = 0; |
2335 | |
2336 | vnode_lock_spin(vp); |
2337 | |
2338 | /* |
2339 | * once all the current call sites have been fixed to insure they have |
2340 | * taken an iocount, we can toughen this assert up and insist that the |
2341 | * iocount is non-zero... a non-zero usecount doesn't insure correctness |
2342 | */ |
2343 | if (vp->v_iocount <= 0 && vp->v_usecount <= 0) { |
2344 | panic("vnode_ref_ext: vp %p has no valid reference %d, %d" , vp, vp->v_iocount, vp->v_usecount); |
2345 | } |
2346 | |
2347 | /* |
2348 | * if you are the owner of drain/termination, can acquire usecount |
2349 | */ |
2350 | if ((flags & VNODE_REF_FORCE) == 0) { |
2351 | if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { |
2352 | if (vp->v_owner != current_thread()) { |
2353 | error = ENOENT; |
2354 | goto out; |
2355 | } |
2356 | } |
2357 | } |
2358 | |
2359 | /* Enable atomic ops on v_usecount without the vnode lock */ |
2360 | os_atomic_inc(&vp->v_usecount, relaxed); |
2361 | |
2362 | if (fmode & FWRITE) { |
2363 | if (++vp->v_writecount <= 0) { |
2364 | panic("vnode_ref_ext: v_writecount" ); |
2365 | } |
2366 | } |
2367 | if (fmode & O_EVTONLY) { |
2368 | if (++vp->v_kusecount <= 0) { |
2369 | panic("vnode_ref_ext: v_kusecount" ); |
2370 | } |
2371 | } |
2372 | if (vp->v_flag & VRAGE) { |
2373 | struct uthread *ut; |
2374 | |
2375 | ut = current_uthread(); |
2376 | |
2377 | if (!(current_proc()->p_lflag & P_LRAGE_VNODES) && |
2378 | !(ut->uu_flag & UT_RAGE_VNODES)) { |
2379 | /* |
2380 | * a 'normal' process accessed this vnode |
2381 | * so make sure its no longer marked |
2382 | * for rapid aging... also, make sure |
2383 | * it gets removed from the rage list... |
2384 | * when v_usecount drops back to 0, it |
2385 | * will be put back on the real free list |
2386 | */ |
2387 | vp->v_flag &= ~VRAGE; |
2388 | vp->v_references = 0; |
2389 | vnode_list_remove(vp); |
2390 | } |
2391 | } |
2392 | if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { |
2393 | if (vp->v_ubcinfo) { |
2394 | vnode_lock_convert(vp); |
2395 | memory_object_mark_used(control: vp->v_ubcinfo->ui_control); |
2396 | } |
2397 | } |
2398 | out: |
2399 | vnode_unlock(vp); |
2400 | |
2401 | return error; |
2402 | } |
2403 | |
2404 | |
2405 | boolean_t |
2406 | vnode_on_reliable_media(vnode_t vp) |
2407 | { |
2408 | mount_t mp = vp->v_mount; |
2409 | |
2410 | /* |
2411 | * A NULL mountpoint would imply it's not attached to a any filesystem. |
2412 | * This can only happen with a vnode created by bdevvp(). We'll consider |
2413 | * those as not unreliable as the primary use of this function is determine |
2414 | * which vnodes are to be handed off to the async cleaner thread for |
2415 | * reclaim. |
2416 | */ |
2417 | if (!mp || (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && (mp->mnt_flag & MNT_LOCAL))) { |
2418 | return TRUE; |
2419 | } |
2420 | |
2421 | return FALSE; |
2422 | } |
2423 | |
2424 | static void |
2425 | vnode_async_list_add_locked(vnode_t vp) |
2426 | { |
2427 | if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) { |
2428 | panic("vnode_async_list_add: %p is in wrong state" , vp); |
2429 | } |
2430 | |
2431 | TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist); |
2432 | vp->v_listflag |= VLIST_ASYNC_WORK; |
2433 | |
2434 | async_work_vnodes++; |
2435 | if (!(vp->v_listflag & VLIST_NO_REUSE)) { |
2436 | reusablevnodes++; |
2437 | } |
2438 | if (vp->v_flag & VCANDEALLOC) { |
2439 | os_atomic_dec(&busyvnodes, relaxed); |
2440 | } |
2441 | } |
2442 | |
2443 | static void |
2444 | vnode_async_list_add(vnode_t vp) |
2445 | { |
2446 | vnode_list_lock(); |
2447 | |
2448 | if (VONLIST(vp)) { |
2449 | if (!(vp->v_listflag & VLIST_ASYNC_WORK)) { |
2450 | vnode_list_remove_locked(vp); |
2451 | vnode_async_list_add_locked(vp); |
2452 | } |
2453 | } else { |
2454 | vnode_async_list_add_locked(vp); |
2455 | } |
2456 | |
2457 | vnode_list_unlock(); |
2458 | |
2459 | wakeup(chan: &vnode_async_work_list); |
2460 | } |
2461 | |
2462 | |
2463 | /* |
2464 | * put the vnode on appropriate free list. |
2465 | * called with vnode LOCKED |
2466 | */ |
2467 | static void |
2468 | vnode_list_add(vnode_t vp) |
2469 | { |
2470 | boolean_t need_dead_wakeup = FALSE; |
2471 | bool no_busy_decrement = false; |
2472 | |
2473 | #if DIAGNOSTIC |
2474 | lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); |
2475 | #endif |
2476 | |
2477 | again: |
2478 | |
2479 | /* |
2480 | * if it is already on a list or non zero references return |
2481 | */ |
2482 | if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) { |
2483 | return; |
2484 | } |
2485 | |
2486 | /* |
2487 | * In vclean, we might have deferred ditching locked buffers |
2488 | * because something was still referencing them (indicated by |
2489 | * usecount). We can ditch them now. |
2490 | */ |
2491 | if (ISSET(vp->v_lflag, VL_DEAD) |
2492 | && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) { |
2493 | ++vp->v_iocount; // Probably not necessary, but harmless |
2494 | #ifdef CONFIG_IOCOUNT_TRACE |
2495 | record_vp(vp, 1); |
2496 | #endif |
2497 | vnode_unlock(vp); |
2498 | buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, slpflag: 0, slptimeo: 0); |
2499 | vnode_lock(vp); |
2500 | vnode_dropiocount(vp); |
2501 | goto again; |
2502 | } |
2503 | |
2504 | vnode_list_lock(); |
2505 | |
2506 | if (!(vp->v_lflag & VL_DEAD) && (vp->v_listflag & VLIST_NO_REUSE)) { |
2507 | if (!(vp->v_listflag & VLIST_ASYNC_WORK)) { |
2508 | vnode_async_list_add_locked(vp); |
2509 | } |
2510 | no_busy_decrement = true; |
2511 | } else if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) { |
2512 | /* |
2513 | * add the new guy to the appropriate end of the RAGE list |
2514 | */ |
2515 | if ((vp->v_flag & VAGE)) { |
2516 | TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist); |
2517 | } else { |
2518 | TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist); |
2519 | } |
2520 | |
2521 | vp->v_listflag |= VLIST_RAGE; |
2522 | ragevnodes++; |
2523 | reusablevnodes++; |
2524 | wakeup_laundry_thread(); |
2525 | |
2526 | /* |
2527 | * reset the timestamp for the last inserted vp on the RAGE |
2528 | * queue to let new_vnode know that its not ok to start stealing |
2529 | * from this list... as long as we're actively adding to this list |
2530 | * we'll push out the vnodes we want to donate to the real free list |
2531 | * once we stop pushing, we'll let some time elapse before we start |
2532 | * stealing them in the new_vnode routine |
2533 | */ |
2534 | microuptime(tv: &rage_tv); |
2535 | } else { |
2536 | /* |
2537 | * if VL_DEAD, insert it at head of the dead list |
2538 | * else insert at tail of LRU list or at head if VAGE is set |
2539 | */ |
2540 | if ((vp->v_lflag & VL_DEAD)) { |
2541 | if (vp->v_flag & VCANDEALLOC) { |
2542 | TAILQ_INSERT_TAIL(&vnode_dead_list, vp, v_freelist); |
2543 | if (vp->v_listflag & VLIST_NO_REUSE) { |
2544 | deadvnodes_noreuse++; |
2545 | } |
2546 | } else { |
2547 | TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist); |
2548 | } |
2549 | vp->v_listflag |= VLIST_DEAD; |
2550 | deadvnodes++; |
2551 | |
2552 | if (dead_vnode_wanted) { |
2553 | dead_vnode_wanted--; |
2554 | need_dead_wakeup = TRUE; |
2555 | } |
2556 | } else if ((vp->v_flag & VAGE)) { |
2557 | TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); |
2558 | vp->v_flag &= ~VAGE; |
2559 | freevnodes++; |
2560 | reusablevnodes++; |
2561 | wakeup_laundry_thread(); |
2562 | } else { |
2563 | TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); |
2564 | freevnodes++; |
2565 | reusablevnodes++; |
2566 | wakeup_laundry_thread(); |
2567 | } |
2568 | } |
2569 | if ((vp->v_flag & VCANDEALLOC) && !no_busy_decrement) { |
2570 | os_atomic_dec(&busyvnodes, relaxed); |
2571 | } |
2572 | vnode_list_unlock(); |
2573 | |
2574 | if (need_dead_wakeup == TRUE) { |
2575 | wakeup_one(chan: (caddr_t)&dead_vnode_wanted); |
2576 | } |
2577 | } |
2578 | |
2579 | |
2580 | /* |
2581 | * remove the vnode from appropriate free list. |
2582 | * called with vnode LOCKED and |
2583 | * the list lock held |
2584 | */ |
2585 | static void |
2586 | vnode_list_remove_locked(vnode_t vp) |
2587 | { |
2588 | if (VONLIST(vp)) { |
2589 | /* |
2590 | * the v_listflag field is |
2591 | * protected by the vnode_list_lock |
2592 | */ |
2593 | if (vp->v_listflag & VLIST_RAGE) { |
2594 | VREMRAGE("vnode_list_remove" , vp); |
2595 | } else if (vp->v_listflag & VLIST_DEAD) { |
2596 | VREMDEAD("vnode_list_remove" , vp); |
2597 | wakeup_laundry_thread(); |
2598 | } else if (vp->v_listflag & VLIST_ASYNC_WORK) { |
2599 | VREMASYNC_WORK("vnode_list_remove" , vp); |
2600 | } else { |
2601 | VREMFREE("vnode_list_remove" , vp); |
2602 | } |
2603 | if (vp->v_flag & VCANDEALLOC) { |
2604 | os_atomic_inc(&busyvnodes, relaxed); |
2605 | } |
2606 | } |
2607 | } |
2608 | |
2609 | |
2610 | /* |
2611 | * remove the vnode from appropriate free list. |
2612 | * called with vnode LOCKED |
2613 | */ |
2614 | static void |
2615 | vnode_list_remove(vnode_t vp) |
2616 | { |
2617 | #if DIAGNOSTIC |
2618 | lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); |
2619 | #endif |
2620 | /* |
2621 | * we want to avoid taking the list lock |
2622 | * in the case where we're not on the free |
2623 | * list... this will be true for most |
2624 | * directories and any currently in use files |
2625 | * |
2626 | * we're guaranteed that we can't go from |
2627 | * the not-on-list state to the on-list |
2628 | * state since we hold the vnode lock... |
2629 | * all calls to vnode_list_add are done |
2630 | * under the vnode lock... so we can |
2631 | * check for that condition (the prevelant one) |
2632 | * without taking the list lock |
2633 | */ |
2634 | if (VONLIST(vp)) { |
2635 | vnode_list_lock(); |
2636 | /* |
2637 | * however, we're not guaranteed that |
2638 | * we won't go from the on-list state |
2639 | * to the not-on-list state until we |
2640 | * hold the vnode_list_lock... this |
2641 | * is due to "new_vnode" removing vnodes |
2642 | * from the free list uder the list_lock |
2643 | * w/o the vnode lock... so we need to |
2644 | * check again whether we're currently |
2645 | * on the free list |
2646 | */ |
2647 | vnode_list_remove_locked(vp); |
2648 | |
2649 | vnode_list_unlock(); |
2650 | } |
2651 | } |
2652 | |
2653 | |
2654 | void |
2655 | vnode_rele(vnode_t vp) |
2656 | { |
2657 | vnode_rele_internal(vp, 0, 0, 0); |
2658 | } |
2659 | |
2660 | |
2661 | void |
2662 | vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) |
2663 | { |
2664 | vnode_rele_internal(vp, fmode, dont_reenter, 0); |
2665 | } |
2666 | |
2667 | |
2668 | void |
2669 | vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) |
2670 | { |
2671 | int32_t old_usecount; |
2672 | |
2673 | if (!locked) { |
2674 | vnode_hold(vp); |
2675 | vnode_lock_spin(vp); |
2676 | } |
2677 | #if DIAGNOSTIC |
2678 | else { |
2679 | lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); |
2680 | } |
2681 | #endif |
2682 | /* Enable atomic ops on v_usecount without the vnode lock */ |
2683 | old_usecount = os_atomic_dec_orig(&vp->v_usecount, relaxed); |
2684 | if (old_usecount < 1) { |
2685 | /* |
2686 | * Because we allow atomic ops on usecount (in lookup only, under |
2687 | * specific conditions of already having a usecount) it is |
2688 | * possible that when the vnode is examined, its usecount is |
2689 | * different than what will be printed in this panic message. |
2690 | */ |
2691 | panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x." , |
2692 | vp, old_usecount - 1, vp->v_tag, vp->v_type, vp->v_flag); |
2693 | } |
2694 | |
2695 | if (fmode & FWRITE) { |
2696 | if (--vp->v_writecount < 0) { |
2697 | panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x." , vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag); |
2698 | } |
2699 | } |
2700 | if (fmode & O_EVTONLY) { |
2701 | if (--vp->v_kusecount < 0) { |
2702 | panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x." , vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag); |
2703 | } |
2704 | } |
2705 | if (vp->v_kusecount > vp->v_usecount) { |
2706 | panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x." , vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag); |
2707 | } |
2708 | |
2709 | if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) { |
2710 | /* |
2711 | * vnode is still busy... if we're the last |
2712 | * usecount, mark for a future call to VNOP_INACTIVE |
2713 | * when the iocount finally drops to 0 |
2714 | */ |
2715 | if (vp->v_usecount == 0) { |
2716 | vp->v_lflag |= VL_NEEDINACTIVE; |
2717 | vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); |
2718 | } |
2719 | goto done; |
2720 | } |
2721 | vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); |
2722 | |
2723 | if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) { |
2724 | /* |
2725 | * vnode is being cleaned, or |
2726 | * we've requested that we don't reenter |
2727 | * the filesystem on this release...in |
2728 | * the latter case, we'll mark the vnode aged |
2729 | */ |
2730 | if (dont_reenter) { |
2731 | if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) { |
2732 | vp->v_lflag |= VL_NEEDINACTIVE; |
2733 | |
2734 | if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) { |
2735 | vnode_async_list_add(vp); |
2736 | goto done; |
2737 | } |
2738 | } |
2739 | vp->v_flag |= VAGE; |
2740 | } |
2741 | vnode_list_add(vp); |
2742 | |
2743 | goto done; |
2744 | } |
2745 | /* |
2746 | * at this point both the iocount and usecount |
2747 | * are zero |
2748 | * pick up an iocount so that we can call |
2749 | * VNOP_INACTIVE with the vnode lock unheld |
2750 | */ |
2751 | vp->v_iocount++; |
2752 | #ifdef CONFIG_IOCOUNT_TRACE |
2753 | record_vp(vp, 1); |
2754 | #endif |
2755 | vp->v_lflag &= ~VL_NEEDINACTIVE; |
2756 | |
2757 | if (UBCINFOEXISTS(vp)) { |
2758 | ubc_cs_free_and_vnode_unlock(vp); |
2759 | } else { |
2760 | vnode_unlock(vp); |
2761 | } |
2762 | |
2763 | VNOP_INACTIVE(vp, vfs_context_current()); |
2764 | |
2765 | vnode_lock_spin(vp); |
2766 | |
2767 | /* |
2768 | * because we dropped the vnode lock to call VNOP_INACTIVE |
2769 | * the state of the vnode may have changed... we may have |
2770 | * picked up an iocount, usecount or the MARKTERM may have |
2771 | * been set... we need to reevaluate the reference counts |
2772 | * to determine if we can call vnode_reclaim_internal at |
2773 | * this point... if the reference counts are up, we'll pick |
2774 | * up the MARKTERM state when they get subsequently dropped |
2775 | */ |
2776 | if ((vp->v_iocount == 1) && (vp->v_usecount == 0) && |
2777 | ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { |
2778 | struct uthread *ut; |
2779 | |
2780 | ut = current_uthread(); |
2781 | |
2782 | if (ut->uu_defer_reclaims) { |
2783 | vp->v_defer_reclaimlist = ut->uu_vreclaims; |
2784 | ut->uu_vreclaims = vp; |
2785 | goto done; |
2786 | } |
2787 | vnode_lock_convert(vp); |
2788 | vnode_reclaim_internal(vp, 1, 1, 0); |
2789 | } |
2790 | vnode_dropiocount(vp); |
2791 | vnode_list_add(vp); |
2792 | done: |
2793 | if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { |
2794 | if (vp->v_ubcinfo) { |
2795 | vnode_lock_convert(vp); |
2796 | memory_object_mark_unused(control: vp->v_ubcinfo->ui_control, rage: (vp->v_flag & VRAGE) == VRAGE); |
2797 | } |
2798 | } |
2799 | if (!locked) { |
2800 | vnode_drop_and_unlock(vp); |
2801 | } |
2802 | return; |
2803 | } |
2804 | |
2805 | /* |
2806 | * Remove any vnodes in the vnode table belonging to mount point mp. |
2807 | * |
2808 | * If MNT_NOFORCE is specified, there should not be any active ones, |
2809 | * return error if any are found (nb: this is a user error, not a |
2810 | * system error). If MNT_FORCE is specified, detach any active vnodes |
2811 | * that are found. |
2812 | */ |
2813 | |
2814 | int |
2815 | vflush(struct mount *mp, struct vnode *skipvp, int flags) |
2816 | { |
2817 | struct vnode *vp; |
2818 | int busy = 0; |
2819 | int reclaimed = 0; |
2820 | int retval; |
2821 | unsigned int vid; |
2822 | bool first_try = true; |
2823 | |
2824 | /* |
2825 | * See comments in vnode_iterate() for the rationale for this lock |
2826 | */ |
2827 | mount_iterate_lock(mp); |
2828 | |
2829 | mount_lock(mp); |
2830 | vnode_iterate_setup(mp); |
2831 | /* |
2832 | * On regular unmounts(not forced) do a |
2833 | * quick check for vnodes to be in use. This |
2834 | * preserves the caching of vnodes. automounter |
2835 | * tries unmounting every so often to see whether |
2836 | * it is still busy or not. |
2837 | */ |
2838 | if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) { |
2839 | if (vnode_umount_preflight(mp, skipvp, flags)) { |
2840 | vnode_iterate_clear(mp); |
2841 | mount_unlock(mp); |
2842 | mount_iterate_unlock(mp); |
2843 | return EBUSY; |
2844 | } |
2845 | } |
2846 | loop: |
2847 | /* If it returns 0 then there is nothing to do */ |
2848 | retval = vnode_iterate_prepare(mp); |
2849 | |
2850 | if (retval == 0) { |
2851 | vnode_iterate_clear(mp); |
2852 | mount_unlock(mp); |
2853 | mount_iterate_unlock(mp); |
2854 | return retval; |
2855 | } |
2856 | |
2857 | /* iterate over all the vnodes */ |
2858 | while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { |
2859 | vp = TAILQ_FIRST(&mp->mnt_workerqueue); |
2860 | TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); |
2861 | TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); |
2862 | |
2863 | if ((vp->v_mount != mp) || (vp == skipvp)) { |
2864 | continue; |
2865 | } |
2866 | vid = vp->v_id; |
2867 | mount_unlock(mp); |
2868 | |
2869 | vnode_lock_spin(vp); |
2870 | |
2871 | // If vnode is already terminating, wait for it... |
2872 | while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) { |
2873 | vp->v_lflag |= VL_TERMWANT; |
2874 | msleep(chan: &vp->v_lflag, mtx: &vp->v_lock, PVFS, wmesg: "vflush" , NULL); |
2875 | } |
2876 | |
2877 | if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) { |
2878 | vnode_unlock(vp); |
2879 | mount_lock(mp); |
2880 | continue; |
2881 | } |
2882 | |
2883 | /* |
2884 | * If requested, skip over vnodes marked VSYSTEM. |
2885 | * Skip over all vnodes marked VNOFLUSH. |
2886 | */ |
2887 | if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || |
2888 | (vp->v_flag & VNOFLUSH))) { |
2889 | vnode_unlock(vp); |
2890 | mount_lock(mp); |
2891 | continue; |
2892 | } |
2893 | /* |
2894 | * If requested, skip over vnodes marked VSWAP. |
2895 | */ |
2896 | if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { |
2897 | vnode_unlock(vp); |
2898 | mount_lock(mp); |
2899 | continue; |
2900 | } |
2901 | /* |
2902 | * If requested, skip over vnodes marked VROOT. |
2903 | */ |
2904 | if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) { |
2905 | vnode_unlock(vp); |
2906 | mount_lock(mp); |
2907 | continue; |
2908 | } |
2909 | /* |
2910 | * If WRITECLOSE is set, only flush out regular file |
2911 | * vnodes open for writing. |
2912 | */ |
2913 | if ((flags & WRITECLOSE) && |
2914 | (vp->v_writecount == 0 || vp->v_type != VREG)) { |
2915 | vnode_unlock(vp); |
2916 | mount_lock(mp); |
2917 | continue; |
2918 | } |
2919 | /* |
2920 | * If the real usecount is 0, all we need to do is clear |
2921 | * out the vnode data structures and we are done. |
2922 | */ |
2923 | if (((vp->v_usecount == 0) || |
2924 | ((vp->v_usecount - vp->v_kusecount) == 0))) { |
2925 | vnode_lock_convert(vp); |
2926 | vnode_hold(vp); |
2927 | vp->v_iocount++; /* so that drain waits for * other iocounts */ |
2928 | #ifdef CONFIG_IOCOUNT_TRACE |
2929 | record_vp(vp, 1); |
2930 | #endif |
2931 | vnode_reclaim_internal(vp, 1, 1, 0); |
2932 | vnode_dropiocount(vp); |
2933 | vnode_list_add(vp); |
2934 | vnode_drop_and_unlock(vp); |
2935 | |
2936 | reclaimed++; |
2937 | mount_lock(mp); |
2938 | continue; |
2939 | } |
2940 | /* |
2941 | * If FORCECLOSE is set, forcibly close the vnode. |
2942 | * For block or character devices, revert to an |
2943 | * anonymous device. For all other files, just kill them. |
2944 | */ |
2945 | if (flags & FORCECLOSE) { |
2946 | vnode_lock_convert(vp); |
2947 | |
2948 | if (vp->v_type != VBLK && vp->v_type != VCHR) { |
2949 | vp->v_iocount++; /* so that drain waits * for other iocounts */ |
2950 | vnode_hold(vp); |
2951 | #ifdef CONFIG_IOCOUNT_TRACE |
2952 | record_vp(vp, 1); |
2953 | #endif |
2954 | vnode_abort_advlocks(vp); |
2955 | vnode_reclaim_internal(vp, 1, 1, 0); |
2956 | vnode_dropiocount(vp); |
2957 | vnode_list_add(vp); |
2958 | vnode_drop_and_unlock(vp); |
2959 | } else { |
2960 | vnode_hold(vp); |
2961 | vp->v_lflag |= VL_OPSCHANGE; |
2962 | vclean(vp, flag: 0); |
2963 | vp->v_lflag &= ~VL_DEAD; |
2964 | vp->v_op = spec_vnodeop_p; |
2965 | vp->v_flag |= VDEVFLUSH; |
2966 | vnode_drop_and_unlock(vp); |
2967 | wakeup(chan: &vp->v_lflag); /* chkvnlock is waitng for VL_DEAD to get unset */ |
2968 | } |
2969 | mount_lock(mp); |
2970 | continue; |
2971 | } |
2972 | |
2973 | /* log vnodes blocking unforced unmounts */ |
2974 | if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) { |
2975 | vprint(label: "vflush - busy vnode" , vp); |
2976 | } |
2977 | |
2978 | vnode_unlock(vp); |
2979 | mount_lock(mp); |
2980 | busy++; |
2981 | } |
2982 | |
2983 | /* At this point the worker queue is completed */ |
2984 | if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) { |
2985 | busy = 0; |
2986 | reclaimed = 0; |
2987 | (void)vnode_iterate_reloadq(mp); |
2988 | first_try = false; |
2989 | /* returned with mount lock held */ |
2990 | goto loop; |
2991 | } |
2992 | |
2993 | /* if new vnodes were created in between retry the reclaim */ |
2994 | if (vnode_iterate_reloadq(mp) != 0) { |
2995 | if (!(busy && ((flags & FORCECLOSE) == 0))) { |
2996 | first_try = false; |
2997 | goto loop; |
2998 | } |
2999 | } |
3000 | vnode_iterate_clear(mp); |
3001 | mount_unlock(mp); |
3002 | mount_iterate_unlock(mp); |
3003 | |
3004 | if (busy && ((flags & FORCECLOSE) == 0)) { |
3005 | return EBUSY; |
3006 | } |
3007 | return 0; |
3008 | } |
3009 | |
3010 | long num_recycledvnodes = 0; |
3011 | /* |
3012 | * Disassociate the underlying file system from a vnode. |
3013 | * The vnode lock is held on entry. |
3014 | */ |
3015 | static void |
3016 | vclean(vnode_t vp, int flags) |
3017 | { |
3018 | vfs_context_t ctx = vfs_context_current(); |
3019 | int active; |
3020 | int need_inactive; |
3021 | int already_terminating; |
3022 | int clflags = 0; |
3023 | #if NAMEDSTREAMS |
3024 | int is_namedstream; |
3025 | #endif |
3026 | |
3027 | /* |
3028 | * Check to see if the vnode is in use. |
3029 | * If so we have to reference it before we clean it out |
3030 | * so that its count cannot fall to zero and generate a |
3031 | * race against ourselves to recycle it. |
3032 | */ |
3033 | active = vp->v_usecount; |
3034 | |
3035 | /* |
3036 | * just in case we missed sending a needed |
3037 | * VNOP_INACTIVE, we'll do it now |
3038 | */ |
3039 | need_inactive = (vp->v_lflag & VL_NEEDINACTIVE); |
3040 | |
3041 | vp->v_lflag &= ~VL_NEEDINACTIVE; |
3042 | |
3043 | /* |
3044 | * Prevent the vnode from being recycled or |
3045 | * brought into use while we clean it out. |
3046 | */ |
3047 | already_terminating = (vp->v_lflag & VL_TERMINATE); |
3048 | |
3049 | vp->v_lflag |= VL_TERMINATE; |
3050 | |
3051 | #if NAMEDSTREAMS |
3052 | is_namedstream = vnode_isnamedstream(vp); |
3053 | #endif |
3054 | |
3055 | vnode_unlock(vp); |
3056 | |
3057 | OSAddAtomicLong(1, &num_recycledvnodes); |
3058 | |
3059 | if (flags & DOCLOSE) { |
3060 | clflags |= IO_NDELAY; |
3061 | } |
3062 | if (flags & REVOKEALL) { |
3063 | clflags |= IO_REVOKE; |
3064 | } |
3065 | |
3066 | #if CONFIG_MACF |
3067 | if (vp->v_mount) { |
3068 | /* |
3069 | * It is possible for bdevvp vnodes to not have a mount |
3070 | * pointer. It's fine to let it get reclaimed without |
3071 | * notifying. |
3072 | */ |
3073 | mac_vnode_notify_reclaim(vp); |
3074 | } |
3075 | #endif |
3076 | |
3077 | if (active && (flags & DOCLOSE)) { |
3078 | VNOP_CLOSE(vp, clflags, ctx); |
3079 | } |
3080 | |
3081 | /* |
3082 | * Clean out any buffers associated with the vnode. |
3083 | */ |
3084 | if (flags & DOCLOSE) { |
3085 | if (vp->v_tag == VT_NFS) { |
3086 | nfs_vinvalbuf(vp, V_SAVE, ctx, 0); |
3087 | } else { |
3088 | VNOP_FSYNC(vp, MNT_WAIT, ctx); |
3089 | |
3090 | /* |
3091 | * If the vnode is still in use (by the journal for |
3092 | * example) we don't want to invalidate locked buffers |
3093 | * here. In that case, either the journal will tidy them |
3094 | * up, or we will deal with it when the usecount is |
3095 | * finally released in vnode_rele_internal. |
3096 | */ |
3097 | buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), slpflag: 0, slptimeo: 0); |
3098 | } |
3099 | if (UBCINFOEXISTS(vp)) { |
3100 | /* |
3101 | * Clean the pages in VM. |
3102 | */ |
3103 | (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); |
3104 | } |
3105 | } |
3106 | if (active || need_inactive) { |
3107 | VNOP_INACTIVE(vp, ctx); |
3108 | } |
3109 | |
3110 | #if NAMEDSTREAMS |
3111 | if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) { |
3112 | vnode_t pvp = vp->v_parent; |
3113 | |
3114 | /* Delete the shadow stream file before we reclaim its vnode */ |
3115 | if (vnode_isshadow(vp)) { |
3116 | vnode_relenamedstream(vp: pvp, svp: vp); |
3117 | } |
3118 | |
3119 | /* |
3120 | * No more streams associated with the parent. We |
3121 | * have a ref on it, so its identity is stable. |
3122 | * If the parent is on an opaque volume, then we need to know |
3123 | * whether it has associated named streams. |
3124 | */ |
3125 | if (vfs_authopaque(mp: pvp->v_mount)) { |
3126 | vnode_lock_spin(pvp); |
3127 | pvp->v_lflag &= ~VL_HASSTREAMS; |
3128 | vnode_unlock(pvp); |
3129 | } |
3130 | } |
3131 | #endif |
3132 | |
3133 | vm_object_destroy_reason_t reason = VM_OBJECT_DESTROY_UNKNOWN_REASON; |
3134 | bool forced_unmount = vnode_mount(vp) != NULL && (vnode_mount(vp)->mnt_lflag & MNT_LFORCE) != 0; |
3135 | bool ungraft_heuristic = flags & REVOKEALL; |
3136 | if (forced_unmount) { |
3137 | reason = VM_OBJECT_DESTROY_FORCED_UNMOUNT; |
3138 | } else if (ungraft_heuristic) { |
3139 | reason = VM_OBJECT_DESTROY_UNGRAFT; |
3140 | } |
3141 | |
3142 | /* |
3143 | * Destroy ubc named reference |
3144 | * cluster_release is done on this path |
3145 | * along with dropping the reference on the ucred |
3146 | * (and in the case of forced unmount of an mmap-ed file, |
3147 | * the ubc reference on the vnode is dropped here too). |
3148 | */ |
3149 | ubc_destroy_named(vp, reason); |
3150 | |
3151 | #if CONFIG_TRIGGERS |
3152 | /* |
3153 | * cleanup trigger info from vnode (if any) |
3154 | */ |
3155 | if (vp->v_resolve) { |
3156 | vnode_resolver_detach(vp); |
3157 | } |
3158 | #endif |
3159 | |
3160 | #if CONFIG_IO_COMPRESSION_STATS |
3161 | if ((vp->io_compression_stats)) { |
3162 | vnode_iocs_record_and_free(vp); |
3163 | } |
3164 | #endif /* CONFIG_IO_COMPRESSION_STATS */ |
3165 | |
3166 | /* |
3167 | * Reclaim the vnode. |
3168 | */ |
3169 | if (VNOP_RECLAIM(vp, ctx)) { |
3170 | panic("vclean: cannot reclaim" ); |
3171 | } |
3172 | |
3173 | // make sure the name & parent ptrs get cleaned out! |
3174 | vnode_update_identity(vp, NULLVP, NULL, name_len: 0, name_hashval: 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE | VNODE_UPDATE_PURGEFIRMLINK); |
3175 | |
3176 | vnode_lock(vp); |
3177 | |
3178 | /* |
3179 | * Remove the vnode from any mount list it might be on. It is not |
3180 | * safe to do this any earlier because unmount needs to wait for |
3181 | * any vnodes to terminate and it cannot do that if it cannot find |
3182 | * them. |
3183 | */ |
3184 | insmntque(vp, mp: (struct mount *)0); |
3185 | |
3186 | vp->v_lflag |= VL_DEAD; |
3187 | vp->v_mount = dead_mountp; |
3188 | vp->v_op = dead_vnodeop_p; |
3189 | vp->v_tag = VT_NON; |
3190 | vp->v_data = NULL; |
3191 | |
3192 | vp->v_flag &= ~VISDIRTY; |
3193 | |
3194 | if (already_terminating == 0) { |
3195 | vp->v_lflag &= ~VL_TERMINATE; |
3196 | /* |
3197 | * Done with purge, notify sleepers of the grim news. |
3198 | */ |
3199 | if (vp->v_lflag & VL_TERMWANT) { |
3200 | vp->v_lflag &= ~VL_TERMWANT; |
3201 | wakeup(chan: &vp->v_lflag); |
3202 | } |
3203 | } |
3204 | } |
3205 | |
3206 | /* |
3207 | * Eliminate all activity associated with the requested vnode |
3208 | * and with all vnodes aliased to the requested vnode. |
3209 | */ |
3210 | int |
3211 | #if DIAGNOSTIC |
3212 | vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context) |
3213 | #else |
3214 | vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context) |
3215 | #endif |
3216 | { |
3217 | struct vnode *vq; |
3218 | int vid; |
3219 | |
3220 | #if DIAGNOSTIC |
3221 | if ((flags & REVOKEALL) == 0) { |
3222 | panic("vnop_revoke" ); |
3223 | } |
3224 | #endif |
3225 | |
3226 | if (vnode_isaliased(vp)) { |
3227 | /* |
3228 | * If a vgone (or vclean) is already in progress, |
3229 | * return an immediate error |
3230 | */ |
3231 | if (vp->v_lflag & VL_TERMINATE) { |
3232 | return ENOENT; |
3233 | } |
3234 | |
3235 | /* |
3236 | * Ensure that vp will not be vgone'd while we |
3237 | * are eliminating its aliases. |
3238 | */ |
3239 | SPECHASH_LOCK(); |
3240 | while ((vp->v_specflags & SI_ALIASED)) { |
3241 | for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { |
3242 | if (vq->v_rdev != vp->v_rdev || |
3243 | vq->v_type != vp->v_type || vp == vq) { |
3244 | continue; |
3245 | } |
3246 | vid = vq->v_id; |
3247 | vnode_hold(vp: vq); |
3248 | SPECHASH_UNLOCK(); |
3249 | if (vnode_getwithvid(vq, vid)) { |
3250 | vq = vnode_drop(vp: vq); |
3251 | SPECHASH_LOCK(); |
3252 | break; |
3253 | } |
3254 | vnode_lock(vq); |
3255 | if (!(vq->v_lflag & VL_TERMINATE)) { |
3256 | vnode_reclaim_internal(vq, 1, 1, 0); |
3257 | } |
3258 | vnode_put_locked(vq); |
3259 | vq = vnode_drop_and_unlock(vq); |
3260 | SPECHASH_LOCK(); |
3261 | break; |
3262 | } |
3263 | } |
3264 | SPECHASH_UNLOCK(); |
3265 | } |
3266 | vnode_lock(vp); |
3267 | if (vp->v_lflag & VL_TERMINATE) { |
3268 | vnode_unlock(vp); |
3269 | return ENOENT; |
3270 | } |
3271 | vnode_reclaim_internal(vp, 1, 0, REVOKEALL); |
3272 | vnode_unlock(vp); |
3273 | |
3274 | return 0; |
3275 | } |
3276 | |
3277 | /* |
3278 | * Recycle an unused vnode to the front of the free list. |
3279 | * Release the passed interlock if the vnode will be recycled. |
3280 | */ |
3281 | int |
3282 | vnode_recycle(struct vnode *vp) |
3283 | { |
3284 | vnode_lock_spin(vp); |
3285 | |
3286 | if (vp->v_iocount || vp->v_usecount) { |
3287 | vp->v_lflag |= VL_MARKTERM; |
3288 | vnode_unlock(vp); |
3289 | return 0; |
3290 | } |
3291 | vnode_lock_convert(vp); |
3292 | vnode_hold(vp); |
3293 | vnode_reclaim_internal(vp, 1, 0, 0); |
3294 | |
3295 | vnode_drop_and_unlock(vp); |
3296 | |
3297 | return 1; |
3298 | } |
3299 | |
3300 | static int |
3301 | vnode_reload(vnode_t vp) |
3302 | { |
3303 | vnode_lock_spin(vp); |
3304 | |
3305 | if ((vp->v_iocount > 1) || vp->v_usecount) { |
3306 | vnode_unlock(vp); |
3307 | return 0; |
3308 | } |
3309 | if (vp->v_iocount <= 0) { |
3310 | panic("vnode_reload with no iocount %d" , vp->v_iocount); |
3311 | } |
3312 | |
3313 | /* mark for release when iocount is dopped */ |
3314 | vp->v_lflag |= VL_MARKTERM; |
3315 | vnode_unlock(vp); |
3316 | |
3317 | return 1; |
3318 | } |
3319 | |
3320 | |
3321 | static void |
3322 | vgone(vnode_t vp, int flags) |
3323 | { |
3324 | struct vnode *vq; |
3325 | struct vnode *vx; |
3326 | |
3327 | /* |
3328 | * Clean out the filesystem specific data. |
3329 | * vclean also takes care of removing the |
3330 | * vnode from any mount list it might be on |
3331 | */ |
3332 | vclean(vp, flags: flags | DOCLOSE); |
3333 | |
3334 | /* |
3335 | * If special device, remove it from special device alias list |
3336 | * if it is on one. |
3337 | */ |
3338 | if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { |
3339 | SPECHASH_LOCK(); |
3340 | if (*vp->v_hashchain == vp) { |
3341 | *vp->v_hashchain = vp->v_specnext; |
3342 | } else { |
3343 | for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { |
3344 | if (vq->v_specnext != vp) { |
3345 | continue; |
3346 | } |
3347 | vq->v_specnext = vp->v_specnext; |
3348 | break; |
3349 | } |
3350 | if (vq == NULL) { |
3351 | panic("missing bdev" ); |
3352 | } |
3353 | } |
3354 | if (vp->v_specflags & SI_ALIASED) { |
3355 | vx = NULL; |
3356 | for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { |
3357 | if (vq->v_rdev != vp->v_rdev || |
3358 | vq->v_type != vp->v_type) { |
3359 | continue; |
3360 | } |
3361 | if (vx) { |
3362 | break; |
3363 | } |
3364 | vx = vq; |
3365 | } |
3366 | if (vx == NULL) { |
3367 | panic("missing alias" ); |
3368 | } |
3369 | if (vq == NULL) { |
3370 | vx->v_specflags &= ~SI_ALIASED; |
3371 | } |
3372 | vp->v_specflags &= ~SI_ALIASED; |
3373 | } |
3374 | SPECHASH_UNLOCK(); |
3375 | { |
3376 | struct specinfo *tmp = vp->v_specinfo; |
3377 | vp->v_specinfo = NULL; |
3378 | zfree(specinfo_zone, tmp); |
3379 | } |
3380 | } |
3381 | } |
3382 | |
3383 | /* |
3384 | * internal helper function only! |
3385 | * vend an _iocounted_ vnode via output argument, or return an error if unable. |
3386 | */ |
3387 | static int |
3388 | get_vp_from_dev(dev_t dev, enum vtype type, vnode_t *outvp) |
3389 | { |
3390 | vnode_t vp; |
3391 | int vid; |
3392 | |
3393 | loop: |
3394 | SPECHASH_LOCK(); |
3395 | for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { |
3396 | if (dev != vp->v_rdev || type != vp->v_type) { |
3397 | continue; |
3398 | } |
3399 | vid = vp->v_id; |
3400 | vnode_hold(vp); |
3401 | SPECHASH_UNLOCK(); |
3402 | |
3403 | /* acquire iocount */ |
3404 | if (vnode_getwithvid(vp, vid)) { |
3405 | vnode_drop(vp); |
3406 | goto loop; |
3407 | } |
3408 | vnode_drop(vp); |
3409 | |
3410 | /* Vend iocounted vnode */ |
3411 | *outvp = vp; |
3412 | return 0; |
3413 | } |
3414 | |
3415 | /* vnode not found, error out */ |
3416 | SPECHASH_UNLOCK(); |
3417 | return ENOENT; |
3418 | } |
3419 | |
3420 | |
3421 | |
3422 | /* |
3423 | * Lookup a vnode by device number. |
3424 | */ |
3425 | int |
3426 | check_mountedon(dev_t dev, enum vtype type, int *errorp) |
3427 | { |
3428 | vnode_t vp = NULLVP; |
3429 | int rc = 0; |
3430 | |
3431 | rc = get_vp_from_dev(dev, type, outvp: &vp); |
3432 | if (rc) { |
3433 | /* if no vnode found, it cannot be mounted on */ |
3434 | return 0; |
3435 | } |
3436 | |
3437 | /* otherwise, examine it */ |
3438 | vnode_lock_spin(vp); |
3439 | /* note: exclude the iocount we JUST got (e.g. >1, not >0) */ |
3440 | if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { |
3441 | vnode_unlock(vp); |
3442 | if ((*errorp = vfs_mountedon(vp)) != 0) { |
3443 | rc = 1; |
3444 | } |
3445 | } else { |
3446 | vnode_unlock(vp); |
3447 | } |
3448 | /* release iocount! */ |
3449 | vnode_put(vp); |
3450 | |
3451 | return rc; |
3452 | } |
3453 | |
3454 | extern dev_t chrtoblk(dev_t d); |
3455 | |
3456 | /* |
3457 | * Examine the supplied vnode's dev_t and find its counterpart |
3458 | * (e.g. VCHR => VDEV) to compare against. |
3459 | */ |
3460 | static int |
3461 | vnode_cmp_paired_dev(vnode_t vp, vnode_t bdev_vp, enum vtype in_type, |
3462 | enum vtype out_type) |
3463 | { |
3464 | if (!vp || !bdev_vp) { |
3465 | return EINVAL; |
3466 | } |
3467 | /* Verify iocounts */ |
3468 | if (vnode_iocount(vp) <= 0 || |
3469 | vnode_iocount(vp: bdev_vp) <= 0) { |
3470 | return EINVAL; |
3471 | } |
3472 | |
3473 | /* check for basic matches */ |
3474 | if (vnode_vtype(vp) != in_type) { |
3475 | return EINVAL; |
3476 | } |
3477 | if (vnode_vtype(vp: bdev_vp) != out_type) { |
3478 | return EINVAL; |
3479 | } |
3480 | |
3481 | dev_t dev = vnode_specrdev(vp); |
3482 | dev_t blk_devt = vnode_specrdev(vp: bdev_vp); |
3483 | |
3484 | if (in_type == VCHR) { |
3485 | if (out_type != VBLK) { |
3486 | return EINVAL; |
3487 | } |
3488 | dev_t bdev = chrtoblk(d: dev); |
3489 | if (bdev == NODEV) { |
3490 | return EINVAL; |
3491 | } else if (bdev == blk_devt) { |
3492 | return 0; |
3493 | } |
3494 | //fall through |
3495 | } |
3496 | /* |
3497 | * else case: |
3498 | * |
3499 | * in_type == VBLK? => VCHR? |
3500 | * not implemented... |
3501 | * exercise to the reader: this can be built by |
3502 | * taking the device's major, and iterating the `chrtoblktab` |
3503 | * array to look for a value that matches. |
3504 | */ |
3505 | return EINVAL; |
3506 | } |
3507 | /* |
3508 | * Vnode compare: does the supplied vnode's CHR device, match the dev_t |
3509 | * of the accompanying `blk_vp` ? |
3510 | * NOTE: vnodes MUST be iocounted BEFORE calling this! |
3511 | */ |
3512 | |
3513 | int |
3514 | vnode_cmp_chrtoblk(vnode_t vp, vnode_t blk_vp) |
3515 | { |
3516 | return vnode_cmp_paired_dev(vp, bdev_vp: blk_vp, in_type: VCHR, out_type: VBLK); |
3517 | } |
3518 | |
3519 | |
3520 | |
3521 | /* |
3522 | * Calculate the total number of references to a special device. |
3523 | */ |
3524 | int |
3525 | vcount(vnode_t vp) |
3526 | { |
3527 | vnode_t vq, vnext; |
3528 | int count; |
3529 | int vid; |
3530 | |
3531 | if (!vnode_isspec(vp)) { |
3532 | return vp->v_usecount - vp->v_kusecount; |
3533 | } |
3534 | |
3535 | loop: |
3536 | if (!vnode_isaliased(vp)) { |
3537 | return vp->v_specinfo->si_opencount; |
3538 | } |
3539 | count = 0; |
3540 | |
3541 | SPECHASH_LOCK(); |
3542 | /* |
3543 | * Grab first vnode and its vid. |
3544 | */ |
3545 | vq = *vp->v_hashchain; |
3546 | if (vq) { |
3547 | vid = vq->v_id; |
3548 | vnode_hold(vp: vq); |
3549 | } else { |
3550 | vid = 0; |
3551 | } |
3552 | SPECHASH_UNLOCK(); |
3553 | |
3554 | while (vq) { |
3555 | /* |
3556 | * Attempt to get the vnode outside the SPECHASH lock. |
3557 | * Don't take iocount on 'vp' as iocount is already held by the caller. |
3558 | */ |
3559 | if ((vq != vp) && vnode_getwithvid(vq, vid)) { |
3560 | vnode_drop(vp: vq); |
3561 | goto loop; |
3562 | } |
3563 | vnode_drop(vp: vq); |
3564 | vnode_lock(vq); |
3565 | |
3566 | if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) { |
3567 | if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) { |
3568 | /* |
3569 | * Alias, but not in use, so flush it out. |
3570 | */ |
3571 | vnode_hold(vp: vq); |
3572 | vnode_reclaim_internal(vq, 1, 1, 0); |
3573 | vnode_put_locked(vq); |
3574 | vnode_drop_and_unlock(vq); |
3575 | goto loop; |
3576 | } |
3577 | count += vq->v_specinfo->si_opencount; |
3578 | } |
3579 | vnode_unlock(vq); |
3580 | |
3581 | SPECHASH_LOCK(); |
3582 | /* |
3583 | * must do this with the reference still held on 'vq' |
3584 | * so that it can't be destroyed while we're poking |
3585 | * through v_specnext |
3586 | */ |
3587 | vnext = vq->v_specnext; |
3588 | if (vnext) { |
3589 | vid = vnext->v_id; |
3590 | vnode_hold(vp: vnext); |
3591 | } else { |
3592 | vid = 0; |
3593 | } |
3594 | SPECHASH_UNLOCK(); |
3595 | |
3596 | if (vq != vp) { |
3597 | vnode_put(vp: vq); |
3598 | } |
3599 | |
3600 | vq = vnext; |
3601 | } |
3602 | |
3603 | return count; |
3604 | } |
3605 | |
3606 | int prtactive = 0; /* 1 => print out reclaim of active vnodes */ |
3607 | |
3608 | /* |
3609 | * Print out a description of a vnode. |
3610 | */ |
3611 | static const char *typename[] = |
3612 | { "VNON" , "VREG" , "VDIR" , "VBLK" , "VCHR" , "VLNK" , "VSOCK" , "VFIFO" , "VBAD" }; |
3613 | |
3614 | void |
3615 | vprint(const char *label, struct vnode *vp) |
3616 | { |
3617 | char sbuf[64]; |
3618 | |
3619 | if (label != NULL) { |
3620 | printf("%s: " , label); |
3621 | } |
3622 | printf("name %s type %s, usecount %d, writecount %d\n" , |
3623 | vp->v_name, typename[vp->v_type], |
3624 | vp->v_usecount, vp->v_writecount); |
3625 | sbuf[0] = '\0'; |
3626 | if (vp->v_flag & VROOT) { |
3627 | strlcat(dst: sbuf, src: "|VROOT" , n: sizeof(sbuf)); |
3628 | } |
3629 | if (vp->v_flag & VTEXT) { |
3630 | strlcat(dst: sbuf, src: "|VTEXT" , n: sizeof(sbuf)); |
3631 | } |
3632 | if (vp->v_flag & VSYSTEM) { |
3633 | strlcat(dst: sbuf, src: "|VSYSTEM" , n: sizeof(sbuf)); |
3634 | } |
3635 | if (vp->v_flag & VNOFLUSH) { |
3636 | strlcat(dst: sbuf, src: "|VNOFLUSH" , n: sizeof(sbuf)); |
3637 | } |
3638 | if (vp->v_flag & VBWAIT) { |
3639 | strlcat(dst: sbuf, src: "|VBWAIT" , n: sizeof(sbuf)); |
3640 | } |
3641 | if (vnode_isaliased(vp)) { |
3642 | strlcat(dst: sbuf, src: "|VALIASED" , n: sizeof(sbuf)); |
3643 | } |
3644 | if (sbuf[0] != '\0') { |
3645 | printf("vnode flags (%s\n" , &sbuf[1]); |
3646 | } |
3647 | } |
3648 | |
3649 | static int |
3650 | vn_getpath_flags_to_buildpath_flags(int flags) |
3651 | { |
3652 | int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER; |
3653 | |
3654 | if (flags && (flags != VN_GETPATH_FSENTER)) { |
3655 | if (flags & VN_GETPATH_NO_FIRMLINK) { |
3656 | bpflags |= BUILDPATH_NO_FIRMLINK; |
3657 | } |
3658 | if (flags & VN_GETPATH_VOLUME_RELATIVE) { |
3659 | bpflags |= (BUILDPATH_VOLUME_RELATIVE | |
3660 | BUILDPATH_NO_FIRMLINK); |
3661 | } |
3662 | if (flags & VN_GETPATH_NO_PROCROOT) { |
3663 | bpflags |= BUILDPATH_NO_PROCROOT; |
3664 | } |
3665 | if (flags & VN_GETPATH_CHECK_MOVED) { |
3666 | bpflags |= BUILDPATH_CHECK_MOVED; |
3667 | } |
3668 | } |
3669 | |
3670 | return bpflags; |
3671 | } |
3672 | |
3673 | int |
3674 | vn_getpath_ext_with_mntlen(struct vnode *vp, struct vnode *dvp, char *pathbuf, |
3675 | size_t *len, size_t *mntlen, int flags) |
3676 | { |
3677 | int bpflags = vn_getpath_flags_to_buildpath_flags(flags); |
3678 | int local_len; |
3679 | int error; |
3680 | |
3681 | if (*len > INT_MAX) { |
3682 | return EINVAL; |
3683 | } |
3684 | |
3685 | local_len = *len; |
3686 | |
3687 | error = build_path_with_parent(vp, dvp, pathbuf, local_len, &local_len, |
3688 | mntlen, bpflags, vfs_context_current()); |
3689 | |
3690 | if (local_len >= 0 && local_len <= (int)*len) { |
3691 | *len = (size_t)local_len; |
3692 | } |
3693 | |
3694 | return error; |
3695 | } |
3696 | |
3697 | int |
3698 | vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, size_t *len, |
3699 | int flags) |
3700 | { |
3701 | return vn_getpath_ext_with_mntlen(vp, dvp, pathbuf, len, NULL, flags); |
3702 | } |
3703 | |
3704 | /* |
3705 | * Wrapper around vn_getpath_ext() that takes care of the int * <-> size_t * |
3706 | * conversion for the legacy KPIs. |
3707 | */ |
3708 | static int |
3709 | vn_getpath_ext_int(struct vnode *vp, struct vnode *dvp, char *pathbuf, |
3710 | int *len, int flags) |
3711 | { |
3712 | size_t slen = *len; |
3713 | int error; |
3714 | |
3715 | if (*len < 0) { |
3716 | return EINVAL; |
3717 | } |
3718 | |
3719 | error = vn_getpath_ext(vp, dvp, pathbuf, len: &slen, flags); |
3720 | |
3721 | if (slen <= INT_MAX) { |
3722 | *len = (int)slen; |
3723 | } |
3724 | |
3725 | return error; |
3726 | } |
3727 | |
3728 | int |
3729 | vn_getpath(struct vnode *vp, char *pathbuf, int *len) |
3730 | { |
3731 | return vn_getpath_ext_int(vp, NULL, pathbuf, len, flags: 0); |
3732 | } |
3733 | |
3734 | int |
3735 | vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len) |
3736 | { |
3737 | return vn_getpath_ext_int(vp, NULL, pathbuf, len, VN_GETPATH_FSENTER); |
3738 | } |
3739 | |
3740 | /* |
3741 | * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the |
3742 | * vnode. It requires that there are IO counts on both the vnode and the directory vnode. |
3743 | * |
3744 | * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but |
3745 | * unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents |
3746 | * problems where if the path can not be found from the name cache, those operations can |
3747 | * erroneously fail with EPERM even though the call should succeed. When removing or moving |
3748 | * file system objects with operations such as unlink or rename, those operations need to |
3749 | * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a |
3750 | * MAC hook from these operations during forced unmount operations can lead to dead |
3751 | * lock. This happens when the operation starts, IO counts are taken on the containing |
3752 | * directories and targets. Before the MAC hook is called a forced unmount from another |
3753 | * thread takes place and blocks on the on going operation's directory vnode in vdrain. |
3754 | * After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter |
3755 | * is called with the understanding that there is an IO count on the target. If in |
3756 | * build_path the directory vnode is no longer in the cache, then the parent object id via |
3757 | * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent |
3758 | * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get |
3759 | * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block |
3760 | * depending on which version and how it calls the vnode_get family of interfaces. |
3761 | * |
3762 | * N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to |
3763 | * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not |
3764 | * cause issues, but there is no guarantee that all or any file systems are doing that. |
3765 | * |
3766 | * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known |
3767 | * IO count on the directory vnode by calling build_path_with_parent. |
3768 | */ |
3769 | |
3770 | int |
3771 | vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len) |
3772 | { |
3773 | return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, 0, vfs_context_current()); |
3774 | } |
3775 | |
3776 | int |
3777 | vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len) |
3778 | { |
3779 | return vn_getpath_ext_int(vp, NULLVP, pathbuf, len, |
3780 | VN_GETPATH_NO_FIRMLINK); |
3781 | } |
3782 | |
3783 | int |
3784 | vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) |
3785 | { |
3786 | return ubc_cs_getcdhash(vp, offset, cdhash); |
3787 | } |
3788 | |
3789 | |
3790 | static char *extension_table = NULL; |
3791 | static int nexts; |
3792 | static int max_ext_width; |
3793 | |
3794 | static int |
3795 | extension_cmp(const void *a, const void *b) |
3796 | { |
3797 | return (int)(strlen(s: (const char *)a) - strlen(s: (const char *)b)); |
3798 | } |
3799 | |
3800 | |
3801 | // |
3802 | // This is the api LaunchServices uses to inform the kernel |
3803 | // the list of package extensions to ignore. |
3804 | // |
3805 | // Internally we keep the list sorted by the length of the |
3806 | // the extension (from longest to shortest). We sort the |
3807 | // list of extensions so that we can speed up our searches |
3808 | // when comparing file names -- we only compare extensions |
3809 | // that could possibly fit into the file name, not all of |
3810 | // them (i.e. a short 8 character name can't have an 8 |
3811 | // character extension). |
3812 | // |
3813 | extern lck_mtx_t pkg_extensions_lck; |
3814 | |
3815 | __private_extern__ int |
3816 | set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) |
3817 | { |
3818 | char *new_exts, *old_exts; |
3819 | int old_nentries = 0, old_maxwidth = 0; |
3820 | int error; |
3821 | |
3822 | if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { |
3823 | return EINVAL; |
3824 | } |
3825 | |
3826 | |
3827 | // allocate one byte extra so we can guarantee null termination |
3828 | new_exts = kalloc_data((nentries * maxwidth) + 1, Z_WAITOK); |
3829 | if (new_exts == NULL) { |
3830 | return ENOMEM; |
3831 | } |
3832 | |
3833 | error = copyin(data, new_exts, nentries * maxwidth); |
3834 | if (error) { |
3835 | kfree_data(new_exts, (nentries * maxwidth) + 1); |
3836 | return error; |
3837 | } |
3838 | |
3839 | new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block |
3840 | |
3841 | qsort(array: new_exts, nmembers: nentries, member_size: maxwidth, extension_cmp); |
3842 | |
3843 | lck_mtx_lock(lck: &pkg_extensions_lck); |
3844 | |
3845 | old_exts = extension_table; |
3846 | old_nentries = nexts; |
3847 | old_maxwidth = max_ext_width; |
3848 | extension_table = new_exts; |
3849 | nexts = nentries; |
3850 | max_ext_width = maxwidth; |
3851 | |
3852 | lck_mtx_unlock(lck: &pkg_extensions_lck); |
3853 | |
3854 | kfree_data(old_exts, (old_nentries * old_maxwidth) + 1); |
3855 | |
3856 | return 0; |
3857 | } |
3858 | |
3859 | |
3860 | int |
3861 | is_package_name(const char *name, int len) |
3862 | { |
3863 | int i; |
3864 | size_t extlen; |
3865 | const char *ptr, *name_ext; |
3866 | |
3867 | // if the name is less than 3 bytes it can't be of the |
3868 | // form A.B and if it begins with a "." then it is also |
3869 | // not a package. |
3870 | if (len <= 3 || name[0] == '.') { |
3871 | return 0; |
3872 | } |
3873 | |
3874 | name_ext = NULL; |
3875 | for (ptr = name; *ptr != '\0'; ptr++) { |
3876 | if (*ptr == '.') { |
3877 | name_ext = ptr; |
3878 | } |
3879 | } |
3880 | |
3881 | // if there is no "." extension, it can't match |
3882 | if (name_ext == NULL) { |
3883 | return 0; |
3884 | } |
3885 | |
3886 | // advance over the "." |
3887 | name_ext++; |
3888 | |
3889 | lck_mtx_lock(lck: &pkg_extensions_lck); |
3890 | |
3891 | // now iterate over all the extensions to see if any match |
3892 | ptr = &extension_table[0]; |
3893 | for (i = 0; i < nexts; i++, ptr += max_ext_width) { |
3894 | extlen = strlen(s: ptr); |
3895 | if (strncasecmp(s1: name_ext, s2: ptr, n: extlen) == 0 && name_ext[extlen] == '\0') { |
3896 | // aha, a match! |
3897 | lck_mtx_unlock(lck: &pkg_extensions_lck); |
3898 | return 1; |
3899 | } |
3900 | } |
3901 | |
3902 | lck_mtx_unlock(lck: &pkg_extensions_lck); |
3903 | |
3904 | // if we get here, no extension matched |
3905 | return 0; |
3906 | } |
3907 | |
3908 | int |
3909 | vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component) |
3910 | { |
3911 | char *ptr, *end; |
3912 | int comp = 0; |
3913 | |
3914 | if (pathlen < 0) { |
3915 | return EINVAL; |
3916 | } |
3917 | |
3918 | *component = -1; |
3919 | if (*path != '/') { |
3920 | return EINVAL; |
3921 | } |
3922 | |
3923 | end = path + 1; |
3924 | while (end < path + pathlen && *end != '\0') { |
3925 | while (end < path + pathlen && *end == '/' && *end != '\0') { |
3926 | end++; |
3927 | } |
3928 | |
3929 | ptr = end; |
3930 | |
3931 | while (end < path + pathlen && *end != '/' && *end != '\0') { |
3932 | end++; |
3933 | } |
3934 | |
3935 | if (end > path + pathlen) { |
3936 | // hmm, string wasn't null terminated |
3937 | return EINVAL; |
3938 | } |
3939 | |
3940 | *end = '\0'; |
3941 | if (is_package_name(name: ptr, len: (int)(end - ptr))) { |
3942 | *component = comp; |
3943 | break; |
3944 | } |
3945 | |
3946 | end++; |
3947 | comp++; |
3948 | } |
3949 | |
3950 | return 0; |
3951 | } |
3952 | |
3953 | /* |
3954 | * Determine if a name is inappropriate for a searchfs query. |
3955 | * This list consists of /System currently. |
3956 | */ |
3957 | |
3958 | int |
3959 | vn_searchfs_inappropriate_name(const char *name, int len) |
3960 | { |
3961 | const char *bad_names[] = { "System" }; |
3962 | int bad_len[] = { 6 }; |
3963 | int i; |
3964 | |
3965 | if (len < 0) { |
3966 | return EINVAL; |
3967 | } |
3968 | |
3969 | for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) { |
3970 | if (len == bad_len[i] && strncmp(s1: name, s2: bad_names[i], n: strlen(s: bad_names[i]) + 1) == 0) { |
3971 | return 1; |
3972 | } |
3973 | } |
3974 | |
3975 | // if we get here, no name matched |
3976 | return 0; |
3977 | } |
3978 | |
3979 | /* |
3980 | * Top level filesystem related information gathering. |
3981 | */ |
3982 | extern unsigned int vfs_nummntops; |
3983 | |
3984 | /* |
3985 | * The VFS_NUMMNTOPS shouldn't be at name[1] since |
3986 | * is a VFS generic variable. Since we no longer support |
3987 | * VT_UFS, we reserve its value to support this sysctl node. |
3988 | * |
3989 | * It should have been: |
3990 | * name[0]: VFS_GENERIC |
3991 | * name[1]: VFS_NUMMNTOPS |
3992 | */ |
3993 | SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops, |
3994 | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, |
3995 | &vfs_nummntops, 0, "" ); |
3996 | |
3997 | int |
3998 | vfs_sysctl(int *name __unused, u_int namelen __unused, |
3999 | user_addr_t oldp __unused, size_t *oldlenp __unused, |
4000 | user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused); |
4001 | |
4002 | int |
4003 | vfs_sysctl(int *name __unused, u_int namelen __unused, |
4004 | user_addr_t oldp __unused, size_t *oldlenp __unused, |
4005 | user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused) |
4006 | { |
4007 | return EINVAL; |
4008 | } |
4009 | |
4010 | |
4011 | // |
4012 | // The following code disallows specific sysctl's that came through |
4013 | // the direct sysctl interface (vfs_sysctl_node) instead of the newer |
4014 | // sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors |
4015 | // through vfs_sysctl_node() because it passes the user's oldp pointer |
4016 | // directly to the file system which (for these selectors) casts it |
4017 | // back to a struct sysctl_req and then proceed to use SYSCTL_IN() |
4018 | // which jumps through an arbitrary function pointer. When called |
4019 | // through the sysctl_vfs_ctlbyfsid() interface this does not happen |
4020 | // and so it's safe. |
4021 | // |
4022 | // Unfortunately we have to pull in definitions from AFP and SMB and |
4023 | // perform explicit name checks on the file system to determine if |
4024 | // these selectors are being used. |
4025 | // |
4026 | |
4027 | #define AFPFS_VFS_CTL_GETID 0x00020001 |
4028 | #define AFPFS_VFS_CTL_NETCHANGE 0x00020002 |
4029 | #define AFPFS_VFS_CTL_VOLCHANGE 0x00020003 |
4030 | |
4031 | #define SMBFS_SYSCTL_REMOUNT 1 |
4032 | #define SMBFS_SYSCTL_REMOUNT_INFO 2 |
4033 | #define SMBFS_SYSCTL_GET_SERVER_SHARE 3 |
4034 | |
4035 | |
4036 | static int |
4037 | is_bad_sysctl_name(struct vfstable *vfsp, int selector_name) |
4038 | { |
4039 | switch (selector_name) { |
4040 | case VFS_CTL_QUERY: |
4041 | case VFS_CTL_TIMEO: |
4042 | case VFS_CTL_NOLOCKS: |
4043 | case VFS_CTL_NSTATUS: |
4044 | case VFS_CTL_SADDR: |
4045 | case VFS_CTL_DISC: |
4046 | case VFS_CTL_SERVERINFO: |
4047 | return 1; |
4048 | |
4049 | default: |
4050 | break; |
4051 | } |
4052 | |
4053 | // the more complicated check for some of SMB's special values |
4054 | if (strcmp(s1: vfsp->vfc_name, s2: "smbfs" ) == 0) { |
4055 | switch (selector_name) { |
4056 | case SMBFS_SYSCTL_REMOUNT: |
4057 | case SMBFS_SYSCTL_REMOUNT_INFO: |
4058 | case SMBFS_SYSCTL_GET_SERVER_SHARE: |
4059 | return 1; |
4060 | } |
4061 | } else if (strcmp(s1: vfsp->vfc_name, s2: "afpfs" ) == 0) { |
4062 | switch (selector_name) { |
4063 | case AFPFS_VFS_CTL_GETID: |
4064 | case AFPFS_VFS_CTL_NETCHANGE: |
4065 | case AFPFS_VFS_CTL_VOLCHANGE: |
4066 | return 1; |
4067 | } |
4068 | } |
4069 | |
4070 | // |
4071 | // If we get here we passed all the checks so the selector is ok |
4072 | // |
4073 | return 0; |
4074 | } |
4075 | |
4076 | |
4077 | int vfs_sysctl_node SYSCTL_HANDLER_ARGS |
4078 | { |
4079 | int *name, namelen; |
4080 | struct vfstable *vfsp; |
4081 | int error; |
4082 | int fstypenum; |
4083 | |
4084 | fstypenum = oidp->oid_number; |
4085 | name = arg1; |
4086 | namelen = arg2; |
4087 | |
4088 | /* all sysctl names at this level should have at least one name slot for the FS */ |
4089 | if (namelen < 1) { |
4090 | return EISDIR; /* overloaded */ |
4091 | } |
4092 | mount_list_lock(); |
4093 | for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { |
4094 | if (vfsp->vfc_typenum == fstypenum) { |
4095 | vfsp->vfc_refcount++; |
4096 | break; |
4097 | } |
4098 | } |
4099 | mount_list_unlock(); |
4100 | |
4101 | if (vfsp == NULL) { |
4102 | return ENOTSUP; |
4103 | } |
4104 | |
4105 | if (is_bad_sysctl_name(vfsp, selector_name: name[0])) { |
4106 | printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n" , name[0]); |
4107 | error = EPERM; |
4108 | } else { |
4109 | error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, |
4110 | req->oldptr, &req->oldlen, req->newptr, req->newlen, |
4111 | vfs_context_current()); |
4112 | } |
4113 | |
4114 | mount_list_lock(); |
4115 | vfsp->vfc_refcount--; |
4116 | mount_list_unlock(); |
4117 | |
4118 | return error; |
4119 | } |
4120 | |
4121 | /* |
4122 | * Check to see if a filesystem is mounted on a block device. |
4123 | */ |
4124 | int |
4125 | vfs_mountedon(struct vnode *vp) |
4126 | { |
4127 | struct vnode *vq; |
4128 | int error = 0; |
4129 | |
4130 | restart: |
4131 | SPECHASH_LOCK(); |
4132 | if (vp->v_specflags & SI_MOUNTING && (vp->v_specinfo->si_mountingowner != current_thread())) { |
4133 | msleep(chan: (caddr_t)&vp->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS | PDROP, wmesg: "vnode_waitformounting" , NULL); |
4134 | goto restart; |
4135 | } |
4136 | if (vp->v_specflags & SI_MOUNTEDON) { |
4137 | error = EBUSY; |
4138 | goto out; |
4139 | } |
4140 | if (vp->v_specflags & SI_ALIASED) { |
4141 | for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { |
4142 | if (vq->v_rdev != vp->v_rdev || |
4143 | vq->v_type != vp->v_type) { |
4144 | continue; |
4145 | } |
4146 | if (vq->v_specflags & SI_MOUNTING) { |
4147 | msleep(chan: (caddr_t)&vq->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS | PDROP, wmesg: "vnode_waitformounting" , NULL); |
4148 | goto restart; |
4149 | } |
4150 | if (vq->v_specflags & SI_MOUNTEDON) { |
4151 | error = EBUSY; |
4152 | break; |
4153 | } |
4154 | } |
4155 | } |
4156 | out: |
4157 | SPECHASH_UNLOCK(); |
4158 | return error; |
4159 | } |
4160 | |
4161 | void |
4162 | vfs_setmountedon(vnode_t vp) |
4163 | { |
4164 | vnode_lock(vp); |
4165 | SPECHASH_LOCK(); |
4166 | vp->v_specflags |= SI_MOUNTEDON; |
4167 | vp->v_specflags &= ~SI_MOUNTING; |
4168 | vp->v_specinfo->si_mountingowner = NULL; |
4169 | SPECHASH_UNLOCK(); |
4170 | vnode_unlock(vp); |
4171 | wakeup(chan: &vp->v_specflags); |
4172 | } |
4173 | |
4174 | void |
4175 | vfs_clearmounting(vnode_t vp) |
4176 | { |
4177 | vnode_lock(vp); |
4178 | SPECHASH_LOCK(); |
4179 | vp->v_specflags &= ~SI_MOUNTING; |
4180 | vp->v_specinfo->si_mountingowner = NULL; |
4181 | SPECHASH_UNLOCK(); |
4182 | vnode_unlock(vp); |
4183 | wakeup(chan: &vp->v_specflags); |
4184 | } |
4185 | |
4186 | /* |
4187 | * Check to see if a filesystem is mounted on a block device. |
4188 | */ |
4189 | int |
4190 | vfs_setmounting(vnode_t vp) |
4191 | { |
4192 | struct vnode *vq; |
4193 | int error = 0; |
4194 | |
4195 | vnode_lock(vp); |
4196 | while (vp->v_specflags & SI_MOUNTING) { |
4197 | msleep(chan: (caddr_t)&vp->v_specflags, mtx: &vp->v_lock, PVFS, wmesg: "vnode_waitformounting" , NULL); |
4198 | } |
4199 | if (vp->v_specflags & SI_MOUNTEDON) { |
4200 | vnode_unlock(vp); |
4201 | return EBUSY; |
4202 | } |
4203 | SPECHASH_LOCK(); |
4204 | vp->v_specflags |= SI_MOUNTING; |
4205 | vp->v_specinfo->si_mountingowner = current_thread(); |
4206 | vnode_unlock(vp); |
4207 | restart: |
4208 | if (vp->v_specflags & SI_ALIASED) { |
4209 | for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { |
4210 | if (vq->v_rdev != vp->v_rdev || |
4211 | vq->v_type != vp->v_type || vq == vp) { |
4212 | continue; |
4213 | } |
4214 | if (vq->v_specflags & SI_MOUNTING) { |
4215 | msleep(chan: (caddr_t)&vq->v_specflags, mtx: SPECHASH_LOCK_ADDR(), PVFS | PDROP, wmesg: "vnode_waitformounting" , NULL); |
4216 | SPECHASH_LOCK(); |
4217 | goto restart; |
4218 | } |
4219 | if (vq->v_specflags & SI_MOUNTEDON) { |
4220 | error = EBUSY; |
4221 | break; |
4222 | } |
4223 | } |
4224 | } |
4225 | SPECHASH_UNLOCK(); |
4226 | if (error) { |
4227 | vnode_lock(vp); |
4228 | SPECHASH_LOCK(); |
4229 | vp->v_specflags &= ~SI_MOUNTING; |
4230 | SPECHASH_UNLOCK(); |
4231 | vnode_unlock(vp); |
4232 | wakeup(chan: &vp->v_specflags); |
4233 | } |
4234 | return error; |
4235 | } |
4236 | |
4237 | struct unmount_info { |
4238 | int u_errs; // Total failed unmounts |
4239 | int u_busy; // EBUSY failed unmounts |
4240 | int u_count; // Total volumes iterated |
4241 | int u_only_non_system; |
4242 | }; |
4243 | |
4244 | static int |
4245 | unmount_callback(mount_t mp, void *arg) |
4246 | { |
4247 | int error; |
4248 | char *mntname; |
4249 | struct unmount_info *uip = arg; |
4250 | |
4251 | uip->u_count++; |
4252 | |
4253 | mntname = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_NOFAIL); |
4254 | strlcpy(dst: mntname, src: mp->mnt_vfsstat.f_mntonname, MAXPATHLEN); |
4255 | |
4256 | if (uip->u_only_non_system |
4257 | && ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT |
4258 | printf("unmount(%d) %s skipped\n" , uip->u_only_non_system, mntname); |
4259 | mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF |
4260 | } else { |
4261 | printf("unmount(%d) %s\n" , uip->u_only_non_system, mntname); |
4262 | |
4263 | mount_ref(mp, locked: 0); |
4264 | mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF |
4265 | error = dounmount(mp, MNT_FORCE, 1, vfs_context_current()); |
4266 | if (error) { |
4267 | uip->u_errs++; |
4268 | printf("Unmount of %s failed (%d)\n" , mntname ? mntname:"?" , error); |
4269 | if (error == EBUSY) { |
4270 | uip->u_busy++; |
4271 | } |
4272 | } |
4273 | } |
4274 | zfree(ZV_NAMEI, mntname); |
4275 | |
4276 | return VFS_RETURNED; |
4277 | } |
4278 | |
4279 | /* |
4280 | * Unmount all filesystems. The list is traversed in reverse order |
4281 | * of mounting to avoid dependencies. |
4282 | * Busy mounts are retried. |
4283 | */ |
4284 | __private_extern__ void |
4285 | vfs_unmountall(int only_non_system) |
4286 | { |
4287 | int mounts, sec = 1; |
4288 | struct unmount_info ui; |
4289 | |
4290 | /* |
4291 | * Ensure last-completion-time is valid before anyone can see that |
4292 | * VFS shutdown has started. |
4293 | */ |
4294 | vfs_shutdown_last_completion_time = mach_absolute_time(); |
4295 | OSMemoryBarrier(); |
4296 | vfs_unmountall_started = 1; |
4297 | printf("vfs_unmountall(%ssystem) start\n" , only_non_system ? "non" : "" ); |
4298 | |
4299 | retry: |
4300 | ui.u_errs = ui.u_busy = ui.u_count = 0; |
4301 | ui.u_only_non_system = only_non_system; |
4302 | // avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF |
4303 | vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, callout: unmount_callback, arg: &ui); |
4304 | mounts = mount_getvfscnt(); |
4305 | if (mounts == 0) { |
4306 | goto out; |
4307 | } |
4308 | if (ui.u_busy > 0) { // Busy mounts - wait & retry |
4309 | tsleep(chan: &nummounts, PVFS, wmesg: "busy mount" , timo: sec * hz); |
4310 | sec *= 2; |
4311 | if (sec <= 32) { |
4312 | goto retry; |
4313 | } |
4314 | printf("Unmounting timed out\n" ); |
4315 | } else if (ui.u_count < mounts) { |
4316 | // If the vfs_iterate missed mounts in progress - wait a bit |
4317 | tsleep(chan: &nummounts, PVFS, wmesg: "missed mount" , timo: 2 * hz); |
4318 | } |
4319 | |
4320 | out: |
4321 | printf("vfs_unmountall(%ssystem) end\n" , only_non_system ? "non" : "" ); |
4322 | |
4323 | /* |
4324 | * reboot_kernel() calls us twice; once to deal with non-system |
4325 | * mounts, and again to sweep up anything left after terminating |
4326 | * DEXTs. We're only finished once we've completed the second pass. |
4327 | */ |
4328 | if (!only_non_system) { |
4329 | vfs_unmountall_finished = 1; |
4330 | } |
4331 | } |
4332 | |
4333 | /* |
4334 | * vfs_shutdown_in_progress -- |
4335 | * |
4336 | * Returns whether or not the VFS is shutting down the file systems. |
4337 | */ |
4338 | boolean_t |
4339 | vfs_shutdown_in_progress(void) |
4340 | { |
4341 | return vfs_unmountall_started && !vfs_unmountall_finished; |
4342 | } |
4343 | |
4344 | /* |
4345 | * vfs_shutdown_finished -- |
4346 | * |
4347 | * Returns whether or not the VFS shutdown has completed. |
4348 | */ |
4349 | boolean_t |
4350 | vfs_shutdown_finished(void) |
4351 | { |
4352 | return !!vfs_unmountall_finished; |
4353 | } |
4354 | |
4355 | /* |
4356 | * vfs_update_last_completion_time -- |
4357 | * |
4358 | * Updates the "last I/O completion time" timestamp used by the watchdog |
4359 | * to monitor VFS shutdown progress. Called by various I/O stack layers |
4360 | * as operations complete and progress moves forward. |
4361 | */ |
4362 | void |
4363 | vfs_update_last_completion_time(void) |
4364 | { |
4365 | if (vfs_unmountall_started) { |
4366 | vfs_shutdown_last_completion_time = mach_absolute_time(); |
4367 | } |
4368 | } |
4369 | |
4370 | /* |
4371 | * vfs_last_completion_time -- |
4372 | * |
4373 | * Returns the "last I/O completion time" timestamp. Return |
4374 | * value is a mach_absolute_time() value, and is not meaningful |
4375 | * unless vfs_is_shutting_down() also returns true. |
4376 | */ |
4377 | uint64_t |
4378 | vfs_last_completion_time(void) |
4379 | { |
4380 | return vfs_unmountall_started ? vfs_shutdown_last_completion_time : 0; |
4381 | } |
4382 | |
4383 | /* |
4384 | * This routine is called from vnode_pager_deallocate out of the VM |
4385 | * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named |
4386 | * on a vnode that has a UBCINFO |
4387 | */ |
4388 | __private_extern__ void |
4389 | (vnode_t vp) |
4390 | { |
4391 | struct ubc_info *uip; |
4392 | |
4393 | vnode_lock_spin(vp); |
4394 | |
4395 | vp->v_lflag &= ~VNAMED_UBC; |
4396 | if (vp->v_usecount != 0) { |
4397 | /* |
4398 | * At the eleventh hour, just before the ubcinfo is |
4399 | * destroyed, ensure the ubc-specific v_usecount |
4400 | * reference has gone. We use v_usecount != 0 as a hint; |
4401 | * ubc_unmap() does nothing if there's no mapping. |
4402 | * |
4403 | * This case is caused by coming here via forced unmount, |
4404 | * versus the usual vm_object_deallocate() path. |
4405 | * In the forced unmount case, ubc_destroy_named() |
4406 | * releases the pager before memory_object_last_unmap() |
4407 | * can be called. |
4408 | */ |
4409 | vnode_unlock(vp); |
4410 | ubc_unmap(vp); |
4411 | vnode_lock_spin(vp); |
4412 | } |
4413 | |
4414 | uip = vp->v_ubcinfo; |
4415 | vp->v_ubcinfo = UBC_INFO_NULL; |
4416 | |
4417 | vnode_unlock(vp); |
4418 | |
4419 | ubc_info_deallocate(uip); |
4420 | } |
4421 | |
4422 | |
4423 | #include <sys/disk.h> |
4424 | |
4425 | u_int32_t rootunit = (u_int32_t)-1; |
4426 | |
4427 | #if CONFIG_IOSCHED |
4428 | extern int lowpri_throttle_enabled; |
4429 | extern int iosched_enabled; |
4430 | #endif |
4431 | |
4432 | errno_t |
4433 | vfs_init_io_attributes(vnode_t devvp, mount_t mp) |
4434 | { |
4435 | int error; |
4436 | off_t readblockcnt = 0; |
4437 | off_t writeblockcnt = 0; |
4438 | off_t readmaxcnt = 0; |
4439 | off_t writemaxcnt = 0; |
4440 | off_t readsegcnt = 0; |
4441 | off_t writesegcnt = 0; |
4442 | off_t readsegsize = 0; |
4443 | off_t writesegsize = 0; |
4444 | off_t alignment = 0; |
4445 | u_int32_t minsaturationbytecount = 0; |
4446 | u_int32_t ioqueue_depth = 0; |
4447 | u_int32_t blksize; |
4448 | u_int64_t temp; |
4449 | u_int32_t features; |
4450 | u_int64_t location = 0; |
4451 | vfs_context_t ctx = vfs_context_current(); |
4452 | dk_corestorage_info_t cs_info; |
4453 | boolean_t cs_present = FALSE; |
4454 | int isssd = 0; |
4455 | int isvirtual = 0; |
4456 | |
4457 | |
4458 | VNOP_IOCTL(vp: devvp, DKIOCGETTHROTTLEMASK, data: (caddr_t)&mp->mnt_throttle_mask, fflag: 0, NULL); |
4459 | /* |
4460 | * as a reasonable approximation, only use the lowest bit of the mask |
4461 | * to generate a disk unit number |
4462 | */ |
4463 | mp->mnt_devbsdunit = num_trailing_0(n: mp->mnt_throttle_mask); |
4464 | |
4465 | if (devvp == rootvp) { |
4466 | rootunit = mp->mnt_devbsdunit; |
4467 | } |
4468 | |
4469 | if (mp->mnt_devbsdunit == rootunit) { |
4470 | /* |
4471 | * this mount point exists on the same device as the root |
4472 | * partition, so it comes under the hard throttle control... |
4473 | * this is true even for the root mount point itself |
4474 | */ |
4475 | mp->mnt_kern_flag |= MNTK_ROOTDEV; |
4476 | } |
4477 | /* |
4478 | * force the spec device to re-cache |
4479 | * the underlying block size in case |
4480 | * the filesystem overrode the initial value |
4481 | */ |
4482 | set_fsblocksize(devvp); |
4483 | |
4484 | |
4485 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKSIZE, |
4486 | data: (caddr_t)&blksize, fflag: 0, ctx))) { |
4487 | return error; |
4488 | } |
4489 | |
4490 | mp->mnt_devblocksize = blksize; |
4491 | |
4492 | /* |
4493 | * set the maximum possible I/O size |
4494 | * this may get clipped to a smaller value |
4495 | * based on which constraints are being advertised |
4496 | * and if those advertised constraints result in a smaller |
4497 | * limit for a given I/O |
4498 | */ |
4499 | mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES; |
4500 | mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES; |
4501 | |
4502 | if (VNOP_IOCTL(vp: devvp, DKIOCISVIRTUAL, data: (caddr_t)&isvirtual, fflag: 0, ctx) == 0) { |
4503 | if (isvirtual) { |
4504 | mp->mnt_kern_flag |= MNTK_VIRTUALDEV; |
4505 | mp->mnt_flag |= MNT_REMOVABLE; |
4506 | } |
4507 | } |
4508 | if (VNOP_IOCTL(vp: devvp, DKIOCISSOLIDSTATE, data: (caddr_t)&isssd, fflag: 0, ctx) == 0) { |
4509 | if (isssd) { |
4510 | mp->mnt_kern_flag |= MNTK_SSD; |
4511 | } |
4512 | } |
4513 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETFEATURES, |
4514 | data: (caddr_t)&features, fflag: 0, ctx))) { |
4515 | return error; |
4516 | } |
4517 | |
4518 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBLOCKCOUNTREAD, |
4519 | data: (caddr_t)&readblockcnt, fflag: 0, ctx))) { |
4520 | return error; |
4521 | } |
4522 | |
4523 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBLOCKCOUNTWRITE, |
4524 | data: (caddr_t)&writeblockcnt, fflag: 0, ctx))) { |
4525 | return error; |
4526 | } |
4527 | |
4528 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBYTECOUNTREAD, |
4529 | data: (caddr_t)&readmaxcnt, fflag: 0, ctx))) { |
4530 | return error; |
4531 | } |
4532 | |
4533 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXBYTECOUNTWRITE, |
4534 | data: (caddr_t)&writemaxcnt, fflag: 0, ctx))) { |
4535 | return error; |
4536 | } |
4537 | |
4538 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTCOUNTREAD, |
4539 | data: (caddr_t)&readsegcnt, fflag: 0, ctx))) { |
4540 | return error; |
4541 | } |
4542 | |
4543 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, |
4544 | data: (caddr_t)&writesegcnt, fflag: 0, ctx))) { |
4545 | return error; |
4546 | } |
4547 | |
4548 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, |
4549 | data: (caddr_t)&readsegsize, fflag: 0, ctx))) { |
4550 | return error; |
4551 | } |
4552 | |
4553 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, |
4554 | data: (caddr_t)&writesegsize, fflag: 0, ctx))) { |
4555 | return error; |
4556 | } |
4557 | |
4558 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT, |
4559 | data: (caddr_t)&alignment, fflag: 0, ctx))) { |
4560 | return error; |
4561 | } |
4562 | |
4563 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETCOMMANDPOOLSIZE, |
4564 | data: (caddr_t)&ioqueue_depth, fflag: 0, ctx))) { |
4565 | return error; |
4566 | } |
4567 | |
4568 | if (readmaxcnt) { |
4569 | mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX :(uint32_t) readmaxcnt; |
4570 | } |
4571 | |
4572 | if (readblockcnt) { |
4573 | temp = readblockcnt * blksize; |
4574 | temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; |
4575 | |
4576 | if (temp < mp->mnt_maxreadcnt) { |
4577 | mp->mnt_maxreadcnt = (u_int32_t)temp; |
4578 | } |
4579 | } |
4580 | |
4581 | if (writemaxcnt) { |
4582 | mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : (uint32_t)writemaxcnt; |
4583 | } |
4584 | |
4585 | if (writeblockcnt) { |
4586 | temp = writeblockcnt * blksize; |
4587 | temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; |
4588 | |
4589 | if (temp < mp->mnt_maxwritecnt) { |
4590 | mp->mnt_maxwritecnt = (u_int32_t)temp; |
4591 | } |
4592 | } |
4593 | |
4594 | if (readsegcnt) { |
4595 | temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt; |
4596 | } else { |
4597 | temp = mp->mnt_maxreadcnt / PAGE_SIZE; |
4598 | |
4599 | if (temp > UINT16_MAX) { |
4600 | temp = UINT16_MAX; |
4601 | } |
4602 | } |
4603 | mp->mnt_segreadcnt = (u_int16_t)temp; |
4604 | |
4605 | if (writesegcnt) { |
4606 | temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt; |
4607 | } else { |
4608 | temp = mp->mnt_maxwritecnt / PAGE_SIZE; |
4609 | |
4610 | if (temp > UINT16_MAX) { |
4611 | temp = UINT16_MAX; |
4612 | } |
4613 | } |
4614 | mp->mnt_segwritecnt = (u_int16_t)temp; |
4615 | |
4616 | if (readsegsize) { |
4617 | temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; |
4618 | } else { |
4619 | temp = mp->mnt_maxreadcnt; |
4620 | } |
4621 | mp->mnt_maxsegreadsize = (u_int32_t)temp; |
4622 | |
4623 | if (writesegsize) { |
4624 | temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; |
4625 | } else { |
4626 | temp = mp->mnt_maxwritecnt; |
4627 | } |
4628 | mp->mnt_maxsegwritesize = (u_int32_t)temp; |
4629 | |
4630 | if (alignment) { |
4631 | temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1; |
4632 | } else { |
4633 | temp = 0; |
4634 | } |
4635 | mp->mnt_alignmentmask = (uint32_t)temp; |
4636 | |
4637 | |
4638 | if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) { |
4639 | temp = ioqueue_depth; |
4640 | } else { |
4641 | temp = MNT_DEFAULT_IOQUEUE_DEPTH; |
4642 | } |
4643 | |
4644 | mp->mnt_ioqueue_depth = (uint32_t)temp; |
4645 | mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth); |
4646 | |
4647 | if (mp->mnt_ioscale > 1) { |
4648 | printf("ioqueue_depth = %d, ioscale = %d\n" , (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale); |
4649 | } |
4650 | |
4651 | if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { |
4652 | mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; |
4653 | } |
4654 | |
4655 | if (VNOP_IOCTL(vp: devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, data: (caddr_t)&minsaturationbytecount, fflag: 0, ctx) == 0) { |
4656 | mp->mnt_minsaturationbytecount = minsaturationbytecount; |
4657 | } else { |
4658 | mp->mnt_minsaturationbytecount = 0; |
4659 | } |
4660 | |
4661 | if (VNOP_IOCTL(vp: devvp, DKIOCCORESTORAGE, data: (caddr_t)&cs_info, fflag: 0, ctx) == 0) { |
4662 | cs_present = TRUE; |
4663 | } |
4664 | |
4665 | if (features & DK_FEATURE_UNMAP) { |
4666 | mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED; |
4667 | |
4668 | if (cs_present == TRUE) { |
4669 | mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED; |
4670 | } |
4671 | } |
4672 | if (cs_present == TRUE) { |
4673 | /* |
4674 | * for now we'll use the following test as a proxy for |
4675 | * the underlying drive being FUSION in nature |
4676 | */ |
4677 | if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) { |
4678 | mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE; |
4679 | } |
4680 | } else { |
4681 | /* Check for APFS Fusion */ |
4682 | dk_apfs_flavour_t flavour; |
4683 | if ((VNOP_IOCTL(vp: devvp, DKIOCGETAPFSFLAVOUR, data: (caddr_t)&flavour, fflag: 0, ctx) == 0) && |
4684 | (flavour == DK_APFS_FUSION)) { |
4685 | mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE; |
4686 | } |
4687 | } |
4688 | |
4689 | if (VNOP_IOCTL(vp: devvp, DKIOCGETLOCATION, data: (caddr_t)&location, fflag: 0, ctx) == 0) { |
4690 | if (location & DK_LOCATION_EXTERNAL) { |
4691 | mp->mnt_ioflags |= MNT_IOFLAGS_PERIPHERAL_DRIVE; |
4692 | mp->mnt_flag |= MNT_REMOVABLE; |
4693 | } |
4694 | } |
4695 | |
4696 | #if CONFIG_IOSCHED |
4697 | if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) { |
4698 | mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED; |
4699 | throttle_info_disable_throttle(devno: mp->mnt_devbsdunit, isfusion: (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0); |
4700 | } |
4701 | #endif /* CONFIG_IOSCHED */ |
4702 | return error; |
4703 | } |
4704 | |
4705 | static struct klist fs_klist; |
4706 | static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist" ); |
4707 | static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp); |
4708 | |
4709 | void |
4710 | vfs_event_init(void) |
4711 | { |
4712 | klist_init(list: &fs_klist); |
4713 | } |
4714 | |
4715 | void |
4716 | vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) |
4717 | { |
4718 | if (event == VQ_DEAD || event == VQ_NOTRESP) { |
4719 | struct mount *mp = vfs_getvfs(fsid); |
4720 | if (mp) { |
4721 | mount_lock_spin(mp); |
4722 | if (data) { |
4723 | mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding |
4724 | } else { |
4725 | mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding |
4726 | } |
4727 | mount_unlock(mp); |
4728 | } |
4729 | } |
4730 | |
4731 | lck_mtx_lock(lck: &fs_klist_lock); |
4732 | KNOTE(&fs_klist, event); |
4733 | lck_mtx_unlock(lck: &fs_klist_lock); |
4734 | } |
4735 | |
4736 | /* |
4737 | * return the number of mounted filesystems. |
4738 | */ |
4739 | static int |
4740 | sysctl_vfs_getvfscnt(void) |
4741 | { |
4742 | return mount_getvfscnt(); |
4743 | } |
4744 | |
4745 | |
4746 | static int |
4747 | mount_getvfscnt(void) |
4748 | { |
4749 | int ret; |
4750 | |
4751 | mount_list_lock(); |
4752 | ret = nummounts; |
4753 | mount_list_unlock(); |
4754 | return ret; |
4755 | } |
4756 | |
4757 | |
4758 | |
4759 | static int |
4760 | mount_fillfsids(fsid_t *fsidlst, int count) |
4761 | { |
4762 | struct mount *mp; |
4763 | int actual = 0; |
4764 | |
4765 | actual = 0; |
4766 | mount_list_lock(); |
4767 | TAILQ_FOREACH(mp, &mountlist, mnt_list) { |
4768 | if (actual < count) { |
4769 | fsidlst[actual] = mp->mnt_vfsstat.f_fsid; |
4770 | actual++; |
4771 | } |
4772 | } |
4773 | mount_list_unlock(); |
4774 | return actual; |
4775 | } |
4776 | |
4777 | /* |
4778 | * fill in the array of fsid_t's up to a max of 'count', the actual |
4779 | * number filled in will be set in '*actual'. If there are more fsid_t's |
4780 | * than room in fsidlst then ENOMEM will be returned and '*actual' will |
4781 | * have the actual count. |
4782 | * having *actual filled out even in the error case is depended upon. |
4783 | */ |
4784 | static int |
4785 | sysctl_vfs_getvfslist(fsid_t *fsidlst, unsigned long count, unsigned long *actual) |
4786 | { |
4787 | struct mount *mp; |
4788 | |
4789 | *actual = 0; |
4790 | mount_list_lock(); |
4791 | TAILQ_FOREACH(mp, &mountlist, mnt_list) { |
4792 | (*actual)++; |
4793 | if (*actual <= count) { |
4794 | fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid; |
4795 | } |
4796 | } |
4797 | mount_list_unlock(); |
4798 | return *actual <= count ? 0 : ENOMEM; |
4799 | } |
4800 | |
4801 | static int |
4802 | sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1, |
4803 | __unused int arg2, struct sysctl_req *req) |
4804 | { |
4805 | unsigned long actual; |
4806 | int error; |
4807 | size_t space; |
4808 | fsid_t *fsidlst; |
4809 | |
4810 | /* This is a readonly node. */ |
4811 | if (req->newptr != USER_ADDR_NULL) { |
4812 | return EPERM; |
4813 | } |
4814 | |
4815 | /* they are querying us so just return the space required. */ |
4816 | if (req->oldptr == USER_ADDR_NULL) { |
4817 | req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); |
4818 | return 0; |
4819 | } |
4820 | again: |
4821 | /* |
4822 | * Retrieve an accurate count of the amount of space required to copy |
4823 | * out all the fsids in the system. |
4824 | */ |
4825 | space = req->oldlen; |
4826 | req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t); |
4827 | |
4828 | /* they didn't give us enough space. */ |
4829 | if (space < req->oldlen) { |
4830 | return ENOMEM; |
4831 | } |
4832 | |
4833 | fsidlst = kalloc_data(req->oldlen, Z_WAITOK | Z_ZERO); |
4834 | if (fsidlst == NULL) { |
4835 | return ENOMEM; |
4836 | } |
4837 | |
4838 | error = sysctl_vfs_getvfslist(fsidlst, count: req->oldlen / sizeof(fsid_t), |
4839 | actual: &actual); |
4840 | /* |
4841 | * If we get back ENOMEM, then another mount has been added while we |
4842 | * slept in malloc above. If this is the case then try again. |
4843 | */ |
4844 | if (error == ENOMEM) { |
4845 | kfree_data(fsidlst, req->oldlen); |
4846 | req->oldlen = space; |
4847 | goto again; |
4848 | } |
4849 | if (error == 0) { |
4850 | error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t)); |
4851 | } |
4852 | kfree_data(fsidlst, req->oldlen); |
4853 | return error; |
4854 | } |
4855 | |
4856 | /* |
4857 | * Do a sysctl by fsid. |
4858 | */ |
4859 | static int |
4860 | sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, |
4861 | struct sysctl_req *req) |
4862 | { |
4863 | union union_vfsidctl vc; |
4864 | struct mount *mp = NULL; |
4865 | struct vfsstatfs *sp; |
4866 | int *name, namelen; |
4867 | int flags = 0; |
4868 | int error = 0, gotref = 0; |
4869 | vfs_context_t ctx = vfs_context_current(); |
4870 | proc_t p = req->p; /* XXX req->p != current_proc()? */ |
4871 | boolean_t is_64_bit; |
4872 | union { |
4873 | struct statfs64 sfs64; |
4874 | struct user64_statfs osfs64; |
4875 | struct user32_statfs osfs32; |
4876 | } *sfsbuf; |
4877 | |
4878 | if (req->newptr == USER_ADDR_NULL) { |
4879 | error = EINVAL; |
4880 | goto out; |
4881 | } |
4882 | |
4883 | name = arg1; |
4884 | namelen = arg2; |
4885 | is_64_bit = proc_is64bit(p); |
4886 | |
4887 | error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32)); |
4888 | if (error) { |
4889 | goto out; |
4890 | } |
4891 | if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */ |
4892 | error = EINVAL; |
4893 | goto out; |
4894 | } |
4895 | mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */ |
4896 | if (mp == NULL) { |
4897 | error = ENOENT; |
4898 | goto out; |
4899 | } |
4900 | gotref = 1; |
4901 | /* reset so that the fs specific code can fetch it. */ |
4902 | req->newidx = 0; |
4903 | /* |
4904 | * Note if this is a VFS_CTL then we pass the actual sysctl req |
4905 | * in for "oldp" so that the lower layer can DTRT and use the |
4906 | * SYSCTL_IN/OUT routines. |
4907 | */ |
4908 | if (mp->mnt_op->vfs_sysctl != NULL) { |
4909 | if (is_64_bit) { |
4910 | if (vfs_64bitready(mp)) { |
4911 | error = mp->mnt_op->vfs_sysctl(name, namelen, |
4912 | CAST_USER_ADDR_T(req), |
4913 | NULL, USER_ADDR_NULL, 0, |
4914 | ctx); |
4915 | } else { |
4916 | error = ENOTSUP; |
4917 | } |
4918 | } else { |
4919 | error = mp->mnt_op->vfs_sysctl(name, namelen, |
4920 | CAST_USER_ADDR_T(req), |
4921 | NULL, USER_ADDR_NULL, 0, |
4922 | ctx); |
4923 | } |
4924 | if (error != ENOTSUP) { |
4925 | goto out; |
4926 | } |
4927 | } |
4928 | switch (name[0]) { |
4929 | case VFS_CTL_UMOUNT: |
4930 | #if CONFIG_MACF |
4931 | error = mac_mount_check_umount(ctx, mp); |
4932 | if (error != 0) { |
4933 | goto out; |
4934 | } |
4935 | #endif |
4936 | req->newidx = 0; |
4937 | if (is_64_bit) { |
4938 | req->newptr = vc.vc64.vc_ptr; |
4939 | req->newlen = (size_t)vc.vc64.vc_len; |
4940 | } else { |
4941 | req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); |
4942 | req->newlen = vc.vc32.vc_len; |
4943 | } |
4944 | error = SYSCTL_IN(req, &flags, sizeof(flags)); |
4945 | if (error) { |
4946 | break; |
4947 | } |
4948 | |
4949 | mount_ref(mp, locked: 0); |
4950 | mount_iterdrop(mp); |
4951 | gotref = 0; |
4952 | /* safedounmount consumes a ref */ |
4953 | error = safedounmount(mp, flags, ctx); |
4954 | break; |
4955 | case VFS_CTL_OSTATFS: |
4956 | case VFS_CTL_STATFS64: |
4957 | #if CONFIG_MACF |
4958 | error = mac_mount_check_stat(ctx, mp); |
4959 | if (error != 0) { |
4960 | break; |
4961 | } |
4962 | #endif |
4963 | req->newidx = 0; |
4964 | if (is_64_bit) { |
4965 | req->newptr = vc.vc64.vc_ptr; |
4966 | req->newlen = (size_t)vc.vc64.vc_len; |
4967 | } else { |
4968 | req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); |
4969 | req->newlen = vc.vc32.vc_len; |
4970 | } |
4971 | error = SYSCTL_IN(req, &flags, sizeof(flags)); |
4972 | if (error) { |
4973 | break; |
4974 | } |
4975 | sp = &mp->mnt_vfsstat; |
4976 | if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) && |
4977 | (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) { |
4978 | goto out; |
4979 | } |
4980 | |
4981 | sfsbuf = kalloc_type(typeof(*sfsbuf), Z_WAITOK); |
4982 | |
4983 | if (name[0] == VFS_CTL_STATFS64) { |
4984 | struct statfs64 *sfs = &sfsbuf->sfs64; |
4985 | |
4986 | vfs_get_statfs64(mp, sfs); |
4987 | error = SYSCTL_OUT(req, sfs, sizeof(*sfs)); |
4988 | } else if (is_64_bit) { |
4989 | struct user64_statfs *sfs = &sfsbuf->osfs64; |
4990 | |
4991 | bzero(s: sfs, n: sizeof(*sfs)); |
4992 | sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; |
4993 | sfs->f_type = (short)mp->mnt_vtable->vfc_typenum; |
4994 | sfs->f_bsize = (user64_long_t)sp->f_bsize; |
4995 | sfs->f_iosize = (user64_long_t)sp->f_iosize; |
4996 | sfs->f_blocks = (user64_long_t)sp->f_blocks; |
4997 | sfs->f_bfree = (user64_long_t)sp->f_bfree; |
4998 | sfs->f_bavail = (user64_long_t)sp->f_bavail; |
4999 | sfs->f_files = (user64_long_t)sp->f_files; |
5000 | sfs->f_ffree = (user64_long_t)sp->f_ffree; |
5001 | sfs->f_fsid = sp->f_fsid; |
5002 | sfs->f_owner = sp->f_owner; |
5003 | vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSNAMELEN); |
5004 | strlcpy(dst: sfs->f_mntonname, src: sp->f_mntonname, MNAMELEN); |
5005 | strlcpy(dst: sfs->f_mntfromname, src: sp->f_mntfromname, MNAMELEN); |
5006 | |
5007 | error = SYSCTL_OUT(req, sfs, sizeof(*sfs)); |
5008 | } else { |
5009 | struct user32_statfs *sfs = &sfsbuf->osfs32; |
5010 | long temp; |
5011 | |
5012 | bzero(s: sfs, n: sizeof(*sfs)); |
5013 | sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; |
5014 | sfs->f_type = (short)mp->mnt_vtable->vfc_typenum; |
5015 | |
5016 | /* |
5017 | * It's possible for there to be more than 2^^31 blocks in the filesystem, so we |
5018 | * have to fudge the numbers here in that case. We inflate the blocksize in order |
5019 | * to reflect the filesystem size as best we can. |
5020 | */ |
5021 | if (sp->f_blocks > INT_MAX) { |
5022 | int shift; |
5023 | |
5024 | /* |
5025 | * Work out how far we have to shift the block count down to make it fit. |
5026 | * Note that it's possible to have to shift so far that the resulting |
5027 | * blocksize would be unreportably large. At that point, we will clip |
5028 | * any values that don't fit. |
5029 | * |
5030 | * For safety's sake, we also ensure that f_iosize is never reported as |
5031 | * being smaller than f_bsize. |
5032 | */ |
5033 | for (shift = 0; shift < 32; shift++) { |
5034 | if ((sp->f_blocks >> shift) <= INT_MAX) { |
5035 | break; |
5036 | } |
5037 | if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) { |
5038 | break; |
5039 | } |
5040 | } |
5041 | #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s))) |
5042 | sfs->f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift); |
5043 | sfs->f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift); |
5044 | sfs->f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift); |
5045 | #undef __SHIFT_OR_CLIP |
5046 | sfs->f_bsize = (user32_long_t)(sp->f_bsize << shift); |
5047 | temp = lmax(a: sp->f_iosize, b: sp->f_bsize); |
5048 | if (temp > INT32_MAX) { |
5049 | error = EINVAL; |
5050 | kfree_type(typeof(*sfsbuf), sfsbuf); |
5051 | goto out; |
5052 | } |
5053 | sfs->f_iosize = (user32_long_t)temp; |
5054 | } else { |
5055 | sfs->f_bsize = (user32_long_t)sp->f_bsize; |
5056 | sfs->f_iosize = (user32_long_t)sp->f_iosize; |
5057 | sfs->f_blocks = (user32_long_t)sp->f_blocks; |
5058 | sfs->f_bfree = (user32_long_t)sp->f_bfree; |
5059 | sfs->f_bavail = (user32_long_t)sp->f_bavail; |
5060 | } |
5061 | sfs->f_files = (user32_long_t)sp->f_files; |
5062 | sfs->f_ffree = (user32_long_t)sp->f_ffree; |
5063 | sfs->f_fsid = sp->f_fsid; |
5064 | sfs->f_owner = sp->f_owner; |
5065 | |
5066 | vfs_getfstypename(mp, buf: sfs->f_fstypename, MFSNAMELEN); |
5067 | strlcpy(dst: sfs->f_mntonname, src: sp->f_mntonname, MNAMELEN); |
5068 | strlcpy(dst: sfs->f_mntfromname, src: sp->f_mntfromname, MNAMELEN); |
5069 | |
5070 | error = SYSCTL_OUT(req, sfs, sizeof(*sfs)); |
5071 | } |
5072 | kfree_type(typeof(*sfsbuf), sfsbuf); |
5073 | break; |
5074 | default: |
5075 | error = ENOTSUP; |
5076 | goto out; |
5077 | } |
5078 | out: |
5079 | if (gotref != 0) { |
5080 | mount_iterdrop(mp); |
5081 | } |
5082 | return error; |
5083 | } |
5084 | |
5085 | static int filt_fsattach(struct knote *kn, struct kevent_qos_s *kev); |
5086 | static void filt_fsdetach(struct knote *kn); |
5087 | static int filt_fsevent(struct knote *kn, long hint); |
5088 | static int filt_fstouch(struct knote *kn, struct kevent_qos_s *kev); |
5089 | static int filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev); |
5090 | SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = { |
5091 | .f_attach = filt_fsattach, |
5092 | .f_detach = filt_fsdetach, |
5093 | .f_event = filt_fsevent, |
5094 | .f_touch = filt_fstouch, |
5095 | .f_process = filt_fsprocess, |
5096 | }; |
5097 | |
5098 | static int |
5099 | filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev) |
5100 | { |
5101 | kn->kn_flags |= EV_CLEAR; /* automatic */ |
5102 | kn->kn_sdata = 0; /* incoming data is ignored */ |
5103 | |
5104 | lck_mtx_lock(lck: &fs_klist_lock); |
5105 | KNOTE_ATTACH(&fs_klist, kn); |
5106 | lck_mtx_unlock(lck: &fs_klist_lock); |
5107 | |
5108 | /* |
5109 | * filter only sees future events, |
5110 | * so it can't be fired already. |
5111 | */ |
5112 | return 0; |
5113 | } |
5114 | |
5115 | static void |
5116 | filt_fsdetach(struct knote *kn) |
5117 | { |
5118 | lck_mtx_lock(lck: &fs_klist_lock); |
5119 | KNOTE_DETACH(&fs_klist, kn); |
5120 | lck_mtx_unlock(lck: &fs_klist_lock); |
5121 | } |
5122 | |
5123 | static int |
5124 | filt_fsevent(struct knote *kn, long hint) |
5125 | { |
5126 | /* |
5127 | * Backwards compatibility: |
5128 | * Other filters would do nothing if kn->kn_sfflags == 0 |
5129 | */ |
5130 | |
5131 | if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) { |
5132 | kn->kn_fflags |= hint; |
5133 | } |
5134 | |
5135 | return kn->kn_fflags != 0; |
5136 | } |
5137 | |
5138 | static int |
5139 | filt_fstouch(struct knote *kn, struct kevent_qos_s *kev) |
5140 | { |
5141 | int res; |
5142 | |
5143 | lck_mtx_lock(lck: &fs_klist_lock); |
5144 | |
5145 | kn->kn_sfflags = kev->fflags; |
5146 | |
5147 | /* |
5148 | * the above filter function sets bits even if nobody is looking for them. |
5149 | * Just preserve those bits even in the new mask is more selective |
5150 | * than before. |
5151 | * |
5152 | * For compatibility with previous implementations, we leave kn_fflags |
5153 | * as they were before. |
5154 | */ |
5155 | //if (kn->kn_sfflags) |
5156 | // kn->kn_fflags &= kn->kn_sfflags; |
5157 | res = (kn->kn_fflags != 0); |
5158 | |
5159 | lck_mtx_unlock(lck: &fs_klist_lock); |
5160 | |
5161 | return res; |
5162 | } |
5163 | |
5164 | static int |
5165 | filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev) |
5166 | { |
5167 | int res = 0; |
5168 | |
5169 | lck_mtx_lock(lck: &fs_klist_lock); |
5170 | if (kn->kn_fflags) { |
5171 | knote_fill_kevent(kn, kev, data: 0); |
5172 | res = 1; |
5173 | } |
5174 | lck_mtx_unlock(lck: &fs_klist_lock); |
5175 | return res; |
5176 | } |
5177 | |
5178 | static int |
5179 | sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, |
5180 | __unused void *arg1, __unused int arg2, struct sysctl_req *req) |
5181 | { |
5182 | int out, error; |
5183 | pid_t pid; |
5184 | proc_t p; |
5185 | |
5186 | /* We need a pid. */ |
5187 | if (req->newptr == USER_ADDR_NULL) { |
5188 | return EINVAL; |
5189 | } |
5190 | |
5191 | error = SYSCTL_IN(req, &pid, sizeof(pid)); |
5192 | if (error) { |
5193 | return error; |
5194 | } |
5195 | |
5196 | p = proc_find(pid: pid < 0 ? -pid : pid); |
5197 | if (p == NULL) { |
5198 | return ESRCH; |
5199 | } |
5200 | |
5201 | /* |
5202 | * Fetching the value is ok, but we only fetch if the old |
5203 | * pointer is given. |
5204 | */ |
5205 | if (req->oldptr != USER_ADDR_NULL) { |
5206 | out = !((p->p_flag & P_NOREMOTEHANG) == 0); |
5207 | proc_rele(p); |
5208 | error = SYSCTL_OUT(req, &out, sizeof(out)); |
5209 | return error; |
5210 | } |
5211 | |
5212 | /* cansignal offers us enough security. */ |
5213 | if (p != req->p && proc_suser(p: req->p) != 0) { |
5214 | proc_rele(p); |
5215 | return EPERM; |
5216 | } |
5217 | |
5218 | if (pid < 0) { |
5219 | OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag); |
5220 | } else { |
5221 | OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag); |
5222 | } |
5223 | proc_rele(p); |
5224 | |
5225 | return 0; |
5226 | } |
5227 | |
5228 | static int |
5229 | sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS |
5230 | { |
5231 | int *name, namelen; |
5232 | struct vfstable *vfsp; |
5233 | struct vfsconf vfsc = {}; |
5234 | |
5235 | (void)oidp; |
5236 | name = arg1; |
5237 | namelen = arg2; |
5238 | |
5239 | if (namelen < 1) { |
5240 | return EISDIR; |
5241 | } else if (namelen > 1) { |
5242 | return ENOTDIR; |
5243 | } |
5244 | |
5245 | mount_list_lock(); |
5246 | for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { |
5247 | if (vfsp->vfc_typenum == name[0]) { |
5248 | break; |
5249 | } |
5250 | } |
5251 | |
5252 | if (vfsp == NULL) { |
5253 | mount_list_unlock(); |
5254 | return ENOTSUP; |
5255 | } |
5256 | |
5257 | vfsc.vfc_reserved1 = 0; |
5258 | bcopy(src: vfsp->vfc_name, dst: vfsc.vfc_name, n: sizeof(vfsc.vfc_name)); |
5259 | vfsc.vfc_typenum = vfsp->vfc_typenum; |
5260 | vfsc.vfc_refcount = vfsp->vfc_refcount; |
5261 | vfsc.vfc_flags = vfsp->vfc_flags; |
5262 | vfsc.vfc_reserved2 = 0; |
5263 | vfsc.vfc_reserved3 = 0; |
5264 | |
5265 | mount_list_unlock(); |
5266 | return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf)); |
5267 | } |
5268 | |
5269 | /* the vfs.generic. branch. */ |
5270 | SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic, |
5271 | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge" ); |
5272 | /* retreive a list of mounted filesystem fsid_t */ |
5273 | SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, |
5274 | CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, |
5275 | NULL, 0, sysctl_vfs_vfslist, "S,fsid" , "List of mounted filesystem ids" ); |
5276 | /* perform operations on filesystem via fsid_t */ |
5277 | SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED, |
5278 | sysctl_vfs_ctlbyfsid, "ctlbyfsid" ); |
5279 | SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, |
5280 | NULL, 0, sysctl_vfs_noremotehang, "I" , "noremotehang" ); |
5281 | SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum, |
5282 | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, |
5283 | &maxvfstypenum, 0, "" ); |
5284 | SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "" ); |
5285 | SYSCTL_NODE(_vfs_generic, VFS_CONF, conf, |
5286 | CTLFLAG_RD | CTLFLAG_LOCKED, |
5287 | sysctl_vfs_generic_conf, "" ); |
5288 | #if DEVELOPMENT || DEBUG |
5289 | SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes, |
5290 | CTLTYPE_INT | CTLFLAG_RW, |
5291 | &print_busy_vnodes, 0, |
5292 | "VFS log busy vnodes blocking unmount" ); |
5293 | #endif |
5294 | |
5295 | /* Indicate that the root file system unmounted cleanly */ |
5296 | static int vfs_root_unmounted_cleanly = 0; |
5297 | SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly" ); |
5298 | |
5299 | void |
5300 | vfs_set_root_unmounted_cleanly(void) |
5301 | { |
5302 | vfs_root_unmounted_cleanly = 1; |
5303 | } |
5304 | |
5305 | /* |
5306 | * Print vnode state. |
5307 | */ |
5308 | void |
5309 | vn_print_state(struct vnode *vp, const char *fmt, ...) |
5310 | { |
5311 | va_list ap; |
5312 | char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)" ; |
5313 | char fs_name[MFSNAMELEN]; |
5314 | |
5315 | va_start(ap, fmt); |
5316 | vprintf(fmt, ap); |
5317 | va_end(ap); |
5318 | printf("vp 0x%0llx %s: " , (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str); |
5319 | printf("tag %d, type %d\n" , vp->v_tag, vp->v_type); |
5320 | /* Counts .. */ |
5321 | printf(" iocount %d, usecount %d, kusecount %d references %d\n" , |
5322 | vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references); |
5323 | printf(" writecount %d, numoutput %d\n" , vp->v_writecount, |
5324 | vp->v_numoutput); |
5325 | /* Flags */ |
5326 | printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n" , vp->v_flag, |
5327 | vp->v_lflag, vp->v_listflag); |
5328 | |
5329 | if (vp->v_mount == NULL || vp->v_mount == dead_mountp) { |
5330 | strlcpy(dst: fs_name, src: "deadfs" , MFSNAMELEN); |
5331 | } else { |
5332 | vfs_name(mp: vp->v_mount, buffer: fs_name); |
5333 | } |
5334 | |
5335 | printf(" v_data 0x%0llx %s\n" , |
5336 | (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0), |
5337 | perm_str); |
5338 | printf(" v_mount 0x%0llx %s vfs_name %s\n" , |
5339 | (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0), |
5340 | perm_str, fs_name); |
5341 | } |
5342 | |
5343 | long num_reusedvnodes = 0; |
5344 | |
5345 | |
5346 | static vnode_t |
5347 | process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred) |
5348 | { |
5349 | unsigned int vpid; |
5350 | |
5351 | *deferred = 0; |
5352 | |
5353 | vpid = vp->v_id; |
5354 | |
5355 | vnode_list_remove_locked(vp); |
5356 | |
5357 | vnode_hold(vp); |
5358 | vnode_list_unlock(); |
5359 | |
5360 | vnode_lock_spin(vp); |
5361 | |
5362 | /* |
5363 | * We could wait for the vnode_lock after removing the vp from the freelist |
5364 | * and the vid is bumped only at the very end of reclaim. So it is possible |
5365 | * that we are looking at a vnode that is being terminated. If so skip it. |
5366 | */ |
5367 | if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || |
5368 | VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { |
5369 | /* |
5370 | * we lost the race between dropping the list lock |
5371 | * and picking up the vnode_lock... someone else |
5372 | * used this vnode and it is now in a new state |
5373 | */ |
5374 | vnode_drop_and_unlock(vp); |
5375 | |
5376 | return NULLVP; |
5377 | } |
5378 | if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) { |
5379 | /* |
5380 | * we did a vnode_rele_ext that asked for |
5381 | * us not to reenter the filesystem during |
5382 | * the release even though VL_NEEDINACTIVE was |
5383 | * set... we'll do it here by doing a |
5384 | * vnode_get/vnode_put |
5385 | * |
5386 | * pick up an iocount so that we can call |
5387 | * vnode_put and drive the VNOP_INACTIVE... |
5388 | * vnode_put will either leave us off |
5389 | * the freelist if a new ref comes in, |
5390 | * or put us back on the end of the freelist |
5391 | * or recycle us if we were marked for termination... |
5392 | * so we'll just go grab a new candidate |
5393 | */ |
5394 | vp->v_iocount++; |
5395 | #ifdef CONFIG_IOCOUNT_TRACE |
5396 | record_vp(vp, 1); |
5397 | #endif |
5398 | vnode_put_locked(vp); |
5399 | vnode_drop_and_unlock(vp); |
5400 | |
5401 | return NULLVP; |
5402 | } |
5403 | /* |
5404 | * Checks for anyone racing us for recycle |
5405 | */ |
5406 | if (vp->v_type != VBAD) { |
5407 | if ((want_vp || can_defer) && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) { |
5408 | vnode_async_list_add(vp); |
5409 | vnode_drop_and_unlock(vp); |
5410 | |
5411 | *deferred = 1; |
5412 | |
5413 | return NULLVP; |
5414 | } |
5415 | if (vp->v_lflag & VL_DEAD) { |
5416 | panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD" , vp); |
5417 | } |
5418 | |
5419 | vnode_lock_convert(vp); |
5420 | (void)vnode_reclaim_internal(vp, 1, want_vp, 0); |
5421 | |
5422 | if (want_vp) { |
5423 | if ((VONLIST(vp))) { |
5424 | panic("new_vnode(%p): vp on list" , vp); |
5425 | } |
5426 | if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || |
5427 | (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) { |
5428 | panic("new_vnode(%p): free vnode still referenced" , vp); |
5429 | } |
5430 | if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) { |
5431 | panic("new_vnode(%p): vnode seems to be on mount list" , vp); |
5432 | } |
5433 | if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) { |
5434 | panic("new_vnode(%p): vnode still hooked into the name cache" , vp); |
5435 | } |
5436 | } else { |
5437 | vnode_drop_and_unlock(vp); |
5438 | vp = NULLVP; |
5439 | } |
5440 | } |
5441 | return vp; |
5442 | } |
5443 | |
5444 | __attribute__((noreturn)) |
5445 | static void |
5446 | async_work_continue(void) |
5447 | { |
5448 | struct async_work_lst *q; |
5449 | int deferred; |
5450 | vnode_t vp; |
5451 | |
5452 | q = &vnode_async_work_list; |
5453 | |
5454 | for (;;) { |
5455 | vnode_list_lock(); |
5456 | |
5457 | if (TAILQ_EMPTY(q)) { |
5458 | assert_wait(event: q, interruptible: (THREAD_UNINT)); |
5459 | |
5460 | vnode_list_unlock(); |
5461 | |
5462 | thread_block(continuation: (thread_continue_t)async_work_continue); |
5463 | |
5464 | continue; |
5465 | } |
5466 | async_work_handled++; |
5467 | |
5468 | vp = TAILQ_FIRST(q); |
5469 | |
5470 | vp = process_vp(vp, want_vp: 0, false, deferred: &deferred); |
5471 | |
5472 | if (vp != NULLVP) { |
5473 | panic("found VBAD vp (%p) on async queue" , vp); |
5474 | } |
5475 | } |
5476 | } |
5477 | |
5478 | #if CONFIG_JETSAM |
5479 | bool do_async_jetsam = false; |
5480 | #endif |
5481 | |
5482 | __attribute__((noreturn)) |
5483 | static void |
5484 | vn_laundry_continue(void) |
5485 | { |
5486 | struct freelst *free_q; |
5487 | struct ragelst *rage_q; |
5488 | vnode_t vp; |
5489 | int deferred; |
5490 | bool rage_q_empty; |
5491 | bool free_q_empty; |
5492 | |
5493 | |
5494 | free_q = &vnode_free_list; |
5495 | rage_q = &vnode_rage_list; |
5496 | |
5497 | for (;;) { |
5498 | vnode_list_lock(); |
5499 | |
5500 | #if CONFIG_JETSAM |
5501 | if (do_async_jetsam) { |
5502 | do_async_jetsam = false; |
5503 | if (deadvnodes <= deadvnodes_low) { |
5504 | vnode_list_unlock(); |
5505 | |
5506 | log(LOG_EMERG, "Initiating vnode jetsam : %d desired, %ld numvnodes, " |
5507 | "%ld free, %ld dead, %ld async, %d rage\n" , |
5508 | desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes); |
5509 | |
5510 | memorystatus_kill_on_vnode_limit(); |
5511 | |
5512 | continue; |
5513 | } |
5514 | } |
5515 | #endif |
5516 | |
5517 | if (!TAILQ_EMPTY(&vnode_async_work_list)) { |
5518 | vp = TAILQ_FIRST(&vnode_async_work_list); |
5519 | async_work_handled++; |
5520 | |
5521 | vp = process_vp(vp, want_vp: 0, false, deferred: &deferred); |
5522 | |
5523 | if (vp != NULLVP) { |
5524 | panic("found VBAD vp (%p) on async queue" , vp); |
5525 | } |
5526 | continue; |
5527 | } |
5528 | |
5529 | free_q_empty = TAILQ_EMPTY(free_q); |
5530 | rage_q_empty = TAILQ_EMPTY(rage_q); |
5531 | |
5532 | if (!rage_q_empty && !free_q_empty) { |
5533 | struct timeval current_tv; |
5534 | |
5535 | microuptime(tv: ¤t_tv); |
5536 | if (ragevnodes < rage_limit && |
5537 | ((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) { |
5538 | rage_q_empty = true; |
5539 | } |
5540 | } |
5541 | |
5542 | if (numvnodes < numvnodes_min || (rage_q_empty && free_q_empty) || |
5543 | (reusablevnodes <= reusablevnodes_max && deadvnodes >= deadvnodes_high)) { |
5544 | assert_wait(event: free_q, interruptible: (THREAD_UNINT)); |
5545 | |
5546 | vnode_list_unlock(); |
5547 | |
5548 | thread_block(continuation: (thread_continue_t)vn_laundry_continue); |
5549 | |
5550 | continue; |
5551 | } |
5552 | |
5553 | if (!rage_q_empty) { |
5554 | vp = TAILQ_FIRST(rage_q); |
5555 | } else { |
5556 | vp = TAILQ_FIRST(free_q); |
5557 | } |
5558 | |
5559 | vp = process_vp(vp, want_vp: 0, false, deferred: &deferred); |
5560 | |
5561 | if (vp != NULLVP) { |
5562 | /* If process_vp returns a vnode, it is locked and has a holdcount */ |
5563 | vnode_drop_and_unlock(vp); |
5564 | vp = NULLVP; |
5565 | } |
5566 | } |
5567 | } |
5568 | |
5569 | static inline void |
5570 | wakeup_laundry_thread() |
5571 | { |
5572 | if (deadvnodes_noreuse || (numvnodes >= numvnodes_min && deadvnodes < deadvnodes_low && |
5573 | (reusablevnodes > reusablevnodes_max || numvnodes >= desiredvnodes))) { |
5574 | wakeup(chan: &vnode_free_list); |
5575 | } |
5576 | } |
5577 | |
5578 | /* |
5579 | * This must be called under vnode_list_lock() to prevent race when accessing |
5580 | * various vnode stats. |
5581 | */ |
5582 | static void |
5583 | send_freeable_vnodes_telemetry(void) |
5584 | { |
5585 | bool send_event = false; |
5586 | |
5587 | /* |
5588 | * Log an event when the 'numvnodes' is above the freeable vnodes threshold |
5589 | * or when it falls back within the threshold. |
5590 | * When the 'numvnodes' is above the threshold, log an event when it has |
5591 | * been incrementally growing by 25%. |
5592 | */ |
5593 | if ((numvnodes > desiredvnodes) && (freevnodes + deadvnodes) == 0) { |
5594 | long last_numvnodes = freeable_vnodes_telemetry.numvnodes; |
5595 | |
5596 | if (numvnodes > (last_numvnodes + ((last_numvnodes * 25) / 100)) || |
5597 | numvnodes >= numvnodes_max) { |
5598 | send_event = true; |
5599 | } |
5600 | freeablevnodes_threshold_crossed = true; |
5601 | } else if (freeablevnodes_threshold_crossed && |
5602 | (freevnodes + deadvnodes) > busyvnodes) { |
5603 | freeablevnodes_threshold_crossed = false; |
5604 | send_event = true; |
5605 | } |
5606 | |
5607 | if (__improbable(send_event)) { |
5608 | ca_event_t event = CA_EVENT_ALLOCATE_FLAGS(freeable_vnodes, Z_NOWAIT); |
5609 | |
5610 | if (event) { |
5611 | /* |
5612 | * Update the stats except the 'numvnodes_max' and 'desiredvnodes' |
5613 | * as they are immutable after init. |
5614 | */ |
5615 | freeable_vnodes_telemetry.numvnodes_min = numvnodes_min; |
5616 | freeable_vnodes_telemetry.numvnodes = numvnodes; |
5617 | freeable_vnodes_telemetry.freevnodes = freevnodes; |
5618 | freeable_vnodes_telemetry.deadvnodes = deadvnodes; |
5619 | freeable_vnodes_telemetry.freeablevnodes = freeablevnodes; |
5620 | freeable_vnodes_telemetry.busyvnodes = busyvnodes; |
5621 | freeable_vnodes_telemetry.threshold_crossed = |
5622 | freeablevnodes_threshold_crossed; |
5623 | |
5624 | memcpy(dst: event->data, src: &freeable_vnodes_telemetry, |
5625 | n: sizeof(CA_EVENT_TYPE(freeable_vnodes))); |
5626 | |
5627 | if (!freeablevnodes_threshold_crossed) { |
5628 | freeable_vnodes_telemetry.numvnodes = 0; |
5629 | } |
5630 | CA_EVENT_SEND(event); |
5631 | } |
5632 | } |
5633 | } |
5634 | |
5635 | static int |
5636 | new_vnode(vnode_t *vpp, bool can_free) |
5637 | { |
5638 | long force_alloc_min; |
5639 | vnode_t vp; |
5640 | #if CONFIG_JETSAM |
5641 | uint32_t retries = 0, max_retries = 2; /* retry incase of tablefull */ |
5642 | #else |
5643 | uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */ |
5644 | #endif |
5645 | int force_alloc = 0, walk_count = 0; |
5646 | boolean_t need_reliable_vp = FALSE; |
5647 | int deferred; |
5648 | struct timeval initial_tv; |
5649 | struct timeval current_tv; |
5650 | proc_t curproc = current_proc(); |
5651 | bool force_alloc_freeable = false; |
5652 | |
5653 | if (vn_dealloc_level == DEALLOC_VNODE_NONE) { |
5654 | can_free = false; |
5655 | } |
5656 | |
5657 | initial_tv.tv_sec = 0; |
5658 | retry: |
5659 | vp = NULLVP; |
5660 | |
5661 | vnode_list_lock(); |
5662 | newvnode++; |
5663 | |
5664 | if (need_reliable_vp == TRUE) { |
5665 | async_work_timed_out++; |
5666 | } |
5667 | |
5668 | /* |
5669 | * The vnode list lock was dropped after force_alloc_freeable was set, |
5670 | * reevaluate. |
5671 | */ |
5672 | force_alloc_min = MAX(desiredvnodes, numvnodes_min); |
5673 | if (force_alloc_freeable && |
5674 | (numvnodes < force_alloc_min || numvnodes >= numvnodes_max)) { |
5675 | force_alloc_freeable = false; |
5676 | } |
5677 | |
5678 | #if CONFIG_JETSAM |
5679 | if ((numvnodes_max > desiredvnodes) && numvnodes > (numvnodes_max - 100) |
5680 | #if (DEVELOPMENT || DEBUG) |
5681 | && !bootarg_no_vnode_jetsam |
5682 | #endif |
5683 | ) { |
5684 | do_async_jetsam = true; |
5685 | wakeup(&vnode_free_list); |
5686 | } |
5687 | #endif /* CONFIG_JETSAM */ |
5688 | |
5689 | if (((numvnodes - deadvnodes + deadvnodes_noreuse) < desiredvnodes) || |
5690 | force_alloc || force_alloc_freeable) { |
5691 | struct timespec ts; |
5692 | uint32_t vflag = 0; |
5693 | |
5694 | /* |
5695 | * Can always reuse a dead one except if it is in the process of |
5696 | * being freed or the FS cannot handle freeable vnodes. |
5697 | */ |
5698 | if (!TAILQ_EMPTY(&vnode_dead_list)) { |
5699 | /* Select an appropriate deadvnode */ |
5700 | if (numvnodes <= numvnodes_min || !can_free) { |
5701 | /* all vnodes upto numvnodes_min are not freeable */ |
5702 | vp = TAILQ_FIRST(&vnode_dead_list); |
5703 | if (numvnodes > numvnodes_min && |
5704 | (vp->v_flag & VCANDEALLOC)) { |
5705 | /* |
5706 | * Freeable vnodes are added to the |
5707 | * back of the queue, so if the first |
5708 | * from the front is freeable, then |
5709 | * there are none on the dead list. |
5710 | */ |
5711 | vp = NULLVP; |
5712 | } |
5713 | } else { |
5714 | /* |
5715 | * Filesystems which opt in to freeable vnodes |
5716 | * can get either one. |
5717 | */ |
5718 | TAILQ_FOREACH_REVERSE(vp, &vnode_dead_list, |
5719 | deadlst, v_freelist) { |
5720 | if (!(vp->v_listflag & VLIST_NO_REUSE)) { |
5721 | break; |
5722 | } |
5723 | } |
5724 | } |
5725 | |
5726 | if (vp) { |
5727 | force_alloc_freeable = false; |
5728 | goto steal_this_vp; |
5729 | } |
5730 | } |
5731 | |
5732 | /* |
5733 | * no dead vnodes available... if we're under |
5734 | * the limit, we'll create a new vnode |
5735 | */ |
5736 | numvnodes++; |
5737 | if (force_alloc) { |
5738 | numvnodes_min++; |
5739 | } else if (can_free && (numvnodes > numvnodes_min)) { |
5740 | allocedvnodes++; |
5741 | freeablevnodes++; |
5742 | vflag = VCANDEALLOC; |
5743 | |
5744 | send_freeable_vnodes_telemetry(); |
5745 | } |
5746 | vnode_list_unlock(); |
5747 | |
5748 | if (nc_smr_enabled) { |
5749 | vp = zalloc_smr(vnode_zone, Z_WAITOK_ZERO_NOFAIL); |
5750 | } else { |
5751 | vp = zalloc_flags(vnode_zone, Z_WAITOK_ZERO_NOFAIL); |
5752 | } |
5753 | |
5754 | VLISTNONE(vp); /* avoid double queue removal */ |
5755 | lck_mtx_init(lck: &vp->v_lock, grp: &vnode_lck_grp, attr: &vnode_lck_attr); |
5756 | |
5757 | TAILQ_INIT(&vp->v_ncchildren); |
5758 | |
5759 | klist_init(list: &vp->v_knotes); |
5760 | nanouptime(ts: &ts); |
5761 | vp->v_id = (uint32_t)ts.tv_nsec; |
5762 | vp->v_flag = VSTANDARD | vflag; |
5763 | if (force_alloc_freeable) { |
5764 | /* This vnode should be recycled and freed immediately */ |
5765 | vp->v_lflag = VL_MARKTERM; |
5766 | vp->v_listflag = VLIST_NO_REUSE; |
5767 | } |
5768 | |
5769 | if (vflag & VCANDEALLOC) { |
5770 | os_atomic_inc(&busyvnodes, relaxed); |
5771 | } |
5772 | |
5773 | #if CONFIG_MACF |
5774 | if (mac_vnode_label_init_needed(vp)) { |
5775 | mac_vnode_label_init(vp); |
5776 | } |
5777 | #endif /* MAC */ |
5778 | |
5779 | #if CONFIG_IOCOUNT_TRACE |
5780 | if (__improbable(bootarg_vnode_iocount_trace)) { |
5781 | vp->v_iocount_trace = (vnode_iocount_trace_t)zalloc_permanent( |
5782 | IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace), |
5783 | ZALIGN(struct vnode_iocount_trace)); |
5784 | } |
5785 | #endif /* CONFIG_IOCOUNT_TRACE */ |
5786 | |
5787 | #if CONFIG_FILE_LEASES |
5788 | LIST_INIT(&vp->v_leases); |
5789 | #endif |
5790 | |
5791 | vp->v_iocount = 1; |
5792 | |
5793 | goto done; |
5794 | } |
5795 | |
5796 | microuptime(tv: ¤t_tv); |
5797 | |
5798 | #define MAX_WALK_COUNT 1000 |
5799 | |
5800 | if (!TAILQ_EMPTY(&vnode_rage_list) && |
5801 | (ragevnodes >= rage_limit || |
5802 | (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) { |
5803 | TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) { |
5804 | if (!(vp->v_listflag & VLIST_RAGE)) { |
5805 | panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE" , vp); |
5806 | } |
5807 | |
5808 | // if we're a dependency-capable process, skip vnodes that can |
5809 | // cause recycling deadlocks. (i.e. this process is diskimages |
5810 | // helper and the vnode is in a disk image). Querying the |
5811 | // mnt_kern_flag for the mount's virtual device status |
5812 | // is safer than checking the mnt_dependent_process, which |
5813 | // may not be updated if there are multiple devnode layers |
5814 | // in between the disk image and the final consumer. |
5815 | |
5816 | if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || |
5817 | (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) && |
5818 | !(vp->v_listflag & VLIST_NO_REUSE) && |
5819 | (can_free || !(vp->v_flag & VCANDEALLOC))) { |
5820 | /* |
5821 | * if need_reliable_vp == TRUE, then we've already sent one or more |
5822 | * non-reliable vnodes to the async thread for processing and timed |
5823 | * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT |
5824 | * mechanism to first scan for a reliable vnode before forcing |
5825 | * a new vnode to be created |
5826 | */ |
5827 | if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) { |
5828 | break; |
5829 | } |
5830 | } |
5831 | |
5832 | // don't iterate more than MAX_WALK_COUNT vnodes to |
5833 | // avoid keeping the vnode list lock held for too long. |
5834 | |
5835 | if (walk_count++ > MAX_WALK_COUNT) { |
5836 | vp = NULL; |
5837 | break; |
5838 | } |
5839 | } |
5840 | } |
5841 | |
5842 | if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) { |
5843 | /* |
5844 | * Pick the first vp for possible reuse |
5845 | */ |
5846 | walk_count = 0; |
5847 | TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { |
5848 | // if we're a dependency-capable process, skip vnodes that can |
5849 | // cause recycling deadlocks. (i.e. this process is diskimages |
5850 | // helper and the vnode is in a disk image). Querying the |
5851 | // mnt_kern_flag for the mount's virtual device status |
5852 | // is safer than checking the mnt_dependent_process, which |
5853 | // may not be updated if there are multiple devnode layers |
5854 | // in between the disk image and the final consumer. |
5855 | |
5856 | if (((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || |
5857 | (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) && |
5858 | !(vp->v_listflag & VLIST_NO_REUSE) && |
5859 | (can_free || !(vp->v_flag & VCANDEALLOC))) { |
5860 | /* |
5861 | * if need_reliable_vp == TRUE, then we've already sent one or more |
5862 | * non-reliable vnodes to the async thread for processing and timed |
5863 | * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT |
5864 | * mechanism to first scan for a reliable vnode before forcing |
5865 | * a new vnode to be created |
5866 | */ |
5867 | if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) { |
5868 | break; |
5869 | } |
5870 | } |
5871 | |
5872 | // don't iterate more than MAX_WALK_COUNT vnodes to |
5873 | // avoid keeping the vnode list lock held for too long. |
5874 | |
5875 | if (walk_count++ > MAX_WALK_COUNT) { |
5876 | vp = NULL; |
5877 | break; |
5878 | } |
5879 | } |
5880 | } |
5881 | |
5882 | // |
5883 | // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT |
5884 | // then we're trying to create a vnode on behalf of a |
5885 | // process like diskimages-helper that has file systems |
5886 | // mounted on top of itself (and thus we can't reclaim |
5887 | // vnodes in the file systems on top of us). if we can't |
5888 | // find a vnode to reclaim then we'll just have to force |
5889 | // the allocation. |
5890 | // |
5891 | if (vp == NULL && walk_count >= MAX_WALK_COUNT) { |
5892 | force_alloc = 1; |
5893 | vnode_list_unlock(); |
5894 | goto retry; |
5895 | } |
5896 | |
5897 | if (vp == NULL) { |
5898 | if (can_free && (vn_dealloc_level > DEALLOC_VNODE_NONE) && |
5899 | (numvnodes >= force_alloc_min) && (numvnodes < numvnodes_max)) { |
5900 | force_alloc_freeable = true; |
5901 | vnode_list_unlock(); |
5902 | goto retry; |
5903 | } |
5904 | vnode_list_unlock(); |
5905 | |
5906 | /* |
5907 | * we've reached the system imposed maximum number of vnodes |
5908 | * but there isn't a single one available |
5909 | * wait a bit and then retry... if we can't get a vnode |
5910 | * after our target number of retries, than log a complaint |
5911 | */ |
5912 | if (++retries <= max_retries) { |
5913 | delay_for_interval(interval: 1, scale_factor: 1000 * 1000); |
5914 | goto retry; |
5915 | } |
5916 | |
5917 | tablefull("vnode" ); |
5918 | log(LOG_EMERG, "%d desired, %ld numvnodes, " |
5919 | "%ld free, %ld dead, %ld async, %d rage\n" , |
5920 | desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes); |
5921 | |
5922 | #if CONFIG_JETSAM |
5923 | /* |
5924 | * Running out of vnodes tends to make a system unusable. Start killing |
5925 | * processes that jetsam knows are killable. |
5926 | */ |
5927 | if (memorystatus_kill_on_vnode_limit() == FALSE |
5928 | #if DEVELOPMENT || DEBUG |
5929 | || bootarg_no_vnode_jetsam |
5930 | #endif |
5931 | ) { |
5932 | /* |
5933 | * If jetsam can't find any more processes to kill and there |
5934 | * still aren't any free vnodes, panic. Hopefully we'll get a |
5935 | * panic log to tell us why we ran out. |
5936 | */ |
5937 | panic("vnode table is full" ); |
5938 | } |
5939 | |
5940 | /* |
5941 | * Now that we've killed someone, wait a bit and continue looking |
5942 | */ |
5943 | delay_for_interval(3, 1000 * 1000); |
5944 | retries = 0; |
5945 | goto retry; |
5946 | #endif |
5947 | |
5948 | *vpp = NULL; |
5949 | return ENFILE; |
5950 | } |
5951 | newvnode_nodead++; |
5952 | steal_this_vp: |
5953 | if ((vp = process_vp(vp, want_vp: 1, true, deferred: &deferred)) == NULLVP) { |
5954 | if (deferred) { |
5955 | int elapsed_msecs; |
5956 | struct timeval elapsed_tv; |
5957 | |
5958 | if (initial_tv.tv_sec == 0) { |
5959 | microuptime(tv: &initial_tv); |
5960 | } |
5961 | |
5962 | vnode_list_lock(); |
5963 | |
5964 | dead_vnode_waited++; |
5965 | dead_vnode_wanted++; |
5966 | |
5967 | /* |
5968 | * note that we're only going to explicitly wait 10ms |
5969 | * for a dead vnode to become available, since even if one |
5970 | * isn't available, a reliable vnode might now be available |
5971 | * at the head of the VRAGE or free lists... if so, we |
5972 | * can satisfy the new_vnode request with less latency then waiting |
5973 | * for the full 100ms duration we're ultimately willing to tolerate |
5974 | */ |
5975 | assert_wait_timeout(event: (caddr_t)&dead_vnode_wanted, interruptible: (THREAD_INTERRUPTIBLE), interval: 10000, NSEC_PER_USEC); |
5976 | |
5977 | vnode_list_unlock(); |
5978 | |
5979 | thread_block(THREAD_CONTINUE_NULL); |
5980 | |
5981 | microuptime(tv: &elapsed_tv); |
5982 | |
5983 | timevalsub(t1: &elapsed_tv, t2: &initial_tv); |
5984 | elapsed_msecs = (int)(elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000); |
5985 | |
5986 | if (elapsed_msecs >= 100) { |
5987 | /* |
5988 | * we've waited long enough... 100ms is |
5989 | * somewhat arbitrary for this case, but the |
5990 | * normal worst case latency used for UI |
5991 | * interaction is 100ms, so I've chosen to |
5992 | * go with that. |
5993 | * |
5994 | * setting need_reliable_vp to TRUE |
5995 | * forces us to find a reliable vnode |
5996 | * that we can process synchronously, or |
5997 | * to create a new one if the scan for |
5998 | * a reliable one hits the scan limit |
5999 | */ |
6000 | need_reliable_vp = TRUE; |
6001 | } |
6002 | } |
6003 | goto retry; |
6004 | } |
6005 | OSAddAtomicLong(1, &num_reusedvnodes); |
6006 | |
6007 | |
6008 | #if CONFIG_MACF |
6009 | /* |
6010 | * We should never see VL_LABELWAIT or VL_LABEL here. |
6011 | * as those operations hold a reference. |
6012 | */ |
6013 | assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT); |
6014 | assert((vp->v_lflag & VL_LABEL) != VL_LABEL); |
6015 | if (vp->v_lflag & VL_LABELED || mac_vnode_label(vp) != NULL) { |
6016 | vnode_lock_convert(vp); |
6017 | mac_vnode_label_recycle(vp); |
6018 | } else if (mac_vnode_label_init_needed(vp)) { |
6019 | vnode_lock_convert(vp); |
6020 | mac_vnode_label_init(vp); |
6021 | } |
6022 | |
6023 | #endif /* MAC */ |
6024 | |
6025 | vp->v_iocount = 1; |
6026 | vp->v_lflag = 0; |
6027 | vp->v_writecount = 0; |
6028 | vp->v_references = 0; |
6029 | vp->v_iterblkflags = 0; |
6030 | if (can_free && (vp->v_flag & VCANDEALLOC)) { |
6031 | vp->v_flag = VSTANDARD | VCANDEALLOC; |
6032 | } else { |
6033 | vp->v_flag = VSTANDARD; |
6034 | } |
6035 | |
6036 | /* vbad vnodes can point to dead_mountp */ |
6037 | vp->v_mount = NULL; |
6038 | vp->v_defer_reclaimlist = (vnode_t)0; |
6039 | |
6040 | /* process_vp returns a locked vnode with a holdcount */ |
6041 | vnode_drop_and_unlock(vp); |
6042 | |
6043 | done: |
6044 | *vpp = vp; |
6045 | |
6046 | return 0; |
6047 | } |
6048 | |
6049 | void |
6050 | vnode_lock(vnode_t vp) |
6051 | { |
6052 | lck_mtx_lock(lck: &vp->v_lock); |
6053 | } |
6054 | |
6055 | void |
6056 | vnode_lock_spin(vnode_t vp) |
6057 | { |
6058 | lck_mtx_lock_spin(lck: &vp->v_lock); |
6059 | } |
6060 | |
6061 | void |
6062 | vnode_unlock(vnode_t vp) |
6063 | { |
6064 | lck_mtx_unlock(lck: &vp->v_lock); |
6065 | } |
6066 | |
6067 | void |
6068 | vnode_hold(vnode_t vp) |
6069 | { |
6070 | int32_t old_holdcount = os_atomic_inc_orig(&vp->v_holdcount, relaxed); |
6071 | |
6072 | if (old_holdcount == INT32_MAX) { |
6073 | /* |
6074 | * Because we allow atomic ops on the holdcount it is |
6075 | * possible that when the vnode is examined, its holdcount |
6076 | * is different than what will be printed in this |
6077 | * panic message. |
6078 | */ |
6079 | panic("%s: vp %p holdcount overflow from : %d v_tag = %d, v_type = %d, v_flag = %x." , |
6080 | __FUNCTION__, vp, old_holdcount, vp->v_tag, vp->v_type, vp->v_flag); |
6081 | } |
6082 | } |
6083 | |
6084 | #define VNODE_HOLD_NO_SMR (1<<29) /* Disable vnode_hold_smr */ |
6085 | |
6086 | /* |
6087 | * To be used when smr is the only protection (cache_lookup and cache_lookup_path) |
6088 | */ |
6089 | bool |
6090 | vnode_hold_smr(vnode_t vp) |
6091 | { |
6092 | int32_t holdcount; |
6093 | |
6094 | /* |
6095 | * For "high traffic" vnodes like rootvnode, the atomic |
6096 | * cmpexcg loop below can turn into a infinite loop, no need |
6097 | * to do it for vnodes that won't be dealloc'ed |
6098 | */ |
6099 | if (!(os_atomic_load(&vp->v_flag, relaxed) & VCANDEALLOC)) { |
6100 | vnode_hold(vp); |
6101 | return true; |
6102 | } |
6103 | |
6104 | for (;;) { |
6105 | holdcount = os_atomic_load(&vp->v_holdcount, relaxed); |
6106 | |
6107 | if (holdcount & VNODE_HOLD_NO_SMR) { |
6108 | return false; |
6109 | } |
6110 | |
6111 | if ((os_atomic_cmpxchg(&vp->v_holdcount, holdcount, holdcount + 1, relaxed) != 0)) { |
6112 | return true; |
6113 | } |
6114 | } |
6115 | } |
6116 | |
6117 | /* |
6118 | * free callback from smr enabled zones |
6119 | */ |
6120 | static void |
6121 | vnode_smr_free(void *_vp, __unused size_t _size) |
6122 | { |
6123 | vnode_t vp = _vp; |
6124 | |
6125 | bzero(s: vp, n: sizeof(*vp)); |
6126 | } |
6127 | |
6128 | static vnode_t |
6129 | vnode_drop_internal(vnode_t vp, bool locked) |
6130 | { |
6131 | int32_t old_holdcount = os_atomic_dec_orig(&vp->v_holdcount, relaxed); |
6132 | |
6133 | if (old_holdcount < 1) { |
6134 | if (locked) { |
6135 | vnode_unlock(vp); |
6136 | } |
6137 | |
6138 | /* |
6139 | * Because we allow atomic ops on the holdcount it is possible |
6140 | * that when the vnode is examined, its holdcount is different |
6141 | * than what will be printed in this panic message. |
6142 | */ |
6143 | panic("%s : vp %p holdcount -ve: %d. v_tag = %d, v_type = %d, v_flag = %x." , |
6144 | __FUNCTION__, vp, old_holdcount - 1, vp->v_tag, vp->v_type, vp->v_flag); |
6145 | } |
6146 | |
6147 | if (vn_dealloc_level == DEALLOC_VNODE_NONE || old_holdcount > 1 || |
6148 | !(vp->v_flag & VCANDEALLOC) || !(vp->v_lflag & VL_DEAD)) { |
6149 | if (locked) { |
6150 | vnode_unlock(vp); |
6151 | } |
6152 | return vp; |
6153 | } |
6154 | |
6155 | if (!locked) { |
6156 | vnode_lock(vp); |
6157 | } |
6158 | |
6159 | if ((os_atomic_load(&vp->v_holdcount, relaxed) != 0) || vp->v_iocount || |
6160 | vp->v_usecount || !(vp->v_flag & VCANDEALLOC) || !(vp->v_lflag & VL_DEAD)) { |
6161 | vnode_unlock(vp); |
6162 | return vp; |
6163 | } |
6164 | |
6165 | vnode_list_lock(); |
6166 | |
6167 | /* |
6168 | * the v_listflag field is protected by the vnode_list_lock |
6169 | */ |
6170 | if (VONLIST(vp) && (vp->v_listflag & VLIST_DEAD) && |
6171 | (numvnodes > desiredvnodes || (vp->v_listflag & VLIST_NO_REUSE) || |
6172 | vn_dealloc_level != DEALLOC_VNODE_ALL || deadvnodes >= deadvnodes_high) && |
6173 | (os_atomic_cmpxchg(&vp->v_holdcount, 0, VNODE_HOLD_NO_SMR, relaxed) != 0)) { |
6174 | VREMDEAD("vnode_list_remove" , vp); |
6175 | numvnodes--; |
6176 | freeablevnodes--; |
6177 | deallocedvnodes++; |
6178 | vp->v_listflag = 0; |
6179 | |
6180 | send_freeable_vnodes_telemetry(); |
6181 | vnode_list_unlock(); |
6182 | |
6183 | #if CONFIG_MACF |
6184 | struct label *tmpl = mac_vnode_label(vp); |
6185 | vp->v_label = NULL; |
6186 | #endif /* CONFIG_MACF */ |
6187 | |
6188 | vnode_unlock(vp); |
6189 | |
6190 | #if CONFIG_MACF |
6191 | if (tmpl) { |
6192 | mac_vnode_label_free(label: tmpl); |
6193 | } |
6194 | #endif /* CONFIG_MACF */ |
6195 | |
6196 | if (nc_smr_enabled) { |
6197 | zfree_smr(vnode_zone, vp); |
6198 | } else { |
6199 | zfree(vnode_zone, vp); |
6200 | } |
6201 | |
6202 | vp = NULLVP; |
6203 | } else { |
6204 | vnode_list_unlock(); |
6205 | vnode_unlock(vp); |
6206 | } |
6207 | |
6208 | return vp; |
6209 | } |
6210 | |
6211 | vnode_t |
6212 | vnode_drop_and_unlock(vnode_t vp) |
6213 | { |
6214 | return vnode_drop_internal(vp, true); |
6215 | } |
6216 | |
6217 | vnode_t |
6218 | vnode_drop(vnode_t vp) |
6219 | { |
6220 | return vnode_drop_internal(vp, false); |
6221 | } |
6222 | |
6223 | SYSCTL_NODE(_vfs, OID_AUTO, vnstats, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "vfs vnode stats" ); |
6224 | |
6225 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, vn_dealloc_level, |
6226 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6227 | &vn_dealloc_level, 0, "" ); |
6228 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, desired_vnodes, |
6229 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6230 | &desiredvnodes, 0, "" ); |
6231 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_vnodes, |
6232 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6233 | &numvnodes, "" ); |
6234 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_min, |
6235 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6236 | &numvnodes_min, 0, "" ); |
6237 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_vnodes_max, |
6238 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6239 | &numvnodes_max, 0, "" ); |
6240 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_deallocable_vnodes, |
6241 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6242 | &freeablevnodes, 0, "" ); |
6243 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_deallocable_busy_vnodes, |
6244 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6245 | &busyvnodes, "" ); |
6246 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes, |
6247 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6248 | &deadvnodes, "" ); |
6249 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_dead_vnodes_to_dealloc, |
6250 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6251 | &deadvnodes_noreuse, "" ); |
6252 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_async_work_vnodes, |
6253 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6254 | &async_work_vnodes, "" ); |
6255 | SYSCTL_COMPAT_INT(_vfs_vnstats, OID_AUTO, num_rapid_aging_vnodes, |
6256 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6257 | &ragevnodes, 0, "" ); |
6258 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_free_vnodes, |
6259 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6260 | &freevnodes, "" ); |
6261 | SYSCTL_LONG(_vfs_vnstats, OID_AUTO, num_recycledvnodes, |
6262 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6263 | &num_recycledvnodes, "" ); |
6264 | SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_allocedvnodes, |
6265 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6266 | &allocedvnodes, "" ); |
6267 | SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_deallocedvnodes, |
6268 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6269 | &deallocedvnodes, "" ); |
6270 | SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls, |
6271 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6272 | &newvnode, "" ); |
6273 | SYSCTL_QUAD(_vfs_vnstats, OID_AUTO, num_newvnode_calls_nodead, |
6274 | CTLFLAG_RD | CTLFLAG_LOCKED, |
6275 | &newvnode_nodead, "" ); |
6276 | |
6277 | int |
6278 | vnode_get(struct vnode *vp) |
6279 | { |
6280 | int retval; |
6281 | |
6282 | vnode_lock_spin(vp); |
6283 | retval = vnode_get_locked(vp); |
6284 | vnode_unlock(vp); |
6285 | |
6286 | return retval; |
6287 | } |
6288 | |
6289 | int |
6290 | vnode_get_locked(struct vnode *vp) |
6291 | { |
6292 | #if DIAGNOSTIC |
6293 | lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); |
6294 | #endif |
6295 | if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) { |
6296 | return ENOENT; |
6297 | } |
6298 | |
6299 | if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) { |
6300 | panic("v_iocount overflow" ); |
6301 | } |
6302 | |
6303 | #ifdef CONFIG_IOCOUNT_TRACE |
6304 | record_vp(vp, 1); |
6305 | #endif |
6306 | return 0; |
6307 | } |
6308 | |
6309 | /* |
6310 | * vnode_getwithvid() cuts in line in front of a vnode drain (that is, |
6311 | * while the vnode is draining, but at no point after that) to prevent |
6312 | * deadlocks when getting vnodes from filesystem hashes while holding |
6313 | * resources that may prevent other iocounts from being released. |
6314 | */ |
6315 | int |
6316 | vnode_getwithvid(vnode_t vp, uint32_t vid) |
6317 | { |
6318 | return vget_internal(vp, vid, vflags: (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO)); |
6319 | } |
6320 | |
6321 | /* |
6322 | * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode |
6323 | * drain; it exists for use in the VFS name cache, where we really do want to block behind |
6324 | * vnode drain to prevent holding off an unmount. |
6325 | */ |
6326 | int |
6327 | vnode_getwithvid_drainok(vnode_t vp, uint32_t vid) |
6328 | { |
6329 | return vget_internal(vp, vid, vflags: (VNODE_NODEAD | VNODE_WITHID)); |
6330 | } |
6331 | |
6332 | int |
6333 | vnode_getwithref(vnode_t vp) |
6334 | { |
6335 | return vget_internal(vp, vid: 0, vflags: 0); |
6336 | } |
6337 | |
6338 | __private_extern__ int |
6339 | vnode_getwithref_noblock(vnode_t vp) |
6340 | { |
6341 | return vget_internal(vp, vid: 0, VNODE_NOBLOCK); |
6342 | } |
6343 | |
6344 | __private_extern__ int |
6345 | vnode_getalways(vnode_t vp) |
6346 | { |
6347 | return vget_internal(vp, vid: 0, VNODE_ALWAYS); |
6348 | } |
6349 | |
6350 | __private_extern__ int |
6351 | (vnode_t vp) |
6352 | { |
6353 | return vget_internal(vp, vid: 0, VNODE_ALWAYS | VNODE_PAGER); |
6354 | } |
6355 | |
6356 | static inline void |
6357 | vn_set_dead(vnode_t vp) |
6358 | { |
6359 | vp->v_mount = NULL; |
6360 | vp->v_op = dead_vnodeop_p; |
6361 | vp->v_tag = VT_NON; |
6362 | vp->v_data = NULL; |
6363 | vp->v_type = VBAD; |
6364 | vp->v_lflag |= VL_DEAD; |
6365 | } |
6366 | |
6367 | static int |
6368 | vnode_put_internal_locked(vnode_t vp, bool ) |
6369 | { |
6370 | vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */ |
6371 | |
6372 | #if DIAGNOSTIC |
6373 | lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); |
6374 | #endif |
6375 | retry: |
6376 | if (vp->v_iocount < 1) { |
6377 | panic("vnode_put(%p): iocount < 1" , vp); |
6378 | } |
6379 | |
6380 | if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { |
6381 | vnode_dropiocount(vp); |
6382 | return 0; |
6383 | } |
6384 | |
6385 | if (((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE)) { |
6386 | vp->v_lflag &= ~VL_NEEDINACTIVE; |
6387 | |
6388 | if (UBCINFOEXISTS(vp)) { |
6389 | ubc_cs_free_and_vnode_unlock(vp); |
6390 | } else { |
6391 | vnode_unlock(vp); |
6392 | } |
6393 | |
6394 | VNOP_INACTIVE(vp, ctx); |
6395 | |
6396 | vnode_lock_spin(vp); |
6397 | /* |
6398 | * because we had to drop the vnode lock before calling |
6399 | * VNOP_INACTIVE, the state of this vnode may have changed... |
6400 | * we may pick up both VL_MARTERM and either |
6401 | * an iocount or a usecount while in the VNOP_INACTIVE call |
6402 | * we don't want to call vnode_reclaim_internal on a vnode |
6403 | * that has active references on it... so loop back around |
6404 | * and reevaluate the state |
6405 | */ |
6406 | goto retry; |
6407 | } |
6408 | vp->v_lflag &= ~VL_NEEDINACTIVE; |
6409 | |
6410 | vnode_lock_convert(vp); |
6411 | if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) { |
6412 | if (from_pager) { |
6413 | /* |
6414 | * We can't initiate reclaim when called from the pager |
6415 | * because it will deadlock with itself so we hand it |
6416 | * off to the async cleaner thread. |
6417 | */ |
6418 | vnode_async_list_add(vp); |
6419 | } else { |
6420 | vnode_reclaim_internal(vp, 1, 1, 0); |
6421 | } |
6422 | } |
6423 | vnode_dropiocount(vp); |
6424 | vnode_list_add(vp); |
6425 | |
6426 | return 0; |
6427 | } |
6428 | |
6429 | int |
6430 | vnode_put_locked(vnode_t vp) |
6431 | { |
6432 | return vnode_put_internal_locked(vp, false); |
6433 | } |
6434 | |
6435 | int |
6436 | vnode_put(vnode_t vp) |
6437 | { |
6438 | int retval; |
6439 | |
6440 | vnode_lock_spin(vp); |
6441 | vnode_hold(vp); |
6442 | retval = vnode_put_internal_locked(vp, false); |
6443 | vnode_drop_and_unlock(vp); |
6444 | |
6445 | return retval; |
6446 | } |
6447 | |
6448 | int |
6449 | (vnode_t vp) |
6450 | { |
6451 | int retval; |
6452 | |
6453 | vnode_lock_spin(vp); |
6454 | vnode_hold(vp); |
6455 | /* Cannot initiate reclaim while paging */ |
6456 | retval = vnode_put_internal_locked(vp, true); |
6457 | vnode_drop_and_unlock(vp); |
6458 | |
6459 | return retval; |
6460 | } |
6461 | |
6462 | int |
6463 | vnode_writecount(vnode_t vp) |
6464 | { |
6465 | return vp->v_writecount; |
6466 | } |
6467 | |
6468 | /* is vnode_t in use by others? */ |
6469 | int |
6470 | vnode_isinuse(vnode_t vp, int refcnt) |
6471 | { |
6472 | return vnode_isinuse_locked(vp, refcnt, 0); |
6473 | } |
6474 | |
6475 | int |
6476 | vnode_usecount(vnode_t vp) |
6477 | { |
6478 | return vp->v_usecount; |
6479 | } |
6480 | |
6481 | int |
6482 | vnode_iocount(vnode_t vp) |
6483 | { |
6484 | return vp->v_iocount; |
6485 | } |
6486 | |
6487 | int |
6488 | vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) |
6489 | { |
6490 | int retval = 0; |
6491 | |
6492 | if (!locked) { |
6493 | vnode_lock_spin(vp); |
6494 | } |
6495 | if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) { |
6496 | retval = 1; |
6497 | goto out; |
6498 | } |
6499 | if (vp->v_type == VREG) { |
6500 | retval = ubc_isinuse_locked(vp, refcnt, 1); |
6501 | } |
6502 | |
6503 | out: |
6504 | if (!locked) { |
6505 | vnode_unlock(vp); |
6506 | } |
6507 | return retval; |
6508 | } |
6509 | |
6510 | kauth_cred_t |
6511 | vnode_cred(vnode_t vp) |
6512 | { |
6513 | if (vp->v_cred) { |
6514 | return kauth_cred_require(cred: vp->v_cred); |
6515 | } |
6516 | |
6517 | return NULL; |
6518 | } |
6519 | |
6520 | |
6521 | /* resume vnode_t */ |
6522 | errno_t |
6523 | vnode_resume(vnode_t vp) |
6524 | { |
6525 | if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) { |
6526 | vnode_lock_spin(vp); |
6527 | vp->v_lflag &= ~VL_SUSPENDED; |
6528 | vp->v_owner = NULL; |
6529 | vnode_unlock(vp); |
6530 | |
6531 | wakeup(chan: &vp->v_iocount); |
6532 | } |
6533 | return 0; |
6534 | } |
6535 | |
6536 | /* suspend vnode_t |
6537 | * Please do not use on more than one vnode at a time as it may |
6538 | * cause deadlocks. |
6539 | * xxx should we explicity prevent this from happening? |
6540 | */ |
6541 | |
6542 | errno_t |
6543 | vnode_suspend(vnode_t vp) |
6544 | { |
6545 | if (vp->v_lflag & VL_SUSPENDED) { |
6546 | return EBUSY; |
6547 | } |
6548 | |
6549 | vnode_lock_spin(vp); |
6550 | |
6551 | /* |
6552 | * xxx is this sufficient to check if a vnode_drain is |
6553 | * progress? |
6554 | */ |
6555 | |
6556 | if (vp->v_owner == NULL) { |
6557 | vp->v_lflag |= VL_SUSPENDED; |
6558 | vp->v_owner = current_thread(); |
6559 | } |
6560 | vnode_unlock(vp); |
6561 | |
6562 | return 0; |
6563 | } |
6564 | |
6565 | /* |
6566 | * Release any blocked locking requests on the vnode. |
6567 | * Used for forced-unmounts. |
6568 | * |
6569 | * XXX What about network filesystems? |
6570 | */ |
6571 | static void |
6572 | vnode_abort_advlocks(vnode_t vp) |
6573 | { |
6574 | if (vp->v_flag & VLOCKLOCAL) { |
6575 | lf_abort_advlocks(vp); |
6576 | } |
6577 | } |
6578 | |
6579 | |
6580 | static errno_t |
6581 | vnode_drain(vnode_t vp) |
6582 | { |
6583 | if (vp->v_lflag & VL_DRAIN) { |
6584 | panic("vnode_drain: recursive drain" ); |
6585 | return ENOENT; |
6586 | } |
6587 | vp->v_lflag |= VL_DRAIN; |
6588 | vp->v_owner = current_thread(); |
6589 | |
6590 | while (vp->v_iocount > 1) { |
6591 | if (bootarg_no_vnode_drain) { |
6592 | struct timespec ts = {.tv_sec = 10, .tv_nsec = 0}; |
6593 | int error; |
6594 | |
6595 | if (vfs_unmountall_started) { |
6596 | ts.tv_sec = 1; |
6597 | } |
6598 | |
6599 | error = msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_drain_with_timeout" , ts: &ts); |
6600 | |
6601 | /* Try to deal with leaked iocounts under bootarg and shutting down */ |
6602 | if (vp->v_iocount > 1 && error == EWOULDBLOCK && |
6603 | ts.tv_sec == 1 && vp->v_numoutput == 0) { |
6604 | vp->v_iocount = 1; |
6605 | break; |
6606 | } |
6607 | } else { |
6608 | msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_drain" , NULL); |
6609 | } |
6610 | } |
6611 | |
6612 | vp->v_lflag &= ~VL_DRAIN; |
6613 | |
6614 | return 0; |
6615 | } |
6616 | |
6617 | |
6618 | /* |
6619 | * if the number of recent references via vnode_getwithvid or vnode_getwithref |
6620 | * exceeds this threshold, than 'UN-AGE' the vnode by removing it from |
6621 | * the LRU list if it's currently on it... once the iocount and usecount both drop |
6622 | * to 0, it will get put back on the end of the list, effectively making it younger |
6623 | * this allows us to keep actively referenced vnodes in the list without having |
6624 | * to constantly remove and add to the list each time a vnode w/o a usecount is |
6625 | * referenced which costs us taking and dropping a global lock twice. |
6626 | * However, if the vnode is marked DIRTY, we want to pull it out much earlier |
6627 | */ |
6628 | #define UNAGE_THRESHHOLD 25 |
6629 | #define UNAGE_DIRTYTHRESHHOLD 6 |
6630 | |
6631 | errno_t |
6632 | vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) |
6633 | { |
6634 | int nodead = vflags & VNODE_NODEAD; |
6635 | int nosusp = vflags & VNODE_NOSUSPEND; |
6636 | int always = vflags & VNODE_ALWAYS; |
6637 | int beatdrain = vflags & VNODE_DRAINO; |
6638 | int withvid = vflags & VNODE_WITHID; |
6639 | int = vflags & VNODE_PAGER; |
6640 | int noblock = vflags & VNODE_NOBLOCK; |
6641 | |
6642 | for (;;) { |
6643 | int sleepflg = 0; |
6644 | |
6645 | /* |
6646 | * if it is a dead vnode with deadfs |
6647 | */ |
6648 | if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) { |
6649 | return ENOENT; |
6650 | } |
6651 | /* |
6652 | * will return VL_DEAD ones |
6653 | */ |
6654 | if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) { |
6655 | break; |
6656 | } |
6657 | /* |
6658 | * if suspended vnodes are to be failed |
6659 | */ |
6660 | if (nosusp && (vp->v_lflag & VL_SUSPENDED)) { |
6661 | return ENOENT; |
6662 | } |
6663 | /* |
6664 | * if you are the owner of drain/suspend/termination , can acquire iocount |
6665 | * check for VL_TERMINATE; it does not set owner |
6666 | */ |
6667 | if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) && |
6668 | (vp->v_owner == current_thread())) { |
6669 | break; |
6670 | } |
6671 | |
6672 | if (always != 0) { |
6673 | break; |
6674 | } |
6675 | |
6676 | if (noblock && (vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE))) { |
6677 | return ENOENT; |
6678 | } |
6679 | |
6680 | /* |
6681 | * If this vnode is getting drained, there are some cases where |
6682 | * we can't block or, in case of tty vnodes, want to be |
6683 | * interruptible. |
6684 | */ |
6685 | if (vp->v_lflag & VL_DRAIN) { |
6686 | /* |
6687 | * In some situations, we want to get an iocount |
6688 | * even if the vnode is draining to prevent deadlock, |
6689 | * e.g. if we're in the filesystem, potentially holding |
6690 | * resources that could prevent other iocounts from |
6691 | * being released. |
6692 | */ |
6693 | if (beatdrain) { |
6694 | break; |
6695 | } |
6696 | /* |
6697 | * Don't block if the vnode's mount point is unmounting as |
6698 | * we may be the thread the unmount is itself waiting on |
6699 | * Only callers who pass in vids (at this point, we've already |
6700 | * handled nosusp and nodead) are expecting error returns |
6701 | * from this function, so only we can only return errors for |
6702 | * those. ENODEV is intended to inform callers that the call |
6703 | * failed because an unmount is in progress. |
6704 | */ |
6705 | if (withvid && (vp->v_mount) && vfs_isunmount(mp: vp->v_mount)) { |
6706 | return ENODEV; |
6707 | } |
6708 | |
6709 | if (vnode_istty(vp)) { |
6710 | sleepflg = PCATCH; |
6711 | } |
6712 | } |
6713 | |
6714 | vnode_lock_convert(vp); |
6715 | |
6716 | if (vp->v_lflag & VL_TERMINATE) { |
6717 | int error; |
6718 | |
6719 | vp->v_lflag |= VL_TERMWANT; |
6720 | |
6721 | error = msleep(chan: &vp->v_lflag, mtx: &vp->v_lock, |
6722 | pri: (PVFS | sleepflg), wmesg: "vnode getiocount" , NULL); |
6723 | if (error) { |
6724 | return error; |
6725 | } |
6726 | } else { |
6727 | msleep(chan: &vp->v_iocount, mtx: &vp->v_lock, PVFS, wmesg: "vnode_getiocount" , NULL); |
6728 | } |
6729 | } |
6730 | if (withvid && vid != vp->v_id) { |
6731 | return ENOENT; |
6732 | } |
6733 | if (!forpager && (++vp->v_references >= UNAGE_THRESHHOLD || |
6734 | (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD))) { |
6735 | vp->v_references = 0; |
6736 | vnode_list_remove(vp); |
6737 | } |
6738 | vp->v_iocount++; |
6739 | #ifdef CONFIG_IOCOUNT_TRACE |
6740 | record_vp(vp, 1); |
6741 | #endif |
6742 | return 0; |
6743 | } |
6744 | |
6745 | static void |
6746 | vnode_dropiocount(vnode_t vp) |
6747 | { |
6748 | if (vp->v_iocount < 1) { |
6749 | panic("vnode_dropiocount(%p): v_iocount < 1" , vp); |
6750 | } |
6751 | |
6752 | vp->v_iocount--; |
6753 | #ifdef CONFIG_IOCOUNT_TRACE |
6754 | record_vp(vp, -1); |
6755 | #endif |
6756 | if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) { |
6757 | wakeup(chan: &vp->v_iocount); |
6758 | } |
6759 | } |
6760 | |
6761 | |
6762 | void |
6763 | vnode_reclaim(struct vnode * vp) |
6764 | { |
6765 | vnode_reclaim_internal(vp, 0, 0, 0); |
6766 | } |
6767 | |
6768 | __private_extern__ |
6769 | void |
6770 | vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) |
6771 | { |
6772 | int isfifo = 0; |
6773 | bool clear_tty_revoke = false; |
6774 | |
6775 | if (!locked) { |
6776 | vnode_lock(vp); |
6777 | } |
6778 | |
6779 | if (vp->v_lflag & VL_TERMINATE) { |
6780 | panic("vnode reclaim in progress" ); |
6781 | } |
6782 | vp->v_lflag |= VL_TERMINATE; |
6783 | |
6784 | vn_clearunionwait(vp, 1); |
6785 | |
6786 | /* |
6787 | * We have to force any terminals in reads to return and give up |
6788 | * their iocounts. It's important to do this after VL_TERMINATE |
6789 | * has been set to ensure new reads are blocked while the |
6790 | * revoke is in progress. |
6791 | */ |
6792 | if (vnode_istty(vp) && (flags & REVOKEALL) && (vp->v_iocount > 1)) { |
6793 | vnode_unlock(vp); |
6794 | VNOP_IOCTL(vp, TIOCREVOKE, data: (caddr_t)NULL, fflag: 0, ctx: vfs_context_kernel()); |
6795 | clear_tty_revoke = true; |
6796 | vnode_lock(vp); |
6797 | } |
6798 | |
6799 | vnode_drain(vp); |
6800 | |
6801 | if (clear_tty_revoke) { |
6802 | vnode_unlock(vp); |
6803 | VNOP_IOCTL(vp, TIOCREVOKECLEAR, data: (caddr_t)NULL, fflag: 0, ctx: vfs_context_kernel()); |
6804 | vnode_lock(vp); |
6805 | } |
6806 | |
6807 | #if CONFIG_FILE_LEASES |
6808 | /* |
6809 | * Revoke all leases in place for this vnode as it is about to be reclaimed. |
6810 | * In normal case, there shouldn't be any leases in place by the time we |
6811 | * get here as there shouldn't be any opens on the vnode (usecount == 0). |
6812 | * However, in the case of force unmount or unmount of a volume that |
6813 | * contains file that was opened with O_EVTONLY then the vnode can be |
6814 | * reclaimed while the file is still opened. |
6815 | */ |
6816 | vnode_revokelease(vp, true); |
6817 | #endif |
6818 | |
6819 | isfifo = (vp->v_type == VFIFO); |
6820 | |
6821 | if (vp->v_type != VBAD) { |
6822 | vgone(vp, flags); /* clean and reclaim the vnode */ |
6823 | } |
6824 | /* |
6825 | * give the vnode a new identity so that vnode_getwithvid will fail |
6826 | * on any stale cache accesses... |
6827 | * grab the list_lock so that if we're in "new_vnode" |
6828 | * behind the list_lock trying to steal this vnode, the v_id is stable... |
6829 | * once new_vnode drops the list_lock, it will block trying to take |
6830 | * the vnode lock until we release it... at that point it will evaluate |
6831 | * whether the v_vid has changed |
6832 | * also need to make sure that the vnode isn't on a list where "new_vnode" |
6833 | * can find it after the v_id has been bumped until we are completely done |
6834 | * with the vnode (i.e. putting it back on a list has to be the very last |
6835 | * thing we do to this vnode... many of the callers of vnode_reclaim_internal |
6836 | * are holding an io_count on the vnode... they need to drop the io_count |
6837 | * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until |
6838 | * they are completely done with the vnode |
6839 | */ |
6840 | vnode_list_lock(); |
6841 | |
6842 | vnode_list_remove_locked(vp); |
6843 | vp->v_id++; |
6844 | |
6845 | vnode_list_unlock(); |
6846 | |
6847 | if (isfifo) { |
6848 | struct fifoinfo * fip; |
6849 | |
6850 | fip = vp->v_fifoinfo; |
6851 | vp->v_fifoinfo = NULL; |
6852 | kfree_type(struct fifoinfo, fip); |
6853 | } |
6854 | vp->v_type = VBAD; |
6855 | |
6856 | if (vp->v_data) { |
6857 | panic("vnode_reclaim_internal: cleaned vnode isn't" ); |
6858 | } |
6859 | if (vp->v_numoutput) { |
6860 | panic("vnode_reclaim_internal: clean vnode has pending I/O's" ); |
6861 | } |
6862 | if (UBCINFOEXISTS(vp)) { |
6863 | panic("vnode_reclaim_internal: ubcinfo not cleaned" ); |
6864 | } |
6865 | if (vp->v_parent) { |
6866 | panic("vnode_reclaim_internal: vparent not removed" ); |
6867 | } |
6868 | if (vp->v_name) { |
6869 | panic("vnode_reclaim_internal: vname not removed" ); |
6870 | } |
6871 | |
6872 | #if CONFIG_FILE_LEASES |
6873 | if (__improbable(!LIST_EMPTY(&vp->v_leases))) { |
6874 | panic("vnode_reclaim_internal: vleases NOT empty" ); |
6875 | } |
6876 | #endif |
6877 | |
6878 | vp->v_socket = NULL; |
6879 | |
6880 | vp->v_lflag &= ~VL_TERMINATE; |
6881 | vp->v_owner = NULL; |
6882 | |
6883 | #if CONFIG_IOCOUNT_TRACE |
6884 | if (__improbable(bootarg_vnode_iocount_trace)) { |
6885 | bzero(vp->v_iocount_trace, |
6886 | IOCOUNT_TRACE_MAX_TYPES * sizeof(struct vnode_iocount_trace)); |
6887 | } |
6888 | #endif /* CONFIG_IOCOUNT_TRACE */ |
6889 | |
6890 | KNOTE(&vp->v_knotes, NOTE_REVOKE); |
6891 | |
6892 | /* Make sure that when we reuse the vnode, no knotes left over */ |
6893 | klist_init(list: &vp->v_knotes); |
6894 | |
6895 | if (vp->v_lflag & VL_TERMWANT) { |
6896 | vp->v_lflag &= ~VL_TERMWANT; |
6897 | wakeup(chan: &vp->v_lflag); |
6898 | } |
6899 | if (!reuse) { |
6900 | /* |
6901 | * make sure we get on the |
6902 | * dead list if appropriate |
6903 | */ |
6904 | vnode_list_add(vp); |
6905 | } |
6906 | if (!locked) { |
6907 | vnode_unlock(vp); |
6908 | } |
6909 | } |
6910 | |
6911 | static int |
6912 | vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, |
6913 | vnode_create_options_t vc_options) |
6914 | { |
6915 | int error; |
6916 | int insert = 1; |
6917 | vnode_t vp = NULLVP; |
6918 | vnode_t nvp; |
6919 | vnode_t dvp; |
6920 | struct uthread *ut; |
6921 | struct componentname *cnp; |
6922 | struct vnode_fsparam *param = (struct vnode_fsparam *)data; |
6923 | #if CONFIG_TRIGGERS |
6924 | struct vnode_trigger_param *tinfo = NULL; |
6925 | #endif |
6926 | bool existing_vnode; |
6927 | bool init_vnode = !(vc_options & VNODE_CREATE_EMPTY); |
6928 | bool is_bdevvp = false; |
6929 | |
6930 | if (*vpp) { |
6931 | vp = *vpp; |
6932 | *vpp = NULLVP; |
6933 | existing_vnode = true; |
6934 | } else { |
6935 | existing_vnode = false; |
6936 | } |
6937 | |
6938 | if (init_vnode) { |
6939 | /* Do quick sanity check on the parameters. */ |
6940 | if ((param == NULL) || (param->vnfs_vtype == VBAD)) { |
6941 | error = EINVAL; |
6942 | goto error_out; |
6943 | } |
6944 | |
6945 | #if CONFIG_TRIGGERS |
6946 | if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) { |
6947 | tinfo = (struct vnode_trigger_param *)data; |
6948 | |
6949 | /* Validate trigger vnode input */ |
6950 | if ((param->vnfs_vtype != VDIR) || |
6951 | (tinfo->vnt_resolve_func == NULL) || |
6952 | (tinfo->vnt_flags & ~VNT_VALID_MASK)) { |
6953 | error = EINVAL; |
6954 | goto error_out; |
6955 | } |
6956 | /* Fall through a normal create (params will be the same) */ |
6957 | flavor = VNCREATE_FLAVOR; |
6958 | size = VCREATESIZE; |
6959 | } |
6960 | #endif |
6961 | if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) { |
6962 | error = EINVAL; |
6963 | goto error_out; |
6964 | } |
6965 | } |
6966 | |
6967 | if (!existing_vnode) { |
6968 | if ((error = new_vnode(vpp: &vp, can_free: !(vc_options & VNODE_CREATE_NODEALLOC)))) { |
6969 | return error; |
6970 | } |
6971 | if (!init_vnode) { |
6972 | /* Make it so that it can be released by a vnode_put) */ |
6973 | vnode_lock(vp); |
6974 | vn_set_dead(vp); |
6975 | vnode_unlock(vp); |
6976 | *vpp = vp; |
6977 | return 0; |
6978 | } |
6979 | } else { |
6980 | /* |
6981 | * A vnode obtained by vnode_create_empty has been passed to |
6982 | * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After |
6983 | * this point, it is set back on any error. |
6984 | */ |
6985 | vnode_lock(vp); |
6986 | vp->v_lflag &= ~VL_DEAD; |
6987 | vnode_unlock(vp); |
6988 | } |
6989 | |
6990 | dvp = param->vnfs_dvp; |
6991 | cnp = param->vnfs_cnp; |
6992 | |
6993 | vp->v_op = param->vnfs_vops; |
6994 | vp->v_type = (uint16_t)param->vnfs_vtype; |
6995 | vp->v_data = param->vnfs_fsnode; |
6996 | |
6997 | if (param->vnfs_markroot) { |
6998 | vp->v_flag |= VROOT; |
6999 | } |
7000 | if (param->vnfs_marksystem) { |
7001 | vp->v_flag |= VSYSTEM; |
7002 | } |
7003 | if (vp->v_type == VREG) { |
7004 | error = ubc_info_init_withsize(vp, param->vnfs_filesize); |
7005 | if (error) { |
7006 | #ifdef CONFIG_IOCOUNT_TRACE |
7007 | record_vp(vp, 1); |
7008 | #endif |
7009 | vnode_hold(vp); |
7010 | vnode_lock(vp); |
7011 | vn_set_dead(vp); |
7012 | |
7013 | vnode_put_locked(vp); |
7014 | vnode_drop_and_unlock(vp); |
7015 | return error; |
7016 | } |
7017 | if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) { |
7018 | memory_object_mark_io_tracking(control: vp->v_ubcinfo->ui_control); |
7019 | } |
7020 | } |
7021 | #ifdef CONFIG_IOCOUNT_TRACE |
7022 | record_vp(vp, 1); |
7023 | #endif |
7024 | |
7025 | #if CONFIG_FIRMLINKS |
7026 | vp->v_fmlink = NULLVP; |
7027 | #endif |
7028 | vp->v_flag &= ~VFMLINKTARGET; |
7029 | |
7030 | #if CONFIG_TRIGGERS |
7031 | /* |
7032 | * For trigger vnodes, attach trigger info to vnode |
7033 | */ |
7034 | if ((vp->v_type == VDIR) && (tinfo != NULL)) { |
7035 | /* |
7036 | * Note: has a side effect of incrementing trigger count on the |
7037 | * mount if successful, which we would need to undo on a |
7038 | * subsequent failure. |
7039 | */ |
7040 | #ifdef CONFIG_IOCOUNT_TRACE |
7041 | record_vp(vp, -1); |
7042 | #endif |
7043 | error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE); |
7044 | if (error) { |
7045 | printf("vnode_create: vnode_resolver_create() err %d\n" , error); |
7046 | vnode_hold(vp); |
7047 | vnode_lock(vp); |
7048 | vn_set_dead(vp); |
7049 | #ifdef CONFIG_IOCOUNT_TRACE |
7050 | record_vp(vp, 1); |
7051 | #endif |
7052 | vnode_put_locked(vp); |
7053 | vnode_drop_and_unlock(vp); |
7054 | return error; |
7055 | } |
7056 | } |
7057 | #endif |
7058 | if (vp->v_type == VCHR || vp->v_type == VBLK) { |
7059 | vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ |
7060 | |
7061 | if ((nvp = checkalias(nvp: vp, nvp_rdev: param->vnfs_rdev))) { |
7062 | /* |
7063 | * if checkalias returns a vnode, it will be locked |
7064 | * |
7065 | * first get rid of the unneeded vnode we acquired |
7066 | */ |
7067 | vp->v_data = NULL; |
7068 | vp->v_op = spec_vnodeop_p; |
7069 | vp->v_type = VBAD; |
7070 | vp->v_lflag = VL_DEAD; |
7071 | vp->v_data = NULL; |
7072 | vp->v_tag = VT_NON; |
7073 | vnode_put(vp); |
7074 | |
7075 | /* |
7076 | * switch to aliased vnode and finish |
7077 | * preparing it |
7078 | */ |
7079 | vp = nvp; |
7080 | |
7081 | is_bdevvp = (vp->v_flag & VBDEVVP); |
7082 | |
7083 | if (is_bdevvp) { |
7084 | printf("%s: alias vnode (vid = %u) is in state of change (start) v_flags = 0x%x v_numoutput = %d\n" , |
7085 | __func__, vp->v_id, vp->v_flag, vp->v_numoutput); |
7086 | } |
7087 | |
7088 | vnode_hold(vp); |
7089 | vp->v_lflag |= VL_OPSCHANGE; |
7090 | vclean(vp, flags: 0); |
7091 | vp->v_op = param->vnfs_vops; |
7092 | vp->v_type = (uint16_t)param->vnfs_vtype; |
7093 | vp->v_data = param->vnfs_fsnode; |
7094 | vp->v_lflag = VL_OPSCHANGE; |
7095 | vp->v_mount = NULL; |
7096 | insmntque(vp, mp: param->vnfs_mp); |
7097 | insert = 0; |
7098 | |
7099 | if (is_bdevvp) { |
7100 | printf("%s: alias vnode (vid = %u), is in state of change (end) v_flags = 0x%x v_numoutput = %d\n" , |
7101 | __func__, vp->v_id, vp->v_flag, vp->v_numoutput); |
7102 | } |
7103 | |
7104 | vnode_drop_and_unlock(vp); |
7105 | wakeup(chan: &vp->v_lflag); /* chkvnlock is waitng for VL_DEAD to get unset */ |
7106 | } |
7107 | |
7108 | if (VCHR == vp->v_type) { |
7109 | u_int maj = major(vp->v_rdev); |
7110 | |
7111 | if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) { |
7112 | vp->v_flag |= VISTTY; |
7113 | } |
7114 | } |
7115 | } |
7116 | |
7117 | if (vp->v_type == VFIFO) { |
7118 | struct fifoinfo *fip; |
7119 | |
7120 | fip = kalloc_type(struct fifoinfo, Z_WAITOK | Z_ZERO); |
7121 | vp->v_fifoinfo = fip; |
7122 | } |
7123 | /* The file systems must pass the address of the location where |
7124 | * they store the vnode pointer. When we add the vnode into the mount |
7125 | * list and name cache they become discoverable. So the file system node |
7126 | * must have the connection to vnode setup by then |
7127 | */ |
7128 | *vpp = vp; |
7129 | |
7130 | /* Add fs named reference. */ |
7131 | if (param->vnfs_flags & VNFS_ADDFSREF) { |
7132 | vp->v_lflag |= VNAMED_FSHASH; |
7133 | } |
7134 | if (param->vnfs_mp) { |
7135 | if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) { |
7136 | vp->v_flag |= VLOCKLOCAL; |
7137 | } |
7138 | if (insert) { |
7139 | if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) { |
7140 | panic("insmntque: vp on the free list" ); |
7141 | } |
7142 | |
7143 | /* |
7144 | * enter in mount vnode list |
7145 | */ |
7146 | insmntque(vp, mp: param->vnfs_mp); |
7147 | } |
7148 | } |
7149 | if (dvp && vnode_ref(vp: dvp) == 0) { |
7150 | vp->v_parent = dvp; |
7151 | } |
7152 | if (cnp) { |
7153 | if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { |
7154 | /* |
7155 | * enter into name cache |
7156 | * we've got the info to enter it into the name cache now |
7157 | * cache_enter_create will pick up an extra reference on |
7158 | * the name entered into the string cache |
7159 | */ |
7160 | vp->v_name = cache_enter_create(dvp, vp, cnp); |
7161 | } else { |
7162 | vp->v_name = vfs_addname(name: cnp->cn_nameptr, len: cnp->cn_namelen, nc_hash: cnp->cn_hash, flags: 0); |
7163 | } |
7164 | |
7165 | if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) { |
7166 | vp->v_flag |= VISUNION; |
7167 | } |
7168 | } |
7169 | if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { |
7170 | /* |
7171 | * this vnode is being created as cacheable in the name cache |
7172 | * this allows us to re-enter it in the cache |
7173 | */ |
7174 | vp->v_flag |= VNCACHEABLE; |
7175 | } |
7176 | ut = current_uthread(); |
7177 | |
7178 | if ((current_proc()->p_lflag & P_LRAGE_VNODES) || |
7179 | (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) { |
7180 | /* |
7181 | * process has indicated that it wants any |
7182 | * vnodes created on its behalf to be rapidly |
7183 | * aged to reduce the impact on the cached set |
7184 | * of vnodes |
7185 | * |
7186 | * if UT_KERN_RAGE_VNODES is set, then the |
7187 | * kernel internally wants vnodes to be rapidly |
7188 | * aged, even if the process hasn't requested |
7189 | * this |
7190 | */ |
7191 | vp->v_flag |= VRAGE; |
7192 | } |
7193 | |
7194 | #if CONFIG_SECLUDED_MEMORY |
7195 | switch (secluded_for_filecache) { |
7196 | case SECLUDED_FILECACHE_NONE: |
7197 | /* |
7198 | * secluded_for_filecache == 0: |
7199 | * + no file contents in secluded pool |
7200 | */ |
7201 | break; |
7202 | case SECLUDED_FILECACHE_APPS: |
7203 | /* |
7204 | * secluded_for_filecache == 1: |
7205 | * + no files from / |
7206 | * + files from /Applications/ are OK |
7207 | * + files from /Applications/Camera are not OK |
7208 | * + no files that are open for write |
7209 | */ |
7210 | if (vnode_vtype(vp) == VREG && |
7211 | vnode_mount(vp) != NULL && |
7212 | (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) { |
7213 | /* not from root filesystem: eligible for secluded pages */ |
7214 | memory_object_mark_eligible_for_secluded( |
7215 | ubc_getobject(vp, UBC_FLAGS_NONE), |
7216 | TRUE); |
7217 | } |
7218 | break; |
7219 | case SECLUDED_FILECACHE_RDONLY: |
7220 | /* |
7221 | * secluded_for_filecache == 2: |
7222 | * + all read-only files OK, except: |
7223 | * + dyld_shared_cache_arm64* |
7224 | * + Camera |
7225 | * + mediaserverd |
7226 | */ |
7227 | if (vnode_vtype(vp) == VREG) { |
7228 | memory_object_mark_eligible_for_secluded( |
7229 | ubc_getobject(vp, UBC_FLAGS_NONE), |
7230 | TRUE); |
7231 | } |
7232 | break; |
7233 | default: |
7234 | break; |
7235 | } |
7236 | #endif /* CONFIG_SECLUDED_MEMORY */ |
7237 | |
7238 | if (is_bdevvp) { |
7239 | /* |
7240 | * The v_flags and v_lflags felds for the vndoe above are |
7241 | * manipulated without the vnode lock. This is fine for |
7242 | * everything because no other use of this vnode is occurring. |
7243 | * However the case of the bdevvp alias vnode reuse is different |
7244 | * and the flags end up being modified while a thread may be in |
7245 | * vnode_waitforwrites which sets VTHROTTLED and any one of the |
7246 | * non atomic modifications of v_flag in this function can race |
7247 | * with the setting of that flag and cause VTHROTTLED on vflag |
7248 | * to get "lost". |
7249 | * |
7250 | * This should ideally be fixed by making sure all modifications |
7251 | * in this function to the vnode flags are done under the |
7252 | * vnode lock but at this time, a much smaller workaround is |
7253 | * being employed and a the more correct (and potentially |
7254 | * much bigger) change will follow later. |
7255 | * |
7256 | * The effect of "losing" the VTHROTTLED flags would be a lost |
7257 | * wakeup so we just issue that wakeup here since this happens |
7258 | * only once per bdevvp vnode which are only one or two for a |
7259 | * given boot. |
7260 | */ |
7261 | wakeup(chan: &vp->v_numoutput); |
7262 | |
7263 | /* |
7264 | * now make sure the flags that we were suppossed to put aren't |
7265 | * lost. |
7266 | */ |
7267 | vnode_lock_spin(vp); |
7268 | if (param->vnfs_flags & VNFS_ADDFSREF) { |
7269 | vp->v_lflag |= VNAMED_FSHASH; |
7270 | } |
7271 | if (param->vnfs_mp && (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)) { |
7272 | vp->v_flag |= VLOCKLOCAL; |
7273 | } |
7274 | if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { |
7275 | vp->v_flag |= VNCACHEABLE; |
7276 | } |
7277 | vnode_unlock(vp); |
7278 | } |
7279 | |
7280 | return 0; |
7281 | |
7282 | error_out: |
7283 | if (existing_vnode) { |
7284 | vnode_put(vp); |
7285 | } |
7286 | return error; |
7287 | } |
7288 | |
7289 | int |
7290 | vnode_create_ext(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, vnode_create_options_t vc_options) |
7291 | { |
7292 | if (vc_options & ~(VNODE_CREATE_EMPTY | VNODE_CREATE_NODEALLOC)) { |
7293 | return EINVAL; |
7294 | } |
7295 | *vpp = NULLVP; |
7296 | return vnode_create_internal(flavor, size, data, vpp, vc_options); |
7297 | } |
7298 | |
7299 | /* USAGE: |
7300 | * The following api creates a vnode and associates all the parameter specified in vnode_fsparam |
7301 | * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias |
7302 | * is obsoleted by this. |
7303 | */ |
7304 | int |
7305 | vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) |
7306 | { |
7307 | return vnode_create_ext(flavor, size, data, vpp, vc_options: VNODE_CREATE_NODEALLOC); |
7308 | } |
7309 | |
7310 | int |
7311 | vnode_create_empty(vnode_t *vpp) |
7312 | { |
7313 | return vnode_create_ext(VNCREATE_FLAVOR, VCREATESIZE, NULL, |
7314 | vpp, vc_options: VNODE_CREATE_EMPTY); |
7315 | } |
7316 | |
7317 | int |
7318 | vnode_initialize(uint32_t __unused flavor, uint32_t size, void *data, vnode_t *vpp) |
7319 | { |
7320 | if (*vpp == NULLVP) { |
7321 | panic("NULL vnode passed to vnode_initialize" ); |
7322 | } |
7323 | #if DEVELOPMENT || DEBUG |
7324 | /* |
7325 | * We lock to check that vnode is fit for unlocked use in |
7326 | * vnode_create_internal. |
7327 | */ |
7328 | vnode_lock_spin(*vpp); |
7329 | VNASSERT(((*vpp)->v_iocount == 1), *vpp, |
7330 | ("vnode_initialize : iocount not 1, is %d" , (*vpp)->v_iocount)); |
7331 | VNASSERT(((*vpp)->v_usecount == 0), *vpp, |
7332 | ("vnode_initialize : usecount not 0, is %d" , (*vpp)->v_usecount)); |
7333 | VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp, |
7334 | ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x" , |
7335 | (*vpp)->v_lflag)); |
7336 | VNASSERT(((*vpp)->v_data == NULL), *vpp, |
7337 | ("vnode_initialize : v_data not NULL" )); |
7338 | vnode_unlock(*vpp); |
7339 | #endif |
7340 | return vnode_create_internal(flavor, size, data, vpp, VNODE_CREATE_DEFAULT); |
7341 | } |
7342 | |
7343 | int |
7344 | vnode_addfsref(vnode_t vp) |
7345 | { |
7346 | vnode_lock_spin(vp); |
7347 | if (vp->v_lflag & VNAMED_FSHASH) { |
7348 | panic("add_fsref: vp already has named reference" ); |
7349 | } |
7350 | if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) { |
7351 | panic("addfsref: vp on the free list" ); |
7352 | } |
7353 | vp->v_lflag |= VNAMED_FSHASH; |
7354 | vnode_unlock(vp); |
7355 | return 0; |
7356 | } |
7357 | int |
7358 | vnode_removefsref(vnode_t vp) |
7359 | { |
7360 | vnode_lock_spin(vp); |
7361 | if ((vp->v_lflag & VNAMED_FSHASH) == 0) { |
7362 | panic("remove_fsref: no named reference" ); |
7363 | } |
7364 | vp->v_lflag &= ~VNAMED_FSHASH; |
7365 | vnode_unlock(vp); |
7366 | return 0; |
7367 | } |
7368 | |
7369 | |
7370 | int |
7371 | vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) |
7372 | { |
7373 | mount_t mp; |
7374 | int ret = 0; |
7375 | fsid_t * fsid_list; |
7376 | int count, actualcount, i; |
7377 | void * allocmem; |
7378 | int indx_start, indx_stop, indx_incr; |
7379 | int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF); |
7380 | int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT); |
7381 | |
7382 | count = mount_getvfscnt(); |
7383 | count += 10; |
7384 | |
7385 | fsid_list = kalloc_data(count * sizeof(fsid_t), Z_WAITOK); |
7386 | allocmem = (void *)fsid_list; |
7387 | |
7388 | actualcount = mount_fillfsids(fsidlst: fsid_list, count); |
7389 | |
7390 | /* |
7391 | * Establish the iteration direction |
7392 | * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first) |
7393 | */ |
7394 | if (flags & VFS_ITERATE_TAIL_FIRST) { |
7395 | indx_start = actualcount - 1; |
7396 | indx_stop = -1; |
7397 | indx_incr = -1; |
7398 | } else { /* Head first by default */ |
7399 | indx_start = 0; |
7400 | indx_stop = actualcount; |
7401 | indx_incr = 1; |
7402 | } |
7403 | |
7404 | for (i = indx_start; i != indx_stop; i += indx_incr) { |
7405 | /* obtain the mount point with iteration reference */ |
7406 | mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); |
7407 | |
7408 | if (mp == (struct mount *)0) { |
7409 | continue; |
7410 | } |
7411 | mount_lock(mp); |
7412 | if ((mp->mnt_lflag & MNT_LDEAD) || |
7413 | (!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) { |
7414 | mount_unlock(mp); |
7415 | mount_iterdrop(mp); |
7416 | continue; |
7417 | } |
7418 | mount_unlock(mp); |
7419 | |
7420 | /* iterate over all the vnodes */ |
7421 | ret = callout(mp, arg); |
7422 | |
7423 | /* |
7424 | * Drop the iterref here if the callback didn't do it. |
7425 | * Note: If cb_dropref is set the mp may no longer exist. |
7426 | */ |
7427 | if (!cb_dropref) { |
7428 | mount_iterdrop(mp); |
7429 | } |
7430 | |
7431 | switch (ret) { |
7432 | case VFS_RETURNED: |
7433 | case VFS_RETURNED_DONE: |
7434 | if (ret == VFS_RETURNED_DONE) { |
7435 | ret = 0; |
7436 | goto out; |
7437 | } |
7438 | break; |
7439 | |
7440 | case VFS_CLAIMED_DONE: |
7441 | ret = 0; |
7442 | goto out; |
7443 | case VFS_CLAIMED: |
7444 | default: |
7445 | break; |
7446 | } |
7447 | ret = 0; |
7448 | } |
7449 | |
7450 | out: |
7451 | kfree_data(allocmem, count * sizeof(fsid_t)); |
7452 | return ret; |
7453 | } |
7454 | |
7455 | /* |
7456 | * Update the vfsstatfs structure in the mountpoint. |
7457 | * MAC: Parameter eventtype added, indicating whether the event that |
7458 | * triggered this update came from user space, via a system call |
7459 | * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT). |
7460 | */ |
7461 | int |
7462 | vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype) |
7463 | { |
7464 | struct vfs_attr va; |
7465 | int error; |
7466 | |
7467 | /* |
7468 | * Request the attributes we want to propagate into |
7469 | * the per-mount vfsstat structure. |
7470 | */ |
7471 | VFSATTR_INIT(&va); |
7472 | VFSATTR_WANTED(&va, f_iosize); |
7473 | VFSATTR_WANTED(&va, f_blocks); |
7474 | VFSATTR_WANTED(&va, f_bfree); |
7475 | VFSATTR_WANTED(&va, f_bavail); |
7476 | VFSATTR_WANTED(&va, f_bused); |
7477 | VFSATTR_WANTED(&va, f_files); |
7478 | VFSATTR_WANTED(&va, f_ffree); |
7479 | VFSATTR_WANTED(&va, f_bsize); |
7480 | VFSATTR_WANTED(&va, f_fssubtype); |
7481 | |
7482 | if ((error = vfs_getattr(mp, vfa: &va, ctx)) != 0) { |
7483 | KAUTH_DEBUG("STAT - filesystem returned error %d" , error); |
7484 | return error; |
7485 | } |
7486 | #if CONFIG_MACF |
7487 | if (eventtype == VFS_USER_EVENT) { |
7488 | error = mac_mount_check_getattr(ctx, mp, vfa: &va); |
7489 | if (error != 0) { |
7490 | return error; |
7491 | } |
7492 | } |
7493 | #endif |
7494 | /* |
7495 | * Unpack into the per-mount structure. |
7496 | * |
7497 | * We only overwrite these fields, which are likely to change: |
7498 | * f_blocks |
7499 | * f_bfree |
7500 | * f_bavail |
7501 | * f_bused |
7502 | * f_files |
7503 | * f_ffree |
7504 | * |
7505 | * And these which are not, but which the FS has no other way |
7506 | * of providing to us: |
7507 | * f_bsize |
7508 | * f_iosize |
7509 | * f_fssubtype |
7510 | * |
7511 | */ |
7512 | if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) { |
7513 | /* 4822056 - protect against malformed server mount */ |
7514 | mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512); |
7515 | } else { |
7516 | mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */ |
7517 | } |
7518 | if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) { |
7519 | mp->mnt_vfsstat.f_iosize = va.f_iosize; |
7520 | } else { |
7521 | mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */ |
7522 | } |
7523 | if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) { |
7524 | mp->mnt_vfsstat.f_blocks = va.f_blocks; |
7525 | } |
7526 | if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) { |
7527 | mp->mnt_vfsstat.f_bfree = va.f_bfree; |
7528 | } |
7529 | if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) { |
7530 | mp->mnt_vfsstat.f_bavail = va.f_bavail; |
7531 | } |
7532 | if (VFSATTR_IS_SUPPORTED(&va, f_bused)) { |
7533 | mp->mnt_vfsstat.f_bused = va.f_bused; |
7534 | } |
7535 | if (VFSATTR_IS_SUPPORTED(&va, f_files)) { |
7536 | mp->mnt_vfsstat.f_files = va.f_files; |
7537 | } |
7538 | if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) { |
7539 | mp->mnt_vfsstat.f_ffree = va.f_ffree; |
7540 | } |
7541 | |
7542 | /* this is unlikely to change, but has to be queried for */ |
7543 | if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) { |
7544 | mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype; |
7545 | } |
7546 | |
7547 | return 0; |
7548 | } |
7549 | |
7550 | int |
7551 | mount_list_add(mount_t mp) |
7552 | { |
7553 | int res; |
7554 | |
7555 | mount_list_lock(); |
7556 | if (get_system_inshutdown() != 0) { |
7557 | res = -1; |
7558 | } else { |
7559 | TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); |
7560 | nummounts++; |
7561 | res = 0; |
7562 | } |
7563 | mount_list_unlock(); |
7564 | |
7565 | return res; |
7566 | } |
7567 | |
7568 | void |
7569 | mount_list_remove(mount_t mp) |
7570 | { |
7571 | mount_list_lock(); |
7572 | TAILQ_REMOVE(&mountlist, mp, mnt_list); |
7573 | nummounts--; |
7574 | mp->mnt_list.tqe_next = NULL; |
7575 | mp->mnt_list.tqe_prev = NULL; |
7576 | mount_list_unlock(); |
7577 | } |
7578 | |
7579 | mount_t |
7580 | mount_lookupby_volfsid(int volfs_id, int withref) |
7581 | { |
7582 | mount_t cur_mount = (mount_t)0; |
7583 | mount_t mp; |
7584 | |
7585 | mount_list_lock(); |
7586 | TAILQ_FOREACH(mp, &mountlist, mnt_list) { |
7587 | if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) && |
7588 | (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) && |
7589 | (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) { |
7590 | cur_mount = mp; |
7591 | if (withref) { |
7592 | if (mount_iterref(mp: cur_mount, locked: 1)) { |
7593 | cur_mount = (mount_t)0; |
7594 | mount_list_unlock(); |
7595 | goto out; |
7596 | } |
7597 | } |
7598 | break; |
7599 | } |
7600 | } |
7601 | mount_list_unlock(); |
7602 | if (withref && (cur_mount != (mount_t)0)) { |
7603 | mp = cur_mount; |
7604 | if (vfs_busy(mp, LK_NOWAIT) != 0) { |
7605 | cur_mount = (mount_t)0; |
7606 | } |
7607 | mount_iterdrop(mp); |
7608 | } |
7609 | out: |
7610 | return cur_mount; |
7611 | } |
7612 | |
7613 | mount_t |
7614 | mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref) |
7615 | { |
7616 | mount_t retmp = (mount_t)0; |
7617 | mount_t mp; |
7618 | |
7619 | if (!locked) { |
7620 | mount_list_lock(); |
7621 | } |
7622 | TAILQ_FOREACH(mp, &mountlist, mnt_list) |
7623 | if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] && |
7624 | mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) { |
7625 | retmp = mp; |
7626 | if (withref) { |
7627 | if (mount_iterref(mp: retmp, locked: 1)) { |
7628 | retmp = (mount_t)0; |
7629 | } |
7630 | } |
7631 | goto out; |
7632 | } |
7633 | out: |
7634 | if (!locked) { |
7635 | mount_list_unlock(); |
7636 | } |
7637 | return retmp; |
7638 | } |
7639 | |
7640 | errno_t |
7641 | vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx, |
7642 | vnode_t start_dvp) |
7643 | { |
7644 | struct nameidata *ndp; |
7645 | int error = 0; |
7646 | u_int32_t ndflags = 0; |
7647 | |
7648 | if (ctx == NULL) { |
7649 | return EINVAL; |
7650 | } |
7651 | |
7652 | ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL); |
7653 | |
7654 | if (flags & VNODE_LOOKUP_NOFOLLOW) { |
7655 | ndflags = NOFOLLOW; |
7656 | } else { |
7657 | ndflags = FOLLOW; |
7658 | } |
7659 | |
7660 | if (flags & VNODE_LOOKUP_NOCROSSMOUNT) { |
7661 | ndflags |= NOCROSSMOUNT; |
7662 | } |
7663 | |
7664 | if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) { |
7665 | ndflags |= CN_NBMOUNTLOOK; |
7666 | } |
7667 | |
7668 | /* XXX AUDITVNPATH1 needed ? */ |
7669 | NDINIT(ndp, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, |
7670 | CAST_USER_ADDR_T(path), ctx); |
7671 | |
7672 | if (start_dvp && (path[0] != '/')) { |
7673 | ndp->ni_dvp = start_dvp; |
7674 | ndp->ni_cnd.cn_flags |= USEDVP; |
7675 | } |
7676 | |
7677 | if ((error = namei(ndp))) { |
7678 | goto out_free; |
7679 | } |
7680 | |
7681 | ndp->ni_cnd.cn_flags &= ~USEDVP; |
7682 | |
7683 | *vpp = ndp->ni_vp; |
7684 | nameidone(ndp); |
7685 | |
7686 | out_free: |
7687 | kfree_type(struct nameidata, ndp); |
7688 | return error; |
7689 | } |
7690 | |
7691 | errno_t |
7692 | vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) |
7693 | { |
7694 | return vnode_lookupat(path, flags, vpp, ctx, NULLVP); |
7695 | } |
7696 | |
7697 | errno_t |
7698 | vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx) |
7699 | { |
7700 | struct nameidata *ndp = NULL; |
7701 | int error; |
7702 | u_int32_t ndflags = 0; |
7703 | int lflags = flags; |
7704 | |
7705 | if (ctx == NULL) { /* XXX technically an error */ |
7706 | ctx = vfs_context_current(); |
7707 | } |
7708 | |
7709 | ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL); |
7710 | |
7711 | if (fmode & O_NOFOLLOW) { |
7712 | lflags |= VNODE_LOOKUP_NOFOLLOW; |
7713 | } |
7714 | |
7715 | if (lflags & VNODE_LOOKUP_NOFOLLOW) { |
7716 | ndflags = NOFOLLOW; |
7717 | } else { |
7718 | ndflags = FOLLOW; |
7719 | } |
7720 | |
7721 | if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) { |
7722 | ndflags |= NOCROSSMOUNT; |
7723 | } |
7724 | |
7725 | if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) { |
7726 | ndflags |= CN_NBMOUNTLOOK; |
7727 | } |
7728 | |
7729 | /* XXX AUDITVNPATH1 needed ? */ |
7730 | NDINIT(ndp, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, |
7731 | CAST_USER_ADDR_T(path), ctx); |
7732 | |
7733 | if ((error = vn_open(ndp, fmode, cmode))) { |
7734 | *vpp = NULL; |
7735 | } else { |
7736 | *vpp = ndp->ni_vp; |
7737 | } |
7738 | |
7739 | kfree_type(struct nameidata, ndp); |
7740 | return error; |
7741 | } |
7742 | |
7743 | errno_t |
7744 | vnode_close(vnode_t vp, int flags, vfs_context_t ctx) |
7745 | { |
7746 | int error; |
7747 | |
7748 | if (ctx == NULL) { |
7749 | ctx = vfs_context_current(); |
7750 | } |
7751 | |
7752 | error = vn_close(vp, flags, ctx); |
7753 | vnode_put(vp); |
7754 | return error; |
7755 | } |
7756 | |
7757 | errno_t |
7758 | vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx) |
7759 | { |
7760 | struct vnode_attr va; |
7761 | int error; |
7762 | |
7763 | VATTR_INIT(&va); |
7764 | VATTR_WANTED(&va, va_modify_time); |
7765 | error = vnode_getattr(vp, vap: &va, ctx); |
7766 | if (!error) { |
7767 | *mtime = va.va_modify_time; |
7768 | } |
7769 | return error; |
7770 | } |
7771 | |
7772 | errno_t |
7773 | vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx) |
7774 | { |
7775 | struct vnode_attr va; |
7776 | int error; |
7777 | |
7778 | VATTR_INIT(&va); |
7779 | VATTR_WANTED(&va, va_flags); |
7780 | error = vnode_getattr(vp, vap: &va, ctx); |
7781 | if (!error) { |
7782 | *flags = va.va_flags; |
7783 | } |
7784 | return error; |
7785 | } |
7786 | |
7787 | /* |
7788 | * Returns: 0 Success |
7789 | * vnode_getattr:??? |
7790 | */ |
7791 | errno_t |
7792 | vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) |
7793 | { |
7794 | struct vnode_attr va; |
7795 | int error; |
7796 | |
7797 | VATTR_INIT(&va); |
7798 | VATTR_WANTED(&va, va_data_size); |
7799 | error = vnode_getattr(vp, vap: &va, ctx); |
7800 | if (!error) { |
7801 | *sizep = va.va_data_size; |
7802 | } |
7803 | return error; |
7804 | } |
7805 | |
7806 | errno_t |
7807 | vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) |
7808 | { |
7809 | struct vnode_attr va; |
7810 | |
7811 | VATTR_INIT(&va); |
7812 | VATTR_SET(&va, va_data_size, size); |
7813 | va.va_vaflags = ioflag & 0xffff; |
7814 | return vnode_setattr(vp, vap: &va, ctx); |
7815 | } |
7816 | |
7817 | int |
7818 | vnode_setdirty(vnode_t vp) |
7819 | { |
7820 | vnode_lock_spin(vp); |
7821 | vp->v_flag |= VISDIRTY; |
7822 | vnode_unlock(vp); |
7823 | return 0; |
7824 | } |
7825 | |
7826 | int |
7827 | vnode_cleardirty(vnode_t vp) |
7828 | { |
7829 | vnode_lock_spin(vp); |
7830 | vp->v_flag &= ~VISDIRTY; |
7831 | vnode_unlock(vp); |
7832 | return 0; |
7833 | } |
7834 | |
7835 | int |
7836 | vnode_isdirty(vnode_t vp) |
7837 | { |
7838 | int dirty; |
7839 | |
7840 | vnode_lock_spin(vp); |
7841 | dirty = (vp->v_flag & VISDIRTY) ? 1 : 0; |
7842 | vnode_unlock(vp); |
7843 | |
7844 | return dirty; |
7845 | } |
7846 | |
7847 | static int |
7848 | vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) |
7849 | { |
7850 | /* Only use compound VNOP for compound operation */ |
7851 | if (vnode_compound_open_available(vp: dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) { |
7852 | *vpp = NULLVP; |
7853 | return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, status: statusp, vap, ctx); |
7854 | } else { |
7855 | return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx); |
7856 | } |
7857 | } |
7858 | |
7859 | /* |
7860 | * Create a filesystem object of arbitrary type with arbitrary attributes in |
7861 | * the spevied directory with the specified name. |
7862 | * |
7863 | * Parameters: dvp Pointer to the vnode of the directory |
7864 | * in which to create the object. |
7865 | * vpp Pointer to the area into which to |
7866 | * return the vnode of the created object. |
7867 | * cnp Component name pointer from the namei |
7868 | * data structure, containing the name to |
7869 | * use for the create object. |
7870 | * vap Pointer to the vnode_attr structure |
7871 | * describing the object to be created, |
7872 | * including the type of object. |
7873 | * flags VN_* flags controlling ACL inheritance |
7874 | * and whether or not authorization is to |
7875 | * be required for the operation. |
7876 | * |
7877 | * Returns: 0 Success |
7878 | * !0 errno value |
7879 | * |
7880 | * Implicit: *vpp Contains the vnode of the object that |
7881 | * was created, if successful. |
7882 | * *cnp May be modified by the underlying VFS. |
7883 | * *vap May be modified by the underlying VFS. |
7884 | * modified by either ACL inheritance or |
7885 | * |
7886 | * |
7887 | * be modified, even if the operation is |
7888 | * |
7889 | * |
7890 | * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order. |
7891 | * |
7892 | * Modification of '*cnp' and '*vap' by the underlying VFS is |
7893 | * strongly discouraged. |
7894 | * |
7895 | * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c |
7896 | * |
7897 | * XXX: We should enummerate the possible errno values here, and where |
7898 | * in the code they originated. |
7899 | */ |
7900 | errno_t |
7901 | vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) |
7902 | { |
7903 | errno_t error, old_error; |
7904 | vnode_t vp = (vnode_t)0; |
7905 | boolean_t batched; |
7906 | struct componentname *cnp; |
7907 | uint32_t defaulted; |
7908 | |
7909 | cnp = &ndp->ni_cnd; |
7910 | error = 0; |
7911 | batched = namei_compound_available(dp: dvp, ndp) ? TRUE : FALSE; |
7912 | |
7913 | KAUTH_DEBUG("%p CREATE - '%s'" , dvp, cnp->cn_nameptr); |
7914 | |
7915 | if (flags & VN_CREATE_NOINHERIT) { |
7916 | vap->va_vaflags |= VA_NOINHERIT; |
7917 | } |
7918 | if (flags & VN_CREATE_NOAUTH) { |
7919 | vap->va_vaflags |= VA_NOAUTH; |
7920 | } |
7921 | /* |
7922 | * Handle ACL inheritance, initialize vap. |
7923 | */ |
7924 | error = vn_attribute_prepare(dvp, vap, defaulted_fieldsp: &defaulted, ctx); |
7925 | if (error) { |
7926 | return error; |
7927 | } |
7928 | |
7929 | if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) { |
7930 | panic("Open parameters, but not a regular file." ); |
7931 | } |
7932 | if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) { |
7933 | panic("Mode for open, but not trying to open..." ); |
7934 | } |
7935 | |
7936 | |
7937 | /* |
7938 | * Create the requested node. |
7939 | */ |
7940 | switch (vap->va_type) { |
7941 | case VREG: |
7942 | error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx); |
7943 | break; |
7944 | case VDIR: |
7945 | error = vn_mkdir(dvp, vpp, ndp, vap, ctx); |
7946 | break; |
7947 | case VSOCK: |
7948 | case VFIFO: |
7949 | case VBLK: |
7950 | case VCHR: |
7951 | error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx); |
7952 | break; |
7953 | default: |
7954 | panic("vnode_create: unknown vtype %d" , vap->va_type); |
7955 | } |
7956 | if (error != 0) { |
7957 | KAUTH_DEBUG("%p CREATE - error %d returned by filesystem" , dvp, error); |
7958 | goto out; |
7959 | } |
7960 | |
7961 | vp = *vpp; |
7962 | old_error = error; |
7963 | |
7964 | /* |
7965 | * If some of the requested attributes weren't handled by the VNOP, |
7966 | * use our fallback code. |
7967 | */ |
7968 | if ((error == 0) && !VATTR_ALL_SUPPORTED(vap) && *vpp) { |
7969 | KAUTH_DEBUG(" CREATE - doing fallback with ACL %p" , vap->va_acl); |
7970 | error = vnode_setattr_fallback(vp: *vpp, vap, ctx); |
7971 | } |
7972 | |
7973 | #if CONFIG_MACF |
7974 | if ((error == 0) && !(flags & VN_CREATE_NOLABEL)) { |
7975 | error = vnode_label(mp: vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); |
7976 | } |
7977 | #endif |
7978 | |
7979 | if ((error != 0) && (vp != (vnode_t)0)) { |
7980 | /* If we've done a compound open, close */ |
7981 | if (batched && (old_error == 0) && (vap->va_type == VREG)) { |
7982 | VNOP_CLOSE(vp, fmode, ctx); |
7983 | } |
7984 | |
7985 | /* Need to provide notifications if a create succeeded */ |
7986 | if (!batched) { |
7987 | *vpp = (vnode_t) 0; |
7988 | vnode_put(vp); |
7989 | vp = NULLVP; |
7990 | } |
7991 | } |
7992 | |
7993 | /* |
7994 | * For creation VNOPs, this is the equivalent of |
7995 | * lookup_handle_found_vnode. |
7996 | */ |
7997 | if (kdebug_enable && *vpp) { |
7998 | kdebug_lookup(dp: *vpp, cnp); |
7999 | } |
8000 | |
8001 | out: |
8002 | vn_attribute_cleanup(vap, defaulted_fields: defaulted); |
8003 | |
8004 | return error; |
8005 | } |
8006 | |
8007 | static kauth_scope_t vnode_scope; |
8008 | static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action, |
8009 | uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); |
8010 | static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx, |
8011 | vnode_t vp, vnode_t dvp, int *errorp); |
8012 | |
8013 | typedef struct _vnode_authorize_context { |
8014 | vnode_t vp; |
8015 | struct vnode_attr *vap; |
8016 | vnode_t dvp; |
8017 | struct vnode_attr *dvap; |
8018 | vfs_context_t ctx; |
8019 | int flags; |
8020 | int flags_valid; |
8021 | #define _VAC_IS_OWNER (1<<0) |
8022 | #define _VAC_IN_GROUP (1<<1) |
8023 | #define _VAC_IS_DIR_OWNER (1<<2) |
8024 | #define _VAC_IN_DIR_GROUP (1<<3) |
8025 | #define _VAC_NO_VNODE_POINTERS (1<<4) |
8026 | } *vauth_ctx; |
8027 | |
8028 | void |
8029 | vnode_authorize_init(void) |
8030 | { |
8031 | vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, callback: vnode_authorize_callback, NULL); |
8032 | } |
8033 | |
8034 | #define VATTR_PREPARE_DEFAULTED_UID 0x1 |
8035 | #define VATTR_PREPARE_DEFAULTED_GID 0x2 |
8036 | #define VATTR_PREPARE_DEFAULTED_MODE 0x4 |
8037 | |
8038 | int |
8039 | vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx) |
8040 | { |
8041 | kauth_acl_t nacl = NULL, oacl = NULL; |
8042 | int error; |
8043 | |
8044 | /* |
8045 | * Handle ACL inheritance. |
8046 | */ |
8047 | if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { |
8048 | /* save the original filesec */ |
8049 | if (VATTR_IS_ACTIVE(vap, va_acl)) { |
8050 | oacl = vap->va_acl; |
8051 | } |
8052 | |
8053 | vap->va_acl = NULL; |
8054 | if ((error = kauth_acl_inherit(dvp: dvp, |
8055 | initial: oacl, |
8056 | product: &nacl, |
8057 | isdir: vap->va_type == VDIR, |
8058 | ctx: ctx)) != 0) { |
8059 | KAUTH_DEBUG("%p CREATE - error %d processing inheritance" , dvp, error); |
8060 | return error; |
8061 | } |
8062 | |
8063 | /* |
8064 | * If the generated ACL is NULL, then we can save ourselves some effort |
8065 | * by clearing the active bit. |
8066 | */ |
8067 | if (nacl == NULL) { |
8068 | VATTR_CLEAR_ACTIVE(vap, va_acl); |
8069 | } else { |
8070 | vap->va_base_acl = oacl; |
8071 | VATTR_SET(vap, va_acl, nacl); |
8072 | } |
8073 | } |
8074 | |
8075 | error = vnode_authattr_new_internal(dvp, vap, noauth: (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx); |
8076 | if (error) { |
8077 | vn_attribute_cleanup(vap, defaulted_fields: *defaulted_fieldsp); |
8078 | } |
8079 | |
8080 | return error; |
8081 | } |
8082 | |
8083 | void |
8084 | vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields) |
8085 | { |
8086 | /* |
8087 | * If the caller supplied a filesec in vap, it has been replaced |
8088 | * now by the post-inheritance copy. We need to put the original back |
8089 | * and free the inherited product. |
8090 | */ |
8091 | kauth_acl_t nacl, oacl; |
8092 | |
8093 | if (VATTR_IS_ACTIVE(vap, va_acl)) { |
8094 | nacl = vap->va_acl; |
8095 | oacl = vap->va_base_acl; |
8096 | |
8097 | if (oacl) { |
8098 | VATTR_SET(vap, va_acl, oacl); |
8099 | vap->va_base_acl = NULL; |
8100 | } else { |
8101 | VATTR_CLEAR_ACTIVE(vap, va_acl); |
8102 | } |
8103 | |
8104 | if (nacl != NULL) { |
8105 | /* |
8106 | * Only free the ACL buffer if 'VA_FILESEC_ACL' is not set as it |
8107 | * should be freed by the caller or it is a post-inheritance copy. |
8108 | */ |
8109 | if (!(vap->va_vaflags & VA_FILESEC_ACL) || |
8110 | (oacl != NULL && nacl != oacl)) { |
8111 | kauth_acl_free(fsp: nacl); |
8112 | } |
8113 | } |
8114 | } |
8115 | |
8116 | if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) { |
8117 | VATTR_CLEAR_ACTIVE(vap, va_mode); |
8118 | } |
8119 | if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) { |
8120 | VATTR_CLEAR_ACTIVE(vap, va_gid); |
8121 | } |
8122 | if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) { |
8123 | VATTR_CLEAR_ACTIVE(vap, va_uid); |
8124 | } |
8125 | |
8126 | return; |
8127 | } |
8128 | |
8129 | int |
8130 | vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) |
8131 | { |
8132 | #if !CONFIG_MACF |
8133 | #pragma unused(cnp) |
8134 | #endif |
8135 | int error = 0; |
8136 | |
8137 | /* |
8138 | * Normally, unlinking of directories is not supported. |
8139 | * However, some file systems may have limited support. |
8140 | */ |
8141 | if ((vp->v_type == VDIR) && |
8142 | !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) { |
8143 | return EPERM; /* POSIX */ |
8144 | } |
8145 | |
8146 | /* authorize the delete operation */ |
8147 | #if CONFIG_MACF |
8148 | if (!error) { |
8149 | error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); |
8150 | } |
8151 | #endif /* MAC */ |
8152 | if (!error) { |
8153 | error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); |
8154 | } |
8155 | |
8156 | return error; |
8157 | } |
8158 | |
8159 | int |
8160 | vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved) |
8161 | { |
8162 | /* Open of existing case */ |
8163 | kauth_action_t action; |
8164 | int error = 0; |
8165 | if (cnp->cn_ndp == NULL) { |
8166 | panic("NULL ndp" ); |
8167 | } |
8168 | if (reserved != NULL) { |
8169 | panic("reserved not NULL." ); |
8170 | } |
8171 | |
8172 | #if CONFIG_MACF |
8173 | /* XXX may do duplicate work here, but ignore that for now (idempotent) */ |
8174 | if (vfs_flags(mp: vnode_mount(vp)) & MNT_MULTILABEL) { |
8175 | error = vnode_label(mp: vnode_mount(vp), NULL, vp, NULL, flags: 0, ctx); |
8176 | if (error) { |
8177 | return error; |
8178 | } |
8179 | } |
8180 | #endif |
8181 | |
8182 | if (vnode_isdir(vp)) { |
8183 | if ((fmode & (FWRITE | O_TRUNC)) || /* disallow write operations on directories */ |
8184 | ((fmode & FSEARCH) && !(fmode & O_DIRECTORY))) { |
8185 | return EISDIR; |
8186 | } |
8187 | } else { |
8188 | if (fmode & O_DIRECTORY) { |
8189 | return ENOTDIR; |
8190 | } |
8191 | |
8192 | if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { |
8193 | return EOPNOTSUPP; /* Operation not supported on socket */ |
8194 | } |
8195 | |
8196 | if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { |
8197 | return ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */ |
8198 | } |
8199 | |
8200 | if (cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH) { |
8201 | return ENOTDIR; |
8202 | } |
8203 | |
8204 | if (!vnode_isreg(vp) && (fmode & FEXEC)) { |
8205 | return EACCES; |
8206 | } |
8207 | } |
8208 | |
8209 | #if CONFIG_MACF |
8210 | /* If a file being opened is a shadow file containing |
8211 | * namedstream data, ignore the macf checks because it |
8212 | * is a kernel internal file and access should always |
8213 | * be allowed. |
8214 | */ |
8215 | if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) { |
8216 | error = mac_vnode_check_open(ctx, vp, acc_mode: fmode); |
8217 | if (error) { |
8218 | return error; |
8219 | } |
8220 | } |
8221 | #endif |
8222 | |
8223 | /* compute action to be authorized */ |
8224 | action = 0; |
8225 | if (fmode & FREAD) { |
8226 | action |= KAUTH_VNODE_READ_DATA; |
8227 | } |
8228 | if (fmode & (FWRITE | O_TRUNC)) { |
8229 | /* |
8230 | * If we are writing, appending, and not truncating, |
8231 | * indicate that we are appending so that if the |
8232 | * UF_APPEND or SF_APPEND bits are set, we do not deny |
8233 | * the open. |
8234 | */ |
8235 | if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { |
8236 | action |= KAUTH_VNODE_APPEND_DATA; |
8237 | } else { |
8238 | action |= KAUTH_VNODE_WRITE_DATA; |
8239 | } |
8240 | } |
8241 | if (fmode & (FSEARCH | FEXEC)) { |
8242 | if (vnode_isdir(vp)) { |
8243 | action |= KAUTH_VNODE_SEARCH; |
8244 | } else { |
8245 | action |= KAUTH_VNODE_EXECUTE; |
8246 | } |
8247 | } |
8248 | error = vnode_authorize(vp, NULL, action, ctx); |
8249 | #if NAMEDSTREAMS |
8250 | if (error == EACCES) { |
8251 | /* |
8252 | * Shadow files may exist on-disk with a different UID/GID |
8253 | * than that of the current context. Verify that this file |
8254 | * is really a shadow file. If it was created successfully |
8255 | * then it should be authorized. |
8256 | */ |
8257 | if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) { |
8258 | error = vnode_verifynamedstream(vp); |
8259 | } |
8260 | } |
8261 | #endif |
8262 | |
8263 | return error; |
8264 | } |
8265 | |
8266 | int |
8267 | vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) |
8268 | { |
8269 | #if !CONFIG_MACF |
8270 | #pragma unused(vap) |
8271 | #endif |
8272 | /* Creation case */ |
8273 | int error; |
8274 | |
8275 | if (cnp->cn_ndp == NULL) { |
8276 | panic("NULL cn_ndp" ); |
8277 | } |
8278 | if (reserved != NULL) { |
8279 | panic("reserved not NULL." ); |
8280 | } |
8281 | |
8282 | /* Only validate path for creation if we didn't do a complete lookup */ |
8283 | if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) { |
8284 | error = lookup_validate_creation_path(ndp: cnp->cn_ndp); |
8285 | if (error) { |
8286 | return error; |
8287 | } |
8288 | } |
8289 | |
8290 | #if CONFIG_MACF |
8291 | error = mac_vnode_check_create(ctx, dvp, cnp, vap); |
8292 | if (error) { |
8293 | return error; |
8294 | } |
8295 | #endif /* CONFIG_MACF */ |
8296 | |
8297 | return vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx); |
8298 | } |
8299 | |
8300 | int |
8301 | vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, |
8302 | struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, |
8303 | vfs_context_t ctx, void *reserved) |
8304 | { |
8305 | return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, flags: 0, reserved); |
8306 | } |
8307 | |
8308 | int |
8309 | vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, |
8310 | struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, |
8311 | vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved) |
8312 | { |
8313 | return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved); |
8314 | } |
8315 | |
8316 | int |
8317 | vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path, |
8318 | struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path, |
8319 | vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved) |
8320 | { |
8321 | int error = 0; |
8322 | int moving = 0; |
8323 | bool swap = flags & VFS_RENAME_SWAP; |
8324 | |
8325 | if (reserved != NULL) { |
8326 | panic("Passed something other than NULL as reserved field!" ); |
8327 | } |
8328 | |
8329 | /* |
8330 | * Avoid renaming "." and "..". |
8331 | * |
8332 | * XXX No need to check for this in the FS. We should always have the leaves |
8333 | * in VFS in this case. |
8334 | */ |
8335 | if (fvp->v_type == VDIR && |
8336 | ((fdvp == fvp) || |
8337 | (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || |
8338 | ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) { |
8339 | error = EINVAL; |
8340 | goto out; |
8341 | } |
8342 | |
8343 | if (tvp == NULLVP && vnode_compound_rename_available(vp: tdvp)) { |
8344 | error = lookup_validate_creation_path(ndp: tcnp->cn_ndp); |
8345 | if (error) { |
8346 | goto out; |
8347 | } |
8348 | } |
8349 | |
8350 | /***** <MACF> *****/ |
8351 | #if CONFIG_MACF |
8352 | error = mac_vnode_check_rename(ctx, dvp: fdvp, vp: fvp, cnp: fcnp, tdvp, tvp, tcnp); |
8353 | if (error) { |
8354 | goto out; |
8355 | } |
8356 | if (swap) { |
8357 | error = mac_vnode_check_rename(ctx, dvp: tdvp, vp: tvp, cnp: tcnp, tdvp: fdvp, tvp: fvp, tcnp: fcnp); |
8358 | if (error) { |
8359 | goto out; |
8360 | } |
8361 | } |
8362 | #endif |
8363 | /***** </MACF> *****/ |
8364 | |
8365 | /***** <MiscChecks> *****/ |
8366 | if (tvp != NULL) { |
8367 | if (!swap) { |
8368 | if (fvp->v_type == VDIR && tvp->v_type != VDIR) { |
8369 | error = ENOTDIR; |
8370 | goto out; |
8371 | } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { |
8372 | error = EISDIR; |
8373 | goto out; |
8374 | } |
8375 | } |
8376 | } else if (swap) { |
8377 | /* |
8378 | * Caller should have already checked this and returned |
8379 | * ENOENT. If we send back ENOENT here, caller will retry |
8380 | * which isn't what we want so we send back EINVAL here |
8381 | * instead. |
8382 | */ |
8383 | error = EINVAL; |
8384 | goto out; |
8385 | } |
8386 | |
8387 | if (fvp == tdvp) { |
8388 | error = EINVAL; |
8389 | goto out; |
8390 | } |
8391 | |
8392 | /* |
8393 | * The following edge case is caught here: |
8394 | * (to cannot be a descendent of from) |
8395 | * |
8396 | * o fdvp |
8397 | * / |
8398 | * / |
8399 | * o fvp |
8400 | * \ |
8401 | * \ |
8402 | * o tdvp |
8403 | * / |
8404 | * / |
8405 | * o tvp |
8406 | */ |
8407 | if (tdvp->v_parent == fvp) { |
8408 | error = EINVAL; |
8409 | goto out; |
8410 | } |
8411 | |
8412 | if (swap && fdvp->v_parent == tvp) { |
8413 | error = EINVAL; |
8414 | goto out; |
8415 | } |
8416 | /***** </MiscChecks> *****/ |
8417 | |
8418 | /***** <Kauth> *****/ |
8419 | |
8420 | /* |
8421 | * As part of the Kauth step, we call out to allow 3rd-party |
8422 | * fileop notification of "about to rename". This is needed |
8423 | * in the event that 3rd-parties need to know that the DELETE |
8424 | * authorization is actually part of a rename. It's important |
8425 | * that we guarantee that the DELETE call-out will always be |
8426 | * made if the WILL_RENAME call-out is made. Another fileop |
8427 | * call-out will be performed once the operation is completed. |
8428 | * We can ignore the result of kauth_authorize_fileop(). |
8429 | * |
8430 | * N.B. We are passing the vnode and *both* paths to each |
8431 | * call; kauth_authorize_fileop() extracts the "from" path |
8432 | * when posting a KAUTH_FILEOP_WILL_RENAME notification. |
8433 | * As such, we only post these notifications if all of the |
8434 | * information we need is provided. |
8435 | */ |
8436 | |
8437 | if (swap) { |
8438 | kauth_action_t f = 0, t = 0; |
8439 | |
8440 | /* |
8441 | * Directories changing parents need ...ADD_SUBDIR... to |
8442 | * permit changing ".." |
8443 | */ |
8444 | if (fdvp != tdvp) { |
8445 | if (vnode_isdir(vp: fvp)) { |
8446 | f = KAUTH_VNODE_ADD_SUBDIRECTORY; |
8447 | } |
8448 | if (vnode_isdir(vp: tvp)) { |
8449 | t = KAUTH_VNODE_ADD_SUBDIRECTORY; |
8450 | } |
8451 | } |
8452 | if (to_path != NULL) { |
8453 | kauth_authorize_fileop(credential: vfs_context_ucred(ctx), |
8454 | KAUTH_FILEOP_WILL_RENAME, |
8455 | arg0: (uintptr_t)fvp, |
8456 | arg1: (uintptr_t)to_path); |
8457 | } |
8458 | error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE | f, ctx); |
8459 | if (error) { |
8460 | goto out; |
8461 | } |
8462 | if (from_path != NULL) { |
8463 | kauth_authorize_fileop(credential: vfs_context_ucred(ctx), |
8464 | KAUTH_FILEOP_WILL_RENAME, |
8465 | arg0: (uintptr_t)tvp, |
8466 | arg1: (uintptr_t)from_path); |
8467 | } |
8468 | error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE | t, ctx); |
8469 | if (error) { |
8470 | goto out; |
8471 | } |
8472 | f = vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE; |
8473 | t = vnode_isdir(vp: tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE; |
8474 | if (fdvp == tdvp) { |
8475 | error = vnode_authorize(vp: fdvp, NULL, action: f | t, ctx); |
8476 | } else { |
8477 | error = vnode_authorize(vp: fdvp, NULL, action: t, ctx); |
8478 | if (error) { |
8479 | goto out; |
8480 | } |
8481 | error = vnode_authorize(vp: tdvp, NULL, action: f, ctx); |
8482 | } |
8483 | if (error) { |
8484 | goto out; |
8485 | } |
8486 | } else { |
8487 | error = 0; |
8488 | if ((tvp != NULL) && vnode_isdir(vp: tvp)) { |
8489 | if (tvp != fdvp) { |
8490 | moving = 1; |
8491 | } |
8492 | } else if (tdvp != fdvp) { |
8493 | moving = 1; |
8494 | } |
8495 | |
8496 | /* |
8497 | * must have delete rights to remove the old name even in |
8498 | * the simple case of fdvp == tdvp. |
8499 | * |
8500 | * If fvp is a directory, and we are changing it's parent, |
8501 | * then we also need rights to rewrite its ".." entry as well. |
8502 | */ |
8503 | if (to_path != NULL) { |
8504 | kauth_authorize_fileop(credential: vfs_context_ucred(ctx), |
8505 | KAUTH_FILEOP_WILL_RENAME, |
8506 | arg0: (uintptr_t)fvp, |
8507 | arg1: (uintptr_t)to_path); |
8508 | } |
8509 | if (vnode_isdir(vp: fvp)) { |
8510 | if ((error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) { |
8511 | goto out; |
8512 | } |
8513 | } else { |
8514 | if ((error = vnode_authorize(vp: fvp, dvp: fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) { |
8515 | goto out; |
8516 | } |
8517 | } |
8518 | if (moving) { |
8519 | /* moving into tdvp or tvp, must have rights to add */ |
8520 | if ((error = vnode_authorize(vp: ((tvp != NULL) && vnode_isdir(vp: tvp)) ? tvp : tdvp, |
8521 | NULL, |
8522 | action: vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, |
8523 | ctx)) != 0) { |
8524 | goto out; |
8525 | } |
8526 | } else { |
8527 | /* node staying in same directory, must be allowed to add new name */ |
8528 | if ((error = vnode_authorize(vp: fdvp, NULL, |
8529 | action: vnode_isdir(vp: fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) { |
8530 | goto out; |
8531 | } |
8532 | } |
8533 | /* overwriting tvp */ |
8534 | if ((tvp != NULL) && !vnode_isdir(vp: tvp) && |
8535 | ((error = vnode_authorize(vp: tvp, dvp: tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { |
8536 | goto out; |
8537 | } |
8538 | } |
8539 | |
8540 | /***** </Kauth> *****/ |
8541 | |
8542 | /* XXX more checks? */ |
8543 | out: |
8544 | return error; |
8545 | } |
8546 | |
8547 | int |
8548 | vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) |
8549 | { |
8550 | #if !CONFIG_MACF |
8551 | #pragma unused(vap) |
8552 | #endif |
8553 | int error; |
8554 | |
8555 | if (reserved != NULL) { |
8556 | panic("reserved not NULL in vn_authorize_mkdir()" ); |
8557 | } |
8558 | |
8559 | /* XXX A hack for now, to make shadow files work */ |
8560 | if (cnp->cn_ndp == NULL) { |
8561 | return 0; |
8562 | } |
8563 | |
8564 | if (vnode_compound_mkdir_available(vp: dvp)) { |
8565 | error = lookup_validate_creation_path(ndp: cnp->cn_ndp); |
8566 | if (error) { |
8567 | goto out; |
8568 | } |
8569 | } |
8570 | |
8571 | #if CONFIG_MACF |
8572 | error = mac_vnode_check_create(ctx, |
8573 | dvp, cnp, vap); |
8574 | if (error) { |
8575 | goto out; |
8576 | } |
8577 | #endif |
8578 | |
8579 | /* authorize addition of a directory to the parent */ |
8580 | if ((error = vnode_authorize(vp: dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) { |
8581 | goto out; |
8582 | } |
8583 | |
8584 | out: |
8585 | return error; |
8586 | } |
8587 | |
8588 | int |
8589 | vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved) |
8590 | { |
8591 | #if CONFIG_MACF |
8592 | int error; |
8593 | #else |
8594 | #pragma unused(cnp) |
8595 | #endif |
8596 | if (reserved != NULL) { |
8597 | panic("Non-NULL reserved argument to vn_authorize_rmdir()" ); |
8598 | } |
8599 | |
8600 | if (vp->v_type != VDIR) { |
8601 | /* |
8602 | * rmdir only deals with directories |
8603 | */ |
8604 | return ENOTDIR; |
8605 | } |
8606 | |
8607 | if (dvp == vp) { |
8608 | /* |
8609 | * No rmdir "." please. |
8610 | */ |
8611 | return EINVAL; |
8612 | } |
8613 | |
8614 | #if CONFIG_MACF |
8615 | error = mac_vnode_check_unlink(ctx, dvp, |
8616 | vp, cnp); |
8617 | if (error) { |
8618 | return error; |
8619 | } |
8620 | #endif |
8621 | |
8622 | return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); |
8623 | } |
8624 | |
8625 | /* |
8626 | * Authorizer for directory cloning. This does not use vnodes but instead |
8627 | * uses prefilled vnode attributes from the filesystem. |
8628 | * |
8629 | * The same function is called to set up the attributes required, perform the |
8630 | * authorization and cleanup (if required) |
8631 | */ |
8632 | int |
8633 | vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action, |
8634 | struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp, |
8635 | dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx, |
8636 | __unused void *reserved) |
8637 | { |
8638 | int error; |
8639 | int is_suser = vfs_context_issuser(ctx); |
8640 | |
8641 | if (vattr_op == OP_VATTR_SETUP) { |
8642 | VATTR_INIT(vap); |
8643 | |
8644 | /* |
8645 | * When ACL inheritence is implemented, both vap->va_acl and |
8646 | * dvap->va_acl will be required (even as superuser). |
8647 | */ |
8648 | VATTR_WANTED(vap, va_type); |
8649 | VATTR_WANTED(vap, va_mode); |
8650 | VATTR_WANTED(vap, va_flags); |
8651 | VATTR_WANTED(vap, va_uid); |
8652 | VATTR_WANTED(vap, va_gid); |
8653 | if (dvap) { |
8654 | VATTR_INIT(dvap); |
8655 | VATTR_WANTED(dvap, va_flags); |
8656 | } |
8657 | |
8658 | if (!is_suser) { |
8659 | /* |
8660 | * If not superuser, we have to evaluate ACLs and |
8661 | * need the target directory gid to set the initial |
8662 | * gid of the new object. |
8663 | */ |
8664 | VATTR_WANTED(vap, va_acl); |
8665 | if (dvap) { |
8666 | VATTR_WANTED(dvap, va_gid); |
8667 | } |
8668 | } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) { |
8669 | VATTR_WANTED(dvap, va_gid); |
8670 | } |
8671 | return 0; |
8672 | } else if (vattr_op == OP_VATTR_CLEANUP) { |
8673 | return 0; /* Nothing to do for now */ |
8674 | } |
8675 | |
8676 | /* dvap isn't used for authorization */ |
8677 | error = vnode_attr_authorize(vap, NULL, mp, action, ctx); |
8678 | |
8679 | if (error) { |
8680 | return error; |
8681 | } |
8682 | |
8683 | /* |
8684 | * vn_attribute_prepare should be able to accept attributes as well as |
8685 | * vnodes but for now we do this inline. |
8686 | */ |
8687 | if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) { |
8688 | /* |
8689 | * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit |
8690 | * owner is set, that owner takes ownership of all new files. |
8691 | */ |
8692 | if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) && |
8693 | (mp->mnt_fsowner != KAUTH_UID_NONE)) { |
8694 | VATTR_SET(vap, va_uid, mp->mnt_fsowner); |
8695 | } else { |
8696 | /* default owner is current user */ |
8697 | VATTR_SET(vap, va_uid, |
8698 | kauth_cred_getuid(vfs_context_ucred(ctx))); |
8699 | } |
8700 | |
8701 | if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) && |
8702 | (mp->mnt_fsgroup != KAUTH_GID_NONE)) { |
8703 | VATTR_SET(vap, va_gid, mp->mnt_fsgroup); |
8704 | } else { |
8705 | /* |
8706 | * default group comes from parent object, |
8707 | * fallback to current user |
8708 | */ |
8709 | if (VATTR_IS_SUPPORTED(dvap, va_gid)) { |
8710 | VATTR_SET(vap, va_gid, dvap->va_gid); |
8711 | } else { |
8712 | VATTR_SET(vap, va_gid, |
8713 | kauth_cred_getgid(vfs_context_ucred(ctx))); |
8714 | } |
8715 | } |
8716 | } |
8717 | |
8718 | /* Inherit SF_RESTRICTED bit from destination directory only */ |
8719 | if (VATTR_IS_ACTIVE(vap, va_flags)) { |
8720 | VATTR_SET(vap, va_flags, |
8721 | ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */ |
8722 | if (VATTR_IS_ACTIVE(dvap, va_flags)) { |
8723 | VATTR_SET(vap, va_flags, |
8724 | vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))); |
8725 | } |
8726 | } else if (VATTR_IS_ACTIVE(dvap, va_flags)) { |
8727 | VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))); |
8728 | } |
8729 | |
8730 | return 0; |
8731 | } |
8732 | |
8733 | |
8734 | /* |
8735 | * Authorize an operation on a vnode. |
8736 | * |
8737 | * This is KPI, but here because it needs vnode_scope. |
8738 | * |
8739 | * Returns: 0 Success |
8740 | * kauth_authorize_action:EPERM ... |
8741 | * xlate => EACCES Permission denied |
8742 | * kauth_authorize_action:0 Success |
8743 | * kauth_authorize_action: Depends on callback return; this is |
8744 | * usually only vnode_authorize_callback(), |
8745 | * but may include other listerners, if any |
8746 | * exist. |
8747 | * EROFS |
8748 | * EACCES |
8749 | * EPERM |
8750 | * ??? |
8751 | */ |
8752 | int |
8753 | vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx) |
8754 | { |
8755 | int error, result; |
8756 | |
8757 | /* |
8758 | * We can't authorize against a dead vnode; allow all operations through so that |
8759 | * the correct error can be returned. |
8760 | */ |
8761 | if (vp->v_type == VBAD) { |
8762 | return 0; |
8763 | } |
8764 | |
8765 | error = 0; |
8766 | result = kauth_authorize_action(scope: vnode_scope, credential: vfs_context_ucred(ctx), action: action, |
8767 | arg0: (uintptr_t)ctx, arg1: (uintptr_t)vp, arg2: (uintptr_t)dvp, arg3: (uintptr_t)&error); |
8768 | if (result == EPERM) { /* traditional behaviour */ |
8769 | result = EACCES; |
8770 | } |
8771 | /* did the lower layers give a better error return? */ |
8772 | if ((result != 0) && (error != 0)) { |
8773 | return error; |
8774 | } |
8775 | return result; |
8776 | } |
8777 | |
8778 | /* |
8779 | * Test for vnode immutability. |
8780 | * |
8781 | * The 'append' flag is set when the authorization request is constrained |
8782 | * to operations which only request the right to append to a file. |
8783 | * |
8784 | * The 'ignore' flag is set when an operation modifying the immutability flags |
8785 | * is being authorized. We check the system securelevel to determine which |
8786 | * immutability flags we can ignore. |
8787 | */ |
8788 | static int |
8789 | vnode_immutable(struct vnode_attr *vap, int append, int ignore) |
8790 | { |
8791 | int mask; |
8792 | |
8793 | /* start with all bits precluding the operation */ |
8794 | mask = IMMUTABLE | APPEND; |
8795 | |
8796 | /* if appending only, remove the append-only bits */ |
8797 | if (append) { |
8798 | mask &= ~APPEND; |
8799 | } |
8800 | |
8801 | /* ignore only set when authorizing flags changes */ |
8802 | if (ignore) { |
8803 | if (securelevel <= 0) { |
8804 | /* in insecure state, flags do not inhibit changes */ |
8805 | mask = 0; |
8806 | } else { |
8807 | /* in secure state, user flags don't inhibit */ |
8808 | mask &= ~(UF_IMMUTABLE | UF_APPEND); |
8809 | } |
8810 | } |
8811 | KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d" , vap->va_flags, mask, append, ignore); |
8812 | if ((vap->va_flags & mask) != 0) { |
8813 | return EPERM; |
8814 | } |
8815 | return 0; |
8816 | } |
8817 | |
8818 | static int |
8819 | vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred) |
8820 | { |
8821 | int result; |
8822 | |
8823 | /* default assumption is not-owner */ |
8824 | result = 0; |
8825 | |
8826 | /* |
8827 | * If the filesystem has given us a UID, we treat this as authoritative. |
8828 | */ |
8829 | if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) { |
8830 | result = (vap->va_uid == kauth_cred_getuid(cred: cred)) ? 1 : 0; |
8831 | } |
8832 | /* we could test the owner UUID here if we had a policy for it */ |
8833 | |
8834 | return result; |
8835 | } |
8836 | |
8837 | /* |
8838 | * vauth_node_group |
8839 | * |
8840 | * Description: Ask if a cred is a member of the group owning the vnode object |
8841 | * |
8842 | * Parameters: vap vnode attribute |
8843 | * vap->va_gid group owner of vnode object |
8844 | * cred credential to check |
8845 | * ismember pointer to where to put the answer |
8846 | * idontknow Return this if we can't get an answer |
8847 | * |
8848 | * Returns: 0 Success |
8849 | * idontknow Can't get information |
8850 | * kauth_cred_ismember_gid:? Error from kauth subsystem |
8851 | * kauth_cred_ismember_gid:? Error from kauth subsystem |
8852 | */ |
8853 | static int |
8854 | vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow) |
8855 | { |
8856 | int error; |
8857 | int result; |
8858 | |
8859 | error = 0; |
8860 | result = 0; |
8861 | |
8862 | /* |
8863 | * The caller is expected to have asked the filesystem for a group |
8864 | * at some point prior to calling this function. The answer may |
8865 | * have been that there is no group ownership supported for the |
8866 | * vnode object, in which case we return |
8867 | */ |
8868 | if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) { |
8869 | error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &result); |
8870 | /* |
8871 | * Credentials which are opted into external group membership |
8872 | * resolution which are not known to the external resolver |
8873 | * will result in an ENOENT error. We translate this into |
8874 | * the appropriate 'idontknow' response for our caller. |
8875 | * |
8876 | * XXX We do not make a distinction here between an ENOENT |
8877 | * XXX arising from a response from the external resolver, |
8878 | * XXX and an ENOENT which is internally generated. This is |
8879 | * XXX a deficiency of the published kauth_cred_ismember_gid() |
8880 | * XXX KPI which can not be overcome without new KPI. For |
8881 | * XXX all currently known cases, however, this wil result |
8882 | * XXX in correct behaviour. |
8883 | */ |
8884 | if (error == ENOENT) { |
8885 | error = idontknow; |
8886 | } |
8887 | } |
8888 | /* |
8889 | * XXX We could test the group UUID here if we had a policy for it, |
8890 | * XXX but this is problematic from the perspective of synchronizing |
8891 | * XXX group UUID and POSIX GID ownership of a file and keeping the |
8892 | * XXX values coherent over time. The problem is that the local |
8893 | * XXX system will vend transient group UUIDs for unknown POSIX GID |
8894 | * XXX values, and these are not persistent, whereas storage of values |
8895 | * XXX is persistent. One potential solution to this is a local |
8896 | * XXX (persistent) replica of remote directory entries and vended |
8897 | * XXX local ids in a local directory server (think in terms of a |
8898 | * XXX caching DNS server). |
8899 | */ |
8900 | |
8901 | if (!error) { |
8902 | *ismember = result; |
8903 | } |
8904 | return error; |
8905 | } |
8906 | |
8907 | static int |
8908 | vauth_file_owner(vauth_ctx vcp) |
8909 | { |
8910 | int result; |
8911 | |
8912 | if (vcp->flags_valid & _VAC_IS_OWNER) { |
8913 | result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0; |
8914 | } else { |
8915 | result = vauth_node_owner(vap: vcp->vap, cred: vcp->ctx->vc_ucred); |
8916 | |
8917 | /* cache our result */ |
8918 | vcp->flags_valid |= _VAC_IS_OWNER; |
8919 | if (result) { |
8920 | vcp->flags |= _VAC_IS_OWNER; |
8921 | } else { |
8922 | vcp->flags &= ~_VAC_IS_OWNER; |
8923 | } |
8924 | } |
8925 | return result; |
8926 | } |
8927 | |
8928 | |
8929 | /* |
8930 | * vauth_file_ingroup |
8931 | * |
8932 | * Description: Ask if a user is a member of the group owning the directory |
8933 | * |
8934 | * Parameters: vcp The vnode authorization context that |
8935 | * contains the user and directory info |
8936 | * vcp->flags_valid Valid flags |
8937 | * vcp->flags Flags values |
8938 | * vcp->vap File vnode attributes |
8939 | * vcp->ctx VFS Context (for user) |
8940 | * ismember pointer to where to put the answer |
8941 | * idontknow Return this if we can't get an answer |
8942 | * |
8943 | * Returns: 0 Success |
8944 | * vauth_node_group:? Error from vauth_node_group() |
8945 | * |
8946 | * Implicit returns: *ismember 0 The user is not a group member |
8947 | * 1 The user is a group member |
8948 | */ |
8949 | static int |
8950 | vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow) |
8951 | { |
8952 | int error; |
8953 | |
8954 | /* Check for a cached answer first, to avoid the check if possible */ |
8955 | if (vcp->flags_valid & _VAC_IN_GROUP) { |
8956 | *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0; |
8957 | error = 0; |
8958 | } else { |
8959 | /* Otherwise, go look for it */ |
8960 | error = vauth_node_group(vap: vcp->vap, cred: vcp->ctx->vc_ucred, ismember, idontknow); |
8961 | |
8962 | if (!error) { |
8963 | /* cache our result */ |
8964 | vcp->flags_valid |= _VAC_IN_GROUP; |
8965 | if (*ismember) { |
8966 | vcp->flags |= _VAC_IN_GROUP; |
8967 | } else { |
8968 | vcp->flags &= ~_VAC_IN_GROUP; |
8969 | } |
8970 | } |
8971 | } |
8972 | return error; |
8973 | } |
8974 | |
8975 | static int |
8976 | vauth_dir_owner(vauth_ctx vcp) |
8977 | { |
8978 | int result; |
8979 | |
8980 | if (vcp->flags_valid & _VAC_IS_DIR_OWNER) { |
8981 | result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0; |
8982 | } else { |
8983 | result = vauth_node_owner(vap: vcp->dvap, cred: vcp->ctx->vc_ucred); |
8984 | |
8985 | /* cache our result */ |
8986 | vcp->flags_valid |= _VAC_IS_DIR_OWNER; |
8987 | if (result) { |
8988 | vcp->flags |= _VAC_IS_DIR_OWNER; |
8989 | } else { |
8990 | vcp->flags &= ~_VAC_IS_DIR_OWNER; |
8991 | } |
8992 | } |
8993 | return result; |
8994 | } |
8995 | |
8996 | /* |
8997 | * vauth_dir_ingroup |
8998 | * |
8999 | * Description: Ask if a user is a member of the group owning the directory |
9000 | * |
9001 | * Parameters: vcp The vnode authorization context that |
9002 | * contains the user and directory info |
9003 | * vcp->flags_valid Valid flags |
9004 | * vcp->flags Flags values |
9005 | * vcp->dvap Dir vnode attributes |
9006 | * vcp->ctx VFS Context (for user) |
9007 | * ismember pointer to where to put the answer |
9008 | * idontknow Return this if we can't get an answer |
9009 | * |
9010 | * Returns: 0 Success |
9011 | * vauth_node_group:? Error from vauth_node_group() |
9012 | * |
9013 | * Implicit returns: *ismember 0 The user is not a group member |
9014 | * 1 The user is a group member |
9015 | */ |
9016 | static int |
9017 | vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow) |
9018 | { |
9019 | int error; |
9020 | |
9021 | /* Check for a cached answer first, to avoid the check if possible */ |
9022 | if (vcp->flags_valid & _VAC_IN_DIR_GROUP) { |
9023 | *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0; |
9024 | error = 0; |
9025 | } else { |
9026 | /* Otherwise, go look for it */ |
9027 | error = vauth_node_group(vap: vcp->dvap, cred: vcp->ctx->vc_ucred, ismember, idontknow); |
9028 | |
9029 | if (!error) { |
9030 | /* cache our result */ |
9031 | vcp->flags_valid |= _VAC_IN_DIR_GROUP; |
9032 | if (*ismember) { |
9033 | vcp->flags |= _VAC_IN_DIR_GROUP; |
9034 | } else { |
9035 | vcp->flags &= ~_VAC_IN_DIR_GROUP; |
9036 | } |
9037 | } |
9038 | } |
9039 | return error; |
9040 | } |
9041 | |
9042 | /* |
9043 | * Test the posix permissions in (vap) to determine whether (credential) |
9044 | * may perform (action) |
9045 | */ |
9046 | static int |
9047 | vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) |
9048 | { |
9049 | struct vnode_attr *vap; |
9050 | int needed, error, owner_ok, group_ok, world_ok, ismember; |
9051 | #ifdef KAUTH_DEBUG_ENABLE |
9052 | const char *where = "uninitialized" ; |
9053 | # define _SETWHERE(c) where = c; |
9054 | #else |
9055 | # define _SETWHERE(c) |
9056 | #endif |
9057 | |
9058 | /* checking file or directory? */ |
9059 | if (on_dir) { |
9060 | vap = vcp->dvap; |
9061 | } else { |
9062 | vap = vcp->vap; |
9063 | } |
9064 | |
9065 | error = 0; |
9066 | |
9067 | /* |
9068 | * We want to do as little work here as possible. So first we check |
9069 | * which sets of permissions grant us the access we need, and avoid checking |
9070 | * whether specific permissions grant access when more generic ones would. |
9071 | */ |
9072 | |
9073 | /* owner permissions */ |
9074 | needed = 0; |
9075 | if (action & VREAD) { |
9076 | needed |= S_IRUSR; |
9077 | } |
9078 | if (action & VWRITE) { |
9079 | needed |= S_IWUSR; |
9080 | } |
9081 | if (action & VEXEC) { |
9082 | needed |= S_IXUSR; |
9083 | } |
9084 | owner_ok = (needed & vap->va_mode) == needed; |
9085 | |
9086 | /* |
9087 | * Processes with the appropriate entitlement can marked themselves as |
9088 | * ignoring file/directory permissions if they own it. |
9089 | */ |
9090 | if (!owner_ok && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) { |
9091 | owner_ok = 1; |
9092 | } |
9093 | |
9094 | /* group permissions */ |
9095 | needed = 0; |
9096 | if (action & VREAD) { |
9097 | needed |= S_IRGRP; |
9098 | } |
9099 | if (action & VWRITE) { |
9100 | needed |= S_IWGRP; |
9101 | } |
9102 | if (action & VEXEC) { |
9103 | needed |= S_IXGRP; |
9104 | } |
9105 | group_ok = (needed & vap->va_mode) == needed; |
9106 | |
9107 | /* world permissions */ |
9108 | needed = 0; |
9109 | if (action & VREAD) { |
9110 | needed |= S_IROTH; |
9111 | } |
9112 | if (action & VWRITE) { |
9113 | needed |= S_IWOTH; |
9114 | } |
9115 | if (action & VEXEC) { |
9116 | needed |= S_IXOTH; |
9117 | } |
9118 | world_ok = (needed & vap->va_mode) == needed; |
9119 | |
9120 | /* If granted/denied by all three, we're done */ |
9121 | if (owner_ok && group_ok && world_ok) { |
9122 | _SETWHERE("all" ); |
9123 | goto out; |
9124 | } |
9125 | |
9126 | if (!owner_ok && !group_ok && !world_ok) { |
9127 | _SETWHERE("all" ); |
9128 | error = EACCES; |
9129 | goto out; |
9130 | } |
9131 | |
9132 | /* Check ownership (relatively cheap) */ |
9133 | if ((on_dir && vauth_dir_owner(vcp)) || |
9134 | (!on_dir && vauth_file_owner(vcp))) { |
9135 | _SETWHERE("user" ); |
9136 | if (!owner_ok) { |
9137 | error = EACCES; |
9138 | } |
9139 | goto out; |
9140 | } |
9141 | |
9142 | /* Not owner; if group and world both grant it we're done */ |
9143 | if (group_ok && world_ok) { |
9144 | _SETWHERE("group/world" ); |
9145 | goto out; |
9146 | } |
9147 | if (!group_ok && !world_ok) { |
9148 | _SETWHERE("group/world" ); |
9149 | error = EACCES; |
9150 | goto out; |
9151 | } |
9152 | |
9153 | /* Check group membership (most expensive) */ |
9154 | ismember = 0; /* Default to allow, if the target has no group owner */ |
9155 | |
9156 | /* |
9157 | * In the case we can't get an answer about the user from the call to |
9158 | * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on |
9159 | * the side of caution, rather than simply granting access, or we will |
9160 | * fail to correctly implement exclusion groups, so we set the third |
9161 | * parameter on the basis of the state of 'group_ok'. |
9162 | */ |
9163 | if (on_dir) { |
9164 | error = vauth_dir_ingroup(vcp, ismember: &ismember, idontknow: (!group_ok ? EACCES : 0)); |
9165 | } else { |
9166 | error = vauth_file_ingroup(vcp, ismember: &ismember, idontknow: (!group_ok ? EACCES : 0)); |
9167 | } |
9168 | if (error) { |
9169 | if (!group_ok) { |
9170 | ismember = 1; |
9171 | } |
9172 | error = 0; |
9173 | } |
9174 | if (ismember) { |
9175 | _SETWHERE("group" ); |
9176 | if (!group_ok) { |
9177 | error = EACCES; |
9178 | } |
9179 | goto out; |
9180 | } |
9181 | |
9182 | /* Not owner, not in group, use world result */ |
9183 | _SETWHERE("world" ); |
9184 | if (!world_ok) { |
9185 | error = EACCES; |
9186 | } |
9187 | |
9188 | /* FALLTHROUGH */ |
9189 | |
9190 | out: |
9191 | KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d" , |
9192 | vcp->vp, (error == 0) ? "ALLOWED" : "DENIED" , where, |
9193 | (action & VREAD) ? "r" : "-" , |
9194 | (action & VWRITE) ? "w" : "-" , |
9195 | (action & VEXEC) ? "x" : "-" , |
9196 | needed, |
9197 | (vap->va_mode & S_IRUSR) ? "r" : "-" , |
9198 | (vap->va_mode & S_IWUSR) ? "w" : "-" , |
9199 | (vap->va_mode & S_IXUSR) ? "x" : "-" , |
9200 | (vap->va_mode & S_IRGRP) ? "r" : "-" , |
9201 | (vap->va_mode & S_IWGRP) ? "w" : "-" , |
9202 | (vap->va_mode & S_IXGRP) ? "x" : "-" , |
9203 | (vap->va_mode & S_IROTH) ? "r" : "-" , |
9204 | (vap->va_mode & S_IWOTH) ? "w" : "-" , |
9205 | (vap->va_mode & S_IXOTH) ? "x" : "-" , |
9206 | kauth_cred_getuid(vcp->ctx->vc_ucred), |
9207 | on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid, |
9208 | on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid); |
9209 | return error; |
9210 | } |
9211 | |
9212 | /* |
9213 | * Authorize the deletion of the node vp from the directory dvp. |
9214 | * |
9215 | * We assume that: |
9216 | * - Neither the node nor the directory are immutable. |
9217 | * - The user is not the superuser. |
9218 | * |
9219 | * The precedence of factors for authorizing or denying delete for a credential |
9220 | * |
9221 | * 1) Explicit ACE on the node. (allow or deny DELETE) |
9222 | * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD). |
9223 | * |
9224 | * If there are conflicting ACEs on the node and the directory, the node |
9225 | * ACE wins. |
9226 | * |
9227 | * 3) Sticky bit on the directory. |
9228 | * Deletion is not permitted if the directory is sticky and the caller is |
9229 | * not owner of the node or directory. The sticky bit rules are like a deny |
9230 | * delete ACE except lower in priority than ACL's either allowing or denying |
9231 | * delete. |
9232 | * |
9233 | * 4) POSIX permisions on the directory. |
9234 | * |
9235 | * As an optimization, we cache whether or not delete child is permitted |
9236 | * on directories. This enables us to skip directory ACL and POSIX checks |
9237 | * as we already have the result from those checks. However, we always check the |
9238 | * node ACL and, if the directory has the sticky bit set, we always check its |
9239 | * ACL (even for a directory with an authorized delete child). Furthermore, |
9240 | * caching the delete child authorization is independent of the sticky bit |
9241 | * being set as it is only applicable in determining whether the node can be |
9242 | * deleted or not. |
9243 | */ |
9244 | static int |
9245 | vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) |
9246 | { |
9247 | struct vnode_attr *vap = vcp->vap; |
9248 | struct vnode_attr *dvap = vcp->dvap; |
9249 | kauth_cred_t cred = vcp->ctx->vc_ucred; |
9250 | struct kauth_acl_eval eval; |
9251 | int error, ismember; |
9252 | |
9253 | /* Check the ACL on the node first */ |
9254 | if (VATTR_IS_NOT(vap, va_acl, NULL)) { |
9255 | eval.ae_requested = KAUTH_VNODE_DELETE; |
9256 | eval.ae_acl = &vap->va_acl->acl_ace[0]; |
9257 | eval.ae_count = vap->va_acl->acl_entrycount; |
9258 | eval.ae_options = 0; |
9259 | if (vauth_file_owner(vcp)) { |
9260 | eval.ae_options |= KAUTH_AEVAL_IS_OWNER; |
9261 | } |
9262 | /* |
9263 | * We use ENOENT as a marker to indicate we could not get |
9264 | * information in order to delay evaluation until after we |
9265 | * have the ACL evaluation answer. Previously, we would |
9266 | * always deny the operation at this point. |
9267 | */ |
9268 | if ((error = vauth_file_ingroup(vcp, ismember: &ismember, ENOENT)) != 0 && error != ENOENT) { |
9269 | return error; |
9270 | } |
9271 | if (error == ENOENT) { |
9272 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; |
9273 | } else if (ismember) { |
9274 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP; |
9275 | } |
9276 | eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; |
9277 | eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; |
9278 | eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; |
9279 | eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; |
9280 | |
9281 | if ((error = kauth_acl_evaluate(credential: cred, eval: &eval)) != 0) { |
9282 | KAUTH_DEBUG("%p ERROR during ACL processing - %d" , vcp->vp, error); |
9283 | return error; |
9284 | } |
9285 | |
9286 | switch (eval.ae_result) { |
9287 | case KAUTH_RESULT_DENY: |
9288 | if (vauth_file_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) { |
9289 | KAUTH_DEBUG("%p Override DENY due to entitlement" , vcp->vp); |
9290 | return 0; |
9291 | } |
9292 | KAUTH_DEBUG("%p DENIED - denied by ACL" , vcp->vp); |
9293 | return EACCES; |
9294 | case KAUTH_RESULT_ALLOW: |
9295 | KAUTH_DEBUG("%p ALLOWED - granted by ACL" , vcp->vp); |
9296 | return 0; |
9297 | case KAUTH_RESULT_DEFER: |
9298 | default: |
9299 | /* Defer to directory */ |
9300 | KAUTH_DEBUG("%p DEFERRED - by file ACL" , vcp->vp); |
9301 | break; |
9302 | } |
9303 | } |
9304 | |
9305 | /* |
9306 | * Without a sticky bit, a previously authorized delete child is |
9307 | * sufficient to authorize this delete. |
9308 | * |
9309 | * If the sticky bit is set, a directory ACL which allows delete child |
9310 | * overrides a (potential) sticky bit deny. The authorized delete child |
9311 | * cannot tell us if it was authorized because of an explicit delete |
9312 | * child allow ACE or because of POSIX permisions so we have to check |
9313 | * the directory ACL everytime if the directory has a sticky bit. |
9314 | */ |
9315 | if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) { |
9316 | KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory" , vcp->vp); |
9317 | return 0; |
9318 | } |
9319 | |
9320 | /* check the ACL on the directory */ |
9321 | if (VATTR_IS_NOT(dvap, va_acl, NULL)) { |
9322 | eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; |
9323 | eval.ae_acl = &dvap->va_acl->acl_ace[0]; |
9324 | eval.ae_count = dvap->va_acl->acl_entrycount; |
9325 | eval.ae_options = 0; |
9326 | if (vauth_dir_owner(vcp)) { |
9327 | eval.ae_options |= KAUTH_AEVAL_IS_OWNER; |
9328 | } |
9329 | /* |
9330 | * We use ENOENT as a marker to indicate we could not get |
9331 | * information in order to delay evaluation until after we |
9332 | * have the ACL evaluation answer. Previously, we would |
9333 | * always deny the operation at this point. |
9334 | */ |
9335 | if ((error = vauth_dir_ingroup(vcp, ismember: &ismember, ENOENT)) != 0 && error != ENOENT) { |
9336 | return error; |
9337 | } |
9338 | if (error == ENOENT) { |
9339 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; |
9340 | } else if (ismember) { |
9341 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP; |
9342 | } |
9343 | eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; |
9344 | eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; |
9345 | eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; |
9346 | eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; |
9347 | |
9348 | /* |
9349 | * If there is no entry, we are going to defer to other |
9350 | * authorization mechanisms. |
9351 | */ |
9352 | error = kauth_acl_evaluate(credential: cred, eval: &eval); |
9353 | |
9354 | if (error != 0) { |
9355 | KAUTH_DEBUG("%p ERROR during ACL processing - %d" , vcp->vp, error); |
9356 | return error; |
9357 | } |
9358 | switch (eval.ae_result) { |
9359 | case KAUTH_RESULT_DENY: |
9360 | if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) { |
9361 | KAUTH_DEBUG("%p Override DENY due to entitlement" , vcp->vp); |
9362 | return 0; |
9363 | } |
9364 | KAUTH_DEBUG("%p DENIED - denied by directory ACL" , vcp->vp); |
9365 | return EACCES; |
9366 | case KAUTH_RESULT_ALLOW: |
9367 | KAUTH_DEBUG("%p ALLOWED - granted by directory ACL" , vcp->vp); |
9368 | if (!cached_delete_child && vcp->dvp) { |
9369 | vnode_cache_authorized_action(vp: vcp->dvp, |
9370 | context: vcp->ctx, KAUTH_VNODE_DELETE_CHILD); |
9371 | } |
9372 | return 0; |
9373 | case KAUTH_RESULT_DEFER: |
9374 | default: |
9375 | /* Deferred by directory ACL */ |
9376 | KAUTH_DEBUG("%p DEFERRED - directory ACL" , vcp->vp); |
9377 | break; |
9378 | } |
9379 | } |
9380 | |
9381 | /* |
9382 | * From this point, we can't explicitly allow and if we reach the end |
9383 | * of the function without a denial, then the delete is authorized. |
9384 | */ |
9385 | if (!cached_delete_child) { |
9386 | if (vnode_authorize_posix(vcp, VWRITE, on_dir: 1 /* on_dir */) != 0) { |
9387 | KAUTH_DEBUG("%p DENIED - denied by posix permisssions" , vcp->vp); |
9388 | return EACCES; |
9389 | } |
9390 | /* |
9391 | * Cache the authorized action on the vnode if allowed by the |
9392 | * directory ACL or POSIX permissions. It is correct to cache |
9393 | * this action even if sticky bit would deny deleting the node. |
9394 | */ |
9395 | if (vcp->dvp) { |
9396 | vnode_cache_authorized_action(vp: vcp->dvp, context: vcp->ctx, |
9397 | KAUTH_VNODE_DELETE_CHILD); |
9398 | } |
9399 | } |
9400 | |
9401 | /* enforce sticky bit behaviour */ |
9402 | if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { |
9403 | KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)" , |
9404 | vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); |
9405 | return EACCES; |
9406 | } |
9407 | |
9408 | /* not denied, must be OK */ |
9409 | return 0; |
9410 | } |
9411 | |
9412 | |
9413 | /* |
9414 | * Authorize an operation based on the node's attributes. |
9415 | */ |
9416 | static int |
9417 | vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny) |
9418 | { |
9419 | struct vnode_attr *vap = vcp->vap; |
9420 | kauth_cred_t cred = vcp->ctx->vc_ucred; |
9421 | struct kauth_acl_eval eval; |
9422 | int error, ismember; |
9423 | mode_t posix_action; |
9424 | |
9425 | /* |
9426 | * If we are the file owner, we automatically have some rights. |
9427 | * |
9428 | * Do we need to expand this to support group ownership? |
9429 | */ |
9430 | if (vauth_file_owner(vcp)) { |
9431 | acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY); |
9432 | } |
9433 | |
9434 | /* |
9435 | * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can |
9436 | * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to |
9437 | * change ownership to themselves, and WRITE_SECURITY is implicitly |
9438 | * granted to the owner. We need to do this because at this point |
9439 | * WRITE_SECURITY may not be granted as the caller is not currently |
9440 | * the owner. |
9441 | */ |
9442 | if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) && |
9443 | (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) { |
9444 | acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY; |
9445 | } |
9446 | |
9447 | if (acl_rights == 0) { |
9448 | KAUTH_DEBUG("%p ALLOWED - implicit or no rights required" , vcp->vp); |
9449 | return 0; |
9450 | } |
9451 | |
9452 | /* if we have an ACL, evaluate it */ |
9453 | if (VATTR_IS_NOT(vap, va_acl, NULL)) { |
9454 | eval.ae_requested = acl_rights; |
9455 | eval.ae_acl = &vap->va_acl->acl_ace[0]; |
9456 | eval.ae_count = vap->va_acl->acl_entrycount; |
9457 | eval.ae_options = 0; |
9458 | if (vauth_file_owner(vcp)) { |
9459 | eval.ae_options |= KAUTH_AEVAL_IS_OWNER; |
9460 | } |
9461 | /* |
9462 | * We use ENOENT as a marker to indicate we could not get |
9463 | * information in order to delay evaluation until after we |
9464 | * have the ACL evaluation answer. Previously, we would |
9465 | * always deny the operation at this point. |
9466 | */ |
9467 | if ((error = vauth_file_ingroup(vcp, ismember: &ismember, ENOENT)) != 0 && error != ENOENT) { |
9468 | return error; |
9469 | } |
9470 | if (error == ENOENT) { |
9471 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; |
9472 | } else if (ismember) { |
9473 | eval.ae_options |= KAUTH_AEVAL_IN_GROUP; |
9474 | } |
9475 | eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; |
9476 | eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; |
9477 | eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; |
9478 | eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; |
9479 | |
9480 | if ((error = kauth_acl_evaluate(credential: cred, eval: &eval)) != 0) { |
9481 | KAUTH_DEBUG("%p ERROR during ACL processing - %d" , vcp->vp, error); |
9482 | return error; |
9483 | } |
9484 | |
9485 | switch (eval.ae_result) { |
9486 | case KAUTH_RESULT_DENY: |
9487 | if (vauth_file_owner(vcp) && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) { |
9488 | KAUTH_DEBUG("%p Override DENY due to entitlement" , vcp->vp); |
9489 | return 0; |
9490 | } |
9491 | KAUTH_DEBUG("%p DENIED - by ACL" , vcp->vp); |
9492 | return EACCES; /* deny, deny, counter-allege */ |
9493 | case KAUTH_RESULT_ALLOW: |
9494 | KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL" , vcp->vp); |
9495 | return 0; |
9496 | case KAUTH_RESULT_DEFER: |
9497 | default: |
9498 | /* Effectively the same as !delete_child_denied */ |
9499 | KAUTH_DEBUG("%p DEFERRED - directory ACL" , vcp->vp); |
9500 | break; |
9501 | } |
9502 | |
9503 | *found_deny = eval.ae_found_deny; |
9504 | |
9505 | /* fall through and evaluate residual rights */ |
9506 | } else { |
9507 | /* no ACL, everything is residual */ |
9508 | eval.ae_residual = acl_rights; |
9509 | } |
9510 | |
9511 | /* |
9512 | * Grant residual rights that have been pre-authorized. |
9513 | */ |
9514 | eval.ae_residual &= ~preauth_rights; |
9515 | |
9516 | /* |
9517 | * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied. |
9518 | */ |
9519 | if (vauth_file_owner(vcp)) { |
9520 | eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES; |
9521 | } |
9522 | |
9523 | if (eval.ae_residual == 0) { |
9524 | KAUTH_DEBUG("%p ALLOWED - rights already authorized" , vcp->vp); |
9525 | return 0; |
9526 | } |
9527 | |
9528 | /* |
9529 | * Bail if we have residual rights that can't be granted by posix permissions, |
9530 | * or aren't presumed granted at this point. |
9531 | * |
9532 | * XXX these can be collapsed for performance |
9533 | */ |
9534 | if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) { |
9535 | KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted" , vcp->vp); |
9536 | return EACCES; |
9537 | } |
9538 | if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) { |
9539 | KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted" , vcp->vp); |
9540 | return EACCES; |
9541 | } |
9542 | |
9543 | #if DIAGNOSTIC |
9544 | if (eval.ae_residual & KAUTH_VNODE_DELETE) { |
9545 | panic("vnode_authorize: can't be checking delete permission here" ); |
9546 | } |
9547 | #endif |
9548 | |
9549 | /* |
9550 | * Compute the fallback posix permissions that will satisfy the remaining |
9551 | * rights. |
9552 | */ |
9553 | posix_action = 0; |
9554 | if (eval.ae_residual & (KAUTH_VNODE_READ_DATA | |
9555 | KAUTH_VNODE_LIST_DIRECTORY | |
9556 | KAUTH_VNODE_READ_EXTATTRIBUTES)) { |
9557 | posix_action |= VREAD; |
9558 | } |
9559 | if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA | |
9560 | KAUTH_VNODE_ADD_FILE | |
9561 | KAUTH_VNODE_ADD_SUBDIRECTORY | |
9562 | KAUTH_VNODE_DELETE_CHILD | |
9563 | KAUTH_VNODE_WRITE_ATTRIBUTES | |
9564 | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) { |
9565 | posix_action |= VWRITE; |
9566 | } |
9567 | if (eval.ae_residual & (KAUTH_VNODE_EXECUTE | |
9568 | KAUTH_VNODE_SEARCH)) { |
9569 | posix_action |= VEXEC; |
9570 | } |
9571 | |
9572 | if (posix_action != 0) { |
9573 | return vnode_authorize_posix(vcp, action: posix_action, on_dir: 0 /* !on_dir */); |
9574 | } else { |
9575 | KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping" , |
9576 | vcp->vp, |
9577 | (eval.ae_residual & KAUTH_VNODE_READ_DATA) |
9578 | ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "" , |
9579 | (eval.ae_residual & KAUTH_VNODE_WRITE_DATA) |
9580 | ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "" , |
9581 | (eval.ae_residual & KAUTH_VNODE_EXECUTE) |
9582 | ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "" , |
9583 | (eval.ae_residual & KAUTH_VNODE_DELETE) |
9584 | ? " DELETE" : "" , |
9585 | (eval.ae_residual & KAUTH_VNODE_APPEND_DATA) |
9586 | ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "" , |
9587 | (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD) |
9588 | ? " DELETE_CHILD" : "" , |
9589 | (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES) |
9590 | ? " READ_ATTRIBUTES" : "" , |
9591 | (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES) |
9592 | ? " WRITE_ATTRIBUTES" : "" , |
9593 | (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES) |
9594 | ? " READ_EXTATTRIBUTES" : "" , |
9595 | (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES) |
9596 | ? " WRITE_EXTATTRIBUTES" : "" , |
9597 | (eval.ae_residual & KAUTH_VNODE_READ_SECURITY) |
9598 | ? " READ_SECURITY" : "" , |
9599 | (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) |
9600 | ? " WRITE_SECURITY" : "" , |
9601 | (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE) |
9602 | ? " CHECKIMMUTABLE" : "" , |
9603 | (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) |
9604 | ? " CHANGE_OWNER" : "" ); |
9605 | } |
9606 | |
9607 | /* |
9608 | * Lack of required Posix permissions implies no reason to deny access. |
9609 | */ |
9610 | return 0; |
9611 | } |
9612 | |
9613 | /* |
9614 | * Check for file immutability. |
9615 | */ |
9616 | static int |
9617 | vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp, |
9618 | struct vnode_attr *vap, int rights, int ignore) |
9619 | { |
9620 | int error; |
9621 | int append; |
9622 | |
9623 | /* |
9624 | * Perform immutability checks for operations that change data. |
9625 | * |
9626 | * Sockets, fifos and devices require special handling. |
9627 | */ |
9628 | switch (vap->va_type) { |
9629 | case VSOCK: |
9630 | case VFIFO: |
9631 | case VBLK: |
9632 | case VCHR: |
9633 | /* |
9634 | * Writing to these nodes does not change the filesystem data, |
9635 | * so forget that it's being tried. |
9636 | */ |
9637 | rights &= ~KAUTH_VNODE_WRITE_DATA; |
9638 | break; |
9639 | default: |
9640 | break; |
9641 | } |
9642 | |
9643 | error = 0; |
9644 | if (rights & KAUTH_VNODE_WRITE_RIGHTS) { |
9645 | /* check per-filesystem options if possible */ |
9646 | if (mp != NULL) { |
9647 | /* check for no-EA filesystems */ |
9648 | if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && |
9649 | (vfs_flags(mp) & MNT_NOUSERXATTR)) { |
9650 | KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes" , vap); |
9651 | error = EACCES; /* User attributes disabled */ |
9652 | goto out; |
9653 | } |
9654 | } |
9655 | |
9656 | /* |
9657 | * check for file immutability. first, check if the requested rights are |
9658 | * allowable for a UF_APPEND file. |
9659 | */ |
9660 | append = 0; |
9661 | if (vap->va_type == VDIR) { |
9662 | if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES | ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) { |
9663 | append = 1; |
9664 | } |
9665 | } else { |
9666 | if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES | ~KAUTH_VNODE_WRITE_RIGHTS)) == rights) { |
9667 | append = 1; |
9668 | } |
9669 | } |
9670 | if ((error = vnode_immutable(vap, append, ignore)) != 0) { |
9671 | if (error && !ignore) { |
9672 | /* |
9673 | * In case of a rename, we want to check ownership for dvp as well. |
9674 | */ |
9675 | int owner = 0; |
9676 | if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) { |
9677 | owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp); |
9678 | } else { |
9679 | owner = vauth_file_owner(vcp); |
9680 | } |
9681 | if (owner && proc_ignores_node_permissions(proc: vfs_context_proc(ctx: vcp->ctx))) { |
9682 | error = vnode_immutable(vap, append, ignore: 1); |
9683 | } |
9684 | } |
9685 | } |
9686 | if (error) { |
9687 | KAUTH_DEBUG("%p DENIED - file is immutable" , vap); |
9688 | goto out; |
9689 | } |
9690 | } |
9691 | out: |
9692 | return error; |
9693 | } |
9694 | |
9695 | /* |
9696 | * Handle authorization actions for filesystems that advertise that the |
9697 | * server will be enforcing. |
9698 | * |
9699 | * Returns: 0 Authorization should be handled locally |
9700 | * 1 Authorization was handled by the FS |
9701 | * |
9702 | * Note: Imputed returns will only occur if the authorization request |
9703 | * was handled by the FS. |
9704 | * |
9705 | * Imputed: *resultp, modified Return code from FS when the request is |
9706 | * handled by the FS. |
9707 | * VNOP_ACCESS:??? |
9708 | * VNOP_OPEN:??? |
9709 | */ |
9710 | static int |
9711 | vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx) |
9712 | { |
9713 | int error; |
9714 | |
9715 | /* |
9716 | * If the vp is a device node, socket or FIFO it actually represents a local |
9717 | * endpoint, so we need to handle it locally. |
9718 | */ |
9719 | switch (vp->v_type) { |
9720 | case VBLK: |
9721 | case VCHR: |
9722 | case VSOCK: |
9723 | case VFIFO: |
9724 | return 0; |
9725 | default: |
9726 | break; |
9727 | } |
9728 | |
9729 | /* |
9730 | * In the advisory request case, if the filesystem doesn't think it's reliable |
9731 | * we will attempt to formulate a result ourselves based on VNOP_GETATTR data. |
9732 | */ |
9733 | if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(mp: vp->v_mount)) { |
9734 | return 0; |
9735 | } |
9736 | |
9737 | /* |
9738 | * Let the filesystem have a say in the matter. It's OK for it to not implemnent |
9739 | * VNOP_ACCESS, as most will authorise inline with the actual request. |
9740 | */ |
9741 | if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) { |
9742 | *resultp = error; |
9743 | KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access" , vp); |
9744 | return 1; |
9745 | } |
9746 | |
9747 | /* |
9748 | * Typically opaque filesystems do authorisation in-line, but exec is a special case. In |
9749 | * order to be reasonably sure that exec will be permitted, we try a bit harder here. |
9750 | */ |
9751 | if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) { |
9752 | /* try a VNOP_OPEN for readonly access */ |
9753 | if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) { |
9754 | *resultp = error; |
9755 | KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly" , vp); |
9756 | return 1; |
9757 | } |
9758 | VNOP_CLOSE(vp, FREAD, ctx); |
9759 | } |
9760 | |
9761 | /* |
9762 | * We don't have any reason to believe that the request has to be denied at this point, |
9763 | * so go ahead and allow it. |
9764 | */ |
9765 | *resultp = 0; |
9766 | KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem" , vp); |
9767 | return 1; |
9768 | } |
9769 | |
9770 | |
9771 | |
9772 | |
9773 | /* |
9774 | * Returns: KAUTH_RESULT_ALLOW |
9775 | * KAUTH_RESULT_DENY |
9776 | * |
9777 | * Imputed: *arg3, modified Error code in the deny case |
9778 | * EROFS Read-only file system |
9779 | * EACCES Permission denied |
9780 | * EPERM Operation not permitted [no execute] |
9781 | * vnode_getattr:ENOMEM Not enough space [only if has filesec] |
9782 | * vnode_getattr:??? |
9783 | * vnode_authorize_opaque:*arg2 ??? |
9784 | * vnode_authorize_checkimmutable:??? |
9785 | * vnode_authorize_delete:??? |
9786 | * vnode_authorize_simple:??? |
9787 | */ |
9788 | |
9789 | |
9790 | static int |
9791 | vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata, |
9792 | kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, |
9793 | uintptr_t arg3) |
9794 | { |
9795 | vfs_context_t ctx; |
9796 | vnode_t cvp = NULLVP; |
9797 | vnode_t vp, dvp; |
9798 | int result = KAUTH_RESULT_DENY; |
9799 | int parent_iocount = 0; |
9800 | int parent_action = 0; /* In case we need to use namedstream's data fork for cached rights*/ |
9801 | |
9802 | ctx = (vfs_context_t)arg0; |
9803 | vp = (vnode_t)arg1; |
9804 | dvp = (vnode_t)arg2; |
9805 | |
9806 | /* |
9807 | * if there are 2 vnodes passed in, we don't know at |
9808 | * this point which rights to look at based on the |
9809 | * combined action being passed in... defer until later... |
9810 | * otherwise check the kauth 'rights' cache hung |
9811 | * off of the vnode we're interested in... if we've already |
9812 | * been granted the right we're currently interested in, |
9813 | * we can just return success... otherwise we'll go through |
9814 | * the process of authorizing the requested right(s)... if that |
9815 | * succeeds, we'll add the right(s) to the cache. |
9816 | * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache |
9817 | */ |
9818 | if (dvp && vp) { |
9819 | goto defer; |
9820 | } |
9821 | if (dvp) { |
9822 | cvp = dvp; |
9823 | } else { |
9824 | /* |
9825 | * For named streams on local-authorization volumes, rights are cached on the parent; |
9826 | * authorization is determined by looking at the parent's properties anyway, so storing |
9827 | * on the parent means that we don't recompute for the named stream and that if |
9828 | * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the |
9829 | * stream to flush its cache separately. If we miss in the cache, then we authorize |
9830 | * as if there were no cached rights (passing the named stream vnode and desired rights to |
9831 | * vnode_authorize_callback_int()). |
9832 | * |
9833 | * On an opaquely authorized volume, we don't know the relationship between the |
9834 | * data fork's properties and the rights granted on a stream. Thus, named stream vnodes |
9835 | * on such a volume are authorized directly (rather than using the parent) and have their |
9836 | * own caches. When a named stream vnode is created, we mark the parent as having a named |
9837 | * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we |
9838 | * find the stream and flush its cache. |
9839 | */ |
9840 | if (vnode_isnamedstream(vp) && (!vfs_authopaque(mp: vp->v_mount))) { |
9841 | cvp = vnode_getparent(vp); |
9842 | if (cvp != NULLVP) { |
9843 | parent_iocount = 1; |
9844 | } else { |
9845 | cvp = NULL; |
9846 | goto defer; /* If we can't use the parent, take the slow path */ |
9847 | } |
9848 | |
9849 | /* Have to translate some actions */ |
9850 | parent_action = action; |
9851 | if (parent_action & KAUTH_VNODE_READ_DATA) { |
9852 | parent_action &= ~KAUTH_VNODE_READ_DATA; |
9853 | parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES; |
9854 | } |
9855 | if (parent_action & KAUTH_VNODE_WRITE_DATA) { |
9856 | parent_action &= ~KAUTH_VNODE_WRITE_DATA; |
9857 | parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; |
9858 | } |
9859 | } else { |
9860 | cvp = vp; |
9861 | } |
9862 | } |
9863 | |
9864 | if (vnode_cache_is_authorized(vp: cvp, context: ctx, action: parent_iocount ? parent_action : action) == TRUE) { |
9865 | result = KAUTH_RESULT_ALLOW; |
9866 | goto out; |
9867 | } |
9868 | defer: |
9869 | result = vnode_authorize_callback_int(action, ctx, vp, dvp, errorp: (int *)arg3); |
9870 | |
9871 | if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) { |
9872 | KAUTH_DEBUG("%p - caching action = %x" , cvp, action); |
9873 | vnode_cache_authorized_action(vp: cvp, context: ctx, action); |
9874 | } |
9875 | |
9876 | out: |
9877 | if (parent_iocount) { |
9878 | vnode_put(vp: cvp); |
9879 | } |
9880 | |
9881 | return result; |
9882 | } |
9883 | |
9884 | static int |
9885 | vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp, |
9886 | kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny, |
9887 | int noimmutable, int parent_authorized_for_delete_child) |
9888 | { |
9889 | int result; |
9890 | |
9891 | /* |
9892 | * Check for immutability. |
9893 | * |
9894 | * In the deletion case, parent directory immutability vetoes specific |
9895 | * file rights. |
9896 | */ |
9897 | if ((result = vnode_authorize_checkimmutable(mp, vcp, vap: vcp->vap, rights, |
9898 | ignore: noimmutable)) != 0) { |
9899 | goto out; |
9900 | } |
9901 | |
9902 | if ((rights & KAUTH_VNODE_DELETE) && |
9903 | !parent_authorized_for_delete_child) { |
9904 | result = vnode_authorize_checkimmutable(mp, vcp, vap: vcp->dvap, |
9905 | KAUTH_VNODE_DELETE_CHILD, ignore: 0); |
9906 | if (result) { |
9907 | goto out; |
9908 | } |
9909 | } |
9910 | |
9911 | /* |
9912 | * Clear rights that have been authorized by reaching this point, bail if nothing left to |
9913 | * check. |
9914 | */ |
9915 | rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE); |
9916 | if (rights == 0) { |
9917 | goto out; |
9918 | } |
9919 | |
9920 | /* |
9921 | * If we're not the superuser, authorize based on file properties; |
9922 | * note that even if parent_authorized_for_delete_child is TRUE, we |
9923 | * need to check on the node itself. |
9924 | */ |
9925 | if (!is_suser) { |
9926 | /* process delete rights */ |
9927 | if ((rights & KAUTH_VNODE_DELETE) && |
9928 | ((result = vnode_authorize_delete(vcp, cached_delete_child: parent_authorized_for_delete_child)) != 0)) { |
9929 | goto out; |
9930 | } |
9931 | |
9932 | /* process remaining rights */ |
9933 | if ((rights & ~KAUTH_VNODE_DELETE) && |
9934 | (result = vnode_authorize_simple(vcp, acl_rights: rights, preauth_rights: rights & KAUTH_VNODE_DELETE, found_deny)) != 0) { |
9935 | goto out; |
9936 | } |
9937 | } else { |
9938 | /* |
9939 | * Execute is only granted to root if one of the x bits is set. This check only |
9940 | * makes sense if the posix mode bits are actually supported. |
9941 | */ |
9942 | if ((rights & KAUTH_VNODE_EXECUTE) && |
9943 | (vcp->vap->va_type == VREG) && |
9944 | VATTR_IS_SUPPORTED(vcp->vap, va_mode) && |
9945 | !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { |
9946 | result = EPERM; |
9947 | KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x" , vcp, vcp->vap->va_mode); |
9948 | goto out; |
9949 | } |
9950 | |
9951 | /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */ |
9952 | *found_deny = TRUE; |
9953 | |
9954 | KAUTH_DEBUG("%p ALLOWED - caller is superuser" , vcp); |
9955 | } |
9956 | out: |
9957 | return result; |
9958 | } |
9959 | |
9960 | static int |
9961 | vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx, |
9962 | vnode_t vp, vnode_t dvp, int *errorp) |
9963 | { |
9964 | struct _vnode_authorize_context auth_context; |
9965 | vauth_ctx vcp; |
9966 | kauth_cred_t cred; |
9967 | kauth_ace_rights_t rights; |
9968 | struct vnode_attr va, dva; |
9969 | int result; |
9970 | int noimmutable; |
9971 | boolean_t parent_authorized_for_delete_child = FALSE; |
9972 | boolean_t found_deny = FALSE; |
9973 | boolean_t parent_ref = FALSE; |
9974 | boolean_t is_suser = FALSE; |
9975 | |
9976 | vcp = &auth_context; |
9977 | vcp->ctx = ctx; |
9978 | vcp->vp = vp; |
9979 | vcp->dvp = dvp; |
9980 | /* |
9981 | * Note that we authorize against the context, not the passed cred |
9982 | * (the same thing anyway) |
9983 | */ |
9984 | cred = ctx->vc_ucred; |
9985 | |
9986 | VATTR_INIT(&va); |
9987 | vcp->vap = &va; |
9988 | VATTR_INIT(&dva); |
9989 | vcp->dvap = &dva; |
9990 | |
9991 | vcp->flags = vcp->flags_valid = 0; |
9992 | |
9993 | #if DIAGNOSTIC |
9994 | if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) { |
9995 | panic("vnode_authorize: bad arguments (context %p vp %p cred %p)" , ctx, vp, cred); |
9996 | } |
9997 | #endif |
9998 | |
9999 | KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)" , |
10000 | vp, vfs_context_proc(ctx)->p_comm, |
10001 | (action & KAUTH_VNODE_ACCESS) ? "access" : "auth" , |
10002 | (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "" , |
10003 | (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "" , |
10004 | (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "" , |
10005 | (action & KAUTH_VNODE_DELETE) ? " DELETE" : "" , |
10006 | (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "" , |
10007 | (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "" , |
10008 | (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "" , |
10009 | (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "" , |
10010 | (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "" , |
10011 | (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "" , |
10012 | (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "" , |
10013 | (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "" , |
10014 | (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "" , |
10015 | (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "" , |
10016 | vnode_isdir(vp) ? "directory" : "file" , |
10017 | vp->v_name ? vp->v_name : "<NULL>" , action, vp, dvp); |
10018 | |
10019 | /* |
10020 | * Extract the control bits from the action, everything else is |
10021 | * requested rights. |
10022 | */ |
10023 | noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; |
10024 | rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); |
10025 | |
10026 | if (rights & KAUTH_VNODE_DELETE) { |
10027 | #if DIAGNOSTIC |
10028 | if (dvp == NULL) { |
10029 | panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory" ); |
10030 | } |
10031 | #endif |
10032 | /* |
10033 | * check to see if we've already authorized the parent |
10034 | * directory for deletion of its children... if so, we |
10035 | * can skip a whole bunch of work... we will still have to |
10036 | * authorize that this specific child can be removed |
10037 | */ |
10038 | if (vnode_cache_is_authorized(vp: dvp, context: ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) { |
10039 | parent_authorized_for_delete_child = TRUE; |
10040 | } |
10041 | } else { |
10042 | vcp->dvp = NULLVP; |
10043 | vcp->dvap = NULL; |
10044 | } |
10045 | |
10046 | /* |
10047 | * Check for read-only filesystems. |
10048 | */ |
10049 | if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && |
10050 | (vp->v_mount->mnt_flag & MNT_RDONLY) && |
10051 | ((vp->v_type == VREG) || (vp->v_type == VDIR) || |
10052 | (vp->v_type == VLNK) || (vp->v_type == VCPLX) || |
10053 | (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) { |
10054 | result = EROFS; |
10055 | goto out; |
10056 | } |
10057 | |
10058 | /* |
10059 | * Check for noexec filesystems. |
10060 | */ |
10061 | if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) { |
10062 | result = EACCES; |
10063 | goto out; |
10064 | } |
10065 | |
10066 | /* |
10067 | * Handle cases related to filesystems with non-local enforcement. |
10068 | * This call can return 0, in which case we will fall through to perform a |
10069 | * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets |
10070 | * an appropriate result, at which point we can return immediately. |
10071 | */ |
10072 | if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, resultp: &result, action, ctx)) { |
10073 | goto out; |
10074 | } |
10075 | |
10076 | /* |
10077 | * If the vnode is a namedstream (extended attribute) data vnode (eg. |
10078 | * a resource fork), *_DATA becomes *_EXTATTRIBUTES. |
10079 | */ |
10080 | if (vnode_isnamedstream(vp)) { |
10081 | if (rights & KAUTH_VNODE_READ_DATA) { |
10082 | rights &= ~KAUTH_VNODE_READ_DATA; |
10083 | rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; |
10084 | } |
10085 | if (rights & KAUTH_VNODE_WRITE_DATA) { |
10086 | rights &= ~KAUTH_VNODE_WRITE_DATA; |
10087 | rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; |
10088 | } |
10089 | |
10090 | /* |
10091 | * Point 'vp' to the namedstream's parent for ACL checking |
10092 | */ |
10093 | if ((vp->v_parent != NULL) && |
10094 | (vget_internal(vp: vp->v_parent, vid: 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { |
10095 | parent_ref = TRUE; |
10096 | vcp->vp = vp = vp->v_parent; |
10097 | } |
10098 | } |
10099 | |
10100 | if (vfs_context_issuser(ctx)) { |
10101 | /* |
10102 | * if we're not asking for execute permissions or modifications, |
10103 | * then we're done, this action is authorized. |
10104 | */ |
10105 | if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) { |
10106 | goto success; |
10107 | } |
10108 | |
10109 | is_suser = TRUE; |
10110 | } |
10111 | |
10112 | /* |
10113 | * Get vnode attributes and extended security information for the vnode |
10114 | * and directory if required. |
10115 | * |
10116 | * If we're root we only want mode bits and flags for checking |
10117 | * execute and immutability. |
10118 | */ |
10119 | VATTR_WANTED(&va, va_mode); |
10120 | VATTR_WANTED(&va, va_flags); |
10121 | if (!is_suser) { |
10122 | VATTR_WANTED(&va, va_uid); |
10123 | VATTR_WANTED(&va, va_gid); |
10124 | VATTR_WANTED(&va, va_acl); |
10125 | } |
10126 | if ((result = vnode_getattr(vp, vap: &va, ctx)) != 0) { |
10127 | KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d" , vp, result); |
10128 | goto out; |
10129 | } |
10130 | VATTR_WANTED(&va, va_type); |
10131 | VATTR_RETURN(&va, va_type, vnode_vtype(vp)); |
10132 | |
10133 | if (vcp->dvp) { |
10134 | VATTR_WANTED(&dva, va_mode); |
10135 | VATTR_WANTED(&dva, va_flags); |
10136 | if (!is_suser) { |
10137 | VATTR_WANTED(&dva, va_uid); |
10138 | VATTR_WANTED(&dva, va_gid); |
10139 | VATTR_WANTED(&dva, va_acl); |
10140 | } |
10141 | if ((result = vnode_getattr(vp: vcp->dvp, vap: &dva, ctx)) != 0) { |
10142 | KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d" , vp, result); |
10143 | goto out; |
10144 | } |
10145 | VATTR_WANTED(&dva, va_type); |
10146 | VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp)); |
10147 | } |
10148 | |
10149 | result = vnode_attr_authorize_internal(vcp, mp: vp->v_mount, rights, is_suser, |
10150 | found_deny: &found_deny, noimmutable, parent_authorized_for_delete_child); |
10151 | out: |
10152 | if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) { |
10153 | kauth_acl_free(fsp: va.va_acl); |
10154 | } |
10155 | if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) { |
10156 | kauth_acl_free(fsp: dva.va_acl); |
10157 | } |
10158 | |
10159 | if (result) { |
10160 | if (parent_ref) { |
10161 | vnode_put(vp); |
10162 | } |
10163 | *errorp = result; |
10164 | KAUTH_DEBUG("%p DENIED - auth denied" , vp); |
10165 | return KAUTH_RESULT_DENY; |
10166 | } |
10167 | if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) { |
10168 | /* |
10169 | * if we were successfully granted the right to search this directory |
10170 | * and there were NO ACL DENYs for search and the posix permissions also don't |
10171 | * deny execute, we can synthesize a global right that allows anyone to |
10172 | * traverse this directory during a pathname lookup without having to |
10173 | * match the credential associated with this cache of rights. |
10174 | * |
10175 | * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE |
10176 | * only if we actually check ACLs which we don't for root. As |
10177 | * a workaround, the lookup fast path checks for root. |
10178 | */ |
10179 | if (!VATTR_IS_SUPPORTED(&va, va_mode) || |
10180 | ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == |
10181 | (S_IXUSR | S_IXGRP | S_IXOTH))) { |
10182 | vnode_cache_authorized_action(vp, context: ctx, KAUTH_VNODE_SEARCHBYANYONE); |
10183 | } |
10184 | } |
10185 | success: |
10186 | if (parent_ref) { |
10187 | vnode_put(vp); |
10188 | } |
10189 | |
10190 | /* |
10191 | * Note that this implies that we will allow requests for no rights, as well as |
10192 | * for rights that we do not recognise. There should be none of these. |
10193 | */ |
10194 | KAUTH_DEBUG("%p ALLOWED - auth granted" , vp); |
10195 | return KAUTH_RESULT_ALLOW; |
10196 | } |
10197 | |
10198 | int |
10199 | vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap, |
10200 | kauth_action_t action, vfs_context_t ctx) |
10201 | { |
10202 | VATTR_INIT(vap); |
10203 | VATTR_WANTED(vap, va_type); |
10204 | VATTR_WANTED(vap, va_mode); |
10205 | VATTR_WANTED(vap, va_flags); |
10206 | if (dvap) { |
10207 | VATTR_INIT(dvap); |
10208 | if (action & KAUTH_VNODE_DELETE) { |
10209 | VATTR_WANTED(dvap, va_type); |
10210 | VATTR_WANTED(dvap, va_mode); |
10211 | VATTR_WANTED(dvap, va_flags); |
10212 | } |
10213 | } else if (action & KAUTH_VNODE_DELETE) { |
10214 | return EINVAL; |
10215 | } |
10216 | |
10217 | if (!vfs_context_issuser(ctx)) { |
10218 | VATTR_WANTED(vap, va_uid); |
10219 | VATTR_WANTED(vap, va_gid); |
10220 | VATTR_WANTED(vap, va_acl); |
10221 | if (dvap && (action & KAUTH_VNODE_DELETE)) { |
10222 | VATTR_WANTED(dvap, va_uid); |
10223 | VATTR_WANTED(dvap, va_gid); |
10224 | VATTR_WANTED(dvap, va_acl); |
10225 | } |
10226 | } |
10227 | |
10228 | return 0; |
10229 | } |
10230 | |
10231 | #define VNODE_SEC_ATTRS_NO_ACL (VNODE_ATTR_va_uid | VNODE_ATTR_va_gid | VNODE_ATTR_va_mode | VNODE_ATTR_va_flags | VNODE_ATTR_va_type) |
10232 | |
10233 | int |
10234 | vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp, |
10235 | kauth_action_t action, vfs_context_t ctx) |
10236 | { |
10237 | struct _vnode_authorize_context auth_context; |
10238 | vauth_ctx vcp; |
10239 | kauth_ace_rights_t rights; |
10240 | int noimmutable; |
10241 | boolean_t found_deny; |
10242 | boolean_t is_suser = FALSE; |
10243 | int result = 0; |
10244 | uid_t ouid = vap->va_uid; |
10245 | gid_t ogid = vap->va_gid; |
10246 | |
10247 | vcp = &auth_context; |
10248 | vcp->ctx = ctx; |
10249 | vcp->vp = NULLVP; |
10250 | vcp->vap = vap; |
10251 | vcp->dvp = NULLVP; |
10252 | vcp->dvap = dvap; |
10253 | vcp->flags = vcp->flags_valid = 0; |
10254 | |
10255 | noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; |
10256 | rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); |
10257 | |
10258 | /* |
10259 | * Check for read-only filesystems. |
10260 | */ |
10261 | if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && |
10262 | mp && (mp->mnt_flag & MNT_RDONLY) && |
10263 | ((vap->va_type == VREG) || (vap->va_type == VDIR) || |
10264 | (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) || |
10265 | (rights & KAUTH_VNODE_DELETE_CHILD))) { |
10266 | result = EROFS; |
10267 | goto out; |
10268 | } |
10269 | |
10270 | /* |
10271 | * Check for noexec filesystems. |
10272 | */ |
10273 | if ((rights & KAUTH_VNODE_EXECUTE) && |
10274 | (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) { |
10275 | result = EACCES; |
10276 | goto out; |
10277 | } |
10278 | |
10279 | if (vfs_context_issuser(ctx)) { |
10280 | /* |
10281 | * if we're not asking for execute permissions or modifications, |
10282 | * then we're done, this action is authorized. |
10283 | */ |
10284 | if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) { |
10285 | goto out; |
10286 | } |
10287 | is_suser = TRUE; |
10288 | } |
10289 | |
10290 | if (mp) { |
10291 | if (vfs_extendedsecurity(mp) && VATTR_IS_ACTIVE(vap, va_acl) && !VATTR_IS_SUPPORTED(vap, va_acl)) { |
10292 | panic("(1) vnode attrs not complete for vnode_attr_authorize" ); |
10293 | } |
10294 | vnode_attr_handle_uid_and_gid(vap, mp, ctx); |
10295 | } |
10296 | |
10297 | if ((vap->va_active & VNODE_SEC_ATTRS_NO_ACL) != (vap->va_supported & VNODE_SEC_ATTRS_NO_ACL)) { |
10298 | panic("(2) vnode attrs not complete for vnode_attr_authorize (2) vap->va_active = 0x%llx , vap->va_supported = 0x%llx" , |
10299 | vap->va_active, vap->va_supported); |
10300 | } |
10301 | |
10302 | result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser, |
10303 | found_deny: &found_deny, noimmutable, FALSE); |
10304 | |
10305 | if (mp) { |
10306 | vap->va_uid = ouid; |
10307 | vap->va_gid = ogid; |
10308 | } |
10309 | |
10310 | if (result == EPERM) { |
10311 | result = EACCES; |
10312 | } |
10313 | out: |
10314 | return result; |
10315 | } |
10316 | |
10317 | |
10318 | int |
10319 | vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) |
10320 | { |
10321 | return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx); |
10322 | } |
10323 | |
10324 | /* |
10325 | * Check that the attribute information in vattr can be legally applied to |
10326 | * a new file by the context. |
10327 | */ |
10328 | static int |
10329 | vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) |
10330 | { |
10331 | int error; |
10332 | int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; |
10333 | uint32_t inherit_flags; |
10334 | kauth_cred_t cred; |
10335 | guid_t changer; |
10336 | mount_t dmp; |
10337 | struct vnode_attr dva; |
10338 | |
10339 | error = 0; |
10340 | |
10341 | if (defaulted_fieldsp) { |
10342 | *defaulted_fieldsp = 0; |
10343 | } |
10344 | |
10345 | defaulted_owner = defaulted_group = defaulted_mode = 0; |
10346 | |
10347 | inherit_flags = 0; |
10348 | |
10349 | /* |
10350 | * Require that the filesystem support extended security to apply any. |
10351 | */ |
10352 | if (!vfs_extendedsecurity(dvp->v_mount) && |
10353 | (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { |
10354 | error = EINVAL; |
10355 | goto out; |
10356 | } |
10357 | |
10358 | /* |
10359 | * Default some fields. |
10360 | */ |
10361 | dmp = dvp->v_mount; |
10362 | |
10363 | /* |
10364 | * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that |
10365 | * owner takes ownership of all new files. |
10366 | */ |
10367 | if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) { |
10368 | VATTR_SET(vap, va_uid, dmp->mnt_fsowner); |
10369 | defaulted_owner = 1; |
10370 | } else { |
10371 | if (!VATTR_IS_ACTIVE(vap, va_uid)) { |
10372 | /* default owner is current user */ |
10373 | VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx))); |
10374 | defaulted_owner = 1; |
10375 | } |
10376 | } |
10377 | |
10378 | /* |
10379 | * We need the dvp's va_flags and *may* need the gid of the directory, |
10380 | * we ask for both here. |
10381 | */ |
10382 | VATTR_INIT(&dva); |
10383 | VATTR_WANTED(&dva, va_gid); |
10384 | VATTR_WANTED(&dva, va_flags); |
10385 | if ((error = vnode_getattr(vp: dvp, vap: &dva, ctx)) != 0) { |
10386 | goto out; |
10387 | } |
10388 | |
10389 | /* |
10390 | * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that |
10391 | * group takes ownership of all new files. |
10392 | */ |
10393 | if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) { |
10394 | VATTR_SET(vap, va_gid, dmp->mnt_fsgroup); |
10395 | defaulted_group = 1; |
10396 | } else { |
10397 | if (!VATTR_IS_ACTIVE(vap, va_gid)) { |
10398 | /* default group comes from parent object, fallback to current user */ |
10399 | if (VATTR_IS_SUPPORTED(&dva, va_gid)) { |
10400 | VATTR_SET(vap, va_gid, dva.va_gid); |
10401 | } else { |
10402 | VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx))); |
10403 | } |
10404 | defaulted_group = 1; |
10405 | } |
10406 | } |
10407 | |
10408 | if (!VATTR_IS_ACTIVE(vap, va_flags)) { |
10409 | VATTR_SET(vap, va_flags, 0); |
10410 | } |
10411 | |
10412 | /* Determine if SF_RESTRICTED should be inherited from the parent |
10413 | * directory. */ |
10414 | if (VATTR_IS_SUPPORTED(&dva, va_flags)) { |
10415 | inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED); |
10416 | } |
10417 | |
10418 | /* default mode is everything, masked with current umask */ |
10419 | if (!VATTR_IS_ACTIVE(vap, va_mode)) { |
10420 | VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd.fd_cmask); |
10421 | KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o" , |
10422 | vap->va_mode, vfs_context_proc(ctx)->p_fd.fd_cmask); |
10423 | defaulted_mode = 1; |
10424 | } |
10425 | /* set timestamps to now */ |
10426 | if (!VATTR_IS_ACTIVE(vap, va_create_time)) { |
10427 | nanotime(ts: &vap->va_create_time); |
10428 | VATTR_SET_ACTIVE(vap, va_create_time); |
10429 | } |
10430 | |
10431 | /* |
10432 | * Check for attempts to set nonsensical fields. |
10433 | */ |
10434 | if (vap->va_active & ~VNODE_ATTR_NEWOBJ) { |
10435 | error = EINVAL; |
10436 | KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx" , |
10437 | vap->va_active & ~VNODE_ATTR_NEWOBJ); |
10438 | goto out; |
10439 | } |
10440 | |
10441 | /* |
10442 | * Quickly check for the applicability of any enforcement here. |
10443 | * Tests below maintain the integrity of the local security model. |
10444 | */ |
10445 | if (vfs_authopaque(mp: dvp->v_mount)) { |
10446 | goto out; |
10447 | } |
10448 | |
10449 | /* |
10450 | * We need to know if the caller is the superuser, or if the work is |
10451 | * otherwise already authorised. |
10452 | */ |
10453 | cred = vfs_context_ucred(ctx); |
10454 | if (noauth) { |
10455 | /* doing work for the kernel */ |
10456 | has_priv_suser = 1; |
10457 | } else { |
10458 | has_priv_suser = vfs_context_issuser(ctx); |
10459 | } |
10460 | |
10461 | |
10462 | if (VATTR_IS_ACTIVE(vap, va_flags)) { |
10463 | vap->va_flags &= ~SF_SYNTHETIC; |
10464 | if (has_priv_suser) { |
10465 | if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { |
10466 | error = EPERM; |
10467 | KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)" ); |
10468 | goto out; |
10469 | } |
10470 | } else { |
10471 | if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) { |
10472 | error = EPERM; |
10473 | KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)" ); |
10474 | goto out; |
10475 | } |
10476 | } |
10477 | } |
10478 | |
10479 | /* if not superuser, validate legality of new-item attributes */ |
10480 | if (!has_priv_suser) { |
10481 | if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) { |
10482 | /* setgid? */ |
10483 | if (vap->va_mode & S_ISGID) { |
10484 | if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != 0) { |
10485 | KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d" , error, vap->va_gid); |
10486 | goto out; |
10487 | } |
10488 | if (!ismember) { |
10489 | KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d" , vap->va_gid); |
10490 | error = EPERM; |
10491 | goto out; |
10492 | } |
10493 | } |
10494 | |
10495 | /* setuid? */ |
10496 | if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred: cred))) { |
10497 | KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit" ); |
10498 | error = EPERM; |
10499 | goto out; |
10500 | } |
10501 | } |
10502 | if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred: cred))) { |
10503 | KAUTH_DEBUG(" DENIED - cannot create new item owned by %d" , vap->va_uid); |
10504 | error = EPERM; |
10505 | goto out; |
10506 | } |
10507 | if (!defaulted_group) { |
10508 | if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != 0) { |
10509 | KAUTH_DEBUG(" ERROR - got %d checking for membership in %d" , error, vap->va_gid); |
10510 | goto out; |
10511 | } |
10512 | if (!ismember) { |
10513 | KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member" , vap->va_gid); |
10514 | error = EPERM; |
10515 | goto out; |
10516 | } |
10517 | } |
10518 | |
10519 | /* initialising owner/group UUID */ |
10520 | if (VATTR_IS_ACTIVE(vap, va_uuuid)) { |
10521 | if ((error = kauth_cred_getguid(cred: cred, guidp: &changer)) != 0) { |
10522 | KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID" , error); |
10523 | /* XXX ENOENT here - no GUID - should perhaps become EPERM */ |
10524 | goto out; |
10525 | } |
10526 | if (!kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &changer)) { |
10527 | KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us" ); |
10528 | error = EPERM; |
10529 | goto out; |
10530 | } |
10531 | } |
10532 | if (VATTR_IS_ACTIVE(vap, va_guuid)) { |
10533 | if ((error = kauth_cred_ismember_guid(cred: cred, guidp: &vap->va_guuid, resultp: &ismember)) != 0) { |
10534 | KAUTH_DEBUG(" ERROR - got %d trying to check group membership" , error); |
10535 | goto out; |
10536 | } |
10537 | if (!ismember) { |
10538 | KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member" ); |
10539 | error = EPERM; |
10540 | goto out; |
10541 | } |
10542 | } |
10543 | } |
10544 | out: |
10545 | if (inherit_flags) { |
10546 | /* Apply SF_RESTRICTED to the file if its parent directory was |
10547 | * restricted. This is done at the end so that root is not |
10548 | * required if this flag is only set due to inheritance. */ |
10549 | VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags)); |
10550 | } |
10551 | if (defaulted_fieldsp) { |
10552 | if (defaulted_mode) { |
10553 | *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; |
10554 | } |
10555 | if (defaulted_group) { |
10556 | *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID; |
10557 | } |
10558 | if (defaulted_owner) { |
10559 | *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID; |
10560 | } |
10561 | } |
10562 | return error; |
10563 | } |
10564 | |
10565 | /* |
10566 | * Check that the attribute information in vap can be legally written by the |
10567 | * context. |
10568 | * |
10569 | * Call this when you're not sure about the vnode_attr; either its contents |
10570 | * have come from an unknown source, or when they are variable. |
10571 | * |
10572 | * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that |
10573 | * must be authorized to be permitted to write the vattr. |
10574 | */ |
10575 | int |
10576 | vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx) |
10577 | { |
10578 | struct vnode_attr ova; |
10579 | kauth_action_t required_action; |
10580 | int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid; |
10581 | guid_t changer; |
10582 | gid_t group; |
10583 | uid_t owner; |
10584 | mode_t newmode; |
10585 | kauth_cred_t cred; |
10586 | uint32_t fdelta; |
10587 | |
10588 | VATTR_INIT(&ova); |
10589 | required_action = 0; |
10590 | error = 0; |
10591 | |
10592 | /* |
10593 | * Quickly check for enforcement applicability. |
10594 | */ |
10595 | if (vfs_authopaque(mp: vp->v_mount)) { |
10596 | goto out; |
10597 | } |
10598 | |
10599 | /* |
10600 | * Check for attempts to set nonsensical fields. |
10601 | */ |
10602 | if (vap->va_active & VNODE_ATTR_RDONLY) { |
10603 | KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)" ); |
10604 | error = EINVAL; |
10605 | goto out; |
10606 | } |
10607 | |
10608 | /* |
10609 | * We need to know if the caller is the superuser. |
10610 | */ |
10611 | cred = vfs_context_ucred(ctx); |
10612 | has_priv_suser = kauth_cred_issuser(cred: cred); |
10613 | |
10614 | /* |
10615 | * If any of the following are changing, we need information from the old file: |
10616 | * va_uid |
10617 | * va_gid |
10618 | * va_mode |
10619 | * va_uuuid |
10620 | * va_guuid |
10621 | */ |
10622 | if (VATTR_IS_ACTIVE(vap, va_uid) || |
10623 | VATTR_IS_ACTIVE(vap, va_gid) || |
10624 | VATTR_IS_ACTIVE(vap, va_mode) || |
10625 | VATTR_IS_ACTIVE(vap, va_uuuid) || |
10626 | VATTR_IS_ACTIVE(vap, va_guuid)) { |
10627 | VATTR_WANTED(&ova, va_mode); |
10628 | VATTR_WANTED(&ova, va_uid); |
10629 | VATTR_WANTED(&ova, va_gid); |
10630 | VATTR_WANTED(&ova, va_uuuid); |
10631 | VATTR_WANTED(&ova, va_guuid); |
10632 | KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes" ); |
10633 | } |
10634 | |
10635 | /* |
10636 | * If timestamps are being changed, we need to know who the file is owned |
10637 | * by. |
10638 | */ |
10639 | if (VATTR_IS_ACTIVE(vap, va_create_time) || |
10640 | VATTR_IS_ACTIVE(vap, va_change_time) || |
10641 | VATTR_IS_ACTIVE(vap, va_modify_time) || |
10642 | VATTR_IS_ACTIVE(vap, va_access_time) || |
10643 | VATTR_IS_ACTIVE(vap, va_backup_time) || |
10644 | VATTR_IS_ACTIVE(vap, va_addedtime)) { |
10645 | VATTR_WANTED(&ova, va_uid); |
10646 | #if 0 /* enable this when we support UUIDs as official owners */ |
10647 | VATTR_WANTED(&ova, va_uuuid); |
10648 | #endif |
10649 | KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID" ); |
10650 | } |
10651 | |
10652 | /* |
10653 | * If flags are being changed, we need the old flags. |
10654 | */ |
10655 | if (VATTR_IS_ACTIVE(vap, va_flags)) { |
10656 | KAUTH_DEBUG("ATTR - flags changing, fetching old flags" ); |
10657 | VATTR_WANTED(&ova, va_flags); |
10658 | } |
10659 | |
10660 | /* |
10661 | * If ACLs are being changed, we need the old ACLs. |
10662 | */ |
10663 | if (VATTR_IS_ACTIVE(vap, va_acl)) { |
10664 | KAUTH_DEBUG("ATTR - acl changing, fetching old flags" ); |
10665 | VATTR_WANTED(&ova, va_acl); |
10666 | } |
10667 | |
10668 | /* |
10669 | * If the size is being set, make sure it's not a directory. |
10670 | */ |
10671 | if (VATTR_IS_ACTIVE(vap, va_data_size)) { |
10672 | /* size is only meaningful on regular files, don't permit otherwise */ |
10673 | if (!vnode_isreg(vp)) { |
10674 | KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file" ); |
10675 | error = vnode_isdir(vp) ? EISDIR : EINVAL; |
10676 | goto out; |
10677 | } |
10678 | } |
10679 | |
10680 | /* |
10681 | * Get old data. |
10682 | */ |
10683 | KAUTH_DEBUG("ATTR - fetching old attributes %016llx" , ova.va_active); |
10684 | if ((error = vnode_getattr(vp, vap: &ova, ctx)) != 0) { |
10685 | KAUTH_DEBUG(" ERROR - got %d trying to get attributes" , error); |
10686 | goto out; |
10687 | } |
10688 | |
10689 | /* |
10690 | * Size changes require write access to the file data. |
10691 | */ |
10692 | if (VATTR_IS_ACTIVE(vap, va_data_size)) { |
10693 | /* if we can't get the size, or it's different, we need write access */ |
10694 | KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA" ); |
10695 | required_action |= KAUTH_VNODE_WRITE_DATA; |
10696 | } |
10697 | |
10698 | /* |
10699 | * Changing timestamps? |
10700 | * |
10701 | * Note that we are only called to authorize user-requested time changes; |
10702 | * side-effect time changes are not authorized. Authorisation is only |
10703 | * required for existing files. |
10704 | * |
10705 | * Non-owners are not permitted to change the time on an existing |
10706 | * file to anything other than the current time. |
10707 | */ |
10708 | if (VATTR_IS_ACTIVE(vap, va_create_time) || |
10709 | VATTR_IS_ACTIVE(vap, va_change_time) || |
10710 | VATTR_IS_ACTIVE(vap, va_modify_time) || |
10711 | VATTR_IS_ACTIVE(vap, va_access_time) || |
10712 | VATTR_IS_ACTIVE(vap, va_backup_time) || |
10713 | VATTR_IS_ACTIVE(vap, va_addedtime)) { |
10714 | /* |
10715 | * The owner and root may set any timestamps they like, |
10716 | * provided that the file is not immutable. The owner still needs |
10717 | * WRITE_ATTRIBUTES (implied by ownership but still deniable). |
10718 | */ |
10719 | if (has_priv_suser || vauth_node_owner(vap: &ova, cred)) { |
10720 | KAUTH_DEBUG("ATTR - root or owner changing timestamps" ); |
10721 | required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES; |
10722 | } else { |
10723 | /* just setting the current time? */ |
10724 | if (vap->va_vaflags & VA_UTIMES_NULL) { |
10725 | KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES" ); |
10726 | required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; |
10727 | } else { |
10728 | KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted" ); |
10729 | error = EACCES; |
10730 | goto out; |
10731 | } |
10732 | } |
10733 | } |
10734 | |
10735 | /* |
10736 | * Changing file mode? |
10737 | */ |
10738 | if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) { |
10739 | KAUTH_DEBUG("ATTR - mode change from %06o to %06o" , ova.va_mode, vap->va_mode); |
10740 | |
10741 | /* |
10742 | * Mode changes always have the same basic auth requirements. |
10743 | */ |
10744 | if (has_priv_suser) { |
10745 | KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check" ); |
10746 | required_action |= KAUTH_VNODE_CHECKIMMUTABLE; |
10747 | } else { |
10748 | /* need WRITE_SECURITY */ |
10749 | KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY" ); |
10750 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
10751 | } |
10752 | |
10753 | /* |
10754 | * Can't set the setgid bit if you're not in the group and not root. Have to have |
10755 | * existing group information in the case we're not setting it right now. |
10756 | */ |
10757 | if (vap->va_mode & S_ISGID) { |
10758 | required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ |
10759 | if (!has_priv_suser) { |
10760 | if (VATTR_IS_ACTIVE(vap, va_gid)) { |
10761 | group = vap->va_gid; |
10762 | } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) { |
10763 | group = ova.va_gid; |
10764 | } else { |
10765 | KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available" ); |
10766 | error = EINVAL; |
10767 | goto out; |
10768 | } |
10769 | /* |
10770 | * This might be too restrictive; WRITE_SECURITY might be implied by |
10771 | * membership in this case, rather than being an additional requirement. |
10772 | */ |
10773 | if ((error = kauth_cred_ismember_gid(cred: cred, gid: group, resultp: &ismember)) != 0) { |
10774 | KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d" , error, vap->va_gid); |
10775 | goto out; |
10776 | } |
10777 | if (!ismember) { |
10778 | KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d" , group); |
10779 | error = EPERM; |
10780 | goto out; |
10781 | } |
10782 | } |
10783 | } |
10784 | |
10785 | /* |
10786 | * Can't set the setuid bit unless you're root or the file's owner. |
10787 | */ |
10788 | if (vap->va_mode & S_ISUID) { |
10789 | required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ |
10790 | if (!has_priv_suser) { |
10791 | if (VATTR_IS_ACTIVE(vap, va_uid)) { |
10792 | owner = vap->va_uid; |
10793 | } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) { |
10794 | owner = ova.va_uid; |
10795 | } else { |
10796 | KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available" ); |
10797 | error = EINVAL; |
10798 | goto out; |
10799 | } |
10800 | if (owner != kauth_cred_getuid(cred: cred)) { |
10801 | /* |
10802 | * We could allow this if WRITE_SECURITY is permitted, perhaps. |
10803 | */ |
10804 | KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit" ); |
10805 | error = EPERM; |
10806 | goto out; |
10807 | } |
10808 | } |
10809 | } |
10810 | } |
10811 | |
10812 | /* |
10813 | * Validate/mask flags changes. This checks that only the flags in |
10814 | * the UF_SETTABLE mask are being set, and preserves the flags in |
10815 | * the SF_SETTABLE case. |
10816 | * |
10817 | * Since flags changes may be made in conjunction with other changes, |
10818 | * we will ask the auth code to ignore immutability in the case that |
10819 | * the SF_* flags are not set and we are only manipulating the file flags. |
10820 | * |
10821 | */ |
10822 | if (VATTR_IS_ACTIVE(vap, va_flags)) { |
10823 | /* compute changing flags bits */ |
10824 | vap->va_flags &= ~SF_SYNTHETIC; |
10825 | ova.va_flags &= ~SF_SYNTHETIC; |
10826 | if (VATTR_IS_SUPPORTED(&ova, va_flags)) { |
10827 | fdelta = vap->va_flags ^ ova.va_flags; |
10828 | } else { |
10829 | fdelta = vap->va_flags; |
10830 | } |
10831 | |
10832 | if (fdelta != 0) { |
10833 | KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY" ); |
10834 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
10835 | |
10836 | /* check that changing bits are legal */ |
10837 | if (has_priv_suser) { |
10838 | /* |
10839 | * The immutability check will prevent us from clearing the SF_* |
10840 | * flags unless the system securelevel permits it, so just check |
10841 | * for legal flags here. |
10842 | */ |
10843 | if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) { |
10844 | error = EPERM; |
10845 | KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)" ); |
10846 | goto out; |
10847 | } |
10848 | } else { |
10849 | if (fdelta & ~UF_SETTABLE) { |
10850 | error = EPERM; |
10851 | KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)" ); |
10852 | goto out; |
10853 | } |
10854 | } |
10855 | /* |
10856 | * If the caller has the ability to manipulate file flags, |
10857 | * security is not reduced by ignoring them for this operation. |
10858 | * |
10859 | * A more complete test here would consider the 'after' states of the flags |
10860 | * to determine whether it would permit the operation, but this becomes |
10861 | * very complex. |
10862 | * |
10863 | * Ignoring immutability is conditional on securelevel; this does not bypass |
10864 | * the SF_* flags if securelevel > 0. |
10865 | */ |
10866 | required_action |= KAUTH_VNODE_NOIMMUTABLE; |
10867 | } |
10868 | } |
10869 | |
10870 | /* |
10871 | * Validate ownership information. |
10872 | */ |
10873 | chowner = 0; |
10874 | chgroup = 0; |
10875 | clear_suid = 0; |
10876 | clear_sgid = 0; |
10877 | |
10878 | /* |
10879 | * uid changing |
10880 | * Note that if the filesystem didn't give us a UID, we expect that it doesn't |
10881 | * support them in general, and will ignore it if/when we try to set it. |
10882 | * We might want to clear the uid out of vap completely here. |
10883 | */ |
10884 | if (VATTR_IS_ACTIVE(vap, va_uid)) { |
10885 | if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) { |
10886 | if (!has_priv_suser && (kauth_cred_getuid(cred: cred) != vap->va_uid)) { |
10887 | KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party" ); |
10888 | error = EPERM; |
10889 | goto out; |
10890 | } |
10891 | chowner = 1; |
10892 | } |
10893 | clear_suid = 1; |
10894 | } |
10895 | |
10896 | /* |
10897 | * gid changing |
10898 | * Note that if the filesystem didn't give us a GID, we expect that it doesn't |
10899 | * support them in general, and will ignore it if/when we try to set it. |
10900 | * We might want to clear the gid out of vap completely here. |
10901 | */ |
10902 | if (VATTR_IS_ACTIVE(vap, va_gid)) { |
10903 | if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) { |
10904 | if (!has_priv_suser) { |
10905 | if ((error = kauth_cred_ismember_gid(cred: cred, gid: vap->va_gid, resultp: &ismember)) != 0) { |
10906 | KAUTH_DEBUG(" ERROR - got %d checking for membership in %d" , error, vap->va_gid); |
10907 | goto out; |
10908 | } |
10909 | if (!ismember) { |
10910 | KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group" , |
10911 | ova.va_gid, vap->va_gid); |
10912 | error = EPERM; |
10913 | goto out; |
10914 | } |
10915 | } |
10916 | chgroup = 1; |
10917 | } |
10918 | clear_sgid = 1; |
10919 | } |
10920 | |
10921 | /* |
10922 | * Owner UUID being set or changed. |
10923 | */ |
10924 | if (VATTR_IS_ACTIVE(vap, va_uuuid)) { |
10925 | /* if the owner UUID is not actually changing ... */ |
10926 | if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) { |
10927 | if (kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &ova.va_uuuid)) { |
10928 | goto no_uuuid_change; |
10929 | } |
10930 | |
10931 | /* |
10932 | * If the current owner UUID is a null GUID, check |
10933 | * it against the UUID corresponding to the owner UID. |
10934 | */ |
10935 | if (kauth_guid_equal(guid1: &ova.va_uuuid, guid2: &kauth_null_guid) && |
10936 | VATTR_IS_SUPPORTED(&ova, va_uid)) { |
10937 | guid_t uid_guid; |
10938 | |
10939 | if (kauth_cred_uid2guid(uid: ova.va_uid, guidp: &uid_guid) == 0 && |
10940 | kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &uid_guid)) { |
10941 | goto no_uuuid_change; |
10942 | } |
10943 | } |
10944 | } |
10945 | |
10946 | /* |
10947 | * The owner UUID cannot be set by a non-superuser to anything other than |
10948 | * their own or a null GUID (to "unset" the owner UUID). |
10949 | * Note that file systems must be prepared to handle the |
10950 | * null UUID case in a manner appropriate for that file |
10951 | * system. |
10952 | */ |
10953 | if (!has_priv_suser) { |
10954 | if ((error = kauth_cred_getguid(cred: cred, guidp: &changer)) != 0) { |
10955 | KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID" , error); |
10956 | /* XXX ENOENT here - no UUID - should perhaps become EPERM */ |
10957 | goto out; |
10958 | } |
10959 | if (!kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &changer) && |
10960 | !kauth_guid_equal(guid1: &vap->va_uuuid, guid2: &kauth_null_guid)) { |
10961 | KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null" ); |
10962 | error = EPERM; |
10963 | goto out; |
10964 | } |
10965 | } |
10966 | chowner = 1; |
10967 | clear_suid = 1; |
10968 | } |
10969 | no_uuuid_change: |
10970 | /* |
10971 | * Group UUID being set or changed. |
10972 | */ |
10973 | if (VATTR_IS_ACTIVE(vap, va_guuid)) { |
10974 | /* if the group UUID is not actually changing ... */ |
10975 | if (VATTR_IS_SUPPORTED(&ova, va_guuid)) { |
10976 | if (kauth_guid_equal(guid1: &vap->va_guuid, guid2: &ova.va_guuid)) { |
10977 | goto no_guuid_change; |
10978 | } |
10979 | |
10980 | /* |
10981 | * If the current group UUID is a null UUID, check |
10982 | * it against the UUID corresponding to the group GID. |
10983 | */ |
10984 | if (kauth_guid_equal(guid1: &ova.va_guuid, guid2: &kauth_null_guid) && |
10985 | VATTR_IS_SUPPORTED(&ova, va_gid)) { |
10986 | guid_t gid_guid; |
10987 | |
10988 | if (kauth_cred_gid2guid(gid: ova.va_gid, guidp: &gid_guid) == 0 && |
10989 | kauth_guid_equal(guid1: &vap->va_guuid, guid2: &gid_guid)) { |
10990 | goto no_guuid_change; |
10991 | } |
10992 | } |
10993 | } |
10994 | |
10995 | /* |
10996 | * The group UUID cannot be set by a non-superuser to anything other than |
10997 | * one of which they are a member or a null GUID (to "unset" |
10998 | * the group UUID). |
10999 | * Note that file systems must be prepared to handle the |
11000 | * null UUID case in a manner appropriate for that file |
11001 | * system. |
11002 | */ |
11003 | if (!has_priv_suser) { |
11004 | if (kauth_guid_equal(guid1: &vap->va_guuid, guid2: &kauth_null_guid)) { |
11005 | ismember = 1; |
11006 | } else if ((error = kauth_cred_ismember_guid(cred: cred, guidp: &vap->va_guuid, resultp: &ismember)) != 0) { |
11007 | KAUTH_DEBUG(" ERROR - got %d trying to check group membership" , error); |
11008 | goto out; |
11009 | } |
11010 | if (!ismember) { |
11011 | KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null" ); |
11012 | error = EPERM; |
11013 | goto out; |
11014 | } |
11015 | } |
11016 | chgroup = 1; |
11017 | } |
11018 | no_guuid_change: |
11019 | |
11020 | /* |
11021 | * Compute authorisation for group/ownership changes. |
11022 | */ |
11023 | if (chowner || chgroup || clear_suid || clear_sgid) { |
11024 | if (has_priv_suser) { |
11025 | KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check" ); |
11026 | required_action |= KAUTH_VNODE_CHECKIMMUTABLE; |
11027 | } else { |
11028 | if (chowner) { |
11029 | KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP" ); |
11030 | required_action |= KAUTH_VNODE_TAKE_OWNERSHIP; |
11031 | } |
11032 | if (chgroup && !chowner) { |
11033 | KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY" ); |
11034 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
11035 | } |
11036 | } |
11037 | |
11038 | /* |
11039 | * clear set-uid and set-gid bits. POSIX only requires this for |
11040 | * non-privileged processes but we do it even for root. |
11041 | */ |
11042 | if (VATTR_IS_ACTIVE(vap, va_mode)) { |
11043 | newmode = vap->va_mode; |
11044 | } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { |
11045 | newmode = ova.va_mode; |
11046 | } else { |
11047 | KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits" ); |
11048 | newmode = 0; |
11049 | } |
11050 | |
11051 | /* chown always clears setuid/gid bits. An exception is made for |
11052 | * setattrlist which can set both at the same time: <uid, gid, mode> on a file: |
11053 | * setattrlist is allowed to set the new mode on the file and change (chown) |
11054 | * uid/gid. |
11055 | */ |
11056 | if (newmode & (S_ISUID | S_ISGID)) { |
11057 | if (!VATTR_IS_ACTIVE(vap, va_mode)) { |
11058 | KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o" , |
11059 | newmode, newmode & ~(S_ISUID | S_ISGID)); |
11060 | newmode &= ~(S_ISUID | S_ISGID); |
11061 | } |
11062 | VATTR_SET(vap, va_mode, newmode); |
11063 | } |
11064 | } |
11065 | |
11066 | /* |
11067 | * Authorise changes in the ACL. |
11068 | */ |
11069 | if (VATTR_IS_ACTIVE(vap, va_acl)) { |
11070 | /* no existing ACL */ |
11071 | if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) { |
11072 | /* adding an ACL */ |
11073 | if (vap->va_acl != NULL) { |
11074 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
11075 | KAUTH_DEBUG("CHMOD - adding ACL" ); |
11076 | } |
11077 | |
11078 | /* removing an existing ACL */ |
11079 | } else if (vap->va_acl == NULL) { |
11080 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
11081 | KAUTH_DEBUG("CHMOD - removing ACL" ); |
11082 | |
11083 | /* updating an existing ACL */ |
11084 | } else { |
11085 | if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) { |
11086 | /* entry count changed, must be different */ |
11087 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
11088 | KAUTH_DEBUG("CHMOD - adding/removing ACL entries" ); |
11089 | } else if (vap->va_acl->acl_entrycount > 0) { |
11090 | /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ |
11091 | if (memcmp(s1: &vap->va_acl->acl_ace[0], s2: &ova.va_acl->acl_ace[0], |
11092 | n: sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { |
11093 | required_action |= KAUTH_VNODE_WRITE_SECURITY; |
11094 | KAUTH_DEBUG("CHMOD - changing ACL entries" ); |
11095 | } |
11096 | } |
11097 | } |
11098 | } |
11099 | |
11100 | /* |
11101 | * Other attributes that require authorisation. |
11102 | */ |
11103 | if (VATTR_IS_ACTIVE(vap, va_encoding)) { |
11104 | required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; |
11105 | } |
11106 | |
11107 | out: |
11108 | if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) { |
11109 | kauth_acl_free(fsp: ova.va_acl); |
11110 | } |
11111 | if (error == 0) { |
11112 | *actionp = required_action; |
11113 | } |
11114 | return error; |
11115 | } |
11116 | |
11117 | static int |
11118 | setlocklocal_callback(struct vnode *vp, __unused void *cargs) |
11119 | { |
11120 | vnode_lock_spin(vp); |
11121 | vp->v_flag |= VLOCKLOCAL; |
11122 | vnode_unlock(vp); |
11123 | |
11124 | return VNODE_RETURNED; |
11125 | } |
11126 | |
11127 | void |
11128 | vfs_setlocklocal(mount_t mp) |
11129 | { |
11130 | mount_lock_spin(mp); |
11131 | mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; |
11132 | mount_unlock(mp); |
11133 | |
11134 | /* |
11135 | * The number of active vnodes is expected to be |
11136 | * very small when vfs_setlocklocal is invoked. |
11137 | */ |
11138 | vnode_iterate(mp, flags: 0, callout: setlocklocal_callback, NULL); |
11139 | } |
11140 | |
11141 | void |
11142 | vfs_setcompoundopen(mount_t mp) |
11143 | { |
11144 | mount_lock_spin(mp); |
11145 | mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN; |
11146 | mount_unlock(mp); |
11147 | } |
11148 | |
11149 | void |
11150 | vnode_setswapmount(vnode_t vp) |
11151 | { |
11152 | mount_lock(mp: vp->v_mount); |
11153 | vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT; |
11154 | mount_unlock(mp: vp->v_mount); |
11155 | } |
11156 | |
11157 | void |
11158 | vfs_setfskit(mount_t mp) |
11159 | { |
11160 | mount_lock_spin(mp); |
11161 | mp->mnt_kern_flag |= MNTK_FSKIT; |
11162 | mount_unlock(mp); |
11163 | } |
11164 | |
11165 | char * |
11166 | vfs_getfstypenameref_locked(mount_t mp, size_t *lenp) |
11167 | { |
11168 | char *name; |
11169 | |
11170 | if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { |
11171 | name = mp->fstypename_override; |
11172 | } else { |
11173 | name = mp->mnt_vfsstat.f_fstypename; |
11174 | } |
11175 | if (lenp != NULL) { |
11176 | *lenp = strlen(s: name); |
11177 | } |
11178 | return name; |
11179 | } |
11180 | |
11181 | void |
11182 | vfs_getfstypename(mount_t mp, char *buf, size_t buflen) |
11183 | { |
11184 | mount_lock_spin(mp); |
11185 | strlcpy(dst: buf, src: vfs_getfstypenameref_locked(mp, NULL), n: buflen); |
11186 | mount_unlock(mp); |
11187 | } |
11188 | |
11189 | void |
11190 | vfs_setfstypename_locked(mount_t mp, const char *name) |
11191 | { |
11192 | if (name == NULL || name[0] == '\0') { |
11193 | mp->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE; |
11194 | mp->fstypename_override[0] = '\0'; |
11195 | } else { |
11196 | strlcpy(dst: mp->fstypename_override, src: name, |
11197 | n: sizeof(mp->fstypename_override)); |
11198 | mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE; |
11199 | } |
11200 | } |
11201 | |
11202 | void |
11203 | vfs_setfstypename(mount_t mp, const char *name) |
11204 | { |
11205 | mount_lock_spin(mp); |
11206 | vfs_setfstypename_locked(mp, name); |
11207 | mount_unlock(mp); |
11208 | } |
11209 | |
11210 | int64_t |
11211 | vnode_getswappin_avail(vnode_t vp) |
11212 | { |
11213 | int64_t max_swappin_avail = 0; |
11214 | |
11215 | mount_lock(mp: vp->v_mount); |
11216 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) { |
11217 | max_swappin_avail = vp->v_mount->mnt_max_swappin_available; |
11218 | } |
11219 | mount_unlock(mp: vp->v_mount); |
11220 | |
11221 | return max_swappin_avail; |
11222 | } |
11223 | |
11224 | |
11225 | void |
11226 | vn_setunionwait(vnode_t vp) |
11227 | { |
11228 | vnode_lock_spin(vp); |
11229 | vp->v_flag |= VISUNION; |
11230 | vnode_unlock(vp); |
11231 | } |
11232 | |
11233 | |
11234 | void |
11235 | vn_checkunionwait(vnode_t vp) |
11236 | { |
11237 | vnode_lock_spin(vp); |
11238 | while ((vp->v_flag & VISUNION) == VISUNION) { |
11239 | msleep(chan: (caddr_t)&vp->v_flag, mtx: &vp->v_lock, pri: 0, wmesg: 0, ts: 0); |
11240 | } |
11241 | vnode_unlock(vp); |
11242 | } |
11243 | |
11244 | void |
11245 | vn_clearunionwait(vnode_t vp, int locked) |
11246 | { |
11247 | if (!locked) { |
11248 | vnode_lock_spin(vp); |
11249 | } |
11250 | if ((vp->v_flag & VISUNION) == VISUNION) { |
11251 | vp->v_flag &= ~VISUNION; |
11252 | wakeup(chan: (caddr_t)&vp->v_flag); |
11253 | } |
11254 | if (!locked) { |
11255 | vnode_unlock(vp); |
11256 | } |
11257 | } |
11258 | |
11259 | /* |
11260 | * Removes orphaned apple double files during a rmdir |
11261 | * Works by: |
11262 | * 1. vnode_suspend(). |
11263 | * 2. Call VNOP_READDIR() till the end of directory is reached. |
11264 | * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY. |
11265 | * 4. Continue (2) and (3) till end of directory is reached. |
11266 | * 5. If all the entries in the directory were files with "._" name, delete all the files. |
11267 | * 6. vnode_resume() |
11268 | * 7. If deletion of all files succeeded, call VNOP_RMDIR() again. |
11269 | */ |
11270 | |
11271 | errno_t |
11272 | rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag) |
11273 | { |
11274 | #define UIO_BUFF_SIZE 2048 |
11275 | uio_t auio = NULL; |
11276 | int eofflag, siz = UIO_BUFF_SIZE, alloc_size = 0, nentries = 0; |
11277 | int open_flag = 0, full_erase_flag = 0; |
11278 | UIO_STACKBUF(uio_buf, 1); |
11279 | char *rbuf = NULL; |
11280 | void *dir_pos; |
11281 | void *dir_end; |
11282 | struct dirent *dp; |
11283 | errno_t error; |
11284 | |
11285 | error = vnode_suspend(vp); |
11286 | |
11287 | /* |
11288 | * restart_flag is set so that the calling rmdir sleeps and resets |
11289 | */ |
11290 | if (error == EBUSY) { |
11291 | *restart_flag = 1; |
11292 | } |
11293 | if (error != 0) { |
11294 | return error; |
11295 | } |
11296 | |
11297 | /* |
11298 | * Prevent dataless fault materialization while we have |
11299 | * a suspended vnode. |
11300 | */ |
11301 | uthread_t ut = current_uthread(); |
11302 | bool saved_nodatalessfaults = |
11303 | (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false; |
11304 | ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS; |
11305 | |
11306 | /* |
11307 | * set up UIO |
11308 | */ |
11309 | rbuf = kalloc_data(siz, Z_WAITOK); |
11310 | alloc_size = siz; |
11311 | if (rbuf) { |
11312 | auio = uio_createwithbuffer(a_iovcount: 1, a_offset: 0, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ, |
11313 | a_buf_p: &uio_buf[0], a_buffer_size: sizeof(uio_buf)); |
11314 | } |
11315 | if (!rbuf || !auio) { |
11316 | error = ENOMEM; |
11317 | goto outsc; |
11318 | } |
11319 | |
11320 | uio_setoffset(a_uio: auio, a_offset: 0); |
11321 | |
11322 | eofflag = 0; |
11323 | |
11324 | if ((error = VNOP_OPEN(vp, FREAD, ctx))) { |
11325 | goto outsc; |
11326 | } else { |
11327 | open_flag = 1; |
11328 | } |
11329 | |
11330 | /* |
11331 | * First pass checks if all files are appleDouble files. |
11332 | */ |
11333 | |
11334 | do { |
11335 | siz = UIO_BUFF_SIZE; |
11336 | uio_reset(a_uio: auio, a_offset: uio_offset(a_uio: auio), a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ); |
11337 | uio_addiov(a_uio: auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); |
11338 | |
11339 | if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) { |
11340 | goto outsc; |
11341 | } |
11342 | |
11343 | if (uio_resid(a_uio: auio) != 0) { |
11344 | siz -= uio_resid(a_uio: auio); |
11345 | } |
11346 | |
11347 | /* |
11348 | * Iterate through directory |
11349 | */ |
11350 | dir_pos = (void*) rbuf; |
11351 | dir_end = (void*) (rbuf + siz); |
11352 | dp = (struct dirent*) (dir_pos); |
11353 | |
11354 | if (dir_pos == dir_end) { |
11355 | eofflag = 1; |
11356 | } |
11357 | |
11358 | while (dir_pos < dir_end) { |
11359 | /* |
11360 | * Check for . and .. as well as directories |
11361 | */ |
11362 | if (dp->d_ino != 0 && |
11363 | !((dp->d_namlen == 1 && dp->d_name[0] == '.') || |
11364 | (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) { |
11365 | /* |
11366 | * Check for irregular files and ._ files |
11367 | * If there is a ._._ file abort the op |
11368 | */ |
11369 | if (dp->d_namlen < 2 || |
11370 | strncmp(s1: dp->d_name, s2: "._" , n: 2) || |
11371 | (dp->d_namlen >= 4 && !strncmp(s1: &(dp->d_name[2]), s2: "._" , n: 2))) { |
11372 | error = ENOTEMPTY; |
11373 | goto outsc; |
11374 | } |
11375 | } |
11376 | dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen); |
11377 | dp = (struct dirent*)dir_pos; |
11378 | } |
11379 | |
11380 | /* |
11381 | * workaround for HFS/NFS setting eofflag before end of file |
11382 | */ |
11383 | if (vp->v_tag == VT_HFS && nentries > 2) { |
11384 | eofflag = 0; |
11385 | } |
11386 | |
11387 | if (vp->v_tag == VT_NFS) { |
11388 | if (eofflag && !full_erase_flag) { |
11389 | full_erase_flag = 1; |
11390 | eofflag = 0; |
11391 | uio_reset(a_uio: auio, a_offset: 0, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ); |
11392 | } else if (!eofflag && full_erase_flag) { |
11393 | full_erase_flag = 0; |
11394 | } |
11395 | } |
11396 | } while (!eofflag); |
11397 | /* |
11398 | * If we've made it here all the files in the dir are ._ files. |
11399 | * We can delete the files even though the node is suspended |
11400 | * because we are the owner of the file. |
11401 | */ |
11402 | |
11403 | uio_reset(a_uio: auio, a_offset: 0, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ); |
11404 | eofflag = 0; |
11405 | full_erase_flag = 0; |
11406 | |
11407 | do { |
11408 | siz = UIO_BUFF_SIZE; |
11409 | uio_reset(a_uio: auio, a_offset: uio_offset(a_uio: auio), a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ); |
11410 | uio_addiov(a_uio: auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); |
11411 | |
11412 | error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx); |
11413 | |
11414 | if (error != 0) { |
11415 | goto outsc; |
11416 | } |
11417 | |
11418 | if (uio_resid(a_uio: auio) != 0) { |
11419 | siz -= uio_resid(a_uio: auio); |
11420 | } |
11421 | |
11422 | /* |
11423 | * Iterate through directory |
11424 | */ |
11425 | dir_pos = (void*) rbuf; |
11426 | dir_end = (void*) (rbuf + siz); |
11427 | dp = (struct dirent*) dir_pos; |
11428 | |
11429 | if (dir_pos == dir_end) { |
11430 | eofflag = 1; |
11431 | } |
11432 | |
11433 | while (dir_pos < dir_end) { |
11434 | /* |
11435 | * Check for . and .. as well as directories |
11436 | */ |
11437 | if (dp->d_ino != 0 && |
11438 | !((dp->d_namlen == 1 && dp->d_name[0] == '.') || |
11439 | (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) |
11440 | ) { |
11441 | error = unlink1(ctx, vp, |
11442 | CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE, |
11443 | VNODE_REMOVE_SKIP_NAMESPACE_EVENT | |
11444 | VNODE_REMOVE_NO_AUDIT_PATH); |
11445 | |
11446 | if (error && error != ENOENT) { |
11447 | goto outsc; |
11448 | } |
11449 | } |
11450 | dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen); |
11451 | dp = (struct dirent*)dir_pos; |
11452 | } |
11453 | |
11454 | /* |
11455 | * workaround for HFS/NFS setting eofflag before end of file |
11456 | */ |
11457 | if (vp->v_tag == VT_HFS && nentries > 2) { |
11458 | eofflag = 0; |
11459 | } |
11460 | |
11461 | if (vp->v_tag == VT_NFS) { |
11462 | if (eofflag && !full_erase_flag) { |
11463 | full_erase_flag = 1; |
11464 | eofflag = 0; |
11465 | uio_reset(a_uio: auio, a_offset: 0, a_spacetype: UIO_SYSSPACE, a_iodirection: UIO_READ); |
11466 | } else if (!eofflag && full_erase_flag) { |
11467 | full_erase_flag = 0; |
11468 | } |
11469 | } |
11470 | } while (!eofflag); |
11471 | |
11472 | |
11473 | error = 0; |
11474 | |
11475 | outsc: |
11476 | if (open_flag) { |
11477 | VNOP_CLOSE(vp, FREAD, ctx); |
11478 | } |
11479 | |
11480 | if (auio) { |
11481 | uio_free(a_uio: auio); |
11482 | } |
11483 | kfree_data(rbuf, alloc_size); |
11484 | |
11485 | if (saved_nodatalessfaults == false) { |
11486 | ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS; |
11487 | } |
11488 | |
11489 | vnode_resume(vp); |
11490 | |
11491 | return error; |
11492 | } |
11493 | |
11494 | |
11495 | void |
11496 | lock_vnode_and_post(vnode_t vp, int kevent_num) |
11497 | { |
11498 | /* Only take the lock if there's something there! */ |
11499 | if (vp->v_knotes.slh_first != NULL) { |
11500 | vnode_lock(vp); |
11501 | KNOTE(&vp->v_knotes, kevent_num); |
11502 | vnode_unlock(vp); |
11503 | } |
11504 | } |
11505 | |
11506 | void panic_print_vnodes(void); |
11507 | |
11508 | /* define PANIC_PRINTS_VNODES only if investigation is required. */ |
11509 | #ifdef PANIC_PRINTS_VNODES |
11510 | |
11511 | static const char * |
11512 | __vtype(uint16_t vtype) |
11513 | { |
11514 | switch (vtype) { |
11515 | case VREG: |
11516 | return "R" ; |
11517 | case VDIR: |
11518 | return "D" ; |
11519 | case VBLK: |
11520 | return "B" ; |
11521 | case VCHR: |
11522 | return "C" ; |
11523 | case VLNK: |
11524 | return "L" ; |
11525 | case VSOCK: |
11526 | return "S" ; |
11527 | case VFIFO: |
11528 | return "F" ; |
11529 | case VBAD: |
11530 | return "x" ; |
11531 | case VSTR: |
11532 | return "T" ; |
11533 | case VCPLX: |
11534 | return "X" ; |
11535 | default: |
11536 | return "?" ; |
11537 | } |
11538 | } |
11539 | |
11540 | /* |
11541 | * build a path from the bottom up |
11542 | * NOTE: called from the panic path - no alloc'ing of memory and no locks! |
11543 | */ |
11544 | static char * |
11545 | __vpath(vnode_t vp, char *str, int len, int depth) |
11546 | { |
11547 | int vnm_len; |
11548 | const char *src; |
11549 | char *dst; |
11550 | |
11551 | if (len <= 0) { |
11552 | return str; |
11553 | } |
11554 | /* str + len is the start of the string we created */ |
11555 | if (!vp->v_name) { |
11556 | return str + len; |
11557 | } |
11558 | |
11559 | /* follow mount vnodes to get the full path */ |
11560 | if ((vp->v_flag & VROOT)) { |
11561 | if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) { |
11562 | return __vpath(vp->v_mount->mnt_vnodecovered, |
11563 | str, len, depth + 1); |
11564 | } |
11565 | return str + len; |
11566 | } |
11567 | |
11568 | src = vp->v_name; |
11569 | vnm_len = strlen(src); |
11570 | if (vnm_len > len) { |
11571 | /* truncate the name to fit in the string */ |
11572 | src += (vnm_len - len); |
11573 | vnm_len = len; |
11574 | } |
11575 | |
11576 | /* start from the back and copy just characters (no NULLs) */ |
11577 | |
11578 | /* this will chop off leaf path (file) names */ |
11579 | if (depth > 0) { |
11580 | dst = str + len - vnm_len; |
11581 | memcpy(dst, src, vnm_len); |
11582 | len -= vnm_len; |
11583 | } else { |
11584 | dst = str + len; |
11585 | } |
11586 | |
11587 | if (vp->v_parent && len > 1) { |
11588 | /* follow parents up the chain */ |
11589 | len--; |
11590 | *(dst - 1) = '/'; |
11591 | return __vpath(vp->v_parent, str, len, depth + 1); |
11592 | } |
11593 | |
11594 | return dst; |
11595 | } |
11596 | |
11597 | #define SANE_VNODE_PRINT_LIMIT 5000 |
11598 | void |
11599 | panic_print_vnodes(void) |
11600 | { |
11601 | mount_t mnt; |
11602 | vnode_t vp; |
11603 | int nvnodes = 0; |
11604 | const char *type; |
11605 | char *nm; |
11606 | char vname[257]; |
11607 | |
11608 | paniclog_append_noflush("\n***** VNODES *****\n" |
11609 | "TYPE UREF ICNT PATH\n" ); |
11610 | |
11611 | /* NULL-terminate the path name */ |
11612 | vname[sizeof(vname) - 1] = '\0'; |
11613 | |
11614 | /* |
11615 | * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist |
11616 | */ |
11617 | TAILQ_FOREACH(mnt, &mountlist, mnt_list) { |
11618 | if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) { |
11619 | paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n" , |
11620 | &mountlist, mnt); |
11621 | break; |
11622 | } |
11623 | |
11624 | TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) { |
11625 | if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) { |
11626 | paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n" , |
11627 | &mnt->mnt_vnodelist, vp); |
11628 | break; |
11629 | } |
11630 | |
11631 | if (++nvnodes > SANE_VNODE_PRINT_LIMIT) { |
11632 | return; |
11633 | } |
11634 | type = __vtype(vp->v_type); |
11635 | nm = __vpath(vp, vname, sizeof(vname) - 1, 0); |
11636 | paniclog_append_noflush("%s %0d %0d %s\n" , |
11637 | type, vp->v_usecount, vp->v_iocount, nm); |
11638 | } |
11639 | } |
11640 | } |
11641 | |
11642 | #else /* !PANIC_PRINTS_VNODES */ |
11643 | void |
11644 | panic_print_vnodes(void) |
11645 | { |
11646 | return; |
11647 | } |
11648 | #endif |
11649 | |
11650 | |
11651 | #ifdef CONFIG_IOCOUNT_TRACE |
11652 | static void |
11653 | record_iocount_trace_vnode(vnode_t vp, int type) |
11654 | { |
11655 | void *stacks[IOCOUNT_TRACE_MAX_FRAMES] = {0}; |
11656 | int idx = vp->v_iocount_trace[type].idx; |
11657 | |
11658 | if (idx >= IOCOUNT_TRACE_MAX_IDX) { |
11659 | return; |
11660 | } |
11661 | |
11662 | OSBacktrace((void **)&stacks[0], IOCOUNT_TRACE_MAX_FRAMES); |
11663 | |
11664 | /* |
11665 | * To save index space, only store the unique backtraces. If dup is found, |
11666 | * just bump the count and return. |
11667 | */ |
11668 | for (int i = 0; i < idx; i++) { |
11669 | if (memcmp(&stacks[0], &vp->v_iocount_trace[type].stacks[i][0], |
11670 | sizeof(stacks)) == 0) { |
11671 | vp->v_iocount_trace[type].counts[i]++; |
11672 | return; |
11673 | } |
11674 | } |
11675 | |
11676 | memcpy(&vp->v_iocount_trace[type].stacks[idx][0], &stacks[0], |
11677 | sizeof(stacks)); |
11678 | vp->v_iocount_trace[type].counts[idx] = 1; |
11679 | vp->v_iocount_trace[type].idx++; |
11680 | } |
11681 | |
11682 | static void |
11683 | record_iocount_trace_uthread(vnode_t vp, int count) |
11684 | { |
11685 | struct uthread *ut; |
11686 | |
11687 | ut = current_uthread(); |
11688 | ut->uu_iocount += count; |
11689 | |
11690 | if (count == 1) { |
11691 | if (ut->uu_vpindex < 32) { |
11692 | OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10); |
11693 | |
11694 | ut->uu_vps[ut->uu_vpindex] = vp; |
11695 | ut->uu_vpindex++; |
11696 | } |
11697 | } |
11698 | } |
11699 | |
11700 | static void |
11701 | record_vp(vnode_t vp, int count) |
11702 | { |
11703 | if (__probable(bootarg_vnode_iocount_trace == 0 && |
11704 | bootarg_uthread_iocount_trace == 0)) { |
11705 | return; |
11706 | } |
11707 | |
11708 | #if CONFIG_TRIGGERS |
11709 | if (vp->v_resolve) { |
11710 | return; |
11711 | } |
11712 | #endif |
11713 | if ((vp->v_flag & VSYSTEM)) { |
11714 | return; |
11715 | } |
11716 | |
11717 | if (bootarg_vnode_iocount_trace) { |
11718 | record_iocount_trace_vnode(vp, |
11719 | (count > 0) ? IOCOUNT_TRACE_VGET : IOCOUNT_TRACE_VPUT); |
11720 | } |
11721 | if (bootarg_uthread_iocount_trace) { |
11722 | record_iocount_trace_uthread(vp, count); |
11723 | } |
11724 | } |
11725 | #endif /* CONFIG_IOCOUNT_TRACE */ |
11726 | |
11727 | #if CONFIG_TRIGGERS |
11728 | #define __triggers_unused |
11729 | #else |
11730 | #define __triggers_unused __unused |
11731 | #endif |
11732 | |
11733 | resolver_result_t |
11734 | vfs_resolver_result(__triggers_unused uint32_t seq, __triggers_unused enum resolver_status stat, __triggers_unused int aux) |
11735 | { |
11736 | #if CONFIG_TRIGGERS |
11737 | /* |
11738 | * |<--- 32 --->|<--- 28 --->|<- 4 ->| |
11739 | * sequence auxiliary status |
11740 | */ |
11741 | return (((uint64_t)seq) << 32) | |
11742 | (((uint64_t)(aux & 0x0fffffff)) << 4) | |
11743 | (uint64_t)(stat & 0x0000000F); |
11744 | #else |
11745 | return (0x0ULL) | (((uint64_t)ENOTSUP) << 4) | (((uint64_t)RESOLVER_ERROR) & 0xF); |
11746 | #endif |
11747 | } |
11748 | |
11749 | #if CONFIG_TRIGGERS |
11750 | |
11751 | #define TRIG_DEBUG 0 |
11752 | |
11753 | #if TRIG_DEBUG |
11754 | #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0) |
11755 | #else |
11756 | #define TRIG_LOG(...) |
11757 | #endif |
11758 | |
11759 | /* |
11760 | * Resolver result functions |
11761 | */ |
11762 | |
11763 | |
11764 | enum resolver_status |
11765 | vfs_resolver_status(resolver_result_t result) |
11766 | { |
11767 | /* lower 4 bits is status */ |
11768 | return result & 0x0000000F; |
11769 | } |
11770 | |
11771 | uint32_t |
11772 | vfs_resolver_sequence(resolver_result_t result) |
11773 | { |
11774 | /* upper 32 bits is sequence */ |
11775 | return (uint32_t)(result >> 32); |
11776 | } |
11777 | |
11778 | int |
11779 | vfs_resolver_auxiliary(resolver_result_t result) |
11780 | { |
11781 | /* 28 bits of auxiliary */ |
11782 | return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4); |
11783 | } |
11784 | |
11785 | /* |
11786 | * SPI |
11787 | * Call in for resolvers to update vnode trigger state |
11788 | */ |
11789 | int |
11790 | vnode_trigger_update(vnode_t vp, resolver_result_t result) |
11791 | { |
11792 | vnode_resolve_t rp; |
11793 | uint32_t seq; |
11794 | enum resolver_status stat; |
11795 | |
11796 | if (vp->v_resolve == NULL) { |
11797 | return EINVAL; |
11798 | } |
11799 | |
11800 | stat = vfs_resolver_status(result); |
11801 | seq = vfs_resolver_sequence(result); |
11802 | |
11803 | if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) { |
11804 | return EINVAL; |
11805 | } |
11806 | |
11807 | rp = vp->v_resolve; |
11808 | lck_mtx_lock(lck: &rp->vr_lock); |
11809 | |
11810 | if (seq > rp->vr_lastseq) { |
11811 | if (stat == RESOLVER_RESOLVED) { |
11812 | rp->vr_flags |= VNT_RESOLVED; |
11813 | } else { |
11814 | rp->vr_flags &= ~VNT_RESOLVED; |
11815 | } |
11816 | |
11817 | rp->vr_lastseq = seq; |
11818 | } |
11819 | |
11820 | lck_mtx_unlock(lck: &rp->vr_lock); |
11821 | |
11822 | return 0; |
11823 | } |
11824 | |
11825 | static int |
11826 | vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref) |
11827 | { |
11828 | int error; |
11829 | |
11830 | vnode_lock_spin(vp); |
11831 | if (vp->v_resolve != NULL) { |
11832 | vnode_unlock(vp); |
11833 | return EINVAL; |
11834 | } else { |
11835 | vp->v_resolve = rp; |
11836 | } |
11837 | vnode_unlock(vp); |
11838 | |
11839 | if (ref) { |
11840 | error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE); |
11841 | if (error != 0) { |
11842 | panic("VNODE_REF_FORCE didn't help..." ); |
11843 | } |
11844 | } |
11845 | |
11846 | return 0; |
11847 | } |
11848 | |
11849 | /* |
11850 | * VFS internal interfaces for vnode triggers |
11851 | * |
11852 | * vnode must already have an io count on entry |
11853 | * v_resolve is stable when io count is non-zero |
11854 | */ |
11855 | static int |
11856 | vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external) |
11857 | { |
11858 | vnode_resolve_t rp; |
11859 | int result; |
11860 | char byte; |
11861 | |
11862 | #if 1 |
11863 | /* minimum pointer test (debugging) */ |
11864 | if (tinfo->vnt_data) { |
11865 | byte = *((char *)tinfo->vnt_data); |
11866 | } |
11867 | #endif |
11868 | rp = kalloc_type(struct vnode_resolve, Z_WAITOK | Z_NOFAIL); |
11869 | |
11870 | lck_mtx_init(lck: &rp->vr_lock, grp: &trigger_vnode_lck_grp, attr: &trigger_vnode_lck_attr); |
11871 | |
11872 | rp->vr_resolve_func = tinfo->vnt_resolve_func; |
11873 | rp->vr_unresolve_func = tinfo->vnt_unresolve_func; |
11874 | rp->vr_rearm_func = tinfo->vnt_rearm_func; |
11875 | rp->vr_reclaim_func = tinfo->vnt_reclaim_func; |
11876 | rp->vr_data = tinfo->vnt_data; |
11877 | rp->vr_lastseq = 0; |
11878 | rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK; |
11879 | if (external) { |
11880 | rp->vr_flags |= VNT_EXTERNAL; |
11881 | } |
11882 | |
11883 | result = vnode_resolver_attach(vp, rp, ref: external); |
11884 | if (result != 0) { |
11885 | goto out; |
11886 | } |
11887 | |
11888 | if (mp) { |
11889 | OSAddAtomic(1, &mp->mnt_numtriggers); |
11890 | } |
11891 | |
11892 | return result; |
11893 | |
11894 | out: |
11895 | kfree_type(struct vnode_resolve, rp); |
11896 | return result; |
11897 | } |
11898 | |
11899 | static void |
11900 | vnode_resolver_release(vnode_resolve_t rp) |
11901 | { |
11902 | /* |
11903 | * Give them a chance to free any private data |
11904 | */ |
11905 | if (rp->vr_data && rp->vr_reclaim_func) { |
11906 | rp->vr_reclaim_func(NULLVP, rp->vr_data); |
11907 | } |
11908 | |
11909 | lck_mtx_destroy(lck: &rp->vr_lock, grp: &trigger_vnode_lck_grp); |
11910 | kfree_type(struct vnode_resolve, rp); |
11911 | } |
11912 | |
11913 | /* Called after the vnode has been drained */ |
11914 | static void |
11915 | vnode_resolver_detach(vnode_t vp) |
11916 | { |
11917 | vnode_resolve_t rp; |
11918 | mount_t mp; |
11919 | |
11920 | mp = vnode_mount(vp); |
11921 | |
11922 | vnode_lock(vp); |
11923 | rp = vp->v_resolve; |
11924 | vp->v_resolve = NULL; |
11925 | vnode_unlock(vp); |
11926 | |
11927 | if ((rp->vr_flags & VNT_EXTERNAL) != 0) { |
11928 | vnode_rele_ext(vp, O_EVTONLY, dont_reenter: 1); |
11929 | } |
11930 | |
11931 | vnode_resolver_release(rp); |
11932 | |
11933 | /* Keep count of active trigger vnodes per mount */ |
11934 | OSAddAtomic(-1, &mp->mnt_numtriggers); |
11935 | } |
11936 | |
11937 | __private_extern__ |
11938 | void |
11939 | vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx) |
11940 | { |
11941 | vnode_resolve_t rp; |
11942 | resolver_result_t result; |
11943 | enum resolver_status status; |
11944 | uint32_t seq; |
11945 | |
11946 | if ((vp->v_resolve == NULL) || |
11947 | (vp->v_resolve->vr_rearm_func == NULL) || |
11948 | (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) { |
11949 | return; |
11950 | } |
11951 | |
11952 | rp = vp->v_resolve; |
11953 | lck_mtx_lock(lck: &rp->vr_lock); |
11954 | |
11955 | /* |
11956 | * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes. |
11957 | */ |
11958 | if (rp->vr_flags & VNT_VFS_UNMOUNTED) { |
11959 | lck_mtx_unlock(lck: &rp->vr_lock); |
11960 | return; |
11961 | } |
11962 | |
11963 | /* Check if this vnode is already armed */ |
11964 | if ((rp->vr_flags & VNT_RESOLVED) == 0) { |
11965 | lck_mtx_unlock(lck: &rp->vr_lock); |
11966 | return; |
11967 | } |
11968 | |
11969 | lck_mtx_unlock(lck: &rp->vr_lock); |
11970 | |
11971 | result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx); |
11972 | status = vfs_resolver_status(result); |
11973 | seq = vfs_resolver_sequence(result); |
11974 | |
11975 | lck_mtx_lock(lck: &rp->vr_lock); |
11976 | if (seq > rp->vr_lastseq) { |
11977 | if (status == RESOLVER_UNRESOLVED) { |
11978 | rp->vr_flags &= ~VNT_RESOLVED; |
11979 | } |
11980 | rp->vr_lastseq = seq; |
11981 | } |
11982 | lck_mtx_unlock(lck: &rp->vr_lock); |
11983 | } |
11984 | |
11985 | __private_extern__ |
11986 | int |
11987 | vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) |
11988 | { |
11989 | vnode_resolve_t rp; |
11990 | enum path_operation op; |
11991 | resolver_result_t result; |
11992 | enum resolver_status status; |
11993 | uint32_t seq; |
11994 | |
11995 | /* |
11996 | * N.B. we cannot call vfs_context_can_resolve_triggers() |
11997 | * here because we really only want to suppress that in |
11998 | * the event the trigger will be resolved by something in |
11999 | * user-space. Any triggers that are resolved by the kernel |
12000 | * do not pose a threat of deadlock. |
12001 | */ |
12002 | |
12003 | /* Only trigger on topmost vnodes */ |
12004 | if ((vp->v_resolve == NULL) || |
12005 | (vp->v_resolve->vr_resolve_func == NULL) || |
12006 | (vp->v_mountedhere != NULL)) { |
12007 | return 0; |
12008 | } |
12009 | |
12010 | rp = vp->v_resolve; |
12011 | lck_mtx_lock(lck: &rp->vr_lock); |
12012 | |
12013 | /* Check if this vnode is already resolved */ |
12014 | if (rp->vr_flags & VNT_RESOLVED) { |
12015 | lck_mtx_unlock(lck: &rp->vr_lock); |
12016 | return 0; |
12017 | } |
12018 | |
12019 | lck_mtx_unlock(lck: &rp->vr_lock); |
12020 | |
12021 | #if CONFIG_MACF |
12022 | if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) { |
12023 | /* |
12024 | * VNT_KERN_RESOLVE indicates this trigger has no parameters |
12025 | * at the discression of the accessing process other than |
12026 | * the act of access. All other triggers must be checked |
12027 | */ |
12028 | int rv = mac_vnode_check_trigger_resolve(ctx, dvp: vp, cnp: &ndp->ni_cnd); |
12029 | if (rv != 0) { |
12030 | return rv; |
12031 | } |
12032 | } |
12033 | #endif |
12034 | |
12035 | /* |
12036 | * XXX |
12037 | * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) |
12038 | * is there anyway to know this??? |
12039 | * there can also be other legitimate lookups in parallel |
12040 | * |
12041 | * XXX - should we call this on a separate thread with a timeout? |
12042 | * |
12043 | * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should |
12044 | * get the richer set and non-leafs should get generic OP_LOOKUP? TBD |
12045 | */ |
12046 | op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP; |
12047 | |
12048 | result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx); |
12049 | status = vfs_resolver_status(result); |
12050 | seq = vfs_resolver_sequence(result); |
12051 | |
12052 | lck_mtx_lock(lck: &rp->vr_lock); |
12053 | if (seq > rp->vr_lastseq) { |
12054 | if (status == RESOLVER_RESOLVED) { |
12055 | rp->vr_flags |= VNT_RESOLVED; |
12056 | } |
12057 | rp->vr_lastseq = seq; |
12058 | } |
12059 | lck_mtx_unlock(lck: &rp->vr_lock); |
12060 | |
12061 | /* On resolver errors, propagate the error back up */ |
12062 | return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0; |
12063 | } |
12064 | |
12065 | static int |
12066 | vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx) |
12067 | { |
12068 | vnode_resolve_t rp; |
12069 | resolver_result_t result; |
12070 | enum resolver_status status; |
12071 | uint32_t seq; |
12072 | |
12073 | if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) { |
12074 | return 0; |
12075 | } |
12076 | |
12077 | rp = vp->v_resolve; |
12078 | lck_mtx_lock(lck: &rp->vr_lock); |
12079 | |
12080 | /* Check if this vnode is already resolved */ |
12081 | if ((rp->vr_flags & VNT_RESOLVED) == 0) { |
12082 | printf("vnode_trigger_unresolve: not currently resolved\n" ); |
12083 | lck_mtx_unlock(lck: &rp->vr_lock); |
12084 | return 0; |
12085 | } |
12086 | |
12087 | rp->vr_flags |= VNT_VFS_UNMOUNTED; |
12088 | |
12089 | lck_mtx_unlock(lck: &rp->vr_lock); |
12090 | |
12091 | /* |
12092 | * XXX |
12093 | * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) |
12094 | * there can also be other legitimate lookups in parallel |
12095 | * |
12096 | * XXX - should we call this on a separate thread with a timeout? |
12097 | */ |
12098 | |
12099 | result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx); |
12100 | status = vfs_resolver_status(result); |
12101 | seq = vfs_resolver_sequence(result); |
12102 | |
12103 | lck_mtx_lock(lck: &rp->vr_lock); |
12104 | if (seq > rp->vr_lastseq) { |
12105 | if (status == RESOLVER_UNRESOLVED) { |
12106 | rp->vr_flags &= ~VNT_RESOLVED; |
12107 | } |
12108 | rp->vr_lastseq = seq; |
12109 | } |
12110 | rp->vr_flags &= ~VNT_VFS_UNMOUNTED; |
12111 | lck_mtx_unlock(lck: &rp->vr_lock); |
12112 | |
12113 | /* On resolver errors, propagate the error back up */ |
12114 | return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0; |
12115 | } |
12116 | |
12117 | static int |
12118 | triggerisdescendant(mount_t mp, mount_t rmp) |
12119 | { |
12120 | int match = FALSE; |
12121 | |
12122 | /* |
12123 | * walk up vnode covered chain looking for a match |
12124 | */ |
12125 | name_cache_lock_shared(); |
12126 | |
12127 | while (1) { |
12128 | vnode_t vp; |
12129 | |
12130 | /* did we encounter "/" ? */ |
12131 | if (mp->mnt_flag & MNT_ROOTFS) { |
12132 | break; |
12133 | } |
12134 | |
12135 | vp = mp->mnt_vnodecovered; |
12136 | if (vp == NULLVP) { |
12137 | break; |
12138 | } |
12139 | |
12140 | mp = vp->v_mount; |
12141 | if (mp == rmp) { |
12142 | match = TRUE; |
12143 | break; |
12144 | } |
12145 | } |
12146 | |
12147 | name_cache_unlock(); |
12148 | |
12149 | return match; |
12150 | } |
12151 | |
12152 | struct trigger_unmount_info { |
12153 | vfs_context_t ctx; |
12154 | mount_t top_mp; |
12155 | vnode_t trigger_vp; |
12156 | mount_t trigger_mp; |
12157 | uint32_t trigger_vid; |
12158 | int flags; |
12159 | }; |
12160 | |
12161 | static int |
12162 | trigger_unmount_callback(mount_t mp, void * arg) |
12163 | { |
12164 | struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg; |
12165 | boolean_t mountedtrigger = FALSE; |
12166 | |
12167 | /* |
12168 | * When we encounter the top level mount we're done |
12169 | */ |
12170 | if (mp == infop->top_mp) { |
12171 | return VFS_RETURNED_DONE; |
12172 | } |
12173 | |
12174 | if ((mp->mnt_vnodecovered == NULL) || |
12175 | (vnode_getwithref(vp: mp->mnt_vnodecovered) != 0)) { |
12176 | return VFS_RETURNED; |
12177 | } |
12178 | |
12179 | if ((mp->mnt_vnodecovered->v_mountedhere == mp) && |
12180 | (mp->mnt_vnodecovered->v_resolve != NULL) && |
12181 | (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) { |
12182 | mountedtrigger = TRUE; |
12183 | } |
12184 | vnode_put(vp: mp->mnt_vnodecovered); |
12185 | |
12186 | /* |
12187 | * When we encounter a mounted trigger, check if its under the top level mount |
12188 | */ |
12189 | if (!mountedtrigger || !triggerisdescendant(mp, rmp: infop->top_mp)) { |
12190 | return VFS_RETURNED; |
12191 | } |
12192 | |
12193 | /* |
12194 | * Process any pending nested mount (now that its not referenced) |
12195 | */ |
12196 | if ((infop->trigger_vp != NULLVP) && |
12197 | (vnode_getwithvid(vp: infop->trigger_vp, vid: infop->trigger_vid) == 0)) { |
12198 | vnode_t vp = infop->trigger_vp; |
12199 | int error; |
12200 | |
12201 | vnode_drop(vp: infop->trigger_vp); |
12202 | infop->trigger_vp = NULLVP; |
12203 | |
12204 | if (mp == vp->v_mountedhere) { |
12205 | vnode_put(vp); |
12206 | printf("trigger_unmount_callback: unexpected match '%s'\n" , |
12207 | mp->mnt_vfsstat.f_mntonname); |
12208 | return VFS_RETURNED; |
12209 | } |
12210 | if (infop->trigger_mp != vp->v_mountedhere) { |
12211 | vnode_put(vp); |
12212 | printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n" , |
12213 | infop->trigger_mp, vp->v_mountedhere); |
12214 | goto savenext; |
12215 | } |
12216 | |
12217 | error = vnode_trigger_unresolve(vp, flags: infop->flags, ctx: infop->ctx); |
12218 | vnode_put(vp); |
12219 | if (error) { |
12220 | printf("unresolving: '%s', err %d\n" , |
12221 | vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname : |
12222 | "???" , error); |
12223 | return VFS_RETURNED_DONE; /* stop iteration on errors */ |
12224 | } |
12225 | } else if (infop->trigger_vp != NULLVP) { |
12226 | vnode_drop(vp: infop->trigger_vp); |
12227 | } |
12228 | |
12229 | savenext: |
12230 | /* |
12231 | * We can't call resolver here since we hold a mount iter |
12232 | * ref on mp so save its covered vp for later processing |
12233 | */ |
12234 | infop->trigger_vp = mp->mnt_vnodecovered; |
12235 | if ((infop->trigger_vp != NULLVP) && |
12236 | (vnode_getwithref(vp: infop->trigger_vp) == 0)) { |
12237 | if (infop->trigger_vp->v_mountedhere == mp) { |
12238 | infop->trigger_vid = infop->trigger_vp->v_id; |
12239 | vnode_hold(vp: infop->trigger_vp); |
12240 | infop->trigger_mp = mp; |
12241 | } |
12242 | vnode_put(vp: infop->trigger_vp); |
12243 | } |
12244 | |
12245 | return VFS_RETURNED; |
12246 | } |
12247 | |
12248 | /* |
12249 | * Attempt to unmount any trigger mounts nested underneath a mount. |
12250 | * This is a best effort attempt and no retries are performed here. |
12251 | * |
12252 | * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull) |
12253 | */ |
12254 | __private_extern__ |
12255 | void |
12256 | vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx) |
12257 | { |
12258 | struct trigger_unmount_info info; |
12259 | |
12260 | /* Must have trigger vnodes */ |
12261 | if (mp->mnt_numtriggers == 0) { |
12262 | return; |
12263 | } |
12264 | /* Avoid recursive requests (by checking covered vnode) */ |
12265 | if ((mp->mnt_vnodecovered != NULL) && |
12266 | (vnode_getwithref(vp: mp->mnt_vnodecovered) == 0)) { |
12267 | boolean_t recursive = FALSE; |
12268 | |
12269 | if ((mp->mnt_vnodecovered->v_mountedhere == mp) && |
12270 | (mp->mnt_vnodecovered->v_resolve != NULL) && |
12271 | (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) { |
12272 | recursive = TRUE; |
12273 | } |
12274 | vnode_put(vp: mp->mnt_vnodecovered); |
12275 | if (recursive) { |
12276 | return; |
12277 | } |
12278 | } |
12279 | |
12280 | /* |
12281 | * Attempt to unmount any nested trigger mounts (best effort) |
12282 | */ |
12283 | info.ctx = ctx; |
12284 | info.top_mp = mp; |
12285 | info.trigger_vp = NULLVP; |
12286 | info.trigger_vid = 0; |
12287 | info.trigger_mp = NULL; |
12288 | info.flags = flags; |
12289 | |
12290 | (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, callout: trigger_unmount_callback, arg: &info); |
12291 | |
12292 | /* |
12293 | * Process remaining nested mount (now that its not referenced) |
12294 | */ |
12295 | if ((info.trigger_vp != NULLVP) && |
12296 | (vnode_getwithvid(vp: info.trigger_vp, vid: info.trigger_vid) == 0)) { |
12297 | vnode_t vp = info.trigger_vp; |
12298 | |
12299 | if (info.trigger_mp == vp->v_mountedhere) { |
12300 | (void) vnode_trigger_unresolve(vp, flags, ctx); |
12301 | } |
12302 | vnode_put(vp); |
12303 | vnode_drop(vp); |
12304 | } else if (info.trigger_vp != NULLVP) { |
12305 | vnode_drop(vp: info.trigger_vp); |
12306 | } |
12307 | } |
12308 | |
12309 | int |
12310 | vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx) |
12311 | { |
12312 | struct nameidata *ndp; |
12313 | int res; |
12314 | vnode_t rvp, vp; |
12315 | struct vnode_trigger_param vtp; |
12316 | |
12317 | /* |
12318 | * Must be called for trigger callback, wherein rwlock is held |
12319 | */ |
12320 | lck_rw_assert(lck: &mp->mnt_rwlock, LCK_RW_ASSERT_HELD); |
12321 | |
12322 | TRIG_LOG("Adding trigger at %s\n" , relpath); |
12323 | TRIG_LOG("Trying VFS_ROOT\n" ); |
12324 | |
12325 | ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_NOFAIL); |
12326 | |
12327 | /* |
12328 | * We do a lookup starting at the root of the mountpoint, unwilling |
12329 | * to cross into other mountpoints. |
12330 | */ |
12331 | res = VFS_ROOT(mp, &rvp, ctx); |
12332 | if (res != 0) { |
12333 | goto out; |
12334 | } |
12335 | |
12336 | TRIG_LOG("Trying namei\n" ); |
12337 | |
12338 | NDINIT(ndp, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE, |
12339 | CAST_USER_ADDR_T(relpath), ctx); |
12340 | ndp->ni_dvp = rvp; |
12341 | res = namei(ndp); |
12342 | if (res != 0) { |
12343 | vnode_put(vp: rvp); |
12344 | goto out; |
12345 | } |
12346 | |
12347 | vp = ndp->ni_vp; |
12348 | nameidone(ndp); |
12349 | vnode_put(vp: rvp); |
12350 | |
12351 | TRIG_LOG("Trying vnode_resolver_create()\n" ); |
12352 | |
12353 | /* |
12354 | * Set up blob. vnode_create() takes a larger structure |
12355 | * with creation info, and we needed something different |
12356 | * for this case. One needs to win, or we need to munge both; |
12357 | * vnode_create() wins. |
12358 | */ |
12359 | bzero(s: &vtp, n: sizeof(vtp)); |
12360 | vtp.vnt_resolve_func = vtip->vti_resolve_func; |
12361 | vtp.vnt_unresolve_func = vtip->vti_unresolve_func; |
12362 | vtp.vnt_rearm_func = vtip->vti_rearm_func; |
12363 | vtp.vnt_reclaim_func = vtip->vti_reclaim_func; |
12364 | vtp.vnt_reclaim_func = vtip->vti_reclaim_func; |
12365 | vtp.vnt_data = vtip->vti_data; |
12366 | vtp.vnt_flags = vtip->vti_flags; |
12367 | |
12368 | res = vnode_resolver_create(mp, vp, tinfo: &vtp, TRUE); |
12369 | vnode_put(vp); |
12370 | out: |
12371 | kfree_type(struct nameidata, ndp); |
12372 | TRIG_LOG("Returning %d\n" , res); |
12373 | return res; |
12374 | } |
12375 | |
12376 | #endif /* CONFIG_TRIGGERS */ |
12377 | |
12378 | vm_offset_t |
12379 | kdebug_vnode(vnode_t vp) |
12380 | { |
12381 | return VM_KERNEL_ADDRPERM(vp); |
12382 | } |
12383 | |
12384 | static int flush_cache_on_write = 0; |
12385 | SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write, |
12386 | CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, |
12387 | "always flush the drive cache on writes to uncached files" ); |
12388 | |
12389 | int |
12390 | vnode_should_flush_after_write(vnode_t vp, int ioflag) |
12391 | { |
12392 | return flush_cache_on_write |
12393 | && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp)); |
12394 | } |
12395 | |
12396 | /* |
12397 | * sysctl for use by disk I/O tracing tools to get the list of existing |
12398 | * vnodes' paths |
12399 | */ |
12400 | |
12401 | #define NPATH_WORDS (MAXPATHLEN / sizeof(unsigned long)) |
12402 | struct vnode_trace_paths_context { |
12403 | uint64_t count; |
12404 | /* |
12405 | * Must be a multiple of 4, then -1, for tracing! |
12406 | */ |
12407 | unsigned long path[NPATH_WORDS + (4 - (NPATH_WORDS % 4)) - 1]; |
12408 | }; |
12409 | |
12410 | static int |
12411 | vnode_trace_path_callback(struct vnode *vp, void *vctx) |
12412 | { |
12413 | struct vnode_trace_paths_context *ctx = vctx; |
12414 | size_t path_len = sizeof(ctx->path); |
12415 | |
12416 | int getpath_len = (int)path_len; |
12417 | if (vn_getpath(vp, pathbuf: (char *)ctx->path, len: &getpath_len) == 0) { |
12418 | /* vn_getpath() NUL-terminates, and len includes the NUL. */ |
12419 | assert(getpath_len >= 0); |
12420 | path_len = (size_t)getpath_len; |
12421 | |
12422 | assert(path_len <= sizeof(ctx->path)); |
12423 | kdebug_vfs_lookup(path_words: ctx->path, path_len: (int)path_len, vnp: vp, |
12424 | KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT); |
12425 | |
12426 | if (++(ctx->count) == 1000) { |
12427 | thread_yield_to_preemption(); |
12428 | ctx->count = 0; |
12429 | } |
12430 | } |
12431 | |
12432 | return VNODE_RETURNED; |
12433 | } |
12434 | |
12435 | static int |
12436 | vfs_trace_paths_callback(mount_t mp, void *arg) |
12437 | { |
12438 | if (mp->mnt_flag & MNT_LOCAL) { |
12439 | vnode_iterate(mp, VNODE_ITERATE_ALL, callout: vnode_trace_path_callback, arg); |
12440 | } |
12441 | |
12442 | return VFS_RETURNED; |
12443 | } |
12444 | |
12445 | static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS { |
12446 | struct vnode_trace_paths_context ctx; |
12447 | |
12448 | (void)oidp; |
12449 | (void)arg1; |
12450 | (void)arg2; |
12451 | (void)req; |
12452 | |
12453 | if (!kauth_cred_issuser(cred: kauth_cred_get())) { |
12454 | return EPERM; |
12455 | } |
12456 | |
12457 | if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) { |
12458 | return EINVAL; |
12459 | } |
12460 | |
12461 | bzero(s: &ctx, n: sizeof(struct vnode_trace_paths_context)); |
12462 | |
12463 | vfs_iterate(flags: 0, callout: vfs_trace_paths_callback, arg: &ctx); |
12464 | |
12465 | return 0; |
12466 | } |
12467 | |
12468 | SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-" , "trace_paths" ); |
12469 | |
12470 | #if CONFIG_FILE_LEASES |
12471 | #include <IOKit/IOBSD.h> |
12472 | #include <sys/file_internal.h> |
12473 | |
12474 | #define FILE_LEASES_ENTITLEMENT "com.apple.private.vfs.file-leases" |
12475 | |
12476 | static uint32_t lease_break_timeout = 60; /* secs */ |
12477 | |
12478 | #if (DEVELOPMENT || DEBUG) |
12479 | static int lease_debug = 0; |
12480 | static int lease_entitlement_override = 0; |
12481 | |
12482 | SYSCTL_NODE(_vfs, OID_AUTO, lease, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs lease" ); |
12483 | SYSCTL_UINT(_vfs_lease, OID_AUTO, break_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_break_timeout, 0, "" ); |
12484 | SYSCTL_INT(_vfs_lease, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_debug, 0, "" ); |
12485 | SYSCTL_INT(_vfs_lease, OID_AUTO, entitlement_override, CTLFLAG_RW | CTLFLAG_LOCKED, &lease_entitlement_override, 0, "" ); |
12486 | |
12487 | #define LEASEDBG(fmt, args...) \ |
12488 | do { \ |
12489 | if (__improbable(lease_debug)) { \ |
12490 | pid_t cur_pid = proc_getpid(current_proc()); \ |
12491 | printf("%s(%d): " fmt "\n", __func__, cur_pid, ##args); \ |
12492 | } \ |
12493 | } while(0) |
12494 | #else |
12495 | #define LEASEDBG(fmt, args...) /**/ |
12496 | #endif /* (DEVELOPMENT || DEBUG) */ |
12497 | |
12498 | static bool |
12499 | allow_setlease(vfs_context_t ctx) |
12500 | { |
12501 | bool entitled; |
12502 | |
12503 | entitled = IOTaskHasEntitlement(task: vfs_context_task(ctx), |
12504 | FILE_LEASES_ENTITLEMENT); |
12505 | |
12506 | #if (DEVELOPMENT || DEBUG) |
12507 | if (!entitled) { |
12508 | entitled = (lease_entitlement_override == 1); |
12509 | } |
12510 | #endif |
12511 | |
12512 | return entitled; |
12513 | } |
12514 | |
12515 | static file_lease_t |
12516 | file_lease_alloc(struct fileglob *fg, int fl_type, pid_t pid) |
12517 | { |
12518 | file_lease_t fl; |
12519 | |
12520 | fl = kalloc_type(struct file_lease, Z_WAITOK); |
12521 | /* |
12522 | * Duplicated file descriptors created by dup() or fork() would have the |
12523 | * same 'fileglob' so the lease can be released or modified with the |
12524 | * duplicated fds. Opening the same file (by either same or different |
12525 | * process) would have different 'fileglob' so a lease always follows a |
12526 | * 'fileglob'. |
12527 | */ |
12528 | fl->fl_fg = fg; |
12529 | fl->fl_type = fl_type; |
12530 | fl->fl_pid = pid; |
12531 | fl->fl_downgrade_start = fl->fl_release_start = 0; |
12532 | |
12533 | return fl; |
12534 | } |
12535 | |
12536 | static void |
12537 | file_lease_free(file_lease_t fl) |
12538 | { |
12539 | kfree_type(struct file_lease, fl); |
12540 | } |
12541 | |
12542 | /* |
12543 | * A read lease can be placed only on a file/directory that is opened for |
12544 | * read-only which means no other processes have the file/directory opened in |
12545 | * read-write/write-only mode or mmap'ed writable. |
12546 | * A write lease can be placed on a file only if there are no other opens |
12547 | * for the file. |
12548 | * |
12549 | * Needs to be called with vnode's lock held. |
12550 | */ |
12551 | static int |
12552 | check_for_open_conflict(vnode_t vp, struct fileglob *fg, int fl_type, |
12553 | int expcounts) |
12554 | { |
12555 | int error = 0; |
12556 | |
12557 | if (fl_type == F_RDLCK) { |
12558 | if (vp->v_writecount > expcounts && |
12559 | !(vp->v_writecount == 1 && (fg->fg_flag & FWRITE))) { |
12560 | error = EAGAIN; |
12561 | } else if (ubc_is_mapped_writable(vp)) { |
12562 | error = EAGAIN; |
12563 | } |
12564 | } else if (fl_type == F_WRLCK && vp->v_usecount > expcounts) { |
12565 | error = EAGAIN; |
12566 | } |
12567 | |
12568 | return error; |
12569 | } |
12570 | |
12571 | /* Needs to be called with vnode's lock held. */ |
12572 | static void |
12573 | modify_file_lease(vnode_t vp, file_lease_t fl, int new_fl_type, |
12574 | struct fileglob *new_fg) |
12575 | { |
12576 | LEASEDBG("fl %p changing fl_type from %d to %d (flags 0x%x)" , |
12577 | fl, fl->fl_type, new_fl_type, fl->fl_flags); |
12578 | |
12579 | fl->fl_type = new_fl_type; |
12580 | |
12581 | /* |
12582 | * The lease being modified may be using a different file |
12583 | * descriptor, so usurp the fileglob pointer here. In this |
12584 | * case the old descriptor no longer holds the lease. |
12585 | */ |
12586 | if (new_fg != NULL) { |
12587 | fl->fl_fg = new_fg; |
12588 | } |
12589 | |
12590 | if (fl->fl_flags & FL_FLAG_RELEASE_PENDING || |
12591 | fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) { |
12592 | wakeup(chan: &vp->v_leases); |
12593 | } |
12594 | } |
12595 | |
12596 | static int |
12597 | acquire_file_lease(vnode_t vp, struct fileglob *fg, int fl_type, int expcounts, |
12598 | vfs_context_t ctx) |
12599 | { |
12600 | file_lease_t fl, new_fl, our_fl; |
12601 | int error; |
12602 | |
12603 | /* Make sure "expected count" looks sane. */ |
12604 | if (expcounts < 0 || expcounts > OPEN_MAX) { |
12605 | return EINVAL; |
12606 | } |
12607 | |
12608 | new_fl = file_lease_alloc(fg, fl_type, pid: vfs_context_pid(ctx)); |
12609 | |
12610 | vnode_lock(vp); |
12611 | |
12612 | error = check_for_open_conflict(vp, fg, fl_type, expcounts); |
12613 | if (error) { |
12614 | LEASEDBG("open conflict on vp %p type %d writecnt %d usecnt %d " |
12615 | "fl_type %d expcounts %d" , |
12616 | vp, vp->v_type, vp->v_writecount, vp->v_usecount, fl_type, |
12617 | expcounts); |
12618 | goto out; |
12619 | } |
12620 | |
12621 | our_fl = NULL; |
12622 | LIST_FOREACH(fl, &vp->v_leases, fl_link) { |
12623 | /* Does the existing lease belong to us? */ |
12624 | if (fl->fl_fg == new_fl->fl_fg || |
12625 | fl->fl_pid == new_fl->fl_pid) { |
12626 | our_fl = fl; |
12627 | continue; |
12628 | } |
12629 | |
12630 | /* |
12631 | * We don't allow placing a new write lease when there is an existing |
12632 | * read lease that doesn't belong to us. We also don't allow putting |
12633 | * a new read lease if there is a pending release on the lease. |
12634 | * Putting a new read lease when there is a pending downgrade on the |
12635 | * lease is fine as it won't cause lease conflict. |
12636 | */ |
12637 | if (fl_type == F_WRLCK || fl->fl_flags & FL_FLAG_RELEASE_PENDING) { |
12638 | break; |
12639 | } |
12640 | } |
12641 | |
12642 | /* |
12643 | * Found an existing lease that we don't own and it conflicts with the |
12644 | * new lease. |
12645 | */ |
12646 | if (fl) { |
12647 | LEASEDBG("lease conflict on vp %p fl %p fl_type %d cur_fl_type %d" , |
12648 | vp, fl, fl_type, fl->fl_type); |
12649 | goto out; |
12650 | } |
12651 | |
12652 | /* Found an existing lease that we own so just change the type. */ |
12653 | if (our_fl) { |
12654 | LEASEDBG("replace lease on vp %p fl %p old_fl_type %d new_fl_type %d" , |
12655 | vp, our_fl, our_fl->fl_type, fl_type); |
12656 | |
12657 | modify_file_lease(vp, fl: our_fl, new_fl_type: new_fl->fl_type, new_fg: new_fl->fl_fg); |
12658 | goto out; |
12659 | } |
12660 | |
12661 | LEASEDBG("acquired lease on vp %p type %d fl %p fl_type %d fg %p" , |
12662 | vp, vp->v_type, new_fl, new_fl->fl_type, new_fl->fl_fg); |
12663 | |
12664 | LIST_INSERT_HEAD(&vp->v_leases, new_fl, fl_link); |
12665 | new_fl = NULL; |
12666 | |
12667 | out: |
12668 | vnode_unlock(vp); |
12669 | |
12670 | if (new_fl) { |
12671 | file_lease_free(fl: new_fl); |
12672 | } |
12673 | |
12674 | return error; |
12675 | } |
12676 | |
12677 | static int |
12678 | release_file_lease(vnode_t vp, struct fileglob *fg) |
12679 | { |
12680 | file_lease_t fl, fl_tmp; |
12681 | int error = 0; |
12682 | |
12683 | LEASEDBG("request to release lease on vp %p type %d fg %p" , |
12684 | vp, vp->v_type, fg); |
12685 | |
12686 | vnode_lock(vp); |
12687 | |
12688 | LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) { |
12689 | if (fl->fl_fg == fg) { |
12690 | LEASEDBG("released lease on vp %p fl %p type %d" , |
12691 | vp, fl, fl->fl_type); |
12692 | |
12693 | LIST_REMOVE(fl, fl_link); |
12694 | modify_file_lease(vp, fl, F_UNLCK, NULL); |
12695 | break; |
12696 | } |
12697 | } |
12698 | |
12699 | vnode_unlock(vp); |
12700 | |
12701 | if (fl) { |
12702 | file_lease_free(fl); |
12703 | } else { |
12704 | error = ENOLCK; |
12705 | } |
12706 | |
12707 | return error; |
12708 | } |
12709 | |
12710 | /* |
12711 | * Acquire or release a file lease according to the given type (F_RDLCK, |
12712 | * F_WRLCK or F_UNLCK). |
12713 | * |
12714 | * Returns: 0 Success |
12715 | * EAGAIN Failed to acquire a file lease due to conflicting opens |
12716 | * ENOLCK Failed to release a file lease due to lease not found |
12717 | * EPERM Current task doesn't have the entitlement |
12718 | */ |
12719 | int |
12720 | vnode_setlease(vnode_t vp, struct fileglob *fg, int fl_type, int expcounts, |
12721 | vfs_context_t ctx) |
12722 | { |
12723 | int error; |
12724 | |
12725 | if (!allow_setlease(ctx)) { |
12726 | return EPERM; |
12727 | } |
12728 | |
12729 | error = (fl_type == F_UNLCK) ? release_file_lease(vp, fg) : |
12730 | acquire_file_lease(vp, fg, fl_type, expcounts, ctx); |
12731 | |
12732 | return error; |
12733 | } |
12734 | |
12735 | /* |
12736 | * Retrieve the currently in place lease for the file. |
12737 | * |
12738 | * Returns: |
12739 | * F_RDLCK Read lease |
12740 | * F_WRLCK Write lease |
12741 | * F_UNLCK No lease |
12742 | */ |
12743 | int |
12744 | vnode_getlease(vnode_t vp) |
12745 | { |
12746 | file_lease_t fl; |
12747 | int fl_type = F_UNLCK; |
12748 | |
12749 | vnode_lock(vp); |
12750 | |
12751 | /* |
12752 | * There should be only one type of lease in the list as read and write |
12753 | * leases can't co-exist for the same file. |
12754 | */ |
12755 | fl = LIST_FIRST(&vp->v_leases); |
12756 | if (fl) { |
12757 | fl_type = fl->fl_type; |
12758 | } |
12759 | |
12760 | vnode_unlock(vp); |
12761 | |
12762 | LEASEDBG("vp %p fl %p fl_type %d" , vp, fl, fl_type); |
12763 | |
12764 | return fl_type; |
12765 | } |
12766 | |
12767 | /* Must be called with vnode's lock held. */ |
12768 | static bool |
12769 | check_for_lease_conflict(vnode_t vp, int breaker_fl_type, vfs_context_t ctx) |
12770 | { |
12771 | file_lease_t fl; |
12772 | pid_t pid = vfs_context_pid(ctx); |
12773 | bool is_conflict = false; |
12774 | |
12775 | LIST_FOREACH(fl, &vp->v_leases, fl_link) { |
12776 | if ((fl->fl_type == F_WRLCK && fl->fl_pid != pid) || |
12777 | (breaker_fl_type == F_WRLCK && fl->fl_pid != pid)) { |
12778 | LEASEDBG("conflict detected on vp %p type %d fl_type %d " |
12779 | "breaker_fl_type %d" , |
12780 | vp, vp->v_type, fl->fl_type, breaker_fl_type); |
12781 | |
12782 | is_conflict = true; |
12783 | break; |
12784 | } |
12785 | } |
12786 | |
12787 | return is_conflict; |
12788 | } |
12789 | |
12790 | static uint64_t |
12791 | absolutetime_elapsed_in_secs(uint64_t start) |
12792 | { |
12793 | uint64_t elapsed, elapsed_sec; |
12794 | uint64_t now = mach_absolute_time(); |
12795 | |
12796 | elapsed = now - start; |
12797 | absolutetime_to_nanoseconds(abstime: elapsed, result: &elapsed_sec); |
12798 | elapsed_sec /= NSEC_PER_SEC; |
12799 | |
12800 | return elapsed_sec; |
12801 | } |
12802 | |
12803 | /* Must be called with vnode's lock held. */ |
12804 | static void |
12805 | handle_lease_break_timedout(vnode_t vp) |
12806 | { |
12807 | file_lease_t fl, fl_tmp; |
12808 | uint64_t elapsed_sec; |
12809 | |
12810 | LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) { |
12811 | if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING) { |
12812 | elapsed_sec = absolutetime_elapsed_in_secs(start: fl->fl_downgrade_start); |
12813 | |
12814 | if (elapsed_sec >= lease_break_timeout) { |
12815 | LEASEDBG("force downgrade on vp %p for fl %p elapsed %llu " |
12816 | "timeout %u" , vp, fl, elapsed_sec, lease_break_timeout); |
12817 | |
12818 | fl->fl_flags &= ~FL_FLAG_DOWNGRADE_PENDING; |
12819 | fl->fl_downgrade_start = 0; |
12820 | modify_file_lease(vp, fl, F_RDLCK, NULL); |
12821 | continue; |
12822 | } |
12823 | } |
12824 | if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) { |
12825 | elapsed_sec = absolutetime_elapsed_in_secs(start: fl->fl_release_start); |
12826 | |
12827 | if (elapsed_sec >= lease_break_timeout) { |
12828 | LEASEDBG("force release on vp %p for fl %p elapsed %llu " |
12829 | "timeout %u" , vp, fl, elapsed_sec, lease_break_timeout); |
12830 | |
12831 | LIST_REMOVE(fl, fl_link); |
12832 | file_lease_free(fl); |
12833 | continue; |
12834 | } |
12835 | } |
12836 | } |
12837 | |
12838 | /* Wakeup the lease breaker(s). */ |
12839 | wakeup(chan: &vp->v_leases); |
12840 | } |
12841 | |
12842 | /* Must be called with vnode's lock held. */ |
12843 | static void |
12844 | wait_for_lease_break(vnode_t vp, int breaker_fl_type, vfs_context_t ctx) |
12845 | { |
12846 | file_lease_t fl; |
12847 | struct timespec ts; |
12848 | uint64_t elapsed_sec, start_time; |
12849 | int error; |
12850 | |
12851 | restart: |
12852 | fl = LIST_FIRST(&vp->v_leases); |
12853 | assert(fl); |
12854 | |
12855 | /* |
12856 | * In a rare case it is possible that the lease that we are blocked on has |
12857 | * been released and a new lease has been put in place after we are |
12858 | * signalled to wake up. In this particular, we would treat it as no |
12859 | * conflict and proceed. This could only happen for directory leasing. |
12860 | */ |
12861 | if ((fl->fl_flags & (FL_FLAG_DOWNGRADE_PENDING | FL_FLAG_RELEASE_PENDING)) == 0) { |
12862 | LEASEDBG("new lease in place on vp %p fl %p fl_type %d " |
12863 | "breaker_fl_type %d" , |
12864 | vp, fl, fl->fl_type, breaker_fl_type); |
12865 | |
12866 | return; |
12867 | } |
12868 | /* |
12869 | * Figure out which timer to use for lease break timedout as we could have |
12870 | * both timers active. If both timers active, pick the one with earliest |
12871 | * start time. |
12872 | */ |
12873 | if (fl->fl_release_start) { |
12874 | if (fl->fl_downgrade_start == 0 || |
12875 | fl->fl_downgrade_start < fl->fl_release_start) { |
12876 | start_time = fl->fl_release_start; |
12877 | } else { |
12878 | start_time = fl->fl_downgrade_start; |
12879 | } |
12880 | } else { |
12881 | start_time = fl->fl_downgrade_start; |
12882 | } |
12883 | assert(start_time > 0); |
12884 | |
12885 | elapsed_sec = absolutetime_elapsed_in_secs(start: start_time); |
12886 | |
12887 | LEASEDBG("elapsed_sec %llu release_start %llu downgrade_start %llu" , |
12888 | elapsed_sec, fl->fl_release_start, fl->fl_downgrade_start); |
12889 | |
12890 | ts.tv_sec = (lease_break_timeout > elapsed_sec ? |
12891 | (lease_break_timeout - elapsed_sec) : 0); |
12892 | ts.tv_nsec = (ts.tv_sec == 0 ? 1 : 0); |
12893 | error = msleep(chan: &vp->v_leases, mtx: &vp->v_lock, PVFS, wmesg: __func__, ts: &ts); |
12894 | |
12895 | if (error == 0 || error != EWOULDBLOCK) { |
12896 | /* |
12897 | * Woken up due to lease is released/downgraded by lease holder. |
12898 | * We don't expect any other error from msleep() beside EWOULDBLOCK. |
12899 | * Check if there is any further conflicts. If so, then continue to |
12900 | * wait for the next conflict to resolve. |
12901 | */ |
12902 | if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) { |
12903 | goto restart; |
12904 | } |
12905 | } else { |
12906 | /* |
12907 | * Woken due to lease break timeout expired (EWOULDBLOCK returned). |
12908 | * Break/downgrade all conflicting leases. |
12909 | */ |
12910 | handle_lease_break_timedout(vp); |
12911 | |
12912 | if (check_for_lease_conflict(vp, breaker_fl_type, ctx)) { |
12913 | goto restart; |
12914 | } |
12915 | } |
12916 | } |
12917 | |
12918 | /* Must be called with vnode's lock held. */ |
12919 | static void |
12920 | send_lease_break_event(vnode_t vp, uint32_t event) |
12921 | { |
12922 | if (vp->v_knotes.slh_first != NULL) { |
12923 | KNOTE(&vp->v_knotes, event); |
12924 | } |
12925 | } |
12926 | |
12927 | static bool |
12928 | is_dataless_file(vnode_t vp, vfs_context_t ctx) |
12929 | { |
12930 | struct vnode_attr va; |
12931 | bool is_dataless = false; |
12932 | int error; |
12933 | |
12934 | VATTR_INIT(&va); |
12935 | VATTR_WANTED(&va, va_flags); |
12936 | |
12937 | error = vnode_getattr(vp, vap: &va, ctx); |
12938 | if (!error && (va.va_flags & SF_DATALESS)) { |
12939 | is_dataless = true; |
12940 | } |
12941 | |
12942 | return is_dataless; |
12943 | } |
12944 | |
12945 | /* |
12946 | * Break lease(s) in place for the file when there is conflict. |
12947 | * This function would return 0 for almost all call sites. The only exception |
12948 | * is when it is called from open1() with O_NONBLOCK flag and it needs to block |
12949 | * waiting for the lease conflict(s) to resolve. In this case EWOULDBLOCK is |
12950 | * returned. |
12951 | */ |
12952 | int |
12953 | vnode_breaklease(vnode_t vp, uint32_t oflags, vfs_context_t ctx) |
12954 | { |
12955 | file_lease_t fl; |
12956 | uint64_t now; |
12957 | int fl_type; |
12958 | int error = 0; |
12959 | |
12960 | vnode_lock(vp); |
12961 | |
12962 | if (__probable(LIST_EMPTY(&vp->v_leases))) { |
12963 | goto out_unlock; |
12964 | } |
12965 | |
12966 | /* Determine the access mode requested by the lease breaker. */ |
12967 | fl_type = (oflags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC)) ? F_WRLCK : F_RDLCK; |
12968 | |
12969 | /* |
12970 | * If the lease-breaker is just reading, check that it can break |
12971 | * leases first. If the lease-breaker is writing, or if the |
12972 | * context was not specified, we always break. |
12973 | * We skip lease break if the lease-breaker is dataless manipulator and |
12974 | * the file is dataless. |
12975 | */ |
12976 | if ((fl_type == F_RDLCK && !vfs_context_can_break_leases(ctx)) || |
12977 | (vfs_context_is_dataless_manipulator(ctx) && (vp->v_type == VREG) && |
12978 | is_dataless_file(vp, ctx))) { |
12979 | goto out_unlock; |
12980 | } |
12981 | |
12982 | if (!check_for_lease_conflict(vp, breaker_fl_type: fl_type, ctx)) { |
12983 | goto out_unlock; |
12984 | } |
12985 | |
12986 | now = mach_absolute_time(); |
12987 | |
12988 | LEASEDBG("break lease on vp %p type %d oflags 0x%x cur_time %llu" , |
12989 | vp, vp->v_type, oflags, now); |
12990 | |
12991 | /* |
12992 | * We get to this point then this means all lease(s) are conflict and |
12993 | * we need to send the lease break event to the lease holder(s). |
12994 | * It is possible that a lease could have both downgrade and release events |
12995 | * pending triggered by multiple breakers trying to open the file in |
12996 | * different modes. Both events would have different lease break timers. |
12997 | * Consider the following case: |
12998 | * 1. Process A holds the write lease on file X. |
12999 | * 2. Provess B opens the file X in read-only mode. |
13000 | * This triggers downgrade lease event to Process A. |
13001 | * 3. While downgrade is pending, Process C opens the file X in read-write |
13002 | * mode. This triggers release lease event to Process A. |
13003 | */ |
13004 | LIST_FOREACH(fl, &vp->v_leases, fl_link) { |
13005 | if (fl_type == F_WRLCK) { |
13006 | /* File is opened for writing or truncate. */ |
13007 | if (fl->fl_flags & FL_FLAG_RELEASE_PENDING) { |
13008 | continue; |
13009 | } |
13010 | fl->fl_release_start = now; |
13011 | fl->fl_flags |= FL_FLAG_RELEASE_PENDING; |
13012 | send_lease_break_event(vp, NOTE_LEASE_RELEASE); |
13013 | } else { |
13014 | /* File is opened for reading. */ |
13015 | if (fl->fl_flags & FL_FLAG_DOWNGRADE_PENDING || |
13016 | fl->fl_flags & FL_FLAG_RELEASE_PENDING) { |
13017 | continue; |
13018 | } |
13019 | fl->fl_downgrade_start = now; |
13020 | fl->fl_flags |= FL_FLAG_DOWNGRADE_PENDING; |
13021 | send_lease_break_event(vp, NOTE_LEASE_DOWNGRADE); |
13022 | } |
13023 | } |
13024 | |
13025 | /* |
13026 | * If open is requested with O_NONBLOCK, then we can't block and wait for |
13027 | * the lease to be released/downgraded. Just bail out with EWOULDBLOCK. |
13028 | */ |
13029 | if (oflags & O_NONBLOCK) { |
13030 | error = EWOULDBLOCK; |
13031 | goto out; |
13032 | } |
13033 | |
13034 | wait_for_lease_break(vp, breaker_fl_type: fl_type, ctx); |
13035 | |
13036 | out: |
13037 | LEASEDBG("break lease on vp %p oflags 0x%x, error %d" , vp, oflags, error); |
13038 | |
13039 | out_unlock: |
13040 | vnode_unlock(vp); |
13041 | |
13042 | return error; |
13043 | } |
13044 | |
13045 | /* |
13046 | * Get parent vnode by parent ID (only for file system that supports |
13047 | * MNTK_PATH_FROM_ID). |
13048 | * On success, the parent's vnode is returned with iocount held. |
13049 | */ |
13050 | static vnode_t |
13051 | vnode_getparent_byid(vnode_t vp) |
13052 | { |
13053 | struct vnode_attr va; |
13054 | vnode_t dvp = NULLVP; |
13055 | vfs_context_t ctx = vfs_context_current(); |
13056 | int error; |
13057 | |
13058 | if (!(vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID)) { |
13059 | goto out; |
13060 | } |
13061 | |
13062 | VATTR_INIT(&va); |
13063 | VATTR_WANTED(&va, va_parentid); |
13064 | |
13065 | /* Get the vnode's parent id from the file system. */ |
13066 | error = vnode_getattr(vp, vap: &va, ctx); |
13067 | if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) { |
13068 | goto out; |
13069 | } |
13070 | |
13071 | /* |
13072 | * Ask the file system for the parent vnode. |
13073 | * We are ignoring the error here as we don't expect the parent vnode to be |
13074 | * populated on error. |
13075 | */ |
13076 | (void)VFS_VGET(vp->v_mount, (ino64_t)va.va_parentid, &dvp, ctx); |
13077 | |
13078 | out: |
13079 | return dvp; |
13080 | } |
13081 | |
13082 | /* |
13083 | * Break directory's lease. |
13084 | * If 'need_parent' is true, then parent is obtained via vnode_getparent() (or |
13085 | * vnode_getparent_byid()) on the provided 'vp'. |
13086 | */ |
13087 | void |
13088 | vnode_breakdirlease(vnode_t vp, bool need_parent, uint32_t oflags) |
13089 | { |
13090 | vnode_t dvp; |
13091 | |
13092 | if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR) || |
13093 | (vp == rootvnode)) { |
13094 | return; |
13095 | } |
13096 | |
13097 | /* |
13098 | * If parent is not provided, first try to get it from the name cache. |
13099 | * If failed, then we will attempt to ask the file system for parent vnode. |
13100 | * This is just a best effort as both attempts could still fail. |
13101 | */ |
13102 | if (need_parent) { |
13103 | dvp = vnode_getparent(vp); |
13104 | if (__improbable(dvp == NULLVP)) { |
13105 | dvp = vnode_getparent_byid(vp); |
13106 | } |
13107 | } else { |
13108 | dvp = vp; |
13109 | } |
13110 | |
13111 | if (__probable(dvp != NULLVP)) { |
13112 | /* Always break dir leases. */ |
13113 | (void)vnode_breaklease(vp: dvp, oflags, ctx: vfs_context_current()); |
13114 | } |
13115 | |
13116 | if (need_parent && (dvp != NULLVP)) { |
13117 | vnode_put(vp: dvp); |
13118 | } |
13119 | } |
13120 | |
13121 | /* |
13122 | * Revoke all lease(s) in place for the file. |
13123 | * This is called when the vnode is reclaimed. |
13124 | */ |
13125 | void |
13126 | vnode_revokelease(vnode_t vp, bool locked) |
13127 | { |
13128 | file_lease_t fl, fl_tmp; |
13129 | bool need_wakeup = false; |
13130 | |
13131 | if ((vnode_vtype(vp) != VREG && vnode_vtype(vp) != VDIR)) { |
13132 | return; |
13133 | } |
13134 | |
13135 | if (!locked) { |
13136 | vnode_lock(vp); |
13137 | } |
13138 | |
13139 | LIST_FOREACH_SAFE(fl, &vp->v_leases, fl_link, fl_tmp) { |
13140 | LIST_REMOVE(fl, fl_link); |
13141 | file_lease_free(fl); |
13142 | need_wakeup = true; |
13143 | } |
13144 | |
13145 | /* Wakeup any lease breaker(s) that might be currently blocked. */ |
13146 | if (__improbable(need_wakeup)) { |
13147 | wakeup(chan: &vp->v_leases); |
13148 | } |
13149 | |
13150 | if (!locked) { |
13151 | vnode_unlock(vp); |
13152 | } |
13153 | } |
13154 | |
13155 | #endif /* CONFIG_FILE_LEASES */ |
13156 | |