1 | /* |
2 | * Copyright (c) 2000-2015 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ |
29 | /* |
30 | * Copyright (c) 1989, 1993, 1995 |
31 | * The Regents of the University of California. All rights reserved. |
32 | * |
33 | * This code is derived from software contributed to Berkeley by |
34 | * Poul-Henning Kamp of the FreeBSD Project. |
35 | * |
36 | * Redistribution and use in source and binary forms, with or without |
37 | * modification, are permitted provided that the following conditions |
38 | * are met: |
39 | * 1. Redistributions of source code must retain the above copyright |
40 | * notice, this list of conditions and the following disclaimer. |
41 | * 2. Redistributions in binary form must reproduce the above copyright |
42 | * notice, this list of conditions and the following disclaimer in the |
43 | * documentation and/or other materials provided with the distribution. |
44 | * 3. All advertising materials mentioning features or use of this software |
45 | * must display the following acknowledgement: |
46 | * This product includes software developed by the University of |
47 | * California, Berkeley and its contributors. |
48 | * 4. Neither the name of the University nor the names of its contributors |
49 | * may be used to endorse or promote products derived from this software |
50 | * without specific prior written permission. |
51 | * |
52 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
53 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
54 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
55 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
56 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
57 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
58 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
59 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
60 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
61 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
62 | * SUCH DAMAGE. |
63 | * |
64 | * |
65 | * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 |
66 | */ |
67 | /* |
68 | * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce |
69 | * support for mandatory and extensible security protections. This notice |
70 | * is included in support of clause 2.2 (b) of the Apple Public License, |
71 | * Version 2.0. |
72 | */ |
73 | #include <sys/param.h> |
74 | #include <sys/systm.h> |
75 | #include <sys/time.h> |
76 | #include <sys/mount_internal.h> |
77 | #include <sys/vnode_internal.h> |
78 | #include <miscfs/specfs/specdev.h> |
79 | #include <sys/namei.h> |
80 | #include <sys/errno.h> |
81 | #include <kern/kalloc.h> |
82 | #include <sys/kauth.h> |
83 | #include <sys/user.h> |
84 | #include <sys/paths.h> |
85 | #include <os/overflow.h> |
86 | |
87 | #if CONFIG_MACF |
88 | #include <security/mac_framework.h> |
89 | #endif |
90 | |
91 | /* |
92 | * Name caching works as follows: |
93 | * |
94 | * Names found by directory scans are retained in a cache |
95 | * for future reference. It is managed LRU, so frequently |
96 | * used names will hang around. Cache is indexed by hash value |
97 | * obtained from (vp, name) where vp refers to the directory |
98 | * containing name. |
99 | * |
100 | * If it is a "negative" entry, (i.e. for a name that is known NOT to |
101 | * exist) the vnode pointer will be NULL. |
102 | * |
103 | * Upon reaching the last segment of a path, if the reference |
104 | * is for DELETE, or NOCACHE is set (rewrite), and the |
105 | * name is located in the cache, it will be dropped. |
106 | */ |
107 | |
108 | /* |
109 | * Structures associated with name cacheing. |
110 | */ |
111 | |
112 | ZONE_DEFINE_TYPE(namecache_zone, "namecache" , struct namecache, ZC_NONE); |
113 | |
114 | struct smrq_list_head *nchashtbl; /* Hash Table */ |
115 | u_long nchashmask; |
116 | u_long nchash; /* size of hash table - 1 */ |
117 | long numcache; /* number of cache entries allocated */ |
118 | int desiredNodes; |
119 | int desiredNegNodes; |
120 | int ncs_negtotal; |
121 | TUNABLE_WRITEABLE(int, nc_disabled, "-novfscache" , 0); |
122 | __options_decl(nc_smr_level_t, uint32_t, { |
123 | NC_SMR_DISABLED = 0, |
124 | NC_SMR_LOOKUP = 1 |
125 | }); |
126 | TUNABLE(nc_smr_level_t, nc_smr_enabled, "ncsmr" , NC_SMR_LOOKUP); |
127 | TAILQ_HEAD(, namecache) nchead; /* chain of all name cache entries */ |
128 | TAILQ_HEAD(, namecache) neghead; /* chain of only negative cache entries */ |
129 | |
130 | |
131 | #if COLLECT_STATS |
132 | |
133 | struct nchstats nchstats; /* cache effectiveness statistics */ |
134 | |
135 | #define NCHSTAT(v) { \ |
136 | nchstats.v++; \ |
137 | } |
138 | #define NAME_CACHE_LOCK_SHARED() name_cache_lock() |
139 | #define NAME_CACHE_LOCK_SHARED_TO_EXCLUSIVE() TRUE |
140 | |
141 | #else |
142 | |
143 | #define NCHSTAT(v) |
144 | #define NAME_CACHE_LOCK_SHARED() name_cache_lock_shared() |
145 | #define NAME_CACHE_LOCK_SHARED_TO_EXCLUSIVE() name_cache_lock_shared_to_exclusive() |
146 | |
147 | #endif |
148 | |
149 | #define NAME_CACHE_LOCK() name_cache_lock() |
150 | #define NAME_CACHE_UNLOCK() name_cache_unlock() |
151 | |
152 | /* vars for name cache list lock */ |
153 | static LCK_GRP_DECLARE(namecache_lck_grp, "Name Cache" ); |
154 | static LCK_RW_DECLARE(namecache_rw_lock, &namecache_lck_grp); |
155 | |
156 | typedef struct string_t { |
157 | LIST_ENTRY(string_t) hash_chain; |
158 | char *str; |
159 | uint32_t strbuflen; |
160 | uint32_t refcount; |
161 | } string_t; |
162 | |
163 | ZONE_DEFINE_TYPE(stringcache_zone, "vfsstringcache" , string_t, ZC_NONE); |
164 | |
165 | static LCK_GRP_DECLARE(strcache_lck_grp, "String Cache" ); |
166 | static LCK_ATTR_DECLARE(strcache_lck_attr, 0, 0); |
167 | LCK_RW_DECLARE_ATTR(strtable_rw_lock, &strcache_lck_grp, &strcache_lck_attr); |
168 | |
169 | static LCK_GRP_DECLARE(rootvnode_lck_grp, "rootvnode" ); |
170 | LCK_RW_DECLARE(rootvnode_rw_lock, &rootvnode_lck_grp); |
171 | |
172 | #define NUM_STRCACHE_LOCKS 1024 |
173 | |
174 | lck_mtx_t strcache_mtx_locks[NUM_STRCACHE_LOCKS]; |
175 | |
176 | SYSCTL_NODE(_vfs, OID_AUTO, ncstats, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "vfs name cache stats" ); |
177 | |
178 | SYSCTL_COMPAT_INT(_vfs_ncstats, OID_AUTO, nc_smr_enabled, |
179 | CTLFLAG_RD | CTLFLAG_LOCKED, |
180 | &nc_smr_enabled, 0, "" ); |
181 | |
182 | #if COLLECT_NC_SMR_STATS |
183 | struct ncstats { |
184 | uint64_t cl_smr_hits; |
185 | uint64_t cl_smr_miss; |
186 | uint64_t cl_smr_negative_hits; |
187 | uint64_t cl_smr_fallback; |
188 | uint64_t cl_lock_hits; |
189 | uint64_t clp_next; |
190 | uint64_t clp_next_fail; |
191 | uint64_t clp_smr_next; |
192 | uint64_t clp_smr_next_fail; |
193 | uint64_t clp_smr_fallback; |
194 | uint64_t nc_lock_shared; |
195 | uint64_t nc_lock; |
196 | } ncstats = {0}; |
197 | |
198 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, cl_smr_hits, |
199 | CTLFLAG_RD | CTLFLAG_LOCKED, |
200 | &ncstats.cl_smr_hits, "" ); |
201 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, cl_smr_misses, |
202 | CTLFLAG_RD | CTLFLAG_LOCKED, |
203 | &ncstats.cl_smr_miss, "" ); |
204 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, cl_smr_negative_hits, |
205 | CTLFLAG_RD | CTLFLAG_LOCKED, |
206 | &ncstats.cl_smr_negative_hits, "" ); |
207 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, cl_smr_fallback, |
208 | CTLFLAG_RD | CTLFLAG_LOCKED, |
209 | &ncstats.cl_smr_fallback, "" ); |
210 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, cl_lock_hits, |
211 | CTLFLAG_RD | CTLFLAG_LOCKED, |
212 | &ncstats.cl_lock_hits, "" ); |
213 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, clp_next, |
214 | CTLFLAG_RD | CTLFLAG_LOCKED, |
215 | &ncstats.clp_next, "" ); |
216 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, clp_next_fail, |
217 | CTLFLAG_RD | CTLFLAG_LOCKED, |
218 | &ncstats.clp_next_fail, "" ); |
219 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, clp_smr_next, |
220 | CTLFLAG_RD | CTLFLAG_LOCKED, |
221 | &ncstats.clp_smr_next, "" ); |
222 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, clp_smr_next_fail, |
223 | CTLFLAG_RD | CTLFLAG_LOCKED, |
224 | &ncstats.clp_smr_next_fail, "" ); |
225 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, nc_lock_shared, |
226 | CTLFLAG_RD | CTLFLAG_LOCKED, |
227 | &ncstats.nc_lock_shared, "" ); |
228 | SYSCTL_LONG(_vfs_ncstats, OID_AUTO, nc_lock, |
229 | CTLFLAG_RD | CTLFLAG_LOCKED, |
230 | &ncstats.nc_lock, "" ); |
231 | |
232 | #define NC_SMR_STATS(v) os_atomic_inc(&ncstats.v, relaxed) |
233 | #else |
234 | #define NC_SMR_STATS(v) |
235 | #endif /* COLLECT_NC_SMR_STATS */ |
236 | |
237 | static vnode_t cache_lookup_locked(vnode_t dvp, struct componentname *cnp, uint32_t *vidp); |
238 | static vnode_t cache_lookup_smr(vnode_t dvp, struct componentname *cnp, uint32_t *vidp); |
239 | static const char *add_name_internal(const char *, uint32_t, u_int, boolean_t, u_int); |
240 | static void init_string_table(void); |
241 | static void cache_delete(struct namecache *, int); |
242 | static void cache_enter_locked(vnode_t dvp, vnode_t vp, struct componentname *cnp, const char *strname); |
243 | static void cache_purge_locked(vnode_t vp, kauth_cred_t *credp); |
244 | static void namecache_smr_free(void *, size_t); |
245 | static void string_smr_free(void *, size_t); |
246 | |
247 | |
248 | #ifdef DUMP_STRING_TABLE |
249 | /* |
250 | * Internal dump function used for debugging |
251 | */ |
252 | void dump_string_table(void); |
253 | #endif /* DUMP_STRING_TABLE */ |
254 | |
255 | static void init_crc32(void); |
256 | static unsigned int crc32tab[256]; |
257 | |
258 | |
259 | #define NCHHASH(dvp, hash_val) \ |
260 | (&nchashtbl[(dvp->v_id ^ (hash_val)) & nchashmask]) |
261 | |
262 | /* |
263 | * This function tries to check if a directory vp is a subdirectory of dvp |
264 | * only from valid v_parent pointers. It is called with the name cache lock |
265 | * held and does not drop the lock anytime inside the function. |
266 | * |
267 | * It returns a boolean that indicates whether or not it was able to |
268 | * successfully infer the parent/descendent relationship via the v_parent |
269 | * pointers, or if it could not infer such relationship and that the decision |
270 | * must be delegated to the owning filesystem. |
271 | * |
272 | * If it does not defer the decision, i.e. it was successfuly able to determine |
273 | * the parent/descendent relationship, *is_subdir tells the caller if vp is a |
274 | * subdirectory of dvp. |
275 | * |
276 | * If the decision is deferred, *next_vp is where it stopped i.e. *next_vp |
277 | * is the vnode whose parent is to be determined from the filesystem. |
278 | * *is_subdir, in this case, is not indicative of anything and should be |
279 | * ignored. |
280 | * |
281 | * The return value and output args should be used as follows : |
282 | * |
283 | * defer = cache_check_vnode_issubdir(vp, dvp, is_subdir, next_vp); |
284 | * if (!defer) { |
285 | * if (*is_subdir) |
286 | * vp is subdirectory; |
287 | * else |
288 | * vp is not a subdirectory; |
289 | * } else { |
290 | * if (*next_vp) |
291 | * check this vnode's parent from the filesystem |
292 | * else |
293 | * error (likely because of forced unmount). |
294 | * } |
295 | * |
296 | */ |
297 | static boolean_t |
298 | cache_check_vnode_issubdir(vnode_t vp, vnode_t dvp, boolean_t *is_subdir, |
299 | vnode_t *next_vp) |
300 | { |
301 | vnode_t tvp = vp; |
302 | int defer = FALSE; |
303 | |
304 | *is_subdir = FALSE; |
305 | *next_vp = NULLVP; |
306 | while (1) { |
307 | mount_t tmp; |
308 | |
309 | if (tvp == dvp) { |
310 | *is_subdir = TRUE; |
311 | break; |
312 | } else if (tvp == rootvnode) { |
313 | /* *is_subdir = FALSE */ |
314 | break; |
315 | } |
316 | |
317 | tmp = tvp->v_mount; |
318 | while ((tvp->v_flag & VROOT) && tmp && tmp->mnt_vnodecovered && |
319 | tvp != dvp && tvp != rootvnode) { |
320 | tvp = tmp->mnt_vnodecovered; |
321 | tmp = tvp->v_mount; |
322 | } |
323 | |
324 | /* |
325 | * If dvp is not at the top of a mount "stack" then |
326 | * vp is not a subdirectory of dvp either. |
327 | */ |
328 | if (tvp == dvp || tvp == rootvnode) { |
329 | /* *is_subdir = FALSE */ |
330 | break; |
331 | } |
332 | |
333 | if (!tmp) { |
334 | defer = TRUE; |
335 | *next_vp = NULLVP; |
336 | break; |
337 | } |
338 | |
339 | if ((tvp->v_flag & VISHARDLINK) || !(tvp->v_parent)) { |
340 | defer = TRUE; |
341 | *next_vp = tvp; |
342 | break; |
343 | } |
344 | |
345 | tvp = tvp->v_parent; |
346 | } |
347 | |
348 | return defer; |
349 | } |
350 | |
351 | /* maximum times retry from potentially transient errors in vnode_issubdir */ |
352 | #define MAX_ERROR_RETRY 3 |
353 | |
354 | /* |
355 | * This function checks if a given directory (vp) is a subdirectory of dvp. |
356 | * It walks backwards from vp and if it hits dvp in its parent chain, |
357 | * it is a subdirectory. If it encounters the root directory, it is not |
358 | * a subdirectory. |
359 | * |
360 | * This function returns an error if it is unsuccessful and 0 on success. |
361 | * |
362 | * On entry (and exit) vp has an iocount and if this function has to take |
363 | * any iocounts on other vnodes in the parent chain traversal, it releases them. |
364 | */ |
365 | int |
366 | vnode_issubdir(vnode_t vp, vnode_t dvp, int *is_subdir, vfs_context_t ctx) |
367 | { |
368 | vnode_t start_vp, tvp; |
369 | vnode_t vp_with_iocount; |
370 | int error = 0; |
371 | char dotdotbuf[] = ".." ; |
372 | int error_retry_count = 0; /* retry count for potentially transient |
373 | * errors */ |
374 | |
375 | *is_subdir = FALSE; |
376 | tvp = start_vp = vp; |
377 | /* |
378 | * Anytime we acquire an iocount in this function, we save the vnode |
379 | * in this variable and release it before exiting. |
380 | */ |
381 | vp_with_iocount = NULLVP; |
382 | |
383 | while (1) { |
384 | boolean_t defer; |
385 | vnode_t pvp; |
386 | uint32_t vid = 0; |
387 | struct componentname cn; |
388 | boolean_t is_subdir_locked = FALSE; |
389 | |
390 | if (tvp == dvp) { |
391 | *is_subdir = TRUE; |
392 | break; |
393 | } else if (tvp == rootvnode) { |
394 | /* *is_subdir = FALSE */ |
395 | break; |
396 | } |
397 | |
398 | NAME_CACHE_LOCK_SHARED(); |
399 | |
400 | defer = cache_check_vnode_issubdir(vp: tvp, dvp, is_subdir: &is_subdir_locked, |
401 | next_vp: &tvp); |
402 | |
403 | if (defer && tvp) { |
404 | vid = vnode_vid(vp: tvp); |
405 | vnode_hold(vp: tvp); |
406 | } |
407 | |
408 | NAME_CACHE_UNLOCK(); |
409 | |
410 | if (!defer) { |
411 | *is_subdir = is_subdir_locked; |
412 | break; |
413 | } |
414 | |
415 | if (!tvp) { |
416 | if (error_retry_count++ < MAX_ERROR_RETRY) { |
417 | tvp = vp; |
418 | continue; |
419 | } |
420 | error = ENOENT; |
421 | break; |
422 | } |
423 | |
424 | if (tvp != start_vp) { |
425 | if (vp_with_iocount) { |
426 | vnode_put(vp: vp_with_iocount); |
427 | vp_with_iocount = NULLVP; |
428 | } |
429 | |
430 | error = vnode_getwithvid(tvp, vid); |
431 | vnode_drop(vp: tvp); |
432 | if (error) { |
433 | if (error_retry_count++ < MAX_ERROR_RETRY) { |
434 | tvp = vp; |
435 | error = 0; |
436 | continue; |
437 | } |
438 | break; |
439 | } |
440 | vp_with_iocount = tvp; |
441 | } else { |
442 | tvp = vnode_drop(vp: tvp); |
443 | } |
444 | |
445 | bzero(s: &cn, n: sizeof(cn)); |
446 | cn.cn_nameiop = LOOKUP; |
447 | cn.cn_flags = ISLASTCN | ISDOTDOT; |
448 | cn.cn_context = ctx; |
449 | cn.cn_pnbuf = &dotdotbuf[0]; |
450 | cn.cn_pnlen = sizeof(dotdotbuf); |
451 | cn.cn_nameptr = cn.cn_pnbuf; |
452 | cn.cn_namelen = 2; |
453 | |
454 | pvp = NULLVP; |
455 | if ((error = VNOP_LOOKUP(tvp, &pvp, &cn, ctx))) { |
456 | break; |
457 | } |
458 | |
459 | if (!(tvp->v_flag & VISHARDLINK) && tvp->v_parent != pvp) { |
460 | (void)vnode_update_identity(vp: tvp, dvp: pvp, NULL, name_len: 0, name_hashval: 0, |
461 | VNODE_UPDATE_PARENT); |
462 | } |
463 | |
464 | if (vp_with_iocount) { |
465 | vnode_put(vp: vp_with_iocount); |
466 | } |
467 | |
468 | vp_with_iocount = tvp = pvp; |
469 | } |
470 | |
471 | if (vp_with_iocount) { |
472 | vnode_put(vp: vp_with_iocount); |
473 | } |
474 | |
475 | return error; |
476 | } |
477 | |
478 | /* |
479 | * This function builds the path in "buff" from the supplied vnode. |
480 | * The length of the buffer *INCLUDING* the trailing zero byte is |
481 | * returned in outlen. NOTE: the length includes the trailing zero |
482 | * byte and thus the length is one greater than what strlen would |
483 | * return. This is important and lots of code elsewhere in the kernel |
484 | * assumes this behavior. |
485 | * |
486 | * This function can call vnop in file system if the parent vnode |
487 | * does not exist or when called for hardlinks via volfs path. |
488 | * If BUILDPATH_NO_FS_ENTER is set in flags, it only uses values present |
489 | * in the name cache and does not enter the file system. |
490 | * |
491 | * If BUILDPATH_CHECK_MOVED is set in flags, we return EAGAIN when |
492 | * we encounter ENOENT during path reconstruction. ENOENT means that |
493 | * one of the parents moved while we were building the path. The |
494 | * caller can special handle this case by calling build_path again. |
495 | * |
496 | * If BUILDPATH_VOLUME_RELATIVE is set in flags, we return path |
497 | * that is relative to the nearest mount point, i.e. do not |
498 | * cross over mount points during building the path. |
499 | * |
500 | * passed in vp must have a valid io_count reference |
501 | * |
502 | * If parent vnode is non-NULL it also must have an io count. This |
503 | * allows build_path_with_parent to be safely called for operations |
504 | * unlink, rmdir and rename that already have io counts on the target |
505 | * and the directory. In this way build_path_with_parent does not have |
506 | * to try and obtain an additional io count on the parent. Taking an |
507 | * io count ont the parent can lead to dead lock if a forced unmount |
508 | * occures at the right moment. For a fuller explaination on how this |
509 | * can occur see the comment for vn_getpath_with_parent. |
510 | * |
511 | */ |
512 | int |
513 | build_path_with_parent(vnode_t first_vp, vnode_t parent_vp, char *buff, int buflen, |
514 | int *outlen, size_t *mntpt_outlen, int flags, vfs_context_t ctx) |
515 | { |
516 | vnode_t vp, tvp; |
517 | vnode_t vp_with_iocount; |
518 | vnode_t proc_root_dir_vp; |
519 | char *end; |
520 | char *mntpt_end; |
521 | const char *str; |
522 | unsigned int len; |
523 | int ret = 0; |
524 | int fixhardlink; |
525 | |
526 | if (first_vp == NULLVP) { |
527 | return EINVAL; |
528 | } |
529 | |
530 | if (buflen <= 1) { |
531 | return ENOSPC; |
532 | } |
533 | |
534 | /* |
535 | * Grab the process fd so we can evaluate fd_rdir. |
536 | */ |
537 | if (!(flags & BUILDPATH_NO_PROCROOT)) { |
538 | proc_root_dir_vp = vfs_context_proc(ctx)->p_fd.fd_rdir; |
539 | } else { |
540 | proc_root_dir_vp = NULL; |
541 | } |
542 | |
543 | vp_with_iocount = NULLVP; |
544 | again: |
545 | vp = first_vp; |
546 | |
547 | end = &buff[buflen - 1]; |
548 | *end = '\0'; |
549 | mntpt_end = NULL; |
550 | |
551 | /* |
552 | * Catch a special corner case here: chroot to /full/path/to/dir, chdir to |
553 | * it, then open it. Without this check, the path to it will be |
554 | * /full/path/to/dir instead of "/". |
555 | */ |
556 | if (proc_root_dir_vp == first_vp) { |
557 | *--end = '/'; |
558 | goto out; |
559 | } |
560 | |
561 | /* |
562 | * holding the NAME_CACHE_LOCK in shared mode is |
563 | * sufficient to stabilize both the vp->v_parent chain |
564 | * and the 'vp->v_mount->mnt_vnodecovered' chain |
565 | * |
566 | * if we need to drop this lock, we must first grab the v_id |
567 | * from the vnode we're currently working with... if that |
568 | * vnode doesn't already have an io_count reference (the vp |
569 | * passed in comes with one), we must grab a reference |
570 | * after we drop the NAME_CACHE_LOCK via vnode_getwithvid... |
571 | * deadlocks may result if you call vnode_get while holding |
572 | * the NAME_CACHE_LOCK... we lazily release the reference |
573 | * we pick up the next time we encounter a need to drop |
574 | * the NAME_CACHE_LOCK or before we return from this routine |
575 | */ |
576 | NAME_CACHE_LOCK_SHARED(); |
577 | |
578 | #if CONFIG_FIRMLINKS |
579 | if (!(flags & BUILDPATH_NO_FIRMLINK) && |
580 | (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink && (vp->v_fmlink->v_type == VDIR)) { |
581 | vp = vp->v_fmlink; |
582 | } |
583 | #endif |
584 | |
585 | /* |
586 | * Check if this is the root of a file system. |
587 | */ |
588 | while (vp && vp->v_flag & VROOT) { |
589 | if (vp->v_mount == NULL) { |
590 | ret = EINVAL; |
591 | goto out_unlock; |
592 | } |
593 | if ((vp->v_mount->mnt_flag & MNT_ROOTFS) || (vp == proc_root_dir_vp)) { |
594 | /* |
595 | * It's the root of the root file system, so it's |
596 | * just "/". |
597 | */ |
598 | *--end = '/'; |
599 | |
600 | goto out_unlock; |
601 | } else { |
602 | /* |
603 | * This the root of the volume and the caller does not |
604 | * want to cross mount points. Therefore just return |
605 | * '/' as the relative path. |
606 | */ |
607 | #if CONFIG_FIRMLINKS |
608 | if (!(flags & BUILDPATH_NO_FIRMLINK) && |
609 | (vp->v_flag & VFMLINKTARGET) && vp->v_fmlink && (vp->v_fmlink->v_type == VDIR)) { |
610 | vp = vp->v_fmlink; |
611 | } else |
612 | #endif |
613 | if (flags & BUILDPATH_VOLUME_RELATIVE) { |
614 | *--end = '/'; |
615 | goto out_unlock; |
616 | } else { |
617 | vp = vp->v_mount->mnt_vnodecovered; |
618 | if (!mntpt_end && vp) { |
619 | mntpt_end = end; |
620 | } |
621 | } |
622 | } |
623 | } |
624 | |
625 | while ((vp != NULLVP) && (vp->v_parent != vp)) { |
626 | int vid; |
627 | |
628 | /* |
629 | * For hardlinks the v_name may be stale, so if its OK |
630 | * to enter a file system, ask the file system for the |
631 | * name and parent (below). |
632 | */ |
633 | fixhardlink = (vp->v_flag & VISHARDLINK) && |
634 | (vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID) && |
635 | !(flags & BUILDPATH_NO_FS_ENTER); |
636 | |
637 | if (!fixhardlink) { |
638 | str = vp->v_name; |
639 | |
640 | if (str == NULL || *str == '\0') { |
641 | if (vp->v_parent != NULL) { |
642 | ret = EINVAL; |
643 | } else { |
644 | ret = ENOENT; |
645 | } |
646 | goto out_unlock; |
647 | } |
648 | len = (unsigned int)strlen(s: str); |
649 | /* |
650 | * Check that there's enough space (including space for the '/') |
651 | */ |
652 | if ((unsigned int)(end - buff) < (len + 1)) { |
653 | ret = ENOSPC; |
654 | goto out_unlock; |
655 | } |
656 | /* |
657 | * Copy the name backwards. |
658 | */ |
659 | str += len; |
660 | |
661 | for (; len > 0; len--) { |
662 | *--end = *--str; |
663 | } |
664 | /* |
665 | * Add a path separator. |
666 | */ |
667 | *--end = '/'; |
668 | } |
669 | |
670 | /* |
671 | * Walk up the parent chain. |
672 | */ |
673 | if (((vp->v_parent != NULLVP) && !fixhardlink) || |
674 | (flags & BUILDPATH_NO_FS_ENTER)) { |
675 | /* |
676 | * In this if () block we are not allowed to enter the filesystem |
677 | * to conclusively get the most accurate parent identifier. |
678 | * As a result, if 'vp' does not identify '/' and it |
679 | * does not have a valid v_parent, then error out |
680 | * and disallow further path construction |
681 | */ |
682 | if ((vp->v_parent == NULLVP) && (rootvnode != vp)) { |
683 | /* |
684 | * Only '/' is allowed to have a NULL parent |
685 | * pointer. Upper level callers should ideally |
686 | * re-drive name lookup on receiving a ENOENT. |
687 | */ |
688 | ret = ENOENT; |
689 | |
690 | /* The code below will exit early if 'tvp = vp' == NULL */ |
691 | } |
692 | vp = vp->v_parent; |
693 | |
694 | /* |
695 | * if the vnode we have in hand isn't a directory and it |
696 | * has a v_parent, then we started with the resource fork |
697 | * so skip up to avoid getting a duplicate copy of the |
698 | * file name in the path. |
699 | */ |
700 | if (vp && !vnode_isdir(vp) && vp->v_parent) { |
701 | vp = vp->v_parent; |
702 | } |
703 | } else { |
704 | /* |
705 | * No parent, go get it if supported. |
706 | */ |
707 | struct vnode_attr va; |
708 | vnode_t dvp; |
709 | |
710 | /* |
711 | * Make sure file system supports obtaining a path from id. |
712 | */ |
713 | if (!(vp->v_mount->mnt_kern_flag & MNTK_PATH_FROM_ID)) { |
714 | ret = ENOENT; |
715 | goto out_unlock; |
716 | } |
717 | vid = vp->v_id; |
718 | |
719 | vnode_hold(vp); |
720 | NAME_CACHE_UNLOCK(); |
721 | |
722 | if (vp != first_vp && vp != parent_vp && vp != vp_with_iocount) { |
723 | if (vp_with_iocount) { |
724 | vnode_put(vp: vp_with_iocount); |
725 | vp_with_iocount = NULLVP; |
726 | } |
727 | if (vnode_getwithvid(vp, vid)) { |
728 | vnode_drop(vp); |
729 | goto again; |
730 | } |
731 | vp_with_iocount = vp; |
732 | } |
733 | |
734 | vnode_drop(vp); |
735 | |
736 | VATTR_INIT(&va); |
737 | VATTR_WANTED(&va, va_parentid); |
738 | |
739 | if (fixhardlink) { |
740 | VATTR_WANTED(&va, va_name); |
741 | va.va_name = zalloc(view: ZV_NAMEI); |
742 | } else { |
743 | va.va_name = NULL; |
744 | } |
745 | /* |
746 | * Ask the file system for its parent id and for its name (optional). |
747 | */ |
748 | ret = vnode_getattr(vp, vap: &va, ctx); |
749 | |
750 | if (fixhardlink) { |
751 | if ((ret == 0) && (VATTR_IS_SUPPORTED(&va, va_name))) { |
752 | str = va.va_name; |
753 | vnode_update_identity(vp, NULL, name: str, name_len: (unsigned int)strlen(s: str), name_hashval: 0, VNODE_UPDATE_NAME); |
754 | } else if (vp->v_name) { |
755 | str = vp->v_name; |
756 | ret = 0; |
757 | } else { |
758 | ret = ENOENT; |
759 | goto bad_news; |
760 | } |
761 | len = (unsigned int)strlen(s: str); |
762 | |
763 | /* |
764 | * Check that there's enough space. |
765 | */ |
766 | if ((unsigned int)(end - buff) < (len + 1)) { |
767 | ret = ENOSPC; |
768 | } else { |
769 | /* Copy the name backwards. */ |
770 | str += len; |
771 | |
772 | for (; len > 0; len--) { |
773 | *--end = *--str; |
774 | } |
775 | /* |
776 | * Add a path separator. |
777 | */ |
778 | *--end = '/'; |
779 | } |
780 | bad_news: |
781 | zfree(ZV_NAMEI, va.va_name); |
782 | } |
783 | if (ret || !VATTR_IS_SUPPORTED(&va, va_parentid)) { |
784 | ret = ENOENT; |
785 | goto out; |
786 | } |
787 | /* |
788 | * Ask the file system for the parent vnode. |
789 | */ |
790 | if ((ret = VFS_VGET(vp->v_mount, (ino64_t)va.va_parentid, &dvp, ctx))) { |
791 | goto out; |
792 | } |
793 | |
794 | if (!fixhardlink && (vp->v_parent != dvp)) { |
795 | vnode_update_identity(vp, dvp, NULL, name_len: 0, name_hashval: 0, VNODE_UPDATE_PARENT); |
796 | } |
797 | |
798 | if (vp_with_iocount) { |
799 | vnode_put(vp: vp_with_iocount); |
800 | } |
801 | vp = dvp; |
802 | vp_with_iocount = vp; |
803 | |
804 | NAME_CACHE_LOCK_SHARED(); |
805 | |
806 | /* |
807 | * if the vnode we have in hand isn't a directory and it |
808 | * has a v_parent, then we started with the resource fork |
809 | * so skip up to avoid getting a duplicate copy of the |
810 | * file name in the path. |
811 | */ |
812 | if (vp && !vnode_isdir(vp) && vp->v_parent) { |
813 | vp = vp->v_parent; |
814 | } |
815 | } |
816 | |
817 | if (vp && (flags & BUILDPATH_CHECKACCESS)) { |
818 | vid = vp->v_id; |
819 | |
820 | vnode_hold(vp); |
821 | NAME_CACHE_UNLOCK(); |
822 | |
823 | if (vp != first_vp && vp != parent_vp && vp != vp_with_iocount) { |
824 | if (vp_with_iocount) { |
825 | vnode_put(vp: vp_with_iocount); |
826 | vp_with_iocount = NULLVP; |
827 | } |
828 | if (vnode_getwithvid(vp, vid)) { |
829 | vnode_drop(vp); |
830 | goto again; |
831 | } |
832 | vp_with_iocount = vp; |
833 | } |
834 | vnode_drop(vp); |
835 | |
836 | if ((ret = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx))) { |
837 | goto out; /* no peeking */ |
838 | } |
839 | NAME_CACHE_LOCK_SHARED(); |
840 | } |
841 | |
842 | /* |
843 | * When a mount point is crossed switch the vp. |
844 | * Continue until we find the root or we find |
845 | * a vnode that's not the root of a mounted |
846 | * file system. |
847 | */ |
848 | tvp = vp; |
849 | |
850 | while (tvp) { |
851 | if (tvp == proc_root_dir_vp) { |
852 | goto out_unlock; /* encountered the root */ |
853 | } |
854 | |
855 | #if CONFIG_FIRMLINKS |
856 | if (!(flags & BUILDPATH_NO_FIRMLINK) && |
857 | (tvp->v_flag & VFMLINKTARGET) && tvp->v_fmlink && (tvp->v_fmlink->v_type == VDIR)) { |
858 | tvp = tvp->v_fmlink; |
859 | break; |
860 | } |
861 | #endif |
862 | |
863 | if (!(tvp->v_flag & VROOT) || !tvp->v_mount) { |
864 | break; /* not the root of a mounted FS */ |
865 | } |
866 | if (flags & BUILDPATH_VOLUME_RELATIVE) { |
867 | /* Do not cross over mount points */ |
868 | tvp = NULL; |
869 | } else { |
870 | tvp = tvp->v_mount->mnt_vnodecovered; |
871 | if (!mntpt_end && tvp) { |
872 | mntpt_end = end; |
873 | } |
874 | } |
875 | } |
876 | if (tvp == NULLVP) { |
877 | goto out_unlock; |
878 | } |
879 | vp = tvp; |
880 | } |
881 | out_unlock: |
882 | NAME_CACHE_UNLOCK(); |
883 | out: |
884 | if (vp_with_iocount) { |
885 | vnode_put(vp: vp_with_iocount); |
886 | } |
887 | /* |
888 | * Slide the name down to the beginning of the buffer. |
889 | */ |
890 | memmove(dst: buff, src: end, n: &buff[buflen] - end); |
891 | |
892 | /* |
893 | * length includes the trailing zero byte |
894 | */ |
895 | *outlen = (int)(&buff[buflen] - end); |
896 | if (mntpt_outlen && mntpt_end) { |
897 | *mntpt_outlen = (size_t)*outlen - (size_t)(&buff[buflen] - mntpt_end); |
898 | } |
899 | |
900 | /* One of the parents was moved during path reconstruction. |
901 | * The caller is interested in knowing whether any of the |
902 | * parents moved via BUILDPATH_CHECK_MOVED, so return EAGAIN. |
903 | */ |
904 | if ((ret == ENOENT) && (flags & BUILDPATH_CHECK_MOVED)) { |
905 | ret = EAGAIN; |
906 | } |
907 | |
908 | return ret; |
909 | } |
910 | |
911 | int |
912 | build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx) |
913 | { |
914 | return build_path_with_parent(first_vp, NULL, buff, buflen, outlen, NULL, flags, ctx); |
915 | } |
916 | |
917 | /* |
918 | * Combined version of vnode_getparent() and vnode_getname() to acquire both vnode name and parent |
919 | * without releasing the name cache lock in interim. |
920 | */ |
921 | void |
922 | vnode_getparent_and_name(vnode_t vp, vnode_t *out_pvp, const char **out_name) |
923 | { |
924 | vnode_t pvp = NULLVP; |
925 | int locked = 0; |
926 | int pvid; |
927 | |
928 | NAME_CACHE_LOCK_SHARED(); |
929 | locked = 1; |
930 | |
931 | if (out_name) { |
932 | const char *name = NULL; |
933 | if (vp->v_name) { |
934 | name = vfs_addname(name: vp->v_name, len: (unsigned int)strlen(s: vp->v_name), nc_hash: 0, flags: 0); |
935 | } |
936 | *out_name = name; |
937 | } |
938 | |
939 | if (!out_pvp) { |
940 | goto out; |
941 | } |
942 | |
943 | pvp = vp->v_parent; |
944 | |
945 | /* |
946 | * v_parent is stable behind the name_cache lock |
947 | * however, the only thing we can really guarantee |
948 | * is that we've grabbed a valid iocount on the |
949 | * parent of 'vp' at the time we took the name_cache lock... |
950 | * once we drop the lock, vp could get re-parented |
951 | */ |
952 | if (pvp != NULLVP) { |
953 | pvid = pvp->v_id; |
954 | |
955 | vnode_hold(vp: pvp); |
956 | NAME_CACHE_UNLOCK(); |
957 | locked = 0; |
958 | |
959 | if (vnode_getwithvid(pvp, pvid) != 0) { |
960 | vnode_drop(vp: pvp); |
961 | pvp = NULL; |
962 | } else { |
963 | vnode_drop(vp: pvp); |
964 | } |
965 | } |
966 | *out_pvp = pvp; |
967 | |
968 | out: |
969 | if (locked) { |
970 | NAME_CACHE_UNLOCK(); |
971 | } |
972 | } |
973 | |
974 | /* |
975 | * return NULLVP if vp's parent doesn't |
976 | * exist, or we can't get a valid iocount |
977 | * else return the parent of vp |
978 | */ |
979 | vnode_t |
980 | vnode_getparent(vnode_t vp) |
981 | { |
982 | vnode_t pvp = NULLVP; |
983 | vnode_getparent_and_name(vp, out_pvp: &pvp, NULL); |
984 | |
985 | return pvp; |
986 | } |
987 | |
988 | /* |
989 | * Similar to vnode_getparent() but only returned parent vnode (with iocount |
990 | * held) if the actual parent vnode is different than the given 'pvp'. |
991 | */ |
992 | __private_extern__ vnode_t |
993 | vnode_getparent_if_different(vnode_t vp, vnode_t pvp) |
994 | { |
995 | vnode_t real_pvp = NULLVP; |
996 | int pvid; |
997 | |
998 | if (vp->v_parent == pvp) { |
999 | goto out; |
1000 | } |
1001 | |
1002 | NAME_CACHE_LOCK_SHARED(); |
1003 | |
1004 | real_pvp = vp->v_parent; |
1005 | if (real_pvp == NULLVP) { |
1006 | NAME_CACHE_UNLOCK(); |
1007 | goto out; |
1008 | } |
1009 | |
1010 | /* |
1011 | * Do the check again after namecache lock is acquired as the parent vnode |
1012 | * could have changed. |
1013 | */ |
1014 | if (real_pvp != pvp) { |
1015 | pvid = real_pvp->v_id; |
1016 | |
1017 | vnode_hold(vp: real_pvp); |
1018 | NAME_CACHE_UNLOCK(); |
1019 | |
1020 | if (vnode_getwithvid(real_pvp, pvid) != 0) { |
1021 | vnode_drop(vp: real_pvp); |
1022 | real_pvp = NULLVP; |
1023 | } else { |
1024 | vnode_drop(vp: real_pvp); |
1025 | } |
1026 | } else { |
1027 | real_pvp = NULLVP; |
1028 | NAME_CACHE_UNLOCK(); |
1029 | } |
1030 | |
1031 | out: |
1032 | return real_pvp; |
1033 | } |
1034 | |
1035 | const char * |
1036 | vnode_getname(vnode_t vp) |
1037 | { |
1038 | const char *name = NULL; |
1039 | vnode_getparent_and_name(vp, NULL, out_name: &name); |
1040 | |
1041 | return name; |
1042 | } |
1043 | |
1044 | void |
1045 | vnode_putname(const char *name) |
1046 | { |
1047 | vfs_removename(name); |
1048 | } |
1049 | |
1050 | static const char unknown_vnodename[] = "(unknown vnode name)" ; |
1051 | |
1052 | const char * |
1053 | vnode_getname_printable(vnode_t vp) |
1054 | { |
1055 | const char *name = vnode_getname(vp); |
1056 | if (name != NULL) { |
1057 | return name; |
1058 | } |
1059 | |
1060 | switch (vp->v_type) { |
1061 | case VCHR: |
1062 | case VBLK: |
1063 | { |
1064 | /* |
1065 | * Create an artificial dev name from |
1066 | * major and minor device number |
1067 | */ |
1068 | char dev_name[64]; |
1069 | (void) snprintf(dev_name, count: sizeof(dev_name), |
1070 | "%c(%u, %u)" , VCHR == vp->v_type ? 'c':'b', |
1071 | major(vp->v_rdev), minor(vp->v_rdev)); |
1072 | /* |
1073 | * Add the newly created dev name to the name |
1074 | * cache to allow easier cleanup. Also, |
1075 | * vfs_addname allocates memory for the new name |
1076 | * and returns it. |
1077 | */ |
1078 | NAME_CACHE_LOCK_SHARED(); |
1079 | name = vfs_addname(name: dev_name, len: (unsigned int)strlen(s: dev_name), nc_hash: 0, flags: 0); |
1080 | NAME_CACHE_UNLOCK(); |
1081 | return name; |
1082 | } |
1083 | default: |
1084 | return unknown_vnodename; |
1085 | } |
1086 | } |
1087 | |
1088 | void |
1089 | vnode_putname_printable(const char *name) |
1090 | { |
1091 | if (name == unknown_vnodename) { |
1092 | return; |
1093 | } |
1094 | vnode_putname(name); |
1095 | } |
1096 | |
1097 | |
1098 | /* |
1099 | * if VNODE_UPDATE_PARENT, and we can take |
1100 | * a reference on dvp, then update vp with |
1101 | * it's new parent... if vp already has a parent, |
1102 | * then drop the reference vp held on it |
1103 | * |
1104 | * if VNODE_UPDATE_NAME, |
1105 | * then drop string ref on v_name if it exists, and if name is non-NULL |
1106 | * then pick up a string reference on name and record it in v_name... |
1107 | * optionally pass in the length and hashval of name if known |
1108 | * |
1109 | * if VNODE_UPDATE_CACHE, flush the name cache entries associated with vp |
1110 | */ |
1111 | void |
1112 | vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, uint32_t name_hashval, int flags) |
1113 | { |
1114 | struct namecache *ncp; |
1115 | vnode_t old_parentvp = NULLVP; |
1116 | int isstream = (vp->v_flag & VISNAMEDSTREAM); |
1117 | int kusecountbumped = 0; |
1118 | kauth_cred_t tcred = NULL; |
1119 | const char *vname = NULL; |
1120 | const char *tname = NULL; |
1121 | |
1122 | if (name_len < 0) { |
1123 | return; |
1124 | } |
1125 | |
1126 | if (flags & VNODE_UPDATE_PARENT) { |
1127 | if (dvp && vnode_ref(vp: dvp) != 0) { |
1128 | dvp = NULLVP; |
1129 | } |
1130 | /* Don't count a stream's parent ref during unmounts */ |
1131 | if (isstream && dvp && (dvp != vp) && (dvp != vp->v_parent) && (dvp->v_type == VREG)) { |
1132 | vnode_lock_spin(dvp); |
1133 | ++dvp->v_kusecount; |
1134 | kusecountbumped = 1; |
1135 | vnode_unlock(dvp); |
1136 | } |
1137 | } else { |
1138 | dvp = NULLVP; |
1139 | } |
1140 | if ((flags & VNODE_UPDATE_NAME)) { |
1141 | if (name != vp->v_name) { |
1142 | if (name && *name) { |
1143 | if (name_len == 0) { |
1144 | name_len = (int)strlen(s: name); |
1145 | } |
1146 | tname = vfs_addname(name, len: name_len, nc_hash: name_hashval, flags: 0); |
1147 | } |
1148 | } else { |
1149 | flags &= ~VNODE_UPDATE_NAME; |
1150 | } |
1151 | } |
1152 | if ((flags & (VNODE_UPDATE_PURGE | VNODE_UPDATE_PARENT | VNODE_UPDATE_CACHE | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGEFIRMLINK))) { |
1153 | NAME_CACHE_LOCK(); |
1154 | |
1155 | #if CONFIG_FIRMLINKS |
1156 | if (flags & VNODE_UPDATE_PURGEFIRMLINK) { |
1157 | vnode_t old_fvp = vp->v_fmlink; |
1158 | if (old_fvp) { |
1159 | vnode_lock_spin(vp); |
1160 | vp->v_flag &= ~VFMLINKTARGET; |
1161 | vp->v_fmlink = NULLVP; |
1162 | vnode_unlock(vp); |
1163 | NAME_CACHE_UNLOCK(); |
1164 | |
1165 | /* |
1166 | * vnode_rele can result in cascading series of |
1167 | * usecount releases. The combination of calling |
1168 | * vnode_recycle and dont_reenter (3rd arg to |
1169 | * vnode_rele_internal) ensures we don't have |
1170 | * that issue. |
1171 | */ |
1172 | vnode_recycle(vp: old_fvp); |
1173 | vnode_rele_internal(old_fvp, O_EVTONLY, 1, 0); |
1174 | |
1175 | NAME_CACHE_LOCK(); |
1176 | } |
1177 | } |
1178 | #endif |
1179 | |
1180 | if ((flags & VNODE_UPDATE_PURGE)) { |
1181 | if (vp->v_parent) { |
1182 | vp->v_parent->v_nc_generation++; |
1183 | } |
1184 | |
1185 | while ((ncp = LIST_FIRST(&vp->v_nclinks))) { |
1186 | cache_delete(ncp, 1); |
1187 | } |
1188 | |
1189 | while ((ncp = TAILQ_FIRST(&vp->v_ncchildren))) { |
1190 | cache_delete(ncp, 1); |
1191 | } |
1192 | |
1193 | /* |
1194 | * Use a temp variable to avoid kauth_cred_drop() while NAME_CACHE_LOCK is held |
1195 | */ |
1196 | tcred = vnode_cred(vp); |
1197 | vp->v_cred = NOCRED; |
1198 | vp->v_authorized_actions = 0; |
1199 | vp->v_cred_timestamp = 0; |
1200 | } |
1201 | if ((flags & VNODE_UPDATE_NAME)) { |
1202 | vname = vp->v_name; |
1203 | vp->v_name = tname; |
1204 | } |
1205 | if (flags & VNODE_UPDATE_PARENT) { |
1206 | if (dvp != vp && dvp != vp->v_parent) { |
1207 | old_parentvp = vp->v_parent; |
1208 | vp->v_parent = dvp; |
1209 | dvp = NULLVP; |
1210 | |
1211 | if (old_parentvp) { |
1212 | flags |= VNODE_UPDATE_CACHE; |
1213 | } |
1214 | } |
1215 | } |
1216 | if (flags & VNODE_UPDATE_CACHE) { |
1217 | while ((ncp = LIST_FIRST(&vp->v_nclinks))) { |
1218 | cache_delete(ncp, 1); |
1219 | } |
1220 | } |
1221 | NAME_CACHE_UNLOCK(); |
1222 | |
1223 | if (vname != NULL) { |
1224 | vfs_removename(name: vname); |
1225 | } |
1226 | |
1227 | if (IS_VALID_CRED(tcred)) { |
1228 | kauth_cred_unref(&tcred); |
1229 | } |
1230 | } |
1231 | if (dvp != NULLVP) { |
1232 | /* Back-out the ref we took if we lost a race for vp->v_parent. */ |
1233 | if (kusecountbumped) { |
1234 | vnode_lock_spin(dvp); |
1235 | if (dvp->v_kusecount > 0) { |
1236 | --dvp->v_kusecount; |
1237 | } |
1238 | vnode_unlock(dvp); |
1239 | } |
1240 | vnode_rele(vp: dvp); |
1241 | } |
1242 | if (old_parentvp) { |
1243 | struct uthread *ut; |
1244 | vnode_t vreclaims = NULLVP; |
1245 | |
1246 | if (isstream) { |
1247 | vnode_lock_spin(old_parentvp); |
1248 | if ((old_parentvp->v_type != VDIR) && (old_parentvp->v_kusecount > 0)) { |
1249 | --old_parentvp->v_kusecount; |
1250 | } |
1251 | vnode_unlock(old_parentvp); |
1252 | } |
1253 | ut = current_uthread(); |
1254 | |
1255 | /* |
1256 | * indicated to vnode_rele that it shouldn't do a |
1257 | * vnode_reclaim at this time... instead it will |
1258 | * chain the vnode to the uu_vreclaims list... |
1259 | * we'll be responsible for calling vnode_reclaim |
1260 | * on each of the vnodes in this list... |
1261 | */ |
1262 | ut->uu_defer_reclaims = 1; |
1263 | ut->uu_vreclaims = NULLVP; |
1264 | |
1265 | while ((vp = old_parentvp) != NULLVP) { |
1266 | vnode_hold(vp); |
1267 | vnode_lock_spin(vp); |
1268 | vnode_rele_internal(vp, 0, 0, 1); |
1269 | |
1270 | /* |
1271 | * check to see if the vnode is now in the state |
1272 | * that would have triggered a vnode_reclaim in vnode_rele |
1273 | * if it is, we save it's parent pointer and then NULL |
1274 | * out the v_parent field... we'll drop the reference |
1275 | * that was held on the next iteration of this loop... |
1276 | * this short circuits a potential deep recursion if we |
1277 | * have a long chain of parents in this state... |
1278 | * we'll sit in this loop until we run into |
1279 | * a parent in this chain that is not in this state |
1280 | * |
1281 | * make our check and the vnode_rele atomic |
1282 | * with respect to the current vnode we're working on |
1283 | * by holding the vnode lock |
1284 | * if vnode_rele deferred the vnode_reclaim and has put |
1285 | * this vnode on the list to be reaped by us, than |
1286 | * it has left this vnode with an iocount == 1 |
1287 | */ |
1288 | if (ut->uu_vreclaims == vp) { |
1289 | /* |
1290 | * This vnode is on the head of the uu_vreclaims chain |
1291 | * which means vnode_rele wanted to do a vnode_reclaim |
1292 | * on this vnode. Pull the parent pointer now so that when we do the |
1293 | * vnode_reclaim for each of the vnodes in the uu_vreclaims |
1294 | * list, we won't recurse back through here |
1295 | * |
1296 | * need to do a convert here in case vnode_rele_internal |
1297 | * returns with the lock held in the spin mode... it |
1298 | * can drop and retake the lock under certain circumstances |
1299 | */ |
1300 | vnode_lock_convert(vp); |
1301 | |
1302 | NAME_CACHE_LOCK(); |
1303 | old_parentvp = vp->v_parent; |
1304 | vp->v_parent = NULLVP; |
1305 | NAME_CACHE_UNLOCK(); |
1306 | } else { |
1307 | /* |
1308 | * we're done... we ran into a vnode that isn't |
1309 | * being terminated |
1310 | */ |
1311 | old_parentvp = NULLVP; |
1312 | } |
1313 | vnode_drop_and_unlock(vp); |
1314 | } |
1315 | vreclaims = ut->uu_vreclaims; |
1316 | ut->uu_vreclaims = NULLVP; |
1317 | ut->uu_defer_reclaims = 0; |
1318 | |
1319 | while ((vp = vreclaims) != NULLVP) { |
1320 | vreclaims = vp->v_defer_reclaimlist; |
1321 | |
1322 | /* |
1323 | * vnode_put will drive the vnode_reclaim if |
1324 | * we are still the only reference on this vnode |
1325 | */ |
1326 | vnode_put(vp); |
1327 | } |
1328 | } |
1329 | } |
1330 | |
1331 | #if CONFIG_FIRMLINKS |
1332 | errno_t |
1333 | vnode_setasfirmlink(vnode_t vp, vnode_t target_vp) |
1334 | { |
1335 | int error = 0; |
1336 | vnode_t old_target_vp = NULLVP; |
1337 | vnode_t old_target_vp_v_fmlink = NULLVP; |
1338 | kauth_cred_t target_vp_cred = NULL; |
1339 | kauth_cred_t old_target_vp_cred = NULL; |
1340 | |
1341 | if (!vp) { |
1342 | return EINVAL; |
1343 | } |
1344 | |
1345 | if (target_vp) { |
1346 | if (vp->v_fmlink == target_vp) { /* Will be checked again under the name cache lock */ |
1347 | return 0; |
1348 | } |
1349 | |
1350 | /* |
1351 | * Firmlink source and target will take both a usecount |
1352 | * and kusecount on each other. |
1353 | */ |
1354 | if ((error = vnode_ref_ext(target_vp, O_EVTONLY, VNODE_REF_FORCE))) { |
1355 | return error; |
1356 | } |
1357 | |
1358 | if ((error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE))) { |
1359 | vnode_rele_ext(target_vp, O_EVTONLY, 1); |
1360 | return error; |
1361 | } |
1362 | } |
1363 | |
1364 | NAME_CACHE_LOCK(); |
1365 | |
1366 | old_target_vp = vp->v_fmlink; |
1367 | if (target_vp && (target_vp == old_target_vp)) { |
1368 | NAME_CACHE_UNLOCK(); |
1369 | return 0; |
1370 | } |
1371 | vp->v_fmlink = target_vp; |
1372 | |
1373 | vnode_lock_spin(vp); |
1374 | vp->v_flag &= ~VFMLINKTARGET; |
1375 | vnode_unlock(vp); |
1376 | |
1377 | if (target_vp) { |
1378 | target_vp->v_fmlink = vp; |
1379 | vnode_lock_spin(target_vp); |
1380 | target_vp->v_flag |= VFMLINKTARGET; |
1381 | vnode_unlock(target_vp); |
1382 | cache_purge_locked(vp, credp: &target_vp_cred); |
1383 | } |
1384 | |
1385 | if (old_target_vp) { |
1386 | old_target_vp_v_fmlink = old_target_vp->v_fmlink; |
1387 | old_target_vp->v_fmlink = NULLVP; |
1388 | vnode_lock_spin(old_target_vp); |
1389 | old_target_vp->v_flag &= ~VFMLINKTARGET; |
1390 | vnode_unlock(old_target_vp); |
1391 | cache_purge_locked(vp, credp: &old_target_vp_cred); |
1392 | } |
1393 | |
1394 | NAME_CACHE_UNLOCK(); |
1395 | |
1396 | if (IS_VALID_CRED(target_vp_cred)) { |
1397 | kauth_cred_unref(&target_vp_cred); |
1398 | } |
1399 | |
1400 | if (old_target_vp) { |
1401 | if (IS_VALID_CRED(old_target_vp_cred)) { |
1402 | kauth_cred_unref(&old_target_vp_cred); |
1403 | } |
1404 | |
1405 | vnode_rele_ext(old_target_vp, O_EVTONLY, 1); |
1406 | if (old_target_vp_v_fmlink) { |
1407 | vnode_rele_ext(old_target_vp_v_fmlink, O_EVTONLY, 1); |
1408 | } |
1409 | } |
1410 | |
1411 | return 0; |
1412 | } |
1413 | |
1414 | errno_t |
1415 | vnode_getfirmlink(vnode_t vp, vnode_t *target_vp) |
1416 | { |
1417 | int error; |
1418 | |
1419 | if (!vp->v_fmlink) { |
1420 | return ENODEV; |
1421 | } |
1422 | |
1423 | NAME_CACHE_LOCK_SHARED(); |
1424 | if (vp->v_fmlink && !(vp->v_flag & VFMLINKTARGET) && |
1425 | (vnode_get(vp->v_fmlink) == 0)) { |
1426 | vnode_t tvp = vp->v_fmlink; |
1427 | |
1428 | vnode_lock_spin(tvp); |
1429 | if (tvp->v_lflag & (VL_TERMINATE | VL_DEAD)) { |
1430 | vnode_unlock(tvp); |
1431 | NAME_CACHE_UNLOCK(); |
1432 | vnode_put(vp: tvp); |
1433 | return ENOENT; |
1434 | } |
1435 | if (!(tvp->v_flag & VFMLINKTARGET)) { |
1436 | panic("firmlink target for vnode %p does not have flag set" , vp); |
1437 | } |
1438 | vnode_unlock(tvp); |
1439 | *target_vp = tvp; |
1440 | error = 0; |
1441 | } else { |
1442 | *target_vp = NULLVP; |
1443 | error = ENODEV; |
1444 | } |
1445 | NAME_CACHE_UNLOCK(); |
1446 | return error; |
1447 | } |
1448 | |
1449 | #else /* CONFIG_FIRMLINKS */ |
1450 | |
1451 | errno_t |
1452 | vnode_setasfirmlink(__unused vnode_t vp, __unused vnode_t src_vp) |
1453 | { |
1454 | return ENOTSUP; |
1455 | } |
1456 | |
1457 | errno_t |
1458 | vnode_getfirmlink(__unused vnode_t vp, __unused vnode_t *target_vp) |
1459 | { |
1460 | return ENOTSUP; |
1461 | } |
1462 | |
1463 | #endif |
1464 | |
1465 | /* |
1466 | * Mark a vnode as having multiple hard links. HFS makes use of this |
1467 | * because it keeps track of each link separately, and wants to know |
1468 | * which link was actually used. |
1469 | * |
1470 | * This will cause the name cache to force a VNOP_LOOKUP on the vnode |
1471 | * so that HFS can post-process the lookup. Also, volfs will call |
1472 | * VNOP_GETATTR2 to determine the parent, instead of using v_parent. |
1473 | */ |
1474 | void |
1475 | vnode_setmultipath(vnode_t vp) |
1476 | { |
1477 | vnode_lock_spin(vp); |
1478 | |
1479 | /* |
1480 | * In theory, we're changing the vnode's identity as far as the |
1481 | * name cache is concerned, so we ought to grab the name cache lock |
1482 | * here. However, there is already a race, and grabbing the name |
1483 | * cache lock only makes the race window slightly smaller. |
1484 | * |
1485 | * The race happens because the vnode already exists in the name |
1486 | * cache, and could be found by one thread before another thread |
1487 | * can set the hard link flag. |
1488 | */ |
1489 | |
1490 | vp->v_flag |= VISHARDLINK; |
1491 | |
1492 | vnode_unlock(vp); |
1493 | } |
1494 | |
1495 | |
1496 | |
1497 | /* |
1498 | * backwards compatibility |
1499 | */ |
1500 | void |
1501 | vnode_uncache_credentials(vnode_t vp) |
1502 | { |
1503 | vnode_uncache_authorized_action(vp, KAUTH_INVALIDATE_CACHED_RIGHTS); |
1504 | } |
1505 | |
1506 | |
1507 | /* |
1508 | * use the exclusive form of NAME_CACHE_LOCK to protect the update of the |
1509 | * following fields in the vnode: v_cred_timestamp, v_cred, v_authorized_actions |
1510 | * we use this lock so that we can look at the v_cred and v_authorized_actions |
1511 | * atomically while behind the NAME_CACHE_LOCK in shared mode in 'cache_lookup_path', |
1512 | * which is the super-hot path... if we are updating the authorized actions for this |
1513 | * vnode, we are already in the super-slow and far less frequented path so its not |
1514 | * that bad that we take the lock exclusive for this case... of course we strive |
1515 | * to hold it for the minimum amount of time possible |
1516 | */ |
1517 | |
1518 | void |
1519 | vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action) |
1520 | { |
1521 | kauth_cred_t tcred = NOCRED; |
1522 | |
1523 | NAME_CACHE_LOCK(); |
1524 | |
1525 | vp->v_authorized_actions &= ~action; |
1526 | |
1527 | if (action == KAUTH_INVALIDATE_CACHED_RIGHTS && |
1528 | IS_VALID_CRED(vp->v_cred)) { |
1529 | /* |
1530 | * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held |
1531 | */ |
1532 | tcred = vnode_cred(vp); |
1533 | vp->v_cred = NOCRED; |
1534 | } |
1535 | NAME_CACHE_UNLOCK(); |
1536 | |
1537 | if (IS_VALID_CRED(tcred)) { |
1538 | kauth_cred_unref(&tcred); |
1539 | } |
1540 | } |
1541 | |
1542 | |
1543 | /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */ |
1544 | static TUNABLE(int, bootarg_vnode_cache_defeat, "-vnode_cache_defeat" , 0); |
1545 | |
1546 | boolean_t |
1547 | vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action) |
1548 | { |
1549 | kauth_cred_t ucred; |
1550 | boolean_t retval = FALSE; |
1551 | |
1552 | /* Boot argument to defeat rights caching */ |
1553 | if (bootarg_vnode_cache_defeat) { |
1554 | return FALSE; |
1555 | } |
1556 | |
1557 | if ((vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) { |
1558 | /* |
1559 | * a TTL is enabled on the rights cache... handle it here |
1560 | * a TTL of 0 indicates that no rights should be cached |
1561 | */ |
1562 | if (vp->v_mount->mnt_authcache_ttl) { |
1563 | if (!(vp->v_mount->mnt_kern_flag & MNTK_AUTH_CACHE_TTL)) { |
1564 | /* |
1565 | * For filesystems marked only MNTK_AUTH_OPAQUE (generally network ones), |
1566 | * we will only allow a SEARCH right on a directory to be cached... |
1567 | * that cached right always has a default TTL associated with it |
1568 | */ |
1569 | if (action != KAUTH_VNODE_SEARCH || vp->v_type != VDIR) { |
1570 | vp = NULLVP; |
1571 | } |
1572 | } |
1573 | if (vp != NULLVP && vnode_cache_is_stale(vp) == TRUE) { |
1574 | vnode_uncache_authorized_action(vp, action: vp->v_authorized_actions); |
1575 | vp = NULLVP; |
1576 | } |
1577 | } else { |
1578 | vp = NULLVP; |
1579 | } |
1580 | } |
1581 | if (vp != NULLVP) { |
1582 | ucred = vfs_context_ucred(ctx); |
1583 | |
1584 | NAME_CACHE_LOCK_SHARED(); |
1585 | |
1586 | if (vnode_cred(vp) == ucred && (vp->v_authorized_actions & action) == action) { |
1587 | retval = TRUE; |
1588 | } |
1589 | |
1590 | NAME_CACHE_UNLOCK(); |
1591 | } |
1592 | return retval; |
1593 | } |
1594 | |
1595 | |
1596 | void |
1597 | vnode_cache_authorized_action(vnode_t vp, vfs_context_t ctx, kauth_action_t action) |
1598 | { |
1599 | kauth_cred_t tcred = NOCRED; |
1600 | kauth_cred_t ucred; |
1601 | struct timeval tv; |
1602 | boolean_t ttl_active = FALSE; |
1603 | |
1604 | ucred = vfs_context_ucred(ctx); |
1605 | |
1606 | if (!IS_VALID_CRED(ucred) || action == 0) { |
1607 | return; |
1608 | } |
1609 | |
1610 | if ((vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) { |
1611 | /* |
1612 | * a TTL is enabled on the rights cache... handle it here |
1613 | * a TTL of 0 indicates that no rights should be cached |
1614 | */ |
1615 | if (vp->v_mount->mnt_authcache_ttl == 0) { |
1616 | return; |
1617 | } |
1618 | |
1619 | if (!(vp->v_mount->mnt_kern_flag & MNTK_AUTH_CACHE_TTL)) { |
1620 | /* |
1621 | * only cache SEARCH action for filesystems marked |
1622 | * MNTK_AUTH_OPAQUE on VDIRs... |
1623 | * the lookup_path code will time these out |
1624 | */ |
1625 | if ((action & ~KAUTH_VNODE_SEARCH) || vp->v_type != VDIR) { |
1626 | return; |
1627 | } |
1628 | } |
1629 | ttl_active = TRUE; |
1630 | |
1631 | microuptime(tv: &tv); |
1632 | } |
1633 | NAME_CACHE_LOCK(); |
1634 | |
1635 | tcred = vnode_cred(vp); |
1636 | if (tcred == ucred) { |
1637 | tcred = NOCRED; |
1638 | } else { |
1639 | /* |
1640 | * Use a temp variable to avoid kauth_cred_drop() while NAME_CACHE_LOCK is held |
1641 | */ |
1642 | kauth_cred_ref(cred: ucred); |
1643 | vp->v_cred = ucred; |
1644 | vp->v_authorized_actions = 0; |
1645 | } |
1646 | if (ttl_active == TRUE && vp->v_authorized_actions == 0) { |
1647 | /* |
1648 | * only reset the timestamnp on the |
1649 | * first authorization cached after the previous |
1650 | * timer has expired or we're switching creds... |
1651 | * 'vnode_cache_is_authorized' will clear the |
1652 | * authorized actions if the TTL is active and |
1653 | * it has expired |
1654 | */ |
1655 | vp->v_cred_timestamp = (int)tv.tv_sec; |
1656 | } |
1657 | vp->v_authorized_actions |= action; |
1658 | |
1659 | NAME_CACHE_UNLOCK(); |
1660 | |
1661 | if (IS_VALID_CRED(tcred)) { |
1662 | kauth_cred_unref(&tcred); |
1663 | } |
1664 | } |
1665 | |
1666 | |
1667 | boolean_t |
1668 | vnode_cache_is_stale(vnode_t vp) |
1669 | { |
1670 | struct timeval tv; |
1671 | boolean_t retval; |
1672 | |
1673 | microuptime(tv: &tv); |
1674 | |
1675 | if ((tv.tv_sec - vp->v_cred_timestamp) > vp->v_mount->mnt_authcache_ttl) { |
1676 | retval = TRUE; |
1677 | } else { |
1678 | retval = FALSE; |
1679 | } |
1680 | |
1681 | return retval; |
1682 | } |
1683 | |
1684 | VFS_SMR_DECLARE; |
1685 | |
1686 | /* |
1687 | * Components of nameidata (or objects it can point to) which may |
1688 | * need restoring in case fast path lookup fails. |
1689 | */ |
1690 | struct nameidata_state { |
1691 | u_long ni_loopcnt; |
1692 | char *ni_next; |
1693 | u_int ni_pathlen; |
1694 | int32_t ni_flag; |
1695 | char *cn_nameptr; |
1696 | int cn_namelen; |
1697 | int cn_flags; |
1698 | uint32_t cn_hash; |
1699 | }; |
1700 | |
1701 | static void |
1702 | save_ndp_state(struct nameidata *ndp, struct componentname *cnp, struct nameidata_state *saved_statep) |
1703 | { |
1704 | saved_statep->ni_loopcnt = ndp->ni_loopcnt; |
1705 | saved_statep->ni_next = ndp->ni_next; |
1706 | saved_statep->ni_pathlen = ndp->ni_pathlen; |
1707 | saved_statep->ni_flag = ndp->ni_flag; |
1708 | saved_statep->cn_nameptr = cnp->cn_nameptr; |
1709 | saved_statep->cn_namelen = cnp->cn_namelen; |
1710 | saved_statep->cn_flags = cnp->cn_flags; |
1711 | saved_statep->cn_hash = cnp->cn_hash; |
1712 | } |
1713 | |
1714 | static void |
1715 | restore_ndp_state(struct nameidata *ndp, struct componentname *cnp, struct nameidata_state *saved_statep) |
1716 | { |
1717 | ndp->ni_loopcnt = saved_statep->ni_loopcnt; |
1718 | ndp->ni_next = saved_statep->ni_next; |
1719 | ndp->ni_pathlen = saved_statep->ni_pathlen; |
1720 | ndp->ni_flag = saved_statep->ni_flag; |
1721 | cnp->cn_nameptr = saved_statep->cn_nameptr; |
1722 | cnp->cn_namelen = saved_statep->cn_namelen; |
1723 | cnp->cn_flags = saved_statep->cn_flags; |
1724 | cnp->cn_hash = saved_statep->cn_hash; |
1725 | } |
1726 | |
1727 | static inline bool |
1728 | vid_is_same(vnode_t vp, uint32_t vid) |
1729 | { |
1730 | return !(os_atomic_load(&vp->v_lflag, relaxed) & (VL_DRAIN | VL_TERMINATE | VL_DEAD)) && (vnode_vid(vp) == vid); |
1731 | } |
1732 | |
1733 | static inline bool |
1734 | can_check_v_mountedhere(vnode_t vp) |
1735 | { |
1736 | return (os_atomic_load(&vp->v_usecount, relaxed) > 0) && |
1737 | (os_atomic_load(&vp->v_flag, relaxed) & VMOUNTEDHERE) && |
1738 | !(os_atomic_load(&vp->v_lflag, relaxed) & (VL_TERMINATE | VL_DEAD) && |
1739 | (vp->v_type == VDIR)); |
1740 | } |
1741 | |
1742 | /* |
1743 | * Returns: 0 Success |
1744 | * ERECYCLE vnode was recycled from underneath us. Force lookup to be re-driven from namei. |
1745 | * This errno value should not be seen by anyone outside of the kernel. |
1746 | */ |
1747 | int |
1748 | cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, |
1749 | vfs_context_t ctx, int *dp_authorized, vnode_t last_dp) |
1750 | { |
1751 | struct nameidata_state saved_state; |
1752 | char *cp; /* pointer into pathname argument */ |
1753 | uint32_t vid; |
1754 | uint32_t vvid = 0; /* protected by vp != NULLVP */ |
1755 | vnode_t vp = NULLVP; |
1756 | vnode_t tdp = NULLVP; |
1757 | vnode_t start_dp = dp; |
1758 | kauth_cred_t ucred; |
1759 | boolean_t ttl_enabled = FALSE; |
1760 | struct timeval tv; |
1761 | mount_t mp; |
1762 | mount_t dmp; |
1763 | unsigned int hash; |
1764 | int error = 0; |
1765 | boolean_t dotdotchecked = FALSE; |
1766 | bool locked = false; |
1767 | bool needs_lock = false; |
1768 | bool dp_iocount_taken = false; |
1769 | |
1770 | #if CONFIG_TRIGGERS |
1771 | vnode_t trigger_vp; |
1772 | #endif /* CONFIG_TRIGGERS */ |
1773 | |
1774 | ucred = vfs_context_ucred(ctx); |
1775 | retry: |
1776 | if (nc_smr_enabled && !needs_lock) { |
1777 | save_ndp_state(ndp, cnp, saved_statep: &saved_state); |
1778 | vfs_smr_enter(); |
1779 | } else { |
1780 | NAME_CACHE_LOCK_SHARED(); |
1781 | locked = true; |
1782 | } |
1783 | ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); |
1784 | |
1785 | dmp = dp->v_mount; |
1786 | vid = dp->v_id; |
1787 | if (dmp && (dmp->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL))) { |
1788 | ttl_enabled = TRUE; |
1789 | microuptime(tv: &tv); |
1790 | } |
1791 | for (;;) { |
1792 | /* |
1793 | * Search a directory. |
1794 | * |
1795 | * The cn_hash value is for use by cache_lookup |
1796 | * The last component of the filename is left accessible via |
1797 | * cnp->cn_nameptr for callers that need the name. |
1798 | */ |
1799 | hash = 0; |
1800 | cp = cnp->cn_nameptr; |
1801 | |
1802 | while (*cp && (*cp != '/')) { |
1803 | hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8; |
1804 | } |
1805 | /* |
1806 | * the crc generator can legitimately generate |
1807 | * a 0... however, 0 for us means that we |
1808 | * haven't computed a hash, so use 1 instead |
1809 | */ |
1810 | if (hash == 0) { |
1811 | hash = 1; |
1812 | } |
1813 | cnp->cn_hash = hash; |
1814 | cnp->cn_namelen = (int)(cp - cnp->cn_nameptr); |
1815 | |
1816 | ndp->ni_pathlen -= cnp->cn_namelen; |
1817 | ndp->ni_next = cp; |
1818 | |
1819 | /* |
1820 | * Replace multiple slashes by a single slash and trailing slashes |
1821 | * by a null. This must be done before VNOP_LOOKUP() because some |
1822 | * fs's don't know about trailing slashes. Remember if there were |
1823 | * trailing slashes to handle symlinks, existing non-directories |
1824 | * and non-existing files that won't be directories specially later. |
1825 | */ |
1826 | while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { |
1827 | cp++; |
1828 | ndp->ni_pathlen--; |
1829 | |
1830 | if (*cp == '\0') { |
1831 | ndp->ni_flag |= NAMEI_TRAILINGSLASH; |
1832 | *ndp->ni_next = '\0'; |
1833 | } |
1834 | } |
1835 | ndp->ni_next = cp; |
1836 | |
1837 | cnp->cn_flags &= ~(MAKEENTRY | ISLASTCN | ISDOTDOT); |
1838 | |
1839 | if (*cp == '\0') { |
1840 | cnp->cn_flags |= ISLASTCN; |
1841 | } |
1842 | |
1843 | if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') { |
1844 | cnp->cn_flags |= ISDOTDOT; |
1845 | } |
1846 | |
1847 | #if NAMEDRSRCFORK |
1848 | /* |
1849 | * Process a request for a file's resource fork. |
1850 | * |
1851 | * Consume the _PATH_RSRCFORKSPEC suffix and tag the path. |
1852 | */ |
1853 | if ((ndp->ni_pathlen == sizeof(_PATH_RSRCFORKSPEC)) && |
1854 | (cp[1] == '.' && cp[2] == '.') && |
1855 | bcmp(s1: cp, _PATH_RSRCFORKSPEC, n: sizeof(_PATH_RSRCFORKSPEC)) == 0) { |
1856 | /* Skip volfs file systems that don't support native streams. */ |
1857 | if ((dmp != NULL) && |
1858 | (dmp->mnt_flag & MNT_DOVOLFS) && |
1859 | (dmp->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { |
1860 | goto skiprsrcfork; |
1861 | } |
1862 | cnp->cn_flags |= CN_WANTSRSRCFORK; |
1863 | cnp->cn_flags |= ISLASTCN; |
1864 | ndp->ni_next[0] = '\0'; |
1865 | ndp->ni_pathlen = 1; |
1866 | } |
1867 | skiprsrcfork: |
1868 | #endif |
1869 | |
1870 | *dp_authorized = 0; |
1871 | |
1872 | #if CONFIG_FIRMLINKS |
1873 | if ((cnp->cn_flags & ISDOTDOT) && (dp->v_flag & VFMLINKTARGET) && dp->v_fmlink) { |
1874 | /* |
1875 | * If this is a firmlink target then dp has to be switched to the |
1876 | * firmlink "source" before exiting this loop. |
1877 | * |
1878 | * For a firmlink "target", the policy is to pick the parent of the |
1879 | * firmlink "source" as the parent. This means that you can never |
1880 | * get to the "real" parent of firmlink target via a dotdot lookup. |
1881 | */ |
1882 | vnode_t v_fmlink = dp->v_fmlink; |
1883 | uint32_t old_vid = vid; |
1884 | mp = dmp; |
1885 | if (v_fmlink) { |
1886 | vid = v_fmlink->v_id; |
1887 | dmp = v_fmlink->v_mount; |
1888 | if ((dp->v_fmlink == v_fmlink) && dmp) { |
1889 | dp = v_fmlink; |
1890 | } else { |
1891 | vid = old_vid; |
1892 | dmp = mp; |
1893 | } |
1894 | } |
1895 | } |
1896 | #endif |
1897 | |
1898 | |
1899 | if (ttl_enabled && |
1900 | (dmp->mnt_authcache_ttl == 0 || |
1901 | ((tv.tv_sec - dp->v_cred_timestamp) > dmp->mnt_authcache_ttl))) { |
1902 | break; |
1903 | } |
1904 | |
1905 | /* |
1906 | * NAME_CACHE_LOCK holds these fields stable |
1907 | * |
1908 | * We can't cache KAUTH_VNODE_SEARCHBYANYONE for root correctly |
1909 | * so we make an ugly check for root here. root is always |
1910 | * allowed and breaking out of here only to find out that is |
1911 | * authorized by virtue of being root is very very expensive. |
1912 | * However, the check for not root is valid only for filesystems |
1913 | * which use local authorization. |
1914 | * |
1915 | * XXX: Remove the check for root when we can reliably set |
1916 | * KAUTH_VNODE_SEARCHBYANYONE as root. |
1917 | */ |
1918 | int v_authorized_actions = os_atomic_load(&dp->v_authorized_actions, relaxed); |
1919 | if ((vnode_cred(dp) != ucred || !(v_authorized_actions & KAUTH_VNODE_SEARCH)) && |
1920 | !(v_authorized_actions & KAUTH_VNODE_SEARCHBYANYONE) && |
1921 | (ttl_enabled || !vfs_context_issuser(ctx))) { |
1922 | break; |
1923 | } |
1924 | |
1925 | /* |
1926 | * indicate that we're allowed to traverse this directory... |
1927 | * even if we fail the cache lookup or decide to bail for |
1928 | * some other reason, this information is valid and is used |
1929 | * to avoid doing a vnode_authorize before the call to VNOP_LOOKUP |
1930 | */ |
1931 | *dp_authorized = 1; |
1932 | |
1933 | if ((cnp->cn_flags & (ISLASTCN | ISDOTDOT))) { |
1934 | if (cnp->cn_nameiop != LOOKUP) { |
1935 | break; |
1936 | } |
1937 | if (cnp->cn_flags & LOCKPARENT) { |
1938 | break; |
1939 | } |
1940 | if (cnp->cn_flags & NOCACHE) { |
1941 | break; |
1942 | } |
1943 | |
1944 | if (cnp->cn_flags & ISDOTDOT) { |
1945 | /* |
1946 | * Force directory hardlinks to go to |
1947 | * file system for ".." requests. |
1948 | */ |
1949 | if ((dp->v_flag & VISHARDLINK)) { |
1950 | break; |
1951 | } |
1952 | /* |
1953 | * Quit here only if we can't use |
1954 | * the parent directory pointer or |
1955 | * don't have one. Otherwise, we'll |
1956 | * use it below. |
1957 | */ |
1958 | if ((dp->v_flag & VROOT) || |
1959 | dp == ndp->ni_rootdir || |
1960 | dp->v_parent == NULLVP) { |
1961 | break; |
1962 | } |
1963 | } |
1964 | } |
1965 | |
1966 | if ((cnp->cn_flags & CN_SKIPNAMECACHE)) { |
1967 | /* |
1968 | * Force lookup to go to the filesystem with |
1969 | * all cnp fields set up. |
1970 | */ |
1971 | break; |
1972 | } |
1973 | |
1974 | /* |
1975 | * "." and ".." aren't supposed to be cached, so check |
1976 | * for them before checking the cache. |
1977 | */ |
1978 | if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { |
1979 | vp = dp; |
1980 | vvid = vid; |
1981 | } else if ((cnp->cn_flags & ISDOTDOT)) { |
1982 | /* |
1983 | * If this is a chrooted process, we need to check if |
1984 | * the process is trying to break out of its chrooted |
1985 | * jail. We do that by trying to determine if dp is |
1986 | * a subdirectory of ndp->ni_rootdir. If we aren't |
1987 | * able to determine that by the v_parent pointers, we |
1988 | * will leave the fast path. |
1989 | * |
1990 | * Since this function may see dotdot components |
1991 | * many times and it has the name cache lock held for |
1992 | * the entire duration, we optimise this by doing this |
1993 | * check only once per cache_lookup_path call. |
1994 | * If dotdotchecked is set, it means we've done this |
1995 | * check once already and don't need to do it again. |
1996 | */ |
1997 | if (!locked && (ndp->ni_rootdir != rootvnode)) { |
1998 | vfs_smr_leave(); |
1999 | needs_lock = true; |
2000 | goto prep_lock_retry; |
2001 | } else if (locked && !dotdotchecked && (ndp->ni_rootdir != rootvnode)) { |
2002 | vnode_t tvp = dp; |
2003 | boolean_t defer = FALSE; |
2004 | boolean_t is_subdir = FALSE; |
2005 | |
2006 | defer = cache_check_vnode_issubdir(vp: tvp, |
2007 | dvp: ndp->ni_rootdir, is_subdir: &is_subdir, next_vp: &tvp); |
2008 | |
2009 | if (defer) { |
2010 | /* defer to Filesystem */ |
2011 | break; |
2012 | } else if (!is_subdir) { |
2013 | /* |
2014 | * This process is trying to break out |
2015 | * of its chrooted jail, so all its |
2016 | * dotdot accesses will be translated to |
2017 | * its root directory. |
2018 | */ |
2019 | vp = ndp->ni_rootdir; |
2020 | } else { |
2021 | /* |
2022 | * All good, let this dotdot access |
2023 | * proceed normally |
2024 | */ |
2025 | vp = dp->v_parent; |
2026 | } |
2027 | dotdotchecked = TRUE; |
2028 | } else { |
2029 | vp = dp->v_parent; |
2030 | } |
2031 | if (!vp) { |
2032 | break; |
2033 | } |
2034 | vvid = vp->v_id; |
2035 | } else { |
2036 | if (!locked) { |
2037 | vp = cache_lookup_smr(dvp: dp, cnp, vidp: &vvid); |
2038 | if (!vid_is_same(vp: dp, vid)) { |
2039 | vp = NULLVP; |
2040 | needs_lock = true; |
2041 | vfs_smr_leave(); |
2042 | goto prep_lock_retry; |
2043 | } |
2044 | } else { |
2045 | vp = cache_lookup_locked(dvp: dp, cnp, vidp: &vvid); |
2046 | } |
2047 | |
2048 | |
2049 | if (!vp) { |
2050 | break; |
2051 | } |
2052 | |
2053 | if ((vp->v_flag & VISHARDLINK)) { |
2054 | /* |
2055 | * The file system wants a VNOP_LOOKUP on this vnode |
2056 | */ |
2057 | vp = NULL; |
2058 | break; |
2059 | } |
2060 | |
2061 | #if CONFIG_FIRMLINKS |
2062 | vnode_t v_fmlink = vp->v_fmlink; |
2063 | if (v_fmlink && !(vp->v_flag & VFMLINKTARGET)) { |
2064 | if (cnp->cn_flags & CN_FIRMLINK_NOFOLLOW || |
2065 | ((vp->v_type != VDIR) && (vp->v_type != VLNK))) { |
2066 | /* Leave it to the filesystem */ |
2067 | vp = NULLVP; |
2068 | break; |
2069 | } |
2070 | |
2071 | /* |
2072 | * Always switch to the target unless it is a VLNK |
2073 | * and it is the last component and we have NOFOLLOW |
2074 | * semantics |
2075 | */ |
2076 | if (vp->v_type == VDIR) { |
2077 | vp = v_fmlink; |
2078 | vvid = vnode_vid(vp); |
2079 | } else if ((cnp->cn_flags & FOLLOW) || |
2080 | (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/') { |
2081 | if (ndp->ni_loopcnt >= MAXSYMLINKS - 1) { |
2082 | vp = NULLVP; |
2083 | break; |
2084 | } |
2085 | ndp->ni_loopcnt++; |
2086 | vp = v_fmlink; |
2087 | vvid = vnode_vid(vp); |
2088 | } |
2089 | } |
2090 | #endif |
2091 | } |
2092 | if ((cnp->cn_flags & ISLASTCN)) { |
2093 | break; |
2094 | } |
2095 | |
2096 | if (vp->v_type != VDIR) { |
2097 | if (vp->v_type != VLNK) { |
2098 | vp = NULL; |
2099 | } |
2100 | break; |
2101 | } |
2102 | |
2103 | /* |
2104 | * v_mountedhere is PAC protected which means vp has to be a VDIR |
2105 | * to access that pointer as v_mountedhere. However, if we don't |
2106 | * have the name cache lock or an iocount (which we won't in the |
2107 | * !locked case) we can't guarantee that. So we try to detect it |
2108 | * via other fields to avoid having to dereference v_mountedhere |
2109 | * when we don't need to. Note that in theory if entire reclaim |
2110 | * happens between the time we check can_check_v_mountedhere() |
2111 | * and the subsequent access this will still fail but the fields |
2112 | * we check make that exceedingly unlikely and will result in |
2113 | * the chances of that happening being practically zero (but not |
2114 | * zero). |
2115 | */ |
2116 | if ((locked || can_check_v_mountedhere(vp)) && |
2117 | (mp = vp->v_mountedhere) && ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { |
2118 | vnode_t tmp_vp; |
2119 | int tmp_vid; |
2120 | |
2121 | if (!(locked || vid_is_same(vp, vid: vvid))) { |
2122 | vp = NULL; |
2123 | break; |
2124 | } |
2125 | tmp_vp = mp->mnt_realrootvp; |
2126 | tmp_vid = mp->mnt_realrootvp_vid; |
2127 | if (tmp_vp == NULLVP || mp->mnt_generation != mount_generation || |
2128 | tmp_vid != tmp_vp->v_id) { |
2129 | break; |
2130 | } |
2131 | |
2132 | if ((mp = tmp_vp->v_mount) == NULL) { |
2133 | break; |
2134 | } |
2135 | |
2136 | vp = tmp_vp; |
2137 | vvid = tmp_vid; |
2138 | dmp = mp; |
2139 | if (dmp->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL)) { |
2140 | ttl_enabled = TRUE; |
2141 | microuptime(tv: &tv); |
2142 | } else { |
2143 | ttl_enabled = FALSE; |
2144 | } |
2145 | } |
2146 | |
2147 | #if CONFIG_TRIGGERS |
2148 | /* |
2149 | * After traversing all mountpoints stacked here, if we have a |
2150 | * trigger in hand, resolve it. Note that we don't need to |
2151 | * leave the fast path if the mount has already happened. |
2152 | */ |
2153 | if (vp->v_resolve) { |
2154 | break; |
2155 | } |
2156 | #endif /* CONFIG_TRIGGERS */ |
2157 | |
2158 | if (!(locked || vid_is_same(vp, vid: vvid))) { |
2159 | vp = NULL; |
2160 | break; |
2161 | } |
2162 | |
2163 | dp = vp; |
2164 | vid = vvid; |
2165 | vp = NULLVP; |
2166 | vvid = 0; |
2167 | |
2168 | cnp->cn_nameptr = ndp->ni_next + 1; |
2169 | ndp->ni_pathlen--; |
2170 | while (*cnp->cn_nameptr == '/') { |
2171 | cnp->cn_nameptr++; |
2172 | ndp->ni_pathlen--; |
2173 | } |
2174 | } |
2175 | if (!locked) { |
2176 | if (vp && !vnode_hold_smr(vp)) { |
2177 | vp = NULLVP; |
2178 | vvid = 0; |
2179 | } |
2180 | if (!vnode_hold_smr(dp)) { |
2181 | vfs_smr_leave(); |
2182 | if (vp) { |
2183 | vnode_drop(vp); |
2184 | vp = NULLVP; |
2185 | vvid = 0; |
2186 | } |
2187 | goto prep_lock_retry; |
2188 | } |
2189 | vfs_smr_leave(); |
2190 | } else { |
2191 | if (vp != NULLVP) { |
2192 | vvid = vp->v_id; |
2193 | vnode_hold(vp); |
2194 | } |
2195 | vid = dp->v_id; |
2196 | |
2197 | vnode_hold(vp: dp); |
2198 | NAME_CACHE_UNLOCK(); |
2199 | } |
2200 | |
2201 | tdp = NULLVP; |
2202 | if (!(cnp->cn_flags & DONOTAUTH) && |
2203 | (vp != NULLVP) && (vp->v_type != VLNK) && |
2204 | ((cnp->cn_flags & (ISLASTCN | LOCKPARENT | WANTPARENT | SAVESTART)) == ISLASTCN)) { |
2205 | /* |
2206 | * if we've got a child and it's the last component, and |
2207 | * the lookup doesn't need to return the parent then we |
2208 | * can skip grabbing an iocount on the parent, since all |
2209 | * we're going to do with it is a vnode_put just before |
2210 | * we return from 'lookup'. If it's a symbolic link, |
2211 | * we need the parent in case the link happens to be |
2212 | * a relative pathname. |
2213 | * |
2214 | * However, we can't make this optimisation if we have to call |
2215 | * a MAC hook. |
2216 | */ |
2217 | tdp = dp; |
2218 | dp = NULLVP; |
2219 | } else { |
2220 | need_dp: |
2221 | /* |
2222 | * return the last directory we looked at |
2223 | * with an io reference held. If it was the one passed |
2224 | * in as a result of the last iteration of VNOP_LOOKUP, |
2225 | * it should already hold an io ref. No need to increase ref. |
2226 | */ |
2227 | if (last_dp != dp) { |
2228 | if (dp == ndp->ni_usedvp) { |
2229 | /* |
2230 | * if this vnode matches the one passed in via USEDVP |
2231 | * than this context already holds an io_count... just |
2232 | * use vnode_get to get an extra ref for lookup to play |
2233 | * with... can't use the getwithvid variant here because |
2234 | * it will block behind a vnode_drain which would result |
2235 | * in a deadlock (since we already own an io_count that the |
2236 | * vnode_drain is waiting on)... vnode_get grabs the io_count |
2237 | * immediately w/o waiting... it always succeeds |
2238 | */ |
2239 | vnode_get(dp); |
2240 | } else if ((error = vnode_getwithvid_drainok(dp, vid))) { |
2241 | /* |
2242 | * failure indicates the vnode |
2243 | * changed identity or is being |
2244 | * TERMINATED... in either case |
2245 | * punt this lookup. |
2246 | * |
2247 | * don't necessarily return ENOENT, though, because |
2248 | * we really want to go back to disk and make sure it's |
2249 | * there or not if someone else is changing this |
2250 | * vnode. That being said, the one case where we do want |
2251 | * to return ENOENT is when the vnode's mount point is |
2252 | * in the process of unmounting and we might cause a deadlock |
2253 | * in our attempt to take an iocount. An ENODEV error return |
2254 | * is from vnode_get* is an indication this but we change that |
2255 | * ENOENT for upper layers. |
2256 | */ |
2257 | if (error == ENODEV) { |
2258 | error = ENOENT; |
2259 | } else { |
2260 | error = ERECYCLE; |
2261 | } |
2262 | vnode_drop(vp: dp); |
2263 | if (vp) { |
2264 | vnode_drop(vp); |
2265 | } |
2266 | goto errorout; |
2267 | } |
2268 | dp_iocount_taken = true; |
2269 | } |
2270 | vnode_drop(vp: dp); |
2271 | } |
2272 | |
2273 | #if CONFIG_MACF |
2274 | /* |
2275 | * Name cache provides authorization caching (see below) |
2276 | * that will short circuit MAC checks in lookup(). |
2277 | * We must perform MAC check here. On denial |
2278 | * dp_authorized will remain 0 and second check will |
2279 | * be perfomed in lookup(). |
2280 | */ |
2281 | if (!(cnp->cn_flags & DONOTAUTH)) { |
2282 | error = mac_vnode_check_lookup(ctx, dvp: dp, cnp); |
2283 | if (error) { |
2284 | *dp_authorized = 0; |
2285 | if (dp_iocount_taken) { |
2286 | vnode_put(vp: dp); |
2287 | } |
2288 | if (vp) { |
2289 | vnode_drop(vp); |
2290 | vp = NULLVP; |
2291 | } |
2292 | goto errorout; |
2293 | } |
2294 | } |
2295 | #endif /* MAC */ |
2296 | |
2297 | if (vp != NULLVP) { |
2298 | if ((vnode_getwithvid_drainok(vp, vvid))) { |
2299 | vnode_drop(vp); |
2300 | vp = NULLVP; |
2301 | |
2302 | /* |
2303 | * can't get reference on the vp we'd like |
2304 | * to return... if we didn't grab a reference |
2305 | * on the directory (due to fast path bypass), |
2306 | * then we need to do it now... we can't return |
2307 | * with both ni_dvp and ni_vp NULL, and no |
2308 | * error condition |
2309 | */ |
2310 | if (dp == NULLVP) { |
2311 | dp = tdp; |
2312 | tdp = NULLVP; |
2313 | goto need_dp; |
2314 | } |
2315 | } else { |
2316 | vnode_drop(vp); |
2317 | } |
2318 | if (dp_iocount_taken && vp && (vp->v_type != VLNK) && |
2319 | ((cnp->cn_flags & (ISLASTCN | LOCKPARENT | WANTPARENT | SAVESTART)) == ISLASTCN)) { |
2320 | vnode_put(vp: dp); |
2321 | dp = NULLVP; |
2322 | } |
2323 | } |
2324 | |
2325 | if (tdp) { |
2326 | vnode_drop(vp: tdp); |
2327 | tdp = NULLVP; |
2328 | } |
2329 | |
2330 | ndp->ni_dvp = dp; |
2331 | ndp->ni_vp = vp; |
2332 | |
2333 | #if CONFIG_TRIGGERS |
2334 | trigger_vp = vp ? vp : dp; |
2335 | if ((error == 0) && (trigger_vp != NULLVP) && vnode_isdir(vp: trigger_vp)) { |
2336 | error = vnode_trigger_resolve(trigger_vp, ndp, ctx); |
2337 | if (error) { |
2338 | if (vp) { |
2339 | vnode_put(vp); |
2340 | } |
2341 | if (dp) { |
2342 | vnode_put(vp: dp); |
2343 | } |
2344 | goto errorout; |
2345 | } |
2346 | } |
2347 | #endif /* CONFIG_TRIGGERS */ |
2348 | |
2349 | errorout: |
2350 | /* |
2351 | * If we came into cache_lookup_path after an iteration of the lookup loop that |
2352 | * resulted in a call to VNOP_LOOKUP, then VNOP_LOOKUP returned a vnode with a io ref |
2353 | * on it. It is now the job of cache_lookup_path to drop the ref on this vnode |
2354 | * when it is no longer needed. If we get to this point, and last_dp is not NULL |
2355 | * and it is ALSO not the dvp we want to return to caller of this function, it MUST be |
2356 | * the case that we got to a subsequent path component and this previous vnode is |
2357 | * no longer needed. We can then drop the io ref on it. |
2358 | */ |
2359 | if ((last_dp != NULLVP) && (last_dp != ndp->ni_dvp)) { |
2360 | vnode_put(vp: last_dp); |
2361 | } |
2362 | |
2363 | //initialized to 0, should be the same if no error cases occurred. |
2364 | return error; |
2365 | |
2366 | prep_lock_retry: |
2367 | restore_ndp_state(ndp, cnp, saved_statep: &saved_state); |
2368 | dp = start_dp; |
2369 | goto retry; |
2370 | } |
2371 | |
2372 | |
2373 | static vnode_t |
2374 | cache_lookup_locked(vnode_t dvp, struct componentname *cnp, uint32_t *vidp) |
2375 | { |
2376 | struct namecache *ncp; |
2377 | long namelen = cnp->cn_namelen; |
2378 | unsigned int hashval = cnp->cn_hash; |
2379 | |
2380 | if (nc_disabled) { |
2381 | return NULL; |
2382 | } |
2383 | |
2384 | smrq_serialized_foreach(ncp, NCHHASH(dvp, cnp->cn_hash), nc_hash) { |
2385 | if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { |
2386 | if (strncmp(s1: ncp->nc_name, s2: cnp->cn_nameptr, n: namelen) == 0 && ncp->nc_name[namelen] == 0) { |
2387 | break; |
2388 | } |
2389 | } |
2390 | } |
2391 | if (ncp == 0) { |
2392 | /* |
2393 | * We failed to find an entry |
2394 | */ |
2395 | NCHSTAT(ncs_miss); |
2396 | NC_SMR_STATS(clp_next_fail); |
2397 | return NULL; |
2398 | } |
2399 | NCHSTAT(ncs_goodhits); |
2400 | |
2401 | if (!ncp->nc_vp) { |
2402 | return NULL; |
2403 | } |
2404 | |
2405 | *vidp = ncp->nc_vid; |
2406 | NC_SMR_STATS(clp_next); |
2407 | |
2408 | return ncp->nc_vp; |
2409 | } |
2410 | |
2411 | static vnode_t |
2412 | cache_lookup_smr(vnode_t dvp, struct componentname *cnp, uint32_t *vidp) |
2413 | { |
2414 | struct namecache *ncp; |
2415 | long namelen = cnp->cn_namelen; |
2416 | unsigned int hashval = cnp->cn_hash; |
2417 | vnode_t vp = NULLVP; |
2418 | uint32_t vid = 0; |
2419 | uint32_t counter = 1; |
2420 | |
2421 | if (nc_disabled) { |
2422 | return NULL; |
2423 | } |
2424 | |
2425 | smrq_entered_foreach(ncp, NCHHASH(dvp, cnp->cn_hash), nc_hash) { |
2426 | counter = os_atomic_load(&ncp->nc_counter, acquire); |
2427 | if (!(counter & NC_VALID)) { |
2428 | ncp = NULL; |
2429 | goto out; |
2430 | } |
2431 | if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { |
2432 | const char *nc_name = |
2433 | os_atomic_load(&ncp->nc_name, relaxed); |
2434 | if (nc_name && |
2435 | strncmp(s1: nc_name, s2: cnp->cn_nameptr, n: namelen) == 0 && |
2436 | nc_name[namelen] == 0) { |
2437 | break; |
2438 | } else if (!nc_name) { |
2439 | ncp = NULL; |
2440 | goto out; |
2441 | } |
2442 | } |
2443 | } |
2444 | |
2445 | /* We failed to find an entry */ |
2446 | if (ncp == 0) { |
2447 | goto out; |
2448 | } |
2449 | |
2450 | vp = ncp->nc_vp; |
2451 | vid = ncp->nc_vid; |
2452 | |
2453 | /* |
2454 | * The validity of vp and vid depends on the value of the counter being |
2455 | * the same when we read it first in the loop and now. Anything else |
2456 | * and we can't use this vp & vid. |
2457 | * Hopefully this ncp wasn't reused 2 billion times between the time |
2458 | * we read it first and when we the counter value again. |
2459 | */ |
2460 | if (os_atomic_load(&ncp->nc_counter, acquire) != counter) { |
2461 | vp = NULLVP; |
2462 | goto out; |
2463 | } |
2464 | |
2465 | *vidp = vid; |
2466 | NC_SMR_STATS(clp_smr_next); |
2467 | |
2468 | return vp; |
2469 | |
2470 | out: |
2471 | NC_SMR_STATS(clp_smr_next_fail); |
2472 | return NULL; |
2473 | } |
2474 | |
2475 | |
2476 | unsigned int hash_string(const char *cp, int len); |
2477 | // |
2478 | // Have to take a len argument because we may only need to |
2479 | // hash part of a componentname. |
2480 | // |
2481 | unsigned int |
2482 | hash_string(const char *cp, int len) |
2483 | { |
2484 | unsigned hash = 0; |
2485 | |
2486 | if (len) { |
2487 | while (len--) { |
2488 | hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8; |
2489 | } |
2490 | } else { |
2491 | while (*cp != '\0') { |
2492 | hash = crc32tab[((hash >> 24) ^ (unsigned char)*cp++)] ^ hash << 8; |
2493 | } |
2494 | } |
2495 | /* |
2496 | * the crc generator can legitimately generate |
2497 | * a 0... however, 0 for us means that we |
2498 | * haven't computed a hash, so use 1 instead |
2499 | */ |
2500 | if (hash == 0) { |
2501 | hash = 1; |
2502 | } |
2503 | return hash; |
2504 | } |
2505 | |
2506 | |
2507 | /* |
2508 | * Lookup an entry in the cache |
2509 | * |
2510 | * We don't do this if the segment name is long, simply so the cache |
2511 | * can avoid holding long names (which would either waste space, or |
2512 | * add greatly to the complexity). |
2513 | * |
2514 | * Lookup is called with dvp pointing to the directory to search, |
2515 | * cnp pointing to the name of the entry being sought. If the lookup |
2516 | * succeeds, the vnode is returned in *vpp, and a status of -1 is |
2517 | * returned. If the lookup determines that the name does not exist |
2518 | * (negative cacheing), a status of ENOENT is returned. If the lookup |
2519 | * fails, a status of zero is returned. |
2520 | */ |
2521 | |
2522 | static int |
2523 | cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) |
2524 | { |
2525 | struct namecache *ncp; |
2526 | long namelen = cnp->cn_namelen; |
2527 | unsigned int hashval = cnp->cn_hash; |
2528 | boolean_t have_exclusive = FALSE; |
2529 | uint32_t vid; |
2530 | vnode_t vp; |
2531 | |
2532 | NAME_CACHE_LOCK_SHARED(); |
2533 | |
2534 | relook: |
2535 | smrq_serialized_foreach(ncp, NCHHASH(dvp, cnp->cn_hash), nc_hash) { |
2536 | if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { |
2537 | if (strncmp(s1: ncp->nc_name, s2: cnp->cn_nameptr, n: namelen) == 0 && ncp->nc_name[namelen] == 0) { |
2538 | break; |
2539 | } |
2540 | } |
2541 | } |
2542 | /* We failed to find an entry */ |
2543 | if (ncp == 0) { |
2544 | NCHSTAT(ncs_miss); |
2545 | NAME_CACHE_UNLOCK(); |
2546 | return 0; |
2547 | } |
2548 | |
2549 | /* We don't want to have an entry, so dump it */ |
2550 | if ((cnp->cn_flags & MAKEENTRY) == 0) { |
2551 | if (have_exclusive == TRUE) { |
2552 | NCHSTAT(ncs_badhits); |
2553 | cache_delete(ncp, 1); |
2554 | NAME_CACHE_UNLOCK(); |
2555 | return 0; |
2556 | } |
2557 | if (!NAME_CACHE_LOCK_SHARED_TO_EXCLUSIVE()) { |
2558 | NAME_CACHE_LOCK(); |
2559 | } |
2560 | have_exclusive = TRUE; |
2561 | goto relook; |
2562 | } |
2563 | vp = ncp->nc_vp; |
2564 | |
2565 | /* We found a "positive" match, return the vnode */ |
2566 | if (vp) { |
2567 | NCHSTAT(ncs_goodhits); |
2568 | |
2569 | vid = ncp->nc_vid; |
2570 | vnode_hold(vp); |
2571 | NAME_CACHE_UNLOCK(); |
2572 | |
2573 | if (vnode_getwithvid(vp, vid)) { |
2574 | vnode_drop(vp); |
2575 | #if COLLECT_STATS |
2576 | NAME_CACHE_LOCK(); |
2577 | NCHSTAT(ncs_badvid); |
2578 | NAME_CACHE_UNLOCK(); |
2579 | #endif |
2580 | return 0; |
2581 | } |
2582 | vnode_drop(vp); |
2583 | *vpp = vp; |
2584 | NC_SMR_STATS(cl_lock_hits); |
2585 | return -1; |
2586 | } |
2587 | |
2588 | /* We found a negative match, and want to create it, so purge */ |
2589 | if (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) { |
2590 | if (have_exclusive == TRUE) { |
2591 | NCHSTAT(ncs_badhits); |
2592 | cache_delete(ncp, 1); |
2593 | NAME_CACHE_UNLOCK(); |
2594 | return 0; |
2595 | } |
2596 | if (!NAME_CACHE_LOCK_SHARED_TO_EXCLUSIVE()) { |
2597 | NAME_CACHE_LOCK(); |
2598 | } |
2599 | have_exclusive = TRUE; |
2600 | goto relook; |
2601 | } |
2602 | |
2603 | /* |
2604 | * We found a "negative" match, ENOENT notifies client of this match. |
2605 | */ |
2606 | NCHSTAT(ncs_neghits); |
2607 | |
2608 | NAME_CACHE_UNLOCK(); |
2609 | return ENOENT; |
2610 | } |
2611 | |
2612 | |
2613 | |
2614 | /* |
2615 | * Lookup an entry in the cache |
2616 | * |
2617 | * Lookup is called with dvp pointing to the directory to search, |
2618 | * cnp pointing to the name of the entry being sought. If the lookup |
2619 | * succeeds, the vnode is returned in *vpp, and a status of -1 is |
2620 | * returned. If the lookup determines that the name does not exist |
2621 | * (negative cacheing), a status of ENOENT is returned. If the lookup |
2622 | * fails, a status of zero is returned. |
2623 | */ |
2624 | int |
2625 | cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) |
2626 | { |
2627 | struct namecache *ncp; |
2628 | long namelen = cnp->cn_namelen; |
2629 | vnode_t vp; |
2630 | uint32_t vid = 0; |
2631 | uint32_t counter = 1; |
2632 | unsigned int hashval; |
2633 | |
2634 | *vpp = NULLVP; |
2635 | |
2636 | if (cnp->cn_hash == 0) { |
2637 | cnp->cn_hash = hash_string(cp: cnp->cn_nameptr, len: cnp->cn_namelen); |
2638 | } |
2639 | hashval = cnp->cn_hash; |
2640 | |
2641 | if (nc_disabled) { |
2642 | return 0; |
2643 | } |
2644 | |
2645 | if (!nc_smr_enabled) { |
2646 | goto out_fallback; |
2647 | } |
2648 | |
2649 | /* We don't want to have an entry, so dump it */ |
2650 | if ((cnp->cn_flags & MAKEENTRY) == 0) { |
2651 | goto out_fallback; |
2652 | } |
2653 | |
2654 | vfs_smr_enter(); |
2655 | |
2656 | smrq_entered_foreach(ncp, NCHHASH(dvp, cnp->cn_hash), nc_hash) { |
2657 | counter = os_atomic_load(&ncp->nc_counter, acquire); |
2658 | if (!(counter & NC_VALID)) { |
2659 | vfs_smr_leave(); |
2660 | goto out_fallback; |
2661 | } |
2662 | if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { |
2663 | const char *nc_name = |
2664 | os_atomic_load(&ncp->nc_name, relaxed); |
2665 | if (nc_name && |
2666 | strncmp(s1: nc_name, s2: cnp->cn_nameptr, n: namelen) == 0 && |
2667 | nc_name[namelen] == 0) { |
2668 | break; |
2669 | } else if (!nc_name) { |
2670 | vfs_smr_leave(); |
2671 | goto out_fallback; |
2672 | } |
2673 | } |
2674 | } |
2675 | |
2676 | /* We failed to find an entry */ |
2677 | if (ncp == 0) { |
2678 | NCHSTAT(ncs_miss); |
2679 | vfs_smr_leave(); |
2680 | NC_SMR_STATS(cl_smr_miss); |
2681 | return 0; |
2682 | } |
2683 | |
2684 | vp = ncp->nc_vp; |
2685 | vid = ncp->nc_vid; |
2686 | |
2687 | /* |
2688 | * The validity of vp and vid depends on the value of the counter being |
2689 | * the same when we read it first in the loop and now. Anything else |
2690 | * and we can't use this vp & vid. |
2691 | * Hopefully this ncp wasn't reused 2 billion times between the time |
2692 | * we read it first and when we the counter value again. |
2693 | */ |
2694 | if (os_atomic_load(&ncp->nc_counter, acquire) != counter) { |
2695 | vfs_smr_leave(); |
2696 | goto out_fallback; |
2697 | } |
2698 | |
2699 | if (vp) { |
2700 | bool holdcount_acquired = vnode_hold_smr(vp); |
2701 | |
2702 | vfs_smr_leave(); |
2703 | |
2704 | if (!holdcount_acquired) { |
2705 | goto out_fallback; |
2706 | } |
2707 | |
2708 | if (vnode_getwithvid(vp, vid) != 0) { |
2709 | vnode_drop(vp); |
2710 | goto out_fallback; |
2711 | } |
2712 | vnode_drop(vp); |
2713 | NCHSTAT(ncs_goodhits); |
2714 | |
2715 | *vpp = vp; |
2716 | NC_SMR_STATS(cl_smr_hits); |
2717 | return -1; |
2718 | } |
2719 | |
2720 | vfs_smr_leave(); |
2721 | |
2722 | /* We found a negative match, and want to create it, so purge */ |
2723 | if (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) { |
2724 | goto out_fallback; |
2725 | } |
2726 | |
2727 | /* |
2728 | * We found a "negative" match, ENOENT notifies client of this match. |
2729 | */ |
2730 | NCHSTAT(ncs_neghits); |
2731 | NC_SMR_STATS(cl_smr_negative_hits); |
2732 | return ENOENT; |
2733 | |
2734 | out_fallback: |
2735 | NC_SMR_STATS(cl_smr_fallback); |
2736 | return cache_lookup_fallback(dvp, vpp, cnp); |
2737 | } |
2738 | |
2739 | const char * |
2740 | cache_enter_create(vnode_t dvp, vnode_t vp, struct componentname *cnp) |
2741 | { |
2742 | const char *strname; |
2743 | |
2744 | if (cnp->cn_hash == 0) { |
2745 | cnp->cn_hash = hash_string(cp: cnp->cn_nameptr, len: cnp->cn_namelen); |
2746 | } |
2747 | |
2748 | /* |
2749 | * grab 2 references on the string entered |
2750 | * one for the cache_enter_locked to consume |
2751 | * and the second to be consumed by v_name (vnode_create call point) |
2752 | */ |
2753 | strname = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, TRUE, 0); |
2754 | |
2755 | NAME_CACHE_LOCK(); |
2756 | |
2757 | cache_enter_locked(dvp, vp, cnp, strname); |
2758 | |
2759 | NAME_CACHE_UNLOCK(); |
2760 | |
2761 | return strname; |
2762 | } |
2763 | |
2764 | |
2765 | /* |
2766 | * Add an entry to the cache... |
2767 | * but first check to see if the directory |
2768 | * that this entry is to be associated with has |
2769 | * had any cache_purges applied since we took |
2770 | * our identity snapshot... this check needs to |
2771 | * be done behind the name cache lock |
2772 | */ |
2773 | void |
2774 | cache_enter_with_gen(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int gen) |
2775 | { |
2776 | if (cnp->cn_hash == 0) { |
2777 | cnp->cn_hash = hash_string(cp: cnp->cn_nameptr, len: cnp->cn_namelen); |
2778 | } |
2779 | |
2780 | NAME_CACHE_LOCK(); |
2781 | |
2782 | if (dvp->v_nc_generation == gen) { |
2783 | (void)cache_enter_locked(dvp, vp, cnp, NULL); |
2784 | } |
2785 | |
2786 | NAME_CACHE_UNLOCK(); |
2787 | } |
2788 | |
2789 | |
2790 | /* |
2791 | * Add an entry to the cache. |
2792 | */ |
2793 | void |
2794 | cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) |
2795 | { |
2796 | const char *strname; |
2797 | |
2798 | if (cnp->cn_hash == 0) { |
2799 | cnp->cn_hash = hash_string(cp: cnp->cn_nameptr, len: cnp->cn_namelen); |
2800 | } |
2801 | |
2802 | /* |
2803 | * grab 1 reference on the string entered |
2804 | * for the cache_enter_locked to consume |
2805 | */ |
2806 | strname = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, FALSE, 0); |
2807 | |
2808 | NAME_CACHE_LOCK(); |
2809 | |
2810 | cache_enter_locked(dvp, vp, cnp, strname); |
2811 | |
2812 | NAME_CACHE_UNLOCK(); |
2813 | } |
2814 | |
2815 | |
2816 | static void |
2817 | cache_enter_locked(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, const char *strname) |
2818 | { |
2819 | struct namecache *ncp, *negp; |
2820 | struct smrq_list_head *ncpp; |
2821 | |
2822 | if (nc_disabled) { |
2823 | return; |
2824 | } |
2825 | |
2826 | /* |
2827 | * if the entry is for -ve caching vp is null |
2828 | */ |
2829 | if ((vp != NULLVP) && (LIST_FIRST(&vp->v_nclinks))) { |
2830 | /* |
2831 | * someone beat us to the punch.. |
2832 | * this vnode is already in the cache |
2833 | */ |
2834 | if (strname != NULL) { |
2835 | vfs_removename(name: strname); |
2836 | } |
2837 | return; |
2838 | } |
2839 | /* |
2840 | * We allocate a new entry if we are less than the maximum |
2841 | * allowed and the one at the front of the list is in use. |
2842 | * Otherwise we use the one at the front of the list. |
2843 | */ |
2844 | if (numcache < desiredNodes && |
2845 | ((ncp = nchead.tqh_first) == NULL || |
2846 | (ncp->nc_counter & NC_VALID))) { |
2847 | /* |
2848 | * Allocate one more entry |
2849 | */ |
2850 | if (nc_smr_enabled) { |
2851 | ncp = zalloc_smr(namecache_zone, Z_WAITOK_ZERO_NOFAIL); |
2852 | } else { |
2853 | ncp = zalloc(zone: namecache_zone); |
2854 | } |
2855 | ncp->nc_counter = 0; |
2856 | numcache++; |
2857 | } else { |
2858 | /* |
2859 | * reuse an old entry |
2860 | */ |
2861 | ncp = TAILQ_FIRST(&nchead); |
2862 | TAILQ_REMOVE(&nchead, ncp, nc_entry); |
2863 | |
2864 | if (ncp->nc_counter & NC_VALID) { |
2865 | /* |
2866 | * still in use... we need to |
2867 | * delete it before re-using it |
2868 | */ |
2869 | NCHSTAT(ncs_stolen); |
2870 | cache_delete(ncp, 0); |
2871 | } |
2872 | } |
2873 | NCHSTAT(ncs_enters); |
2874 | |
2875 | /* |
2876 | * Fill in cache info, if vp is NULL this is a "negative" cache entry. |
2877 | */ |
2878 | if (vp) { |
2879 | ncp->nc_vid = vnode_vid(vp); |
2880 | vnode_hold(vp); |
2881 | } |
2882 | ncp->nc_vp = vp; |
2883 | ncp->nc_dvp = dvp; |
2884 | ncp->nc_hashval = cnp->cn_hash; |
2885 | |
2886 | if (strname == NULL) { |
2887 | ncp->nc_name = add_name_internal(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, FALSE, 0); |
2888 | } else { |
2889 | ncp->nc_name = strname; |
2890 | } |
2891 | |
2892 | // |
2893 | // If the bytes of the name associated with the vnode differ, |
2894 | // use the name associated with the vnode since the file system |
2895 | // may have set that explicitly in the case of a lookup on a |
2896 | // case-insensitive file system where the case of the looked up |
2897 | // name differs from what is on disk. For more details, see: |
2898 | // <rdar://problem/8044697> FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories |
2899 | // |
2900 | const char *vn_name = vp ? vp->v_name : NULL; |
2901 | unsigned int len = vn_name ? (unsigned int)strlen(s: vn_name) : 0; |
2902 | if (vn_name && ncp && ncp->nc_name && strncmp(s1: ncp->nc_name, s2: vn_name, n: len) != 0) { |
2903 | unsigned int hash = hash_string(cp: vn_name, len); |
2904 | |
2905 | vfs_removename(name: ncp->nc_name); |
2906 | ncp->nc_name = add_name_internal(vn_name, len, hash, FALSE, 0); |
2907 | ncp->nc_hashval = hash; |
2908 | } |
2909 | |
2910 | /* |
2911 | * make us the newest entry in the cache |
2912 | * i.e. we'll be the last to be stolen |
2913 | */ |
2914 | TAILQ_INSERT_TAIL(&nchead, ncp, nc_entry); |
2915 | |
2916 | ncpp = NCHHASH(dvp, cnp->cn_hash); |
2917 | #if DIAGNOSTIC |
2918 | { |
2919 | struct namecache *p; |
2920 | |
2921 | smrq_serialized_foreach(p, ncpp, nc_hash) { |
2922 | if (p == ncp) { |
2923 | panic("cache_enter: duplicate" ); |
2924 | } |
2925 | } |
2926 | } |
2927 | #endif |
2928 | /* |
2929 | * make us available to be found via lookup |
2930 | */ |
2931 | smrq_serialized_insert_head(ncpp, &ncp->nc_hash); |
2932 | |
2933 | if (vp) { |
2934 | /* |
2935 | * add to the list of name cache entries |
2936 | * that point at vp |
2937 | */ |
2938 | LIST_INSERT_HEAD(&vp->v_nclinks, ncp, nc_un.nc_link); |
2939 | } else { |
2940 | /* |
2941 | * this is a negative cache entry (vp == NULL) |
2942 | * stick it on the negative cache list. |
2943 | */ |
2944 | TAILQ_INSERT_TAIL(&neghead, ncp, nc_un.nc_negentry); |
2945 | |
2946 | ncs_negtotal++; |
2947 | |
2948 | if (ncs_negtotal > desiredNegNodes) { |
2949 | /* |
2950 | * if we've reached our desired limit |
2951 | * of negative cache entries, delete |
2952 | * the oldest |
2953 | */ |
2954 | negp = TAILQ_FIRST(&neghead); |
2955 | cache_delete(negp, 1); |
2956 | } |
2957 | } |
2958 | |
2959 | /* |
2960 | * add us to the list of name cache entries that |
2961 | * are children of dvp |
2962 | */ |
2963 | if (vp) { |
2964 | TAILQ_INSERT_TAIL(&dvp->v_ncchildren, ncp, nc_child); |
2965 | } else { |
2966 | TAILQ_INSERT_HEAD(&dvp->v_ncchildren, ncp, nc_child); |
2967 | } |
2968 | |
2969 | /* |
2970 | * nc_counter represents a sequence counter and 1 bit valid flag. |
2971 | * When the counter value is odd, it represents a valid and in use |
2972 | * namecache structure. We increment the value on every state transition |
2973 | * (invalid to valid (here) and valid to invalid (in cache delete). |
2974 | * Lockless readers have to read the value before reading other fields |
2975 | * and ensure that the field is valid and remains the same after the fields |
2976 | * have been read. |
2977 | */ |
2978 | uint32_t old_count = os_atomic_inc_orig(&ncp->nc_counter, release); |
2979 | if (old_count & NC_VALID) { |
2980 | /* This is a invalid to valid transition */ |
2981 | panic("Incorrect state for old nc_counter(%d), should be even" , old_count); |
2982 | } |
2983 | } |
2984 | |
2985 | |
2986 | /* |
2987 | * Initialize CRC-32 remainder table. |
2988 | */ |
2989 | static void |
2990 | init_crc32(void) |
2991 | { |
2992 | /* |
2993 | * the CRC-32 generator polynomial is: |
2994 | * x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^10 |
2995 | * + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 |
2996 | */ |
2997 | unsigned int crc32_polynomial = 0x04c11db7; |
2998 | unsigned int i, j; |
2999 | |
3000 | /* |
3001 | * pre-calculate the CRC-32 remainder for each possible octet encoding |
3002 | */ |
3003 | for (i = 0; i < 256; i++) { |
3004 | unsigned int crc_rem = i << 24; |
3005 | |
3006 | for (j = 0; j < 8; j++) { |
3007 | if (crc_rem & 0x80000000) { |
3008 | crc_rem = (crc_rem << 1) ^ crc32_polynomial; |
3009 | } else { |
3010 | crc_rem = (crc_rem << 1); |
3011 | } |
3012 | } |
3013 | crc32tab[i] = crc_rem; |
3014 | } |
3015 | } |
3016 | |
3017 | |
3018 | /* |
3019 | * Name cache initialization, from vfs_init() when we are booting |
3020 | */ |
3021 | void |
3022 | nchinit(void) |
3023 | { |
3024 | desiredNegNodes = (desiredvnodes / 10); |
3025 | desiredNodes = desiredvnodes + desiredNegNodes; |
3026 | |
3027 | if (nc_smr_enabled) { |
3028 | zone_enable_smr(zone: namecache_zone, VFS_SMR(), free_cb: &namecache_smr_free); |
3029 | zone_enable_smr(zone: stringcache_zone, VFS_SMR(), free_cb: &string_smr_free); |
3030 | } |
3031 | TAILQ_INIT(&nchead); |
3032 | TAILQ_INIT(&neghead); |
3033 | |
3034 | init_crc32(); |
3035 | |
3036 | nchashtbl = hashinit(MAX(CONFIG_NC_HASH, (2 * desiredNodes)), M_CACHE, hashmask: &nchash); |
3037 | nchashmask = nchash; |
3038 | nchash++; |
3039 | |
3040 | init_string_table(); |
3041 | |
3042 | for (int i = 0; i < NUM_STRCACHE_LOCKS; i++) { |
3043 | lck_mtx_init(lck: &strcache_mtx_locks[i], grp: &strcache_lck_grp, attr: &strcache_lck_attr); |
3044 | } |
3045 | } |
3046 | |
3047 | void |
3048 | name_cache_lock_shared(void) |
3049 | { |
3050 | lck_rw_lock_shared(lck: &namecache_rw_lock); |
3051 | NC_SMR_STATS(nc_lock_shared); |
3052 | } |
3053 | |
3054 | void |
3055 | name_cache_lock(void) |
3056 | { |
3057 | lck_rw_lock_exclusive(lck: &namecache_rw_lock); |
3058 | NC_SMR_STATS(nc_lock); |
3059 | } |
3060 | |
3061 | boolean_t |
3062 | name_cache_lock_shared_to_exclusive(void) |
3063 | { |
3064 | return lck_rw_lock_shared_to_exclusive(lck: &namecache_rw_lock); |
3065 | } |
3066 | |
3067 | void |
3068 | name_cache_unlock(void) |
3069 | { |
3070 | lck_rw_done(lck: &namecache_rw_lock); |
3071 | } |
3072 | |
3073 | |
3074 | int |
3075 | resize_namecache(int newsize) |
3076 | { |
3077 | struct smrq_list_head *new_table; |
3078 | struct smrq_list_head *old_table; |
3079 | struct smrq_list_head *old_head; |
3080 | struct namecache *entry; |
3081 | uint32_t i, hashval; |
3082 | int dNodes, dNegNodes, nelements; |
3083 | u_long new_size, old_size; |
3084 | |
3085 | if (newsize < 0) { |
3086 | return EINVAL; |
3087 | } |
3088 | |
3089 | dNegNodes = (newsize / 10); |
3090 | dNodes = newsize + dNegNodes; |
3091 | // we don't support shrinking yet |
3092 | if (dNodes <= desiredNodes) { |
3093 | return 0; |
3094 | } |
3095 | |
3096 | if (os_mul_overflow(dNodes, 2, &nelements)) { |
3097 | return EINVAL; |
3098 | } |
3099 | |
3100 | new_table = hashinit(count: nelements, M_CACHE, hashmask: &nchashmask); |
3101 | new_size = nchashmask + 1; |
3102 | |
3103 | if (new_table == NULL) { |
3104 | return ENOMEM; |
3105 | } |
3106 | |
3107 | NAME_CACHE_LOCK(); |
3108 | |
3109 | /* No need to switch if the hash table size hasn't changed. */ |
3110 | if (new_size == nchash) { |
3111 | NAME_CACHE_UNLOCK(); |
3112 | hashdestroy(new_table, M_CACHE, hashmask: new_size - 1); |
3113 | return 0; |
3114 | } |
3115 | |
3116 | // do the switch! |
3117 | old_table = nchashtbl; |
3118 | nchashtbl = new_table; |
3119 | old_size = nchash; |
3120 | nchash = new_size; |
3121 | |
3122 | // walk the old table and insert all the entries into |
3123 | // the new table |
3124 | // |
3125 | for (i = 0; i < old_size; i++) { |
3126 | old_head = &old_table[i]; |
3127 | smrq_serialized_foreach_safe(entry, old_head, nc_hash) { |
3128 | // |
3129 | // XXXdbg - Beware: this assumes that hash_string() does |
3130 | // the same thing as what happens in |
3131 | // lookup() over in vfs_lookup.c |
3132 | hashval = hash_string(cp: entry->nc_name, len: 0); |
3133 | entry->nc_hashval = hashval; |
3134 | |
3135 | smrq_serialized_insert_head(NCHHASH(entry->nc_dvp, hashval), &entry->nc_hash); |
3136 | } |
3137 | } |
3138 | desiredNodes = dNodes; |
3139 | desiredNegNodes = dNegNodes; |
3140 | |
3141 | NAME_CACHE_UNLOCK(); |
3142 | hashdestroy(old_table, M_CACHE, hashmask: old_size - 1); |
3143 | |
3144 | return 0; |
3145 | } |
3146 | |
3147 | static void |
3148 | namecache_smr_free(void *_ncp, __unused size_t _size) |
3149 | { |
3150 | struct namecache *ncp = _ncp; |
3151 | |
3152 | bzero(s: ncp, n: sizeof(*ncp)); |
3153 | } |
3154 | |
3155 | static void |
3156 | cache_delete(struct namecache *ncp, int free_entry) |
3157 | { |
3158 | NCHSTAT(ncs_deletes); |
3159 | |
3160 | /* |
3161 | * See comment at the end of cache_enter_locked expalining the usage of |
3162 | * nc_counter. |
3163 | */ |
3164 | uint32_t old_count = os_atomic_inc_orig(&ncp->nc_counter, release); |
3165 | if (!(old_count & NC_VALID)) { |
3166 | /* This should be a valid to invalid transition */ |
3167 | panic("Incorrect state for old nc_counter(%d), should be odd" , old_count); |
3168 | } |
3169 | |
3170 | if (ncp->nc_vp) { |
3171 | LIST_REMOVE(ncp, nc_un.nc_link); |
3172 | } else { |
3173 | TAILQ_REMOVE(&neghead, ncp, nc_un.nc_negentry); |
3174 | ncs_negtotal--; |
3175 | } |
3176 | TAILQ_REMOVE(&(ncp->nc_dvp->v_ncchildren), ncp, nc_child); |
3177 | |
3178 | smrq_serialized_remove((NCHHASH(ncp->nc_dvp, ncp->nc_hashval)), &ncp->nc_hash); |
3179 | |
3180 | const char *nc_name = ncp->nc_name; |
3181 | ncp->nc_name = NULL; |
3182 | vfs_removename(name: nc_name); |
3183 | if (ncp->nc_vp) { |
3184 | vnode_t vp = ncp->nc_vp; |
3185 | |
3186 | ncp->nc_vp = NULLVP; |
3187 | vnode_drop(vp); |
3188 | } |
3189 | |
3190 | if (free_entry) { |
3191 | TAILQ_REMOVE(&nchead, ncp, nc_entry); |
3192 | if (nc_smr_enabled) { |
3193 | zfree_smr(namecache_zone, ncp); |
3194 | } else { |
3195 | zfree(namecache_zone, ncp); |
3196 | } |
3197 | numcache--; |
3198 | } |
3199 | } |
3200 | |
3201 | |
3202 | /* |
3203 | * purge the entry associated with the |
3204 | * specified vnode from the name cache |
3205 | */ |
3206 | static void |
3207 | cache_purge_locked(vnode_t vp, kauth_cred_t *credp) |
3208 | { |
3209 | struct namecache *ncp; |
3210 | |
3211 | *credp = NULL; |
3212 | if ((LIST_FIRST(&vp->v_nclinks) == NULL) && |
3213 | (TAILQ_FIRST(&vp->v_ncchildren) == NULL) && |
3214 | (vnode_cred(vp) == NOCRED) && |
3215 | (vp->v_parent == NULLVP)) { |
3216 | return; |
3217 | } |
3218 | |
3219 | if (vp->v_parent) { |
3220 | vp->v_parent->v_nc_generation++; |
3221 | } |
3222 | |
3223 | while ((ncp = LIST_FIRST(&vp->v_nclinks))) { |
3224 | cache_delete(ncp, free_entry: 1); |
3225 | } |
3226 | |
3227 | while ((ncp = TAILQ_FIRST(&vp->v_ncchildren))) { |
3228 | cache_delete(ncp, free_entry: 1); |
3229 | } |
3230 | |
3231 | /* |
3232 | * Use a temp variable to avoid kauth_cred_unref() while NAME_CACHE_LOCK is held |
3233 | */ |
3234 | *credp = vnode_cred(vp); |
3235 | vp->v_cred = NOCRED; |
3236 | vp->v_authorized_actions = 0; |
3237 | } |
3238 | |
3239 | void |
3240 | cache_purge(vnode_t vp) |
3241 | { |
3242 | kauth_cred_t tcred = NULL; |
3243 | |
3244 | if ((LIST_FIRST(&vp->v_nclinks) == NULL) && |
3245 | (TAILQ_FIRST(&vp->v_ncchildren) == NULL) && |
3246 | (vnode_cred(vp) == NOCRED) && |
3247 | (vp->v_parent == NULLVP)) { |
3248 | return; |
3249 | } |
3250 | |
3251 | NAME_CACHE_LOCK(); |
3252 | |
3253 | cache_purge_locked(vp, credp: &tcred); |
3254 | |
3255 | NAME_CACHE_UNLOCK(); |
3256 | |
3257 | if (IS_VALID_CRED(tcred)) { |
3258 | kauth_cred_unref(&tcred); |
3259 | } |
3260 | } |
3261 | |
3262 | /* |
3263 | * Purge all negative cache entries that are children of the |
3264 | * given vnode. A case-insensitive file system (or any file |
3265 | * system that has multiple equivalent names for the same |
3266 | * directory entry) can use this when creating or renaming |
3267 | * to remove negative entries that may no longer apply. |
3268 | */ |
3269 | void |
3270 | cache_purge_negatives(vnode_t vp) |
3271 | { |
3272 | struct namecache *ncp, *next_ncp; |
3273 | |
3274 | NAME_CACHE_LOCK(); |
3275 | |
3276 | TAILQ_FOREACH_SAFE(ncp, &vp->v_ncchildren, nc_child, next_ncp) { |
3277 | if (ncp->nc_vp) { |
3278 | break; |
3279 | } |
3280 | |
3281 | cache_delete(ncp, free_entry: 1); |
3282 | } |
3283 | |
3284 | NAME_CACHE_UNLOCK(); |
3285 | } |
3286 | |
3287 | /* |
3288 | * Flush all entries referencing a particular filesystem. |
3289 | * |
3290 | * Since we need to check it anyway, we will flush all the invalid |
3291 | * entries at the same time. |
3292 | */ |
3293 | void |
3294 | cache_purgevfs(struct mount *mp) |
3295 | { |
3296 | struct smrq_list_head *ncpp; |
3297 | struct namecache *ncp; |
3298 | |
3299 | NAME_CACHE_LOCK(); |
3300 | /* Scan hash tables for applicable entries */ |
3301 | for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { |
3302 | restart: |
3303 | smrq_serialized_foreach(ncp, ncpp, nc_hash) { |
3304 | if (ncp->nc_dvp->v_mount == mp) { |
3305 | cache_delete(ncp, free_entry: 0); |
3306 | goto restart; |
3307 | } |
3308 | } |
3309 | } |
3310 | NAME_CACHE_UNLOCK(); |
3311 | } |
3312 | |
3313 | |
3314 | |
3315 | // |
3316 | // String ref routines |
3317 | // |
3318 | static LIST_HEAD(stringhead, string_t) * string_ref_table; |
3319 | static u_long string_table_mask; |
3320 | static uint32_t filled_buckets = 0; |
3321 | |
3322 | |
3323 | |
3324 | |
3325 | static void |
3326 | resize_string_ref_table(void) |
3327 | { |
3328 | struct stringhead *new_table; |
3329 | struct stringhead *old_table; |
3330 | struct stringhead *old_head, *head; |
3331 | string_t *entry, *next; |
3332 | uint32_t i, hashval; |
3333 | u_long new_mask, old_mask; |
3334 | |
3335 | /* |
3336 | * need to hold the table lock exclusively |
3337 | * in order to grow the table... need to recheck |
3338 | * the need to resize again after we've taken |
3339 | * the lock exclusively in case some other thread |
3340 | * beat us to the punch |
3341 | */ |
3342 | lck_rw_lock_exclusive(lck: &strtable_rw_lock); |
3343 | |
3344 | if (4 * filled_buckets < ((string_table_mask + 1) * 3)) { |
3345 | lck_rw_done(lck: &strtable_rw_lock); |
3346 | return; |
3347 | } |
3348 | assert(string_table_mask < INT32_MAX); |
3349 | new_table = hashinit(count: (int)(string_table_mask + 1) * 2, M_CACHE, hashmask: &new_mask); |
3350 | |
3351 | if (new_table == NULL) { |
3352 | printf("failed to resize the hash table.\n" ); |
3353 | lck_rw_done(lck: &strtable_rw_lock); |
3354 | return; |
3355 | } |
3356 | |
3357 | // do the switch! |
3358 | old_table = string_ref_table; |
3359 | string_ref_table = new_table; |
3360 | old_mask = string_table_mask; |
3361 | string_table_mask = new_mask; |
3362 | filled_buckets = 0; |
3363 | |
3364 | // walk the old table and insert all the entries into |
3365 | // the new table |
3366 | // |
3367 | for (i = 0; i <= old_mask; i++) { |
3368 | old_head = &old_table[i]; |
3369 | for (entry = old_head->lh_first; entry != NULL; entry = next) { |
3370 | hashval = hash_string(cp: (const char *)entry->str, len: 0); |
3371 | head = &string_ref_table[hashval & string_table_mask]; |
3372 | if (head->lh_first == NULL) { |
3373 | filled_buckets++; |
3374 | } |
3375 | next = entry->hash_chain.le_next; |
3376 | LIST_INSERT_HEAD(head, entry, hash_chain); |
3377 | } |
3378 | } |
3379 | lck_rw_done(lck: &strtable_rw_lock); |
3380 | |
3381 | hashdestroy(old_table, M_CACHE, hashmask: old_mask); |
3382 | } |
3383 | |
3384 | |
3385 | static void |
3386 | init_string_table(void) |
3387 | { |
3388 | string_ref_table = hashinit(CONFIG_VFS_NAMES, M_CACHE, hashmask: &string_table_mask); |
3389 | } |
3390 | |
3391 | |
3392 | const char * |
3393 | vfs_addname(const char *name, uint32_t len, u_int hashval, u_int flags) |
3394 | { |
3395 | return add_name_internal(name, len, hashval, FALSE, flags); |
3396 | } |
3397 | |
3398 | |
3399 | static const char * |
3400 | add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t , __unused u_int flags) |
3401 | { |
3402 | struct stringhead *head; |
3403 | string_t *entry; |
3404 | uint32_t chain_len = 0; |
3405 | uint32_t hash_index; |
3406 | uint32_t lock_index; |
3407 | char *ptr; |
3408 | |
3409 | if (len > MAXPATHLEN) { |
3410 | len = MAXPATHLEN; |
3411 | } |
3412 | |
3413 | /* |
3414 | * if the length already accounts for the null-byte, then |
3415 | * subtract one so later on we don't index past the end |
3416 | * of the string. |
3417 | */ |
3418 | if (len > 0 && name[len - 1] == '\0') { |
3419 | len--; |
3420 | } |
3421 | if (hashval == 0) { |
3422 | hashval = hash_string(cp: name, len); |
3423 | } |
3424 | |
3425 | /* |
3426 | * take this lock 'shared' to keep the hash stable |
3427 | * if someone else decides to grow the pool they |
3428 | * will take this lock exclusively |
3429 | */ |
3430 | lck_rw_lock_shared(lck: &strtable_rw_lock); |
3431 | |
3432 | /* |
3433 | * If the table gets more than 3/4 full, resize it |
3434 | */ |
3435 | if (4 * filled_buckets >= ((string_table_mask + 1) * 3)) { |
3436 | lck_rw_done(lck: &strtable_rw_lock); |
3437 | |
3438 | resize_string_ref_table(); |
3439 | |
3440 | lck_rw_lock_shared(lck: &strtable_rw_lock); |
3441 | } |
3442 | hash_index = hashval & string_table_mask; |
3443 | lock_index = hash_index % NUM_STRCACHE_LOCKS; |
3444 | |
3445 | head = &string_ref_table[hash_index]; |
3446 | |
3447 | lck_mtx_lock_spin(lck: &strcache_mtx_locks[lock_index]); |
3448 | |
3449 | for (entry = head->lh_first; entry != NULL; chain_len++, entry = entry->hash_chain.le_next) { |
3450 | if (strncmp(s1: entry->str, s2: name, n: len) == 0 && entry->str[len] == 0) { |
3451 | entry->refcount++; |
3452 | break; |
3453 | } |
3454 | } |
3455 | if (entry == NULL) { |
3456 | const uint32_t buflen = len + 1; |
3457 | |
3458 | lck_mtx_convert_spin(lck: &strcache_mtx_locks[lock_index]); |
3459 | /* |
3460 | * it wasn't already there so add it. |
3461 | */ |
3462 | if (nc_smr_enabled) { |
3463 | entry = zalloc_smr(stringcache_zone, Z_WAITOK_ZERO_NOFAIL); |
3464 | } else { |
3465 | entry = zalloc(zone: stringcache_zone); |
3466 | } |
3467 | |
3468 | if (head->lh_first == NULL) { |
3469 | OSAddAtomic(1, &filled_buckets); |
3470 | } |
3471 | ptr = kalloc_data(buflen, Z_WAITOK); |
3472 | strncpy(ptr, name, len); |
3473 | ptr[len] = '\0'; |
3474 | entry->str = ptr; |
3475 | entry->strbuflen = buflen; |
3476 | entry->refcount = 1; |
3477 | LIST_INSERT_HEAD(head, entry, hash_chain); |
3478 | } |
3479 | if (need_extra_ref == TRUE) { |
3480 | entry->refcount++; |
3481 | } |
3482 | |
3483 | lck_mtx_unlock(lck: &strcache_mtx_locks[lock_index]); |
3484 | lck_rw_done(lck: &strtable_rw_lock); |
3485 | |
3486 | return (const char *)entry->str; |
3487 | } |
3488 | |
3489 | static void |
3490 | string_smr_free(void *_entry, __unused size_t size) |
3491 | { |
3492 | string_t *entry = _entry; |
3493 | |
3494 | kfree_data(entry->str, entry->strbuflen); |
3495 | bzero(s: entry, n: sizeof(*entry)); |
3496 | } |
3497 | |
3498 | int |
3499 | vfs_removename(const char *nameref) |
3500 | { |
3501 | struct stringhead *head; |
3502 | string_t *entry; |
3503 | uint32_t hashval; |
3504 | uint32_t hash_index; |
3505 | uint32_t lock_index; |
3506 | int retval = ENOENT; |
3507 | |
3508 | hashval = hash_string(cp: nameref, len: 0); |
3509 | |
3510 | /* |
3511 | * take this lock 'shared' to keep the hash stable |
3512 | * if someone else decides to grow the pool they |
3513 | * will take this lock exclusively |
3514 | */ |
3515 | lck_rw_lock_shared(lck: &strtable_rw_lock); |
3516 | /* |
3517 | * must compute the head behind the table lock |
3518 | * since the size and location of the table |
3519 | * can change on the fly |
3520 | */ |
3521 | hash_index = hashval & string_table_mask; |
3522 | lock_index = hash_index % NUM_STRCACHE_LOCKS; |
3523 | |
3524 | head = &string_ref_table[hash_index]; |
3525 | |
3526 | lck_mtx_lock_spin(lck: &strcache_mtx_locks[lock_index]); |
3527 | |
3528 | for (entry = head->lh_first; entry != NULL; entry = entry->hash_chain.le_next) { |
3529 | if (entry->str == nameref) { |
3530 | entry->refcount--; |
3531 | |
3532 | if (entry->refcount == 0) { |
3533 | LIST_REMOVE(entry, hash_chain); |
3534 | |
3535 | if (head->lh_first == NULL) { |
3536 | OSAddAtomic(-1, &filled_buckets); |
3537 | } |
3538 | } else { |
3539 | entry = NULL; |
3540 | } |
3541 | retval = 0; |
3542 | break; |
3543 | } |
3544 | } |
3545 | lck_mtx_unlock(lck: &strcache_mtx_locks[lock_index]); |
3546 | lck_rw_done(lck: &strtable_rw_lock); |
3547 | |
3548 | if (entry) { |
3549 | assert(entry->refcount == 0); |
3550 | if (nc_smr_enabled) { |
3551 | zfree_smr(stringcache_zone, entry); |
3552 | } else { |
3553 | kfree_data(entry->str, entry->strbuflen); |
3554 | entry->str = NULL; |
3555 | entry->strbuflen = 0; |
3556 | zfree(stringcache_zone, entry); |
3557 | } |
3558 | } |
3559 | |
3560 | return retval; |
3561 | } |
3562 | |
3563 | |
3564 | #ifdef DUMP_STRING_TABLE |
3565 | void |
3566 | dump_string_table(void) |
3567 | { |
3568 | struct stringhead *head; |
3569 | string_t *entry; |
3570 | u_long i; |
3571 | |
3572 | lck_rw_lock_shared(&strtable_rw_lock); |
3573 | |
3574 | for (i = 0; i <= string_table_mask; i++) { |
3575 | head = &string_ref_table[i]; |
3576 | for (entry = head->lh_first; entry != NULL; entry = entry->hash_chain.le_next) { |
3577 | printf("%6d - %s\n" , entry->refcount, entry->str); |
3578 | } |
3579 | } |
3580 | lck_rw_done(&strtable_rw_lock); |
3581 | } |
3582 | #endif /* DUMP_STRING_TABLE */ |
3583 | |