1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "opt_ddb.h"
36 #include "opt_ktrace.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/capsicum.h>
41 #include <sys/counter.h>
42 #include <sys/filedesc.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/kernel.h>
45 #include <sys/ktr.h>
46 #include <sys/lock.h>
47 #include <sys/malloc.h>
48 #include <sys/fcntl.h>
49 #include <sys/jail.h>
50 #include <sys/mount.h>
51 #include <sys/namei.h>
52 #include <sys/proc.h>
53 #include <sys/seqc.h>
54 #include <sys/sdt.h>
55 #include <sys/smr.h>
56 #include <sys/smp.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysproto.h>
60 #include <sys/vnode.h>
61 #include <ck_queue.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #ifdef INVARIANTS
66 #include <machine/_inttypes.h>
67 #endif
68
69 #include <security/audit/audit.h>
70 #include <security/mac/mac_framework.h>
71
72 #ifdef DDB
73 #include <ddb/ddb.h>
74 #endif
75
76 #include <vm/uma.h>
77
78 /*
79 * High level overview of name caching in the VFS layer.
80 *
81 * Originally caching was implemented as part of UFS, later extracted to allow
82 * use by other filesystems. A decision was made to make it optional and
83 * completely detached from the rest of the kernel, which comes with limitations
84 * outlined near the end of this comment block.
85 *
86 * This fundamental choice needs to be revisited. In the meantime, the current
87 * state is described below. Significance of all notable routines is explained
88 * in comments placed above their implementation. Scattered thoroughout the
89 * file are TODO comments indicating shortcomings which can be fixed without
90 * reworking everything (most of the fixes will likely be reusable). Various
91 * details are omitted from this explanation to not clutter the overview, they
92 * have to be checked by reading the code and associated commentary.
93 *
94 * Keep in mind that it's individual path components which are cached, not full
95 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
96 * one for each name.
97 *
98 * I. Data organization
99 *
100 * Entries are described by "struct namecache" objects and stored in a hash
101 * table. See cache_get_hash for more information.
102 *
103 * "struct vnode" contains pointers to source entries (names which can be found
104 * when traversing through said vnode), destination entries (names of that
105 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
106 * the parent vnode.
107 *
108 * The (directory vnode; name) tuple reliably determines the target entry if
109 * it exists.
110 *
111 * Since there are no small locks at this time (all are 32 bytes in size on
112 * LP64), the code works around the problem by introducing lock arrays to
113 * protect hash buckets and vnode lists.
114 *
115 * II. Filesystem integration
116 *
117 * Filesystems participating in name caching do the following:
118 * - set vop_lookup routine to vfs_cache_lookup
119 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
120 * - if they support lockless lookup (see below), vop_fplookup_vexec and
121 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
122 * mount point
123 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
124 * applicable
125 * - call cache_enter to add entries depending on the MAKEENTRY flag
126 *
127 * With the above in mind, there are 2 entry points when doing lookups:
128 * - ... -> namei -> cache_fplookup -- this is the default
129 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
130 * should the above fail
131 *
132 * Example code flow how an entry is added:
133 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
134 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
135 *
136 * III. Performance considerations
137 *
138 * For lockless case forward lookup avoids any writes to shared areas apart
139 * from the terminal path component. In other words non-modifying lookups of
140 * different files don't suffer any scalability problems in the namecache.
141 * Looking up the same file is limited by VFS and goes beyond the scope of this
142 * file.
143 *
144 * At least on amd64 the single-threaded bottleneck for long paths is hashing
145 * (see cache_get_hash). There are cases where the code issues acquire fence
146 * multiple times, they can be combined on architectures which suffer from it.
147 *
148 * For locked case each encountered vnode has to be referenced and locked in
149 * order to be handed out to the caller (normally that's namei). This
150 * introduces significant hit single-threaded and serialization multi-threaded.
151 *
152 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
153 * avoids any writes to shared areas to any components.
154 *
155 * Unrelated insertions are partially serialized on updating the global entry
156 * counter and possibly serialized on colliding bucket or vnode locks.
157 *
158 * IV. Observability
159 *
160 * Note not everything has an explicit dtrace probe nor it should have, thus
161 * some of the one-liners below depend on implementation details.
162 *
163 * Examples:
164 *
165 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
166 * # line number, column 2 is status code (see cache_fpl_status)
167 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
168 *
169 * # Lengths of names added by binary name
170 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
171 *
172 * # Same as above but only those which exceed 64 characters
173 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
174 *
175 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
176 * # path is it
177 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
178 *
179 * V. Limitations and implementation defects
180 *
181 * - since it is possible there is no entry for an open file, tools like
182 * "procstat" may fail to resolve fd -> vnode -> path to anything
183 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
184 * shortage) in which case the above problem applies
185 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
186 * way, resolving a name may return a different path than the one used to
187 * open it (even if said path is still valid)
188 * - by default entries are not added for newly created files
189 * - adding an entry may need to evict negative entry first, which happens in 2
190 * distinct places (evicting on lookup, adding in a later VOP) making it
191 * impossible to simply reuse it
192 * - there is a simple scheme to evict negative entries as the cache is approaching
193 * its capacity, but it is very unclear if doing so is a good idea to begin with
194 * - vnodes are subject to being recycled even if target inode is left in memory,
195 * which loses the name cache entries when it perhaps should not. in case of tmpfs
196 * names get duplicated -- kept by filesystem itself and namecache separately
197 * - struct namecache has a fixed size and comes in 2 variants, often wasting
198 * space. now hard to replace with malloc due to dependence on SMR, which
199 * requires UMA zones to opt in
200 * - lack of better integration with the kernel also turns nullfs into a layered
201 * filesystem instead of something which can take advantage of caching
202 *
203 * Appendix A: where is the time lost, expanding on paragraph III
204 *
205 * While some care went into optimizing lookups, there is still plenty of
206 * performance left on the table, most notably from single-threaded standpoint.
207 * Below is a woefully incomplete list of changes which can help. Ideas are
208 * mostly sketched out, no claim is made all kinks or prerequisites are laid
209 * out.
210 *
211 * Note there is performance lost all over VFS.
212 *
213 * === SMR-only lookup
214 *
215 * For commonly used ops like stat(2), when the terminal vnode *is* cached,
216 * lockless lookup could refrain from refing/locking the found vnode and
217 * instead return while within the SMR section. Then a call to, say,
218 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
219 * would be validated with seqc not changing. This would be faster
220 * single-threaded as it dodges atomics and would provide full scalability for
221 * multicore uses. This would *not* work for open(2) or other calls which need
222 * the vnode to hang around for the long haul, but would work for aforementioned
223 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
224 *
225 * === hotpatching for sdt probes
226 *
227 * They result in *tons* of branches all over with rather regrettable codegen
228 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
229 * Reworking the code to patch itself at runtime with asm goto would solve it.
230 * asm goto is fully supported by gcc and clang.
231 *
232 * === copyinstr
233 *
234 * On all architectures it operates one byte at a time, while it could be
235 * word-sized instead thanks to the Mycroft trick.
236 *
237 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
238 * *optionally* filling in the length parameter.
239 *
240 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
241 * size which is a multiply of the word (and never zero), with the length
242 * always returned. On top of it the routine could be allowed to transform the
243 * buffer in arbitrary ways, most notably writing past the found length (not to
244 * be confused with writing past buffer size) -- this would allow word-sized
245 * movs while checking for '\0' later.
246 *
247 * === detour through namei
248 *
249 * Currently one suffers being called from namei, which then has to check if
250 * things worked out locklessly. Instead the lockless lookup could be the
251 * actual entry point which calls what is currently namei as a fallback.
252 *
253 * === avoidable branches in cache_can_fplookup
254 *
255 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
256 * this is off, none of fplookup code should execute).
257 *
258 * Both audit and capsicum branches can be combined into one, but it requires
259 * paying off a lot of tech debt first.
260 *
261 * ni_startdir could be indicated with a flag in cn_flags, eliminating the
262 * branch.
263 *
264 * === mount stacks
265 *
266 * Crossing a mount requires checking if perhaps something is mounted on top.
267 * Instead, an additional entry could be added to struct mount with a pointer
268 * to the final mount on the stack. This would be recalculated on each
269 * mount/unmount.
270 *
271 * === root vnodes
272 *
273 * It could become part of the API contract to *always* have a rootvnode set in
274 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
275 * to be modified to always skip them.
276 *
277 * === inactive on v_usecount reaching 0
278 *
279 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
280 * processing with a bit in usecount.
281 *
282 * === v_holdcnt
283 *
284 * Hold count should probably get eliminated, but one can argue it is a useful
285 * feature. Even if so, handling of v_usecount could be decoupled from it --
286 * vnlru et al would consider the vnode not-freeable if has either hold or
287 * usecount on it.
288 *
289 * This would eliminate 2 atomics.
290 */
291
292 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
293 "Name cache");
294
295 SDT_PROVIDER_DECLARE(vfs);
296 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
297 "struct vnode *");
298 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
299 "struct vnode *");
300 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
301 "char *");
302 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
303 "const char *");
304 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
305 "struct namecache *", "int", "int");
306 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
307 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
308 "char *", "struct vnode *");
309 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
310 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
311 "struct vnode *", "char *");
312 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
313 "struct vnode *");
314 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
315 "struct vnode *", "char *");
316 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
317 "char *");
318 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
319 "struct componentname *");
320 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
321 "struct componentname *");
322 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
323 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
324 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
325 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
326 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
327 "struct vnode *");
328 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
329 "char *");
330 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
331 "char *");
332 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
333
334 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
335 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
336 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
337
338 static char __read_frequently cache_fast_lookup_enabled = true;
339
340 /*
341 * This structure describes the elements in the cache of recent
342 * names looked up by namei.
343 */
344 struct negstate {
345 u_char neg_flag;
346 u_char neg_hit;
347 };
348 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
349 "the state must fit in a union with a pointer without growing it");
350
351 struct namecache {
352 LIST_ENTRY(namecache) nc_src; /* source vnode list */
353 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
354 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
355 struct vnode *nc_dvp; /* vnode of parent of name */
356 union {
357 struct vnode *nu_vp; /* vnode the name refers to */
358 struct negstate nu_neg;/* negative entry state */
359 } n_un;
360 u_char nc_flag; /* flag bits */
361 u_char nc_nlen; /* length of name */
362 char nc_name[]; /* segment name + nul */
363 };
364
365 /*
366 * struct namecache_ts repeats struct namecache layout up to the
367 * nc_nlen member.
368 * struct namecache_ts is used in place of struct namecache when time(s) need
369 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
370 * both a non-dotdot directory name plus dotdot for the directory's
371 * parent.
372 *
373 * See below for alignment requirement.
374 */
375 struct namecache_ts {
376 struct timespec nc_time; /* timespec provided by fs */
377 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
378 int nc_ticks; /* ticks value when entry was added */
379 int nc_pad;
380 struct namecache nc_nc;
381 };
382
383 TAILQ_HEAD(cache_freebatch, namecache);
384
385 /*
386 * At least mips n32 performs 64-bit accesses to timespec as found
387 * in namecache_ts and requires them to be aligned. Since others
388 * may be in the same spot suffer a little bit and enforce the
389 * alignment for everyone. Note this is a nop for 64-bit platforms.
390 */
391 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
392
393 /*
394 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
395 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
396 * smaller and the value was bumped to retain the total size, but it
397 * was never re-evaluated for suitability. A simple test counting
398 * lengths during package building shows that the value of 45 covers
399 * about 86% of all added entries, reaching 99% at 65.
400 *
401 * Regardless of the above, use of dedicated zones instead of malloc may be
402 * inducing additional waste. This may be hard to address as said zones are
403 * tied to VFS SMR. Even if retaining them, the current split should be
404 * re-evaluated.
405 */
406 #ifdef __LP64__
407 #define CACHE_PATH_CUTOFF 45
408 #define CACHE_LARGE_PAD 6
409 #else
410 #define CACHE_PATH_CUTOFF 41
411 #define CACHE_LARGE_PAD 2
412 #endif
413
414 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
415 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
416 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
417 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
418
419 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
420 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
421 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
422 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
423
424 #define nc_vp n_un.nu_vp
425 #define nc_neg n_un.nu_neg
426
427 /*
428 * Flags in namecache.nc_flag
429 */
430 #define NCF_WHITE 0x01
431 #define NCF_ISDOTDOT 0x02
432 #define NCF_TS 0x04
433 #define NCF_DTS 0x08
434 #define NCF_DVDROP 0x10
435 #define NCF_NEGATIVE 0x20
436 #define NCF_INVALID 0x40
437 #define NCF_WIP 0x80
438
439 /*
440 * Flags in negstate.neg_flag
441 */
442 #define NEG_HOT 0x01
443
444 static bool cache_neg_evict_cond(u_long lnumcache);
445
446 /*
447 * Mark an entry as invalid.
448 *
449 * This is called before it starts getting deconstructed.
450 */
451 static void
cache_ncp_invalidate(struct namecache * ncp)452 cache_ncp_invalidate(struct namecache *ncp)
453 {
454
455 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
456 ("%s: entry %p already invalid", __func__, ncp));
457 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
458 atomic_thread_fence_rel();
459 }
460
461 /*
462 * Does this entry match the given directory and name?
463 */
464 static bool
cache_ncp_match(struct namecache * ncp,struct vnode * dvp,struct componentname * cnp)465 cache_ncp_match(struct namecache *ncp, struct vnode *dvp,
466 struct componentname *cnp)
467 {
468 return (ncp->nc_dvp == dvp &&
469 ncp->nc_nlen == cnp->cn_namelen &&
470 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0);
471 }
472
473 /*
474 * Check whether the entry can be safely used.
475 *
476 * All places which elide locks are supposed to call this after they are
477 * done with reading from an entry.
478 */
479 #define cache_ncp_canuse(ncp) ({ \
480 struct namecache *_ncp = (ncp); \
481 u_char _nc_flag; \
482 \
483 atomic_thread_fence_acq(); \
484 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
485 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
486 })
487
488 /*
489 * Like the above but also checks NCF_WHITE.
490 */
491 #define cache_fpl_neg_ncp_canuse(ncp) ({ \
492 struct namecache *_ncp = (ncp); \
493 u_char _nc_flag; \
494 \
495 atomic_thread_fence_acq(); \
496 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
497 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \
498 })
499
500 VFS_SMR_DECLARE;
501
502 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
503 "Name cache parameters");
504
505 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
506 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
507 "Total namecache capacity");
508
509 u_int ncsizefactor = 2;
510 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
511 "Size factor for namecache");
512
513 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
514 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
515 "Ratio of negative namecache entries");
516
517 /*
518 * Negative entry % of namecache capacity above which automatic eviction is allowed.
519 *
520 * Check cache_neg_evict_cond for details.
521 */
522 static u_int ncnegminpct = 3;
523
524 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
525 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
526 "Negative entry count above which automatic eviction is allowed");
527
528 /*
529 * Structures associated with name caching.
530 */
531 #define NCHHASH(hash) \
532 (&nchashtbl[(hash) & nchash])
533 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
534 static u_long __read_mostly nchash; /* size of hash table */
535 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
536 "Size of namecache hash table");
537 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
538 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
539
540 struct nchstats nchstats; /* cache effectiveness statistics */
541
542 static u_int __exclusive_cache_line neg_cycle;
543
544 #define ncneghash 3
545 #define numneglists (ncneghash + 1)
546
547 struct neglist {
548 struct mtx nl_evict_lock;
549 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
550 TAILQ_HEAD(, namecache) nl_list;
551 TAILQ_HEAD(, namecache) nl_hotlist;
552 u_long nl_hotnum;
553 } __aligned(CACHE_LINE_SIZE);
554
555 static struct neglist neglists[numneglists];
556
557 static inline struct neglist *
NCP2NEGLIST(struct namecache * ncp)558 NCP2NEGLIST(struct namecache *ncp)
559 {
560
561 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
562 }
563
564 static inline struct negstate *
NCP2NEGSTATE(struct namecache * ncp)565 NCP2NEGSTATE(struct namecache *ncp)
566 {
567
568 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
569 return (&ncp->nc_neg);
570 }
571
572 #define numbucketlocks (ncbuckethash + 1)
573 static u_int __read_mostly ncbuckethash;
574 static struct mtx_padalign __read_mostly *bucketlocks;
575 #define HASH2BUCKETLOCK(hash) \
576 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
577
578 #define numvnodelocks (ncvnodehash + 1)
579 static u_int __read_mostly ncvnodehash;
580 static struct mtx __read_mostly *vnodelocks;
581 static inline struct mtx *
VP2VNODELOCK(struct vnode * vp)582 VP2VNODELOCK(struct vnode *vp)
583 {
584
585 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
586 }
587
588 /*
589 * Search the hash table for a namecache entry. Either the corresponding bucket
590 * must be locked, or the caller must be in an SMR read section.
591 */
592 static struct namecache *
cache_ncp_find(struct vnode * dvp,struct componentname * cnp,uint32_t hash)593 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash)
594 {
595 struct namecache *ncp;
596
597 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(),
598 ("%s: hash %u not locked", __func__, hash));
599 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) {
600 if (cache_ncp_match(ncp, dvp, cnp))
601 break;
602 }
603 return (ncp);
604 }
605
606 static void
cache_out_ts(struct namecache * ncp,struct timespec * tsp,int * ticksp)607 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
608 {
609 struct namecache_ts *ncp_ts;
610
611 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
612 (tsp == NULL && ticksp == NULL),
613 ("No NCF_TS"));
614
615 if (tsp == NULL)
616 return;
617
618 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
619 *tsp = ncp_ts->nc_time;
620 *ticksp = ncp_ts->nc_ticks;
621 }
622
623 #ifdef DEBUG_CACHE
624 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
625 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
626 "VFS namecache enabled");
627 #endif
628
629 /* Export size information to userland */
630 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
631 sizeof(struct namecache), "sizeof(struct namecache)");
632
633 /*
634 * The new name cache statistics
635 */
636 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
637 "Name cache statistics");
638
639 #define STATNODE_ULONG(name, varname, descr) \
640 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
641 #define STATNODE_COUNTER(name, varname, descr) \
642 static COUNTER_U64_DEFINE_EARLY(varname); \
643 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
644 descr);
645 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
646 STATNODE_ULONG(count, numcache, "Number of cache entries");
647 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
648 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
649 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
650 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
651 STATNODE_COUNTER(poszaps, numposzaps,
652 "Number of cache hits (positive) we do not want to cache");
653 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
654 STATNODE_COUNTER(negzaps, numnegzaps,
655 "Number of cache hits (negative) we do not want to cache");
656 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
657 /* These count for vn_getcwd(), too. */
658 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
659 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
660 "Number of fullpath search errors (VOP_VPTOCNP failures)");
661 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
662 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
663 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
664
665 /*
666 * Debug or developer statistics.
667 */
668 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
669 "Name cache debugging");
670 #define DEBUGNODE_ULONG(name, varname, descr) \
671 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
672 static u_long zap_bucket_relock_success;
673 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
674 "Number of successful removals after relocking");
675 static u_long zap_bucket_fail;
676 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
677 static u_long zap_bucket_fail2;
678 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
679 static u_long cache_lock_vnodes_cel_3_failures;
680 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
681 "Number of times 3-way vnode locking failed");
682
683 static void cache_zap_locked(struct namecache *ncp);
684 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
685 char **retbuf, size_t *buflen, size_t addend);
686 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
687 char **retbuf, size_t *buflen);
688 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
689 char **retbuf, size_t *len, size_t addend);
690
691 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
692
693 static inline void
cache_assert_vlp_locked(struct mtx * vlp)694 cache_assert_vlp_locked(struct mtx *vlp)
695 {
696
697 if (vlp != NULL)
698 mtx_assert(vlp, MA_OWNED);
699 }
700
701 static inline void
cache_assert_vnode_locked(struct vnode * vp)702 cache_assert_vnode_locked(struct vnode *vp)
703 {
704 struct mtx *vlp;
705
706 vlp = VP2VNODELOCK(vp);
707 cache_assert_vlp_locked(vlp);
708 }
709
710 /*
711 * Directory vnodes with entries are held for two reasons:
712 * 1. make them less of a target for reclamation in vnlru
713 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
714 *
715 * It will be feasible to stop doing it altogether if all filesystems start
716 * supporting lockless lookup.
717 */
718 static void
cache_hold_vnode(struct vnode * vp)719 cache_hold_vnode(struct vnode *vp)
720 {
721
722 cache_assert_vnode_locked(vp);
723 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
724 vhold(vp);
725 counter_u64_add(numcachehv, 1);
726 }
727
728 static void
cache_drop_vnode(struct vnode * vp)729 cache_drop_vnode(struct vnode *vp)
730 {
731
732 /*
733 * Called after all locks are dropped, meaning we can't assert
734 * on the state of v_cache_src.
735 */
736 vdrop(vp);
737 counter_u64_add(numcachehv, -1);
738 }
739
740 /*
741 * UMA zones.
742 */
743 static uma_zone_t __read_mostly cache_zone_small;
744 static uma_zone_t __read_mostly cache_zone_small_ts;
745 static uma_zone_t __read_mostly cache_zone_large;
746 static uma_zone_t __read_mostly cache_zone_large_ts;
747
748 char *
cache_symlink_alloc(size_t size,int flags)749 cache_symlink_alloc(size_t size, int flags)
750 {
751
752 if (size < CACHE_ZONE_SMALL_SIZE) {
753 return (uma_zalloc_smr(cache_zone_small, flags));
754 }
755 if (size < CACHE_ZONE_LARGE_SIZE) {
756 return (uma_zalloc_smr(cache_zone_large, flags));
757 }
758 counter_u64_add(symlinktoobig, 1);
759 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
760 return (NULL);
761 }
762
763 void
cache_symlink_free(char * string,size_t size)764 cache_symlink_free(char *string, size_t size)
765 {
766
767 MPASS(string != NULL);
768 KASSERT(size < CACHE_ZONE_LARGE_SIZE,
769 ("%s: size %zu too big", __func__, size));
770
771 if (size < CACHE_ZONE_SMALL_SIZE) {
772 uma_zfree_smr(cache_zone_small, string);
773 return;
774 }
775 if (size < CACHE_ZONE_LARGE_SIZE) {
776 uma_zfree_smr(cache_zone_large, string);
777 return;
778 }
779 __assert_unreachable();
780 }
781
782 static struct namecache *
cache_alloc_uma(int len,bool ts)783 cache_alloc_uma(int len, bool ts)
784 {
785 struct namecache_ts *ncp_ts;
786 struct namecache *ncp;
787
788 if (__predict_false(ts)) {
789 if (len <= CACHE_PATH_CUTOFF)
790 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
791 else
792 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
793 ncp = &ncp_ts->nc_nc;
794 } else {
795 if (len <= CACHE_PATH_CUTOFF)
796 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
797 else
798 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
799 }
800 return (ncp);
801 }
802
803 static void
cache_free_uma(struct namecache * ncp)804 cache_free_uma(struct namecache *ncp)
805 {
806 struct namecache_ts *ncp_ts;
807
808 if (__predict_false(ncp->nc_flag & NCF_TS)) {
809 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
810 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
811 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
812 else
813 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
814 } else {
815 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
816 uma_zfree_smr(cache_zone_small, ncp);
817 else
818 uma_zfree_smr(cache_zone_large, ncp);
819 }
820 }
821
822 static struct namecache *
cache_alloc(int len,bool ts)823 cache_alloc(int len, bool ts)
824 {
825 u_long lnumcache;
826
827 /*
828 * Avoid blowout in namecache entries.
829 *
830 * Bugs:
831 * 1. filesystems may end up trying to add an already existing entry
832 * (for example this can happen after a cache miss during concurrent
833 * lookup), in which case we will call cache_neg_evict despite not
834 * adding anything.
835 * 2. the routine may fail to free anything and no provisions are made
836 * to make it try harder (see the inside for failure modes)
837 * 3. it only ever looks at negative entries.
838 */
839 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
840 if (cache_neg_evict_cond(lnumcache)) {
841 lnumcache = atomic_load_long(&numcache);
842 }
843 if (__predict_false(lnumcache >= ncsize)) {
844 atomic_subtract_long(&numcache, 1);
845 counter_u64_add(numdrops, 1);
846 return (NULL);
847 }
848 return (cache_alloc_uma(len, ts));
849 }
850
851 static void
cache_free(struct namecache * ncp)852 cache_free(struct namecache *ncp)
853 {
854
855 MPASS(ncp != NULL);
856 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
857 cache_drop_vnode(ncp->nc_dvp);
858 }
859 cache_free_uma(ncp);
860 atomic_subtract_long(&numcache, 1);
861 }
862
863 static void
cache_free_batch(struct cache_freebatch * batch)864 cache_free_batch(struct cache_freebatch *batch)
865 {
866 struct namecache *ncp, *nnp;
867 int i;
868
869 i = 0;
870 if (TAILQ_EMPTY(batch))
871 goto out;
872 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
873 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
874 cache_drop_vnode(ncp->nc_dvp);
875 }
876 cache_free_uma(ncp);
877 i++;
878 }
879 atomic_subtract_long(&numcache, i);
880 out:
881 SDT_PROBE1(vfs, namecache, purge, batch, i);
882 }
883
884 /*
885 * Hashing.
886 *
887 * The code was made to use FNV in 2001 and this choice needs to be revisited.
888 *
889 * Short summary of the difficulty:
890 * The longest name which can be inserted is NAME_MAX characters in length (or
891 * 255 at the time of writing this comment), while majority of names used in
892 * practice are significantly shorter (mostly below 10). More importantly
893 * majority of lookups performed find names are even shorter than that.
894 *
895 * This poses a problem where hashes which do better than FNV past word size
896 * (or so) tend to come with additional overhead when finalizing the result,
897 * making them noticeably slower for the most commonly used range.
898 *
899 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
900 *
901 * When looking it up the most time consuming part by a large margin (at least
902 * on amd64) is hashing. Replacing FNV with something which pessimizes short
903 * input would make the slowest part stand out even more.
904 */
905
906 /*
907 * TODO: With the value stored we can do better than computing the hash based
908 * on the address.
909 */
910 static void
cache_prehash(struct vnode * vp)911 cache_prehash(struct vnode *vp)
912 {
913
914 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
915 }
916
917 static uint32_t
cache_get_hash(char * name,u_char len,struct vnode * dvp)918 cache_get_hash(char *name, u_char len, struct vnode *dvp)
919 {
920
921 return (fnv_32_buf(name, len, dvp->v_nchash));
922 }
923
924 static uint32_t
cache_get_hash_iter_start(struct vnode * dvp)925 cache_get_hash_iter_start(struct vnode *dvp)
926 {
927
928 return (dvp->v_nchash);
929 }
930
931 static uint32_t
cache_get_hash_iter(char c,uint32_t hash)932 cache_get_hash_iter(char c, uint32_t hash)
933 {
934
935 return (fnv_32_buf(&c, 1, hash));
936 }
937
938 static uint32_t
cache_get_hash_iter_finish(uint32_t hash)939 cache_get_hash_iter_finish(uint32_t hash)
940 {
941
942 return (hash);
943 }
944
945 static inline struct nchashhead *
NCP2BUCKET(struct namecache * ncp)946 NCP2BUCKET(struct namecache *ncp)
947 {
948 uint32_t hash;
949
950 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
951 return (NCHHASH(hash));
952 }
953
954 static inline struct mtx *
NCP2BUCKETLOCK(struct namecache * ncp)955 NCP2BUCKETLOCK(struct namecache *ncp)
956 {
957 uint32_t hash;
958
959 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
960 return (HASH2BUCKETLOCK(hash));
961 }
962
963 #ifdef INVARIANTS
964 static void
cache_assert_bucket_locked(struct namecache * ncp)965 cache_assert_bucket_locked(struct namecache *ncp)
966 {
967 struct mtx *blp;
968
969 blp = NCP2BUCKETLOCK(ncp);
970 mtx_assert(blp, MA_OWNED);
971 }
972
973 static void
cache_assert_bucket_unlocked(struct namecache * ncp)974 cache_assert_bucket_unlocked(struct namecache *ncp)
975 {
976 struct mtx *blp;
977
978 blp = NCP2BUCKETLOCK(ncp);
979 mtx_assert(blp, MA_NOTOWNED);
980 }
981 #else
982 #define cache_assert_bucket_locked(x) do { } while (0)
983 #define cache_assert_bucket_unlocked(x) do { } while (0)
984 #endif
985
986 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
987 static void
_cache_sort_vnodes(void ** p1,void ** p2)988 _cache_sort_vnodes(void **p1, void **p2)
989 {
990 void *tmp;
991
992 MPASS(*p1 != NULL || *p2 != NULL);
993
994 if (*p1 > *p2) {
995 tmp = *p2;
996 *p2 = *p1;
997 *p1 = tmp;
998 }
999 }
1000
1001 static void
cache_lock_all_buckets(void)1002 cache_lock_all_buckets(void)
1003 {
1004 u_int i;
1005
1006 for (i = 0; i < numbucketlocks; i++)
1007 mtx_lock(&bucketlocks[i]);
1008 }
1009
1010 static void
cache_unlock_all_buckets(void)1011 cache_unlock_all_buckets(void)
1012 {
1013 u_int i;
1014
1015 for (i = 0; i < numbucketlocks; i++)
1016 mtx_unlock(&bucketlocks[i]);
1017 }
1018
1019 static void
cache_lock_all_vnodes(void)1020 cache_lock_all_vnodes(void)
1021 {
1022 u_int i;
1023
1024 for (i = 0; i < numvnodelocks; i++)
1025 mtx_lock(&vnodelocks[i]);
1026 }
1027
1028 static void
cache_unlock_all_vnodes(void)1029 cache_unlock_all_vnodes(void)
1030 {
1031 u_int i;
1032
1033 for (i = 0; i < numvnodelocks; i++)
1034 mtx_unlock(&vnodelocks[i]);
1035 }
1036
1037 static int
cache_trylock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1038 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1039 {
1040
1041 cache_sort_vnodes(&vlp1, &vlp2);
1042
1043 if (vlp1 != NULL) {
1044 if (!mtx_trylock(vlp1))
1045 return (EAGAIN);
1046 }
1047 if (!mtx_trylock(vlp2)) {
1048 if (vlp1 != NULL)
1049 mtx_unlock(vlp1);
1050 return (EAGAIN);
1051 }
1052
1053 return (0);
1054 }
1055
1056 static void
cache_lock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1057 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1058 {
1059
1060 MPASS(vlp1 != NULL || vlp2 != NULL);
1061 MPASS(vlp1 <= vlp2);
1062
1063 if (vlp1 != NULL)
1064 mtx_lock(vlp1);
1065 if (vlp2 != NULL)
1066 mtx_lock(vlp2);
1067 }
1068
1069 static void
cache_unlock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1070 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1071 {
1072
1073 MPASS(vlp1 != NULL || vlp2 != NULL);
1074
1075 if (vlp1 != NULL)
1076 mtx_unlock(vlp1);
1077 if (vlp2 != NULL)
1078 mtx_unlock(vlp2);
1079 }
1080
1081 static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)1082 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1083 {
1084 struct nchstats snap;
1085
1086 if (req->oldptr == NULL)
1087 return (SYSCTL_OUT(req, 0, sizeof(snap)));
1088
1089 snap = nchstats;
1090 snap.ncs_goodhits = counter_u64_fetch(numposhits);
1091 snap.ncs_neghits = counter_u64_fetch(numneghits);
1092 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1093 counter_u64_fetch(numnegzaps);
1094 snap.ncs_miss = counter_u64_fetch(nummisszap) +
1095 counter_u64_fetch(nummiss);
1096
1097 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1098 }
1099 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1100 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1101 "VFS cache effectiveness statistics");
1102
1103 static int
sysctl_hitpct(SYSCTL_HANDLER_ARGS)1104 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
1105 {
1106 long poshits, neghits, miss, total;
1107 long pct;
1108
1109 poshits = counter_u64_fetch(numposhits);
1110 neghits = counter_u64_fetch(numneghits);
1111 miss = counter_u64_fetch(nummiss);
1112 total = poshits + neghits + miss;
1113
1114 pct = 0;
1115 if (total != 0)
1116 pct = ((poshits + neghits) * 100) / total;
1117 return (sysctl_handle_int(oidp, 0, pct, req));
1118 }
1119 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
1120 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
1121 "I", "Percentage of hits");
1122
1123 static void
cache_recalc_neg_min(void)1124 cache_recalc_neg_min(void)
1125 {
1126
1127 neg_min = (ncsize * ncnegminpct) / 100;
1128 }
1129
1130 static int
sysctl_negminpct(SYSCTL_HANDLER_ARGS)1131 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1132 {
1133 u_int val;
1134 int error;
1135
1136 val = ncnegminpct;
1137 error = sysctl_handle_int(oidp, &val, 0, req);
1138 if (error != 0 || req->newptr == NULL)
1139 return (error);
1140
1141 if (val == ncnegminpct)
1142 return (0);
1143 if (val < 0 || val > 99)
1144 return (EINVAL);
1145 ncnegminpct = val;
1146 cache_recalc_neg_min();
1147 return (0);
1148 }
1149
1150 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1151 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1152 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1153
1154 #ifdef DEBUG_CACHE
1155 /*
1156 * Grab an atomic snapshot of the name cache hash chain lengths
1157 */
1158 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1159 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1160 "hash table stats");
1161
1162 static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)1163 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1164 {
1165 struct nchashhead *ncpp;
1166 struct namecache *ncp;
1167 int i, error, n_nchash, *cntbuf;
1168
1169 retry:
1170 n_nchash = nchash + 1; /* nchash is max index, not count */
1171 if (req->oldptr == NULL)
1172 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1173 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1174 cache_lock_all_buckets();
1175 if (n_nchash != nchash + 1) {
1176 cache_unlock_all_buckets();
1177 free(cntbuf, M_TEMP);
1178 goto retry;
1179 }
1180 /* Scan hash tables counting entries */
1181 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1182 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1183 cntbuf[i]++;
1184 cache_unlock_all_buckets();
1185 for (error = 0, i = 0; i < n_nchash; i++)
1186 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1187 break;
1188 free(cntbuf, M_TEMP);
1189 return (error);
1190 }
1191 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1192 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1193 "nchash chain lengths");
1194
1195 static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)1196 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1197 {
1198 int error;
1199 struct nchashhead *ncpp;
1200 struct namecache *ncp;
1201 int n_nchash;
1202 int count, maxlength, used, pct;
1203
1204 if (!req->oldptr)
1205 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1206
1207 cache_lock_all_buckets();
1208 n_nchash = nchash + 1; /* nchash is max index, not count */
1209 used = 0;
1210 maxlength = 0;
1211
1212 /* Scan hash tables for applicable entries */
1213 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1214 count = 0;
1215 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1216 count++;
1217 }
1218 if (count)
1219 used++;
1220 if (maxlength < count)
1221 maxlength = count;
1222 }
1223 n_nchash = nchash + 1;
1224 cache_unlock_all_buckets();
1225 pct = (used * 100) / (n_nchash / 100);
1226 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1227 if (error)
1228 return (error);
1229 error = SYSCTL_OUT(req, &used, sizeof(used));
1230 if (error)
1231 return (error);
1232 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1233 if (error)
1234 return (error);
1235 error = SYSCTL_OUT(req, &pct, sizeof(pct));
1236 if (error)
1237 return (error);
1238 return (0);
1239 }
1240 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1241 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1242 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1243 #endif
1244
1245 /*
1246 * Negative entries management
1247 *
1248 * Various workloads create plenty of negative entries and barely use them
1249 * afterwards. Moreover malicious users can keep performing bogus lookups
1250 * adding even more entries. For example "make tinderbox" as of writing this
1251 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1252 * negative.
1253 *
1254 * As such, a rather aggressive eviction method is needed. The currently
1255 * employed method is a placeholder.
1256 *
1257 * Entries are split over numneglists separate lists, each of which is further
1258 * split into hot and cold entries. Entries get promoted after getting a hit.
1259 * Eviction happens on addition of new entry.
1260 */
1261 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1262 "Name cache negative entry statistics");
1263
1264 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1265 "Number of negative cache entries");
1266
1267 static COUNTER_U64_DEFINE_EARLY(neg_created);
1268 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1269 "Number of created negative entries");
1270
1271 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1272 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1273 "Number of evicted negative entries");
1274
1275 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1276 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1277 &neg_evict_skipped_empty,
1278 "Number of times evicting failed due to lack of entries");
1279
1280 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1281 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1282 &neg_evict_skipped_missed,
1283 "Number of times evicting failed due to target entry disappearing");
1284
1285 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1286 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1287 &neg_evict_skipped_contended,
1288 "Number of times evicting failed due to contention");
1289
1290 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1291 "Number of cache hits (negative)");
1292
1293 static int
sysctl_neg_hot(SYSCTL_HANDLER_ARGS)1294 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1295 {
1296 int i, out;
1297
1298 out = 0;
1299 for (i = 0; i < numneglists; i++)
1300 out += neglists[i].nl_hotnum;
1301
1302 return (SYSCTL_OUT(req, &out, sizeof(out)));
1303 }
1304 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1305 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1306 "Number of hot negative entries");
1307
1308 static void
cache_neg_init(struct namecache * ncp)1309 cache_neg_init(struct namecache *ncp)
1310 {
1311 struct negstate *ns;
1312
1313 ncp->nc_flag |= NCF_NEGATIVE;
1314 ns = NCP2NEGSTATE(ncp);
1315 ns->neg_flag = 0;
1316 ns->neg_hit = 0;
1317 counter_u64_add(neg_created, 1);
1318 }
1319
1320 #define CACHE_NEG_PROMOTION_THRESH 2
1321
1322 static bool
cache_neg_hit_prep(struct namecache * ncp)1323 cache_neg_hit_prep(struct namecache *ncp)
1324 {
1325 struct negstate *ns;
1326 u_char n;
1327
1328 ns = NCP2NEGSTATE(ncp);
1329 n = atomic_load_char(&ns->neg_hit);
1330 for (;;) {
1331 if (n >= CACHE_NEG_PROMOTION_THRESH)
1332 return (false);
1333 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1334 break;
1335 }
1336 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1337 }
1338
1339 /*
1340 * Nothing to do here but it is provided for completeness as some
1341 * cache_neg_hit_prep callers may end up returning without even
1342 * trying to promote.
1343 */
1344 #define cache_neg_hit_abort(ncp) do { } while (0)
1345
1346 static void
cache_neg_hit_finish(struct namecache * ncp)1347 cache_neg_hit_finish(struct namecache *ncp)
1348 {
1349
1350 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1351 counter_u64_add(numneghits, 1);
1352 }
1353
1354 /*
1355 * Move a negative entry to the hot list.
1356 */
1357 static void
cache_neg_promote_locked(struct namecache * ncp)1358 cache_neg_promote_locked(struct namecache *ncp)
1359 {
1360 struct neglist *nl;
1361 struct negstate *ns;
1362
1363 ns = NCP2NEGSTATE(ncp);
1364 nl = NCP2NEGLIST(ncp);
1365 mtx_assert(&nl->nl_lock, MA_OWNED);
1366 if ((ns->neg_flag & NEG_HOT) == 0) {
1367 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1368 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1369 nl->nl_hotnum++;
1370 ns->neg_flag |= NEG_HOT;
1371 }
1372 }
1373
1374 /*
1375 * Move a hot negative entry to the cold list.
1376 */
1377 static void
cache_neg_demote_locked(struct namecache * ncp)1378 cache_neg_demote_locked(struct namecache *ncp)
1379 {
1380 struct neglist *nl;
1381 struct negstate *ns;
1382
1383 ns = NCP2NEGSTATE(ncp);
1384 nl = NCP2NEGLIST(ncp);
1385 mtx_assert(&nl->nl_lock, MA_OWNED);
1386 MPASS(ns->neg_flag & NEG_HOT);
1387 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1388 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1389 nl->nl_hotnum--;
1390 ns->neg_flag &= ~NEG_HOT;
1391 atomic_store_char(&ns->neg_hit, 0);
1392 }
1393
1394 /*
1395 * Move a negative entry to the hot list if it matches the lookup.
1396 *
1397 * We have to take locks, but they may be contended and in the worst
1398 * case we may need to go off CPU. We don't want to spin within the
1399 * smr section and we can't block with it. Exiting the section means
1400 * the found entry could have been evicted. We are going to look it
1401 * up again.
1402 */
1403 static bool
cache_neg_promote_cond(struct vnode * dvp,struct componentname * cnp,struct namecache * oncp,uint32_t hash)1404 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1405 struct namecache *oncp, uint32_t hash)
1406 {
1407 struct namecache *ncp;
1408 struct neglist *nl;
1409 u_char nc_flag;
1410
1411 nl = NCP2NEGLIST(oncp);
1412
1413 mtx_lock(&nl->nl_lock);
1414 /*
1415 * For hash iteration.
1416 */
1417 vfs_smr_enter();
1418
1419 /*
1420 * Avoid all surprises by only succeeding if we got the same entry and
1421 * bailing completely otherwise.
1422 * XXX There are no provisions to keep the vnode around, meaning we may
1423 * end up promoting a negative entry for a *new* vnode and returning
1424 * ENOENT on its account. This is the error we want to return anyway
1425 * and promotion is harmless.
1426 *
1427 * In particular at this point there can be a new ncp which matches the
1428 * search but hashes to a different neglist.
1429 */
1430 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1431 if (ncp == oncp)
1432 break;
1433 }
1434
1435 /*
1436 * No match to begin with.
1437 */
1438 if (__predict_false(ncp == NULL)) {
1439 goto out_abort;
1440 }
1441
1442 /*
1443 * The newly found entry may be something different...
1444 */
1445 if (!cache_ncp_match(ncp, dvp, cnp)) {
1446 goto out_abort;
1447 }
1448
1449 /*
1450 * ... and not even negative.
1451 */
1452 nc_flag = atomic_load_char(&ncp->nc_flag);
1453 if ((nc_flag & NCF_NEGATIVE) == 0) {
1454 goto out_abort;
1455 }
1456
1457 if (!cache_ncp_canuse(ncp)) {
1458 goto out_abort;
1459 }
1460
1461 cache_neg_promote_locked(ncp);
1462 cache_neg_hit_finish(ncp);
1463 vfs_smr_exit();
1464 mtx_unlock(&nl->nl_lock);
1465 return (true);
1466 out_abort:
1467 vfs_smr_exit();
1468 mtx_unlock(&nl->nl_lock);
1469 return (false);
1470 }
1471
1472 static void
cache_neg_promote(struct namecache * ncp)1473 cache_neg_promote(struct namecache *ncp)
1474 {
1475 struct neglist *nl;
1476
1477 nl = NCP2NEGLIST(ncp);
1478 mtx_lock(&nl->nl_lock);
1479 cache_neg_promote_locked(ncp);
1480 mtx_unlock(&nl->nl_lock);
1481 }
1482
1483 static void
cache_neg_insert(struct namecache * ncp)1484 cache_neg_insert(struct namecache *ncp)
1485 {
1486 struct neglist *nl;
1487
1488 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1489 cache_assert_bucket_locked(ncp);
1490 nl = NCP2NEGLIST(ncp);
1491 mtx_lock(&nl->nl_lock);
1492 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1493 mtx_unlock(&nl->nl_lock);
1494 atomic_add_long(&numneg, 1);
1495 }
1496
1497 static void
cache_neg_remove(struct namecache * ncp)1498 cache_neg_remove(struct namecache *ncp)
1499 {
1500 struct neglist *nl;
1501 struct negstate *ns;
1502
1503 cache_assert_bucket_locked(ncp);
1504 nl = NCP2NEGLIST(ncp);
1505 ns = NCP2NEGSTATE(ncp);
1506 mtx_lock(&nl->nl_lock);
1507 if ((ns->neg_flag & NEG_HOT) != 0) {
1508 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1509 nl->nl_hotnum--;
1510 } else {
1511 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1512 }
1513 mtx_unlock(&nl->nl_lock);
1514 atomic_subtract_long(&numneg, 1);
1515 }
1516
1517 static struct neglist *
cache_neg_evict_select_list(void)1518 cache_neg_evict_select_list(void)
1519 {
1520 struct neglist *nl;
1521 u_int c;
1522
1523 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1524 nl = &neglists[c % numneglists];
1525 if (!mtx_trylock(&nl->nl_evict_lock)) {
1526 counter_u64_add(neg_evict_skipped_contended, 1);
1527 return (NULL);
1528 }
1529 return (nl);
1530 }
1531
1532 static struct namecache *
cache_neg_evict_select_entry(struct neglist * nl)1533 cache_neg_evict_select_entry(struct neglist *nl)
1534 {
1535 struct namecache *ncp, *lncp;
1536 struct negstate *ns, *lns;
1537 int i;
1538
1539 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1540 mtx_assert(&nl->nl_lock, MA_OWNED);
1541 ncp = TAILQ_FIRST(&nl->nl_list);
1542 if (ncp == NULL)
1543 return (NULL);
1544 lncp = ncp;
1545 lns = NCP2NEGSTATE(lncp);
1546 for (i = 1; i < 4; i++) {
1547 ncp = TAILQ_NEXT(ncp, nc_dst);
1548 if (ncp == NULL)
1549 break;
1550 ns = NCP2NEGSTATE(ncp);
1551 if (ns->neg_hit < lns->neg_hit) {
1552 lncp = ncp;
1553 lns = ns;
1554 }
1555 }
1556 return (lncp);
1557 }
1558
1559 static bool
cache_neg_evict(void)1560 cache_neg_evict(void)
1561 {
1562 struct namecache *ncp, *ncp2;
1563 struct neglist *nl;
1564 struct vnode *dvp;
1565 struct mtx *dvlp;
1566 struct mtx *blp;
1567 uint32_t hash;
1568 u_char nlen;
1569 bool evicted;
1570
1571 nl = cache_neg_evict_select_list();
1572 if (nl == NULL) {
1573 return (false);
1574 }
1575
1576 mtx_lock(&nl->nl_lock);
1577 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1578 if (ncp != NULL) {
1579 cache_neg_demote_locked(ncp);
1580 }
1581 ncp = cache_neg_evict_select_entry(nl);
1582 if (ncp == NULL) {
1583 counter_u64_add(neg_evict_skipped_empty, 1);
1584 mtx_unlock(&nl->nl_lock);
1585 mtx_unlock(&nl->nl_evict_lock);
1586 return (false);
1587 }
1588 nlen = ncp->nc_nlen;
1589 dvp = ncp->nc_dvp;
1590 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1591 dvlp = VP2VNODELOCK(dvp);
1592 blp = HASH2BUCKETLOCK(hash);
1593 mtx_unlock(&nl->nl_lock);
1594 mtx_unlock(&nl->nl_evict_lock);
1595 mtx_lock(dvlp);
1596 mtx_lock(blp);
1597 /*
1598 * Note that since all locks were dropped above, the entry may be
1599 * gone or reallocated to be something else.
1600 */
1601 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1602 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1603 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1604 break;
1605 }
1606 if (ncp2 == NULL) {
1607 counter_u64_add(neg_evict_skipped_missed, 1);
1608 ncp = NULL;
1609 evicted = false;
1610 } else {
1611 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1612 MPASS(blp == NCP2BUCKETLOCK(ncp));
1613 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1614 ncp->nc_name);
1615 cache_zap_locked(ncp);
1616 counter_u64_add(neg_evicted, 1);
1617 evicted = true;
1618 }
1619 mtx_unlock(blp);
1620 mtx_unlock(dvlp);
1621 if (ncp != NULL)
1622 cache_free(ncp);
1623 return (evicted);
1624 }
1625
1626 /*
1627 * Maybe evict a negative entry to create more room.
1628 *
1629 * The ncnegfactor parameter limits what fraction of the total count
1630 * can comprise of negative entries. However, if the cache is just
1631 * warming up this leads to excessive evictions. As such, ncnegminpct
1632 * (recomputed to neg_min) dictates whether the above should be
1633 * applied.
1634 *
1635 * Try evicting if the cache is close to full capacity regardless of
1636 * other considerations.
1637 */
1638 static bool
cache_neg_evict_cond(u_long lnumcache)1639 cache_neg_evict_cond(u_long lnumcache)
1640 {
1641 u_long lnumneg;
1642
1643 if (ncsize - 1000 < lnumcache)
1644 goto out_evict;
1645 lnumneg = atomic_load_long(&numneg);
1646 if (lnumneg < neg_min)
1647 return (false);
1648 if (lnumneg * ncnegfactor < lnumcache)
1649 return (false);
1650 out_evict:
1651 return (cache_neg_evict());
1652 }
1653
1654 /*
1655 * cache_zap_locked():
1656 *
1657 * Removes a namecache entry from cache, whether it contains an actual
1658 * pointer to a vnode or if it is just a negative cache entry.
1659 */
1660 static void
cache_zap_locked(struct namecache * ncp)1661 cache_zap_locked(struct namecache *ncp)
1662 {
1663 struct nchashhead *ncpp;
1664 struct vnode *dvp, *vp;
1665
1666 dvp = ncp->nc_dvp;
1667 vp = ncp->nc_vp;
1668
1669 if (!(ncp->nc_flag & NCF_NEGATIVE))
1670 cache_assert_vnode_locked(vp);
1671 cache_assert_vnode_locked(dvp);
1672 cache_assert_bucket_locked(ncp);
1673
1674 cache_ncp_invalidate(ncp);
1675
1676 ncpp = NCP2BUCKET(ncp);
1677 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1678 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1679 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1680 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1681 if (ncp == vp->v_cache_dd) {
1682 atomic_store_ptr(&vp->v_cache_dd, NULL);
1683 }
1684 } else {
1685 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1686 cache_neg_remove(ncp);
1687 }
1688 if (ncp->nc_flag & NCF_ISDOTDOT) {
1689 if (ncp == dvp->v_cache_dd) {
1690 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1691 }
1692 } else {
1693 LIST_REMOVE(ncp, nc_src);
1694 if (LIST_EMPTY(&dvp->v_cache_src)) {
1695 ncp->nc_flag |= NCF_DVDROP;
1696 }
1697 }
1698 }
1699
1700 static void
cache_zap_negative_locked_vnode_kl(struct namecache * ncp,struct vnode * vp)1701 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1702 {
1703 struct mtx *blp;
1704
1705 MPASS(ncp->nc_dvp == vp);
1706 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1707 cache_assert_vnode_locked(vp);
1708
1709 blp = NCP2BUCKETLOCK(ncp);
1710 mtx_lock(blp);
1711 cache_zap_locked(ncp);
1712 mtx_unlock(blp);
1713 }
1714
1715 static bool
cache_zap_locked_vnode_kl2(struct namecache * ncp,struct vnode * vp,struct mtx ** vlpp)1716 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1717 struct mtx **vlpp)
1718 {
1719 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1720 struct mtx *blp;
1721
1722 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1723 cache_assert_vnode_locked(vp);
1724
1725 if (ncp->nc_flag & NCF_NEGATIVE) {
1726 if (*vlpp != NULL) {
1727 mtx_unlock(*vlpp);
1728 *vlpp = NULL;
1729 }
1730 cache_zap_negative_locked_vnode_kl(ncp, vp);
1731 return (true);
1732 }
1733
1734 pvlp = VP2VNODELOCK(vp);
1735 blp = NCP2BUCKETLOCK(ncp);
1736 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1737 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1738
1739 if (*vlpp == vlp1 || *vlpp == vlp2) {
1740 to_unlock = *vlpp;
1741 *vlpp = NULL;
1742 } else {
1743 if (*vlpp != NULL) {
1744 mtx_unlock(*vlpp);
1745 *vlpp = NULL;
1746 }
1747 cache_sort_vnodes(&vlp1, &vlp2);
1748 if (vlp1 == pvlp) {
1749 mtx_lock(vlp2);
1750 to_unlock = vlp2;
1751 } else {
1752 if (!mtx_trylock(vlp1))
1753 goto out_relock;
1754 to_unlock = vlp1;
1755 }
1756 }
1757 mtx_lock(blp);
1758 cache_zap_locked(ncp);
1759 mtx_unlock(blp);
1760 if (to_unlock != NULL)
1761 mtx_unlock(to_unlock);
1762 return (true);
1763
1764 out_relock:
1765 mtx_unlock(vlp2);
1766 mtx_lock(vlp1);
1767 mtx_lock(vlp2);
1768 MPASS(*vlpp == NULL);
1769 *vlpp = vlp1;
1770 return (false);
1771 }
1772
1773 /*
1774 * If trylocking failed we can get here. We know enough to take all needed locks
1775 * in the right order and re-lookup the entry.
1776 */
1777 static int
cache_zap_unlocked_bucket(struct namecache * ncp,struct componentname * cnp,struct vnode * dvp,struct mtx * dvlp,struct mtx * vlp,uint32_t hash,struct mtx * blp)1778 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1779 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1780 struct mtx *blp)
1781 {
1782 struct namecache *rncp;
1783 struct mtx *rvlp;
1784
1785 cache_assert_bucket_unlocked(ncp);
1786
1787 cache_sort_vnodes(&dvlp, &vlp);
1788 cache_lock_vnodes(dvlp, vlp);
1789 mtx_lock(blp);
1790 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1791 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp))
1792 break;
1793 }
1794 if (rncp == NULL)
1795 goto out_mismatch;
1796
1797 if (!(ncp->nc_flag & NCF_NEGATIVE))
1798 rvlp = VP2VNODELOCK(rncp->nc_vp);
1799 else
1800 rvlp = NULL;
1801 if (rvlp != vlp)
1802 goto out_mismatch;
1803
1804 cache_zap_locked(rncp);
1805 mtx_unlock(blp);
1806 cache_unlock_vnodes(dvlp, vlp);
1807 atomic_add_long(&zap_bucket_relock_success, 1);
1808 return (0);
1809
1810 out_mismatch:
1811 mtx_unlock(blp);
1812 cache_unlock_vnodes(dvlp, vlp);
1813 return (EAGAIN);
1814 }
1815
1816 static int __noinline
cache_zap_locked_bucket(struct namecache * ncp,struct componentname * cnp,uint32_t hash,struct mtx * blp)1817 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1818 uint32_t hash, struct mtx *blp)
1819 {
1820 struct mtx *dvlp, *vlp;
1821 struct vnode *dvp;
1822
1823 cache_assert_bucket_locked(ncp);
1824
1825 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1826 vlp = NULL;
1827 if (!(ncp->nc_flag & NCF_NEGATIVE))
1828 vlp = VP2VNODELOCK(ncp->nc_vp);
1829 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1830 cache_zap_locked(ncp);
1831 mtx_unlock(blp);
1832 cache_unlock_vnodes(dvlp, vlp);
1833 return (0);
1834 }
1835
1836 dvp = ncp->nc_dvp;
1837 mtx_unlock(blp);
1838 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1839 }
1840
1841 static __noinline int
cache_remove_cnp(struct vnode * dvp,struct componentname * cnp)1842 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1843 {
1844 struct namecache *ncp;
1845 struct mtx *blp;
1846 struct mtx *dvlp, *dvlp2;
1847 uint32_t hash;
1848 int error;
1849
1850 if (cnp->cn_namelen == 2 &&
1851 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1852 dvlp = VP2VNODELOCK(dvp);
1853 dvlp2 = NULL;
1854 mtx_lock(dvlp);
1855 retry_dotdot:
1856 ncp = dvp->v_cache_dd;
1857 if (ncp == NULL) {
1858 mtx_unlock(dvlp);
1859 if (dvlp2 != NULL)
1860 mtx_unlock(dvlp2);
1861 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1862 return (0);
1863 }
1864 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1865 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1866 goto retry_dotdot;
1867 MPASS(dvp->v_cache_dd == NULL);
1868 mtx_unlock(dvlp);
1869 if (dvlp2 != NULL)
1870 mtx_unlock(dvlp2);
1871 cache_free(ncp);
1872 } else {
1873 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1874 mtx_unlock(dvlp);
1875 if (dvlp2 != NULL)
1876 mtx_unlock(dvlp2);
1877 }
1878 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1879 return (1);
1880 }
1881
1882 /*
1883 * XXX note that access here is completely unlocked with no provisions
1884 * to keep the hash allocated. If one is sufficiently unlucky a
1885 * parallel cache resize can reallocate the hash, unmap backing pages
1886 * and cause the empty check below to fault.
1887 *
1888 * Fixing this has epsilon priority, but can be done with no overhead
1889 * for this codepath with sufficient effort.
1890 */
1891 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1892 blp = HASH2BUCKETLOCK(hash);
1893 retry:
1894 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1895 goto out_no_entry;
1896
1897 mtx_lock(blp);
1898 ncp = cache_ncp_find(dvp, cnp, hash);
1899 if (ncp == NULL) {
1900 mtx_unlock(blp);
1901 goto out_no_entry;
1902 }
1903
1904 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1905 if (__predict_false(error != 0)) {
1906 atomic_add_long(&zap_bucket_fail, 1);
1907 goto retry;
1908 }
1909 counter_u64_add(numposzaps, 1);
1910 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1911 cache_free(ncp);
1912 return (1);
1913 out_no_entry:
1914 counter_u64_add(nummisszap, 1);
1915 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1916 return (0);
1917 }
1918
1919 static int __noinline
cache_lookup_dot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1920 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1921 struct timespec *tsp, int *ticksp)
1922 {
1923 int ltype;
1924
1925 *vpp = dvp;
1926 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1927 if (tsp != NULL)
1928 timespecclear(tsp);
1929 if (ticksp != NULL)
1930 *ticksp = ticks;
1931 vrefact(*vpp);
1932 /*
1933 * When we lookup "." we still can be asked to lock it
1934 * differently...
1935 */
1936 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1937 if (ltype != VOP_ISLOCKED(*vpp)) {
1938 if (ltype == LK_EXCLUSIVE) {
1939 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1940 if (VN_IS_DOOMED((*vpp))) {
1941 /* forced unmount */
1942 vrele(*vpp);
1943 *vpp = NULL;
1944 return (ENOENT);
1945 }
1946 } else
1947 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1948 }
1949 return (-1);
1950 }
1951
1952 static int __noinline
cache_lookup_dotdot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1953 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1954 struct timespec *tsp, int *ticksp)
1955 {
1956 struct namecache_ts *ncp_ts;
1957 struct namecache *ncp;
1958 struct mtx *dvlp;
1959 enum vgetstate vs;
1960 int error, ltype;
1961 bool whiteout;
1962
1963 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1964
1965 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1966 cache_remove_cnp(dvp, cnp);
1967 return (0);
1968 }
1969
1970 retry:
1971 dvlp = VP2VNODELOCK(dvp);
1972 mtx_lock(dvlp);
1973 ncp = dvp->v_cache_dd;
1974 if (ncp == NULL) {
1975 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1976 mtx_unlock(dvlp);
1977 return (0);
1978 }
1979 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1980 if (ncp->nc_flag & NCF_NEGATIVE)
1981 *vpp = NULL;
1982 else
1983 *vpp = ncp->nc_vp;
1984 } else
1985 *vpp = ncp->nc_dvp;
1986 if (*vpp == NULL)
1987 goto negative_success;
1988 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1989 cache_out_ts(ncp, tsp, ticksp);
1990 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1991 NCF_DTS && tsp != NULL) {
1992 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1993 *tsp = ncp_ts->nc_dotdottime;
1994 }
1995
1996 MPASS(dvp != *vpp);
1997 ltype = VOP_ISLOCKED(dvp);
1998 VOP_UNLOCK(dvp);
1999 vs = vget_prep(*vpp);
2000 mtx_unlock(dvlp);
2001 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2002 vn_lock(dvp, ltype | LK_RETRY);
2003 if (VN_IS_DOOMED(dvp)) {
2004 if (error == 0)
2005 vput(*vpp);
2006 *vpp = NULL;
2007 return (ENOENT);
2008 }
2009 if (error) {
2010 *vpp = NULL;
2011 goto retry;
2012 }
2013 return (-1);
2014 negative_success:
2015 if (__predict_false(cnp->cn_nameiop == CREATE)) {
2016 if (cnp->cn_flags & ISLASTCN) {
2017 counter_u64_add(numnegzaps, 1);
2018 cache_zap_negative_locked_vnode_kl(ncp, dvp);
2019 mtx_unlock(dvlp);
2020 cache_free(ncp);
2021 return (0);
2022 }
2023 }
2024
2025 whiteout = (ncp->nc_flag & NCF_WHITE);
2026 cache_out_ts(ncp, tsp, ticksp);
2027 if (cache_neg_hit_prep(ncp))
2028 cache_neg_promote(ncp);
2029 else
2030 cache_neg_hit_finish(ncp);
2031 mtx_unlock(dvlp);
2032 if (whiteout)
2033 cnp->cn_flags |= ISWHITEOUT;
2034 return (ENOENT);
2035 }
2036
2037 /**
2038 * Lookup a name in the name cache
2039 *
2040 * # Arguments
2041 *
2042 * - dvp: Parent directory in which to search.
2043 * - vpp: Return argument. Will contain desired vnode on cache hit.
2044 * - cnp: Parameters of the name search. The most interesting bits of
2045 * the cn_flags field have the following meanings:
2046 * - MAKEENTRY: If clear, free an entry from the cache rather than look
2047 * it up.
2048 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
2049 * - tsp: Return storage for cache timestamp. On a successful (positive
2050 * or negative) lookup, tsp will be filled with any timespec that
2051 * was stored when this cache entry was created. However, it will
2052 * be clear for "." entries.
2053 * - ticks: Return storage for alternate cache timestamp. On a successful
2054 * (positive or negative) lookup, it will contain the ticks value
2055 * that was current when the cache entry was created, unless cnp
2056 * was ".".
2057 *
2058 * Either both tsp and ticks have to be provided or neither of them.
2059 *
2060 * # Returns
2061 *
2062 * - -1: A positive cache hit. vpp will contain the desired vnode.
2063 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
2064 * to a forced unmount. vpp will not be modified. If the entry
2065 * is a whiteout, then the ISWHITEOUT flag will be set in
2066 * cnp->cn_flags.
2067 * - 0: A cache miss. vpp will not be modified.
2068 *
2069 * # Locking
2070 *
2071 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
2072 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
2073 * lock is not recursively acquired.
2074 */
2075 static int __noinline
cache_lookup_fallback(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2076 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2077 struct timespec *tsp, int *ticksp)
2078 {
2079 struct namecache *ncp;
2080 struct mtx *blp;
2081 uint32_t hash;
2082 enum vgetstate vs;
2083 int error;
2084 bool whiteout;
2085
2086 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2087 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2088
2089 retry:
2090 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2091 blp = HASH2BUCKETLOCK(hash);
2092 mtx_lock(blp);
2093
2094 ncp = cache_ncp_find(dvp, cnp, hash);
2095 if (__predict_false(ncp == NULL)) {
2096 mtx_unlock(blp);
2097 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2098 counter_u64_add(nummiss, 1);
2099 return (0);
2100 }
2101
2102 if (ncp->nc_flag & NCF_NEGATIVE)
2103 goto negative_success;
2104
2105 counter_u64_add(numposhits, 1);
2106 *vpp = ncp->nc_vp;
2107 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2108 cache_out_ts(ncp, tsp, ticksp);
2109 MPASS(dvp != *vpp);
2110 vs = vget_prep(*vpp);
2111 mtx_unlock(blp);
2112 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2113 if (error) {
2114 *vpp = NULL;
2115 goto retry;
2116 }
2117 return (-1);
2118 negative_success:
2119 /*
2120 * We don't get here with regular lookup apart from corner cases.
2121 */
2122 if (__predict_true(cnp->cn_nameiop == CREATE)) {
2123 if (cnp->cn_flags & ISLASTCN) {
2124 counter_u64_add(numnegzaps, 1);
2125 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2126 if (__predict_false(error != 0)) {
2127 atomic_add_long(&zap_bucket_fail2, 1);
2128 goto retry;
2129 }
2130 cache_free(ncp);
2131 return (0);
2132 }
2133 }
2134
2135 whiteout = (ncp->nc_flag & NCF_WHITE);
2136 cache_out_ts(ncp, tsp, ticksp);
2137 if (cache_neg_hit_prep(ncp))
2138 cache_neg_promote(ncp);
2139 else
2140 cache_neg_hit_finish(ncp);
2141 mtx_unlock(blp);
2142 if (whiteout)
2143 cnp->cn_flags |= ISWHITEOUT;
2144 return (ENOENT);
2145 }
2146
2147 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2148 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2149 struct timespec *tsp, int *ticksp)
2150 {
2151 struct namecache *ncp;
2152 uint32_t hash;
2153 enum vgetstate vs;
2154 int error;
2155 bool whiteout, neg_promote;
2156 u_short nc_flag;
2157
2158 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2159
2160 #ifdef DEBUG_CACHE
2161 if (__predict_false(!doingcache)) {
2162 cnp->cn_flags &= ~MAKEENTRY;
2163 return (0);
2164 }
2165 #endif
2166
2167 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2168 if (cnp->cn_namelen == 1)
2169 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2170 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2171 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2172 }
2173
2174 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2175
2176 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2177 cache_remove_cnp(dvp, cnp);
2178 return (0);
2179 }
2180
2181 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2182 vfs_smr_enter();
2183
2184 ncp = cache_ncp_find(dvp, cnp, hash);
2185 if (__predict_false(ncp == NULL)) {
2186 vfs_smr_exit();
2187 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2188 counter_u64_add(nummiss, 1);
2189 return (0);
2190 }
2191
2192 nc_flag = atomic_load_char(&ncp->nc_flag);
2193 if (nc_flag & NCF_NEGATIVE)
2194 goto negative_success;
2195
2196 counter_u64_add(numposhits, 1);
2197 *vpp = ncp->nc_vp;
2198 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2199 cache_out_ts(ncp, tsp, ticksp);
2200 MPASS(dvp != *vpp);
2201 if (!cache_ncp_canuse(ncp)) {
2202 vfs_smr_exit();
2203 *vpp = NULL;
2204 goto out_fallback;
2205 }
2206 vs = vget_prep_smr(*vpp);
2207 vfs_smr_exit();
2208 if (__predict_false(vs == VGET_NONE)) {
2209 *vpp = NULL;
2210 goto out_fallback;
2211 }
2212 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2213 if (error) {
2214 *vpp = NULL;
2215 goto out_fallback;
2216 }
2217 return (-1);
2218 negative_success:
2219 if (cnp->cn_nameiop == CREATE) {
2220 if (cnp->cn_flags & ISLASTCN) {
2221 vfs_smr_exit();
2222 goto out_fallback;
2223 }
2224 }
2225
2226 cache_out_ts(ncp, tsp, ticksp);
2227 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2228 neg_promote = cache_neg_hit_prep(ncp);
2229 if (!cache_ncp_canuse(ncp)) {
2230 cache_neg_hit_abort(ncp);
2231 vfs_smr_exit();
2232 goto out_fallback;
2233 }
2234 if (neg_promote) {
2235 vfs_smr_exit();
2236 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2237 goto out_fallback;
2238 } else {
2239 cache_neg_hit_finish(ncp);
2240 vfs_smr_exit();
2241 }
2242 if (whiteout)
2243 cnp->cn_flags |= ISWHITEOUT;
2244 return (ENOENT);
2245 out_fallback:
2246 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2247 }
2248
2249 struct celockstate {
2250 struct mtx *vlp[3];
2251 struct mtx *blp[2];
2252 };
2253 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2254 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2255
2256 static inline void
cache_celockstate_init(struct celockstate * cel)2257 cache_celockstate_init(struct celockstate *cel)
2258 {
2259
2260 bzero(cel, sizeof(*cel));
2261 }
2262
2263 static void
cache_lock_vnodes_cel(struct celockstate * cel,struct vnode * vp,struct vnode * dvp)2264 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2265 struct vnode *dvp)
2266 {
2267 struct mtx *vlp1, *vlp2;
2268
2269 MPASS(cel->vlp[0] == NULL);
2270 MPASS(cel->vlp[1] == NULL);
2271 MPASS(cel->vlp[2] == NULL);
2272
2273 MPASS(vp != NULL || dvp != NULL);
2274
2275 vlp1 = VP2VNODELOCK(vp);
2276 vlp2 = VP2VNODELOCK(dvp);
2277 cache_sort_vnodes(&vlp1, &vlp2);
2278
2279 if (vlp1 != NULL) {
2280 mtx_lock(vlp1);
2281 cel->vlp[0] = vlp1;
2282 }
2283 mtx_lock(vlp2);
2284 cel->vlp[1] = vlp2;
2285 }
2286
2287 static void
cache_unlock_vnodes_cel(struct celockstate * cel)2288 cache_unlock_vnodes_cel(struct celockstate *cel)
2289 {
2290
2291 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2292
2293 if (cel->vlp[0] != NULL)
2294 mtx_unlock(cel->vlp[0]);
2295 if (cel->vlp[1] != NULL)
2296 mtx_unlock(cel->vlp[1]);
2297 if (cel->vlp[2] != NULL)
2298 mtx_unlock(cel->vlp[2]);
2299 }
2300
2301 static bool
cache_lock_vnodes_cel_3(struct celockstate * cel,struct vnode * vp)2302 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2303 {
2304 struct mtx *vlp;
2305 bool ret;
2306
2307 cache_assert_vlp_locked(cel->vlp[0]);
2308 cache_assert_vlp_locked(cel->vlp[1]);
2309 MPASS(cel->vlp[2] == NULL);
2310
2311 MPASS(vp != NULL);
2312 vlp = VP2VNODELOCK(vp);
2313
2314 ret = true;
2315 if (vlp >= cel->vlp[1]) {
2316 mtx_lock(vlp);
2317 } else {
2318 if (mtx_trylock(vlp))
2319 goto out;
2320 cache_unlock_vnodes_cel(cel);
2321 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2322 if (vlp < cel->vlp[0]) {
2323 mtx_lock(vlp);
2324 mtx_lock(cel->vlp[0]);
2325 mtx_lock(cel->vlp[1]);
2326 } else {
2327 if (cel->vlp[0] != NULL)
2328 mtx_lock(cel->vlp[0]);
2329 mtx_lock(vlp);
2330 mtx_lock(cel->vlp[1]);
2331 }
2332 ret = false;
2333 }
2334 out:
2335 cel->vlp[2] = vlp;
2336 return (ret);
2337 }
2338
2339 static void
cache_lock_buckets_cel(struct celockstate * cel,struct mtx * blp1,struct mtx * blp2)2340 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2341 struct mtx *blp2)
2342 {
2343
2344 MPASS(cel->blp[0] == NULL);
2345 MPASS(cel->blp[1] == NULL);
2346
2347 cache_sort_vnodes(&blp1, &blp2);
2348
2349 if (blp1 != NULL) {
2350 mtx_lock(blp1);
2351 cel->blp[0] = blp1;
2352 }
2353 mtx_lock(blp2);
2354 cel->blp[1] = blp2;
2355 }
2356
2357 static void
cache_unlock_buckets_cel(struct celockstate * cel)2358 cache_unlock_buckets_cel(struct celockstate *cel)
2359 {
2360
2361 if (cel->blp[0] != NULL)
2362 mtx_unlock(cel->blp[0]);
2363 mtx_unlock(cel->blp[1]);
2364 }
2365
2366 /*
2367 * Lock part of the cache affected by the insertion.
2368 *
2369 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2370 * However, insertion can result in removal of an old entry. In this
2371 * case we have an additional vnode and bucketlock pair to lock.
2372 *
2373 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2374 * preserving the locking order (smaller address first).
2375 */
2376 static void
cache_enter_lock(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2377 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2378 uint32_t hash)
2379 {
2380 struct namecache *ncp;
2381 struct mtx *blps[2];
2382 u_char nc_flag;
2383
2384 blps[0] = HASH2BUCKETLOCK(hash);
2385 for (;;) {
2386 blps[1] = NULL;
2387 cache_lock_vnodes_cel(cel, dvp, vp);
2388 if (vp == NULL || vp->v_type != VDIR)
2389 break;
2390 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2391 if (ncp == NULL)
2392 break;
2393 nc_flag = atomic_load_char(&ncp->nc_flag);
2394 if ((nc_flag & NCF_ISDOTDOT) == 0)
2395 break;
2396 MPASS(ncp->nc_dvp == vp);
2397 blps[1] = NCP2BUCKETLOCK(ncp);
2398 if ((nc_flag & NCF_NEGATIVE) != 0)
2399 break;
2400 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2401 break;
2402 /*
2403 * All vnodes got re-locked. Re-validate the state and if
2404 * nothing changed we are done. Otherwise restart.
2405 */
2406 if (ncp == vp->v_cache_dd &&
2407 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2408 blps[1] == NCP2BUCKETLOCK(ncp) &&
2409 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2410 break;
2411 cache_unlock_vnodes_cel(cel);
2412 cel->vlp[0] = NULL;
2413 cel->vlp[1] = NULL;
2414 cel->vlp[2] = NULL;
2415 }
2416 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2417 }
2418
2419 static void
cache_enter_lock_dd(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2420 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2421 uint32_t hash)
2422 {
2423 struct namecache *ncp;
2424 struct mtx *blps[2];
2425 u_char nc_flag;
2426
2427 blps[0] = HASH2BUCKETLOCK(hash);
2428 for (;;) {
2429 blps[1] = NULL;
2430 cache_lock_vnodes_cel(cel, dvp, vp);
2431 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2432 if (ncp == NULL)
2433 break;
2434 nc_flag = atomic_load_char(&ncp->nc_flag);
2435 if ((nc_flag & NCF_ISDOTDOT) == 0)
2436 break;
2437 MPASS(ncp->nc_dvp == dvp);
2438 blps[1] = NCP2BUCKETLOCK(ncp);
2439 if ((nc_flag & NCF_NEGATIVE) != 0)
2440 break;
2441 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2442 break;
2443 if (ncp == dvp->v_cache_dd &&
2444 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2445 blps[1] == NCP2BUCKETLOCK(ncp) &&
2446 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2447 break;
2448 cache_unlock_vnodes_cel(cel);
2449 cel->vlp[0] = NULL;
2450 cel->vlp[1] = NULL;
2451 cel->vlp[2] = NULL;
2452 }
2453 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2454 }
2455
2456 static void
cache_enter_unlock(struct celockstate * cel)2457 cache_enter_unlock(struct celockstate *cel)
2458 {
2459
2460 cache_unlock_buckets_cel(cel);
2461 cache_unlock_vnodes_cel(cel);
2462 }
2463
2464 static void __noinline
cache_enter_dotdot_prep(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2465 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2466 struct componentname *cnp)
2467 {
2468 struct celockstate cel;
2469 struct namecache *ncp;
2470 uint32_t hash;
2471 int len;
2472
2473 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2474 return;
2475 len = cnp->cn_namelen;
2476 cache_celockstate_init(&cel);
2477 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2478 cache_enter_lock_dd(&cel, dvp, vp, hash);
2479 ncp = dvp->v_cache_dd;
2480 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2481 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2482 cache_zap_locked(ncp);
2483 } else {
2484 ncp = NULL;
2485 }
2486 atomic_store_ptr(&dvp->v_cache_dd, NULL);
2487 cache_enter_unlock(&cel);
2488 if (ncp != NULL)
2489 cache_free(ncp);
2490 }
2491
2492 /*
2493 * Add an entry to the cache.
2494 */
2495 void
cache_enter_time(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp)2496 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2497 struct timespec *tsp, struct timespec *dtsp)
2498 {
2499 struct celockstate cel;
2500 struct namecache *ncp, *n2, *ndd;
2501 struct namecache_ts *ncp_ts;
2502 uint32_t hash;
2503 int flag;
2504 int len;
2505
2506 KASSERT(cnp->cn_namelen <= NAME_MAX,
2507 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2508 NAME_MAX));
2509 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2510 VNPASS(dvp->v_type != VNON, dvp);
2511 if (vp != NULL) {
2512 VNPASS(!VN_IS_DOOMED(vp), vp);
2513 VNPASS(vp->v_type != VNON, vp);
2514 }
2515 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2516 KASSERT(dvp == vp,
2517 ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2518 dvp, vp));
2519 } else {
2520 KASSERT(dvp != vp,
2521 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2522 cnp->cn_nameptr, dvp));
2523 }
2524
2525 #ifdef DEBUG_CACHE
2526 if (__predict_false(!doingcache))
2527 return;
2528 #endif
2529
2530 flag = 0;
2531 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2532 if (cnp->cn_namelen == 1)
2533 return;
2534 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2535 cache_enter_dotdot_prep(dvp, vp, cnp);
2536 flag = NCF_ISDOTDOT;
2537 }
2538 }
2539
2540 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2541 if (ncp == NULL)
2542 return;
2543
2544 cache_celockstate_init(&cel);
2545 ndd = NULL;
2546 ncp_ts = NULL;
2547
2548 /*
2549 * Calculate the hash key and setup as much of the new
2550 * namecache entry as possible before acquiring the lock.
2551 */
2552 ncp->nc_flag = flag | NCF_WIP;
2553 ncp->nc_vp = vp;
2554 if (vp == NULL)
2555 cache_neg_init(ncp);
2556 ncp->nc_dvp = dvp;
2557 if (tsp != NULL) {
2558 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2559 ncp_ts->nc_time = *tsp;
2560 ncp_ts->nc_ticks = ticks;
2561 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2562 if (dtsp != NULL) {
2563 ncp_ts->nc_dotdottime = *dtsp;
2564 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2565 }
2566 }
2567 len = ncp->nc_nlen = cnp->cn_namelen;
2568 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2569 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2570 ncp->nc_name[len] = '\0';
2571 cache_enter_lock(&cel, dvp, vp, hash);
2572
2573 /*
2574 * See if this vnode or negative entry is already in the cache
2575 * with this name. This can happen with concurrent lookups of
2576 * the same path name.
2577 */
2578 n2 = cache_ncp_find(dvp, cnp, hash);
2579 if (n2 != NULL) {
2580 MPASS(cache_ncp_canuse(n2));
2581 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2582 KASSERT(vp == NULL,
2583 ("%s: found entry pointing to a different vnode "
2584 "(%p != %p); name [%s]",
2585 __func__, NULL, vp, cnp->cn_nameptr));
2586 else
2587 KASSERT(n2->nc_vp == vp,
2588 ("%s: found entry pointing to a different vnode "
2589 "(%p != %p); name [%s]",
2590 __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2591 /*
2592 * Entries are supposed to be immutable unless in the
2593 * process of getting destroyed. Accommodating for
2594 * changing timestamps is possible but not worth it.
2595 * This should be harmless in terms of correctness, in
2596 * the worst case resulting in an earlier expiration.
2597 * Alternatively, the found entry can be replaced
2598 * altogether.
2599 */
2600 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) ==
2601 (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2602 #if 0
2603 if (tsp != NULL) {
2604 KASSERT((n2->nc_flag & NCF_TS) != 0,
2605 ("no NCF_TS"));
2606 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2607 n2_ts->nc_time = ncp_ts->nc_time;
2608 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2609 if (dtsp != NULL) {
2610 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2611 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2612 }
2613 }
2614 #endif
2615 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2616 vp);
2617 goto out_unlock_free;
2618 }
2619
2620 if (flag == NCF_ISDOTDOT) {
2621 /*
2622 * See if we are trying to add .. entry, but some other lookup
2623 * has populated v_cache_dd pointer already.
2624 */
2625 if (dvp->v_cache_dd != NULL)
2626 goto out_unlock_free;
2627 KASSERT(vp == NULL || vp->v_type == VDIR,
2628 ("wrong vnode type %p", vp));
2629 atomic_thread_fence_rel();
2630 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2631 } else if (vp != NULL) {
2632 /*
2633 * For this case, the cache entry maps both the
2634 * directory name in it and the name ".." for the
2635 * directory's parent.
2636 */
2637 if ((ndd = vp->v_cache_dd) != NULL) {
2638 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2639 cache_zap_locked(ndd);
2640 else
2641 ndd = NULL;
2642 }
2643 atomic_thread_fence_rel();
2644 atomic_store_ptr(&vp->v_cache_dd, ncp);
2645 }
2646
2647 if (flag != NCF_ISDOTDOT) {
2648 if (LIST_EMPTY(&dvp->v_cache_src)) {
2649 cache_hold_vnode(dvp);
2650 }
2651 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2652 }
2653
2654 /*
2655 * If the entry is "negative", we place it into the
2656 * "negative" cache queue, otherwise, we place it into the
2657 * destination vnode's cache entries queue.
2658 */
2659 if (vp != NULL) {
2660 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2661 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2662 vp);
2663 } else {
2664 if (cnp->cn_flags & ISWHITEOUT)
2665 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2666 cache_neg_insert(ncp);
2667 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2668 ncp->nc_name);
2669 }
2670
2671 /*
2672 * Insert the new namecache entry into the appropriate chain
2673 * within the cache entries table.
2674 */
2675 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2676
2677 atomic_thread_fence_rel();
2678 /*
2679 * Mark the entry as fully constructed.
2680 * It is immutable past this point until its removal.
2681 */
2682 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2683
2684 cache_enter_unlock(&cel);
2685 if (ndd != NULL)
2686 cache_free(ndd);
2687 return;
2688 out_unlock_free:
2689 cache_enter_unlock(&cel);
2690 cache_free(ncp);
2691 return;
2692 }
2693
2694 /*
2695 * A variant of the above accepting flags.
2696 *
2697 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2698 *
2699 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2700 * happens to match and it is doing it in an inefficient manner. It was added
2701 * to accommodate NFS which runs into a case where the target for a given name
2702 * may change from under it. Note this does nothing to solve the following
2703 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2704 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2705 */
2706 void
cache_enter_time_flags(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp,int flags)2707 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2708 struct timespec *tsp, struct timespec *dtsp, int flags)
2709 {
2710
2711 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2712
2713 if (flags & VFS_CACHE_DROPOLD)
2714 cache_remove_cnp(dvp, cnp);
2715 cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2716 }
2717
2718 static u_long
cache_roundup_2(u_long val)2719 cache_roundup_2(u_long val)
2720 {
2721 u_long res;
2722
2723 for (res = 1; res <= val; res <<= 1)
2724 continue;
2725
2726 return (res);
2727 }
2728
2729 static struct nchashhead *
nchinittbl(u_long elements,u_long * hashmask)2730 nchinittbl(u_long elements, u_long *hashmask)
2731 {
2732 struct nchashhead *hashtbl;
2733 u_long hashsize, i;
2734
2735 hashsize = cache_roundup_2(elements) / 2;
2736
2737 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2738 for (i = 0; i < hashsize; i++)
2739 CK_SLIST_INIT(&hashtbl[i]);
2740 *hashmask = hashsize - 1;
2741 return (hashtbl);
2742 }
2743
2744 static void
ncfreetbl(struct nchashhead * hashtbl)2745 ncfreetbl(struct nchashhead *hashtbl)
2746 {
2747
2748 free(hashtbl, M_VFSCACHE);
2749 }
2750
2751 /*
2752 * Name cache initialization, from vfs_init() when we are booting
2753 */
2754 static void
nchinit(void * dummy __unused)2755 nchinit(void *dummy __unused)
2756 {
2757 u_int i;
2758
2759 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2760 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2761 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2762 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2763 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2764 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2765 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2766 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2767
2768 VFS_SMR_ZONE_SET(cache_zone_small);
2769 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2770 VFS_SMR_ZONE_SET(cache_zone_large);
2771 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2772
2773 ncsize = desiredvnodes * ncsizefactor;
2774 cache_recalc_neg_min();
2775 nchashtbl = nchinittbl(ncsize, &nchash);
2776 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2777 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2778 ncbuckethash = 7;
2779 if (ncbuckethash > nchash)
2780 ncbuckethash = nchash;
2781 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2782 M_WAITOK | M_ZERO);
2783 for (i = 0; i < numbucketlocks; i++)
2784 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2785 ncvnodehash = ncbuckethash;
2786 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2787 M_WAITOK | M_ZERO);
2788 for (i = 0; i < numvnodelocks; i++)
2789 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2790
2791 for (i = 0; i < numneglists; i++) {
2792 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2793 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2794 TAILQ_INIT(&neglists[i].nl_list);
2795 TAILQ_INIT(&neglists[i].nl_hotlist);
2796 }
2797 }
2798 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2799
2800 void
cache_vnode_init(struct vnode * vp)2801 cache_vnode_init(struct vnode *vp)
2802 {
2803
2804 LIST_INIT(&vp->v_cache_src);
2805 TAILQ_INIT(&vp->v_cache_dst);
2806 vp->v_cache_dd = NULL;
2807 cache_prehash(vp);
2808 }
2809
2810 /*
2811 * Induce transient cache misses for lockless operation in cache_lookup() by
2812 * using a temporary hash table.
2813 *
2814 * This will force a fs lookup.
2815 *
2816 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2817 * to observe all CPUs not performing the lookup.
2818 */
2819 static void
cache_changesize_set_temp(struct nchashhead * temptbl,u_long temphash)2820 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2821 {
2822
2823 MPASS(temphash < nchash);
2824 /*
2825 * Change the size. The new size is smaller and can safely be used
2826 * against the existing table. All lookups which now hash wrong will
2827 * result in a cache miss, which all callers are supposed to know how
2828 * to handle.
2829 */
2830 atomic_store_long(&nchash, temphash);
2831 atomic_thread_fence_rel();
2832 vfs_smr_synchronize();
2833 /*
2834 * At this point everyone sees the updated hash value, but they still
2835 * see the old table.
2836 */
2837 atomic_store_ptr(&nchashtbl, temptbl);
2838 atomic_thread_fence_rel();
2839 vfs_smr_synchronize();
2840 /*
2841 * At this point everyone sees the updated table pointer and size pair.
2842 */
2843 }
2844
2845 /*
2846 * Set the new hash table.
2847 *
2848 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2849 * lockless operation in cache_lookup().
2850 */
2851 static void
cache_changesize_set_new(struct nchashhead * new_tbl,u_long new_hash)2852 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2853 {
2854
2855 MPASS(nchash < new_hash);
2856 /*
2857 * Change the pointer first. This wont result in out of bounds access
2858 * since the temporary table is guaranteed to be smaller.
2859 */
2860 atomic_store_ptr(&nchashtbl, new_tbl);
2861 atomic_thread_fence_rel();
2862 vfs_smr_synchronize();
2863 /*
2864 * At this point everyone sees the updated pointer value, but they
2865 * still see the old size.
2866 */
2867 atomic_store_long(&nchash, new_hash);
2868 atomic_thread_fence_rel();
2869 vfs_smr_synchronize();
2870 /*
2871 * At this point everyone sees the updated table pointer and size pair.
2872 */
2873 }
2874
2875 void
cache_changesize(u_long newmaxvnodes)2876 cache_changesize(u_long newmaxvnodes)
2877 {
2878 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2879 u_long new_nchash, old_nchash, temphash;
2880 struct namecache *ncp;
2881 uint32_t hash;
2882 u_long newncsize;
2883 u_long i;
2884
2885 newncsize = newmaxvnodes * ncsizefactor;
2886 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2887 if (newmaxvnodes < numbucketlocks)
2888 newmaxvnodes = numbucketlocks;
2889
2890 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2891 /* If same hash table size, nothing to do */
2892 if (nchash == new_nchash) {
2893 ncfreetbl(new_nchashtbl);
2894 return;
2895 }
2896
2897 temptbl = nchinittbl(1, &temphash);
2898
2899 /*
2900 * Move everything from the old hash table to the new table.
2901 * None of the namecache entries in the table can be removed
2902 * because to do so, they have to be removed from the hash table.
2903 */
2904 cache_lock_all_vnodes();
2905 cache_lock_all_buckets();
2906 old_nchashtbl = nchashtbl;
2907 old_nchash = nchash;
2908 cache_changesize_set_temp(temptbl, temphash);
2909 for (i = 0; i <= old_nchash; i++) {
2910 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2911 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2912 ncp->nc_dvp);
2913 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2914 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2915 }
2916 }
2917 ncsize = newncsize;
2918 cache_recalc_neg_min();
2919 cache_changesize_set_new(new_nchashtbl, new_nchash);
2920 cache_unlock_all_buckets();
2921 cache_unlock_all_vnodes();
2922 ncfreetbl(old_nchashtbl);
2923 ncfreetbl(temptbl);
2924 }
2925
2926 /*
2927 * Remove all entries from and to a particular vnode.
2928 */
2929 static void
cache_purge_impl(struct vnode * vp)2930 cache_purge_impl(struct vnode *vp)
2931 {
2932 struct cache_freebatch batch;
2933 struct namecache *ncp;
2934 struct mtx *vlp, *vlp2;
2935
2936 TAILQ_INIT(&batch);
2937 vlp = VP2VNODELOCK(vp);
2938 vlp2 = NULL;
2939 mtx_lock(vlp);
2940 retry:
2941 while (!LIST_EMPTY(&vp->v_cache_src)) {
2942 ncp = LIST_FIRST(&vp->v_cache_src);
2943 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2944 goto retry;
2945 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2946 }
2947 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2948 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2949 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2950 goto retry;
2951 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2952 }
2953 ncp = vp->v_cache_dd;
2954 if (ncp != NULL) {
2955 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2956 ("lost dotdot link"));
2957 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2958 goto retry;
2959 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2960 }
2961 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2962 mtx_unlock(vlp);
2963 if (vlp2 != NULL)
2964 mtx_unlock(vlp2);
2965 cache_free_batch(&batch);
2966 }
2967
2968 /*
2969 * Opportunistic check to see if there is anything to do.
2970 */
2971 static bool
cache_has_entries(struct vnode * vp)2972 cache_has_entries(struct vnode *vp)
2973 {
2974
2975 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2976 atomic_load_ptr(&vp->v_cache_dd) == NULL)
2977 return (false);
2978 return (true);
2979 }
2980
2981 void
cache_purge(struct vnode * vp)2982 cache_purge(struct vnode *vp)
2983 {
2984
2985 SDT_PROBE1(vfs, namecache, purge, done, vp);
2986 if (!cache_has_entries(vp))
2987 return;
2988 cache_purge_impl(vp);
2989 }
2990
2991 /*
2992 * Only to be used by vgone.
2993 */
2994 void
cache_purge_vgone(struct vnode * vp)2995 cache_purge_vgone(struct vnode *vp)
2996 {
2997 struct mtx *vlp;
2998
2999 VNPASS(VN_IS_DOOMED(vp), vp);
3000 if (cache_has_entries(vp)) {
3001 cache_purge_impl(vp);
3002 return;
3003 }
3004
3005 /*
3006 * Serialize against a potential thread doing cache_purge.
3007 */
3008 vlp = VP2VNODELOCK(vp);
3009 mtx_wait_unlocked(vlp);
3010 if (cache_has_entries(vp)) {
3011 cache_purge_impl(vp);
3012 return;
3013 }
3014 return;
3015 }
3016
3017 /*
3018 * Remove all negative entries for a particular directory vnode.
3019 */
3020 void
cache_purge_negative(struct vnode * vp)3021 cache_purge_negative(struct vnode *vp)
3022 {
3023 struct cache_freebatch batch;
3024 struct namecache *ncp, *nnp;
3025 struct mtx *vlp;
3026
3027 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
3028 if (LIST_EMPTY(&vp->v_cache_src))
3029 return;
3030 TAILQ_INIT(&batch);
3031 vlp = VP2VNODELOCK(vp);
3032 mtx_lock(vlp);
3033 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3034 if (!(ncp->nc_flag & NCF_NEGATIVE))
3035 continue;
3036 cache_zap_negative_locked_vnode_kl(ncp, vp);
3037 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3038 }
3039 mtx_unlock(vlp);
3040 cache_free_batch(&batch);
3041 }
3042
3043 /*
3044 * Entry points for modifying VOP operations.
3045 */
3046 void
cache_vop_rename(struct vnode * fdvp,struct vnode * fvp,struct vnode * tdvp,struct vnode * tvp,struct componentname * fcnp,struct componentname * tcnp)3047 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3048 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3049 {
3050
3051 ASSERT_VOP_IN_SEQC(fdvp);
3052 ASSERT_VOP_IN_SEQC(fvp);
3053 ASSERT_VOP_IN_SEQC(tdvp);
3054 if (tvp != NULL)
3055 ASSERT_VOP_IN_SEQC(tvp);
3056
3057 cache_purge(fvp);
3058 if (tvp != NULL) {
3059 cache_purge(tvp);
3060 KASSERT(!cache_remove_cnp(tdvp, tcnp),
3061 ("%s: lingering negative entry", __func__));
3062 } else {
3063 cache_remove_cnp(tdvp, tcnp);
3064 }
3065
3066 /*
3067 * TODO
3068 *
3069 * Historically renaming was always purging all revelang entries,
3070 * but that's quite wasteful. In particular turns out that in many cases
3071 * the target file is immediately accessed after rename, inducing a cache
3072 * miss.
3073 *
3074 * Recode this to reduce relocking and reuse the existing entry (if any)
3075 * instead of just removing it above and allocating a new one here.
3076 */
3077 cache_enter(tdvp, fvp, tcnp);
3078 }
3079
3080 void
cache_vop_rmdir(struct vnode * dvp,struct vnode * vp)3081 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3082 {
3083
3084 ASSERT_VOP_IN_SEQC(dvp);
3085 ASSERT_VOP_IN_SEQC(vp);
3086 cache_purge(vp);
3087 }
3088
3089 #ifdef INVARIANTS
3090 /*
3091 * Validate that if an entry exists it matches.
3092 */
3093 void
cache_validate(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)3094 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3095 {
3096 struct namecache *ncp;
3097 struct mtx *blp;
3098 uint32_t hash;
3099
3100 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3101 if (CK_SLIST_EMPTY(NCHHASH(hash)))
3102 return;
3103 blp = HASH2BUCKETLOCK(hash);
3104 mtx_lock(blp);
3105 ncp = cache_ncp_find(dvp, cnp, hash);
3106 if (ncp != NULL && ncp->nc_vp != vp) {
3107 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3108 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3109 }
3110 mtx_unlock(blp);
3111 }
3112
3113 void
cache_assert_no_entries(struct vnode * vp)3114 cache_assert_no_entries(struct vnode *vp)
3115 {
3116
3117 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3118 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3119 VNPASS(vp->v_cache_dd == NULL, vp);
3120 }
3121 #endif
3122
3123 /*
3124 * Flush all entries referencing a particular filesystem.
3125 */
3126 void
cache_purgevfs(struct mount * mp)3127 cache_purgevfs(struct mount *mp)
3128 {
3129 struct vnode *vp, *mvp;
3130 size_t visited __sdt_used, purged __sdt_used;
3131
3132 visited = purged = 0;
3133 /*
3134 * Somewhat wasteful iteration over all vnodes. Would be better to
3135 * support filtering and avoid the interlock to begin with.
3136 */
3137 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3138 visited++;
3139 if (!cache_has_entries(vp)) {
3140 VI_UNLOCK(vp);
3141 continue;
3142 }
3143 vholdl(vp);
3144 VI_UNLOCK(vp);
3145 cache_purge(vp);
3146 purged++;
3147 vdrop(vp);
3148 }
3149
3150 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3151 }
3152
3153 /*
3154 * Perform canonical checks and cache lookup and pass on to filesystem
3155 * through the vop_cachedlookup only if needed.
3156 */
3157
3158 int
vfs_cache_lookup(struct vop_lookup_args * ap)3159 vfs_cache_lookup(struct vop_lookup_args *ap)
3160 {
3161 struct vnode *dvp;
3162 int error;
3163 struct vnode **vpp = ap->a_vpp;
3164 struct componentname *cnp = ap->a_cnp;
3165 int flags = cnp->cn_flags;
3166
3167 *vpp = NULL;
3168 dvp = ap->a_dvp;
3169
3170 if (dvp->v_type != VDIR)
3171 return (ENOTDIR);
3172
3173 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3174 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3175 return (EROFS);
3176
3177 error = vn_dir_check_exec(dvp, cnp);
3178 if (error != 0)
3179 return (error);
3180
3181 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3182 if (error == 0)
3183 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3184 if (error == -1)
3185 return (0);
3186 return (error);
3187 }
3188
3189 /* Implementation of the getcwd syscall. */
3190 int
sys___getcwd(struct thread * td,struct __getcwd_args * uap)3191 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3192 {
3193 char *buf, *retbuf;
3194 size_t buflen;
3195 int error;
3196
3197 buflen = uap->buflen;
3198 if (__predict_false(buflen < 2))
3199 return (EINVAL);
3200 if (buflen > MAXPATHLEN)
3201 buflen = MAXPATHLEN;
3202
3203 buf = uma_zalloc(namei_zone, M_WAITOK);
3204 error = vn_getcwd(buf, &retbuf, &buflen);
3205 if (error == 0)
3206 error = copyout(retbuf, uap->buf, buflen);
3207 uma_zfree(namei_zone, buf);
3208 return (error);
3209 }
3210
3211 int
vn_getcwd(char * buf,char ** retbuf,size_t * buflen)3212 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3213 {
3214 struct pwd *pwd;
3215 int error;
3216
3217 vfs_smr_enter();
3218 pwd = pwd_get_smr();
3219 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3220 buflen, 0);
3221 VFS_SMR_ASSERT_NOT_ENTERED();
3222 if (error < 0) {
3223 pwd = pwd_hold(curthread);
3224 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3225 retbuf, buflen);
3226 pwd_drop(pwd);
3227 }
3228
3229 #ifdef KTRACE
3230 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3231 ktrnamei(*retbuf);
3232 #endif
3233 return (error);
3234 }
3235
3236 /*
3237 * Canonicalize a path by walking it forward and back.
3238 *
3239 * BUGS:
3240 * - Nothing guarantees the integrity of the entire chain. Consider the case
3241 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3242 * "foo" into "quux" during the backwards walk. The result will be
3243 * "quux/bar/baz/qux", which could not have been obtained by an incremental
3244 * walk in userspace. Moreover, the path we return is inaccessible if the
3245 * calling thread lacks permission to traverse "quux".
3246 */
3247 static int
kern___realpathat(struct thread * td,int fd,const char * path,char * buf,size_t size,int flags,enum uio_seg pathseg)3248 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3249 size_t size, int flags, enum uio_seg pathseg)
3250 {
3251 struct nameidata nd;
3252 char *retbuf, *freebuf;
3253 int error;
3254
3255 if (flags != 0)
3256 return (EINVAL);
3257 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3258 pathseg, path, fd, &cap_fstat_rights);
3259 if ((error = namei(&nd)) != 0)
3260 return (error);
3261
3262 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3263 (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3264 struct vnode *covered_vp;
3265
3266 /*
3267 * This happens if vp is a file mount. The call to
3268 * vn_fullpath_hardlink can panic if path resolution can't be
3269 * handled without the directory.
3270 *
3271 * To resolve this, we find the vnode which was mounted on -
3272 * this should have a unique global path since we disallow
3273 * mounting on linked files.
3274 */
3275 error = vn_lock(nd.ni_vp, LK_SHARED);
3276 if (error != 0)
3277 goto out;
3278 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3279 vref(covered_vp);
3280 VOP_UNLOCK(nd.ni_vp);
3281 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3282 vrele(covered_vp);
3283 } else {
3284 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp,
3285 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf,
3286 &freebuf, &size);
3287 }
3288 if (error == 0) {
3289 size_t len;
3290
3291 len = strlen(retbuf) + 1;
3292 if (size < len)
3293 error = ENAMETOOLONG;
3294 else if (pathseg == UIO_USERSPACE)
3295 error = copyout(retbuf, buf, len);
3296 else
3297 memcpy(buf, retbuf, len);
3298 free(freebuf, M_TEMP);
3299 }
3300 out:
3301 vrele(nd.ni_vp);
3302 vrele(nd.ni_dvp);
3303 NDFREE_PNBUF(&nd);
3304 return (error);
3305 }
3306
3307 int
sys___realpathat(struct thread * td,struct __realpathat_args * uap)3308 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3309 {
3310
3311 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3312 uap->flags, UIO_USERSPACE));
3313 }
3314
3315 /*
3316 * Retrieve the full filesystem path that correspond to a vnode from the name
3317 * cache (if available)
3318 */
3319 int
vn_fullpath(struct vnode * vp,char ** retbuf,char ** freebuf)3320 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3321 {
3322 struct pwd *pwd;
3323 char *buf;
3324 size_t buflen;
3325 int error;
3326
3327 if (__predict_false(vp == NULL))
3328 return (EINVAL);
3329
3330 buflen = MAXPATHLEN;
3331 buf = malloc(buflen, M_TEMP, M_WAITOK);
3332 vfs_smr_enter();
3333 pwd = pwd_get_smr();
3334 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3335 VFS_SMR_ASSERT_NOT_ENTERED();
3336 if (error < 0) {
3337 pwd = pwd_hold(curthread);
3338 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3339 pwd_drop(pwd);
3340 }
3341 if (error == 0)
3342 *freebuf = buf;
3343 else
3344 free(buf, M_TEMP);
3345 return (error);
3346 }
3347
3348 /*
3349 * This function is similar to vn_fullpath, but it attempts to lookup the
3350 * pathname relative to the global root mount point. This is required for the
3351 * auditing sub-system, as audited pathnames must be absolute, relative to the
3352 * global root mount point.
3353 */
3354 int
vn_fullpath_global(struct vnode * vp,char ** retbuf,char ** freebuf)3355 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3356 {
3357 char *buf;
3358 size_t buflen;
3359 int error;
3360
3361 if (__predict_false(vp == NULL))
3362 return (EINVAL);
3363 buflen = MAXPATHLEN;
3364 buf = malloc(buflen, M_TEMP, M_WAITOK);
3365 vfs_smr_enter();
3366 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3367 VFS_SMR_ASSERT_NOT_ENTERED();
3368 if (error < 0) {
3369 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3370 }
3371 if (error == 0)
3372 *freebuf = buf;
3373 else
3374 free(buf, M_TEMP);
3375 return (error);
3376 }
3377
3378 static struct namecache *
vn_dd_from_dst(struct vnode * vp)3379 vn_dd_from_dst(struct vnode *vp)
3380 {
3381 struct namecache *ncp;
3382
3383 cache_assert_vnode_locked(vp);
3384 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3385 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3386 return (ncp);
3387 }
3388 return (NULL);
3389 }
3390
3391 int
vn_vptocnp(struct vnode ** vp,char * buf,size_t * buflen)3392 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3393 {
3394 struct vnode *dvp;
3395 struct namecache *ncp;
3396 struct mtx *vlp;
3397 int error;
3398
3399 vlp = VP2VNODELOCK(*vp);
3400 mtx_lock(vlp);
3401 ncp = (*vp)->v_cache_dd;
3402 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3403 KASSERT(ncp == vn_dd_from_dst(*vp),
3404 ("%s: mismatch for dd entry (%p != %p)", __func__,
3405 ncp, vn_dd_from_dst(*vp)));
3406 } else {
3407 ncp = vn_dd_from_dst(*vp);
3408 }
3409 if (ncp != NULL) {
3410 if (*buflen < ncp->nc_nlen) {
3411 mtx_unlock(vlp);
3412 vrele(*vp);
3413 counter_u64_add(numfullpathfail4, 1);
3414 error = ENOMEM;
3415 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3416 vp, NULL);
3417 return (error);
3418 }
3419 *buflen -= ncp->nc_nlen;
3420 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3421 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3422 ncp->nc_name, vp);
3423 dvp = *vp;
3424 *vp = ncp->nc_dvp;
3425 vref(*vp);
3426 mtx_unlock(vlp);
3427 vrele(dvp);
3428 return (0);
3429 }
3430 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3431
3432 mtx_unlock(vlp);
3433 vn_lock(*vp, LK_SHARED | LK_RETRY);
3434 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3435 vput(*vp);
3436 if (error) {
3437 counter_u64_add(numfullpathfail2, 1);
3438 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3439 return (error);
3440 }
3441
3442 *vp = dvp;
3443 if (VN_IS_DOOMED(dvp)) {
3444 /* forced unmount */
3445 vrele(dvp);
3446 error = ENOENT;
3447 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3448 return (error);
3449 }
3450 /*
3451 * *vp has its use count incremented still.
3452 */
3453
3454 return (0);
3455 }
3456
3457 /*
3458 * Resolve a directory to a pathname.
3459 *
3460 * The name of the directory can always be found in the namecache or fetched
3461 * from the filesystem. There is also guaranteed to be only one parent, meaning
3462 * we can just follow vnodes up until we find the root.
3463 *
3464 * The vnode must be referenced.
3465 */
3466 static int
vn_fullpath_dir(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * len,size_t addend)3467 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3468 size_t *len, size_t addend)
3469 {
3470 #ifdef KDTRACE_HOOKS
3471 struct vnode *startvp = vp;
3472 #endif
3473 struct vnode *vp1;
3474 size_t buflen;
3475 int error;
3476 bool slash_prefixed;
3477
3478 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3479 VNPASS(vp->v_usecount > 0, vp);
3480
3481 buflen = *len;
3482
3483 slash_prefixed = true;
3484 if (addend == 0) {
3485 MPASS(*len >= 2);
3486 buflen--;
3487 buf[buflen] = '\0';
3488 slash_prefixed = false;
3489 }
3490
3491 error = 0;
3492
3493 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3494 counter_u64_add(numfullpathcalls, 1);
3495 while (vp != rdir && vp != rootvnode) {
3496 /*
3497 * The vp vnode must be already fully constructed,
3498 * since it is either found in namecache or obtained
3499 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3500 * without obtaining the vnode lock.
3501 */
3502 if ((vp->v_vflag & VV_ROOT) != 0) {
3503 vn_lock(vp, LK_RETRY | LK_SHARED);
3504
3505 /*
3506 * With the vnode locked, check for races with
3507 * unmount, forced or not. Note that we
3508 * already verified that vp is not equal to
3509 * the root vnode, which means that
3510 * mnt_vnodecovered can be NULL only for the
3511 * case of unmount.
3512 */
3513 if (VN_IS_DOOMED(vp) ||
3514 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3515 vp1->v_mountedhere != vp->v_mount) {
3516 vput(vp);
3517 error = ENOENT;
3518 SDT_PROBE3(vfs, namecache, fullpath, return,
3519 error, vp, NULL);
3520 break;
3521 }
3522
3523 vref(vp1);
3524 vput(vp);
3525 vp = vp1;
3526 continue;
3527 }
3528 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3529 error = vn_vptocnp(&vp, buf, &buflen);
3530 if (error)
3531 break;
3532 if (buflen == 0) {
3533 vrele(vp);
3534 error = ENOMEM;
3535 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3536 startvp, NULL);
3537 break;
3538 }
3539 buf[--buflen] = '/';
3540 slash_prefixed = true;
3541 }
3542 if (error)
3543 return (error);
3544 if (!slash_prefixed) {
3545 if (buflen == 0) {
3546 vrele(vp);
3547 counter_u64_add(numfullpathfail4, 1);
3548 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3549 startvp, NULL);
3550 return (ENOMEM);
3551 }
3552 buf[--buflen] = '/';
3553 }
3554 counter_u64_add(numfullpathfound, 1);
3555 vrele(vp);
3556
3557 *retbuf = buf + buflen;
3558 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3559 *len -= buflen;
3560 *len += addend;
3561 return (0);
3562 }
3563
3564 /*
3565 * Resolve an arbitrary vnode to a pathname.
3566 *
3567 * Note 2 caveats:
3568 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3569 * resolve to a different path than the one used to find it
3570 * - namecache is not mandatory, meaning names are not guaranteed to be added
3571 * (in which case resolving fails)
3572 */
3573 static void __inline
cache_rev_failed_impl(int * reason,int line)3574 cache_rev_failed_impl(int *reason, int line)
3575 {
3576
3577 *reason = line;
3578 }
3579 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3580
3581 static int
vn_fullpath_any_smr(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen,size_t addend)3582 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3583 char **retbuf, size_t *buflen, size_t addend)
3584 {
3585 #ifdef KDTRACE_HOOKS
3586 struct vnode *startvp = vp;
3587 #endif
3588 struct vnode *tvp;
3589 struct mount *mp;
3590 struct namecache *ncp;
3591 size_t orig_buflen;
3592 int reason;
3593 int error;
3594 #ifdef KDTRACE_HOOKS
3595 int i;
3596 #endif
3597 seqc_t vp_seqc, tvp_seqc;
3598 u_char nc_flag;
3599
3600 VFS_SMR_ASSERT_ENTERED();
3601
3602 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3603 vfs_smr_exit();
3604 return (-1);
3605 }
3606
3607 orig_buflen = *buflen;
3608
3609 if (addend == 0) {
3610 MPASS(*buflen >= 2);
3611 *buflen -= 1;
3612 buf[*buflen] = '\0';
3613 }
3614
3615 if (vp == rdir || vp == rootvnode) {
3616 if (addend == 0) {
3617 *buflen -= 1;
3618 buf[*buflen] = '/';
3619 }
3620 goto out_ok;
3621 }
3622
3623 #ifdef KDTRACE_HOOKS
3624 i = 0;
3625 #endif
3626 error = -1;
3627 ncp = NULL; /* for sdt probe down below */
3628 vp_seqc = vn_seqc_read_any(vp);
3629 if (seqc_in_modify(vp_seqc)) {
3630 cache_rev_failed(&reason);
3631 goto out_abort;
3632 }
3633
3634 for (;;) {
3635 #ifdef KDTRACE_HOOKS
3636 i++;
3637 #endif
3638 if ((vp->v_vflag & VV_ROOT) != 0) {
3639 mp = atomic_load_ptr(&vp->v_mount);
3640 if (mp == NULL) {
3641 cache_rev_failed(&reason);
3642 goto out_abort;
3643 }
3644 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3645 tvp_seqc = vn_seqc_read_any(tvp);
3646 if (seqc_in_modify(tvp_seqc)) {
3647 cache_rev_failed(&reason);
3648 goto out_abort;
3649 }
3650 if (!vn_seqc_consistent(vp, vp_seqc)) {
3651 cache_rev_failed(&reason);
3652 goto out_abort;
3653 }
3654 vp = tvp;
3655 vp_seqc = tvp_seqc;
3656 continue;
3657 }
3658 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3659 if (ncp == NULL) {
3660 cache_rev_failed(&reason);
3661 goto out_abort;
3662 }
3663 nc_flag = atomic_load_char(&ncp->nc_flag);
3664 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3665 cache_rev_failed(&reason);
3666 goto out_abort;
3667 }
3668 if (ncp->nc_nlen >= *buflen) {
3669 cache_rev_failed(&reason);
3670 error = ENOMEM;
3671 goto out_abort;
3672 }
3673 *buflen -= ncp->nc_nlen;
3674 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3675 *buflen -= 1;
3676 buf[*buflen] = '/';
3677 tvp = ncp->nc_dvp;
3678 tvp_seqc = vn_seqc_read_any(tvp);
3679 if (seqc_in_modify(tvp_seqc)) {
3680 cache_rev_failed(&reason);
3681 goto out_abort;
3682 }
3683 if (!vn_seqc_consistent(vp, vp_seqc)) {
3684 cache_rev_failed(&reason);
3685 goto out_abort;
3686 }
3687 /*
3688 * Acquire fence provided by vn_seqc_read_any above.
3689 */
3690 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3691 cache_rev_failed(&reason);
3692 goto out_abort;
3693 }
3694 if (!cache_ncp_canuse(ncp)) {
3695 cache_rev_failed(&reason);
3696 goto out_abort;
3697 }
3698 vp = tvp;
3699 vp_seqc = tvp_seqc;
3700 if (vp == rdir || vp == rootvnode)
3701 break;
3702 }
3703 out_ok:
3704 vfs_smr_exit();
3705 *retbuf = buf + *buflen;
3706 *buflen = orig_buflen - *buflen + addend;
3707 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3708 return (0);
3709
3710 out_abort:
3711 *buflen = orig_buflen;
3712 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3713 vfs_smr_exit();
3714 return (error);
3715 }
3716
3717 static int
vn_fullpath_any(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen)3718 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3719 size_t *buflen)
3720 {
3721 size_t orig_buflen, addend;
3722 int error;
3723
3724 if (*buflen < 2)
3725 return (EINVAL);
3726
3727 orig_buflen = *buflen;
3728
3729 vref(vp);
3730 addend = 0;
3731 if (vp->v_type != VDIR) {
3732 *buflen -= 1;
3733 buf[*buflen] = '\0';
3734 error = vn_vptocnp(&vp, buf, buflen);
3735 if (error)
3736 return (error);
3737 if (*buflen == 0) {
3738 vrele(vp);
3739 return (ENOMEM);
3740 }
3741 *buflen -= 1;
3742 buf[*buflen] = '/';
3743 addend = orig_buflen - *buflen;
3744 }
3745
3746 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3747 }
3748
3749 /*
3750 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3751 *
3752 * Since the namecache does not track hardlinks, the caller is expected to
3753 * first look up the target vnode with WANTPARENT flag passed to namei to get
3754 * dvp and vp.
3755 *
3756 * Then we have 2 cases:
3757 * - if the found vnode is a directory, the path can be constructed just by
3758 * following names up the chain
3759 * - otherwise we populate the buffer with the saved name and start resolving
3760 * from the parent
3761 */
3762 int
vn_fullpath_hardlink(struct vnode * vp,struct vnode * dvp,const char * hrdl_name,size_t hrdl_name_length,char ** retbuf,char ** freebuf,size_t * buflen)3763 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3764 const char *hrdl_name, size_t hrdl_name_length,
3765 char **retbuf, char **freebuf, size_t *buflen)
3766 {
3767 char *buf, *tmpbuf;
3768 struct pwd *pwd;
3769 size_t addend;
3770 int error;
3771 __enum_uint8(vtype) type;
3772
3773 if (*buflen < 2)
3774 return (EINVAL);
3775 if (*buflen > MAXPATHLEN)
3776 *buflen = MAXPATHLEN;
3777
3778 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3779
3780 addend = 0;
3781
3782 /*
3783 * Check for VBAD to work around the vp_crossmp bug in lookup().
3784 *
3785 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3786 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3787 * If the type is VDIR (like in this very case) we can skip looking
3788 * at ni_dvp in the first place. However, since vnodes get passed here
3789 * unlocked the target may transition to doomed state (type == VBAD)
3790 * before we get to evaluate the condition. If this happens, we will
3791 * populate part of the buffer and descend to vn_fullpath_dir with
3792 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3793 */
3794 type = atomic_load_8(&vp->v_type);
3795 if (type == VBAD) {
3796 error = ENOENT;
3797 goto out_bad;
3798 }
3799 if (type != VDIR) {
3800 addend = hrdl_name_length + 2;
3801 if (*buflen < addend) {
3802 error = ENOMEM;
3803 goto out_bad;
3804 }
3805 *buflen -= addend;
3806 tmpbuf = buf + *buflen;
3807 tmpbuf[0] = '/';
3808 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3809 tmpbuf[addend - 1] = '\0';
3810 vp = dvp;
3811 }
3812
3813 vfs_smr_enter();
3814 pwd = pwd_get_smr();
3815 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3816 addend);
3817 VFS_SMR_ASSERT_NOT_ENTERED();
3818 if (error < 0) {
3819 pwd = pwd_hold(curthread);
3820 vref(vp);
3821 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3822 addend);
3823 pwd_drop(pwd);
3824 }
3825 if (error != 0)
3826 goto out_bad;
3827
3828 *freebuf = buf;
3829
3830 return (0);
3831 out_bad:
3832 free(buf, M_TEMP);
3833 return (error);
3834 }
3835
3836 struct vnode *
vn_dir_dd_ino(struct vnode * vp)3837 vn_dir_dd_ino(struct vnode *vp)
3838 {
3839 struct namecache *ncp;
3840 struct vnode *ddvp;
3841 struct mtx *vlp;
3842 enum vgetstate vs;
3843
3844 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3845 vlp = VP2VNODELOCK(vp);
3846 mtx_lock(vlp);
3847 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3848 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3849 continue;
3850 ddvp = ncp->nc_dvp;
3851 vs = vget_prep(ddvp);
3852 mtx_unlock(vlp);
3853 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3854 return (NULL);
3855 return (ddvp);
3856 }
3857 mtx_unlock(vlp);
3858 return (NULL);
3859 }
3860
3861 int
vn_commname(struct vnode * vp,char * buf,u_int buflen)3862 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3863 {
3864 struct namecache *ncp;
3865 struct mtx *vlp;
3866 int l;
3867
3868 vlp = VP2VNODELOCK(vp);
3869 mtx_lock(vlp);
3870 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3871 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3872 break;
3873 if (ncp == NULL) {
3874 mtx_unlock(vlp);
3875 return (ENOENT);
3876 }
3877 l = min(ncp->nc_nlen, buflen - 1);
3878 memcpy(buf, ncp->nc_name, l);
3879 mtx_unlock(vlp);
3880 buf[l] = '\0';
3881 return (0);
3882 }
3883
3884 /*
3885 * This function updates path string to vnode's full global path
3886 * and checks the size of the new path string against the pathlen argument.
3887 *
3888 * Requires a locked, referenced vnode.
3889 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3890 *
3891 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3892 * because it falls back to the ".." lookup if the namecache lookup fails.
3893 */
3894 int
vn_path_to_global_path(struct thread * td,struct vnode * vp,char * path,u_int pathlen)3895 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3896 u_int pathlen)
3897 {
3898 struct nameidata nd;
3899 struct vnode *vp1;
3900 char *rpath, *fbuf;
3901 int error;
3902
3903 ASSERT_VOP_ELOCKED(vp, __func__);
3904
3905 /* Construct global filesystem path from vp. */
3906 VOP_UNLOCK(vp);
3907 error = vn_fullpath_global(vp, &rpath, &fbuf);
3908
3909 if (error != 0) {
3910 vrele(vp);
3911 return (error);
3912 }
3913
3914 if (strlen(rpath) >= pathlen) {
3915 vrele(vp);
3916 error = ENAMETOOLONG;
3917 goto out;
3918 }
3919
3920 /*
3921 * Re-lookup the vnode by path to detect a possible rename.
3922 * As a side effect, the vnode is relocked.
3923 * If vnode was renamed, return ENOENT.
3924 */
3925 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3926 error = namei(&nd);
3927 if (error != 0) {
3928 vrele(vp);
3929 goto out;
3930 }
3931 NDFREE_PNBUF(&nd);
3932 vp1 = nd.ni_vp;
3933 vrele(vp);
3934 if (vp1 == vp)
3935 strcpy(path, rpath);
3936 else {
3937 vput(vp1);
3938 error = ENOENT;
3939 }
3940
3941 out:
3942 free(fbuf, M_TEMP);
3943 return (error);
3944 }
3945
3946 /*
3947 * This is similar to vn_path_to_global_path but allows for regular
3948 * files which may not be present in the cache.
3949 *
3950 * Requires a locked, referenced vnode.
3951 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3952 */
3953 int
vn_path_to_global_path_hardlink(struct thread * td,struct vnode * vp,struct vnode * dvp,char * path,u_int pathlen,const char * leaf_name,size_t leaf_length)3954 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3955 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3956 size_t leaf_length)
3957 {
3958 struct nameidata nd;
3959 struct vnode *vp1;
3960 char *rpath, *fbuf;
3961 size_t len;
3962 int error;
3963
3964 ASSERT_VOP_ELOCKED(vp, __func__);
3965
3966 /*
3967 * Construct global filesystem path from dvp, vp and leaf
3968 * name.
3969 */
3970 VOP_UNLOCK(vp);
3971 len = pathlen;
3972 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3973 &rpath, &fbuf, &len);
3974
3975 if (error != 0) {
3976 vrele(vp);
3977 return (error);
3978 }
3979
3980 if (strlen(rpath) >= pathlen) {
3981 vrele(vp);
3982 error = ENAMETOOLONG;
3983 goto out;
3984 }
3985
3986 /*
3987 * Re-lookup the vnode by path to detect a possible rename.
3988 * As a side effect, the vnode is relocked.
3989 * If vnode was renamed, return ENOENT.
3990 */
3991 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3992 error = namei(&nd);
3993 if (error != 0) {
3994 vrele(vp);
3995 goto out;
3996 }
3997 NDFREE_PNBUF(&nd);
3998 vp1 = nd.ni_vp;
3999 vrele(vp);
4000 if (vp1 == vp)
4001 strcpy(path, rpath);
4002 else {
4003 vput(vp1);
4004 error = ENOENT;
4005 }
4006
4007 out:
4008 free(fbuf, M_TEMP);
4009 return (error);
4010 }
4011
4012 #ifdef DDB
4013 static void
db_print_vpath(struct vnode * vp)4014 db_print_vpath(struct vnode *vp)
4015 {
4016
4017 while (vp != NULL) {
4018 db_printf("%p: ", vp);
4019 if (vp == rootvnode) {
4020 db_printf("/");
4021 vp = NULL;
4022 } else {
4023 if (vp->v_vflag & VV_ROOT) {
4024 db_printf("<mount point>");
4025 vp = vp->v_mount->mnt_vnodecovered;
4026 } else {
4027 struct namecache *ncp;
4028 char *ncn;
4029 int i;
4030
4031 ncp = TAILQ_FIRST(&vp->v_cache_dst);
4032 if (ncp != NULL) {
4033 ncn = ncp->nc_name;
4034 for (i = 0; i < ncp->nc_nlen; i++)
4035 db_printf("%c", *ncn++);
4036 vp = ncp->nc_dvp;
4037 } else {
4038 vp = NULL;
4039 }
4040 }
4041 }
4042 db_printf("\n");
4043 }
4044
4045 return;
4046 }
4047
DB_SHOW_COMMAND(vpath,db_show_vpath)4048 DB_SHOW_COMMAND(vpath, db_show_vpath)
4049 {
4050 struct vnode *vp;
4051
4052 if (!have_addr) {
4053 db_printf("usage: show vpath <struct vnode *>\n");
4054 return;
4055 }
4056
4057 vp = (struct vnode *)addr;
4058 db_print_vpath(vp);
4059 }
4060
4061 #endif
4062
4063 static int cache_fast_lookup = 1;
4064
4065 #define CACHE_FPL_FAILED -2020
4066
4067 static int
cache_vop_bad_vexec(struct vop_fplookup_vexec_args * v)4068 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4069 {
4070 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4071 panic("no proper vop_fplookup_vexec");
4072 }
4073
4074 static int
cache_vop_bad_symlink(struct vop_fplookup_symlink_args * v)4075 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4076 {
4077 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4078 panic("no proper vop_fplookup_symlink");
4079 }
4080
4081 void
cache_vop_vector_register(struct vop_vector * v)4082 cache_vop_vector_register(struct vop_vector *v)
4083 {
4084 size_t ops;
4085
4086 ops = 0;
4087 if (v->vop_fplookup_vexec != NULL) {
4088 ops++;
4089 }
4090 if (v->vop_fplookup_symlink != NULL) {
4091 ops++;
4092 }
4093
4094 if (ops == 2) {
4095 return;
4096 }
4097
4098 if (ops == 0) {
4099 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4100 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4101 return;
4102 }
4103
4104 printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4105 "need to be provided", __func__, v);
4106 if (v->vop_fplookup_vexec == NULL) {
4107 printf("%s: missing vop_fplookup_vexec\n", __func__);
4108 }
4109 if (v->vop_fplookup_symlink == NULL) {
4110 printf("%s: missing vop_fplookup_symlink\n", __func__);
4111 }
4112 panic("bad vop vector %p", v);
4113 }
4114
4115 #ifdef INVARIANTS
4116 void
cache_validate_vop_vector(struct mount * mp,struct vop_vector * vops)4117 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4118 {
4119 if (mp == NULL)
4120 return;
4121
4122 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4123 return;
4124
4125 if (vops->vop_fplookup_vexec == NULL ||
4126 vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4127 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4128 vops, mp->mnt_vfc->vfc_name);
4129
4130 if (vops->vop_fplookup_symlink == NULL ||
4131 vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4132 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4133 vops, mp->mnt_vfc->vfc_name);
4134 }
4135 #endif
4136
4137 void
cache_fast_lookup_enabled_recalc(void)4138 cache_fast_lookup_enabled_recalc(void)
4139 {
4140 int lookup_flag;
4141 int mac_on;
4142
4143 #ifdef MAC
4144 mac_on = mac_vnode_check_lookup_enabled();
4145 mac_on |= mac_vnode_check_readlink_enabled();
4146 #else
4147 mac_on = 0;
4148 #endif
4149
4150 lookup_flag = atomic_load_int(&cache_fast_lookup);
4151 if (lookup_flag && !mac_on) {
4152 atomic_store_char(&cache_fast_lookup_enabled, true);
4153 } else {
4154 atomic_store_char(&cache_fast_lookup_enabled, false);
4155 }
4156 }
4157
4158 static int
syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)4159 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4160 {
4161 int error, old;
4162
4163 old = atomic_load_int(&cache_fast_lookup);
4164 error = sysctl_handle_int(oidp, arg1, arg2, req);
4165 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4166 cache_fast_lookup_enabled_recalc();
4167 return (error);
4168 }
4169 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4170 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4171
4172 /*
4173 * Components of nameidata (or objects it can point to) which may
4174 * need restoring in case fast path lookup fails.
4175 */
4176 struct nameidata_outer {
4177 size_t ni_pathlen;
4178 int cn_flags;
4179 };
4180
4181 struct nameidata_saved {
4182 #ifdef INVARIANTS
4183 char *cn_nameptr;
4184 size_t ni_pathlen;
4185 #endif
4186 };
4187
4188 #ifdef INVARIANTS
4189 struct cache_fpl_debug {
4190 size_t ni_pathlen;
4191 };
4192 #endif
4193
4194 struct cache_fpl {
4195 struct nameidata *ndp;
4196 struct componentname *cnp;
4197 char *nulchar;
4198 struct vnode *dvp;
4199 struct vnode *tvp;
4200 seqc_t dvp_seqc;
4201 seqc_t tvp_seqc;
4202 uint32_t hash;
4203 struct nameidata_saved snd;
4204 struct nameidata_outer snd_outer;
4205 int line;
4206 enum cache_fpl_status status:8;
4207 bool in_smr;
4208 bool fsearch;
4209 struct pwd **pwd;
4210 #ifdef INVARIANTS
4211 struct cache_fpl_debug debug;
4212 #endif
4213 };
4214
4215 static bool cache_fplookup_mp_supported(struct mount *mp);
4216 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4217 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4218 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4219 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4220 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4221 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4222 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4223 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4224 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4225
4226 static void
cache_fpl_cleanup_cnp(struct componentname * cnp)4227 cache_fpl_cleanup_cnp(struct componentname *cnp)
4228 {
4229
4230 uma_zfree(namei_zone, cnp->cn_pnbuf);
4231 cnp->cn_pnbuf = NULL;
4232 cnp->cn_nameptr = NULL;
4233 }
4234
4235 static struct vnode *
cache_fpl_handle_root(struct cache_fpl * fpl)4236 cache_fpl_handle_root(struct cache_fpl *fpl)
4237 {
4238 struct nameidata *ndp;
4239 struct componentname *cnp;
4240
4241 ndp = fpl->ndp;
4242 cnp = fpl->cnp;
4243
4244 MPASS(*(cnp->cn_nameptr) == '/');
4245 cnp->cn_nameptr++;
4246 cache_fpl_pathlen_dec(fpl);
4247
4248 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4249 do {
4250 cnp->cn_nameptr++;
4251 cache_fpl_pathlen_dec(fpl);
4252 } while (*(cnp->cn_nameptr) == '/');
4253 }
4254
4255 return (ndp->ni_rootdir);
4256 }
4257
4258 static void
cache_fpl_checkpoint_outer(struct cache_fpl * fpl)4259 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4260 {
4261
4262 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4263 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4264 }
4265
4266 static void
cache_fpl_checkpoint(struct cache_fpl * fpl)4267 cache_fpl_checkpoint(struct cache_fpl *fpl)
4268 {
4269
4270 #ifdef INVARIANTS
4271 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4272 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4273 #endif
4274 }
4275
4276 static void
cache_fpl_restore_partial(struct cache_fpl * fpl)4277 cache_fpl_restore_partial(struct cache_fpl *fpl)
4278 {
4279
4280 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4281 #ifdef INVARIANTS
4282 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4283 #endif
4284 }
4285
4286 static void
cache_fpl_restore_abort(struct cache_fpl * fpl)4287 cache_fpl_restore_abort(struct cache_fpl *fpl)
4288 {
4289
4290 cache_fpl_restore_partial(fpl);
4291 /*
4292 * It is 0 on entry by API contract.
4293 */
4294 fpl->ndp->ni_resflags = 0;
4295 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4296 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4297 }
4298
4299 #ifdef INVARIANTS
4300 #define cache_fpl_smr_assert_entered(fpl) ({ \
4301 struct cache_fpl *_fpl = (fpl); \
4302 MPASS(_fpl->in_smr == true); \
4303 VFS_SMR_ASSERT_ENTERED(); \
4304 })
4305 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
4306 struct cache_fpl *_fpl = (fpl); \
4307 MPASS(_fpl->in_smr == false); \
4308 VFS_SMR_ASSERT_NOT_ENTERED(); \
4309 })
4310 static void
cache_fpl_assert_status(struct cache_fpl * fpl)4311 cache_fpl_assert_status(struct cache_fpl *fpl)
4312 {
4313
4314 switch (fpl->status) {
4315 case CACHE_FPL_STATUS_UNSET:
4316 __assert_unreachable();
4317 break;
4318 case CACHE_FPL_STATUS_DESTROYED:
4319 case CACHE_FPL_STATUS_ABORTED:
4320 case CACHE_FPL_STATUS_PARTIAL:
4321 case CACHE_FPL_STATUS_HANDLED:
4322 break;
4323 }
4324 }
4325 #else
4326 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4327 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4328 #define cache_fpl_assert_status(fpl) do { } while (0)
4329 #endif
4330
4331 #define cache_fpl_smr_enter_initial(fpl) ({ \
4332 struct cache_fpl *_fpl = (fpl); \
4333 vfs_smr_enter(); \
4334 _fpl->in_smr = true; \
4335 })
4336
4337 #define cache_fpl_smr_enter(fpl) ({ \
4338 struct cache_fpl *_fpl = (fpl); \
4339 MPASS(_fpl->in_smr == false); \
4340 vfs_smr_enter(); \
4341 _fpl->in_smr = true; \
4342 })
4343
4344 #define cache_fpl_smr_exit(fpl) ({ \
4345 struct cache_fpl *_fpl = (fpl); \
4346 MPASS(_fpl->in_smr == true); \
4347 vfs_smr_exit(); \
4348 _fpl->in_smr = false; \
4349 })
4350
4351 static int
cache_fpl_aborted_early_impl(struct cache_fpl * fpl,int line)4352 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4353 {
4354
4355 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4356 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4357 ("%s: converting to abort from %d at %d, set at %d\n",
4358 __func__, fpl->status, line, fpl->line));
4359 }
4360 cache_fpl_smr_assert_not_entered(fpl);
4361 fpl->status = CACHE_FPL_STATUS_ABORTED;
4362 fpl->line = line;
4363 return (CACHE_FPL_FAILED);
4364 }
4365
4366 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__)
4367
4368 static int __noinline
cache_fpl_aborted_impl(struct cache_fpl * fpl,int line)4369 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4370 {
4371 struct nameidata *ndp;
4372 struct componentname *cnp;
4373
4374 ndp = fpl->ndp;
4375 cnp = fpl->cnp;
4376
4377 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4378 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4379 ("%s: converting to abort from %d at %d, set at %d\n",
4380 __func__, fpl->status, line, fpl->line));
4381 }
4382 fpl->status = CACHE_FPL_STATUS_ABORTED;
4383 fpl->line = line;
4384 if (fpl->in_smr)
4385 cache_fpl_smr_exit(fpl);
4386 cache_fpl_restore_abort(fpl);
4387 /*
4388 * Resolving symlinks overwrites data passed by the caller.
4389 * Let namei know.
4390 */
4391 if (ndp->ni_loopcnt > 0) {
4392 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4393 cache_fpl_cleanup_cnp(cnp);
4394 }
4395 return (CACHE_FPL_FAILED);
4396 }
4397
4398 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
4399
4400 static int __noinline
cache_fpl_partial_impl(struct cache_fpl * fpl,int line)4401 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4402 {
4403
4404 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4405 ("%s: setting to partial at %d, but already set to %d at %d\n",
4406 __func__, line, fpl->status, fpl->line));
4407 cache_fpl_smr_assert_entered(fpl);
4408 fpl->status = CACHE_FPL_STATUS_PARTIAL;
4409 fpl->line = line;
4410 return (cache_fplookup_partial_setup(fpl));
4411 }
4412
4413 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
4414
4415 static int
cache_fpl_handled_impl(struct cache_fpl * fpl,int line)4416 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4417 {
4418
4419 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4420 ("%s: setting to handled at %d, but already set to %d at %d\n",
4421 __func__, line, fpl->status, fpl->line));
4422 cache_fpl_smr_assert_not_entered(fpl);
4423 fpl->status = CACHE_FPL_STATUS_HANDLED;
4424 fpl->line = line;
4425 return (0);
4426 }
4427
4428 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__)
4429
4430 static int
cache_fpl_handled_error_impl(struct cache_fpl * fpl,int error,int line)4431 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4432 {
4433
4434 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4435 ("%s: setting to handled at %d, but already set to %d at %d\n",
4436 __func__, line, fpl->status, fpl->line));
4437 MPASS(error != 0);
4438 MPASS(error != CACHE_FPL_FAILED);
4439 cache_fpl_smr_assert_not_entered(fpl);
4440 fpl->status = CACHE_FPL_STATUS_HANDLED;
4441 fpl->line = line;
4442 fpl->dvp = NULL;
4443 fpl->tvp = NULL;
4444 return (error);
4445 }
4446
4447 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__)
4448
4449 static bool
cache_fpl_terminated(struct cache_fpl * fpl)4450 cache_fpl_terminated(struct cache_fpl *fpl)
4451 {
4452
4453 return (fpl->status != CACHE_FPL_STATUS_UNSET);
4454 }
4455
4456 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4457 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4458 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4459 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4460 OPENWRITE | WANTIOCTLCAPS | OPENNAMED)
4461
4462 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4463 (ISDOTDOT | MAKEENTRY | ISLASTCN)
4464
4465 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4466 "supported and internal flags overlap");
4467
4468 static bool
cache_fpl_islastcn(struct nameidata * ndp)4469 cache_fpl_islastcn(struct nameidata *ndp)
4470 {
4471
4472 return (*ndp->ni_next == 0);
4473 }
4474
4475 static bool
cache_fpl_istrailingslash(struct cache_fpl * fpl)4476 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4477 {
4478
4479 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4480 return (*(fpl->nulchar - 1) == '/');
4481 }
4482
4483 static bool
cache_fpl_isdotdot(struct componentname * cnp)4484 cache_fpl_isdotdot(struct componentname *cnp)
4485 {
4486
4487 if (cnp->cn_namelen == 2 &&
4488 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4489 return (true);
4490 return (false);
4491 }
4492
4493 static bool
cache_can_fplookup(struct cache_fpl * fpl)4494 cache_can_fplookup(struct cache_fpl *fpl)
4495 {
4496 struct nameidata *ndp;
4497 struct componentname *cnp;
4498 struct thread *td;
4499
4500 ndp = fpl->ndp;
4501 cnp = fpl->cnp;
4502 td = curthread;
4503
4504 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4505 cache_fpl_aborted_early(fpl);
4506 return (false);
4507 }
4508 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4509 cache_fpl_aborted_early(fpl);
4510 return (false);
4511 }
4512 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4513 cache_fpl_aborted_early(fpl);
4514 return (false);
4515 }
4516 if (AUDITING_TD(td)) {
4517 cache_fpl_aborted_early(fpl);
4518 return (false);
4519 }
4520 if (ndp->ni_startdir != NULL) {
4521 cache_fpl_aborted_early(fpl);
4522 return (false);
4523 }
4524 if ((cnp->cn_flags & OPENNAMED) != 0) {
4525 cache_fpl_aborted_early(fpl);
4526 return (false);
4527 }
4528 return (true);
4529 }
4530
4531 static int __noinline
cache_fplookup_dirfd(struct cache_fpl * fpl,struct vnode ** vpp)4532 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4533 {
4534 struct nameidata *ndp;
4535 struct componentname *cnp;
4536 int error;
4537 bool fsearch;
4538
4539 ndp = fpl->ndp;
4540 cnp = fpl->cnp;
4541
4542 error = fgetvp_lookup_smr(ndp, vpp, &fsearch);
4543 if (__predict_false(error != 0)) {
4544 return (cache_fpl_aborted(fpl));
4545 }
4546 fpl->fsearch = fsearch;
4547 if ((*vpp)->v_type != VDIR) {
4548 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4549 cache_fpl_smr_exit(fpl);
4550 return (cache_fpl_handled_error(fpl, ENOTDIR));
4551 }
4552 }
4553 return (0);
4554 }
4555
4556 static int __noinline
cache_fplookup_negative_promote(struct cache_fpl * fpl,struct namecache * oncp,uint32_t hash)4557 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4558 uint32_t hash)
4559 {
4560 struct componentname *cnp;
4561 struct vnode *dvp;
4562
4563 cnp = fpl->cnp;
4564 dvp = fpl->dvp;
4565
4566 cache_fpl_smr_exit(fpl);
4567 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4568 return (cache_fpl_handled_error(fpl, ENOENT));
4569 else
4570 return (cache_fpl_aborted(fpl));
4571 }
4572
4573 /*
4574 * The target vnode is not supported, prepare for the slow path to take over.
4575 */
4576 static int __noinline
cache_fplookup_partial_setup(struct cache_fpl * fpl)4577 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4578 {
4579 struct nameidata *ndp;
4580 struct componentname *cnp;
4581 enum vgetstate dvs;
4582 struct vnode *dvp;
4583 struct pwd *pwd;
4584 seqc_t dvp_seqc;
4585
4586 ndp = fpl->ndp;
4587 cnp = fpl->cnp;
4588 pwd = *(fpl->pwd);
4589 dvp = fpl->dvp;
4590 dvp_seqc = fpl->dvp_seqc;
4591
4592 if (!pwd_hold_smr(pwd)) {
4593 return (cache_fpl_aborted(fpl));
4594 }
4595
4596 /*
4597 * Note that seqc is checked before the vnode is locked, so by
4598 * the time regular lookup gets to it it may have moved.
4599 *
4600 * Ultimately this does not affect correctness, any lookup errors
4601 * are userspace racing with itself. It is guaranteed that any
4602 * path which ultimately gets found could also have been found
4603 * by regular lookup going all the way in absence of concurrent
4604 * modifications.
4605 */
4606 dvs = vget_prep_smr(dvp);
4607 cache_fpl_smr_exit(fpl);
4608 if (__predict_false(dvs == VGET_NONE)) {
4609 pwd_drop(pwd);
4610 return (cache_fpl_aborted(fpl));
4611 }
4612
4613 vget_finish_ref(dvp, dvs);
4614 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4615 vrele(dvp);
4616 pwd_drop(pwd);
4617 return (cache_fpl_aborted(fpl));
4618 }
4619
4620 cache_fpl_restore_partial(fpl);
4621 #ifdef INVARIANTS
4622 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4623 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4624 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4625 }
4626 #endif
4627
4628 ndp->ni_startdir = dvp;
4629 cnp->cn_flags |= MAKEENTRY;
4630 if (cache_fpl_islastcn(ndp))
4631 cnp->cn_flags |= ISLASTCN;
4632 if (cache_fpl_isdotdot(cnp))
4633 cnp->cn_flags |= ISDOTDOT;
4634
4635 /*
4636 * Skip potential extra slashes parsing did not take care of.
4637 * cache_fplookup_skip_slashes explains the mechanism.
4638 */
4639 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4640 do {
4641 cnp->cn_nameptr++;
4642 cache_fpl_pathlen_dec(fpl);
4643 } while (*(cnp->cn_nameptr) == '/');
4644 }
4645
4646 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4647 #ifdef INVARIANTS
4648 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4649 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4650 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4651 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4652 }
4653 #endif
4654 return (0);
4655 }
4656
4657 static int
cache_fplookup_final_child(struct cache_fpl * fpl,enum vgetstate tvs)4658 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4659 {
4660 struct componentname *cnp;
4661 struct vnode *tvp;
4662 seqc_t tvp_seqc;
4663 int error, lkflags;
4664
4665 cnp = fpl->cnp;
4666 tvp = fpl->tvp;
4667 tvp_seqc = fpl->tvp_seqc;
4668
4669 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4670 lkflags = LK_SHARED;
4671 if ((cnp->cn_flags & LOCKSHARED) == 0)
4672 lkflags = LK_EXCLUSIVE;
4673 error = vget_finish(tvp, lkflags, tvs);
4674 if (__predict_false(error != 0)) {
4675 return (cache_fpl_aborted(fpl));
4676 }
4677 } else {
4678 vget_finish_ref(tvp, tvs);
4679 }
4680
4681 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4682 if ((cnp->cn_flags & LOCKLEAF) != 0)
4683 vput(tvp);
4684 else
4685 vrele(tvp);
4686 return (cache_fpl_aborted(fpl));
4687 }
4688
4689 return (cache_fpl_handled(fpl));
4690 }
4691
4692 /*
4693 * They want to possibly modify the state of the namecache.
4694 */
4695 static int __noinline
cache_fplookup_final_modifying(struct cache_fpl * fpl)4696 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4697 {
4698 struct nameidata *ndp __diagused;
4699 struct componentname *cnp;
4700 enum vgetstate dvs;
4701 struct vnode *dvp, *tvp;
4702 struct mount *mp;
4703 seqc_t dvp_seqc;
4704 int error;
4705 bool docache;
4706
4707 ndp = fpl->ndp;
4708 cnp = fpl->cnp;
4709 dvp = fpl->dvp;
4710 dvp_seqc = fpl->dvp_seqc;
4711
4712 MPASS(*(cnp->cn_nameptr) != '/');
4713 MPASS(cache_fpl_islastcn(ndp));
4714 if ((cnp->cn_flags & LOCKPARENT) == 0)
4715 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4716 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4717 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4718 cnp->cn_nameiop == RENAME);
4719 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4720 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4721
4722 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4723 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4724 docache = false;
4725
4726 /*
4727 * Regular lookup nulifies the slash, which we don't do here.
4728 * Don't take chances with filesystem routines seeing it for
4729 * the last entry.
4730 */
4731 if (cache_fpl_istrailingslash(fpl)) {
4732 return (cache_fpl_partial(fpl));
4733 }
4734
4735 mp = atomic_load_ptr(&dvp->v_mount);
4736 if (__predict_false(mp == NULL)) {
4737 return (cache_fpl_aborted(fpl));
4738 }
4739
4740 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4741 cache_fpl_smr_exit(fpl);
4742 /*
4743 * Original code keeps not checking for CREATE which
4744 * might be a bug. For now let the old lookup decide.
4745 */
4746 if (cnp->cn_nameiop == CREATE) {
4747 return (cache_fpl_aborted(fpl));
4748 }
4749 return (cache_fpl_handled_error(fpl, EROFS));
4750 }
4751
4752 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4753 cache_fpl_smr_exit(fpl);
4754 return (cache_fpl_handled_error(fpl, EEXIST));
4755 }
4756
4757 /*
4758 * Secure access to dvp; check cache_fplookup_partial_setup for
4759 * reasoning.
4760 *
4761 * XXX At least UFS requires its lookup routine to be called for
4762 * the last path component, which leads to some level of complication
4763 * and inefficiency:
4764 * - the target routine always locks the target vnode, but our caller
4765 * may not need it locked
4766 * - some of the VOP machinery asserts that the parent is locked, which
4767 * once more may be not required
4768 *
4769 * TODO: add a flag for filesystems which don't need this.
4770 */
4771 dvs = vget_prep_smr(dvp);
4772 cache_fpl_smr_exit(fpl);
4773 if (__predict_false(dvs == VGET_NONE)) {
4774 return (cache_fpl_aborted(fpl));
4775 }
4776
4777 vget_finish_ref(dvp, dvs);
4778 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4779 vrele(dvp);
4780 return (cache_fpl_aborted(fpl));
4781 }
4782
4783 error = vn_lock(dvp, LK_EXCLUSIVE);
4784 if (__predict_false(error != 0)) {
4785 vrele(dvp);
4786 return (cache_fpl_aborted(fpl));
4787 }
4788
4789 tvp = NULL;
4790 cnp->cn_flags |= ISLASTCN;
4791 if (docache)
4792 cnp->cn_flags |= MAKEENTRY;
4793 if (cache_fpl_isdotdot(cnp))
4794 cnp->cn_flags |= ISDOTDOT;
4795 cnp->cn_lkflags = LK_EXCLUSIVE;
4796 error = VOP_LOOKUP(dvp, &tvp, cnp);
4797 switch (error) {
4798 case EJUSTRETURN:
4799 case 0:
4800 break;
4801 case ENOTDIR:
4802 case ENOENT:
4803 vput(dvp);
4804 return (cache_fpl_handled_error(fpl, error));
4805 default:
4806 vput(dvp);
4807 return (cache_fpl_aborted(fpl));
4808 }
4809
4810 fpl->tvp = tvp;
4811
4812 if (tvp == NULL) {
4813 MPASS(error == EJUSTRETURN);
4814 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4815 VOP_UNLOCK(dvp);
4816 }
4817 return (cache_fpl_handled(fpl));
4818 }
4819
4820 /*
4821 * There are very hairy corner cases concerning various flag combinations
4822 * and locking state. In particular here we only hold one lock instead of
4823 * two.
4824 *
4825 * Skip the complexity as it is of no significance for normal workloads.
4826 */
4827 if (__predict_false(tvp == dvp)) {
4828 vput(dvp);
4829 vrele(tvp);
4830 return (cache_fpl_aborted(fpl));
4831 }
4832
4833 /*
4834 * If they want the symlink itself we are fine, but if they want to
4835 * follow it regular lookup has to be engaged.
4836 */
4837 if (tvp->v_type == VLNK) {
4838 if ((cnp->cn_flags & FOLLOW) != 0) {
4839 vput(dvp);
4840 vput(tvp);
4841 return (cache_fpl_aborted(fpl));
4842 }
4843 }
4844
4845 /*
4846 * Since we expect this to be the terminal vnode it should almost never
4847 * be a mount point.
4848 */
4849 if (__predict_false(cache_fplookup_is_mp(fpl))) {
4850 vput(dvp);
4851 vput(tvp);
4852 return (cache_fpl_aborted(fpl));
4853 }
4854
4855 if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4856 vput(dvp);
4857 vput(tvp);
4858 return (cache_fpl_handled_error(fpl, EEXIST));
4859 }
4860
4861 if ((cnp->cn_flags & LOCKLEAF) == 0) {
4862 VOP_UNLOCK(tvp);
4863 }
4864
4865 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4866 VOP_UNLOCK(dvp);
4867 }
4868
4869 return (cache_fpl_handled(fpl));
4870 }
4871
4872 static int __noinline
cache_fplookup_modifying(struct cache_fpl * fpl)4873 cache_fplookup_modifying(struct cache_fpl *fpl)
4874 {
4875 struct nameidata *ndp;
4876
4877 ndp = fpl->ndp;
4878
4879 if (!cache_fpl_islastcn(ndp)) {
4880 return (cache_fpl_partial(fpl));
4881 }
4882 return (cache_fplookup_final_modifying(fpl));
4883 }
4884
4885 static int __noinline
cache_fplookup_final_withparent(struct cache_fpl * fpl)4886 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4887 {
4888 struct componentname *cnp;
4889 enum vgetstate dvs, tvs;
4890 struct vnode *dvp, *tvp;
4891 seqc_t dvp_seqc;
4892 int error;
4893
4894 cnp = fpl->cnp;
4895 dvp = fpl->dvp;
4896 dvp_seqc = fpl->dvp_seqc;
4897 tvp = fpl->tvp;
4898
4899 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4900
4901 /*
4902 * This is less efficient than it can be for simplicity.
4903 */
4904 dvs = vget_prep_smr(dvp);
4905 if (__predict_false(dvs == VGET_NONE)) {
4906 return (cache_fpl_aborted(fpl));
4907 }
4908 tvs = vget_prep_smr(tvp);
4909 if (__predict_false(tvs == VGET_NONE)) {
4910 cache_fpl_smr_exit(fpl);
4911 vget_abort(dvp, dvs);
4912 return (cache_fpl_aborted(fpl));
4913 }
4914
4915 cache_fpl_smr_exit(fpl);
4916
4917 if ((cnp->cn_flags & LOCKPARENT) != 0) {
4918 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4919 if (__predict_false(error != 0)) {
4920 vget_abort(tvp, tvs);
4921 return (cache_fpl_aborted(fpl));
4922 }
4923 } else {
4924 vget_finish_ref(dvp, dvs);
4925 }
4926
4927 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4928 vget_abort(tvp, tvs);
4929 if ((cnp->cn_flags & LOCKPARENT) != 0)
4930 vput(dvp);
4931 else
4932 vrele(dvp);
4933 return (cache_fpl_aborted(fpl));
4934 }
4935
4936 error = cache_fplookup_final_child(fpl, tvs);
4937 if (__predict_false(error != 0)) {
4938 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4939 fpl->status == CACHE_FPL_STATUS_DESTROYED);
4940 if ((cnp->cn_flags & LOCKPARENT) != 0)
4941 vput(dvp);
4942 else
4943 vrele(dvp);
4944 return (error);
4945 }
4946
4947 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4948 return (0);
4949 }
4950
4951 static int
cache_fplookup_final(struct cache_fpl * fpl)4952 cache_fplookup_final(struct cache_fpl *fpl)
4953 {
4954 struct componentname *cnp;
4955 enum vgetstate tvs;
4956 struct vnode *dvp, *tvp;
4957 seqc_t dvp_seqc;
4958
4959 cnp = fpl->cnp;
4960 dvp = fpl->dvp;
4961 dvp_seqc = fpl->dvp_seqc;
4962 tvp = fpl->tvp;
4963
4964 MPASS(*(cnp->cn_nameptr) != '/');
4965
4966 if (cnp->cn_nameiop != LOOKUP) {
4967 return (cache_fplookup_final_modifying(fpl));
4968 }
4969
4970 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4971 return (cache_fplookup_final_withparent(fpl));
4972
4973 tvs = vget_prep_smr(tvp);
4974 if (__predict_false(tvs == VGET_NONE)) {
4975 return (cache_fpl_partial(fpl));
4976 }
4977
4978 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4979 cache_fpl_smr_exit(fpl);
4980 vget_abort(tvp, tvs);
4981 return (cache_fpl_aborted(fpl));
4982 }
4983
4984 cache_fpl_smr_exit(fpl);
4985 return (cache_fplookup_final_child(fpl, tvs));
4986 }
4987
4988 /*
4989 * Comment from locked lookup:
4990 * Check for degenerate name (e.g. / or "") which is a way of talking about a
4991 * directory, e.g. like "/." or ".".
4992 */
4993 static int __noinline
cache_fplookup_degenerate(struct cache_fpl * fpl)4994 cache_fplookup_degenerate(struct cache_fpl *fpl)
4995 {
4996 struct componentname *cnp;
4997 struct vnode *dvp;
4998 enum vgetstate dvs;
4999 int error, lkflags;
5000 #ifdef INVARIANTS
5001 char *cp;
5002 #endif
5003
5004 fpl->tvp = fpl->dvp;
5005 fpl->tvp_seqc = fpl->dvp_seqc;
5006
5007 cnp = fpl->cnp;
5008 dvp = fpl->dvp;
5009
5010 #ifdef INVARIANTS
5011 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
5012 KASSERT(*cp == '/',
5013 ("%s: encountered non-slash; string [%s]\n", __func__,
5014 cnp->cn_pnbuf));
5015 }
5016 #endif
5017
5018 if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
5019 cache_fpl_smr_exit(fpl);
5020 return (cache_fpl_handled_error(fpl, EISDIR));
5021 }
5022
5023 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
5024 return (cache_fplookup_final_withparent(fpl));
5025 }
5026
5027 dvs = vget_prep_smr(dvp);
5028 cache_fpl_smr_exit(fpl);
5029 if (__predict_false(dvs == VGET_NONE)) {
5030 return (cache_fpl_aborted(fpl));
5031 }
5032
5033 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5034 lkflags = LK_SHARED;
5035 if ((cnp->cn_flags & LOCKSHARED) == 0)
5036 lkflags = LK_EXCLUSIVE;
5037 error = vget_finish(dvp, lkflags, dvs);
5038 if (__predict_false(error != 0)) {
5039 return (cache_fpl_aborted(fpl));
5040 }
5041 } else {
5042 vget_finish_ref(dvp, dvs);
5043 }
5044 return (cache_fpl_handled(fpl));
5045 }
5046
5047 static int __noinline
cache_fplookup_emptypath(struct cache_fpl * fpl)5048 cache_fplookup_emptypath(struct cache_fpl *fpl)
5049 {
5050 struct nameidata *ndp;
5051 struct componentname *cnp;
5052 enum vgetstate tvs;
5053 struct vnode *tvp;
5054 int error, lkflags;
5055
5056 fpl->tvp = fpl->dvp;
5057 fpl->tvp_seqc = fpl->dvp_seqc;
5058
5059 ndp = fpl->ndp;
5060 cnp = fpl->cnp;
5061 tvp = fpl->tvp;
5062
5063 MPASS(*cnp->cn_pnbuf == '\0');
5064
5065 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5066 cache_fpl_smr_exit(fpl);
5067 return (cache_fpl_handled_error(fpl, ENOENT));
5068 }
5069
5070 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5071
5072 tvs = vget_prep_smr(tvp);
5073 cache_fpl_smr_exit(fpl);
5074 if (__predict_false(tvs == VGET_NONE)) {
5075 return (cache_fpl_aborted(fpl));
5076 }
5077
5078 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5079 lkflags = LK_SHARED;
5080 if ((cnp->cn_flags & LOCKSHARED) == 0)
5081 lkflags = LK_EXCLUSIVE;
5082 error = vget_finish(tvp, lkflags, tvs);
5083 if (__predict_false(error != 0)) {
5084 return (cache_fpl_aborted(fpl));
5085 }
5086 } else {
5087 vget_finish_ref(tvp, tvs);
5088 }
5089
5090 ndp->ni_resflags |= NIRES_EMPTYPATH;
5091 return (cache_fpl_handled(fpl));
5092 }
5093
5094 static int __noinline
cache_fplookup_noentry(struct cache_fpl * fpl)5095 cache_fplookup_noentry(struct cache_fpl *fpl)
5096 {
5097 struct nameidata *ndp;
5098 struct componentname *cnp;
5099 enum vgetstate dvs;
5100 struct vnode *dvp, *tvp;
5101 seqc_t dvp_seqc;
5102 int error;
5103
5104 ndp = fpl->ndp;
5105 cnp = fpl->cnp;
5106 dvp = fpl->dvp;
5107 dvp_seqc = fpl->dvp_seqc;
5108
5109 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5110 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5111 if (cnp->cn_nameiop == LOOKUP)
5112 MPASS((cnp->cn_flags & NOCACHE) == 0);
5113 MPASS(!cache_fpl_isdotdot(cnp));
5114
5115 /*
5116 * Hack: delayed name len checking.
5117 */
5118 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5119 cache_fpl_smr_exit(fpl);
5120 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5121 }
5122
5123 if (cnp->cn_nameptr[0] == '/') {
5124 return (cache_fplookup_skip_slashes(fpl));
5125 }
5126
5127 if (cnp->cn_pnbuf[0] == '\0') {
5128 return (cache_fplookup_emptypath(fpl));
5129 }
5130
5131 if (cnp->cn_nameptr[0] == '\0') {
5132 if (fpl->tvp == NULL) {
5133 return (cache_fplookup_degenerate(fpl));
5134 }
5135 return (cache_fplookup_trailingslash(fpl));
5136 }
5137
5138 if (cnp->cn_nameiop != LOOKUP) {
5139 fpl->tvp = NULL;
5140 return (cache_fplookup_modifying(fpl));
5141 }
5142
5143 /*
5144 * Only try to fill in the component if it is the last one,
5145 * otherwise not only there may be several to handle but the
5146 * walk may be complicated.
5147 */
5148 if (!cache_fpl_islastcn(ndp)) {
5149 return (cache_fpl_partial(fpl));
5150 }
5151
5152 /*
5153 * Regular lookup nulifies the slash, which we don't do here.
5154 * Don't take chances with filesystem routines seeing it for
5155 * the last entry.
5156 */
5157 if (cache_fpl_istrailingslash(fpl)) {
5158 return (cache_fpl_partial(fpl));
5159 }
5160
5161 /*
5162 * Secure access to dvp; check cache_fplookup_partial_setup for
5163 * reasoning.
5164 */
5165 dvs = vget_prep_smr(dvp);
5166 cache_fpl_smr_exit(fpl);
5167 if (__predict_false(dvs == VGET_NONE)) {
5168 return (cache_fpl_aborted(fpl));
5169 }
5170
5171 vget_finish_ref(dvp, dvs);
5172 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5173 vrele(dvp);
5174 return (cache_fpl_aborted(fpl));
5175 }
5176
5177 error = vn_lock(dvp, LK_SHARED);
5178 if (__predict_false(error != 0)) {
5179 vrele(dvp);
5180 return (cache_fpl_aborted(fpl));
5181 }
5182
5183 tvp = NULL;
5184 /*
5185 * TODO: provide variants which don't require locking either vnode.
5186 */
5187 cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5188 cnp->cn_lkflags = LK_SHARED;
5189 if ((cnp->cn_flags & LOCKSHARED) == 0) {
5190 cnp->cn_lkflags = LK_EXCLUSIVE;
5191 }
5192 error = VOP_LOOKUP(dvp, &tvp, cnp);
5193 switch (error) {
5194 case EJUSTRETURN:
5195 case 0:
5196 break;
5197 case ENOTDIR:
5198 case ENOENT:
5199 vput(dvp);
5200 return (cache_fpl_handled_error(fpl, error));
5201 default:
5202 vput(dvp);
5203 return (cache_fpl_aborted(fpl));
5204 }
5205
5206 fpl->tvp = tvp;
5207
5208 if (tvp == NULL) {
5209 MPASS(error == EJUSTRETURN);
5210 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5211 vput(dvp);
5212 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5213 VOP_UNLOCK(dvp);
5214 }
5215 return (cache_fpl_handled(fpl));
5216 }
5217
5218 if (tvp->v_type == VLNK) {
5219 if ((cnp->cn_flags & FOLLOW) != 0) {
5220 vput(dvp);
5221 vput(tvp);
5222 return (cache_fpl_aborted(fpl));
5223 }
5224 }
5225
5226 if (__predict_false(cache_fplookup_is_mp(fpl))) {
5227 vput(dvp);
5228 vput(tvp);
5229 return (cache_fpl_aborted(fpl));
5230 }
5231
5232 if ((cnp->cn_flags & LOCKLEAF) == 0) {
5233 VOP_UNLOCK(tvp);
5234 }
5235
5236 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5237 vput(dvp);
5238 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5239 VOP_UNLOCK(dvp);
5240 }
5241 return (cache_fpl_handled(fpl));
5242 }
5243
5244 static int __noinline
cache_fplookup_dot(struct cache_fpl * fpl)5245 cache_fplookup_dot(struct cache_fpl *fpl)
5246 {
5247 int error;
5248
5249 MPASS(!seqc_in_modify(fpl->dvp_seqc));
5250
5251 if (__predict_false(fpl->dvp->v_type != VDIR)) {
5252 cache_fpl_smr_exit(fpl);
5253 return (cache_fpl_handled_error(fpl, ENOTDIR));
5254 }
5255
5256 /*
5257 * Just re-assign the value. seqc will be checked later for the first
5258 * non-dot path component in line and/or before deciding to return the
5259 * vnode.
5260 */
5261 fpl->tvp = fpl->dvp;
5262 fpl->tvp_seqc = fpl->dvp_seqc;
5263
5264 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5265
5266 error = 0;
5267 if (cache_fplookup_is_mp(fpl)) {
5268 error = cache_fplookup_cross_mount(fpl);
5269 }
5270 return (error);
5271 }
5272
5273 static int __noinline
cache_fplookup_dotdot(struct cache_fpl * fpl)5274 cache_fplookup_dotdot(struct cache_fpl *fpl)
5275 {
5276 struct nameidata *ndp;
5277 struct componentname *cnp;
5278 struct namecache *ncp;
5279 struct vnode *dvp;
5280 struct prison *pr;
5281 u_char nc_flag;
5282
5283 ndp = fpl->ndp;
5284 cnp = fpl->cnp;
5285 dvp = fpl->dvp;
5286
5287 MPASS(cache_fpl_isdotdot(cnp));
5288
5289 /*
5290 * XXX this is racy the same way regular lookup is
5291 */
5292 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
5293 pr = pr->pr_parent)
5294 if (dvp == pr->pr_root)
5295 break;
5296
5297 if (dvp == ndp->ni_rootdir ||
5298 dvp == ndp->ni_topdir ||
5299 dvp == rootvnode ||
5300 pr != NULL) {
5301 fpl->tvp = dvp;
5302 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5303 if (seqc_in_modify(fpl->tvp_seqc)) {
5304 return (cache_fpl_aborted(fpl));
5305 }
5306 return (0);
5307 }
5308
5309 if ((dvp->v_vflag & VV_ROOT) != 0) {
5310 /*
5311 * TODO
5312 * The opposite of climb mount is needed here.
5313 */
5314 return (cache_fpl_partial(fpl));
5315 }
5316
5317 if (__predict_false(dvp->v_type != VDIR)) {
5318 cache_fpl_smr_exit(fpl);
5319 return (cache_fpl_handled_error(fpl, ENOTDIR));
5320 }
5321
5322 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5323 if (ncp == NULL) {
5324 return (cache_fpl_aborted(fpl));
5325 }
5326
5327 nc_flag = atomic_load_char(&ncp->nc_flag);
5328 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5329 if ((nc_flag & NCF_NEGATIVE) != 0)
5330 return (cache_fpl_aborted(fpl));
5331 fpl->tvp = ncp->nc_vp;
5332 } else {
5333 fpl->tvp = ncp->nc_dvp;
5334 }
5335
5336 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5337 if (seqc_in_modify(fpl->tvp_seqc)) {
5338 return (cache_fpl_partial(fpl));
5339 }
5340
5341 /*
5342 * Acquire fence provided by vn_seqc_read_any above.
5343 */
5344 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5345 return (cache_fpl_aborted(fpl));
5346 }
5347
5348 if (!cache_ncp_canuse(ncp)) {
5349 return (cache_fpl_aborted(fpl));
5350 }
5351
5352 return (0);
5353 }
5354
5355 static int __noinline
cache_fplookup_neg(struct cache_fpl * fpl,struct namecache * ncp,uint32_t hash)5356 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5357 {
5358 u_char nc_flag __diagused;
5359 bool neg_promote;
5360
5361 #ifdef INVARIANTS
5362 nc_flag = atomic_load_char(&ncp->nc_flag);
5363 MPASS((nc_flag & NCF_NEGATIVE) != 0);
5364 #endif
5365 /*
5366 * If they want to create an entry we need to replace this one.
5367 */
5368 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5369 fpl->tvp = NULL;
5370 return (cache_fplookup_modifying(fpl));
5371 }
5372 neg_promote = cache_neg_hit_prep(ncp);
5373 if (!cache_fpl_neg_ncp_canuse(ncp)) {
5374 cache_neg_hit_abort(ncp);
5375 return (cache_fpl_partial(fpl));
5376 }
5377 if (neg_promote) {
5378 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5379 }
5380 cache_neg_hit_finish(ncp);
5381 cache_fpl_smr_exit(fpl);
5382 return (cache_fpl_handled_error(fpl, ENOENT));
5383 }
5384
5385 /*
5386 * Resolve a symlink. Called by filesystem-specific routines.
5387 *
5388 * Code flow is:
5389 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5390 */
5391 int
cache_symlink_resolve(struct cache_fpl * fpl,const char * string,size_t len)5392 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5393 {
5394 struct nameidata *ndp;
5395 struct componentname *cnp;
5396 size_t adjust;
5397
5398 ndp = fpl->ndp;
5399 cnp = fpl->cnp;
5400
5401 if (__predict_false(len == 0)) {
5402 return (ENOENT);
5403 }
5404
5405 if (__predict_false(len > MAXPATHLEN - 2)) {
5406 if (cache_fpl_istrailingslash(fpl)) {
5407 return (EAGAIN);
5408 }
5409 }
5410
5411 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5412 #ifdef INVARIANTS
5413 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5414 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5415 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5416 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5417 }
5418 #endif
5419
5420 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5421 return (ENAMETOOLONG);
5422 }
5423
5424 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5425 return (ELOOP);
5426 }
5427
5428 adjust = len;
5429 if (ndp->ni_pathlen > 1) {
5430 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5431 } else {
5432 if (cache_fpl_istrailingslash(fpl)) {
5433 adjust = len + 1;
5434 cnp->cn_pnbuf[len] = '/';
5435 cnp->cn_pnbuf[len + 1] = '\0';
5436 } else {
5437 cnp->cn_pnbuf[len] = '\0';
5438 }
5439 }
5440 bcopy(string, cnp->cn_pnbuf, len);
5441
5442 ndp->ni_pathlen += adjust;
5443 cache_fpl_pathlen_add(fpl, adjust);
5444 cnp->cn_nameptr = cnp->cn_pnbuf;
5445 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5446 fpl->tvp = NULL;
5447 return (0);
5448 }
5449
5450 static int __noinline
cache_fplookup_symlink(struct cache_fpl * fpl)5451 cache_fplookup_symlink(struct cache_fpl *fpl)
5452 {
5453 struct mount *mp;
5454 struct nameidata *ndp;
5455 struct componentname *cnp;
5456 struct vnode *dvp, *tvp;
5457 struct pwd *pwd;
5458 int error;
5459
5460 ndp = fpl->ndp;
5461 cnp = fpl->cnp;
5462 dvp = fpl->dvp;
5463 tvp = fpl->tvp;
5464 pwd = *(fpl->pwd);
5465
5466 if (cache_fpl_islastcn(ndp)) {
5467 if ((cnp->cn_flags & FOLLOW) == 0) {
5468 return (cache_fplookup_final(fpl));
5469 }
5470 }
5471
5472 mp = atomic_load_ptr(&dvp->v_mount);
5473 if (__predict_false(mp == NULL)) {
5474 return (cache_fpl_aborted(fpl));
5475 }
5476
5477 /*
5478 * Note this check races against setting the flag just like regular
5479 * lookup.
5480 */
5481 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5482 cache_fpl_smr_exit(fpl);
5483 return (cache_fpl_handled_error(fpl, EACCES));
5484 }
5485
5486 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5487 if (__predict_false(error != 0)) {
5488 switch (error) {
5489 case EAGAIN:
5490 return (cache_fpl_partial(fpl));
5491 case ENOENT:
5492 case ENAMETOOLONG:
5493 case ELOOP:
5494 cache_fpl_smr_exit(fpl);
5495 return (cache_fpl_handled_error(fpl, error));
5496 default:
5497 return (cache_fpl_aborted(fpl));
5498 }
5499 }
5500
5501 if (*(cnp->cn_nameptr) == '/') {
5502 fpl->dvp = cache_fpl_handle_root(fpl);
5503 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5504 if (seqc_in_modify(fpl->dvp_seqc)) {
5505 return (cache_fpl_aborted(fpl));
5506 }
5507 /*
5508 * The main loop assumes that ->dvp points to a vnode belonging
5509 * to a filesystem which can do lockless lookup, but the absolute
5510 * symlink can be wandering off to one which does not.
5511 */
5512 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5513 if (__predict_false(mp == NULL)) {
5514 return (cache_fpl_aborted(fpl));
5515 }
5516 if (!cache_fplookup_mp_supported(mp)) {
5517 cache_fpl_checkpoint(fpl);
5518 return (cache_fpl_partial(fpl));
5519 }
5520 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5521 return (cache_fpl_aborted(fpl));
5522 }
5523 }
5524 return (0);
5525 }
5526
5527 static int
cache_fplookup_next(struct cache_fpl * fpl)5528 cache_fplookup_next(struct cache_fpl *fpl)
5529 {
5530 struct componentname *cnp;
5531 struct namecache *ncp;
5532 struct vnode *dvp, *tvp;
5533 u_char nc_flag;
5534 uint32_t hash;
5535 int error;
5536
5537 cnp = fpl->cnp;
5538 dvp = fpl->dvp;
5539 hash = fpl->hash;
5540
5541 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5542 if (cnp->cn_namelen == 1) {
5543 return (cache_fplookup_dot(fpl));
5544 }
5545 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5546 return (cache_fplookup_dotdot(fpl));
5547 }
5548 }
5549
5550 MPASS(!cache_fpl_isdotdot(cnp));
5551
5552 ncp = cache_ncp_find(dvp, cnp, hash);
5553 if (__predict_false(ncp == NULL)) {
5554 return (cache_fplookup_noentry(fpl));
5555 }
5556
5557 tvp = atomic_load_ptr(&ncp->nc_vp);
5558 nc_flag = atomic_load_char(&ncp->nc_flag);
5559 if ((nc_flag & NCF_NEGATIVE) != 0) {
5560 return (cache_fplookup_neg(fpl, ncp, hash));
5561 }
5562
5563 if (!cache_ncp_canuse(ncp)) {
5564 return (cache_fpl_partial(fpl));
5565 }
5566
5567 fpl->tvp = tvp;
5568 fpl->tvp_seqc = vn_seqc_read_any(tvp);
5569 if (seqc_in_modify(fpl->tvp_seqc)) {
5570 return (cache_fpl_partial(fpl));
5571 }
5572
5573 counter_u64_add(numposhits, 1);
5574 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5575
5576 error = 0;
5577 if (cache_fplookup_is_mp(fpl)) {
5578 error = cache_fplookup_cross_mount(fpl);
5579 }
5580 return (error);
5581 }
5582
5583 static bool
cache_fplookup_mp_supported(struct mount * mp)5584 cache_fplookup_mp_supported(struct mount *mp)
5585 {
5586
5587 MPASS(mp != NULL);
5588 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5589 return (false);
5590 return (true);
5591 }
5592
5593 /*
5594 * Walk up the mount stack (if any).
5595 *
5596 * Correctness is provided in the following ways:
5597 * - all vnodes are protected from freeing with SMR
5598 * - struct mount objects are type stable making them always safe to access
5599 * - stability of the particular mount is provided by busying it
5600 * - relationship between the vnode which is mounted on and the mount is
5601 * verified with the vnode sequence counter after busying
5602 * - association between root vnode of the mount and the mount is protected
5603 * by busy
5604 *
5605 * From that point on we can read the sequence counter of the root vnode
5606 * and get the next mount on the stack (if any) using the same protection.
5607 *
5608 * By the end of successful walk we are guaranteed the reached state was
5609 * indeed present at least at some point which matches the regular lookup.
5610 */
5611 static int __noinline
cache_fplookup_climb_mount(struct cache_fpl * fpl)5612 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5613 {
5614 struct mount *mp, *prev_mp;
5615 struct mount_pcpu *mpcpu, *prev_mpcpu;
5616 struct vnode *vp;
5617 seqc_t vp_seqc;
5618
5619 vp = fpl->tvp;
5620 vp_seqc = fpl->tvp_seqc;
5621
5622 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5623 mp = atomic_load_ptr(&vp->v_mountedhere);
5624 if (__predict_false(mp == NULL)) {
5625 return (0);
5626 }
5627
5628 prev_mp = NULL;
5629 for (;;) {
5630 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5631 if (prev_mp != NULL)
5632 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5633 return (cache_fpl_partial(fpl));
5634 }
5635 if (prev_mp != NULL)
5636 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5637 if (!vn_seqc_consistent(vp, vp_seqc)) {
5638 vfs_op_thread_exit_crit(mp, mpcpu);
5639 return (cache_fpl_partial(fpl));
5640 }
5641 if (!cache_fplookup_mp_supported(mp)) {
5642 vfs_op_thread_exit_crit(mp, mpcpu);
5643 return (cache_fpl_partial(fpl));
5644 }
5645 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5646 if (vp == NULL) {
5647 vfs_op_thread_exit_crit(mp, mpcpu);
5648 return (cache_fpl_partial(fpl));
5649 }
5650 vp_seqc = vn_seqc_read_any(vp);
5651 if (seqc_in_modify(vp_seqc)) {
5652 vfs_op_thread_exit_crit(mp, mpcpu);
5653 return (cache_fpl_partial(fpl));
5654 }
5655 prev_mp = mp;
5656 prev_mpcpu = mpcpu;
5657 mp = atomic_load_ptr(&vp->v_mountedhere);
5658 if (mp == NULL)
5659 break;
5660 }
5661
5662 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5663 fpl->tvp = vp;
5664 fpl->tvp_seqc = vp_seqc;
5665 return (0);
5666 }
5667
5668 static int __noinline
cache_fplookup_cross_mount(struct cache_fpl * fpl)5669 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5670 {
5671 struct mount *mp;
5672 struct mount_pcpu *mpcpu;
5673 struct vnode *vp;
5674 seqc_t vp_seqc;
5675
5676 vp = fpl->tvp;
5677 vp_seqc = fpl->tvp_seqc;
5678
5679 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5680 mp = atomic_load_ptr(&vp->v_mountedhere);
5681 if (__predict_false(mp == NULL)) {
5682 return (0);
5683 }
5684
5685 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5686 return (cache_fpl_partial(fpl));
5687 }
5688 if (!vn_seqc_consistent(vp, vp_seqc)) {
5689 vfs_op_thread_exit_crit(mp, mpcpu);
5690 return (cache_fpl_partial(fpl));
5691 }
5692 if (!cache_fplookup_mp_supported(mp)) {
5693 vfs_op_thread_exit_crit(mp, mpcpu);
5694 return (cache_fpl_partial(fpl));
5695 }
5696 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5697 if (__predict_false(vp == NULL)) {
5698 vfs_op_thread_exit_crit(mp, mpcpu);
5699 return (cache_fpl_partial(fpl));
5700 }
5701 vp_seqc = vn_seqc_read_any(vp);
5702 vfs_op_thread_exit_crit(mp, mpcpu);
5703 if (seqc_in_modify(vp_seqc)) {
5704 return (cache_fpl_partial(fpl));
5705 }
5706 mp = atomic_load_ptr(&vp->v_mountedhere);
5707 if (__predict_false(mp != NULL)) {
5708 /*
5709 * There are possibly more mount points on top.
5710 * Normally this does not happen so for simplicity just start
5711 * over.
5712 */
5713 return (cache_fplookup_climb_mount(fpl));
5714 }
5715
5716 fpl->tvp = vp;
5717 fpl->tvp_seqc = vp_seqc;
5718 return (0);
5719 }
5720
5721 /*
5722 * Check if a vnode is mounted on.
5723 */
5724 static bool
cache_fplookup_is_mp(struct cache_fpl * fpl)5725 cache_fplookup_is_mp(struct cache_fpl *fpl)
5726 {
5727 struct vnode *vp;
5728
5729 vp = fpl->tvp;
5730 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5731 }
5732
5733 /*
5734 * Parse the path.
5735 *
5736 * The code was originally copy-pasted from regular lookup and despite
5737 * clean ups leaves performance on the table. Any modifications here
5738 * must take into account that in case off fallback the resulting
5739 * nameidata state has to be compatible with the original.
5740 */
5741
5742 /*
5743 * Debug ni_pathlen tracking.
5744 */
5745 #ifdef INVARIANTS
5746 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5747 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5748 {
5749
5750 fpl->debug.ni_pathlen += n;
5751 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5752 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5753 }
5754
5755 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5756 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5757 {
5758
5759 fpl->debug.ni_pathlen -= n;
5760 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5761 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5762 }
5763
5764 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5765 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5766 {
5767
5768 cache_fpl_pathlen_add(fpl, 1);
5769 }
5770
5771 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5772 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5773 {
5774
5775 cache_fpl_pathlen_sub(fpl, 1);
5776 }
5777 #else
5778 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5779 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5780 {
5781 }
5782
5783 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5784 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5785 {
5786 }
5787
5788 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5789 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5790 {
5791 }
5792
5793 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5794 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5795 {
5796 }
5797 #endif
5798
5799 static void
cache_fplookup_parse(struct cache_fpl * fpl)5800 cache_fplookup_parse(struct cache_fpl *fpl)
5801 {
5802 struct nameidata *ndp;
5803 struct componentname *cnp;
5804 struct vnode *dvp;
5805 char *cp;
5806 uint32_t hash;
5807
5808 ndp = fpl->ndp;
5809 cnp = fpl->cnp;
5810 dvp = fpl->dvp;
5811
5812 /*
5813 * Find the end of this path component, it is either / or nul.
5814 *
5815 * Store / as a temporary sentinel so that we only have one character
5816 * to test for. Pathnames tend to be short so this should not be
5817 * resulting in cache misses.
5818 *
5819 * TODO: fix this to be word-sized.
5820 */
5821 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5822 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5823 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5824 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5825 fpl->nulchar, cnp->cn_pnbuf));
5826 KASSERT(*fpl->nulchar == '\0',
5827 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5828 cnp->cn_pnbuf));
5829 hash = cache_get_hash_iter_start(dvp);
5830 *fpl->nulchar = '/';
5831 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5832 KASSERT(*cp != '\0',
5833 ("%s: encountered unexpected nul; string [%s]\n", __func__,
5834 cnp->cn_nameptr));
5835 hash = cache_get_hash_iter(*cp, hash);
5836 continue;
5837 }
5838 *fpl->nulchar = '\0';
5839 fpl->hash = cache_get_hash_iter_finish(hash);
5840
5841 cnp->cn_namelen = cp - cnp->cn_nameptr;
5842 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5843
5844 #ifdef INVARIANTS
5845 /*
5846 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5847 * we are going to fail this lookup with ENAMETOOLONG (see below).
5848 */
5849 if (cnp->cn_namelen <= NAME_MAX) {
5850 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5851 panic("%s: mismatched hash for [%s] len %ld", __func__,
5852 cnp->cn_nameptr, cnp->cn_namelen);
5853 }
5854 }
5855 #endif
5856
5857 /*
5858 * Hack: we have to check if the found path component's length exceeds
5859 * NAME_MAX. However, the condition is very rarely true and check can
5860 * be elided in the common case -- if an entry was found in the cache,
5861 * then it could not have been too long to begin with.
5862 */
5863 ndp->ni_next = cp;
5864 }
5865
5866 static void
cache_fplookup_parse_advance(struct cache_fpl * fpl)5867 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5868 {
5869 struct nameidata *ndp;
5870 struct componentname *cnp;
5871
5872 ndp = fpl->ndp;
5873 cnp = fpl->cnp;
5874
5875 cnp->cn_nameptr = ndp->ni_next;
5876 KASSERT(*(cnp->cn_nameptr) == '/',
5877 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5878 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5879 cnp->cn_nameptr++;
5880 cache_fpl_pathlen_dec(fpl);
5881 }
5882
5883 /*
5884 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5885 *
5886 * Lockless lookup tries to elide checking for spurious slashes and should they
5887 * be present is guaranteed to fail to find an entry. In this case the caller
5888 * must check if the name starts with a slash and call this routine. It is
5889 * going to fast forward across the spurious slashes and set the state up for
5890 * retry.
5891 */
5892 static int __noinline
cache_fplookup_skip_slashes(struct cache_fpl * fpl)5893 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5894 {
5895 struct nameidata *ndp;
5896 struct componentname *cnp;
5897
5898 ndp = fpl->ndp;
5899 cnp = fpl->cnp;
5900
5901 MPASS(*(cnp->cn_nameptr) == '/');
5902 do {
5903 cnp->cn_nameptr++;
5904 cache_fpl_pathlen_dec(fpl);
5905 } while (*(cnp->cn_nameptr) == '/');
5906
5907 /*
5908 * Go back to one slash so that cache_fplookup_parse_advance has
5909 * something to skip.
5910 */
5911 cnp->cn_nameptr--;
5912 cache_fpl_pathlen_inc(fpl);
5913
5914 /*
5915 * cache_fplookup_parse_advance starts from ndp->ni_next
5916 */
5917 ndp->ni_next = cnp->cn_nameptr;
5918
5919 /*
5920 * See cache_fplookup_dot.
5921 */
5922 fpl->tvp = fpl->dvp;
5923 fpl->tvp_seqc = fpl->dvp_seqc;
5924
5925 return (0);
5926 }
5927
5928 /*
5929 * Handle trailing slashes (e.g., "foo/").
5930 *
5931 * If a trailing slash is found the terminal vnode must be a directory.
5932 * Regular lookup shortens the path by nulifying the first trailing slash and
5933 * sets the TRAILINGSLASH flag to denote this took place. There are several
5934 * checks on it performed later.
5935 *
5936 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5937 * manner relying on an invariant that a non-directory vnode will get a miss.
5938 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5939 *
5940 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5941 * and denotes this is the last path component, which avoids looping back.
5942 *
5943 * Only plain lookups are supported for now to restrict corner cases to handle.
5944 */
5945 static int __noinline
cache_fplookup_trailingslash(struct cache_fpl * fpl)5946 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5947 {
5948 #ifdef INVARIANTS
5949 size_t ni_pathlen;
5950 #endif
5951 struct nameidata *ndp;
5952 struct componentname *cnp;
5953 struct namecache *ncp;
5954 struct vnode *tvp;
5955 char *cn_nameptr_orig, *cn_nameptr_slash;
5956 seqc_t tvp_seqc;
5957 u_char nc_flag;
5958
5959 ndp = fpl->ndp;
5960 cnp = fpl->cnp;
5961 tvp = fpl->tvp;
5962 tvp_seqc = fpl->tvp_seqc;
5963
5964 MPASS(fpl->dvp == fpl->tvp);
5965 KASSERT(cache_fpl_istrailingslash(fpl),
5966 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5967 cnp->cn_pnbuf));
5968 KASSERT(cnp->cn_nameptr[0] == '\0',
5969 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5970 cnp->cn_pnbuf));
5971 KASSERT(cnp->cn_namelen == 0,
5972 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5973 cnp->cn_pnbuf));
5974 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5975
5976 if (cnp->cn_nameiop != LOOKUP) {
5977 return (cache_fpl_aborted(fpl));
5978 }
5979
5980 if (__predict_false(tvp->v_type != VDIR)) {
5981 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5982 return (cache_fpl_aborted(fpl));
5983 }
5984 cache_fpl_smr_exit(fpl);
5985 return (cache_fpl_handled_error(fpl, ENOTDIR));
5986 }
5987
5988 /*
5989 * Denote the last component.
5990 */
5991 ndp->ni_next = &cnp->cn_nameptr[0];
5992 MPASS(cache_fpl_islastcn(ndp));
5993
5994 /*
5995 * Unwind trailing slashes.
5996 */
5997 cn_nameptr_orig = cnp->cn_nameptr;
5998 while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5999 cnp->cn_nameptr--;
6000 if (cnp->cn_nameptr[0] != '/') {
6001 break;
6002 }
6003 }
6004
6005 /*
6006 * Unwind to the beginning of the path component.
6007 *
6008 * Note the path may or may not have started with a slash.
6009 */
6010 cn_nameptr_slash = cnp->cn_nameptr;
6011 while (cnp->cn_nameptr > cnp->cn_pnbuf) {
6012 cnp->cn_nameptr--;
6013 if (cnp->cn_nameptr[0] == '/') {
6014 break;
6015 }
6016 }
6017 if (cnp->cn_nameptr[0] == '/') {
6018 cnp->cn_nameptr++;
6019 }
6020
6021 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
6022 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
6023 cache_fpl_checkpoint(fpl);
6024
6025 #ifdef INVARIANTS
6026 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
6027 if (ni_pathlen != fpl->debug.ni_pathlen) {
6028 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
6029 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
6030 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
6031 }
6032 #endif
6033
6034 /*
6035 * If this was a "./" lookup the parent directory is already correct.
6036 */
6037 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
6038 return (0);
6039 }
6040
6041 /*
6042 * Otherwise we need to look it up.
6043 */
6044 tvp = fpl->tvp;
6045 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6046 if (__predict_false(ncp == NULL)) {
6047 return (cache_fpl_aborted(fpl));
6048 }
6049 nc_flag = atomic_load_char(&ncp->nc_flag);
6050 if ((nc_flag & NCF_ISDOTDOT) != 0) {
6051 return (cache_fpl_aborted(fpl));
6052 }
6053 fpl->dvp = ncp->nc_dvp;
6054 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6055 if (seqc_in_modify(fpl->dvp_seqc)) {
6056 return (cache_fpl_aborted(fpl));
6057 }
6058 return (0);
6059 }
6060
6061 /*
6062 * See the API contract for VOP_FPLOOKUP_VEXEC.
6063 */
6064 static int __noinline
cache_fplookup_failed_vexec(struct cache_fpl * fpl,int error)6065 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6066 {
6067 struct componentname *cnp;
6068 struct vnode *dvp;
6069 seqc_t dvp_seqc;
6070
6071 cnp = fpl->cnp;
6072 dvp = fpl->dvp;
6073 dvp_seqc = fpl->dvp_seqc;
6074
6075 /*
6076 * Hack: delayed empty path checking.
6077 */
6078 if (cnp->cn_pnbuf[0] == '\0') {
6079 return (cache_fplookup_emptypath(fpl));
6080 }
6081
6082 /*
6083 * TODO: Due to ignoring trailing slashes lookup will perform a
6084 * permission check on the last dir when it should not be doing it. It
6085 * may fail, but said failure should be ignored. It is possible to fix
6086 * it up fully without resorting to regular lookup, but for now just
6087 * abort.
6088 */
6089 if (cache_fpl_istrailingslash(fpl)) {
6090 return (cache_fpl_aborted(fpl));
6091 }
6092
6093 /*
6094 * Hack: delayed degenerate path checking.
6095 */
6096 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6097 return (cache_fplookup_degenerate(fpl));
6098 }
6099
6100 /*
6101 * Hack: delayed name len checking.
6102 */
6103 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6104 cache_fpl_smr_exit(fpl);
6105 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6106 }
6107
6108 /*
6109 * Hack: they may be looking up foo/bar, where foo is not a directory.
6110 * In such a case we need to return ENOTDIR, but we may happen to get
6111 * here with a different error.
6112 */
6113 if (dvp->v_type != VDIR) {
6114 error = ENOTDIR;
6115 }
6116
6117 /*
6118 * Hack: handle O_SEARCH.
6119 *
6120 * Open Group Base Specifications Issue 7, 2018 edition states:
6121 * <quote>
6122 * If the access mode of the open file description associated with the
6123 * file descriptor is not O_SEARCH, the function shall check whether
6124 * directory searches are permitted using the current permissions of
6125 * the directory underlying the file descriptor. If the access mode is
6126 * O_SEARCH, the function shall not perform the check.
6127 * </quote>
6128 *
6129 * Regular lookup tests for the NOEXECCHECK flag for every path
6130 * component to decide whether to do the permission check. However,
6131 * since most lookups never have the flag (and when they do it is only
6132 * present for the first path component), lockless lookup only acts on
6133 * it if there is a permission problem. Here the flag is represented
6134 * with a boolean so that we don't have to clear it on the way out.
6135 *
6136 * For simplicity this always aborts.
6137 * TODO: check if this is the first lookup and ignore the permission
6138 * problem. Note the flag has to survive fallback (if it happens to be
6139 * performed).
6140 */
6141 if (fpl->fsearch) {
6142 return (cache_fpl_aborted(fpl));
6143 }
6144
6145 switch (error) {
6146 case EAGAIN:
6147 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6148 error = cache_fpl_aborted(fpl);
6149 } else {
6150 cache_fpl_partial(fpl);
6151 }
6152 break;
6153 default:
6154 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6155 error = cache_fpl_aborted(fpl);
6156 } else {
6157 cache_fpl_smr_exit(fpl);
6158 cache_fpl_handled_error(fpl, error);
6159 }
6160 break;
6161 }
6162 return (error);
6163 }
6164
6165 static int
cache_fplookup_impl(struct vnode * dvp,struct cache_fpl * fpl)6166 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6167 {
6168 struct nameidata *ndp;
6169 struct componentname *cnp;
6170 struct mount *mp;
6171 int error;
6172
6173 ndp = fpl->ndp;
6174 cnp = fpl->cnp;
6175
6176 cache_fpl_checkpoint(fpl);
6177
6178 /*
6179 * The vnode at hand is almost always stable, skip checking for it.
6180 * Worst case this postpones the check towards the end of the iteration
6181 * of the main loop.
6182 */
6183 fpl->dvp = dvp;
6184 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6185
6186 mp = atomic_load_ptr(&dvp->v_mount);
6187 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6188 return (cache_fpl_aborted(fpl));
6189 }
6190
6191 MPASS(fpl->tvp == NULL);
6192
6193 for (;;) {
6194 cache_fplookup_parse(fpl);
6195
6196 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6197 if (__predict_false(error != 0)) {
6198 error = cache_fplookup_failed_vexec(fpl, error);
6199 break;
6200 }
6201
6202 error = cache_fplookup_next(fpl);
6203 if (__predict_false(cache_fpl_terminated(fpl))) {
6204 break;
6205 }
6206
6207 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6208
6209 if (fpl->tvp->v_type == VLNK) {
6210 error = cache_fplookup_symlink(fpl);
6211 if (cache_fpl_terminated(fpl)) {
6212 break;
6213 }
6214 } else {
6215 if (cache_fpl_islastcn(ndp)) {
6216 error = cache_fplookup_final(fpl);
6217 break;
6218 }
6219
6220 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6221 error = cache_fpl_aborted(fpl);
6222 break;
6223 }
6224
6225 fpl->dvp = fpl->tvp;
6226 fpl->dvp_seqc = fpl->tvp_seqc;
6227 cache_fplookup_parse_advance(fpl);
6228 }
6229
6230 cache_fpl_checkpoint(fpl);
6231 }
6232
6233 return (error);
6234 }
6235
6236 /*
6237 * Fast path lookup protected with SMR and sequence counters.
6238 *
6239 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6240 *
6241 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6242 * outlined below.
6243 *
6244 * Traditional vnode lookup conceptually looks like this:
6245 *
6246 * vn_lock(current);
6247 * for (;;) {
6248 * next = find();
6249 * vn_lock(next);
6250 * vn_unlock(current);
6251 * current = next;
6252 * if (last)
6253 * break;
6254 * }
6255 * return (current);
6256 *
6257 * Each jump to the next vnode is safe memory-wise and atomic with respect to
6258 * any modifications thanks to holding respective locks.
6259 *
6260 * The same guarantee can be provided with a combination of safe memory
6261 * reclamation and sequence counters instead. If all operations which affect
6262 * the relationship between the current vnode and the one we are looking for
6263 * also modify the counter, we can verify whether all the conditions held as
6264 * we made the jump. This includes things like permissions, mount points etc.
6265 * Counter modification is provided by enclosing relevant places in
6266 * vn_seqc_write_begin()/end() calls.
6267 *
6268 * Thus this translates to:
6269 *
6270 * vfs_smr_enter();
6271 * dvp_seqc = seqc_read_any(dvp);
6272 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6273 * abort();
6274 * for (;;) {
6275 * tvp = find();
6276 * tvp_seqc = seqc_read_any(tvp);
6277 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6278 * abort();
6279 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6280 * abort();
6281 * dvp = tvp; // we know nothing of importance has changed
6282 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6283 * if (last)
6284 * break;
6285 * }
6286 * vget(); // secure the vnode
6287 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6288 * abort();
6289 * // at this point we know nothing has changed for any parent<->child pair
6290 * // as they were crossed during the lookup, meaning we matched the guarantee
6291 * // of the locked variant
6292 * return (tvp);
6293 *
6294 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6295 * - they are called while within vfs_smr protection which they must never exit
6296 * - EAGAIN can be returned to denote checking could not be performed, it is
6297 * always valid to return it
6298 * - if the sequence counter has not changed the result must be valid
6299 * - if the sequence counter has changed both false positives and false negatives
6300 * are permitted (since the result will be rejected later)
6301 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6302 *
6303 * Caveats to watch out for:
6304 * - vnodes are passed unlocked and unreferenced with nothing stopping
6305 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6306 * to use atomic_load_ptr to fetch it.
6307 * - the aforementioned object can also get freed, meaning absent other means it
6308 * should be protected with vfs_smr
6309 * - either safely checking permissions as they are modified or guaranteeing
6310 * their stability is left to the routine
6311 */
6312 int
cache_fplookup(struct nameidata * ndp,enum cache_fpl_status * status,struct pwd ** pwdp)6313 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6314 struct pwd **pwdp)
6315 {
6316 struct cache_fpl fpl;
6317 struct pwd *pwd;
6318 struct vnode *dvp;
6319 struct componentname *cnp;
6320 int error;
6321
6322 fpl.status = CACHE_FPL_STATUS_UNSET;
6323 fpl.in_smr = false;
6324 fpl.ndp = ndp;
6325 fpl.cnp = cnp = &ndp->ni_cnd;
6326 MPASS(ndp->ni_lcf == 0);
6327 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6328 ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6329 cnp->cn_flags));
6330 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6331 MPASS(ndp->ni_resflags == 0);
6332
6333 if (__predict_false(!cache_can_fplookup(&fpl))) {
6334 *status = fpl.status;
6335 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6336 return (EOPNOTSUPP);
6337 }
6338
6339 cache_fpl_checkpoint_outer(&fpl);
6340
6341 cache_fpl_smr_enter_initial(&fpl);
6342 #ifdef INVARIANTS
6343 fpl.debug.ni_pathlen = ndp->ni_pathlen;
6344 #endif
6345 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6346 fpl.fsearch = false;
6347 fpl.tvp = NULL; /* for degenerate path handling */
6348 fpl.pwd = pwdp;
6349 pwd = pwd_get_smr();
6350 *(fpl.pwd) = pwd;
6351 namei_setup_rootdir(ndp, cnp, pwd);
6352 ndp->ni_topdir = pwd->pwd_jdir;
6353
6354 if (cnp->cn_pnbuf[0] == '/') {
6355 dvp = cache_fpl_handle_root(&fpl);
6356 ndp->ni_resflags = NIRES_ABS;
6357 } else {
6358 if (ndp->ni_dirfd == AT_FDCWD) {
6359 dvp = pwd->pwd_cdir;
6360 } else {
6361 error = cache_fplookup_dirfd(&fpl, &dvp);
6362 if (__predict_false(error != 0)) {
6363 goto out;
6364 }
6365 }
6366 }
6367
6368 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6369 error = cache_fplookup_impl(dvp, &fpl);
6370 out:
6371 cache_fpl_smr_assert_not_entered(&fpl);
6372 cache_fpl_assert_status(&fpl);
6373 *status = fpl.status;
6374 if (SDT_PROBES_ENABLED()) {
6375 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6376 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6377 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6378 ndp);
6379 }
6380
6381 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6382 MPASS(error != CACHE_FPL_FAILED);
6383 if (error != 0) {
6384 cache_fpl_cleanup_cnp(fpl.cnp);
6385 MPASS(fpl.dvp == NULL);
6386 MPASS(fpl.tvp == NULL);
6387 }
6388 ndp->ni_dvp = fpl.dvp;
6389 ndp->ni_vp = fpl.tvp;
6390 }
6391 return (error);
6392 }
6393