1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
35 */
36
37 #include <sys/cdefs.h>
38 #include "opt_ddb.h"
39 #include "opt_ktrace.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/capsicum.h>
44 #include <sys/counter.h>
45 #include <sys/filedesc.h>
46 #include <sys/fnv_hash.h>
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/lock.h>
50 #include <sys/malloc.h>
51 #include <sys/fcntl.h>
52 #include <sys/jail.h>
53 #include <sys/mount.h>
54 #include <sys/namei.h>
55 #include <sys/proc.h>
56 #include <sys/seqc.h>
57 #include <sys/sdt.h>
58 #include <sys/smr.h>
59 #include <sys/smp.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/vnode.h>
64 #include <ck_queue.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #ifdef INVARIANTS
69 #include <machine/_inttypes.h>
70 #endif
71
72 #include <security/audit/audit.h>
73 #include <security/mac/mac_framework.h>
74
75 #ifdef DDB
76 #include <ddb/ddb.h>
77 #endif
78
79 #include <vm/uma.h>
80
81 /*
82 * High level overview of name caching in the VFS layer.
83 *
84 * Originally caching was implemented as part of UFS, later extracted to allow
85 * use by other filesystems. A decision was made to make it optional and
86 * completely detached from the rest of the kernel, which comes with limitations
87 * outlined near the end of this comment block.
88 *
89 * This fundamental choice needs to be revisited. In the meantime, the current
90 * state is described below. Significance of all notable routines is explained
91 * in comments placed above their implementation. Scattered thoroughout the
92 * file are TODO comments indicating shortcomings which can be fixed without
93 * reworking everything (most of the fixes will likely be reusable). Various
94 * details are omitted from this explanation to not clutter the overview, they
95 * have to be checked by reading the code and associated commentary.
96 *
97 * Keep in mind that it's individual path components which are cached, not full
98 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
99 * one for each name.
100 *
101 * I. Data organization
102 *
103 * Entries are described by "struct namecache" objects and stored in a hash
104 * table. See cache_get_hash for more information.
105 *
106 * "struct vnode" contains pointers to source entries (names which can be found
107 * when traversing through said vnode), destination entries (names of that
108 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
109 * the parent vnode.
110 *
111 * The (directory vnode; name) tuple reliably determines the target entry if
112 * it exists.
113 *
114 * Since there are no small locks at this time (all are 32 bytes in size on
115 * LP64), the code works around the problem by introducing lock arrays to
116 * protect hash buckets and vnode lists.
117 *
118 * II. Filesystem integration
119 *
120 * Filesystems participating in name caching do the following:
121 * - set vop_lookup routine to vfs_cache_lookup
122 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
123 * - if they support lockless lookup (see below), vop_fplookup_vexec and
124 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
125 * mount point
126 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
127 * applicable
128 * - call cache_enter to add entries depending on the MAKEENTRY flag
129 *
130 * With the above in mind, there are 2 entry points when doing lookups:
131 * - ... -> namei -> cache_fplookup -- this is the default
132 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
133 * should the above fail
134 *
135 * Example code flow how an entry is added:
136 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
137 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
138 *
139 * III. Performance considerations
140 *
141 * For lockless case forward lookup avoids any writes to shared areas apart
142 * from the terminal path component. In other words non-modifying lookups of
143 * different files don't suffer any scalability problems in the namecache.
144 * Looking up the same file is limited by VFS and goes beyond the scope of this
145 * file.
146 *
147 * At least on amd64 the single-threaded bottleneck for long paths is hashing
148 * (see cache_get_hash). There are cases where the code issues acquire fence
149 * multiple times, they can be combined on architectures which suffer from it.
150 *
151 * For locked case each encountered vnode has to be referenced and locked in
152 * order to be handed out to the caller (normally that's namei). This
153 * introduces significant hit single-threaded and serialization multi-threaded.
154 *
155 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
156 * avoids any writes to shared areas to any components.
157 *
158 * Unrelated insertions are partially serialized on updating the global entry
159 * counter and possibly serialized on colliding bucket or vnode locks.
160 *
161 * IV. Observability
162 *
163 * Note not everything has an explicit dtrace probe nor it should have, thus
164 * some of the one-liners below depend on implementation details.
165 *
166 * Examples:
167 *
168 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
169 * # line number, column 2 is status code (see cache_fpl_status)
170 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
171 *
172 * # Lengths of names added by binary name
173 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
174 *
175 * # Same as above but only those which exceed 64 characters
176 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
177 *
178 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
179 * # path is it
180 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
181 *
182 * V. Limitations and implementation defects
183 *
184 * - since it is possible there is no entry for an open file, tools like
185 * "procstat" may fail to resolve fd -> vnode -> path to anything
186 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
187 * shortage) in which case the above problem applies
188 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
189 * way, resolving a name may return a different path than the one used to
190 * open it (even if said path is still valid)
191 * - by default entries are not added for newly created files
192 * - adding an entry may need to evict negative entry first, which happens in 2
193 * distinct places (evicting on lookup, adding in a later VOP) making it
194 * impossible to simply reuse it
195 * - there is a simple scheme to evict negative entries as the cache is approaching
196 * its capacity, but it is very unclear if doing so is a good idea to begin with
197 * - vnodes are subject to being recycled even if target inode is left in memory,
198 * which loses the name cache entries when it perhaps should not. in case of tmpfs
199 * names get duplicated -- kept by filesystem itself and namecache separately
200 * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
201 * now hard to replace with malloc due to dependence on SMR.
202 * - lack of better integration with the kernel also turns nullfs into a layered
203 * filesystem instead of something which can take advantage of caching
204 */
205
206 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
207 "Name cache");
208
209 SDT_PROVIDER_DECLARE(vfs);
210 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
211 "struct vnode *");
212 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
213 "struct vnode *");
214 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
215 "char *");
216 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
217 "const char *");
218 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
219 "struct namecache *", "int", "int");
220 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
221 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
222 "char *", "struct vnode *");
223 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
224 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
225 "struct vnode *", "char *");
226 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
227 "struct vnode *");
228 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
229 "struct vnode *", "char *");
230 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
231 "char *");
232 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
233 "struct componentname *");
234 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
235 "struct componentname *");
236 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
237 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
238 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
239 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
240 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
241 "struct vnode *");
242 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
243 "char *");
244 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
245 "char *");
246 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
247
248 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
249 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
250 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
251
252 static char __read_frequently cache_fast_lookup_enabled = true;
253
254 /*
255 * This structure describes the elements in the cache of recent
256 * names looked up by namei.
257 */
258 struct negstate {
259 u_char neg_flag;
260 u_char neg_hit;
261 };
262 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
263 "the state must fit in a union with a pointer without growing it");
264
265 struct namecache {
266 LIST_ENTRY(namecache) nc_src; /* source vnode list */
267 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
268 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
269 struct vnode *nc_dvp; /* vnode of parent of name */
270 union {
271 struct vnode *nu_vp; /* vnode the name refers to */
272 struct negstate nu_neg;/* negative entry state */
273 } n_un;
274 u_char nc_flag; /* flag bits */
275 u_char nc_nlen; /* length of name */
276 char nc_name[]; /* segment name + nul */
277 };
278
279 /*
280 * struct namecache_ts repeats struct namecache layout up to the
281 * nc_nlen member.
282 * struct namecache_ts is used in place of struct namecache when time(s) need
283 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
284 * both a non-dotdot directory name plus dotdot for the directory's
285 * parent.
286 *
287 * See below for alignment requirement.
288 */
289 struct namecache_ts {
290 struct timespec nc_time; /* timespec provided by fs */
291 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
292 int nc_ticks; /* ticks value when entry was added */
293 int nc_pad;
294 struct namecache nc_nc;
295 };
296
297 TAILQ_HEAD(cache_freebatch, namecache);
298
299 /*
300 * At least mips n32 performs 64-bit accesses to timespec as found
301 * in namecache_ts and requires them to be aligned. Since others
302 * may be in the same spot suffer a little bit and enforce the
303 * alignment for everyone. Note this is a nop for 64-bit platforms.
304 */
305 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
306
307 /*
308 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
309 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
310 * smaller and the value was bumped to retain the total size, but it
311 * was never re-evaluated for suitability. A simple test counting
312 * lengths during package building shows that the value of 45 covers
313 * about 86% of all added entries, reaching 99% at 65.
314 *
315 * Regardless of the above, use of dedicated zones instead of malloc may be
316 * inducing additional waste. This may be hard to address as said zones are
317 * tied to VFS SMR. Even if retaining them, the current split should be
318 * re-evaluated.
319 */
320 #ifdef __LP64__
321 #define CACHE_PATH_CUTOFF 45
322 #define CACHE_LARGE_PAD 6
323 #else
324 #define CACHE_PATH_CUTOFF 41
325 #define CACHE_LARGE_PAD 2
326 #endif
327
328 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
329 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
330 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
331 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
332
333 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
334 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
335 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
336 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
337
338 #define nc_vp n_un.nu_vp
339 #define nc_neg n_un.nu_neg
340
341 /*
342 * Flags in namecache.nc_flag
343 */
344 #define NCF_WHITE 0x01
345 #define NCF_ISDOTDOT 0x02
346 #define NCF_TS 0x04
347 #define NCF_DTS 0x08
348 #define NCF_DVDROP 0x10
349 #define NCF_NEGATIVE 0x20
350 #define NCF_INVALID 0x40
351 #define NCF_WIP 0x80
352
353 /*
354 * Flags in negstate.neg_flag
355 */
356 #define NEG_HOT 0x01
357
358 static bool cache_neg_evict_cond(u_long lnumcache);
359
360 /*
361 * Mark an entry as invalid.
362 *
363 * This is called before it starts getting deconstructed.
364 */
365 static void
cache_ncp_invalidate(struct namecache * ncp)366 cache_ncp_invalidate(struct namecache *ncp)
367 {
368
369 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
370 ("%s: entry %p already invalid", __func__, ncp));
371 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
372 atomic_thread_fence_rel();
373 }
374
375 /*
376 * Check whether the entry can be safely used.
377 *
378 * All places which elide locks are supposed to call this after they are
379 * done with reading from an entry.
380 */
381 #define cache_ncp_canuse(ncp) ({ \
382 struct namecache *_ncp = (ncp); \
383 u_char _nc_flag; \
384 \
385 atomic_thread_fence_acq(); \
386 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
387 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
388 })
389
390 /*
391 * Like the above but also checks NCF_WHITE.
392 */
393 #define cache_fpl_neg_ncp_canuse(ncp) ({ \
394 struct namecache *_ncp = (ncp); \
395 u_char _nc_flag; \
396 \
397 atomic_thread_fence_acq(); \
398 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
399 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \
400 })
401
402 VFS_SMR_DECLARE;
403
404 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
405 "Name cache parameters");
406
407 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
408 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
409 "Total namecache capacity");
410
411 u_int ncsizefactor = 2;
412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
413 "Size factor for namecache");
414
415 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
416 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
417 "Ratio of negative namecache entries");
418
419 /*
420 * Negative entry % of namecache capacity above which automatic eviction is allowed.
421 *
422 * Check cache_neg_evict_cond for details.
423 */
424 static u_int ncnegminpct = 3;
425
426 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
427 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
428 "Negative entry count above which automatic eviction is allowed");
429
430 /*
431 * Structures associated with name caching.
432 */
433 #define NCHHASH(hash) \
434 (&nchashtbl[(hash) & nchash])
435 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
436 static u_long __read_mostly nchash; /* size of hash table */
437 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
438 "Size of namecache hash table");
439 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
440 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
441
442 struct nchstats nchstats; /* cache effectiveness statistics */
443
444 static u_int __exclusive_cache_line neg_cycle;
445
446 #define ncneghash 3
447 #define numneglists (ncneghash + 1)
448
449 struct neglist {
450 struct mtx nl_evict_lock;
451 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
452 TAILQ_HEAD(, namecache) nl_list;
453 TAILQ_HEAD(, namecache) nl_hotlist;
454 u_long nl_hotnum;
455 } __aligned(CACHE_LINE_SIZE);
456
457 static struct neglist neglists[numneglists];
458
459 static inline struct neglist *
NCP2NEGLIST(struct namecache * ncp)460 NCP2NEGLIST(struct namecache *ncp)
461 {
462
463 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
464 }
465
466 static inline struct negstate *
NCP2NEGSTATE(struct namecache * ncp)467 NCP2NEGSTATE(struct namecache *ncp)
468 {
469
470 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
471 return (&ncp->nc_neg);
472 }
473
474 #define numbucketlocks (ncbuckethash + 1)
475 static u_int __read_mostly ncbuckethash;
476 static struct mtx_padalign __read_mostly *bucketlocks;
477 #define HASH2BUCKETLOCK(hash) \
478 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
479
480 #define numvnodelocks (ncvnodehash + 1)
481 static u_int __read_mostly ncvnodehash;
482 static struct mtx __read_mostly *vnodelocks;
483 static inline struct mtx *
VP2VNODELOCK(struct vnode * vp)484 VP2VNODELOCK(struct vnode *vp)
485 {
486
487 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
488 }
489
490 static void
cache_out_ts(struct namecache * ncp,struct timespec * tsp,int * ticksp)491 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
492 {
493 struct namecache_ts *ncp_ts;
494
495 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
496 (tsp == NULL && ticksp == NULL),
497 ("No NCF_TS"));
498
499 if (tsp == NULL)
500 return;
501
502 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
503 *tsp = ncp_ts->nc_time;
504 *ticksp = ncp_ts->nc_ticks;
505 }
506
507 #ifdef DEBUG_CACHE
508 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
509 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
510 "VFS namecache enabled");
511 #endif
512
513 /* Export size information to userland */
514 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
515 sizeof(struct namecache), "sizeof(struct namecache)");
516
517 /*
518 * The new name cache statistics
519 */
520 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
521 "Name cache statistics");
522
523 #define STATNODE_ULONG(name, varname, descr) \
524 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
525 #define STATNODE_COUNTER(name, varname, descr) \
526 static COUNTER_U64_DEFINE_EARLY(varname); \
527 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
528 descr);
529 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
530 STATNODE_ULONG(count, numcache, "Number of cache entries");
531 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
532 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
533 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
534 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
535 STATNODE_COUNTER(poszaps, numposzaps,
536 "Number of cache hits (positive) we do not want to cache");
537 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
538 STATNODE_COUNTER(negzaps, numnegzaps,
539 "Number of cache hits (negative) we do not want to cache");
540 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
541 /* These count for vn_getcwd(), too. */
542 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
543 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
544 "Number of fullpath search errors (VOP_VPTOCNP failures)");
545 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
546 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
547 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
548
549 /*
550 * Debug or developer statistics.
551 */
552 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
553 "Name cache debugging");
554 #define DEBUGNODE_ULONG(name, varname, descr) \
555 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
556 static u_long zap_bucket_relock_success;
557 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
558 "Number of successful removals after relocking");
559 static u_long zap_bucket_fail;
560 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
561 static u_long zap_bucket_fail2;
562 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
563 static u_long cache_lock_vnodes_cel_3_failures;
564 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
565 "Number of times 3-way vnode locking failed");
566
567 static void cache_zap_locked(struct namecache *ncp);
568 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
569 char **retbuf, size_t *buflen, size_t addend);
570 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
571 char **retbuf, size_t *buflen);
572 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
573 char **retbuf, size_t *len, size_t addend);
574
575 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
576
577 static inline void
cache_assert_vlp_locked(struct mtx * vlp)578 cache_assert_vlp_locked(struct mtx *vlp)
579 {
580
581 if (vlp != NULL)
582 mtx_assert(vlp, MA_OWNED);
583 }
584
585 static inline void
cache_assert_vnode_locked(struct vnode * vp)586 cache_assert_vnode_locked(struct vnode *vp)
587 {
588 struct mtx *vlp;
589
590 vlp = VP2VNODELOCK(vp);
591 cache_assert_vlp_locked(vlp);
592 }
593
594 /*
595 * Directory vnodes with entries are held for two reasons:
596 * 1. make them less of a target for reclamation in vnlru
597 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
598 *
599 * It will be feasible to stop doing it altogether if all filesystems start
600 * supporting lockless lookup.
601 */
602 static void
cache_hold_vnode(struct vnode * vp)603 cache_hold_vnode(struct vnode *vp)
604 {
605
606 cache_assert_vnode_locked(vp);
607 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
608 vhold(vp);
609 counter_u64_add(numcachehv, 1);
610 }
611
612 static void
cache_drop_vnode(struct vnode * vp)613 cache_drop_vnode(struct vnode *vp)
614 {
615
616 /*
617 * Called after all locks are dropped, meaning we can't assert
618 * on the state of v_cache_src.
619 */
620 vdrop(vp);
621 counter_u64_add(numcachehv, -1);
622 }
623
624 /*
625 * UMA zones.
626 */
627 static uma_zone_t __read_mostly cache_zone_small;
628 static uma_zone_t __read_mostly cache_zone_small_ts;
629 static uma_zone_t __read_mostly cache_zone_large;
630 static uma_zone_t __read_mostly cache_zone_large_ts;
631
632 char *
cache_symlink_alloc(size_t size,int flags)633 cache_symlink_alloc(size_t size, int flags)
634 {
635
636 if (size < CACHE_ZONE_SMALL_SIZE) {
637 return (uma_zalloc_smr(cache_zone_small, flags));
638 }
639 if (size < CACHE_ZONE_LARGE_SIZE) {
640 return (uma_zalloc_smr(cache_zone_large, flags));
641 }
642 counter_u64_add(symlinktoobig, 1);
643 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
644 return (NULL);
645 }
646
647 void
cache_symlink_free(char * string,size_t size)648 cache_symlink_free(char *string, size_t size)
649 {
650
651 MPASS(string != NULL);
652 KASSERT(size < CACHE_ZONE_LARGE_SIZE,
653 ("%s: size %zu too big", __func__, size));
654
655 if (size < CACHE_ZONE_SMALL_SIZE) {
656 uma_zfree_smr(cache_zone_small, string);
657 return;
658 }
659 if (size < CACHE_ZONE_LARGE_SIZE) {
660 uma_zfree_smr(cache_zone_large, string);
661 return;
662 }
663 __assert_unreachable();
664 }
665
666 static struct namecache *
cache_alloc_uma(int len,bool ts)667 cache_alloc_uma(int len, bool ts)
668 {
669 struct namecache_ts *ncp_ts;
670 struct namecache *ncp;
671
672 if (__predict_false(ts)) {
673 if (len <= CACHE_PATH_CUTOFF)
674 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
675 else
676 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
677 ncp = &ncp_ts->nc_nc;
678 } else {
679 if (len <= CACHE_PATH_CUTOFF)
680 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
681 else
682 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
683 }
684 return (ncp);
685 }
686
687 static void
cache_free_uma(struct namecache * ncp)688 cache_free_uma(struct namecache *ncp)
689 {
690 struct namecache_ts *ncp_ts;
691
692 if (__predict_false(ncp->nc_flag & NCF_TS)) {
693 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
694 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
695 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
696 else
697 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
698 } else {
699 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
700 uma_zfree_smr(cache_zone_small, ncp);
701 else
702 uma_zfree_smr(cache_zone_large, ncp);
703 }
704 }
705
706 static struct namecache *
cache_alloc(int len,bool ts)707 cache_alloc(int len, bool ts)
708 {
709 u_long lnumcache;
710
711 /*
712 * Avoid blowout in namecache entries.
713 *
714 * Bugs:
715 * 1. filesystems may end up trying to add an already existing entry
716 * (for example this can happen after a cache miss during concurrent
717 * lookup), in which case we will call cache_neg_evict despite not
718 * adding anything.
719 * 2. the routine may fail to free anything and no provisions are made
720 * to make it try harder (see the inside for failure modes)
721 * 3. it only ever looks at negative entries.
722 */
723 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
724 if (cache_neg_evict_cond(lnumcache)) {
725 lnumcache = atomic_load_long(&numcache);
726 }
727 if (__predict_false(lnumcache >= ncsize)) {
728 atomic_subtract_long(&numcache, 1);
729 counter_u64_add(numdrops, 1);
730 return (NULL);
731 }
732 return (cache_alloc_uma(len, ts));
733 }
734
735 static void
cache_free(struct namecache * ncp)736 cache_free(struct namecache *ncp)
737 {
738
739 MPASS(ncp != NULL);
740 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
741 cache_drop_vnode(ncp->nc_dvp);
742 }
743 cache_free_uma(ncp);
744 atomic_subtract_long(&numcache, 1);
745 }
746
747 static void
cache_free_batch(struct cache_freebatch * batch)748 cache_free_batch(struct cache_freebatch *batch)
749 {
750 struct namecache *ncp, *nnp;
751 int i;
752
753 i = 0;
754 if (TAILQ_EMPTY(batch))
755 goto out;
756 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
757 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
758 cache_drop_vnode(ncp->nc_dvp);
759 }
760 cache_free_uma(ncp);
761 i++;
762 }
763 atomic_subtract_long(&numcache, i);
764 out:
765 SDT_PROBE1(vfs, namecache, purge, batch, i);
766 }
767
768 /*
769 * Hashing.
770 *
771 * The code was made to use FNV in 2001 and this choice needs to be revisited.
772 *
773 * Short summary of the difficulty:
774 * The longest name which can be inserted is NAME_MAX characters in length (or
775 * 255 at the time of writing this comment), while majority of names used in
776 * practice are significantly shorter (mostly below 10). More importantly
777 * majority of lookups performed find names are even shorter than that.
778 *
779 * This poses a problem where hashes which do better than FNV past word size
780 * (or so) tend to come with additional overhead when finalizing the result,
781 * making them noticeably slower for the most commonly used range.
782 *
783 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
784 *
785 * When looking it up the most time consuming part by a large margin (at least
786 * on amd64) is hashing. Replacing FNV with something which pessimizes short
787 * input would make the slowest part stand out even more.
788 */
789
790 /*
791 * TODO: With the value stored we can do better than computing the hash based
792 * on the address.
793 */
794 static void
cache_prehash(struct vnode * vp)795 cache_prehash(struct vnode *vp)
796 {
797
798 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
799 }
800
801 static uint32_t
cache_get_hash(char * name,u_char len,struct vnode * dvp)802 cache_get_hash(char *name, u_char len, struct vnode *dvp)
803 {
804
805 return (fnv_32_buf(name, len, dvp->v_nchash));
806 }
807
808 static uint32_t
cache_get_hash_iter_start(struct vnode * dvp)809 cache_get_hash_iter_start(struct vnode *dvp)
810 {
811
812 return (dvp->v_nchash);
813 }
814
815 static uint32_t
cache_get_hash_iter(char c,uint32_t hash)816 cache_get_hash_iter(char c, uint32_t hash)
817 {
818
819 return (fnv_32_buf(&c, 1, hash));
820 }
821
822 static uint32_t
cache_get_hash_iter_finish(uint32_t hash)823 cache_get_hash_iter_finish(uint32_t hash)
824 {
825
826 return (hash);
827 }
828
829 static inline struct nchashhead *
NCP2BUCKET(struct namecache * ncp)830 NCP2BUCKET(struct namecache *ncp)
831 {
832 uint32_t hash;
833
834 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
835 return (NCHHASH(hash));
836 }
837
838 static inline struct mtx *
NCP2BUCKETLOCK(struct namecache * ncp)839 NCP2BUCKETLOCK(struct namecache *ncp)
840 {
841 uint32_t hash;
842
843 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
844 return (HASH2BUCKETLOCK(hash));
845 }
846
847 #ifdef INVARIANTS
848 static void
cache_assert_bucket_locked(struct namecache * ncp)849 cache_assert_bucket_locked(struct namecache *ncp)
850 {
851 struct mtx *blp;
852
853 blp = NCP2BUCKETLOCK(ncp);
854 mtx_assert(blp, MA_OWNED);
855 }
856
857 static void
cache_assert_bucket_unlocked(struct namecache * ncp)858 cache_assert_bucket_unlocked(struct namecache *ncp)
859 {
860 struct mtx *blp;
861
862 blp = NCP2BUCKETLOCK(ncp);
863 mtx_assert(blp, MA_NOTOWNED);
864 }
865 #else
866 #define cache_assert_bucket_locked(x) do { } while (0)
867 #define cache_assert_bucket_unlocked(x) do { } while (0)
868 #endif
869
870 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
871 static void
_cache_sort_vnodes(void ** p1,void ** p2)872 _cache_sort_vnodes(void **p1, void **p2)
873 {
874 void *tmp;
875
876 MPASS(*p1 != NULL || *p2 != NULL);
877
878 if (*p1 > *p2) {
879 tmp = *p2;
880 *p2 = *p1;
881 *p1 = tmp;
882 }
883 }
884
885 static void
cache_lock_all_buckets(void)886 cache_lock_all_buckets(void)
887 {
888 u_int i;
889
890 for (i = 0; i < numbucketlocks; i++)
891 mtx_lock(&bucketlocks[i]);
892 }
893
894 static void
cache_unlock_all_buckets(void)895 cache_unlock_all_buckets(void)
896 {
897 u_int i;
898
899 for (i = 0; i < numbucketlocks; i++)
900 mtx_unlock(&bucketlocks[i]);
901 }
902
903 static void
cache_lock_all_vnodes(void)904 cache_lock_all_vnodes(void)
905 {
906 u_int i;
907
908 for (i = 0; i < numvnodelocks; i++)
909 mtx_lock(&vnodelocks[i]);
910 }
911
912 static void
cache_unlock_all_vnodes(void)913 cache_unlock_all_vnodes(void)
914 {
915 u_int i;
916
917 for (i = 0; i < numvnodelocks; i++)
918 mtx_unlock(&vnodelocks[i]);
919 }
920
921 static int
cache_trylock_vnodes(struct mtx * vlp1,struct mtx * vlp2)922 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
923 {
924
925 cache_sort_vnodes(&vlp1, &vlp2);
926
927 if (vlp1 != NULL) {
928 if (!mtx_trylock(vlp1))
929 return (EAGAIN);
930 }
931 if (!mtx_trylock(vlp2)) {
932 if (vlp1 != NULL)
933 mtx_unlock(vlp1);
934 return (EAGAIN);
935 }
936
937 return (0);
938 }
939
940 static void
cache_lock_vnodes(struct mtx * vlp1,struct mtx * vlp2)941 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
942 {
943
944 MPASS(vlp1 != NULL || vlp2 != NULL);
945 MPASS(vlp1 <= vlp2);
946
947 if (vlp1 != NULL)
948 mtx_lock(vlp1);
949 if (vlp2 != NULL)
950 mtx_lock(vlp2);
951 }
952
953 static void
cache_unlock_vnodes(struct mtx * vlp1,struct mtx * vlp2)954 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
955 {
956
957 MPASS(vlp1 != NULL || vlp2 != NULL);
958
959 if (vlp1 != NULL)
960 mtx_unlock(vlp1);
961 if (vlp2 != NULL)
962 mtx_unlock(vlp2);
963 }
964
965 static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)966 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
967 {
968 struct nchstats snap;
969
970 if (req->oldptr == NULL)
971 return (SYSCTL_OUT(req, 0, sizeof(snap)));
972
973 snap = nchstats;
974 snap.ncs_goodhits = counter_u64_fetch(numposhits);
975 snap.ncs_neghits = counter_u64_fetch(numneghits);
976 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
977 counter_u64_fetch(numnegzaps);
978 snap.ncs_miss = counter_u64_fetch(nummisszap) +
979 counter_u64_fetch(nummiss);
980
981 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
982 }
983 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
984 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
985 "VFS cache effectiveness statistics");
986
987 static int
sysctl_hitpct(SYSCTL_HANDLER_ARGS)988 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
989 {
990 long poshits, neghits, miss, total;
991 long pct;
992
993 poshits = counter_u64_fetch(numposhits);
994 neghits = counter_u64_fetch(numneghits);
995 miss = counter_u64_fetch(nummiss);
996 total = poshits + neghits + miss;
997
998 pct = 0;
999 if (total != 0)
1000 pct = ((poshits + neghits) * 100) / total;
1001 return (sysctl_handle_int(oidp, 0, pct, req));
1002 }
1003 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
1004 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
1005 "I", "Percentage of hits");
1006
1007 static void
cache_recalc_neg_min(void)1008 cache_recalc_neg_min(void)
1009 {
1010
1011 neg_min = (ncsize * ncnegminpct) / 100;
1012 }
1013
1014 static int
sysctl_negminpct(SYSCTL_HANDLER_ARGS)1015 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1016 {
1017 u_int val;
1018 int error;
1019
1020 val = ncnegminpct;
1021 error = sysctl_handle_int(oidp, &val, 0, req);
1022 if (error != 0 || req->newptr == NULL)
1023 return (error);
1024
1025 if (val == ncnegminpct)
1026 return (0);
1027 if (val < 0 || val > 99)
1028 return (EINVAL);
1029 ncnegminpct = val;
1030 cache_recalc_neg_min();
1031 return (0);
1032 }
1033
1034 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1035 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1036 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1037
1038 #ifdef DEBUG_CACHE
1039 /*
1040 * Grab an atomic snapshot of the name cache hash chain lengths
1041 */
1042 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1043 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1044 "hash table stats");
1045
1046 static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)1047 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1048 {
1049 struct nchashhead *ncpp;
1050 struct namecache *ncp;
1051 int i, error, n_nchash, *cntbuf;
1052
1053 retry:
1054 n_nchash = nchash + 1; /* nchash is max index, not count */
1055 if (req->oldptr == NULL)
1056 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1057 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1058 cache_lock_all_buckets();
1059 if (n_nchash != nchash + 1) {
1060 cache_unlock_all_buckets();
1061 free(cntbuf, M_TEMP);
1062 goto retry;
1063 }
1064 /* Scan hash tables counting entries */
1065 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1066 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1067 cntbuf[i]++;
1068 cache_unlock_all_buckets();
1069 for (error = 0, i = 0; i < n_nchash; i++)
1070 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1071 break;
1072 free(cntbuf, M_TEMP);
1073 return (error);
1074 }
1075 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1076 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1077 "nchash chain lengths");
1078
1079 static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)1080 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1081 {
1082 int error;
1083 struct nchashhead *ncpp;
1084 struct namecache *ncp;
1085 int n_nchash;
1086 int count, maxlength, used, pct;
1087
1088 if (!req->oldptr)
1089 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1090
1091 cache_lock_all_buckets();
1092 n_nchash = nchash + 1; /* nchash is max index, not count */
1093 used = 0;
1094 maxlength = 0;
1095
1096 /* Scan hash tables for applicable entries */
1097 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1098 count = 0;
1099 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1100 count++;
1101 }
1102 if (count)
1103 used++;
1104 if (maxlength < count)
1105 maxlength = count;
1106 }
1107 n_nchash = nchash + 1;
1108 cache_unlock_all_buckets();
1109 pct = (used * 100) / (n_nchash / 100);
1110 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1111 if (error)
1112 return (error);
1113 error = SYSCTL_OUT(req, &used, sizeof(used));
1114 if (error)
1115 return (error);
1116 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1117 if (error)
1118 return (error);
1119 error = SYSCTL_OUT(req, &pct, sizeof(pct));
1120 if (error)
1121 return (error);
1122 return (0);
1123 }
1124 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1125 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1126 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1127 #endif
1128
1129 /*
1130 * Negative entries management
1131 *
1132 * Various workloads create plenty of negative entries and barely use them
1133 * afterwards. Moreover malicious users can keep performing bogus lookups
1134 * adding even more entries. For example "make tinderbox" as of writing this
1135 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1136 * negative.
1137 *
1138 * As such, a rather aggressive eviction method is needed. The currently
1139 * employed method is a placeholder.
1140 *
1141 * Entries are split over numneglists separate lists, each of which is further
1142 * split into hot and cold entries. Entries get promoted after getting a hit.
1143 * Eviction happens on addition of new entry.
1144 */
1145 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1146 "Name cache negative entry statistics");
1147
1148 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1149 "Number of negative cache entries");
1150
1151 static COUNTER_U64_DEFINE_EARLY(neg_created);
1152 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1153 "Number of created negative entries");
1154
1155 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1156 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1157 "Number of evicted negative entries");
1158
1159 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1160 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1161 &neg_evict_skipped_empty,
1162 "Number of times evicting failed due to lack of entries");
1163
1164 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1165 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1166 &neg_evict_skipped_missed,
1167 "Number of times evicting failed due to target entry disappearing");
1168
1169 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1170 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1171 &neg_evict_skipped_contended,
1172 "Number of times evicting failed due to contention");
1173
1174 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1175 "Number of cache hits (negative)");
1176
1177 static int
sysctl_neg_hot(SYSCTL_HANDLER_ARGS)1178 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1179 {
1180 int i, out;
1181
1182 out = 0;
1183 for (i = 0; i < numneglists; i++)
1184 out += neglists[i].nl_hotnum;
1185
1186 return (SYSCTL_OUT(req, &out, sizeof(out)));
1187 }
1188 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1189 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1190 "Number of hot negative entries");
1191
1192 static void
cache_neg_init(struct namecache * ncp)1193 cache_neg_init(struct namecache *ncp)
1194 {
1195 struct negstate *ns;
1196
1197 ncp->nc_flag |= NCF_NEGATIVE;
1198 ns = NCP2NEGSTATE(ncp);
1199 ns->neg_flag = 0;
1200 ns->neg_hit = 0;
1201 counter_u64_add(neg_created, 1);
1202 }
1203
1204 #define CACHE_NEG_PROMOTION_THRESH 2
1205
1206 static bool
cache_neg_hit_prep(struct namecache * ncp)1207 cache_neg_hit_prep(struct namecache *ncp)
1208 {
1209 struct negstate *ns;
1210 u_char n;
1211
1212 ns = NCP2NEGSTATE(ncp);
1213 n = atomic_load_char(&ns->neg_hit);
1214 for (;;) {
1215 if (n >= CACHE_NEG_PROMOTION_THRESH)
1216 return (false);
1217 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1218 break;
1219 }
1220 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1221 }
1222
1223 /*
1224 * Nothing to do here but it is provided for completeness as some
1225 * cache_neg_hit_prep callers may end up returning without even
1226 * trying to promote.
1227 */
1228 #define cache_neg_hit_abort(ncp) do { } while (0)
1229
1230 static void
cache_neg_hit_finish(struct namecache * ncp)1231 cache_neg_hit_finish(struct namecache *ncp)
1232 {
1233
1234 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1235 counter_u64_add(numneghits, 1);
1236 }
1237
1238 /*
1239 * Move a negative entry to the hot list.
1240 */
1241 static void
cache_neg_promote_locked(struct namecache * ncp)1242 cache_neg_promote_locked(struct namecache *ncp)
1243 {
1244 struct neglist *nl;
1245 struct negstate *ns;
1246
1247 ns = NCP2NEGSTATE(ncp);
1248 nl = NCP2NEGLIST(ncp);
1249 mtx_assert(&nl->nl_lock, MA_OWNED);
1250 if ((ns->neg_flag & NEG_HOT) == 0) {
1251 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1252 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1253 nl->nl_hotnum++;
1254 ns->neg_flag |= NEG_HOT;
1255 }
1256 }
1257
1258 /*
1259 * Move a hot negative entry to the cold list.
1260 */
1261 static void
cache_neg_demote_locked(struct namecache * ncp)1262 cache_neg_demote_locked(struct namecache *ncp)
1263 {
1264 struct neglist *nl;
1265 struct negstate *ns;
1266
1267 ns = NCP2NEGSTATE(ncp);
1268 nl = NCP2NEGLIST(ncp);
1269 mtx_assert(&nl->nl_lock, MA_OWNED);
1270 MPASS(ns->neg_flag & NEG_HOT);
1271 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1272 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1273 nl->nl_hotnum--;
1274 ns->neg_flag &= ~NEG_HOT;
1275 atomic_store_char(&ns->neg_hit, 0);
1276 }
1277
1278 /*
1279 * Move a negative entry to the hot list if it matches the lookup.
1280 *
1281 * We have to take locks, but they may be contended and in the worst
1282 * case we may need to go off CPU. We don't want to spin within the
1283 * smr section and we can't block with it. Exiting the section means
1284 * the found entry could have been evicted. We are going to look it
1285 * up again.
1286 */
1287 static bool
cache_neg_promote_cond(struct vnode * dvp,struct componentname * cnp,struct namecache * oncp,uint32_t hash)1288 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1289 struct namecache *oncp, uint32_t hash)
1290 {
1291 struct namecache *ncp;
1292 struct neglist *nl;
1293 u_char nc_flag;
1294
1295 nl = NCP2NEGLIST(oncp);
1296
1297 mtx_lock(&nl->nl_lock);
1298 /*
1299 * For hash iteration.
1300 */
1301 vfs_smr_enter();
1302
1303 /*
1304 * Avoid all surprises by only succeeding if we got the same entry and
1305 * bailing completely otherwise.
1306 * XXX There are no provisions to keep the vnode around, meaning we may
1307 * end up promoting a negative entry for a *new* vnode and returning
1308 * ENOENT on its account. This is the error we want to return anyway
1309 * and promotion is harmless.
1310 *
1311 * In particular at this point there can be a new ncp which matches the
1312 * search but hashes to a different neglist.
1313 */
1314 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1315 if (ncp == oncp)
1316 break;
1317 }
1318
1319 /*
1320 * No match to begin with.
1321 */
1322 if (__predict_false(ncp == NULL)) {
1323 goto out_abort;
1324 }
1325
1326 /*
1327 * The newly found entry may be something different...
1328 */
1329 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1330 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1331 goto out_abort;
1332 }
1333
1334 /*
1335 * ... and not even negative.
1336 */
1337 nc_flag = atomic_load_char(&ncp->nc_flag);
1338 if ((nc_flag & NCF_NEGATIVE) == 0) {
1339 goto out_abort;
1340 }
1341
1342 if (!cache_ncp_canuse(ncp)) {
1343 goto out_abort;
1344 }
1345
1346 cache_neg_promote_locked(ncp);
1347 cache_neg_hit_finish(ncp);
1348 vfs_smr_exit();
1349 mtx_unlock(&nl->nl_lock);
1350 return (true);
1351 out_abort:
1352 vfs_smr_exit();
1353 mtx_unlock(&nl->nl_lock);
1354 return (false);
1355 }
1356
1357 static void
cache_neg_promote(struct namecache * ncp)1358 cache_neg_promote(struct namecache *ncp)
1359 {
1360 struct neglist *nl;
1361
1362 nl = NCP2NEGLIST(ncp);
1363 mtx_lock(&nl->nl_lock);
1364 cache_neg_promote_locked(ncp);
1365 mtx_unlock(&nl->nl_lock);
1366 }
1367
1368 static void
cache_neg_insert(struct namecache * ncp)1369 cache_neg_insert(struct namecache *ncp)
1370 {
1371 struct neglist *nl;
1372
1373 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1374 cache_assert_bucket_locked(ncp);
1375 nl = NCP2NEGLIST(ncp);
1376 mtx_lock(&nl->nl_lock);
1377 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1378 mtx_unlock(&nl->nl_lock);
1379 atomic_add_long(&numneg, 1);
1380 }
1381
1382 static void
cache_neg_remove(struct namecache * ncp)1383 cache_neg_remove(struct namecache *ncp)
1384 {
1385 struct neglist *nl;
1386 struct negstate *ns;
1387
1388 cache_assert_bucket_locked(ncp);
1389 nl = NCP2NEGLIST(ncp);
1390 ns = NCP2NEGSTATE(ncp);
1391 mtx_lock(&nl->nl_lock);
1392 if ((ns->neg_flag & NEG_HOT) != 0) {
1393 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1394 nl->nl_hotnum--;
1395 } else {
1396 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1397 }
1398 mtx_unlock(&nl->nl_lock);
1399 atomic_subtract_long(&numneg, 1);
1400 }
1401
1402 static struct neglist *
cache_neg_evict_select_list(void)1403 cache_neg_evict_select_list(void)
1404 {
1405 struct neglist *nl;
1406 u_int c;
1407
1408 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1409 nl = &neglists[c % numneglists];
1410 if (!mtx_trylock(&nl->nl_evict_lock)) {
1411 counter_u64_add(neg_evict_skipped_contended, 1);
1412 return (NULL);
1413 }
1414 return (nl);
1415 }
1416
1417 static struct namecache *
cache_neg_evict_select_entry(struct neglist * nl)1418 cache_neg_evict_select_entry(struct neglist *nl)
1419 {
1420 struct namecache *ncp, *lncp;
1421 struct negstate *ns, *lns;
1422 int i;
1423
1424 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1425 mtx_assert(&nl->nl_lock, MA_OWNED);
1426 ncp = TAILQ_FIRST(&nl->nl_list);
1427 if (ncp == NULL)
1428 return (NULL);
1429 lncp = ncp;
1430 lns = NCP2NEGSTATE(lncp);
1431 for (i = 1; i < 4; i++) {
1432 ncp = TAILQ_NEXT(ncp, nc_dst);
1433 if (ncp == NULL)
1434 break;
1435 ns = NCP2NEGSTATE(ncp);
1436 if (ns->neg_hit < lns->neg_hit) {
1437 lncp = ncp;
1438 lns = ns;
1439 }
1440 }
1441 return (lncp);
1442 }
1443
1444 static bool
cache_neg_evict(void)1445 cache_neg_evict(void)
1446 {
1447 struct namecache *ncp, *ncp2;
1448 struct neglist *nl;
1449 struct vnode *dvp;
1450 struct mtx *dvlp;
1451 struct mtx *blp;
1452 uint32_t hash;
1453 u_char nlen;
1454 bool evicted;
1455
1456 nl = cache_neg_evict_select_list();
1457 if (nl == NULL) {
1458 return (false);
1459 }
1460
1461 mtx_lock(&nl->nl_lock);
1462 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1463 if (ncp != NULL) {
1464 cache_neg_demote_locked(ncp);
1465 }
1466 ncp = cache_neg_evict_select_entry(nl);
1467 if (ncp == NULL) {
1468 counter_u64_add(neg_evict_skipped_empty, 1);
1469 mtx_unlock(&nl->nl_lock);
1470 mtx_unlock(&nl->nl_evict_lock);
1471 return (false);
1472 }
1473 nlen = ncp->nc_nlen;
1474 dvp = ncp->nc_dvp;
1475 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1476 dvlp = VP2VNODELOCK(dvp);
1477 blp = HASH2BUCKETLOCK(hash);
1478 mtx_unlock(&nl->nl_lock);
1479 mtx_unlock(&nl->nl_evict_lock);
1480 mtx_lock(dvlp);
1481 mtx_lock(blp);
1482 /*
1483 * Note that since all locks were dropped above, the entry may be
1484 * gone or reallocated to be something else.
1485 */
1486 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1487 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1488 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1489 break;
1490 }
1491 if (ncp2 == NULL) {
1492 counter_u64_add(neg_evict_skipped_missed, 1);
1493 ncp = NULL;
1494 evicted = false;
1495 } else {
1496 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1497 MPASS(blp == NCP2BUCKETLOCK(ncp));
1498 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1499 ncp->nc_name);
1500 cache_zap_locked(ncp);
1501 counter_u64_add(neg_evicted, 1);
1502 evicted = true;
1503 }
1504 mtx_unlock(blp);
1505 mtx_unlock(dvlp);
1506 if (ncp != NULL)
1507 cache_free(ncp);
1508 return (evicted);
1509 }
1510
1511 /*
1512 * Maybe evict a negative entry to create more room.
1513 *
1514 * The ncnegfactor parameter limits what fraction of the total count
1515 * can comprise of negative entries. However, if the cache is just
1516 * warming up this leads to excessive evictions. As such, ncnegminpct
1517 * (recomputed to neg_min) dictates whether the above should be
1518 * applied.
1519 *
1520 * Try evicting if the cache is close to full capacity regardless of
1521 * other considerations.
1522 */
1523 static bool
cache_neg_evict_cond(u_long lnumcache)1524 cache_neg_evict_cond(u_long lnumcache)
1525 {
1526 u_long lnumneg;
1527
1528 if (ncsize - 1000 < lnumcache)
1529 goto out_evict;
1530 lnumneg = atomic_load_long(&numneg);
1531 if (lnumneg < neg_min)
1532 return (false);
1533 if (lnumneg * ncnegfactor < lnumcache)
1534 return (false);
1535 out_evict:
1536 return (cache_neg_evict());
1537 }
1538
1539 /*
1540 * cache_zap_locked():
1541 *
1542 * Removes a namecache entry from cache, whether it contains an actual
1543 * pointer to a vnode or if it is just a negative cache entry.
1544 */
1545 static void
cache_zap_locked(struct namecache * ncp)1546 cache_zap_locked(struct namecache *ncp)
1547 {
1548 struct nchashhead *ncpp;
1549 struct vnode *dvp, *vp;
1550
1551 dvp = ncp->nc_dvp;
1552 vp = ncp->nc_vp;
1553
1554 if (!(ncp->nc_flag & NCF_NEGATIVE))
1555 cache_assert_vnode_locked(vp);
1556 cache_assert_vnode_locked(dvp);
1557 cache_assert_bucket_locked(ncp);
1558
1559 cache_ncp_invalidate(ncp);
1560
1561 ncpp = NCP2BUCKET(ncp);
1562 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1563 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1564 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1565 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1566 if (ncp == vp->v_cache_dd) {
1567 atomic_store_ptr(&vp->v_cache_dd, NULL);
1568 }
1569 } else {
1570 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1571 cache_neg_remove(ncp);
1572 }
1573 if (ncp->nc_flag & NCF_ISDOTDOT) {
1574 if (ncp == dvp->v_cache_dd) {
1575 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1576 }
1577 } else {
1578 LIST_REMOVE(ncp, nc_src);
1579 if (LIST_EMPTY(&dvp->v_cache_src)) {
1580 ncp->nc_flag |= NCF_DVDROP;
1581 }
1582 }
1583 }
1584
1585 static void
cache_zap_negative_locked_vnode_kl(struct namecache * ncp,struct vnode * vp)1586 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1587 {
1588 struct mtx *blp;
1589
1590 MPASS(ncp->nc_dvp == vp);
1591 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1592 cache_assert_vnode_locked(vp);
1593
1594 blp = NCP2BUCKETLOCK(ncp);
1595 mtx_lock(blp);
1596 cache_zap_locked(ncp);
1597 mtx_unlock(blp);
1598 }
1599
1600 static bool
cache_zap_locked_vnode_kl2(struct namecache * ncp,struct vnode * vp,struct mtx ** vlpp)1601 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1602 struct mtx **vlpp)
1603 {
1604 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1605 struct mtx *blp;
1606
1607 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1608 cache_assert_vnode_locked(vp);
1609
1610 if (ncp->nc_flag & NCF_NEGATIVE) {
1611 if (*vlpp != NULL) {
1612 mtx_unlock(*vlpp);
1613 *vlpp = NULL;
1614 }
1615 cache_zap_negative_locked_vnode_kl(ncp, vp);
1616 return (true);
1617 }
1618
1619 pvlp = VP2VNODELOCK(vp);
1620 blp = NCP2BUCKETLOCK(ncp);
1621 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1622 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1623
1624 if (*vlpp == vlp1 || *vlpp == vlp2) {
1625 to_unlock = *vlpp;
1626 *vlpp = NULL;
1627 } else {
1628 if (*vlpp != NULL) {
1629 mtx_unlock(*vlpp);
1630 *vlpp = NULL;
1631 }
1632 cache_sort_vnodes(&vlp1, &vlp2);
1633 if (vlp1 == pvlp) {
1634 mtx_lock(vlp2);
1635 to_unlock = vlp2;
1636 } else {
1637 if (!mtx_trylock(vlp1))
1638 goto out_relock;
1639 to_unlock = vlp1;
1640 }
1641 }
1642 mtx_lock(blp);
1643 cache_zap_locked(ncp);
1644 mtx_unlock(blp);
1645 if (to_unlock != NULL)
1646 mtx_unlock(to_unlock);
1647 return (true);
1648
1649 out_relock:
1650 mtx_unlock(vlp2);
1651 mtx_lock(vlp1);
1652 mtx_lock(vlp2);
1653 MPASS(*vlpp == NULL);
1654 *vlpp = vlp1;
1655 return (false);
1656 }
1657
1658 /*
1659 * If trylocking failed we can get here. We know enough to take all needed locks
1660 * in the right order and re-lookup the entry.
1661 */
1662 static int
cache_zap_unlocked_bucket(struct namecache * ncp,struct componentname * cnp,struct vnode * dvp,struct mtx * dvlp,struct mtx * vlp,uint32_t hash,struct mtx * blp)1663 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1664 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1665 struct mtx *blp)
1666 {
1667 struct namecache *rncp;
1668 struct mtx *rvlp;
1669
1670 cache_assert_bucket_unlocked(ncp);
1671
1672 cache_sort_vnodes(&dvlp, &vlp);
1673 cache_lock_vnodes(dvlp, vlp);
1674 mtx_lock(blp);
1675 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1676 if (rncp == ncp && rncp->nc_dvp == dvp &&
1677 rncp->nc_nlen == cnp->cn_namelen &&
1678 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1679 break;
1680 }
1681
1682 if (rncp == NULL)
1683 goto out_mismatch;
1684
1685 if (!(ncp->nc_flag & NCF_NEGATIVE))
1686 rvlp = VP2VNODELOCK(rncp->nc_vp);
1687 else
1688 rvlp = NULL;
1689 if (rvlp != vlp)
1690 goto out_mismatch;
1691
1692 cache_zap_locked(rncp);
1693 mtx_unlock(blp);
1694 cache_unlock_vnodes(dvlp, vlp);
1695 atomic_add_long(&zap_bucket_relock_success, 1);
1696 return (0);
1697
1698 out_mismatch:
1699 mtx_unlock(blp);
1700 cache_unlock_vnodes(dvlp, vlp);
1701 return (EAGAIN);
1702 }
1703
1704 static int __noinline
cache_zap_locked_bucket(struct namecache * ncp,struct componentname * cnp,uint32_t hash,struct mtx * blp)1705 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1706 uint32_t hash, struct mtx *blp)
1707 {
1708 struct mtx *dvlp, *vlp;
1709 struct vnode *dvp;
1710
1711 cache_assert_bucket_locked(ncp);
1712
1713 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1714 vlp = NULL;
1715 if (!(ncp->nc_flag & NCF_NEGATIVE))
1716 vlp = VP2VNODELOCK(ncp->nc_vp);
1717 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1718 cache_zap_locked(ncp);
1719 mtx_unlock(blp);
1720 cache_unlock_vnodes(dvlp, vlp);
1721 return (0);
1722 }
1723
1724 dvp = ncp->nc_dvp;
1725 mtx_unlock(blp);
1726 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1727 }
1728
1729 static __noinline int
cache_remove_cnp(struct vnode * dvp,struct componentname * cnp)1730 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1731 {
1732 struct namecache *ncp;
1733 struct mtx *blp;
1734 struct mtx *dvlp, *dvlp2;
1735 uint32_t hash;
1736 int error;
1737
1738 if (cnp->cn_namelen == 2 &&
1739 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1740 dvlp = VP2VNODELOCK(dvp);
1741 dvlp2 = NULL;
1742 mtx_lock(dvlp);
1743 retry_dotdot:
1744 ncp = dvp->v_cache_dd;
1745 if (ncp == NULL) {
1746 mtx_unlock(dvlp);
1747 if (dvlp2 != NULL)
1748 mtx_unlock(dvlp2);
1749 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1750 return (0);
1751 }
1752 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1753 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1754 goto retry_dotdot;
1755 MPASS(dvp->v_cache_dd == NULL);
1756 mtx_unlock(dvlp);
1757 if (dvlp2 != NULL)
1758 mtx_unlock(dvlp2);
1759 cache_free(ncp);
1760 } else {
1761 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1762 mtx_unlock(dvlp);
1763 if (dvlp2 != NULL)
1764 mtx_unlock(dvlp2);
1765 }
1766 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1767 return (1);
1768 }
1769
1770 /*
1771 * XXX note that access here is completely unlocked with no provisions
1772 * to keep the hash allocated. If one is sufficiently unlucky a
1773 * parallel cache resize can reallocate the hash, unmap backing pages
1774 * and cause the empty check below to fault.
1775 *
1776 * Fixing this has epsilon priority, but can be done with no overhead
1777 * for this codepath with sufficient effort.
1778 */
1779 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1780 blp = HASH2BUCKETLOCK(hash);
1781 retry:
1782 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1783 goto out_no_entry;
1784
1785 mtx_lock(blp);
1786
1787 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1788 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1789 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1790 break;
1791 }
1792
1793 if (ncp == NULL) {
1794 mtx_unlock(blp);
1795 goto out_no_entry;
1796 }
1797
1798 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1799 if (__predict_false(error != 0)) {
1800 atomic_add_long(&zap_bucket_fail, 1);
1801 goto retry;
1802 }
1803 counter_u64_add(numposzaps, 1);
1804 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1805 cache_free(ncp);
1806 return (1);
1807 out_no_entry:
1808 counter_u64_add(nummisszap, 1);
1809 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1810 return (0);
1811 }
1812
1813 static int __noinline
cache_lookup_dot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1814 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1815 struct timespec *tsp, int *ticksp)
1816 {
1817 int ltype;
1818
1819 *vpp = dvp;
1820 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1821 if (tsp != NULL)
1822 timespecclear(tsp);
1823 if (ticksp != NULL)
1824 *ticksp = ticks;
1825 vrefact(*vpp);
1826 /*
1827 * When we lookup "." we still can be asked to lock it
1828 * differently...
1829 */
1830 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1831 if (ltype != VOP_ISLOCKED(*vpp)) {
1832 if (ltype == LK_EXCLUSIVE) {
1833 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1834 if (VN_IS_DOOMED((*vpp))) {
1835 /* forced unmount */
1836 vrele(*vpp);
1837 *vpp = NULL;
1838 return (ENOENT);
1839 }
1840 } else
1841 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1842 }
1843 return (-1);
1844 }
1845
1846 static int __noinline
cache_lookup_dotdot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1847 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1848 struct timespec *tsp, int *ticksp)
1849 {
1850 struct namecache_ts *ncp_ts;
1851 struct namecache *ncp;
1852 struct mtx *dvlp;
1853 enum vgetstate vs;
1854 int error, ltype;
1855 bool whiteout;
1856
1857 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1858
1859 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1860 cache_remove_cnp(dvp, cnp);
1861 return (0);
1862 }
1863
1864 retry:
1865 dvlp = VP2VNODELOCK(dvp);
1866 mtx_lock(dvlp);
1867 ncp = dvp->v_cache_dd;
1868 if (ncp == NULL) {
1869 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1870 mtx_unlock(dvlp);
1871 return (0);
1872 }
1873 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1874 if (ncp->nc_flag & NCF_NEGATIVE)
1875 *vpp = NULL;
1876 else
1877 *vpp = ncp->nc_vp;
1878 } else
1879 *vpp = ncp->nc_dvp;
1880 if (*vpp == NULL)
1881 goto negative_success;
1882 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1883 cache_out_ts(ncp, tsp, ticksp);
1884 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1885 NCF_DTS && tsp != NULL) {
1886 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1887 *tsp = ncp_ts->nc_dotdottime;
1888 }
1889
1890 MPASS(dvp != *vpp);
1891 ltype = VOP_ISLOCKED(dvp);
1892 VOP_UNLOCK(dvp);
1893 vs = vget_prep(*vpp);
1894 mtx_unlock(dvlp);
1895 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1896 vn_lock(dvp, ltype | LK_RETRY);
1897 if (VN_IS_DOOMED(dvp)) {
1898 if (error == 0)
1899 vput(*vpp);
1900 *vpp = NULL;
1901 return (ENOENT);
1902 }
1903 if (error) {
1904 *vpp = NULL;
1905 goto retry;
1906 }
1907 return (-1);
1908 negative_success:
1909 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1910 if (cnp->cn_flags & ISLASTCN) {
1911 counter_u64_add(numnegzaps, 1);
1912 cache_zap_negative_locked_vnode_kl(ncp, dvp);
1913 mtx_unlock(dvlp);
1914 cache_free(ncp);
1915 return (0);
1916 }
1917 }
1918
1919 whiteout = (ncp->nc_flag & NCF_WHITE);
1920 cache_out_ts(ncp, tsp, ticksp);
1921 if (cache_neg_hit_prep(ncp))
1922 cache_neg_promote(ncp);
1923 else
1924 cache_neg_hit_finish(ncp);
1925 mtx_unlock(dvlp);
1926 if (whiteout)
1927 cnp->cn_flags |= ISWHITEOUT;
1928 return (ENOENT);
1929 }
1930
1931 /**
1932 * Lookup a name in the name cache
1933 *
1934 * # Arguments
1935 *
1936 * - dvp: Parent directory in which to search.
1937 * - vpp: Return argument. Will contain desired vnode on cache hit.
1938 * - cnp: Parameters of the name search. The most interesting bits of
1939 * the cn_flags field have the following meanings:
1940 * - MAKEENTRY: If clear, free an entry from the cache rather than look
1941 * it up.
1942 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
1943 * - tsp: Return storage for cache timestamp. On a successful (positive
1944 * or negative) lookup, tsp will be filled with any timespec that
1945 * was stored when this cache entry was created. However, it will
1946 * be clear for "." entries.
1947 * - ticks: Return storage for alternate cache timestamp. On a successful
1948 * (positive or negative) lookup, it will contain the ticks value
1949 * that was current when the cache entry was created, unless cnp
1950 * was ".".
1951 *
1952 * Either both tsp and ticks have to be provided or neither of them.
1953 *
1954 * # Returns
1955 *
1956 * - -1: A positive cache hit. vpp will contain the desired vnode.
1957 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
1958 * to a forced unmount. vpp will not be modified. If the entry
1959 * is a whiteout, then the ISWHITEOUT flag will be set in
1960 * cnp->cn_flags.
1961 * - 0: A cache miss. vpp will not be modified.
1962 *
1963 * # Locking
1964 *
1965 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
1966 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
1967 * lock is not recursively acquired.
1968 */
1969 static int __noinline
cache_lookup_fallback(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1970 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1971 struct timespec *tsp, int *ticksp)
1972 {
1973 struct namecache *ncp;
1974 struct mtx *blp;
1975 uint32_t hash;
1976 enum vgetstate vs;
1977 int error;
1978 bool whiteout;
1979
1980 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1981 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1982
1983 retry:
1984 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1985 blp = HASH2BUCKETLOCK(hash);
1986 mtx_lock(blp);
1987
1988 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1989 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1990 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1991 break;
1992 }
1993
1994 if (__predict_false(ncp == NULL)) {
1995 mtx_unlock(blp);
1996 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
1997 counter_u64_add(nummiss, 1);
1998 return (0);
1999 }
2000
2001 if (ncp->nc_flag & NCF_NEGATIVE)
2002 goto negative_success;
2003
2004 counter_u64_add(numposhits, 1);
2005 *vpp = ncp->nc_vp;
2006 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2007 cache_out_ts(ncp, tsp, ticksp);
2008 MPASS(dvp != *vpp);
2009 vs = vget_prep(*vpp);
2010 mtx_unlock(blp);
2011 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2012 if (error) {
2013 *vpp = NULL;
2014 goto retry;
2015 }
2016 return (-1);
2017 negative_success:
2018 /*
2019 * We don't get here with regular lookup apart from corner cases.
2020 */
2021 if (__predict_true(cnp->cn_nameiop == CREATE)) {
2022 if (cnp->cn_flags & ISLASTCN) {
2023 counter_u64_add(numnegzaps, 1);
2024 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2025 if (__predict_false(error != 0)) {
2026 atomic_add_long(&zap_bucket_fail2, 1);
2027 goto retry;
2028 }
2029 cache_free(ncp);
2030 return (0);
2031 }
2032 }
2033
2034 whiteout = (ncp->nc_flag & NCF_WHITE);
2035 cache_out_ts(ncp, tsp, ticksp);
2036 if (cache_neg_hit_prep(ncp))
2037 cache_neg_promote(ncp);
2038 else
2039 cache_neg_hit_finish(ncp);
2040 mtx_unlock(blp);
2041 if (whiteout)
2042 cnp->cn_flags |= ISWHITEOUT;
2043 return (ENOENT);
2044 }
2045
2046 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2047 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2048 struct timespec *tsp, int *ticksp)
2049 {
2050 struct namecache *ncp;
2051 uint32_t hash;
2052 enum vgetstate vs;
2053 int error;
2054 bool whiteout, neg_promote;
2055 u_short nc_flag;
2056
2057 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2058
2059 #ifdef DEBUG_CACHE
2060 if (__predict_false(!doingcache)) {
2061 cnp->cn_flags &= ~MAKEENTRY;
2062 return (0);
2063 }
2064 #endif
2065
2066 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2067 if (cnp->cn_namelen == 1)
2068 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2069 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2070 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2071 }
2072
2073 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2074
2075 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2076 cache_remove_cnp(dvp, cnp);
2077 return (0);
2078 }
2079
2080 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2081 vfs_smr_enter();
2082
2083 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2084 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2085 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2086 break;
2087 }
2088
2089 if (__predict_false(ncp == NULL)) {
2090 vfs_smr_exit();
2091 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2092 counter_u64_add(nummiss, 1);
2093 return (0);
2094 }
2095
2096 nc_flag = atomic_load_char(&ncp->nc_flag);
2097 if (nc_flag & NCF_NEGATIVE)
2098 goto negative_success;
2099
2100 counter_u64_add(numposhits, 1);
2101 *vpp = ncp->nc_vp;
2102 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2103 cache_out_ts(ncp, tsp, ticksp);
2104 MPASS(dvp != *vpp);
2105 if (!cache_ncp_canuse(ncp)) {
2106 vfs_smr_exit();
2107 *vpp = NULL;
2108 goto out_fallback;
2109 }
2110 vs = vget_prep_smr(*vpp);
2111 vfs_smr_exit();
2112 if (__predict_false(vs == VGET_NONE)) {
2113 *vpp = NULL;
2114 goto out_fallback;
2115 }
2116 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2117 if (error) {
2118 *vpp = NULL;
2119 goto out_fallback;
2120 }
2121 return (-1);
2122 negative_success:
2123 if (cnp->cn_nameiop == CREATE) {
2124 if (cnp->cn_flags & ISLASTCN) {
2125 vfs_smr_exit();
2126 goto out_fallback;
2127 }
2128 }
2129
2130 cache_out_ts(ncp, tsp, ticksp);
2131 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2132 neg_promote = cache_neg_hit_prep(ncp);
2133 if (!cache_ncp_canuse(ncp)) {
2134 cache_neg_hit_abort(ncp);
2135 vfs_smr_exit();
2136 goto out_fallback;
2137 }
2138 if (neg_promote) {
2139 vfs_smr_exit();
2140 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2141 goto out_fallback;
2142 } else {
2143 cache_neg_hit_finish(ncp);
2144 vfs_smr_exit();
2145 }
2146 if (whiteout)
2147 cnp->cn_flags |= ISWHITEOUT;
2148 return (ENOENT);
2149 out_fallback:
2150 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2151 }
2152
2153 struct celockstate {
2154 struct mtx *vlp[3];
2155 struct mtx *blp[2];
2156 };
2157 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2158 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2159
2160 static inline void
cache_celockstate_init(struct celockstate * cel)2161 cache_celockstate_init(struct celockstate *cel)
2162 {
2163
2164 bzero(cel, sizeof(*cel));
2165 }
2166
2167 static void
cache_lock_vnodes_cel(struct celockstate * cel,struct vnode * vp,struct vnode * dvp)2168 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2169 struct vnode *dvp)
2170 {
2171 struct mtx *vlp1, *vlp2;
2172
2173 MPASS(cel->vlp[0] == NULL);
2174 MPASS(cel->vlp[1] == NULL);
2175 MPASS(cel->vlp[2] == NULL);
2176
2177 MPASS(vp != NULL || dvp != NULL);
2178
2179 vlp1 = VP2VNODELOCK(vp);
2180 vlp2 = VP2VNODELOCK(dvp);
2181 cache_sort_vnodes(&vlp1, &vlp2);
2182
2183 if (vlp1 != NULL) {
2184 mtx_lock(vlp1);
2185 cel->vlp[0] = vlp1;
2186 }
2187 mtx_lock(vlp2);
2188 cel->vlp[1] = vlp2;
2189 }
2190
2191 static void
cache_unlock_vnodes_cel(struct celockstate * cel)2192 cache_unlock_vnodes_cel(struct celockstate *cel)
2193 {
2194
2195 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2196
2197 if (cel->vlp[0] != NULL)
2198 mtx_unlock(cel->vlp[0]);
2199 if (cel->vlp[1] != NULL)
2200 mtx_unlock(cel->vlp[1]);
2201 if (cel->vlp[2] != NULL)
2202 mtx_unlock(cel->vlp[2]);
2203 }
2204
2205 static bool
cache_lock_vnodes_cel_3(struct celockstate * cel,struct vnode * vp)2206 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2207 {
2208 struct mtx *vlp;
2209 bool ret;
2210
2211 cache_assert_vlp_locked(cel->vlp[0]);
2212 cache_assert_vlp_locked(cel->vlp[1]);
2213 MPASS(cel->vlp[2] == NULL);
2214
2215 MPASS(vp != NULL);
2216 vlp = VP2VNODELOCK(vp);
2217
2218 ret = true;
2219 if (vlp >= cel->vlp[1]) {
2220 mtx_lock(vlp);
2221 } else {
2222 if (mtx_trylock(vlp))
2223 goto out;
2224 cache_unlock_vnodes_cel(cel);
2225 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2226 if (vlp < cel->vlp[0]) {
2227 mtx_lock(vlp);
2228 mtx_lock(cel->vlp[0]);
2229 mtx_lock(cel->vlp[1]);
2230 } else {
2231 if (cel->vlp[0] != NULL)
2232 mtx_lock(cel->vlp[0]);
2233 mtx_lock(vlp);
2234 mtx_lock(cel->vlp[1]);
2235 }
2236 ret = false;
2237 }
2238 out:
2239 cel->vlp[2] = vlp;
2240 return (ret);
2241 }
2242
2243 static void
cache_lock_buckets_cel(struct celockstate * cel,struct mtx * blp1,struct mtx * blp2)2244 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2245 struct mtx *blp2)
2246 {
2247
2248 MPASS(cel->blp[0] == NULL);
2249 MPASS(cel->blp[1] == NULL);
2250
2251 cache_sort_vnodes(&blp1, &blp2);
2252
2253 if (blp1 != NULL) {
2254 mtx_lock(blp1);
2255 cel->blp[0] = blp1;
2256 }
2257 mtx_lock(blp2);
2258 cel->blp[1] = blp2;
2259 }
2260
2261 static void
cache_unlock_buckets_cel(struct celockstate * cel)2262 cache_unlock_buckets_cel(struct celockstate *cel)
2263 {
2264
2265 if (cel->blp[0] != NULL)
2266 mtx_unlock(cel->blp[0]);
2267 mtx_unlock(cel->blp[1]);
2268 }
2269
2270 /*
2271 * Lock part of the cache affected by the insertion.
2272 *
2273 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2274 * However, insertion can result in removal of an old entry. In this
2275 * case we have an additional vnode and bucketlock pair to lock.
2276 *
2277 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2278 * preserving the locking order (smaller address first).
2279 */
2280 static void
cache_enter_lock(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2281 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2282 uint32_t hash)
2283 {
2284 struct namecache *ncp;
2285 struct mtx *blps[2];
2286 u_char nc_flag;
2287
2288 blps[0] = HASH2BUCKETLOCK(hash);
2289 for (;;) {
2290 blps[1] = NULL;
2291 cache_lock_vnodes_cel(cel, dvp, vp);
2292 if (vp == NULL || vp->v_type != VDIR)
2293 break;
2294 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2295 if (ncp == NULL)
2296 break;
2297 nc_flag = atomic_load_char(&ncp->nc_flag);
2298 if ((nc_flag & NCF_ISDOTDOT) == 0)
2299 break;
2300 MPASS(ncp->nc_dvp == vp);
2301 blps[1] = NCP2BUCKETLOCK(ncp);
2302 if ((nc_flag & NCF_NEGATIVE) != 0)
2303 break;
2304 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2305 break;
2306 /*
2307 * All vnodes got re-locked. Re-validate the state and if
2308 * nothing changed we are done. Otherwise restart.
2309 */
2310 if (ncp == vp->v_cache_dd &&
2311 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2312 blps[1] == NCP2BUCKETLOCK(ncp) &&
2313 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2314 break;
2315 cache_unlock_vnodes_cel(cel);
2316 cel->vlp[0] = NULL;
2317 cel->vlp[1] = NULL;
2318 cel->vlp[2] = NULL;
2319 }
2320 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2321 }
2322
2323 static void
cache_enter_lock_dd(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2324 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2325 uint32_t hash)
2326 {
2327 struct namecache *ncp;
2328 struct mtx *blps[2];
2329 u_char nc_flag;
2330
2331 blps[0] = HASH2BUCKETLOCK(hash);
2332 for (;;) {
2333 blps[1] = NULL;
2334 cache_lock_vnodes_cel(cel, dvp, vp);
2335 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2336 if (ncp == NULL)
2337 break;
2338 nc_flag = atomic_load_char(&ncp->nc_flag);
2339 if ((nc_flag & NCF_ISDOTDOT) == 0)
2340 break;
2341 MPASS(ncp->nc_dvp == dvp);
2342 blps[1] = NCP2BUCKETLOCK(ncp);
2343 if ((nc_flag & NCF_NEGATIVE) != 0)
2344 break;
2345 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2346 break;
2347 if (ncp == dvp->v_cache_dd &&
2348 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2349 blps[1] == NCP2BUCKETLOCK(ncp) &&
2350 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2351 break;
2352 cache_unlock_vnodes_cel(cel);
2353 cel->vlp[0] = NULL;
2354 cel->vlp[1] = NULL;
2355 cel->vlp[2] = NULL;
2356 }
2357 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2358 }
2359
2360 static void
cache_enter_unlock(struct celockstate * cel)2361 cache_enter_unlock(struct celockstate *cel)
2362 {
2363
2364 cache_unlock_buckets_cel(cel);
2365 cache_unlock_vnodes_cel(cel);
2366 }
2367
2368 static void __noinline
cache_enter_dotdot_prep(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2369 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2370 struct componentname *cnp)
2371 {
2372 struct celockstate cel;
2373 struct namecache *ncp;
2374 uint32_t hash;
2375 int len;
2376
2377 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2378 return;
2379 len = cnp->cn_namelen;
2380 cache_celockstate_init(&cel);
2381 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2382 cache_enter_lock_dd(&cel, dvp, vp, hash);
2383 ncp = dvp->v_cache_dd;
2384 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2385 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2386 cache_zap_locked(ncp);
2387 } else {
2388 ncp = NULL;
2389 }
2390 atomic_store_ptr(&dvp->v_cache_dd, NULL);
2391 cache_enter_unlock(&cel);
2392 if (ncp != NULL)
2393 cache_free(ncp);
2394 }
2395
2396 /*
2397 * Add an entry to the cache.
2398 */
2399 void
cache_enter_time(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp)2400 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2401 struct timespec *tsp, struct timespec *dtsp)
2402 {
2403 struct celockstate cel;
2404 struct namecache *ncp, *n2, *ndd;
2405 struct namecache_ts *ncp_ts;
2406 struct nchashhead *ncpp;
2407 uint32_t hash;
2408 int flag;
2409 int len;
2410
2411 KASSERT(cnp->cn_namelen <= NAME_MAX,
2412 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2413 NAME_MAX));
2414 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2415 VNPASS(dvp->v_type != VNON, dvp);
2416 if (vp != NULL) {
2417 VNPASS(!VN_IS_DOOMED(vp), vp);
2418 VNPASS(vp->v_type != VNON, vp);
2419 }
2420 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2421 KASSERT(dvp == vp,
2422 ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2423 dvp, vp));
2424 } else {
2425 KASSERT(dvp != vp,
2426 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2427 cnp->cn_nameptr, dvp));
2428 }
2429
2430 #ifdef DEBUG_CACHE
2431 if (__predict_false(!doingcache))
2432 return;
2433 #endif
2434
2435 flag = 0;
2436 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2437 if (cnp->cn_namelen == 1)
2438 return;
2439 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2440 cache_enter_dotdot_prep(dvp, vp, cnp);
2441 flag = NCF_ISDOTDOT;
2442 }
2443 }
2444
2445 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2446 if (ncp == NULL)
2447 return;
2448
2449 cache_celockstate_init(&cel);
2450 ndd = NULL;
2451 ncp_ts = NULL;
2452
2453 /*
2454 * Calculate the hash key and setup as much of the new
2455 * namecache entry as possible before acquiring the lock.
2456 */
2457 ncp->nc_flag = flag | NCF_WIP;
2458 ncp->nc_vp = vp;
2459 if (vp == NULL)
2460 cache_neg_init(ncp);
2461 ncp->nc_dvp = dvp;
2462 if (tsp != NULL) {
2463 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2464 ncp_ts->nc_time = *tsp;
2465 ncp_ts->nc_ticks = ticks;
2466 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2467 if (dtsp != NULL) {
2468 ncp_ts->nc_dotdottime = *dtsp;
2469 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2470 }
2471 }
2472 len = ncp->nc_nlen = cnp->cn_namelen;
2473 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2474 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2475 ncp->nc_name[len] = '\0';
2476 cache_enter_lock(&cel, dvp, vp, hash);
2477
2478 /*
2479 * See if this vnode or negative entry is already in the cache
2480 * with this name. This can happen with concurrent lookups of
2481 * the same path name.
2482 */
2483 ncpp = NCHHASH(hash);
2484 CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2485 if (n2->nc_dvp == dvp &&
2486 n2->nc_nlen == cnp->cn_namelen &&
2487 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2488 MPASS(cache_ncp_canuse(n2));
2489 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2490 KASSERT(vp == NULL,
2491 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2492 __func__, NULL, vp, cnp->cn_nameptr));
2493 else
2494 KASSERT(n2->nc_vp == vp,
2495 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2496 __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2497 /*
2498 * Entries are supposed to be immutable unless in the
2499 * process of getting destroyed. Accommodating for
2500 * changing timestamps is possible but not worth it.
2501 * This should be harmless in terms of correctness, in
2502 * the worst case resulting in an earlier expiration.
2503 * Alternatively, the found entry can be replaced
2504 * altogether.
2505 */
2506 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2507 #if 0
2508 if (tsp != NULL) {
2509 KASSERT((n2->nc_flag & NCF_TS) != 0,
2510 ("no NCF_TS"));
2511 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2512 n2_ts->nc_time = ncp_ts->nc_time;
2513 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2514 if (dtsp != NULL) {
2515 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2516 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2517 }
2518 }
2519 #endif
2520 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2521 vp);
2522 goto out_unlock_free;
2523 }
2524 }
2525
2526 if (flag == NCF_ISDOTDOT) {
2527 /*
2528 * See if we are trying to add .. entry, but some other lookup
2529 * has populated v_cache_dd pointer already.
2530 */
2531 if (dvp->v_cache_dd != NULL)
2532 goto out_unlock_free;
2533 KASSERT(vp == NULL || vp->v_type == VDIR,
2534 ("wrong vnode type %p", vp));
2535 atomic_thread_fence_rel();
2536 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2537 }
2538
2539 if (vp != NULL) {
2540 if (flag != NCF_ISDOTDOT) {
2541 /*
2542 * For this case, the cache entry maps both the
2543 * directory name in it and the name ".." for the
2544 * directory's parent.
2545 */
2546 if ((ndd = vp->v_cache_dd) != NULL) {
2547 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2548 cache_zap_locked(ndd);
2549 else
2550 ndd = NULL;
2551 }
2552 atomic_thread_fence_rel();
2553 atomic_store_ptr(&vp->v_cache_dd, ncp);
2554 } else if (vp->v_type != VDIR) {
2555 if (vp->v_cache_dd != NULL) {
2556 atomic_store_ptr(&vp->v_cache_dd, NULL);
2557 }
2558 }
2559 }
2560
2561 if (flag != NCF_ISDOTDOT) {
2562 if (LIST_EMPTY(&dvp->v_cache_src)) {
2563 cache_hold_vnode(dvp);
2564 }
2565 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2566 }
2567
2568 /*
2569 * If the entry is "negative", we place it into the
2570 * "negative" cache queue, otherwise, we place it into the
2571 * destination vnode's cache entries queue.
2572 */
2573 if (vp != NULL) {
2574 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2575 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2576 vp);
2577 } else {
2578 if (cnp->cn_flags & ISWHITEOUT)
2579 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2580 cache_neg_insert(ncp);
2581 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2582 ncp->nc_name);
2583 }
2584
2585 /*
2586 * Insert the new namecache entry into the appropriate chain
2587 * within the cache entries table.
2588 */
2589 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2590
2591 atomic_thread_fence_rel();
2592 /*
2593 * Mark the entry as fully constructed.
2594 * It is immutable past this point until its removal.
2595 */
2596 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2597
2598 cache_enter_unlock(&cel);
2599 if (ndd != NULL)
2600 cache_free(ndd);
2601 return;
2602 out_unlock_free:
2603 cache_enter_unlock(&cel);
2604 cache_free(ncp);
2605 return;
2606 }
2607
2608 /*
2609 * A variant of the above accepting flags.
2610 *
2611 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2612 *
2613 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2614 * happens to match and it is doing it in an inefficient manner. It was added
2615 * to accommodate NFS which runs into a case where the target for a given name
2616 * may change from under it. Note this does nothing to solve the following
2617 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2618 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2619 */
2620 void
cache_enter_time_flags(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp,int flags)2621 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2622 struct timespec *tsp, struct timespec *dtsp, int flags)
2623 {
2624
2625 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2626
2627 if (flags & VFS_CACHE_DROPOLD)
2628 cache_remove_cnp(dvp, cnp);
2629 cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2630 }
2631
2632 static u_long
cache_roundup_2(u_long val)2633 cache_roundup_2(u_long val)
2634 {
2635 u_long res;
2636
2637 for (res = 1; res <= val; res <<= 1)
2638 continue;
2639
2640 return (res);
2641 }
2642
2643 static struct nchashhead *
nchinittbl(u_long elements,u_long * hashmask)2644 nchinittbl(u_long elements, u_long *hashmask)
2645 {
2646 struct nchashhead *hashtbl;
2647 u_long hashsize, i;
2648
2649 hashsize = cache_roundup_2(elements) / 2;
2650
2651 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2652 for (i = 0; i < hashsize; i++)
2653 CK_SLIST_INIT(&hashtbl[i]);
2654 *hashmask = hashsize - 1;
2655 return (hashtbl);
2656 }
2657
2658 static void
ncfreetbl(struct nchashhead * hashtbl)2659 ncfreetbl(struct nchashhead *hashtbl)
2660 {
2661
2662 free(hashtbl, M_VFSCACHE);
2663 }
2664
2665 /*
2666 * Name cache initialization, from vfs_init() when we are booting
2667 */
2668 static void
nchinit(void * dummy __unused)2669 nchinit(void *dummy __unused)
2670 {
2671 u_int i;
2672
2673 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2674 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2675 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2676 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2677 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2678 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2679 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2680 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2681
2682 VFS_SMR_ZONE_SET(cache_zone_small);
2683 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2684 VFS_SMR_ZONE_SET(cache_zone_large);
2685 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2686
2687 ncsize = desiredvnodes * ncsizefactor;
2688 cache_recalc_neg_min();
2689 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2690 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2691 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2692 ncbuckethash = 7;
2693 if (ncbuckethash > nchash)
2694 ncbuckethash = nchash;
2695 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2696 M_WAITOK | M_ZERO);
2697 for (i = 0; i < numbucketlocks; i++)
2698 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2699 ncvnodehash = ncbuckethash;
2700 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2701 M_WAITOK | M_ZERO);
2702 for (i = 0; i < numvnodelocks; i++)
2703 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2704
2705 for (i = 0; i < numneglists; i++) {
2706 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2707 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2708 TAILQ_INIT(&neglists[i].nl_list);
2709 TAILQ_INIT(&neglists[i].nl_hotlist);
2710 }
2711 }
2712 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2713
2714 void
cache_vnode_init(struct vnode * vp)2715 cache_vnode_init(struct vnode *vp)
2716 {
2717
2718 LIST_INIT(&vp->v_cache_src);
2719 TAILQ_INIT(&vp->v_cache_dst);
2720 vp->v_cache_dd = NULL;
2721 cache_prehash(vp);
2722 }
2723
2724 /*
2725 * Induce transient cache misses for lockless operation in cache_lookup() by
2726 * using a temporary hash table.
2727 *
2728 * This will force a fs lookup.
2729 *
2730 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2731 * to observe all CPUs not performing the lookup.
2732 */
2733 static void
cache_changesize_set_temp(struct nchashhead * temptbl,u_long temphash)2734 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2735 {
2736
2737 MPASS(temphash < nchash);
2738 /*
2739 * Change the size. The new size is smaller and can safely be used
2740 * against the existing table. All lookups which now hash wrong will
2741 * result in a cache miss, which all callers are supposed to know how
2742 * to handle.
2743 */
2744 atomic_store_long(&nchash, temphash);
2745 atomic_thread_fence_rel();
2746 vfs_smr_synchronize();
2747 /*
2748 * At this point everyone sees the updated hash value, but they still
2749 * see the old table.
2750 */
2751 atomic_store_ptr(&nchashtbl, temptbl);
2752 atomic_thread_fence_rel();
2753 vfs_smr_synchronize();
2754 /*
2755 * At this point everyone sees the updated table pointer and size pair.
2756 */
2757 }
2758
2759 /*
2760 * Set the new hash table.
2761 *
2762 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2763 * lockless operation in cache_lookup().
2764 */
2765 static void
cache_changesize_set_new(struct nchashhead * new_tbl,u_long new_hash)2766 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2767 {
2768
2769 MPASS(nchash < new_hash);
2770 /*
2771 * Change the pointer first. This wont result in out of bounds access
2772 * since the temporary table is guaranteed to be smaller.
2773 */
2774 atomic_store_ptr(&nchashtbl, new_tbl);
2775 atomic_thread_fence_rel();
2776 vfs_smr_synchronize();
2777 /*
2778 * At this point everyone sees the updated pointer value, but they
2779 * still see the old size.
2780 */
2781 atomic_store_long(&nchash, new_hash);
2782 atomic_thread_fence_rel();
2783 vfs_smr_synchronize();
2784 /*
2785 * At this point everyone sees the updated table pointer and size pair.
2786 */
2787 }
2788
2789 void
cache_changesize(u_long newmaxvnodes)2790 cache_changesize(u_long newmaxvnodes)
2791 {
2792 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2793 u_long new_nchash, old_nchash, temphash;
2794 struct namecache *ncp;
2795 uint32_t hash;
2796 u_long newncsize;
2797 u_long i;
2798
2799 newncsize = newmaxvnodes * ncsizefactor;
2800 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2801 if (newmaxvnodes < numbucketlocks)
2802 newmaxvnodes = numbucketlocks;
2803
2804 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2805 /* If same hash table size, nothing to do */
2806 if (nchash == new_nchash) {
2807 ncfreetbl(new_nchashtbl);
2808 return;
2809 }
2810
2811 temptbl = nchinittbl(1, &temphash);
2812
2813 /*
2814 * Move everything from the old hash table to the new table.
2815 * None of the namecache entries in the table can be removed
2816 * because to do so, they have to be removed from the hash table.
2817 */
2818 cache_lock_all_vnodes();
2819 cache_lock_all_buckets();
2820 old_nchashtbl = nchashtbl;
2821 old_nchash = nchash;
2822 cache_changesize_set_temp(temptbl, temphash);
2823 for (i = 0; i <= old_nchash; i++) {
2824 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2825 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2826 ncp->nc_dvp);
2827 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2828 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2829 }
2830 }
2831 ncsize = newncsize;
2832 cache_recalc_neg_min();
2833 cache_changesize_set_new(new_nchashtbl, new_nchash);
2834 cache_unlock_all_buckets();
2835 cache_unlock_all_vnodes();
2836 ncfreetbl(old_nchashtbl);
2837 ncfreetbl(temptbl);
2838 }
2839
2840 /*
2841 * Remove all entries from and to a particular vnode.
2842 */
2843 static void
cache_purge_impl(struct vnode * vp)2844 cache_purge_impl(struct vnode *vp)
2845 {
2846 struct cache_freebatch batch;
2847 struct namecache *ncp;
2848 struct mtx *vlp, *vlp2;
2849
2850 TAILQ_INIT(&batch);
2851 vlp = VP2VNODELOCK(vp);
2852 vlp2 = NULL;
2853 mtx_lock(vlp);
2854 retry:
2855 while (!LIST_EMPTY(&vp->v_cache_src)) {
2856 ncp = LIST_FIRST(&vp->v_cache_src);
2857 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2858 goto retry;
2859 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2860 }
2861 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2862 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2863 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2864 goto retry;
2865 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2866 }
2867 ncp = vp->v_cache_dd;
2868 if (ncp != NULL) {
2869 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2870 ("lost dotdot link"));
2871 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2872 goto retry;
2873 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2874 }
2875 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2876 mtx_unlock(vlp);
2877 if (vlp2 != NULL)
2878 mtx_unlock(vlp2);
2879 cache_free_batch(&batch);
2880 }
2881
2882 /*
2883 * Opportunistic check to see if there is anything to do.
2884 */
2885 static bool
cache_has_entries(struct vnode * vp)2886 cache_has_entries(struct vnode *vp)
2887 {
2888
2889 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2890 atomic_load_ptr(&vp->v_cache_dd) == NULL)
2891 return (false);
2892 return (true);
2893 }
2894
2895 void
cache_purge(struct vnode * vp)2896 cache_purge(struct vnode *vp)
2897 {
2898
2899 SDT_PROBE1(vfs, namecache, purge, done, vp);
2900 if (!cache_has_entries(vp))
2901 return;
2902 cache_purge_impl(vp);
2903 }
2904
2905 /*
2906 * Only to be used by vgone.
2907 */
2908 void
cache_purge_vgone(struct vnode * vp)2909 cache_purge_vgone(struct vnode *vp)
2910 {
2911 struct mtx *vlp;
2912
2913 VNPASS(VN_IS_DOOMED(vp), vp);
2914 if (cache_has_entries(vp)) {
2915 cache_purge_impl(vp);
2916 return;
2917 }
2918
2919 /*
2920 * Serialize against a potential thread doing cache_purge.
2921 */
2922 vlp = VP2VNODELOCK(vp);
2923 mtx_wait_unlocked(vlp);
2924 if (cache_has_entries(vp)) {
2925 cache_purge_impl(vp);
2926 return;
2927 }
2928 return;
2929 }
2930
2931 /*
2932 * Remove all negative entries for a particular directory vnode.
2933 */
2934 void
cache_purge_negative(struct vnode * vp)2935 cache_purge_negative(struct vnode *vp)
2936 {
2937 struct cache_freebatch batch;
2938 struct namecache *ncp, *nnp;
2939 struct mtx *vlp;
2940
2941 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2942 if (LIST_EMPTY(&vp->v_cache_src))
2943 return;
2944 TAILQ_INIT(&batch);
2945 vlp = VP2VNODELOCK(vp);
2946 mtx_lock(vlp);
2947 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2948 if (!(ncp->nc_flag & NCF_NEGATIVE))
2949 continue;
2950 cache_zap_negative_locked_vnode_kl(ncp, vp);
2951 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2952 }
2953 mtx_unlock(vlp);
2954 cache_free_batch(&batch);
2955 }
2956
2957 /*
2958 * Entry points for modifying VOP operations.
2959 */
2960 void
cache_vop_rename(struct vnode * fdvp,struct vnode * fvp,struct vnode * tdvp,struct vnode * tvp,struct componentname * fcnp,struct componentname * tcnp)2961 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2962 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2963 {
2964
2965 ASSERT_VOP_IN_SEQC(fdvp);
2966 ASSERT_VOP_IN_SEQC(fvp);
2967 ASSERT_VOP_IN_SEQC(tdvp);
2968 if (tvp != NULL)
2969 ASSERT_VOP_IN_SEQC(tvp);
2970
2971 cache_purge(fvp);
2972 if (tvp != NULL) {
2973 cache_purge(tvp);
2974 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2975 ("%s: lingering negative entry", __func__));
2976 } else {
2977 cache_remove_cnp(tdvp, tcnp);
2978 }
2979
2980 /*
2981 * TODO
2982 *
2983 * Historically renaming was always purging all revelang entries,
2984 * but that's quite wasteful. In particular turns out that in many cases
2985 * the target file is immediately accessed after rename, inducing a cache
2986 * miss.
2987 *
2988 * Recode this to reduce relocking and reuse the existing entry (if any)
2989 * instead of just removing it above and allocating a new one here.
2990 */
2991 cache_enter(tdvp, fvp, tcnp);
2992 }
2993
2994 void
cache_vop_rmdir(struct vnode * dvp,struct vnode * vp)2995 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2996 {
2997
2998 ASSERT_VOP_IN_SEQC(dvp);
2999 ASSERT_VOP_IN_SEQC(vp);
3000 cache_purge(vp);
3001 }
3002
3003 #ifdef INVARIANTS
3004 /*
3005 * Validate that if an entry exists it matches.
3006 */
3007 void
cache_validate(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)3008 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3009 {
3010 struct namecache *ncp;
3011 struct mtx *blp;
3012 uint32_t hash;
3013
3014 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3015 if (CK_SLIST_EMPTY(NCHHASH(hash)))
3016 return;
3017 blp = HASH2BUCKETLOCK(hash);
3018 mtx_lock(blp);
3019 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3020 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3021 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
3022 if (ncp->nc_vp != vp)
3023 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3024 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3025 }
3026 }
3027 mtx_unlock(blp);
3028 }
3029
3030 void
cache_assert_no_entries(struct vnode * vp)3031 cache_assert_no_entries(struct vnode *vp)
3032 {
3033
3034 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3035 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3036 VNPASS(vp->v_cache_dd == NULL, vp);
3037 }
3038 #endif
3039
3040 /*
3041 * Flush all entries referencing a particular filesystem.
3042 */
3043 void
cache_purgevfs(struct mount * mp)3044 cache_purgevfs(struct mount *mp)
3045 {
3046 struct vnode *vp, *mvp;
3047 size_t visited __sdt_used, purged __sdt_used;
3048
3049 visited = purged = 0;
3050 /*
3051 * Somewhat wasteful iteration over all vnodes. Would be better to
3052 * support filtering and avoid the interlock to begin with.
3053 */
3054 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3055 visited++;
3056 if (!cache_has_entries(vp)) {
3057 VI_UNLOCK(vp);
3058 continue;
3059 }
3060 vholdl(vp);
3061 VI_UNLOCK(vp);
3062 cache_purge(vp);
3063 purged++;
3064 vdrop(vp);
3065 }
3066
3067 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3068 }
3069
3070 /*
3071 * Perform canonical checks and cache lookup and pass on to filesystem
3072 * through the vop_cachedlookup only if needed.
3073 */
3074
3075 int
vfs_cache_lookup(struct vop_lookup_args * ap)3076 vfs_cache_lookup(struct vop_lookup_args *ap)
3077 {
3078 struct vnode *dvp;
3079 int error;
3080 struct vnode **vpp = ap->a_vpp;
3081 struct componentname *cnp = ap->a_cnp;
3082 int flags = cnp->cn_flags;
3083
3084 *vpp = NULL;
3085 dvp = ap->a_dvp;
3086
3087 if (dvp->v_type != VDIR)
3088 return (ENOTDIR);
3089
3090 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3091 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3092 return (EROFS);
3093
3094 error = vn_dir_check_exec(dvp, cnp);
3095 if (error != 0)
3096 return (error);
3097
3098 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3099 if (error == 0)
3100 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3101 if (error == -1)
3102 return (0);
3103 return (error);
3104 }
3105
3106 /* Implementation of the getcwd syscall. */
3107 int
sys___getcwd(struct thread * td,struct __getcwd_args * uap)3108 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3109 {
3110 char *buf, *retbuf;
3111 size_t buflen;
3112 int error;
3113
3114 buflen = uap->buflen;
3115 if (__predict_false(buflen < 2))
3116 return (EINVAL);
3117 if (buflen > MAXPATHLEN)
3118 buflen = MAXPATHLEN;
3119
3120 buf = uma_zalloc(namei_zone, M_WAITOK);
3121 error = vn_getcwd(buf, &retbuf, &buflen);
3122 if (error == 0)
3123 error = copyout(retbuf, uap->buf, buflen);
3124 uma_zfree(namei_zone, buf);
3125 return (error);
3126 }
3127
3128 int
vn_getcwd(char * buf,char ** retbuf,size_t * buflen)3129 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3130 {
3131 struct pwd *pwd;
3132 int error;
3133
3134 vfs_smr_enter();
3135 pwd = pwd_get_smr();
3136 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3137 buflen, 0);
3138 VFS_SMR_ASSERT_NOT_ENTERED();
3139 if (error < 0) {
3140 pwd = pwd_hold(curthread);
3141 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3142 retbuf, buflen);
3143 pwd_drop(pwd);
3144 }
3145
3146 #ifdef KTRACE
3147 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3148 ktrnamei(*retbuf);
3149 #endif
3150 return (error);
3151 }
3152
3153 /*
3154 * Canonicalize a path by walking it forward and back.
3155 *
3156 * BUGS:
3157 * - Nothing guarantees the integrity of the entire chain. Consider the case
3158 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3159 * "foo" into "quux" during the backwards walk. The result will be
3160 * "quux/bar/baz/qux", which could not have been obtained by an incremental
3161 * walk in userspace. Moreover, the path we return is inaccessible if the
3162 * calling thread lacks permission to traverse "quux".
3163 */
3164 static int
kern___realpathat(struct thread * td,int fd,const char * path,char * buf,size_t size,int flags,enum uio_seg pathseg)3165 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3166 size_t size, int flags, enum uio_seg pathseg)
3167 {
3168 struct nameidata nd;
3169 char *retbuf, *freebuf;
3170 int error;
3171
3172 if (flags != 0)
3173 return (EINVAL);
3174 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3175 pathseg, path, fd, &cap_fstat_rights);
3176 if ((error = namei(&nd)) != 0)
3177 return (error);
3178
3179 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3180 (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3181 /*
3182 * This happens if vp is a file mount. The call to
3183 * vn_fullpath_hardlink can panic if path resolution can't be
3184 * handled without the directory.
3185 *
3186 * To resolve this, we find the vnode which was mounted on -
3187 * this should have a unique global path since we disallow
3188 * mounting on linked files.
3189 */
3190 struct vnode *covered_vp;
3191 error = vn_lock(nd.ni_vp, LK_SHARED);
3192 if (error != 0)
3193 goto out;
3194 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3195 vref(covered_vp);
3196 VOP_UNLOCK(nd.ni_vp);
3197 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3198 vrele(covered_vp);
3199 } else {
3200 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
3201 nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
3202 }
3203 if (error == 0) {
3204 error = copyout(retbuf, buf, size);
3205 free(freebuf, M_TEMP);
3206 }
3207 out:
3208 vrele(nd.ni_vp);
3209 vrele(nd.ni_dvp);
3210 NDFREE_PNBUF(&nd);
3211 return (error);
3212 }
3213
3214 int
sys___realpathat(struct thread * td,struct __realpathat_args * uap)3215 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3216 {
3217
3218 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3219 uap->flags, UIO_USERSPACE));
3220 }
3221
3222 /*
3223 * Retrieve the full filesystem path that correspond to a vnode from the name
3224 * cache (if available)
3225 */
3226 int
vn_fullpath(struct vnode * vp,char ** retbuf,char ** freebuf)3227 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3228 {
3229 struct pwd *pwd;
3230 char *buf;
3231 size_t buflen;
3232 int error;
3233
3234 if (__predict_false(vp == NULL))
3235 return (EINVAL);
3236
3237 buflen = MAXPATHLEN;
3238 buf = malloc(buflen, M_TEMP, M_WAITOK);
3239 vfs_smr_enter();
3240 pwd = pwd_get_smr();
3241 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3242 VFS_SMR_ASSERT_NOT_ENTERED();
3243 if (error < 0) {
3244 pwd = pwd_hold(curthread);
3245 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3246 pwd_drop(pwd);
3247 }
3248 if (error == 0)
3249 *freebuf = buf;
3250 else
3251 free(buf, M_TEMP);
3252 return (error);
3253 }
3254
3255 /*
3256 * This function is similar to vn_fullpath, but it attempts to lookup the
3257 * pathname relative to the global root mount point. This is required for the
3258 * auditing sub-system, as audited pathnames must be absolute, relative to the
3259 * global root mount point.
3260 */
3261 int
vn_fullpath_global(struct vnode * vp,char ** retbuf,char ** freebuf)3262 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3263 {
3264 char *buf;
3265 size_t buflen;
3266 int error;
3267
3268 if (__predict_false(vp == NULL))
3269 return (EINVAL);
3270 buflen = MAXPATHLEN;
3271 buf = malloc(buflen, M_TEMP, M_WAITOK);
3272 vfs_smr_enter();
3273 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3274 VFS_SMR_ASSERT_NOT_ENTERED();
3275 if (error < 0) {
3276 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3277 }
3278 if (error == 0)
3279 *freebuf = buf;
3280 else
3281 free(buf, M_TEMP);
3282 return (error);
3283 }
3284
3285 static struct namecache *
vn_dd_from_dst(struct vnode * vp)3286 vn_dd_from_dst(struct vnode *vp)
3287 {
3288 struct namecache *ncp;
3289
3290 cache_assert_vnode_locked(vp);
3291 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3292 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3293 return (ncp);
3294 }
3295 return (NULL);
3296 }
3297
3298 int
vn_vptocnp(struct vnode ** vp,char * buf,size_t * buflen)3299 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3300 {
3301 struct vnode *dvp;
3302 struct namecache *ncp;
3303 struct mtx *vlp;
3304 int error;
3305
3306 vlp = VP2VNODELOCK(*vp);
3307 mtx_lock(vlp);
3308 ncp = (*vp)->v_cache_dd;
3309 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3310 KASSERT(ncp == vn_dd_from_dst(*vp),
3311 ("%s: mismatch for dd entry (%p != %p)", __func__,
3312 ncp, vn_dd_from_dst(*vp)));
3313 } else {
3314 ncp = vn_dd_from_dst(*vp);
3315 }
3316 if (ncp != NULL) {
3317 if (*buflen < ncp->nc_nlen) {
3318 mtx_unlock(vlp);
3319 vrele(*vp);
3320 counter_u64_add(numfullpathfail4, 1);
3321 error = ENOMEM;
3322 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3323 vp, NULL);
3324 return (error);
3325 }
3326 *buflen -= ncp->nc_nlen;
3327 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3328 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3329 ncp->nc_name, vp);
3330 dvp = *vp;
3331 *vp = ncp->nc_dvp;
3332 vref(*vp);
3333 mtx_unlock(vlp);
3334 vrele(dvp);
3335 return (0);
3336 }
3337 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3338
3339 mtx_unlock(vlp);
3340 vn_lock(*vp, LK_SHARED | LK_RETRY);
3341 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3342 vput(*vp);
3343 if (error) {
3344 counter_u64_add(numfullpathfail2, 1);
3345 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3346 return (error);
3347 }
3348
3349 *vp = dvp;
3350 if (VN_IS_DOOMED(dvp)) {
3351 /* forced unmount */
3352 vrele(dvp);
3353 error = ENOENT;
3354 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3355 return (error);
3356 }
3357 /*
3358 * *vp has its use count incremented still.
3359 */
3360
3361 return (0);
3362 }
3363
3364 /*
3365 * Resolve a directory to a pathname.
3366 *
3367 * The name of the directory can always be found in the namecache or fetched
3368 * from the filesystem. There is also guaranteed to be only one parent, meaning
3369 * we can just follow vnodes up until we find the root.
3370 *
3371 * The vnode must be referenced.
3372 */
3373 static int
vn_fullpath_dir(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * len,size_t addend)3374 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3375 size_t *len, size_t addend)
3376 {
3377 #ifdef KDTRACE_HOOKS
3378 struct vnode *startvp = vp;
3379 #endif
3380 struct vnode *vp1;
3381 size_t buflen;
3382 int error;
3383 bool slash_prefixed;
3384
3385 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3386 VNPASS(vp->v_usecount > 0, vp);
3387
3388 buflen = *len;
3389
3390 slash_prefixed = true;
3391 if (addend == 0) {
3392 MPASS(*len >= 2);
3393 buflen--;
3394 buf[buflen] = '\0';
3395 slash_prefixed = false;
3396 }
3397
3398 error = 0;
3399
3400 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3401 counter_u64_add(numfullpathcalls, 1);
3402 while (vp != rdir && vp != rootvnode) {
3403 /*
3404 * The vp vnode must be already fully constructed,
3405 * since it is either found in namecache or obtained
3406 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3407 * without obtaining the vnode lock.
3408 */
3409 if ((vp->v_vflag & VV_ROOT) != 0) {
3410 vn_lock(vp, LK_RETRY | LK_SHARED);
3411
3412 /*
3413 * With the vnode locked, check for races with
3414 * unmount, forced or not. Note that we
3415 * already verified that vp is not equal to
3416 * the root vnode, which means that
3417 * mnt_vnodecovered can be NULL only for the
3418 * case of unmount.
3419 */
3420 if (VN_IS_DOOMED(vp) ||
3421 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3422 vp1->v_mountedhere != vp->v_mount) {
3423 vput(vp);
3424 error = ENOENT;
3425 SDT_PROBE3(vfs, namecache, fullpath, return,
3426 error, vp, NULL);
3427 break;
3428 }
3429
3430 vref(vp1);
3431 vput(vp);
3432 vp = vp1;
3433 continue;
3434 }
3435 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3436 error = vn_vptocnp(&vp, buf, &buflen);
3437 if (error)
3438 break;
3439 if (buflen == 0) {
3440 vrele(vp);
3441 error = ENOMEM;
3442 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3443 startvp, NULL);
3444 break;
3445 }
3446 buf[--buflen] = '/';
3447 slash_prefixed = true;
3448 }
3449 if (error)
3450 return (error);
3451 if (!slash_prefixed) {
3452 if (buflen == 0) {
3453 vrele(vp);
3454 counter_u64_add(numfullpathfail4, 1);
3455 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3456 startvp, NULL);
3457 return (ENOMEM);
3458 }
3459 buf[--buflen] = '/';
3460 }
3461 counter_u64_add(numfullpathfound, 1);
3462 vrele(vp);
3463
3464 *retbuf = buf + buflen;
3465 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3466 *len -= buflen;
3467 *len += addend;
3468 return (0);
3469 }
3470
3471 /*
3472 * Resolve an arbitrary vnode to a pathname.
3473 *
3474 * Note 2 caveats:
3475 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3476 * resolve to a different path than the one used to find it
3477 * - namecache is not mandatory, meaning names are not guaranteed to be added
3478 * (in which case resolving fails)
3479 */
3480 static void __inline
cache_rev_failed_impl(int * reason,int line)3481 cache_rev_failed_impl(int *reason, int line)
3482 {
3483
3484 *reason = line;
3485 }
3486 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3487
3488 static int
vn_fullpath_any_smr(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen,size_t addend)3489 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3490 char **retbuf, size_t *buflen, size_t addend)
3491 {
3492 #ifdef KDTRACE_HOOKS
3493 struct vnode *startvp = vp;
3494 #endif
3495 struct vnode *tvp;
3496 struct mount *mp;
3497 struct namecache *ncp;
3498 size_t orig_buflen;
3499 int reason;
3500 int error;
3501 #ifdef KDTRACE_HOOKS
3502 int i;
3503 #endif
3504 seqc_t vp_seqc, tvp_seqc;
3505 u_char nc_flag;
3506
3507 VFS_SMR_ASSERT_ENTERED();
3508
3509 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3510 vfs_smr_exit();
3511 return (-1);
3512 }
3513
3514 orig_buflen = *buflen;
3515
3516 if (addend == 0) {
3517 MPASS(*buflen >= 2);
3518 *buflen -= 1;
3519 buf[*buflen] = '\0';
3520 }
3521
3522 if (vp == rdir || vp == rootvnode) {
3523 if (addend == 0) {
3524 *buflen -= 1;
3525 buf[*buflen] = '/';
3526 }
3527 goto out_ok;
3528 }
3529
3530 #ifdef KDTRACE_HOOKS
3531 i = 0;
3532 #endif
3533 error = -1;
3534 ncp = NULL; /* for sdt probe down below */
3535 vp_seqc = vn_seqc_read_any(vp);
3536 if (seqc_in_modify(vp_seqc)) {
3537 cache_rev_failed(&reason);
3538 goto out_abort;
3539 }
3540
3541 for (;;) {
3542 #ifdef KDTRACE_HOOKS
3543 i++;
3544 #endif
3545 if ((vp->v_vflag & VV_ROOT) != 0) {
3546 mp = atomic_load_ptr(&vp->v_mount);
3547 if (mp == NULL) {
3548 cache_rev_failed(&reason);
3549 goto out_abort;
3550 }
3551 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3552 tvp_seqc = vn_seqc_read_any(tvp);
3553 if (seqc_in_modify(tvp_seqc)) {
3554 cache_rev_failed(&reason);
3555 goto out_abort;
3556 }
3557 if (!vn_seqc_consistent(vp, vp_seqc)) {
3558 cache_rev_failed(&reason);
3559 goto out_abort;
3560 }
3561 vp = tvp;
3562 vp_seqc = tvp_seqc;
3563 continue;
3564 }
3565 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3566 if (ncp == NULL) {
3567 cache_rev_failed(&reason);
3568 goto out_abort;
3569 }
3570 nc_flag = atomic_load_char(&ncp->nc_flag);
3571 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3572 cache_rev_failed(&reason);
3573 goto out_abort;
3574 }
3575 if (ncp->nc_nlen >= *buflen) {
3576 cache_rev_failed(&reason);
3577 error = ENOMEM;
3578 goto out_abort;
3579 }
3580 *buflen -= ncp->nc_nlen;
3581 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3582 *buflen -= 1;
3583 buf[*buflen] = '/';
3584 tvp = ncp->nc_dvp;
3585 tvp_seqc = vn_seqc_read_any(tvp);
3586 if (seqc_in_modify(tvp_seqc)) {
3587 cache_rev_failed(&reason);
3588 goto out_abort;
3589 }
3590 if (!vn_seqc_consistent(vp, vp_seqc)) {
3591 cache_rev_failed(&reason);
3592 goto out_abort;
3593 }
3594 /*
3595 * Acquire fence provided by vn_seqc_read_any above.
3596 */
3597 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3598 cache_rev_failed(&reason);
3599 goto out_abort;
3600 }
3601 if (!cache_ncp_canuse(ncp)) {
3602 cache_rev_failed(&reason);
3603 goto out_abort;
3604 }
3605 vp = tvp;
3606 vp_seqc = tvp_seqc;
3607 if (vp == rdir || vp == rootvnode)
3608 break;
3609 }
3610 out_ok:
3611 vfs_smr_exit();
3612 *retbuf = buf + *buflen;
3613 *buflen = orig_buflen - *buflen + addend;
3614 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3615 return (0);
3616
3617 out_abort:
3618 *buflen = orig_buflen;
3619 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3620 vfs_smr_exit();
3621 return (error);
3622 }
3623
3624 static int
vn_fullpath_any(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen)3625 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3626 size_t *buflen)
3627 {
3628 size_t orig_buflen, addend;
3629 int error;
3630
3631 if (*buflen < 2)
3632 return (EINVAL);
3633
3634 orig_buflen = *buflen;
3635
3636 vref(vp);
3637 addend = 0;
3638 if (vp->v_type != VDIR) {
3639 *buflen -= 1;
3640 buf[*buflen] = '\0';
3641 error = vn_vptocnp(&vp, buf, buflen);
3642 if (error)
3643 return (error);
3644 if (*buflen == 0) {
3645 vrele(vp);
3646 return (ENOMEM);
3647 }
3648 *buflen -= 1;
3649 buf[*buflen] = '/';
3650 addend = orig_buflen - *buflen;
3651 }
3652
3653 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3654 }
3655
3656 /*
3657 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3658 *
3659 * Since the namecache does not track hardlinks, the caller is expected to
3660 * first look up the target vnode with WANTPARENT flag passed to namei to get
3661 * dvp and vp.
3662 *
3663 * Then we have 2 cases:
3664 * - if the found vnode is a directory, the path can be constructed just by
3665 * following names up the chain
3666 * - otherwise we populate the buffer with the saved name and start resolving
3667 * from the parent
3668 */
3669 int
vn_fullpath_hardlink(struct vnode * vp,struct vnode * dvp,const char * hrdl_name,size_t hrdl_name_length,char ** retbuf,char ** freebuf,size_t * buflen)3670 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3671 const char *hrdl_name, size_t hrdl_name_length,
3672 char **retbuf, char **freebuf, size_t *buflen)
3673 {
3674 char *buf, *tmpbuf;
3675 struct pwd *pwd;
3676 size_t addend;
3677 int error;
3678 __enum_uint8(vtype) type;
3679
3680 if (*buflen < 2)
3681 return (EINVAL);
3682 if (*buflen > MAXPATHLEN)
3683 *buflen = MAXPATHLEN;
3684
3685 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3686
3687 addend = 0;
3688
3689 /*
3690 * Check for VBAD to work around the vp_crossmp bug in lookup().
3691 *
3692 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3693 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3694 * If the type is VDIR (like in this very case) we can skip looking
3695 * at ni_dvp in the first place. However, since vnodes get passed here
3696 * unlocked the target may transition to doomed state (type == VBAD)
3697 * before we get to evaluate the condition. If this happens, we will
3698 * populate part of the buffer and descend to vn_fullpath_dir with
3699 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3700 */
3701 type = atomic_load_8(&vp->v_type);
3702 if (type == VBAD) {
3703 error = ENOENT;
3704 goto out_bad;
3705 }
3706 if (type != VDIR) {
3707 addend = hrdl_name_length + 2;
3708 if (*buflen < addend) {
3709 error = ENOMEM;
3710 goto out_bad;
3711 }
3712 *buflen -= addend;
3713 tmpbuf = buf + *buflen;
3714 tmpbuf[0] = '/';
3715 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3716 tmpbuf[addend - 1] = '\0';
3717 vp = dvp;
3718 }
3719
3720 vfs_smr_enter();
3721 pwd = pwd_get_smr();
3722 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3723 addend);
3724 VFS_SMR_ASSERT_NOT_ENTERED();
3725 if (error < 0) {
3726 pwd = pwd_hold(curthread);
3727 vref(vp);
3728 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3729 addend);
3730 pwd_drop(pwd);
3731 }
3732 if (error != 0)
3733 goto out_bad;
3734
3735 *freebuf = buf;
3736
3737 return (0);
3738 out_bad:
3739 free(buf, M_TEMP);
3740 return (error);
3741 }
3742
3743 struct vnode *
vn_dir_dd_ino(struct vnode * vp)3744 vn_dir_dd_ino(struct vnode *vp)
3745 {
3746 struct namecache *ncp;
3747 struct vnode *ddvp;
3748 struct mtx *vlp;
3749 enum vgetstate vs;
3750
3751 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3752 vlp = VP2VNODELOCK(vp);
3753 mtx_lock(vlp);
3754 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3755 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3756 continue;
3757 ddvp = ncp->nc_dvp;
3758 vs = vget_prep(ddvp);
3759 mtx_unlock(vlp);
3760 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3761 return (NULL);
3762 return (ddvp);
3763 }
3764 mtx_unlock(vlp);
3765 return (NULL);
3766 }
3767
3768 int
vn_commname(struct vnode * vp,char * buf,u_int buflen)3769 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3770 {
3771 struct namecache *ncp;
3772 struct mtx *vlp;
3773 int l;
3774
3775 vlp = VP2VNODELOCK(vp);
3776 mtx_lock(vlp);
3777 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3778 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3779 break;
3780 if (ncp == NULL) {
3781 mtx_unlock(vlp);
3782 return (ENOENT);
3783 }
3784 l = min(ncp->nc_nlen, buflen - 1);
3785 memcpy(buf, ncp->nc_name, l);
3786 mtx_unlock(vlp);
3787 buf[l] = '\0';
3788 return (0);
3789 }
3790
3791 /*
3792 * This function updates path string to vnode's full global path
3793 * and checks the size of the new path string against the pathlen argument.
3794 *
3795 * Requires a locked, referenced vnode.
3796 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3797 *
3798 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3799 * because it falls back to the ".." lookup if the namecache lookup fails.
3800 */
3801 int
vn_path_to_global_path(struct thread * td,struct vnode * vp,char * path,u_int pathlen)3802 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3803 u_int pathlen)
3804 {
3805 struct nameidata nd;
3806 struct vnode *vp1;
3807 char *rpath, *fbuf;
3808 int error;
3809
3810 ASSERT_VOP_ELOCKED(vp, __func__);
3811
3812 /* Construct global filesystem path from vp. */
3813 VOP_UNLOCK(vp);
3814 error = vn_fullpath_global(vp, &rpath, &fbuf);
3815
3816 if (error != 0) {
3817 vrele(vp);
3818 return (error);
3819 }
3820
3821 if (strlen(rpath) >= pathlen) {
3822 vrele(vp);
3823 error = ENAMETOOLONG;
3824 goto out;
3825 }
3826
3827 /*
3828 * Re-lookup the vnode by path to detect a possible rename.
3829 * As a side effect, the vnode is relocked.
3830 * If vnode was renamed, return ENOENT.
3831 */
3832 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3833 error = namei(&nd);
3834 if (error != 0) {
3835 vrele(vp);
3836 goto out;
3837 }
3838 NDFREE_PNBUF(&nd);
3839 vp1 = nd.ni_vp;
3840 vrele(vp);
3841 if (vp1 == vp)
3842 strcpy(path, rpath);
3843 else {
3844 vput(vp1);
3845 error = ENOENT;
3846 }
3847
3848 out:
3849 free(fbuf, M_TEMP);
3850 return (error);
3851 }
3852
3853 /*
3854 * This is similar to vn_path_to_global_path but allows for regular
3855 * files which may not be present in the cache.
3856 *
3857 * Requires a locked, referenced vnode.
3858 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3859 */
3860 int
vn_path_to_global_path_hardlink(struct thread * td,struct vnode * vp,struct vnode * dvp,char * path,u_int pathlen,const char * leaf_name,size_t leaf_length)3861 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3862 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3863 size_t leaf_length)
3864 {
3865 struct nameidata nd;
3866 struct vnode *vp1;
3867 char *rpath, *fbuf;
3868 size_t len;
3869 int error;
3870
3871 ASSERT_VOP_ELOCKED(vp, __func__);
3872
3873 /*
3874 * Construct global filesystem path from dvp, vp and leaf
3875 * name.
3876 */
3877 VOP_UNLOCK(vp);
3878 len = pathlen;
3879 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3880 &rpath, &fbuf, &len);
3881
3882 if (error != 0) {
3883 vrele(vp);
3884 return (error);
3885 }
3886
3887 if (strlen(rpath) >= pathlen) {
3888 vrele(vp);
3889 error = ENAMETOOLONG;
3890 goto out;
3891 }
3892
3893 /*
3894 * Re-lookup the vnode by path to detect a possible rename.
3895 * As a side effect, the vnode is relocked.
3896 * If vnode was renamed, return ENOENT.
3897 */
3898 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3899 error = namei(&nd);
3900 if (error != 0) {
3901 vrele(vp);
3902 goto out;
3903 }
3904 NDFREE_PNBUF(&nd);
3905 vp1 = nd.ni_vp;
3906 vrele(vp);
3907 if (vp1 == vp)
3908 strcpy(path, rpath);
3909 else {
3910 vput(vp1);
3911 error = ENOENT;
3912 }
3913
3914 out:
3915 free(fbuf, M_TEMP);
3916 return (error);
3917 }
3918
3919 #ifdef DDB
3920 static void
db_print_vpath(struct vnode * vp)3921 db_print_vpath(struct vnode *vp)
3922 {
3923
3924 while (vp != NULL) {
3925 db_printf("%p: ", vp);
3926 if (vp == rootvnode) {
3927 db_printf("/");
3928 vp = NULL;
3929 } else {
3930 if (vp->v_vflag & VV_ROOT) {
3931 db_printf("<mount point>");
3932 vp = vp->v_mount->mnt_vnodecovered;
3933 } else {
3934 struct namecache *ncp;
3935 char *ncn;
3936 int i;
3937
3938 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3939 if (ncp != NULL) {
3940 ncn = ncp->nc_name;
3941 for (i = 0; i < ncp->nc_nlen; i++)
3942 db_printf("%c", *ncn++);
3943 vp = ncp->nc_dvp;
3944 } else {
3945 vp = NULL;
3946 }
3947 }
3948 }
3949 db_printf("\n");
3950 }
3951
3952 return;
3953 }
3954
DB_SHOW_COMMAND(vpath,db_show_vpath)3955 DB_SHOW_COMMAND(vpath, db_show_vpath)
3956 {
3957 struct vnode *vp;
3958
3959 if (!have_addr) {
3960 db_printf("usage: show vpath <struct vnode *>\n");
3961 return;
3962 }
3963
3964 vp = (struct vnode *)addr;
3965 db_print_vpath(vp);
3966 }
3967
3968 #endif
3969
3970 static int cache_fast_lookup = 1;
3971
3972 #define CACHE_FPL_FAILED -2020
3973
3974 static int
cache_vop_bad_vexec(struct vop_fplookup_vexec_args * v)3975 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
3976 {
3977 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
3978 panic("no proper vop_fplookup_vexec");
3979 }
3980
3981 static int
cache_vop_bad_symlink(struct vop_fplookup_symlink_args * v)3982 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
3983 {
3984 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
3985 panic("no proper vop_fplookup_symlink");
3986 }
3987
3988 void
cache_vop_vector_register(struct vop_vector * v)3989 cache_vop_vector_register(struct vop_vector *v)
3990 {
3991 size_t ops;
3992
3993 ops = 0;
3994 if (v->vop_fplookup_vexec != NULL) {
3995 ops++;
3996 }
3997 if (v->vop_fplookup_symlink != NULL) {
3998 ops++;
3999 }
4000
4001 if (ops == 2) {
4002 return;
4003 }
4004
4005 if (ops == 0) {
4006 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4007 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4008 return;
4009 }
4010
4011 printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4012 "need to be provided", __func__, v);
4013 if (v->vop_fplookup_vexec == NULL) {
4014 printf("%s: missing vop_fplookup_vexec\n", __func__);
4015 }
4016 if (v->vop_fplookup_symlink == NULL) {
4017 printf("%s: missing vop_fplookup_symlink\n", __func__);
4018 }
4019 panic("bad vop vector %p", v);
4020 }
4021
4022 #ifdef INVARIANTS
4023 void
cache_validate_vop_vector(struct mount * mp,struct vop_vector * vops)4024 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4025 {
4026 if (mp == NULL)
4027 return;
4028
4029 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4030 return;
4031
4032 if (vops->vop_fplookup_vexec == NULL ||
4033 vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4034 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4035 vops, mp->mnt_vfc->vfc_name);
4036
4037 if (vops->vop_fplookup_symlink == NULL ||
4038 vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4039 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4040 vops, mp->mnt_vfc->vfc_name);
4041 }
4042 #endif
4043
4044 void
cache_fast_lookup_enabled_recalc(void)4045 cache_fast_lookup_enabled_recalc(void)
4046 {
4047 int lookup_flag;
4048 int mac_on;
4049
4050 #ifdef MAC
4051 mac_on = mac_vnode_check_lookup_enabled();
4052 mac_on |= mac_vnode_check_readlink_enabled();
4053 #else
4054 mac_on = 0;
4055 #endif
4056
4057 lookup_flag = atomic_load_int(&cache_fast_lookup);
4058 if (lookup_flag && !mac_on) {
4059 atomic_store_char(&cache_fast_lookup_enabled, true);
4060 } else {
4061 atomic_store_char(&cache_fast_lookup_enabled, false);
4062 }
4063 }
4064
4065 static int
syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)4066 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4067 {
4068 int error, old;
4069
4070 old = atomic_load_int(&cache_fast_lookup);
4071 error = sysctl_handle_int(oidp, arg1, arg2, req);
4072 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4073 cache_fast_lookup_enabled_recalc();
4074 return (error);
4075 }
4076 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4077 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4078
4079 /*
4080 * Components of nameidata (or objects it can point to) which may
4081 * need restoring in case fast path lookup fails.
4082 */
4083 struct nameidata_outer {
4084 size_t ni_pathlen;
4085 int cn_flags;
4086 };
4087
4088 struct nameidata_saved {
4089 #ifdef INVARIANTS
4090 char *cn_nameptr;
4091 size_t ni_pathlen;
4092 #endif
4093 };
4094
4095 #ifdef INVARIANTS
4096 struct cache_fpl_debug {
4097 size_t ni_pathlen;
4098 };
4099 #endif
4100
4101 struct cache_fpl {
4102 struct nameidata *ndp;
4103 struct componentname *cnp;
4104 char *nulchar;
4105 struct vnode *dvp;
4106 struct vnode *tvp;
4107 seqc_t dvp_seqc;
4108 seqc_t tvp_seqc;
4109 uint32_t hash;
4110 struct nameidata_saved snd;
4111 struct nameidata_outer snd_outer;
4112 int line;
4113 enum cache_fpl_status status:8;
4114 bool in_smr;
4115 bool fsearch;
4116 struct pwd **pwd;
4117 #ifdef INVARIANTS
4118 struct cache_fpl_debug debug;
4119 #endif
4120 };
4121
4122 static bool cache_fplookup_mp_supported(struct mount *mp);
4123 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4124 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4125 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4126 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4127 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4128 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4129 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4130 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4131 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4132
4133 static void
cache_fpl_cleanup_cnp(struct componentname * cnp)4134 cache_fpl_cleanup_cnp(struct componentname *cnp)
4135 {
4136
4137 uma_zfree(namei_zone, cnp->cn_pnbuf);
4138 cnp->cn_pnbuf = NULL;
4139 cnp->cn_nameptr = NULL;
4140 }
4141
4142 static struct vnode *
cache_fpl_handle_root(struct cache_fpl * fpl)4143 cache_fpl_handle_root(struct cache_fpl *fpl)
4144 {
4145 struct nameidata *ndp;
4146 struct componentname *cnp;
4147
4148 ndp = fpl->ndp;
4149 cnp = fpl->cnp;
4150
4151 MPASS(*(cnp->cn_nameptr) == '/');
4152 cnp->cn_nameptr++;
4153 cache_fpl_pathlen_dec(fpl);
4154
4155 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4156 do {
4157 cnp->cn_nameptr++;
4158 cache_fpl_pathlen_dec(fpl);
4159 } while (*(cnp->cn_nameptr) == '/');
4160 }
4161
4162 return (ndp->ni_rootdir);
4163 }
4164
4165 static void
cache_fpl_checkpoint_outer(struct cache_fpl * fpl)4166 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4167 {
4168
4169 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4170 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4171 }
4172
4173 static void
cache_fpl_checkpoint(struct cache_fpl * fpl)4174 cache_fpl_checkpoint(struct cache_fpl *fpl)
4175 {
4176
4177 #ifdef INVARIANTS
4178 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4179 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4180 #endif
4181 }
4182
4183 static void
cache_fpl_restore_partial(struct cache_fpl * fpl)4184 cache_fpl_restore_partial(struct cache_fpl *fpl)
4185 {
4186
4187 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4188 #ifdef INVARIANTS
4189 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4190 #endif
4191 }
4192
4193 static void
cache_fpl_restore_abort(struct cache_fpl * fpl)4194 cache_fpl_restore_abort(struct cache_fpl *fpl)
4195 {
4196
4197 cache_fpl_restore_partial(fpl);
4198 /*
4199 * It is 0 on entry by API contract.
4200 */
4201 fpl->ndp->ni_resflags = 0;
4202 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4203 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4204 }
4205
4206 #ifdef INVARIANTS
4207 #define cache_fpl_smr_assert_entered(fpl) ({ \
4208 struct cache_fpl *_fpl = (fpl); \
4209 MPASS(_fpl->in_smr == true); \
4210 VFS_SMR_ASSERT_ENTERED(); \
4211 })
4212 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
4213 struct cache_fpl *_fpl = (fpl); \
4214 MPASS(_fpl->in_smr == false); \
4215 VFS_SMR_ASSERT_NOT_ENTERED(); \
4216 })
4217 static void
cache_fpl_assert_status(struct cache_fpl * fpl)4218 cache_fpl_assert_status(struct cache_fpl *fpl)
4219 {
4220
4221 switch (fpl->status) {
4222 case CACHE_FPL_STATUS_UNSET:
4223 __assert_unreachable();
4224 break;
4225 case CACHE_FPL_STATUS_DESTROYED:
4226 case CACHE_FPL_STATUS_ABORTED:
4227 case CACHE_FPL_STATUS_PARTIAL:
4228 case CACHE_FPL_STATUS_HANDLED:
4229 break;
4230 }
4231 }
4232 #else
4233 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4234 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4235 #define cache_fpl_assert_status(fpl) do { } while (0)
4236 #endif
4237
4238 #define cache_fpl_smr_enter_initial(fpl) ({ \
4239 struct cache_fpl *_fpl = (fpl); \
4240 vfs_smr_enter(); \
4241 _fpl->in_smr = true; \
4242 })
4243
4244 #define cache_fpl_smr_enter(fpl) ({ \
4245 struct cache_fpl *_fpl = (fpl); \
4246 MPASS(_fpl->in_smr == false); \
4247 vfs_smr_enter(); \
4248 _fpl->in_smr = true; \
4249 })
4250
4251 #define cache_fpl_smr_exit(fpl) ({ \
4252 struct cache_fpl *_fpl = (fpl); \
4253 MPASS(_fpl->in_smr == true); \
4254 vfs_smr_exit(); \
4255 _fpl->in_smr = false; \
4256 })
4257
4258 static int
cache_fpl_aborted_early_impl(struct cache_fpl * fpl,int line)4259 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4260 {
4261
4262 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4263 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4264 ("%s: converting to abort from %d at %d, set at %d\n",
4265 __func__, fpl->status, line, fpl->line));
4266 }
4267 cache_fpl_smr_assert_not_entered(fpl);
4268 fpl->status = CACHE_FPL_STATUS_ABORTED;
4269 fpl->line = line;
4270 return (CACHE_FPL_FAILED);
4271 }
4272
4273 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__)
4274
4275 static int __noinline
cache_fpl_aborted_impl(struct cache_fpl * fpl,int line)4276 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4277 {
4278 struct nameidata *ndp;
4279 struct componentname *cnp;
4280
4281 ndp = fpl->ndp;
4282 cnp = fpl->cnp;
4283
4284 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4285 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4286 ("%s: converting to abort from %d at %d, set at %d\n",
4287 __func__, fpl->status, line, fpl->line));
4288 }
4289 fpl->status = CACHE_FPL_STATUS_ABORTED;
4290 fpl->line = line;
4291 if (fpl->in_smr)
4292 cache_fpl_smr_exit(fpl);
4293 cache_fpl_restore_abort(fpl);
4294 /*
4295 * Resolving symlinks overwrites data passed by the caller.
4296 * Let namei know.
4297 */
4298 if (ndp->ni_loopcnt > 0) {
4299 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4300 cache_fpl_cleanup_cnp(cnp);
4301 }
4302 return (CACHE_FPL_FAILED);
4303 }
4304
4305 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
4306
4307 static int __noinline
cache_fpl_partial_impl(struct cache_fpl * fpl,int line)4308 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4309 {
4310
4311 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4312 ("%s: setting to partial at %d, but already set to %d at %d\n",
4313 __func__, line, fpl->status, fpl->line));
4314 cache_fpl_smr_assert_entered(fpl);
4315 fpl->status = CACHE_FPL_STATUS_PARTIAL;
4316 fpl->line = line;
4317 return (cache_fplookup_partial_setup(fpl));
4318 }
4319
4320 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
4321
4322 static int
cache_fpl_handled_impl(struct cache_fpl * fpl,int line)4323 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4324 {
4325
4326 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4327 ("%s: setting to handled at %d, but already set to %d at %d\n",
4328 __func__, line, fpl->status, fpl->line));
4329 cache_fpl_smr_assert_not_entered(fpl);
4330 fpl->status = CACHE_FPL_STATUS_HANDLED;
4331 fpl->line = line;
4332 return (0);
4333 }
4334
4335 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__)
4336
4337 static int
cache_fpl_handled_error_impl(struct cache_fpl * fpl,int error,int line)4338 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4339 {
4340
4341 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4342 ("%s: setting to handled at %d, but already set to %d at %d\n",
4343 __func__, line, fpl->status, fpl->line));
4344 MPASS(error != 0);
4345 MPASS(error != CACHE_FPL_FAILED);
4346 cache_fpl_smr_assert_not_entered(fpl);
4347 fpl->status = CACHE_FPL_STATUS_HANDLED;
4348 fpl->line = line;
4349 fpl->dvp = NULL;
4350 fpl->tvp = NULL;
4351 return (error);
4352 }
4353
4354 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__)
4355
4356 static bool
cache_fpl_terminated(struct cache_fpl * fpl)4357 cache_fpl_terminated(struct cache_fpl *fpl)
4358 {
4359
4360 return (fpl->status != CACHE_FPL_STATUS_UNSET);
4361 }
4362
4363 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4364 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4365 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4366 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4367 OPENWRITE | WANTIOCTLCAPS)
4368
4369 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4370 (ISDOTDOT | MAKEENTRY | ISLASTCN)
4371
4372 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4373 "supported and internal flags overlap");
4374
4375 static bool
cache_fpl_islastcn(struct nameidata * ndp)4376 cache_fpl_islastcn(struct nameidata *ndp)
4377 {
4378
4379 return (*ndp->ni_next == 0);
4380 }
4381
4382 static bool
cache_fpl_istrailingslash(struct cache_fpl * fpl)4383 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4384 {
4385
4386 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4387 return (*(fpl->nulchar - 1) == '/');
4388 }
4389
4390 static bool
cache_fpl_isdotdot(struct componentname * cnp)4391 cache_fpl_isdotdot(struct componentname *cnp)
4392 {
4393
4394 if (cnp->cn_namelen == 2 &&
4395 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4396 return (true);
4397 return (false);
4398 }
4399
4400 static bool
cache_can_fplookup(struct cache_fpl * fpl)4401 cache_can_fplookup(struct cache_fpl *fpl)
4402 {
4403 struct nameidata *ndp;
4404 struct componentname *cnp;
4405 struct thread *td;
4406
4407 ndp = fpl->ndp;
4408 cnp = fpl->cnp;
4409 td = curthread;
4410
4411 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4412 cache_fpl_aborted_early(fpl);
4413 return (false);
4414 }
4415 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4416 cache_fpl_aborted_early(fpl);
4417 return (false);
4418 }
4419 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4420 cache_fpl_aborted_early(fpl);
4421 return (false);
4422 }
4423 if (AUDITING_TD(td)) {
4424 cache_fpl_aborted_early(fpl);
4425 return (false);
4426 }
4427 if (ndp->ni_startdir != NULL) {
4428 cache_fpl_aborted_early(fpl);
4429 return (false);
4430 }
4431 return (true);
4432 }
4433
4434 static int __noinline
cache_fplookup_dirfd(struct cache_fpl * fpl,struct vnode ** vpp)4435 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4436 {
4437 struct nameidata *ndp;
4438 struct componentname *cnp;
4439 int error;
4440 bool fsearch;
4441
4442 ndp = fpl->ndp;
4443 cnp = fpl->cnp;
4444
4445 error = fgetvp_lookup_smr(ndp, vpp, &fsearch);
4446 if (__predict_false(error != 0)) {
4447 return (cache_fpl_aborted(fpl));
4448 }
4449 fpl->fsearch = fsearch;
4450 if ((*vpp)->v_type != VDIR) {
4451 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4452 cache_fpl_smr_exit(fpl);
4453 return (cache_fpl_handled_error(fpl, ENOTDIR));
4454 }
4455 }
4456 return (0);
4457 }
4458
4459 static int __noinline
cache_fplookup_negative_promote(struct cache_fpl * fpl,struct namecache * oncp,uint32_t hash)4460 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4461 uint32_t hash)
4462 {
4463 struct componentname *cnp;
4464 struct vnode *dvp;
4465
4466 cnp = fpl->cnp;
4467 dvp = fpl->dvp;
4468
4469 cache_fpl_smr_exit(fpl);
4470 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4471 return (cache_fpl_handled_error(fpl, ENOENT));
4472 else
4473 return (cache_fpl_aborted(fpl));
4474 }
4475
4476 /*
4477 * The target vnode is not supported, prepare for the slow path to take over.
4478 */
4479 static int __noinline
cache_fplookup_partial_setup(struct cache_fpl * fpl)4480 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4481 {
4482 struct nameidata *ndp;
4483 struct componentname *cnp;
4484 enum vgetstate dvs;
4485 struct vnode *dvp;
4486 struct pwd *pwd;
4487 seqc_t dvp_seqc;
4488
4489 ndp = fpl->ndp;
4490 cnp = fpl->cnp;
4491 pwd = *(fpl->pwd);
4492 dvp = fpl->dvp;
4493 dvp_seqc = fpl->dvp_seqc;
4494
4495 if (!pwd_hold_smr(pwd)) {
4496 return (cache_fpl_aborted(fpl));
4497 }
4498
4499 /*
4500 * Note that seqc is checked before the vnode is locked, so by
4501 * the time regular lookup gets to it it may have moved.
4502 *
4503 * Ultimately this does not affect correctness, any lookup errors
4504 * are userspace racing with itself. It is guaranteed that any
4505 * path which ultimately gets found could also have been found
4506 * by regular lookup going all the way in absence of concurrent
4507 * modifications.
4508 */
4509 dvs = vget_prep_smr(dvp);
4510 cache_fpl_smr_exit(fpl);
4511 if (__predict_false(dvs == VGET_NONE)) {
4512 pwd_drop(pwd);
4513 return (cache_fpl_aborted(fpl));
4514 }
4515
4516 vget_finish_ref(dvp, dvs);
4517 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4518 vrele(dvp);
4519 pwd_drop(pwd);
4520 return (cache_fpl_aborted(fpl));
4521 }
4522
4523 cache_fpl_restore_partial(fpl);
4524 #ifdef INVARIANTS
4525 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4526 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4527 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4528 }
4529 #endif
4530
4531 ndp->ni_startdir = dvp;
4532 cnp->cn_flags |= MAKEENTRY;
4533 if (cache_fpl_islastcn(ndp))
4534 cnp->cn_flags |= ISLASTCN;
4535 if (cache_fpl_isdotdot(cnp))
4536 cnp->cn_flags |= ISDOTDOT;
4537
4538 /*
4539 * Skip potential extra slashes parsing did not take care of.
4540 * cache_fplookup_skip_slashes explains the mechanism.
4541 */
4542 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4543 do {
4544 cnp->cn_nameptr++;
4545 cache_fpl_pathlen_dec(fpl);
4546 } while (*(cnp->cn_nameptr) == '/');
4547 }
4548
4549 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4550 #ifdef INVARIANTS
4551 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4552 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4553 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4554 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4555 }
4556 #endif
4557 return (0);
4558 }
4559
4560 static int
cache_fplookup_final_child(struct cache_fpl * fpl,enum vgetstate tvs)4561 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4562 {
4563 struct componentname *cnp;
4564 struct vnode *tvp;
4565 seqc_t tvp_seqc;
4566 int error, lkflags;
4567
4568 cnp = fpl->cnp;
4569 tvp = fpl->tvp;
4570 tvp_seqc = fpl->tvp_seqc;
4571
4572 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4573 lkflags = LK_SHARED;
4574 if ((cnp->cn_flags & LOCKSHARED) == 0)
4575 lkflags = LK_EXCLUSIVE;
4576 error = vget_finish(tvp, lkflags, tvs);
4577 if (__predict_false(error != 0)) {
4578 return (cache_fpl_aborted(fpl));
4579 }
4580 } else {
4581 vget_finish_ref(tvp, tvs);
4582 }
4583
4584 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4585 if ((cnp->cn_flags & LOCKLEAF) != 0)
4586 vput(tvp);
4587 else
4588 vrele(tvp);
4589 return (cache_fpl_aborted(fpl));
4590 }
4591
4592 return (cache_fpl_handled(fpl));
4593 }
4594
4595 /*
4596 * They want to possibly modify the state of the namecache.
4597 */
4598 static int __noinline
cache_fplookup_final_modifying(struct cache_fpl * fpl)4599 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4600 {
4601 struct nameidata *ndp __diagused;
4602 struct componentname *cnp;
4603 enum vgetstate dvs;
4604 struct vnode *dvp, *tvp;
4605 struct mount *mp;
4606 seqc_t dvp_seqc;
4607 int error;
4608 bool docache;
4609
4610 ndp = fpl->ndp;
4611 cnp = fpl->cnp;
4612 dvp = fpl->dvp;
4613 dvp_seqc = fpl->dvp_seqc;
4614
4615 MPASS(*(cnp->cn_nameptr) != '/');
4616 MPASS(cache_fpl_islastcn(ndp));
4617 if ((cnp->cn_flags & LOCKPARENT) == 0)
4618 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4619 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4620 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4621 cnp->cn_nameiop == RENAME);
4622 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4623 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4624
4625 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4626 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4627 docache = false;
4628
4629 /*
4630 * Regular lookup nulifies the slash, which we don't do here.
4631 * Don't take chances with filesystem routines seeing it for
4632 * the last entry.
4633 */
4634 if (cache_fpl_istrailingslash(fpl)) {
4635 return (cache_fpl_partial(fpl));
4636 }
4637
4638 mp = atomic_load_ptr(&dvp->v_mount);
4639 if (__predict_false(mp == NULL)) {
4640 return (cache_fpl_aborted(fpl));
4641 }
4642
4643 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4644 cache_fpl_smr_exit(fpl);
4645 /*
4646 * Original code keeps not checking for CREATE which
4647 * might be a bug. For now let the old lookup decide.
4648 */
4649 if (cnp->cn_nameiop == CREATE) {
4650 return (cache_fpl_aborted(fpl));
4651 }
4652 return (cache_fpl_handled_error(fpl, EROFS));
4653 }
4654
4655 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4656 cache_fpl_smr_exit(fpl);
4657 return (cache_fpl_handled_error(fpl, EEXIST));
4658 }
4659
4660 /*
4661 * Secure access to dvp; check cache_fplookup_partial_setup for
4662 * reasoning.
4663 *
4664 * XXX At least UFS requires its lookup routine to be called for
4665 * the last path component, which leads to some level of complication
4666 * and inefficiency:
4667 * - the target routine always locks the target vnode, but our caller
4668 * may not need it locked
4669 * - some of the VOP machinery asserts that the parent is locked, which
4670 * once more may be not required
4671 *
4672 * TODO: add a flag for filesystems which don't need this.
4673 */
4674 dvs = vget_prep_smr(dvp);
4675 cache_fpl_smr_exit(fpl);
4676 if (__predict_false(dvs == VGET_NONE)) {
4677 return (cache_fpl_aborted(fpl));
4678 }
4679
4680 vget_finish_ref(dvp, dvs);
4681 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4682 vrele(dvp);
4683 return (cache_fpl_aborted(fpl));
4684 }
4685
4686 error = vn_lock(dvp, LK_EXCLUSIVE);
4687 if (__predict_false(error != 0)) {
4688 vrele(dvp);
4689 return (cache_fpl_aborted(fpl));
4690 }
4691
4692 tvp = NULL;
4693 cnp->cn_flags |= ISLASTCN;
4694 if (docache)
4695 cnp->cn_flags |= MAKEENTRY;
4696 if (cache_fpl_isdotdot(cnp))
4697 cnp->cn_flags |= ISDOTDOT;
4698 cnp->cn_lkflags = LK_EXCLUSIVE;
4699 error = VOP_LOOKUP(dvp, &tvp, cnp);
4700 switch (error) {
4701 case EJUSTRETURN:
4702 case 0:
4703 break;
4704 case ENOTDIR:
4705 case ENOENT:
4706 vput(dvp);
4707 return (cache_fpl_handled_error(fpl, error));
4708 default:
4709 vput(dvp);
4710 return (cache_fpl_aborted(fpl));
4711 }
4712
4713 fpl->tvp = tvp;
4714
4715 if (tvp == NULL) {
4716 MPASS(error == EJUSTRETURN);
4717 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4718 VOP_UNLOCK(dvp);
4719 }
4720 return (cache_fpl_handled(fpl));
4721 }
4722
4723 /*
4724 * There are very hairy corner cases concerning various flag combinations
4725 * and locking state. In particular here we only hold one lock instead of
4726 * two.
4727 *
4728 * Skip the complexity as it is of no significance for normal workloads.
4729 */
4730 if (__predict_false(tvp == dvp)) {
4731 vput(dvp);
4732 vrele(tvp);
4733 return (cache_fpl_aborted(fpl));
4734 }
4735
4736 /*
4737 * If they want the symlink itself we are fine, but if they want to
4738 * follow it regular lookup has to be engaged.
4739 */
4740 if (tvp->v_type == VLNK) {
4741 if ((cnp->cn_flags & FOLLOW) != 0) {
4742 vput(dvp);
4743 vput(tvp);
4744 return (cache_fpl_aborted(fpl));
4745 }
4746 }
4747
4748 /*
4749 * Since we expect this to be the terminal vnode it should almost never
4750 * be a mount point.
4751 */
4752 if (__predict_false(cache_fplookup_is_mp(fpl))) {
4753 vput(dvp);
4754 vput(tvp);
4755 return (cache_fpl_aborted(fpl));
4756 }
4757
4758 if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4759 vput(dvp);
4760 vput(tvp);
4761 return (cache_fpl_handled_error(fpl, EEXIST));
4762 }
4763
4764 if ((cnp->cn_flags & LOCKLEAF) == 0) {
4765 VOP_UNLOCK(tvp);
4766 }
4767
4768 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4769 VOP_UNLOCK(dvp);
4770 }
4771
4772 return (cache_fpl_handled(fpl));
4773 }
4774
4775 static int __noinline
cache_fplookup_modifying(struct cache_fpl * fpl)4776 cache_fplookup_modifying(struct cache_fpl *fpl)
4777 {
4778 struct nameidata *ndp;
4779
4780 ndp = fpl->ndp;
4781
4782 if (!cache_fpl_islastcn(ndp)) {
4783 return (cache_fpl_partial(fpl));
4784 }
4785 return (cache_fplookup_final_modifying(fpl));
4786 }
4787
4788 static int __noinline
cache_fplookup_final_withparent(struct cache_fpl * fpl)4789 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4790 {
4791 struct componentname *cnp;
4792 enum vgetstate dvs, tvs;
4793 struct vnode *dvp, *tvp;
4794 seqc_t dvp_seqc;
4795 int error;
4796
4797 cnp = fpl->cnp;
4798 dvp = fpl->dvp;
4799 dvp_seqc = fpl->dvp_seqc;
4800 tvp = fpl->tvp;
4801
4802 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4803
4804 /*
4805 * This is less efficient than it can be for simplicity.
4806 */
4807 dvs = vget_prep_smr(dvp);
4808 if (__predict_false(dvs == VGET_NONE)) {
4809 return (cache_fpl_aborted(fpl));
4810 }
4811 tvs = vget_prep_smr(tvp);
4812 if (__predict_false(tvs == VGET_NONE)) {
4813 cache_fpl_smr_exit(fpl);
4814 vget_abort(dvp, dvs);
4815 return (cache_fpl_aborted(fpl));
4816 }
4817
4818 cache_fpl_smr_exit(fpl);
4819
4820 if ((cnp->cn_flags & LOCKPARENT) != 0) {
4821 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4822 if (__predict_false(error != 0)) {
4823 vget_abort(tvp, tvs);
4824 return (cache_fpl_aborted(fpl));
4825 }
4826 } else {
4827 vget_finish_ref(dvp, dvs);
4828 }
4829
4830 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4831 vget_abort(tvp, tvs);
4832 if ((cnp->cn_flags & LOCKPARENT) != 0)
4833 vput(dvp);
4834 else
4835 vrele(dvp);
4836 return (cache_fpl_aborted(fpl));
4837 }
4838
4839 error = cache_fplookup_final_child(fpl, tvs);
4840 if (__predict_false(error != 0)) {
4841 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4842 fpl->status == CACHE_FPL_STATUS_DESTROYED);
4843 if ((cnp->cn_flags & LOCKPARENT) != 0)
4844 vput(dvp);
4845 else
4846 vrele(dvp);
4847 return (error);
4848 }
4849
4850 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4851 return (0);
4852 }
4853
4854 static int
cache_fplookup_final(struct cache_fpl * fpl)4855 cache_fplookup_final(struct cache_fpl *fpl)
4856 {
4857 struct componentname *cnp;
4858 enum vgetstate tvs;
4859 struct vnode *dvp, *tvp;
4860 seqc_t dvp_seqc;
4861
4862 cnp = fpl->cnp;
4863 dvp = fpl->dvp;
4864 dvp_seqc = fpl->dvp_seqc;
4865 tvp = fpl->tvp;
4866
4867 MPASS(*(cnp->cn_nameptr) != '/');
4868
4869 if (cnp->cn_nameiop != LOOKUP) {
4870 return (cache_fplookup_final_modifying(fpl));
4871 }
4872
4873 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4874 return (cache_fplookup_final_withparent(fpl));
4875
4876 tvs = vget_prep_smr(tvp);
4877 if (__predict_false(tvs == VGET_NONE)) {
4878 return (cache_fpl_partial(fpl));
4879 }
4880
4881 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4882 cache_fpl_smr_exit(fpl);
4883 vget_abort(tvp, tvs);
4884 return (cache_fpl_aborted(fpl));
4885 }
4886
4887 cache_fpl_smr_exit(fpl);
4888 return (cache_fplookup_final_child(fpl, tvs));
4889 }
4890
4891 /*
4892 * Comment from locked lookup:
4893 * Check for degenerate name (e.g. / or "") which is a way of talking about a
4894 * directory, e.g. like "/." or ".".
4895 */
4896 static int __noinline
cache_fplookup_degenerate(struct cache_fpl * fpl)4897 cache_fplookup_degenerate(struct cache_fpl *fpl)
4898 {
4899 struct componentname *cnp;
4900 struct vnode *dvp;
4901 enum vgetstate dvs;
4902 int error, lkflags;
4903 #ifdef INVARIANTS
4904 char *cp;
4905 #endif
4906
4907 fpl->tvp = fpl->dvp;
4908 fpl->tvp_seqc = fpl->dvp_seqc;
4909
4910 cnp = fpl->cnp;
4911 dvp = fpl->dvp;
4912
4913 #ifdef INVARIANTS
4914 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
4915 KASSERT(*cp == '/',
4916 ("%s: encountered non-slash; string [%s]\n", __func__,
4917 cnp->cn_pnbuf));
4918 }
4919 #endif
4920
4921 if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4922 cache_fpl_smr_exit(fpl);
4923 return (cache_fpl_handled_error(fpl, EISDIR));
4924 }
4925
4926 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4927 return (cache_fplookup_final_withparent(fpl));
4928 }
4929
4930 dvs = vget_prep_smr(dvp);
4931 cache_fpl_smr_exit(fpl);
4932 if (__predict_false(dvs == VGET_NONE)) {
4933 return (cache_fpl_aborted(fpl));
4934 }
4935
4936 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4937 lkflags = LK_SHARED;
4938 if ((cnp->cn_flags & LOCKSHARED) == 0)
4939 lkflags = LK_EXCLUSIVE;
4940 error = vget_finish(dvp, lkflags, dvs);
4941 if (__predict_false(error != 0)) {
4942 return (cache_fpl_aborted(fpl));
4943 }
4944 } else {
4945 vget_finish_ref(dvp, dvs);
4946 }
4947 return (cache_fpl_handled(fpl));
4948 }
4949
4950 static int __noinline
cache_fplookup_emptypath(struct cache_fpl * fpl)4951 cache_fplookup_emptypath(struct cache_fpl *fpl)
4952 {
4953 struct nameidata *ndp;
4954 struct componentname *cnp;
4955 enum vgetstate tvs;
4956 struct vnode *tvp;
4957 int error, lkflags;
4958
4959 fpl->tvp = fpl->dvp;
4960 fpl->tvp_seqc = fpl->dvp_seqc;
4961
4962 ndp = fpl->ndp;
4963 cnp = fpl->cnp;
4964 tvp = fpl->tvp;
4965
4966 MPASS(*cnp->cn_pnbuf == '\0');
4967
4968 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
4969 cache_fpl_smr_exit(fpl);
4970 return (cache_fpl_handled_error(fpl, ENOENT));
4971 }
4972
4973 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
4974
4975 tvs = vget_prep_smr(tvp);
4976 cache_fpl_smr_exit(fpl);
4977 if (__predict_false(tvs == VGET_NONE)) {
4978 return (cache_fpl_aborted(fpl));
4979 }
4980
4981 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4982 lkflags = LK_SHARED;
4983 if ((cnp->cn_flags & LOCKSHARED) == 0)
4984 lkflags = LK_EXCLUSIVE;
4985 error = vget_finish(tvp, lkflags, tvs);
4986 if (__predict_false(error != 0)) {
4987 return (cache_fpl_aborted(fpl));
4988 }
4989 } else {
4990 vget_finish_ref(tvp, tvs);
4991 }
4992
4993 ndp->ni_resflags |= NIRES_EMPTYPATH;
4994 return (cache_fpl_handled(fpl));
4995 }
4996
4997 static int __noinline
cache_fplookup_noentry(struct cache_fpl * fpl)4998 cache_fplookup_noentry(struct cache_fpl *fpl)
4999 {
5000 struct nameidata *ndp;
5001 struct componentname *cnp;
5002 enum vgetstate dvs;
5003 struct vnode *dvp, *tvp;
5004 seqc_t dvp_seqc;
5005 int error;
5006
5007 ndp = fpl->ndp;
5008 cnp = fpl->cnp;
5009 dvp = fpl->dvp;
5010 dvp_seqc = fpl->dvp_seqc;
5011
5012 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5013 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5014 if (cnp->cn_nameiop == LOOKUP)
5015 MPASS((cnp->cn_flags & NOCACHE) == 0);
5016 MPASS(!cache_fpl_isdotdot(cnp));
5017
5018 /*
5019 * Hack: delayed name len checking.
5020 */
5021 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5022 cache_fpl_smr_exit(fpl);
5023 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5024 }
5025
5026 if (cnp->cn_nameptr[0] == '/') {
5027 return (cache_fplookup_skip_slashes(fpl));
5028 }
5029
5030 if (cnp->cn_pnbuf[0] == '\0') {
5031 return (cache_fplookup_emptypath(fpl));
5032 }
5033
5034 if (cnp->cn_nameptr[0] == '\0') {
5035 if (fpl->tvp == NULL) {
5036 return (cache_fplookup_degenerate(fpl));
5037 }
5038 return (cache_fplookup_trailingslash(fpl));
5039 }
5040
5041 if (cnp->cn_nameiop != LOOKUP) {
5042 fpl->tvp = NULL;
5043 return (cache_fplookup_modifying(fpl));
5044 }
5045
5046 /*
5047 * Only try to fill in the component if it is the last one,
5048 * otherwise not only there may be several to handle but the
5049 * walk may be complicated.
5050 */
5051 if (!cache_fpl_islastcn(ndp)) {
5052 return (cache_fpl_partial(fpl));
5053 }
5054
5055 /*
5056 * Regular lookup nulifies the slash, which we don't do here.
5057 * Don't take chances with filesystem routines seeing it for
5058 * the last entry.
5059 */
5060 if (cache_fpl_istrailingslash(fpl)) {
5061 return (cache_fpl_partial(fpl));
5062 }
5063
5064 /*
5065 * Secure access to dvp; check cache_fplookup_partial_setup for
5066 * reasoning.
5067 */
5068 dvs = vget_prep_smr(dvp);
5069 cache_fpl_smr_exit(fpl);
5070 if (__predict_false(dvs == VGET_NONE)) {
5071 return (cache_fpl_aborted(fpl));
5072 }
5073
5074 vget_finish_ref(dvp, dvs);
5075 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5076 vrele(dvp);
5077 return (cache_fpl_aborted(fpl));
5078 }
5079
5080 error = vn_lock(dvp, LK_SHARED);
5081 if (__predict_false(error != 0)) {
5082 vrele(dvp);
5083 return (cache_fpl_aborted(fpl));
5084 }
5085
5086 tvp = NULL;
5087 /*
5088 * TODO: provide variants which don't require locking either vnode.
5089 */
5090 cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5091 cnp->cn_lkflags = LK_SHARED;
5092 if ((cnp->cn_flags & LOCKSHARED) == 0) {
5093 cnp->cn_lkflags = LK_EXCLUSIVE;
5094 }
5095 error = VOP_LOOKUP(dvp, &tvp, cnp);
5096 switch (error) {
5097 case EJUSTRETURN:
5098 case 0:
5099 break;
5100 case ENOTDIR:
5101 case ENOENT:
5102 vput(dvp);
5103 return (cache_fpl_handled_error(fpl, error));
5104 default:
5105 vput(dvp);
5106 return (cache_fpl_aborted(fpl));
5107 }
5108
5109 fpl->tvp = tvp;
5110
5111 if (tvp == NULL) {
5112 MPASS(error == EJUSTRETURN);
5113 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5114 vput(dvp);
5115 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5116 VOP_UNLOCK(dvp);
5117 }
5118 return (cache_fpl_handled(fpl));
5119 }
5120
5121 if (tvp->v_type == VLNK) {
5122 if ((cnp->cn_flags & FOLLOW) != 0) {
5123 vput(dvp);
5124 vput(tvp);
5125 return (cache_fpl_aborted(fpl));
5126 }
5127 }
5128
5129 if (__predict_false(cache_fplookup_is_mp(fpl))) {
5130 vput(dvp);
5131 vput(tvp);
5132 return (cache_fpl_aborted(fpl));
5133 }
5134
5135 if ((cnp->cn_flags & LOCKLEAF) == 0) {
5136 VOP_UNLOCK(tvp);
5137 }
5138
5139 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5140 vput(dvp);
5141 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5142 VOP_UNLOCK(dvp);
5143 }
5144 return (cache_fpl_handled(fpl));
5145 }
5146
5147 static int __noinline
cache_fplookup_dot(struct cache_fpl * fpl)5148 cache_fplookup_dot(struct cache_fpl *fpl)
5149 {
5150 int error;
5151
5152 MPASS(!seqc_in_modify(fpl->dvp_seqc));
5153
5154 if (__predict_false(fpl->dvp->v_type != VDIR)) {
5155 cache_fpl_smr_exit(fpl);
5156 return (cache_fpl_handled_error(fpl, ENOTDIR));
5157 }
5158
5159 /*
5160 * Just re-assign the value. seqc will be checked later for the first
5161 * non-dot path component in line and/or before deciding to return the
5162 * vnode.
5163 */
5164 fpl->tvp = fpl->dvp;
5165 fpl->tvp_seqc = fpl->dvp_seqc;
5166
5167 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5168
5169 error = 0;
5170 if (cache_fplookup_is_mp(fpl)) {
5171 error = cache_fplookup_cross_mount(fpl);
5172 }
5173 return (error);
5174 }
5175
5176 static int __noinline
cache_fplookup_dotdot(struct cache_fpl * fpl)5177 cache_fplookup_dotdot(struct cache_fpl *fpl)
5178 {
5179 struct nameidata *ndp;
5180 struct componentname *cnp;
5181 struct namecache *ncp;
5182 struct vnode *dvp;
5183 struct prison *pr;
5184 u_char nc_flag;
5185
5186 ndp = fpl->ndp;
5187 cnp = fpl->cnp;
5188 dvp = fpl->dvp;
5189
5190 MPASS(cache_fpl_isdotdot(cnp));
5191
5192 /*
5193 * XXX this is racy the same way regular lookup is
5194 */
5195 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
5196 pr = pr->pr_parent)
5197 if (dvp == pr->pr_root)
5198 break;
5199
5200 if (dvp == ndp->ni_rootdir ||
5201 dvp == ndp->ni_topdir ||
5202 dvp == rootvnode ||
5203 pr != NULL) {
5204 fpl->tvp = dvp;
5205 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5206 if (seqc_in_modify(fpl->tvp_seqc)) {
5207 return (cache_fpl_aborted(fpl));
5208 }
5209 return (0);
5210 }
5211
5212 if ((dvp->v_vflag & VV_ROOT) != 0) {
5213 /*
5214 * TODO
5215 * The opposite of climb mount is needed here.
5216 */
5217 return (cache_fpl_partial(fpl));
5218 }
5219
5220 if (__predict_false(dvp->v_type != VDIR)) {
5221 cache_fpl_smr_exit(fpl);
5222 return (cache_fpl_handled_error(fpl, ENOTDIR));
5223 }
5224
5225 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5226 if (ncp == NULL) {
5227 return (cache_fpl_aborted(fpl));
5228 }
5229
5230 nc_flag = atomic_load_char(&ncp->nc_flag);
5231 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5232 if ((nc_flag & NCF_NEGATIVE) != 0)
5233 return (cache_fpl_aborted(fpl));
5234 fpl->tvp = ncp->nc_vp;
5235 } else {
5236 fpl->tvp = ncp->nc_dvp;
5237 }
5238
5239 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5240 if (seqc_in_modify(fpl->tvp_seqc)) {
5241 return (cache_fpl_partial(fpl));
5242 }
5243
5244 /*
5245 * Acquire fence provided by vn_seqc_read_any above.
5246 */
5247 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5248 return (cache_fpl_aborted(fpl));
5249 }
5250
5251 if (!cache_ncp_canuse(ncp)) {
5252 return (cache_fpl_aborted(fpl));
5253 }
5254
5255 return (0);
5256 }
5257
5258 static int __noinline
cache_fplookup_neg(struct cache_fpl * fpl,struct namecache * ncp,uint32_t hash)5259 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5260 {
5261 u_char nc_flag __diagused;
5262 bool neg_promote;
5263
5264 #ifdef INVARIANTS
5265 nc_flag = atomic_load_char(&ncp->nc_flag);
5266 MPASS((nc_flag & NCF_NEGATIVE) != 0);
5267 #endif
5268 /*
5269 * If they want to create an entry we need to replace this one.
5270 */
5271 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5272 fpl->tvp = NULL;
5273 return (cache_fplookup_modifying(fpl));
5274 }
5275 neg_promote = cache_neg_hit_prep(ncp);
5276 if (!cache_fpl_neg_ncp_canuse(ncp)) {
5277 cache_neg_hit_abort(ncp);
5278 return (cache_fpl_partial(fpl));
5279 }
5280 if (neg_promote) {
5281 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5282 }
5283 cache_neg_hit_finish(ncp);
5284 cache_fpl_smr_exit(fpl);
5285 return (cache_fpl_handled_error(fpl, ENOENT));
5286 }
5287
5288 /*
5289 * Resolve a symlink. Called by filesystem-specific routines.
5290 *
5291 * Code flow is:
5292 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5293 */
5294 int
cache_symlink_resolve(struct cache_fpl * fpl,const char * string,size_t len)5295 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5296 {
5297 struct nameidata *ndp;
5298 struct componentname *cnp;
5299 size_t adjust;
5300
5301 ndp = fpl->ndp;
5302 cnp = fpl->cnp;
5303
5304 if (__predict_false(len == 0)) {
5305 return (ENOENT);
5306 }
5307
5308 if (__predict_false(len > MAXPATHLEN - 2)) {
5309 if (cache_fpl_istrailingslash(fpl)) {
5310 return (EAGAIN);
5311 }
5312 }
5313
5314 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5315 #ifdef INVARIANTS
5316 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5317 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5318 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5319 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5320 }
5321 #endif
5322
5323 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5324 return (ENAMETOOLONG);
5325 }
5326
5327 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5328 return (ELOOP);
5329 }
5330
5331 adjust = len;
5332 if (ndp->ni_pathlen > 1) {
5333 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5334 } else {
5335 if (cache_fpl_istrailingslash(fpl)) {
5336 adjust = len + 1;
5337 cnp->cn_pnbuf[len] = '/';
5338 cnp->cn_pnbuf[len + 1] = '\0';
5339 } else {
5340 cnp->cn_pnbuf[len] = '\0';
5341 }
5342 }
5343 bcopy(string, cnp->cn_pnbuf, len);
5344
5345 ndp->ni_pathlen += adjust;
5346 cache_fpl_pathlen_add(fpl, adjust);
5347 cnp->cn_nameptr = cnp->cn_pnbuf;
5348 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5349 fpl->tvp = NULL;
5350 return (0);
5351 }
5352
5353 static int __noinline
cache_fplookup_symlink(struct cache_fpl * fpl)5354 cache_fplookup_symlink(struct cache_fpl *fpl)
5355 {
5356 struct mount *mp;
5357 struct nameidata *ndp;
5358 struct componentname *cnp;
5359 struct vnode *dvp, *tvp;
5360 struct pwd *pwd;
5361 int error;
5362
5363 ndp = fpl->ndp;
5364 cnp = fpl->cnp;
5365 dvp = fpl->dvp;
5366 tvp = fpl->tvp;
5367 pwd = *(fpl->pwd);
5368
5369 if (cache_fpl_islastcn(ndp)) {
5370 if ((cnp->cn_flags & FOLLOW) == 0) {
5371 return (cache_fplookup_final(fpl));
5372 }
5373 }
5374
5375 mp = atomic_load_ptr(&dvp->v_mount);
5376 if (__predict_false(mp == NULL)) {
5377 return (cache_fpl_aborted(fpl));
5378 }
5379
5380 /*
5381 * Note this check races against setting the flag just like regular
5382 * lookup.
5383 */
5384 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5385 cache_fpl_smr_exit(fpl);
5386 return (cache_fpl_handled_error(fpl, EACCES));
5387 }
5388
5389 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5390 if (__predict_false(error != 0)) {
5391 switch (error) {
5392 case EAGAIN:
5393 return (cache_fpl_partial(fpl));
5394 case ENOENT:
5395 case ENAMETOOLONG:
5396 case ELOOP:
5397 cache_fpl_smr_exit(fpl);
5398 return (cache_fpl_handled_error(fpl, error));
5399 default:
5400 return (cache_fpl_aborted(fpl));
5401 }
5402 }
5403
5404 if (*(cnp->cn_nameptr) == '/') {
5405 fpl->dvp = cache_fpl_handle_root(fpl);
5406 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5407 if (seqc_in_modify(fpl->dvp_seqc)) {
5408 return (cache_fpl_aborted(fpl));
5409 }
5410 /*
5411 * The main loop assumes that ->dvp points to a vnode belonging
5412 * to a filesystem which can do lockless lookup, but the absolute
5413 * symlink can be wandering off to one which does not.
5414 */
5415 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5416 if (__predict_false(mp == NULL)) {
5417 return (cache_fpl_aborted(fpl));
5418 }
5419 if (!cache_fplookup_mp_supported(mp)) {
5420 cache_fpl_checkpoint(fpl);
5421 return (cache_fpl_partial(fpl));
5422 }
5423 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5424 return (cache_fpl_aborted(fpl));
5425 }
5426 }
5427 return (0);
5428 }
5429
5430 static int
cache_fplookup_next(struct cache_fpl * fpl)5431 cache_fplookup_next(struct cache_fpl *fpl)
5432 {
5433 struct componentname *cnp;
5434 struct namecache *ncp;
5435 struct vnode *dvp, *tvp;
5436 u_char nc_flag;
5437 uint32_t hash;
5438 int error;
5439
5440 cnp = fpl->cnp;
5441 dvp = fpl->dvp;
5442 hash = fpl->hash;
5443
5444 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5445 if (cnp->cn_namelen == 1) {
5446 return (cache_fplookup_dot(fpl));
5447 }
5448 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5449 return (cache_fplookup_dotdot(fpl));
5450 }
5451 }
5452
5453 MPASS(!cache_fpl_isdotdot(cnp));
5454
5455 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
5456 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
5457 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
5458 break;
5459 }
5460
5461 if (__predict_false(ncp == NULL)) {
5462 return (cache_fplookup_noentry(fpl));
5463 }
5464
5465 tvp = atomic_load_ptr(&ncp->nc_vp);
5466 nc_flag = atomic_load_char(&ncp->nc_flag);
5467 if ((nc_flag & NCF_NEGATIVE) != 0) {
5468 return (cache_fplookup_neg(fpl, ncp, hash));
5469 }
5470
5471 if (!cache_ncp_canuse(ncp)) {
5472 return (cache_fpl_partial(fpl));
5473 }
5474
5475 fpl->tvp = tvp;
5476 fpl->tvp_seqc = vn_seqc_read_any(tvp);
5477 if (seqc_in_modify(fpl->tvp_seqc)) {
5478 return (cache_fpl_partial(fpl));
5479 }
5480
5481 counter_u64_add(numposhits, 1);
5482 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5483
5484 error = 0;
5485 if (cache_fplookup_is_mp(fpl)) {
5486 error = cache_fplookup_cross_mount(fpl);
5487 }
5488 return (error);
5489 }
5490
5491 static bool
cache_fplookup_mp_supported(struct mount * mp)5492 cache_fplookup_mp_supported(struct mount *mp)
5493 {
5494
5495 MPASS(mp != NULL);
5496 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5497 return (false);
5498 return (true);
5499 }
5500
5501 /*
5502 * Walk up the mount stack (if any).
5503 *
5504 * Correctness is provided in the following ways:
5505 * - all vnodes are protected from freeing with SMR
5506 * - struct mount objects are type stable making them always safe to access
5507 * - stability of the particular mount is provided by busying it
5508 * - relationship between the vnode which is mounted on and the mount is
5509 * verified with the vnode sequence counter after busying
5510 * - association between root vnode of the mount and the mount is protected
5511 * by busy
5512 *
5513 * From that point on we can read the sequence counter of the root vnode
5514 * and get the next mount on the stack (if any) using the same protection.
5515 *
5516 * By the end of successful walk we are guaranteed the reached state was
5517 * indeed present at least at some point which matches the regular lookup.
5518 */
5519 static int __noinline
cache_fplookup_climb_mount(struct cache_fpl * fpl)5520 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5521 {
5522 struct mount *mp, *prev_mp;
5523 struct mount_pcpu *mpcpu, *prev_mpcpu;
5524 struct vnode *vp;
5525 seqc_t vp_seqc;
5526
5527 vp = fpl->tvp;
5528 vp_seqc = fpl->tvp_seqc;
5529
5530 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5531 mp = atomic_load_ptr(&vp->v_mountedhere);
5532 if (__predict_false(mp == NULL)) {
5533 return (0);
5534 }
5535
5536 prev_mp = NULL;
5537 for (;;) {
5538 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5539 if (prev_mp != NULL)
5540 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5541 return (cache_fpl_partial(fpl));
5542 }
5543 if (prev_mp != NULL)
5544 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5545 if (!vn_seqc_consistent(vp, vp_seqc)) {
5546 vfs_op_thread_exit_crit(mp, mpcpu);
5547 return (cache_fpl_partial(fpl));
5548 }
5549 if (!cache_fplookup_mp_supported(mp)) {
5550 vfs_op_thread_exit_crit(mp, mpcpu);
5551 return (cache_fpl_partial(fpl));
5552 }
5553 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5554 if (vp == NULL) {
5555 vfs_op_thread_exit_crit(mp, mpcpu);
5556 return (cache_fpl_partial(fpl));
5557 }
5558 vp_seqc = vn_seqc_read_any(vp);
5559 if (seqc_in_modify(vp_seqc)) {
5560 vfs_op_thread_exit_crit(mp, mpcpu);
5561 return (cache_fpl_partial(fpl));
5562 }
5563 prev_mp = mp;
5564 prev_mpcpu = mpcpu;
5565 mp = atomic_load_ptr(&vp->v_mountedhere);
5566 if (mp == NULL)
5567 break;
5568 }
5569
5570 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5571 fpl->tvp = vp;
5572 fpl->tvp_seqc = vp_seqc;
5573 return (0);
5574 }
5575
5576 static int __noinline
cache_fplookup_cross_mount(struct cache_fpl * fpl)5577 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5578 {
5579 struct mount *mp;
5580 struct mount_pcpu *mpcpu;
5581 struct vnode *vp;
5582 seqc_t vp_seqc;
5583
5584 vp = fpl->tvp;
5585 vp_seqc = fpl->tvp_seqc;
5586
5587 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5588 mp = atomic_load_ptr(&vp->v_mountedhere);
5589 if (__predict_false(mp == NULL)) {
5590 return (0);
5591 }
5592
5593 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5594 return (cache_fpl_partial(fpl));
5595 }
5596 if (!vn_seqc_consistent(vp, vp_seqc)) {
5597 vfs_op_thread_exit_crit(mp, mpcpu);
5598 return (cache_fpl_partial(fpl));
5599 }
5600 if (!cache_fplookup_mp_supported(mp)) {
5601 vfs_op_thread_exit_crit(mp, mpcpu);
5602 return (cache_fpl_partial(fpl));
5603 }
5604 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5605 if (__predict_false(vp == NULL)) {
5606 vfs_op_thread_exit_crit(mp, mpcpu);
5607 return (cache_fpl_partial(fpl));
5608 }
5609 vp_seqc = vn_seqc_read_any(vp);
5610 vfs_op_thread_exit_crit(mp, mpcpu);
5611 if (seqc_in_modify(vp_seqc)) {
5612 return (cache_fpl_partial(fpl));
5613 }
5614 mp = atomic_load_ptr(&vp->v_mountedhere);
5615 if (__predict_false(mp != NULL)) {
5616 /*
5617 * There are possibly more mount points on top.
5618 * Normally this does not happen so for simplicity just start
5619 * over.
5620 */
5621 return (cache_fplookup_climb_mount(fpl));
5622 }
5623
5624 fpl->tvp = vp;
5625 fpl->tvp_seqc = vp_seqc;
5626 return (0);
5627 }
5628
5629 /*
5630 * Check if a vnode is mounted on.
5631 */
5632 static bool
cache_fplookup_is_mp(struct cache_fpl * fpl)5633 cache_fplookup_is_mp(struct cache_fpl *fpl)
5634 {
5635 struct vnode *vp;
5636
5637 vp = fpl->tvp;
5638 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5639 }
5640
5641 /*
5642 * Parse the path.
5643 *
5644 * The code was originally copy-pasted from regular lookup and despite
5645 * clean ups leaves performance on the table. Any modifications here
5646 * must take into account that in case off fallback the resulting
5647 * nameidata state has to be compatible with the original.
5648 */
5649
5650 /*
5651 * Debug ni_pathlen tracking.
5652 */
5653 #ifdef INVARIANTS
5654 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5655 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5656 {
5657
5658 fpl->debug.ni_pathlen += n;
5659 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5660 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5661 }
5662
5663 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5664 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5665 {
5666
5667 fpl->debug.ni_pathlen -= n;
5668 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5669 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5670 }
5671
5672 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5673 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5674 {
5675
5676 cache_fpl_pathlen_add(fpl, 1);
5677 }
5678
5679 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5680 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5681 {
5682
5683 cache_fpl_pathlen_sub(fpl, 1);
5684 }
5685 #else
5686 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5687 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5688 {
5689 }
5690
5691 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5692 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5693 {
5694 }
5695
5696 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5697 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5698 {
5699 }
5700
5701 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5702 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5703 {
5704 }
5705 #endif
5706
5707 static void
cache_fplookup_parse(struct cache_fpl * fpl)5708 cache_fplookup_parse(struct cache_fpl *fpl)
5709 {
5710 struct nameidata *ndp;
5711 struct componentname *cnp;
5712 struct vnode *dvp;
5713 char *cp;
5714 uint32_t hash;
5715
5716 ndp = fpl->ndp;
5717 cnp = fpl->cnp;
5718 dvp = fpl->dvp;
5719
5720 /*
5721 * Find the end of this path component, it is either / or nul.
5722 *
5723 * Store / as a temporary sentinel so that we only have one character
5724 * to test for. Pathnames tend to be short so this should not be
5725 * resulting in cache misses.
5726 *
5727 * TODO: fix this to be word-sized.
5728 */
5729 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5730 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5731 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5732 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5733 fpl->nulchar, cnp->cn_pnbuf));
5734 KASSERT(*fpl->nulchar == '\0',
5735 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5736 cnp->cn_pnbuf));
5737 hash = cache_get_hash_iter_start(dvp);
5738 *fpl->nulchar = '/';
5739 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5740 KASSERT(*cp != '\0',
5741 ("%s: encountered unexpected nul; string [%s]\n", __func__,
5742 cnp->cn_nameptr));
5743 hash = cache_get_hash_iter(*cp, hash);
5744 continue;
5745 }
5746 *fpl->nulchar = '\0';
5747 fpl->hash = cache_get_hash_iter_finish(hash);
5748
5749 cnp->cn_namelen = cp - cnp->cn_nameptr;
5750 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5751
5752 #ifdef INVARIANTS
5753 /*
5754 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5755 * we are going to fail this lookup with ENAMETOOLONG (see below).
5756 */
5757 if (cnp->cn_namelen <= NAME_MAX) {
5758 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5759 panic("%s: mismatched hash for [%s] len %ld", __func__,
5760 cnp->cn_nameptr, cnp->cn_namelen);
5761 }
5762 }
5763 #endif
5764
5765 /*
5766 * Hack: we have to check if the found path component's length exceeds
5767 * NAME_MAX. However, the condition is very rarely true and check can
5768 * be elided in the common case -- if an entry was found in the cache,
5769 * then it could not have been too long to begin with.
5770 */
5771 ndp->ni_next = cp;
5772 }
5773
5774 static void
cache_fplookup_parse_advance(struct cache_fpl * fpl)5775 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5776 {
5777 struct nameidata *ndp;
5778 struct componentname *cnp;
5779
5780 ndp = fpl->ndp;
5781 cnp = fpl->cnp;
5782
5783 cnp->cn_nameptr = ndp->ni_next;
5784 KASSERT(*(cnp->cn_nameptr) == '/',
5785 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5786 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5787 cnp->cn_nameptr++;
5788 cache_fpl_pathlen_dec(fpl);
5789 }
5790
5791 /*
5792 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5793 *
5794 * Lockless lookup tries to elide checking for spurious slashes and should they
5795 * be present is guaranteed to fail to find an entry. In this case the caller
5796 * must check if the name starts with a slash and call this routine. It is
5797 * going to fast forward across the spurious slashes and set the state up for
5798 * retry.
5799 */
5800 static int __noinline
cache_fplookup_skip_slashes(struct cache_fpl * fpl)5801 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5802 {
5803 struct nameidata *ndp;
5804 struct componentname *cnp;
5805
5806 ndp = fpl->ndp;
5807 cnp = fpl->cnp;
5808
5809 MPASS(*(cnp->cn_nameptr) == '/');
5810 do {
5811 cnp->cn_nameptr++;
5812 cache_fpl_pathlen_dec(fpl);
5813 } while (*(cnp->cn_nameptr) == '/');
5814
5815 /*
5816 * Go back to one slash so that cache_fplookup_parse_advance has
5817 * something to skip.
5818 */
5819 cnp->cn_nameptr--;
5820 cache_fpl_pathlen_inc(fpl);
5821
5822 /*
5823 * cache_fplookup_parse_advance starts from ndp->ni_next
5824 */
5825 ndp->ni_next = cnp->cn_nameptr;
5826
5827 /*
5828 * See cache_fplookup_dot.
5829 */
5830 fpl->tvp = fpl->dvp;
5831 fpl->tvp_seqc = fpl->dvp_seqc;
5832
5833 return (0);
5834 }
5835
5836 /*
5837 * Handle trailing slashes (e.g., "foo/").
5838 *
5839 * If a trailing slash is found the terminal vnode must be a directory.
5840 * Regular lookup shortens the path by nulifying the first trailing slash and
5841 * sets the TRAILINGSLASH flag to denote this took place. There are several
5842 * checks on it performed later.
5843 *
5844 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5845 * manner relying on an invariant that a non-directory vnode will get a miss.
5846 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5847 *
5848 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5849 * and denotes this is the last path component, which avoids looping back.
5850 *
5851 * Only plain lookups are supported for now to restrict corner cases to handle.
5852 */
5853 static int __noinline
cache_fplookup_trailingslash(struct cache_fpl * fpl)5854 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5855 {
5856 #ifdef INVARIANTS
5857 size_t ni_pathlen;
5858 #endif
5859 struct nameidata *ndp;
5860 struct componentname *cnp;
5861 struct namecache *ncp;
5862 struct vnode *tvp;
5863 char *cn_nameptr_orig, *cn_nameptr_slash;
5864 seqc_t tvp_seqc;
5865 u_char nc_flag;
5866
5867 ndp = fpl->ndp;
5868 cnp = fpl->cnp;
5869 tvp = fpl->tvp;
5870 tvp_seqc = fpl->tvp_seqc;
5871
5872 MPASS(fpl->dvp == fpl->tvp);
5873 KASSERT(cache_fpl_istrailingslash(fpl),
5874 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5875 cnp->cn_pnbuf));
5876 KASSERT(cnp->cn_nameptr[0] == '\0',
5877 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5878 cnp->cn_pnbuf));
5879 KASSERT(cnp->cn_namelen == 0,
5880 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5881 cnp->cn_pnbuf));
5882 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5883
5884 if (cnp->cn_nameiop != LOOKUP) {
5885 return (cache_fpl_aborted(fpl));
5886 }
5887
5888 if (__predict_false(tvp->v_type != VDIR)) {
5889 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5890 return (cache_fpl_aborted(fpl));
5891 }
5892 cache_fpl_smr_exit(fpl);
5893 return (cache_fpl_handled_error(fpl, ENOTDIR));
5894 }
5895
5896 /*
5897 * Denote the last component.
5898 */
5899 ndp->ni_next = &cnp->cn_nameptr[0];
5900 MPASS(cache_fpl_islastcn(ndp));
5901
5902 /*
5903 * Unwind trailing slashes.
5904 */
5905 cn_nameptr_orig = cnp->cn_nameptr;
5906 while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5907 cnp->cn_nameptr--;
5908 if (cnp->cn_nameptr[0] != '/') {
5909 break;
5910 }
5911 }
5912
5913 /*
5914 * Unwind to the beginning of the path component.
5915 *
5916 * Note the path may or may not have started with a slash.
5917 */
5918 cn_nameptr_slash = cnp->cn_nameptr;
5919 while (cnp->cn_nameptr > cnp->cn_pnbuf) {
5920 cnp->cn_nameptr--;
5921 if (cnp->cn_nameptr[0] == '/') {
5922 break;
5923 }
5924 }
5925 if (cnp->cn_nameptr[0] == '/') {
5926 cnp->cn_nameptr++;
5927 }
5928
5929 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
5930 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
5931 cache_fpl_checkpoint(fpl);
5932
5933 #ifdef INVARIANTS
5934 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
5935 if (ni_pathlen != fpl->debug.ni_pathlen) {
5936 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5937 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5938 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5939 }
5940 #endif
5941
5942 /*
5943 * If this was a "./" lookup the parent directory is already correct.
5944 */
5945 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
5946 return (0);
5947 }
5948
5949 /*
5950 * Otherwise we need to look it up.
5951 */
5952 tvp = fpl->tvp;
5953 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
5954 if (__predict_false(ncp == NULL)) {
5955 return (cache_fpl_aborted(fpl));
5956 }
5957 nc_flag = atomic_load_char(&ncp->nc_flag);
5958 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5959 return (cache_fpl_aborted(fpl));
5960 }
5961 fpl->dvp = ncp->nc_dvp;
5962 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5963 if (seqc_in_modify(fpl->dvp_seqc)) {
5964 return (cache_fpl_aborted(fpl));
5965 }
5966 return (0);
5967 }
5968
5969 /*
5970 * See the API contract for VOP_FPLOOKUP_VEXEC.
5971 */
5972 static int __noinline
cache_fplookup_failed_vexec(struct cache_fpl * fpl,int error)5973 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
5974 {
5975 struct componentname *cnp;
5976 struct vnode *dvp;
5977 seqc_t dvp_seqc;
5978
5979 cnp = fpl->cnp;
5980 dvp = fpl->dvp;
5981 dvp_seqc = fpl->dvp_seqc;
5982
5983 /*
5984 * Hack: delayed empty path checking.
5985 */
5986 if (cnp->cn_pnbuf[0] == '\0') {
5987 return (cache_fplookup_emptypath(fpl));
5988 }
5989
5990 /*
5991 * TODO: Due to ignoring trailing slashes lookup will perform a
5992 * permission check on the last dir when it should not be doing it. It
5993 * may fail, but said failure should be ignored. It is possible to fix
5994 * it up fully without resorting to regular lookup, but for now just
5995 * abort.
5996 */
5997 if (cache_fpl_istrailingslash(fpl)) {
5998 return (cache_fpl_aborted(fpl));
5999 }
6000
6001 /*
6002 * Hack: delayed degenerate path checking.
6003 */
6004 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6005 return (cache_fplookup_degenerate(fpl));
6006 }
6007
6008 /*
6009 * Hack: delayed name len checking.
6010 */
6011 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6012 cache_fpl_smr_exit(fpl);
6013 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6014 }
6015
6016 /*
6017 * Hack: they may be looking up foo/bar, where foo is not a directory.
6018 * In such a case we need to return ENOTDIR, but we may happen to get
6019 * here with a different error.
6020 */
6021 if (dvp->v_type != VDIR) {
6022 error = ENOTDIR;
6023 }
6024
6025 /*
6026 * Hack: handle O_SEARCH.
6027 *
6028 * Open Group Base Specifications Issue 7, 2018 edition states:
6029 * <quote>
6030 * If the access mode of the open file description associated with the
6031 * file descriptor is not O_SEARCH, the function shall check whether
6032 * directory searches are permitted using the current permissions of
6033 * the directory underlying the file descriptor. If the access mode is
6034 * O_SEARCH, the function shall not perform the check.
6035 * </quote>
6036 *
6037 * Regular lookup tests for the NOEXECCHECK flag for every path
6038 * component to decide whether to do the permission check. However,
6039 * since most lookups never have the flag (and when they do it is only
6040 * present for the first path component), lockless lookup only acts on
6041 * it if there is a permission problem. Here the flag is represented
6042 * with a boolean so that we don't have to clear it on the way out.
6043 *
6044 * For simplicity this always aborts.
6045 * TODO: check if this is the first lookup and ignore the permission
6046 * problem. Note the flag has to survive fallback (if it happens to be
6047 * performed).
6048 */
6049 if (fpl->fsearch) {
6050 return (cache_fpl_aborted(fpl));
6051 }
6052
6053 switch (error) {
6054 case EAGAIN:
6055 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6056 error = cache_fpl_aborted(fpl);
6057 } else {
6058 cache_fpl_partial(fpl);
6059 }
6060 break;
6061 default:
6062 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6063 error = cache_fpl_aborted(fpl);
6064 } else {
6065 cache_fpl_smr_exit(fpl);
6066 cache_fpl_handled_error(fpl, error);
6067 }
6068 break;
6069 }
6070 return (error);
6071 }
6072
6073 static int
cache_fplookup_impl(struct vnode * dvp,struct cache_fpl * fpl)6074 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6075 {
6076 struct nameidata *ndp;
6077 struct componentname *cnp;
6078 struct mount *mp;
6079 int error;
6080
6081 ndp = fpl->ndp;
6082 cnp = fpl->cnp;
6083
6084 cache_fpl_checkpoint(fpl);
6085
6086 /*
6087 * The vnode at hand is almost always stable, skip checking for it.
6088 * Worst case this postpones the check towards the end of the iteration
6089 * of the main loop.
6090 */
6091 fpl->dvp = dvp;
6092 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6093
6094 mp = atomic_load_ptr(&dvp->v_mount);
6095 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6096 return (cache_fpl_aborted(fpl));
6097 }
6098
6099 MPASS(fpl->tvp == NULL);
6100
6101 for (;;) {
6102 cache_fplookup_parse(fpl);
6103
6104 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6105 if (__predict_false(error != 0)) {
6106 error = cache_fplookup_failed_vexec(fpl, error);
6107 break;
6108 }
6109
6110 error = cache_fplookup_next(fpl);
6111 if (__predict_false(cache_fpl_terminated(fpl))) {
6112 break;
6113 }
6114
6115 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6116
6117 if (fpl->tvp->v_type == VLNK) {
6118 error = cache_fplookup_symlink(fpl);
6119 if (cache_fpl_terminated(fpl)) {
6120 break;
6121 }
6122 } else {
6123 if (cache_fpl_islastcn(ndp)) {
6124 error = cache_fplookup_final(fpl);
6125 break;
6126 }
6127
6128 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6129 error = cache_fpl_aborted(fpl);
6130 break;
6131 }
6132
6133 fpl->dvp = fpl->tvp;
6134 fpl->dvp_seqc = fpl->tvp_seqc;
6135 cache_fplookup_parse_advance(fpl);
6136 }
6137
6138 cache_fpl_checkpoint(fpl);
6139 }
6140
6141 return (error);
6142 }
6143
6144 /*
6145 * Fast path lookup protected with SMR and sequence counters.
6146 *
6147 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6148 *
6149 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6150 * outlined below.
6151 *
6152 * Traditional vnode lookup conceptually looks like this:
6153 *
6154 * vn_lock(current);
6155 * for (;;) {
6156 * next = find();
6157 * vn_lock(next);
6158 * vn_unlock(current);
6159 * current = next;
6160 * if (last)
6161 * break;
6162 * }
6163 * return (current);
6164 *
6165 * Each jump to the next vnode is safe memory-wise and atomic with respect to
6166 * any modifications thanks to holding respective locks.
6167 *
6168 * The same guarantee can be provided with a combination of safe memory
6169 * reclamation and sequence counters instead. If all operations which affect
6170 * the relationship between the current vnode and the one we are looking for
6171 * also modify the counter, we can verify whether all the conditions held as
6172 * we made the jump. This includes things like permissions, mount points etc.
6173 * Counter modification is provided by enclosing relevant places in
6174 * vn_seqc_write_begin()/end() calls.
6175 *
6176 * Thus this translates to:
6177 *
6178 * vfs_smr_enter();
6179 * dvp_seqc = seqc_read_any(dvp);
6180 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6181 * abort();
6182 * for (;;) {
6183 * tvp = find();
6184 * tvp_seqc = seqc_read_any(tvp);
6185 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6186 * abort();
6187 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6188 * abort();
6189 * dvp = tvp; // we know nothing of importance has changed
6190 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6191 * if (last)
6192 * break;
6193 * }
6194 * vget(); // secure the vnode
6195 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6196 * abort();
6197 * // at this point we know nothing has changed for any parent<->child pair
6198 * // as they were crossed during the lookup, meaning we matched the guarantee
6199 * // of the locked variant
6200 * return (tvp);
6201 *
6202 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6203 * - they are called while within vfs_smr protection which they must never exit
6204 * - EAGAIN can be returned to denote checking could not be performed, it is
6205 * always valid to return it
6206 * - if the sequence counter has not changed the result must be valid
6207 * - if the sequence counter has changed both false positives and false negatives
6208 * are permitted (since the result will be rejected later)
6209 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6210 *
6211 * Caveats to watch out for:
6212 * - vnodes are passed unlocked and unreferenced with nothing stopping
6213 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6214 * to use atomic_load_ptr to fetch it.
6215 * - the aforementioned object can also get freed, meaning absent other means it
6216 * should be protected with vfs_smr
6217 * - either safely checking permissions as they are modified or guaranteeing
6218 * their stability is left to the routine
6219 */
6220 int
cache_fplookup(struct nameidata * ndp,enum cache_fpl_status * status,struct pwd ** pwdp)6221 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6222 struct pwd **pwdp)
6223 {
6224 struct cache_fpl fpl;
6225 struct pwd *pwd;
6226 struct vnode *dvp;
6227 struct componentname *cnp;
6228 int error;
6229
6230 fpl.status = CACHE_FPL_STATUS_UNSET;
6231 fpl.in_smr = false;
6232 fpl.ndp = ndp;
6233 fpl.cnp = cnp = &ndp->ni_cnd;
6234 MPASS(ndp->ni_lcf == 0);
6235 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6236 ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6237 cnp->cn_flags));
6238 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6239 MPASS(ndp->ni_resflags == 0);
6240
6241 if (__predict_false(!cache_can_fplookup(&fpl))) {
6242 *status = fpl.status;
6243 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6244 return (EOPNOTSUPP);
6245 }
6246
6247 cache_fpl_checkpoint_outer(&fpl);
6248
6249 cache_fpl_smr_enter_initial(&fpl);
6250 #ifdef INVARIANTS
6251 fpl.debug.ni_pathlen = ndp->ni_pathlen;
6252 #endif
6253 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6254 fpl.fsearch = false;
6255 fpl.tvp = NULL; /* for degenerate path handling */
6256 fpl.pwd = pwdp;
6257 pwd = pwd_get_smr();
6258 *(fpl.pwd) = pwd;
6259 namei_setup_rootdir(ndp, cnp, pwd);
6260 ndp->ni_topdir = pwd->pwd_jdir;
6261
6262 if (cnp->cn_pnbuf[0] == '/') {
6263 dvp = cache_fpl_handle_root(&fpl);
6264 ndp->ni_resflags = NIRES_ABS;
6265 } else {
6266 if (ndp->ni_dirfd == AT_FDCWD) {
6267 dvp = pwd->pwd_cdir;
6268 } else {
6269 error = cache_fplookup_dirfd(&fpl, &dvp);
6270 if (__predict_false(error != 0)) {
6271 goto out;
6272 }
6273 }
6274 }
6275
6276 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6277 error = cache_fplookup_impl(dvp, &fpl);
6278 out:
6279 cache_fpl_smr_assert_not_entered(&fpl);
6280 cache_fpl_assert_status(&fpl);
6281 *status = fpl.status;
6282 if (SDT_PROBES_ENABLED()) {
6283 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6284 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6285 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6286 ndp);
6287 }
6288
6289 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6290 MPASS(error != CACHE_FPL_FAILED);
6291 if (error != 0) {
6292 cache_fpl_cleanup_cnp(fpl.cnp);
6293 MPASS(fpl.dvp == NULL);
6294 MPASS(fpl.tvp == NULL);
6295 }
6296 ndp->ni_dvp = fpl.dvp;
6297 ndp->ni_vp = fpl.tvp;
6298 }
6299 return (error);
6300 }
6301