xref: /lighttpd1.4/src/stat_cache.c (revision fcf0dc3e)
1 #include "first.h"
2 
3 #include "stat_cache.h"
4 #include "log.h"
5 #include "fdevent.h"
6 #include "http_etag.h"
7 #include "algo_splaytree.h"
8 
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <errno.h>
15 #include <unistd.h>
16 #include <fcntl.h>
17 
18 #if defined(HAVE_SYS_XATTR_H)
19 # include <sys/xattr.h>
20 #elif defined(HAVE_ATTR_ATTRIBUTES_H)
21 # include <attr/attributes.h>
22 #endif
23 
24 #ifdef HAVE_SYS_EXTATTR_H
25 # include <sys/extattr.h>
26 #endif
27 
28 #ifndef HAVE_LSTAT
29 #define lstat stat
30 #ifndef S_ISLNK
31 #define S_ISLNK(mode) (0)
32 #endif
33 #endif
34 
35 /*
36  * stat-cache
37  *
38  * - a splay-tree is used as we can use the caching effect of it
39  */
40 
41 enum {
42   STAT_CACHE_ENGINE_SIMPLE  = 0  /*(default)*/
43  ,STAT_CACHE_ENGINE_NONE    = 1
44  ,STAT_CACHE_ENGINE_FAM     = 2  /* same as STAT_CACHE_ENGINE_INOTIFY */
45  ,STAT_CACHE_ENGINE_INOTIFY = 2  /* same as STAT_CACHE_ENGINE_FAM */
46  ,STAT_CACHE_ENGINE_KQUEUE  = 2  /* same as STAT_CACHE_ENGINE_FAM */
47 };
48 
49 struct stat_cache_fam;  /* declaration */
50 
51 typedef struct stat_cache {
52 	int stat_cache_engine;
53 	splay_tree *files; /* nodes of tree are (stat_cache_entry *) */
54 	struct stat_cache_fam *scf;
55 } stat_cache;
56 
57 static stat_cache sc;
58 
59 
60 static void * stat_cache_sptree_find(splay_tree ** const sptree,
61                                      const char * const name,
62                                      uint32_t len)
63 {
64     const int ndx = splaytree_djbhash(name, len);
65     *sptree = splaytree_splay(*sptree, ndx);
66     return (*sptree && (*sptree)->key == ndx) ? (*sptree)->data : NULL;
67 }
68 
69 
70 #if defined(HAVE_SYS_INOTIFY_H) \
71  || (defined(HAVE_SYS_EVENT_H) && defined(HAVE_KQUEUE))
72 #ifndef HAVE_FAM_H
73 #define HAVE_FAM_H
74 #endif
75 #endif
76 
77 #ifdef HAVE_FAM_H
78 
79 /* monitor changes in directories using FAM
80  *
81  * This implementation employing FAM monitors directories as they are used,
82  * and maintains a reference count for cache use within stat_cache.c.
83  * A periodic job runs in lighttpd every 32 seconds, expiring entries unused
84  * in last 64 seconds out of the cache and cancelling FAM monitoring.  Items
85  * within the cache are checked against the filesystem upon use if last stat()
86  * was greater than or equal to 16 seconds ago.
87  *
88  * This implementation does not monitor every directory in a tree, and therefore
89  * the cache may get out-of-sync with the filesystem.  Delays in receiving and
90  * processing events from FAM might also lead to stale cache entries.
91  *
92  * For many websites, a large number of files are seldom, if ever, modified,
93  * and a common practice with images is to create a new file with a new name
94  * when a new version is needed, in order for client browsers and CDNs to better
95  * cache the content.  Given this, most use will see little difference in
96  * performance between server.stat-cache-engine = "fam" and "simple" (default).
97  * The default server.stat-cache-engine = "simple" calls stat() on a target once
98  * per second, and reuses that information until the next second.  For use where
99  * changes must be immediately visible, server.stat-cache-engine = "disable"
100  * should be used.
101  *
102  * When considering use of server.stat-cache-engine = "fam", there are a few
103  * additional limitations for this cache implementation using FAM.
104  * - symlinks to files located outside of the current directory do not result
105  *   in changes to that file being monitored (unless that file is in a directory
106  *   which is monitored as a result of a different request).  symlinks can be
107  *   chained and can be circular.  This implementation *does not* readlink() or
108  *   realpath() to resolve the chains to find and monitor the ultimate target
109  *   directory.  While symlinks to files located outside the current directory
110  *   are not monitored, symlinks to directories *are* monitored, though chains
111  *   of symlinks to directories do not result in monitoring of the directories
112  *   containing intermediate symlinks to the target directory.
113  * - directory rename of a directory which is not currently being monitored will
114  *   result in stale information in the cache if there is a subdirectory that is
115  *   being monitored.
116  * Even though lighttpd will not receive FAM events in the above cases, lighttpd
117  * does re-validate the information in the cache upon use if the cache entry has
118  * not been checked in 16 seconds, so that is the upper limit for use of stale
119  * data.
120  *
121  * Use of server.stat-cache-engine = "fam" is discouraged for extremely volatile
122  * directories such as temporary directories (e.g. /tmp and maybe /var/tmp) due
123  * to the overhead of processing the additional noise generated from changes.
124  * Related, server.stat-cache-engine = "fam" is not recommended on trees of
125  * untrusted files where a malicious user could generate an excess of change
126  * events.
127  *
128  * Internal note: lighttpd walks the caches to prune trees in stat_cache when an
129  * event is received for a directory (or symlink to a directory) which has been
130  * deleted or renamed.  The splaytree data structure is suboptimal for frequent
131  * changes of large directories trees where there have been a large number of
132  * different files recently accessed and part of the stat_cache.
133  */
134 
135 #if defined(HAVE_SYS_INOTIFY_H) \
136  && !(defined(HAVE_SYS_EVENT_H) && defined(HAVE_KQUEUE))
137 
138 #include <sys/inotify.h>
139 #ifndef IN_EXCL_UNLINK /*(not defined in some very old glibc headers)*/
140 #define IN_EXCL_UNLINK 0x04000000
141 #endif
142 
143 /*(translate FAM API to inotify; this is specific to stat_cache.c use of FAM)*/
144 #define fam fd /*(translate struct stat_cache_fam scf->fam -> scf->fd)*/
145 typedef int FAMRequest; /*(fr)*/
146 #define FAMClose(fd) \
147         close(*(fd))
148 #define FAMCancelMonitor(fd, wd) \
149         inotify_rm_watch(*(fd), *(wd))
150 #define fam_watch_mask ( IN_ATTRIB | IN_CREATE | IN_DELETE | IN_DELETE_SELF \
151                        | IN_MODIFY | IN_MOVE_SELF | IN_MOVED_FROM \
152                        | IN_EXCL_UNLINK | IN_ONLYDIR )
153                      /*(note: follows symlinks; not providing IN_DONT_FOLLOW)*/
154 #define FAMMonitorDirectory(fd, fn, wd, userData) \
155         ((*(wd) = inotify_add_watch(*(fd), (fn), (fam_watch_mask))) < 0)
156 typedef enum FAMCodes { /*(copied from fam.h to define arbitrary enum values)*/
157     FAMChanged=1,
158     FAMDeleted=2,
159     FAMCreated=5,
160     FAMMoved=6,
161 } FAMCodes;
162 
163 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
164 #undef HAVE_SYS_INOTIFY_H
165 
166 #include <sys/event.h>
167 #include <sys/time.h>
168 
169 /*(translate FAM API to inotify; this is specific to stat_cache.c use of FAM)*/
170 #define fam fd /*(translate struct stat_cache_fam scf->fam -> scf->fd)*/
171 typedef int FAMRequest; /*(fr)*/
172 #define FAMClose(fd) \
173         (-1 != (*(fd)) ? close(*(fd)) : 0)
174 static int FAMCancelMonitor (const int * const fd, int * const wd)
175 {
176     if (-1 == *fd) return 0;
177     if (-1 == *wd) return 0;
178     struct timespec t0 = { 0, 0 };
179     struct kevent kev;
180     EV_SET(&kev, *wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
181     int rc = kevent(*fd, &kev, 1, NULL, 0, &t0);
182     close(*wd);
183     *wd = -1;
184     return rc;
185 }
186 static int FAMMonitorDirectory (int * const fd, char * const fn, int * const wd, void * const userData)
187 {
188     *wd = fdevent_open_dirname(fn, 1); /*(note: follows symlinks)*/
189     if (-1 == *wd) return -1;
190     struct timespec t0 = { 0, 0 };
191     struct kevent kev;
192     unsigned short kev_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
193     unsigned int kev_fflags = NOTE_ATTRIB | NOTE_EXTEND | NOTE_LINK | NOTE_WRITE
194                             | NOTE_DELETE | NOTE_REVOKE | NOTE_RENAME;
195     EV_SET(&kev, *wd, EVFILT_VNODE, kev_flags, kev_fflags, 0, userData);
196     return kevent(*fd, &kev, 1, NULL, 0, &t0);
197 }
198 typedef enum FAMCodes { /*(copied from fam.h to define arbitrary enum values)*/
199     FAMChanged=1,
200     FAMDeleted=2,
201     FAMCreated=5,
202     FAMMoved=6,
203 } FAMCodes;
204 
205 #else
206 
207 #include <fam.h>
208 
209 #ifdef HAVE_FAMNOEXISTS
210 #ifndef LIGHTTPD_STATIC
211 #ifdef HAVE_DLFCN_H
212 #include <dlfcn.h>
213 #endif
214 #endif
215 #endif
216 
217 #endif
218 
219 typedef struct fam_dir_entry {
220 	buffer name;
221 	int refcnt;
222 	FAMRequest req;
223 	unix_time64_t stat_ts;
224 	dev_t st_dev;
225 	ino_t st_ino;
226 	struct fam_dir_entry *fam_parent;
227 } fam_dir_entry;
228 
229 typedef struct stat_cache_fam {
230 	splay_tree *dirs; /* indexed by path; node data is fam_dir_entry */
231   #ifdef HAVE_SYS_INOTIFY_H
232 	splay_tree *wds;  /* indexed by inotify watch descriptor */
233   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
234   #else
235 	FAMConnection fam;
236   #endif
237 	log_error_st *errh;
238 	fdevents *ev;
239 	fdnode *fdn;
240 	int fd;
241 } stat_cache_fam;
242 
243 __attribute_returns_nonnull__
244 static fam_dir_entry * fam_dir_entry_init(const char *name, size_t len)
245 {
246     fam_dir_entry * const fam_dir = calloc(1, sizeof(*fam_dir));
247     force_assert(NULL != fam_dir);
248 
249     buffer_copy_string_len(&fam_dir->name, name, len);
250     fam_dir->refcnt = 0;
251   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
252     fam_dir->req = -1;
253   #endif
254 
255     return fam_dir;
256 }
257 
258 static void fam_dir_entry_free(fam_dir_entry *fam_dir)
259 {
260     if (!fam_dir) return;
261     /*(fam_dir->fam_parent might be invalid pointer here; ignore)*/
262     free(fam_dir->name.ptr);
263   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
264     if (-1 != fam_dir->req)
265         close(fam_dir->req);
266   #endif
267     free(fam_dir);
268 }
269 
270 static void fam_dir_invalidate_node(fam_dir_entry *fam_dir)
271 {
272     fam_dir->stat_ts = 0;
273     if (fam_dir->fam_parent) {
274         --fam_dir->fam_parent->refcnt;
275         fam_dir->fam_parent = NULL;
276     }
277 }
278 
279 /*
280  * walk though splay_tree and collect contents of dir tree.
281  * remove tagged entries in a second loop
282  */
283 
284 static void fam_dir_tag_refcnt(splay_tree *t, int *keys, int *ndx)
285 {
286     if (*ndx == 512) return; /*(must match num array entries in keys[])*/
287     if (t->left)  fam_dir_tag_refcnt(t->left,  keys, ndx);
288     if (t->right) fam_dir_tag_refcnt(t->right, keys, ndx);
289     if (*ndx == 512) return; /*(must match num array entries in keys[])*/
290 
291     fam_dir_entry * const fam_dir = t->data;
292     if (0 == fam_dir->refcnt) {
293         fam_dir_invalidate_node(fam_dir);
294         keys[(*ndx)++] = t->key;
295     }
296 }
297 
298 __attribute_noinline__
299 static void fam_dir_periodic_cleanup() {
300     stat_cache_fam * const scf = sc.scf;
301     int max_ndx, i;
302     int keys[512]; /* 2k size on stack */
303   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
304     struct kevent kevl[512]; /* 32k size on stack to batch kevent EV_DELETE */
305   #endif
306     do {
307         if (!scf->dirs) break;
308         max_ndx = 0;
309         fam_dir_tag_refcnt(scf->dirs, keys, &max_ndx);
310         for (i = 0; i < max_ndx; ++i) {
311             const int ndx = keys[i];
312             splay_tree *node = scf->dirs = splaytree_splay(scf->dirs, ndx);
313             if (node && node->key == ndx) {
314                 fam_dir_entry *fam_dir = node->data;
315                 scf->dirs = splaytree_delete(scf->dirs, ndx);
316               #ifdef HAVE_SYS_INOTIFY_H
317                 scf->wds = splaytree_delete(scf->wds, fam_dir->req);
318               #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
319                 /* batch process kevent removal; defer cancel */
320                 EV_SET(kevl+i, fam_dir->req, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
321                 fam_dir->req = -1; /*(make FAMCancelMonitor() a no-op)*/
322               #endif
323                 FAMCancelMonitor(&scf->fam, &fam_dir->req);
324                 fam_dir_entry_free(fam_dir);
325             }
326         }
327       #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
328         /* batch process: kevent() to submit EV_DELETE, then close dir fds */
329         if (0 == max_ndx) break;
330         struct timespec t0 = { 0, 0 };
331         kevent(scf->fd, kevl, max_ndx, NULL, 0, &t0);
332         for (i = 0; i < max_ndx; ++i)
333             close((int)kevl[i].ident);
334       #endif
335     } while (max_ndx == sizeof(keys)/sizeof(int));
336 }
337 
338 static void fam_dir_invalidate_tree(splay_tree *t, const char *name, size_t len)
339 {
340   #ifdef __clang_analyzer__
341     force_assert(name);
342   #endif
343     /*force_assert(t);*/
344     if (t->left)  fam_dir_invalidate_tree(t->left,  name, len);
345     if (t->right) fam_dir_invalidate_tree(t->right, name, len);
346 
347     fam_dir_entry * const fam_dir = t->data;
348   #ifdef __clang_analyzer__
349     force_assert(fam_dir);
350   #endif
351     const buffer * const b = &fam_dir->name;
352     size_t blen = buffer_clen(b);
353     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len))
354         fam_dir_invalidate_node(fam_dir);
355 }
356 
357 /* declarations */
358 static void stat_cache_delete_tree(const char *name, uint32_t len);
359 static void stat_cache_invalidate_dir_tree(const char *name, size_t len);
360 static void stat_cache_handle_fdevent_fn(stat_cache_fam * const scf, fam_dir_entry * const fam_dir, const char * const fn, const uint32_t fnlen, int code);
361 
362 static void stat_cache_handle_fdevent_in(stat_cache_fam *scf)
363 {
364   #ifdef HAVE_SYS_INOTIFY_H
365     /*(inotify pads in->len to align struct following in->name[])*/
366     char buf[4096]
367       __attribute__ ((__aligned__(__alignof__(struct inotify_event))));
368     int rd;
369     do {
370         rd = (int)read(scf->fd, buf, sizeof(buf));
371         if (rd <= 0) {
372             if (-1 == rd && errno != EINTR && errno != EAGAIN) {
373                 log_perror(scf->errh, __FILE__, __LINE__, "inotify error");
374                 /* TODO: could flush cache, close scf->fd, and re-open inotify*/
375             }
376             break;
377         }
378         for (int i = 0; i < rd; ) {
379             struct inotify_event * const in =
380               (struct inotify_event *)((uintptr_t)buf + i);
381             uint32_t len = in->len;
382             if (len > sizeof(buf)) break; /*(should not happen)*/
383             i += sizeof(struct inotify_event) + len;
384             if (i > rd) break; /*(should not happen (partial record))*/
385             if (in->mask & IN_CREATE)
386                 continue; /*(see comment below for FAMCreated)*/
387             if (in->mask & IN_Q_OVERFLOW) {
388                 log_error(scf->errh, __FILE__, __LINE__,
389                           "inotify queue overflow");
390                 continue;
391             }
392             /* ignore events which may have been pending for
393              * paths recently cancelled via FAMCancelMonitor() */
394             scf->wds = splaytree_splay(scf->wds, in->wd);
395             if (!scf->wds || scf->wds->key != in->wd)
396                 continue;
397             fam_dir_entry *fam_dir = scf->wds->data;
398             if (NULL == fam_dir)        /*(should not happen)*/
399                 continue;
400             if (fam_dir->req != in->wd) /*(should not happen)*/
401                 continue;
402             /*(specific to use here in stat_cache.c)*/
403             int code = 0;
404             if (in->mask & (IN_ATTRIB | IN_MODIFY))
405                 code = FAMChanged;
406             else if (in->mask & (IN_DELETE | IN_DELETE_SELF | IN_UNMOUNT))
407                 code = FAMDeleted;
408             else if (in->mask & (IN_MOVE_SELF | IN_MOVED_FROM))
409                 code = FAMMoved;
410 
411             if (len) {
412                 do { --len; } while (len && in->name[len-1] == '\0');
413             }
414             stat_cache_handle_fdevent_fn(scf, fam_dir, in->name, len, code);
415         }
416     } while (rd + sizeof(struct inotify_event) + NAME_MAX + 1 > sizeof(buf));
417   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
418     struct kevent kevl[256];
419     struct timespec t0 = { 0, 0 };
420     int n;
421     do {
422         n = kevent(scf->fd, NULL, 0, kevl, sizeof(kevl)/sizeof(*kevl), &t0);
423         if (n <= 0) break;
424         for (int i = 0; i < n; ++i) {
425             const struct kevent * const kev = kevl+i;
426             /* ignore events which may have been pending for
427              * paths recently cancelled via FAMCancelMonitor() */
428             int ndx = (int)(intptr_t)kev->udata;
429             scf->dirs = splaytree_splay(scf->dirs, ndx);
430             if (!scf->dirs || scf->dirs->key != ndx)
431                 continue;
432             fam_dir_entry *fam_dir = scf->dirs->data;
433             if (fam_dir->req != (int)kev->ident)
434                 continue;
435             /*(specific to use here in stat_cache.c)*/
436             /* note: stat_cache only monitors on directories,
437              *       so events here are only on directories
438              * note: changes are treated as FAMDeleted since
439              *       it is unknown which file in dir was changed
440              *       This is not efficient, but this stat_cache mechanism also
441              *       should not be used on frequently modified directories. */
442             int code = 0;
443             if (kev->fflags & (NOTE_WRITE|NOTE_ATTRIB|NOTE_EXTEND|NOTE_LINK))
444                 code = FAMDeleted; /*(not FAMChanged; see comment above)*/
445             else if (kev->fflags & (NOTE_DELETE|NOTE_REVOKE))
446                 code = FAMDeleted;
447             else if (kev->fflags & NOTE_RENAME)
448                 code = FAMMoved;
449             if (kev->flags & EV_ERROR) /*(not expected; treat as FAMDeleted)*/
450                 code = FAMDeleted;
451             stat_cache_handle_fdevent_fn(scf, fam_dir, NULL, 0, code);
452         }
453     } while (n == sizeof(kevl)/sizeof(*kevl));
454   #else
455     for (int i = 0, ndx; i || (i = FAMPending(&scf->fam)) > 0; --i) {
456         FAMEvent fe;
457         if (FAMNextEvent(&scf->fam, &fe) < 0) break;
458 
459         /* ignore events which may have been pending for
460          * paths recently cancelled via FAMCancelMonitor() */
461         ndx = (int)(intptr_t)fe.userdata;
462         scf->dirs = splaytree_splay(scf->dirs, ndx);
463         if (!scf->dirs || scf->dirs->key != ndx) {
464             continue;
465         }
466         fam_dir_entry *fam_dir = scf->dirs->data;
467         if (FAMREQUEST_GETREQNUM(&fam_dir->req)
468             != FAMREQUEST_GETREQNUM(&fe.fr)) {
469             continue;
470         }
471 
472         uint32_t fnlen = (fe.code != FAMCreated && fe.filename[0] != '/')
473           ? (uint32_t)strlen(fe.filename)
474           : 0;
475         stat_cache_handle_fdevent_fn(scf, fam_dir, fe.filename, fnlen, fe.code);
476     }
477   #endif
478 }
479 
480 static void stat_cache_handle_fdevent_fn(stat_cache_fam * const scf, fam_dir_entry *fam_dir, const char * const fn, const uint32_t fnlen, int code)
481 {
482         if (fnlen) {
483             buffer * const n = &fam_dir->name;
484             fam_dir_entry *fam_link;
485             uint32_t len;
486             switch (code) {
487             case FAMCreated:
488                 /* file created in monitored dir modifies dir and
489                  * we should get a separate FAMChanged event for dir.
490                  * Therefore, ignore file FAMCreated event here.
491                  * Also, if FAMNoExists() is used, might get spurious
492                  * FAMCreated events as changes are made e.g. in monitored
493                  * sub-sub-sub dirs and the library discovers new (already
494                  * existing) dir entries */
495                 return;
496             case FAMChanged:
497                 /* file changed in monitored dir does not modify dir */
498             case FAMDeleted:
499             case FAMMoved:
500                 /* file deleted or moved in monitored dir modifies dir,
501                  * but FAM provides separate notification for that */
502 
503                 /* temporarily append filename to dir in fam_dir->name to
504                  * construct path, then delete stat_cache entry (if any)*/
505                 len = buffer_clen(n);
506                 buffer_append_path_len(n, fn, fnlen);
507                 /* (alternatively, could chose to stat() and update)*/
508                 stat_cache_invalidate_entry(BUF_PTR_LEN(n));
509 
510                 fam_link = /*(check if might be symlink to monitored dir)*/
511                 stat_cache_sptree_find(&scf->dirs, BUF_PTR_LEN(n));
512                 if (fam_link && !buffer_is_equal(&fam_link->name, n))
513                     fam_link = NULL;
514 
515                 buffer_truncate(n, len);
516 
517                 if (fam_link) {
518                     /* replaced symlink changes containing dir */
519                     stat_cache_invalidate_entry(n->ptr, len);
520                     /* handle symlink to dir as deleted dir below */
521                     code = FAMDeleted;
522                     fam_dir = fam_link;
523                     break;
524                 }
525                 return;
526             default:
527                 return;
528             }
529         }
530 
531         switch(code) {
532         case FAMChanged:
533             stat_cache_invalidate_entry(BUF_PTR_LEN(&fam_dir->name));
534             break;
535         case FAMDeleted:
536         case FAMMoved:
537             stat_cache_delete_tree(BUF_PTR_LEN(&fam_dir->name));
538             fam_dir_invalidate_node(fam_dir);
539             if (scf->dirs)
540                 fam_dir_invalidate_tree(scf->dirs,
541                                         BUF_PTR_LEN(&fam_dir->name));
542             fam_dir_periodic_cleanup();
543             break;
544         default:
545             break;
546         }
547 }
548 
549 static handler_t stat_cache_handle_fdevent(void *ctx, int revent)
550 {
551 	stat_cache_fam * const scf = ctx; /* sc.scf */
552 
553 	if (revent & FDEVENT_IN) {
554 		stat_cache_handle_fdevent_in(scf);
555 	}
556 
557 	if (revent & (FDEVENT_HUP|FDEVENT_RDHUP)) {
558 		/* fam closed the connection */
559 		log_error(scf->errh, __FILE__, __LINE__,
560 		  "FAM connection closed; disabling stat_cache.");
561 		/* (although effectively STAT_CACHE_ENGINE_NONE,
562 		 *  do not change here so that periodic jobs clean up memory)*/
563 		/*sc.stat_cache_engine = STAT_CACHE_ENGINE_NONE; */
564 		fdevent_fdnode_event_del(scf->ev, scf->fdn);
565 		fdevent_unregister(scf->ev, scf->fd);
566 		scf->fdn = NULL;
567 
568 		FAMClose(&scf->fam);
569 		scf->fd = -1;
570 	}
571 
572 	return HANDLER_GO_ON;
573 }
574 
575 static stat_cache_fam * stat_cache_init_fam(fdevents *ev, log_error_st *errh) {
576 	stat_cache_fam *scf = calloc(1, sizeof(*scf));
577 	force_assert(scf);
578 	scf->fd = -1;
579 	scf->ev = ev;
580 	scf->errh = errh;
581 
582   #ifdef HAVE_SYS_INOTIFY_H
583    #if !defined(IN_NONBLOCK) || !defined(IN_CLOEXEC)
584 	scf->fd = inotify_init();
585 	if (scf->fd >= 0 && 0 != fdevent_fcntl_set_nb_cloexec(scf->fd)) {
586 		close(scf->fd);
587 		scf->fd = -1;
588 	}
589    #else
590 	scf->fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
591    #endif
592 	if (scf->fd < 0) {
593 		log_perror(errh, __FILE__, __LINE__, "inotify_init1()");
594 		free(scf);
595 		return NULL;
596 	}
597   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
598    #ifdef __NetBSD__
599 	scf->fd = kqueue1(O_NONBLOCK|O_CLOEXEC|O_NOSIGPIPE);
600    #else
601 	scf->fd = kqueue();
602 	if (scf->fd >= 0) fdevent_setfd_cloexec(scf->fd);
603    #endif
604 	if (scf->fd < 0) {
605 		log_perror(errh, __FILE__, __LINE__, "kqueue()");
606 		free(scf);
607 		return NULL;
608 	}
609   #else
610 	/* setup FAM */
611 	if (0 != FAMOpen2(&scf->fam, "lighttpd")) {
612 		log_error(errh, __FILE__, __LINE__,
613 		  "could not open a fam connection, dying.");
614 		free(scf);
615 		return NULL;
616 	}
617       #ifdef HAVE_FAMNOEXISTS
618       #ifdef LIGHTTPD_STATIC
619 	FAMNoExists(&scf->fam);
620       #else
621 	int (*FAMNoExists_fn)(FAMConnection *);
622 	FAMNoExists_fn =
623 	  (int (*)(FAMConnection *))(intptr_t)dlsym(RTLD_DEFAULT,"FAMNoExists");
624 	if (FAMNoExists_fn) FAMNoExists_fn(&scf->fam);
625       #endif
626       #endif
627 
628 	scf->fd = FAMCONNECTION_GETFD(&scf->fam);
629 	fdevent_setfd_cloexec(scf->fd);
630   #endif
631 	scf->fdn = fdevent_register(scf->ev, scf->fd, stat_cache_handle_fdevent, scf);
632 	fdevent_fdnode_event_set(scf->ev, scf->fdn, FDEVENT_IN | FDEVENT_RDHUP);
633 
634 	return scf;
635 }
636 
637 static void stat_cache_free_fam(stat_cache_fam *scf) {
638 	if (NULL == scf) return;
639 
640       #ifdef HAVE_SYS_INOTIFY_H
641 	while (scf->wds) {
642 		splay_tree *node = scf->wds;
643 		scf->wds = splaytree_delete(scf->wds, node->key);
644 	}
645       #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
646 	/*(quicker cleanup to close kqueue() before cancel per entry)*/
647 	close(scf->fd);
648 	scf->fd = -1;
649       #endif
650 	while (scf->dirs) {
651 		/*(skip entry invalidation and FAMCancelMonitor())*/
652 		splay_tree *node = scf->dirs;
653 		fam_dir_entry_free((fam_dir_entry *)node->data);
654 		scf->dirs = splaytree_delete(scf->dirs, node->key);
655 	}
656 
657 	if (-1 != scf->fd) {
658 		/*scf->fdn already cleaned up in fdevent_free()*/
659 		FAMClose(&scf->fam);
660 		/*scf->fd = -1;*/
661 	}
662 
663 	free(scf);
664 }
665 
666 static fam_dir_entry * fam_dir_monitor(stat_cache_fam *scf, char *fn, uint32_t dirlen, struct stat *st)
667 {
668     if (NULL == scf->fdn) return NULL; /* FAM connection closed; do nothing */
669     const int fn_is_dir = S_ISDIR(st->st_mode);
670     /*force_assert(0 != dirlen);*/
671     /*force_assert(fn[0] == '/');*/
672     /* consistency: ensure fn does not end in '/' unless root "/"
673      * FAM events will not end in '/', so easier to match this way */
674     if (fn[dirlen-1] == '/') --dirlen;
675     if (0 == dirlen) dirlen = 1; /* root dir ("/") */
676     /* Note: paths are expected to be normalized before calling stat_cache,
677      * e.g. without repeated '/' */
678     if (!fn_is_dir) {
679         while (fn[--dirlen] != '/') ;
680         if (0 == dirlen) dirlen = 1; /*(should not happen for file)*/
681     }
682     int dir_ndx = splaytree_djbhash(fn, dirlen);
683     fam_dir_entry *fam_dir = NULL;
684 
685     scf->dirs = splaytree_splay(scf->dirs, dir_ndx);
686     if (NULL != scf->dirs && scf->dirs->key == dir_ndx) {
687         fam_dir = scf->dirs->data;
688         if (!buffer_eq_slen(&fam_dir->name, fn, dirlen)) {
689             /* hash collision; preserve existing
690              * do not monitor new to avoid cache thrashing */
691             return NULL;
692         }
693         /* directory already registered */
694     }
695 
696     const unix_time64_t cur_ts = log_monotonic_secs;
697     struct stat lst;
698     int ck_dir = fn_is_dir;
699     if (!fn_is_dir && (NULL==fam_dir || cur_ts - fam_dir->stat_ts >= 16)) {
700         ck_dir = 1;
701         /*(temporarily modify fn)*/
702         fn[dirlen] = '\0';
703         if (0 != lstat(fn, &lst)) {
704             fn[dirlen] = '/';
705             return NULL;
706         }
707         if (!S_ISLNK(lst.st_mode)) {
708             st = &lst;
709         }
710         else if (0 != stat(fn, st)) { /*st passed in now is stat() of dir*/
711             fn[dirlen] = '/';
712             return NULL;
713         }
714         fn[dirlen] = '/';
715     }
716 
717     int ck_lnk = (NULL == fam_dir);
718     if (ck_dir && NULL != fam_dir) {
719         /* check stat() matches device and inode, just in case an external event
720          * not being monitored occurs (e.g. rename of unmonitored parent dir)*/
721         if (st->st_dev != fam_dir->st_dev || st->st_ino != fam_dir->st_ino) {
722             ck_lnk = 1;
723             /*(modifies scf->dirs but no need to re-splay for dir_ndx since
724              * fam_dir is not NULL and so splaytree_insert not called below)*/
725             if (scf->dirs) fam_dir_invalidate_tree(scf->dirs, fn, dirlen);
726             if (!fn_is_dir) /*(if dir, caller is updating stat_cache_entry)*/
727                 stat_cache_update_entry(fn, dirlen, st, NULL);
728             /*(must not delete tree since caller is holding a valid node)*/
729             stat_cache_invalidate_dir_tree(fn, dirlen);
730           #ifdef HAVE_SYS_INOTIFY_H
731             scf->wds = splaytree_delete(scf->wds, fam_dir->req);
732           #endif
733             if (0 != FAMCancelMonitor(&scf->fam, &fam_dir->req)
734                 || 0 != FAMMonitorDirectory(&scf->fam, fam_dir->name.ptr,
735                                             &fam_dir->req,
736                                             (void *)(intptr_t)dir_ndx)) {
737                 fam_dir->stat_ts = 0; /* invalidate */
738                 return NULL;
739             }
740             fam_dir->st_dev = st->st_dev;
741             fam_dir->st_ino = st->st_ino;
742           #ifdef HAVE_SYS_INOTIFY_H
743             scf->wds = splaytree_insert(scf->wds, fam_dir->req, fam_dir);
744           #endif
745         }
746         fam_dir->stat_ts = cur_ts;
747     }
748 
749     if (NULL == fam_dir) {
750         fam_dir = fam_dir_entry_init(fn, dirlen);
751 
752         if (0 != FAMMonitorDirectory(&scf->fam,fam_dir->name.ptr,&fam_dir->req,
753                                      (void *)(intptr_t)dir_ndx)) {
754           #if defined(HAVE_SYS_INOTIFY_H) \
755            || (defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE)
756             log_perror(scf->errh, __FILE__, __LINE__,
757               "monitoring dir failed: %s file: %s",
758               fam_dir->name.ptr, fn);
759           #else
760             log_error(scf->errh, __FILE__, __LINE__,
761               "monitoring dir failed: %s file: %s %s",
762               fam_dir->name.ptr, fn, FamErrlist[FAMErrno]);
763           #endif
764             fam_dir_entry_free(fam_dir);
765             return NULL;
766         }
767 
768         scf->dirs = splaytree_insert(scf->dirs, dir_ndx, fam_dir);
769       #ifdef HAVE_SYS_INOTIFY_H
770         scf->wds = splaytree_insert(scf->wds, fam_dir->req, fam_dir);
771       #endif
772         fam_dir->stat_ts= cur_ts;
773         fam_dir->st_dev = st->st_dev;
774         fam_dir->st_ino = st->st_ino;
775     }
776 
777     if (ck_lnk) {
778         if (fn_is_dir) {
779             /*(temporarily modify fn)*/
780             char e = fn[dirlen];
781             fn[dirlen] = '\0';
782             if (0 != lstat(fn, &lst)) {
783                 fn[dirlen] = e;
784                 return NULL;
785             }
786             fn[dirlen] = e;
787         }
788         if (fam_dir->fam_parent) {
789             --fam_dir->fam_parent->refcnt;
790             fam_dir->fam_parent = NULL;
791         }
792         if (S_ISLNK(lst.st_mode)) {
793             fam_dir->fam_parent = fam_dir_monitor(scf, fn, dirlen, &lst);
794         }
795     }
796 
797     ++fam_dir->refcnt;
798     return fam_dir;
799 }
800 
801 #endif
802 
803 
804 __attribute_malloc__
805 __attribute_returns_nonnull__
806 static stat_cache_entry * stat_cache_entry_init(void) {
807     stat_cache_entry *sce = calloc(1, sizeof(*sce));
808     force_assert(NULL != sce);
809     sce->fd = -1;
810     sce->refcnt = 1;
811     return sce;
812 }
813 
814 static void stat_cache_entry_free(void *data) {
815     stat_cache_entry *sce = data;
816     if (!sce) return;
817 
818     if (--sce->refcnt) return;
819 
820   #ifdef HAVE_FAM_H
821     /*(decrement refcnt only;
822      * defer cancelling FAM monitor on dir even if refcnt reaches zero)*/
823     if (sce->fam_dir) --((fam_dir_entry *)sce->fam_dir)->refcnt;
824   #endif
825 
826     free(sce->name.ptr);
827     free(sce->etag.ptr);
828     if (sce->content_type.size) free(sce->content_type.ptr);
829     if (sce->fd >= 0) close(sce->fd);
830 
831     free(sce);
832 }
833 
834 void stat_cache_entry_refchg(void *data, int mod) {
835     /*(expect mod == -1 or mod == 1)*/
836     stat_cache_entry * const sce = data;
837     if (mod < 0 && 1 == sce->refcnt)
838         stat_cache_entry_free(data);
839     else
840         sce->refcnt += mod;
841 }
842 
843 #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
844 
845 static const char *attrname = "Content-Type";
846 static char attrval[128];
847 static buffer attrb = { attrval, 0, 0 };
848 
849 static int stat_cache_attr_get(const char *name) {
850   #if defined(HAVE_XATTR)
851    #if defined(HAVE_SYS_XATTR_H)
852     ssize_t attrlen;
853     if (0 < (attrlen = getxattr(name, attrname,
854                                 attrval, sizeof(attrval)-1)))
855    #else
856     int attrlen = sizeof(attrval)-1;
857     if (0 == attr_get(name, attrname, attrval, &attrlen, 0))
858    #endif
859   #elif defined(HAVE_EXTATTR)
860     ssize_t attrlen;
861     if (0 < (attrlen = extattr_get_file(name, EXTATTR_NAMESPACE_USER, attrname,
862                                         attrval, sizeof(attrval)-1)))
863   #endif
864     {
865         attrval[attrlen] = '\0';
866         attrb.used = (uint32_t)(attrlen + 1);
867         return 1;
868     }
869     return 0;
870 }
871 
872 #endif
873 
874 int stat_cache_init(fdevents *ev, log_error_st *errh) {
875   #ifdef HAVE_FAM_H
876     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
877         sc.scf = stat_cache_init_fam(ev, errh);
878         if (NULL == sc.scf) return 0;
879     }
880   #else
881     UNUSED(ev);
882     UNUSED(errh);
883   #endif
884 
885     return 1;
886 }
887 
888 void stat_cache_free(void) {
889     splay_tree *sptree = sc.files;
890     while (sptree) {
891         stat_cache_entry_free(sptree->data);
892         sptree = splaytree_delete(sptree, sptree->key);
893     }
894     sc.files = NULL;
895 
896   #ifdef HAVE_FAM_H
897     stat_cache_free_fam(sc.scf);
898     sc.scf = NULL;
899   #endif
900 
901   #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
902     attrname = "Content-Type";
903   #endif
904 
905     sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE; /*(default)*/
906 }
907 
908 void stat_cache_xattrname (const char *name) {
909   #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
910     attrname = name;
911   #else
912     UNUSED(name);
913   #endif
914 }
915 
916 int stat_cache_choose_engine (const buffer *stat_cache_string, log_error_st *errh) {
917     if (buffer_is_blank(stat_cache_string))
918         sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE;
919     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("simple")))
920         sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE;
921 #ifdef HAVE_SYS_INOTIFY_H
922     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("inotify")))
923         sc.stat_cache_engine = STAT_CACHE_ENGINE_INOTIFY;
924         /*(STAT_CACHE_ENGINE_FAM == STAT_CACHE_ENGINE_INOTIFY)*/
925 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
926     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("kqueue")))
927         sc.stat_cache_engine = STAT_CACHE_ENGINE_KQUEUE;
928         /*(STAT_CACHE_ENGINE_FAM == STAT_CACHE_ENGINE_KQUEUE)*/
929 #endif
930 #ifdef HAVE_FAM_H
931     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("fam")))
932         sc.stat_cache_engine = STAT_CACHE_ENGINE_FAM;
933 #endif
934     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("disable"))
935              || buffer_eq_slen(stat_cache_string, CONST_STR_LEN("none")))
936         sc.stat_cache_engine = STAT_CACHE_ENGINE_NONE;
937     else {
938         log_error(errh, __FILE__, __LINE__,
939           "server.stat-cache-engine can be one of \"disable\", \"simple\","
940 #ifdef HAVE_SYS_INOTIFY_H
941           " \"inotify\","
942 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
943           " \"kqueue\","
944 #endif
945 #ifdef HAVE_FAM_H
946           " \"fam\","
947 #endif
948           " but not: %s", stat_cache_string->ptr);
949         return -1;
950     }
951     return 0;
952 }
953 
954 const buffer * stat_cache_mimetype_by_ext(const array * const mimetypes, const char * const name, const uint32_t nlen)
955 {
956     const char * const end = name + nlen; /*(end of string)*/
957     const uint32_t used = mimetypes->used;
958     if (used < 16) {
959         for (uint32_t i = 0; i < used; ++i) {
960             /* suffix match */
961             const data_string *ds = (data_string *)mimetypes->data[i];
962             const size_t klen = buffer_clen(&ds->key);
963             if (klen <= nlen && buffer_eq_icase_ssn(end-klen, ds->key.ptr, klen))
964                 return &ds->value;
965         }
966     }
967     else {
968         const char *s;
969         const data_string *ds;
970         if (nlen) {
971             for (s = end-1; s != name && *s != '/'; --s) ; /*(like memrchr())*/
972             if (*s == '/') ++s;
973         }
974         else {
975             s = name;
976         }
977         /* search for basename, then longest .ext2.ext1, then .ext1, then "" */
978         ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
979         if (NULL != ds) return &ds->value;
980         while (++s < end) {
981             while (*s != '.' && ++s != end) ;
982             if (s == end) break;
983             /* search ".ext" then "ext" */
984             ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
985             if (NULL != ds) return &ds->value;
986             /* repeat search without leading '.' to handle situation where
987              * admin configured mimetype.assign keys without leading '.' */
988             if (++s < end) {
989                 if (*s == '.') { --s; continue; }
990                 ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
991                 if (NULL != ds) return &ds->value;
992             }
993         }
994         /* search for ""; catchall */
995         ds = (const data_string *)array_get_element_klen(mimetypes, CONST_STR_LEN(""));
996         if (NULL != ds) return &ds->value;
997     }
998 
999     return NULL;
1000 }
1001 
1002 #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
1003 
1004 const buffer * stat_cache_mimetype_by_xattr(const char * const name)
1005 {
1006     return stat_cache_attr_get(name) ? &attrb : NULL;
1007 }
1008 
1009 const buffer * stat_cache_content_type_get_by_xattr(stat_cache_entry *sce, const array *mimetypes, int use_xattr)
1010 {
1011     /*(invalid caching if user config has multiple, different
1012      * r->conf.mimetypes for same extension (not expected))*/
1013     if (!buffer_is_blank(&sce->content_type)) return &sce->content_type;
1014 
1015     if (!S_ISREG(sce->st.st_mode)) return NULL;
1016 
1017     /* cache mimetype */
1018     const buffer *mtype =
1019       (use_xattr) ? stat_cache_mimetype_by_xattr(sce->name.ptr) : NULL;
1020     if (NULL == mtype)
1021         mtype = stat_cache_mimetype_by_ext(mimetypes, BUF_PTR_LEN(&sce->name));
1022     if (NULL != mtype) {
1023         if (sce->content_type.size) {
1024             buffer_copy_buffer(&sce->content_type, mtype);
1025         }
1026         else if (mtype == &attrb) {
1027             sce->content_type.ptr = NULL;
1028             buffer_copy_buffer(&sce->content_type, mtype);
1029         }
1030         else {
1031             /*(copy pointers from mimetypes array; avoid allocation)*/
1032             sce->content_type.ptr = mtype->ptr;
1033             sce->content_type.used = mtype->used;
1034             /*(leave sce->content_type.size = 0 to flag not-allocated)*/
1035         }
1036     }
1037     else
1038         buffer_clear(&sce->content_type);
1039 
1040     return &sce->content_type;
1041 }
1042 
1043 #else
1044 
1045 const buffer * stat_cache_content_type_get_by_ext(stat_cache_entry *sce, const array *mimetypes)
1046 {
1047     /*(invalid caching if user config has multiple, different
1048      * r->conf.mimetypes for same extension (not expected))*/
1049     if (!buffer_is_blank(&sce->content_type)) return &sce->content_type;
1050 
1051     if (!S_ISREG(sce->st.st_mode)) return NULL;
1052 
1053     /* cache mimetype */
1054     const buffer * const mtype =
1055       stat_cache_mimetype_by_ext(mimetypes, BUF_PTR_LEN(&sce->name));
1056     if (NULL != mtype) {
1057         /*(copy pointers from mimetypes array; avoid allocation)*/
1058         sce->content_type.ptr = mtype->ptr;
1059         sce->content_type.used = mtype->used;
1060         /*(leave sce->content_type.size = 0 to flag not-allocated)*/
1061     }
1062     else
1063         buffer_clear(&sce->content_type);
1064 
1065     return &sce->content_type;
1066 }
1067 
1068 #endif
1069 
1070 const buffer * stat_cache_etag_get(stat_cache_entry *sce, int flags) {
1071     /*(invalid caching if user cfg has multiple, different r->conf.etag_flags
1072      * for same path (not expected, since etag flags should be by filesystem))*/
1073     if (!buffer_is_blank(&sce->etag)) return &sce->etag;
1074 
1075     if (S_ISREG(sce->st.st_mode) || S_ISDIR(sce->st.st_mode)) {
1076         if (0 == flags) return NULL;
1077         http_etag_create(&sce->etag, &sce->st, flags);
1078         return &sce->etag;
1079     }
1080 
1081     return NULL;
1082 }
1083 
1084 __attribute_pure__
1085 static int stat_cache_stat_eq(const struct stat * const sta, const struct stat * const stb) {
1086     return
1087       #ifdef st_mtime /* use high-precision timestamp if available */
1088       #if defined(__APPLE__) && defined(__MACH__)
1089         sta->st_mtimespec.tv_nsec == stb->st_mtimespec.tv_nsec
1090       #else
1091         sta->st_mtim.tv_nsec == stb->st_mtim.tv_nsec
1092       #endif
1093       #else
1094         1
1095       #endif
1096         && sta->st_mtime == stb->st_mtime
1097         && sta->st_size  == stb->st_size
1098         && sta->st_ino   == stb->st_ino
1099         && sta->st_dev   == stb->st_dev;
1100 }
1101 
1102 void stat_cache_update_entry(const char *name, uint32_t len,
1103                              const struct stat *st, const buffer *etagb)
1104 {
1105     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_NONE) return;
1106     force_assert(0 != len);
1107     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1108     splay_tree **sptree = &sc.files;
1109     stat_cache_entry *sce =
1110       stat_cache_sptree_find(sptree, name, len);
1111     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1112         if (!stat_cache_stat_eq(&sce->st, st)) {
1113             /* etagb might be NULL to clear etag (invalidate) */
1114             buffer_clear(&sce->etag);
1115             if (etagb)
1116                 buffer_copy_string_len(&sce->etag, BUF_PTR_LEN(etagb));
1117           #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
1118             buffer_clear(&sce->content_type);
1119           #endif
1120             if (sce->fd >= 0) {
1121                 if (1 == sce->refcnt) {
1122                     close(sce->fd);
1123                     sce->fd = -1;
1124                 }
1125                 else {
1126                     --sce->refcnt; /* stat_cache_entry_free(sce); */
1127                     (*sptree)->data = sce = stat_cache_entry_init();
1128                     buffer_copy_string_len(&sce->name, name, len);
1129                 }
1130             }
1131             sce->st = *st;
1132         }
1133         sce->stat_ts = log_monotonic_secs;
1134     }
1135 }
1136 
1137 void stat_cache_delete_entry(const char *name, uint32_t len)
1138 {
1139     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_NONE) return;
1140     force_assert(0 != len);
1141     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1142     splay_tree **sptree = &sc.files;
1143     stat_cache_entry *sce = stat_cache_sptree_find(sptree, name, len);
1144     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1145         stat_cache_entry_free(sce);
1146         *sptree = splaytree_delete(*sptree, (*sptree)->key);
1147     }
1148 }
1149 
1150 void stat_cache_invalidate_entry(const char *name, uint32_t len)
1151 {
1152     splay_tree **sptree = &sc.files;
1153     stat_cache_entry *sce = stat_cache_sptree_find(sptree, name, len);
1154     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1155         sce->stat_ts = 0;
1156       #ifdef HAVE_FAM_H
1157         if (sce->fam_dir != NULL) {
1158             --((fam_dir_entry *)sce->fam_dir)->refcnt;
1159             sce->fam_dir = NULL;
1160         }
1161       #endif
1162     }
1163 }
1164 
1165 #ifdef HAVE_FAM_H
1166 
1167 static void stat_cache_invalidate_dir_tree_walk(splay_tree *t,
1168                                                 const char *name, size_t len)
1169 {
1170     if (t->left)  stat_cache_invalidate_dir_tree_walk(t->left,  name, len);
1171     if (t->right) stat_cache_invalidate_dir_tree_walk(t->right, name, len);
1172 
1173     const buffer * const b = &((stat_cache_entry *)t->data)->name;
1174     const size_t blen = buffer_clen(b);
1175     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len)) {
1176         stat_cache_entry *sce = t->data;
1177         sce->stat_ts = 0;
1178         if (sce->fam_dir != NULL) {
1179             --((fam_dir_entry *)sce->fam_dir)->refcnt;
1180             sce->fam_dir = NULL;
1181         }
1182     }
1183 }
1184 
1185 static void stat_cache_invalidate_dir_tree(const char *name, size_t len)
1186 {
1187     splay_tree * const sptree = sc.files;
1188     if (sptree) stat_cache_invalidate_dir_tree_walk(sptree, name, len);
1189 }
1190 
1191 #endif
1192 
1193 /*
1194  * walk though splay_tree and collect contents of dir tree.
1195  * remove tagged entries in a second loop
1196  */
1197 
1198 static void stat_cache_tag_dir_tree(splay_tree *t, const char *name, size_t len,
1199                                     int *keys, int *ndx)
1200 {
1201     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1202     if (t->left)  stat_cache_tag_dir_tree(t->left,  name, len, keys, ndx);
1203     if (t->right) stat_cache_tag_dir_tree(t->right, name, len, keys, ndx);
1204     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1205 
1206     const buffer * const b = &((stat_cache_entry *)t->data)->name;
1207     const size_t blen = buffer_clen(b);
1208     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len))
1209         keys[(*ndx)++] = t->key;
1210 }
1211 
1212 __attribute_noinline__
1213 static void stat_cache_prune_dir_tree(const char *name, size_t len)
1214 {
1215     splay_tree *sptree = sc.files;
1216     int max_ndx, i;
1217     int keys[8192]; /* 32k size on stack */
1218     do {
1219         if (!sptree) break;
1220         max_ndx = 0;
1221         stat_cache_tag_dir_tree(sptree, name, len, keys, &max_ndx);
1222         for (i = 0; i < max_ndx; ++i) {
1223             const int ndx = keys[i];
1224             splay_tree *node = sptree = splaytree_splay(sptree, ndx);
1225             if (node && node->key == ndx) {
1226                 stat_cache_entry_free(node->data);
1227                 sptree = splaytree_delete(sptree, ndx);
1228             }
1229         }
1230     } while (max_ndx == sizeof(keys)/sizeof(int));
1231     sc.files = sptree;
1232 }
1233 
1234 static void stat_cache_delete_tree(const char *name, uint32_t len)
1235 {
1236     stat_cache_delete_entry(name, len);
1237     stat_cache_prune_dir_tree(name, len);
1238 }
1239 
1240 void stat_cache_delete_dir(const char *name, uint32_t len)
1241 {
1242     force_assert(0 != len);
1243     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1244     stat_cache_delete_tree(name, len);
1245   #ifdef HAVE_FAM_H
1246     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
1247         splay_tree **sptree = &sc.scf->dirs;
1248         fam_dir_entry *fam_dir = stat_cache_sptree_find(sptree, name, len);
1249         if (fam_dir && buffer_eq_slen(&fam_dir->name, name, len))
1250             fam_dir_invalidate_node(fam_dir);
1251         if (*sptree) fam_dir_invalidate_tree(*sptree, name, len);
1252         fam_dir_periodic_cleanup();
1253     }
1254   #endif
1255 }
1256 
1257 /***
1258  *
1259  *
1260  *
1261  * returns:
1262  *  - HANDLER_FINISHED on cache-miss (don't forget to reopen the file)
1263  *  - HANDLER_ERROR on stat() failed -> see errno for problem
1264  */
1265 
1266 stat_cache_entry * stat_cache_get_entry(const buffer * const name) {
1267 	stat_cache_entry *sce = NULL;
1268 
1269 	/* consistency: ensure lookup name does not end in '/' unless root "/"
1270 	 * (but use full path given with stat(), even with trailing '/') */
1271 	int final_slash = 0;
1272 	size_t len = buffer_clen(name);
1273 	force_assert(0 != len);
1274 	if (name->ptr[len-1] == '/') { final_slash = 1; if (0 == --len) len = 1; }
1275 	/* Note: paths are expected to be normalized before calling stat_cache,
1276 	 * e.g. without repeated '/' */
1277 
1278 	if (name->ptr[0] != '/') {
1279 		errno = EINVAL;
1280 		return NULL;
1281 	}
1282 
1283 	/*
1284 	 * check if the directory for this file has changed
1285 	 */
1286 
1287 	const unix_time64_t cur_ts = log_monotonic_secs;
1288 
1289 	const int file_ndx = splaytree_djbhash(name->ptr, len);
1290 	splay_tree *sptree = sc.files = splaytree_splay(sc.files, file_ndx);
1291 
1292 	if (sptree && (sptree->key == file_ndx)) {
1293 		/* we have seen this file already and
1294 		 * don't stat() it again in the same second */
1295 
1296 		sce = sptree->data;
1297 
1298 		/* check if the name is the same, we might have a collision */
1299 
1300 		if (buffer_is_equal_string(&sce->name, name->ptr, len)) {
1301 			if (sc.stat_cache_engine == STAT_CACHE_ENGINE_SIMPLE) {
1302 				if (sce->stat_ts == cur_ts) {
1303 					if (final_slash && !S_ISDIR(sce->st.st_mode)) {
1304 						errno = ENOTDIR;
1305 						return NULL;
1306 					}
1307 					return sce;
1308 				}
1309 			}
1310 		      #ifdef HAVE_FAM_H
1311 			else if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM
1312 				 && sce->fam_dir) { /* entry is in monitored dir */
1313 				/* re-stat() periodically, even if monitoring for changes
1314 				 * (due to limitations in stat_cache.c use of FAM)
1315 				 * (gaps due to not continually monitoring an entire tree) */
1316 				if (cur_ts - sce->stat_ts < 16) {
1317 					if (final_slash && !S_ISDIR(sce->st.st_mode)) {
1318 						errno = ENOTDIR;
1319 						return NULL;
1320 					}
1321 					return sce;
1322 				}
1323 			}
1324 		      #endif
1325 		} else {
1326 			/* collision, forget about the entry */
1327 			sce = NULL;
1328 		}
1329 	}
1330 
1331 	struct stat st;
1332 	if (-1 == stat(name->ptr, &st)) {
1333 		return NULL;
1334 	}
1335 
1336 	if (NULL == sce) {
1337 
1338 		/* fix broken stat/open for symlinks to reg files with appended slash on freebsd,osx */
1339 		if (final_slash && S_ISREG(st.st_mode)) {
1340 			errno = ENOTDIR;
1341 			return NULL;
1342 		}
1343 
1344 		sce = stat_cache_entry_init();
1345 		buffer_copy_string_len(&sce->name, name->ptr, len);
1346 
1347 		/* already splayed file_ndx */
1348 		if (NULL != sptree && sptree->key == file_ndx) {
1349 			/* hash collision: replace old entry */
1350 			stat_cache_entry_free(sptree->data);
1351 			sptree->data = sce;
1352 		} else {
1353 			/*sptree =*/ sc.files = splaytree_insert(sptree, file_ndx, sce);
1354 		}
1355 
1356 	} else {
1357 
1358 		buffer_clear(&sce->etag);
1359 	      #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
1360 		buffer_clear(&sce->content_type);
1361 	      #endif
1362 
1363 		/* close fd if file changed */
1364 		if (sce->fd >= 0 && !stat_cache_stat_eq(&sce->st, &st)) {
1365 			if (1 == sce->refcnt) {
1366 				close(sce->fd);
1367 				sce->fd = -1;
1368 			}
1369 			else {
1370 				--sce->refcnt; /* stat_cache_entry_free(sce); */
1371 				sptree->data = sce = stat_cache_entry_init();
1372 				buffer_copy_string_len(&sce->name, name->ptr, len);
1373 			}
1374 		}
1375 	}
1376 
1377 	sce->st = st; /*(copy prior to calling fam_dir_monitor())*/
1378 
1379 #ifdef HAVE_FAM_H
1380 	if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
1381 		if (sce->fam_dir) --((fam_dir_entry *)sce->fam_dir)->refcnt;
1382 		sce->fam_dir =
1383 		  fam_dir_monitor(sc.scf, name->ptr, len, &st);
1384 	      #if 0 /*(performed below)*/
1385 		if (NULL != sce->fam_dir) {
1386 			/*(may have been invalidated by dir change)*/
1387 			sce->stat_ts = cur_ts;
1388 		}
1389 	      #endif
1390 	}
1391 #endif
1392 
1393 	sce->stat_ts = cur_ts;
1394 	return sce;
1395 }
1396 
1397 stat_cache_entry * stat_cache_get_entry_open(const buffer * const name, const int symlinks) {
1398     stat_cache_entry * const sce = stat_cache_get_entry(name);
1399     if (NULL == sce) return NULL;
1400     if (sce->fd >= 0) return sce;
1401     if (sce->st.st_size > 0) {
1402         sce->fd = stat_cache_open_rdonly_fstat(name, &sce->st, symlinks);
1403         buffer_clear(&sce->etag);
1404     }
1405     return sce; /* (note: sce->fd might still be -1 if open() failed) */
1406 }
1407 
1408 const stat_cache_st * stat_cache_path_stat (const buffer * const name) {
1409     const stat_cache_entry * const sce = stat_cache_get_entry(name);
1410     return sce ? &sce->st : NULL;
1411 }
1412 
1413 int stat_cache_path_isdir(const buffer *name) {
1414     const stat_cache_entry * const sce = stat_cache_get_entry(name);
1415     return (sce && (S_ISDIR(sce->st.st_mode) ? 1 : (errno = ENOTDIR, 0)));
1416 }
1417 
1418 int stat_cache_path_contains_symlink(const buffer *name, log_error_st *errh) {
1419     /* caller should check for symlinks only if we should block symlinks. */
1420 
1421     /* catch the obvious symlinks
1422      *
1423      * this is not a secure check as we still have a race-condition between
1424      * the stat() and the open. We can only solve this by
1425      * 1. open() the file
1426      * 2. fstat() the fd
1427      *
1428      * and keeping the file open for the rest of the time. But this can
1429      * only be done at network level.
1430      * */
1431 
1432   #ifdef HAVE_LSTAT
1433     /* we assume "/" can not be symlink,
1434      * so skip the symlink stuff if path is "/" */
1435     size_t len = buffer_clen(name);
1436     force_assert(0 != len);
1437     force_assert(name->ptr[0] == '/');
1438     if (1 == len) return 0;
1439    #ifndef PATH_MAX
1440    #define PATH_MAX 4096
1441    #endif
1442     if (len >= PATH_MAX) return -1;
1443 
1444     char buf[PATH_MAX];
1445     memcpy(buf, name->ptr, len);
1446     char *s_cur = buf+len;
1447     do {
1448         *s_cur = '\0';
1449         struct stat st;
1450         if (0 == lstat(buf, &st)) {
1451             if (S_ISLNK(st.st_mode)) return 1;
1452         }
1453         else {
1454             log_perror(errh, __FILE__, __LINE__, "lstat failed for: %s", buf);
1455             return -1;
1456         }
1457     } while ((s_cur = strrchr(buf, '/')) > buf); /*(&buf[0]==buf; NULL < buf)*/
1458   #endif
1459 
1460     return 0;
1461 }
1462 
1463 int stat_cache_open_rdonly_fstat (const buffer *name, struct stat *st, int symlinks) {
1464 	/*(Note: O_NOFOLLOW affects only the final path segment, the target file,
1465 	 * not any intermediate symlinks along the path)*/
1466 	const int fd = fdevent_open_cloexec(name->ptr, symlinks, O_RDONLY, 0);
1467 	if (fd >= 0) {
1468 		if (0 == fstat(fd, st)) {
1469 			return fd;
1470 		} else {
1471 			const int errnum = errno;
1472 			close(fd);
1473 			errno = errnum;
1474 		}
1475 	}
1476 	return -1;
1477 }
1478 
1479 /**
1480  * remove stat() from cache which haven't been stat()ed for
1481  * more than 2 seconds
1482  *
1483  *
1484  * walk though the stat-cache, collect the ids which are too old
1485  * and remove them in a second loop
1486  */
1487 
1488 static void stat_cache_tag_old_entries(splay_tree * const t, int * const keys, int * const ndx, const time_t max_age, const unix_time64_t cur_ts) {
1489     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1490     if (t->left)
1491         stat_cache_tag_old_entries(t->left, keys, ndx, max_age, cur_ts);
1492     if (t->right)
1493         stat_cache_tag_old_entries(t->right, keys, ndx, max_age, cur_ts);
1494     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1495 
1496     const stat_cache_entry * const sce = t->data;
1497     if (cur_ts - sce->stat_ts > max_age)
1498         keys[(*ndx)++] = t->key;
1499 }
1500 
1501 static void stat_cache_periodic_cleanup(const time_t max_age, const unix_time64_t cur_ts) {
1502     splay_tree *sptree = sc.files;
1503     int max_ndx, i;
1504     int keys[8192]; /* 32k size on stack */
1505     do {
1506         if (!sptree) break;
1507         max_ndx = 0;
1508         stat_cache_tag_old_entries(sptree, keys, &max_ndx, max_age, cur_ts);
1509         for (i = 0; i < max_ndx; ++i) {
1510             int ndx = keys[i];
1511             sptree = splaytree_splay(sptree, ndx);
1512             if (sptree && sptree->key == ndx) {
1513                 stat_cache_entry_free(sptree->data);
1514                 sptree = splaytree_delete(sptree, ndx);
1515             }
1516         }
1517     } while (max_ndx == sizeof(keys)/sizeof(int));
1518     sc.files = sptree;
1519 }
1520 
1521 void stat_cache_trigger_cleanup(void) {
1522 	time_t max_age = 2;
1523 
1524       #ifdef HAVE_FAM_H
1525 	if (STAT_CACHE_ENGINE_FAM == sc.stat_cache_engine) {
1526 		if (log_monotonic_secs & 0x1F) return;
1527 		/* once every 32 seconds (0x1F == 31) */
1528 		max_age = 32;
1529 		fam_dir_periodic_cleanup();
1530 		/* By doing this before stat_cache_periodic_cleanup(),
1531 		 * entries used within the next max_age secs will remain
1532 		 * monitored, instead of effectively flushing and
1533 		 * rebuilding the FAM monitoring every max_age seconds */
1534 	}
1535       #endif
1536 
1537 	stat_cache_periodic_cleanup(max_age, log_monotonic_secs);
1538 }
1539