xref: /lighttpd1.4/src/stat_cache.c (revision dce44060)
1 #include "first.h"
2 
3 #include "stat_cache.h"
4 #include "log.h"
5 #include "fdevent.h"
6 #include "etag.h"
7 #include "algo_splaytree.h"
8 
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <errno.h>
15 #include <unistd.h>
16 #include <fcntl.h>
17 
18 #if defined(HAVE_SYS_XATTR_H)
19 # include <sys/xattr.h>
20 #elif defined(HAVE_ATTR_ATTRIBUTES_H)
21 # include <attr/attributes.h>
22 #endif
23 
24 #ifdef HAVE_SYS_EXTATTR_H
25 # include <sys/extattr.h>
26 #endif
27 
28 #ifndef HAVE_LSTAT
29 #define lstat stat
30 #ifndef S_ISLNK
31 #define S_ISLNK(mode) (0)
32 #endif
33 #endif
34 
35 /*
36  * stat-cache
37  *
38  * - a splay-tree is used as we can use the caching effect of it
39  */
40 
41 enum {
42   STAT_CACHE_ENGINE_SIMPLE  = 0  /*(default)*/
43  ,STAT_CACHE_ENGINE_NONE    = 1
44  ,STAT_CACHE_ENGINE_FAM     = 2  /* same as STAT_CACHE_ENGINE_INOTIFY */
45  ,STAT_CACHE_ENGINE_INOTIFY = 2  /* same as STAT_CACHE_ENGINE_FAM */
46  ,STAT_CACHE_ENGINE_KQUEUE  = 2  /* same as STAT_CACHE_ENGINE_FAM */
47 };
48 
49 struct stat_cache_fam;  /* declaration */
50 
51 typedef struct stat_cache {
52 	int stat_cache_engine;
53 	splay_tree *files; /* nodes of tree are (stat_cache_entry *) */
54 	struct stat_cache_fam *scf;
55 } stat_cache;
56 
57 static stat_cache sc;
58 
59 
60 static void * stat_cache_sptree_find(splay_tree ** const sptree,
61                                      const char * const name,
62                                      uint32_t len)
63 {
64     const int ndx = splaytree_djbhash(name, len);
65     *sptree = splaytree_splay(*sptree, ndx);
66     return (*sptree && (*sptree)->key == ndx) ? (*sptree)->data : NULL;
67 }
68 
69 
70 #if defined(HAVE_SYS_INOTIFY_H) \
71  || (defined(HAVE_SYS_EVENT_H) && defined(HAVE_KQUEUE))
72 #ifndef HAVE_FAM_H
73 #define HAVE_FAM_H
74 #endif
75 #endif
76 
77 #ifdef HAVE_FAM_H
78 
79 /* monitor changes in directories using FAM
80  *
81  * This implementation employing FAM monitors directories as they are used,
82  * and maintains a reference count for cache use within stat_cache.c.
83  * A periodic job runs in lighttpd every 32 seconds, expiring entries unused
84  * in last 64 seconds out of the cache and cancelling FAM monitoring.  Items
85  * within the cache are checked against the filesystem upon use if last stat()
86  * was greater than or equal to 16 seconds ago.
87  *
88  * This implementation does not monitor every directory in a tree, and therefore
89  * the cache may get out-of-sync with the filesystem.  Delays in receiving and
90  * processing events from FAM might also lead to stale cache entries.
91  *
92  * For many websites, a large number of files are seldom, if ever, modified,
93  * and a common practice with images is to create a new file with a new name
94  * when a new version is needed, in order for client browsers and CDNs to better
95  * cache the content.  Given this, most use will see little difference in
96  * performance between server.stat-cache-engine = "fam" and "simple" (default).
97  * The default server.stat-cache-engine = "simple" calls stat() on a target once
98  * per second, and reuses that information until the next second.  For use where
99  * changes must be immediately visible, server.stat-cache-engine = "disable"
100  * should be used.
101  *
102  * When considering use of server.stat-cache-engine = "fam", there are a few
103  * additional limitations for this cache implementation using FAM.
104  * - symlinks to files located outside of the current directory do not result
105  *   in changes to that file being monitored (unless that file is in a directory
106  *   which is monitored as a result of a different request).  symlinks can be
107  *   chained and can be circular.  This implementation *does not* readlink() or
108  *   realpath() to resolve the chains to find and monitor the ultimate target
109  *   directory.  While symlinks to files located outside the current directory
110  *   are not monitored, symlinks to directories *are* monitored, though chains
111  *   of symlinks to directories do not result in monitoring of the directories
112  *   containing intermediate symlinks to the target directory.
113  * - directory rename of a directory which is not currently being monitored will
114  *   result in stale information in the cache if there is a subdirectory that is
115  *   being monitored.
116  * Even though lighttpd will not receive FAM events in the above cases, lighttpd
117  * does re-validate the information in the cache upon use if the cache entry has
118  * not been checked in 16 seconds, so that is the upper limit for use of stale
119  * data.
120  *
121  * Use of server.stat-cache-engine = "fam" is discouraged for extremely volatile
122  * directories such as temporary directories (e.g. /tmp and maybe /var/tmp) due
123  * to the overhead of processing the additional noise generated from changes.
124  * Related, server.stat-cache-engine = "fam" is not recommended on trees of
125  * untrusted files where a malicious user could generate an excess of change
126  * events.
127  *
128  * Internal note: lighttpd walks the caches to prune trees in stat_cache when an
129  * event is received for a directory (or symlink to a directory) which has been
130  * deleted or renamed.  The splaytree data structure is suboptimal for frequent
131  * changes of large directories trees where there have been a large number of
132  * different files recently accessed and part of the stat_cache.
133  */
134 
135 #if defined(HAVE_SYS_INOTIFY_H) \
136  && !(defined(HAVE_SYS_EVENT_H) && defined(HAVE_KQUEUE))
137 
138 #include <sys/inotify.h>
139 
140 /*(translate FAM API to inotify; this is specific to stat_cache.c use of FAM)*/
141 #define fam fd /*(translate struct stat_cache_fam scf->fam -> scf->fd)*/
142 typedef int FAMRequest; /*(fr)*/
143 #define FAMClose(fd) \
144         close(*(fd))
145 #define FAMCancelMonitor(fd, wd) \
146         inotify_rm_watch(*(fd), *(wd))
147 #define fam_watch_mask IN_ATTRIB | IN_CREATE | IN_DELETE | IN_DELETE_SELF \
148                      | IN_MODIFY | IN_MOVE_SELF | IN_MOVED_FROM \
149                      | IN_EXCL_UNLINK | IN_ONLYDIR
150                      /*(note: follows symlinks; not providing IN_DONT_FOLLOW)*/
151 #define FAMMonitorDirectory(fd, fn, wd, userData) \
152         ((*(wd) = inotify_add_watch(*(fd), (fn), (fam_watch_mask))) < 0)
153 typedef enum FAMCodes { /*(copied from fam.h to define arbitrary enum values)*/
154     FAMChanged=1,
155     FAMDeleted=2,
156     FAMCreated=5,
157     FAMMoved=6,
158 } FAMCodes;
159 
160 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
161 #undef HAVE_SYS_INOTIFY_H
162 
163 #include <sys/event.h>
164 #include <sys/time.h>
165 
166 /*(translate FAM API to inotify; this is specific to stat_cache.c use of FAM)*/
167 #define fam fd /*(translate struct stat_cache_fam scf->fam -> scf->fd)*/
168 typedef int FAMRequest; /*(fr)*/
169 #define FAMClose(fd) \
170         (-1 != (*(fd)) ? close(*(fd)) : 0)
171 static int FAMCancelMonitor (const int * const fd, int * const wd)
172 {
173     if (-1 == *fd) return 0;
174     if (-1 == *wd) return 0;
175     struct timespec t0 = { 0, 0 };
176     struct kevent kev;
177     EV_SET(&kev, *wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
178     int rc = kevent(*fd, &kev, 1, NULL, 0, &t0);
179     close(*wd);
180     *wd = -1;
181     return rc;
182 }
183 static int FAMMonitorDirectory (int * const fd, char * const fn, int * const wd, void * const userData)
184 {
185     *wd = fdevent_open_dirname(fn, 1); /*(note: follows symlinks)*/
186     if (-1 == *wd) return -1;
187     struct timespec t0 = { 0, 0 };
188     struct kevent kev;
189     unsigned short kev_flags = EV_ADD | EV_ENABLE | EV_CLEAR;
190     unsigned int kev_fflags = NOTE_ATTRIB | NOTE_EXTEND | NOTE_LINK | NOTE_WRITE
191                             | NOTE_DELETE | NOTE_REVOKE | NOTE_RENAME;
192     EV_SET(&kev, *wd, EVFILT_VNODE, kev_flags, kev_fflags, 0, userData);
193     return kevent(*fd, &kev, 1, NULL, 0, &t0);
194 }
195 typedef enum FAMCodes { /*(copied from fam.h to define arbitrary enum values)*/
196     FAMChanged=1,
197     FAMDeleted=2,
198     FAMCreated=5,
199     FAMMoved=6,
200 } FAMCodes;
201 
202 #else
203 
204 #include <fam.h>
205 
206 #ifdef HAVE_FAMNOEXISTS
207 #ifndef LIGHTTPD_STATIC
208 #include <dlfcn.h>
209 #endif
210 #endif
211 
212 #endif
213 
214 typedef struct fam_dir_entry {
215 	buffer *name;
216 	int refcnt;
217 	FAMRequest req;
218 	time_t stat_ts;
219 	dev_t st_dev;
220 	ino_t st_ino;
221 	struct fam_dir_entry *fam_parent;
222 } fam_dir_entry;
223 
224 typedef struct stat_cache_fam {
225 	splay_tree *dirs; /* indexed by path; node data is fam_dir_entry */
226   #ifdef HAVE_SYS_INOTIFY_H
227 	splay_tree *wds;  /* indexed by inotify watch descriptor */
228   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
229   #else
230 	FAMConnection fam;
231   #endif
232 	log_error_st *errh;
233 	fdevents *ev;
234 	fdnode *fdn;
235 	int fd;
236 } stat_cache_fam;
237 
238 static fam_dir_entry * fam_dir_entry_init(const char *name, size_t len)
239 {
240     fam_dir_entry * const fam_dir = calloc(1, sizeof(*fam_dir));
241     force_assert(NULL != fam_dir);
242 
243     fam_dir->name = buffer_init();
244     buffer_copy_string_len(fam_dir->name, name, len);
245     fam_dir->refcnt = 0;
246   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
247     fam_dir->req = -1;
248   #endif
249 
250     return fam_dir;
251 }
252 
253 static void fam_dir_entry_free(fam_dir_entry *fam_dir)
254 {
255     if (!fam_dir) return;
256     /*(fam_dir->parent might be invalid pointer here; ignore)*/
257     buffer_free(fam_dir->name);
258   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
259     if (-1 != fam_dir->req)
260         close(fam_dir->req);
261   #endif
262     free(fam_dir);
263 }
264 
265 static void fam_dir_invalidate_node(fam_dir_entry *fam_dir)
266 {
267     fam_dir->stat_ts = 0;
268     if (fam_dir->fam_parent) {
269         --fam_dir->fam_parent->refcnt;
270         fam_dir->fam_parent = NULL;
271     }
272 }
273 
274 /*
275  * walk though splay_tree and collect contents of dir tree.
276  * remove tagged entries in a second loop
277  */
278 
279 static void fam_dir_tag_refcnt(splay_tree *t, int *keys, int *ndx)
280 {
281     if (*ndx == 512) return; /*(must match num array entries in keys[])*/
282     if (t->left)  fam_dir_tag_refcnt(t->left,  keys, ndx);
283     if (t->right) fam_dir_tag_refcnt(t->right, keys, ndx);
284     if (*ndx == 512) return; /*(must match num array entries in keys[])*/
285 
286     fam_dir_entry * const fam_dir = t->data;
287     if (0 == fam_dir->refcnt) {
288         fam_dir_invalidate_node(fam_dir);
289         keys[(*ndx)++] = t->key;
290     }
291 }
292 
293 __attribute_noinline__
294 static void fam_dir_periodic_cleanup() {
295     stat_cache_fam * const scf = sc.scf;
296     int max_ndx, i;
297     int keys[512]; /* 2k size on stack */
298   #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
299     struct kevent kevl[512]; /* 32k size on stack to batch kevent EV_DELETE */
300   #endif
301     do {
302         if (!scf->dirs) break;
303         max_ndx = 0;
304         fam_dir_tag_refcnt(scf->dirs, keys, &max_ndx);
305         for (i = 0; i < max_ndx; ++i) {
306             const int ndx = keys[i];
307             splay_tree *node = scf->dirs = splaytree_splay(scf->dirs, ndx);
308             if (node && node->key == ndx) {
309                 fam_dir_entry *fam_dir = node->data;
310                 scf->dirs = splaytree_delete(scf->dirs, ndx);
311               #ifdef HAVE_SYS_INOTIFY_H
312                 scf->wds = splaytree_delete(scf->wds, fam_dir->req);
313               #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
314                 /* batch process kevent removal; defer cancel */
315                 EV_SET(kevl+i, fam_dir->req, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
316                 fam_dir->req = -1; /*(make FAMCancelMonitor() a no-op)*/
317               #endif
318                 FAMCancelMonitor(&scf->fam, &fam_dir->req);
319                 fam_dir_entry_free(fam_dir);
320             }
321         }
322       #if defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
323         /* batch process: kevent() to submit EV_DELETE, then close dir fds */
324         if (0 == max_ndx) break;
325         struct timespec t0 = { 0, 0 };
326         kevent(scf->fd, kevl, max_ndx, NULL, 0, &t0);
327         for (i = 0; i < max_ndx; ++i)
328             close((int)kevl[i].ident);
329       #endif
330     } while (max_ndx == sizeof(keys)/sizeof(int));
331 }
332 
333 static void fam_dir_invalidate_tree(splay_tree *t, const char *name, size_t len)
334 {
335   #ifdef __clang_analyzer__
336     force_assert(name);
337   #endif
338     /*force_assert(t);*/
339     if (t->left)  fam_dir_invalidate_tree(t->left,  name, len);
340     if (t->right) fam_dir_invalidate_tree(t->right, name, len);
341 
342     fam_dir_entry * const fam_dir = t->data;
343   #ifdef __clang_analyzer__
344     force_assert(fam_dir);
345   #endif
346     buffer *b = fam_dir->name;
347     size_t blen = buffer_string_length(b);
348     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len))
349         fam_dir_invalidate_node(fam_dir);
350 }
351 
352 /* declarations */
353 static void stat_cache_delete_tree(const char *name, uint32_t len);
354 static void stat_cache_invalidate_dir_tree(const char *name, size_t len);
355 static void stat_cache_handle_fdevent_fn(stat_cache_fam * const scf, fam_dir_entry * const fam_dir, const char * const fn, const uint32_t fnlen, int code);
356 
357 static void stat_cache_handle_fdevent_in(stat_cache_fam *scf)
358 {
359   #ifdef HAVE_SYS_INOTIFY_H
360     /*(inotify pads in->len to align struct following in->name[])*/
361     char buf[4096]
362       __attribute__ ((__aligned__(__alignof__(struct inotify_event))));
363     int rd;
364     do {
365         rd = (int)read(scf->fd, buf, sizeof(buf));
366         if (rd <= 0) {
367             if (-1 == rd && errno != EINTR && errno != EAGAIN) {
368                 log_perror(scf->errh, __FILE__, __LINE__, "inotify error");
369                 /* TODO: could flush cache, close scf->fd, and re-open inotify*/
370             }
371             break;
372         }
373         for (int i = 0; i < rd; ) {
374             struct inotify_event * const in =
375               (struct inotify_event *)((uintptr_t)buf + i);
376             i += sizeof(struct inotify_event) + in->len;
377             if (in->mask & IN_CREATE)
378                 continue; /*(see comment below for FAMCreated)*/
379             if (in->mask & IN_Q_OVERFLOW) {
380                 log_error(scf->errh, __FILE__, __LINE__,
381                           "inotify queue overflow");
382                 continue;
383             }
384             /* ignore events which may have been pending for
385              * paths recently cancelled via FAMCancelMonitor() */
386             scf->wds = splaytree_splay(scf->wds, in->wd);
387             if (!scf->wds || scf->wds->key != in->wd)
388                 continue;
389             fam_dir_entry *fam_dir = scf->wds->data;
390             if (NULL == fam_dir)        /*(should not happen)*/
391                 continue;
392             if (fam_dir->req != in->wd) /*(should not happen)*/
393                 continue;
394             /*(specific to use here in stat_cache.c)*/
395             int code = 0;
396             if (in->mask & (IN_ATTRIB | IN_MODIFY))
397                 code = FAMChanged;
398             else if (in->mask & (IN_DELETE | IN_DELETE_SELF | IN_UNMOUNT))
399                 code = FAMDeleted;
400             else if (in->mask & (IN_MOVE_SELF | IN_MOVED_FROM))
401                 code = FAMMoved;
402 
403             if (in->len) {
404                 do { --in->len; } while (in->len && in->name[in->len-1]=='\0');
405             }
406             stat_cache_handle_fdevent_fn(scf, fam_dir, in->name, in->len, code);
407         }
408     } while (rd + sizeof(struct inotify_event) + NAME_MAX + 1 > sizeof(buf));
409   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
410     struct kevent kevl[256];
411     struct timespec t0 = { 0, 0 };
412     int n;
413     do {
414         n = kevent(scf->fd, NULL, 0, kevl, sizeof(kevl)/sizeof(*kevl), &t0);
415         if (n <= 0) break;
416         for (int i = 0; i < n; ++i) {
417             const struct kevent * const kev = kevl+i;
418             /* ignore events which may have been pending for
419              * paths recently cancelled via FAMCancelMonitor() */
420             int ndx = (int)(intptr_t)kev->udata;
421             scf->dirs = splaytree_splay(scf->dirs, ndx);
422             if (!scf->dirs || scf->dirs->key != ndx)
423                 continue;
424             fam_dir_entry *fam_dir = scf->dirs->data;
425             if (fam_dir->req != (int)kev->ident)
426                 continue;
427             /*(specific to use here in stat_cache.c)*/
428             /* note: stat_cache only monitors on directories,
429              *       so events here are only on directories
430              * note: changes are treated as FAMDeleted since
431              *       it is unknown which file in dir was changed
432              *       This is not efficient, but this stat_cache mechanism also
433              *       should not be used on frequently modified directories. */
434             int code = 0;
435             if (kev->fflags & (NOTE_WRITE|NOTE_ATTRIB|NOTE_EXTEND|NOTE_LINK))
436                 code = FAMDeleted; /*(not FAMChanged; see comment above)*/
437             else if (kev->fflags & (NOTE_DELETE|NOTE_REVOKE))
438                 code = FAMDeleted;
439             else if (kev->fflags & NOTE_RENAME)
440                 code = FAMMoved;
441             if (kev->flags & EV_ERROR) /*(not expected; treat as FAMDeleted)*/
442                 code = FAMDeleted;
443             stat_cache_handle_fdevent_fn(scf, fam_dir, NULL, 0, code);
444         }
445     } while (n == sizeof(kevl)/sizeof(*kevl));
446   #else
447     for (int i = 0, ndx; i || (i = FAMPending(&scf->fam)) > 0; --i) {
448         FAMEvent fe;
449         if (FAMNextEvent(&scf->fam, &fe) < 0) break;
450 
451         /* ignore events which may have been pending for
452          * paths recently cancelled via FAMCancelMonitor() */
453         ndx = (int)(intptr_t)fe.userdata;
454         scf->dirs = splaytree_splay(scf->dirs, ndx);
455         if (!scf->dirs || scf->dirs->key != ndx) {
456             continue;
457         }
458         fam_dir_entry *fam_dir = scf->dirs->data;
459         if (FAMREQUEST_GETREQNUM(&fam_dir->req)
460             != FAMREQUEST_GETREQNUM(&fe.fr)) {
461             continue;
462         }
463 
464         uint32_t fnlen = (fe.code != FAMCreated && fe.filename[0] != '/')
465           ? (uint32_t)strlen(fe.filename)
466           : 0;
467         stat_cache_handle_fdevent_fn(scf, fam_dir, fe.filename, fnlen, fe.code);
468     }
469   #endif
470 }
471 
472 static void stat_cache_handle_fdevent_fn(stat_cache_fam * const scf, fam_dir_entry *fam_dir, const char * const fn, const uint32_t fnlen, int code)
473 {
474         if (fnlen) {
475             buffer * const n = fam_dir->name;
476             fam_dir_entry *fam_link;
477             uint32_t len;
478             switch (code) {
479             case FAMCreated:
480                 /* file created in monitored dir modifies dir and
481                  * we should get a separate FAMChanged event for dir.
482                  * Therefore, ignore file FAMCreated event here.
483                  * Also, if FAMNoExists() is used, might get spurious
484                  * FAMCreated events as changes are made e.g. in monitored
485                  * sub-sub-sub dirs and the library discovers new (already
486                  * existing) dir entries */
487                 return;
488             case FAMChanged:
489                 /* file changed in monitored dir does not modify dir */
490             case FAMDeleted:
491             case FAMMoved:
492                 /* file deleted or moved in monitored dir modifies dir,
493                  * but FAM provides separate notification for that */
494 
495                 /* temporarily append filename to dir in fam_dir->name to
496                  * construct path, then delete stat_cache entry (if any)*/
497                 len = buffer_string_length(n);
498                 buffer_append_string_len(n, CONST_STR_LEN("/"));
499                 buffer_append_string_len(n, fn, fnlen);
500                 /* (alternatively, could chose to stat() and update)*/
501                 stat_cache_invalidate_entry(CONST_BUF_LEN(n));
502 
503                 fam_link = /*(check if might be symlink to monitored dir)*/
504                   stat_cache_sptree_find(&scf->dirs, CONST_BUF_LEN(n));
505                 if (fam_link && !buffer_is_equal(fam_link->name, n))
506                     fam_link = NULL;
507 
508                 buffer_string_set_length(n, len);
509 
510                 if (fam_link) {
511                     /* replaced symlink changes containing dir */
512                     stat_cache_invalidate_entry(CONST_BUF_LEN(n));
513                     /* handle symlink to dir as deleted dir below */
514                     code = FAMDeleted;
515                     fam_dir = fam_link;
516                     break;
517                 }
518                 return;
519             default:
520                 return;
521             }
522         }
523 
524         switch(code) {
525         case FAMChanged:
526             stat_cache_invalidate_entry(CONST_BUF_LEN(fam_dir->name));
527             break;
528         case FAMDeleted:
529         case FAMMoved:
530             stat_cache_delete_tree(CONST_BUF_LEN(fam_dir->name));
531             fam_dir_invalidate_node(fam_dir);
532             if (scf->dirs)
533                 fam_dir_invalidate_tree(scf->dirs,CONST_BUF_LEN(fam_dir->name));
534             fam_dir_periodic_cleanup();
535             break;
536         default:
537             break;
538         }
539 }
540 
541 static handler_t stat_cache_handle_fdevent(void *ctx, int revent)
542 {
543 	stat_cache_fam * const scf = ctx; /* sc.scf */
544 
545 	if (revent & FDEVENT_IN) {
546 		stat_cache_handle_fdevent_in(scf);
547 	}
548 
549 	if (revent & (FDEVENT_HUP|FDEVENT_RDHUP)) {
550 		/* fam closed the connection */
551 		log_error(scf->errh, __FILE__, __LINE__,
552 		  "FAM connection closed; disabling stat_cache.");
553 		/* (although effectively STAT_CACHE_ENGINE_NONE,
554 		 *  do not change here so that periodic jobs clean up memory)*/
555 		/*sc.stat_cache_engine = STAT_CACHE_ENGINE_NONE; */
556 		fdevent_fdnode_event_del(scf->ev, scf->fdn);
557 		fdevent_unregister(scf->ev, scf->fd);
558 		scf->fdn = NULL;
559 
560 		FAMClose(&scf->fam);
561 		scf->fd = -1;
562 	}
563 
564 	return HANDLER_GO_ON;
565 }
566 
567 static stat_cache_fam * stat_cache_init_fam(fdevents *ev, log_error_st *errh) {
568 	stat_cache_fam *scf = calloc(1, sizeof(*scf));
569 	force_assert(scf);
570 	scf->fd = -1;
571 	scf->ev = ev;
572 	scf->errh = errh;
573 
574   #ifdef HAVE_SYS_INOTIFY_H
575 	scf->fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
576 	if (scf->fd < 0) {
577 		log_perror(errh, __FILE__, __LINE__, "inotify_init1()");
578 		return NULL;
579 	}
580   #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
581    #ifdef __NetBSD__
582 	scf->fd = kqueue1(O_NONBLOCK|O_CLOEXEC|O_NOSIGPIPE);
583    #else
584 	scf->fd = kqueue();
585 	if (scf->fd >= 0) fdevent_setfd_cloexec(scf->fd);
586    #endif
587 	if (scf->fd < 0) {
588 		log_perror(errh, __FILE__, __LINE__, "kqueue()");
589 		return NULL;
590 	}
591   #else
592 	/* setup FAM */
593 	if (0 != FAMOpen2(&scf->fam, "lighttpd")) {
594 		log_error(errh, __FILE__, __LINE__,
595 		  "could not open a fam connection, dying.");
596 		return NULL;
597 	}
598       #ifdef HAVE_FAMNOEXISTS
599       #ifdef LIGHTTPD_STATIC
600 	FAMNoExists(&scf->fam);
601       #else
602 	int (*FAMNoExists_fn)(FAMConnection *);
603 	FAMNoExists_fn =
604 	  (int (*)(FAMConnection *))(intptr_t)dlsym(RTLD_DEFAULT,"FAMNoExists");
605 	if (FAMNoExists_fn) FAMNoExists_fn(&scf->fam);
606       #endif
607       #endif
608 
609 	scf->fd = FAMCONNECTION_GETFD(&scf->fam);
610 	fdevent_setfd_cloexec(scf->fd);
611   #endif
612 	scf->fdn = fdevent_register(scf->ev, scf->fd, stat_cache_handle_fdevent, scf);
613 	fdevent_fdnode_event_set(scf->ev, scf->fdn, FDEVENT_IN | FDEVENT_RDHUP);
614 
615 	return scf;
616 }
617 
618 static void stat_cache_free_fam(stat_cache_fam *scf) {
619 	if (NULL == scf) return;
620 
621       #ifdef HAVE_SYS_INOTIFY_H
622 	while (scf->wds) {
623 		splay_tree *node = scf->wds;
624 		scf->wds = splaytree_delete(scf->wds, node->key);
625 	}
626       #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
627 	/*(quicker cleanup to close kqueue() before cancel per entry)*/
628 	close(scf->fd);
629 	scf->fd = -1;
630       #endif
631 	while (scf->dirs) {
632 		/*(skip entry invalidation and FAMCancelMonitor())*/
633 		splay_tree *node = scf->dirs;
634 		fam_dir_entry_free((fam_dir_entry *)node->data);
635 		scf->dirs = splaytree_delete(scf->dirs, node->key);
636 	}
637 
638 	if (-1 != scf->fd) {
639 		/*scf->fdn already cleaned up in fdevent_free()*/
640 		FAMClose(&scf->fam);
641 		/*scf->fd = -1;*/
642 	}
643 
644 	free(scf);
645 }
646 
647 static fam_dir_entry * fam_dir_monitor(stat_cache_fam *scf, char *fn, uint32_t dirlen, struct stat *st)
648 {
649     if (NULL == scf->fdn) return NULL; /* FAM connection closed; do nothing */
650     const int fn_is_dir = S_ISDIR(st->st_mode);
651     /*force_assert(0 != dirlen);*/
652     /*force_assert(fn[0] == '/');*/
653     /* consistency: ensure fn does not end in '/' unless root "/"
654      * FAM events will not end in '/', so easier to match this way */
655     if (fn[dirlen-1] == '/') --dirlen;
656     if (0 == dirlen) dirlen = 1; /* root dir ("/") */
657     /* Note: paths are expected to be normalized before calling stat_cache,
658      * e.g. without repeated '/' */
659     if (!fn_is_dir) {
660         while (fn[--dirlen] != '/') ;
661         if (0 == dirlen) dirlen = 1; /*(should not happen for file)*/
662     }
663     int dir_ndx = splaytree_djbhash(fn, dirlen);
664     fam_dir_entry *fam_dir = NULL;
665 
666     scf->dirs = splaytree_splay(scf->dirs, dir_ndx);
667     if (NULL != scf->dirs && scf->dirs->key == dir_ndx) {
668         fam_dir = scf->dirs->data;
669         if (!buffer_is_equal_string(fam_dir->name, fn, dirlen)) {
670             /* hash collision; preserve existing
671              * do not monitor new to avoid cache thrashing */
672             return NULL;
673         }
674         /* directory already registered */
675     }
676 
677     const time_t cur_ts = log_epoch_secs;
678     struct stat lst;
679     int ck_dir = fn_is_dir;
680     if (!fn_is_dir && (NULL==fam_dir || cur_ts - fam_dir->stat_ts >= 16)) {
681         ck_dir = 1;
682         /*(temporarily modify fn)*/
683         fn[dirlen] = '\0';
684         if (0 != lstat(fn, &lst)) {
685             fn[dirlen] = '/';
686             return NULL;
687         }
688         if (!S_ISLNK(lst.st_mode)) {
689             st = &lst;
690         }
691         else if (0 != stat(fn, st)) { /*st passed in now is stat() of dir*/
692             fn[dirlen] = '/';
693             return NULL;
694         }
695         fn[dirlen] = '/';
696     }
697 
698     int ck_lnk = (NULL == fam_dir);
699     if (ck_dir && NULL != fam_dir) {
700         /* check stat() matches device and inode, just in case an external event
701          * not being monitored occurs (e.g. rename of unmonitored parent dir)*/
702         if (st->st_dev != fam_dir->st_dev || st->st_ino != fam_dir->st_ino) {
703             ck_lnk = 1;
704             /*(modifies scf->dirs but no need to re-splay for dir_ndx since
705              * fam_dir is not NULL and so splaytree_insert not called below)*/
706             if (scf->dirs) fam_dir_invalidate_tree(scf->dirs, fn, dirlen);
707             if (!fn_is_dir) /*(if dir, caller is updating stat_cache_entry)*/
708                 stat_cache_update_entry(fn, dirlen, st, NULL);
709             /*(must not delete tree since caller is holding a valid node)*/
710             stat_cache_invalidate_dir_tree(fn, dirlen);
711           #ifdef HAVE_SYS_INOTIFY_H
712             scf->wds = splaytree_delete(scf->wds, fam_dir->req);
713           #endif
714             if (0 != FAMCancelMonitor(&scf->fam, &fam_dir->req)
715                 || 0 != FAMMonitorDirectory(&scf->fam, fam_dir->name->ptr,
716                                             &fam_dir->req,
717                                             (void *)(intptr_t)dir_ndx)) {
718                 fam_dir->stat_ts = 0; /* invalidate */
719                 return NULL;
720             }
721             fam_dir->st_dev = st->st_dev;
722             fam_dir->st_ino = st->st_ino;
723           #ifdef HAVE_SYS_INOTIFY_H
724             scf->wds = splaytree_insert(scf->wds, fam_dir->req, fam_dir);
725           #endif
726         }
727         fam_dir->stat_ts = cur_ts;
728     }
729 
730     if (NULL == fam_dir) {
731         fam_dir = fam_dir_entry_init(fn, dirlen);
732 
733         if (0 != FAMMonitorDirectory(&scf->fam,fam_dir->name->ptr,&fam_dir->req,
734                                      (void *)(intptr_t)dir_ndx)) {
735           #if defined(HAVE_SYS_INOTIFY_H) \
736            || (defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE)
737             log_perror(scf->errh, __FILE__, __LINE__,
738               "monitoring dir failed: %s file: %s",
739               fam_dir->name->ptr, fn);
740           #else
741             log_error(scf->errh, __FILE__, __LINE__,
742               "monitoring dir failed: %s file: %s %s",
743               fam_dir->name->ptr, fn, FamErrlist[FAMErrno]);
744           #endif
745             fam_dir_entry_free(fam_dir);
746             return NULL;
747         }
748 
749         scf->dirs = splaytree_insert(scf->dirs, dir_ndx, fam_dir);
750       #ifdef HAVE_SYS_INOTIFY_H
751         scf->wds = splaytree_insert(scf->wds, fam_dir->req, fam_dir);
752       #endif
753         fam_dir->stat_ts= cur_ts;
754         fam_dir->st_dev = st->st_dev;
755         fam_dir->st_ino = st->st_ino;
756     }
757 
758     if (ck_lnk) {
759         if (fn_is_dir) {
760             /*(temporarily modify fn)*/
761             char e = fn[dirlen];
762             fn[dirlen] = '\0';
763             if (0 != lstat(fn, &lst)) {
764                 fn[dirlen] = e;
765                 return NULL;
766             }
767             fn[dirlen] = e;
768         }
769         if (fam_dir->fam_parent) {
770             --fam_dir->fam_parent->refcnt;
771             fam_dir->fam_parent = NULL;
772         }
773         if (S_ISLNK(lst.st_mode)) {
774             fam_dir->fam_parent = fam_dir_monitor(scf, fn, dirlen, &lst);
775         }
776     }
777 
778     ++fam_dir->refcnt;
779     return fam_dir;
780 }
781 
782 #endif
783 
784 
785 static stat_cache_entry * stat_cache_entry_init(void) {
786     stat_cache_entry *sce = calloc(1, sizeof(*sce));
787     force_assert(NULL != sce);
788     sce->fd = -1;
789     sce->refcnt = 1;
790     return sce;
791 }
792 
793 static void stat_cache_entry_free(void *data) {
794     stat_cache_entry *sce = data;
795     if (!sce) return;
796 
797     if (--sce->refcnt) return;
798 
799   #ifdef HAVE_FAM_H
800     /*(decrement refcnt only;
801      * defer cancelling FAM monitor on dir even if refcnt reaches zero)*/
802     if (sce->fam_dir) --((fam_dir_entry *)sce->fam_dir)->refcnt;
803   #endif
804 
805     free(sce->name.ptr);
806     free(sce->etag.ptr);
807     if (sce->content_type.size) free(sce->content_type.ptr);
808     if (sce->fd >= 0) close(sce->fd);
809 
810     free(sce);
811 }
812 
813 void stat_cache_entry_refchg(void *data, int mod) {
814     /*(expect mod == -1 or mod == 1)*/
815     stat_cache_entry * const sce = data;
816     if (mod < 0 && 1 == sce->refcnt)
817         stat_cache_entry_free(data);
818     else
819         sce->refcnt += mod;
820 }
821 
822 #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
823 
824 static const char *attrname = "Content-Type";
825 static char attrval[128];
826 static buffer attrb = { attrval, 0, 0 };
827 
828 static int stat_cache_attr_get(const char *name) {
829   #if defined(HAVE_XATTR)
830    #if defined(HAVE_SYS_XATTR_H)
831     ssize_t attrlen;
832     if (0 < (attrlen = getxattr(name, attrname,
833                                 attrval, sizeof(attrval)-1)))
834    #else
835     int attrlen = sizeof(attrval)-1;
836     if (0 == attr_get(name, attrname, attrval, &attrlen, 0))
837    #endif
838   #elif defined(HAVE_EXTATTR)
839     ssize_t attrlen;
840     if (0 < (attrlen = extattr_get_file(name, EXTATTR_NAMESPACE_USER, attrname,
841                                         attrval, sizeof(attrval)-1)))
842   #endif
843     {
844         attrval[attrlen] = '\0';
845         attrb.used = (uint32_t)(attrlen + 1);
846         return 1;
847     }
848     return 0;
849 }
850 
851 #endif
852 
853 int stat_cache_init(fdevents *ev, log_error_st *errh) {
854   #ifdef HAVE_FAM_H
855     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
856         sc.scf = stat_cache_init_fam(ev, errh);
857         if (NULL == sc.scf) return 0;
858     }
859   #else
860     UNUSED(ev);
861     UNUSED(errh);
862   #endif
863 
864     return 1;
865 }
866 
867 void stat_cache_free(void) {
868     splay_tree *sptree = sc.files;
869     while (sptree) {
870         stat_cache_entry_free(sptree->data);
871         sptree = splaytree_delete(sptree, sptree->key);
872     }
873     sc.files = NULL;
874 
875   #ifdef HAVE_FAM_H
876     stat_cache_free_fam(sc.scf);
877     sc.scf = NULL;
878   #endif
879 
880   #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
881     attrname = "Content-Type";
882   #endif
883 
884     sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE; /*(default)*/
885 }
886 
887 void stat_cache_xattrname (const char *name) {
888   #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
889     attrname = name;
890   #else
891     UNUSED(name);
892   #endif
893 }
894 
895 int stat_cache_choose_engine (const buffer *stat_cache_string, log_error_st *errh) {
896     if (buffer_string_is_empty(stat_cache_string))
897         sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE;
898     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("simple")))
899         sc.stat_cache_engine = STAT_CACHE_ENGINE_SIMPLE;
900 #ifdef HAVE_SYS_INOTIFY_H
901     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("inotify")))
902         sc.stat_cache_engine = STAT_CACHE_ENGINE_INOTIFY;
903         /*(STAT_CACHE_ENGINE_FAM == STAT_CACHE_ENGINE_INOTIFY)*/
904 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
905     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("kqueue")))
906         sc.stat_cache_engine = STAT_CACHE_ENGINE_KQUEUE;
907         /*(STAT_CACHE_ENGINE_FAM == STAT_CACHE_ENGINE_KQUEUE)*/
908 #endif
909 #ifdef HAVE_FAM_H
910     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("fam")))
911         sc.stat_cache_engine = STAT_CACHE_ENGINE_FAM;
912 #endif
913     else if (buffer_eq_slen(stat_cache_string, CONST_STR_LEN("disable")))
914         sc.stat_cache_engine = STAT_CACHE_ENGINE_NONE;
915     else {
916         log_error(errh, __FILE__, __LINE__,
917           "server.stat-cache-engine can be one of \"disable\", \"simple\","
918 #ifdef HAVE_SYS_INOTIFY_H
919           " \"inotify\","
920 #elif defined HAVE_SYS_EVENT_H && defined HAVE_KQUEUE
921           " \"kqueue\","
922 #endif
923 #ifdef HAVE_FAM_H
924           " \"fam\","
925 #endif
926           " but not: %s", stat_cache_string->ptr);
927         return -1;
928     }
929     return 0;
930 }
931 
932 const buffer * stat_cache_mimetype_by_ext(const array * const mimetypes, const char * const name, const uint32_t nlen)
933 {
934     const char * const end = name + nlen; /*(end of string)*/
935     const uint32_t used = mimetypes->used;
936     if (used < 16) {
937         for (uint32_t i = 0; i < used; ++i) {
938             /* suffix match */
939             const data_string *ds = (data_string *)mimetypes->data[i];
940             const size_t klen = buffer_string_length(&ds->key);
941             if (klen <= nlen && buffer_eq_icase_ssn(end-klen, ds->key.ptr, klen))
942                 return &ds->value;
943         }
944     }
945     else {
946         const char *s;
947         const data_string *ds;
948         if (nlen) {
949             for (s = end-1; s != name && *s != '/'; --s) ; /*(like memrchr())*/
950             if (*s == '/') ++s;
951         }
952         else {
953             s = name;
954         }
955         /* search for basename, then longest .ext2.ext1, then .ext1, then "" */
956         ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
957         if (NULL != ds) return &ds->value;
958         while (++s < end) {
959             while (*s != '.' && ++s != end) ;
960             if (s == end) break;
961             /* search ".ext" then "ext" */
962             ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
963             if (NULL != ds) return &ds->value;
964             /* repeat search without leading '.' to handle situation where
965              * admin configured mimetype.assign keys without leading '.' */
966             if (++s < end) {
967                 if (*s == '.') { --s; continue; }
968                 ds = (const data_string *)array_get_element_klen(mimetypes, s, end - s);
969                 if (NULL != ds) return &ds->value;
970             }
971         }
972         /* search for ""; catchall */
973         ds = (const data_string *)array_get_element_klen(mimetypes, CONST_STR_LEN(""));
974         if (NULL != ds) return &ds->value;
975     }
976 
977     return NULL;
978 }
979 
980 #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
981 
982 const buffer * stat_cache_mimetype_by_xattr(const char * const name)
983 {
984     return stat_cache_attr_get(name) ? &attrb : NULL;
985 }
986 
987 const buffer * stat_cache_content_type_get_by_xattr(stat_cache_entry *sce, const array *mimetypes, int use_xattr)
988 {
989     /*(invalid caching if user config has multiple, different
990      * r->conf.mimetypes for same extension (not expected))*/
991     if (!buffer_string_is_empty(&sce->content_type)) return &sce->content_type;
992 
993     if (!S_ISREG(sce->st.st_mode)) return NULL;
994 
995     /* cache mimetype */
996     const buffer *mtype =
997       (use_xattr) ? stat_cache_mimetype_by_xattr(sce->name.ptr) : NULL;
998     if (NULL == mtype)
999         mtype = stat_cache_mimetype_by_ext(mimetypes,CONST_BUF_LEN(&sce->name));
1000     if (NULL != mtype) {
1001         if (sce->content_type.size) {
1002             buffer_copy_buffer(&sce->content_type, mtype);
1003         }
1004         else if (mtype == &attrb) {
1005             sce->content_type.ptr = NULL;
1006             buffer_copy_buffer(&sce->content_type, mtype);
1007         }
1008         else {
1009             /*(copy pointers from mimetypes array; avoid allocation)*/
1010             sce->content_type.ptr = mtype->ptr;
1011             sce->content_type.used = mtype->used;
1012             /*(leave sce->content_type.size = 0 to flag not-allocated)*/
1013         }
1014     }
1015     else
1016         buffer_clear(&sce->content_type);
1017 
1018     return &sce->content_type;
1019 }
1020 
1021 #else
1022 
1023 const buffer * stat_cache_content_type_get_by_ext(stat_cache_entry *sce, const array *mimetypes)
1024 {
1025     /*(invalid caching if user config has multiple, different
1026      * r->conf.mimetypes for same extension (not expected))*/
1027     if (!buffer_string_is_empty(&sce->content_type)) return &sce->content_type;
1028 
1029     if (!S_ISREG(sce->st.st_mode)) return NULL;
1030 
1031     /* cache mimetype */
1032     const buffer * const mtype =
1033       stat_cache_mimetype_by_ext(mimetypes, CONST_BUF_LEN(&sce->name));
1034     if (NULL != mtype) {
1035         /*(copy pointers from mimetypes array; avoid allocation)*/
1036         sce->content_type.ptr = mtype->ptr;
1037         sce->content_type.used = mtype->used;
1038         /*(leave sce->content_type.size = 0 to flag not-allocated)*/
1039     }
1040     else
1041         buffer_clear(&sce->content_type);
1042 
1043     return &sce->content_type;
1044 }
1045 
1046 #endif
1047 
1048 const buffer * stat_cache_etag_get(stat_cache_entry *sce, int flags) {
1049     /*(invalid caching if user cfg has multiple, different r->conf.etag_flags
1050      * for same path (not expected, since etag flags should be by filesystem))*/
1051     if (!buffer_string_is_empty(&sce->etag)) return &sce->etag;
1052 
1053     if (S_ISREG(sce->st.st_mode) || S_ISDIR(sce->st.st_mode)) {
1054         if (0 == flags) return NULL;
1055         etag_create(&sce->etag, &sce->st, flags);
1056         return &sce->etag;
1057     }
1058 
1059     return NULL;
1060 }
1061 
1062 __attribute_pure__
1063 static int stat_cache_stat_eq(const struct stat * const sta, const struct stat * const stb) {
1064     return
1065       #ifdef st_mtime /* use high-precision timestamp if available */
1066       #if defined(__APPLE__) && defined(__MACH__)
1067         sta->st_mtimespec.tv_nsec == stb->st_mtimespec.tv_nsec
1068       #else
1069         sta->st_mtim.tv_nsec == stb->st_mtim.tv_nsec
1070       #endif
1071       #endif
1072         && sta->st_mtime == stb->st_mtime
1073         && sta->st_size  == stb->st_size
1074         && sta->st_ino   == stb->st_ino
1075         && sta->st_dev   == stb->st_dev;
1076 }
1077 
1078 void stat_cache_update_entry(const char *name, uint32_t len,
1079                              struct stat *st, buffer *etagb)
1080 {
1081     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_NONE) return;
1082     force_assert(0 != len);
1083     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1084     splay_tree **sptree = &sc.files;
1085     stat_cache_entry *sce =
1086       stat_cache_sptree_find(sptree, name, len);
1087     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1088         if (!stat_cache_stat_eq(&sce->st, st)) {
1089             /* etagb might be NULL to clear etag (invalidate) */
1090             buffer_copy_string_len(&sce->etag, CONST_BUF_LEN(etagb));
1091           #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
1092             buffer_clear(&sce->content_type);
1093           #endif
1094             if (sce->fd >= 0) {
1095                 if (1 == sce->refcnt) {
1096                     close(sce->fd);
1097                     sce->fd = -1;
1098                 }
1099                 else {
1100                     --sce->refcnt; /* stat_cache_entry_free(sce); */
1101                     (*sptree)->data = sce = stat_cache_entry_init();
1102                     buffer_copy_string_len(&sce->name, name, len);
1103                 }
1104             }
1105             sce->st = *st;
1106         }
1107         sce->stat_ts = log_epoch_secs;
1108     }
1109 }
1110 
1111 void stat_cache_delete_entry(const char *name, uint32_t len)
1112 {
1113     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_NONE) return;
1114     force_assert(0 != len);
1115     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1116     splay_tree **sptree = &sc.files;
1117     stat_cache_entry *sce = stat_cache_sptree_find(sptree, name, len);
1118     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1119         stat_cache_entry_free(sce);
1120         *sptree = splaytree_delete(*sptree, (*sptree)->key);
1121     }
1122 }
1123 
1124 void stat_cache_invalidate_entry(const char *name, uint32_t len)
1125 {
1126     splay_tree **sptree = &sc.files;
1127     stat_cache_entry *sce = stat_cache_sptree_find(sptree, name, len);
1128     if (sce && buffer_is_equal_string(&sce->name, name, len)) {
1129         sce->stat_ts = 0;
1130       #ifdef HAVE_FAM_H
1131         if (sce->fam_dir != NULL) {
1132             --((fam_dir_entry *)sce->fam_dir)->refcnt;
1133             sce->fam_dir = NULL;
1134         }
1135       #endif
1136     }
1137 }
1138 
1139 #ifdef HAVE_FAM_H
1140 
1141 static void stat_cache_invalidate_dir_tree_walk(splay_tree *t,
1142                                                 const char *name, size_t len)
1143 {
1144     if (t->left)  stat_cache_invalidate_dir_tree_walk(t->left,  name, len);
1145     if (t->right) stat_cache_invalidate_dir_tree_walk(t->right, name, len);
1146 
1147     buffer *b = &((stat_cache_entry *)t->data)->name;
1148     size_t blen = buffer_string_length(b);
1149     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len)) {
1150         stat_cache_entry *sce = t->data;
1151         sce->stat_ts = 0;
1152         if (sce->fam_dir != NULL) {
1153             --((fam_dir_entry *)sce->fam_dir)->refcnt;
1154             sce->fam_dir = NULL;
1155         }
1156     }
1157 }
1158 
1159 static void stat_cache_invalidate_dir_tree(const char *name, size_t len)
1160 {
1161     splay_tree * const sptree = sc.files;
1162     if (sptree) stat_cache_invalidate_dir_tree_walk(sptree, name, len);
1163 }
1164 
1165 #endif
1166 
1167 /*
1168  * walk though splay_tree and collect contents of dir tree.
1169  * remove tagged entries in a second loop
1170  */
1171 
1172 static void stat_cache_tag_dir_tree(splay_tree *t, const char *name, size_t len,
1173                                     int *keys, int *ndx)
1174 {
1175     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1176     if (t->left)  stat_cache_tag_dir_tree(t->left,  name, len, keys, ndx);
1177     if (t->right) stat_cache_tag_dir_tree(t->right, name, len, keys, ndx);
1178     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1179 
1180     buffer *b = &((stat_cache_entry *)t->data)->name;
1181     size_t blen = buffer_string_length(b);
1182     if (blen > len && b->ptr[len] == '/' && 0 == memcmp(b->ptr, name, len))
1183         keys[(*ndx)++] = t->key;
1184 }
1185 
1186 __attribute_noinline__
1187 static void stat_cache_prune_dir_tree(const char *name, size_t len)
1188 {
1189     splay_tree *sptree = sc.files;
1190     int max_ndx, i;
1191     int keys[8192]; /* 32k size on stack */
1192     do {
1193         if (!sptree) break;
1194         max_ndx = 0;
1195         stat_cache_tag_dir_tree(sptree, name, len, keys, &max_ndx);
1196         for (i = 0; i < max_ndx; ++i) {
1197             const int ndx = keys[i];
1198             splay_tree *node = sptree = splaytree_splay(sptree, ndx);
1199             if (node && node->key == ndx) {
1200                 stat_cache_entry_free(node->data);
1201                 sptree = splaytree_delete(sptree, ndx);
1202             }
1203         }
1204     } while (max_ndx == sizeof(keys)/sizeof(int));
1205     sc.files = sptree;
1206 }
1207 
1208 static void stat_cache_delete_tree(const char *name, uint32_t len)
1209 {
1210     stat_cache_delete_entry(name, len);
1211     stat_cache_prune_dir_tree(name, len);
1212 }
1213 
1214 void stat_cache_delete_dir(const char *name, uint32_t len)
1215 {
1216     force_assert(0 != len);
1217     if (name[len-1] == '/') { if (0 == --len) len = 1; }
1218     stat_cache_delete_tree(name, len);
1219   #ifdef HAVE_FAM_H
1220     if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
1221         splay_tree **sptree = &sc.scf->dirs;
1222         fam_dir_entry *fam_dir = stat_cache_sptree_find(sptree, name, len);
1223         if (fam_dir && buffer_is_equal_string(fam_dir->name, name, len))
1224             fam_dir_invalidate_node(fam_dir);
1225         if (*sptree) fam_dir_invalidate_tree(*sptree, name, len);
1226         fam_dir_periodic_cleanup();
1227     }
1228   #endif
1229 }
1230 
1231 /***
1232  *
1233  *
1234  *
1235  * returns:
1236  *  - HANDLER_FINISHED on cache-miss (don't forget to reopen the file)
1237  *  - HANDLER_ERROR on stat() failed -> see errno for problem
1238  */
1239 
1240 stat_cache_entry * stat_cache_get_entry(const buffer *name) {
1241 	stat_cache_entry *sce = NULL;
1242 	struct stat st;
1243 	int file_ndx;
1244 
1245 	/* consistency: ensure lookup name does not end in '/' unless root "/"
1246 	 * (but use full path given with stat(), even with trailing '/') */
1247 	int final_slash = 0;
1248 	size_t len = buffer_string_length(name);
1249 	force_assert(0 != len);
1250 	if (name->ptr[len-1] == '/') { final_slash = 1; if (0 == --len) len = 1; }
1251 	/* Note: paths are expected to be normalized before calling stat_cache,
1252 	 * e.g. without repeated '/' */
1253 
1254 	if (name->ptr[0] != '/') {
1255 		errno = EINVAL;
1256 		return NULL;
1257 	}
1258 
1259 	/*
1260 	 * check if the directory for this file has changed
1261 	 */
1262 
1263 	const time_t cur_ts = log_epoch_secs;
1264 
1265 	file_ndx = splaytree_djbhash(name->ptr, len);
1266 	splay_tree *sptree = sc.files = splaytree_splay(sc.files, file_ndx);
1267 
1268 	if (sptree && (sptree->key == file_ndx)) {
1269 		/* we have seen this file already and
1270 		 * don't stat() it again in the same second */
1271 
1272 		sce = sptree->data;
1273 
1274 		/* check if the name is the same, we might have a collision */
1275 
1276 		if (buffer_is_equal_string(&sce->name, name->ptr, len)) {
1277 			if (sc.stat_cache_engine == STAT_CACHE_ENGINE_SIMPLE) {
1278 				if (sce->stat_ts == cur_ts) {
1279 					if (final_slash && !S_ISDIR(sce->st.st_mode)) {
1280 						errno = ENOTDIR;
1281 						return NULL;
1282 					}
1283 					return sce;
1284 				}
1285 			}
1286 		      #ifdef HAVE_FAM_H
1287 			else if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM
1288 				 && sce->fam_dir) { /* entry is in monitored dir */
1289 				/* re-stat() periodically, even if monitoring for changes
1290 				 * (due to limitations in stat_cache.c use of FAM)
1291 				 * (gaps due to not continually monitoring an entire tree) */
1292 				if (cur_ts - sce->stat_ts < 16) {
1293 					if (final_slash && !S_ISDIR(sce->st.st_mode)) {
1294 						errno = ENOTDIR;
1295 						return NULL;
1296 					}
1297 					return sce;
1298 				}
1299 			}
1300 		      #endif
1301 		} else {
1302 			/* collision, forget about the entry */
1303 			sce = NULL;
1304 		}
1305 	}
1306 
1307 	if (-1 == stat(name->ptr, &st)) {
1308 		return NULL;
1309 	}
1310 
1311 	if (S_ISREG(st.st_mode)) {
1312 		/* fix broken stat/open for symlinks to reg files with appended slash on freebsd,osx */
1313 		if (name->ptr[buffer_string_length(name) - 1] == '/') {
1314 			errno = ENOTDIR;
1315 			return NULL;
1316 		}
1317 	}
1318 
1319 	if (NULL == sce) {
1320 
1321 		sce = stat_cache_entry_init();
1322 		buffer_copy_string_len(&sce->name, name->ptr, len);
1323 
1324 		/* already splayed file_ndx */
1325 		if (NULL != sptree && sptree->key == file_ndx) {
1326 			/* hash collision: replace old entry */
1327 			stat_cache_entry_free(sptree->data);
1328 			sptree->data = sce;
1329 		} else {
1330 			sptree = sc.files = splaytree_insert(sptree, file_ndx, sce);
1331 		}
1332 
1333 	} else {
1334 
1335 		buffer_clear(&sce->etag);
1336 	      #if defined(HAVE_XATTR) || defined(HAVE_EXTATTR)
1337 		buffer_clear(&sce->content_type);
1338 	      #endif
1339 
1340 	}
1341 
1342 	if (sce->fd >= 0) {
1343 		/* close fd if file changed */
1344 		if (!stat_cache_stat_eq(&sce->st, &st)) {
1345 			if (1 == sce->refcnt) {
1346 				close(sce->fd);
1347 				sce->fd = -1;
1348 			}
1349 			else {
1350 				--sce->refcnt; /* stat_cache_entry_free(sce); */
1351 				sptree->data = sce = stat_cache_entry_init();
1352 				buffer_copy_string_len(&sce->name, name->ptr, len);
1353 			}
1354 		}
1355 	}
1356 
1357 	sce->st = st; /*(copy prior to calling fam_dir_monitor())*/
1358 
1359 #ifdef HAVE_FAM_H
1360 	if (sc.stat_cache_engine == STAT_CACHE_ENGINE_FAM) {
1361 		if (sce->fam_dir) --((fam_dir_entry *)sce->fam_dir)->refcnt;
1362 		sce->fam_dir =
1363 		  fam_dir_monitor(sc.scf, CONST_BUF_LEN(name), &st);
1364 	      #if 0 /*(performed below)*/
1365 		if (NULL != sce->fam_dir) {
1366 			/*(may have been invalidated by dir change)*/
1367 			sce->stat_ts = cur_ts;
1368 		}
1369 	      #endif
1370 	}
1371 #endif
1372 
1373 	sce->stat_ts = cur_ts;
1374 	return sce;
1375 }
1376 
1377 stat_cache_entry * stat_cache_get_entry_open(const buffer * const name, const int symlinks) {
1378     stat_cache_entry * const sce = stat_cache_get_entry(name);
1379     if (NULL == sce) return NULL;
1380     if (sce->fd >= 0) return sce;
1381     if (sce->st.st_size > 0)
1382         sce->fd = stat_cache_open_rdonly_fstat(name, &sce->st, symlinks);
1383     return sce; /* (note: sce->fd might still be -1 if open() failed) */
1384 }
1385 
1386 const stat_cache_st * stat_cache_path_stat (const buffer * const name) {
1387     const stat_cache_entry * const sce = stat_cache_get_entry(name);
1388     return sce ? &sce->st : NULL;
1389 }
1390 
1391 int stat_cache_path_isdir(const buffer *name) {
1392     const stat_cache_entry * const sce = stat_cache_get_entry(name);
1393     return (sce && (S_ISDIR(sce->st.st_mode) ? 1 : (errno = ENOTDIR, 0)));
1394 }
1395 
1396 int stat_cache_path_contains_symlink(const buffer *name, log_error_st *errh) {
1397     /* caller should check for symlinks only if we should block symlinks. */
1398 
1399     /* catch the obvious symlinks
1400      *
1401      * this is not a secure check as we still have a race-condition between
1402      * the stat() and the open. We can only solve this by
1403      * 1. open() the file
1404      * 2. fstat() the fd
1405      *
1406      * and keeping the file open for the rest of the time. But this can
1407      * only be done at network level.
1408      * */
1409 
1410   #ifdef HAVE_LSTAT
1411     /* we assume "/" can not be symlink,
1412      * so skip the symlink stuff if path is "/" */
1413     size_t len = buffer_string_length(name);
1414     force_assert(0 != len);
1415     force_assert(name->ptr[0] == '/');
1416     if (1 == len) return 0;
1417    #ifndef PATH_MAX
1418    #define PATH_MAX 4096
1419    #endif
1420     if (len >= PATH_MAX) return -1;
1421 
1422     char buf[PATH_MAX];
1423     memcpy(buf, name->ptr, len);
1424     char *s_cur = buf+len;
1425     do {
1426         *s_cur = '\0';
1427         struct stat st;
1428         if (0 == lstat(buf, &st)) {
1429             if (S_ISLNK(st.st_mode)) return 1;
1430         }
1431         else {
1432             log_perror(errh, __FILE__, __LINE__, "lstat failed for: %s", buf);
1433             return -1;
1434         }
1435     } while ((s_cur = strrchr(buf, '/')) > buf); /*(&buf[0]==buf; NULL < buf)*/
1436   #endif
1437 
1438     return 0;
1439 }
1440 
1441 int stat_cache_open_rdonly_fstat (const buffer *name, struct stat *st, int symlinks) {
1442 	/*(Note: O_NOFOLLOW affects only the final path segment, the target file,
1443 	 * not any intermediate symlinks along the path)*/
1444 	const int fd = fdevent_open_cloexec(name->ptr, symlinks, O_RDONLY, 0);
1445 	if (fd >= 0) {
1446 		if (0 == fstat(fd, st)) {
1447 			return fd;
1448 		} else {
1449 			const int errnum = errno;
1450 			close(fd);
1451 			errno = errnum;
1452 		}
1453 	}
1454 	return -1;
1455 }
1456 
1457 /**
1458  * remove stat() from cache which haven't been stat()ed for
1459  * more than 2 seconds
1460  *
1461  *
1462  * walk though the stat-cache, collect the ids which are too old
1463  * and remove them in a second loop
1464  */
1465 
1466 static void stat_cache_tag_old_entries(splay_tree * const t, int * const keys, int * const ndx, const time_t max_age, const time_t cur_ts) {
1467     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1468     if (t->left)
1469         stat_cache_tag_old_entries(t->left, keys, ndx, max_age, cur_ts);
1470     if (t->right)
1471         stat_cache_tag_old_entries(t->right, keys, ndx, max_age, cur_ts);
1472     if (*ndx == 8192) return; /*(must match num array entries in keys[])*/
1473 
1474     const stat_cache_entry * const sce = t->data;
1475     if (cur_ts - sce->stat_ts > max_age)
1476         keys[(*ndx)++] = t->key;
1477 }
1478 
1479 static void stat_cache_periodic_cleanup(const time_t max_age, const time_t cur_ts) {
1480     splay_tree *sptree = sc.files;
1481     int max_ndx, i;
1482     int keys[8192]; /* 32k size on stack */
1483     do {
1484         if (!sptree) break;
1485         max_ndx = 0;
1486         stat_cache_tag_old_entries(sptree, keys, &max_ndx, max_age, cur_ts);
1487         for (i = 0; i < max_ndx; ++i) {
1488             int ndx = keys[i];
1489             sptree = splaytree_splay(sptree, ndx);
1490             if (sptree && sptree->key == ndx) {
1491                 stat_cache_entry_free(sptree->data);
1492                 sptree = splaytree_delete(sptree, ndx);
1493             }
1494         }
1495     } while (max_ndx == sizeof(keys)/sizeof(int));
1496     sc.files = sptree;
1497 }
1498 
1499 void stat_cache_trigger_cleanup(void) {
1500 	time_t max_age = 2;
1501 
1502       #ifdef HAVE_FAM_H
1503 	if (STAT_CACHE_ENGINE_FAM == sc.stat_cache_engine) {
1504 		if (log_epoch_secs & 0x1F) return;
1505 		/* once every 32 seconds (0x1F == 31) */
1506 		max_age = 32;
1507 		fam_dir_periodic_cleanup();
1508 		/* By doing this before stat_cache_periodic_cleanup(),
1509 		 * entries used within the next max_age secs will remain
1510 		 * monitored, instead of effectively flushing and
1511 		 * rebuilding the FAM monitoring every max_age seconds */
1512 	}
1513       #endif
1514 
1515 	stat_cache_periodic_cleanup(max_age, log_epoch_secs);
1516 }
1517