1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include "bipbuffer.h"
4 #include "slab_automove.h"
5 #include "storage.h"
6 #ifdef EXTSTORE
7 #include "slab_automove_extstore.h"
8 #endif
9 #include <sys/stat.h>
10 #include <sys/socket.h>
11 #include <sys/resource.h>
12 #include <fcntl.h>
13 #include <netinet/in.h>
14 #include <errno.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <signal.h>
18 #include <string.h>
19 #include <time.h>
20 #include <assert.h>
21 #include <unistd.h>
22 #include <poll.h>
23 
24 /* Forward Declarations */
25 static void item_link_q(item *it);
26 static void item_unlink_q(item *it);
27 
28 static unsigned int lru_type_map[4] = {HOT_LRU, WARM_LRU, COLD_LRU, TEMP_LRU};
29 
30 #define LARGEST_ID POWER_LARGEST
31 typedef struct {
32     uint64_t evicted;
33     uint64_t evicted_nonzero;
34     uint64_t reclaimed;
35     uint64_t outofmemory;
36     uint64_t tailrepairs;
37     uint64_t expired_unfetched; /* items reclaimed but never touched */
38     uint64_t evicted_unfetched; /* items evicted but never touched */
39     uint64_t evicted_active; /* items evicted that should have been shuffled */
40     uint64_t crawler_reclaimed;
41     uint64_t crawler_items_checked;
42     uint64_t lrutail_reflocked;
43     uint64_t moves_to_cold;
44     uint64_t moves_to_warm;
45     uint64_t moves_within_lru;
46     uint64_t direct_reclaims;
47     uint64_t hits_to_hot;
48     uint64_t hits_to_warm;
49     uint64_t hits_to_cold;
50     uint64_t hits_to_temp;
51     uint64_t mem_requested;
52     rel_time_t evicted_time;
53 } itemstats_t;
54 
55 static item *heads[LARGEST_ID];
56 static item *tails[LARGEST_ID];
57 static itemstats_t itemstats[LARGEST_ID];
58 static unsigned int sizes[LARGEST_ID];
59 static uint64_t sizes_bytes[LARGEST_ID];
60 static unsigned int *stats_sizes_hist = NULL;
61 static int stats_sizes_buckets = 0;
62 static uint64_t cas_id = 1;
63 
64 static volatile int do_run_lru_maintainer_thread = 0;
65 static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER;
66 static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER;
67 
item_stats_reset(void)68 void item_stats_reset(void) {
69     int i;
70     for (i = 0; i < LARGEST_ID; i++) {
71         pthread_mutex_lock(&lru_locks[i]);
72         memset(&itemstats[i], 0, sizeof(itemstats_t));
73         pthread_mutex_unlock(&lru_locks[i]);
74     }
75 }
76 
77 /* called with class lru lock held */
do_item_stats_add_crawl(const int i,const uint64_t reclaimed,const uint64_t unfetched,const uint64_t checked)78 void do_item_stats_add_crawl(const int i, const uint64_t reclaimed,
79         const uint64_t unfetched, const uint64_t checked) {
80     itemstats[i].crawler_reclaimed += reclaimed;
81     itemstats[i].expired_unfetched += unfetched;
82     itemstats[i].crawler_items_checked += checked;
83 }
84 
85 typedef struct _lru_bump_buf {
86     struct _lru_bump_buf *prev;
87     struct _lru_bump_buf *next;
88     pthread_mutex_t mutex;
89     bipbuf_t *buf;
90     uint64_t dropped;
91 } lru_bump_buf;
92 
93 typedef struct {
94     item *it;
95     uint32_t hv;
96 } lru_bump_entry;
97 
98 static lru_bump_buf *bump_buf_head = NULL;
99 static lru_bump_buf *bump_buf_tail = NULL;
100 static pthread_mutex_t bump_buf_lock = PTHREAD_MUTEX_INITIALIZER;
101 /* TODO: tunable? Need bench results */
102 #define LRU_BUMP_BUF_SIZE 8192
103 
104 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv);
105 static uint64_t lru_total_bumps_dropped(void);
106 
107 /* Get the next CAS id for a new item. */
108 /* TODO: refactor some atomics for this. */
get_cas_id(void)109 uint64_t get_cas_id(void) {
110     pthread_mutex_lock(&cas_id_lock);
111     uint64_t next_id = ++cas_id;
112     pthread_mutex_unlock(&cas_id_lock);
113     return next_id;
114 }
115 
set_cas_id(uint64_t new_cas)116 void set_cas_id(uint64_t new_cas) {
117     pthread_mutex_lock(&cas_id_lock);
118     cas_id = new_cas;
119     pthread_mutex_unlock(&cas_id_lock);
120 }
121 
item_is_flushed(item * it)122 int item_is_flushed(item *it) {
123     rel_time_t oldest_live = settings.oldest_live;
124     if (it->time <= oldest_live && oldest_live <= current_time)
125         return 1;
126 
127     return 0;
128 }
129 
130 /* must be locked before call */
do_get_lru_size(uint32_t id)131 unsigned int do_get_lru_size(uint32_t id) {
132     return sizes[id];
133 }
134 
135 /* Enable this for reference-count debugging. */
136 #if 0
137 # define DEBUG_REFCNT(it,op) \
138                 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
139                         it, op, it->refcount, \
140                         (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
141                         (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
142 #else
143 # define DEBUG_REFCNT(it,op) while(0)
144 #endif
145 
146 /**
147  * Generates the variable-sized part of the header for an object.
148  *
149  * nkey    - The length of the key
150  * flags   - key flags
151  * nbytes  - Number of bytes to hold value and addition CRLF terminator
152  * suffix  - Buffer for the "VALUE" line suffix (flags, size).
153  * nsuffix - The length of the suffix is stored here.
154  *
155  * Returns the total size of the header.
156  */
item_make_header(const uint8_t nkey,const client_flags_t flags,const int nbytes,char * suffix,uint8_t * nsuffix)157 static size_t item_make_header(const uint8_t nkey, const client_flags_t flags, const int nbytes,
158                      char *suffix, uint8_t *nsuffix) {
159     if (flags == 0) {
160         *nsuffix = 0;
161     } else {
162         *nsuffix = sizeof(flags);
163     }
164     return sizeof(item) + nkey + *nsuffix + nbytes;
165 }
166 
do_item_alloc_pull(const size_t ntotal,const unsigned int id)167 item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
168     item *it = NULL;
169     int i;
170     /* If no memory is available, attempt a direct LRU juggle/eviction */
171     /* This is a race in order to simplify lru_pull_tail; in cases where
172      * locked items are on the tail, you want them to fall out and cause
173      * occasional OOM's, rather than internally work around them.
174      * This also gives one fewer code path for slab alloc/free
175      */
176     for (i = 0; i < 10; i++) {
177         /* Try to reclaim memory first */
178         if (!settings.lru_segmented) {
179             lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
180         }
181         it = slabs_alloc(ntotal, id, 0);
182 
183         if (it == NULL) {
184             // We send '0' in for "total_bytes" as this routine is always
185             // pulling to evict, or forcing HOT -> COLD migration.
186             // As of this writing, total_bytes isn't at all used with COLD_LRU.
187             if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0, NULL) <= 0) {
188                 if (settings.lru_segmented) {
189                     lru_pull_tail(id, HOT_LRU, 0, 0, 0, NULL);
190                 } else {
191                     break;
192                 }
193             }
194         } else {
195             break;
196         }
197     }
198 
199     if (i > 0) {
200         pthread_mutex_lock(&lru_locks[id]);
201         itemstats[id].direct_reclaims += i;
202         pthread_mutex_unlock(&lru_locks[id]);
203     }
204 
205     return it;
206 }
207 
208 /* Chain another chunk onto this chunk. */
209 /* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
210  * flag, it counts as busy and skips.
211  * I think it might still not be safe to do linking outside of the slab lock
212  */
do_item_alloc_chunk(item_chunk * ch,const size_t bytes_remain)213 item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
214     // TODO: Should be a cleaner way of finding real size with slabber calls
215     size_t size = bytes_remain + sizeof(item_chunk);
216     if (size > settings.slab_chunk_size_max)
217         size = settings.slab_chunk_size_max;
218     unsigned int id = slabs_clsid(size);
219 
220     item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
221     if (nch == NULL) {
222         // The final chunk in a large item will attempt to be a more
223         // appropriately sized chunk to minimize memory overhead. However, if
224         // there's no memory available in the lower slab classes we fail the
225         // SET. In these cases as a fallback we ensure we attempt to evict a
226         // max-size item and reuse a large chunk.
227         if (size == settings.slab_chunk_size_max) {
228             return NULL;
229         } else {
230             size = settings.slab_chunk_size_max;
231             id = slabs_clsid(size);
232             nch = (item_chunk *) do_item_alloc_pull(size, id);
233 
234             if (nch == NULL)
235                 return NULL;
236         }
237     }
238 
239     // link in.
240     // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
241     slabs_mlock();
242     nch->head = ch->head;
243     ch->next = nch;
244     nch->prev = ch;
245     nch->next = 0;
246     nch->used = 0;
247     nch->slabs_clsid = id;
248     nch->size = size - sizeof(item_chunk);
249     nch->it_flags |= ITEM_CHUNK;
250     slabs_munlock();
251     return nch;
252 }
253 
do_item_alloc(const char * key,const size_t nkey,const client_flags_t flags,const rel_time_t exptime,const int nbytes)254 item *do_item_alloc(const char *key, const size_t nkey, const client_flags_t flags,
255                     const rel_time_t exptime, const int nbytes) {
256     uint8_t nsuffix;
257     item *it = NULL;
258     char suffix[40];
259     // Avoid potential underflows.
260     if (nbytes < 2)
261         return 0;
262 
263     size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
264     if (settings.use_cas) {
265         ntotal += sizeof(uint64_t);
266     }
267 
268     unsigned int id = slabs_clsid(ntotal);
269     unsigned int hdr_id = 0;
270     if (id == 0)
271         return 0;
272 
273     /* This is a large item. Allocate a header object now, lazily allocate
274      *  chunks while reading the upload.
275      */
276     if (ntotal > settings.slab_chunk_size_max) {
277         /* We still link this item into the LRU for the larger slab class, but
278          * we're pulling a header from an entirely different slab class. The
279          * free routines handle large items specifically.
280          */
281         int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
282         if (settings.use_cas) {
283             htotal += sizeof(uint64_t);
284         }
285 #ifdef NEED_ALIGN
286         // header chunk needs to be padded on some systems
287         int remain = htotal % 8;
288         if (remain != 0) {
289             htotal += 8 - remain;
290         }
291 #endif
292         hdr_id = slabs_clsid(htotal);
293         it = do_item_alloc_pull(htotal, hdr_id);
294         /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
295         if (it != NULL)
296             it->it_flags |= ITEM_CHUNKED;
297     } else {
298         it = do_item_alloc_pull(ntotal, id);
299     }
300 
301     if (it == NULL) {
302         pthread_mutex_lock(&lru_locks[id]);
303         itemstats[id].outofmemory++;
304         pthread_mutex_unlock(&lru_locks[id]);
305         return NULL;
306     }
307 
308     assert(it->it_flags == 0 || it->it_flags == ITEM_CHUNKED);
309     //assert(it != heads[id]);
310 
311     /* Refcount is seeded to 1 by slabs_alloc() */
312     it->next = it->prev = 0;
313 
314     /* Items are initially loaded into the HOT_LRU. This is '0' but I want at
315      * least a note here. Compiler (hopefully?) optimizes this out.
316      */
317     if (settings.temp_lru &&
318             exptime - current_time <= settings.temporary_ttl) {
319         id |= TEMP_LRU;
320     } else if (settings.lru_segmented) {
321         id |= HOT_LRU;
322     } else {
323         /* There is only COLD in compat-mode */
324         id |= COLD_LRU;
325     }
326     it->slabs_clsid = id;
327 
328     DEBUG_REFCNT(it, '*');
329     it->it_flags |= settings.use_cas ? ITEM_CAS : 0;
330     it->it_flags |= nsuffix != 0 ? ITEM_CFLAGS : 0;
331     it->nkey = nkey;
332     it->nbytes = nbytes;
333     memcpy(ITEM_key(it), key, nkey);
334     it->exptime = exptime;
335     if (nsuffix > 0) {
336         memcpy(ITEM_suffix(it), &flags, sizeof(flags));
337     }
338 
339     /* Initialize internal chunk. */
340     if (it->it_flags & ITEM_CHUNKED) {
341         item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
342 
343         chunk->next = 0;
344         chunk->prev = 0;
345         chunk->used = 0;
346         chunk->size = 0;
347         chunk->head = it;
348         chunk->orig_clsid = hdr_id;
349     }
350     it->h_next = 0;
351 
352     return it;
353 }
354 
item_free(item * it)355 void item_free(item *it) {
356     size_t ntotal = ITEM_ntotal(it);
357     unsigned int clsid;
358     assert((it->it_flags & ITEM_LINKED) == 0);
359     assert(it != heads[it->slabs_clsid]);
360     assert(it != tails[it->slabs_clsid]);
361     assert(it->refcount == 0);
362 
363     /* so slab size changer can tell later if item is already free or not */
364     clsid = ITEM_clsid(it);
365     DEBUG_REFCNT(it, 'F');
366     slabs_free(it, ntotal, clsid);
367 }
368 
369 /**
370  * Returns true if an item will fit in the cache (its size does not exceed
371  * the maximum for a cache entry.)
372  */
item_size_ok(const size_t nkey,const client_flags_t flags,const int nbytes)373 bool item_size_ok(const size_t nkey, const client_flags_t flags, const int nbytes) {
374     char prefix[40];
375     uint8_t nsuffix;
376     if (nbytes < 2)
377         return false;
378 
379     size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
380                                      prefix, &nsuffix);
381     if (settings.use_cas) {
382         ntotal += sizeof(uint64_t);
383     }
384 
385     return slabs_clsid(ntotal) != 0;
386 }
387 
388 /* fixing stats/references during warm start */
do_item_link_fixup(item * it)389 void do_item_link_fixup(item *it) {
390     item **head, **tail;
391     int ntotal = ITEM_ntotal(it);
392     uint32_t hv = hash(ITEM_key(it), it->nkey);
393     assoc_insert(it, hv);
394 
395     head = &heads[it->slabs_clsid];
396     tail = &tails[it->slabs_clsid];
397     if (it->prev == 0 && *head == 0) *head = it;
398     if (it->next == 0 && *tail == 0) *tail = it;
399     sizes[it->slabs_clsid]++;
400     sizes_bytes[it->slabs_clsid] += ntotal;
401 
402     STATS_LOCK();
403     stats_state.curr_bytes += ntotal;
404     stats_state.curr_items += 1;
405     stats.total_items += 1;
406     STATS_UNLOCK();
407 
408     item_stats_sizes_add(it);
409 
410     return;
411 }
412 
do_item_link_q(item * it)413 static void do_item_link_q(item *it) { /* item is the new head */
414     item **head, **tail;
415     assert((it->it_flags & ITEM_SLABBED) == 0);
416 
417     head = &heads[it->slabs_clsid];
418     tail = &tails[it->slabs_clsid];
419     assert(it != *head);
420     assert((*head && *tail) || (*head == 0 && *tail == 0));
421     it->prev = 0;
422     it->next = *head;
423     if (it->next) it->next->prev = it;
424     *head = it;
425     if (*tail == 0) *tail = it;
426     sizes[it->slabs_clsid]++;
427 #ifdef EXTSTORE
428     if (it->it_flags & ITEM_HDR) {
429         sizes_bytes[it->slabs_clsid] += (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
430     } else {
431         sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
432     }
433 #else
434     sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
435 #endif
436 
437     return;
438 }
439 
item_link_q(item * it)440 static void item_link_q(item *it) {
441     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
442     do_item_link_q(it);
443     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
444 }
445 
item_link_q_warm(item * it)446 static void item_link_q_warm(item *it) {
447     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
448     do_item_link_q(it);
449     itemstats[it->slabs_clsid].moves_to_warm++;
450     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
451 }
452 
do_item_unlink_q(item * it)453 static void do_item_unlink_q(item *it) {
454     item **head, **tail;
455     head = &heads[it->slabs_clsid];
456     tail = &tails[it->slabs_clsid];
457 
458     if (*head == it) {
459         assert(it->prev == 0);
460         *head = it->next;
461     }
462     if (*tail == it) {
463         assert(it->next == 0);
464         *tail = it->prev;
465     }
466     assert(it->next != it);
467     assert(it->prev != it);
468 
469     if (it->next) it->next->prev = it->prev;
470     if (it->prev) it->prev->next = it->next;
471     sizes[it->slabs_clsid]--;
472 #ifdef EXTSTORE
473     if (it->it_flags & ITEM_HDR) {
474         sizes_bytes[it->slabs_clsid] -= (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
475     } else {
476         sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
477     }
478 #else
479     sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
480 #endif
481 
482     return;
483 }
484 
item_unlink_q(item * it)485 static void item_unlink_q(item *it) {
486     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
487     do_item_unlink_q(it);
488     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
489 }
490 
do_item_link(item * it,const uint32_t hv,const uint64_t cas)491 int do_item_link(item *it, const uint32_t hv, const uint64_t cas) {
492     MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
493     assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
494     it->it_flags |= ITEM_LINKED;
495     it->time = current_time;
496 
497     STATS_LOCK();
498     stats_state.curr_bytes += ITEM_ntotal(it);
499     stats_state.curr_items += 1;
500     stats.total_items += 1;
501     STATS_UNLOCK();
502 
503     /* Allocate a new CAS ID on link. */
504     ITEM_set_cas(it, cas);
505     assoc_insert(it, hv);
506     item_link_q(it);
507     refcount_incr(it);
508     item_stats_sizes_add(it);
509 
510     return 1;
511 }
512 
do_item_unlink(item * it,const uint32_t hv)513 void do_item_unlink(item *it, const uint32_t hv) {
514     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
515     if ((it->it_flags & ITEM_LINKED) != 0) {
516         it->it_flags &= ~ITEM_LINKED;
517         STATS_LOCK();
518         stats_state.curr_bytes -= ITEM_ntotal(it);
519         stats_state.curr_items -= 1;
520         STATS_UNLOCK();
521         item_stats_sizes_remove(it);
522         assoc_delete(ITEM_key(it), it->nkey, hv);
523         item_unlink_q(it);
524         do_item_remove(it);
525     }
526 }
527 
528 /* FIXME: Is it necessary to keep this copy/pasted code? */
do_item_unlink_nolock(item * it,const uint32_t hv)529 void do_item_unlink_nolock(item *it, const uint32_t hv) {
530     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
531     if ((it->it_flags & ITEM_LINKED) != 0) {
532         it->it_flags &= ~ITEM_LINKED;
533         STATS_LOCK();
534         stats_state.curr_bytes -= ITEM_ntotal(it);
535         stats_state.curr_items -= 1;
536         STATS_UNLOCK();
537         item_stats_sizes_remove(it);
538         assoc_delete(ITEM_key(it), it->nkey, hv);
539         do_item_unlink_q(it);
540         do_item_remove(it);
541     }
542 }
543 
do_item_remove(item * it)544 void do_item_remove(item *it) {
545     MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
546     assert((it->it_flags & ITEM_SLABBED) == 0);
547     assert(it->refcount > 0);
548 
549     if (refcount_decr(it) == 0) {
550         item_free(it);
551     }
552 }
553 
554 /* Bump the last accessed time, or relink if we're in compat mode */
do_item_update(item * it)555 void do_item_update(item *it) {
556     MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
557 
558     /* Hits to COLD_LRU immediately move to WARM. */
559     if (settings.lru_segmented) {
560         assert((it->it_flags & ITEM_SLABBED) == 0);
561         if ((it->it_flags & ITEM_LINKED) != 0) {
562             if (ITEM_lruid(it) == COLD_LRU && (it->it_flags & ITEM_ACTIVE)) {
563                 it->time = current_time;
564                 item_unlink_q(it);
565                 it->slabs_clsid = ITEM_clsid(it);
566                 it->slabs_clsid |= WARM_LRU;
567                 it->it_flags &= ~ITEM_ACTIVE;
568                 item_link_q_warm(it);
569             } else {
570                 it->time = current_time;
571             }
572         }
573     } else if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
574         assert((it->it_flags & ITEM_SLABBED) == 0);
575 
576         if ((it->it_flags & ITEM_LINKED) != 0) {
577             it->time = current_time;
578             item_unlink_q(it);
579             item_link_q(it);
580         }
581     }
582 }
583 
do_item_replace(item * it,item * new_it,const uint32_t hv,const uint64_t cas)584 int do_item_replace(item *it, item *new_it, const uint32_t hv, const uint64_t cas) {
585     MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
586                            ITEM_key(new_it), new_it->nkey, new_it->nbytes);
587     assert((it->it_flags & ITEM_SLABBED) == 0);
588 
589     do_item_unlink(it, hv);
590     return do_item_link(new_it, hv, cas);
591 }
592 
item_flush_expired(void)593 void item_flush_expired(void) {
594     int i;
595     item *iter, *next;
596     if (settings.oldest_live == 0)
597         return;
598     for (i = 0; i < LARGEST_ID; i++) {
599         /* The LRU is sorted in decreasing time order, and an item's timestamp
600          * is never newer than its last access time, so we only need to walk
601          * back until we hit an item older than the oldest_live time.
602          * The oldest_live checking will auto-expire the remaining items.
603          */
604         pthread_mutex_lock(&lru_locks[i]);
605         for (iter = heads[i]; iter != NULL; iter = next) {
606             void *hold_lock = NULL;
607             next = iter->next;
608             if (iter->time == 0 && iter->nkey == 0 && iter->it_flags == 1) {
609                 continue; // crawler item.
610             }
611             uint32_t hv = hash(ITEM_key(iter), iter->nkey);
612             // if we can't lock the item, just give up.
613             // we can't block here because the lock order is inverted.
614             if ((hold_lock = item_trylock(hv)) == NULL) {
615                 continue;
616             }
617 
618             if (iter->time >= settings.oldest_live) {
619                 // note: not sure why SLABBED check is here. linked and slabbed
620                 // are mutually exclusive, but this can't hurt and I don't
621                 // want to validate it right now.
622                 if ((iter->it_flags & ITEM_SLABBED) == 0) {
623                     STORAGE_delete(ext_storage, iter);
624                     // nolock version because we hold the LRU lock already.
625                     do_item_unlink_nolock(iter, hash(ITEM_key(iter), iter->nkey));
626                 }
627                 item_trylock_unlock(hold_lock);
628             } else {
629                 /* We've hit the first old item. Continue to the next queue. */
630                 item_trylock_unlock(hold_lock);
631                 break;
632             }
633         }
634         pthread_mutex_unlock(&lru_locks[i]);
635     }
636 }
637 
638 /*@null@*/
639 /* This is walking the line of violating lock order, but I think it's safe.
640  * If the LRU lock is held, an item in the LRU cannot be wiped and freed.
641  * The data could possibly be overwritten, but this is only accessing the
642  * headers.
643  * It may not be the best idea to leave it like this, but for now it's safe.
644  */
item_cachedump(const unsigned int slabs_clsid,const unsigned int limit,unsigned int * bytes)645 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
646     unsigned int memlimit = 2 * 1024 * 1024;   /* 2MB max response size */
647     char *buffer;
648     unsigned int bufcurr;
649     item *it;
650     unsigned int len;
651     unsigned int shown = 0;
652     char key_temp[KEY_MAX_LENGTH + 1];
653     char temp[512];
654     unsigned int id = slabs_clsid;
655     id |= COLD_LRU;
656 
657     pthread_mutex_lock(&lru_locks[id]);
658     it = heads[id];
659 
660     buffer = malloc((size_t)memlimit);
661     if (buffer == 0) {
662         pthread_mutex_unlock(&lru_locks[id]);
663         return NULL;
664     }
665     bufcurr = 0;
666 
667     while (it != NULL && (limit == 0 || shown < limit)) {
668         assert(it->nkey <= KEY_MAX_LENGTH);
669         // protect from printing binary keys.
670         if ((it->nbytes == 0 && it->nkey == 0) || (it->it_flags & ITEM_KEY_BINARY)) {
671             it = it->next;
672             continue;
673         }
674         /* Copy the key since it may not be null-terminated in the struct */
675         strncpy(key_temp, ITEM_key(it), it->nkey);
676         key_temp[it->nkey] = 0x00; /* terminate */
677         len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %llu s]\r\n",
678                        key_temp, it->nbytes - 2,
679                        it->exptime == 0 ? 0 :
680                        (unsigned long long)it->exptime + process_started);
681         if (bufcurr + len + 6 > memlimit)  /* 6 is END\r\n\0 */
682             break;
683         memcpy(buffer + bufcurr, temp, len);
684         bufcurr += len;
685         shown++;
686         it = it->next;
687     }
688 
689     memcpy(buffer + bufcurr, "END\r\n", 6);
690     bufcurr += 5;
691 
692     *bytes = bufcurr;
693     pthread_mutex_unlock(&lru_locks[id]);
694     return buffer;
695 }
696 
697 /* With refactoring of the various stats code the automover won't need a
698  * custom function here.
699  */
fill_item_stats_automove(item_stats_automove * am)700 void fill_item_stats_automove(item_stats_automove *am) {
701     int n;
702     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
703         item_stats_automove *cur = &am[n];
704 
705         // outofmemory records into HOT
706         int i = n | HOT_LRU;
707         pthread_mutex_lock(&lru_locks[i]);
708         cur->outofmemory = itemstats[i].outofmemory;
709         pthread_mutex_unlock(&lru_locks[i]);
710 
711         // evictions and tail age are from COLD
712         i = n | COLD_LRU;
713         pthread_mutex_lock(&lru_locks[i]);
714         cur->evicted = itemstats[i].evicted;
715         if (!tails[i]) {
716             cur->age = 0;
717         } else if (tails[i]->nbytes == 0 && tails[i]->nkey == 0 && tails[i]->it_flags == 1) {
718             /* it's a crawler, check previous entry */
719             if (tails[i]->prev) {
720                cur->age = current_time - tails[i]->prev->time;
721             } else {
722                cur->age = 0;
723             }
724         } else {
725             cur->age = current_time - tails[i]->time;
726         }
727         pthread_mutex_unlock(&lru_locks[i]);
728      }
729 }
730 
item_stats_totals(ADD_STAT add_stats,void * c)731 void item_stats_totals(ADD_STAT add_stats, void *c) {
732     itemstats_t totals;
733     memset(&totals, 0, sizeof(itemstats_t));
734     int n;
735     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
736         int x;
737         int i;
738         for (x = 0; x < 4; x++) {
739             i = n | lru_type_map[x];
740             pthread_mutex_lock(&lru_locks[i]);
741             totals.evicted += itemstats[i].evicted;
742             totals.reclaimed += itemstats[i].reclaimed;
743             totals.expired_unfetched += itemstats[i].expired_unfetched;
744             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
745             totals.evicted_active += itemstats[i].evicted_active;
746             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
747             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
748             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
749             totals.moves_to_cold += itemstats[i].moves_to_cold;
750             totals.moves_to_warm += itemstats[i].moves_to_warm;
751             totals.moves_within_lru += itemstats[i].moves_within_lru;
752             totals.direct_reclaims += itemstats[i].direct_reclaims;
753             pthread_mutex_unlock(&lru_locks[i]);
754         }
755     }
756     APPEND_STAT("expired_unfetched", "%llu",
757                 (unsigned long long)totals.expired_unfetched);
758     APPEND_STAT("evicted_unfetched", "%llu",
759                 (unsigned long long)totals.evicted_unfetched);
760     if (settings.lru_maintainer_thread) {
761         APPEND_STAT("evicted_active", "%llu",
762                     (unsigned long long)totals.evicted_active);
763     }
764     APPEND_STAT("evictions", "%llu",
765                 (unsigned long long)totals.evicted);
766     APPEND_STAT("reclaimed", "%llu",
767                 (unsigned long long)totals.reclaimed);
768     APPEND_STAT("crawler_reclaimed", "%llu",
769                 (unsigned long long)totals.crawler_reclaimed);
770     APPEND_STAT("crawler_items_checked", "%llu",
771                 (unsigned long long)totals.crawler_items_checked);
772     APPEND_STAT("lrutail_reflocked", "%llu",
773                 (unsigned long long)totals.lrutail_reflocked);
774     if (settings.lru_maintainer_thread) {
775         APPEND_STAT("moves_to_cold", "%llu",
776                     (unsigned long long)totals.moves_to_cold);
777         APPEND_STAT("moves_to_warm", "%llu",
778                     (unsigned long long)totals.moves_to_warm);
779         APPEND_STAT("moves_within_lru", "%llu",
780                     (unsigned long long)totals.moves_within_lru);
781         APPEND_STAT("direct_reclaims", "%llu",
782                     (unsigned long long)totals.direct_reclaims);
783         APPEND_STAT("lru_bumps_dropped", "%llu",
784                     (unsigned long long)lru_total_bumps_dropped());
785     }
786 }
787 
item_stats(ADD_STAT add_stats,void * c)788 void item_stats(ADD_STAT add_stats, void *c) {
789     struct thread_stats thread_stats;
790     threadlocal_stats_aggregate(&thread_stats);
791     itemstats_t totals;
792     int n;
793     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
794         memset(&totals, 0, sizeof(itemstats_t));
795         int x;
796         int i;
797         unsigned int size = 0;
798         unsigned int age  = 0;
799         unsigned int age_hot = 0;
800         unsigned int age_warm = 0;
801         unsigned int lru_size_map[4];
802         const char *fmt = "items:%d:%s";
803         char key_str[STAT_KEY_LEN];
804         char val_str[STAT_VAL_LEN];
805         int klen = 0, vlen = 0;
806         for (x = 0; x < 4; x++) {
807             i = n | lru_type_map[x];
808             pthread_mutex_lock(&lru_locks[i]);
809             totals.evicted += itemstats[i].evicted;
810             totals.evicted_nonzero += itemstats[i].evicted_nonzero;
811             totals.reclaimed += itemstats[i].reclaimed;
812             totals.outofmemory += itemstats[i].outofmemory;
813             totals.tailrepairs += itemstats[i].tailrepairs;
814             totals.expired_unfetched += itemstats[i].expired_unfetched;
815             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
816             totals.evicted_active += itemstats[i].evicted_active;
817             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
818             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
819             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
820             totals.moves_to_cold += itemstats[i].moves_to_cold;
821             totals.moves_to_warm += itemstats[i].moves_to_warm;
822             totals.moves_within_lru += itemstats[i].moves_within_lru;
823             totals.direct_reclaims += itemstats[i].direct_reclaims;
824             totals.mem_requested += sizes_bytes[i];
825             size += sizes[i];
826             lru_size_map[x] = sizes[i];
827             if (lru_type_map[x] == COLD_LRU && tails[i] != NULL) {
828                 age = current_time - tails[i]->time;
829             } else if (lru_type_map[x] == HOT_LRU && tails[i] != NULL) {
830                 age_hot = current_time - tails[i]->time;
831             } else if (lru_type_map[x] == WARM_LRU && tails[i] != NULL) {
832                 age_warm = current_time - tails[i]->time;
833             }
834             if (lru_type_map[x] == COLD_LRU)
835                 totals.evicted_time = itemstats[i].evicted_time;
836             switch (lru_type_map[x]) {
837                 case HOT_LRU:
838                     totals.hits_to_hot = thread_stats.lru_hits[i];
839                     break;
840                 case WARM_LRU:
841                     totals.hits_to_warm = thread_stats.lru_hits[i];
842                     break;
843                 case COLD_LRU:
844                     totals.hits_to_cold = thread_stats.lru_hits[i];
845                     break;
846                 case TEMP_LRU:
847                     totals.hits_to_temp = thread_stats.lru_hits[i];
848                     break;
849             }
850             pthread_mutex_unlock(&lru_locks[i]);
851         }
852         if (size == 0)
853             continue;
854         APPEND_NUM_FMT_STAT(fmt, n, "number", "%u", size);
855         if (settings.lru_maintainer_thread) {
856             APPEND_NUM_FMT_STAT(fmt, n, "number_hot", "%u", lru_size_map[0]);
857             APPEND_NUM_FMT_STAT(fmt, n, "number_warm", "%u", lru_size_map[1]);
858             APPEND_NUM_FMT_STAT(fmt, n, "number_cold", "%u", lru_size_map[2]);
859             if (settings.temp_lru) {
860                 APPEND_NUM_FMT_STAT(fmt, n, "number_temp", "%u", lru_size_map[3]);
861             }
862             APPEND_NUM_FMT_STAT(fmt, n, "age_hot", "%u", age_hot);
863             APPEND_NUM_FMT_STAT(fmt, n, "age_warm", "%u", age_warm);
864         }
865         APPEND_NUM_FMT_STAT(fmt, n, "age", "%u", age);
866         APPEND_NUM_FMT_STAT(fmt, n, "mem_requested", "%llu", (unsigned long long)totals.mem_requested);
867         APPEND_NUM_FMT_STAT(fmt, n, "evicted",
868                             "%llu", (unsigned long long)totals.evicted);
869         APPEND_NUM_FMT_STAT(fmt, n, "evicted_nonzero",
870                             "%llu", (unsigned long long)totals.evicted_nonzero);
871         APPEND_NUM_FMT_STAT(fmt, n, "evicted_time",
872                             "%u", totals.evicted_time);
873         APPEND_NUM_FMT_STAT(fmt, n, "outofmemory",
874                             "%llu", (unsigned long long)totals.outofmemory);
875         APPEND_NUM_FMT_STAT(fmt, n, "tailrepairs",
876                             "%llu", (unsigned long long)totals.tailrepairs);
877         APPEND_NUM_FMT_STAT(fmt, n, "reclaimed",
878                             "%llu", (unsigned long long)totals.reclaimed);
879         APPEND_NUM_FMT_STAT(fmt, n, "expired_unfetched",
880                             "%llu", (unsigned long long)totals.expired_unfetched);
881         APPEND_NUM_FMT_STAT(fmt, n, "evicted_unfetched",
882                             "%llu", (unsigned long long)totals.evicted_unfetched);
883         if (settings.lru_maintainer_thread) {
884             APPEND_NUM_FMT_STAT(fmt, n, "evicted_active",
885                                 "%llu", (unsigned long long)totals.evicted_active);
886         }
887         APPEND_NUM_FMT_STAT(fmt, n, "crawler_reclaimed",
888                             "%llu", (unsigned long long)totals.crawler_reclaimed);
889         APPEND_NUM_FMT_STAT(fmt, n, "crawler_items_checked",
890                             "%llu", (unsigned long long)totals.crawler_items_checked);
891         APPEND_NUM_FMT_STAT(fmt, n, "lrutail_reflocked",
892                             "%llu", (unsigned long long)totals.lrutail_reflocked);
893         if (settings.lru_maintainer_thread) {
894             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_cold",
895                                 "%llu", (unsigned long long)totals.moves_to_cold);
896             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_warm",
897                                 "%llu", (unsigned long long)totals.moves_to_warm);
898             APPEND_NUM_FMT_STAT(fmt, n, "moves_within_lru",
899                                 "%llu", (unsigned long long)totals.moves_within_lru);
900             APPEND_NUM_FMT_STAT(fmt, n, "direct_reclaims",
901                                 "%llu", (unsigned long long)totals.direct_reclaims);
902             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_hot",
903                                 "%llu", (unsigned long long)totals.hits_to_hot);
904 
905             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_warm",
906                                 "%llu", (unsigned long long)totals.hits_to_warm);
907 
908             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_cold",
909                                 "%llu", (unsigned long long)totals.hits_to_cold);
910 
911             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_temp",
912                                 "%llu", (unsigned long long)totals.hits_to_temp);
913 
914         }
915     }
916 
917     /* getting here means both ascii and binary terminators fit */
918     add_stats(NULL, 0, NULL, 0, c);
919 }
920 
item_stats_sizes_status(void)921 bool item_stats_sizes_status(void) {
922     bool ret = false;
923     if (stats_sizes_hist != NULL)
924         ret = true;
925     return ret;
926 }
927 
item_stats_sizes_init(void)928 void item_stats_sizes_init(void) {
929     if (stats_sizes_hist != NULL)
930         return;
931     stats_sizes_buckets = settings.item_size_max / 32 + 1;
932     stats_sizes_hist = calloc(stats_sizes_buckets, sizeof(int));
933 }
934 
item_stats_sizes_add(item * it)935 void item_stats_sizes_add(item *it) {
936     if (stats_sizes_hist == NULL)
937         return;
938     int ntotal = ITEM_ntotal(it);
939     int bucket = ntotal / 32;
940     if ((ntotal % 32) != 0) bucket++;
941     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]++;
942 }
943 
944 /* I think there's no way for this to be accurate without using the CAS value.
945  * Since items getting their time value bumped will pass this validation.
946  */
item_stats_sizes_remove(item * it)947 void item_stats_sizes_remove(item *it) {
948     if (stats_sizes_hist == NULL)
949         return;
950     int ntotal = ITEM_ntotal(it);
951     int bucket = ntotal / 32;
952     if ((ntotal % 32) != 0) bucket++;
953     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]--;
954 }
955 
956 /** dumps out a list of objects of each size, with granularity of 32 bytes */
957 /*@null@*/
958 /* Locks are correct based on a technicality. Holds LRU lock while doing the
959  * work, so items can't go invalid, and it's only looking at header sizes
960  * which don't change.
961  */
item_stats_sizes(ADD_STAT add_stats,void * c)962 void item_stats_sizes(ADD_STAT add_stats, void *c) {
963     if (stats_sizes_hist != NULL) {
964         int i;
965         for (i = 0; i < stats_sizes_buckets; i++) {
966             if (stats_sizes_hist[i] != 0) {
967                 char key[12];
968                 snprintf(key, sizeof(key), "%d", i * 32);
969                 APPEND_STAT(key, "%u", stats_sizes_hist[i]);
970             }
971         }
972     } else {
973         APPEND_STAT("sizes_status", "disabled", "");
974     }
975 
976     add_stats(NULL, 0, NULL, 0, c);
977 }
978 
979 /** wrapper around assoc_find which does the lazy expiration logic */
do_item_get(const char * key,const size_t nkey,const uint32_t hv,LIBEVENT_THREAD * t,const bool do_update)980 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, LIBEVENT_THREAD *t, const bool do_update) {
981     item *it = assoc_find(key, nkey, hv);
982     if (it != NULL) {
983         refcount_incr(it);
984         /* Optimization for slab reassignment. prevents popular items from
985          * jamming in busy wait. Can only do this here to satisfy lock order
986          * of item_lock, slabs_lock. */
987         /* This was made unsafe by removal of the cache_lock:
988          * slab_rebalance_signal and slab_rebal.* are modified in a separate
989          * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start =
990          * NULL (0), but slab_end is still equal to some value, this would end
991          * up unlinking every item fetched.
992          * This is either an acceptable loss, or if slab_rebalance_signal is
993          * true, slab_start/slab_end should be put behind the slabs_lock.
994          * Which would cause a huge potential slowdown.
995          * Could also use a specific lock for slab_rebal.* and
996          * slab_rebalance_signal (shorter lock?)
997          */
998         /*if (slab_rebalance_signal &&
999             ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
1000             do_item_unlink(it, hv);
1001             do_item_remove(it);
1002             it = NULL;
1003         }*/
1004     }
1005     int was_found = 0;
1006 
1007     if (settings.verbose > 2) {
1008         int ii;
1009         if (it == NULL) {
1010             fprintf(stderr, "> NOT FOUND ");
1011         } else if (was_found) {
1012             fprintf(stderr, "> FOUND KEY ");
1013         }
1014         for (ii = 0; ii < nkey; ++ii) {
1015             fprintf(stderr, "%c", key[ii]);
1016         }
1017     }
1018 
1019     if (it != NULL) {
1020         was_found = 1;
1021         if (item_is_flushed(it)) {
1022             do_item_unlink(it, hv);
1023             STORAGE_delete(t->storage, it);
1024             do_item_remove(it);
1025             it = NULL;
1026             pthread_mutex_lock(&t->stats.mutex);
1027             t->stats.get_flushed++;
1028             pthread_mutex_unlock(&t->stats.mutex);
1029             if (settings.verbose > 2) {
1030                 fprintf(stderr, " -nuked by flush");
1031             }
1032             was_found = 2;
1033         } else if (it->exptime != 0 && it->exptime <= current_time) {
1034             do_item_unlink(it, hv);
1035             STORAGE_delete(t->storage, it);
1036             do_item_remove(it);
1037             it = NULL;
1038             pthread_mutex_lock(&t->stats.mutex);
1039             t->stats.get_expired++;
1040             pthread_mutex_unlock(&t->stats.mutex);
1041             if (settings.verbose > 2) {
1042                 fprintf(stderr, " -nuked by expire");
1043             }
1044             was_found = 3;
1045         } else {
1046             if (do_update) {
1047                 do_item_bump(t, it, hv);
1048             }
1049             DEBUG_REFCNT(it, '+');
1050         }
1051     }
1052 
1053     if (settings.verbose > 2)
1054         fprintf(stderr, "\n");
1055     /* For now this is in addition to the above verbose logging. */
1056     LOGGER_LOG(t->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key,
1057                nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, t->cur_sfd);
1058 
1059     return it;
1060 }
1061 
1062 // Requires lock held for item.
1063 // Split out of do_item_get() to allow mget functions to look through header
1064 // data before losing state modified via the bump function.
do_item_bump(LIBEVENT_THREAD * t,item * it,const uint32_t hv)1065 void do_item_bump(LIBEVENT_THREAD *t, item *it, const uint32_t hv) {
1066     /* We update the hit markers only during fetches.
1067      * An item needs to be hit twice overall to be considered
1068      * ACTIVE, but only needs a single hit to maintain activity
1069      * afterward.
1070      * FETCHED tells if an item has ever been active.
1071      */
1072     if (settings.lru_segmented) {
1073         if ((it->it_flags & ITEM_ACTIVE) == 0) {
1074             if ((it->it_flags & ITEM_FETCHED) == 0) {
1075                 it->it_flags |= ITEM_FETCHED;
1076             } else {
1077                 it->it_flags |= ITEM_ACTIVE;
1078                 if (ITEM_lruid(it) != COLD_LRU) {
1079                     it->time = current_time; // only need to bump time.
1080                 } else if (!lru_bump_async(t->lru_bump_buf, it, hv)) {
1081                     // add flag before async bump to avoid race.
1082                     it->it_flags &= ~ITEM_ACTIVE;
1083                 }
1084             }
1085         }
1086     } else {
1087         it->it_flags |= ITEM_FETCHED;
1088         do_item_update(it);
1089     }
1090 }
1091 
do_item_touch(const char * key,size_t nkey,uint32_t exptime,const uint32_t hv,LIBEVENT_THREAD * t)1092 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
1093                     const uint32_t hv, LIBEVENT_THREAD *t) {
1094     item *it = do_item_get(key, nkey, hv, t, DO_UPDATE);
1095     if (it != NULL) {
1096         it->exptime = exptime;
1097     }
1098     return it;
1099 }
1100 
1101 /*** LRU MAINTENANCE THREAD ***/
1102 
1103 /* Returns number of items remove, expired, or evicted.
1104  * Callable from worker threads or the LRU maintainer thread */
lru_pull_tail(const int orig_id,const int cur_lru,const uint64_t total_bytes,const uint8_t flags,const rel_time_t max_age,struct lru_pull_tail_return * ret_it)1105 int lru_pull_tail(const int orig_id, const int cur_lru,
1106         const uint64_t total_bytes, const uint8_t flags, const rel_time_t max_age,
1107         struct lru_pull_tail_return *ret_it) {
1108     item *it = NULL;
1109     int id = orig_id;
1110     int removed = 0;
1111     if (id == 0)
1112         return 0;
1113 
1114     int tries = 5;
1115     item *search;
1116     item *next_it;
1117     void *hold_lock = NULL;
1118     unsigned int move_to_lru = 0;
1119     uint64_t limit = 0;
1120 
1121     id |= cur_lru;
1122     pthread_mutex_lock(&lru_locks[id]);
1123     search = tails[id];
1124     /* We walk up *only* for locked items, and if bottom is expired. */
1125     for (; tries > 0 && search != NULL; tries--, search=next_it) {
1126         /* we might relink search mid-loop, so search->prev isn't reliable */
1127         next_it = search->prev;
1128         if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) {
1129             /* We are a crawler, ignore it. */
1130             if (flags & LRU_PULL_CRAWL_BLOCKS) {
1131                 pthread_mutex_unlock(&lru_locks[id]);
1132                 return 0;
1133             }
1134             tries++;
1135             continue;
1136         }
1137         uint32_t hv = hash(ITEM_key(search), search->nkey);
1138         /* Attempt to hash item lock the "search" item. If locked, no
1139          * other callers can incr the refcount. Also skip ourselves. */
1140         if ((hold_lock = item_trylock(hv)) == NULL)
1141             continue;
1142         /* Now see if the item is refcount locked */
1143         if (refcount_incr(search) != 2) {
1144             /* Note pathological case with ref'ed items in tail.
1145              * Can still unlink the item, but it won't be reusable yet */
1146             itemstats[id].lrutail_reflocked++;
1147             /* In case of refcount leaks, enable for quick workaround. */
1148             /* WARNING: This can cause terrible corruption */
1149             if (settings.tail_repair_time &&
1150                     search->time + settings.tail_repair_time < current_time) {
1151                 itemstats[id].tailrepairs++;
1152                 search->refcount = 1;
1153                 /* This will call item_remove -> item_free since refcnt is 1 */
1154                 STORAGE_delete(ext_storage, search);
1155                 do_item_unlink_nolock(search, hv);
1156                 item_trylock_unlock(hold_lock);
1157                 continue;
1158             }
1159         }
1160 
1161         /* Expired or flushed */
1162         if ((search->exptime != 0 && search->exptime < current_time)
1163             || item_is_flushed(search)) {
1164             itemstats[id].reclaimed++;
1165             if ((search->it_flags & ITEM_FETCHED) == 0) {
1166                 itemstats[id].expired_unfetched++;
1167             }
1168             /* refcnt 2 -> 1 */
1169             do_item_unlink_nolock(search, hv);
1170             STORAGE_delete(ext_storage, search);
1171             /* refcnt 1 -> 0 -> item_free */
1172             do_item_remove(search);
1173             item_trylock_unlock(hold_lock);
1174             removed++;
1175 
1176             /* If all we're finding are expired, can keep going */
1177             continue;
1178         }
1179 
1180         /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU.
1181          * If we're COLD_LRU, send to WARM_LRU unless we need to evict
1182          */
1183         switch (cur_lru) {
1184             case HOT_LRU:
1185                 limit = total_bytes * settings.hot_lru_pct / 100;
1186             case WARM_LRU:
1187                 if (limit == 0)
1188                     limit = total_bytes * settings.warm_lru_pct / 100;
1189                 /* Rescue ACTIVE items aggressively */
1190                 if ((search->it_flags & ITEM_ACTIVE) != 0) {
1191                     search->it_flags &= ~ITEM_ACTIVE;
1192                     removed++;
1193                     if (cur_lru == WARM_LRU) {
1194                         itemstats[id].moves_within_lru++;
1195                         do_item_unlink_q(search);
1196                         do_item_link_q(search);
1197                         do_item_remove(search);
1198                         item_trylock_unlock(hold_lock);
1199                     } else {
1200                         /* Active HOT_LRU items flow to WARM */
1201                         itemstats[id].moves_to_warm++;
1202                         move_to_lru = WARM_LRU;
1203                         do_item_unlink_q(search);
1204                         it = search;
1205                     }
1206                 } else if (sizes_bytes[id] > limit ||
1207                            current_time - search->time > max_age) {
1208                     itemstats[id].moves_to_cold++;
1209                     move_to_lru = COLD_LRU;
1210                     do_item_unlink_q(search);
1211                     it = search;
1212                     removed++;
1213                     break;
1214                 } else {
1215                     /* Don't want to move to COLD, not active, bail out */
1216                     it = search;
1217                 }
1218                 break;
1219             case COLD_LRU:
1220                 it = search; /* No matter what, we're stopping */
1221                 if (flags & LRU_PULL_EVICT) {
1222                     if (settings.evict_to_free == 0) {
1223                         /* Don't think we need a counter for this. It'll OOM.  */
1224                         break;
1225                     }
1226                     itemstats[id].evicted++;
1227                     itemstats[id].evicted_time = current_time - search->time;
1228                     if (search->exptime != 0)
1229                         itemstats[id].evicted_nonzero++;
1230                     if ((search->it_flags & ITEM_FETCHED) == 0) {
1231                         itemstats[id].evicted_unfetched++;
1232                     }
1233                     if ((search->it_flags & ITEM_ACTIVE)) {
1234                         itemstats[id].evicted_active++;
1235                     }
1236                     LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search);
1237                     STORAGE_delete(ext_storage, search);
1238                     do_item_unlink_nolock(search, hv);
1239                     removed++;
1240                     if (settings.slab_automove == 2) {
1241                         slabs_reassign(-1, orig_id);
1242                     }
1243                 } else if (flags & LRU_PULL_RETURN_ITEM) {
1244                     /* Keep a reference to this item and return it. */
1245                     ret_it->it = it;
1246                     ret_it->hv = hv;
1247                 } else if ((search->it_flags & ITEM_ACTIVE) != 0
1248                         && settings.lru_segmented) {
1249                     itemstats[id].moves_to_warm++;
1250                     search->it_flags &= ~ITEM_ACTIVE;
1251                     move_to_lru = WARM_LRU;
1252                     do_item_unlink_q(search);
1253                     removed++;
1254                 }
1255                 break;
1256             case TEMP_LRU:
1257                 it = search; /* Kill the loop. Parent only interested in reclaims */
1258                 break;
1259         }
1260         if (it != NULL)
1261             break;
1262     }
1263 
1264     pthread_mutex_unlock(&lru_locks[id]);
1265 
1266     if (it != NULL) {
1267         if (move_to_lru) {
1268             it->slabs_clsid = ITEM_clsid(it);
1269             it->slabs_clsid |= move_to_lru;
1270             item_link_q(it);
1271         }
1272         if ((flags & LRU_PULL_RETURN_ITEM) == 0) {
1273             do_item_remove(it);
1274             item_trylock_unlock(hold_lock);
1275         }
1276     }
1277 
1278     return removed;
1279 }
1280 
1281 
1282 /* TODO: Third place this code needs to be deduped */
lru_bump_buf_link_q(lru_bump_buf * b)1283 static void lru_bump_buf_link_q(lru_bump_buf *b) {
1284     pthread_mutex_lock(&bump_buf_lock);
1285     assert(b != bump_buf_head);
1286 
1287     b->prev = 0;
1288     b->next = bump_buf_head;
1289     if (b->next) b->next->prev = b;
1290     bump_buf_head = b;
1291     if (bump_buf_tail == 0) bump_buf_tail = b;
1292     pthread_mutex_unlock(&bump_buf_lock);
1293     return;
1294 }
1295 
item_lru_bump_buf_create(void)1296 void *item_lru_bump_buf_create(void) {
1297     lru_bump_buf *b = calloc(1, sizeof(lru_bump_buf));
1298     if (b == NULL) {
1299         return NULL;
1300     }
1301 
1302     b->buf = bipbuf_new(sizeof(lru_bump_entry) * LRU_BUMP_BUF_SIZE);
1303     if (b->buf == NULL) {
1304         free(b);
1305         return NULL;
1306     }
1307 
1308     pthread_mutex_init(&b->mutex, NULL);
1309 
1310     lru_bump_buf_link_q(b);
1311     return b;
1312 }
1313 
lru_bump_async(lru_bump_buf * b,item * it,uint32_t hv)1314 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv) {
1315     bool ret = true;
1316     refcount_incr(it);
1317     pthread_mutex_lock(&b->mutex);
1318     lru_bump_entry *be = (lru_bump_entry *) bipbuf_request(b->buf, sizeof(lru_bump_entry));
1319     if (be != NULL) {
1320         be->it = it;
1321         be->hv = hv;
1322         if (bipbuf_push(b->buf, sizeof(lru_bump_entry)) == 0) {
1323             ret = false;
1324             b->dropped++;
1325         }
1326     } else {
1327         ret = false;
1328         b->dropped++;
1329     }
1330     if (!ret) {
1331         refcount_decr(it);
1332     }
1333     pthread_mutex_unlock(&b->mutex);
1334     return ret;
1335 }
1336 
1337 /* TODO: Might be worth a micro-optimization of having bump buffers link
1338  * themselves back into the central queue when queue goes from zero to
1339  * non-zero, then remove from list if zero more than N times.
1340  * If very few hits on cold this would avoid extra memory barriers from LRU
1341  * maintainer thread. If many hits, they'll just stay in the list.
1342  */
lru_maintainer_bumps(void)1343 static bool lru_maintainer_bumps(void) {
1344     lru_bump_buf *b;
1345     lru_bump_entry *be;
1346     unsigned int size;
1347     unsigned int todo;
1348     bool bumped = false;
1349     pthread_mutex_lock(&bump_buf_lock);
1350     for (b = bump_buf_head; b != NULL; b=b->next) {
1351         pthread_mutex_lock(&b->mutex);
1352         be = (lru_bump_entry *) bipbuf_peek_all(b->buf, &size);
1353         pthread_mutex_unlock(&b->mutex);
1354 
1355         if (be == NULL) {
1356             continue;
1357         }
1358         todo = size;
1359         bumped = true;
1360 
1361         while (todo) {
1362             item_lock(be->hv);
1363             do_item_update(be->it);
1364             do_item_remove(be->it);
1365             item_unlock(be->hv);
1366             be++;
1367             todo -= sizeof(lru_bump_entry);
1368         }
1369 
1370         pthread_mutex_lock(&b->mutex);
1371         be = (lru_bump_entry *) bipbuf_poll(b->buf, size);
1372         pthread_mutex_unlock(&b->mutex);
1373     }
1374     pthread_mutex_unlock(&bump_buf_lock);
1375     return bumped;
1376 }
1377 
lru_total_bumps_dropped(void)1378 static uint64_t lru_total_bumps_dropped(void) {
1379     uint64_t total = 0;
1380     lru_bump_buf *b;
1381     pthread_mutex_lock(&bump_buf_lock);
1382     for (b = bump_buf_head; b != NULL; b=b->next) {
1383         pthread_mutex_lock(&b->mutex);
1384         total += b->dropped;
1385         pthread_mutex_unlock(&b->mutex);
1386     }
1387     pthread_mutex_unlock(&bump_buf_lock);
1388     return total;
1389 }
1390 
1391 /* Loop up to N times:
1392  * If too many items are in HOT_LRU, push to COLD_LRU
1393  * If too many items are in WARM_LRU, push to COLD_LRU
1394  * If too many items are in COLD_LRU, poke COLD_LRU tail
1395  * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The
1396  * locks can't handle much more than that. Leaving a TODO for how to
1397  * autoadjust in the future.
1398  */
lru_maintainer_juggle(const int slabs_clsid)1399 static int lru_maintainer_juggle(const int slabs_clsid) {
1400     int i;
1401     int did_moves = 0;
1402     uint64_t total_bytes = 0;
1403     unsigned int chunks_perslab = 0;
1404     //unsigned int chunks_free = 0;
1405     /* TODO: if free_chunks below high watermark, increase aggressiveness */
1406     slabs_available_chunks(slabs_clsid, NULL,
1407             &chunks_perslab);
1408     if (settings.temp_lru) {
1409         /* Only looking for reclaims. Run before we size the LRU. */
1410         for (i = 0; i < 500; i++) {
1411             if (lru_pull_tail(slabs_clsid, TEMP_LRU, 0, 0, 0, NULL) <= 0) {
1412                 break;
1413             } else {
1414                 did_moves++;
1415             }
1416         }
1417     }
1418 
1419     rel_time_t cold_age = 0;
1420     rel_time_t hot_age = 0;
1421     rel_time_t warm_age = 0;
1422     /* If LRU is in flat mode, force items to drain into COLD via max age of 0 */
1423     if (settings.lru_segmented) {
1424         pthread_mutex_lock(&lru_locks[slabs_clsid|COLD_LRU]);
1425         if (tails[slabs_clsid|COLD_LRU]) {
1426             cold_age = current_time - tails[slabs_clsid|COLD_LRU]->time;
1427         }
1428         // Also build up total_bytes for the classes.
1429         total_bytes += sizes_bytes[slabs_clsid|COLD_LRU];
1430         pthread_mutex_unlock(&lru_locks[slabs_clsid|COLD_LRU]);
1431 
1432         hot_age = cold_age * settings.hot_max_factor;
1433         warm_age = cold_age * settings.warm_max_factor;
1434 
1435         // total_bytes doesn't have to be exact. cache it for the juggles.
1436         pthread_mutex_lock(&lru_locks[slabs_clsid|HOT_LRU]);
1437         total_bytes += sizes_bytes[slabs_clsid|HOT_LRU];
1438         pthread_mutex_unlock(&lru_locks[slabs_clsid|HOT_LRU]);
1439 
1440         pthread_mutex_lock(&lru_locks[slabs_clsid|WARM_LRU]);
1441         total_bytes += sizes_bytes[slabs_clsid|WARM_LRU];
1442         pthread_mutex_unlock(&lru_locks[slabs_clsid|WARM_LRU]);
1443     }
1444 
1445     /* Juggle HOT/WARM up to N times */
1446     for (i = 0; i < 500; i++) {
1447         int do_more = 0;
1448         if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, hot_age, NULL) ||
1449             lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, warm_age, NULL)) {
1450             do_more++;
1451         }
1452         if (settings.lru_segmented) {
1453             do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, 0, NULL);
1454         }
1455         if (do_more == 0)
1456             break;
1457         did_moves++;
1458     }
1459     return did_moves;
1460 }
1461 
1462 /* Will crawl all slab classes a minimum of once per hour */
1463 #define MAX_MAINTCRAWL_WAIT 60 * 60
1464 
1465 /* Hoping user input will improve this function. This is all a wild guess.
1466  * Operation: Kicks crawler for each slab id. Crawlers take some statistics as
1467  * to items with nonzero expirations. It then buckets how many items will
1468  * expire per minute for the next hour.
1469  * This function checks the results of a run, and if it things more than 1% of
1470  * expirable objects are ready to go, kick the crawler again to reap.
1471  * It will also kick the crawler once per minute regardless, waiting a minute
1472  * longer for each time it has no work to do, up to an hour wait time.
1473  * The latter is to avoid newly started daemons from waiting too long before
1474  * retrying a crawl.
1475  */
lru_maintainer_crawler_check(struct crawler_expired_data * cdata,logger * l)1476 static void lru_maintainer_crawler_check(struct crawler_expired_data *cdata, logger *l) {
1477     int i;
1478     static rel_time_t next_crawls[POWER_LARGEST];
1479     static rel_time_t next_crawl_wait[POWER_LARGEST];
1480     uint8_t todo[POWER_LARGEST];
1481     memset(todo, 0, sizeof(uint8_t) * POWER_LARGEST);
1482     bool do_run = false;
1483     unsigned int tocrawl_limit = 0;
1484 
1485     // TODO: If not segmented LRU, skip non-cold
1486     for (i = POWER_SMALLEST; i < POWER_LARGEST; i++) {
1487         crawlerstats_t *s = &cdata->crawlerstats[i];
1488         /* We've not successfully kicked off a crawl yet. */
1489         if (s->run_complete) {
1490             char *lru_name = "na";
1491             pthread_mutex_lock(&cdata->lock);
1492             int x;
1493             /* Should we crawl again? */
1494             uint64_t possible_reclaims = s->seen - s->noexp;
1495             uint64_t available_reclaims = 0;
1496             /* Need to think we can free at least 1% of the items before
1497              * crawling. */
1498             /* FIXME: Configurable? */
1499             uint64_t low_watermark = (possible_reclaims / 100) + 1;
1500             rel_time_t since_run = current_time - s->end_time;
1501             /* Don't bother if the payoff is too low. */
1502             for (x = 0; x < 60; x++) {
1503                 available_reclaims += s->histo[x];
1504                 if (available_reclaims > low_watermark) {
1505                     if (next_crawl_wait[i] < (x * 60)) {
1506                         next_crawl_wait[i] += 60;
1507                     } else if (next_crawl_wait[i] >= 60) {
1508                         next_crawl_wait[i] -= 60;
1509                     }
1510                     break;
1511                 }
1512             }
1513 
1514             if (available_reclaims == 0) {
1515                 next_crawl_wait[i] += 60;
1516             }
1517 
1518             if (next_crawl_wait[i] > MAX_MAINTCRAWL_WAIT) {
1519                 next_crawl_wait[i] = MAX_MAINTCRAWL_WAIT;
1520             }
1521 
1522             next_crawls[i] = current_time + next_crawl_wait[i] + 5;
1523             switch (GET_LRU(i)) {
1524                 case HOT_LRU:
1525                     lru_name = "hot";
1526                     break;
1527                 case WARM_LRU:
1528                     lru_name = "warm";
1529                     break;
1530                 case COLD_LRU:
1531                     lru_name = "cold";
1532                     break;
1533                 case TEMP_LRU:
1534                     lru_name = "temp";
1535                     break;
1536             }
1537             LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_CRAWLER_STATUS, NULL,
1538                     CLEAR_LRU(i),
1539                     lru_name,
1540                     (unsigned long long)low_watermark,
1541                     (unsigned long long)available_reclaims,
1542                     (unsigned int)since_run,
1543                     next_crawls[i] - current_time,
1544                     s->end_time - s->start_time,
1545                     s->seen,
1546                     s->reclaimed);
1547             // Got our calculation, avoid running until next actual run.
1548             s->run_complete = false;
1549             pthread_mutex_unlock(&cdata->lock);
1550         }
1551         if (current_time > next_crawls[i]) {
1552             pthread_mutex_lock(&lru_locks[i]);
1553             if (sizes[i] > tocrawl_limit) {
1554                 tocrawl_limit = sizes[i];
1555             }
1556             pthread_mutex_unlock(&lru_locks[i]);
1557             todo[i] = 1;
1558             do_run = true;
1559             next_crawls[i] = current_time + 5; // minimum retry wait.
1560         }
1561     }
1562     if (do_run) {
1563         if (settings.lru_crawler_tocrawl && settings.lru_crawler_tocrawl < tocrawl_limit) {
1564             tocrawl_limit = settings.lru_crawler_tocrawl;
1565         }
1566         lru_crawler_start(todo, tocrawl_limit, CRAWLER_AUTOEXPIRE, cdata, NULL, 0);
1567     }
1568 }
1569 
1570 slab_automove_reg_t slab_automove_default = {
1571     .init = slab_automove_init,
1572     .free = slab_automove_free,
1573     .run = slab_automove_run
1574 };
1575 #ifdef EXTSTORE
1576 slab_automove_reg_t slab_automove_extstore = {
1577     .init = slab_automove_extstore_init,
1578     .free = slab_automove_extstore_free,
1579     .run = slab_automove_extstore_run
1580 };
1581 #endif
1582 static pthread_t lru_maintainer_tid;
1583 
1584 #define MAX_LRU_MAINTAINER_SLEEP (1000000-1)
1585 #define MIN_LRU_MAINTAINER_SLEEP 1000
1586 
lru_maintainer_thread(void * arg)1587 static void *lru_maintainer_thread(void *arg) {
1588     slab_automove_reg_t *sam = &slab_automove_default;
1589 #ifdef EXTSTORE
1590     void *storage = arg;
1591     if (storage != NULL)
1592         sam = &slab_automove_extstore;
1593 #endif
1594     int i;
1595     useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP;
1596     useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP;
1597     rel_time_t last_crawler_check = 0;
1598     rel_time_t last_automove_check = 0;
1599     useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1600     useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1601     struct crawler_expired_data *cdata =
1602         calloc(1, sizeof(struct crawler_expired_data));
1603     if (cdata == NULL) {
1604         fprintf(stderr, "Failed to allocate crawler data for LRU maintainer thread\n");
1605         abort();
1606     }
1607     pthread_mutex_init(&cdata->lock, NULL);
1608     cdata->crawl_complete = true; // kick off the crawler.
1609     logger *l = logger_create();
1610     if (l == NULL) {
1611         fprintf(stderr, "Failed to allocate logger for LRU maintainer thread\n");
1612         abort();
1613     }
1614 
1615     double last_ratio = settings.slab_automove_ratio;
1616     void *am = sam->init(&settings);
1617 
1618     pthread_mutex_lock(&lru_maintainer_lock);
1619     if (settings.verbose > 2)
1620         fprintf(stderr, "Starting LRU maintainer background thread\n");
1621     while (do_run_lru_maintainer_thread) {
1622         pthread_mutex_unlock(&lru_maintainer_lock);
1623         if (to_sleep)
1624             usleep(to_sleep);
1625         pthread_mutex_lock(&lru_maintainer_lock);
1626         /* A sleep of zero counts as a minimum of a 1ms wait */
1627         last_sleep = to_sleep > 1000 ? to_sleep : 1000;
1628         to_sleep = MAX_LRU_MAINTAINER_SLEEP;
1629 
1630         STATS_LOCK();
1631         stats.lru_maintainer_juggles++;
1632         STATS_UNLOCK();
1633 
1634         /* Each slab class gets its own sleep to avoid hammering locks */
1635         for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
1636             next_juggles[i] = next_juggles[i] > last_sleep ? next_juggles[i] - last_sleep : 0;
1637 
1638             if (next_juggles[i] > 0) {
1639                 // Sleep the thread just for the minimum amount (or not at all)
1640                 if (next_juggles[i] < to_sleep)
1641                     to_sleep = next_juggles[i];
1642                 continue;
1643             }
1644 
1645             int did_moves = lru_maintainer_juggle(i);
1646             if (did_moves == 0) {
1647                 if (backoff_juggles[i] != 0) {
1648                     backoff_juggles[i] += backoff_juggles[i] / 8;
1649                 } else {
1650                     backoff_juggles[i] = MIN_LRU_MAINTAINER_SLEEP;
1651                 }
1652                 if (backoff_juggles[i] > MAX_LRU_MAINTAINER_SLEEP)
1653                     backoff_juggles[i] = MAX_LRU_MAINTAINER_SLEEP;
1654             } else if (backoff_juggles[i] > 0) {
1655                 backoff_juggles[i] /= 2;
1656                 if (backoff_juggles[i] < MIN_LRU_MAINTAINER_SLEEP) {
1657                     backoff_juggles[i] = 0;
1658                 }
1659             }
1660             next_juggles[i] = backoff_juggles[i];
1661             if (next_juggles[i] < to_sleep)
1662                 to_sleep = next_juggles[i];
1663         }
1664 
1665         /* Minimize the sleep if we had async LRU bumps to process */
1666         if (settings.lru_segmented && lru_maintainer_bumps() && to_sleep > 1000) {
1667             to_sleep = 1000;
1668         }
1669 
1670         /* Once per second at most */
1671         if (settings.lru_crawler && last_crawler_check != current_time) {
1672             lru_maintainer_crawler_check(cdata, l);
1673             last_crawler_check = current_time;
1674         }
1675 
1676         if (settings.slab_automove == 1 && last_automove_check != current_time) {
1677             if (last_ratio != settings.slab_automove_ratio) {
1678                 sam->free(am);
1679                 am = sam->init(&settings);
1680                 last_ratio = settings.slab_automove_ratio;
1681             }
1682             int src, dst;
1683             sam->run(am, &src, &dst);
1684             if (src != -1 && dst != -1) {
1685                 slabs_reassign(src, dst);
1686                 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
1687                         src, dst);
1688             }
1689             // dst == 0 means reclaim to global pool, be more aggressive
1690             if (dst != 0) {
1691                 last_automove_check = current_time;
1692             } else if (dst == 0) {
1693                 // also ensure we minimize the thread sleep
1694                 to_sleep = 1000;
1695             }
1696         }
1697     }
1698     pthread_mutex_unlock(&lru_maintainer_lock);
1699     sam->free(am);
1700     // LRU crawler *must* be stopped.
1701     free(cdata);
1702     if (settings.verbose > 2)
1703         fprintf(stderr, "LRU maintainer thread stopping\n");
1704 
1705     return NULL;
1706 }
1707 
stop_lru_maintainer_thread(void)1708 int stop_lru_maintainer_thread(void) {
1709     int ret;
1710     pthread_mutex_lock(&lru_maintainer_lock);
1711     /* LRU thread is a sleep loop, will die on its own */
1712     do_run_lru_maintainer_thread = 0;
1713     pthread_mutex_unlock(&lru_maintainer_lock);
1714     if ((ret = pthread_join(lru_maintainer_tid, NULL)) != 0) {
1715         fprintf(stderr, "Failed to stop LRU maintainer thread: %s\n", strerror(ret));
1716         return -1;
1717     }
1718     settings.lru_maintainer_thread = false;
1719     return 0;
1720 }
1721 
start_lru_maintainer_thread(void * arg)1722 int start_lru_maintainer_thread(void *arg) {
1723     int ret;
1724 
1725     pthread_mutex_lock(&lru_maintainer_lock);
1726     do_run_lru_maintainer_thread = 1;
1727     settings.lru_maintainer_thread = true;
1728     if ((ret = pthread_create(&lru_maintainer_tid, NULL,
1729         lru_maintainer_thread, arg)) != 0) {
1730         fprintf(stderr, "Can't create LRU maintainer thread: %s\n",
1731             strerror(ret));
1732         pthread_mutex_unlock(&lru_maintainer_lock);
1733         return -1;
1734     }
1735     thread_setname(lru_maintainer_tid, "mc-lrumaint");
1736     pthread_mutex_unlock(&lru_maintainer_lock);
1737 
1738     return 0;
1739 }
1740 
1741 /* If we hold this lock, crawler can't wake up or move */
lru_maintainer_pause(void)1742 void lru_maintainer_pause(void) {
1743     pthread_mutex_lock(&lru_maintainer_lock);
1744 }
1745 
lru_maintainer_resume(void)1746 void lru_maintainer_resume(void) {
1747     pthread_mutex_unlock(&lru_maintainer_lock);
1748 }
1749 
1750 /* Tail linkers and crawler for the LRU crawler. */
do_item_linktail_q(item * it)1751 void do_item_linktail_q(item *it) { /* item is the new tail */
1752     item **head, **tail;
1753     assert(it->it_flags == 1);
1754     assert(it->nbytes == 0);
1755 
1756     head = &heads[it->slabs_clsid];
1757     tail = &tails[it->slabs_clsid];
1758     //assert(*tail != 0);
1759     assert(it != *tail);
1760     assert((*head && *tail) || (*head == 0 && *tail == 0));
1761     it->prev = *tail;
1762     it->next = 0;
1763     if (it->prev) {
1764         assert(it->prev->next == 0);
1765         it->prev->next = it;
1766     }
1767     *tail = it;
1768     if (*head == 0) *head = it;
1769     return;
1770 }
1771 
do_item_unlinktail_q(item * it)1772 void do_item_unlinktail_q(item *it) {
1773     item **head, **tail;
1774     head = &heads[it->slabs_clsid];
1775     tail = &tails[it->slabs_clsid];
1776 
1777     if (*head == it) {
1778         assert(it->prev == 0);
1779         *head = it->next;
1780     }
1781     if (*tail == it) {
1782         assert(it->next == 0);
1783         *tail = it->prev;
1784     }
1785     assert(it->next != it);
1786     assert(it->prev != it);
1787 
1788     if (it->next) it->next->prev = it->prev;
1789     if (it->prev) it->prev->next = it->next;
1790     return;
1791 }
1792 
1793 /* This is too convoluted, but it's a difficult shuffle. Try to rewrite it
1794  * more clearly. */
do_item_crawl_q(item * it)1795 item *do_item_crawl_q(item *it) {
1796     item **head, **tail;
1797     assert(it->it_flags == 1);
1798     assert(it->nbytes == 0);
1799     head = &heads[it->slabs_clsid];
1800     tail = &tails[it->slabs_clsid];
1801 
1802     /* We've hit the head, pop off */
1803     if (it->prev == 0) {
1804         assert(*head == it);
1805         if (it->next) {
1806             *head = it->next;
1807             assert(it->next->prev == it);
1808             it->next->prev = 0;
1809         }
1810         return NULL; /* Done */
1811     }
1812 
1813     /* Swing ourselves in front of the next item */
1814     /* NB: If there is a prev, we can't be the head */
1815     assert(it->prev != it);
1816     if (it->prev) {
1817         if (*head == it->prev) {
1818             /* Prev was the head, now we're the head */
1819             *head = it;
1820         }
1821         if (*tail == it) {
1822             /* We are the tail, now they are the tail */
1823             *tail = it->prev;
1824         }
1825         assert(it->next != it);
1826         if (it->next) {
1827             assert(it->prev->next == it);
1828             it->prev->next = it->next;
1829             it->next->prev = it->prev;
1830         } else {
1831             /* Tail. Move this above? */
1832             it->prev->next = 0;
1833         }
1834         /* prev->prev's next is it->prev */
1835         it->next = it->prev;
1836         it->prev = it->next->prev;
1837         it->next->prev = it;
1838         /* New it->prev now, if we're not at the head. */
1839         if (it->prev) {
1840             it->prev->next = it;
1841         }
1842     }
1843     assert(it->next != it);
1844     assert(it->prev != it);
1845 
1846     return it->next; /* success */
1847 }
1848