1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4  * and are divided into chunks. The chunk sizes start off at the size of the
5  * "item" structure plus space for a small key and value. They increase by
6  * a multiplier factor from there, up to half the maximum slab size. The last
7  * slab size is always 1MB, since that's the maximum item size allowed by the
8  * memcached protocol.
9  */
10 #include "memcached.h"
11 #include "storage.h"
12 #include <sys/mman.h>
13 #include <sys/stat.h>
14 #include <sys/socket.h>
15 #include <sys/resource.h>
16 #include <fcntl.h>
17 #include <netinet/in.h>
18 #include <errno.h>
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <signal.h>
23 #include <assert.h>
24 #include <pthread.h>
25 
26 //#define DEBUG_SLAB_MOVER
27 /* powers-of-N allocation structures */
28 
29 typedef struct {
30     unsigned int size;      /* sizes of items */
31     unsigned int perslab;   /* how many items per slab */
32 
33     void *slots;           /* list of item ptrs */
34     unsigned int sl_curr;   /* total free items in list */
35 
36     unsigned int slabs;     /* how many slabs were allocated for this class */
37 
38     void **slab_list;       /* array of slab pointers */
39     unsigned int list_size; /* size of prev array */
40 } slabclass_t;
41 
42 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43 static size_t mem_limit = 0;
44 static size_t mem_malloced = 0;
45 /* If the memory limit has been hit once. Used as a hint to decide when to
46  * early-wake the LRU maintenance thread */
47 static bool mem_limit_reached = false;
48 static int power_largest;
49 
50 static void *mem_base = NULL;
51 static void *mem_current = NULL;
52 static size_t mem_avail = 0;
53 #ifdef EXTSTORE
54 static void *storage  = NULL;
55 #endif
56 /**
57  * Access to the slab allocator is protected by this lock
58  */
59 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
60 static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
61 
62 /*
63  * Forward Declarations
64  */
65 static int grow_slab_list (const unsigned int id);
66 static int do_slabs_newslab(const unsigned int id);
67 static void *memory_allocate(size_t size);
68 static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
69 
70 /* Preallocate as many slab pages as possible (called from slabs_init)
71    on start-up, so users don't get confused out-of-memory errors when
72    they do have free (in-slab) space, but no space to make new slabs.
73    if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
74    slab types can be made.  if max memory is less than 18 MB, only the
75    smaller ones will be made.  */
76 static void slabs_preallocate (const unsigned int maxslabs);
77 #ifdef EXTSTORE
slabs_set_storage(void * arg)78 void slabs_set_storage(void *arg) {
79     storage = arg;
80 }
81 #endif
82 /*
83  * Figures out which slab class (chunk size) is required to store an item of
84  * a given size.
85  *
86  * Given object size, return id to use when allocating/freeing memory for object
87  * 0 means error: can't store such a large object
88  */
89 
slabs_clsid(const size_t size)90 unsigned int slabs_clsid(const size_t size) {
91     int res = POWER_SMALLEST;
92 
93     if (size == 0 || size > settings.item_size_max)
94         return 0;
95     while (size > slabclass[res].size)
96         if (res++ == power_largest)     /* won't fit in the biggest slab */
97             return power_largest;
98     return res;
99 }
100 
slabs_size(const int clsid)101 unsigned int slabs_size(const int clsid) {
102     return slabclass[clsid].size;
103 }
104 
105 // TODO: could this work with the restartable memory?
106 // Docs say hugepages only work with private shm allocs.
107 /* Function split out for better error path handling */
alloc_large_chunk(const size_t limit)108 static void * alloc_large_chunk(const size_t limit)
109 {
110     void *ptr = NULL;
111 #if defined(__linux__) && defined(MADV_HUGEPAGE)
112     size_t pagesize = 0;
113     FILE *fp;
114     int ret;
115 
116     /* Get the size of huge pages */
117     fp = fopen("/proc/meminfo", "r");
118     if (fp != NULL) {
119         char buf[64];
120 
121         while ((fgets(buf, sizeof(buf), fp)))
122             if (!strncmp(buf, "Hugepagesize:", 13)) {
123                 ret = sscanf(buf + 13, "%zu\n", &pagesize);
124 
125                 /* meminfo huge page size is in KiBs */
126                 pagesize <<= 10;
127             }
128         fclose(fp);
129     }
130 
131     if (!pagesize) {
132         fprintf(stderr, "Failed to get supported huge page size\n");
133         return NULL;
134     }
135 
136     if (settings.verbose > 1)
137         fprintf(stderr, "huge page size: %zu\n", pagesize);
138 
139     /* This works because glibc simply uses mmap when the alignment is
140      * above a certain limit. */
141     ret = posix_memalign(&ptr, pagesize, limit);
142     if (ret != 0) {
143         fprintf(stderr, "Failed to get aligned memory chunk: %d\n", ret);
144         return NULL;
145     }
146 
147     ret = madvise(ptr, limit, MADV_HUGEPAGE);
148     if (ret < 0) {
149         fprintf(stderr, "Failed to set transparent hugepage hint: %d\n", ret);
150         free(ptr);
151         ptr = NULL;
152     }
153 #elif defined(__FreeBSD__)
154     size_t align = (sizeof(size_t) * 8 - (__builtin_clzl(4095)));
155     ptr = mmap(NULL, limit, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON | MAP_ALIGNED(align) | MAP_ALIGNED_SUPER, -1, 0);
156     if (ptr == MAP_FAILED) {
157         fprintf(stderr, "Failed to set super pages\n");
158         ptr = NULL;
159     }
160 #else
161     ptr = malloc(limit);
162 #endif
163     return ptr;
164 }
165 
slabs_fixup(char * chunk,const int border)166 unsigned int slabs_fixup(char *chunk, const int border) {
167     slabclass_t *p;
168     item *it = (item *)chunk;
169     int id = ITEM_clsid(it);
170 
171     // memory isn't used yet. shunt to global pool.
172     // (which must be 0)
173     if (id == 0) {
174         //assert(border == 0);
175         p = &slabclass[0];
176         grow_slab_list(0);
177         p->slab_list[p->slabs++] = (char*)chunk;
178         return -1;
179     }
180     p = &slabclass[id];
181 
182     // if we're on a page border, add the slab to slab class
183     if (border == 0) {
184         grow_slab_list(id);
185         p->slab_list[p->slabs++] = chunk;
186     }
187 
188     // increase free count if ITEM_SLABBED
189     if (it->it_flags == ITEM_SLABBED) {
190         // if ITEM_SLABBED re-stack on freelist.
191         // don't have to run pointer fixups.
192         it->prev = 0;
193         it->next = p->slots;
194         if (it->next) it->next->prev = it;
195         p->slots = it;
196 
197         p->sl_curr++;
198         //fprintf(stderr, "replacing into freelist\n");
199     }
200 
201     return p->size;
202 }
203 
204 /**
205  * Determines the chunk sizes and initializes the slab class descriptors
206  * accordingly.
207  */
slabs_init(const size_t limit,const double factor,const bool prealloc,const uint32_t * slab_sizes,void * mem_base_external,bool reuse_mem)208 void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes, void *mem_base_external, bool reuse_mem) {
209     int i = POWER_SMALLEST - 1;
210     unsigned int size = sizeof(item) + settings.chunk_size;
211 
212     /* Some platforms use runtime transparent hugepages. If for any reason
213      * the initial allocation fails, the required settings do not persist
214      * for remaining allocations. As such it makes little sense to do slab
215      * preallocation. */
216     bool __attribute__ ((unused)) do_slab_prealloc = false;
217 
218     mem_limit = limit;
219 
220     if (prealloc && mem_base_external == NULL) {
221         mem_base = alloc_large_chunk(mem_limit);
222         if (mem_base) {
223             do_slab_prealloc = true;
224             mem_current = mem_base;
225             mem_avail = mem_limit;
226         } else {
227             fprintf(stderr, "Warning: Failed to allocate requested memory in"
228                     " one large chunk.\nWill allocate in smaller chunks\n");
229         }
230     } else if (prealloc && mem_base_external != NULL) {
231         // Can't (yet) mix hugepages with mmap allocations, so separate the
232         // logic from above. Reusable memory also force-preallocates memory
233         // pages into the global pool, which requires turning mem_* variables.
234         do_slab_prealloc = true;
235         mem_base = mem_base_external;
236         // _current shouldn't be used in this case, but we set it to where it
237         // should be anyway.
238         if (reuse_mem) {
239             mem_current = ((char*)mem_base) + mem_limit;
240             mem_avail = 0;
241         } else {
242             mem_current = mem_base;
243             mem_avail = mem_limit;
244         }
245     }
246 
247     memset(slabclass, 0, sizeof(slabclass));
248 
249     while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
250         if (slab_sizes != NULL) {
251             if (slab_sizes[i-1] == 0)
252                 break;
253             size = slab_sizes[i-1];
254         } else if (size >= settings.slab_chunk_size_max / factor) {
255             break;
256         }
257         /* Make sure items are always n-byte aligned */
258         if (size % CHUNK_ALIGN_BYTES)
259             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
260 
261         slabclass[i].size = size;
262         slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
263         if (slab_sizes == NULL)
264             size *= factor;
265         if (settings.verbose > 1) {
266             fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
267                     i, slabclass[i].size, slabclass[i].perslab);
268         }
269     }
270 
271     power_largest = i;
272     slabclass[power_largest].size = settings.slab_chunk_size_max;
273     slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
274     if (settings.verbose > 1) {
275         fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
276                 i, slabclass[i].size, slabclass[i].perslab);
277     }
278 
279     /* for the test suite:  faking of how much we've already malloc'd */
280     {
281         char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
282         if (t_initial_malloc) {
283             int64_t env_malloced;
284             if (safe_strtoll((const char *)t_initial_malloc, &env_malloced)) {
285                 mem_malloced = (size_t)env_malloced;
286             }
287         }
288 
289     }
290 
291     if (do_slab_prealloc) {
292         if (!reuse_mem) {
293             slabs_preallocate(power_largest);
294         }
295     }
296 }
297 
slabs_prefill_global(void)298 void slabs_prefill_global(void) {
299     void *ptr;
300     slabclass_t *p = &slabclass[0];
301     int len = settings.slab_page_size;
302 
303     while (mem_malloced < mem_limit
304             && (ptr = memory_allocate(len)) != NULL) {
305         grow_slab_list(0);
306         // Ensure the front header is zero'd to avoid confusing restart code.
307         // It's probably good enough to cast it and just zero slabs_clsid, but
308         // this is extra paranoid.
309         memset(ptr, 0, sizeof(item));
310         p->slab_list[p->slabs++] = ptr;
311     }
312     mem_limit_reached = true;
313 }
314 
slabs_preallocate(const unsigned int maxslabs)315 static void slabs_preallocate (const unsigned int maxslabs) {
316     int i;
317     unsigned int prealloc = 0;
318 
319     /* pre-allocate a 1MB slab in every size class so people don't get
320        confused by non-intuitive "SERVER_ERROR out of memory"
321        messages.  this is the most common question on the mailing
322        list.  if you really don't want this, you can rebuild without
323        these three lines.  */
324 
325     for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
326         if (++prealloc > maxslabs)
327             break;
328         if (do_slabs_newslab(i) == 0) {
329             fprintf(stderr, "Error while preallocating slab memory!\n"
330                 "If using -L or other prealloc options, max memory must be "
331                 "at least %d megabytes.\n", power_largest);
332             exit(1);
333         }
334     }
335 }
336 
grow_slab_list(const unsigned int id)337 static int grow_slab_list (const unsigned int id) {
338     slabclass_t *p = &slabclass[id];
339     if (p->slabs == p->list_size) {
340         size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
341         void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
342         if (new_list == 0) return 0;
343         p->list_size = new_size;
344         p->slab_list = new_list;
345     }
346     return 1;
347 }
348 
split_slab_page_into_freelist(char * ptr,const unsigned int id)349 static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
350     slabclass_t *p = &slabclass[id];
351     int x;
352     for (x = 0; x < p->perslab; x++) {
353         do_slabs_free(ptr, 0, id);
354         ptr += p->size;
355     }
356 }
357 
358 /* Fast FIFO queue */
get_page_from_global_pool(void)359 static void *get_page_from_global_pool(void) {
360     slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
361     if (p->slabs < 1) {
362         return NULL;
363     }
364     char *ret = p->slab_list[p->slabs - 1];
365     p->slabs--;
366     return ret;
367 }
368 
do_slabs_newslab(const unsigned int id)369 static int do_slabs_newslab(const unsigned int id) {
370     slabclass_t *p = &slabclass[id];
371     slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
372     int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size)
373         ? settings.slab_page_size
374         : p->size * p->perslab;
375     char *ptr;
376 
377     if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
378          && g->slabs == 0)) {
379         mem_limit_reached = true;
380         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
381         return 0;
382     }
383 
384     if ((grow_slab_list(id) == 0) ||
385         (((ptr = get_page_from_global_pool()) == NULL) &&
386         ((ptr = memory_allocate((size_t)len)) == 0))) {
387 
388         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
389         return 0;
390     }
391 
392     // Always wipe the memory at this stage: in restart mode the mmap memory
393     // could be unused, yet still full of data. Better for usability if we're
394     // wiping memory as it's being pulled out of the global pool instead of
395     // blocking startup all at once.
396     memset(ptr, 0, (size_t)len);
397     split_slab_page_into_freelist(ptr, id);
398 
399     p->slab_list[p->slabs++] = ptr;
400     MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
401 
402     return 1;
403 }
404 
405 /*@null@*/
do_slabs_alloc(const size_t size,unsigned int id,unsigned int flags)406 static void *do_slabs_alloc(const size_t size, unsigned int id,
407         unsigned int flags) {
408     slabclass_t *p;
409     void *ret = NULL;
410     item *it = NULL;
411 
412     if (id < POWER_SMALLEST || id > power_largest) {
413         MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
414         return NULL;
415     }
416     p = &slabclass[id];
417     assert(p->sl_curr == 0 || (((item *)p->slots)->it_flags & ITEM_SLABBED));
418 
419     assert(size <= p->size);
420     /* fail unless we have space at the end of a recently allocated page,
421        we have something on our freelist, or we could allocate a new page */
422     if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
423         do_slabs_newslab(id);
424     }
425 
426     if (p->sl_curr != 0) {
427         /* return off our freelist */
428         it = (item *)p->slots;
429         p->slots = it->next;
430         if (it->next) it->next->prev = 0;
431         /* Kill flag and initialize refcount here for lock safety in slab
432          * mover's freeness detection. */
433         it->it_flags &= ~ITEM_SLABBED;
434         it->refcount = 1;
435         p->sl_curr--;
436         ret = (void *)it;
437     } else {
438         ret = NULL;
439     }
440 
441     if (ret) {
442         MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
443     } else {
444         MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
445     }
446 
447     return ret;
448 }
449 
do_slabs_free_chunked(item * it,const size_t size)450 static void do_slabs_free_chunked(item *it, const size_t size) {
451     item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
452     slabclass_t *p;
453 
454     it->it_flags = ITEM_SLABBED;
455     // FIXME: refresh on how this works?
456     //it->slabs_clsid = 0;
457     it->prev = 0;
458     // header object's original classid is stored in chunk.
459     p = &slabclass[chunk->orig_clsid];
460     // original class id needs to be set on free memory.
461     it->slabs_clsid = chunk->orig_clsid;
462     if (chunk->next) {
463         chunk = chunk->next;
464         chunk->prev = 0;
465     } else {
466         // header with no attached chunk
467         chunk = NULL;
468     }
469 
470     // return the header object.
471     // TODO: This is in three places, here and in do_slabs_free().
472     it->prev = 0;
473     it->next = p->slots;
474     if (it->next) it->next->prev = it;
475     p->slots = it;
476     p->sl_curr++;
477 
478     item_chunk *next_chunk;
479     while (chunk) {
480         assert(chunk->it_flags == ITEM_CHUNK);
481         chunk->it_flags = ITEM_SLABBED;
482         p = &slabclass[chunk->slabs_clsid];
483         next_chunk = chunk->next;
484 
485         chunk->prev = 0;
486         chunk->next = p->slots;
487         if (chunk->next) chunk->next->prev = chunk;
488         p->slots = chunk;
489         p->sl_curr++;
490 
491         chunk = next_chunk;
492     }
493 
494     return;
495 }
496 
497 
do_slabs_free(void * ptr,const size_t size,unsigned int id)498 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
499     slabclass_t *p;
500     item *it;
501 
502     assert(id >= POWER_SMALLEST && id <= power_largest);
503     if (id < POWER_SMALLEST || id > power_largest)
504         return;
505 
506     MEMCACHED_SLABS_FREE(size, id, ptr);
507     p = &slabclass[id];
508 
509     it = (item *)ptr;
510     if ((it->it_flags & ITEM_CHUNKED) == 0) {
511         it->it_flags = ITEM_SLABBED;
512         it->slabs_clsid = id;
513         it->prev = 0;
514         it->next = p->slots;
515         if (it->next) it->next->prev = it;
516         p->slots = it;
517 
518         p->sl_curr++;
519     } else {
520         do_slabs_free_chunked(it, size);
521     }
522     return;
523 }
524 
525 /* With refactoring of the various stats code the automover won't need a
526  * custom function here.
527  */
fill_slab_stats_automove(slab_stats_automove * am)528 void fill_slab_stats_automove(slab_stats_automove *am) {
529     int n;
530     pthread_mutex_lock(&slabs_lock);
531     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
532         slabclass_t *p = &slabclass[n];
533         slab_stats_automove *cur = &am[n];
534         cur->chunks_per_page = p->perslab;
535         cur->free_chunks = p->sl_curr;
536         cur->total_pages = p->slabs;
537         cur->chunk_size = p->size;
538     }
539     pthread_mutex_unlock(&slabs_lock);
540 }
541 
542 /* TODO: slabs_available_chunks should grow up to encompass this.
543  * mem_flag is redundant with the other function.
544  */
global_page_pool_size(bool * mem_flag)545 unsigned int global_page_pool_size(bool *mem_flag) {
546     unsigned int ret = 0;
547     pthread_mutex_lock(&slabs_lock);
548     if (mem_flag != NULL)
549         *mem_flag = mem_malloced >= mem_limit ? true : false;
550     ret = slabclass[SLAB_GLOBAL_PAGE_POOL].slabs;
551     pthread_mutex_unlock(&slabs_lock);
552     return ret;
553 }
554 
555 /*@null@*/
do_slabs_stats(ADD_STAT add_stats,void * c)556 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
557     int i, total;
558     /* Get the per-thread stats which contain some interesting aggregates */
559     struct thread_stats thread_stats;
560     threadlocal_stats_aggregate(&thread_stats);
561 
562     total = 0;
563     for(i = POWER_SMALLEST; i <= power_largest; i++) {
564         slabclass_t *p = &slabclass[i];
565         if (p->slabs != 0) {
566             uint32_t perslab, slabs;
567             slabs = p->slabs;
568             perslab = p->perslab;
569 
570             char key_str[STAT_KEY_LEN];
571             char val_str[STAT_VAL_LEN];
572             int klen = 0, vlen = 0;
573 
574             APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
575             APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
576             APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
577             APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
578             APPEND_NUM_STAT(i, "used_chunks", "%u",
579                             slabs*perslab - p->sl_curr);
580             APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
581             /* Stat is dead, but displaying zero instead of removing it. */
582             APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
583             APPEND_NUM_STAT(i, "get_hits", "%llu",
584                     (unsigned long long)thread_stats.slab_stats[i].get_hits);
585             APPEND_NUM_STAT(i, "cmd_set", "%llu",
586                     (unsigned long long)thread_stats.slab_stats[i].set_cmds);
587             APPEND_NUM_STAT(i, "delete_hits", "%llu",
588                     (unsigned long long)thread_stats.slab_stats[i].delete_hits);
589             APPEND_NUM_STAT(i, "incr_hits", "%llu",
590                     (unsigned long long)thread_stats.slab_stats[i].incr_hits);
591             APPEND_NUM_STAT(i, "decr_hits", "%llu",
592                     (unsigned long long)thread_stats.slab_stats[i].decr_hits);
593             APPEND_NUM_STAT(i, "cas_hits", "%llu",
594                     (unsigned long long)thread_stats.slab_stats[i].cas_hits);
595             APPEND_NUM_STAT(i, "cas_badval", "%llu",
596                     (unsigned long long)thread_stats.slab_stats[i].cas_badval);
597             APPEND_NUM_STAT(i, "touch_hits", "%llu",
598                     (unsigned long long)thread_stats.slab_stats[i].touch_hits);
599             total++;
600         }
601     }
602 
603     /* add overall slab stats and append terminator */
604 
605     APPEND_STAT("active_slabs", "%d", total);
606     APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
607     add_stats(NULL, 0, NULL, 0, c);
608 }
609 
memory_allocate(size_t size)610 static void *memory_allocate(size_t size) {
611     void *ret;
612 
613     if (mem_base == NULL) {
614         /* We are not using a preallocated large memory chunk */
615         ret = malloc(size);
616     } else {
617         ret = mem_current;
618 
619         if (size > mem_avail) {
620             return NULL;
621         }
622 
623         /* mem_current pointer _must_ be aligned!!! */
624         if (size % CHUNK_ALIGN_BYTES) {
625             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
626         }
627 
628         mem_current = ((char*)mem_current) + size;
629         if (size < mem_avail) {
630             mem_avail -= size;
631         } else {
632             mem_avail = 0;
633         }
634     }
635     mem_malloced += size;
636 
637     return ret;
638 }
639 
640 /* Must only be used if all pages are item_size_max */
memory_release(void)641 static void memory_release(void) {
642     void *p = NULL;
643     if (mem_base != NULL)
644         return;
645 
646     if (!settings.slab_reassign)
647         return;
648 
649     while (mem_malloced > mem_limit &&
650             (p = get_page_from_global_pool()) != NULL) {
651         free(p);
652         mem_malloced -= settings.slab_page_size;
653     }
654 }
655 
slabs_alloc(size_t size,unsigned int id,unsigned int flags)656 void *slabs_alloc(size_t size, unsigned int id,
657         unsigned int flags) {
658     void *ret;
659 
660     pthread_mutex_lock(&slabs_lock);
661     ret = do_slabs_alloc(size, id, flags);
662     pthread_mutex_unlock(&slabs_lock);
663     return ret;
664 }
665 
slabs_free(void * ptr,size_t size,unsigned int id)666 void slabs_free(void *ptr, size_t size, unsigned int id) {
667     pthread_mutex_lock(&slabs_lock);
668     do_slabs_free(ptr, size, id);
669     pthread_mutex_unlock(&slabs_lock);
670 }
671 
slabs_stats(ADD_STAT add_stats,void * c)672 void slabs_stats(ADD_STAT add_stats, void *c) {
673     pthread_mutex_lock(&slabs_lock);
674     do_slabs_stats(add_stats, c);
675     pthread_mutex_unlock(&slabs_lock);
676 }
677 
do_slabs_adjust_mem_limit(size_t new_mem_limit)678 static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
679     /* Cannot adjust memory limit at runtime if prealloc'ed */
680     if (mem_base != NULL)
681         return false;
682     settings.maxbytes = new_mem_limit;
683     mem_limit = new_mem_limit;
684     mem_limit_reached = false; /* Will reset on next alloc */
685     memory_release(); /* free what might already be in the global pool */
686     return true;
687 }
688 
slabs_adjust_mem_limit(size_t new_mem_limit)689 bool slabs_adjust_mem_limit(size_t new_mem_limit) {
690     bool ret;
691     pthread_mutex_lock(&slabs_lock);
692     ret = do_slabs_adjust_mem_limit(new_mem_limit);
693     pthread_mutex_unlock(&slabs_lock);
694     return ret;
695 }
696 
slabs_available_chunks(const unsigned int id,bool * mem_flag,unsigned int * chunks_perslab)697 unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
698         unsigned int *chunks_perslab) {
699     unsigned int ret;
700     slabclass_t *p;
701 
702     pthread_mutex_lock(&slabs_lock);
703     p = &slabclass[id];
704     ret = p->sl_curr;
705     if (mem_flag != NULL)
706         *mem_flag = mem_malloced >= mem_limit ? true : false;
707     if (chunks_perslab != NULL)
708         *chunks_perslab = p->perslab;
709     pthread_mutex_unlock(&slabs_lock);
710     return ret;
711 }
712 
713 /* The slabber system could avoid needing to understand much, if anything,
714  * about items if callbacks were strategically used. Due to how the slab mover
715  * works, certain flag bits can only be adjusted while holding the slabs lock.
716  * Using these functions, isolate sections of code needing this and turn them
717  * into callbacks when an interface becomes more obvious.
718  */
slabs_mlock(void)719 void slabs_mlock(void) {
720     pthread_mutex_lock(&slabs_lock);
721 }
722 
slabs_munlock(void)723 void slabs_munlock(void) {
724     pthread_mutex_unlock(&slabs_lock);
725 }
726 
727 static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
728 static volatile int do_run_slab_rebalance_thread = 1;
729 
slab_rebalance_start(void)730 static int slab_rebalance_start(void) {
731     slabclass_t *s_cls;
732     int no_go = 0;
733 
734     pthread_mutex_lock(&slabs_lock);
735 
736     if (slab_rebal.s_clsid < SLAB_GLOBAL_PAGE_POOL ||
737         slab_rebal.s_clsid > power_largest  ||
738         slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
739         slab_rebal.d_clsid > power_largest  ||
740         slab_rebal.s_clsid == slab_rebal.d_clsid)
741         no_go = -2;
742 
743     s_cls = &slabclass[slab_rebal.s_clsid];
744 
745     if (!grow_slab_list(slab_rebal.d_clsid)) {
746         no_go = -1;
747     }
748 
749     if (s_cls->slabs < 2)
750         no_go = -3;
751 
752     if (no_go != 0) {
753         pthread_mutex_unlock(&slabs_lock);
754         return no_go; /* Should use a wrapper function... */
755     }
756 
757     /* Always kill the first available slab page as it is most likely to
758      * contain the oldest items
759      */
760     slab_rebal.slab_start = s_cls->slab_list[0];
761     slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
762         (s_cls->size * s_cls->perslab);
763     slab_rebal.slab_pos   = slab_rebal.slab_start;
764     slab_rebal.done       = 0;
765     // Don't need to do chunk move work if page is in global pool.
766     if (slab_rebal.s_clsid == SLAB_GLOBAL_PAGE_POOL) {
767         slab_rebal.done = 1;
768     }
769 
770     // Bit-vector to keep track of completed chunks
771     slab_rebal.completed = (uint8_t*)calloc(s_cls->perslab,sizeof(uint8_t));
772 
773     slab_rebalance_signal = 2;
774 
775     if (settings.verbose > 1) {
776         fprintf(stderr, "Started a slab rebalance\n");
777     }
778 
779     pthread_mutex_unlock(&slabs_lock);
780 
781     STATS_LOCK();
782     stats_state.slab_reassign_running = true;
783     STATS_UNLOCK();
784 
785     return 0;
786 }
787 
788 /* CALLED WITH slabs_lock HELD */
slab_rebalance_alloc(const size_t size,unsigned int id)789 static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
790     slabclass_t *s_cls;
791     s_cls = &slabclass[slab_rebal.s_clsid];
792     int x;
793     item *new_it = NULL;
794 
795     for (x = 0; x < s_cls->perslab; x++) {
796         new_it = do_slabs_alloc(size, id, SLABS_ALLOC_NO_NEWPAGE);
797         /* check that memory isn't within the range to clear */
798         if (new_it == NULL) {
799             break;
800         }
801         if ((void *)new_it >= slab_rebal.slab_start
802             && (void *)new_it < slab_rebal.slab_end) {
803             /* Pulled something we intend to free. Mark it as freed since
804              * we've already done the work of unlinking it from the freelist.
805              */
806             new_it->refcount = 0;
807             new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
808 #ifdef DEBUG_SLAB_MOVER
809             memcpy(ITEM_key(new_it), "deadbeef", 8);
810 #endif
811             new_it = NULL;
812             slab_rebal.inline_reclaim++;
813         } else {
814             break;
815         }
816     }
817     return new_it;
818 }
819 
820 /* CALLED WITH slabs_lock HELD */
821 /* detaches item/chunk from freelist. */
slab_rebalance_cut_free(slabclass_t * s_cls,item * it)822 static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
823     /* Ensure this was on the freelist and nothing else. */
824     assert(it->it_flags == ITEM_SLABBED);
825     if (s_cls->slots == it) {
826         s_cls->slots = it->next;
827     }
828     if (it->next) it->next->prev = it->prev;
829     if (it->prev) it->prev->next = it->next;
830     s_cls->sl_curr--;
831 }
832 
833 enum move_status {
834     MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
835 };
836 
837 #define SLAB_MOVE_MAX_LOOPS 1000
838 
839 /* refcount == 0 is safe since nobody can incr while item_lock is held.
840  * refcount != 0 is impossible since flags/etc can be modified in other
841  * threads. instead, note we found a busy one and bail. logic in do_item_get
842  * will prevent busy items from continuing to be busy
843  * NOTE: This is checking it_flags outside of an item lock. I believe this
844  * works since it_flags is 8 bits, and we're only ever comparing a single bit
845  * regardless. ITEM_SLABBED bit will always be correct since we're holding the
846  * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
847  * item having ITEM_SLABBED removed, and the key hasn't been added to the item
848  * yet. The memory barrier from the slabs lock should order the key write and the
849  * flags to the item?
850  * If ITEM_LINKED did exist and was just removed, but we still see it, that's
851  * still safe since it will have a valid key, which we then lock, and then
852  * recheck everything.
853  * This may not be safe on all platforms; If not, slabs_alloc() will need to
854  * seed the item key while holding slabs_lock.
855  */
slab_rebalance_move(void)856 static int slab_rebalance_move(void) {
857     slabclass_t *s_cls;
858     int was_busy = 0;
859     int refcount = 0;
860     uint32_t hv;
861     void *hold_lock;
862     enum move_status status = MOVE_PASS;
863 
864     s_cls = &slabclass[slab_rebal.s_clsid];
865     // the offset to check if completed or not
866     int offset = ((char*)slab_rebal.slab_pos-(char*)slab_rebal.slab_start)/(s_cls->size);
867 
868     // skip acquiring the slabs lock for items we've already fully processed.
869     if (slab_rebal.completed[offset] == 0) {
870         pthread_mutex_lock(&slabs_lock);
871         hv = 0;
872         hold_lock = NULL;
873         item *it = slab_rebal.slab_pos;
874 
875         item_chunk *ch = NULL;
876         status = MOVE_PASS;
877 
878         if (it->it_flags & ITEM_CHUNK) {
879             /* This chunk is a chained part of a larger item. */
880             ch = (item_chunk *) it;
881             /* Instead, we use the head chunk to find the item and effectively
882              * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
883              * head cannot be slabbed, so the normal routine is safe. */
884             it = ch->head;
885             assert(it->it_flags & ITEM_CHUNKED);
886         }
887 
888         /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
889          * the chunk for move. Only these two flags should exist.
890          */
891         if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
892             /* ITEM_SLABBED can only be added/removed under the slabs_lock */
893             if (it->it_flags & ITEM_SLABBED) {
894                 assert(ch == NULL);
895                 slab_rebalance_cut_free(s_cls, it);
896                 status = MOVE_FROM_SLAB;
897             } else if ((it->it_flags & ITEM_LINKED) != 0) {
898                 /* If it doesn't have ITEM_SLABBED, the item could be in any
899                  * state on its way to being freed or written to. If no
900                  * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
901                  * and have the key written to it already.
902                  */
903                 hv = hash(ITEM_key(it), it->nkey);
904                 if ((hold_lock = item_trylock(hv)) == NULL) {
905                     status = MOVE_LOCKED;
906                 } else {
907                     bool is_linked = (it->it_flags & ITEM_LINKED);
908                     refcount = refcount_incr(it);
909                     if (refcount == 2) { /* item is linked but not busy */
910                         /* Double check ITEM_LINKED flag here, since we're
911                          * past a memory barrier from the mutex. */
912                         if (is_linked) {
913                             status = MOVE_FROM_LRU;
914                         } else {
915                             /* refcount == 1 + !ITEM_LINKED means the item is being
916                              * uploaded to, or was just unlinked but hasn't been freed
917                              * yet. Let it bleed off on its own and try again later */
918                             status = MOVE_BUSY;
919                         }
920                     } else if (refcount > 2 && is_linked) {
921                         // TODO: Mark items for delete/rescue and process
922                         // outside of the main loop.
923                         if (slab_rebal.busy_loops > SLAB_MOVE_MAX_LOOPS) {
924                             slab_rebal.busy_deletes++;
925                             // Only safe to hold slabs lock because refcount
926                             // can't drop to 0 until we release item lock.
927                             STORAGE_delete(storage, it);
928                             pthread_mutex_unlock(&slabs_lock);
929                             do_item_unlink(it, hv);
930                             pthread_mutex_lock(&slabs_lock);
931                         }
932                         status = MOVE_BUSY;
933                     } else {
934                         if (settings.verbose > 2) {
935                             fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
936                                 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
937                         }
938                         status = MOVE_BUSY;
939                     }
940                     /* Item lock must be held while modifying refcount */
941                     if (status == MOVE_BUSY) {
942                         refcount_decr(it);
943                         item_trylock_unlock(hold_lock);
944                     }
945                 }
946             } else {
947                 /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
948                  * busy and wait for item to complete its upload. */
949                 status = MOVE_BUSY;
950             }
951         }
952 
953         int save_item = 0;
954         item *new_it = NULL;
955         size_t ntotal = 0;
956         switch (status) {
957             case MOVE_FROM_LRU:
958                 /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
959                  * We only need to hold the slabs_lock while initially looking
960                  * at an item, and at this point we have an exclusive refcount
961                  * (2) + the item is locked. Drop slabs lock, drop item to
962                  * refcount 1 (just our own, then fall through and wipe it
963                  */
964                 /* Check if expired or flushed */
965                 ntotal = ITEM_ntotal(it);
966 #ifdef EXTSTORE
967                 if (it->it_flags & ITEM_HDR) {
968                     ntotal = (ntotal - it->nbytes) + sizeof(item_hdr);
969                 }
970 #endif
971                 /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
972                 if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
973                     /* Chunked should be identical to non-chunked, except we need
974                      * to swap out ntotal for the head-chunk-total. */
975                     ntotal = s_cls->size;
976                 }
977                 if ((it->exptime != 0 && it->exptime < current_time)
978                     || item_is_flushed(it)) {
979                     /* Expired, don't save. */
980                     save_item = 0;
981                 } else if (ch == NULL &&
982                         (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
983                     /* Not a chunk of an item, and nomem. */
984                     save_item = 0;
985                     slab_rebal.evictions_nomem++;
986                 } else if (ch != NULL &&
987                         (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
988                     /* Is a chunk of an item, and nomem. */
989                     save_item = 0;
990                     slab_rebal.evictions_nomem++;
991                 } else {
992                     /* Was whatever it was, and we have memory for it. */
993                     save_item = 1;
994                 }
995                 pthread_mutex_unlock(&slabs_lock);
996                 if (save_item) {
997                     if (ch == NULL) {
998                         assert((new_it->it_flags & ITEM_CHUNKED) == 0);
999                         /* if free memory, memcpy. clear prev/next/h_bucket */
1000                         memcpy(new_it, it, ntotal);
1001                         new_it->prev = 0;
1002                         new_it->next = 0;
1003                         new_it->h_next = 0;
1004                         /* These are definitely required. else fails assert */
1005                         new_it->it_flags &= ~ITEM_LINKED;
1006                         new_it->refcount = 0;
1007                         do_item_replace(it, new_it, hv, ITEM_get_cas(it));
1008                         /* Need to walk the chunks and repoint head  */
1009                         if (new_it->it_flags & ITEM_CHUNKED) {
1010                             item_chunk *fch = (item_chunk *) ITEM_schunk(new_it);
1011                             fch->next->prev = fch;
1012                             while (fch) {
1013                                 fch->head = new_it;
1014                                 fch = fch->next;
1015                             }
1016                         }
1017                         it->refcount = 0;
1018                         it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1019 #ifdef DEBUG_SLAB_MOVER
1020                         memcpy(ITEM_key(it), "deadbeef", 8);
1021 #endif
1022                         slab_rebal.rescues++;
1023                     } else {
1024                         item_chunk *nch = (item_chunk *) new_it;
1025                         /* Chunks always have head chunk (the main it) */
1026                         ch->prev->next = nch;
1027                         if (ch->next)
1028                             ch->next->prev = nch;
1029                         memcpy(nch, ch, ch->used + sizeof(item_chunk));
1030                         ch->refcount = 0;
1031                         ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1032                         slab_rebal.chunk_rescues++;
1033 #ifdef DEBUG_SLAB_MOVER
1034                         memcpy(ITEM_key((item *)ch), "deadbeef", 8);
1035 #endif
1036                         refcount_decr(it);
1037                     }
1038                     slab_rebal.completed[offset] = 1;
1039                 } else {
1040                     /* unlink and mark as done if it's not
1041                      * a chunked item as they require more book-keeping) */
1042                     STORAGE_delete(storage, it);
1043                     if (!ch && (it->it_flags & ITEM_CHUNKED) == 0) {
1044                         do_item_unlink(it, hv);
1045                         it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1046                         it->refcount = 0;
1047 #ifdef DEBUG_SLAB_MOVER
1048                         memcpy(ITEM_key(it), "deadbeef", 8);
1049 #endif
1050                         slab_rebal.completed[offset] = 1;
1051                     } else {
1052                         ntotal = ITEM_ntotal(it);
1053                         do_item_unlink(it, hv);
1054                         slabs_free(it, ntotal, slab_rebal.s_clsid);
1055                         /* Swing around again later to remove it from the freelist. */
1056                         slab_rebal.busy_items++;
1057                         was_busy++;
1058                     }
1059 
1060                 }
1061                 item_trylock_unlock(hold_lock);
1062                 pthread_mutex_lock(&slabs_lock);
1063                 /* Always remove the ntotal, as we added it in during
1064                  * do_slabs_alloc() when copying the item.
1065                  */
1066                 break;
1067             case MOVE_FROM_SLAB:
1068                 slab_rebal.completed[offset] = 1;
1069                 it->refcount = 0;
1070                 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1071 #ifdef DEBUG_SLAB_MOVER
1072                 memcpy(ITEM_key(it), "deadbeef", 8);
1073 #endif
1074                 break;
1075             case MOVE_BUSY:
1076             case MOVE_LOCKED:
1077                 slab_rebal.busy_items++;
1078                 was_busy++;
1079                 break;
1080             case MOVE_PASS:
1081                 break;
1082         }
1083 
1084         pthread_mutex_unlock(&slabs_lock);
1085     }
1086 
1087     // Note: slab_rebal.* is occasionally protected under slabs_lock, but
1088     // the mover thread is the only user while active: so it's only necessary
1089     // for start/stop synchronization.
1090     slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1091 
1092     if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
1093         /* Some items were busy, start again from the top */
1094         if (slab_rebal.busy_items) {
1095             slab_rebal.slab_pos = slab_rebal.slab_start;
1096             STATS_LOCK();
1097             stats.slab_reassign_busy_items += slab_rebal.busy_items;
1098             STATS_UNLOCK();
1099             slab_rebal.busy_items = 0;
1100             slab_rebal.busy_loops++;
1101         } else {
1102             slab_rebal.done++;
1103         }
1104     }
1105 
1106     return was_busy;
1107 }
1108 
slab_rebalance_finish(void)1109 static void slab_rebalance_finish(void) {
1110     slabclass_t *s_cls;
1111     slabclass_t *d_cls;
1112     int x;
1113     uint32_t rescues;
1114     uint32_t evictions_nomem;
1115     uint32_t inline_reclaim;
1116     uint32_t chunk_rescues;
1117     uint32_t busy_deletes;
1118 
1119     pthread_mutex_lock(&slabs_lock);
1120 
1121     s_cls = &slabclass[slab_rebal.s_clsid];
1122     d_cls = &slabclass[slab_rebal.d_clsid];
1123 
1124 #ifdef DEBUG_SLAB_MOVER
1125     /* If the algorithm is broken, live items can sneak in. */
1126     slab_rebal.slab_pos = slab_rebal.slab_start;
1127     while (1) {
1128         item *it = slab_rebal.slab_pos;
1129         assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
1130         assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
1131         it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1132         slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1133         if (slab_rebal.slab_pos >= slab_rebal.slab_end)
1134             break;
1135     }
1136 #endif
1137 
1138     /* At this point the stolen slab is completely clear.
1139      * We always kill the "first"/"oldest" slab page in the slab_list, so
1140      * shuffle the page list backwards and decrement.
1141      */
1142     s_cls->slabs--;
1143     for (x = 0; x < s_cls->slabs; x++) {
1144         s_cls->slab_list[x] = s_cls->slab_list[x+1];
1145     }
1146 
1147     d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
1148     /* Don't need to split the page into chunks if we're just storing it */
1149     if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
1150         memset(slab_rebal.slab_start, 0, (size_t)settings.slab_page_size);
1151         split_slab_page_into_freelist(slab_rebal.slab_start,
1152             slab_rebal.d_clsid);
1153     } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
1154         /* memset just enough to signal restart handler to skip */
1155         memset(slab_rebal.slab_start, 0, sizeof(item));
1156         /* mem_malloc'ed might be higher than mem_limit. */
1157         mem_limit_reached = false;
1158         memory_release();
1159     }
1160 
1161     slab_rebal.busy_loops = 0;
1162     slab_rebal.done       = 0;
1163     slab_rebal.s_clsid    = 0;
1164     slab_rebal.d_clsid    = 0;
1165     slab_rebal.slab_start = NULL;
1166     slab_rebal.slab_end   = NULL;
1167     slab_rebal.slab_pos   = NULL;
1168     evictions_nomem    = slab_rebal.evictions_nomem;
1169     inline_reclaim = slab_rebal.inline_reclaim;
1170     rescues   = slab_rebal.rescues;
1171     chunk_rescues = slab_rebal.chunk_rescues;
1172     busy_deletes = slab_rebal.busy_deletes;
1173     slab_rebal.evictions_nomem    = 0;
1174     slab_rebal.inline_reclaim = 0;
1175     slab_rebal.rescues  = 0;
1176     slab_rebal.chunk_rescues = 0;
1177     slab_rebal.busy_deletes = 0;
1178 
1179     slab_rebalance_signal = 0;
1180 
1181     free(slab_rebal.completed);
1182     pthread_mutex_unlock(&slabs_lock);
1183 
1184     STATS_LOCK();
1185     stats.slabs_moved++;
1186     stats.slab_reassign_rescues += rescues;
1187     stats.slab_reassign_evictions_nomem += evictions_nomem;
1188     stats.slab_reassign_inline_reclaim += inline_reclaim;
1189     stats.slab_reassign_chunk_rescues += chunk_rescues;
1190     stats.slab_reassign_busy_deletes += busy_deletes;
1191     stats_state.slab_reassign_running = false;
1192     STATS_UNLOCK();
1193 
1194     if (settings.verbose > 1) {
1195         fprintf(stderr, "finished a slab move\n");
1196     }
1197 }
1198 
1199 /* Slab mover thread.
1200  * Sits waiting for a condition to jump off and shovel some memory about
1201  */
slab_rebalance_thread(void * arg)1202 static void *slab_rebalance_thread(void *arg) {
1203     int was_busy = 0;
1204     int backoff_timer = 1;
1205     int backoff_max = 1000;
1206     /* So we first pass into cond_wait with the mutex held */
1207     mutex_lock(&slabs_rebalance_lock);
1208 
1209     /* Must finish moving page before stopping */
1210     while (slab_rebalance_signal || do_run_slab_rebalance_thread) {
1211         if (slab_rebalance_signal == 1) {
1212             if (slab_rebalance_start() < 0) {
1213                 /* Handle errors with more specificity as required. */
1214                 slab_rebalance_signal = 0;
1215             }
1216 
1217             was_busy = 0;
1218         } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
1219             was_busy = slab_rebalance_move();
1220         }
1221 
1222         if (slab_rebal.done) {
1223             slab_rebalance_finish();
1224         } else if (was_busy) {
1225             /* Stuck waiting for some items to unlock, so slow down a bit
1226              * to give them a chance to free up */
1227             usleep(backoff_timer);
1228             backoff_timer = backoff_timer * 2;
1229             if (backoff_timer > backoff_max)
1230                 backoff_timer = backoff_max;
1231         }
1232 
1233         if (slab_rebalance_signal == 0) {
1234             /* always hold this lock while we're running */
1235             pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
1236         }
1237     }
1238 
1239     // TODO: cancel in-flight slab page move
1240     mutex_unlock(&slabs_rebalance_lock);
1241     return NULL;
1242 }
1243 
1244 /* Iterate at most once through the slab classes and pick a "random" source.
1245  * I like this better than calling rand() since rand() is slow enough that we
1246  * can just check all of the classes once instead.
1247  */
slabs_reassign_pick_any(int dst)1248 static int slabs_reassign_pick_any(int dst) {
1249     static int cur = POWER_SMALLEST - 1;
1250     int tries = power_largest - POWER_SMALLEST + 1;
1251     for (; tries > 0; tries--) {
1252         cur++;
1253         if (cur > power_largest)
1254             cur = POWER_SMALLEST;
1255         if (cur == dst)
1256             continue;
1257         if (slabclass[cur].slabs > 1) {
1258             return cur;
1259         }
1260     }
1261     return -1;
1262 }
1263 
do_slabs_reassign(int src,int dst)1264 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
1265     bool nospare = false;
1266     if (slab_rebalance_signal != 0)
1267         return REASSIGN_RUNNING;
1268 
1269     if (src == dst)
1270         return REASSIGN_SRC_DST_SAME;
1271 
1272     /* Special indicator to choose ourselves. */
1273     if (src == -1) {
1274         src = slabs_reassign_pick_any(dst);
1275         /* TODO: If we end up back at -1, return a new error type */
1276     }
1277 
1278     if (src < SLAB_GLOBAL_PAGE_POOL || src > power_largest ||
1279         dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
1280         return REASSIGN_BADCLASS;
1281 
1282     pthread_mutex_lock(&slabs_lock);
1283     if (slabclass[src].slabs < 2)
1284         nospare = true;
1285     pthread_mutex_unlock(&slabs_lock);
1286     if (nospare)
1287         return REASSIGN_NOSPARE;
1288 
1289     slab_rebal.s_clsid = src;
1290     slab_rebal.d_clsid = dst;
1291 
1292     slab_rebalance_signal = 1;
1293     pthread_cond_signal(&slab_rebalance_cond);
1294 
1295     return REASSIGN_OK;
1296 }
1297 
slabs_reassign(int src,int dst)1298 enum reassign_result_type slabs_reassign(int src, int dst) {
1299     enum reassign_result_type ret;
1300     if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
1301         return REASSIGN_RUNNING;
1302     }
1303     ret = do_slabs_reassign(src, dst);
1304     pthread_mutex_unlock(&slabs_rebalance_lock);
1305     return ret;
1306 }
1307 
1308 /* If we hold this lock, rebalancer can't wake up or move */
slabs_rebalancer_pause(void)1309 void slabs_rebalancer_pause(void) {
1310     pthread_mutex_lock(&slabs_rebalance_lock);
1311 }
1312 
slabs_rebalancer_resume(void)1313 void slabs_rebalancer_resume(void) {
1314     pthread_mutex_unlock(&slabs_rebalance_lock);
1315 }
1316 
1317 static pthread_t rebalance_tid;
1318 
start_slab_maintenance_thread(void)1319 int start_slab_maintenance_thread(void) {
1320     int ret;
1321     slab_rebalance_signal = 0;
1322     slab_rebal.slab_start = NULL;
1323 
1324     if ((ret = pthread_create(&rebalance_tid, NULL,
1325                               slab_rebalance_thread, NULL)) != 0) {
1326         fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
1327         return -1;
1328     }
1329     thread_setname(rebalance_tid, "mc-slabmaint");
1330     return 0;
1331 }
1332 
1333 /* The maintenance thread is on a sleep/loop cycle, so it should join after a
1334  * short wait */
stop_slab_maintenance_thread(void)1335 void stop_slab_maintenance_thread(void) {
1336     mutex_lock(&slabs_rebalance_lock);
1337     do_run_slab_rebalance_thread = 0;
1338     pthread_cond_signal(&slab_rebalance_cond);
1339     pthread_mutex_unlock(&slabs_rebalance_lock);
1340 
1341     /* Wait for the maintenance thread to stop */
1342     pthread_join(rebalance_tid, NULL);
1343 }
1344