xref: /memcached-1.4.29/slabs.c (revision ee461d11)
166b28810SBrad Fitzpatrick /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
260d70942SAnatoly Vorobey /*
3c9607c6dSBrad Fitzpatrick  * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4c9607c6dSBrad Fitzpatrick  * and are divided into chunks. The chunk sizes start off at the size of the
5c9607c6dSBrad Fitzpatrick  * "item" structure plus space for a small key and value. They increase by
6c9607c6dSBrad Fitzpatrick  * a multiplier factor from there, up to half the maximum slab size. The last
7c9607c6dSBrad Fitzpatrick  * slab size is always 1MB, since that's the maximum item size allowed by the
8c9607c6dSBrad Fitzpatrick  * memcached protocol.
960d70942SAnatoly Vorobey  */
1056b8339eSSteven Grimm #include "memcached.h"
1160d70942SAnatoly Vorobey #include <sys/stat.h>
1260d70942SAnatoly Vorobey #include <sys/socket.h>
1360d70942SAnatoly Vorobey #include <sys/resource.h>
1460d70942SAnatoly Vorobey #include <fcntl.h>
15c6975ef4SPaul Lindner #include <netinet/in.h>
16c6975ef4SPaul Lindner #include <errno.h>
1760d70942SAnatoly Vorobey #include <stdlib.h>
1860d70942SAnatoly Vorobey #include <stdio.h>
1960d70942SAnatoly Vorobey #include <string.h>
20e688e97dSNatanael Copa #include <signal.h>
21c08383afSBrad Fitzpatrick #include <assert.h>
22d9220d64STrond Norbye #include <pthread.h>
23294c32d7SToru Maesaka 
24a836eabcSdormando //#define DEBUG_SLAB_MOVER
25c9607c6dSBrad Fitzpatrick /* powers-of-N allocation structures */
2686969ea4SBrad Fitzpatrick 
2760d70942SAnatoly Vorobey typedef struct {
2860d70942SAnatoly Vorobey     unsigned int size;      /* sizes of items */
2960d70942SAnatoly Vorobey     unsigned int perslab;   /* how many items per slab */
3086969ea4SBrad Fitzpatrick 
3110698baeSdormando     void *slots;           /* list of item ptrs */
3210698baeSdormando     unsigned int sl_curr;   /* total free items in list */
3386969ea4SBrad Fitzpatrick 
3460d70942SAnatoly Vorobey     unsigned int slabs;     /* how many slabs were allocated for this class */
3586969ea4SBrad Fitzpatrick 
36b7b9d5f4SBrad Fitzpatrick     void **slab_list;       /* array of slab pointers */
37b7b9d5f4SBrad Fitzpatrick     unsigned int list_size; /* size of prev array */
3886969ea4SBrad Fitzpatrick 
397a82822bSTrond Norbye     size_t requested; /* The number of requested bytes */
4060d70942SAnatoly Vorobey } slabclass_t;
4186969ea4SBrad Fitzpatrick 
427173856aSDustin Sallings static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43c9607c6dSBrad Fitzpatrick static size_t mem_limit = 0;
44c9607c6dSBrad Fitzpatrick static size_t mem_malloced = 0;
45fb269897Sdormando /* If the memory limit has been hit once. Used as a hint to decide when to
46fb269897Sdormando  * early-wake the LRU maintenance thread */
47fb269897Sdormando static bool mem_limit_reached = false;
48c9607c6dSBrad Fitzpatrick static int power_largest;
4986969ea4SBrad Fitzpatrick 
50a6b35b44STrond Norbye static void *mem_base = NULL;
51a6b35b44STrond Norbye static void *mem_current = NULL;
52a6b35b44STrond Norbye static size_t mem_avail = 0;
53a6b35b44STrond Norbye 
54d9220d64STrond Norbye /**
55d9220d64STrond Norbye  * Access to the slab allocator is protected by this lock
56d9220d64STrond Norbye  */
57d9220d64STrond Norbye static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
583b2dc737Sdormando static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
59d9220d64STrond Norbye 
60c9607c6dSBrad Fitzpatrick /*
6177dde9f9SPaul Lindner  * Forward Declarations
6277dde9f9SPaul Lindner  */
6356b8339eSSteven Grimm static int do_slabs_newslab(const unsigned int id);
64a6b35b44STrond Norbye static void *memory_allocate(size_t size);
65845a4fe1Sdormando static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
6677dde9f9SPaul Lindner 
6777dde9f9SPaul Lindner /* Preallocate as many slab pages as possible (called from slabs_init)
6877dde9f9SPaul Lindner    on start-up, so users don't get confused out-of-memory errors when
6977dde9f9SPaul Lindner    they do have free (in-slab) space, but no space to make new slabs.
7077dde9f9SPaul Lindner    if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
7177dde9f9SPaul Lindner    slab types can be made.  if max memory is less than 18 MB, only the
7277dde9f9SPaul Lindner    smaller ones will be made.  */
7377dde9f9SPaul Lindner static void slabs_preallocate (const unsigned int maxslabs);
7477dde9f9SPaul Lindner 
7577dde9f9SPaul Lindner /*
76c9607c6dSBrad Fitzpatrick  * Figures out which slab class (chunk size) is required to store an item of
77c9607c6dSBrad Fitzpatrick  * a given size.
7877dde9f9SPaul Lindner  *
7977dde9f9SPaul Lindner  * Given object size, return id to use when allocating/freeing memory for object
8077dde9f9SPaul Lindner  * 0 means error: can't store such a large object
81c9607c6dSBrad Fitzpatrick  */
8277dde9f9SPaul Lindner 
slabs_clsid(const size_t size)8377dde9f9SPaul Lindner unsigned int slabs_clsid(const size_t size) {
84c9607c6dSBrad Fitzpatrick     int res = POWER_SMALLEST;
8586969ea4SBrad Fitzpatrick 
860567967aSdormando     if (size == 0 || size > settings.item_size_max)
8760d70942SAnatoly Vorobey         return 0;
88c9607c6dSBrad Fitzpatrick     while (size > slabclass[res].size)
89c9607c6dSBrad Fitzpatrick         if (res++ == power_largest)     /* won't fit in the biggest slab */
900567967aSdormando             return power_largest;
9160d70942SAnatoly Vorobey     return res;
9260d70942SAnatoly Vorobey }
9386969ea4SBrad Fitzpatrick 
9444ef96eeSPaul Lindner /**
95c9607c6dSBrad Fitzpatrick  * Determines the chunk sizes and initializes the slab class descriptors
96c9607c6dSBrad Fitzpatrick  * accordingly.
97c9607c6dSBrad Fitzpatrick  */
slabs_init(const size_t limit,const double factor,const bool prealloc,const uint32_t * slab_sizes)98d7fb022dSdormando void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
99c9607c6dSBrad Fitzpatrick     int i = POWER_SMALLEST - 1;
100c9607c6dSBrad Fitzpatrick     unsigned int size = sizeof(item) + settings.chunk_size;
10186969ea4SBrad Fitzpatrick 
10260d70942SAnatoly Vorobey     mem_limit = limit;
103a6b35b44STrond Norbye 
104a6b35b44STrond Norbye     if (prealloc) {
105a6b35b44STrond Norbye         /* Allocate everything in a big chunk with malloc */
106a6b35b44STrond Norbye         mem_base = malloc(mem_limit);
107a6b35b44STrond Norbye         if (mem_base != NULL) {
108a6b35b44STrond Norbye             mem_current = mem_base;
109a6b35b44STrond Norbye             mem_avail = mem_limit;
110a6b35b44STrond Norbye         } else {
111a6b35b44STrond Norbye             fprintf(stderr, "Warning: Failed to allocate requested memory in"
112a6b35b44STrond Norbye                     " one large chunk.\nWill allocate in smaller chunks\n");
113a6b35b44STrond Norbye         }
114a6b35b44STrond Norbye     }
115a6b35b44STrond Norbye 
116c9607c6dSBrad Fitzpatrick     memset(slabclass, 0, sizeof(slabclass));
11786969ea4SBrad Fitzpatrick 
118d7fb022dSdormando     while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
119d7fb022dSdormando         if (slab_sizes != NULL) {
120d7fb022dSdormando             if (slab_sizes[i-1] == 0)
121d7fb022dSdormando                 break;
122d7fb022dSdormando             size = slab_sizes[i-1];
1230567967aSdormando         } else if (size >= settings.slab_chunk_size_max / factor) {
124d7fb022dSdormando             break;
125d7fb022dSdormando         }
126c9607c6dSBrad Fitzpatrick         /* Make sure items are always n-byte aligned */
127c9607c6dSBrad Fitzpatrick         if (size % CHUNK_ALIGN_BYTES)
128c9607c6dSBrad Fitzpatrick             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
12986969ea4SBrad Fitzpatrick 
13060d70942SAnatoly Vorobey         slabclass[i].size = size;
1310567967aSdormando         slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
132d7fb022dSdormando         if (slab_sizes == NULL)
133c9607c6dSBrad Fitzpatrick             size *= factor;
134c9607c6dSBrad Fitzpatrick         if (settings.verbose > 1) {
1357a1da66aSdormando             fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
136c9607c6dSBrad Fitzpatrick                     i, slabclass[i].size, slabclass[i].perslab);
137c9607c6dSBrad Fitzpatrick         }
13860d70942SAnatoly Vorobey     }
13986969ea4SBrad Fitzpatrick 
140c9607c6dSBrad Fitzpatrick     power_largest = i;
1410567967aSdormando     slabclass[power_largest].size = settings.slab_chunk_size_max;
1420567967aSdormando     slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
1437a1da66aSdormando     if (settings.verbose > 1) {
1447a1da66aSdormando         fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
1457a1da66aSdormando                 i, slabclass[i].size, slabclass[i].perslab);
1467a1da66aSdormando     }
14786969ea4SBrad Fitzpatrick 
1480140fa41SBrad Fitzpatrick     /* for the test suite:  faking of how much we've already malloc'd */
1490140fa41SBrad Fitzpatrick     {
1500140fa41SBrad Fitzpatrick         char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
1510140fa41SBrad Fitzpatrick         if (t_initial_malloc) {
15277dde9f9SPaul Lindner             mem_malloced = (size_t)atol(t_initial_malloc);
1530140fa41SBrad Fitzpatrick         }
1540140fa41SBrad Fitzpatrick 
1550140fa41SBrad Fitzpatrick     }
1560140fa41SBrad Fitzpatrick 
157c979d213Sdormando     if (prealloc) {
1583938c578SPaul Lindner         slabs_preallocate(power_largest);
1590140fa41SBrad Fitzpatrick     }
1600140fa41SBrad Fitzpatrick }
16186969ea4SBrad Fitzpatrick 
slabs_preallocate(const unsigned int maxslabs)16277dde9f9SPaul Lindner static void slabs_preallocate (const unsigned int maxslabs) {
163c252f6e9SBrad Fitzpatrick     int i;
164c252f6e9SBrad Fitzpatrick     unsigned int prealloc = 0;
16586969ea4SBrad Fitzpatrick 
166c252f6e9SBrad Fitzpatrick     /* pre-allocate a 1MB slab in every size class so people don't get
167c252f6e9SBrad Fitzpatrick        confused by non-intuitive "SERVER_ERROR out of memory"
168c252f6e9SBrad Fitzpatrick        messages.  this is the most common question on the mailing
169c252f6e9SBrad Fitzpatrick        list.  if you really don't want this, you can rebuild without
170c252f6e9SBrad Fitzpatrick        these three lines.  */
17186969ea4SBrad Fitzpatrick 
172a2fc8e93Sdormando     for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
173c252f6e9SBrad Fitzpatrick         if (++prealloc > maxslabs)
174c252f6e9SBrad Fitzpatrick             return;
175c979d213Sdormando         if (do_slabs_newslab(i) == 0) {
176c979d213Sdormando             fprintf(stderr, "Error while preallocating slab memory!\n"
177c979d213Sdormando                 "If using -L or other prealloc options, max memory must be "
178c979d213Sdormando                 "at least %d megabytes.\n", power_largest);
179c979d213Sdormando             exit(1);
180c979d213Sdormando         }
181c252f6e9SBrad Fitzpatrick     }
18286969ea4SBrad Fitzpatrick 
18360d70942SAnatoly Vorobey }
18486969ea4SBrad Fitzpatrick 
grow_slab_list(const unsigned int id)18577dde9f9SPaul Lindner static int grow_slab_list (const unsigned int id) {
186d72b1a2dSBrad Fitzpatrick     slabclass_t *p = &slabclass[id];
187d72b1a2dSBrad Fitzpatrick     if (p->slabs == p->list_size) {
18877dde9f9SPaul Lindner         size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
189e3ca4e30SBrad Fitzpatrick         void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
190d72b1a2dSBrad Fitzpatrick         if (new_list == 0) return 0;
191d72b1a2dSBrad Fitzpatrick         p->list_size = new_size;
192d72b1a2dSBrad Fitzpatrick         p->slab_list = new_list;
193d72b1a2dSBrad Fitzpatrick     }
194d72b1a2dSBrad Fitzpatrick     return 1;
195d72b1a2dSBrad Fitzpatrick }
19686969ea4SBrad Fitzpatrick 
split_slab_page_into_freelist(char * ptr,const unsigned int id)197845a4fe1Sdormando static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
198845a4fe1Sdormando     slabclass_t *p = &slabclass[id];
199845a4fe1Sdormando     int x;
200845a4fe1Sdormando     for (x = 0; x < p->perslab; x++) {
201845a4fe1Sdormando         do_slabs_free(ptr, 0, id);
202845a4fe1Sdormando         ptr += p->size;
203845a4fe1Sdormando     }
204845a4fe1Sdormando }
205845a4fe1Sdormando 
206d6e96467Sdormando /* Fast FIFO queue */
get_page_from_global_pool(void)207d6e96467Sdormando static void *get_page_from_global_pool(void) {
208d6e96467Sdormando     slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
209d6e96467Sdormando     if (p->slabs < 1) {
210d6e96467Sdormando         return NULL;
211d6e96467Sdormando     }
212d6e96467Sdormando     char *ret = p->slab_list[p->slabs - 1];
213d6e96467Sdormando     p->slabs--;
214d6e96467Sdormando     return ret;
215d6e96467Sdormando }
216d6e96467Sdormando 
do_slabs_newslab(const unsigned int id)21756b8339eSSteven Grimm static int do_slabs_newslab(const unsigned int id) {
21860d70942SAnatoly Vorobey     slabclass_t *p = &slabclass[id];
219d6e96467Sdormando     slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
22010698baeSdormando     int len = settings.slab_reassign ? settings.item_size_max
22110698baeSdormando         : p->size * p->perslab;
22260d70942SAnatoly Vorobey     char *ptr;
22386969ea4SBrad Fitzpatrick 
224d6e96467Sdormando     if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
225d6e96467Sdormando          && g->slabs == 0)) {
226fb269897Sdormando         mem_limit_reached = true;
227fb269897Sdormando         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
228fb269897Sdormando         return 0;
229fb269897Sdormando     }
230fb269897Sdormando 
231fb269897Sdormando     if ((grow_slab_list(id) == 0) ||
232d6e96467Sdormando         (((ptr = get_page_from_global_pool()) == NULL) &&
233d6e96467Sdormando         ((ptr = memory_allocate((size_t)len)) == 0))) {
23468957214STrond Norbye 
23568957214STrond Norbye         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
23660d70942SAnatoly Vorobey         return 0;
23768957214STrond Norbye     }
23886969ea4SBrad Fitzpatrick 
23977dde9f9SPaul Lindner     memset(ptr, 0, (size_t)len);
240845a4fe1Sdormando     split_slab_page_into_freelist(ptr, id);
24186969ea4SBrad Fitzpatrick 
242b7b9d5f4SBrad Fitzpatrick     p->slab_list[p->slabs++] = ptr;
24368957214STrond Norbye     MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
24461e08596SToru Maesaka 
24560d70942SAnatoly Vorobey     return 1;
24660d70942SAnatoly Vorobey }
24786969ea4SBrad Fitzpatrick 
2480567967aSdormando /* This calculation ends up adding sizeof(void *) to the item size. */
do_slabs_alloc_chunked(const size_t size,slabclass_t * p,unsigned int id)2490567967aSdormando static void *do_slabs_alloc_chunked(const size_t size, slabclass_t *p, unsigned int id) {
2500567967aSdormando     void *ret = NULL;
2510567967aSdormando     item *it = NULL;
2520567967aSdormando     int x;
2530567967aSdormando     int csize = p->size - sizeof(item_chunk);
2540567967aSdormando     unsigned int chunks_req = size / csize;
2550567967aSdormando     if (size % csize != 0)
2560567967aSdormando         chunks_req++;
2570567967aSdormando     while (p->sl_curr < chunks_req) {
2580567967aSdormando         if (do_slabs_newslab(id) == 0)
2590567967aSdormando             break;
2600567967aSdormando     }
2610567967aSdormando 
2620567967aSdormando     if (p->sl_curr >= chunks_req) {
2630567967aSdormando         item_chunk *chunk = NULL;
2640567967aSdormando 
2650567967aSdormando         /* Configure the head item in the chain. */
2660567967aSdormando         it = (item *)p->slots;
2670567967aSdormando         p->slots = it->next;
2680567967aSdormando         if (it->next) it->next->prev = 0;
2690567967aSdormando 
2700567967aSdormando         /* Squirrel away the "top chunk" into h_next for now */
2710567967aSdormando         it->h_next = (item *)p->slots;
2720567967aSdormando         assert(it->h_next != 0);
2730567967aSdormando         chunk = (item_chunk *) it->h_next;
2740567967aSdormando 
2750567967aSdormando         /* roll down the chunks, marking them as such. */
2760567967aSdormando         for (x = 0; x < chunks_req-1; x++) {
2770567967aSdormando             chunk->it_flags &= ~ITEM_SLABBED;
278*ee461d11Sdormando             chunk->it_flags |= ITEM_CHUNK;
2790567967aSdormando             /* Chunks always have a direct reference to the head item */
2800567967aSdormando             chunk->head = it;
2810567967aSdormando             chunk->size = p->size - sizeof(item_chunk);
2820567967aSdormando             chunk->used = 0;
2830567967aSdormando             chunk = chunk->next;
2840567967aSdormando         }
2850567967aSdormando 
2860567967aSdormando         /* The final "next" is now the top of the slab freelist */
2870567967aSdormando         p->slots = chunk;
2880567967aSdormando         if (chunk && chunk->prev) {
2890567967aSdormando             /* Disconnect the final chunk from the chain */
2900567967aSdormando             chunk->prev->next = 0;
2910567967aSdormando             chunk->prev = 0;
2920567967aSdormando         }
2930567967aSdormando 
2940567967aSdormando         it->it_flags &= ~ITEM_SLABBED;
2950567967aSdormando         it->it_flags |= ITEM_CHUNKED;
2960567967aSdormando         it->refcount = 1;
2970567967aSdormando         p->sl_curr -= chunks_req;
2980567967aSdormando         ret = (void *)it;
2990567967aSdormando     } else {
3000567967aSdormando         ret = NULL;
3010567967aSdormando     }
3020567967aSdormando 
3030567967aSdormando     return ret;
3040567967aSdormando }
3050567967aSdormando 
30677dde9f9SPaul Lindner /*@null@*/
do_slabs_alloc(const size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)307*ee461d11Sdormando static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes,
308b1debc4cSdormando         unsigned int flags) {
30960d70942SAnatoly Vorobey     slabclass_t *p;
31068957214STrond Norbye     void *ret = NULL;
31110698baeSdormando     item *it = NULL;
31286969ea4SBrad Fitzpatrick 
31368957214STrond Norbye     if (id < POWER_SMALLEST || id > power_largest) {
31468957214STrond Norbye         MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
31577dde9f9SPaul Lindner         return NULL;
31668957214STrond Norbye     }
31760d70942SAnatoly Vorobey     p = &slabclass[id];
31810698baeSdormando     assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
319*ee461d11Sdormando     if (total_bytes != NULL) {
320*ee461d11Sdormando         *total_bytes = p->requested;
321004e2211Sdormando     }
3220567967aSdormando 
3230567967aSdormando     if (size <= p->size) {
32466b28810SBrad Fitzpatrick         /* fail unless we have space at the end of a recently allocated page,
32566b28810SBrad Fitzpatrick            we have something on our freelist, or we could allocate a new page */
326b1debc4cSdormando         if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
327b1debc4cSdormando             do_slabs_newslab(id);
328b1debc4cSdormando         }
329b1debc4cSdormando 
330b1debc4cSdormando         if (p->sl_curr != 0) {
33168957214STrond Norbye             /* return off our freelist */
33210698baeSdormando             it = (item *)p->slots;
33310698baeSdormando             p->slots = it->next;
334324975ceSdormando             if (it->next) it->next->prev = 0;
33562415f16Sdormando             /* Kill flag and initialize refcount here for lock safety in slab
33662415f16Sdormando              * mover's freeness detection. */
33762415f16Sdormando             it->it_flags &= ~ITEM_SLABBED;
33862415f16Sdormando             it->refcount = 1;
33910698baeSdormando             p->sl_curr--;
34010698baeSdormando             ret = (void *)it;
341b1debc4cSdormando         } else {
342b1debc4cSdormando             ret = NULL;
34366b28810SBrad Fitzpatrick         }
3440567967aSdormando     } else {
3450567967aSdormando         /* Dealing with a chunked item. */
3460567967aSdormando         ret = do_slabs_alloc_chunked(size, p, id);
3470567967aSdormando     }
34886969ea4SBrad Fitzpatrick 
34968957214STrond Norbye     if (ret) {
3507a82822bSTrond Norbye         p->requested += size;
35168957214STrond Norbye         MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
35268957214STrond Norbye     } else {
35368957214STrond Norbye         MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
35468957214STrond Norbye     }
35568957214STrond Norbye 
35668957214STrond Norbye     return ret;
35760d70942SAnatoly Vorobey }
35886969ea4SBrad Fitzpatrick 
do_slabs_free_chunked(item * it,const size_t size,unsigned int id,slabclass_t * p)3590567967aSdormando static void do_slabs_free_chunked(item *it, const size_t size, unsigned int id,
3600567967aSdormando                                   slabclass_t *p) {
3610567967aSdormando     item_chunk *chunk = (item_chunk *) ITEM_data(it);
3620567967aSdormando     size_t realsize = size;
3630567967aSdormando     while (chunk) {
3640567967aSdormando         realsize += sizeof(item_chunk);
3650567967aSdormando         chunk = chunk->next;
3660567967aSdormando     }
3670567967aSdormando     chunk = (item_chunk *) ITEM_data(it);
368*ee461d11Sdormando     unsigned int chunks_found = 1;
3690567967aSdormando 
3700567967aSdormando     it->it_flags = ITEM_SLABBED;
3710567967aSdormando     it->slabs_clsid = 0;
3720567967aSdormando     it->prev = 0;
3730567967aSdormando     it->next = (item *) chunk->next;
3740567967aSdormando     assert(it->next);
3750567967aSdormando     /* top chunk should already point back to head */
3760567967aSdormando     assert(it->next && (void*)it->next->prev == (void*)chunk);
3770567967aSdormando     chunk = chunk->next;
3780567967aSdormando     chunk->prev = (item_chunk *)it;
3790567967aSdormando 
380*ee461d11Sdormando     while (chunk) {
381*ee461d11Sdormando         assert(chunk->it_flags == ITEM_CHUNK);
3820567967aSdormando         chunk->it_flags = ITEM_SLABBED;
383b05653f9Sdormando         chunk->slabs_clsid = 0;
384*ee461d11Sdormando         chunks_found++;
385*ee461d11Sdormando         if (chunk->next) {
3860567967aSdormando             chunk = chunk->next;
387*ee461d11Sdormando         } else {
388*ee461d11Sdormando             break;
389*ee461d11Sdormando         }
3900567967aSdormando     }
3910567967aSdormando     /* must have had nothing hanging off of the final chunk */
392b05653f9Sdormando     assert(chunk && chunk->next == 0);
393b05653f9Sdormando     /* Tail chunk, link the freelist here. */
394b05653f9Sdormando     chunk->next = p->slots;
395b05653f9Sdormando     if (chunk->next) chunk->next->prev = chunk;
3960567967aSdormando 
3970567967aSdormando     p->slots = it;
398*ee461d11Sdormando     p->sl_curr += chunks_found;
3990567967aSdormando     p->requested -= size;
4000567967aSdormando 
4010567967aSdormando     return;
4020567967aSdormando }
4030567967aSdormando 
4040567967aSdormando 
do_slabs_free(void * ptr,const size_t size,unsigned int id)405d9220d64STrond Norbye static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
406c08383afSBrad Fitzpatrick     slabclass_t *p;
40710698baeSdormando     item *it;
40886969ea4SBrad Fitzpatrick 
409c9607c6dSBrad Fitzpatrick     assert(id >= POWER_SMALLEST && id <= power_largest);
410c9607c6dSBrad Fitzpatrick     if (id < POWER_SMALLEST || id > power_largest)
41160d70942SAnatoly Vorobey         return;
41286969ea4SBrad Fitzpatrick 
41368957214STrond Norbye     MEMCACHED_SLABS_FREE(size, id, ptr);
41460d70942SAnatoly Vorobey     p = &slabclass[id];
41586969ea4SBrad Fitzpatrick 
41610698baeSdormando     it = (item *)ptr;
4170567967aSdormando     if ((it->it_flags & ITEM_CHUNKED) == 0) {
418186509c2Sdormando         it->it_flags = ITEM_SLABBED;
41962415f16Sdormando         it->slabs_clsid = 0;
42010698baeSdormando         it->prev = 0;
42110698baeSdormando         it->next = p->slots;
42210698baeSdormando         if (it->next) it->next->prev = it;
42310698baeSdormando         p->slots = it;
42410698baeSdormando 
42510698baeSdormando         p->sl_curr++;
4267a82822bSTrond Norbye         p->requested -= size;
4270567967aSdormando     } else {
4280567967aSdormando         do_slabs_free_chunked(it, size, id, p);
4290567967aSdormando     }
43060d70942SAnatoly Vorobey     return;
43160d70942SAnatoly Vorobey }
43286969ea4SBrad Fitzpatrick 
nz_strcmp(int nzlength,const char * nz,const char * z)433bd6a8278SDustin Sallings static int nz_strcmp(int nzlength, const char *nz, const char *z) {
434bd6a8278SDustin Sallings     int zlength=strlen(z);
435bd6a8278SDustin Sallings     return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
436bd6a8278SDustin Sallings }
437bd6a8278SDustin Sallings 
get_stats(const char * stat_type,int nkey,ADD_STAT add_stats,void * c)43817df5c0eSTrond Norbye bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
43917df5c0eSTrond Norbye     bool ret = true;
44048f45982SToru Maesaka 
44117df5c0eSTrond Norbye     if (add_stats != NULL) {
4424c77f591SToru Maesaka         if (!stat_type) {
44348f45982SToru Maesaka             /* prepare general statistics for the engine */
444bc340cdcSTrond Norbye             STATS_LOCK();
445cb01d504Sdormando             APPEND_STAT("bytes", "%llu", (unsigned long long)stats_state.curr_bytes);
446cb01d504Sdormando             APPEND_STAT("curr_items", "%llu", (unsigned long long)stats_state.curr_items);
4479517c656Sdormando             APPEND_STAT("total_items", "%llu", (unsigned long long)stats.total_items);
448bc340cdcSTrond Norbye             STATS_UNLOCK();
449d6e96467Sdormando             if (settings.slab_automove > 0) {
450d6e96467Sdormando                 pthread_mutex_lock(&slabs_lock);
451d6e96467Sdormando                 APPEND_STAT("slab_global_page_pool", "%u", slabclass[SLAB_GLOBAL_PAGE_POOL].slabs);
452d6e96467Sdormando                 pthread_mutex_unlock(&slabs_lock);
453d6e96467Sdormando             }
45467d7cfc1Sdormando             item_stats_totals(add_stats, c);
455bd6a8278SDustin Sallings         } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
45617df5c0eSTrond Norbye             item_stats(add_stats, c);
457bd6a8278SDustin Sallings         } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
45817df5c0eSTrond Norbye             slabs_stats(add_stats, c);
459bd6a8278SDustin Sallings         } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
46017df5c0eSTrond Norbye             item_stats_sizes(add_stats, c);
461ae6f4267Sdormando         } else if (nz_strcmp(nkey, stat_type, "sizes_enable") == 0) {
462ae6f4267Sdormando             item_stats_sizes_enable(add_stats, c);
463ae6f4267Sdormando         } else if (nz_strcmp(nkey, stat_type, "sizes_disable") == 0) {
464ae6f4267Sdormando             item_stats_sizes_disable(add_stats, c);
46517df5c0eSTrond Norbye         } else {
46617df5c0eSTrond Norbye             ret = false;
46717df5c0eSTrond Norbye         }
46817df5c0eSTrond Norbye     } else {
46917df5c0eSTrond Norbye         ret = false;
4704c77f591SToru Maesaka     }
471df8cf5e7SToru Maesaka 
47217df5c0eSTrond Norbye     return ret;
473f4a8e7ffSToru Maesaka }
4744c77f591SToru Maesaka 
47577dde9f9SPaul Lindner /*@null@*/
do_slabs_stats(ADD_STAT add_stats,void * c)47617df5c0eSTrond Norbye static void do_slabs_stats(ADD_STAT add_stats, void *c) {
47717df5c0eSTrond Norbye     int i, total;
478e71ea432SDustin Sallings     /* Get the per-thread stats which contain some interesting aggregates */
479e71ea432SDustin Sallings     struct thread_stats thread_stats;
480e71ea432SDustin Sallings     threadlocal_stats_aggregate(&thread_stats);
481e71ea432SDustin Sallings 
48266b28810SBrad Fitzpatrick     total = 0;
483c9607c6dSBrad Fitzpatrick     for(i = POWER_SMALLEST; i <= power_largest; i++) {
48466b28810SBrad Fitzpatrick         slabclass_t *p = &slabclass[i];
48577dde9f9SPaul Lindner         if (p->slabs != 0) {
486a20d4b8bSToru Maesaka             uint32_t perslab, slabs;
48766b28810SBrad Fitzpatrick             slabs = p->slabs;
48866b28810SBrad Fitzpatrick             perslab = p->perslab;
48986969ea4SBrad Fitzpatrick 
49088a68689SDustin Sallings             char key_str[STAT_KEY_LEN];
49188a68689SDustin Sallings             char val_str[STAT_VAL_LEN];
49229345c04SDustin Sallings             int klen = 0, vlen = 0;
493a20d4b8bSToru Maesaka 
49429345c04SDustin Sallings             APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
49529345c04SDustin Sallings             APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
49629345c04SDustin Sallings             APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
49729345c04SDustin Sallings             APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
49829345c04SDustin Sallings             APPEND_NUM_STAT(i, "used_chunks", "%u",
499423b9fd4Sdormando                             slabs*perslab - p->sl_curr);
50029345c04SDustin Sallings             APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
501423b9fd4Sdormando             /* Stat is dead, but displaying zero instead of removing it. */
502423b9fd4Sdormando             APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
5037a82822bSTrond Norbye             APPEND_NUM_STAT(i, "mem_requested", "%llu",
5047a82822bSTrond Norbye                             (unsigned long long)p->requested);
50529345c04SDustin Sallings             APPEND_NUM_STAT(i, "get_hits", "%llu",
506e71ea432SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].get_hits);
50729345c04SDustin Sallings             APPEND_NUM_STAT(i, "cmd_set", "%llu",
508e71ea432SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].set_cmds);
50929345c04SDustin Sallings             APPEND_NUM_STAT(i, "delete_hits", "%llu",
510a77d12b0SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].delete_hits);
51129345c04SDustin Sallings             APPEND_NUM_STAT(i, "incr_hits", "%llu",
5123e030782SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].incr_hits);
51329345c04SDustin Sallings             APPEND_NUM_STAT(i, "decr_hits", "%llu",
5143e030782SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].decr_hits);
51529345c04SDustin Sallings             APPEND_NUM_STAT(i, "cas_hits", "%llu",
51615e64625SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].cas_hits);
51729345c04SDustin Sallings             APPEND_NUM_STAT(i, "cas_badval", "%llu",
51815e64625SDustin Sallings                     (unsigned long long)thread_stats.slab_stats[i].cas_badval);
519d87f568aSdormando             APPEND_NUM_STAT(i, "touch_hits", "%llu",
520d87f568aSdormando                     (unsigned long long)thread_stats.slab_stats[i].touch_hits);
52160d70942SAnatoly Vorobey             total++;
52260d70942SAnatoly Vorobey         }
52360d70942SAnatoly Vorobey     }
524a20d4b8bSToru Maesaka 
525a20d4b8bSToru Maesaka     /* add overall slab stats and append terminator */
526a20d4b8bSToru Maesaka 
52729345c04SDustin Sallings     APPEND_STAT("active_slabs", "%d", total);
52829345c04SDustin Sallings     APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
52917df5c0eSTrond Norbye     add_stats(NULL, 0, NULL, 0, c);
53060d70942SAnatoly Vorobey }
53186969ea4SBrad Fitzpatrick 
memory_allocate(size_t size)532a6b35b44STrond Norbye static void *memory_allocate(size_t size) {
533a6b35b44STrond Norbye     void *ret;
534a6b35b44STrond Norbye 
535a6b35b44STrond Norbye     if (mem_base == NULL) {
536a6b35b44STrond Norbye         /* We are not using a preallocated large memory chunk */
537a6b35b44STrond Norbye         ret = malloc(size);
538a6b35b44STrond Norbye     } else {
539a6b35b44STrond Norbye         ret = mem_current;
540a6b35b44STrond Norbye 
541a6b35b44STrond Norbye         if (size > mem_avail) {
542a6b35b44STrond Norbye             return NULL;
543a6b35b44STrond Norbye         }
544a6b35b44STrond Norbye 
545a6b35b44STrond Norbye         /* mem_current pointer _must_ be aligned!!! */
546a6b35b44STrond Norbye         if (size % CHUNK_ALIGN_BYTES) {
547a6b35b44STrond Norbye             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
548a6b35b44STrond Norbye         }
549a6b35b44STrond Norbye 
550df1b7e42STrond Norbye         mem_current = ((char*)mem_current) + size;
551a6b35b44STrond Norbye         if (size < mem_avail) {
552a6b35b44STrond Norbye             mem_avail -= size;
553a6b35b44STrond Norbye         } else {
554a6b35b44STrond Norbye             mem_avail = 0;
555a6b35b44STrond Norbye         }
556a6b35b44STrond Norbye     }
557ec937e5eSdormando     mem_malloced += size;
558a6b35b44STrond Norbye 
559a6b35b44STrond Norbye     return ret;
560a6b35b44STrond Norbye }
561d9220d64STrond Norbye 
56231541b37Sdormando /* Must only be used if all pages are item_size_max */
memory_release()56331541b37Sdormando static void memory_release() {
56431541b37Sdormando     void *p = NULL;
56531541b37Sdormando     if (mem_base != NULL)
56631541b37Sdormando         return;
56731541b37Sdormando 
56831541b37Sdormando     if (!settings.slab_reassign)
56931541b37Sdormando         return;
57031541b37Sdormando 
57131541b37Sdormando     while (mem_malloced > mem_limit &&
57231541b37Sdormando             (p = get_page_from_global_pool()) != NULL) {
57331541b37Sdormando         free(p);
57431541b37Sdormando         mem_malloced -= settings.item_size_max;
57531541b37Sdormando     }
57631541b37Sdormando }
57731541b37Sdormando 
slabs_alloc(size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)578*ee461d11Sdormando void *slabs_alloc(size_t size, unsigned int id, uint64_t *total_bytes,
579b1debc4cSdormando         unsigned int flags) {
580d9220d64STrond Norbye     void *ret;
581d9220d64STrond Norbye 
582d9220d64STrond Norbye     pthread_mutex_lock(&slabs_lock);
583*ee461d11Sdormando     ret = do_slabs_alloc(size, id, total_bytes, flags);
584d9220d64STrond Norbye     pthread_mutex_unlock(&slabs_lock);
585d9220d64STrond Norbye     return ret;
586d9220d64STrond Norbye }
587d9220d64STrond Norbye 
slabs_free(void * ptr,size_t size,unsigned int id)588d9220d64STrond Norbye void slabs_free(void *ptr, size_t size, unsigned int id) {
589d9220d64STrond Norbye     pthread_mutex_lock(&slabs_lock);
590d9220d64STrond Norbye     do_slabs_free(ptr, size, id);
591d9220d64STrond Norbye     pthread_mutex_unlock(&slabs_lock);
592d9220d64STrond Norbye }
593d9220d64STrond Norbye 
slabs_stats(ADD_STAT add_stats,void * c)59417df5c0eSTrond Norbye void slabs_stats(ADD_STAT add_stats, void *c) {
595d9220d64STrond Norbye     pthread_mutex_lock(&slabs_lock);
59617df5c0eSTrond Norbye     do_slabs_stats(add_stats, c);
597d9220d64STrond Norbye     pthread_mutex_unlock(&slabs_lock);
598d9220d64STrond Norbye }
599efad616dSTrond Norbye 
do_slabs_adjust_mem_limit(size_t new_mem_limit)60031541b37Sdormando static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
60131541b37Sdormando     /* Cannot adjust memory limit at runtime if prealloc'ed */
60231541b37Sdormando     if (mem_base != NULL)
60331541b37Sdormando         return false;
60431541b37Sdormando     settings.maxbytes = new_mem_limit;
60531541b37Sdormando     mem_limit = new_mem_limit;
60631541b37Sdormando     mem_limit_reached = false; /* Will reset on next alloc */
60731541b37Sdormando     memory_release(); /* free what might already be in the global pool */
60831541b37Sdormando     return true;
60931541b37Sdormando }
61031541b37Sdormando 
slabs_adjust_mem_limit(size_t new_mem_limit)61131541b37Sdormando bool slabs_adjust_mem_limit(size_t new_mem_limit) {
61231541b37Sdormando     bool ret;
61331541b37Sdormando     pthread_mutex_lock(&slabs_lock);
61431541b37Sdormando     ret = do_slabs_adjust_mem_limit(new_mem_limit);
61531541b37Sdormando     pthread_mutex_unlock(&slabs_lock);
61631541b37Sdormando     return ret;
61731541b37Sdormando }
61831541b37Sdormando 
slabs_adjust_mem_requested(unsigned int id,size_t old,size_t ntotal)619efad616dSTrond Norbye void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
620efad616dSTrond Norbye {
621efad616dSTrond Norbye     pthread_mutex_lock(&slabs_lock);
622efad616dSTrond Norbye     slabclass_t *p;
623efad616dSTrond Norbye     if (id < POWER_SMALLEST || id > power_largest) {
624efad616dSTrond Norbye         fprintf(stderr, "Internal error! Invalid slab class\n");
625efad616dSTrond Norbye         abort();
626efad616dSTrond Norbye     }
627efad616dSTrond Norbye 
628efad616dSTrond Norbye     p = &slabclass[id];
629efad616dSTrond Norbye     p->requested = p->requested - old + ntotal;
630efad616dSTrond Norbye     pthread_mutex_unlock(&slabs_lock);
631efad616dSTrond Norbye }
63210698baeSdormando 
slabs_available_chunks(const unsigned int id,bool * mem_flag,uint64_t * total_bytes,unsigned int * chunks_perslab)633fb269897Sdormando unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
634*ee461d11Sdormando         uint64_t *total_bytes, unsigned int *chunks_perslab) {
635fb269897Sdormando     unsigned int ret;
636fb269897Sdormando     slabclass_t *p;
637fb269897Sdormando 
638fb269897Sdormando     pthread_mutex_lock(&slabs_lock);
639fb269897Sdormando     p = &slabclass[id];
640fb269897Sdormando     ret = p->sl_curr;
641e708513aSdormando     if (mem_flag != NULL)
642fb269897Sdormando         *mem_flag = mem_limit_reached;
643*ee461d11Sdormando     if (total_bytes != NULL)
644*ee461d11Sdormando         *total_bytes = p->requested;
645d6e96467Sdormando     if (chunks_perslab != NULL)
646d6e96467Sdormando         *chunks_perslab = p->perslab;
647fb269897Sdormando     pthread_mutex_unlock(&slabs_lock);
648fb269897Sdormando     return ret;
649fb269897Sdormando }
650fb269897Sdormando 
65157a9856aSdormando static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
65210698baeSdormando static volatile int do_run_slab_thread = 1;
65357a9856aSdormando static volatile int do_run_slab_rebalance_thread = 1;
65410698baeSdormando 
65510698baeSdormando #define DEFAULT_SLAB_BULK_CHECK 1
65610698baeSdormando int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
65710698baeSdormando 
slab_rebalance_start(void)65810698baeSdormando static int slab_rebalance_start(void) {
65910698baeSdormando     slabclass_t *s_cls;
66010698baeSdormando     int no_go = 0;
66110698baeSdormando 
66210698baeSdormando     pthread_mutex_lock(&slabs_lock);
66310698baeSdormando 
66410698baeSdormando     if (slab_rebal.s_clsid < POWER_SMALLEST ||
66510698baeSdormando         slab_rebal.s_clsid > power_largest  ||
666d6e96467Sdormando         slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
6678c1c18edSdormando         slab_rebal.d_clsid > power_largest  ||
6688c1c18edSdormando         slab_rebal.s_clsid == slab_rebal.d_clsid)
66910698baeSdormando         no_go = -2;
67010698baeSdormando 
67110698baeSdormando     s_cls = &slabclass[slab_rebal.s_clsid];
67210698baeSdormando 
673423b9fd4Sdormando     if (!grow_slab_list(slab_rebal.d_clsid)) {
67410698baeSdormando         no_go = -1;
67510698baeSdormando     }
67610698baeSdormando 
67710698baeSdormando     if (s_cls->slabs < 2)
67810698baeSdormando         no_go = -3;
67910698baeSdormando 
68010698baeSdormando     if (no_go != 0) {
68110698baeSdormando         pthread_mutex_unlock(&slabs_lock);
68210698baeSdormando         return no_go; /* Should use a wrapper function... */
68310698baeSdormando     }
68410698baeSdormando 
685d5185f9cSdormando     /* Always kill the first available slab page as it is most likely to
686d5185f9cSdormando      * contain the oldest items
687d5185f9cSdormando      */
688d5185f9cSdormando     slab_rebal.slab_start = s_cls->slab_list[0];
68910698baeSdormando     slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
69010698baeSdormando         (s_cls->size * s_cls->perslab);
69110698baeSdormando     slab_rebal.slab_pos   = slab_rebal.slab_start;
69210698baeSdormando     slab_rebal.done       = 0;
69310698baeSdormando 
69410698baeSdormando     /* Also tells do_item_get to search for items in this slab */
69510698baeSdormando     slab_rebalance_signal = 2;
69610698baeSdormando 
69710698baeSdormando     if (settings.verbose > 1) {
69810698baeSdormando         fprintf(stderr, "Started a slab rebalance\n");
69910698baeSdormando     }
70010698baeSdormando 
70110698baeSdormando     pthread_mutex_unlock(&slabs_lock);
70210698baeSdormando 
70310698baeSdormando     STATS_LOCK();
704cb01d504Sdormando     stats_state.slab_reassign_running = true;
70510698baeSdormando     STATS_UNLOCK();
70610698baeSdormando 
70710698baeSdormando     return 0;
70810698baeSdormando }
70910698baeSdormando 
710b1debc4cSdormando /* CALLED WITH slabs_lock HELD */
slab_rebalance_alloc(const size_t size,unsigned int id)711b1debc4cSdormando static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
712b1debc4cSdormando     slabclass_t *s_cls;
713b1debc4cSdormando     s_cls = &slabclass[slab_rebal.s_clsid];
714b1debc4cSdormando     int x;
715b1debc4cSdormando     item *new_it = NULL;
716b1debc4cSdormando 
717b1debc4cSdormando     for (x = 0; x < s_cls->perslab; x++) {
718b1debc4cSdormando         new_it = do_slabs_alloc(size, id, NULL, SLABS_ALLOC_NO_NEWPAGE);
719b1debc4cSdormando         /* check that memory isn't within the range to clear */
720b1debc4cSdormando         if (new_it == NULL) {
721b1debc4cSdormando             break;
722b1debc4cSdormando         }
723b1debc4cSdormando         if ((void *)new_it >= slab_rebal.slab_start
724b1debc4cSdormando             && (void *)new_it < slab_rebal.slab_end) {
725b1debc4cSdormando             /* Pulled something we intend to free. Mark it as freed since
726b1debc4cSdormando              * we've already done the work of unlinking it from the freelist.
727b1debc4cSdormando              */
728b1debc4cSdormando             s_cls->requested -= size;
729b1debc4cSdormando             new_it->refcount = 0;
730b1debc4cSdormando             new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
731*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
732*ee461d11Sdormando             memcpy(ITEM_key(new_it), "deadbeef", 8);
733*ee461d11Sdormando #endif
734b1debc4cSdormando             new_it = NULL;
735b1debc4cSdormando             slab_rebal.inline_reclaim++;
736b1debc4cSdormando         } else {
737b1debc4cSdormando             break;
738b1debc4cSdormando         }
739b1debc4cSdormando     }
740b1debc4cSdormando     return new_it;
741b1debc4cSdormando }
742b1debc4cSdormando 
743*ee461d11Sdormando /* CALLED WITH slabs_lock HELD */
744*ee461d11Sdormando /* detatches item/chunk from freelist. */
slab_rebalance_cut_free(slabclass_t * s_cls,item * it)745*ee461d11Sdormando static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
746*ee461d11Sdormando     /* Ensure this was on the freelist and nothing else. */
747*ee461d11Sdormando     assert(it->it_flags == ITEM_SLABBED);
748*ee461d11Sdormando     if (s_cls->slots == it) {
749*ee461d11Sdormando         s_cls->slots = it->next;
750*ee461d11Sdormando     }
751*ee461d11Sdormando     if (it->next) it->next->prev = it->prev;
752*ee461d11Sdormando     if (it->prev) it->prev->next = it->next;
753*ee461d11Sdormando     s_cls->sl_curr--;
754*ee461d11Sdormando }
755*ee461d11Sdormando 
756f4983b20Sdormando enum move_status {
757dc272ba5Sdormando     MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
758f4983b20Sdormando };
759f4983b20Sdormando 
76069d1c699Sdormando /* refcount == 0 is safe since nobody can incr while item_lock is held.
76110698baeSdormando  * refcount != 0 is impossible since flags/etc can be modified in other
76210698baeSdormando  * threads. instead, note we found a busy one and bail. logic in do_item_get
76310698baeSdormando  * will prevent busy items from continuing to be busy
76462415f16Sdormando  * NOTE: This is checking it_flags outside of an item lock. I believe this
76562415f16Sdormando  * works since it_flags is 8 bits, and we're only ever comparing a single bit
76662415f16Sdormando  * regardless. ITEM_SLABBED bit will always be correct since we're holding the
76762415f16Sdormando  * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
76862415f16Sdormando  * item having ITEM_SLABBED removed, and the key hasn't been added to the item
76962415f16Sdormando  * yet. The memory barrier from the slabs lock should order the key write and the
77062415f16Sdormando  * flags to the item?
77162415f16Sdormando  * If ITEM_LINKED did exist and was just removed, but we still see it, that's
77262415f16Sdormando  * still safe since it will have a valid key, which we then lock, and then
77362415f16Sdormando  * recheck everything.
77462415f16Sdormando  * This may not be safe on all platforms; If not, slabs_alloc() will need to
77562415f16Sdormando  * seed the item key while holding slabs_lock.
77610698baeSdormando  */
slab_rebalance_move(void)77710698baeSdormando static int slab_rebalance_move(void) {
77810698baeSdormando     slabclass_t *s_cls;
77910698baeSdormando     int x;
78010698baeSdormando     int was_busy = 0;
781f4983b20Sdormando     int refcount = 0;
782dc272ba5Sdormando     uint32_t hv;
783dc272ba5Sdormando     void *hold_lock;
784f4983b20Sdormando     enum move_status status = MOVE_PASS;
78510698baeSdormando 
78610698baeSdormando     pthread_mutex_lock(&slabs_lock);
78710698baeSdormando 
78810698baeSdormando     s_cls = &slabclass[slab_rebal.s_clsid];
78910698baeSdormando 
79010698baeSdormando     for (x = 0; x < slab_bulk_check; x++) {
791dc272ba5Sdormando         hv = 0;
792dc272ba5Sdormando         hold_lock = NULL;
79310698baeSdormando         item *it = slab_rebal.slab_pos;
794*ee461d11Sdormando         item_chunk *ch = NULL;
795f4983b20Sdormando         status = MOVE_PASS;
796*ee461d11Sdormando         if (it->it_flags & ITEM_CHUNK) {
797*ee461d11Sdormando             /* This chunk is a chained part of a larger item. */
798*ee461d11Sdormando             ch = (item_chunk *) it;
799*ee461d11Sdormando             /* Instead, we use the head chunk to find the item and effectively
800*ee461d11Sdormando              * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
801*ee461d11Sdormando              * head cannot be slabbed, so the normal routine is safe. */
802*ee461d11Sdormando             it = ch->head;
803*ee461d11Sdormando             assert(it->it_flags & ITEM_CHUNKED);
804*ee461d11Sdormando         }
805*ee461d11Sdormando 
806186509c2Sdormando         /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
807186509c2Sdormando          * the chunk for move. Only these two flags should exist.
808186509c2Sdormando          */
809186509c2Sdormando         if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
81062415f16Sdormando             /* ITEM_SLABBED can only be added/removed under the slabs_lock */
81110698baeSdormando             if (it->it_flags & ITEM_SLABBED) {
812*ee461d11Sdormando                 assert(ch == NULL);
813*ee461d11Sdormando                 slab_rebalance_cut_free(s_cls, it);
814dc272ba5Sdormando                 status = MOVE_FROM_SLAB;
81562415f16Sdormando             } else if ((it->it_flags & ITEM_LINKED) != 0) {
81662415f16Sdormando                 /* If it doesn't have ITEM_SLABBED, the item could be in any
81762415f16Sdormando                  * state on its way to being freed or written to. If no
81862415f16Sdormando                  * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
81962415f16Sdormando                  * and have the key written to it already.
82062415f16Sdormando                  */
821dc272ba5Sdormando                 hv = hash(ITEM_key(it), it->nkey);
82262415f16Sdormando                 if ((hold_lock = item_trylock(hv)) == NULL) {
82362415f16Sdormando                     status = MOVE_LOCKED;
824f4983b20Sdormando                 } else {
82562415f16Sdormando                     refcount = refcount_incr(&it->refcount);
82662415f16Sdormando                     if (refcount == 2) { /* item is linked but not busy */
82762415f16Sdormando                         /* Double check ITEM_LINKED flag here, since we're
828dc272ba5Sdormando                          * past a memory barrier from the mutex. */
829f4983b20Sdormando                         if ((it->it_flags & ITEM_LINKED) != 0) {
830dc272ba5Sdormando                             status = MOVE_FROM_LRU;
831f4983b20Sdormando                         } else {
832f4983b20Sdormando                             /* refcount == 1 + !ITEM_LINKED means the item is being
833f4983b20Sdormando                              * uploaded to, or was just unlinked but hasn't been freed
834f4983b20Sdormando                              * yet. Let it bleed off on its own and try again later */
835f4983b20Sdormando                             status = MOVE_BUSY;
836f4983b20Sdormando                         }
83710698baeSdormando                     } else {
83810698baeSdormando                         if (settings.verbose > 2) {
83910698baeSdormando                             fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
84010698baeSdormando                                 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
84110698baeSdormando                         }
842f4983b20Sdormando                         status = MOVE_BUSY;
843f4983b20Sdormando                     }
84469d1c699Sdormando                     /* Item lock must be held while modifying refcount */
845dc272ba5Sdormando                     if (status == MOVE_BUSY) {
84669d1c699Sdormando                         refcount_decr(&it->refcount);
8471c94e12cSdormando                         item_trylock_unlock(hold_lock);
8482db1bf46Sdormando                     }
849f4983b20Sdormando                 }
850a836eabcSdormando             } else {
851a836eabcSdormando                 /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
852a836eabcSdormando                  * busy and wait for item to complete its upload. */
853a836eabcSdormando                 status = MOVE_BUSY;
85462415f16Sdormando             }
855dc272ba5Sdormando         }
856f4983b20Sdormando 
857004e2211Sdormando         int save_item = 0;
858004e2211Sdormando         item *new_it = NULL;
859004e2211Sdormando         size_t ntotal = 0;
860f4983b20Sdormando         switch (status) {
861dc272ba5Sdormando             case MOVE_FROM_LRU:
862dc272ba5Sdormando                 /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
863dc272ba5Sdormando                  * We only need to hold the slabs_lock while initially looking
864dc272ba5Sdormando                  * at an item, and at this point we have an exclusive refcount
865dc272ba5Sdormando                  * (2) + the item is locked. Drop slabs lock, drop item to
866dc272ba5Sdormando                  * refcount 1 (just our own, then fall through and wipe it
867dc272ba5Sdormando                  */
868004e2211Sdormando                 /* Check if expired or flushed */
869004e2211Sdormando                 ntotal = ITEM_ntotal(it);
870004e2211Sdormando                 /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
871*ee461d11Sdormando                 if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
872*ee461d11Sdormando                     /* Chunked should be identical to non-chunked, except we need
873*ee461d11Sdormando                      * to swap out ntotal for the head-chunk-total. */
874*ee461d11Sdormando                     ntotal = s_cls->size;
875*ee461d11Sdormando                 }
876004e2211Sdormando                 if ((it->exptime != 0 && it->exptime < current_time)
877004e2211Sdormando                     || item_is_flushed(it)) {
878*ee461d11Sdormando                     /* Expired, don't save. */
879004e2211Sdormando                     save_item = 0;
880*ee461d11Sdormando                 } else if (ch == NULL &&
881*ee461d11Sdormando                         (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
882*ee461d11Sdormando                     /* Not a chunk of an item, and nomem. */
883*ee461d11Sdormando                     save_item = 0;
884*ee461d11Sdormando                     slab_rebal.evictions_nomem++;
885*ee461d11Sdormando                 } else if (ch != NULL &&
886*ee461d11Sdormando                         (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
887*ee461d11Sdormando                     /* Is a chunk of an item, and nomem. */
888004e2211Sdormando                     save_item = 0;
8898fa54f7eSdormando                     slab_rebal.evictions_nomem++;
890004e2211Sdormando                 } else {
891*ee461d11Sdormando                     /* Was whatever it was, and we have memory for it. */
892004e2211Sdormando                     save_item = 1;
893004e2211Sdormando                 }
894dc272ba5Sdormando                 pthread_mutex_unlock(&slabs_lock);
895*ee461d11Sdormando                 unsigned int requested_adjust = 0;
896004e2211Sdormando                 if (save_item) {
897*ee461d11Sdormando                     if (ch == NULL) {
898*ee461d11Sdormando                         assert((new_it->it_flags & ITEM_CHUNKED) == 0);
899004e2211Sdormando                         /* if free memory, memcpy. clear prev/next/h_bucket */
900004e2211Sdormando                         memcpy(new_it, it, ntotal);
901004e2211Sdormando                         new_it->prev = 0;
902004e2211Sdormando                         new_it->next = 0;
903004e2211Sdormando                         new_it->h_next = 0;
904004e2211Sdormando                         /* These are definitely required. else fails assert */
905004e2211Sdormando                         new_it->it_flags &= ~ITEM_LINKED;
906004e2211Sdormando                         new_it->refcount = 0;
907004e2211Sdormando                         do_item_replace(it, new_it, hv);
908*ee461d11Sdormando                         /* Need to walk the chunks and repoint head  */
909*ee461d11Sdormando                         if (new_it->it_flags & ITEM_CHUNKED) {
910*ee461d11Sdormando                             item_chunk *fch = (item_chunk *) ITEM_data(new_it);
911*ee461d11Sdormando                             fch->next->prev = fch;
912*ee461d11Sdormando                             while (fch) {
913*ee461d11Sdormando                                 fch->head = new_it;
914*ee461d11Sdormando                                 fch = fch->next;
915*ee461d11Sdormando                             }
916*ee461d11Sdormando                         }
917*ee461d11Sdormando                         it->refcount = 0;
918*ee461d11Sdormando                         it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
919*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
920*ee461d11Sdormando                         memcpy(ITEM_key(it), "deadbeef", 8);
921*ee461d11Sdormando #endif
9226ee8daefSdormando                         slab_rebal.rescues++;
923*ee461d11Sdormando                         requested_adjust = ntotal;
924004e2211Sdormando                     } else {
925*ee461d11Sdormando                         item_chunk *nch = (item_chunk *) new_it;
926*ee461d11Sdormando                         /* Chunks always have head chunk (the main it) */
927*ee461d11Sdormando                         ch->prev->next = nch;
928*ee461d11Sdormando                         if (ch->next)
929*ee461d11Sdormando                             ch->next->prev = nch;
930*ee461d11Sdormando                         memcpy(nch, ch, ch->used + sizeof(item_chunk));
931*ee461d11Sdormando                         ch->refcount = 0;
932*ee461d11Sdormando                         ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
933*ee461d11Sdormando                         slab_rebal.chunk_rescues++;
934*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
935*ee461d11Sdormando                         memcpy(ITEM_key((item *)ch), "deadbeef", 8);
936*ee461d11Sdormando #endif
937*ee461d11Sdormando                         refcount_decr(&it->refcount);
938*ee461d11Sdormando                         requested_adjust = s_cls->size;
939*ee461d11Sdormando                     }
940*ee461d11Sdormando                 } else {
941*ee461d11Sdormando                     /* restore ntotal in case we tried saving a head chunk. */
942*ee461d11Sdormando                     ntotal = ITEM_ntotal(it);
943dc272ba5Sdormando                     do_item_unlink(it, hv);
944*ee461d11Sdormando                     slabs_free(it, ntotal, slab_rebal.s_clsid);
945*ee461d11Sdormando                     /* Swing around again later to remove it from the freelist. */
946*ee461d11Sdormando                     slab_rebal.busy_items++;
947*ee461d11Sdormando                     was_busy++;
948004e2211Sdormando                 }
949dc272ba5Sdormando                 item_trylock_unlock(hold_lock);
950dc272ba5Sdormando                 pthread_mutex_lock(&slabs_lock);
95111eb3f23Sdormando                 /* Always remove the ntotal, as we added it in during
95211eb3f23Sdormando                  * do_slabs_alloc() when copying the item.
95311eb3f23Sdormando                  */
954*ee461d11Sdormando                 s_cls->requested -= requested_adjust;
955*ee461d11Sdormando                 break;
956dc272ba5Sdormando             case MOVE_FROM_SLAB:
957f4983b20Sdormando                 it->refcount = 0;
958186509c2Sdormando                 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
959a836eabcSdormando #ifdef DEBUG_SLAB_MOVER
960a836eabcSdormando                 memcpy(ITEM_key(it), "deadbeef", 8);
961a836eabcSdormando #endif
962f4983b20Sdormando                 break;
963f4983b20Sdormando             case MOVE_BUSY:
9642db1bf46Sdormando             case MOVE_LOCKED:
96510698baeSdormando                 slab_rebal.busy_items++;
96610698baeSdormando                 was_busy++;
967f4983b20Sdormando                 break;
968f4983b20Sdormando             case MOVE_PASS:
969f4983b20Sdormando                 break;
97010698baeSdormando         }
97110698baeSdormando 
97210698baeSdormando         slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
97310698baeSdormando         if (slab_rebal.slab_pos >= slab_rebal.slab_end)
97410698baeSdormando             break;
97510698baeSdormando     }
97610698baeSdormando 
97710698baeSdormando     if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
97810698baeSdormando         /* Some items were busy, start again from the top */
97910698baeSdormando         if (slab_rebal.busy_items) {
98010698baeSdormando             slab_rebal.slab_pos = slab_rebal.slab_start;
9816ee8daefSdormando             STATS_LOCK();
9826ee8daefSdormando             stats.slab_reassign_busy_items += slab_rebal.busy_items;
9836ee8daefSdormando             STATS_UNLOCK();
98410698baeSdormando             slab_rebal.busy_items = 0;
98510698baeSdormando         } else {
98610698baeSdormando             slab_rebal.done++;
98710698baeSdormando         }
98810698baeSdormando     }
98910698baeSdormando 
99010698baeSdormando     pthread_mutex_unlock(&slabs_lock);
99110698baeSdormando 
99210698baeSdormando     return was_busy;
99310698baeSdormando }
99410698baeSdormando 
slab_rebalance_finish(void)99510698baeSdormando static void slab_rebalance_finish(void) {
99610698baeSdormando     slabclass_t *s_cls;
99710698baeSdormando     slabclass_t *d_cls;
998d5185f9cSdormando     int x;
9996ee8daefSdormando     uint32_t rescues;
10008fa54f7eSdormando     uint32_t evictions_nomem;
1001b1debc4cSdormando     uint32_t inline_reclaim;
1002*ee461d11Sdormando     uint32_t chunk_rescues;
100310698baeSdormando 
100410698baeSdormando     pthread_mutex_lock(&slabs_lock);
100510698baeSdormando 
100610698baeSdormando     s_cls = &slabclass[slab_rebal.s_clsid];
100710698baeSdormando     d_cls = &slabclass[slab_rebal.d_clsid];
100810698baeSdormando 
1009a836eabcSdormando #ifdef DEBUG_SLAB_MOVER
1010a836eabcSdormando     /* If the algorithm is broken, live items can sneak in. */
1011a836eabcSdormando     slab_rebal.slab_pos = slab_rebal.slab_start;
1012a836eabcSdormando     while (1) {
1013a836eabcSdormando         item *it = slab_rebal.slab_pos;
1014186509c2Sdormando         assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
1015a836eabcSdormando         assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
1016186509c2Sdormando         it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1017a836eabcSdormando         slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1018a836eabcSdormando         if (slab_rebal.slab_pos >= slab_rebal.slab_end)
1019a836eabcSdormando             break;
1020a836eabcSdormando     }
1021a836eabcSdormando #endif
1022a836eabcSdormando 
1023d5185f9cSdormando     /* At this point the stolen slab is completely clear.
1024d5185f9cSdormando      * We always kill the "first"/"oldest" slab page in the slab_list, so
1025d5185f9cSdormando      * shuffle the page list backwards and decrement.
1026d5185f9cSdormando      */
1027fa51ad84Sdormando     s_cls->slabs--;
1028d5185f9cSdormando     for (x = 0; x < s_cls->slabs; x++) {
1029d5185f9cSdormando         s_cls->slab_list[x] = s_cls->slab_list[x+1];
1030d5185f9cSdormando     }
103110698baeSdormando 
103210698baeSdormando     d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
1033d6e96467Sdormando     /* Don't need to split the page into chunks if we're just storing it */
1034d6e96467Sdormando     if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
1035a836eabcSdormando         memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
1036845a4fe1Sdormando         split_slab_page_into_freelist(slab_rebal.slab_start,
1037845a4fe1Sdormando             slab_rebal.d_clsid);
103831541b37Sdormando     } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
103931541b37Sdormando         /* mem_malloc'ed might be higher than mem_limit. */
104031541b37Sdormando         memory_release();
1041d6e96467Sdormando     }
104210698baeSdormando 
104310698baeSdormando     slab_rebal.done       = 0;
104410698baeSdormando     slab_rebal.s_clsid    = 0;
104510698baeSdormando     slab_rebal.d_clsid    = 0;
104610698baeSdormando     slab_rebal.slab_start = NULL;
104710698baeSdormando     slab_rebal.slab_end   = NULL;
104810698baeSdormando     slab_rebal.slab_pos   = NULL;
10498fa54f7eSdormando     evictions_nomem    = slab_rebal.evictions_nomem;
1050b1debc4cSdormando     inline_reclaim = slab_rebal.inline_reclaim;
10516ee8daefSdormando     rescues   = slab_rebal.rescues;
1052*ee461d11Sdormando     chunk_rescues = slab_rebal.chunk_rescues;
10538fa54f7eSdormando     slab_rebal.evictions_nomem    = 0;
1054b1debc4cSdormando     slab_rebal.inline_reclaim = 0;
10556ee8daefSdormando     slab_rebal.rescues  = 0;
105610698baeSdormando 
105710698baeSdormando     slab_rebalance_signal = 0;
105810698baeSdormando 
105910698baeSdormando     pthread_mutex_unlock(&slabs_lock);
106010698baeSdormando 
106110698baeSdormando     STATS_LOCK();
106210698baeSdormando     stats.slabs_moved++;
10636ee8daefSdormando     stats.slab_reassign_rescues += rescues;
10648fa54f7eSdormando     stats.slab_reassign_evictions_nomem += evictions_nomem;
1065b1debc4cSdormando     stats.slab_reassign_inline_reclaim += inline_reclaim;
1066*ee461d11Sdormando     stats.slab_reassign_chunk_rescues += chunk_rescues;
1067cb01d504Sdormando     stats_state.slab_reassign_running = false;
106810698baeSdormando     STATS_UNLOCK();
106910698baeSdormando 
107010698baeSdormando     if (settings.verbose > 1) {
107110698baeSdormando         fprintf(stderr, "finished a slab move\n");
107210698baeSdormando     }
107310698baeSdormando }
107410698baeSdormando 
107557a9856aSdormando /* Slab mover thread.
107657a9856aSdormando  * Sits waiting for a condition to jump off and shovel some memory about
107757a9856aSdormando  */
slab_rebalance_thread(void * arg)107857a9856aSdormando static void *slab_rebalance_thread(void *arg) {
107957a9856aSdormando     int was_busy = 0;
10801c94e12cSdormando     /* So we first pass into cond_wait with the mutex held */
10811c94e12cSdormando     mutex_lock(&slabs_rebalance_lock);
108257a9856aSdormando 
108357a9856aSdormando     while (do_run_slab_rebalance_thread) {
108410698baeSdormando         if (slab_rebalance_signal == 1) {
108510698baeSdormando             if (slab_rebalance_start() < 0) {
108610698baeSdormando                 /* Handle errors with more specifity as required. */
108710698baeSdormando                 slab_rebalance_signal = 0;
108810698baeSdormando             }
108910698baeSdormando 
109057a9856aSdormando             was_busy = 0;
109110698baeSdormando         } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
109210698baeSdormando             was_busy = slab_rebalance_move();
109310698baeSdormando         }
109410698baeSdormando 
109510698baeSdormando         if (slab_rebal.done) {
109610698baeSdormando             slab_rebalance_finish();
109763bf748aSdormando         } else if (was_busy) {
109863bf748aSdormando             /* Stuck waiting for some items to unlock, so slow down a bit
109963bf748aSdormando              * to give them a chance to free up */
110063bf748aSdormando             usleep(50);
110163bf748aSdormando         }
110263bf748aSdormando 
110363bf748aSdormando         if (slab_rebalance_signal == 0) {
11043b2dc737Sdormando             /* always hold this lock while we're running */
11053b2dc737Sdormando             pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
110610698baeSdormando         }
110710698baeSdormando     }
110810698baeSdormando     return NULL;
110910698baeSdormando }
111010698baeSdormando 
1111c2e0023dSdormando /* Iterate at most once through the slab classes and pick a "random" source.
1112c2e0023dSdormando  * I like this better than calling rand() since rand() is slow enough that we
1113c2e0023dSdormando  * can just check all of the classes once instead.
1114c2e0023dSdormando  */
slabs_reassign_pick_any(int dst)1115c2e0023dSdormando static int slabs_reassign_pick_any(int dst) {
1116c2e0023dSdormando     static int cur = POWER_SMALLEST - 1;
1117c2e0023dSdormando     int tries = power_largest - POWER_SMALLEST + 1;
1118c2e0023dSdormando     for (; tries > 0; tries--) {
1119c2e0023dSdormando         cur++;
1120c2e0023dSdormando         if (cur > power_largest)
1121c2e0023dSdormando             cur = POWER_SMALLEST;
1122c2e0023dSdormando         if (cur == dst)
1123c2e0023dSdormando             continue;
1124c2e0023dSdormando         if (slabclass[cur].slabs > 1) {
1125c2e0023dSdormando             return cur;
1126c2e0023dSdormando         }
1127c2e0023dSdormando     }
1128c2e0023dSdormando     return -1;
1129c2e0023dSdormando }
1130c2e0023dSdormando 
do_slabs_reassign(int src,int dst)113110698baeSdormando static enum reassign_result_type do_slabs_reassign(int src, int dst) {
113210698baeSdormando     if (slab_rebalance_signal != 0)
113310698baeSdormando         return REASSIGN_RUNNING;
113410698baeSdormando 
11358c1c18edSdormando     if (src == dst)
11368c1c18edSdormando         return REASSIGN_SRC_DST_SAME;
11378c1c18edSdormando 
1138c2e0023dSdormando     /* Special indicator to choose ourselves. */
1139c2e0023dSdormando     if (src == -1) {
1140c2e0023dSdormando         src = slabs_reassign_pick_any(dst);
1141c2e0023dSdormando         /* TODO: If we end up back at -1, return a new error type */
1142c2e0023dSdormando     }
1143c2e0023dSdormando 
114410698baeSdormando     if (src < POWER_SMALLEST        || src > power_largest ||
1145d6e96467Sdormando         dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
114610698baeSdormando         return REASSIGN_BADCLASS;
114710698baeSdormando 
114810698baeSdormando     if (slabclass[src].slabs < 2)
114910698baeSdormando         return REASSIGN_NOSPARE;
115010698baeSdormando 
115110698baeSdormando     slab_rebal.s_clsid = src;
115210698baeSdormando     slab_rebal.d_clsid = dst;
115310698baeSdormando 
115410698baeSdormando     slab_rebalance_signal = 1;
115557a9856aSdormando     pthread_cond_signal(&slab_rebalance_cond);
115610698baeSdormando 
115710698baeSdormando     return REASSIGN_OK;
115810698baeSdormando }
115910698baeSdormando 
slabs_reassign(int src,int dst)116010698baeSdormando enum reassign_result_type slabs_reassign(int src, int dst) {
116110698baeSdormando     enum reassign_result_type ret;
11623b2dc737Sdormando     if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
11633b2dc737Sdormando         return REASSIGN_RUNNING;
11643b2dc737Sdormando     }
116510698baeSdormando     ret = do_slabs_reassign(src, dst);
11663b2dc737Sdormando     pthread_mutex_unlock(&slabs_rebalance_lock);
116710698baeSdormando     return ret;
116810698baeSdormando }
116910698baeSdormando 
11701c94e12cSdormando /* If we hold this lock, rebalancer can't wake up or move */
slabs_rebalancer_pause(void)11711c94e12cSdormando void slabs_rebalancer_pause(void) {
11721c94e12cSdormando     pthread_mutex_lock(&slabs_rebalance_lock);
11731c94e12cSdormando }
11741c94e12cSdormando 
slabs_rebalancer_resume(void)11751c94e12cSdormando void slabs_rebalancer_resume(void) {
11761c94e12cSdormando     pthread_mutex_unlock(&slabs_rebalance_lock);
11771c94e12cSdormando }
11781c94e12cSdormando 
117957a9856aSdormando static pthread_t rebalance_tid;
118010698baeSdormando 
start_slab_maintenance_thread(void)118110698baeSdormando int start_slab_maintenance_thread(void) {
118210698baeSdormando     int ret;
118310698baeSdormando     slab_rebalance_signal = 0;
118410698baeSdormando     slab_rebal.slab_start = NULL;
118510698baeSdormando     char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
118610698baeSdormando     if (env != NULL) {
118710698baeSdormando         slab_bulk_check = atoi(env);
118810698baeSdormando         if (slab_bulk_check == 0) {
118910698baeSdormando             slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
119010698baeSdormando         }
119110698baeSdormando     }
119257a9856aSdormando 
119357a9856aSdormando     if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
119457a9856aSdormando         fprintf(stderr, "Can't intiialize rebalance condition\n");
119557a9856aSdormando         return -1;
119657a9856aSdormando     }
11973b2dc737Sdormando     pthread_mutex_init(&slabs_rebalance_lock, NULL);
119857a9856aSdormando 
119957a9856aSdormando     if ((ret = pthread_create(&rebalance_tid, NULL,
120057a9856aSdormando                               slab_rebalance_thread, NULL)) != 0) {
120157a9856aSdormando         fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
120210698baeSdormando         return -1;
120310698baeSdormando     }
120410698baeSdormando     return 0;
120510698baeSdormando }
120610698baeSdormando 
120769d1c699Sdormando /* The maintenance thread is on a sleep/loop cycle, so it should join after a
120869d1c699Sdormando  * short wait */
stop_slab_maintenance_thread(void)120910698baeSdormando void stop_slab_maintenance_thread(void) {
121069d1c699Sdormando     mutex_lock(&slabs_rebalance_lock);
121110698baeSdormando     do_run_slab_thread = 0;
121257a9856aSdormando     do_run_slab_rebalance_thread = 0;
121369d1c699Sdormando     pthread_cond_signal(&slab_rebalance_cond);
121469d1c699Sdormando     pthread_mutex_unlock(&slabs_rebalance_lock);
121510698baeSdormando 
121610698baeSdormando     /* Wait for the maintenance thread to stop */
121757a9856aSdormando     pthread_join(rebalance_tid, NULL);
121810698baeSdormando }
1219